1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/gso.h> 69 #include <net/pkt_sched.h> 70 #include <net/sch_priv.h> 71 #include <net/pkt_cls.h> 72 #include <net/tcp.h> 73 #include <net/flow_dissector.h> 74 75 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 76 #include <net/netfilter/nf_conntrack_core.h> 77 #endif 78 79 #define CAKE_SET_WAYS (8) 80 #define CAKE_MAX_TINS (8) 81 #define CAKE_QUEUES (1024) 82 #define CAKE_FLOW_MASK 63 83 #define CAKE_FLOW_NAT_FLAG 64 84 85 /* struct cobalt_params - contains codel and blue parameters 86 * @interval: codel initial drop rate 87 * @target: maximum persistent sojourn time & blue update rate 88 * @mtu_time: serialisation delay of maximum-size packet 89 * @p_inc: increment of blue drop probability (0.32 fxp) 90 * @p_dec: decrement of blue drop probability (0.32 fxp) 91 */ 92 struct cobalt_params { 93 u64 interval; 94 u64 target; 95 u64 mtu_time; 96 u32 p_inc; 97 u32 p_dec; 98 }; 99 100 /* struct cobalt_vars - contains codel and blue variables 101 * @count: codel dropping frequency 102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 103 * @drop_next: time to drop next packet, or when we dropped last 104 * @blue_timer: Blue time to next drop 105 * @p_drop: BLUE drop probability (0.32 fxp) 106 * @dropping: set if in dropping state 107 * @ecn_marked: set if marked 108 */ 109 struct cobalt_vars { 110 u32 count; 111 u32 rec_inv_sqrt; 112 ktime_t drop_next; 113 ktime_t blue_timer; 114 u32 p_drop; 115 bool dropping; 116 bool ecn_marked; 117 }; 118 119 enum { 120 CAKE_SET_NONE = 0, 121 CAKE_SET_SPARSE, 122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 123 CAKE_SET_BULK, 124 CAKE_SET_DECAYING 125 }; 126 127 struct cake_flow { 128 /* this stuff is all needed per-flow at dequeue time */ 129 struct sk_buff *head; 130 struct sk_buff *tail; 131 struct list_head flowchain; 132 s32 deficit; 133 u32 dropped; 134 struct cobalt_vars cvars; 135 u16 srchost; /* index into cake_host table */ 136 u16 dsthost; 137 u8 set; 138 }; /* please try to keep this structure <= 64 bytes */ 139 140 struct cake_host { 141 u32 srchost_tag; 142 u32 dsthost_tag; 143 u16 srchost_bulk_flow_count; 144 u16 dsthost_bulk_flow_count; 145 }; 146 147 struct cake_heap_entry { 148 u16 t:3, b:10; 149 }; 150 151 struct cake_tin_data { 152 struct cake_flow flows[CAKE_QUEUES]; 153 u32 backlogs[CAKE_QUEUES]; 154 u32 tags[CAKE_QUEUES]; /* for set association */ 155 u16 overflow_idx[CAKE_QUEUES]; 156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 157 u16 flow_quantum; 158 159 struct cobalt_params cparams; 160 u32 drop_overlimit; 161 u16 bulk_flow_count; 162 u16 sparse_flow_count; 163 u16 decaying_flow_count; 164 u16 unresponsive_flow_count; 165 166 u32 max_skblen; 167 168 struct list_head new_flows; 169 struct list_head old_flows; 170 struct list_head decaying_flows; 171 172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 173 ktime_t time_next_packet; 174 u64 tin_rate_ns; 175 u64 tin_rate_bps; 176 u16 tin_rate_shft; 177 178 u16 tin_quantum; 179 s32 tin_deficit; 180 u32 tin_backlog; 181 u32 tin_dropped; 182 u32 tin_ecn_mark; 183 184 u32 packets; 185 u64 bytes; 186 187 u32 ack_drops; 188 189 /* moving averages */ 190 u64 avge_delay; 191 u64 peak_delay; 192 u64 base_delay; 193 194 /* hash function stats */ 195 u32 way_directs; 196 u32 way_hits; 197 u32 way_misses; 198 u32 way_collisions; 199 }; /* number of tins is small, so size of this struct doesn't matter much */ 200 201 struct cake_sched_config { 202 u64 rate_bps; 203 u64 interval; 204 u64 target; 205 u64 sync_time; 206 u32 buffer_config_limit; 207 u32 fwmark_mask; 208 u16 fwmark_shft; 209 s16 rate_overhead; 210 u16 rate_mpu; 211 u16 rate_flags; 212 u8 tin_mode; 213 u8 flow_mode; 214 u8 atm_mode; 215 u8 ack_filter; 216 u8 is_shared; 217 }; 218 219 struct cake_sched_data { 220 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 221 struct tcf_block *block; 222 struct cake_tin_data *tins; 223 struct cake_sched_config *config; 224 struct cake_sched_config initial_config; 225 226 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 227 228 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 229 ktime_t time_next_packet; 230 ktime_t failsafe_next_packet; 231 u64 rate_ns; 232 u16 rate_shft; 233 u16 overflow_timeout; 234 u16 tin_cnt; 235 236 /* resource tracking */ 237 u32 buffer_used; 238 u32 buffer_max_used; 239 u32 buffer_limit; 240 241 /* indices for dequeue */ 242 u16 cur_tin; 243 u16 cur_flow; 244 245 struct qdisc_watchdog watchdog; 246 const u8 *tin_index; 247 const u8 *tin_order; 248 249 /* bandwidth capacity estimate */ 250 ktime_t last_packet_time; 251 ktime_t avg_window_begin; 252 u64 avg_packet_interval; 253 u64 avg_window_bytes; 254 u64 avg_peak_bandwidth; 255 ktime_t last_reconfig_time; 256 257 /* packet length stats */ 258 u32 avg_netoff; 259 u16 max_netlen; 260 u16 max_adjlen; 261 u16 min_netlen; 262 u16 min_adjlen; 263 264 /* mq sync state */ 265 u64 last_checked_active; 266 u64 last_active; 267 u32 active_queues; 268 }; 269 270 enum { 271 CAKE_FLAG_OVERHEAD = BIT(0), 272 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 273 CAKE_FLAG_INGRESS = BIT(2), 274 CAKE_FLAG_WASH = BIT(3), 275 CAKE_FLAG_SPLIT_GSO = BIT(4) 276 }; 277 278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 279 * obtain the best features of each. Codel is excellent on flows which 280 * respond to congestion signals in a TCP-like way. BLUE is more effective on 281 * unresponsive flows. 282 */ 283 284 struct cobalt_skb_cb { 285 ktime_t enqueue_time; 286 u32 adjusted_len; 287 }; 288 289 static u64 us_to_ns(u64 us) 290 { 291 return us * NSEC_PER_USEC; 292 } 293 294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 295 { 296 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 297 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 298 } 299 300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 301 { 302 return get_cobalt_cb(skb)->enqueue_time; 303 } 304 305 static void cobalt_set_enqueue_time(struct sk_buff *skb, 306 ktime_t now) 307 { 308 get_cobalt_cb(skb)->enqueue_time = now; 309 } 310 311 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 312 313 /* Diffserv lookup tables */ 314 315 static const u8 precedence[] = { 316 0, 0, 0, 0, 0, 0, 0, 0, 317 1, 1, 1, 1, 1, 1, 1, 1, 318 2, 2, 2, 2, 2, 2, 2, 2, 319 3, 3, 3, 3, 3, 3, 3, 3, 320 4, 4, 4, 4, 4, 4, 4, 4, 321 5, 5, 5, 5, 5, 5, 5, 5, 322 6, 6, 6, 6, 6, 6, 6, 6, 323 7, 7, 7, 7, 7, 7, 7, 7, 324 }; 325 326 static const u8 diffserv8[] = { 327 2, 0, 1, 2, 4, 2, 2, 2, 328 1, 2, 1, 2, 1, 2, 1, 2, 329 5, 2, 4, 2, 4, 2, 4, 2, 330 3, 2, 3, 2, 3, 2, 3, 2, 331 6, 2, 3, 2, 3, 2, 3, 2, 332 6, 2, 2, 2, 6, 2, 6, 2, 333 7, 2, 2, 2, 2, 2, 2, 2, 334 7, 2, 2, 2, 2, 2, 2, 2, 335 }; 336 337 static const u8 diffserv4[] = { 338 0, 1, 0, 0, 2, 0, 0, 0, 339 1, 0, 0, 0, 0, 0, 0, 0, 340 2, 0, 2, 0, 2, 0, 2, 0, 341 2, 0, 2, 0, 2, 0, 2, 0, 342 3, 0, 2, 0, 2, 0, 2, 0, 343 3, 0, 0, 0, 3, 0, 3, 0, 344 3, 0, 0, 0, 0, 0, 0, 0, 345 3, 0, 0, 0, 0, 0, 0, 0, 346 }; 347 348 static const u8 diffserv3[] = { 349 0, 1, 0, 0, 2, 0, 0, 0, 350 1, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 2, 0, 2, 0, 355 2, 0, 0, 0, 0, 0, 0, 0, 356 2, 0, 0, 0, 0, 0, 0, 0, 357 }; 358 359 static const u8 besteffort[] = { 360 0, 0, 0, 0, 0, 0, 0, 0, 361 0, 0, 0, 0, 0, 0, 0, 0, 362 0, 0, 0, 0, 0, 0, 0, 0, 363 0, 0, 0, 0, 0, 0, 0, 0, 364 0, 0, 0, 0, 0, 0, 0, 0, 365 0, 0, 0, 0, 0, 0, 0, 0, 366 0, 0, 0, 0, 0, 0, 0, 0, 367 0, 0, 0, 0, 0, 0, 0, 0, 368 }; 369 370 /* tin priority order for stats dumping */ 371 372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 373 static const u8 bulk_order[] = {1, 0, 2, 3}; 374 375 /* There is a big difference in timing between the accurate values placed in the 376 * cache and the approximations given by a single Newton step for small count 377 * values, particularly when stepping from count 1 to 2 or vice versa. Hence, 378 * these values are calculated using eight Newton steps, using the 379 * implementation below. Above 16, a single Newton step gives sufficient 380 * accuracy in either direction, given the precision stored. 381 * 382 * The magnitude of the error when stepping up to count 2 is such as to give the 383 * value that *should* have been produced at count 4. 384 */ 385 386 #define REC_INV_SQRT_CACHE (16) 387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 388 ~0, ~0, 3037000500, 2479700525, 389 2147483647, 1920767767, 1753413056, 1623345051, 390 1518500250, 1431655765, 1358187914, 1294981364, 391 1239850263, 1191209601, 1147878294, 1108955788 392 }; 393 394 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 395 u64 target_ns, u64 rtt_est_ns); 396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 397 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 398 * 399 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 400 */ 401 402 static void cobalt_newton_step(struct cobalt_vars *vars) 403 { 404 u32 invsqrt, invsqrt2; 405 u64 val; 406 407 invsqrt = vars->rec_inv_sqrt; 408 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 409 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 410 411 val >>= 2; /* avoid overflow in following multiply */ 412 val = (val * invsqrt) >> (32 - 2 + 1); 413 414 vars->rec_inv_sqrt = val; 415 } 416 417 static void cobalt_invsqrt(struct cobalt_vars *vars) 418 { 419 if (vars->count < REC_INV_SQRT_CACHE) 420 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; 421 else 422 cobalt_newton_step(vars); 423 } 424 425 static void cobalt_vars_init(struct cobalt_vars *vars) 426 { 427 memset(vars, 0, sizeof(*vars)); 428 } 429 430 /* CoDel control_law is t + interval/sqrt(count) 431 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 432 * both sqrt() and divide operation. 433 */ 434 static ktime_t cobalt_control(ktime_t t, 435 u64 interval, 436 u32 rec_inv_sqrt) 437 { 438 return ktime_add_ns(t, reciprocal_scale(interval, 439 rec_inv_sqrt)); 440 } 441 442 /* Call this when a packet had to be dropped due to queue overflow. Returns 443 * true if the BLUE state was quiescent before but active after this call. 444 */ 445 static bool cobalt_queue_full(struct cobalt_vars *vars, 446 struct cobalt_params *p, 447 ktime_t now) 448 { 449 bool up = false; 450 451 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 452 up = !vars->p_drop; 453 vars->p_drop += p->p_inc; 454 if (vars->p_drop < p->p_inc) 455 vars->p_drop = ~0; 456 vars->blue_timer = now; 457 } 458 vars->dropping = true; 459 vars->drop_next = now; 460 if (!vars->count) 461 vars->count = 1; 462 463 return up; 464 } 465 466 /* Call this when the queue was serviced but turned out to be empty. Returns 467 * true if the BLUE state was active before but quiescent after this call. 468 */ 469 static bool cobalt_queue_empty(struct cobalt_vars *vars, 470 struct cobalt_params *p, 471 ktime_t now) 472 { 473 bool down = false; 474 475 if (vars->p_drop && 476 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 477 if (vars->p_drop < p->p_dec) 478 vars->p_drop = 0; 479 else 480 vars->p_drop -= p->p_dec; 481 vars->blue_timer = now; 482 down = !vars->p_drop; 483 } 484 vars->dropping = false; 485 486 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 487 vars->count--; 488 cobalt_invsqrt(vars); 489 vars->drop_next = cobalt_control(vars->drop_next, 490 p->interval, 491 vars->rec_inv_sqrt); 492 } 493 494 return down; 495 } 496 497 /* Call this with a freshly dequeued packet for possible congestion marking. 498 * Returns true as an instruction to drop the packet, false for delivery. 499 */ 500 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, 501 struct cobalt_params *p, 502 ktime_t now, 503 struct sk_buff *skb, 504 u32 bulk_flows) 505 { 506 enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; 507 bool next_due, over_target; 508 ktime_t schedule; 509 u64 sojourn; 510 511 /* The 'schedule' variable records, in its sign, whether 'now' is before or 512 * after 'drop_next'. This allows 'drop_next' to be updated before the next 513 * scheduling decision is actually branched, without destroying that 514 * information. Similarly, the first 'schedule' value calculated is preserved 515 * in the boolean 'next_due'. 516 * 517 * As for 'drop_next', we take advantage of the fact that 'interval' is both 518 * the delay between first exceeding 'target' and the first signalling event, 519 * *and* the scaling factor for the signalling frequency. It's therefore very 520 * natural to use a single mechanism for both purposes, and eliminates a 521 * significant amount of reference Codel's spaghetti code. To help with this, 522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 523 * as possible to 1.0 in fixed-point. 524 */ 525 526 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 527 schedule = ktime_sub(now, vars->drop_next); 528 over_target = sojourn > p->target && 529 sojourn > p->mtu_time * bulk_flows * 2 && 530 sojourn > p->mtu_time * 4; 531 next_due = vars->count && ktime_to_ns(schedule) >= 0; 532 533 vars->ecn_marked = false; 534 535 if (over_target) { 536 if (!vars->dropping) { 537 vars->dropping = true; 538 vars->drop_next = cobalt_control(now, 539 p->interval, 540 vars->rec_inv_sqrt); 541 } 542 if (!vars->count) 543 vars->count = 1; 544 } else if (vars->dropping) { 545 vars->dropping = false; 546 } 547 548 if (next_due && vars->dropping) { 549 /* Use ECN mark if possible, otherwise drop */ 550 if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) 551 reason = SKB_DROP_REASON_QDISC_CONGESTED; 552 553 vars->count++; 554 if (!vars->count) 555 vars->count--; 556 cobalt_invsqrt(vars); 557 vars->drop_next = cobalt_control(vars->drop_next, 558 p->interval, 559 vars->rec_inv_sqrt); 560 schedule = ktime_sub(now, vars->drop_next); 561 } else { 562 while (next_due) { 563 vars->count--; 564 cobalt_invsqrt(vars); 565 vars->drop_next = cobalt_control(vars->drop_next, 566 p->interval, 567 vars->rec_inv_sqrt); 568 schedule = ktime_sub(now, vars->drop_next); 569 next_due = vars->count && ktime_to_ns(schedule) >= 0; 570 } 571 } 572 573 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 574 if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && 575 get_random_u32() < vars->p_drop) 576 reason = SKB_DROP_REASON_CAKE_FLOOD; 577 578 /* Overload the drop_next field as an activity timeout */ 579 if (!vars->count) 580 vars->drop_next = ktime_add_ns(now, p->interval); 581 else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) 582 vars->drop_next = now; 583 584 return reason; 585 } 586 587 static bool cake_update_flowkeys(struct flow_keys *keys, 588 const struct sk_buff *skb) 589 { 590 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 591 struct nf_conntrack_tuple tuple = {}; 592 bool rev = !skb->_nfct, upd = false; 593 __be32 ip; 594 595 if (skb_protocol(skb, true) != htons(ETH_P_IP)) 596 return false; 597 598 if (!nf_ct_get_tuple_skb(&tuple, skb)) 599 return false; 600 601 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 602 if (ip != keys->addrs.v4addrs.src) { 603 keys->addrs.v4addrs.src = ip; 604 upd = true; 605 } 606 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 607 if (ip != keys->addrs.v4addrs.dst) { 608 keys->addrs.v4addrs.dst = ip; 609 upd = true; 610 } 611 612 if (keys->ports.ports) { 613 __be16 port; 614 615 port = rev ? tuple.dst.u.all : tuple.src.u.all; 616 if (port != keys->ports.src) { 617 keys->ports.src = port; 618 upd = true; 619 } 620 port = rev ? tuple.src.u.all : tuple.dst.u.all; 621 if (port != keys->ports.dst) { 622 port = keys->ports.dst; 623 upd = true; 624 } 625 } 626 return upd; 627 #else 628 return false; 629 #endif 630 } 631 632 /* Cake has several subtle multiple bit settings. In these cases you 633 * would be matching triple isolate mode as well. 634 */ 635 636 static bool cake_dsrc(int flow_mode) 637 { 638 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 639 } 640 641 static bool cake_ddst(int flow_mode) 642 { 643 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 644 } 645 646 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, 647 struct cake_flow *flow, 648 int flow_mode) 649 { 650 if (likely(cake_dsrc(flow_mode) && 651 q->hosts[flow->srchost].srchost_bulk_flow_count)) 652 q->hosts[flow->srchost].srchost_bulk_flow_count--; 653 } 654 655 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, 656 struct cake_flow *flow, 657 int flow_mode) 658 { 659 if (likely(cake_dsrc(flow_mode) && 660 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) 661 q->hosts[flow->srchost].srchost_bulk_flow_count++; 662 } 663 664 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, 665 struct cake_flow *flow, 666 int flow_mode) 667 { 668 if (likely(cake_ddst(flow_mode) && 669 q->hosts[flow->dsthost].dsthost_bulk_flow_count)) 670 q->hosts[flow->dsthost].dsthost_bulk_flow_count--; 671 } 672 673 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, 674 struct cake_flow *flow, 675 int flow_mode) 676 { 677 if (likely(cake_ddst(flow_mode) && 678 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) 679 q->hosts[flow->dsthost].dsthost_bulk_flow_count++; 680 } 681 682 static u16 cake_get_flow_quantum(struct cake_tin_data *q, 683 struct cake_flow *flow, 684 int flow_mode) 685 { 686 u16 host_load = 1; 687 688 if (cake_dsrc(flow_mode)) 689 host_load = max(host_load, 690 q->hosts[flow->srchost].srchost_bulk_flow_count); 691 692 if (cake_ddst(flow_mode)) 693 host_load = max(host_load, 694 q->hosts[flow->dsthost].dsthost_bulk_flow_count); 695 696 /* The get_random_u16() is a way to apply dithering to avoid 697 * accumulating roundoff errors 698 */ 699 return (q->flow_quantum * quantum_div[host_load] + 700 get_random_u16()) >> 16; 701 } 702 703 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 704 int flow_mode, u16 flow_override, u16 host_override) 705 { 706 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 707 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 708 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 709 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 710 u16 reduced_hash, srchost_idx, dsthost_idx; 711 struct flow_keys keys, host_keys; 712 bool use_skbhash = skb->l4_hash; 713 714 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 715 return 0; 716 717 /* If both overrides are set, or we can use the SKB hash and nat mode is 718 * disabled, we can skip packet dissection entirely. If nat mode is 719 * enabled there's another check below after doing the conntrack lookup. 720 */ 721 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 722 goto skip_hash; 723 724 skb_flow_dissect_flow_keys(skb, &keys, 725 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 726 727 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 728 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 729 use_skbhash = false; 730 731 /* If we can still use the SKB hash and don't need the host hash, we can 732 * skip the rest of the hashing procedure 733 */ 734 if (use_skbhash && !hash_hosts) 735 goto skip_hash; 736 737 /* flow_hash_from_keys() sorts the addresses by value, so we have 738 * to preserve their order in a separate data structure to treat 739 * src and dst host addresses as independently selectable. 740 */ 741 host_keys = keys; 742 host_keys.ports.ports = 0; 743 host_keys.basic.ip_proto = 0; 744 host_keys.keyid.keyid = 0; 745 host_keys.tags.flow_label = 0; 746 747 switch (host_keys.control.addr_type) { 748 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 749 host_keys.addrs.v4addrs.src = 0; 750 dsthost_hash = flow_hash_from_keys(&host_keys); 751 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 752 host_keys.addrs.v4addrs.dst = 0; 753 srchost_hash = flow_hash_from_keys(&host_keys); 754 break; 755 756 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 757 memset(&host_keys.addrs.v6addrs.src, 0, 758 sizeof(host_keys.addrs.v6addrs.src)); 759 dsthost_hash = flow_hash_from_keys(&host_keys); 760 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 761 memset(&host_keys.addrs.v6addrs.dst, 0, 762 sizeof(host_keys.addrs.v6addrs.dst)); 763 srchost_hash = flow_hash_from_keys(&host_keys); 764 break; 765 766 default: 767 dsthost_hash = 0; 768 srchost_hash = 0; 769 } 770 771 /* This *must* be after the above switch, since as a 772 * side-effect it sorts the src and dst addresses. 773 */ 774 if (hash_flows && !use_skbhash) 775 flow_hash = flow_hash_from_keys(&keys); 776 777 skip_hash: 778 if (flow_override) 779 flow_hash = flow_override - 1; 780 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) 781 flow_hash = skb->hash; 782 if (host_override) { 783 dsthost_hash = host_override - 1; 784 srchost_hash = host_override - 1; 785 } 786 787 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 788 if (flow_mode & CAKE_FLOW_SRC_IP) 789 flow_hash ^= srchost_hash; 790 791 if (flow_mode & CAKE_FLOW_DST_IP) 792 flow_hash ^= dsthost_hash; 793 } 794 795 reduced_hash = flow_hash % CAKE_QUEUES; 796 797 /* set-associative hashing */ 798 /* fast path if no hash collision (direct lookup succeeds) */ 799 if (likely(q->tags[reduced_hash] == flow_hash && 800 q->flows[reduced_hash].set)) { 801 q->way_directs++; 802 } else { 803 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 804 u32 outer_hash = reduced_hash - inner_hash; 805 bool allocate_src = false; 806 bool allocate_dst = false; 807 u32 i, k; 808 809 /* check if any active queue in the set is reserved for 810 * this flow. 811 */ 812 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 813 i++, k = (k + 1) % CAKE_SET_WAYS) { 814 if (q->tags[outer_hash + k] == flow_hash) { 815 if (i) 816 q->way_hits++; 817 818 if (!q->flows[outer_hash + k].set) { 819 /* need to increment host refcnts */ 820 allocate_src = cake_dsrc(flow_mode); 821 allocate_dst = cake_ddst(flow_mode); 822 } 823 824 goto found; 825 } 826 } 827 828 /* no queue is reserved for this flow, look for an 829 * empty one. 830 */ 831 for (i = 0; i < CAKE_SET_WAYS; 832 i++, k = (k + 1) % CAKE_SET_WAYS) { 833 if (!q->flows[outer_hash + k].set) { 834 q->way_misses++; 835 allocate_src = cake_dsrc(flow_mode); 836 allocate_dst = cake_ddst(flow_mode); 837 goto found; 838 } 839 } 840 841 /* With no empty queues, default to the original 842 * queue, accept the collision, update the host tags. 843 */ 844 q->way_collisions++; 845 allocate_src = cake_dsrc(flow_mode); 846 allocate_dst = cake_ddst(flow_mode); 847 848 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 849 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 850 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); 851 } 852 found: 853 /* reserve queue for future packets in same flow */ 854 reduced_hash = outer_hash + k; 855 q->tags[reduced_hash] = flow_hash; 856 857 if (allocate_src) { 858 srchost_idx = srchost_hash % CAKE_QUEUES; 859 inner_hash = srchost_idx % CAKE_SET_WAYS; 860 outer_hash = srchost_idx - inner_hash; 861 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 862 i++, k = (k + 1) % CAKE_SET_WAYS) { 863 if (q->hosts[outer_hash + k].srchost_tag == 864 srchost_hash) 865 goto found_src; 866 } 867 for (i = 0; i < CAKE_SET_WAYS; 868 i++, k = (k + 1) % CAKE_SET_WAYS) { 869 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 870 break; 871 } 872 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 873 found_src: 874 srchost_idx = outer_hash + k; 875 q->flows[reduced_hash].srchost = srchost_idx; 876 877 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 878 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 879 } 880 881 if (allocate_dst) { 882 dsthost_idx = dsthost_hash % CAKE_QUEUES; 883 inner_hash = dsthost_idx % CAKE_SET_WAYS; 884 outer_hash = dsthost_idx - inner_hash; 885 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 886 i++, k = (k + 1) % CAKE_SET_WAYS) { 887 if (q->hosts[outer_hash + k].dsthost_tag == 888 dsthost_hash) 889 goto found_dst; 890 } 891 for (i = 0; i < CAKE_SET_WAYS; 892 i++, k = (k + 1) % CAKE_SET_WAYS) { 893 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 894 break; 895 } 896 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 897 found_dst: 898 dsthost_idx = outer_hash + k; 899 q->flows[reduced_hash].dsthost = dsthost_idx; 900 901 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 902 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); 903 } 904 } 905 906 return reduced_hash; 907 } 908 909 /* helper functions : might be changed when/if skb use a standard list_head */ 910 /* remove one skb from head of slot queue */ 911 912 static struct sk_buff *dequeue_head(struct cake_flow *flow) 913 { 914 struct sk_buff *skb = flow->head; 915 916 if (skb) { 917 flow->head = skb->next; 918 skb_mark_not_on_list(skb); 919 } 920 921 return skb; 922 } 923 924 /* add skb to flow queue (tail add) */ 925 926 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 927 { 928 if (!flow->head) 929 flow->head = skb; 930 else 931 flow->tail->next = skb; 932 flow->tail = skb; 933 skb->next = NULL; 934 } 935 936 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 937 struct ipv6hdr *buf) 938 { 939 unsigned int offset = skb_network_offset(skb); 940 struct iphdr *iph; 941 942 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 943 944 if (!iph) 945 return NULL; 946 947 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 948 return skb_header_pointer(skb, offset + iph->ihl * 4, 949 sizeof(struct ipv6hdr), buf); 950 951 else if (iph->version == 4) 952 return iph; 953 954 else if (iph->version == 6) 955 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 956 buf); 957 958 return NULL; 959 } 960 961 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 962 void *buf, unsigned int bufsize) 963 { 964 unsigned int offset = skb_network_offset(skb); 965 const struct ipv6hdr *ipv6h; 966 const struct tcphdr *tcph; 967 const struct iphdr *iph; 968 struct ipv6hdr _ipv6h; 969 struct tcphdr _tcph; 970 971 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 972 973 if (!ipv6h) 974 return NULL; 975 976 if (ipv6h->version == 4) { 977 iph = (struct iphdr *)ipv6h; 978 offset += iph->ihl * 4; 979 980 /* special-case 6in4 tunnelling, as that is a common way to get 981 * v6 connectivity in the home 982 */ 983 if (iph->protocol == IPPROTO_IPV6) { 984 ipv6h = skb_header_pointer(skb, offset, 985 sizeof(_ipv6h), &_ipv6h); 986 987 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 988 return NULL; 989 990 offset += sizeof(struct ipv6hdr); 991 992 } else if (iph->protocol != IPPROTO_TCP) { 993 return NULL; 994 } 995 996 } else if (ipv6h->version == 6) { 997 if (ipv6h->nexthdr != IPPROTO_TCP) 998 return NULL; 999 1000 offset += sizeof(struct ipv6hdr); 1001 } else { 1002 return NULL; 1003 } 1004 1005 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 1006 if (!tcph || tcph->doff < 5) 1007 return NULL; 1008 1009 return skb_header_pointer(skb, offset, 1010 min(__tcp_hdrlen(tcph), bufsize), buf); 1011 } 1012 1013 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 1014 int code, int *oplen) 1015 { 1016 /* inspired by tcp_parse_options in tcp_input.c */ 1017 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1018 const u8 *ptr = (const u8 *)(tcph + 1); 1019 1020 while (length > 0) { 1021 int opcode = *ptr++; 1022 int opsize; 1023 1024 if (opcode == TCPOPT_EOL) 1025 break; 1026 if (opcode == TCPOPT_NOP) { 1027 length--; 1028 continue; 1029 } 1030 if (length < 2) 1031 break; 1032 opsize = *ptr++; 1033 if (opsize < 2 || opsize > length) 1034 break; 1035 1036 if (opcode == code) { 1037 *oplen = opsize; 1038 return ptr; 1039 } 1040 1041 ptr += opsize - 2; 1042 length -= opsize; 1043 } 1044 1045 return NULL; 1046 } 1047 1048 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 1049 * bytes than the other. In the case where both sequences ACKs bytes that the 1050 * other doesn't, A is considered greater. DSACKs in A also makes A be 1051 * considered greater. 1052 * 1053 * @return -1, 0 or 1 as normal compare functions 1054 */ 1055 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 1056 const struct tcphdr *tcph_b) 1057 { 1058 const struct tcp_sack_block_wire *sack_a, *sack_b; 1059 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 1060 u32 bytes_a = 0, bytes_b = 0; 1061 int oplen_a, oplen_b; 1062 bool first = true; 1063 1064 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 1065 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 1066 1067 /* pointers point to option contents */ 1068 oplen_a -= TCPOLEN_SACK_BASE; 1069 oplen_b -= TCPOLEN_SACK_BASE; 1070 1071 if (sack_a && oplen_a >= sizeof(*sack_a) && 1072 (!sack_b || oplen_b < sizeof(*sack_b))) 1073 return -1; 1074 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1075 (!sack_a || oplen_a < sizeof(*sack_a))) 1076 return 1; 1077 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1078 (!sack_b || oplen_b < sizeof(*sack_b))) 1079 return 0; 1080 1081 while (oplen_a >= sizeof(*sack_a)) { 1082 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1083 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1084 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1085 int oplen_tmp = oplen_b; 1086 bool found = false; 1087 1088 /* DSACK; always considered greater to prevent dropping */ 1089 if (before(start_a, ack_seq_a)) 1090 return -1; 1091 1092 bytes_a += end_a - start_a; 1093 1094 while (oplen_tmp >= sizeof(*sack_tmp)) { 1095 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1096 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1097 1098 /* first time through we count the total size */ 1099 if (first) 1100 bytes_b += end_b - start_b; 1101 1102 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1103 found = true; 1104 if (!first) 1105 break; 1106 } 1107 oplen_tmp -= sizeof(*sack_tmp); 1108 sack_tmp++; 1109 } 1110 1111 if (!found) 1112 return -1; 1113 1114 oplen_a -= sizeof(*sack_a); 1115 sack_a++; 1116 first = false; 1117 } 1118 1119 /* If we made it this far, all ranges SACKed by A are covered by B, so 1120 * either the SACKs are equal, or B SACKs more bytes. 1121 */ 1122 return bytes_b > bytes_a ? 1 : 0; 1123 } 1124 1125 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1126 u32 *tsval, u32 *tsecr) 1127 { 1128 const u8 *ptr; 1129 int opsize; 1130 1131 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1132 1133 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1134 *tsval = get_unaligned_be32(ptr); 1135 *tsecr = get_unaligned_be32(ptr + 4); 1136 } 1137 } 1138 1139 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1140 u32 tstamp_new, u32 tsecr_new) 1141 { 1142 /* inspired by tcp_parse_options in tcp_input.c */ 1143 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1144 const u8 *ptr = (const u8 *)(tcph + 1); 1145 u32 tstamp, tsecr; 1146 1147 /* 3 reserved flags must be unset to avoid future breakage 1148 * ACK must be set 1149 * ECE/CWR are handled separately 1150 * All other flags URG/PSH/RST/SYN/FIN must be unset 1151 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1152 * 0x00C00000 = CWR/ECE (handled separately) 1153 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1154 */ 1155 if (((tcp_flag_word(tcph) & 1156 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1157 return false; 1158 1159 while (length > 0) { 1160 int opcode = *ptr++; 1161 int opsize; 1162 1163 if (opcode == TCPOPT_EOL) 1164 break; 1165 if (opcode == TCPOPT_NOP) { 1166 length--; 1167 continue; 1168 } 1169 if (length < 2) 1170 break; 1171 opsize = *ptr++; 1172 if (opsize < 2 || opsize > length) 1173 break; 1174 1175 switch (opcode) { 1176 case TCPOPT_MD5SIG: /* doesn't influence state */ 1177 break; 1178 1179 case TCPOPT_SACK: /* stricter checking performed later */ 1180 if (opsize % 8 != 2) 1181 return false; 1182 break; 1183 1184 case TCPOPT_TIMESTAMP: 1185 /* only drop timestamps lower than new */ 1186 if (opsize != TCPOLEN_TIMESTAMP) 1187 return false; 1188 tstamp = get_unaligned_be32(ptr); 1189 tsecr = get_unaligned_be32(ptr + 4); 1190 if (after(tstamp, tstamp_new) || 1191 after(tsecr, tsecr_new)) 1192 return false; 1193 break; 1194 1195 case TCPOPT_MSS: /* these should only be set on SYN */ 1196 case TCPOPT_WINDOW: 1197 case TCPOPT_SACK_PERM: 1198 case TCPOPT_FASTOPEN: 1199 case TCPOPT_EXP: 1200 default: /* don't drop if any unknown options are present */ 1201 return false; 1202 } 1203 1204 ptr += opsize - 2; 1205 length -= opsize; 1206 } 1207 1208 return true; 1209 } 1210 1211 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1212 struct cake_flow *flow) 1213 { 1214 bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE; 1215 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1216 struct sk_buff *skb_check, *skb_prev = NULL; 1217 const struct ipv6hdr *ipv6h, *ipv6h_check; 1218 unsigned char _tcph[64], _tcph_check[64]; 1219 const struct tcphdr *tcph, *tcph_check; 1220 const struct iphdr *iph, *iph_check; 1221 struct ipv6hdr _iph, _iph_check; 1222 const struct sk_buff *skb; 1223 int seglen, num_found = 0; 1224 u32 tstamp = 0, tsecr = 0; 1225 __be32 elig_flags = 0; 1226 int sack_comp; 1227 1228 /* no other possible ACKs to filter */ 1229 if (flow->head == flow->tail) 1230 return NULL; 1231 1232 skb = flow->tail; 1233 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1234 iph = cake_get_iphdr(skb, &_iph); 1235 if (!tcph) 1236 return NULL; 1237 1238 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1239 1240 /* the 'triggering' packet need only have the ACK flag set. 1241 * also check that SYN is not set, as there won't be any previous ACKs. 1242 */ 1243 if ((tcp_flag_word(tcph) & 1244 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1245 return NULL; 1246 1247 /* the 'triggering' ACK is at the tail of the queue, we have already 1248 * returned if it is the only packet in the flow. loop through the rest 1249 * of the queue looking for pure ACKs with the same 5-tuple as the 1250 * triggering one. 1251 */ 1252 for (skb_check = flow->head; 1253 skb_check && skb_check != skb; 1254 skb_prev = skb_check, skb_check = skb_check->next) { 1255 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1256 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1257 sizeof(_tcph_check)); 1258 1259 /* only TCP packets with matching 5-tuple are eligible, and only 1260 * drop safe headers 1261 */ 1262 if (!tcph_check || iph->version != iph_check->version || 1263 tcph_check->source != tcph->source || 1264 tcph_check->dest != tcph->dest) 1265 continue; 1266 1267 if (iph_check->version == 4) { 1268 if (iph_check->saddr != iph->saddr || 1269 iph_check->daddr != iph->daddr) 1270 continue; 1271 1272 seglen = iph_totlen(skb, iph_check) - 1273 (4 * iph_check->ihl); 1274 } else if (iph_check->version == 6) { 1275 ipv6h = (struct ipv6hdr *)iph; 1276 ipv6h_check = (struct ipv6hdr *)iph_check; 1277 1278 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1279 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1280 continue; 1281 1282 seglen = ipv6_payload_len(skb, ipv6h_check); 1283 } else { 1284 WARN_ON(1); /* shouldn't happen */ 1285 continue; 1286 } 1287 1288 /* If the ECE/CWR flags changed from the previous eligible 1289 * packet in the same flow, we should no longer be dropping that 1290 * previous packet as this would lose information. 1291 */ 1292 if (elig_ack && (tcp_flag_word(tcph_check) & 1293 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1294 elig_ack = NULL; 1295 elig_ack_prev = NULL; 1296 num_found--; 1297 } 1298 1299 /* Check TCP options and flags, don't drop ACKs with segment 1300 * data, and don't drop ACKs with a higher cumulative ACK 1301 * counter than the triggering packet. Check ACK seqno here to 1302 * avoid parsing SACK options of packets we are going to exclude 1303 * anyway. 1304 */ 1305 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1306 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1307 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1308 continue; 1309 1310 /* Check SACK options. The triggering packet must SACK more data 1311 * than the ACK under consideration, or SACK the same range but 1312 * have a larger cumulative ACK counter. The latter is a 1313 * pathological case, but is contained in the following check 1314 * anyway, just to be safe. 1315 */ 1316 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1317 1318 if (sack_comp < 0 || 1319 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1320 sack_comp == 0)) 1321 continue; 1322 1323 /* At this point we have found an eligible pure ACK to drop; if 1324 * we are in aggressive mode, we are done. Otherwise, keep 1325 * searching unless this is the second eligible ACK we 1326 * found. 1327 * 1328 * Since we want to drop ACK closest to the head of the queue, 1329 * save the first eligible ACK we find, even if we need to loop 1330 * again. 1331 */ 1332 if (!elig_ack) { 1333 elig_ack = skb_check; 1334 elig_ack_prev = skb_prev; 1335 elig_flags = (tcp_flag_word(tcph_check) 1336 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1337 } 1338 1339 if (num_found++ > 0) 1340 goto found; 1341 } 1342 1343 /* We made it through the queue without finding two eligible ACKs . If 1344 * we found a single eligible ACK we can drop it in aggressive mode if 1345 * we can guarantee that this does not interfere with ECN flag 1346 * information. We ensure this by dropping it only if the enqueued 1347 * packet is consecutive with the eligible ACK, and their flags match. 1348 */ 1349 if (elig_ack && aggressive && elig_ack->next == skb && 1350 (elig_flags == (tcp_flag_word(tcph) & 1351 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1352 goto found; 1353 1354 return NULL; 1355 1356 found: 1357 if (elig_ack_prev) 1358 elig_ack_prev->next = elig_ack->next; 1359 else 1360 flow->head = elig_ack->next; 1361 1362 skb_mark_not_on_list(elig_ack); 1363 1364 return elig_ack; 1365 } 1366 1367 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1368 { 1369 avg -= avg >> shift; 1370 avg += sample >> shift; 1371 return avg; 1372 } 1373 1374 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) 1375 { 1376 struct cake_sched_config *q = qd->config; 1377 1378 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1379 len -= off; 1380 1381 if (qd->max_netlen < len) 1382 qd->max_netlen = len; 1383 if (qd->min_netlen > len) 1384 qd->min_netlen = len; 1385 1386 len += q->rate_overhead; 1387 1388 if (len < q->rate_mpu) 1389 len = q->rate_mpu; 1390 1391 if (q->atm_mode == CAKE_ATM_ATM) { 1392 len += 47; 1393 len /= 48; 1394 len *= 53; 1395 } else if (q->atm_mode == CAKE_ATM_PTM) { 1396 /* Add one byte per 64 bytes or part thereof. 1397 * This is conservative and easier to calculate than the 1398 * precise value. 1399 */ 1400 len += (len + 63) / 64; 1401 } 1402 1403 if (qd->max_adjlen < len) 1404 qd->max_adjlen = len; 1405 if (qd->min_adjlen > len) 1406 qd->min_adjlen = len; 1407 1408 return len; 1409 } 1410 1411 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1412 { 1413 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1414 unsigned int hdr_len, last_len = 0; 1415 u32 off = skb_network_offset(skb); 1416 u16 segs = qdisc_pkt_segs(skb); 1417 u32 len = qdisc_pkt_len(skb); 1418 1419 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1420 1421 if (segs == 1) 1422 return cake_calc_overhead(q, len, off); 1423 1424 /* borrowed from qdisc_pkt_len_segs_init() */ 1425 if (!skb->encapsulation) 1426 hdr_len = skb_transport_offset(skb); 1427 else 1428 hdr_len = skb_inner_transport_offset(skb); 1429 1430 /* + transport layer */ 1431 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1432 SKB_GSO_TCPV6))) { 1433 const struct tcphdr *th; 1434 struct tcphdr _tcphdr; 1435 1436 th = skb_header_pointer(skb, hdr_len, 1437 sizeof(_tcphdr), &_tcphdr); 1438 if (likely(th)) 1439 hdr_len += __tcp_hdrlen(th); 1440 } else { 1441 struct udphdr _udphdr; 1442 1443 if (skb_header_pointer(skb, hdr_len, 1444 sizeof(_udphdr), &_udphdr)) 1445 hdr_len += sizeof(struct udphdr); 1446 } 1447 1448 len = shinfo->gso_size + hdr_len; 1449 last_len = skb->len - shinfo->gso_size * (segs - 1); 1450 1451 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1452 cake_calc_overhead(q, last_len, off)); 1453 } 1454 1455 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1456 { 1457 struct cake_heap_entry ii = q->overflow_heap[i]; 1458 struct cake_heap_entry jj = q->overflow_heap[j]; 1459 1460 q->overflow_heap[i] = jj; 1461 q->overflow_heap[j] = ii; 1462 1463 q->tins[ii.t].overflow_idx[ii.b] = j; 1464 q->tins[jj.t].overflow_idx[jj.b] = i; 1465 } 1466 1467 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1468 { 1469 struct cake_heap_entry ii = q->overflow_heap[i]; 1470 1471 return q->tins[ii.t].backlogs[ii.b]; 1472 } 1473 1474 static void cake_heapify(struct cake_sched_data *q, u16 i) 1475 { 1476 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1477 u32 mb = cake_heap_get_backlog(q, i); 1478 u32 m = i; 1479 1480 while (m < a) { 1481 u32 l = m + m + 1; 1482 u32 r = l + 1; 1483 1484 if (l < a) { 1485 u32 lb = cake_heap_get_backlog(q, l); 1486 1487 if (lb > mb) { 1488 m = l; 1489 mb = lb; 1490 } 1491 } 1492 1493 if (r < a) { 1494 u32 rb = cake_heap_get_backlog(q, r); 1495 1496 if (rb > mb) { 1497 m = r; 1498 mb = rb; 1499 } 1500 } 1501 1502 if (m != i) { 1503 cake_heap_swap(q, i, m); 1504 i = m; 1505 } else { 1506 break; 1507 } 1508 } 1509 } 1510 1511 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1512 { 1513 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1514 u16 p = (i - 1) >> 1; 1515 u32 ib = cake_heap_get_backlog(q, i); 1516 u32 pb = cake_heap_get_backlog(q, p); 1517 1518 if (ib > pb) { 1519 cake_heap_swap(q, i, p); 1520 i = p; 1521 } else { 1522 break; 1523 } 1524 } 1525 } 1526 1527 static int cake_advance_shaper(struct cake_sched_data *q, 1528 struct cake_tin_data *b, 1529 struct sk_buff *skb, 1530 ktime_t now, bool drop) 1531 { 1532 u32 len = get_cobalt_cb(skb)->adjusted_len; 1533 1534 /* charge packet bandwidth to this tin 1535 * and to the global shaper. 1536 */ 1537 if (q->rate_ns) { 1538 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1539 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1540 u64 failsafe_dur = global_dur + (global_dur >> 1); 1541 1542 if (ktime_before(b->time_next_packet, now)) 1543 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1544 tin_dur); 1545 1546 else if (ktime_before(b->time_next_packet, 1547 ktime_add_ns(now, tin_dur))) 1548 b->time_next_packet = ktime_add_ns(now, tin_dur); 1549 1550 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1551 global_dur); 1552 if (!drop) 1553 q->failsafe_next_packet = \ 1554 ktime_add_ns(q->failsafe_next_packet, 1555 failsafe_dur); 1556 } 1557 return len; 1558 } 1559 1560 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1561 { 1562 struct cake_sched_data *q = qdisc_priv(sch); 1563 ktime_t now = ktime_get(); 1564 u32 idx = 0, tin = 0, len; 1565 struct cake_heap_entry qq; 1566 struct cake_tin_data *b; 1567 struct cake_flow *flow; 1568 struct sk_buff *skb; 1569 1570 if (!q->overflow_timeout) { 1571 int i; 1572 /* Build fresh max-heap */ 1573 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) 1574 cake_heapify(q, i); 1575 } 1576 q->overflow_timeout = 65535; 1577 1578 /* select longest queue for pruning */ 1579 qq = q->overflow_heap[0]; 1580 tin = qq.t; 1581 idx = qq.b; 1582 1583 b = &q->tins[tin]; 1584 flow = &b->flows[idx]; 1585 skb = dequeue_head(flow); 1586 if (unlikely(!skb)) { 1587 /* heap has gone wrong, rebuild it next time */ 1588 q->overflow_timeout = 0; 1589 return idx + (tin << 16); 1590 } 1591 1592 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1593 b->unresponsive_flow_count++; 1594 1595 len = qdisc_pkt_len(skb); 1596 q->buffer_used -= skb->truesize; 1597 b->backlogs[idx] -= len; 1598 b->tin_backlog -= len; 1599 sch->qstats.backlog -= len; 1600 1601 flow->dropped++; 1602 b->tin_dropped++; 1603 1604 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1605 cake_advance_shaper(q, b, skb, now, true); 1606 1607 qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); 1608 sch->q.qlen--; 1609 1610 cake_heapify(q, 0); 1611 1612 return idx + (tin << 16); 1613 } 1614 1615 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1616 { 1617 const int offset = skb_network_offset(skb); 1618 u16 *buf, buf_; 1619 u8 dscp; 1620 1621 switch (skb_protocol(skb, true)) { 1622 case htons(ETH_P_IP): 1623 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1624 if (unlikely(!buf)) 1625 return 0; 1626 1627 /* ToS is in the second byte of iphdr */ 1628 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1629 1630 if (wash && dscp) { 1631 const int wlen = offset + sizeof(struct iphdr); 1632 1633 if (!pskb_may_pull(skb, wlen) || 1634 skb_try_make_writable(skb, wlen)) 1635 return 0; 1636 1637 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1638 } 1639 1640 return dscp; 1641 1642 case htons(ETH_P_IPV6): 1643 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1644 if (unlikely(!buf)) 1645 return 0; 1646 1647 /* Traffic class is in the first and second bytes of ipv6hdr */ 1648 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1649 1650 if (wash && dscp) { 1651 const int wlen = offset + sizeof(struct ipv6hdr); 1652 1653 if (!pskb_may_pull(skb, wlen) || 1654 skb_try_make_writable(skb, wlen)) 1655 return 0; 1656 1657 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1658 } 1659 1660 return dscp; 1661 1662 case htons(ETH_P_ARP): 1663 return 0x38; /* CS7 - Net Control */ 1664 1665 default: 1666 /* If there is no Diffserv field, treat as best-effort */ 1667 return 0; 1668 } 1669 } 1670 1671 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1672 struct sk_buff *skb) 1673 { 1674 struct cake_sched_data *qd = qdisc_priv(sch); 1675 struct cake_sched_config *q = qd->config; 1676 u32 tin, mark; 1677 bool wash; 1678 u8 dscp; 1679 1680 /* Tin selection: Default to diffserv-based selection, allow overriding 1681 * using firewall marks or skb->priority. Call DSCP parsing early if 1682 * wash is enabled, otherwise defer to below to skip unneeded parsing. 1683 */ 1684 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1685 wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1686 if (wash) 1687 dscp = cake_handle_diffserv(skb, wash); 1688 1689 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1690 tin = 0; 1691 1692 else if (mark && mark <= qd->tin_cnt) 1693 tin = qd->tin_order[mark - 1]; 1694 1695 else if (TC_H_MAJ(skb->priority) == sch->handle && 1696 TC_H_MIN(skb->priority) > 0 && 1697 TC_H_MIN(skb->priority) <= qd->tin_cnt) 1698 tin = qd->tin_order[TC_H_MIN(skb->priority) - 1]; 1699 1700 else { 1701 if (!wash) 1702 dscp = cake_handle_diffserv(skb, wash); 1703 tin = qd->tin_index[dscp]; 1704 1705 if (unlikely(tin >= qd->tin_cnt)) 1706 tin = 0; 1707 } 1708 1709 return &qd->tins[tin]; 1710 } 1711 1712 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1713 struct sk_buff *skb, int flow_mode, int *qerr) 1714 { 1715 struct cake_sched_data *q = qdisc_priv(sch); 1716 struct tcf_proto *filter; 1717 struct tcf_result res; 1718 u16 flow = 0, host = 0; 1719 int result; 1720 1721 filter = rcu_dereference_bh(q->filter_list); 1722 if (!filter) 1723 goto hash; 1724 1725 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1726 result = tcf_classify(skb, NULL, filter, &res, false); 1727 1728 if (result >= 0) { 1729 #ifdef CONFIG_NET_CLS_ACT 1730 switch (result) { 1731 case TC_ACT_STOLEN: 1732 case TC_ACT_QUEUED: 1733 case TC_ACT_TRAP: 1734 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1735 fallthrough; 1736 case TC_ACT_SHOT: 1737 return 0; 1738 } 1739 #endif 1740 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1741 flow = TC_H_MIN(res.classid); 1742 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1743 host = TC_H_MAJ(res.classid) >> 16; 1744 } 1745 hash: 1746 *t = cake_select_tin(sch, skb); 1747 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1748 } 1749 1750 static void cake_reconfigure(struct Qdisc *sch); 1751 1752 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1753 struct sk_buff **to_free) 1754 { 1755 u32 idx, tin, prev_qlen, prev_backlog, drop_id; 1756 struct cake_sched_data *q = qdisc_priv(sch); 1757 int len = qdisc_pkt_len(skb), ret; 1758 struct sk_buff *ack = NULL; 1759 ktime_t now = ktime_get(); 1760 struct cake_tin_data *b; 1761 struct cake_flow *flow; 1762 bool same_flow = false; 1763 1764 /* choose flow to insert into */ 1765 idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret); 1766 if (idx == 0) { 1767 if (ret & __NET_XMIT_BYPASS) 1768 qdisc_qstats_drop(sch); 1769 __qdisc_drop(skb, to_free); 1770 return ret; 1771 } 1772 tin = (u32)(b - q->tins); 1773 idx--; 1774 flow = &b->flows[idx]; 1775 1776 /* ensure shaper state isn't stale */ 1777 if (!b->tin_backlog) { 1778 if (ktime_before(b->time_next_packet, now)) 1779 b->time_next_packet = now; 1780 1781 if (!sch->q.qlen) { 1782 if (ktime_before(q->time_next_packet, now)) { 1783 q->failsafe_next_packet = now; 1784 q->time_next_packet = now; 1785 } else if (ktime_after(q->time_next_packet, now) && 1786 ktime_after(q->failsafe_next_packet, now)) { 1787 u64 next = \ 1788 min(ktime_to_ns(q->time_next_packet), 1789 ktime_to_ns( 1790 q->failsafe_next_packet)); 1791 sch->qstats.overlimits++; 1792 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1793 } 1794 } 1795 } 1796 1797 if (unlikely(len > b->max_skblen)) 1798 b->max_skblen = len; 1799 1800 if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1801 struct sk_buff *segs, *nskb; 1802 netdev_features_t features = netif_skb_features(skb); 1803 unsigned int slen = 0, numsegs = 0; 1804 1805 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1806 if (IS_ERR_OR_NULL(segs)) 1807 return qdisc_drop(skb, sch, to_free); 1808 1809 skb_list_walk_safe(segs, segs, nskb) { 1810 skb_mark_not_on_list(segs); 1811 qdisc_skb_cb(segs)->pkt_len = segs->len; 1812 qdisc_skb_cb(segs)->pkt_segs = 1; 1813 cobalt_set_enqueue_time(segs, now); 1814 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1815 segs); 1816 flow_queue_add(flow, segs); 1817 1818 sch->q.qlen++; 1819 numsegs++; 1820 slen += segs->len; 1821 q->buffer_used += segs->truesize; 1822 b->packets++; 1823 } 1824 1825 /* stats */ 1826 b->bytes += slen; 1827 b->backlogs[idx] += slen; 1828 b->tin_backlog += slen; 1829 sch->qstats.backlog += slen; 1830 q->avg_window_bytes += slen; 1831 1832 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1833 consume_skb(skb); 1834 } else { 1835 /* not splitting */ 1836 int ack_pkt_len = 0; 1837 1838 cobalt_set_enqueue_time(skb, now); 1839 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1840 flow_queue_add(flow, skb); 1841 1842 if (q->config->ack_filter) 1843 ack = cake_ack_filter(q, flow); 1844 1845 if (ack) { 1846 b->ack_drops++; 1847 sch->qstats.drops++; 1848 ack_pkt_len = qdisc_pkt_len(ack); 1849 b->bytes += ack_pkt_len; 1850 q->buffer_used += skb->truesize - ack->truesize; 1851 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 1852 cake_advance_shaper(q, b, ack, now, true); 1853 1854 qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); 1855 consume_skb(ack); 1856 } else { 1857 sch->q.qlen++; 1858 q->buffer_used += skb->truesize; 1859 } 1860 1861 /* stats */ 1862 b->packets++; 1863 b->bytes += len - ack_pkt_len; 1864 b->backlogs[idx] += len - ack_pkt_len; 1865 b->tin_backlog += len - ack_pkt_len; 1866 sch->qstats.backlog += len - ack_pkt_len; 1867 q->avg_window_bytes += len - ack_pkt_len; 1868 } 1869 1870 if (q->overflow_timeout) 1871 cake_heapify_up(q, b->overflow_idx[idx]); 1872 1873 /* incoming bandwidth capacity estimate */ 1874 if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1875 u64 packet_interval = \ 1876 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1877 1878 if (packet_interval > NSEC_PER_SEC) 1879 packet_interval = NSEC_PER_SEC; 1880 1881 /* filter out short-term bursts, eg. wifi aggregation */ 1882 q->avg_packet_interval = \ 1883 cake_ewma(q->avg_packet_interval, 1884 packet_interval, 1885 (packet_interval > q->avg_packet_interval ? 1886 2 : 8)); 1887 1888 q->last_packet_time = now; 1889 1890 if (packet_interval > q->avg_packet_interval) { 1891 u64 window_interval = \ 1892 ktime_to_ns(ktime_sub(now, 1893 q->avg_window_begin)); 1894 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1895 1896 b = div64_u64(b, window_interval); 1897 q->avg_peak_bandwidth = 1898 cake_ewma(q->avg_peak_bandwidth, b, 1899 b > q->avg_peak_bandwidth ? 2 : 8); 1900 q->avg_window_bytes = 0; 1901 q->avg_window_begin = now; 1902 1903 if (ktime_after(now, 1904 ktime_add_ms(q->last_reconfig_time, 1905 250))) { 1906 q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1907 cake_reconfigure(sch); 1908 } 1909 } 1910 } else { 1911 q->avg_window_bytes = 0; 1912 q->last_packet_time = now; 1913 } 1914 1915 /* flowchain */ 1916 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1917 if (!flow->set) { 1918 list_add_tail(&flow->flowchain, &b->new_flows); 1919 } else { 1920 b->decaying_flow_count--; 1921 list_move_tail(&flow->flowchain, &b->new_flows); 1922 } 1923 flow->set = CAKE_SET_SPARSE; 1924 b->sparse_flow_count++; 1925 1926 flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); 1927 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1928 /* this flow was empty, accounted as a sparse flow, but actually 1929 * in the bulk rotation. 1930 */ 1931 flow->set = CAKE_SET_BULK; 1932 b->sparse_flow_count--; 1933 b->bulk_flow_count++; 1934 1935 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 1936 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 1937 } 1938 1939 if (q->buffer_used > q->buffer_max_used) 1940 q->buffer_max_used = q->buffer_used; 1941 1942 if (q->buffer_used <= q->buffer_limit) 1943 return NET_XMIT_SUCCESS; 1944 1945 prev_qlen = sch->q.qlen; 1946 prev_backlog = sch->qstats.backlog; 1947 1948 while (q->buffer_used > q->buffer_limit) { 1949 drop_id = cake_drop(sch, to_free); 1950 if ((drop_id >> 16) == tin && 1951 (drop_id & 0xFFFF) == idx) 1952 same_flow = true; 1953 } 1954 1955 prev_qlen -= sch->q.qlen; 1956 prev_backlog -= sch->qstats.backlog; 1957 b->drop_overlimit += prev_qlen; 1958 1959 if (same_flow) { 1960 qdisc_tree_reduce_backlog(sch, prev_qlen - 1, 1961 prev_backlog - len); 1962 return NET_XMIT_CN; 1963 } 1964 qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); 1965 return NET_XMIT_SUCCESS; 1966 } 1967 1968 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1969 { 1970 struct cake_sched_data *q = qdisc_priv(sch); 1971 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1972 struct cake_flow *flow = &b->flows[q->cur_flow]; 1973 struct sk_buff *skb = NULL; 1974 u32 len; 1975 1976 if (flow->head) { 1977 skb = dequeue_head(flow); 1978 len = qdisc_pkt_len(skb); 1979 b->backlogs[q->cur_flow] -= len; 1980 b->tin_backlog -= len; 1981 sch->qstats.backlog -= len; 1982 q->buffer_used -= skb->truesize; 1983 sch->q.qlen--; 1984 1985 if (q->overflow_timeout) 1986 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1987 } 1988 return skb; 1989 } 1990 1991 /* Discard leftover packets from a tin no longer in use. */ 1992 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1993 { 1994 struct cake_sched_data *q = qdisc_priv(sch); 1995 struct sk_buff *skb; 1996 1997 q->cur_tin = tin; 1998 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1999 while (!!(skb = cake_dequeue_one(sch))) 2000 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); 2001 } 2002 2003 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 2004 { 2005 struct cake_sched_data *q = qdisc_priv(sch); 2006 struct cake_tin_data *b = &q->tins[q->cur_tin]; 2007 enum skb_drop_reason reason; 2008 ktime_t now = ktime_get(); 2009 struct cake_flow *flow; 2010 struct list_head *head; 2011 bool first_flow = true; 2012 struct sk_buff *skb; 2013 u64 delay; 2014 u32 len; 2015 2016 if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { 2017 struct net_device *dev = qdisc_dev(sch); 2018 struct cake_sched_data *other_priv; 2019 u64 new_rate = q->config->rate_bps; 2020 u64 other_qlen, other_last_active; 2021 struct Qdisc *other_sch; 2022 u32 num_active_qs = 1; 2023 unsigned int ntx; 2024 2025 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 2026 other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 2027 other_priv = qdisc_priv(other_sch); 2028 2029 if (other_priv == q) 2030 continue; 2031 2032 other_qlen = READ_ONCE(other_sch->q.qlen); 2033 other_last_active = READ_ONCE(other_priv->last_active); 2034 2035 if (other_qlen || other_last_active > q->last_checked_active) 2036 num_active_qs++; 2037 } 2038 2039 if (num_active_qs > 1) 2040 new_rate = div64_u64(q->config->rate_bps, num_active_qs); 2041 2042 /* mtu = 0 is used to only update the rate and not mess with cobalt params */ 2043 cake_set_rate(b, new_rate, 0, 0, 0); 2044 q->last_checked_active = now; 2045 q->active_queues = num_active_qs; 2046 q->rate_ns = b->tin_rate_ns; 2047 q->rate_shft = b->tin_rate_shft; 2048 } 2049 2050 begin: 2051 if (!sch->q.qlen) 2052 return NULL; 2053 2054 /* global hard shaper */ 2055 if (ktime_after(q->time_next_packet, now) && 2056 ktime_after(q->failsafe_next_packet, now)) { 2057 u64 next = min(ktime_to_ns(q->time_next_packet), 2058 ktime_to_ns(q->failsafe_next_packet)); 2059 2060 sch->qstats.overlimits++; 2061 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2062 return NULL; 2063 } 2064 2065 /* Choose a class to work on. */ 2066 if (!q->rate_ns) { 2067 /* In unlimited mode, can't rely on shaper timings, just balance 2068 * with DRR 2069 */ 2070 bool wrapped = false, empty = true; 2071 2072 while (b->tin_deficit < 0 || 2073 !(b->sparse_flow_count + b->bulk_flow_count)) { 2074 if (b->tin_deficit <= 0) 2075 b->tin_deficit += b->tin_quantum; 2076 if (b->sparse_flow_count + b->bulk_flow_count) 2077 empty = false; 2078 2079 q->cur_tin++; 2080 b++; 2081 if (q->cur_tin >= q->tin_cnt) { 2082 q->cur_tin = 0; 2083 b = q->tins; 2084 2085 if (wrapped) { 2086 /* It's possible for q->qlen to be 2087 * nonzero when we actually have no 2088 * packets anywhere. 2089 */ 2090 if (empty) 2091 return NULL; 2092 } else { 2093 wrapped = true; 2094 } 2095 } 2096 } 2097 } else { 2098 /* In shaped mode, choose: 2099 * - Highest-priority tin with queue and meeting schedule, or 2100 * - The earliest-scheduled tin with queue. 2101 */ 2102 ktime_t best_time = KTIME_MAX; 2103 int tin, best_tin = 0; 2104 2105 for (tin = 0; tin < q->tin_cnt; tin++) { 2106 b = q->tins + tin; 2107 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2108 ktime_t time_to_pkt = \ 2109 ktime_sub(b->time_next_packet, now); 2110 2111 if (ktime_to_ns(time_to_pkt) <= 0 || 2112 ktime_compare(time_to_pkt, 2113 best_time) <= 0) { 2114 best_time = time_to_pkt; 2115 best_tin = tin; 2116 } 2117 } 2118 } 2119 2120 q->cur_tin = best_tin; 2121 b = q->tins + best_tin; 2122 2123 /* No point in going further if no packets to deliver. */ 2124 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2125 return NULL; 2126 } 2127 2128 retry: 2129 /* service this class */ 2130 head = &b->decaying_flows; 2131 if (!first_flow || list_empty(head)) { 2132 head = &b->new_flows; 2133 if (list_empty(head)) { 2134 head = &b->old_flows; 2135 if (unlikely(list_empty(head))) { 2136 head = &b->decaying_flows; 2137 if (unlikely(list_empty(head))) 2138 goto begin; 2139 } 2140 } 2141 } 2142 flow = list_first_entry(head, struct cake_flow, flowchain); 2143 q->cur_flow = flow - b->flows; 2144 first_flow = false; 2145 2146 /* flow isolation (DRR++) */ 2147 if (flow->deficit <= 0) { 2148 /* Keep all flows with deficits out of the sparse and decaying 2149 * rotations. No non-empty flow can go into the decaying 2150 * rotation, so they can't get deficits 2151 */ 2152 if (flow->set == CAKE_SET_SPARSE) { 2153 if (flow->head) { 2154 b->sparse_flow_count--; 2155 b->bulk_flow_count++; 2156 2157 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2158 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2159 2160 flow->set = CAKE_SET_BULK; 2161 } else { 2162 /* we've moved it to the bulk rotation for 2163 * correct deficit accounting but we still want 2164 * to count it as a sparse flow, not a bulk one. 2165 */ 2166 flow->set = CAKE_SET_SPARSE_WAIT; 2167 } 2168 } 2169 2170 flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); 2171 list_move_tail(&flow->flowchain, &b->old_flows); 2172 2173 goto retry; 2174 } 2175 2176 /* Retrieve a packet via the AQM */ 2177 while (1) { 2178 skb = cake_dequeue_one(sch); 2179 if (!skb) { 2180 /* this queue was actually empty */ 2181 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2182 b->unresponsive_flow_count--; 2183 2184 if (flow->cvars.p_drop || flow->cvars.count || 2185 ktime_before(now, flow->cvars.drop_next)) { 2186 /* keep in the flowchain until the state has 2187 * decayed to rest 2188 */ 2189 list_move_tail(&flow->flowchain, 2190 &b->decaying_flows); 2191 if (flow->set == CAKE_SET_BULK) { 2192 b->bulk_flow_count--; 2193 2194 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2195 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2196 2197 b->decaying_flow_count++; 2198 } else if (flow->set == CAKE_SET_SPARSE || 2199 flow->set == CAKE_SET_SPARSE_WAIT) { 2200 b->sparse_flow_count--; 2201 b->decaying_flow_count++; 2202 } 2203 flow->set = CAKE_SET_DECAYING; 2204 } else { 2205 /* remove empty queue from the flowchain */ 2206 list_del_init(&flow->flowchain); 2207 if (flow->set == CAKE_SET_SPARSE || 2208 flow->set == CAKE_SET_SPARSE_WAIT) 2209 b->sparse_flow_count--; 2210 else if (flow->set == CAKE_SET_BULK) { 2211 b->bulk_flow_count--; 2212 2213 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); 2214 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); 2215 } else 2216 b->decaying_flow_count--; 2217 2218 flow->set = CAKE_SET_NONE; 2219 } 2220 goto begin; 2221 } 2222 2223 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2224 (b->bulk_flow_count * 2225 !!(q->config->rate_flags & 2226 CAKE_FLAG_INGRESS))); 2227 /* Last packet in queue may be marked, shouldn't be dropped */ 2228 if (reason == SKB_NOT_DROPPED_YET || !flow->head) 2229 break; 2230 2231 /* drop this packet, get another one */ 2232 if (q->config->rate_flags & CAKE_FLAG_INGRESS) { 2233 len = cake_advance_shaper(q, b, skb, 2234 now, true); 2235 flow->deficit -= len; 2236 b->tin_deficit -= len; 2237 } 2238 flow->dropped++; 2239 b->tin_dropped++; 2240 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2241 qdisc_qstats_drop(sch); 2242 qdisc_dequeue_drop(sch, skb, reason); 2243 if (q->config->rate_flags & CAKE_FLAG_INGRESS) 2244 goto retry; 2245 } 2246 2247 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2248 qdisc_bstats_update(sch, skb); 2249 WRITE_ONCE(q->last_active, now); 2250 2251 /* collect delay stats */ 2252 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2253 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2254 b->peak_delay = cake_ewma(b->peak_delay, delay, 2255 delay > b->peak_delay ? 2 : 8); 2256 b->base_delay = cake_ewma(b->base_delay, delay, 2257 delay < b->base_delay ? 2 : 8); 2258 2259 len = cake_advance_shaper(q, b, skb, now, false); 2260 flow->deficit -= len; 2261 b->tin_deficit -= len; 2262 2263 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2264 u64 next = min(ktime_to_ns(q->time_next_packet), 2265 ktime_to_ns(q->failsafe_next_packet)); 2266 2267 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2268 } else if (!sch->q.qlen) { 2269 int i; 2270 2271 for (i = 0; i < q->tin_cnt; i++) { 2272 if (q->tins[i].decaying_flow_count) { 2273 ktime_t next = \ 2274 ktime_add_ns(now, 2275 q->tins[i].cparams.target); 2276 2277 qdisc_watchdog_schedule_ns(&q->watchdog, 2278 ktime_to_ns(next)); 2279 break; 2280 } 2281 } 2282 } 2283 2284 if (q->overflow_timeout) 2285 q->overflow_timeout--; 2286 2287 return skb; 2288 } 2289 2290 static void cake_reset(struct Qdisc *sch) 2291 { 2292 struct cake_sched_data *q = qdisc_priv(sch); 2293 u32 c; 2294 2295 if (!q->tins) 2296 return; 2297 2298 for (c = 0; c < CAKE_MAX_TINS; c++) 2299 cake_clear_tin(sch, c); 2300 } 2301 2302 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2303 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2304 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2305 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2306 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2307 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2308 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2309 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2310 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2311 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2312 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2313 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2314 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2315 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2316 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2317 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2318 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2319 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2320 }; 2321 2322 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2323 u64 target_ns, u64 rtt_est_ns) 2324 { 2325 /* convert byte-rate into time-per-byte 2326 * so it will always unwedge in reasonable time. 2327 */ 2328 static const u64 MIN_RATE = 64; 2329 u32 byte_target = mtu; 2330 u64 byte_target_ns; 2331 u8 rate_shft = 0; 2332 u64 rate_ns = 0; 2333 2334 b->flow_quantum = 1514; 2335 if (rate) { 2336 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2337 rate_shft = 34; 2338 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2339 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2340 while (!!(rate_ns >> 34)) { 2341 rate_ns >>= 1; 2342 rate_shft--; 2343 } 2344 } /* else unlimited, ie. zero delay */ 2345 2346 b->tin_rate_bps = rate; 2347 b->tin_rate_ns = rate_ns; 2348 b->tin_rate_shft = rate_shft; 2349 2350 if (mtu == 0) 2351 return; 2352 2353 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2354 2355 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2356 b->cparams.interval = max(rtt_est_ns + 2357 b->cparams.target - target_ns, 2358 b->cparams.target * 2); 2359 b->cparams.mtu_time = byte_target_ns; 2360 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2361 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2362 } 2363 2364 static int cake_config_besteffort(struct Qdisc *sch) 2365 { 2366 struct cake_sched_data *q = qdisc_priv(sch); 2367 struct cake_tin_data *b = &q->tins[0]; 2368 u32 mtu = psched_mtu(qdisc_dev(sch)); 2369 u64 rate = q->config->rate_bps; 2370 2371 q->tin_cnt = 1; 2372 2373 q->tin_index = besteffort; 2374 q->tin_order = normal_order; 2375 2376 cake_set_rate(b, rate, mtu, 2377 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2378 b->tin_quantum = 65535; 2379 2380 return 0; 2381 } 2382 2383 static int cake_config_precedence(struct Qdisc *sch) 2384 { 2385 /* convert high-level (user visible) parameters into internal format */ 2386 struct cake_sched_data *q = qdisc_priv(sch); 2387 u32 mtu = psched_mtu(qdisc_dev(sch)); 2388 u64 rate = q->config->rate_bps; 2389 u32 quantum = 256; 2390 u32 i; 2391 2392 q->tin_cnt = 8; 2393 q->tin_index = precedence; 2394 q->tin_order = normal_order; 2395 2396 for (i = 0; i < q->tin_cnt; i++) { 2397 struct cake_tin_data *b = &q->tins[i]; 2398 2399 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2400 us_to_ns(q->config->interval)); 2401 2402 b->tin_quantum = max_t(u16, 1U, quantum); 2403 2404 /* calculate next class's parameters */ 2405 rate *= 7; 2406 rate >>= 3; 2407 2408 quantum *= 7; 2409 quantum >>= 3; 2410 } 2411 2412 return 0; 2413 } 2414 2415 /* List of known Diffserv codepoints: 2416 * 2417 * Default Forwarding (DF/CS0) - Best Effort 2418 * Max Throughput (TOS2) 2419 * Min Delay (TOS4) 2420 * LLT "La" (TOS5) 2421 * Assured Forwarding 1 (AF1x) - x3 2422 * Assured Forwarding 2 (AF2x) - x3 2423 * Assured Forwarding 3 (AF3x) - x3 2424 * Assured Forwarding 4 (AF4x) - x3 2425 * Precedence Class 1 (CS1) 2426 * Precedence Class 2 (CS2) 2427 * Precedence Class 3 (CS3) 2428 * Precedence Class 4 (CS4) 2429 * Precedence Class 5 (CS5) 2430 * Precedence Class 6 (CS6) 2431 * Precedence Class 7 (CS7) 2432 * Voice Admit (VA) 2433 * Expedited Forwarding (EF) 2434 * Lower Effort (LE) 2435 * 2436 * Total 26 codepoints. 2437 */ 2438 2439 /* List of traffic classes in RFC 4594, updated by RFC 8622: 2440 * (roughly descending order of contended priority) 2441 * (roughly ascending order of uncontended throughput) 2442 * 2443 * Network Control (CS6,CS7) - routing traffic 2444 * Telephony (EF,VA) - aka. VoIP streams 2445 * Signalling (CS5) - VoIP setup 2446 * Multimedia Conferencing (AF4x) - aka. video calls 2447 * Realtime Interactive (CS4) - eg. games 2448 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2449 * Broadcast Video (CS3) 2450 * Low-Latency Data (AF2x,TOS4) - eg. database 2451 * Ops, Admin, Management (CS2) - eg. ssh 2452 * Standard Service (DF & unrecognised codepoints) 2453 * High-Throughput Data (AF1x,TOS2) - eg. web traffic 2454 * Low-Priority Data (LE,CS1) - eg. BitTorrent 2455 * 2456 * Total 12 traffic classes. 2457 */ 2458 2459 static int cake_config_diffserv8(struct Qdisc *sch) 2460 { 2461 /* Pruned list of traffic classes for typical applications: 2462 * 2463 * Network Control (CS6, CS7) 2464 * Minimum Latency (EF, VA, CS5, CS4) 2465 * Interactive Shell (CS2) 2466 * Low Latency Transactions (AF2x, TOS4) 2467 * Video Streaming (AF4x, AF3x, CS3) 2468 * Bog Standard (DF etc.) 2469 * High Throughput (AF1x, TOS2, CS1) 2470 * Background Traffic (LE) 2471 * 2472 * Total 8 traffic classes. 2473 */ 2474 2475 struct cake_sched_data *q = qdisc_priv(sch); 2476 u32 mtu = psched_mtu(qdisc_dev(sch)); 2477 u64 rate = q->config->rate_bps; 2478 u32 quantum = 256; 2479 u32 i; 2480 2481 q->tin_cnt = 8; 2482 2483 /* codepoint to class mapping */ 2484 q->tin_index = diffserv8; 2485 q->tin_order = normal_order; 2486 2487 /* class characteristics */ 2488 for (i = 0; i < q->tin_cnt; i++) { 2489 struct cake_tin_data *b = &q->tins[i]; 2490 2491 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), 2492 us_to_ns(q->config->interval)); 2493 2494 b->tin_quantum = max_t(u16, 1U, quantum); 2495 2496 /* calculate next class's parameters */ 2497 rate *= 7; 2498 rate >>= 3; 2499 2500 quantum *= 7; 2501 quantum >>= 3; 2502 } 2503 2504 return 0; 2505 } 2506 2507 static int cake_config_diffserv4(struct Qdisc *sch) 2508 { 2509 /* Further pruned list of traffic classes for four-class system: 2510 * 2511 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2512 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) 2513 * Best Effort (DF, AF1x, TOS2, and those not specified) 2514 * Background Traffic (LE, CS1) 2515 * 2516 * Total 4 traffic classes. 2517 */ 2518 2519 struct cake_sched_data *q = qdisc_priv(sch); 2520 u32 mtu = psched_mtu(qdisc_dev(sch)); 2521 u64 rate = q->config->rate_bps; 2522 u32 quantum = 1024; 2523 2524 q->tin_cnt = 4; 2525 2526 /* codepoint to class mapping */ 2527 q->tin_index = diffserv4; 2528 q->tin_order = bulk_order; 2529 2530 /* class characteristics */ 2531 cake_set_rate(&q->tins[0], rate, mtu, 2532 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2533 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2534 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2535 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2536 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2537 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2538 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2539 2540 /* bandwidth-sharing weights */ 2541 q->tins[0].tin_quantum = quantum; 2542 q->tins[1].tin_quantum = quantum >> 4; 2543 q->tins[2].tin_quantum = quantum >> 1; 2544 q->tins[3].tin_quantum = quantum >> 2; 2545 2546 return 0; 2547 } 2548 2549 static int cake_config_diffserv3(struct Qdisc *sch) 2550 { 2551 /* Simplified Diffserv structure with 3 tins. 2552 * Latency Sensitive (CS7, CS6, EF, VA, TOS4) 2553 * Best Effort 2554 * Low Priority (LE, CS1) 2555 */ 2556 struct cake_sched_data *q = qdisc_priv(sch); 2557 u32 mtu = psched_mtu(qdisc_dev(sch)); 2558 u64 rate = q->config->rate_bps; 2559 u32 quantum = 1024; 2560 2561 q->tin_cnt = 3; 2562 2563 /* codepoint to class mapping */ 2564 q->tin_index = diffserv3; 2565 q->tin_order = bulk_order; 2566 2567 /* class characteristics */ 2568 cake_set_rate(&q->tins[0], rate, mtu, 2569 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2570 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2571 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2572 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2573 us_to_ns(q->config->target), us_to_ns(q->config->interval)); 2574 2575 /* bandwidth-sharing weights */ 2576 q->tins[0].tin_quantum = quantum; 2577 q->tins[1].tin_quantum = quantum >> 4; 2578 q->tins[2].tin_quantum = quantum >> 2; 2579 2580 return 0; 2581 } 2582 2583 static void cake_reconfigure(struct Qdisc *sch) 2584 { 2585 struct cake_sched_data *qd = qdisc_priv(sch); 2586 struct cake_sched_config *q = qd->config; 2587 int c, ft; 2588 2589 switch (q->tin_mode) { 2590 case CAKE_DIFFSERV_BESTEFFORT: 2591 ft = cake_config_besteffort(sch); 2592 break; 2593 2594 case CAKE_DIFFSERV_PRECEDENCE: 2595 ft = cake_config_precedence(sch); 2596 break; 2597 2598 case CAKE_DIFFSERV_DIFFSERV8: 2599 ft = cake_config_diffserv8(sch); 2600 break; 2601 2602 case CAKE_DIFFSERV_DIFFSERV4: 2603 ft = cake_config_diffserv4(sch); 2604 break; 2605 2606 case CAKE_DIFFSERV_DIFFSERV3: 2607 default: 2608 ft = cake_config_diffserv3(sch); 2609 break; 2610 } 2611 2612 for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) { 2613 cake_clear_tin(sch, c); 2614 qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time; 2615 } 2616 2617 qd->rate_ns = qd->tins[ft].tin_rate_ns; 2618 qd->rate_shft = qd->tins[ft].tin_rate_shft; 2619 2620 if (q->buffer_config_limit) { 2621 qd->buffer_limit = q->buffer_config_limit; 2622 } else if (q->rate_bps) { 2623 u64 t = q->rate_bps * q->interval; 2624 2625 do_div(t, USEC_PER_SEC / 4); 2626 qd->buffer_limit = max_t(u32, t, 4U << 20); 2627 } else { 2628 qd->buffer_limit = ~0; 2629 } 2630 2631 sch->flags &= ~TCQ_F_CAN_BYPASS; 2632 2633 qd->buffer_limit = min(qd->buffer_limit, 2634 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2635 q->buffer_config_limit)); 2636 } 2637 2638 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, 2639 struct netlink_ext_ack *extack, bool *overhead_changed) 2640 { 2641 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2642 u16 rate_flags = q->rate_flags; 2643 u8 flow_mode = q->flow_mode; 2644 int err; 2645 2646 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2647 extack); 2648 if (err < 0) 2649 return err; 2650 2651 if (tb[TCA_CAKE_NAT]) { 2652 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2653 flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2654 flow_mode |= CAKE_FLOW_NAT_FLAG * 2655 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2656 #else 2657 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2658 "No conntrack support in kernel"); 2659 return -EOPNOTSUPP; 2660 #endif 2661 } 2662 2663 if (tb[TCA_CAKE_AUTORATE]) { 2664 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) { 2665 if (q->is_shared) { 2666 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE], 2667 "Can't use autorate-ingress with cake_mq"); 2668 return -EOPNOTSUPP; 2669 } 2670 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2671 } else { 2672 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2673 } 2674 } 2675 2676 if (tb[TCA_CAKE_BASE_RATE64]) 2677 WRITE_ONCE(q->rate_bps, 2678 nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); 2679 2680 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2681 WRITE_ONCE(q->tin_mode, 2682 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); 2683 2684 if (tb[TCA_CAKE_WASH]) { 2685 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2686 rate_flags |= CAKE_FLAG_WASH; 2687 else 2688 rate_flags &= ~CAKE_FLAG_WASH; 2689 } 2690 2691 if (tb[TCA_CAKE_FLOW_MODE]) 2692 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | 2693 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2694 CAKE_FLOW_MASK)); 2695 2696 if (tb[TCA_CAKE_ATM]) 2697 WRITE_ONCE(q->atm_mode, 2698 nla_get_u32(tb[TCA_CAKE_ATM])); 2699 2700 if (tb[TCA_CAKE_OVERHEAD]) { 2701 WRITE_ONCE(q->rate_overhead, 2702 nla_get_s32(tb[TCA_CAKE_OVERHEAD])); 2703 rate_flags |= CAKE_FLAG_OVERHEAD; 2704 *overhead_changed = true; 2705 } 2706 2707 if (tb[TCA_CAKE_RAW]) { 2708 rate_flags &= ~CAKE_FLAG_OVERHEAD; 2709 *overhead_changed = true; 2710 } 2711 2712 if (tb[TCA_CAKE_MPU]) 2713 WRITE_ONCE(q->rate_mpu, 2714 nla_get_u32(tb[TCA_CAKE_MPU])); 2715 2716 if (tb[TCA_CAKE_RTT]) { 2717 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2718 2719 WRITE_ONCE(q->interval, max(interval, 1U)); 2720 } 2721 2722 if (tb[TCA_CAKE_TARGET]) { 2723 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2724 2725 WRITE_ONCE(q->target, max(target, 1U)); 2726 } 2727 2728 if (tb[TCA_CAKE_INGRESS]) { 2729 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2730 rate_flags |= CAKE_FLAG_INGRESS; 2731 else 2732 rate_flags &= ~CAKE_FLAG_INGRESS; 2733 } 2734 2735 if (tb[TCA_CAKE_ACK_FILTER]) 2736 WRITE_ONCE(q->ack_filter, 2737 nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); 2738 2739 if (tb[TCA_CAKE_MEMORY]) 2740 WRITE_ONCE(q->buffer_config_limit, 2741 nla_get_u32(tb[TCA_CAKE_MEMORY])); 2742 2743 if (tb[TCA_CAKE_SPLIT_GSO]) { 2744 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2745 rate_flags |= CAKE_FLAG_SPLIT_GSO; 2746 else 2747 rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2748 } 2749 2750 if (tb[TCA_CAKE_FWMARK]) { 2751 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); 2752 WRITE_ONCE(q->fwmark_shft, 2753 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); 2754 } 2755 2756 WRITE_ONCE(q->rate_flags, rate_flags); 2757 WRITE_ONCE(q->flow_mode, flow_mode); 2758 2759 return 0; 2760 } 2761 2762 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2763 struct netlink_ext_ack *extack) 2764 { 2765 struct cake_sched_data *qd = qdisc_priv(sch); 2766 struct cake_sched_config *q = qd->config; 2767 bool overhead_changed = false; 2768 int ret; 2769 2770 if (q->is_shared) { 2771 NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs"); 2772 return -EOPNOTSUPP; 2773 } 2774 2775 ret = cake_config_change(q, opt, extack, &overhead_changed); 2776 if (ret) 2777 return ret; 2778 2779 if (overhead_changed) { 2780 qd->max_netlen = 0; 2781 qd->max_adjlen = 0; 2782 qd->min_netlen = ~0; 2783 qd->min_adjlen = ~0; 2784 } 2785 2786 if (qd->tins) { 2787 sch_tree_lock(sch); 2788 cake_reconfigure(sch); 2789 sch_tree_unlock(sch); 2790 } 2791 2792 return 0; 2793 } 2794 2795 static void cake_destroy(struct Qdisc *sch) 2796 { 2797 struct cake_sched_data *q = qdisc_priv(sch); 2798 2799 qdisc_watchdog_cancel(&q->watchdog); 2800 tcf_block_put(q->block); 2801 kvfree(q->tins); 2802 } 2803 2804 static void cake_config_init(struct cake_sched_config *q, bool is_shared) 2805 { 2806 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2807 q->flow_mode = CAKE_FLOW_TRIPLE; 2808 2809 q->rate_bps = 0; /* unlimited by default */ 2810 2811 q->interval = 100000; /* 100ms default */ 2812 q->target = 5000; /* 5ms: codel RFC argues 2813 * for 5 to 10% of interval 2814 */ 2815 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2816 q->is_shared = is_shared; 2817 q->sync_time = 200 * NSEC_PER_USEC; 2818 } 2819 2820 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2821 struct netlink_ext_ack *extack) 2822 { 2823 struct cake_sched_data *qd = qdisc_priv(sch); 2824 struct cake_sched_config *q = &qd->initial_config; 2825 int i, j, err; 2826 2827 cake_config_init(q, false); 2828 2829 sch->limit = 10240; 2830 sch->flags |= TCQ_F_DEQUEUE_DROPS; 2831 2832 qd->cur_tin = 0; 2833 qd->cur_flow = 0; 2834 qd->config = q; 2835 2836 qdisc_watchdog_init(&qd->watchdog, sch); 2837 2838 if (opt) { 2839 err = cake_change(sch, opt, extack); 2840 if (err) 2841 return err; 2842 } 2843 2844 err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack); 2845 if (err) 2846 return err; 2847 2848 quantum_div[0] = ~0; 2849 for (i = 1; i <= CAKE_QUEUES; i++) 2850 quantum_div[i] = 65535 / i; 2851 2852 qd->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2853 GFP_KERNEL); 2854 if (!qd->tins) 2855 return -ENOMEM; 2856 2857 for (i = 0; i < CAKE_MAX_TINS; i++) { 2858 struct cake_tin_data *b = qd->tins + i; 2859 2860 INIT_LIST_HEAD(&b->new_flows); 2861 INIT_LIST_HEAD(&b->old_flows); 2862 INIT_LIST_HEAD(&b->decaying_flows); 2863 b->sparse_flow_count = 0; 2864 b->bulk_flow_count = 0; 2865 b->decaying_flow_count = 0; 2866 2867 for (j = 0; j < CAKE_QUEUES; j++) { 2868 struct cake_flow *flow = b->flows + j; 2869 u32 k = j * CAKE_MAX_TINS + i; 2870 2871 INIT_LIST_HEAD(&flow->flowchain); 2872 cobalt_vars_init(&flow->cvars); 2873 2874 qd->overflow_heap[k].t = i; 2875 qd->overflow_heap[k].b = j; 2876 b->overflow_idx[j] = k; 2877 } 2878 } 2879 2880 cake_reconfigure(sch); 2881 qd->avg_peak_bandwidth = q->rate_bps; 2882 qd->min_netlen = ~0; 2883 qd->min_adjlen = ~0; 2884 qd->active_queues = 0; 2885 qd->last_checked_active = 0; 2886 2887 return 0; 2888 } 2889 2890 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg) 2891 { 2892 struct cake_sched_data *qd = qdisc_priv(sch); 2893 2894 qd->config = cfg; 2895 cake_reconfigure(sch); 2896 } 2897 2898 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb) 2899 { 2900 struct nlattr *opts; 2901 u16 rate_flags; 2902 u8 flow_mode; 2903 2904 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2905 if (!opts) 2906 goto nla_put_failure; 2907 2908 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, 2909 READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) 2910 goto nla_put_failure; 2911 2912 flow_mode = READ_ONCE(q->flow_mode); 2913 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) 2914 goto nla_put_failure; 2915 2916 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) 2917 goto nla_put_failure; 2918 2919 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) 2920 goto nla_put_failure; 2921 2922 if (nla_put_u32(skb, TCA_CAKE_MEMORY, 2923 READ_ONCE(q->buffer_config_limit))) 2924 goto nla_put_failure; 2925 2926 rate_flags = READ_ONCE(q->rate_flags); 2927 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2928 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2929 goto nla_put_failure; 2930 2931 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2932 !!(rate_flags & CAKE_FLAG_INGRESS))) 2933 goto nla_put_failure; 2934 2935 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) 2936 goto nla_put_failure; 2937 2938 if (nla_put_u32(skb, TCA_CAKE_NAT, 2939 !!(flow_mode & CAKE_FLOW_NAT_FLAG))) 2940 goto nla_put_failure; 2941 2942 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) 2943 goto nla_put_failure; 2944 2945 if (nla_put_u32(skb, TCA_CAKE_WASH, 2946 !!(rate_flags & CAKE_FLAG_WASH))) 2947 goto nla_put_failure; 2948 2949 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) 2950 goto nla_put_failure; 2951 2952 if (!(rate_flags & CAKE_FLAG_OVERHEAD)) 2953 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2954 goto nla_put_failure; 2955 2956 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) 2957 goto nla_put_failure; 2958 2959 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) 2960 goto nla_put_failure; 2961 2962 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2963 !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) 2964 goto nla_put_failure; 2965 2966 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) 2967 goto nla_put_failure; 2968 2969 return nla_nest_end(skb, opts); 2970 2971 nla_put_failure: 2972 return -1; 2973 } 2974 2975 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2976 { 2977 struct cake_sched_data *qd = qdisc_priv(sch); 2978 2979 return cake_config_dump(qd->config, skb); 2980 } 2981 2982 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2983 { 2984 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2985 struct cake_sched_data *q = qdisc_priv(sch); 2986 struct nlattr *tstats, *ts; 2987 int i; 2988 2989 if (!stats) 2990 return -1; 2991 2992 #define PUT_STAT_U32(attr, data) do { \ 2993 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2994 goto nla_put_failure; \ 2995 } while (0) 2996 #define PUT_STAT_U64(attr, data) do { \ 2997 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2998 data, TCA_CAKE_STATS_PAD)) \ 2999 goto nla_put_failure; \ 3000 } while (0) 3001 3002 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 3003 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 3004 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 3005 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 3006 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 3007 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 3008 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 3009 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 3010 PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); 3011 3012 #undef PUT_STAT_U32 3013 #undef PUT_STAT_U64 3014 3015 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 3016 if (!tstats) 3017 goto nla_put_failure; 3018 3019 #define PUT_TSTAT_U32(attr, data) do { \ 3020 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 3021 goto nla_put_failure; \ 3022 } while (0) 3023 #define PUT_TSTAT_U64(attr, data) do { \ 3024 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 3025 data, TCA_CAKE_TIN_STATS_PAD)) \ 3026 goto nla_put_failure; \ 3027 } while (0) 3028 3029 for (i = 0; i < q->tin_cnt; i++) { 3030 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3031 3032 ts = nla_nest_start_noflag(d->skb, i + 1); 3033 if (!ts) 3034 goto nla_put_failure; 3035 3036 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 3037 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 3038 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 3039 3040 PUT_TSTAT_U32(TARGET_US, 3041 ktime_to_us(ns_to_ktime(b->cparams.target))); 3042 PUT_TSTAT_U32(INTERVAL_US, 3043 ktime_to_us(ns_to_ktime(b->cparams.interval))); 3044 3045 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 3046 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 3047 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 3048 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 3049 3050 PUT_TSTAT_U32(PEAK_DELAY_US, 3051 ktime_to_us(ns_to_ktime(b->peak_delay))); 3052 PUT_TSTAT_U32(AVG_DELAY_US, 3053 ktime_to_us(ns_to_ktime(b->avge_delay))); 3054 PUT_TSTAT_U32(BASE_DELAY_US, 3055 ktime_to_us(ns_to_ktime(b->base_delay))); 3056 3057 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 3058 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 3059 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 3060 3061 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 3062 b->decaying_flow_count); 3063 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 3064 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 3065 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 3066 3067 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 3068 nla_nest_end(d->skb, ts); 3069 } 3070 3071 #undef PUT_TSTAT_U32 3072 #undef PUT_TSTAT_U64 3073 3074 nla_nest_end(d->skb, tstats); 3075 return nla_nest_end(d->skb, stats); 3076 3077 nla_put_failure: 3078 nla_nest_cancel(d->skb, stats); 3079 return -1; 3080 } 3081 3082 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 3083 { 3084 return NULL; 3085 } 3086 3087 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 3088 { 3089 return 0; 3090 } 3091 3092 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 3093 u32 classid) 3094 { 3095 return 0; 3096 } 3097 3098 static void cake_unbind(struct Qdisc *q, unsigned long cl) 3099 { 3100 } 3101 3102 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 3103 struct netlink_ext_ack *extack) 3104 { 3105 struct cake_sched_data *q = qdisc_priv(sch); 3106 3107 if (cl) 3108 return NULL; 3109 return q->block; 3110 } 3111 3112 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 3113 struct sk_buff *skb, struct tcmsg *tcm) 3114 { 3115 tcm->tcm_handle |= TC_H_MIN(cl); 3116 return 0; 3117 } 3118 3119 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 3120 struct gnet_dump *d) 3121 { 3122 struct cake_sched_data *q = qdisc_priv(sch); 3123 const struct cake_flow *flow = NULL; 3124 struct gnet_stats_queue qs = { 0 }; 3125 struct nlattr *stats; 3126 u32 idx = cl - 1; 3127 3128 if (idx < CAKE_QUEUES * q->tin_cnt) { 3129 const struct cake_tin_data *b = \ 3130 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3131 const struct sk_buff *skb; 3132 3133 flow = &b->flows[idx % CAKE_QUEUES]; 3134 3135 if (flow->head) { 3136 sch_tree_lock(sch); 3137 skb = flow->head; 3138 while (skb) { 3139 qs.qlen++; 3140 skb = skb->next; 3141 } 3142 sch_tree_unlock(sch); 3143 } 3144 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3145 qs.drops = flow->dropped; 3146 } 3147 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3148 return -1; 3149 if (flow) { 3150 ktime_t now = ktime_get(); 3151 3152 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3153 if (!stats) 3154 return -1; 3155 3156 #define PUT_STAT_U32(attr, data) do { \ 3157 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3158 goto nla_put_failure; \ 3159 } while (0) 3160 #define PUT_STAT_S32(attr, data) do { \ 3161 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3162 goto nla_put_failure; \ 3163 } while (0) 3164 3165 PUT_STAT_S32(DEFICIT, flow->deficit); 3166 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3167 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3168 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3169 if (flow->cvars.p_drop) { 3170 PUT_STAT_S32(BLUE_TIMER_US, 3171 ktime_to_us( 3172 ktime_sub(now, 3173 flow->cvars.blue_timer))); 3174 } 3175 if (flow->cvars.dropping) { 3176 PUT_STAT_S32(DROP_NEXT_US, 3177 ktime_to_us( 3178 ktime_sub(now, 3179 flow->cvars.drop_next))); 3180 } 3181 3182 if (nla_nest_end(d->skb, stats) < 0) 3183 return -1; 3184 } 3185 3186 return 0; 3187 3188 nla_put_failure: 3189 nla_nest_cancel(d->skb, stats); 3190 return -1; 3191 } 3192 3193 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3194 { 3195 struct cake_sched_data *q = qdisc_priv(sch); 3196 unsigned int i, j; 3197 3198 if (arg->stop) 3199 return; 3200 3201 for (i = 0; i < q->tin_cnt; i++) { 3202 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3203 3204 for (j = 0; j < CAKE_QUEUES; j++) { 3205 if (list_empty(&b->flows[j].flowchain)) { 3206 arg->count++; 3207 continue; 3208 } 3209 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, 3210 arg)) 3211 break; 3212 } 3213 } 3214 } 3215 3216 static const struct Qdisc_class_ops cake_class_ops = { 3217 .leaf = cake_leaf, 3218 .find = cake_find, 3219 .tcf_block = cake_tcf_block, 3220 .bind_tcf = cake_bind, 3221 .unbind_tcf = cake_unbind, 3222 .dump = cake_dump_class, 3223 .dump_stats = cake_dump_class_stats, 3224 .walk = cake_walk, 3225 }; 3226 3227 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3228 .cl_ops = &cake_class_ops, 3229 .id = "cake", 3230 .priv_size = sizeof(struct cake_sched_data), 3231 .enqueue = cake_enqueue, 3232 .dequeue = cake_dequeue, 3233 .peek = qdisc_peek_dequeued, 3234 .init = cake_init, 3235 .reset = cake_reset, 3236 .destroy = cake_destroy, 3237 .change = cake_change, 3238 .dump = cake_dump, 3239 .dump_stats = cake_dump_stats, 3240 .owner = THIS_MODULE, 3241 }; 3242 MODULE_ALIAS_NET_SCH("cake"); 3243 3244 struct cake_mq_sched { 3245 struct mq_sched mq_priv; /* must be first */ 3246 struct cake_sched_config cake_config; 3247 }; 3248 3249 static void cake_mq_destroy(struct Qdisc *sch) 3250 { 3251 mq_destroy_common(sch); 3252 } 3253 3254 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt, 3255 struct netlink_ext_ack *extack) 3256 { 3257 struct cake_mq_sched *priv = qdisc_priv(sch); 3258 struct net_device *dev = qdisc_dev(sch); 3259 int ret, ntx; 3260 bool _unused; 3261 3262 cake_config_init(&priv->cake_config, true); 3263 if (opt) { 3264 ret = cake_config_change(&priv->cake_config, opt, extack, &_unused); 3265 if (ret) 3266 return ret; 3267 } 3268 3269 ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops); 3270 if (ret) 3271 return ret; 3272 3273 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) 3274 cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config); 3275 3276 return 0; 3277 } 3278 3279 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb) 3280 { 3281 struct cake_mq_sched *priv = qdisc_priv(sch); 3282 3283 mq_dump_common(sch, skb); 3284 return cake_config_dump(&priv->cake_config, skb); 3285 } 3286 3287 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, 3288 struct netlink_ext_ack *extack) 3289 { 3290 struct cake_mq_sched *priv = qdisc_priv(sch); 3291 struct net_device *dev = qdisc_dev(sch); 3292 bool overhead_changed = false; 3293 unsigned int ntx; 3294 int ret; 3295 3296 ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed); 3297 if (ret) 3298 return ret; 3299 3300 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { 3301 struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); 3302 struct cake_sched_data *qd = qdisc_priv(chld); 3303 3304 if (overhead_changed) { 3305 qd->max_netlen = 0; 3306 qd->max_adjlen = 0; 3307 qd->min_netlen = ~0; 3308 qd->min_adjlen = ~0; 3309 } 3310 3311 if (qd->tins) { 3312 sch_tree_lock(chld); 3313 cake_reconfigure(chld); 3314 sch_tree_unlock(chld); 3315 } 3316 } 3317 3318 return 0; 3319 } 3320 3321 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, 3322 struct Qdisc **old, struct netlink_ext_ack *extack) 3323 { 3324 NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs"); 3325 return -EOPNOTSUPP; 3326 } 3327 3328 static const struct Qdisc_class_ops cake_mq_class_ops = { 3329 .select_queue = mq_select_queue, 3330 .graft = cake_mq_graft, 3331 .leaf = mq_leaf, 3332 .find = mq_find, 3333 .walk = mq_walk, 3334 .dump = mq_dump_class, 3335 .dump_stats = mq_dump_class_stats, 3336 }; 3337 3338 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = { 3339 .cl_ops = &cake_mq_class_ops, 3340 .id = "cake_mq", 3341 .priv_size = sizeof(struct cake_mq_sched), 3342 .init = cake_mq_init, 3343 .destroy = cake_mq_destroy, 3344 .attach = mq_attach, 3345 .change = cake_mq_change, 3346 .change_real_num_tx = mq_change_real_num_tx, 3347 .dump = cake_mq_dump, 3348 .owner = THIS_MODULE, 3349 }; 3350 MODULE_ALIAS_NET_SCH("cake_mq"); 3351 3352 static int __init cake_module_init(void) 3353 { 3354 int ret; 3355 3356 ret = register_qdisc(&cake_qdisc_ops); 3357 if (ret) 3358 return ret; 3359 3360 ret = register_qdisc(&cake_mq_qdisc_ops); 3361 if (ret) 3362 unregister_qdisc(&cake_qdisc_ops); 3363 3364 return ret; 3365 } 3366 3367 static void __exit cake_module_exit(void) 3368 { 3369 unregister_qdisc(&cake_qdisc_ops); 3370 unregister_qdisc(&cake_mq_qdisc_ops); 3371 } 3372 3373 module_init(cake_module_init) 3374 module_exit(cake_module_exit) 3375 MODULE_AUTHOR("Jonathan Morton"); 3376 MODULE_LICENSE("Dual BSD/GPL"); 3377 MODULE_DESCRIPTION("The CAKE shaper."); 3378 MODULE_IMPORT_NS("NET_SCHED_INTERNAL"); 3379