1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* COMMON Applications Kept Enhanced (CAKE) discipline 4 * 5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com> 6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk> 7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com> 8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de> 9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk> 10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au> 11 * 12 * The CAKE Principles: 13 * (or, how to have your cake and eat it too) 14 * 15 * This is a combination of several shaping, AQM and FQ techniques into one 16 * easy-to-use package: 17 * 18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE 19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq), 20 * eliminating the need for any sort of burst parameter (eg. token bucket 21 * depth). Burst support is limited to that necessary to overcome scheduling 22 * latency. 23 * 24 * - A Diffserv-aware priority queue, giving more priority to certain classes, 25 * up to a specified fraction of bandwidth. Above that bandwidth threshold, 26 * the priority is reduced to avoid starving other tins. 27 * 28 * - Each priority tin has a separate Flow Queue system, to isolate traffic 29 * flows from each other. This prevents a burst on one flow from increasing 30 * the delay to another. Flows are distributed to queues using a 31 * set-associative hash function. 32 * 33 * - Each queue is actively managed by Cobalt, which is a combination of the 34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals 35 * congestion early via ECN (if available) and/or packet drops, to keep 36 * latency low. The codel parameters are auto-tuned based on the bandwidth 37 * setting, as is necessary at low bandwidths. 38 * 39 * The configuration parameters are kept deliberately simple for ease of use. 40 * Everything has sane defaults. Complete generality of configuration is *not* 41 * a goal. 42 * 43 * The priority queue operates according to a weighted DRR scheme, combined with 44 * a bandwidth tracker which reuses the shaper logic to detect which side of the 45 * bandwidth sharing threshold the tin is operating. This determines whether a 46 * priority-based weight (high) or a bandwidth-based weight (low) is used for 47 * that tin in the current pass. 48 * 49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly 50 * granted us permission to leverage. 51 */ 52 53 #include <linux/module.h> 54 #include <linux/types.h> 55 #include <linux/kernel.h> 56 #include <linux/jiffies.h> 57 #include <linux/string.h> 58 #include <linux/in.h> 59 #include <linux/errno.h> 60 #include <linux/init.h> 61 #include <linux/skbuff.h> 62 #include <linux/jhash.h> 63 #include <linux/slab.h> 64 #include <linux/vmalloc.h> 65 #include <linux/reciprocal_div.h> 66 #include <net/netlink.h> 67 #include <linux/if_vlan.h> 68 #include <net/gso.h> 69 #include <net/pkt_sched.h> 70 #include <net/pkt_cls.h> 71 #include <net/tcp.h> 72 #include <net/flow_dissector.h> 73 74 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 75 #include <net/netfilter/nf_conntrack_core.h> 76 #endif 77 78 #define CAKE_SET_WAYS (8) 79 #define CAKE_MAX_TINS (8) 80 #define CAKE_QUEUES (1024) 81 #define CAKE_FLOW_MASK 63 82 #define CAKE_FLOW_NAT_FLAG 64 83 84 /* struct cobalt_params - contains codel and blue parameters 85 * @interval: codel initial drop rate 86 * @target: maximum persistent sojourn time & blue update rate 87 * @mtu_time: serialisation delay of maximum-size packet 88 * @p_inc: increment of blue drop probability (0.32 fxp) 89 * @p_dec: decrement of blue drop probability (0.32 fxp) 90 */ 91 struct cobalt_params { 92 u64 interval; 93 u64 target; 94 u64 mtu_time; 95 u32 p_inc; 96 u32 p_dec; 97 }; 98 99 /* struct cobalt_vars - contains codel and blue variables 100 * @count: codel dropping frequency 101 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 102 * @drop_next: time to drop next packet, or when we dropped last 103 * @blue_timer: Blue time to next drop 104 * @p_drop: BLUE drop probability (0.32 fxp) 105 * @dropping: set if in dropping state 106 * @ecn_marked: set if marked 107 */ 108 struct cobalt_vars { 109 u32 count; 110 u32 rec_inv_sqrt; 111 ktime_t drop_next; 112 ktime_t blue_timer; 113 u32 p_drop; 114 bool dropping; 115 bool ecn_marked; 116 }; 117 118 enum { 119 CAKE_SET_NONE = 0, 120 CAKE_SET_SPARSE, 121 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */ 122 CAKE_SET_BULK, 123 CAKE_SET_DECAYING 124 }; 125 126 struct cake_flow { 127 /* this stuff is all needed per-flow at dequeue time */ 128 struct sk_buff *head; 129 struct sk_buff *tail; 130 struct list_head flowchain; 131 s32 deficit; 132 u32 dropped; 133 struct cobalt_vars cvars; 134 u16 srchost; /* index into cake_host table */ 135 u16 dsthost; 136 u8 set; 137 }; /* please try to keep this structure <= 64 bytes */ 138 139 struct cake_host { 140 u32 srchost_tag; 141 u32 dsthost_tag; 142 u16 srchost_bulk_flow_count; 143 u16 dsthost_bulk_flow_count; 144 }; 145 146 struct cake_heap_entry { 147 u16 t:3, b:10; 148 }; 149 150 struct cake_tin_data { 151 struct cake_flow flows[CAKE_QUEUES]; 152 u32 backlogs[CAKE_QUEUES]; 153 u32 tags[CAKE_QUEUES]; /* for set association */ 154 u16 overflow_idx[CAKE_QUEUES]; 155 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */ 156 u16 flow_quantum; 157 158 struct cobalt_params cparams; 159 u32 drop_overlimit; 160 u16 bulk_flow_count; 161 u16 sparse_flow_count; 162 u16 decaying_flow_count; 163 u16 unresponsive_flow_count; 164 165 u32 max_skblen; 166 167 struct list_head new_flows; 168 struct list_head old_flows; 169 struct list_head decaying_flows; 170 171 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 172 ktime_t time_next_packet; 173 u64 tin_rate_ns; 174 u64 tin_rate_bps; 175 u16 tin_rate_shft; 176 177 u16 tin_quantum; 178 s32 tin_deficit; 179 u32 tin_backlog; 180 u32 tin_dropped; 181 u32 tin_ecn_mark; 182 183 u32 packets; 184 u64 bytes; 185 186 u32 ack_drops; 187 188 /* moving averages */ 189 u64 avge_delay; 190 u64 peak_delay; 191 u64 base_delay; 192 193 /* hash function stats */ 194 u32 way_directs; 195 u32 way_hits; 196 u32 way_misses; 197 u32 way_collisions; 198 }; /* number of tins is small, so size of this struct doesn't matter much */ 199 200 struct cake_sched_data { 201 struct tcf_proto __rcu *filter_list; /* optional external classifier */ 202 struct tcf_block *block; 203 struct cake_tin_data *tins; 204 205 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; 206 u16 overflow_timeout; 207 208 u16 tin_cnt; 209 u8 tin_mode; 210 u8 flow_mode; 211 u8 ack_filter; 212 u8 atm_mode; 213 214 u32 fwmark_mask; 215 u16 fwmark_shft; 216 217 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ 218 u16 rate_shft; 219 ktime_t time_next_packet; 220 ktime_t failsafe_next_packet; 221 u64 rate_ns; 222 u64 rate_bps; 223 u16 rate_flags; 224 s16 rate_overhead; 225 u16 rate_mpu; 226 u64 interval; 227 u64 target; 228 229 /* resource tracking */ 230 u32 buffer_used; 231 u32 buffer_max_used; 232 u32 buffer_limit; 233 u32 buffer_config_limit; 234 235 /* indices for dequeue */ 236 u16 cur_tin; 237 u16 cur_flow; 238 239 struct qdisc_watchdog watchdog; 240 const u8 *tin_index; 241 const u8 *tin_order; 242 243 /* bandwidth capacity estimate */ 244 ktime_t last_packet_time; 245 ktime_t avg_window_begin; 246 u64 avg_packet_interval; 247 u64 avg_window_bytes; 248 u64 avg_peak_bandwidth; 249 ktime_t last_reconfig_time; 250 251 /* packet length stats */ 252 u32 avg_netoff; 253 u16 max_netlen; 254 u16 max_adjlen; 255 u16 min_netlen; 256 u16 min_adjlen; 257 }; 258 259 enum { 260 CAKE_FLAG_OVERHEAD = BIT(0), 261 CAKE_FLAG_AUTORATE_INGRESS = BIT(1), 262 CAKE_FLAG_INGRESS = BIT(2), 263 CAKE_FLAG_WASH = BIT(3), 264 CAKE_FLAG_SPLIT_GSO = BIT(4) 265 }; 266 267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to 268 * obtain the best features of each. Codel is excellent on flows which 269 * respond to congestion signals in a TCP-like way. BLUE is more effective on 270 * unresponsive flows. 271 */ 272 273 struct cobalt_skb_cb { 274 ktime_t enqueue_time; 275 u32 adjusted_len; 276 }; 277 278 static u64 us_to_ns(u64 us) 279 { 280 return us * NSEC_PER_USEC; 281 } 282 283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb) 284 { 285 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb)); 286 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data; 287 } 288 289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb) 290 { 291 return get_cobalt_cb(skb)->enqueue_time; 292 } 293 294 static void cobalt_set_enqueue_time(struct sk_buff *skb, 295 ktime_t now) 296 { 297 get_cobalt_cb(skb)->enqueue_time = now; 298 } 299 300 static u16 quantum_div[CAKE_QUEUES + 1] = {0}; 301 302 /* Diffserv lookup tables */ 303 304 static const u8 precedence[] = { 305 0, 0, 0, 0, 0, 0, 0, 0, 306 1, 1, 1, 1, 1, 1, 1, 1, 307 2, 2, 2, 2, 2, 2, 2, 2, 308 3, 3, 3, 3, 3, 3, 3, 3, 309 4, 4, 4, 4, 4, 4, 4, 4, 310 5, 5, 5, 5, 5, 5, 5, 5, 311 6, 6, 6, 6, 6, 6, 6, 6, 312 7, 7, 7, 7, 7, 7, 7, 7, 313 }; 314 315 static const u8 diffserv8[] = { 316 2, 0, 1, 2, 4, 2, 2, 2, 317 1, 2, 1, 2, 1, 2, 1, 2, 318 5, 2, 4, 2, 4, 2, 4, 2, 319 3, 2, 3, 2, 3, 2, 3, 2, 320 6, 2, 3, 2, 3, 2, 3, 2, 321 6, 2, 2, 2, 6, 2, 6, 2, 322 7, 2, 2, 2, 2, 2, 2, 2, 323 7, 2, 2, 2, 2, 2, 2, 2, 324 }; 325 326 static const u8 diffserv4[] = { 327 0, 1, 0, 0, 2, 0, 0, 0, 328 1, 0, 0, 0, 0, 0, 0, 0, 329 2, 0, 2, 0, 2, 0, 2, 0, 330 2, 0, 2, 0, 2, 0, 2, 0, 331 3, 0, 2, 0, 2, 0, 2, 0, 332 3, 0, 0, 0, 3, 0, 3, 0, 333 3, 0, 0, 0, 0, 0, 0, 0, 334 3, 0, 0, 0, 0, 0, 0, 0, 335 }; 336 337 static const u8 diffserv3[] = { 338 0, 1, 0, 0, 2, 0, 0, 0, 339 1, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 0, 0, 0, 0, 343 0, 0, 0, 0, 2, 0, 2, 0, 344 2, 0, 0, 0, 0, 0, 0, 0, 345 2, 0, 0, 0, 0, 0, 0, 0, 346 }; 347 348 static const u8 besteffort[] = { 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 0, 0, 0, 0, 0, 0, 0, 0, 357 }; 358 359 /* tin priority order for stats dumping */ 360 361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7}; 362 static const u8 bulk_order[] = {1, 0, 2, 3}; 363 364 /* There is a big difference in timing between the accurate values placed in the 365 * cache and the approximations given by a single Newton step for small count 366 * values, particularly when stepping from count 1 to 2 or vice versa. Hence, 367 * these values are calculated using eight Newton steps, using the 368 * implementation below. Above 16, a single Newton step gives sufficient 369 * accuracy in either direction, given the precision stored. 370 * 371 * The magnitude of the error when stepping up to count 2 is such as to give the 372 * value that *should* have been produced at count 4. 373 */ 374 375 #define REC_INV_SQRT_CACHE (16) 376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 377 ~0, ~0, 3037000500, 2479700525, 378 2147483647, 1920767767, 1753413056, 1623345051, 379 1518500250, 1431655765, 1358187914, 1294981364, 380 1239850263, 1191209601, 1147878294, 1108955788 381 }; 382 383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots 384 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) 385 * 386 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 387 */ 388 389 static void cobalt_newton_step(struct cobalt_vars *vars) 390 { 391 u32 invsqrt, invsqrt2; 392 u64 val; 393 394 invsqrt = vars->rec_inv_sqrt; 395 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; 396 val = (3LL << 32) - ((u64)vars->count * invsqrt2); 397 398 val >>= 2; /* avoid overflow in following multiply */ 399 val = (val * invsqrt) >> (32 - 2 + 1); 400 401 vars->rec_inv_sqrt = val; 402 } 403 404 static void cobalt_invsqrt(struct cobalt_vars *vars) 405 { 406 if (vars->count < REC_INV_SQRT_CACHE) 407 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count]; 408 else 409 cobalt_newton_step(vars); 410 } 411 412 static void cobalt_vars_init(struct cobalt_vars *vars) 413 { 414 memset(vars, 0, sizeof(*vars)); 415 } 416 417 /* CoDel control_law is t + interval/sqrt(count) 418 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid 419 * both sqrt() and divide operation. 420 */ 421 static ktime_t cobalt_control(ktime_t t, 422 u64 interval, 423 u32 rec_inv_sqrt) 424 { 425 return ktime_add_ns(t, reciprocal_scale(interval, 426 rec_inv_sqrt)); 427 } 428 429 /* Call this when a packet had to be dropped due to queue overflow. Returns 430 * true if the BLUE state was quiescent before but active after this call. 431 */ 432 static bool cobalt_queue_full(struct cobalt_vars *vars, 433 struct cobalt_params *p, 434 ktime_t now) 435 { 436 bool up = false; 437 438 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 439 up = !vars->p_drop; 440 vars->p_drop += p->p_inc; 441 if (vars->p_drop < p->p_inc) 442 vars->p_drop = ~0; 443 vars->blue_timer = now; 444 } 445 vars->dropping = true; 446 vars->drop_next = now; 447 if (!vars->count) 448 vars->count = 1; 449 450 return up; 451 } 452 453 /* Call this when the queue was serviced but turned out to be empty. Returns 454 * true if the BLUE state was active before but quiescent after this call. 455 */ 456 static bool cobalt_queue_empty(struct cobalt_vars *vars, 457 struct cobalt_params *p, 458 ktime_t now) 459 { 460 bool down = false; 461 462 if (vars->p_drop && 463 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) { 464 if (vars->p_drop < p->p_dec) 465 vars->p_drop = 0; 466 else 467 vars->p_drop -= p->p_dec; 468 vars->blue_timer = now; 469 down = !vars->p_drop; 470 } 471 vars->dropping = false; 472 473 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) { 474 vars->count--; 475 cobalt_invsqrt(vars); 476 vars->drop_next = cobalt_control(vars->drop_next, 477 p->interval, 478 vars->rec_inv_sqrt); 479 } 480 481 return down; 482 } 483 484 /* Call this with a freshly dequeued packet for possible congestion marking. 485 * Returns true as an instruction to drop the packet, false for delivery. 486 */ 487 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, 488 struct cobalt_params *p, 489 ktime_t now, 490 struct sk_buff *skb, 491 u32 bulk_flows) 492 { 493 enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; 494 bool next_due, over_target; 495 ktime_t schedule; 496 u64 sojourn; 497 498 /* The 'schedule' variable records, in its sign, whether 'now' is before or 499 * after 'drop_next'. This allows 'drop_next' to be updated before the next 500 * scheduling decision is actually branched, without destroying that 501 * information. Similarly, the first 'schedule' value calculated is preserved 502 * in the boolean 'next_due'. 503 * 504 * As for 'drop_next', we take advantage of the fact that 'interval' is both 505 * the delay between first exceeding 'target' and the first signalling event, 506 * *and* the scaling factor for the signalling frequency. It's therefore very 507 * natural to use a single mechanism for both purposes, and eliminates a 508 * significant amount of reference Codel's spaghetti code. To help with this, 509 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close 510 * as possible to 1.0 in fixed-point. 511 */ 512 513 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 514 schedule = ktime_sub(now, vars->drop_next); 515 over_target = sojourn > p->target && 516 sojourn > p->mtu_time * bulk_flows * 2 && 517 sojourn > p->mtu_time * 4; 518 next_due = vars->count && ktime_to_ns(schedule) >= 0; 519 520 vars->ecn_marked = false; 521 522 if (over_target) { 523 if (!vars->dropping) { 524 vars->dropping = true; 525 vars->drop_next = cobalt_control(now, 526 p->interval, 527 vars->rec_inv_sqrt); 528 } 529 if (!vars->count) 530 vars->count = 1; 531 } else if (vars->dropping) { 532 vars->dropping = false; 533 } 534 535 if (next_due && vars->dropping) { 536 /* Use ECN mark if possible, otherwise drop */ 537 if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) 538 reason = SKB_DROP_REASON_QDISC_CONGESTED; 539 540 vars->count++; 541 if (!vars->count) 542 vars->count--; 543 cobalt_invsqrt(vars); 544 vars->drop_next = cobalt_control(vars->drop_next, 545 p->interval, 546 vars->rec_inv_sqrt); 547 schedule = ktime_sub(now, vars->drop_next); 548 } else { 549 while (next_due) { 550 vars->count--; 551 cobalt_invsqrt(vars); 552 vars->drop_next = cobalt_control(vars->drop_next, 553 p->interval, 554 vars->rec_inv_sqrt); 555 schedule = ktime_sub(now, vars->drop_next); 556 next_due = vars->count && ktime_to_ns(schedule) >= 0; 557 } 558 } 559 560 /* Simple BLUE implementation. Lack of ECN is deliberate. */ 561 if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && 562 get_random_u32() < vars->p_drop) 563 reason = SKB_DROP_REASON_CAKE_FLOOD; 564 565 /* Overload the drop_next field as an activity timeout */ 566 if (!vars->count) 567 vars->drop_next = ktime_add_ns(now, p->interval); 568 else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) 569 vars->drop_next = now; 570 571 return reason; 572 } 573 574 static bool cake_update_flowkeys(struct flow_keys *keys, 575 const struct sk_buff *skb) 576 { 577 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 578 struct nf_conntrack_tuple tuple = {}; 579 bool rev = !skb->_nfct, upd = false; 580 __be32 ip; 581 582 if (skb_protocol(skb, true) != htons(ETH_P_IP)) 583 return false; 584 585 if (!nf_ct_get_tuple_skb(&tuple, skb)) 586 return false; 587 588 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; 589 if (ip != keys->addrs.v4addrs.src) { 590 keys->addrs.v4addrs.src = ip; 591 upd = true; 592 } 593 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; 594 if (ip != keys->addrs.v4addrs.dst) { 595 keys->addrs.v4addrs.dst = ip; 596 upd = true; 597 } 598 599 if (keys->ports.ports) { 600 __be16 port; 601 602 port = rev ? tuple.dst.u.all : tuple.src.u.all; 603 if (port != keys->ports.src) { 604 keys->ports.src = port; 605 upd = true; 606 } 607 port = rev ? tuple.src.u.all : tuple.dst.u.all; 608 if (port != keys->ports.dst) { 609 port = keys->ports.dst; 610 upd = true; 611 } 612 } 613 return upd; 614 #else 615 return false; 616 #endif 617 } 618 619 /* Cake has several subtle multiple bit settings. In these cases you 620 * would be matching triple isolate mode as well. 621 */ 622 623 static bool cake_dsrc(int flow_mode) 624 { 625 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC; 626 } 627 628 static bool cake_ddst(int flow_mode) 629 { 630 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; 631 } 632 633 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, 634 int flow_mode, u16 flow_override, u16 host_override) 635 { 636 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); 637 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); 638 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); 639 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; 640 u16 reduced_hash, srchost_idx, dsthost_idx; 641 struct flow_keys keys, host_keys; 642 bool use_skbhash = skb->l4_hash; 643 644 if (unlikely(flow_mode == CAKE_FLOW_NONE)) 645 return 0; 646 647 /* If both overrides are set, or we can use the SKB hash and nat mode is 648 * disabled, we can skip packet dissection entirely. If nat mode is 649 * enabled there's another check below after doing the conntrack lookup. 650 */ 651 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) 652 goto skip_hash; 653 654 skb_flow_dissect_flow_keys(skb, &keys, 655 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); 656 657 /* Don't use the SKB hash if we change the lookup keys from conntrack */ 658 if (nat_enabled && cake_update_flowkeys(&keys, skb)) 659 use_skbhash = false; 660 661 /* If we can still use the SKB hash and don't need the host hash, we can 662 * skip the rest of the hashing procedure 663 */ 664 if (use_skbhash && !hash_hosts) 665 goto skip_hash; 666 667 /* flow_hash_from_keys() sorts the addresses by value, so we have 668 * to preserve their order in a separate data structure to treat 669 * src and dst host addresses as independently selectable. 670 */ 671 host_keys = keys; 672 host_keys.ports.ports = 0; 673 host_keys.basic.ip_proto = 0; 674 host_keys.keyid.keyid = 0; 675 host_keys.tags.flow_label = 0; 676 677 switch (host_keys.control.addr_type) { 678 case FLOW_DISSECTOR_KEY_IPV4_ADDRS: 679 host_keys.addrs.v4addrs.src = 0; 680 dsthost_hash = flow_hash_from_keys(&host_keys); 681 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 682 host_keys.addrs.v4addrs.dst = 0; 683 srchost_hash = flow_hash_from_keys(&host_keys); 684 break; 685 686 case FLOW_DISSECTOR_KEY_IPV6_ADDRS: 687 memset(&host_keys.addrs.v6addrs.src, 0, 688 sizeof(host_keys.addrs.v6addrs.src)); 689 dsthost_hash = flow_hash_from_keys(&host_keys); 690 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 691 memset(&host_keys.addrs.v6addrs.dst, 0, 692 sizeof(host_keys.addrs.v6addrs.dst)); 693 srchost_hash = flow_hash_from_keys(&host_keys); 694 break; 695 696 default: 697 dsthost_hash = 0; 698 srchost_hash = 0; 699 } 700 701 /* This *must* be after the above switch, since as a 702 * side-effect it sorts the src and dst addresses. 703 */ 704 if (hash_flows && !use_skbhash) 705 flow_hash = flow_hash_from_keys(&keys); 706 707 skip_hash: 708 if (flow_override) 709 flow_hash = flow_override - 1; 710 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS)) 711 flow_hash = skb->hash; 712 if (host_override) { 713 dsthost_hash = host_override - 1; 714 srchost_hash = host_override - 1; 715 } 716 717 if (!(flow_mode & CAKE_FLOW_FLOWS)) { 718 if (flow_mode & CAKE_FLOW_SRC_IP) 719 flow_hash ^= srchost_hash; 720 721 if (flow_mode & CAKE_FLOW_DST_IP) 722 flow_hash ^= dsthost_hash; 723 } 724 725 reduced_hash = flow_hash % CAKE_QUEUES; 726 727 /* set-associative hashing */ 728 /* fast path if no hash collision (direct lookup succeeds) */ 729 if (likely(q->tags[reduced_hash] == flow_hash && 730 q->flows[reduced_hash].set)) { 731 q->way_directs++; 732 } else { 733 u32 inner_hash = reduced_hash % CAKE_SET_WAYS; 734 u32 outer_hash = reduced_hash - inner_hash; 735 bool allocate_src = false; 736 bool allocate_dst = false; 737 u32 i, k; 738 739 /* check if any active queue in the set is reserved for 740 * this flow. 741 */ 742 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 743 i++, k = (k + 1) % CAKE_SET_WAYS) { 744 if (q->tags[outer_hash + k] == flow_hash) { 745 if (i) 746 q->way_hits++; 747 748 if (!q->flows[outer_hash + k].set) { 749 /* need to increment host refcnts */ 750 allocate_src = cake_dsrc(flow_mode); 751 allocate_dst = cake_ddst(flow_mode); 752 } 753 754 goto found; 755 } 756 } 757 758 /* no queue is reserved for this flow, look for an 759 * empty one. 760 */ 761 for (i = 0; i < CAKE_SET_WAYS; 762 i++, k = (k + 1) % CAKE_SET_WAYS) { 763 if (!q->flows[outer_hash + k].set) { 764 q->way_misses++; 765 allocate_src = cake_dsrc(flow_mode); 766 allocate_dst = cake_ddst(flow_mode); 767 goto found; 768 } 769 } 770 771 /* With no empty queues, default to the original 772 * queue, accept the collision, update the host tags. 773 */ 774 q->way_collisions++; 775 allocate_src = cake_dsrc(flow_mode); 776 allocate_dst = cake_ddst(flow_mode); 777 778 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { 779 if (allocate_src) 780 q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; 781 if (allocate_dst) 782 q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; 783 } 784 found: 785 /* reserve queue for future packets in same flow */ 786 reduced_hash = outer_hash + k; 787 q->tags[reduced_hash] = flow_hash; 788 789 if (allocate_src) { 790 srchost_idx = srchost_hash % CAKE_QUEUES; 791 inner_hash = srchost_idx % CAKE_SET_WAYS; 792 outer_hash = srchost_idx - inner_hash; 793 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 794 i++, k = (k + 1) % CAKE_SET_WAYS) { 795 if (q->hosts[outer_hash + k].srchost_tag == 796 srchost_hash) 797 goto found_src; 798 } 799 for (i = 0; i < CAKE_SET_WAYS; 800 i++, k = (k + 1) % CAKE_SET_WAYS) { 801 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count) 802 break; 803 } 804 q->hosts[outer_hash + k].srchost_tag = srchost_hash; 805 found_src: 806 srchost_idx = outer_hash + k; 807 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 808 q->hosts[srchost_idx].srchost_bulk_flow_count++; 809 q->flows[reduced_hash].srchost = srchost_idx; 810 } 811 812 if (allocate_dst) { 813 dsthost_idx = dsthost_hash % CAKE_QUEUES; 814 inner_hash = dsthost_idx % CAKE_SET_WAYS; 815 outer_hash = dsthost_idx - inner_hash; 816 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS; 817 i++, k = (k + 1) % CAKE_SET_WAYS) { 818 if (q->hosts[outer_hash + k].dsthost_tag == 819 dsthost_hash) 820 goto found_dst; 821 } 822 for (i = 0; i < CAKE_SET_WAYS; 823 i++, k = (k + 1) % CAKE_SET_WAYS) { 824 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count) 825 break; 826 } 827 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; 828 found_dst: 829 dsthost_idx = outer_hash + k; 830 if (q->flows[reduced_hash].set == CAKE_SET_BULK) 831 q->hosts[dsthost_idx].dsthost_bulk_flow_count++; 832 q->flows[reduced_hash].dsthost = dsthost_idx; 833 } 834 } 835 836 return reduced_hash; 837 } 838 839 /* helper functions : might be changed when/if skb use a standard list_head */ 840 /* remove one skb from head of slot queue */ 841 842 static struct sk_buff *dequeue_head(struct cake_flow *flow) 843 { 844 struct sk_buff *skb = flow->head; 845 846 if (skb) { 847 flow->head = skb->next; 848 skb_mark_not_on_list(skb); 849 } 850 851 return skb; 852 } 853 854 /* add skb to flow queue (tail add) */ 855 856 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb) 857 { 858 if (!flow->head) 859 flow->head = skb; 860 else 861 flow->tail->next = skb; 862 flow->tail = skb; 863 skb->next = NULL; 864 } 865 866 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb, 867 struct ipv6hdr *buf) 868 { 869 unsigned int offset = skb_network_offset(skb); 870 struct iphdr *iph; 871 872 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf); 873 874 if (!iph) 875 return NULL; 876 877 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6) 878 return skb_header_pointer(skb, offset + iph->ihl * 4, 879 sizeof(struct ipv6hdr), buf); 880 881 else if (iph->version == 4) 882 return iph; 883 884 else if (iph->version == 6) 885 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr), 886 buf); 887 888 return NULL; 889 } 890 891 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, 892 void *buf, unsigned int bufsize) 893 { 894 unsigned int offset = skb_network_offset(skb); 895 const struct ipv6hdr *ipv6h; 896 const struct tcphdr *tcph; 897 const struct iphdr *iph; 898 struct ipv6hdr _ipv6h; 899 struct tcphdr _tcph; 900 901 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); 902 903 if (!ipv6h) 904 return NULL; 905 906 if (ipv6h->version == 4) { 907 iph = (struct iphdr *)ipv6h; 908 offset += iph->ihl * 4; 909 910 /* special-case 6in4 tunnelling, as that is a common way to get 911 * v6 connectivity in the home 912 */ 913 if (iph->protocol == IPPROTO_IPV6) { 914 ipv6h = skb_header_pointer(skb, offset, 915 sizeof(_ipv6h), &_ipv6h); 916 917 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) 918 return NULL; 919 920 offset += sizeof(struct ipv6hdr); 921 922 } else if (iph->protocol != IPPROTO_TCP) { 923 return NULL; 924 } 925 926 } else if (ipv6h->version == 6) { 927 if (ipv6h->nexthdr != IPPROTO_TCP) 928 return NULL; 929 930 offset += sizeof(struct ipv6hdr); 931 } else { 932 return NULL; 933 } 934 935 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); 936 if (!tcph || tcph->doff < 5) 937 return NULL; 938 939 return skb_header_pointer(skb, offset, 940 min(__tcp_hdrlen(tcph), bufsize), buf); 941 } 942 943 static const void *cake_get_tcpopt(const struct tcphdr *tcph, 944 int code, int *oplen) 945 { 946 /* inspired by tcp_parse_options in tcp_input.c */ 947 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 948 const u8 *ptr = (const u8 *)(tcph + 1); 949 950 while (length > 0) { 951 int opcode = *ptr++; 952 int opsize; 953 954 if (opcode == TCPOPT_EOL) 955 break; 956 if (opcode == TCPOPT_NOP) { 957 length--; 958 continue; 959 } 960 if (length < 2) 961 break; 962 opsize = *ptr++; 963 if (opsize < 2 || opsize > length) 964 break; 965 966 if (opcode == code) { 967 *oplen = opsize; 968 return ptr; 969 } 970 971 ptr += opsize - 2; 972 length -= opsize; 973 } 974 975 return NULL; 976 } 977 978 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more 979 * bytes than the other. In the case where both sequences ACKs bytes that the 980 * other doesn't, A is considered greater. DSACKs in A also makes A be 981 * considered greater. 982 * 983 * @return -1, 0 or 1 as normal compare functions 984 */ 985 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a, 986 const struct tcphdr *tcph_b) 987 { 988 const struct tcp_sack_block_wire *sack_a, *sack_b; 989 u32 ack_seq_a = ntohl(tcph_a->ack_seq); 990 u32 bytes_a = 0, bytes_b = 0; 991 int oplen_a, oplen_b; 992 bool first = true; 993 994 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a); 995 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b); 996 997 /* pointers point to option contents */ 998 oplen_a -= TCPOLEN_SACK_BASE; 999 oplen_b -= TCPOLEN_SACK_BASE; 1000 1001 if (sack_a && oplen_a >= sizeof(*sack_a) && 1002 (!sack_b || oplen_b < sizeof(*sack_b))) 1003 return -1; 1004 else if (sack_b && oplen_b >= sizeof(*sack_b) && 1005 (!sack_a || oplen_a < sizeof(*sack_a))) 1006 return 1; 1007 else if ((!sack_a || oplen_a < sizeof(*sack_a)) && 1008 (!sack_b || oplen_b < sizeof(*sack_b))) 1009 return 0; 1010 1011 while (oplen_a >= sizeof(*sack_a)) { 1012 const struct tcp_sack_block_wire *sack_tmp = sack_b; 1013 u32 start_a = get_unaligned_be32(&sack_a->start_seq); 1014 u32 end_a = get_unaligned_be32(&sack_a->end_seq); 1015 int oplen_tmp = oplen_b; 1016 bool found = false; 1017 1018 /* DSACK; always considered greater to prevent dropping */ 1019 if (before(start_a, ack_seq_a)) 1020 return -1; 1021 1022 bytes_a += end_a - start_a; 1023 1024 while (oplen_tmp >= sizeof(*sack_tmp)) { 1025 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq); 1026 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq); 1027 1028 /* first time through we count the total size */ 1029 if (first) 1030 bytes_b += end_b - start_b; 1031 1032 if (!after(start_b, start_a) && !before(end_b, end_a)) { 1033 found = true; 1034 if (!first) 1035 break; 1036 } 1037 oplen_tmp -= sizeof(*sack_tmp); 1038 sack_tmp++; 1039 } 1040 1041 if (!found) 1042 return -1; 1043 1044 oplen_a -= sizeof(*sack_a); 1045 sack_a++; 1046 first = false; 1047 } 1048 1049 /* If we made it this far, all ranges SACKed by A are covered by B, so 1050 * either the SACKs are equal, or B SACKs more bytes. 1051 */ 1052 return bytes_b > bytes_a ? 1 : 0; 1053 } 1054 1055 static void cake_tcph_get_tstamp(const struct tcphdr *tcph, 1056 u32 *tsval, u32 *tsecr) 1057 { 1058 const u8 *ptr; 1059 int opsize; 1060 1061 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize); 1062 1063 if (ptr && opsize == TCPOLEN_TIMESTAMP) { 1064 *tsval = get_unaligned_be32(ptr); 1065 *tsecr = get_unaligned_be32(ptr + 4); 1066 } 1067 } 1068 1069 static bool cake_tcph_may_drop(const struct tcphdr *tcph, 1070 u32 tstamp_new, u32 tsecr_new) 1071 { 1072 /* inspired by tcp_parse_options in tcp_input.c */ 1073 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr); 1074 const u8 *ptr = (const u8 *)(tcph + 1); 1075 u32 tstamp, tsecr; 1076 1077 /* 3 reserved flags must be unset to avoid future breakage 1078 * ACK must be set 1079 * ECE/CWR are handled separately 1080 * All other flags URG/PSH/RST/SYN/FIN must be unset 1081 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero) 1082 * 0x00C00000 = CWR/ECE (handled separately) 1083 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000 1084 */ 1085 if (((tcp_flag_word(tcph) & 1086 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK)) 1087 return false; 1088 1089 while (length > 0) { 1090 int opcode = *ptr++; 1091 int opsize; 1092 1093 if (opcode == TCPOPT_EOL) 1094 break; 1095 if (opcode == TCPOPT_NOP) { 1096 length--; 1097 continue; 1098 } 1099 if (length < 2) 1100 break; 1101 opsize = *ptr++; 1102 if (opsize < 2 || opsize > length) 1103 break; 1104 1105 switch (opcode) { 1106 case TCPOPT_MD5SIG: /* doesn't influence state */ 1107 break; 1108 1109 case TCPOPT_SACK: /* stricter checking performed later */ 1110 if (opsize % 8 != 2) 1111 return false; 1112 break; 1113 1114 case TCPOPT_TIMESTAMP: 1115 /* only drop timestamps lower than new */ 1116 if (opsize != TCPOLEN_TIMESTAMP) 1117 return false; 1118 tstamp = get_unaligned_be32(ptr); 1119 tsecr = get_unaligned_be32(ptr + 4); 1120 if (after(tstamp, tstamp_new) || 1121 after(tsecr, tsecr_new)) 1122 return false; 1123 break; 1124 1125 case TCPOPT_MSS: /* these should only be set on SYN */ 1126 case TCPOPT_WINDOW: 1127 case TCPOPT_SACK_PERM: 1128 case TCPOPT_FASTOPEN: 1129 case TCPOPT_EXP: 1130 default: /* don't drop if any unknown options are present */ 1131 return false; 1132 } 1133 1134 ptr += opsize - 2; 1135 length -= opsize; 1136 } 1137 1138 return true; 1139 } 1140 1141 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, 1142 struct cake_flow *flow) 1143 { 1144 bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; 1145 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; 1146 struct sk_buff *skb_check, *skb_prev = NULL; 1147 const struct ipv6hdr *ipv6h, *ipv6h_check; 1148 unsigned char _tcph[64], _tcph_check[64]; 1149 const struct tcphdr *tcph, *tcph_check; 1150 const struct iphdr *iph, *iph_check; 1151 struct ipv6hdr _iph, _iph_check; 1152 const struct sk_buff *skb; 1153 int seglen, num_found = 0; 1154 u32 tstamp = 0, tsecr = 0; 1155 __be32 elig_flags = 0; 1156 int sack_comp; 1157 1158 /* no other possible ACKs to filter */ 1159 if (flow->head == flow->tail) 1160 return NULL; 1161 1162 skb = flow->tail; 1163 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph)); 1164 iph = cake_get_iphdr(skb, &_iph); 1165 if (!tcph) 1166 return NULL; 1167 1168 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr); 1169 1170 /* the 'triggering' packet need only have the ACK flag set. 1171 * also check that SYN is not set, as there won't be any previous ACKs. 1172 */ 1173 if ((tcp_flag_word(tcph) & 1174 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK) 1175 return NULL; 1176 1177 /* the 'triggering' ACK is at the tail of the queue, we have already 1178 * returned if it is the only packet in the flow. loop through the rest 1179 * of the queue looking for pure ACKs with the same 5-tuple as the 1180 * triggering one. 1181 */ 1182 for (skb_check = flow->head; 1183 skb_check && skb_check != skb; 1184 skb_prev = skb_check, skb_check = skb_check->next) { 1185 iph_check = cake_get_iphdr(skb_check, &_iph_check); 1186 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check, 1187 sizeof(_tcph_check)); 1188 1189 /* only TCP packets with matching 5-tuple are eligible, and only 1190 * drop safe headers 1191 */ 1192 if (!tcph_check || iph->version != iph_check->version || 1193 tcph_check->source != tcph->source || 1194 tcph_check->dest != tcph->dest) 1195 continue; 1196 1197 if (iph_check->version == 4) { 1198 if (iph_check->saddr != iph->saddr || 1199 iph_check->daddr != iph->daddr) 1200 continue; 1201 1202 seglen = iph_totlen(skb, iph_check) - 1203 (4 * iph_check->ihl); 1204 } else if (iph_check->version == 6) { 1205 ipv6h = (struct ipv6hdr *)iph; 1206 ipv6h_check = (struct ipv6hdr *)iph_check; 1207 1208 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) || 1209 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) 1210 continue; 1211 1212 seglen = ntohs(ipv6h_check->payload_len); 1213 } else { 1214 WARN_ON(1); /* shouldn't happen */ 1215 continue; 1216 } 1217 1218 /* If the ECE/CWR flags changed from the previous eligible 1219 * packet in the same flow, we should no longer be dropping that 1220 * previous packet as this would lose information. 1221 */ 1222 if (elig_ack && (tcp_flag_word(tcph_check) & 1223 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) { 1224 elig_ack = NULL; 1225 elig_ack_prev = NULL; 1226 num_found--; 1227 } 1228 1229 /* Check TCP options and flags, don't drop ACKs with segment 1230 * data, and don't drop ACKs with a higher cumulative ACK 1231 * counter than the triggering packet. Check ACK seqno here to 1232 * avoid parsing SACK options of packets we are going to exclude 1233 * anyway. 1234 */ 1235 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) || 1236 (seglen - __tcp_hdrlen(tcph_check)) != 0 || 1237 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq))) 1238 continue; 1239 1240 /* Check SACK options. The triggering packet must SACK more data 1241 * than the ACK under consideration, or SACK the same range but 1242 * have a larger cumulative ACK counter. The latter is a 1243 * pathological case, but is contained in the following check 1244 * anyway, just to be safe. 1245 */ 1246 sack_comp = cake_tcph_sack_compare(tcph_check, tcph); 1247 1248 if (sack_comp < 0 || 1249 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) && 1250 sack_comp == 0)) 1251 continue; 1252 1253 /* At this point we have found an eligible pure ACK to drop; if 1254 * we are in aggressive mode, we are done. Otherwise, keep 1255 * searching unless this is the second eligible ACK we 1256 * found. 1257 * 1258 * Since we want to drop ACK closest to the head of the queue, 1259 * save the first eligible ACK we find, even if we need to loop 1260 * again. 1261 */ 1262 if (!elig_ack) { 1263 elig_ack = skb_check; 1264 elig_ack_prev = skb_prev; 1265 elig_flags = (tcp_flag_word(tcph_check) 1266 & (TCP_FLAG_ECE | TCP_FLAG_CWR)); 1267 } 1268 1269 if (num_found++ > 0) 1270 goto found; 1271 } 1272 1273 /* We made it through the queue without finding two eligible ACKs . If 1274 * we found a single eligible ACK we can drop it in aggressive mode if 1275 * we can guarantee that this does not interfere with ECN flag 1276 * information. We ensure this by dropping it only if the enqueued 1277 * packet is consecutive with the eligible ACK, and their flags match. 1278 */ 1279 if (elig_ack && aggressive && elig_ack->next == skb && 1280 (elig_flags == (tcp_flag_word(tcph) & 1281 (TCP_FLAG_ECE | TCP_FLAG_CWR)))) 1282 goto found; 1283 1284 return NULL; 1285 1286 found: 1287 if (elig_ack_prev) 1288 elig_ack_prev->next = elig_ack->next; 1289 else 1290 flow->head = elig_ack->next; 1291 1292 skb_mark_not_on_list(elig_ack); 1293 1294 return elig_ack; 1295 } 1296 1297 static u64 cake_ewma(u64 avg, u64 sample, u32 shift) 1298 { 1299 avg -= avg >> shift; 1300 avg += sample >> shift; 1301 return avg; 1302 } 1303 1304 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) 1305 { 1306 if (q->rate_flags & CAKE_FLAG_OVERHEAD) 1307 len -= off; 1308 1309 if (q->max_netlen < len) 1310 q->max_netlen = len; 1311 if (q->min_netlen > len) 1312 q->min_netlen = len; 1313 1314 len += q->rate_overhead; 1315 1316 if (len < q->rate_mpu) 1317 len = q->rate_mpu; 1318 1319 if (q->atm_mode == CAKE_ATM_ATM) { 1320 len += 47; 1321 len /= 48; 1322 len *= 53; 1323 } else if (q->atm_mode == CAKE_ATM_PTM) { 1324 /* Add one byte per 64 bytes or part thereof. 1325 * This is conservative and easier to calculate than the 1326 * precise value. 1327 */ 1328 len += (len + 63) / 64; 1329 } 1330 1331 if (q->max_adjlen < len) 1332 q->max_adjlen = len; 1333 if (q->min_adjlen > len) 1334 q->min_adjlen = len; 1335 1336 return len; 1337 } 1338 1339 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) 1340 { 1341 const struct skb_shared_info *shinfo = skb_shinfo(skb); 1342 unsigned int hdr_len, last_len = 0; 1343 u32 off = skb_network_offset(skb); 1344 u32 len = qdisc_pkt_len(skb); 1345 u16 segs = 1; 1346 1347 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); 1348 1349 if (!shinfo->gso_size) 1350 return cake_calc_overhead(q, len, off); 1351 1352 /* borrowed from qdisc_pkt_len_init() */ 1353 hdr_len = skb_transport_offset(skb); 1354 1355 /* + transport layer */ 1356 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | 1357 SKB_GSO_TCPV6))) { 1358 const struct tcphdr *th; 1359 struct tcphdr _tcphdr; 1360 1361 th = skb_header_pointer(skb, hdr_len, 1362 sizeof(_tcphdr), &_tcphdr); 1363 if (likely(th)) 1364 hdr_len += __tcp_hdrlen(th); 1365 } else { 1366 struct udphdr _udphdr; 1367 1368 if (skb_header_pointer(skb, hdr_len, 1369 sizeof(_udphdr), &_udphdr)) 1370 hdr_len += sizeof(struct udphdr); 1371 } 1372 1373 if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) 1374 segs = DIV_ROUND_UP(skb->len - hdr_len, 1375 shinfo->gso_size); 1376 else 1377 segs = shinfo->gso_segs; 1378 1379 len = shinfo->gso_size + hdr_len; 1380 last_len = skb->len - shinfo->gso_size * (segs - 1); 1381 1382 return (cake_calc_overhead(q, len, off) * (segs - 1) + 1383 cake_calc_overhead(q, last_len, off)); 1384 } 1385 1386 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j) 1387 { 1388 struct cake_heap_entry ii = q->overflow_heap[i]; 1389 struct cake_heap_entry jj = q->overflow_heap[j]; 1390 1391 q->overflow_heap[i] = jj; 1392 q->overflow_heap[j] = ii; 1393 1394 q->tins[ii.t].overflow_idx[ii.b] = j; 1395 q->tins[jj.t].overflow_idx[jj.b] = i; 1396 } 1397 1398 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i) 1399 { 1400 struct cake_heap_entry ii = q->overflow_heap[i]; 1401 1402 return q->tins[ii.t].backlogs[ii.b]; 1403 } 1404 1405 static void cake_heapify(struct cake_sched_data *q, u16 i) 1406 { 1407 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES; 1408 u32 mb = cake_heap_get_backlog(q, i); 1409 u32 m = i; 1410 1411 while (m < a) { 1412 u32 l = m + m + 1; 1413 u32 r = l + 1; 1414 1415 if (l < a) { 1416 u32 lb = cake_heap_get_backlog(q, l); 1417 1418 if (lb > mb) { 1419 m = l; 1420 mb = lb; 1421 } 1422 } 1423 1424 if (r < a) { 1425 u32 rb = cake_heap_get_backlog(q, r); 1426 1427 if (rb > mb) { 1428 m = r; 1429 mb = rb; 1430 } 1431 } 1432 1433 if (m != i) { 1434 cake_heap_swap(q, i, m); 1435 i = m; 1436 } else { 1437 break; 1438 } 1439 } 1440 } 1441 1442 static void cake_heapify_up(struct cake_sched_data *q, u16 i) 1443 { 1444 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) { 1445 u16 p = (i - 1) >> 1; 1446 u32 ib = cake_heap_get_backlog(q, i); 1447 u32 pb = cake_heap_get_backlog(q, p); 1448 1449 if (ib > pb) { 1450 cake_heap_swap(q, i, p); 1451 i = p; 1452 } else { 1453 break; 1454 } 1455 } 1456 } 1457 1458 static int cake_advance_shaper(struct cake_sched_data *q, 1459 struct cake_tin_data *b, 1460 struct sk_buff *skb, 1461 ktime_t now, bool drop) 1462 { 1463 u32 len = get_cobalt_cb(skb)->adjusted_len; 1464 1465 /* charge packet bandwidth to this tin 1466 * and to the global shaper. 1467 */ 1468 if (q->rate_ns) { 1469 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft; 1470 u64 global_dur = (len * q->rate_ns) >> q->rate_shft; 1471 u64 failsafe_dur = global_dur + (global_dur >> 1); 1472 1473 if (ktime_before(b->time_next_packet, now)) 1474 b->time_next_packet = ktime_add_ns(b->time_next_packet, 1475 tin_dur); 1476 1477 else if (ktime_before(b->time_next_packet, 1478 ktime_add_ns(now, tin_dur))) 1479 b->time_next_packet = ktime_add_ns(now, tin_dur); 1480 1481 q->time_next_packet = ktime_add_ns(q->time_next_packet, 1482 global_dur); 1483 if (!drop) 1484 q->failsafe_next_packet = \ 1485 ktime_add_ns(q->failsafe_next_packet, 1486 failsafe_dur); 1487 } 1488 return len; 1489 } 1490 1491 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) 1492 { 1493 struct cake_sched_data *q = qdisc_priv(sch); 1494 ktime_t now = ktime_get(); 1495 u32 idx = 0, tin = 0, len; 1496 struct cake_heap_entry qq; 1497 struct cake_tin_data *b; 1498 struct cake_flow *flow; 1499 struct sk_buff *skb; 1500 1501 if (!q->overflow_timeout) { 1502 int i; 1503 /* Build fresh max-heap */ 1504 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--) 1505 cake_heapify(q, i); 1506 } 1507 q->overflow_timeout = 65535; 1508 1509 /* select longest queue for pruning */ 1510 qq = q->overflow_heap[0]; 1511 tin = qq.t; 1512 idx = qq.b; 1513 1514 b = &q->tins[tin]; 1515 flow = &b->flows[idx]; 1516 skb = dequeue_head(flow); 1517 if (unlikely(!skb)) { 1518 /* heap has gone wrong, rebuild it next time */ 1519 q->overflow_timeout = 0; 1520 return idx + (tin << 16); 1521 } 1522 1523 if (cobalt_queue_full(&flow->cvars, &b->cparams, now)) 1524 b->unresponsive_flow_count++; 1525 1526 len = qdisc_pkt_len(skb); 1527 q->buffer_used -= skb->truesize; 1528 b->backlogs[idx] -= len; 1529 b->tin_backlog -= len; 1530 sch->qstats.backlog -= len; 1531 1532 flow->dropped++; 1533 b->tin_dropped++; 1534 1535 if (q->rate_flags & CAKE_FLAG_INGRESS) 1536 cake_advance_shaper(q, b, skb, now, true); 1537 1538 qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); 1539 sch->q.qlen--; 1540 qdisc_tree_reduce_backlog(sch, 1, len); 1541 1542 cake_heapify(q, 0); 1543 1544 return idx + (tin << 16); 1545 } 1546 1547 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) 1548 { 1549 const int offset = skb_network_offset(skb); 1550 u16 *buf, buf_; 1551 u8 dscp; 1552 1553 switch (skb_protocol(skb, true)) { 1554 case htons(ETH_P_IP): 1555 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1556 if (unlikely(!buf)) 1557 return 0; 1558 1559 /* ToS is in the second byte of iphdr */ 1560 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2; 1561 1562 if (wash && dscp) { 1563 const int wlen = offset + sizeof(struct iphdr); 1564 1565 if (!pskb_may_pull(skb, wlen) || 1566 skb_try_make_writable(skb, wlen)) 1567 return 0; 1568 1569 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0); 1570 } 1571 1572 return dscp; 1573 1574 case htons(ETH_P_IPV6): 1575 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_); 1576 if (unlikely(!buf)) 1577 return 0; 1578 1579 /* Traffic class is in the first and second bytes of ipv6hdr */ 1580 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2; 1581 1582 if (wash && dscp) { 1583 const int wlen = offset + sizeof(struct ipv6hdr); 1584 1585 if (!pskb_may_pull(skb, wlen) || 1586 skb_try_make_writable(skb, wlen)) 1587 return 0; 1588 1589 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0); 1590 } 1591 1592 return dscp; 1593 1594 case htons(ETH_P_ARP): 1595 return 0x38; /* CS7 - Net Control */ 1596 1597 default: 1598 /* If there is no Diffserv field, treat as best-effort */ 1599 return 0; 1600 } 1601 } 1602 1603 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, 1604 struct sk_buff *skb) 1605 { 1606 struct cake_sched_data *q = qdisc_priv(sch); 1607 u32 tin, mark; 1608 bool wash; 1609 u8 dscp; 1610 1611 /* Tin selection: Default to diffserv-based selection, allow overriding 1612 * using firewall marks or skb->priority. Call DSCP parsing early if 1613 * wash is enabled, otherwise defer to below to skip unneeded parsing. 1614 */ 1615 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft; 1616 wash = !!(q->rate_flags & CAKE_FLAG_WASH); 1617 if (wash) 1618 dscp = cake_handle_diffserv(skb, wash); 1619 1620 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) 1621 tin = 0; 1622 1623 else if (mark && mark <= q->tin_cnt) 1624 tin = q->tin_order[mark - 1]; 1625 1626 else if (TC_H_MAJ(skb->priority) == sch->handle && 1627 TC_H_MIN(skb->priority) > 0 && 1628 TC_H_MIN(skb->priority) <= q->tin_cnt) 1629 tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; 1630 1631 else { 1632 if (!wash) 1633 dscp = cake_handle_diffserv(skb, wash); 1634 tin = q->tin_index[dscp]; 1635 1636 if (unlikely(tin >= q->tin_cnt)) 1637 tin = 0; 1638 } 1639 1640 return &q->tins[tin]; 1641 } 1642 1643 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, 1644 struct sk_buff *skb, int flow_mode, int *qerr) 1645 { 1646 struct cake_sched_data *q = qdisc_priv(sch); 1647 struct tcf_proto *filter; 1648 struct tcf_result res; 1649 u16 flow = 0, host = 0; 1650 int result; 1651 1652 filter = rcu_dereference_bh(q->filter_list); 1653 if (!filter) 1654 goto hash; 1655 1656 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 1657 result = tcf_classify(skb, NULL, filter, &res, false); 1658 1659 if (result >= 0) { 1660 #ifdef CONFIG_NET_CLS_ACT 1661 switch (result) { 1662 case TC_ACT_STOLEN: 1663 case TC_ACT_QUEUED: 1664 case TC_ACT_TRAP: 1665 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 1666 fallthrough; 1667 case TC_ACT_SHOT: 1668 return 0; 1669 } 1670 #endif 1671 if (TC_H_MIN(res.classid) <= CAKE_QUEUES) 1672 flow = TC_H_MIN(res.classid); 1673 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16)) 1674 host = TC_H_MAJ(res.classid) >> 16; 1675 } 1676 hash: 1677 *t = cake_select_tin(sch, skb); 1678 return cake_hash(*t, skb, flow_mode, flow, host) + 1; 1679 } 1680 1681 static void cake_reconfigure(struct Qdisc *sch); 1682 1683 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1684 struct sk_buff **to_free) 1685 { 1686 struct cake_sched_data *q = qdisc_priv(sch); 1687 int len = qdisc_pkt_len(skb); 1688 int ret; 1689 struct sk_buff *ack = NULL; 1690 ktime_t now = ktime_get(); 1691 struct cake_tin_data *b; 1692 struct cake_flow *flow; 1693 u32 idx; 1694 1695 /* choose flow to insert into */ 1696 idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); 1697 if (idx == 0) { 1698 if (ret & __NET_XMIT_BYPASS) 1699 qdisc_qstats_drop(sch); 1700 __qdisc_drop(skb, to_free); 1701 return ret; 1702 } 1703 idx--; 1704 flow = &b->flows[idx]; 1705 1706 /* ensure shaper state isn't stale */ 1707 if (!b->tin_backlog) { 1708 if (ktime_before(b->time_next_packet, now)) 1709 b->time_next_packet = now; 1710 1711 if (!sch->q.qlen) { 1712 if (ktime_before(q->time_next_packet, now)) { 1713 q->failsafe_next_packet = now; 1714 q->time_next_packet = now; 1715 } else if (ktime_after(q->time_next_packet, now) && 1716 ktime_after(q->failsafe_next_packet, now)) { 1717 u64 next = \ 1718 min(ktime_to_ns(q->time_next_packet), 1719 ktime_to_ns( 1720 q->failsafe_next_packet)); 1721 sch->qstats.overlimits++; 1722 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1723 } 1724 } 1725 } 1726 1727 if (unlikely(len > b->max_skblen)) 1728 b->max_skblen = len; 1729 1730 if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { 1731 struct sk_buff *segs, *nskb; 1732 netdev_features_t features = netif_skb_features(skb); 1733 unsigned int slen = 0, numsegs = 0; 1734 1735 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 1736 if (IS_ERR_OR_NULL(segs)) 1737 return qdisc_drop(skb, sch, to_free); 1738 1739 skb_list_walk_safe(segs, segs, nskb) { 1740 skb_mark_not_on_list(segs); 1741 qdisc_skb_cb(segs)->pkt_len = segs->len; 1742 cobalt_set_enqueue_time(segs, now); 1743 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, 1744 segs); 1745 flow_queue_add(flow, segs); 1746 1747 sch->q.qlen++; 1748 numsegs++; 1749 slen += segs->len; 1750 q->buffer_used += segs->truesize; 1751 b->packets++; 1752 } 1753 1754 /* stats */ 1755 b->bytes += slen; 1756 b->backlogs[idx] += slen; 1757 b->tin_backlog += slen; 1758 sch->qstats.backlog += slen; 1759 q->avg_window_bytes += slen; 1760 1761 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen); 1762 consume_skb(skb); 1763 } else { 1764 /* not splitting */ 1765 cobalt_set_enqueue_time(skb, now); 1766 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); 1767 flow_queue_add(flow, skb); 1768 1769 if (q->ack_filter) 1770 ack = cake_ack_filter(q, flow); 1771 1772 if (ack) { 1773 b->ack_drops++; 1774 sch->qstats.drops++; 1775 b->bytes += qdisc_pkt_len(ack); 1776 len -= qdisc_pkt_len(ack); 1777 q->buffer_used += skb->truesize - ack->truesize; 1778 if (q->rate_flags & CAKE_FLAG_INGRESS) 1779 cake_advance_shaper(q, b, ack, now, true); 1780 1781 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); 1782 consume_skb(ack); 1783 } else { 1784 sch->q.qlen++; 1785 q->buffer_used += skb->truesize; 1786 } 1787 1788 /* stats */ 1789 b->packets++; 1790 b->bytes += len; 1791 b->backlogs[idx] += len; 1792 b->tin_backlog += len; 1793 sch->qstats.backlog += len; 1794 q->avg_window_bytes += len; 1795 } 1796 1797 if (q->overflow_timeout) 1798 cake_heapify_up(q, b->overflow_idx[idx]); 1799 1800 /* incoming bandwidth capacity estimate */ 1801 if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { 1802 u64 packet_interval = \ 1803 ktime_to_ns(ktime_sub(now, q->last_packet_time)); 1804 1805 if (packet_interval > NSEC_PER_SEC) 1806 packet_interval = NSEC_PER_SEC; 1807 1808 /* filter out short-term bursts, eg. wifi aggregation */ 1809 q->avg_packet_interval = \ 1810 cake_ewma(q->avg_packet_interval, 1811 packet_interval, 1812 (packet_interval > q->avg_packet_interval ? 1813 2 : 8)); 1814 1815 q->last_packet_time = now; 1816 1817 if (packet_interval > q->avg_packet_interval) { 1818 u64 window_interval = \ 1819 ktime_to_ns(ktime_sub(now, 1820 q->avg_window_begin)); 1821 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC; 1822 1823 b = div64_u64(b, window_interval); 1824 q->avg_peak_bandwidth = 1825 cake_ewma(q->avg_peak_bandwidth, b, 1826 b > q->avg_peak_bandwidth ? 2 : 8); 1827 q->avg_window_bytes = 0; 1828 q->avg_window_begin = now; 1829 1830 if (ktime_after(now, 1831 ktime_add_ms(q->last_reconfig_time, 1832 250))) { 1833 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; 1834 cake_reconfigure(sch); 1835 } 1836 } 1837 } else { 1838 q->avg_window_bytes = 0; 1839 q->last_packet_time = now; 1840 } 1841 1842 /* flowchain */ 1843 if (!flow->set || flow->set == CAKE_SET_DECAYING) { 1844 struct cake_host *srchost = &b->hosts[flow->srchost]; 1845 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1846 u16 host_load = 1; 1847 1848 if (!flow->set) { 1849 list_add_tail(&flow->flowchain, &b->new_flows); 1850 } else { 1851 b->decaying_flow_count--; 1852 list_move_tail(&flow->flowchain, &b->new_flows); 1853 } 1854 flow->set = CAKE_SET_SPARSE; 1855 b->sparse_flow_count++; 1856 1857 if (cake_dsrc(q->flow_mode)) 1858 host_load = max(host_load, srchost->srchost_bulk_flow_count); 1859 1860 if (cake_ddst(q->flow_mode)) 1861 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 1862 1863 flow->deficit = (b->flow_quantum * 1864 quantum_div[host_load]) >> 16; 1865 } else if (flow->set == CAKE_SET_SPARSE_WAIT) { 1866 struct cake_host *srchost = &b->hosts[flow->srchost]; 1867 struct cake_host *dsthost = &b->hosts[flow->dsthost]; 1868 1869 /* this flow was empty, accounted as a sparse flow, but actually 1870 * in the bulk rotation. 1871 */ 1872 flow->set = CAKE_SET_BULK; 1873 b->sparse_flow_count--; 1874 b->bulk_flow_count++; 1875 1876 if (cake_dsrc(q->flow_mode)) 1877 srchost->srchost_bulk_flow_count++; 1878 1879 if (cake_ddst(q->flow_mode)) 1880 dsthost->dsthost_bulk_flow_count++; 1881 1882 } 1883 1884 if (q->buffer_used > q->buffer_max_used) 1885 q->buffer_max_used = q->buffer_used; 1886 1887 if (q->buffer_used > q->buffer_limit) { 1888 u32 dropped = 0; 1889 1890 while (q->buffer_used > q->buffer_limit) { 1891 dropped++; 1892 cake_drop(sch, to_free); 1893 } 1894 b->drop_overlimit += dropped; 1895 } 1896 return NET_XMIT_SUCCESS; 1897 } 1898 1899 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch) 1900 { 1901 struct cake_sched_data *q = qdisc_priv(sch); 1902 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1903 struct cake_flow *flow = &b->flows[q->cur_flow]; 1904 struct sk_buff *skb = NULL; 1905 u32 len; 1906 1907 if (flow->head) { 1908 skb = dequeue_head(flow); 1909 len = qdisc_pkt_len(skb); 1910 b->backlogs[q->cur_flow] -= len; 1911 b->tin_backlog -= len; 1912 sch->qstats.backlog -= len; 1913 q->buffer_used -= skb->truesize; 1914 sch->q.qlen--; 1915 1916 if (q->overflow_timeout) 1917 cake_heapify(q, b->overflow_idx[q->cur_flow]); 1918 } 1919 return skb; 1920 } 1921 1922 /* Discard leftover packets from a tin no longer in use. */ 1923 static void cake_clear_tin(struct Qdisc *sch, u16 tin) 1924 { 1925 struct cake_sched_data *q = qdisc_priv(sch); 1926 struct sk_buff *skb; 1927 1928 q->cur_tin = tin; 1929 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) 1930 while (!!(skb = cake_dequeue_one(sch))) 1931 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); 1932 } 1933 1934 static struct sk_buff *cake_dequeue(struct Qdisc *sch) 1935 { 1936 struct cake_sched_data *q = qdisc_priv(sch); 1937 struct cake_tin_data *b = &q->tins[q->cur_tin]; 1938 struct cake_host *srchost, *dsthost; 1939 enum skb_drop_reason reason; 1940 ktime_t now = ktime_get(); 1941 struct cake_flow *flow; 1942 struct list_head *head; 1943 bool first_flow = true; 1944 struct sk_buff *skb; 1945 u16 host_load; 1946 u64 delay; 1947 u32 len; 1948 1949 begin: 1950 if (!sch->q.qlen) 1951 return NULL; 1952 1953 /* global hard shaper */ 1954 if (ktime_after(q->time_next_packet, now) && 1955 ktime_after(q->failsafe_next_packet, now)) { 1956 u64 next = min(ktime_to_ns(q->time_next_packet), 1957 ktime_to_ns(q->failsafe_next_packet)); 1958 1959 sch->qstats.overlimits++; 1960 qdisc_watchdog_schedule_ns(&q->watchdog, next); 1961 return NULL; 1962 } 1963 1964 /* Choose a class to work on. */ 1965 if (!q->rate_ns) { 1966 /* In unlimited mode, can't rely on shaper timings, just balance 1967 * with DRR 1968 */ 1969 bool wrapped = false, empty = true; 1970 1971 while (b->tin_deficit < 0 || 1972 !(b->sparse_flow_count + b->bulk_flow_count)) { 1973 if (b->tin_deficit <= 0) 1974 b->tin_deficit += b->tin_quantum; 1975 if (b->sparse_flow_count + b->bulk_flow_count) 1976 empty = false; 1977 1978 q->cur_tin++; 1979 b++; 1980 if (q->cur_tin >= q->tin_cnt) { 1981 q->cur_tin = 0; 1982 b = q->tins; 1983 1984 if (wrapped) { 1985 /* It's possible for q->qlen to be 1986 * nonzero when we actually have no 1987 * packets anywhere. 1988 */ 1989 if (empty) 1990 return NULL; 1991 } else { 1992 wrapped = true; 1993 } 1994 } 1995 } 1996 } else { 1997 /* In shaped mode, choose: 1998 * - Highest-priority tin with queue and meeting schedule, or 1999 * - The earliest-scheduled tin with queue. 2000 */ 2001 ktime_t best_time = KTIME_MAX; 2002 int tin, best_tin = 0; 2003 2004 for (tin = 0; tin < q->tin_cnt; tin++) { 2005 b = q->tins + tin; 2006 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) { 2007 ktime_t time_to_pkt = \ 2008 ktime_sub(b->time_next_packet, now); 2009 2010 if (ktime_to_ns(time_to_pkt) <= 0 || 2011 ktime_compare(time_to_pkt, 2012 best_time) <= 0) { 2013 best_time = time_to_pkt; 2014 best_tin = tin; 2015 } 2016 } 2017 } 2018 2019 q->cur_tin = best_tin; 2020 b = q->tins + best_tin; 2021 2022 /* No point in going further if no packets to deliver. */ 2023 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count))) 2024 return NULL; 2025 } 2026 2027 retry: 2028 /* service this class */ 2029 head = &b->decaying_flows; 2030 if (!first_flow || list_empty(head)) { 2031 head = &b->new_flows; 2032 if (list_empty(head)) { 2033 head = &b->old_flows; 2034 if (unlikely(list_empty(head))) { 2035 head = &b->decaying_flows; 2036 if (unlikely(list_empty(head))) 2037 goto begin; 2038 } 2039 } 2040 } 2041 flow = list_first_entry(head, struct cake_flow, flowchain); 2042 q->cur_flow = flow - b->flows; 2043 first_flow = false; 2044 2045 /* triple isolation (modified DRR++) */ 2046 srchost = &b->hosts[flow->srchost]; 2047 dsthost = &b->hosts[flow->dsthost]; 2048 host_load = 1; 2049 2050 /* flow isolation (DRR++) */ 2051 if (flow->deficit <= 0) { 2052 /* Keep all flows with deficits out of the sparse and decaying 2053 * rotations. No non-empty flow can go into the decaying 2054 * rotation, so they can't get deficits 2055 */ 2056 if (flow->set == CAKE_SET_SPARSE) { 2057 if (flow->head) { 2058 b->sparse_flow_count--; 2059 b->bulk_flow_count++; 2060 2061 if (cake_dsrc(q->flow_mode)) 2062 srchost->srchost_bulk_flow_count++; 2063 2064 if (cake_ddst(q->flow_mode)) 2065 dsthost->dsthost_bulk_flow_count++; 2066 2067 flow->set = CAKE_SET_BULK; 2068 } else { 2069 /* we've moved it to the bulk rotation for 2070 * correct deficit accounting but we still want 2071 * to count it as a sparse flow, not a bulk one. 2072 */ 2073 flow->set = CAKE_SET_SPARSE_WAIT; 2074 } 2075 } 2076 2077 if (cake_dsrc(q->flow_mode)) 2078 host_load = max(host_load, srchost->srchost_bulk_flow_count); 2079 2080 if (cake_ddst(q->flow_mode)) 2081 host_load = max(host_load, dsthost->dsthost_bulk_flow_count); 2082 2083 WARN_ON(host_load > CAKE_QUEUES); 2084 2085 /* The get_random_u16() is a way to apply dithering to avoid 2086 * accumulating roundoff errors 2087 */ 2088 flow->deficit += (b->flow_quantum * quantum_div[host_load] + 2089 get_random_u16()) >> 16; 2090 list_move_tail(&flow->flowchain, &b->old_flows); 2091 2092 goto retry; 2093 } 2094 2095 /* Retrieve a packet via the AQM */ 2096 while (1) { 2097 skb = cake_dequeue_one(sch); 2098 if (!skb) { 2099 /* this queue was actually empty */ 2100 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now)) 2101 b->unresponsive_flow_count--; 2102 2103 if (flow->cvars.p_drop || flow->cvars.count || 2104 ktime_before(now, flow->cvars.drop_next)) { 2105 /* keep in the flowchain until the state has 2106 * decayed to rest 2107 */ 2108 list_move_tail(&flow->flowchain, 2109 &b->decaying_flows); 2110 if (flow->set == CAKE_SET_BULK) { 2111 b->bulk_flow_count--; 2112 2113 if (cake_dsrc(q->flow_mode)) 2114 srchost->srchost_bulk_flow_count--; 2115 2116 if (cake_ddst(q->flow_mode)) 2117 dsthost->dsthost_bulk_flow_count--; 2118 2119 b->decaying_flow_count++; 2120 } else if (flow->set == CAKE_SET_SPARSE || 2121 flow->set == CAKE_SET_SPARSE_WAIT) { 2122 b->sparse_flow_count--; 2123 b->decaying_flow_count++; 2124 } 2125 flow->set = CAKE_SET_DECAYING; 2126 } else { 2127 /* remove empty queue from the flowchain */ 2128 list_del_init(&flow->flowchain); 2129 if (flow->set == CAKE_SET_SPARSE || 2130 flow->set == CAKE_SET_SPARSE_WAIT) 2131 b->sparse_flow_count--; 2132 else if (flow->set == CAKE_SET_BULK) { 2133 b->bulk_flow_count--; 2134 2135 if (cake_dsrc(q->flow_mode)) 2136 srchost->srchost_bulk_flow_count--; 2137 2138 if (cake_ddst(q->flow_mode)) 2139 dsthost->dsthost_bulk_flow_count--; 2140 2141 } else 2142 b->decaying_flow_count--; 2143 2144 flow->set = CAKE_SET_NONE; 2145 } 2146 goto begin; 2147 } 2148 2149 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, 2150 (b->bulk_flow_count * 2151 !!(q->rate_flags & 2152 CAKE_FLAG_INGRESS))); 2153 /* Last packet in queue may be marked, shouldn't be dropped */ 2154 if (reason == SKB_NOT_DROPPED_YET || !flow->head) 2155 break; 2156 2157 /* drop this packet, get another one */ 2158 if (q->rate_flags & CAKE_FLAG_INGRESS) { 2159 len = cake_advance_shaper(q, b, skb, 2160 now, true); 2161 flow->deficit -= len; 2162 b->tin_deficit -= len; 2163 } 2164 flow->dropped++; 2165 b->tin_dropped++; 2166 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); 2167 qdisc_qstats_drop(sch); 2168 kfree_skb_reason(skb, reason); 2169 if (q->rate_flags & CAKE_FLAG_INGRESS) 2170 goto retry; 2171 } 2172 2173 b->tin_ecn_mark += !!flow->cvars.ecn_marked; 2174 qdisc_bstats_update(sch, skb); 2175 2176 /* collect delay stats */ 2177 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); 2178 b->avge_delay = cake_ewma(b->avge_delay, delay, 8); 2179 b->peak_delay = cake_ewma(b->peak_delay, delay, 2180 delay > b->peak_delay ? 2 : 8); 2181 b->base_delay = cake_ewma(b->base_delay, delay, 2182 delay < b->base_delay ? 2 : 8); 2183 2184 len = cake_advance_shaper(q, b, skb, now, false); 2185 flow->deficit -= len; 2186 b->tin_deficit -= len; 2187 2188 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) { 2189 u64 next = min(ktime_to_ns(q->time_next_packet), 2190 ktime_to_ns(q->failsafe_next_packet)); 2191 2192 qdisc_watchdog_schedule_ns(&q->watchdog, next); 2193 } else if (!sch->q.qlen) { 2194 int i; 2195 2196 for (i = 0; i < q->tin_cnt; i++) { 2197 if (q->tins[i].decaying_flow_count) { 2198 ktime_t next = \ 2199 ktime_add_ns(now, 2200 q->tins[i].cparams.target); 2201 2202 qdisc_watchdog_schedule_ns(&q->watchdog, 2203 ktime_to_ns(next)); 2204 break; 2205 } 2206 } 2207 } 2208 2209 if (q->overflow_timeout) 2210 q->overflow_timeout--; 2211 2212 return skb; 2213 } 2214 2215 static void cake_reset(struct Qdisc *sch) 2216 { 2217 struct cake_sched_data *q = qdisc_priv(sch); 2218 u32 c; 2219 2220 if (!q->tins) 2221 return; 2222 2223 for (c = 0; c < CAKE_MAX_TINS; c++) 2224 cake_clear_tin(sch, c); 2225 } 2226 2227 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = { 2228 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 }, 2229 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 }, 2230 [TCA_CAKE_ATM] = { .type = NLA_U32 }, 2231 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 }, 2232 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 }, 2233 [TCA_CAKE_RTT] = { .type = NLA_U32 }, 2234 [TCA_CAKE_TARGET] = { .type = NLA_U32 }, 2235 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 }, 2236 [TCA_CAKE_MEMORY] = { .type = NLA_U32 }, 2237 [TCA_CAKE_NAT] = { .type = NLA_U32 }, 2238 [TCA_CAKE_RAW] = { .type = NLA_U32 }, 2239 [TCA_CAKE_WASH] = { .type = NLA_U32 }, 2240 [TCA_CAKE_MPU] = { .type = NLA_U32 }, 2241 [TCA_CAKE_INGRESS] = { .type = NLA_U32 }, 2242 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 }, 2243 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 }, 2244 [TCA_CAKE_FWMARK] = { .type = NLA_U32 }, 2245 }; 2246 2247 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, 2248 u64 target_ns, u64 rtt_est_ns) 2249 { 2250 /* convert byte-rate into time-per-byte 2251 * so it will always unwedge in reasonable time. 2252 */ 2253 static const u64 MIN_RATE = 64; 2254 u32 byte_target = mtu; 2255 u64 byte_target_ns; 2256 u8 rate_shft = 0; 2257 u64 rate_ns = 0; 2258 2259 b->flow_quantum = 1514; 2260 if (rate) { 2261 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL); 2262 rate_shft = 34; 2263 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft; 2264 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate)); 2265 while (!!(rate_ns >> 34)) { 2266 rate_ns >>= 1; 2267 rate_shft--; 2268 } 2269 } /* else unlimited, ie. zero delay */ 2270 2271 b->tin_rate_bps = rate; 2272 b->tin_rate_ns = rate_ns; 2273 b->tin_rate_shft = rate_shft; 2274 2275 byte_target_ns = (byte_target * rate_ns) >> rate_shft; 2276 2277 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); 2278 b->cparams.interval = max(rtt_est_ns + 2279 b->cparams.target - target_ns, 2280 b->cparams.target * 2); 2281 b->cparams.mtu_time = byte_target_ns; 2282 b->cparams.p_inc = 1 << 24; /* 1/256 */ 2283 b->cparams.p_dec = 1 << 20; /* 1/4096 */ 2284 } 2285 2286 static int cake_config_besteffort(struct Qdisc *sch) 2287 { 2288 struct cake_sched_data *q = qdisc_priv(sch); 2289 struct cake_tin_data *b = &q->tins[0]; 2290 u32 mtu = psched_mtu(qdisc_dev(sch)); 2291 u64 rate = q->rate_bps; 2292 2293 q->tin_cnt = 1; 2294 2295 q->tin_index = besteffort; 2296 q->tin_order = normal_order; 2297 2298 cake_set_rate(b, rate, mtu, 2299 us_to_ns(q->target), us_to_ns(q->interval)); 2300 b->tin_quantum = 65535; 2301 2302 return 0; 2303 } 2304 2305 static int cake_config_precedence(struct Qdisc *sch) 2306 { 2307 /* convert high-level (user visible) parameters into internal format */ 2308 struct cake_sched_data *q = qdisc_priv(sch); 2309 u32 mtu = psched_mtu(qdisc_dev(sch)); 2310 u64 rate = q->rate_bps; 2311 u32 quantum = 256; 2312 u32 i; 2313 2314 q->tin_cnt = 8; 2315 q->tin_index = precedence; 2316 q->tin_order = normal_order; 2317 2318 for (i = 0; i < q->tin_cnt; i++) { 2319 struct cake_tin_data *b = &q->tins[i]; 2320 2321 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2322 us_to_ns(q->interval)); 2323 2324 b->tin_quantum = max_t(u16, 1U, quantum); 2325 2326 /* calculate next class's parameters */ 2327 rate *= 7; 2328 rate >>= 3; 2329 2330 quantum *= 7; 2331 quantum >>= 3; 2332 } 2333 2334 return 0; 2335 } 2336 2337 /* List of known Diffserv codepoints: 2338 * 2339 * Default Forwarding (DF/CS0) - Best Effort 2340 * Max Throughput (TOS2) 2341 * Min Delay (TOS4) 2342 * LLT "La" (TOS5) 2343 * Assured Forwarding 1 (AF1x) - x3 2344 * Assured Forwarding 2 (AF2x) - x3 2345 * Assured Forwarding 3 (AF3x) - x3 2346 * Assured Forwarding 4 (AF4x) - x3 2347 * Precedence Class 1 (CS1) 2348 * Precedence Class 2 (CS2) 2349 * Precedence Class 3 (CS3) 2350 * Precedence Class 4 (CS4) 2351 * Precedence Class 5 (CS5) 2352 * Precedence Class 6 (CS6) 2353 * Precedence Class 7 (CS7) 2354 * Voice Admit (VA) 2355 * Expedited Forwarding (EF) 2356 * Lower Effort (LE) 2357 * 2358 * Total 26 codepoints. 2359 */ 2360 2361 /* List of traffic classes in RFC 4594, updated by RFC 8622: 2362 * (roughly descending order of contended priority) 2363 * (roughly ascending order of uncontended throughput) 2364 * 2365 * Network Control (CS6,CS7) - routing traffic 2366 * Telephony (EF,VA) - aka. VoIP streams 2367 * Signalling (CS5) - VoIP setup 2368 * Multimedia Conferencing (AF4x) - aka. video calls 2369 * Realtime Interactive (CS4) - eg. games 2370 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch 2371 * Broadcast Video (CS3) 2372 * Low-Latency Data (AF2x,TOS4) - eg. database 2373 * Ops, Admin, Management (CS2) - eg. ssh 2374 * Standard Service (DF & unrecognised codepoints) 2375 * High-Throughput Data (AF1x,TOS2) - eg. web traffic 2376 * Low-Priority Data (LE,CS1) - eg. BitTorrent 2377 * 2378 * Total 12 traffic classes. 2379 */ 2380 2381 static int cake_config_diffserv8(struct Qdisc *sch) 2382 { 2383 /* Pruned list of traffic classes for typical applications: 2384 * 2385 * Network Control (CS6, CS7) 2386 * Minimum Latency (EF, VA, CS5, CS4) 2387 * Interactive Shell (CS2) 2388 * Low Latency Transactions (AF2x, TOS4) 2389 * Video Streaming (AF4x, AF3x, CS3) 2390 * Bog Standard (DF etc.) 2391 * High Throughput (AF1x, TOS2, CS1) 2392 * Background Traffic (LE) 2393 * 2394 * Total 8 traffic classes. 2395 */ 2396 2397 struct cake_sched_data *q = qdisc_priv(sch); 2398 u32 mtu = psched_mtu(qdisc_dev(sch)); 2399 u64 rate = q->rate_bps; 2400 u32 quantum = 256; 2401 u32 i; 2402 2403 q->tin_cnt = 8; 2404 2405 /* codepoint to class mapping */ 2406 q->tin_index = diffserv8; 2407 q->tin_order = normal_order; 2408 2409 /* class characteristics */ 2410 for (i = 0; i < q->tin_cnt; i++) { 2411 struct cake_tin_data *b = &q->tins[i]; 2412 2413 cake_set_rate(b, rate, mtu, us_to_ns(q->target), 2414 us_to_ns(q->interval)); 2415 2416 b->tin_quantum = max_t(u16, 1U, quantum); 2417 2418 /* calculate next class's parameters */ 2419 rate *= 7; 2420 rate >>= 3; 2421 2422 quantum *= 7; 2423 quantum >>= 3; 2424 } 2425 2426 return 0; 2427 } 2428 2429 static int cake_config_diffserv4(struct Qdisc *sch) 2430 { 2431 /* Further pruned list of traffic classes for four-class system: 2432 * 2433 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4) 2434 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2) 2435 * Best Effort (DF, AF1x, TOS2, and those not specified) 2436 * Background Traffic (LE, CS1) 2437 * 2438 * Total 4 traffic classes. 2439 */ 2440 2441 struct cake_sched_data *q = qdisc_priv(sch); 2442 u32 mtu = psched_mtu(qdisc_dev(sch)); 2443 u64 rate = q->rate_bps; 2444 u32 quantum = 1024; 2445 2446 q->tin_cnt = 4; 2447 2448 /* codepoint to class mapping */ 2449 q->tin_index = diffserv4; 2450 q->tin_order = bulk_order; 2451 2452 /* class characteristics */ 2453 cake_set_rate(&q->tins[0], rate, mtu, 2454 us_to_ns(q->target), us_to_ns(q->interval)); 2455 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2456 us_to_ns(q->target), us_to_ns(q->interval)); 2457 cake_set_rate(&q->tins[2], rate >> 1, mtu, 2458 us_to_ns(q->target), us_to_ns(q->interval)); 2459 cake_set_rate(&q->tins[3], rate >> 2, mtu, 2460 us_to_ns(q->target), us_to_ns(q->interval)); 2461 2462 /* bandwidth-sharing weights */ 2463 q->tins[0].tin_quantum = quantum; 2464 q->tins[1].tin_quantum = quantum >> 4; 2465 q->tins[2].tin_quantum = quantum >> 1; 2466 q->tins[3].tin_quantum = quantum >> 2; 2467 2468 return 0; 2469 } 2470 2471 static int cake_config_diffserv3(struct Qdisc *sch) 2472 { 2473 /* Simplified Diffserv structure with 3 tins. 2474 * Latency Sensitive (CS7, CS6, EF, VA, TOS4) 2475 * Best Effort 2476 * Low Priority (LE, CS1) 2477 */ 2478 struct cake_sched_data *q = qdisc_priv(sch); 2479 u32 mtu = psched_mtu(qdisc_dev(sch)); 2480 u64 rate = q->rate_bps; 2481 u32 quantum = 1024; 2482 2483 q->tin_cnt = 3; 2484 2485 /* codepoint to class mapping */ 2486 q->tin_index = diffserv3; 2487 q->tin_order = bulk_order; 2488 2489 /* class characteristics */ 2490 cake_set_rate(&q->tins[0], rate, mtu, 2491 us_to_ns(q->target), us_to_ns(q->interval)); 2492 cake_set_rate(&q->tins[1], rate >> 4, mtu, 2493 us_to_ns(q->target), us_to_ns(q->interval)); 2494 cake_set_rate(&q->tins[2], rate >> 2, mtu, 2495 us_to_ns(q->target), us_to_ns(q->interval)); 2496 2497 /* bandwidth-sharing weights */ 2498 q->tins[0].tin_quantum = quantum; 2499 q->tins[1].tin_quantum = quantum >> 4; 2500 q->tins[2].tin_quantum = quantum >> 2; 2501 2502 return 0; 2503 } 2504 2505 static void cake_reconfigure(struct Qdisc *sch) 2506 { 2507 struct cake_sched_data *q = qdisc_priv(sch); 2508 int c, ft; 2509 2510 switch (q->tin_mode) { 2511 case CAKE_DIFFSERV_BESTEFFORT: 2512 ft = cake_config_besteffort(sch); 2513 break; 2514 2515 case CAKE_DIFFSERV_PRECEDENCE: 2516 ft = cake_config_precedence(sch); 2517 break; 2518 2519 case CAKE_DIFFSERV_DIFFSERV8: 2520 ft = cake_config_diffserv8(sch); 2521 break; 2522 2523 case CAKE_DIFFSERV_DIFFSERV4: 2524 ft = cake_config_diffserv4(sch); 2525 break; 2526 2527 case CAKE_DIFFSERV_DIFFSERV3: 2528 default: 2529 ft = cake_config_diffserv3(sch); 2530 break; 2531 } 2532 2533 for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { 2534 cake_clear_tin(sch, c); 2535 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; 2536 } 2537 2538 q->rate_ns = q->tins[ft].tin_rate_ns; 2539 q->rate_shft = q->tins[ft].tin_rate_shft; 2540 2541 if (q->buffer_config_limit) { 2542 q->buffer_limit = q->buffer_config_limit; 2543 } else if (q->rate_bps) { 2544 u64 t = q->rate_bps * q->interval; 2545 2546 do_div(t, USEC_PER_SEC / 4); 2547 q->buffer_limit = max_t(u32, t, 4U << 20); 2548 } else { 2549 q->buffer_limit = ~0; 2550 } 2551 2552 sch->flags &= ~TCQ_F_CAN_BYPASS; 2553 2554 q->buffer_limit = min(q->buffer_limit, 2555 max(sch->limit * psched_mtu(qdisc_dev(sch)), 2556 q->buffer_config_limit)); 2557 } 2558 2559 static int cake_change(struct Qdisc *sch, struct nlattr *opt, 2560 struct netlink_ext_ack *extack) 2561 { 2562 struct cake_sched_data *q = qdisc_priv(sch); 2563 struct nlattr *tb[TCA_CAKE_MAX + 1]; 2564 u16 rate_flags; 2565 u8 flow_mode; 2566 int err; 2567 2568 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, 2569 extack); 2570 if (err < 0) 2571 return err; 2572 2573 flow_mode = q->flow_mode; 2574 if (tb[TCA_CAKE_NAT]) { 2575 #if IS_ENABLED(CONFIG_NF_CONNTRACK) 2576 flow_mode &= ~CAKE_FLOW_NAT_FLAG; 2577 flow_mode |= CAKE_FLOW_NAT_FLAG * 2578 !!nla_get_u32(tb[TCA_CAKE_NAT]); 2579 #else 2580 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT], 2581 "No conntrack support in kernel"); 2582 return -EOPNOTSUPP; 2583 #endif 2584 } 2585 2586 if (tb[TCA_CAKE_BASE_RATE64]) 2587 WRITE_ONCE(q->rate_bps, 2588 nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); 2589 2590 if (tb[TCA_CAKE_DIFFSERV_MODE]) 2591 WRITE_ONCE(q->tin_mode, 2592 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); 2593 2594 rate_flags = q->rate_flags; 2595 if (tb[TCA_CAKE_WASH]) { 2596 if (!!nla_get_u32(tb[TCA_CAKE_WASH])) 2597 rate_flags |= CAKE_FLAG_WASH; 2598 else 2599 rate_flags &= ~CAKE_FLAG_WASH; 2600 } 2601 2602 if (tb[TCA_CAKE_FLOW_MODE]) 2603 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) | 2604 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) & 2605 CAKE_FLOW_MASK)); 2606 2607 if (tb[TCA_CAKE_ATM]) 2608 WRITE_ONCE(q->atm_mode, 2609 nla_get_u32(tb[TCA_CAKE_ATM])); 2610 2611 if (tb[TCA_CAKE_OVERHEAD]) { 2612 WRITE_ONCE(q->rate_overhead, 2613 nla_get_s32(tb[TCA_CAKE_OVERHEAD])); 2614 rate_flags |= CAKE_FLAG_OVERHEAD; 2615 2616 q->max_netlen = 0; 2617 q->max_adjlen = 0; 2618 q->min_netlen = ~0; 2619 q->min_adjlen = ~0; 2620 } 2621 2622 if (tb[TCA_CAKE_RAW]) { 2623 rate_flags &= ~CAKE_FLAG_OVERHEAD; 2624 2625 q->max_netlen = 0; 2626 q->max_adjlen = 0; 2627 q->min_netlen = ~0; 2628 q->min_adjlen = ~0; 2629 } 2630 2631 if (tb[TCA_CAKE_MPU]) 2632 WRITE_ONCE(q->rate_mpu, 2633 nla_get_u32(tb[TCA_CAKE_MPU])); 2634 2635 if (tb[TCA_CAKE_RTT]) { 2636 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]); 2637 2638 WRITE_ONCE(q->interval, max(interval, 1U)); 2639 } 2640 2641 if (tb[TCA_CAKE_TARGET]) { 2642 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]); 2643 2644 WRITE_ONCE(q->target, max(target, 1U)); 2645 } 2646 2647 if (tb[TCA_CAKE_AUTORATE]) { 2648 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) 2649 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; 2650 else 2651 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; 2652 } 2653 2654 if (tb[TCA_CAKE_INGRESS]) { 2655 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) 2656 rate_flags |= CAKE_FLAG_INGRESS; 2657 else 2658 rate_flags &= ~CAKE_FLAG_INGRESS; 2659 } 2660 2661 if (tb[TCA_CAKE_ACK_FILTER]) 2662 WRITE_ONCE(q->ack_filter, 2663 nla_get_u32(tb[TCA_CAKE_ACK_FILTER])); 2664 2665 if (tb[TCA_CAKE_MEMORY]) 2666 WRITE_ONCE(q->buffer_config_limit, 2667 nla_get_u32(tb[TCA_CAKE_MEMORY])); 2668 2669 if (tb[TCA_CAKE_SPLIT_GSO]) { 2670 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO])) 2671 rate_flags |= CAKE_FLAG_SPLIT_GSO; 2672 else 2673 rate_flags &= ~CAKE_FLAG_SPLIT_GSO; 2674 } 2675 2676 if (tb[TCA_CAKE_FWMARK]) { 2677 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK])); 2678 WRITE_ONCE(q->fwmark_shft, 2679 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0); 2680 } 2681 2682 WRITE_ONCE(q->rate_flags, rate_flags); 2683 WRITE_ONCE(q->flow_mode, flow_mode); 2684 if (q->tins) { 2685 sch_tree_lock(sch); 2686 cake_reconfigure(sch); 2687 sch_tree_unlock(sch); 2688 } 2689 2690 return 0; 2691 } 2692 2693 static void cake_destroy(struct Qdisc *sch) 2694 { 2695 struct cake_sched_data *q = qdisc_priv(sch); 2696 2697 qdisc_watchdog_cancel(&q->watchdog); 2698 tcf_block_put(q->block); 2699 kvfree(q->tins); 2700 } 2701 2702 static int cake_init(struct Qdisc *sch, struct nlattr *opt, 2703 struct netlink_ext_ack *extack) 2704 { 2705 struct cake_sched_data *q = qdisc_priv(sch); 2706 int i, j, err; 2707 2708 sch->limit = 10240; 2709 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; 2710 q->flow_mode = CAKE_FLOW_TRIPLE; 2711 2712 q->rate_bps = 0; /* unlimited by default */ 2713 2714 q->interval = 100000; /* 100ms default */ 2715 q->target = 5000; /* 5ms: codel RFC argues 2716 * for 5 to 10% of interval 2717 */ 2718 q->rate_flags |= CAKE_FLAG_SPLIT_GSO; 2719 q->cur_tin = 0; 2720 q->cur_flow = 0; 2721 2722 qdisc_watchdog_init(&q->watchdog, sch); 2723 2724 if (opt) { 2725 err = cake_change(sch, opt, extack); 2726 2727 if (err) 2728 return err; 2729 } 2730 2731 err = tcf_block_get(&q->block, &q->filter_list, sch, extack); 2732 if (err) 2733 return err; 2734 2735 quantum_div[0] = ~0; 2736 for (i = 1; i <= CAKE_QUEUES; i++) 2737 quantum_div[i] = 65535 / i; 2738 2739 q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), 2740 GFP_KERNEL); 2741 if (!q->tins) 2742 return -ENOMEM; 2743 2744 for (i = 0; i < CAKE_MAX_TINS; i++) { 2745 struct cake_tin_data *b = q->tins + i; 2746 2747 INIT_LIST_HEAD(&b->new_flows); 2748 INIT_LIST_HEAD(&b->old_flows); 2749 INIT_LIST_HEAD(&b->decaying_flows); 2750 b->sparse_flow_count = 0; 2751 b->bulk_flow_count = 0; 2752 b->decaying_flow_count = 0; 2753 2754 for (j = 0; j < CAKE_QUEUES; j++) { 2755 struct cake_flow *flow = b->flows + j; 2756 u32 k = j * CAKE_MAX_TINS + i; 2757 2758 INIT_LIST_HEAD(&flow->flowchain); 2759 cobalt_vars_init(&flow->cvars); 2760 2761 q->overflow_heap[k].t = i; 2762 q->overflow_heap[k].b = j; 2763 b->overflow_idx[j] = k; 2764 } 2765 } 2766 2767 cake_reconfigure(sch); 2768 q->avg_peak_bandwidth = q->rate_bps; 2769 q->min_netlen = ~0; 2770 q->min_adjlen = ~0; 2771 return 0; 2772 } 2773 2774 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) 2775 { 2776 struct cake_sched_data *q = qdisc_priv(sch); 2777 struct nlattr *opts; 2778 u16 rate_flags; 2779 u8 flow_mode; 2780 2781 opts = nla_nest_start_noflag(skb, TCA_OPTIONS); 2782 if (!opts) 2783 goto nla_put_failure; 2784 2785 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, 2786 READ_ONCE(q->rate_bps), TCA_CAKE_PAD)) 2787 goto nla_put_failure; 2788 2789 flow_mode = READ_ONCE(q->flow_mode); 2790 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK)) 2791 goto nla_put_failure; 2792 2793 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval))) 2794 goto nla_put_failure; 2795 2796 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target))) 2797 goto nla_put_failure; 2798 2799 if (nla_put_u32(skb, TCA_CAKE_MEMORY, 2800 READ_ONCE(q->buffer_config_limit))) 2801 goto nla_put_failure; 2802 2803 rate_flags = READ_ONCE(q->rate_flags); 2804 if (nla_put_u32(skb, TCA_CAKE_AUTORATE, 2805 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS))) 2806 goto nla_put_failure; 2807 2808 if (nla_put_u32(skb, TCA_CAKE_INGRESS, 2809 !!(rate_flags & CAKE_FLAG_INGRESS))) 2810 goto nla_put_failure; 2811 2812 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter))) 2813 goto nla_put_failure; 2814 2815 if (nla_put_u32(skb, TCA_CAKE_NAT, 2816 !!(flow_mode & CAKE_FLOW_NAT_FLAG))) 2817 goto nla_put_failure; 2818 2819 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode))) 2820 goto nla_put_failure; 2821 2822 if (nla_put_u32(skb, TCA_CAKE_WASH, 2823 !!(rate_flags & CAKE_FLAG_WASH))) 2824 goto nla_put_failure; 2825 2826 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead))) 2827 goto nla_put_failure; 2828 2829 if (!(rate_flags & CAKE_FLAG_OVERHEAD)) 2830 if (nla_put_u32(skb, TCA_CAKE_RAW, 0)) 2831 goto nla_put_failure; 2832 2833 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode))) 2834 goto nla_put_failure; 2835 2836 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu))) 2837 goto nla_put_failure; 2838 2839 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO, 2840 !!(rate_flags & CAKE_FLAG_SPLIT_GSO))) 2841 goto nla_put_failure; 2842 2843 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask))) 2844 goto nla_put_failure; 2845 2846 return nla_nest_end(skb, opts); 2847 2848 nla_put_failure: 2849 return -1; 2850 } 2851 2852 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 2853 { 2854 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 2855 struct cake_sched_data *q = qdisc_priv(sch); 2856 struct nlattr *tstats, *ts; 2857 int i; 2858 2859 if (!stats) 2860 return -1; 2861 2862 #define PUT_STAT_U32(attr, data) do { \ 2863 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 2864 goto nla_put_failure; \ 2865 } while (0) 2866 #define PUT_STAT_U64(attr, data) do { \ 2867 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \ 2868 data, TCA_CAKE_STATS_PAD)) \ 2869 goto nla_put_failure; \ 2870 } while (0) 2871 2872 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth); 2873 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit); 2874 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used); 2875 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16)); 2876 PUT_STAT_U32(MAX_NETLEN, q->max_netlen); 2877 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); 2878 PUT_STAT_U32(MIN_NETLEN, q->min_netlen); 2879 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); 2880 2881 #undef PUT_STAT_U32 2882 #undef PUT_STAT_U64 2883 2884 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS); 2885 if (!tstats) 2886 goto nla_put_failure; 2887 2888 #define PUT_TSTAT_U32(attr, data) do { \ 2889 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \ 2890 goto nla_put_failure; \ 2891 } while (0) 2892 #define PUT_TSTAT_U64(attr, data) do { \ 2893 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \ 2894 data, TCA_CAKE_TIN_STATS_PAD)) \ 2895 goto nla_put_failure; \ 2896 } while (0) 2897 2898 for (i = 0; i < q->tin_cnt; i++) { 2899 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 2900 2901 ts = nla_nest_start_noflag(d->skb, i + 1); 2902 if (!ts) 2903 goto nla_put_failure; 2904 2905 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps); 2906 PUT_TSTAT_U64(SENT_BYTES64, b->bytes); 2907 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog); 2908 2909 PUT_TSTAT_U32(TARGET_US, 2910 ktime_to_us(ns_to_ktime(b->cparams.target))); 2911 PUT_TSTAT_U32(INTERVAL_US, 2912 ktime_to_us(ns_to_ktime(b->cparams.interval))); 2913 2914 PUT_TSTAT_U32(SENT_PACKETS, b->packets); 2915 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped); 2916 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark); 2917 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops); 2918 2919 PUT_TSTAT_U32(PEAK_DELAY_US, 2920 ktime_to_us(ns_to_ktime(b->peak_delay))); 2921 PUT_TSTAT_U32(AVG_DELAY_US, 2922 ktime_to_us(ns_to_ktime(b->avge_delay))); 2923 PUT_TSTAT_U32(BASE_DELAY_US, 2924 ktime_to_us(ns_to_ktime(b->base_delay))); 2925 2926 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits); 2927 PUT_TSTAT_U32(WAY_MISSES, b->way_misses); 2928 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions); 2929 2930 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count + 2931 b->decaying_flow_count); 2932 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count); 2933 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count); 2934 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen); 2935 2936 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum); 2937 nla_nest_end(d->skb, ts); 2938 } 2939 2940 #undef PUT_TSTAT_U32 2941 #undef PUT_TSTAT_U64 2942 2943 nla_nest_end(d->skb, tstats); 2944 return nla_nest_end(d->skb, stats); 2945 2946 nla_put_failure: 2947 nla_nest_cancel(d->skb, stats); 2948 return -1; 2949 } 2950 2951 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg) 2952 { 2953 return NULL; 2954 } 2955 2956 static unsigned long cake_find(struct Qdisc *sch, u32 classid) 2957 { 2958 return 0; 2959 } 2960 2961 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent, 2962 u32 classid) 2963 { 2964 return 0; 2965 } 2966 2967 static void cake_unbind(struct Qdisc *q, unsigned long cl) 2968 { 2969 } 2970 2971 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl, 2972 struct netlink_ext_ack *extack) 2973 { 2974 struct cake_sched_data *q = qdisc_priv(sch); 2975 2976 if (cl) 2977 return NULL; 2978 return q->block; 2979 } 2980 2981 static int cake_dump_class(struct Qdisc *sch, unsigned long cl, 2982 struct sk_buff *skb, struct tcmsg *tcm) 2983 { 2984 tcm->tcm_handle |= TC_H_MIN(cl); 2985 return 0; 2986 } 2987 2988 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl, 2989 struct gnet_dump *d) 2990 { 2991 struct cake_sched_data *q = qdisc_priv(sch); 2992 const struct cake_flow *flow = NULL; 2993 struct gnet_stats_queue qs = { 0 }; 2994 struct nlattr *stats; 2995 u32 idx = cl - 1; 2996 2997 if (idx < CAKE_QUEUES * q->tin_cnt) { 2998 const struct cake_tin_data *b = \ 2999 &q->tins[q->tin_order[idx / CAKE_QUEUES]]; 3000 const struct sk_buff *skb; 3001 3002 flow = &b->flows[idx % CAKE_QUEUES]; 3003 3004 if (flow->head) { 3005 sch_tree_lock(sch); 3006 skb = flow->head; 3007 while (skb) { 3008 qs.qlen++; 3009 skb = skb->next; 3010 } 3011 sch_tree_unlock(sch); 3012 } 3013 qs.backlog = b->backlogs[idx % CAKE_QUEUES]; 3014 qs.drops = flow->dropped; 3015 } 3016 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) 3017 return -1; 3018 if (flow) { 3019 ktime_t now = ktime_get(); 3020 3021 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); 3022 if (!stats) 3023 return -1; 3024 3025 #define PUT_STAT_U32(attr, data) do { \ 3026 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3027 goto nla_put_failure; \ 3028 } while (0) 3029 #define PUT_STAT_S32(attr, data) do { \ 3030 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \ 3031 goto nla_put_failure; \ 3032 } while (0) 3033 3034 PUT_STAT_S32(DEFICIT, flow->deficit); 3035 PUT_STAT_U32(DROPPING, flow->cvars.dropping); 3036 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count); 3037 PUT_STAT_U32(P_DROP, flow->cvars.p_drop); 3038 if (flow->cvars.p_drop) { 3039 PUT_STAT_S32(BLUE_TIMER_US, 3040 ktime_to_us( 3041 ktime_sub(now, 3042 flow->cvars.blue_timer))); 3043 } 3044 if (flow->cvars.dropping) { 3045 PUT_STAT_S32(DROP_NEXT_US, 3046 ktime_to_us( 3047 ktime_sub(now, 3048 flow->cvars.drop_next))); 3049 } 3050 3051 if (nla_nest_end(d->skb, stats) < 0) 3052 return -1; 3053 } 3054 3055 return 0; 3056 3057 nla_put_failure: 3058 nla_nest_cancel(d->skb, stats); 3059 return -1; 3060 } 3061 3062 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) 3063 { 3064 struct cake_sched_data *q = qdisc_priv(sch); 3065 unsigned int i, j; 3066 3067 if (arg->stop) 3068 return; 3069 3070 for (i = 0; i < q->tin_cnt; i++) { 3071 struct cake_tin_data *b = &q->tins[q->tin_order[i]]; 3072 3073 for (j = 0; j < CAKE_QUEUES; j++) { 3074 if (list_empty(&b->flows[j].flowchain)) { 3075 arg->count++; 3076 continue; 3077 } 3078 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, 3079 arg)) 3080 break; 3081 } 3082 } 3083 } 3084 3085 static const struct Qdisc_class_ops cake_class_ops = { 3086 .leaf = cake_leaf, 3087 .find = cake_find, 3088 .tcf_block = cake_tcf_block, 3089 .bind_tcf = cake_bind, 3090 .unbind_tcf = cake_unbind, 3091 .dump = cake_dump_class, 3092 .dump_stats = cake_dump_class_stats, 3093 .walk = cake_walk, 3094 }; 3095 3096 static struct Qdisc_ops cake_qdisc_ops __read_mostly = { 3097 .cl_ops = &cake_class_ops, 3098 .id = "cake", 3099 .priv_size = sizeof(struct cake_sched_data), 3100 .enqueue = cake_enqueue, 3101 .dequeue = cake_dequeue, 3102 .peek = qdisc_peek_dequeued, 3103 .init = cake_init, 3104 .reset = cake_reset, 3105 .destroy = cake_destroy, 3106 .change = cake_change, 3107 .dump = cake_dump, 3108 .dump_stats = cake_dump_stats, 3109 .owner = THIS_MODULE, 3110 }; 3111 MODULE_ALIAS_NET_SCH("cake"); 3112 3113 static int __init cake_module_init(void) 3114 { 3115 return register_qdisc(&cake_qdisc_ops); 3116 } 3117 3118 static void __exit cake_module_exit(void) 3119 { 3120 unregister_qdisc(&cake_qdisc_ops); 3121 } 3122 3123 module_init(cake_module_init) 3124 module_exit(cake_module_exit) 3125 MODULE_AUTHOR("Jonathan Morton"); 3126 MODULE_LICENSE("Dual BSD/GPL"); 3127 MODULE_DESCRIPTION("The CAKE shaper."); 3128