1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 * (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20 * eliminating the need for any sort of burst parameter (eg. token bucket
21 * depth). Burst support is limited to that necessary to overcome scheduling
22 * latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 * up to a specified fraction of bandwidth. Above that bandwidth threshold,
26 * the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 * flows from each other. This prevents a burst on one flow from increasing
30 * the delay to another. Flows are distributed to queues using a
31 * set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals
35 * congestion early via ECN (if available) and/or packet drops, to keep
36 * latency low. The codel parameters are auto-tuned based on the bandwidth
37 * setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults. Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating. This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/sch_priv.h>
71 #include <net/pkt_cls.h>
72 #include <net/tcp.h>
73 #include <net/flow_dissector.h>
74
75 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
76 #include <net/netfilter/nf_conntrack_core.h>
77 #endif
78
79 #define CAKE_SET_WAYS (8)
80 #define CAKE_MAX_TINS (8)
81 #define CAKE_QUEUES (1024)
82 #define CAKE_FLOW_MASK 63
83 #define CAKE_FLOW_NAT_FLAG 64
84
85 /* struct cobalt_params - contains codel and blue parameters
86 * @interval: codel initial drop rate
87 * @target: maximum persistent sojourn time & blue update rate
88 * @mtu_time: serialisation delay of maximum-size packet
89 * @p_inc: increment of blue drop probability (0.32 fxp)
90 * @p_dec: decrement of blue drop probability (0.32 fxp)
91 */
92 struct cobalt_params {
93 u64 interval;
94 u64 target;
95 u64 mtu_time;
96 u32 p_inc;
97 u32 p_dec;
98 };
99
100 /* struct cobalt_vars - contains codel and blue variables
101 * @count: codel dropping frequency
102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
103 * @drop_next: time to drop next packet, or when we dropped last
104 * @blue_timer: Blue time to next drop
105 * @p_drop: BLUE drop probability (0.32 fxp)
106 * @dropping: set if in dropping state
107 * @ecn_marked: set if marked
108 */
109 struct cobalt_vars {
110 u32 count;
111 u32 rec_inv_sqrt;
112 ktime_t drop_next;
113 ktime_t blue_timer;
114 u32 p_drop;
115 bool dropping;
116 bool ecn_marked;
117 };
118
119 enum {
120 CAKE_SET_NONE = 0,
121 CAKE_SET_SPARSE,
122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 CAKE_SET_BULK,
124 CAKE_SET_DECAYING
125 };
126
127 struct cake_flow {
128 /* this stuff is all needed per-flow at dequeue time */
129 struct sk_buff *head;
130 struct sk_buff *tail;
131 struct list_head flowchain;
132 s32 deficit;
133 u32 dropped;
134 struct cobalt_vars cvars;
135 u16 srchost; /* index into cake_host table */
136 u16 dsthost;
137 u8 set;
138 }; /* please try to keep this structure <= 64 bytes */
139
140 struct cake_host {
141 u32 srchost_tag;
142 u32 dsthost_tag;
143 u16 srchost_bulk_flow_count;
144 u16 dsthost_bulk_flow_count;
145 };
146
147 struct cake_heap_entry {
148 u16 t:3, b:10;
149 };
150
151 struct cake_tin_data {
152 struct cake_flow flows[CAKE_QUEUES];
153 u32 backlogs[CAKE_QUEUES];
154 u32 tags[CAKE_QUEUES]; /* for set association */
155 u16 overflow_idx[CAKE_QUEUES];
156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 u16 flow_quantum;
158
159 struct cobalt_params cparams;
160 u32 drop_overlimit;
161 u16 bulk_flow_count;
162 u16 sparse_flow_count;
163 u16 decaying_flow_count;
164 u16 unresponsive_flow_count;
165
166 u32 max_skblen;
167
168 struct list_head new_flows;
169 struct list_head old_flows;
170 struct list_head decaying_flows;
171
172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 ktime_t time_next_packet;
174 u64 tin_rate_ns;
175 u64 tin_rate_bps;
176 u16 tin_rate_shft;
177
178 u16 tin_quantum;
179 s32 tin_deficit;
180 u32 tin_backlog;
181 u32 tin_dropped;
182 u32 tin_ecn_mark;
183
184 u32 packets;
185 u64 bytes;
186
187 u32 ack_drops;
188
189 /* moving averages */
190 u64 avge_delay;
191 u64 peak_delay;
192 u64 base_delay;
193
194 /* hash function stats */
195 u32 way_directs;
196 u32 way_hits;
197 u32 way_misses;
198 u32 way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200
201 struct cake_sched_config {
202 u64 rate_bps;
203 u64 interval;
204 u64 target;
205 u64 sync_time;
206 u32 buffer_config_limit;
207 u32 fwmark_mask;
208 u16 fwmark_shft;
209 s16 rate_overhead;
210 u16 rate_mpu;
211 u16 rate_flags;
212 u8 tin_mode;
213 u8 flow_mode;
214 u8 atm_mode;
215 u8 ack_filter;
216 u8 is_shared;
217 };
218
219 struct cake_sched_data {
220 struct tcf_proto __rcu *filter_list; /* optional external classifier */
221 struct tcf_block *block;
222 struct cake_tin_data *tins;
223 struct cake_sched_config *config;
224 struct cake_sched_config initial_config;
225
226 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
227
228 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
229 ktime_t time_next_packet;
230 ktime_t failsafe_next_packet;
231 u64 rate_ns;
232 u16 rate_shft;
233 u16 overflow_timeout;
234 u16 tin_cnt;
235
236 /* resource tracking */
237 u32 buffer_used;
238 u32 buffer_max_used;
239 u32 buffer_limit;
240
241 /* indices for dequeue */
242 u16 cur_tin;
243 u16 cur_flow;
244
245 struct qdisc_watchdog watchdog;
246 const u8 *tin_index;
247 const u8 *tin_order;
248
249 /* bandwidth capacity estimate */
250 ktime_t last_packet_time;
251 ktime_t avg_window_begin;
252 u64 avg_packet_interval;
253 u64 avg_window_bytes;
254 u64 avg_peak_bandwidth;
255 ktime_t last_reconfig_time;
256
257 /* packet length stats */
258 u32 avg_netoff;
259 u16 max_netlen;
260 u16 max_adjlen;
261 u16 min_netlen;
262 u16 min_adjlen;
263
264 /* mq sync state */
265 u64 last_checked_active;
266 u64 last_active;
267 u32 active_queues;
268 };
269
270 enum {
271 CAKE_FLAG_OVERHEAD = BIT(0),
272 CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
273 CAKE_FLAG_INGRESS = BIT(2),
274 CAKE_FLAG_WASH = BIT(3),
275 CAKE_FLAG_SPLIT_GSO = BIT(4)
276 };
277
278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
279 * obtain the best features of each. Codel is excellent on flows which
280 * respond to congestion signals in a TCP-like way. BLUE is more effective on
281 * unresponsive flows.
282 */
283
284 struct cobalt_skb_cb {
285 ktime_t enqueue_time;
286 u32 adjusted_len;
287 };
288
us_to_ns(u64 us)289 static u64 us_to_ns(u64 us)
290 {
291 return us * NSEC_PER_USEC;
292 }
293
get_cobalt_cb(const struct sk_buff * skb)294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
295 {
296 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
297 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
298 }
299
cobalt_get_enqueue_time(const struct sk_buff * skb)300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
301 {
302 return get_cobalt_cb(skb)->enqueue_time;
303 }
304
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)305 static void cobalt_set_enqueue_time(struct sk_buff *skb,
306 ktime_t now)
307 {
308 get_cobalt_cb(skb)->enqueue_time = now;
309 }
310
311 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
312
313 /* Diffserv lookup tables */
314
315 static const u8 precedence[] = {
316 0, 0, 0, 0, 0, 0, 0, 0,
317 1, 1, 1, 1, 1, 1, 1, 1,
318 2, 2, 2, 2, 2, 2, 2, 2,
319 3, 3, 3, 3, 3, 3, 3, 3,
320 4, 4, 4, 4, 4, 4, 4, 4,
321 5, 5, 5, 5, 5, 5, 5, 5,
322 6, 6, 6, 6, 6, 6, 6, 6,
323 7, 7, 7, 7, 7, 7, 7, 7,
324 };
325
326 static const u8 diffserv8[] = {
327 2, 0, 1, 2, 4, 2, 2, 2,
328 1, 2, 1, 2, 1, 2, 1, 2,
329 5, 2, 4, 2, 4, 2, 4, 2,
330 3, 2, 3, 2, 3, 2, 3, 2,
331 6, 2, 3, 2, 3, 2, 3, 2,
332 6, 2, 2, 2, 6, 2, 6, 2,
333 7, 2, 2, 2, 2, 2, 2, 2,
334 7, 2, 2, 2, 2, 2, 2, 2,
335 };
336
337 static const u8 diffserv4[] = {
338 0, 1, 0, 0, 2, 0, 0, 0,
339 1, 0, 0, 0, 0, 0, 0, 0,
340 2, 0, 2, 0, 2, 0, 2, 0,
341 2, 0, 2, 0, 2, 0, 2, 0,
342 3, 0, 2, 0, 2, 0, 2, 0,
343 3, 0, 0, 0, 3, 0, 3, 0,
344 3, 0, 0, 0, 0, 0, 0, 0,
345 3, 0, 0, 0, 0, 0, 0, 0,
346 };
347
348 static const u8 diffserv3[] = {
349 0, 1, 0, 0, 2, 0, 0, 0,
350 1, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 2, 0, 2, 0,
355 2, 0, 0, 0, 0, 0, 0, 0,
356 2, 0, 0, 0, 0, 0, 0, 0,
357 };
358
359 static const u8 besteffort[] = {
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 };
369
370 /* tin priority order for stats dumping */
371
372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
373 static const u8 bulk_order[] = {1, 0, 2, 3};
374
375 /* There is a big difference in timing between the accurate values placed in the
376 * cache and the approximations given by a single Newton step for small count
377 * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
378 * these values are calculated using eight Newton steps, using the
379 * implementation below. Above 16, a single Newton step gives sufficient
380 * accuracy in either direction, given the precision stored.
381 *
382 * The magnitude of the error when stepping up to count 2 is such as to give the
383 * value that *should* have been produced at count 4.
384 */
385
386 #define REC_INV_SQRT_CACHE (16)
387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
388 ~0, ~0, 3037000500, 2479700525,
389 2147483647, 1920767767, 1753413056, 1623345051,
390 1518500250, 1431655765, 1358187914, 1294981364,
391 1239850263, 1191209601, 1147878294, 1108955788
392 };
393
394 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust);
395
396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
397 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
398 *
399 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
400 */
401
cobalt_newton_step(struct cobalt_vars * vars)402 static void cobalt_newton_step(struct cobalt_vars *vars)
403 {
404 u32 invsqrt, invsqrt2;
405 u64 val;
406
407 invsqrt = vars->rec_inv_sqrt;
408 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
409 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
410
411 val >>= 2; /* avoid overflow in following multiply */
412 val = (val * invsqrt) >> (32 - 2 + 1);
413
414 vars->rec_inv_sqrt = val;
415 }
416
cobalt_invsqrt(struct cobalt_vars * vars)417 static void cobalt_invsqrt(struct cobalt_vars *vars)
418 {
419 if (vars->count < REC_INV_SQRT_CACHE)
420 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
421 else
422 cobalt_newton_step(vars);
423 }
424
cobalt_vars_init(struct cobalt_vars * vars)425 static void cobalt_vars_init(struct cobalt_vars *vars)
426 {
427 memset(vars, 0, sizeof(*vars));
428 }
429
430 /* CoDel control_law is t + interval/sqrt(count)
431 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
432 * both sqrt() and divide operation.
433 */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)434 static ktime_t cobalt_control(ktime_t t,
435 u64 interval,
436 u32 rec_inv_sqrt)
437 {
438 return ktime_add_ns(t, reciprocal_scale(interval,
439 rec_inv_sqrt));
440 }
441
442 /* Call this when a packet had to be dropped due to queue overflow. Returns
443 * true if the BLUE state was quiescent before but active after this call.
444 */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)445 static bool cobalt_queue_full(struct cobalt_vars *vars,
446 struct cobalt_params *p,
447 ktime_t now)
448 {
449 bool up = false;
450
451 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
452 up = !vars->p_drop;
453 vars->p_drop += p->p_inc;
454 if (vars->p_drop < p->p_inc)
455 vars->p_drop = ~0;
456 vars->blue_timer = now;
457 }
458 vars->dropping = true;
459 vars->drop_next = now;
460 if (!vars->count)
461 vars->count = 1;
462
463 return up;
464 }
465
466 /* Call this when the queue was serviced but turned out to be empty. Returns
467 * true if the BLUE state was active before but quiescent after this call.
468 */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)469 static bool cobalt_queue_empty(struct cobalt_vars *vars,
470 struct cobalt_params *p,
471 ktime_t now)
472 {
473 bool down = false;
474
475 if (vars->p_drop &&
476 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
477 if (vars->p_drop < p->p_dec)
478 vars->p_drop = 0;
479 else
480 vars->p_drop -= p->p_dec;
481 vars->blue_timer = now;
482 down = !vars->p_drop;
483 }
484 vars->dropping = false;
485
486 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
487 vars->count--;
488 cobalt_invsqrt(vars);
489 vars->drop_next = cobalt_control(vars->drop_next,
490 p->interval,
491 vars->rec_inv_sqrt);
492 }
493
494 return down;
495 }
496
497 /* Call this with a freshly dequeued packet for possible congestion marking.
498 * Returns true as an instruction to drop the packet, false for delivery.
499 */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)500 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
501 struct cobalt_params *p,
502 ktime_t now,
503 struct sk_buff *skb,
504 u32 bulk_flows)
505 {
506 enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
507 bool next_due, over_target;
508 ktime_t schedule;
509 u64 sojourn;
510
511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
512 * after 'drop_next'. This allows 'drop_next' to be updated before the next
513 * scheduling decision is actually branched, without destroying that
514 * information. Similarly, the first 'schedule' value calculated is preserved
515 * in the boolean 'next_due'.
516 *
517 * As for 'drop_next', we take advantage of the fact that 'interval' is both
518 * the delay between first exceeding 'target' and the first signalling event,
519 * *and* the scaling factor for the signalling frequency. It's therefore very
520 * natural to use a single mechanism for both purposes, and eliminates a
521 * significant amount of reference Codel's spaghetti code. To help with this,
522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523 * as possible to 1.0 in fixed-point.
524 */
525
526 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527 schedule = ktime_sub(now, vars->drop_next);
528 over_target = sojourn > p->target &&
529 sojourn > p->mtu_time * bulk_flows * 2 &&
530 sojourn > p->mtu_time * 4;
531 next_due = vars->count && ktime_to_ns(schedule) >= 0;
532
533 vars->ecn_marked = false;
534
535 if (over_target) {
536 if (!vars->dropping) {
537 vars->dropping = true;
538 vars->drop_next = cobalt_control(now,
539 p->interval,
540 vars->rec_inv_sqrt);
541 }
542 if (!vars->count)
543 vars->count = 1;
544 } else if (vars->dropping) {
545 vars->dropping = false;
546 }
547
548 if (next_due && vars->dropping) {
549 /* Use ECN mark if possible, otherwise drop */
550 if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
551 reason = SKB_DROP_REASON_QDISC_CONGESTED;
552
553 vars->count++;
554 if (!vars->count)
555 vars->count--;
556 cobalt_invsqrt(vars);
557 vars->drop_next = cobalt_control(vars->drop_next,
558 p->interval,
559 vars->rec_inv_sqrt);
560 schedule = ktime_sub(now, vars->drop_next);
561 } else {
562 while (next_due) {
563 vars->count--;
564 cobalt_invsqrt(vars);
565 vars->drop_next = cobalt_control(vars->drop_next,
566 p->interval,
567 vars->rec_inv_sqrt);
568 schedule = ktime_sub(now, vars->drop_next);
569 next_due = vars->count && ktime_to_ns(schedule) >= 0;
570 }
571 }
572
573 /* Simple BLUE implementation. Lack of ECN is deliberate. */
574 if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
575 get_random_u32() < vars->p_drop)
576 reason = SKB_DROP_REASON_CAKE_FLOOD;
577
578 /* Overload the drop_next field as an activity timeout */
579 if (!vars->count)
580 vars->drop_next = ktime_add_ns(now, p->interval);
581 else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
582 vars->drop_next = now;
583
584 return reason;
585 }
586
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)587 static bool cake_update_flowkeys(struct flow_keys *keys,
588 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 struct nf_conntrack_tuple tuple = {};
592 bool rev = !skb->_nfct, upd = false;
593 __be32 ip;
594
595 if (skb_protocol(skb, true) != htons(ETH_P_IP))
596 return false;
597
598 if (!nf_ct_get_tuple_skb(&tuple, skb))
599 return false;
600
601 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 if (ip != keys->addrs.v4addrs.src) {
603 keys->addrs.v4addrs.src = ip;
604 upd = true;
605 }
606 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607 if (ip != keys->addrs.v4addrs.dst) {
608 keys->addrs.v4addrs.dst = ip;
609 upd = true;
610 }
611
612 if (keys->ports.ports) {
613 __be16 port;
614
615 port = rev ? tuple.dst.u.all : tuple.src.u.all;
616 if (port != keys->ports.src) {
617 keys->ports.src = port;
618 upd = true;
619 }
620 port = rev ? tuple.src.u.all : tuple.dst.u.all;
621 if (port != keys->ports.dst) {
622 port = keys->ports.dst;
623 upd = true;
624 }
625 }
626 return upd;
627 #else
628 return false;
629 #endif
630 }
631
632 /* Cake has several subtle multiple bit settings. In these cases you
633 * would be matching triple isolate mode as well.
634 */
635
cake_dsrc(int flow_mode)636 static bool cake_dsrc(int flow_mode)
637 {
638 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640
cake_ddst(int flow_mode)641 static bool cake_ddst(int flow_mode)
642 {
643 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)646 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
647 struct cake_flow *flow,
648 int flow_mode)
649 {
650 if (likely(cake_dsrc(flow_mode) &&
651 q->hosts[flow->srchost].srchost_bulk_flow_count))
652 q->hosts[flow->srchost].srchost_bulk_flow_count--;
653 }
654
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)655 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
656 struct cake_flow *flow,
657 int flow_mode)
658 {
659 if (likely(cake_dsrc(flow_mode) &&
660 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
661 q->hosts[flow->srchost].srchost_bulk_flow_count++;
662 }
663
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)664 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
665 struct cake_flow *flow,
666 int flow_mode)
667 {
668 if (likely(cake_ddst(flow_mode) &&
669 q->hosts[flow->dsthost].dsthost_bulk_flow_count))
670 q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
671 }
672
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)673 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
674 struct cake_flow *flow,
675 int flow_mode)
676 {
677 if (likely(cake_ddst(flow_mode) &&
678 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
679 q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
680 }
681
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)682 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
683 struct cake_flow *flow,
684 int flow_mode)
685 {
686 u16 host_load = 1;
687
688 if (cake_dsrc(flow_mode))
689 host_load = max(host_load,
690 q->hosts[flow->srchost].srchost_bulk_flow_count);
691
692 if (cake_ddst(flow_mode))
693 host_load = max(host_load,
694 q->hosts[flow->dsthost].dsthost_bulk_flow_count);
695
696 /* The get_random_u16() is a way to apply dithering to avoid
697 * accumulating roundoff errors
698 */
699 return (q->flow_quantum * quantum_div[host_load] +
700 get_random_u16()) >> 16;
701 }
702
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)703 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
704 int flow_mode, u16 flow_override, u16 host_override)
705 {
706 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
707 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
708 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
709 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
710 u16 reduced_hash, srchost_idx, dsthost_idx;
711 struct flow_keys keys, host_keys;
712 bool use_skbhash = skb->l4_hash;
713
714 if (unlikely(flow_mode == CAKE_FLOW_NONE))
715 return 0;
716
717 /* If both overrides are set, or we can use the SKB hash and nat mode is
718 * disabled, we can skip packet dissection entirely. If nat mode is
719 * enabled there's another check below after doing the conntrack lookup.
720 */
721 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
722 goto skip_hash;
723
724 skb_flow_dissect_flow_keys(skb, &keys,
725 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
726
727 /* Don't use the SKB hash if we change the lookup keys from conntrack */
728 if (nat_enabled && cake_update_flowkeys(&keys, skb))
729 use_skbhash = false;
730
731 /* If we can still use the SKB hash and don't need the host hash, we can
732 * skip the rest of the hashing procedure
733 */
734 if (use_skbhash && !hash_hosts)
735 goto skip_hash;
736
737 /* flow_hash_from_keys() sorts the addresses by value, so we have
738 * to preserve their order in a separate data structure to treat
739 * src and dst host addresses as independently selectable.
740 */
741 host_keys = keys;
742 host_keys.ports.ports = 0;
743 host_keys.basic.ip_proto = 0;
744 host_keys.keyid.keyid = 0;
745 host_keys.tags.flow_label = 0;
746
747 switch (host_keys.control.addr_type) {
748 case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
749 host_keys.addrs.v4addrs.src = 0;
750 dsthost_hash = flow_hash_from_keys(&host_keys);
751 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
752 host_keys.addrs.v4addrs.dst = 0;
753 srchost_hash = flow_hash_from_keys(&host_keys);
754 break;
755
756 case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
757 memset(&host_keys.addrs.v6addrs.src, 0,
758 sizeof(host_keys.addrs.v6addrs.src));
759 dsthost_hash = flow_hash_from_keys(&host_keys);
760 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
761 memset(&host_keys.addrs.v6addrs.dst, 0,
762 sizeof(host_keys.addrs.v6addrs.dst));
763 srchost_hash = flow_hash_from_keys(&host_keys);
764 break;
765
766 default:
767 dsthost_hash = 0;
768 srchost_hash = 0;
769 }
770
771 /* This *must* be after the above switch, since as a
772 * side-effect it sorts the src and dst addresses.
773 */
774 if (hash_flows && !use_skbhash)
775 flow_hash = flow_hash_from_keys(&keys);
776
777 skip_hash:
778 if (flow_override)
779 flow_hash = flow_override - 1;
780 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
781 flow_hash = skb->hash;
782 if (host_override) {
783 dsthost_hash = host_override - 1;
784 srchost_hash = host_override - 1;
785 }
786
787 if (!(flow_mode & CAKE_FLOW_FLOWS)) {
788 if (flow_mode & CAKE_FLOW_SRC_IP)
789 flow_hash ^= srchost_hash;
790
791 if (flow_mode & CAKE_FLOW_DST_IP)
792 flow_hash ^= dsthost_hash;
793 }
794
795 reduced_hash = flow_hash % CAKE_QUEUES;
796
797 /* set-associative hashing */
798 /* fast path if no hash collision (direct lookup succeeds) */
799 if (likely(q->tags[reduced_hash] == flow_hash &&
800 q->flows[reduced_hash].set)) {
801 q->way_directs++;
802 } else {
803 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
804 u32 outer_hash = reduced_hash - inner_hash;
805 bool allocate_src = false;
806 bool allocate_dst = false;
807 u32 i, k;
808
809 /* check if any active queue in the set is reserved for
810 * this flow.
811 */
812 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
813 i++, k = (k + 1) % CAKE_SET_WAYS) {
814 if (q->tags[outer_hash + k] == flow_hash) {
815 if (i)
816 q->way_hits++;
817
818 if (!q->flows[outer_hash + k].set) {
819 /* need to increment host refcnts */
820 allocate_src = cake_dsrc(flow_mode);
821 allocate_dst = cake_ddst(flow_mode);
822 }
823
824 goto found;
825 }
826 }
827
828 /* no queue is reserved for this flow, look for an
829 * empty one.
830 */
831 for (i = 0; i < CAKE_SET_WAYS;
832 i++, k = (k + 1) % CAKE_SET_WAYS) {
833 if (!q->flows[outer_hash + k].set) {
834 q->way_misses++;
835 allocate_src = cake_dsrc(flow_mode);
836 allocate_dst = cake_ddst(flow_mode);
837 goto found;
838 }
839 }
840
841 /* With no empty queues, default to the original
842 * queue, accept the collision, update the host tags.
843 */
844 q->way_collisions++;
845 allocate_src = cake_dsrc(flow_mode);
846 allocate_dst = cake_ddst(flow_mode);
847
848 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
849 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
850 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851 }
852 found:
853 /* reserve queue for future packets in same flow */
854 reduced_hash = outer_hash + k;
855 q->tags[reduced_hash] = flow_hash;
856
857 if (allocate_src) {
858 srchost_idx = srchost_hash % CAKE_QUEUES;
859 inner_hash = srchost_idx % CAKE_SET_WAYS;
860 outer_hash = srchost_idx - inner_hash;
861 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
862 i++, k = (k + 1) % CAKE_SET_WAYS) {
863 if (q->hosts[outer_hash + k].srchost_tag ==
864 srchost_hash)
865 goto found_src;
866 }
867 for (i = 0; i < CAKE_SET_WAYS;
868 i++, k = (k + 1) % CAKE_SET_WAYS) {
869 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
870 break;
871 }
872 q->hosts[outer_hash + k].srchost_tag = srchost_hash;
873 found_src:
874 srchost_idx = outer_hash + k;
875 q->flows[reduced_hash].srchost = srchost_idx;
876
877 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
878 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
879 }
880
881 if (allocate_dst) {
882 dsthost_idx = dsthost_hash % CAKE_QUEUES;
883 inner_hash = dsthost_idx % CAKE_SET_WAYS;
884 outer_hash = dsthost_idx - inner_hash;
885 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
886 i++, k = (k + 1) % CAKE_SET_WAYS) {
887 if (q->hosts[outer_hash + k].dsthost_tag ==
888 dsthost_hash)
889 goto found_dst;
890 }
891 for (i = 0; i < CAKE_SET_WAYS;
892 i++, k = (k + 1) % CAKE_SET_WAYS) {
893 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
894 break;
895 }
896 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
897 found_dst:
898 dsthost_idx = outer_hash + k;
899 q->flows[reduced_hash].dsthost = dsthost_idx;
900
901 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
902 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
903 }
904 }
905
906 return reduced_hash;
907 }
908
909 /* helper functions : might be changed when/if skb use a standard list_head */
910 /* remove one skb from head of slot queue */
911
dequeue_head(struct cake_flow * flow)912 static struct sk_buff *dequeue_head(struct cake_flow *flow)
913 {
914 struct sk_buff *skb = flow->head;
915
916 if (skb) {
917 flow->head = skb->next;
918 skb_mark_not_on_list(skb);
919 }
920
921 return skb;
922 }
923
924 /* add skb to flow queue (tail add) */
925
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)926 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
927 {
928 if (!flow->head)
929 flow->head = skb;
930 else
931 flow->tail->next = skb;
932 flow->tail = skb;
933 skb->next = NULL;
934 }
935
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)936 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
937 struct ipv6hdr *buf)
938 {
939 unsigned int offset = skb_network_offset(skb);
940 struct iphdr *iph;
941
942 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
943
944 if (!iph)
945 return NULL;
946
947 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
948 return skb_header_pointer(skb, offset + iph->ihl * 4,
949 sizeof(struct ipv6hdr), buf);
950
951 else if (iph->version == 4)
952 return iph;
953
954 else if (iph->version == 6)
955 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
956 buf);
957
958 return NULL;
959 }
960
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)961 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
962 void *buf, unsigned int bufsize)
963 {
964 unsigned int offset = skb_network_offset(skb);
965 const struct ipv6hdr *ipv6h;
966 const struct tcphdr *tcph;
967 const struct iphdr *iph;
968 struct ipv6hdr _ipv6h;
969 struct tcphdr _tcph;
970
971 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
972
973 if (!ipv6h)
974 return NULL;
975
976 if (ipv6h->version == 4) {
977 iph = (struct iphdr *)ipv6h;
978 offset += iph->ihl * 4;
979
980 /* special-case 6in4 tunnelling, as that is a common way to get
981 * v6 connectivity in the home
982 */
983 if (iph->protocol == IPPROTO_IPV6) {
984 ipv6h = skb_header_pointer(skb, offset,
985 sizeof(_ipv6h), &_ipv6h);
986
987 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
988 return NULL;
989
990 offset += sizeof(struct ipv6hdr);
991
992 } else if (iph->protocol != IPPROTO_TCP) {
993 return NULL;
994 }
995
996 } else if (ipv6h->version == 6) {
997 if (ipv6h->nexthdr != IPPROTO_TCP)
998 return NULL;
999
1000 offset += sizeof(struct ipv6hdr);
1001 } else {
1002 return NULL;
1003 }
1004
1005 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1006 if (!tcph || tcph->doff < 5)
1007 return NULL;
1008
1009 return skb_header_pointer(skb, offset,
1010 min(__tcp_hdrlen(tcph), bufsize), buf);
1011 }
1012
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1013 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1014 int code, int *oplen)
1015 {
1016 /* inspired by tcp_parse_options in tcp_input.c */
1017 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1018 const u8 *ptr = (const u8 *)(tcph + 1);
1019
1020 while (length > 0) {
1021 int opcode = *ptr++;
1022 int opsize;
1023
1024 if (opcode == TCPOPT_EOL)
1025 break;
1026 if (opcode == TCPOPT_NOP) {
1027 length--;
1028 continue;
1029 }
1030 if (length < 2)
1031 break;
1032 opsize = *ptr++;
1033 if (opsize < 2 || opsize > length)
1034 break;
1035
1036 if (opcode == code) {
1037 *oplen = opsize;
1038 return ptr;
1039 }
1040
1041 ptr += opsize - 2;
1042 length -= opsize;
1043 }
1044
1045 return NULL;
1046 }
1047
1048 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1049 * bytes than the other. In the case where both sequences ACKs bytes that the
1050 * other doesn't, A is considered greater. DSACKs in A also makes A be
1051 * considered greater.
1052 *
1053 * @return -1, 0 or 1 as normal compare functions
1054 */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1055 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1056 const struct tcphdr *tcph_b)
1057 {
1058 const struct tcp_sack_block_wire *sack_a, *sack_b;
1059 u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1060 u32 bytes_a = 0, bytes_b = 0;
1061 int oplen_a, oplen_b;
1062 bool first = true;
1063
1064 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1065 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1066
1067 /* pointers point to option contents */
1068 oplen_a -= TCPOLEN_SACK_BASE;
1069 oplen_b -= TCPOLEN_SACK_BASE;
1070
1071 if (sack_a && oplen_a >= sizeof(*sack_a) &&
1072 (!sack_b || oplen_b < sizeof(*sack_b)))
1073 return -1;
1074 else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1075 (!sack_a || oplen_a < sizeof(*sack_a)))
1076 return 1;
1077 else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1078 (!sack_b || oplen_b < sizeof(*sack_b)))
1079 return 0;
1080
1081 while (oplen_a >= sizeof(*sack_a)) {
1082 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1083 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1084 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1085 int oplen_tmp = oplen_b;
1086 bool found = false;
1087
1088 /* DSACK; always considered greater to prevent dropping */
1089 if (before(start_a, ack_seq_a))
1090 return -1;
1091
1092 bytes_a += end_a - start_a;
1093
1094 while (oplen_tmp >= sizeof(*sack_tmp)) {
1095 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1096 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1097
1098 /* first time through we count the total size */
1099 if (first)
1100 bytes_b += end_b - start_b;
1101
1102 if (!after(start_b, start_a) && !before(end_b, end_a)) {
1103 found = true;
1104 if (!first)
1105 break;
1106 }
1107 oplen_tmp -= sizeof(*sack_tmp);
1108 sack_tmp++;
1109 }
1110
1111 if (!found)
1112 return -1;
1113
1114 oplen_a -= sizeof(*sack_a);
1115 sack_a++;
1116 first = false;
1117 }
1118
1119 /* If we made it this far, all ranges SACKed by A are covered by B, so
1120 * either the SACKs are equal, or B SACKs more bytes.
1121 */
1122 return bytes_b > bytes_a ? 1 : 0;
1123 }
1124
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1125 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1126 u32 *tsval, u32 *tsecr)
1127 {
1128 const u8 *ptr;
1129 int opsize;
1130
1131 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1132
1133 if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1134 *tsval = get_unaligned_be32(ptr);
1135 *tsecr = get_unaligned_be32(ptr + 4);
1136 }
1137 }
1138
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1139 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1140 u32 tstamp_new, u32 tsecr_new)
1141 {
1142 /* inspired by tcp_parse_options in tcp_input.c */
1143 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1144 const u8 *ptr = (const u8 *)(tcph + 1);
1145 u32 tstamp, tsecr;
1146
1147 /* 3 reserved flags must be unset to avoid future breakage
1148 * ACK must be set
1149 * ECE/CWR are handled separately
1150 * All other flags URG/PSH/RST/SYN/FIN must be unset
1151 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1152 * 0x00C00000 = CWR/ECE (handled separately)
1153 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1154 */
1155 if (((tcp_flag_word(tcph) &
1156 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1157 return false;
1158
1159 while (length > 0) {
1160 int opcode = *ptr++;
1161 int opsize;
1162
1163 if (opcode == TCPOPT_EOL)
1164 break;
1165 if (opcode == TCPOPT_NOP) {
1166 length--;
1167 continue;
1168 }
1169 if (length < 2)
1170 break;
1171 opsize = *ptr++;
1172 if (opsize < 2 || opsize > length)
1173 break;
1174
1175 switch (opcode) {
1176 case TCPOPT_MD5SIG: /* doesn't influence state */
1177 break;
1178
1179 case TCPOPT_SACK: /* stricter checking performed later */
1180 if (opsize % 8 != 2)
1181 return false;
1182 break;
1183
1184 case TCPOPT_TIMESTAMP:
1185 /* only drop timestamps lower than new */
1186 if (opsize != TCPOLEN_TIMESTAMP)
1187 return false;
1188 tstamp = get_unaligned_be32(ptr);
1189 tsecr = get_unaligned_be32(ptr + 4);
1190 if (after(tstamp, tstamp_new) ||
1191 after(tsecr, tsecr_new))
1192 return false;
1193 break;
1194
1195 case TCPOPT_MSS: /* these should only be set on SYN */
1196 case TCPOPT_WINDOW:
1197 case TCPOPT_SACK_PERM:
1198 case TCPOPT_FASTOPEN:
1199 case TCPOPT_EXP:
1200 default: /* don't drop if any unknown options are present */
1201 return false;
1202 }
1203
1204 ptr += opsize - 2;
1205 length -= opsize;
1206 }
1207
1208 return true;
1209 }
1210
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1211 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1212 struct cake_flow *flow)
1213 {
1214 bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
1215 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1216 struct sk_buff *skb_check, *skb_prev = NULL;
1217 const struct ipv6hdr *ipv6h, *ipv6h_check;
1218 unsigned char _tcph[64], _tcph_check[64];
1219 const struct tcphdr *tcph, *tcph_check;
1220 const struct iphdr *iph, *iph_check;
1221 struct ipv6hdr _iph, _iph_check;
1222 const struct sk_buff *skb;
1223 int seglen, num_found = 0;
1224 u32 tstamp = 0, tsecr = 0;
1225 __be32 elig_flags = 0;
1226 int sack_comp;
1227
1228 /* no other possible ACKs to filter */
1229 if (flow->head == flow->tail)
1230 return NULL;
1231
1232 skb = flow->tail;
1233 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1234 iph = cake_get_iphdr(skb, &_iph);
1235 if (!tcph)
1236 return NULL;
1237
1238 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1239
1240 /* the 'triggering' packet need only have the ACK flag set.
1241 * also check that SYN is not set, as there won't be any previous ACKs.
1242 */
1243 if ((tcp_flag_word(tcph) &
1244 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1245 return NULL;
1246
1247 /* the 'triggering' ACK is at the tail of the queue, we have already
1248 * returned if it is the only packet in the flow. loop through the rest
1249 * of the queue looking for pure ACKs with the same 5-tuple as the
1250 * triggering one.
1251 */
1252 for (skb_check = flow->head;
1253 skb_check && skb_check != skb;
1254 skb_prev = skb_check, skb_check = skb_check->next) {
1255 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1256 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1257 sizeof(_tcph_check));
1258
1259 /* only TCP packets with matching 5-tuple are eligible, and only
1260 * drop safe headers
1261 */
1262 if (!tcph_check || iph->version != iph_check->version ||
1263 tcph_check->source != tcph->source ||
1264 tcph_check->dest != tcph->dest)
1265 continue;
1266
1267 if (iph_check->version == 4) {
1268 if (iph_check->saddr != iph->saddr ||
1269 iph_check->daddr != iph->daddr)
1270 continue;
1271
1272 seglen = iph_totlen(skb, iph_check) -
1273 (4 * iph_check->ihl);
1274 } else if (iph_check->version == 6) {
1275 ipv6h = (struct ipv6hdr *)iph;
1276 ipv6h_check = (struct ipv6hdr *)iph_check;
1277
1278 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1279 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1280 continue;
1281
1282 seglen = ipv6_payload_len(skb, ipv6h_check);
1283 } else {
1284 WARN_ON(1); /* shouldn't happen */
1285 continue;
1286 }
1287
1288 /* If the ECE/CWR flags changed from the previous eligible
1289 * packet in the same flow, we should no longer be dropping that
1290 * previous packet as this would lose information.
1291 */
1292 if (elig_ack && (tcp_flag_word(tcph_check) &
1293 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1294 elig_ack = NULL;
1295 elig_ack_prev = NULL;
1296 num_found--;
1297 }
1298
1299 /* Check TCP options and flags, don't drop ACKs with segment
1300 * data, and don't drop ACKs with a higher cumulative ACK
1301 * counter than the triggering packet. Check ACK seqno here to
1302 * avoid parsing SACK options of packets we are going to exclude
1303 * anyway.
1304 */
1305 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1306 (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1307 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1308 continue;
1309
1310 /* Check SACK options. The triggering packet must SACK more data
1311 * than the ACK under consideration, or SACK the same range but
1312 * have a larger cumulative ACK counter. The latter is a
1313 * pathological case, but is contained in the following check
1314 * anyway, just to be safe.
1315 */
1316 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1317
1318 if (sack_comp < 0 ||
1319 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1320 sack_comp == 0))
1321 continue;
1322
1323 /* At this point we have found an eligible pure ACK to drop; if
1324 * we are in aggressive mode, we are done. Otherwise, keep
1325 * searching unless this is the second eligible ACK we
1326 * found.
1327 *
1328 * Since we want to drop ACK closest to the head of the queue,
1329 * save the first eligible ACK we find, even if we need to loop
1330 * again.
1331 */
1332 if (!elig_ack) {
1333 elig_ack = skb_check;
1334 elig_ack_prev = skb_prev;
1335 elig_flags = (tcp_flag_word(tcph_check)
1336 & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1337 }
1338
1339 if (num_found++ > 0)
1340 goto found;
1341 }
1342
1343 /* We made it through the queue without finding two eligible ACKs . If
1344 * we found a single eligible ACK we can drop it in aggressive mode if
1345 * we can guarantee that this does not interfere with ECN flag
1346 * information. We ensure this by dropping it only if the enqueued
1347 * packet is consecutive with the eligible ACK, and their flags match.
1348 */
1349 if (elig_ack && aggressive && elig_ack->next == skb &&
1350 (elig_flags == (tcp_flag_word(tcph) &
1351 (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1352 goto found;
1353
1354 return NULL;
1355
1356 found:
1357 if (elig_ack_prev)
1358 elig_ack_prev->next = elig_ack->next;
1359 else
1360 flow->head = elig_ack->next;
1361
1362 skb_mark_not_on_list(elig_ack);
1363
1364 return elig_ack;
1365 }
1366
cake_ewma(u64 avg,u64 sample,u32 shift)1367 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1368 {
1369 avg -= avg >> shift;
1370 avg += sample >> shift;
1371 return avg;
1372 }
1373
cake_calc_overhead(struct cake_sched_data * qd,u32 len,u32 off)1374 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
1375 {
1376 struct cake_sched_config *q = qd->config;
1377
1378 if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1379 len -= off;
1380
1381 if (qd->max_netlen < len)
1382 qd->max_netlen = len;
1383 if (qd->min_netlen > len)
1384 qd->min_netlen = len;
1385
1386 len += q->rate_overhead;
1387
1388 if (len < q->rate_mpu)
1389 len = q->rate_mpu;
1390
1391 if (q->atm_mode == CAKE_ATM_ATM) {
1392 len += 47;
1393 len /= 48;
1394 len *= 53;
1395 } else if (q->atm_mode == CAKE_ATM_PTM) {
1396 /* Add one byte per 64 bytes or part thereof.
1397 * This is conservative and easier to calculate than the
1398 * precise value.
1399 */
1400 len += (len + 63) / 64;
1401 }
1402
1403 if (qd->max_adjlen < len)
1404 qd->max_adjlen = len;
1405 if (qd->min_adjlen > len)
1406 qd->min_adjlen = len;
1407
1408 return len;
1409 }
1410
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1411 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1412 {
1413 const struct skb_shared_info *shinfo = skb_shinfo(skb);
1414 unsigned int hdr_len, last_len = 0;
1415 u32 off = skb_network_offset(skb);
1416 u16 segs = qdisc_pkt_segs(skb);
1417 u32 len = qdisc_pkt_len(skb);
1418
1419 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1420
1421 if (segs == 1)
1422 return cake_calc_overhead(q, len, off);
1423
1424 /* borrowed from qdisc_pkt_len_segs_init() */
1425 if (!skb->encapsulation)
1426 hdr_len = skb_transport_offset(skb);
1427 else
1428 hdr_len = skb_inner_transport_offset(skb);
1429
1430 /* + transport layer */
1431 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1432 SKB_GSO_TCPV6))) {
1433 const struct tcphdr *th;
1434 struct tcphdr _tcphdr;
1435
1436 th = skb_header_pointer(skb, hdr_len,
1437 sizeof(_tcphdr), &_tcphdr);
1438 if (likely(th))
1439 hdr_len += __tcp_hdrlen(th);
1440 } else {
1441 struct udphdr _udphdr;
1442
1443 if (skb_header_pointer(skb, hdr_len,
1444 sizeof(_udphdr), &_udphdr))
1445 hdr_len += sizeof(struct udphdr);
1446 }
1447
1448 len = shinfo->gso_size + hdr_len;
1449 last_len = skb->len - shinfo->gso_size * (segs - 1);
1450
1451 return (cake_calc_overhead(q, len, off) * (segs - 1) +
1452 cake_calc_overhead(q, last_len, off));
1453 }
1454
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1455 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1456 {
1457 struct cake_heap_entry ii = q->overflow_heap[i];
1458 struct cake_heap_entry jj = q->overflow_heap[j];
1459
1460 q->overflow_heap[i] = jj;
1461 q->overflow_heap[j] = ii;
1462
1463 q->tins[ii.t].overflow_idx[ii.b] = j;
1464 q->tins[jj.t].overflow_idx[jj.b] = i;
1465 }
1466
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1467 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1468 {
1469 struct cake_heap_entry ii = q->overflow_heap[i];
1470
1471 return q->tins[ii.t].backlogs[ii.b];
1472 }
1473
cake_heapify(struct cake_sched_data * q,u16 i)1474 static void cake_heapify(struct cake_sched_data *q, u16 i)
1475 {
1476 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1477 u32 mb = cake_heap_get_backlog(q, i);
1478 u32 m = i;
1479
1480 while (m < a) {
1481 u32 l = m + m + 1;
1482 u32 r = l + 1;
1483
1484 if (l < a) {
1485 u32 lb = cake_heap_get_backlog(q, l);
1486
1487 if (lb > mb) {
1488 m = l;
1489 mb = lb;
1490 }
1491 }
1492
1493 if (r < a) {
1494 u32 rb = cake_heap_get_backlog(q, r);
1495
1496 if (rb > mb) {
1497 m = r;
1498 mb = rb;
1499 }
1500 }
1501
1502 if (m != i) {
1503 cake_heap_swap(q, i, m);
1504 i = m;
1505 } else {
1506 break;
1507 }
1508 }
1509 }
1510
cake_heapify_up(struct cake_sched_data * q,u16 i)1511 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1512 {
1513 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1514 u16 p = (i - 1) >> 1;
1515 u32 ib = cake_heap_get_backlog(q, i);
1516 u32 pb = cake_heap_get_backlog(q, p);
1517
1518 if (ib > pb) {
1519 cake_heap_swap(q, i, p);
1520 i = p;
1521 } else {
1522 break;
1523 }
1524 }
1525 }
1526
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1527 static int cake_advance_shaper(struct cake_sched_data *q,
1528 struct cake_tin_data *b,
1529 struct sk_buff *skb,
1530 ktime_t now, bool drop)
1531 {
1532 u32 len = get_cobalt_cb(skb)->adjusted_len;
1533
1534 /* charge packet bandwidth to this tin
1535 * and to the global shaper.
1536 */
1537 if (q->rate_ns) {
1538 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1539 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1540 u64 failsafe_dur = global_dur + (global_dur >> 1);
1541
1542 if (ktime_before(b->time_next_packet, now))
1543 b->time_next_packet = ktime_add_ns(b->time_next_packet,
1544 tin_dur);
1545
1546 else if (ktime_before(b->time_next_packet,
1547 ktime_add_ns(now, tin_dur)))
1548 b->time_next_packet = ktime_add_ns(now, tin_dur);
1549
1550 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1551 global_dur);
1552 if (!drop)
1553 q->failsafe_next_packet = \
1554 ktime_add_ns(q->failsafe_next_packet,
1555 failsafe_dur);
1556 }
1557 return len;
1558 }
1559
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1560 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1561 {
1562 struct cake_sched_data *q = qdisc_priv(sch);
1563 ktime_t now = ktime_get();
1564 u32 idx = 0, tin = 0, len;
1565 struct cake_heap_entry qq;
1566 struct cake_tin_data *b;
1567 struct cake_flow *flow;
1568 struct sk_buff *skb;
1569
1570 if (!q->overflow_timeout) {
1571 int i;
1572 /* Build fresh max-heap */
1573 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1574 cake_heapify(q, i);
1575 }
1576 q->overflow_timeout = 65535;
1577
1578 /* select longest queue for pruning */
1579 qq = q->overflow_heap[0];
1580 tin = qq.t;
1581 idx = qq.b;
1582
1583 b = &q->tins[tin];
1584 flow = &b->flows[idx];
1585 skb = dequeue_head(flow);
1586 if (unlikely(!skb)) {
1587 /* heap has gone wrong, rebuild it next time */
1588 q->overflow_timeout = 0;
1589 return idx + (tin << 16);
1590 }
1591
1592 if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1593 b->unresponsive_flow_count++;
1594
1595 len = qdisc_pkt_len(skb);
1596 q->buffer_used -= skb->truesize;
1597 b->backlogs[idx] -= len;
1598 b->tin_backlog -= len;
1599 sch->qstats.backlog -= len;
1600
1601 flow->dropped++;
1602 b->tin_dropped++;
1603
1604 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1605 cake_advance_shaper(q, b, skb, now, true);
1606
1607 qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1608 sch->q.qlen--;
1609
1610 cake_heapify(q, 0);
1611
1612 return idx + (tin << 16);
1613 }
1614
cake_handle_diffserv(struct sk_buff * skb,bool wash)1615 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1616 {
1617 const int offset = skb_network_offset(skb);
1618 u16 *buf, buf_;
1619 u8 dscp;
1620
1621 switch (skb_protocol(skb, true)) {
1622 case htons(ETH_P_IP):
1623 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1624 if (unlikely(!buf))
1625 return 0;
1626
1627 /* ToS is in the second byte of iphdr */
1628 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1629
1630 if (wash && dscp) {
1631 const int wlen = offset + sizeof(struct iphdr);
1632
1633 if (!pskb_may_pull(skb, wlen) ||
1634 skb_try_make_writable(skb, wlen))
1635 return 0;
1636
1637 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1638 }
1639
1640 return dscp;
1641
1642 case htons(ETH_P_IPV6):
1643 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1644 if (unlikely(!buf))
1645 return 0;
1646
1647 /* Traffic class is in the first and second bytes of ipv6hdr */
1648 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1649
1650 if (wash && dscp) {
1651 const int wlen = offset + sizeof(struct ipv6hdr);
1652
1653 if (!pskb_may_pull(skb, wlen) ||
1654 skb_try_make_writable(skb, wlen))
1655 return 0;
1656
1657 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1658 }
1659
1660 return dscp;
1661
1662 case htons(ETH_P_ARP):
1663 return 0x38; /* CS7 - Net Control */
1664
1665 default:
1666 /* If there is no Diffserv field, treat as best-effort */
1667 return 0;
1668 }
1669 }
1670
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1671 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1672 struct sk_buff *skb)
1673 {
1674 struct cake_sched_data *qd = qdisc_priv(sch);
1675 struct cake_sched_config *q = qd->config;
1676 u32 tin, mark;
1677 bool wash;
1678 u8 dscp;
1679
1680 /* Tin selection: Default to diffserv-based selection, allow overriding
1681 * using firewall marks or skb->priority. Call DSCP parsing early if
1682 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1683 */
1684 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1685 wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1686 if (wash)
1687 dscp = cake_handle_diffserv(skb, wash);
1688
1689 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1690 tin = 0;
1691
1692 else if (mark && mark <= qd->tin_cnt)
1693 tin = qd->tin_order[mark - 1];
1694
1695 else if (TC_H_MAJ(skb->priority) == sch->handle &&
1696 TC_H_MIN(skb->priority) > 0 &&
1697 TC_H_MIN(skb->priority) <= qd->tin_cnt)
1698 tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
1699
1700 else {
1701 if (!wash)
1702 dscp = cake_handle_diffserv(skb, wash);
1703 tin = qd->tin_index[dscp];
1704
1705 if (unlikely(tin >= qd->tin_cnt))
1706 tin = 0;
1707 }
1708
1709 return &qd->tins[tin];
1710 }
1711
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1712 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1713 struct sk_buff *skb, int flow_mode, int *qerr)
1714 {
1715 struct cake_sched_data *q = qdisc_priv(sch);
1716 struct tcf_proto *filter;
1717 struct tcf_result res;
1718 u16 flow = 0, host = 0;
1719 int result;
1720
1721 filter = rcu_dereference_bh(q->filter_list);
1722 if (!filter)
1723 goto hash;
1724
1725 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1726 result = tcf_classify(skb, NULL, filter, &res, false);
1727
1728 if (result >= 0) {
1729 #ifdef CONFIG_NET_CLS_ACT
1730 switch (result) {
1731 case TC_ACT_STOLEN:
1732 case TC_ACT_QUEUED:
1733 case TC_ACT_TRAP:
1734 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1735 fallthrough;
1736 case TC_ACT_SHOT:
1737 return 0;
1738 }
1739 #endif
1740 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1741 flow = TC_H_MIN(res.classid);
1742 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1743 host = TC_H_MAJ(res.classid) >> 16;
1744 }
1745 hash:
1746 *t = cake_select_tin(sch, skb);
1747 return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1748 }
1749
1750 static void cake_reconfigure(struct Qdisc *sch);
1751
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1752 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1753 struct sk_buff **to_free)
1754 {
1755 u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1756 struct cake_sched_data *q = qdisc_priv(sch);
1757 int len = qdisc_pkt_len(skb), ret;
1758 struct sk_buff *ack = NULL;
1759 ktime_t now = ktime_get();
1760 struct cake_tin_data *b;
1761 struct cake_flow *flow;
1762 bool same_flow = false;
1763
1764 /* choose flow to insert into */
1765 idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
1766 if (idx == 0) {
1767 if (ret & __NET_XMIT_BYPASS)
1768 qdisc_qstats_drop(sch);
1769 __qdisc_drop(skb, to_free);
1770 return ret;
1771 }
1772 tin = (u32)(b - q->tins);
1773 idx--;
1774 flow = &b->flows[idx];
1775
1776 /* ensure shaper state isn't stale */
1777 if (!b->tin_backlog) {
1778 if (ktime_before(b->time_next_packet, now))
1779 b->time_next_packet = now;
1780
1781 if (!sch->q.qlen) {
1782 if (ktime_before(q->time_next_packet, now)) {
1783 q->failsafe_next_packet = now;
1784 q->time_next_packet = now;
1785 } else if (ktime_after(q->time_next_packet, now) &&
1786 ktime_after(q->failsafe_next_packet, now)) {
1787 u64 next = \
1788 min(ktime_to_ns(q->time_next_packet),
1789 ktime_to_ns(
1790 q->failsafe_next_packet));
1791 sch->qstats.overlimits++;
1792 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1793 }
1794 }
1795 }
1796
1797 if (unlikely(len > b->max_skblen))
1798 b->max_skblen = len;
1799
1800 if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1801 struct sk_buff *segs, *nskb;
1802 netdev_features_t features = netif_skb_features(skb);
1803 unsigned int slen = 0, numsegs = 0;
1804
1805 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1806 if (IS_ERR_OR_NULL(segs))
1807 return qdisc_drop(skb, sch, to_free);
1808
1809 skb_list_walk_safe(segs, segs, nskb) {
1810 skb_mark_not_on_list(segs);
1811 qdisc_skb_cb(segs)->pkt_len = segs->len;
1812 qdisc_skb_cb(segs)->pkt_segs = 1;
1813 cobalt_set_enqueue_time(segs, now);
1814 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1815 segs);
1816 flow_queue_add(flow, segs);
1817
1818 sch->q.qlen++;
1819 numsegs++;
1820 slen += segs->len;
1821 q->buffer_used += segs->truesize;
1822 b->packets++;
1823 }
1824
1825 /* stats */
1826 b->bytes += slen;
1827 b->backlogs[idx] += slen;
1828 b->tin_backlog += slen;
1829 sch->qstats.backlog += slen;
1830 q->avg_window_bytes += slen;
1831
1832 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1833 consume_skb(skb);
1834 } else {
1835 /* not splitting */
1836 int ack_pkt_len = 0;
1837
1838 cobalt_set_enqueue_time(skb, now);
1839 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1840 flow_queue_add(flow, skb);
1841
1842 if (q->config->ack_filter)
1843 ack = cake_ack_filter(q, flow);
1844
1845 if (ack) {
1846 b->ack_drops++;
1847 sch->qstats.drops++;
1848 ack_pkt_len = qdisc_pkt_len(ack);
1849 b->bytes += ack_pkt_len;
1850 q->buffer_used += skb->truesize - ack->truesize;
1851 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1852 cake_advance_shaper(q, b, ack, now, true);
1853
1854 qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1855 consume_skb(ack);
1856 } else {
1857 sch->q.qlen++;
1858 q->buffer_used += skb->truesize;
1859 }
1860
1861 /* stats */
1862 b->packets++;
1863 b->bytes += len - ack_pkt_len;
1864 b->backlogs[idx] += len - ack_pkt_len;
1865 b->tin_backlog += len - ack_pkt_len;
1866 sch->qstats.backlog += len - ack_pkt_len;
1867 q->avg_window_bytes += len - ack_pkt_len;
1868 }
1869
1870 if (q->overflow_timeout)
1871 cake_heapify_up(q, b->overflow_idx[idx]);
1872
1873 /* incoming bandwidth capacity estimate */
1874 if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1875 u64 packet_interval = \
1876 ktime_to_ns(ktime_sub(now, q->last_packet_time));
1877
1878 if (packet_interval > NSEC_PER_SEC)
1879 packet_interval = NSEC_PER_SEC;
1880
1881 /* filter out short-term bursts, eg. wifi aggregation */
1882 q->avg_packet_interval = \
1883 cake_ewma(q->avg_packet_interval,
1884 packet_interval,
1885 (packet_interval > q->avg_packet_interval ?
1886 2 : 8));
1887
1888 q->last_packet_time = now;
1889
1890 if (packet_interval > q->avg_packet_interval) {
1891 u64 window_interval = \
1892 ktime_to_ns(ktime_sub(now,
1893 q->avg_window_begin));
1894 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1895
1896 b = div64_u64(b, window_interval);
1897 q->avg_peak_bandwidth =
1898 cake_ewma(q->avg_peak_bandwidth, b,
1899 b > q->avg_peak_bandwidth ? 2 : 8);
1900 q->avg_window_bytes = 0;
1901 q->avg_window_begin = now;
1902
1903 if (ktime_after(now,
1904 ktime_add_ms(q->last_reconfig_time,
1905 250))) {
1906 q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1907 cake_reconfigure(sch);
1908 }
1909 }
1910 } else {
1911 q->avg_window_bytes = 0;
1912 q->last_packet_time = now;
1913 }
1914
1915 /* flowchain */
1916 if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1917 if (!flow->set) {
1918 list_add_tail(&flow->flowchain, &b->new_flows);
1919 } else {
1920 b->decaying_flow_count--;
1921 list_move_tail(&flow->flowchain, &b->new_flows);
1922 }
1923 flow->set = CAKE_SET_SPARSE;
1924 b->sparse_flow_count++;
1925
1926 flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
1927 } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1928 /* this flow was empty, accounted as a sparse flow, but actually
1929 * in the bulk rotation.
1930 */
1931 flow->set = CAKE_SET_BULK;
1932 b->sparse_flow_count--;
1933 b->bulk_flow_count++;
1934
1935 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
1936 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
1937 }
1938
1939 if (q->buffer_used > q->buffer_max_used)
1940 q->buffer_max_used = q->buffer_used;
1941
1942 if (q->buffer_used <= q->buffer_limit)
1943 return NET_XMIT_SUCCESS;
1944
1945 prev_qlen = sch->q.qlen;
1946 prev_backlog = sch->qstats.backlog;
1947
1948 while (q->buffer_used > q->buffer_limit) {
1949 drop_id = cake_drop(sch, to_free);
1950 if ((drop_id >> 16) == tin &&
1951 (drop_id & 0xFFFF) == idx)
1952 same_flow = true;
1953 }
1954
1955 prev_qlen -= sch->q.qlen;
1956 prev_backlog -= sch->qstats.backlog;
1957 b->drop_overlimit += prev_qlen;
1958
1959 if (same_flow) {
1960 qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1961 prev_backlog - len);
1962 return NET_XMIT_CN;
1963 }
1964 qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1965 return NET_XMIT_SUCCESS;
1966 }
1967
cake_dequeue_one(struct Qdisc * sch)1968 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1969 {
1970 struct cake_sched_data *q = qdisc_priv(sch);
1971 struct cake_tin_data *b = &q->tins[q->cur_tin];
1972 struct cake_flow *flow = &b->flows[q->cur_flow];
1973 struct sk_buff *skb = NULL;
1974 u32 len;
1975
1976 if (flow->head) {
1977 skb = dequeue_head(flow);
1978 len = qdisc_pkt_len(skb);
1979 b->backlogs[q->cur_flow] -= len;
1980 b->tin_backlog -= len;
1981 sch->qstats.backlog -= len;
1982 q->buffer_used -= skb->truesize;
1983 sch->q.qlen--;
1984
1985 if (q->overflow_timeout)
1986 cake_heapify(q, b->overflow_idx[q->cur_flow]);
1987 }
1988 return skb;
1989 }
1990
1991 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1992 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1993 {
1994 struct cake_sched_data *q = qdisc_priv(sch);
1995 struct sk_buff *skb;
1996
1997 q->cur_tin = tin;
1998 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1999 while (!!(skb = cake_dequeue_one(sch)))
2000 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
2001 }
2002
cake_dequeue(struct Qdisc * sch)2003 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2004 {
2005 struct cake_sched_data *q = qdisc_priv(sch);
2006 struct cake_tin_data *b = &q->tins[q->cur_tin];
2007 enum skb_drop_reason reason;
2008 ktime_t now = ktime_get();
2009 struct cake_flow *flow;
2010 struct list_head *head;
2011 bool first_flow = true;
2012 struct sk_buff *skb;
2013 u64 delay;
2014 u32 len;
2015
2016 if (q->config->is_shared && q->rate_ns &&
2017 now - q->last_checked_active >= q->config->sync_time) {
2018 struct net_device *dev = qdisc_dev(sch);
2019 struct cake_sched_data *other_priv;
2020 u64 new_rate = q->config->rate_bps;
2021 u64 other_qlen, other_last_active;
2022 struct Qdisc *other_sch;
2023 u32 num_active_qs = 1;
2024 unsigned int ntx;
2025
2026 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2027 other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
2028 other_priv = qdisc_priv(other_sch);
2029
2030 if (other_priv == q)
2031 continue;
2032
2033 other_qlen = READ_ONCE(other_sch->q.qlen);
2034 other_last_active = READ_ONCE(other_priv->last_active);
2035
2036 if (other_qlen || other_last_active > q->last_checked_active)
2037 num_active_qs++;
2038 }
2039
2040 if (num_active_qs > 1)
2041 new_rate = div64_u64(q->config->rate_bps, num_active_qs);
2042
2043 cake_configure_rates(sch, new_rate, true);
2044 q->last_checked_active = now;
2045 q->active_queues = num_active_qs;
2046 }
2047
2048 begin:
2049 if (!sch->q.qlen)
2050 return NULL;
2051
2052 /* global hard shaper */
2053 if (ktime_after(q->time_next_packet, now) &&
2054 ktime_after(q->failsafe_next_packet, now)) {
2055 u64 next = min(ktime_to_ns(q->time_next_packet),
2056 ktime_to_ns(q->failsafe_next_packet));
2057
2058 sch->qstats.overlimits++;
2059 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2060 return NULL;
2061 }
2062
2063 /* Choose a class to work on. */
2064 if (!q->rate_ns) {
2065 /* In unlimited mode, can't rely on shaper timings, just balance
2066 * with DRR
2067 */
2068 bool wrapped = false, empty = true;
2069
2070 while (b->tin_deficit < 0 ||
2071 !(b->sparse_flow_count + b->bulk_flow_count)) {
2072 if (b->tin_deficit <= 0)
2073 b->tin_deficit += b->tin_quantum;
2074 if (b->sparse_flow_count + b->bulk_flow_count)
2075 empty = false;
2076
2077 q->cur_tin++;
2078 b++;
2079 if (q->cur_tin >= q->tin_cnt) {
2080 q->cur_tin = 0;
2081 b = q->tins;
2082
2083 if (wrapped) {
2084 /* It's possible for q->qlen to be
2085 * nonzero when we actually have no
2086 * packets anywhere.
2087 */
2088 if (empty)
2089 return NULL;
2090 } else {
2091 wrapped = true;
2092 }
2093 }
2094 }
2095 } else {
2096 /* In shaped mode, choose:
2097 * - Highest-priority tin with queue and meeting schedule, or
2098 * - The earliest-scheduled tin with queue.
2099 */
2100 ktime_t best_time = KTIME_MAX;
2101 int tin, best_tin = 0;
2102
2103 for (tin = 0; tin < q->tin_cnt; tin++) {
2104 b = q->tins + tin;
2105 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2106 ktime_t time_to_pkt = \
2107 ktime_sub(b->time_next_packet, now);
2108
2109 if (ktime_to_ns(time_to_pkt) <= 0 ||
2110 ktime_compare(time_to_pkt,
2111 best_time) <= 0) {
2112 best_time = time_to_pkt;
2113 best_tin = tin;
2114 }
2115 }
2116 }
2117
2118 q->cur_tin = best_tin;
2119 b = q->tins + best_tin;
2120
2121 /* No point in going further if no packets to deliver. */
2122 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2123 return NULL;
2124 }
2125
2126 retry:
2127 /* service this class */
2128 head = &b->decaying_flows;
2129 if (!first_flow || list_empty(head)) {
2130 head = &b->new_flows;
2131 if (list_empty(head)) {
2132 head = &b->old_flows;
2133 if (unlikely(list_empty(head))) {
2134 head = &b->decaying_flows;
2135 if (unlikely(list_empty(head)))
2136 goto begin;
2137 }
2138 }
2139 }
2140 flow = list_first_entry(head, struct cake_flow, flowchain);
2141 q->cur_flow = flow - b->flows;
2142 first_flow = false;
2143
2144 /* flow isolation (DRR++) */
2145 if (flow->deficit <= 0) {
2146 /* Keep all flows with deficits out of the sparse and decaying
2147 * rotations. No non-empty flow can go into the decaying
2148 * rotation, so they can't get deficits
2149 */
2150 if (flow->set == CAKE_SET_SPARSE) {
2151 if (flow->head) {
2152 b->sparse_flow_count--;
2153 b->bulk_flow_count++;
2154
2155 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2156 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2157
2158 flow->set = CAKE_SET_BULK;
2159 } else {
2160 /* we've moved it to the bulk rotation for
2161 * correct deficit accounting but we still want
2162 * to count it as a sparse flow, not a bulk one.
2163 */
2164 flow->set = CAKE_SET_SPARSE_WAIT;
2165 }
2166 }
2167
2168 flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
2169 list_move_tail(&flow->flowchain, &b->old_flows);
2170
2171 goto retry;
2172 }
2173
2174 /* Retrieve a packet via the AQM */
2175 while (1) {
2176 skb = cake_dequeue_one(sch);
2177 if (!skb) {
2178 /* this queue was actually empty */
2179 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2180 b->unresponsive_flow_count--;
2181
2182 if (flow->cvars.p_drop || flow->cvars.count ||
2183 ktime_before(now, flow->cvars.drop_next)) {
2184 /* keep in the flowchain until the state has
2185 * decayed to rest
2186 */
2187 list_move_tail(&flow->flowchain,
2188 &b->decaying_flows);
2189 if (flow->set == CAKE_SET_BULK) {
2190 b->bulk_flow_count--;
2191
2192 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2193 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2194
2195 b->decaying_flow_count++;
2196 } else if (flow->set == CAKE_SET_SPARSE ||
2197 flow->set == CAKE_SET_SPARSE_WAIT) {
2198 b->sparse_flow_count--;
2199 b->decaying_flow_count++;
2200 }
2201 flow->set = CAKE_SET_DECAYING;
2202 } else {
2203 /* remove empty queue from the flowchain */
2204 list_del_init(&flow->flowchain);
2205 if (flow->set == CAKE_SET_SPARSE ||
2206 flow->set == CAKE_SET_SPARSE_WAIT)
2207 b->sparse_flow_count--;
2208 else if (flow->set == CAKE_SET_BULK) {
2209 b->bulk_flow_count--;
2210
2211 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2212 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2213 } else
2214 b->decaying_flow_count--;
2215
2216 flow->set = CAKE_SET_NONE;
2217 }
2218 goto begin;
2219 }
2220
2221 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2222 (b->bulk_flow_count *
2223 !!(q->config->rate_flags &
2224 CAKE_FLAG_INGRESS)));
2225 /* Last packet in queue may be marked, shouldn't be dropped */
2226 if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2227 break;
2228
2229 /* drop this packet, get another one */
2230 if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
2231 len = cake_advance_shaper(q, b, skb,
2232 now, true);
2233 flow->deficit -= len;
2234 b->tin_deficit -= len;
2235 }
2236 flow->dropped++;
2237 b->tin_dropped++;
2238 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2239 qdisc_qstats_drop(sch);
2240 qdisc_dequeue_drop(sch, skb, reason);
2241 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
2242 goto retry;
2243 }
2244
2245 b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2246 qdisc_bstats_update(sch, skb);
2247 WRITE_ONCE(q->last_active, now);
2248
2249 /* collect delay stats */
2250 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2251 b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2252 b->peak_delay = cake_ewma(b->peak_delay, delay,
2253 delay > b->peak_delay ? 2 : 8);
2254 b->base_delay = cake_ewma(b->base_delay, delay,
2255 delay < b->base_delay ? 2 : 8);
2256
2257 len = cake_advance_shaper(q, b, skb, now, false);
2258 flow->deficit -= len;
2259 b->tin_deficit -= len;
2260
2261 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2262 u64 next = min(ktime_to_ns(q->time_next_packet),
2263 ktime_to_ns(q->failsafe_next_packet));
2264
2265 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2266 } else if (!sch->q.qlen) {
2267 int i;
2268
2269 for (i = 0; i < q->tin_cnt; i++) {
2270 if (q->tins[i].decaying_flow_count) {
2271 ktime_t next = \
2272 ktime_add_ns(now,
2273 q->tins[i].cparams.target);
2274
2275 qdisc_watchdog_schedule_ns(&q->watchdog,
2276 ktime_to_ns(next));
2277 break;
2278 }
2279 }
2280 }
2281
2282 if (q->overflow_timeout)
2283 q->overflow_timeout--;
2284
2285 return skb;
2286 }
2287
cake_reset(struct Qdisc * sch)2288 static void cake_reset(struct Qdisc *sch)
2289 {
2290 struct cake_sched_data *q = qdisc_priv(sch);
2291 u32 c;
2292
2293 if (!q->tins)
2294 return;
2295
2296 for (c = 0; c < CAKE_MAX_TINS; c++)
2297 cake_clear_tin(sch, c);
2298 }
2299
2300 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2301 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
2302 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2303 [TCA_CAKE_ATM] = { .type = NLA_U32 },
2304 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
2305 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
2306 [TCA_CAKE_RTT] = { .type = NLA_U32 },
2307 [TCA_CAKE_TARGET] = { .type = NLA_U32 },
2308 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
2309 [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
2310 [TCA_CAKE_NAT] = { .type = NLA_U32 },
2311 [TCA_CAKE_RAW] = { .type = NLA_U32 },
2312 [TCA_CAKE_WASH] = { .type = NLA_U32 },
2313 [TCA_CAKE_MPU] = { .type = NLA_U32 },
2314 [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
2315 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
2316 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
2317 [TCA_CAKE_FWMARK] = { .type = NLA_U32 },
2318 };
2319
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2320 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2321 u64 target_ns, u64 rtt_est_ns)
2322 {
2323 /* convert byte-rate into time-per-byte
2324 * so it will always unwedge in reasonable time.
2325 */
2326 static const u64 MIN_RATE = 64;
2327 u32 byte_target = mtu;
2328 u64 byte_target_ns;
2329 u8 rate_shft = 0;
2330 u64 rate_ns = 0;
2331
2332 b->flow_quantum = 1514;
2333 if (rate) {
2334 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2335 rate_shft = 34;
2336 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2337 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2338 while (!!(rate_ns >> 34)) {
2339 rate_ns >>= 1;
2340 rate_shft--;
2341 }
2342 } /* else unlimited, ie. zero delay */
2343
2344 b->tin_rate_bps = rate;
2345 b->tin_rate_ns = rate_ns;
2346 b->tin_rate_shft = rate_shft;
2347
2348 if (mtu == 0)
2349 return;
2350
2351 byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2352
2353 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2354 b->cparams.interval = max(rtt_est_ns +
2355 b->cparams.target - target_ns,
2356 b->cparams.target * 2);
2357 b->cparams.mtu_time = byte_target_ns;
2358 b->cparams.p_inc = 1 << 24; /* 1/256 */
2359 b->cparams.p_dec = 1 << 20; /* 1/4096 */
2360 }
2361
cake_config_besteffort(struct Qdisc * sch,u64 rate,u32 mtu)2362 static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu)
2363 {
2364 struct cake_sched_data *q = qdisc_priv(sch);
2365 struct cake_tin_data *b = &q->tins[0];
2366
2367 q->tin_cnt = 1;
2368
2369 q->tin_index = besteffort;
2370 q->tin_order = normal_order;
2371
2372 cake_set_rate(b, rate, mtu,
2373 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2374 b->tin_quantum = 65535;
2375
2376 return 0;
2377 }
2378
cake_config_precedence(struct Qdisc * sch,u64 rate,u32 mtu)2379 static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu)
2380 {
2381 /* convert high-level (user visible) parameters into internal format */
2382 struct cake_sched_data *q = qdisc_priv(sch);
2383 u32 quantum = 256;
2384 u32 i;
2385
2386 q->tin_cnt = 8;
2387 q->tin_index = precedence;
2388 q->tin_order = normal_order;
2389
2390 for (i = 0; i < q->tin_cnt; i++) {
2391 struct cake_tin_data *b = &q->tins[i];
2392
2393 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2394 us_to_ns(q->config->interval));
2395
2396 b->tin_quantum = max_t(u16, 1U, quantum);
2397
2398 /* calculate next class's parameters */
2399 rate *= 7;
2400 rate >>= 3;
2401
2402 quantum *= 7;
2403 quantum >>= 3;
2404 }
2405
2406 return 0;
2407 }
2408
2409 /* List of known Diffserv codepoints:
2410 *
2411 * Default Forwarding (DF/CS0) - Best Effort
2412 * Max Throughput (TOS2)
2413 * Min Delay (TOS4)
2414 * LLT "La" (TOS5)
2415 * Assured Forwarding 1 (AF1x) - x3
2416 * Assured Forwarding 2 (AF2x) - x3
2417 * Assured Forwarding 3 (AF3x) - x3
2418 * Assured Forwarding 4 (AF4x) - x3
2419 * Precedence Class 1 (CS1)
2420 * Precedence Class 2 (CS2)
2421 * Precedence Class 3 (CS3)
2422 * Precedence Class 4 (CS4)
2423 * Precedence Class 5 (CS5)
2424 * Precedence Class 6 (CS6)
2425 * Precedence Class 7 (CS7)
2426 * Voice Admit (VA)
2427 * Expedited Forwarding (EF)
2428 * Lower Effort (LE)
2429 *
2430 * Total 26 codepoints.
2431 */
2432
2433 /* List of traffic classes in RFC 4594, updated by RFC 8622:
2434 * (roughly descending order of contended priority)
2435 * (roughly ascending order of uncontended throughput)
2436 *
2437 * Network Control (CS6,CS7) - routing traffic
2438 * Telephony (EF,VA) - aka. VoIP streams
2439 * Signalling (CS5) - VoIP setup
2440 * Multimedia Conferencing (AF4x) - aka. video calls
2441 * Realtime Interactive (CS4) - eg. games
2442 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
2443 * Broadcast Video (CS3)
2444 * Low-Latency Data (AF2x,TOS4) - eg. database
2445 * Ops, Admin, Management (CS2) - eg. ssh
2446 * Standard Service (DF & unrecognised codepoints)
2447 * High-Throughput Data (AF1x,TOS2) - eg. web traffic
2448 * Low-Priority Data (LE,CS1) - eg. BitTorrent
2449 *
2450 * Total 12 traffic classes.
2451 */
2452
cake_config_diffserv8(struct Qdisc * sch,u64 rate,u32 mtu)2453 static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu)
2454 {
2455 /* Pruned list of traffic classes for typical applications:
2456 *
2457 * Network Control (CS6, CS7)
2458 * Minimum Latency (EF, VA, CS5, CS4)
2459 * Interactive Shell (CS2)
2460 * Low Latency Transactions (AF2x, TOS4)
2461 * Video Streaming (AF4x, AF3x, CS3)
2462 * Bog Standard (DF etc.)
2463 * High Throughput (AF1x, TOS2, CS1)
2464 * Background Traffic (LE)
2465 *
2466 * Total 8 traffic classes.
2467 */
2468
2469 struct cake_sched_data *q = qdisc_priv(sch);
2470 u32 quantum = 256;
2471 u32 i;
2472
2473 q->tin_cnt = 8;
2474
2475 /* codepoint to class mapping */
2476 q->tin_index = diffserv8;
2477 q->tin_order = normal_order;
2478
2479 /* class characteristics */
2480 for (i = 0; i < q->tin_cnt; i++) {
2481 struct cake_tin_data *b = &q->tins[i];
2482
2483 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2484 us_to_ns(q->config->interval));
2485
2486 b->tin_quantum = max_t(u16, 1U, quantum);
2487
2488 /* calculate next class's parameters */
2489 rate *= 7;
2490 rate >>= 3;
2491
2492 quantum *= 7;
2493 quantum >>= 3;
2494 }
2495
2496 return 0;
2497 }
2498
cake_config_diffserv4(struct Qdisc * sch,u64 rate,u32 mtu)2499 static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu)
2500 {
2501 /* Further pruned list of traffic classes for four-class system:
2502 *
2503 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
2504 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2505 * Best Effort (DF, AF1x, TOS2, and those not specified)
2506 * Background Traffic (LE, CS1)
2507 *
2508 * Total 4 traffic classes.
2509 */
2510
2511 struct cake_sched_data *q = qdisc_priv(sch);
2512 u32 quantum = 1024;
2513
2514 q->tin_cnt = 4;
2515
2516 /* codepoint to class mapping */
2517 q->tin_index = diffserv4;
2518 q->tin_order = bulk_order;
2519
2520 /* class characteristics */
2521 cake_set_rate(&q->tins[0], rate, mtu,
2522 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2523 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2524 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2525 cake_set_rate(&q->tins[2], rate >> 1, mtu,
2526 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2527 cake_set_rate(&q->tins[3], rate >> 2, mtu,
2528 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2529
2530 /* bandwidth-sharing weights */
2531 q->tins[0].tin_quantum = quantum;
2532 q->tins[1].tin_quantum = quantum >> 4;
2533 q->tins[2].tin_quantum = quantum >> 1;
2534 q->tins[3].tin_quantum = quantum >> 2;
2535
2536 return 0;
2537 }
2538
cake_config_diffserv3(struct Qdisc * sch,u64 rate,u32 mtu)2539 static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu)
2540 {
2541 /* Simplified Diffserv structure with 3 tins.
2542 * Latency Sensitive (CS7, CS6, EF, VA, TOS4)
2543 * Best Effort
2544 * Low Priority (LE, CS1)
2545 */
2546 struct cake_sched_data *q = qdisc_priv(sch);
2547 u32 quantum = 1024;
2548
2549 q->tin_cnt = 3;
2550
2551 /* codepoint to class mapping */
2552 q->tin_index = diffserv3;
2553 q->tin_order = bulk_order;
2554
2555 /* class characteristics */
2556 cake_set_rate(&q->tins[0], rate, mtu,
2557 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2558 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2559 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2560 cake_set_rate(&q->tins[2], rate >> 2, mtu,
2561 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2562
2563 /* bandwidth-sharing weights */
2564 q->tins[0].tin_quantum = quantum;
2565 q->tins[1].tin_quantum = quantum >> 4;
2566 q->tins[2].tin_quantum = quantum >> 2;
2567
2568 return 0;
2569 }
2570
cake_configure_rates(struct Qdisc * sch,u64 rate,bool rate_adjust)2571 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust)
2572 {
2573 u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch));
2574 struct cake_sched_data *qd = qdisc_priv(sch);
2575 struct cake_sched_config *q = qd->config;
2576 int c, ft;
2577
2578 switch (q->tin_mode) {
2579 case CAKE_DIFFSERV_BESTEFFORT:
2580 ft = cake_config_besteffort(sch, rate, mtu);
2581 break;
2582
2583 case CAKE_DIFFSERV_PRECEDENCE:
2584 ft = cake_config_precedence(sch, rate, mtu);
2585 break;
2586
2587 case CAKE_DIFFSERV_DIFFSERV8:
2588 ft = cake_config_diffserv8(sch, rate, mtu);
2589 break;
2590
2591 case CAKE_DIFFSERV_DIFFSERV4:
2592 ft = cake_config_diffserv4(sch, rate, mtu);
2593 break;
2594
2595 case CAKE_DIFFSERV_DIFFSERV3:
2596 default:
2597 ft = cake_config_diffserv3(sch, rate, mtu);
2598 break;
2599 }
2600
2601 for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
2602 cake_clear_tin(sch, c);
2603 qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
2604 }
2605
2606 qd->rate_ns = qd->tins[ft].tin_rate_ns;
2607 qd->rate_shft = qd->tins[ft].tin_rate_shft;
2608 }
2609
cake_reconfigure(struct Qdisc * sch)2610 static void cake_reconfigure(struct Qdisc *sch)
2611 {
2612 struct cake_sched_data *qd = qdisc_priv(sch);
2613 struct cake_sched_config *q = qd->config;
2614
2615 cake_configure_rates(sch, qd->config->rate_bps, false);
2616
2617 if (q->buffer_config_limit) {
2618 qd->buffer_limit = q->buffer_config_limit;
2619 } else if (q->rate_bps) {
2620 u64 t = q->rate_bps * q->interval;
2621
2622 do_div(t, USEC_PER_SEC / 4);
2623 qd->buffer_limit = max_t(u32, t, 4U << 20);
2624 } else {
2625 qd->buffer_limit = ~0;
2626 }
2627
2628 sch->flags &= ~TCQ_F_CAN_BYPASS;
2629
2630 qd->buffer_limit = min(qd->buffer_limit,
2631 max(sch->limit * psched_mtu(qdisc_dev(sch)),
2632 q->buffer_config_limit));
2633 }
2634
cake_config_change(struct cake_sched_config * q,struct nlattr * opt,struct netlink_ext_ack * extack,bool * overhead_changed)2635 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
2636 struct netlink_ext_ack *extack, bool *overhead_changed)
2637 {
2638 struct nlattr *tb[TCA_CAKE_MAX + 1];
2639 u16 rate_flags = q->rate_flags;
2640 u8 flow_mode = q->flow_mode;
2641 int err;
2642
2643 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2644 extack);
2645 if (err < 0)
2646 return err;
2647
2648 if (tb[TCA_CAKE_NAT]) {
2649 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2650 flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2651 flow_mode |= CAKE_FLOW_NAT_FLAG *
2652 !!nla_get_u32(tb[TCA_CAKE_NAT]);
2653 #else
2654 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2655 "No conntrack support in kernel");
2656 return -EOPNOTSUPP;
2657 #endif
2658 }
2659
2660 if (tb[TCA_CAKE_AUTORATE]) {
2661 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
2662 if (q->is_shared) {
2663 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
2664 "Can't use autorate-ingress with cake_mq");
2665 return -EOPNOTSUPP;
2666 }
2667 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2668 } else {
2669 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2670 }
2671 }
2672
2673 if (tb[TCA_CAKE_BASE_RATE64])
2674 WRITE_ONCE(q->rate_bps,
2675 nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2676
2677 if (tb[TCA_CAKE_DIFFSERV_MODE])
2678 WRITE_ONCE(q->tin_mode,
2679 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2680
2681 if (tb[TCA_CAKE_WASH]) {
2682 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2683 rate_flags |= CAKE_FLAG_WASH;
2684 else
2685 rate_flags &= ~CAKE_FLAG_WASH;
2686 }
2687
2688 if (tb[TCA_CAKE_FLOW_MODE])
2689 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2690 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2691 CAKE_FLOW_MASK));
2692
2693 if (tb[TCA_CAKE_ATM])
2694 WRITE_ONCE(q->atm_mode,
2695 nla_get_u32(tb[TCA_CAKE_ATM]));
2696
2697 if (tb[TCA_CAKE_OVERHEAD]) {
2698 WRITE_ONCE(q->rate_overhead,
2699 nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2700 rate_flags |= CAKE_FLAG_OVERHEAD;
2701 *overhead_changed = true;
2702 }
2703
2704 if (tb[TCA_CAKE_RAW]) {
2705 rate_flags &= ~CAKE_FLAG_OVERHEAD;
2706 *overhead_changed = true;
2707 }
2708
2709 if (tb[TCA_CAKE_MPU])
2710 WRITE_ONCE(q->rate_mpu,
2711 nla_get_u32(tb[TCA_CAKE_MPU]));
2712
2713 if (tb[TCA_CAKE_RTT]) {
2714 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2715
2716 WRITE_ONCE(q->interval, max(interval, 1U));
2717 }
2718
2719 if (tb[TCA_CAKE_TARGET]) {
2720 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2721
2722 WRITE_ONCE(q->target, max(target, 1U));
2723 }
2724
2725 if (tb[TCA_CAKE_INGRESS]) {
2726 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2727 rate_flags |= CAKE_FLAG_INGRESS;
2728 else
2729 rate_flags &= ~CAKE_FLAG_INGRESS;
2730 }
2731
2732 if (tb[TCA_CAKE_ACK_FILTER])
2733 WRITE_ONCE(q->ack_filter,
2734 nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2735
2736 if (tb[TCA_CAKE_MEMORY])
2737 WRITE_ONCE(q->buffer_config_limit,
2738 nla_get_u32(tb[TCA_CAKE_MEMORY]));
2739
2740 if (tb[TCA_CAKE_SPLIT_GSO]) {
2741 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2742 rate_flags |= CAKE_FLAG_SPLIT_GSO;
2743 else
2744 rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2745 }
2746
2747 if (tb[TCA_CAKE_FWMARK]) {
2748 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2749 WRITE_ONCE(q->fwmark_shft,
2750 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2751 }
2752
2753 WRITE_ONCE(q->rate_flags, rate_flags);
2754 WRITE_ONCE(q->flow_mode, flow_mode);
2755
2756 return 0;
2757 }
2758
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2759 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2760 struct netlink_ext_ack *extack)
2761 {
2762 struct cake_sched_data *qd = qdisc_priv(sch);
2763 struct cake_sched_config *q = qd->config;
2764 bool overhead_changed = false;
2765 int ret;
2766
2767 if (q->is_shared) {
2768 NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
2769 return -EOPNOTSUPP;
2770 }
2771
2772 ret = cake_config_change(q, opt, extack, &overhead_changed);
2773 if (ret)
2774 return ret;
2775
2776 if (overhead_changed) {
2777 qd->max_netlen = 0;
2778 qd->max_adjlen = 0;
2779 qd->min_netlen = ~0;
2780 qd->min_adjlen = ~0;
2781 }
2782
2783 if (qd->tins) {
2784 sch_tree_lock(sch);
2785 cake_reconfigure(sch);
2786 sch_tree_unlock(sch);
2787 }
2788
2789 return 0;
2790 }
2791
cake_destroy(struct Qdisc * sch)2792 static void cake_destroy(struct Qdisc *sch)
2793 {
2794 struct cake_sched_data *q = qdisc_priv(sch);
2795
2796 qdisc_watchdog_cancel(&q->watchdog);
2797 tcf_block_put(q->block);
2798 kvfree(q->tins);
2799 }
2800
cake_config_init(struct cake_sched_config * q,bool is_shared)2801 static void cake_config_init(struct cake_sched_config *q, bool is_shared)
2802 {
2803 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2804 q->flow_mode = CAKE_FLOW_TRIPLE;
2805
2806 q->rate_bps = 0; /* unlimited by default */
2807
2808 q->interval = 100000; /* 100ms default */
2809 q->target = 5000; /* 5ms: codel RFC argues
2810 * for 5 to 10% of interval
2811 */
2812 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2813 q->is_shared = is_shared;
2814 q->sync_time = 200 * NSEC_PER_USEC;
2815 }
2816
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2817 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2818 struct netlink_ext_ack *extack)
2819 {
2820 struct cake_sched_data *qd = qdisc_priv(sch);
2821 struct cake_sched_config *q = &qd->initial_config;
2822 int i, j, err;
2823
2824 cake_config_init(q, false);
2825
2826 sch->limit = 10240;
2827 sch->flags |= TCQ_F_DEQUEUE_DROPS;
2828
2829 qd->cur_tin = 0;
2830 qd->cur_flow = 0;
2831 qd->config = q;
2832
2833 qdisc_watchdog_init(&qd->watchdog, sch);
2834
2835 if (opt) {
2836 err = cake_change(sch, opt, extack);
2837 if (err)
2838 return err;
2839 }
2840
2841 err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
2842 if (err)
2843 return err;
2844
2845 quantum_div[0] = ~0;
2846 for (i = 1; i <= CAKE_QUEUES; i++)
2847 quantum_div[i] = 65535 / i;
2848
2849 qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS);
2850 if (!qd->tins)
2851 return -ENOMEM;
2852
2853 for (i = 0; i < CAKE_MAX_TINS; i++) {
2854 struct cake_tin_data *b = qd->tins + i;
2855
2856 INIT_LIST_HEAD(&b->new_flows);
2857 INIT_LIST_HEAD(&b->old_flows);
2858 INIT_LIST_HEAD(&b->decaying_flows);
2859 b->sparse_flow_count = 0;
2860 b->bulk_flow_count = 0;
2861 b->decaying_flow_count = 0;
2862
2863 for (j = 0; j < CAKE_QUEUES; j++) {
2864 struct cake_flow *flow = b->flows + j;
2865 u32 k = j * CAKE_MAX_TINS + i;
2866
2867 INIT_LIST_HEAD(&flow->flowchain);
2868 cobalt_vars_init(&flow->cvars);
2869
2870 qd->overflow_heap[k].t = i;
2871 qd->overflow_heap[k].b = j;
2872 b->overflow_idx[j] = k;
2873 }
2874 }
2875
2876 cake_reconfigure(sch);
2877 qd->avg_peak_bandwidth = q->rate_bps;
2878 qd->min_netlen = ~0;
2879 qd->min_adjlen = ~0;
2880 qd->active_queues = 0;
2881 qd->last_checked_active = 0;
2882
2883 return 0;
2884 }
2885
cake_config_replace(struct Qdisc * sch,struct cake_sched_config * cfg)2886 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
2887 {
2888 struct cake_sched_data *qd = qdisc_priv(sch);
2889
2890 qd->config = cfg;
2891 cake_reconfigure(sch);
2892 }
2893
cake_config_dump(struct cake_sched_config * q,struct sk_buff * skb)2894 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
2895 {
2896 struct nlattr *opts;
2897 u16 rate_flags;
2898 u8 flow_mode;
2899
2900 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2901 if (!opts)
2902 goto nla_put_failure;
2903
2904 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2905 READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2906 goto nla_put_failure;
2907
2908 flow_mode = READ_ONCE(q->flow_mode);
2909 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2910 goto nla_put_failure;
2911
2912 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2913 goto nla_put_failure;
2914
2915 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2916 goto nla_put_failure;
2917
2918 if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2919 READ_ONCE(q->buffer_config_limit)))
2920 goto nla_put_failure;
2921
2922 rate_flags = READ_ONCE(q->rate_flags);
2923 if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2924 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2925 goto nla_put_failure;
2926
2927 if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2928 !!(rate_flags & CAKE_FLAG_INGRESS)))
2929 goto nla_put_failure;
2930
2931 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2932 goto nla_put_failure;
2933
2934 if (nla_put_u32(skb, TCA_CAKE_NAT,
2935 !!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2936 goto nla_put_failure;
2937
2938 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2939 goto nla_put_failure;
2940
2941 if (nla_put_u32(skb, TCA_CAKE_WASH,
2942 !!(rate_flags & CAKE_FLAG_WASH)))
2943 goto nla_put_failure;
2944
2945 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2946 goto nla_put_failure;
2947
2948 if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2949 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2950 goto nla_put_failure;
2951
2952 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2953 goto nla_put_failure;
2954
2955 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2956 goto nla_put_failure;
2957
2958 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2959 !!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2960 goto nla_put_failure;
2961
2962 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2963 goto nla_put_failure;
2964
2965 return nla_nest_end(skb, opts);
2966
2967 nla_put_failure:
2968 return -1;
2969 }
2970
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2971 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2972 {
2973 struct cake_sched_data *qd = qdisc_priv(sch);
2974
2975 return cake_config_dump(qd->config, skb);
2976 }
2977
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2978 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2979 {
2980 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2981 struct cake_sched_data *q = qdisc_priv(sch);
2982 struct nlattr *tstats, *ts;
2983 int i;
2984
2985 if (!stats)
2986 return -1;
2987
2988 #define PUT_STAT_U32(attr, data) do { \
2989 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2990 goto nla_put_failure; \
2991 } while (0)
2992 #define PUT_STAT_U64(attr, data) do { \
2993 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2994 data, TCA_CAKE_STATS_PAD)) \
2995 goto nla_put_failure; \
2996 } while (0)
2997
2998 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2999 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
3000 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
3001 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
3002 PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
3003 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
3004 PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
3005 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
3006 PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
3007
3008 #undef PUT_STAT_U32
3009 #undef PUT_STAT_U64
3010
3011 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
3012 if (!tstats)
3013 goto nla_put_failure;
3014
3015 #define PUT_TSTAT_U32(attr, data) do { \
3016 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3017 goto nla_put_failure; \
3018 } while (0)
3019 #define PUT_TSTAT_U64(attr, data) do { \
3020 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3021 data, TCA_CAKE_TIN_STATS_PAD)) \
3022 goto nla_put_failure; \
3023 } while (0)
3024
3025 for (i = 0; i < q->tin_cnt; i++) {
3026 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3027
3028 ts = nla_nest_start_noflag(d->skb, i + 1);
3029 if (!ts)
3030 goto nla_put_failure;
3031
3032 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
3033 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
3034 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
3035
3036 PUT_TSTAT_U32(TARGET_US,
3037 ktime_to_us(ns_to_ktime(b->cparams.target)));
3038 PUT_TSTAT_U32(INTERVAL_US,
3039 ktime_to_us(ns_to_ktime(b->cparams.interval)));
3040
3041 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
3042 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
3043 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
3044 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
3045
3046 PUT_TSTAT_U32(PEAK_DELAY_US,
3047 ktime_to_us(ns_to_ktime(b->peak_delay)));
3048 PUT_TSTAT_U32(AVG_DELAY_US,
3049 ktime_to_us(ns_to_ktime(b->avge_delay)));
3050 PUT_TSTAT_U32(BASE_DELAY_US,
3051 ktime_to_us(ns_to_ktime(b->base_delay)));
3052
3053 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
3054 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
3055 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
3056
3057 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
3058 b->decaying_flow_count);
3059 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
3060 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
3061 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
3062
3063 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
3064 nla_nest_end(d->skb, ts);
3065 }
3066
3067 #undef PUT_TSTAT_U32
3068 #undef PUT_TSTAT_U64
3069
3070 nla_nest_end(d->skb, tstats);
3071 return nla_nest_end(d->skb, stats);
3072
3073 nla_put_failure:
3074 nla_nest_cancel(d->skb, stats);
3075 return -1;
3076 }
3077
cake_leaf(struct Qdisc * sch,unsigned long arg)3078 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3079 {
3080 return NULL;
3081 }
3082
cake_find(struct Qdisc * sch,u32 classid)3083 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3084 {
3085 return 0;
3086 }
3087
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)3088 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3089 u32 classid)
3090 {
3091 return 0;
3092 }
3093
cake_unbind(struct Qdisc * q,unsigned long cl)3094 static void cake_unbind(struct Qdisc *q, unsigned long cl)
3095 {
3096 }
3097
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)3098 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3099 struct netlink_ext_ack *extack)
3100 {
3101 struct cake_sched_data *q = qdisc_priv(sch);
3102
3103 if (cl)
3104 return NULL;
3105 return q->block;
3106 }
3107
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)3108 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3109 struct sk_buff *skb, struct tcmsg *tcm)
3110 {
3111 tcm->tcm_handle |= TC_H_MIN(cl);
3112 return 0;
3113 }
3114
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)3115 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3116 struct gnet_dump *d)
3117 {
3118 struct cake_sched_data *q = qdisc_priv(sch);
3119 const struct cake_flow *flow = NULL;
3120 struct gnet_stats_queue qs = { 0 };
3121 struct nlattr *stats;
3122 u32 idx = cl - 1;
3123
3124 if (idx < CAKE_QUEUES * q->tin_cnt) {
3125 const struct cake_tin_data *b = \
3126 &q->tins[q->tin_order[idx / CAKE_QUEUES]];
3127 const struct sk_buff *skb;
3128
3129 flow = &b->flows[idx % CAKE_QUEUES];
3130
3131 if (flow->head) {
3132 sch_tree_lock(sch);
3133 skb = flow->head;
3134 while (skb) {
3135 qs.qlen++;
3136 skb = skb->next;
3137 }
3138 sch_tree_unlock(sch);
3139 }
3140 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3141 qs.drops = flow->dropped;
3142 }
3143 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3144 return -1;
3145 if (flow) {
3146 ktime_t now = ktime_get();
3147
3148 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3149 if (!stats)
3150 return -1;
3151
3152 #define PUT_STAT_U32(attr, data) do { \
3153 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3154 goto nla_put_failure; \
3155 } while (0)
3156 #define PUT_STAT_S32(attr, data) do { \
3157 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3158 goto nla_put_failure; \
3159 } while (0)
3160
3161 PUT_STAT_S32(DEFICIT, flow->deficit);
3162 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3163 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3164 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3165 if (flow->cvars.p_drop) {
3166 PUT_STAT_S32(BLUE_TIMER_US,
3167 ktime_to_us(
3168 ktime_sub(now,
3169 flow->cvars.blue_timer)));
3170 }
3171 if (flow->cvars.dropping) {
3172 PUT_STAT_S32(DROP_NEXT_US,
3173 ktime_to_us(
3174 ktime_sub(now,
3175 flow->cvars.drop_next)));
3176 }
3177
3178 if (nla_nest_end(d->skb, stats) < 0)
3179 return -1;
3180 }
3181
3182 return 0;
3183
3184 nla_put_failure:
3185 nla_nest_cancel(d->skb, stats);
3186 return -1;
3187 }
3188
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3189 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3190 {
3191 struct cake_sched_data *q = qdisc_priv(sch);
3192 unsigned int i, j;
3193
3194 if (arg->stop)
3195 return;
3196
3197 for (i = 0; i < q->tin_cnt; i++) {
3198 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3199
3200 for (j = 0; j < CAKE_QUEUES; j++) {
3201 if (list_empty(&b->flows[j].flowchain)) {
3202 arg->count++;
3203 continue;
3204 }
3205 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3206 arg))
3207 break;
3208 }
3209 }
3210 }
3211
3212 static const struct Qdisc_class_ops cake_class_ops = {
3213 .leaf = cake_leaf,
3214 .find = cake_find,
3215 .tcf_block = cake_tcf_block,
3216 .bind_tcf = cake_bind,
3217 .unbind_tcf = cake_unbind,
3218 .dump = cake_dump_class,
3219 .dump_stats = cake_dump_class_stats,
3220 .walk = cake_walk,
3221 };
3222
3223 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3224 .cl_ops = &cake_class_ops,
3225 .id = "cake",
3226 .priv_size = sizeof(struct cake_sched_data),
3227 .enqueue = cake_enqueue,
3228 .dequeue = cake_dequeue,
3229 .peek = qdisc_peek_dequeued,
3230 .init = cake_init,
3231 .reset = cake_reset,
3232 .destroy = cake_destroy,
3233 .change = cake_change,
3234 .dump = cake_dump,
3235 .dump_stats = cake_dump_stats,
3236 .owner = THIS_MODULE,
3237 };
3238 MODULE_ALIAS_NET_SCH("cake");
3239
3240 struct cake_mq_sched {
3241 struct mq_sched mq_priv; /* must be first */
3242 struct cake_sched_config cake_config;
3243 };
3244
cake_mq_destroy(struct Qdisc * sch)3245 static void cake_mq_destroy(struct Qdisc *sch)
3246 {
3247 mq_destroy_common(sch);
3248 }
3249
cake_mq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3250 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
3251 struct netlink_ext_ack *extack)
3252 {
3253 struct cake_mq_sched *priv = qdisc_priv(sch);
3254 struct net_device *dev = qdisc_dev(sch);
3255 int ret, ntx;
3256 bool _unused;
3257
3258 cake_config_init(&priv->cake_config, true);
3259 if (opt) {
3260 ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
3261 if (ret)
3262 return ret;
3263 }
3264
3265 ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
3266 if (ret)
3267 return ret;
3268
3269 for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
3270 cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
3271
3272 return 0;
3273 }
3274
cake_mq_dump(struct Qdisc * sch,struct sk_buff * skb)3275 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
3276 {
3277 struct cake_mq_sched *priv = qdisc_priv(sch);
3278
3279 mq_dump_common(sch, skb);
3280 return cake_config_dump(&priv->cake_config, skb);
3281 }
3282
cake_mq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3283 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
3284 struct netlink_ext_ack *extack)
3285 {
3286 struct cake_mq_sched *priv = qdisc_priv(sch);
3287 struct net_device *dev = qdisc_dev(sch);
3288 bool overhead_changed = false;
3289 unsigned int ntx;
3290 int ret;
3291
3292 ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
3293 if (ret)
3294 return ret;
3295
3296 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
3297 struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
3298 struct cake_sched_data *qd = qdisc_priv(chld);
3299
3300 if (overhead_changed) {
3301 qd->max_netlen = 0;
3302 qd->max_adjlen = 0;
3303 qd->min_netlen = ~0;
3304 qd->min_adjlen = ~0;
3305 }
3306
3307 if (qd->tins) {
3308 sch_tree_lock(chld);
3309 cake_reconfigure(chld);
3310 sch_tree_unlock(chld);
3311 }
3312 }
3313
3314 return 0;
3315 }
3316
cake_mq_graft(struct Qdisc * sch,unsigned long cl,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)3317 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
3318 struct Qdisc **old, struct netlink_ext_ack *extack)
3319 {
3320 NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
3321 return -EOPNOTSUPP;
3322 }
3323
3324 static const struct Qdisc_class_ops cake_mq_class_ops = {
3325 .select_queue = mq_select_queue,
3326 .graft = cake_mq_graft,
3327 .leaf = mq_leaf,
3328 .find = mq_find,
3329 .walk = mq_walk,
3330 .dump = mq_dump_class,
3331 .dump_stats = mq_dump_class_stats,
3332 };
3333
3334 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
3335 .cl_ops = &cake_mq_class_ops,
3336 .id = "cake_mq",
3337 .priv_size = sizeof(struct cake_mq_sched),
3338 .init = cake_mq_init,
3339 .destroy = cake_mq_destroy,
3340 .attach = mq_attach,
3341 .change = cake_mq_change,
3342 .change_real_num_tx = mq_change_real_num_tx,
3343 .dump = cake_mq_dump,
3344 .owner = THIS_MODULE,
3345 };
3346 MODULE_ALIAS_NET_SCH("cake_mq");
3347
cake_module_init(void)3348 static int __init cake_module_init(void)
3349 {
3350 int ret;
3351
3352 ret = register_qdisc(&cake_qdisc_ops);
3353 if (ret)
3354 return ret;
3355
3356 ret = register_qdisc(&cake_mq_qdisc_ops);
3357 if (ret)
3358 unregister_qdisc(&cake_qdisc_ops);
3359
3360 return ret;
3361 }
3362
cake_module_exit(void)3363 static void __exit cake_module_exit(void)
3364 {
3365 unregister_qdisc(&cake_qdisc_ops);
3366 unregister_qdisc(&cake_mq_qdisc_ops);
3367 }
3368
3369 module_init(cake_module_init)
3370 module_exit(cake_module_exit)
3371 MODULE_AUTHOR("Jonathan Morton");
3372 MODULE_LICENSE("Dual BSD/GPL");
3373 MODULE_DESCRIPTION("The CAKE shaper.");
3374 MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
3375