1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4 *
5 * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6 * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7 * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8 * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9 * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10 * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11 *
12 * The CAKE Principles:
13 * (or, how to have your cake and eat it too)
14 *
15 * This is a combination of several shaping, AQM and FQ techniques into one
16 * easy-to-use package:
17 *
18 * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19 * equipment and bloated MACs. This operates in deficit mode (as in sch_fq),
20 * eliminating the need for any sort of burst parameter (eg. token bucket
21 * depth). Burst support is limited to that necessary to overcome scheduling
22 * latency.
23 *
24 * - A Diffserv-aware priority queue, giving more priority to certain classes,
25 * up to a specified fraction of bandwidth. Above that bandwidth threshold,
26 * the priority is reduced to avoid starving other tins.
27 *
28 * - Each priority tin has a separate Flow Queue system, to isolate traffic
29 * flows from each other. This prevents a burst on one flow from increasing
30 * the delay to another. Flows are distributed to queues using a
31 * set-associative hash function.
32 *
33 * - Each queue is actively managed by Cobalt, which is a combination of the
34 * Codel and Blue AQM algorithms. This serves flows fairly, and signals
35 * congestion early via ECN (if available) and/or packet drops, to keep
36 * latency low. The codel parameters are auto-tuned based on the bandwidth
37 * setting, as is necessary at low bandwidths.
38 *
39 * The configuration parameters are kept deliberately simple for ease of use.
40 * Everything has sane defaults. Complete generality of configuration is *not*
41 * a goal.
42 *
43 * The priority queue operates according to a weighted DRR scheme, combined with
44 * a bandwidth tracker which reuses the shaper logic to detect which side of the
45 * bandwidth sharing threshold the tin is operating. This determines whether a
46 * priority-based weight (high) or a bandwidth-based weight (low) is used for
47 * that tin in the current pass.
48 *
49 * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50 * granted us permission to leverage.
51 */
52
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/sch_priv.h>
71 #include <net/pkt_cls.h>
72 #include <net/tcp.h>
73 #include <net/flow_dissector.h>
74
75 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
76 #include <net/netfilter/nf_conntrack_core.h>
77 #endif
78
79 #define CAKE_SET_WAYS (8)
80 #define CAKE_MAX_TINS (8)
81 #define CAKE_QUEUES (1024)
82 #define CAKE_FLOW_MASK 63
83 #define CAKE_FLOW_NAT_FLAG 64
84
85 /* struct cobalt_params - contains codel and blue parameters
86 * @interval: codel initial drop rate
87 * @target: maximum persistent sojourn time & blue update rate
88 * @mtu_time: serialisation delay of maximum-size packet
89 * @p_inc: increment of blue drop probability (0.32 fxp)
90 * @p_dec: decrement of blue drop probability (0.32 fxp)
91 */
92 struct cobalt_params {
93 u64 interval;
94 u64 target;
95 u64 mtu_time;
96 u32 p_inc;
97 u32 p_dec;
98 };
99
100 /* struct cobalt_vars - contains codel and blue variables
101 * @count: codel dropping frequency
102 * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1
103 * @drop_next: time to drop next packet, or when we dropped last
104 * @blue_timer: Blue time to next drop
105 * @p_drop: BLUE drop probability (0.32 fxp)
106 * @dropping: set if in dropping state
107 * @ecn_marked: set if marked
108 */
109 struct cobalt_vars {
110 u32 count;
111 u32 rec_inv_sqrt;
112 ktime_t drop_next;
113 ktime_t blue_timer;
114 u32 p_drop;
115 bool dropping;
116 bool ecn_marked;
117 };
118
119 enum {
120 CAKE_SET_NONE = 0,
121 CAKE_SET_SPARSE,
122 CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 CAKE_SET_BULK,
124 CAKE_SET_DECAYING
125 };
126
127 struct cake_flow {
128 /* this stuff is all needed per-flow at dequeue time */
129 struct sk_buff *head;
130 struct sk_buff *tail;
131 struct list_head flowchain;
132 s32 deficit;
133 u32 dropped;
134 struct cobalt_vars cvars;
135 u16 srchost; /* index into cake_host table */
136 u16 dsthost;
137 u8 set;
138 }; /* please try to keep this structure <= 64 bytes */
139
140 struct cake_host {
141 u32 srchost_tag;
142 u32 dsthost_tag;
143 u16 srchost_bulk_flow_count;
144 u16 dsthost_bulk_flow_count;
145 };
146
147 struct cake_heap_entry {
148 u16 t:3, b:10;
149 };
150
151 struct cake_tin_data {
152 struct cake_flow flows[CAKE_QUEUES];
153 u32 backlogs[CAKE_QUEUES];
154 u32 tags[CAKE_QUEUES]; /* for set association */
155 u16 overflow_idx[CAKE_QUEUES];
156 struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 u16 flow_quantum;
158
159 struct cobalt_params cparams;
160 u32 drop_overlimit;
161 u16 bulk_flow_count;
162 u16 sparse_flow_count;
163 u16 decaying_flow_count;
164 u16 unresponsive_flow_count;
165
166 u32 max_skblen;
167
168 struct list_head new_flows;
169 struct list_head old_flows;
170 struct list_head decaying_flows;
171
172 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 ktime_t time_next_packet;
174 u64 tin_rate_ns;
175 u64 tin_rate_bps;
176 u16 tin_rate_shft;
177
178 u16 tin_quantum;
179 s32 tin_deficit;
180 u32 tin_backlog;
181 u32 tin_dropped;
182 u32 tin_ecn_mark;
183
184 u32 packets;
185 u64 bytes;
186
187 u32 ack_drops;
188
189 /* moving averages */
190 u64 avge_delay;
191 u64 peak_delay;
192 u64 base_delay;
193
194 /* hash function stats */
195 u32 way_directs;
196 u32 way_hits;
197 u32 way_misses;
198 u32 way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200
201 struct cake_sched_config {
202 u64 rate_bps;
203 u64 interval;
204 u64 target;
205 u64 sync_time;
206 u32 buffer_config_limit;
207 u32 fwmark_mask;
208 u16 fwmark_shft;
209 s16 rate_overhead;
210 u16 rate_mpu;
211 u16 rate_flags;
212 u8 tin_mode;
213 u8 flow_mode;
214 u8 atm_mode;
215 u8 ack_filter;
216 u8 is_shared;
217 };
218
219 struct cake_sched_data {
220 struct tcf_proto __rcu *filter_list; /* optional external classifier */
221 struct tcf_block *block;
222 struct cake_tin_data *tins;
223 struct cake_sched_config *config;
224 struct cake_sched_config initial_config;
225
226 struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
227
228 /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
229 ktime_t time_next_packet;
230 ktime_t failsafe_next_packet;
231 u64 rate_ns;
232 u16 rate_shft;
233 u16 overflow_timeout;
234 u16 tin_cnt;
235
236 /* resource tracking */
237 u32 buffer_used;
238 u32 buffer_max_used;
239 u32 buffer_limit;
240
241 /* indices for dequeue */
242 u16 cur_tin;
243 u16 cur_flow;
244
245 struct qdisc_watchdog watchdog;
246 const u8 *tin_index;
247 const u8 *tin_order;
248
249 /* bandwidth capacity estimate */
250 ktime_t last_packet_time;
251 ktime_t avg_window_begin;
252 u64 avg_packet_interval;
253 u64 avg_window_bytes;
254 u64 avg_peak_bandwidth;
255 ktime_t last_reconfig_time;
256
257 /* packet length stats */
258 u32 avg_netoff;
259 u16 max_netlen;
260 u16 max_adjlen;
261 u16 min_netlen;
262 u16 min_adjlen;
263
264 /* mq sync state */
265 u64 last_checked_active;
266 u64 last_active;
267 u32 active_queues;
268 };
269
270 enum {
271 CAKE_FLAG_OVERHEAD = BIT(0),
272 CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
273 CAKE_FLAG_INGRESS = BIT(2),
274 CAKE_FLAG_WASH = BIT(3),
275 CAKE_FLAG_SPLIT_GSO = BIT(4)
276 };
277
278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
279 * obtain the best features of each. Codel is excellent on flows which
280 * respond to congestion signals in a TCP-like way. BLUE is more effective on
281 * unresponsive flows.
282 */
283
284 struct cobalt_skb_cb {
285 ktime_t enqueue_time;
286 u32 adjusted_len;
287 };
288
us_to_ns(u64 us)289 static u64 us_to_ns(u64 us)
290 {
291 return us * NSEC_PER_USEC;
292 }
293
get_cobalt_cb(const struct sk_buff * skb)294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
295 {
296 qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
297 return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
298 }
299
cobalt_get_enqueue_time(const struct sk_buff * skb)300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
301 {
302 return get_cobalt_cb(skb)->enqueue_time;
303 }
304
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)305 static void cobalt_set_enqueue_time(struct sk_buff *skb,
306 ktime_t now)
307 {
308 get_cobalt_cb(skb)->enqueue_time = now;
309 }
310
311 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
312
313 /* Diffserv lookup tables */
314
315 static const u8 precedence[] = {
316 0, 0, 0, 0, 0, 0, 0, 0,
317 1, 1, 1, 1, 1, 1, 1, 1,
318 2, 2, 2, 2, 2, 2, 2, 2,
319 3, 3, 3, 3, 3, 3, 3, 3,
320 4, 4, 4, 4, 4, 4, 4, 4,
321 5, 5, 5, 5, 5, 5, 5, 5,
322 6, 6, 6, 6, 6, 6, 6, 6,
323 7, 7, 7, 7, 7, 7, 7, 7,
324 };
325
326 static const u8 diffserv8[] = {
327 2, 0, 1, 2, 4, 2, 2, 2,
328 1, 2, 1, 2, 1, 2, 1, 2,
329 5, 2, 4, 2, 4, 2, 4, 2,
330 3, 2, 3, 2, 3, 2, 3, 2,
331 6, 2, 3, 2, 3, 2, 3, 2,
332 6, 2, 2, 2, 6, 2, 6, 2,
333 7, 2, 2, 2, 2, 2, 2, 2,
334 7, 2, 2, 2, 2, 2, 2, 2,
335 };
336
337 static const u8 diffserv4[] = {
338 0, 1, 0, 0, 2, 0, 0, 0,
339 1, 0, 0, 0, 0, 0, 0, 0,
340 2, 0, 2, 0, 2, 0, 2, 0,
341 2, 0, 2, 0, 2, 0, 2, 0,
342 3, 0, 2, 0, 2, 0, 2, 0,
343 3, 0, 0, 0, 3, 0, 3, 0,
344 3, 0, 0, 0, 0, 0, 0, 0,
345 3, 0, 0, 0, 0, 0, 0, 0,
346 };
347
348 static const u8 diffserv3[] = {
349 0, 1, 0, 0, 2, 0, 0, 0,
350 1, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 2, 0, 2, 0,
355 2, 0, 0, 0, 0, 0, 0, 0,
356 2, 0, 0, 0, 0, 0, 0, 0,
357 };
358
359 static const u8 besteffort[] = {
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 };
369
370 /* tin priority order for stats dumping */
371
372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
373 static const u8 bulk_order[] = {1, 0, 2, 3};
374
375 /* There is a big difference in timing between the accurate values placed in the
376 * cache and the approximations given by a single Newton step for small count
377 * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
378 * these values are calculated using eight Newton steps, using the
379 * implementation below. Above 16, a single Newton step gives sufficient
380 * accuracy in either direction, given the precision stored.
381 *
382 * The magnitude of the error when stepping up to count 2 is such as to give the
383 * value that *should* have been produced at count 4.
384 */
385
386 #define REC_INV_SQRT_CACHE (16)
387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
388 ~0, ~0, 3037000500, 2479700525,
389 2147483647, 1920767767, 1753413056, 1623345051,
390 1518500250, 1431655765, 1358187914, 1294981364,
391 1239850263, 1191209601, 1147878294, 1108955788
392 };
393
394 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
395 u64 target_ns, u64 rtt_est_ns);
396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
397 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
398 *
399 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
400 */
401
cobalt_newton_step(struct cobalt_vars * vars)402 static void cobalt_newton_step(struct cobalt_vars *vars)
403 {
404 u32 invsqrt, invsqrt2;
405 u64 val;
406
407 invsqrt = vars->rec_inv_sqrt;
408 invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
409 val = (3LL << 32) - ((u64)vars->count * invsqrt2);
410
411 val >>= 2; /* avoid overflow in following multiply */
412 val = (val * invsqrt) >> (32 - 2 + 1);
413
414 vars->rec_inv_sqrt = val;
415 }
416
cobalt_invsqrt(struct cobalt_vars * vars)417 static void cobalt_invsqrt(struct cobalt_vars *vars)
418 {
419 if (vars->count < REC_INV_SQRT_CACHE)
420 vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
421 else
422 cobalt_newton_step(vars);
423 }
424
cobalt_vars_init(struct cobalt_vars * vars)425 static void cobalt_vars_init(struct cobalt_vars *vars)
426 {
427 memset(vars, 0, sizeof(*vars));
428 }
429
430 /* CoDel control_law is t + interval/sqrt(count)
431 * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
432 * both sqrt() and divide operation.
433 */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)434 static ktime_t cobalt_control(ktime_t t,
435 u64 interval,
436 u32 rec_inv_sqrt)
437 {
438 return ktime_add_ns(t, reciprocal_scale(interval,
439 rec_inv_sqrt));
440 }
441
442 /* Call this when a packet had to be dropped due to queue overflow. Returns
443 * true if the BLUE state was quiescent before but active after this call.
444 */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)445 static bool cobalt_queue_full(struct cobalt_vars *vars,
446 struct cobalt_params *p,
447 ktime_t now)
448 {
449 bool up = false;
450
451 if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
452 up = !vars->p_drop;
453 vars->p_drop += p->p_inc;
454 if (vars->p_drop < p->p_inc)
455 vars->p_drop = ~0;
456 vars->blue_timer = now;
457 }
458 vars->dropping = true;
459 vars->drop_next = now;
460 if (!vars->count)
461 vars->count = 1;
462
463 return up;
464 }
465
466 /* Call this when the queue was serviced but turned out to be empty. Returns
467 * true if the BLUE state was active before but quiescent after this call.
468 */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)469 static bool cobalt_queue_empty(struct cobalt_vars *vars,
470 struct cobalt_params *p,
471 ktime_t now)
472 {
473 bool down = false;
474
475 if (vars->p_drop &&
476 ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
477 if (vars->p_drop < p->p_dec)
478 vars->p_drop = 0;
479 else
480 vars->p_drop -= p->p_dec;
481 vars->blue_timer = now;
482 down = !vars->p_drop;
483 }
484 vars->dropping = false;
485
486 if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
487 vars->count--;
488 cobalt_invsqrt(vars);
489 vars->drop_next = cobalt_control(vars->drop_next,
490 p->interval,
491 vars->rec_inv_sqrt);
492 }
493
494 return down;
495 }
496
497 /* Call this with a freshly dequeued packet for possible congestion marking.
498 * Returns true as an instruction to drop the packet, false for delivery.
499 */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)500 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
501 struct cobalt_params *p,
502 ktime_t now,
503 struct sk_buff *skb,
504 u32 bulk_flows)
505 {
506 enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
507 bool next_due, over_target;
508 ktime_t schedule;
509 u64 sojourn;
510
511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
512 * after 'drop_next'. This allows 'drop_next' to be updated before the next
513 * scheduling decision is actually branched, without destroying that
514 * information. Similarly, the first 'schedule' value calculated is preserved
515 * in the boolean 'next_due'.
516 *
517 * As for 'drop_next', we take advantage of the fact that 'interval' is both
518 * the delay between first exceeding 'target' and the first signalling event,
519 * *and* the scaling factor for the signalling frequency. It's therefore very
520 * natural to use a single mechanism for both purposes, and eliminates a
521 * significant amount of reference Codel's spaghetti code. To help with this,
522 * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523 * as possible to 1.0 in fixed-point.
524 */
525
526 sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527 schedule = ktime_sub(now, vars->drop_next);
528 over_target = sojourn > p->target &&
529 sojourn > p->mtu_time * bulk_flows * 2 &&
530 sojourn > p->mtu_time * 4;
531 next_due = vars->count && ktime_to_ns(schedule) >= 0;
532
533 vars->ecn_marked = false;
534
535 if (over_target) {
536 if (!vars->dropping) {
537 vars->dropping = true;
538 vars->drop_next = cobalt_control(now,
539 p->interval,
540 vars->rec_inv_sqrt);
541 }
542 if (!vars->count)
543 vars->count = 1;
544 } else if (vars->dropping) {
545 vars->dropping = false;
546 }
547
548 if (next_due && vars->dropping) {
549 /* Use ECN mark if possible, otherwise drop */
550 if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
551 reason = SKB_DROP_REASON_QDISC_CONGESTED;
552
553 vars->count++;
554 if (!vars->count)
555 vars->count--;
556 cobalt_invsqrt(vars);
557 vars->drop_next = cobalt_control(vars->drop_next,
558 p->interval,
559 vars->rec_inv_sqrt);
560 schedule = ktime_sub(now, vars->drop_next);
561 } else {
562 while (next_due) {
563 vars->count--;
564 cobalt_invsqrt(vars);
565 vars->drop_next = cobalt_control(vars->drop_next,
566 p->interval,
567 vars->rec_inv_sqrt);
568 schedule = ktime_sub(now, vars->drop_next);
569 next_due = vars->count && ktime_to_ns(schedule) >= 0;
570 }
571 }
572
573 /* Simple BLUE implementation. Lack of ECN is deliberate. */
574 if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
575 get_random_u32() < vars->p_drop)
576 reason = SKB_DROP_REASON_CAKE_FLOOD;
577
578 /* Overload the drop_next field as an activity timeout */
579 if (!vars->count)
580 vars->drop_next = ktime_add_ns(now, p->interval);
581 else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
582 vars->drop_next = now;
583
584 return reason;
585 }
586
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)587 static bool cake_update_flowkeys(struct flow_keys *keys,
588 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 struct nf_conntrack_tuple tuple = {};
592 bool rev = !skb->_nfct, upd = false;
593 __be32 ip;
594
595 if (skb_protocol(skb, true) != htons(ETH_P_IP))
596 return false;
597
598 if (!nf_ct_get_tuple_skb(&tuple, skb))
599 return false;
600
601 ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 if (ip != keys->addrs.v4addrs.src) {
603 keys->addrs.v4addrs.src = ip;
604 upd = true;
605 }
606 ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607 if (ip != keys->addrs.v4addrs.dst) {
608 keys->addrs.v4addrs.dst = ip;
609 upd = true;
610 }
611
612 if (keys->ports.ports) {
613 __be16 port;
614
615 port = rev ? tuple.dst.u.all : tuple.src.u.all;
616 if (port != keys->ports.src) {
617 keys->ports.src = port;
618 upd = true;
619 }
620 port = rev ? tuple.src.u.all : tuple.dst.u.all;
621 if (port != keys->ports.dst) {
622 port = keys->ports.dst;
623 upd = true;
624 }
625 }
626 return upd;
627 #else
628 return false;
629 #endif
630 }
631
632 /* Cake has several subtle multiple bit settings. In these cases you
633 * would be matching triple isolate mode as well.
634 */
635
cake_dsrc(int flow_mode)636 static bool cake_dsrc(int flow_mode)
637 {
638 return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640
cake_ddst(int flow_mode)641 static bool cake_ddst(int flow_mode)
642 {
643 return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)646 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
647 struct cake_flow *flow,
648 int flow_mode)
649 {
650 if (likely(cake_dsrc(flow_mode) &&
651 q->hosts[flow->srchost].srchost_bulk_flow_count))
652 q->hosts[flow->srchost].srchost_bulk_flow_count--;
653 }
654
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)655 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
656 struct cake_flow *flow,
657 int flow_mode)
658 {
659 if (likely(cake_dsrc(flow_mode) &&
660 q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
661 q->hosts[flow->srchost].srchost_bulk_flow_count++;
662 }
663
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)664 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
665 struct cake_flow *flow,
666 int flow_mode)
667 {
668 if (likely(cake_ddst(flow_mode) &&
669 q->hosts[flow->dsthost].dsthost_bulk_flow_count))
670 q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
671 }
672
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)673 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
674 struct cake_flow *flow,
675 int flow_mode)
676 {
677 if (likely(cake_ddst(flow_mode) &&
678 q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
679 q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
680 }
681
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)682 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
683 struct cake_flow *flow,
684 int flow_mode)
685 {
686 u16 host_load = 1;
687
688 if (cake_dsrc(flow_mode))
689 host_load = max(host_load,
690 q->hosts[flow->srchost].srchost_bulk_flow_count);
691
692 if (cake_ddst(flow_mode))
693 host_load = max(host_load,
694 q->hosts[flow->dsthost].dsthost_bulk_flow_count);
695
696 /* The get_random_u16() is a way to apply dithering to avoid
697 * accumulating roundoff errors
698 */
699 return (q->flow_quantum * quantum_div[host_load] +
700 get_random_u16()) >> 16;
701 }
702
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)703 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
704 int flow_mode, u16 flow_override, u16 host_override)
705 {
706 bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
707 bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
708 bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
709 u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
710 u16 reduced_hash, srchost_idx, dsthost_idx;
711 struct flow_keys keys, host_keys;
712 bool use_skbhash = skb->l4_hash;
713
714 if (unlikely(flow_mode == CAKE_FLOW_NONE))
715 return 0;
716
717 /* If both overrides are set, or we can use the SKB hash and nat mode is
718 * disabled, we can skip packet dissection entirely. If nat mode is
719 * enabled there's another check below after doing the conntrack lookup.
720 */
721 if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
722 goto skip_hash;
723
724 skb_flow_dissect_flow_keys(skb, &keys,
725 FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
726
727 /* Don't use the SKB hash if we change the lookup keys from conntrack */
728 if (nat_enabled && cake_update_flowkeys(&keys, skb))
729 use_skbhash = false;
730
731 /* If we can still use the SKB hash and don't need the host hash, we can
732 * skip the rest of the hashing procedure
733 */
734 if (use_skbhash && !hash_hosts)
735 goto skip_hash;
736
737 /* flow_hash_from_keys() sorts the addresses by value, so we have
738 * to preserve their order in a separate data structure to treat
739 * src and dst host addresses as independently selectable.
740 */
741 host_keys = keys;
742 host_keys.ports.ports = 0;
743 host_keys.basic.ip_proto = 0;
744 host_keys.keyid.keyid = 0;
745 host_keys.tags.flow_label = 0;
746
747 switch (host_keys.control.addr_type) {
748 case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
749 host_keys.addrs.v4addrs.src = 0;
750 dsthost_hash = flow_hash_from_keys(&host_keys);
751 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
752 host_keys.addrs.v4addrs.dst = 0;
753 srchost_hash = flow_hash_from_keys(&host_keys);
754 break;
755
756 case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
757 memset(&host_keys.addrs.v6addrs.src, 0,
758 sizeof(host_keys.addrs.v6addrs.src));
759 dsthost_hash = flow_hash_from_keys(&host_keys);
760 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
761 memset(&host_keys.addrs.v6addrs.dst, 0,
762 sizeof(host_keys.addrs.v6addrs.dst));
763 srchost_hash = flow_hash_from_keys(&host_keys);
764 break;
765
766 default:
767 dsthost_hash = 0;
768 srchost_hash = 0;
769 }
770
771 /* This *must* be after the above switch, since as a
772 * side-effect it sorts the src and dst addresses.
773 */
774 if (hash_flows && !use_skbhash)
775 flow_hash = flow_hash_from_keys(&keys);
776
777 skip_hash:
778 if (flow_override)
779 flow_hash = flow_override - 1;
780 else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
781 flow_hash = skb->hash;
782 if (host_override) {
783 dsthost_hash = host_override - 1;
784 srchost_hash = host_override - 1;
785 }
786
787 if (!(flow_mode & CAKE_FLOW_FLOWS)) {
788 if (flow_mode & CAKE_FLOW_SRC_IP)
789 flow_hash ^= srchost_hash;
790
791 if (flow_mode & CAKE_FLOW_DST_IP)
792 flow_hash ^= dsthost_hash;
793 }
794
795 reduced_hash = flow_hash % CAKE_QUEUES;
796
797 /* set-associative hashing */
798 /* fast path if no hash collision (direct lookup succeeds) */
799 if (likely(q->tags[reduced_hash] == flow_hash &&
800 q->flows[reduced_hash].set)) {
801 q->way_directs++;
802 } else {
803 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
804 u32 outer_hash = reduced_hash - inner_hash;
805 bool allocate_src = false;
806 bool allocate_dst = false;
807 u32 i, k;
808
809 /* check if any active queue in the set is reserved for
810 * this flow.
811 */
812 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
813 i++, k = (k + 1) % CAKE_SET_WAYS) {
814 if (q->tags[outer_hash + k] == flow_hash) {
815 if (i)
816 q->way_hits++;
817
818 if (!q->flows[outer_hash + k].set) {
819 /* need to increment host refcnts */
820 allocate_src = cake_dsrc(flow_mode);
821 allocate_dst = cake_ddst(flow_mode);
822 }
823
824 goto found;
825 }
826 }
827
828 /* no queue is reserved for this flow, look for an
829 * empty one.
830 */
831 for (i = 0; i < CAKE_SET_WAYS;
832 i++, k = (k + 1) % CAKE_SET_WAYS) {
833 if (!q->flows[outer_hash + k].set) {
834 q->way_misses++;
835 allocate_src = cake_dsrc(flow_mode);
836 allocate_dst = cake_ddst(flow_mode);
837 goto found;
838 }
839 }
840
841 /* With no empty queues, default to the original
842 * queue, accept the collision, update the host tags.
843 */
844 q->way_collisions++;
845 allocate_src = cake_dsrc(flow_mode);
846 allocate_dst = cake_ddst(flow_mode);
847
848 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
849 cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
850 cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851 }
852 found:
853 /* reserve queue for future packets in same flow */
854 reduced_hash = outer_hash + k;
855 q->tags[reduced_hash] = flow_hash;
856
857 if (allocate_src) {
858 srchost_idx = srchost_hash % CAKE_QUEUES;
859 inner_hash = srchost_idx % CAKE_SET_WAYS;
860 outer_hash = srchost_idx - inner_hash;
861 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
862 i++, k = (k + 1) % CAKE_SET_WAYS) {
863 if (q->hosts[outer_hash + k].srchost_tag ==
864 srchost_hash)
865 goto found_src;
866 }
867 for (i = 0; i < CAKE_SET_WAYS;
868 i++, k = (k + 1) % CAKE_SET_WAYS) {
869 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
870 break;
871 }
872 q->hosts[outer_hash + k].srchost_tag = srchost_hash;
873 found_src:
874 srchost_idx = outer_hash + k;
875 q->flows[reduced_hash].srchost = srchost_idx;
876
877 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
878 cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
879 }
880
881 if (allocate_dst) {
882 dsthost_idx = dsthost_hash % CAKE_QUEUES;
883 inner_hash = dsthost_idx % CAKE_SET_WAYS;
884 outer_hash = dsthost_idx - inner_hash;
885 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
886 i++, k = (k + 1) % CAKE_SET_WAYS) {
887 if (q->hosts[outer_hash + k].dsthost_tag ==
888 dsthost_hash)
889 goto found_dst;
890 }
891 for (i = 0; i < CAKE_SET_WAYS;
892 i++, k = (k + 1) % CAKE_SET_WAYS) {
893 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
894 break;
895 }
896 q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
897 found_dst:
898 dsthost_idx = outer_hash + k;
899 q->flows[reduced_hash].dsthost = dsthost_idx;
900
901 if (q->flows[reduced_hash].set == CAKE_SET_BULK)
902 cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
903 }
904 }
905
906 return reduced_hash;
907 }
908
909 /* helper functions : might be changed when/if skb use a standard list_head */
910 /* remove one skb from head of slot queue */
911
dequeue_head(struct cake_flow * flow)912 static struct sk_buff *dequeue_head(struct cake_flow *flow)
913 {
914 struct sk_buff *skb = flow->head;
915
916 if (skb) {
917 flow->head = skb->next;
918 skb_mark_not_on_list(skb);
919 }
920
921 return skb;
922 }
923
924 /* add skb to flow queue (tail add) */
925
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)926 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
927 {
928 if (!flow->head)
929 flow->head = skb;
930 else
931 flow->tail->next = skb;
932 flow->tail = skb;
933 skb->next = NULL;
934 }
935
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)936 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
937 struct ipv6hdr *buf)
938 {
939 unsigned int offset = skb_network_offset(skb);
940 struct iphdr *iph;
941
942 iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
943
944 if (!iph)
945 return NULL;
946
947 if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
948 return skb_header_pointer(skb, offset + iph->ihl * 4,
949 sizeof(struct ipv6hdr), buf);
950
951 else if (iph->version == 4)
952 return iph;
953
954 else if (iph->version == 6)
955 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
956 buf);
957
958 return NULL;
959 }
960
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)961 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
962 void *buf, unsigned int bufsize)
963 {
964 unsigned int offset = skb_network_offset(skb);
965 const struct ipv6hdr *ipv6h;
966 const struct tcphdr *tcph;
967 const struct iphdr *iph;
968 struct ipv6hdr _ipv6h;
969 struct tcphdr _tcph;
970
971 ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
972
973 if (!ipv6h)
974 return NULL;
975
976 if (ipv6h->version == 4) {
977 iph = (struct iphdr *)ipv6h;
978 offset += iph->ihl * 4;
979
980 /* special-case 6in4 tunnelling, as that is a common way to get
981 * v6 connectivity in the home
982 */
983 if (iph->protocol == IPPROTO_IPV6) {
984 ipv6h = skb_header_pointer(skb, offset,
985 sizeof(_ipv6h), &_ipv6h);
986
987 if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
988 return NULL;
989
990 offset += sizeof(struct ipv6hdr);
991
992 } else if (iph->protocol != IPPROTO_TCP) {
993 return NULL;
994 }
995
996 } else if (ipv6h->version == 6) {
997 if (ipv6h->nexthdr != IPPROTO_TCP)
998 return NULL;
999
1000 offset += sizeof(struct ipv6hdr);
1001 } else {
1002 return NULL;
1003 }
1004
1005 tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1006 if (!tcph || tcph->doff < 5)
1007 return NULL;
1008
1009 return skb_header_pointer(skb, offset,
1010 min(__tcp_hdrlen(tcph), bufsize), buf);
1011 }
1012
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1013 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1014 int code, int *oplen)
1015 {
1016 /* inspired by tcp_parse_options in tcp_input.c */
1017 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1018 const u8 *ptr = (const u8 *)(tcph + 1);
1019
1020 while (length > 0) {
1021 int opcode = *ptr++;
1022 int opsize;
1023
1024 if (opcode == TCPOPT_EOL)
1025 break;
1026 if (opcode == TCPOPT_NOP) {
1027 length--;
1028 continue;
1029 }
1030 if (length < 2)
1031 break;
1032 opsize = *ptr++;
1033 if (opsize < 2 || opsize > length)
1034 break;
1035
1036 if (opcode == code) {
1037 *oplen = opsize;
1038 return ptr;
1039 }
1040
1041 ptr += opsize - 2;
1042 length -= opsize;
1043 }
1044
1045 return NULL;
1046 }
1047
1048 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1049 * bytes than the other. In the case where both sequences ACKs bytes that the
1050 * other doesn't, A is considered greater. DSACKs in A also makes A be
1051 * considered greater.
1052 *
1053 * @return -1, 0 or 1 as normal compare functions
1054 */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1055 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1056 const struct tcphdr *tcph_b)
1057 {
1058 const struct tcp_sack_block_wire *sack_a, *sack_b;
1059 u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1060 u32 bytes_a = 0, bytes_b = 0;
1061 int oplen_a, oplen_b;
1062 bool first = true;
1063
1064 sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1065 sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1066
1067 /* pointers point to option contents */
1068 oplen_a -= TCPOLEN_SACK_BASE;
1069 oplen_b -= TCPOLEN_SACK_BASE;
1070
1071 if (sack_a && oplen_a >= sizeof(*sack_a) &&
1072 (!sack_b || oplen_b < sizeof(*sack_b)))
1073 return -1;
1074 else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1075 (!sack_a || oplen_a < sizeof(*sack_a)))
1076 return 1;
1077 else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1078 (!sack_b || oplen_b < sizeof(*sack_b)))
1079 return 0;
1080
1081 while (oplen_a >= sizeof(*sack_a)) {
1082 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1083 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1084 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1085 int oplen_tmp = oplen_b;
1086 bool found = false;
1087
1088 /* DSACK; always considered greater to prevent dropping */
1089 if (before(start_a, ack_seq_a))
1090 return -1;
1091
1092 bytes_a += end_a - start_a;
1093
1094 while (oplen_tmp >= sizeof(*sack_tmp)) {
1095 u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1096 u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1097
1098 /* first time through we count the total size */
1099 if (first)
1100 bytes_b += end_b - start_b;
1101
1102 if (!after(start_b, start_a) && !before(end_b, end_a)) {
1103 found = true;
1104 if (!first)
1105 break;
1106 }
1107 oplen_tmp -= sizeof(*sack_tmp);
1108 sack_tmp++;
1109 }
1110
1111 if (!found)
1112 return -1;
1113
1114 oplen_a -= sizeof(*sack_a);
1115 sack_a++;
1116 first = false;
1117 }
1118
1119 /* If we made it this far, all ranges SACKed by A are covered by B, so
1120 * either the SACKs are equal, or B SACKs more bytes.
1121 */
1122 return bytes_b > bytes_a ? 1 : 0;
1123 }
1124
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1125 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1126 u32 *tsval, u32 *tsecr)
1127 {
1128 const u8 *ptr;
1129 int opsize;
1130
1131 ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1132
1133 if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1134 *tsval = get_unaligned_be32(ptr);
1135 *tsecr = get_unaligned_be32(ptr + 4);
1136 }
1137 }
1138
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1139 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1140 u32 tstamp_new, u32 tsecr_new)
1141 {
1142 /* inspired by tcp_parse_options in tcp_input.c */
1143 int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1144 const u8 *ptr = (const u8 *)(tcph + 1);
1145 u32 tstamp, tsecr;
1146
1147 /* 3 reserved flags must be unset to avoid future breakage
1148 * ACK must be set
1149 * ECE/CWR are handled separately
1150 * All other flags URG/PSH/RST/SYN/FIN must be unset
1151 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1152 * 0x00C00000 = CWR/ECE (handled separately)
1153 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1154 */
1155 if (((tcp_flag_word(tcph) &
1156 cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1157 return false;
1158
1159 while (length > 0) {
1160 int opcode = *ptr++;
1161 int opsize;
1162
1163 if (opcode == TCPOPT_EOL)
1164 break;
1165 if (opcode == TCPOPT_NOP) {
1166 length--;
1167 continue;
1168 }
1169 if (length < 2)
1170 break;
1171 opsize = *ptr++;
1172 if (opsize < 2 || opsize > length)
1173 break;
1174
1175 switch (opcode) {
1176 case TCPOPT_MD5SIG: /* doesn't influence state */
1177 break;
1178
1179 case TCPOPT_SACK: /* stricter checking performed later */
1180 if (opsize % 8 != 2)
1181 return false;
1182 break;
1183
1184 case TCPOPT_TIMESTAMP:
1185 /* only drop timestamps lower than new */
1186 if (opsize != TCPOLEN_TIMESTAMP)
1187 return false;
1188 tstamp = get_unaligned_be32(ptr);
1189 tsecr = get_unaligned_be32(ptr + 4);
1190 if (after(tstamp, tstamp_new) ||
1191 after(tsecr, tsecr_new))
1192 return false;
1193 break;
1194
1195 case TCPOPT_MSS: /* these should only be set on SYN */
1196 case TCPOPT_WINDOW:
1197 case TCPOPT_SACK_PERM:
1198 case TCPOPT_FASTOPEN:
1199 case TCPOPT_EXP:
1200 default: /* don't drop if any unknown options are present */
1201 return false;
1202 }
1203
1204 ptr += opsize - 2;
1205 length -= opsize;
1206 }
1207
1208 return true;
1209 }
1210
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1211 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1212 struct cake_flow *flow)
1213 {
1214 bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
1215 struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1216 struct sk_buff *skb_check, *skb_prev = NULL;
1217 const struct ipv6hdr *ipv6h, *ipv6h_check;
1218 unsigned char _tcph[64], _tcph_check[64];
1219 const struct tcphdr *tcph, *tcph_check;
1220 const struct iphdr *iph, *iph_check;
1221 struct ipv6hdr _iph, _iph_check;
1222 const struct sk_buff *skb;
1223 int seglen, num_found = 0;
1224 u32 tstamp = 0, tsecr = 0;
1225 __be32 elig_flags = 0;
1226 int sack_comp;
1227
1228 /* no other possible ACKs to filter */
1229 if (flow->head == flow->tail)
1230 return NULL;
1231
1232 skb = flow->tail;
1233 tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1234 iph = cake_get_iphdr(skb, &_iph);
1235 if (!tcph)
1236 return NULL;
1237
1238 cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1239
1240 /* the 'triggering' packet need only have the ACK flag set.
1241 * also check that SYN is not set, as there won't be any previous ACKs.
1242 */
1243 if ((tcp_flag_word(tcph) &
1244 (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1245 return NULL;
1246
1247 /* the 'triggering' ACK is at the tail of the queue, we have already
1248 * returned if it is the only packet in the flow. loop through the rest
1249 * of the queue looking for pure ACKs with the same 5-tuple as the
1250 * triggering one.
1251 */
1252 for (skb_check = flow->head;
1253 skb_check && skb_check != skb;
1254 skb_prev = skb_check, skb_check = skb_check->next) {
1255 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1256 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1257 sizeof(_tcph_check));
1258
1259 /* only TCP packets with matching 5-tuple are eligible, and only
1260 * drop safe headers
1261 */
1262 if (!tcph_check || iph->version != iph_check->version ||
1263 tcph_check->source != tcph->source ||
1264 tcph_check->dest != tcph->dest)
1265 continue;
1266
1267 if (iph_check->version == 4) {
1268 if (iph_check->saddr != iph->saddr ||
1269 iph_check->daddr != iph->daddr)
1270 continue;
1271
1272 seglen = iph_totlen(skb, iph_check) -
1273 (4 * iph_check->ihl);
1274 } else if (iph_check->version == 6) {
1275 ipv6h = (struct ipv6hdr *)iph;
1276 ipv6h_check = (struct ipv6hdr *)iph_check;
1277
1278 if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1279 ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1280 continue;
1281
1282 seglen = ipv6_payload_len(skb, ipv6h_check);
1283 } else {
1284 WARN_ON(1); /* shouldn't happen */
1285 continue;
1286 }
1287
1288 /* If the ECE/CWR flags changed from the previous eligible
1289 * packet in the same flow, we should no longer be dropping that
1290 * previous packet as this would lose information.
1291 */
1292 if (elig_ack && (tcp_flag_word(tcph_check) &
1293 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1294 elig_ack = NULL;
1295 elig_ack_prev = NULL;
1296 num_found--;
1297 }
1298
1299 /* Check TCP options and flags, don't drop ACKs with segment
1300 * data, and don't drop ACKs with a higher cumulative ACK
1301 * counter than the triggering packet. Check ACK seqno here to
1302 * avoid parsing SACK options of packets we are going to exclude
1303 * anyway.
1304 */
1305 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1306 (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1307 after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1308 continue;
1309
1310 /* Check SACK options. The triggering packet must SACK more data
1311 * than the ACK under consideration, or SACK the same range but
1312 * have a larger cumulative ACK counter. The latter is a
1313 * pathological case, but is contained in the following check
1314 * anyway, just to be safe.
1315 */
1316 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1317
1318 if (sack_comp < 0 ||
1319 (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1320 sack_comp == 0))
1321 continue;
1322
1323 /* At this point we have found an eligible pure ACK to drop; if
1324 * we are in aggressive mode, we are done. Otherwise, keep
1325 * searching unless this is the second eligible ACK we
1326 * found.
1327 *
1328 * Since we want to drop ACK closest to the head of the queue,
1329 * save the first eligible ACK we find, even if we need to loop
1330 * again.
1331 */
1332 if (!elig_ack) {
1333 elig_ack = skb_check;
1334 elig_ack_prev = skb_prev;
1335 elig_flags = (tcp_flag_word(tcph_check)
1336 & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1337 }
1338
1339 if (num_found++ > 0)
1340 goto found;
1341 }
1342
1343 /* We made it through the queue without finding two eligible ACKs . If
1344 * we found a single eligible ACK we can drop it in aggressive mode if
1345 * we can guarantee that this does not interfere with ECN flag
1346 * information. We ensure this by dropping it only if the enqueued
1347 * packet is consecutive with the eligible ACK, and their flags match.
1348 */
1349 if (elig_ack && aggressive && elig_ack->next == skb &&
1350 (elig_flags == (tcp_flag_word(tcph) &
1351 (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1352 goto found;
1353
1354 return NULL;
1355
1356 found:
1357 if (elig_ack_prev)
1358 elig_ack_prev->next = elig_ack->next;
1359 else
1360 flow->head = elig_ack->next;
1361
1362 skb_mark_not_on_list(elig_ack);
1363
1364 return elig_ack;
1365 }
1366
cake_ewma(u64 avg,u64 sample,u32 shift)1367 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1368 {
1369 avg -= avg >> shift;
1370 avg += sample >> shift;
1371 return avg;
1372 }
1373
cake_calc_overhead(struct cake_sched_data * qd,u32 len,u32 off)1374 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
1375 {
1376 struct cake_sched_config *q = qd->config;
1377
1378 if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1379 len -= off;
1380
1381 if (qd->max_netlen < len)
1382 qd->max_netlen = len;
1383 if (qd->min_netlen > len)
1384 qd->min_netlen = len;
1385
1386 len += q->rate_overhead;
1387
1388 if (len < q->rate_mpu)
1389 len = q->rate_mpu;
1390
1391 if (q->atm_mode == CAKE_ATM_ATM) {
1392 len += 47;
1393 len /= 48;
1394 len *= 53;
1395 } else if (q->atm_mode == CAKE_ATM_PTM) {
1396 /* Add one byte per 64 bytes or part thereof.
1397 * This is conservative and easier to calculate than the
1398 * precise value.
1399 */
1400 len += (len + 63) / 64;
1401 }
1402
1403 if (qd->max_adjlen < len)
1404 qd->max_adjlen = len;
1405 if (qd->min_adjlen > len)
1406 qd->min_adjlen = len;
1407
1408 return len;
1409 }
1410
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1411 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1412 {
1413 const struct skb_shared_info *shinfo = skb_shinfo(skb);
1414 unsigned int hdr_len, last_len = 0;
1415 u32 off = skb_network_offset(skb);
1416 u16 segs = qdisc_pkt_segs(skb);
1417 u32 len = qdisc_pkt_len(skb);
1418
1419 q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1420
1421 if (segs == 1)
1422 return cake_calc_overhead(q, len, off);
1423
1424 /* borrowed from qdisc_pkt_len_segs_init() */
1425 if (!skb->encapsulation)
1426 hdr_len = skb_transport_offset(skb);
1427 else
1428 hdr_len = skb_inner_transport_offset(skb);
1429
1430 /* + transport layer */
1431 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1432 SKB_GSO_TCPV6))) {
1433 const struct tcphdr *th;
1434 struct tcphdr _tcphdr;
1435
1436 th = skb_header_pointer(skb, hdr_len,
1437 sizeof(_tcphdr), &_tcphdr);
1438 if (likely(th))
1439 hdr_len += __tcp_hdrlen(th);
1440 } else {
1441 struct udphdr _udphdr;
1442
1443 if (skb_header_pointer(skb, hdr_len,
1444 sizeof(_udphdr), &_udphdr))
1445 hdr_len += sizeof(struct udphdr);
1446 }
1447
1448 len = shinfo->gso_size + hdr_len;
1449 last_len = skb->len - shinfo->gso_size * (segs - 1);
1450
1451 return (cake_calc_overhead(q, len, off) * (segs - 1) +
1452 cake_calc_overhead(q, last_len, off));
1453 }
1454
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1455 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1456 {
1457 struct cake_heap_entry ii = q->overflow_heap[i];
1458 struct cake_heap_entry jj = q->overflow_heap[j];
1459
1460 q->overflow_heap[i] = jj;
1461 q->overflow_heap[j] = ii;
1462
1463 q->tins[ii.t].overflow_idx[ii.b] = j;
1464 q->tins[jj.t].overflow_idx[jj.b] = i;
1465 }
1466
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1467 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1468 {
1469 struct cake_heap_entry ii = q->overflow_heap[i];
1470
1471 return q->tins[ii.t].backlogs[ii.b];
1472 }
1473
cake_heapify(struct cake_sched_data * q,u16 i)1474 static void cake_heapify(struct cake_sched_data *q, u16 i)
1475 {
1476 static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1477 u32 mb = cake_heap_get_backlog(q, i);
1478 u32 m = i;
1479
1480 while (m < a) {
1481 u32 l = m + m + 1;
1482 u32 r = l + 1;
1483
1484 if (l < a) {
1485 u32 lb = cake_heap_get_backlog(q, l);
1486
1487 if (lb > mb) {
1488 m = l;
1489 mb = lb;
1490 }
1491 }
1492
1493 if (r < a) {
1494 u32 rb = cake_heap_get_backlog(q, r);
1495
1496 if (rb > mb) {
1497 m = r;
1498 mb = rb;
1499 }
1500 }
1501
1502 if (m != i) {
1503 cake_heap_swap(q, i, m);
1504 i = m;
1505 } else {
1506 break;
1507 }
1508 }
1509 }
1510
cake_heapify_up(struct cake_sched_data * q,u16 i)1511 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1512 {
1513 while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1514 u16 p = (i - 1) >> 1;
1515 u32 ib = cake_heap_get_backlog(q, i);
1516 u32 pb = cake_heap_get_backlog(q, p);
1517
1518 if (ib > pb) {
1519 cake_heap_swap(q, i, p);
1520 i = p;
1521 } else {
1522 break;
1523 }
1524 }
1525 }
1526
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1527 static int cake_advance_shaper(struct cake_sched_data *q,
1528 struct cake_tin_data *b,
1529 struct sk_buff *skb,
1530 ktime_t now, bool drop)
1531 {
1532 u32 len = get_cobalt_cb(skb)->adjusted_len;
1533
1534 /* charge packet bandwidth to this tin
1535 * and to the global shaper.
1536 */
1537 if (q->rate_ns) {
1538 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1539 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1540 u64 failsafe_dur = global_dur + (global_dur >> 1);
1541
1542 if (ktime_before(b->time_next_packet, now))
1543 b->time_next_packet = ktime_add_ns(b->time_next_packet,
1544 tin_dur);
1545
1546 else if (ktime_before(b->time_next_packet,
1547 ktime_add_ns(now, tin_dur)))
1548 b->time_next_packet = ktime_add_ns(now, tin_dur);
1549
1550 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1551 global_dur);
1552 if (!drop)
1553 q->failsafe_next_packet = \
1554 ktime_add_ns(q->failsafe_next_packet,
1555 failsafe_dur);
1556 }
1557 return len;
1558 }
1559
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1560 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1561 {
1562 struct cake_sched_data *q = qdisc_priv(sch);
1563 ktime_t now = ktime_get();
1564 u32 idx = 0, tin = 0, len;
1565 struct cake_heap_entry qq;
1566 struct cake_tin_data *b;
1567 struct cake_flow *flow;
1568 struct sk_buff *skb;
1569
1570 if (!q->overflow_timeout) {
1571 int i;
1572 /* Build fresh max-heap */
1573 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1574 cake_heapify(q, i);
1575 }
1576 q->overflow_timeout = 65535;
1577
1578 /* select longest queue for pruning */
1579 qq = q->overflow_heap[0];
1580 tin = qq.t;
1581 idx = qq.b;
1582
1583 b = &q->tins[tin];
1584 flow = &b->flows[idx];
1585 skb = dequeue_head(flow);
1586 if (unlikely(!skb)) {
1587 /* heap has gone wrong, rebuild it next time */
1588 q->overflow_timeout = 0;
1589 return idx + (tin << 16);
1590 }
1591
1592 if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1593 b->unresponsive_flow_count++;
1594
1595 len = qdisc_pkt_len(skb);
1596 q->buffer_used -= skb->truesize;
1597 b->backlogs[idx] -= len;
1598 b->tin_backlog -= len;
1599 sch->qstats.backlog -= len;
1600
1601 flow->dropped++;
1602 b->tin_dropped++;
1603
1604 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1605 cake_advance_shaper(q, b, skb, now, true);
1606
1607 qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1608 sch->q.qlen--;
1609
1610 cake_heapify(q, 0);
1611
1612 return idx + (tin << 16);
1613 }
1614
cake_handle_diffserv(struct sk_buff * skb,bool wash)1615 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1616 {
1617 const int offset = skb_network_offset(skb);
1618 u16 *buf, buf_;
1619 u8 dscp;
1620
1621 switch (skb_protocol(skb, true)) {
1622 case htons(ETH_P_IP):
1623 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1624 if (unlikely(!buf))
1625 return 0;
1626
1627 /* ToS is in the second byte of iphdr */
1628 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1629
1630 if (wash && dscp) {
1631 const int wlen = offset + sizeof(struct iphdr);
1632
1633 if (!pskb_may_pull(skb, wlen) ||
1634 skb_try_make_writable(skb, wlen))
1635 return 0;
1636
1637 ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1638 }
1639
1640 return dscp;
1641
1642 case htons(ETH_P_IPV6):
1643 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1644 if (unlikely(!buf))
1645 return 0;
1646
1647 /* Traffic class is in the first and second bytes of ipv6hdr */
1648 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1649
1650 if (wash && dscp) {
1651 const int wlen = offset + sizeof(struct ipv6hdr);
1652
1653 if (!pskb_may_pull(skb, wlen) ||
1654 skb_try_make_writable(skb, wlen))
1655 return 0;
1656
1657 ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1658 }
1659
1660 return dscp;
1661
1662 case htons(ETH_P_ARP):
1663 return 0x38; /* CS7 - Net Control */
1664
1665 default:
1666 /* If there is no Diffserv field, treat as best-effort */
1667 return 0;
1668 }
1669 }
1670
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1671 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1672 struct sk_buff *skb)
1673 {
1674 struct cake_sched_data *qd = qdisc_priv(sch);
1675 struct cake_sched_config *q = qd->config;
1676 u32 tin, mark;
1677 bool wash;
1678 u8 dscp;
1679
1680 /* Tin selection: Default to diffserv-based selection, allow overriding
1681 * using firewall marks or skb->priority. Call DSCP parsing early if
1682 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1683 */
1684 mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1685 wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1686 if (wash)
1687 dscp = cake_handle_diffserv(skb, wash);
1688
1689 if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1690 tin = 0;
1691
1692 else if (mark && mark <= qd->tin_cnt)
1693 tin = qd->tin_order[mark - 1];
1694
1695 else if (TC_H_MAJ(skb->priority) == sch->handle &&
1696 TC_H_MIN(skb->priority) > 0 &&
1697 TC_H_MIN(skb->priority) <= qd->tin_cnt)
1698 tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
1699
1700 else {
1701 if (!wash)
1702 dscp = cake_handle_diffserv(skb, wash);
1703 tin = qd->tin_index[dscp];
1704
1705 if (unlikely(tin >= qd->tin_cnt))
1706 tin = 0;
1707 }
1708
1709 return &qd->tins[tin];
1710 }
1711
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1712 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1713 struct sk_buff *skb, int flow_mode, int *qerr)
1714 {
1715 struct cake_sched_data *q = qdisc_priv(sch);
1716 struct tcf_proto *filter;
1717 struct tcf_result res;
1718 u16 flow = 0, host = 0;
1719 int result;
1720
1721 filter = rcu_dereference_bh(q->filter_list);
1722 if (!filter)
1723 goto hash;
1724
1725 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1726 result = tcf_classify(skb, NULL, filter, &res, false);
1727
1728 if (result >= 0) {
1729 #ifdef CONFIG_NET_CLS_ACT
1730 switch (result) {
1731 case TC_ACT_STOLEN:
1732 case TC_ACT_QUEUED:
1733 case TC_ACT_TRAP:
1734 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1735 fallthrough;
1736 case TC_ACT_SHOT:
1737 return 0;
1738 }
1739 #endif
1740 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1741 flow = TC_H_MIN(res.classid);
1742 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1743 host = TC_H_MAJ(res.classid) >> 16;
1744 }
1745 hash:
1746 *t = cake_select_tin(sch, skb);
1747 return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1748 }
1749
1750 static void cake_reconfigure(struct Qdisc *sch);
1751
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1752 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1753 struct sk_buff **to_free)
1754 {
1755 u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1756 struct cake_sched_data *q = qdisc_priv(sch);
1757 int len = qdisc_pkt_len(skb), ret;
1758 struct sk_buff *ack = NULL;
1759 ktime_t now = ktime_get();
1760 struct cake_tin_data *b;
1761 struct cake_flow *flow;
1762 bool same_flow = false;
1763
1764 /* choose flow to insert into */
1765 idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
1766 if (idx == 0) {
1767 if (ret & __NET_XMIT_BYPASS)
1768 qdisc_qstats_drop(sch);
1769 __qdisc_drop(skb, to_free);
1770 return ret;
1771 }
1772 tin = (u32)(b - q->tins);
1773 idx--;
1774 flow = &b->flows[idx];
1775
1776 /* ensure shaper state isn't stale */
1777 if (!b->tin_backlog) {
1778 if (ktime_before(b->time_next_packet, now))
1779 b->time_next_packet = now;
1780
1781 if (!sch->q.qlen) {
1782 if (ktime_before(q->time_next_packet, now)) {
1783 q->failsafe_next_packet = now;
1784 q->time_next_packet = now;
1785 } else if (ktime_after(q->time_next_packet, now) &&
1786 ktime_after(q->failsafe_next_packet, now)) {
1787 u64 next = \
1788 min(ktime_to_ns(q->time_next_packet),
1789 ktime_to_ns(
1790 q->failsafe_next_packet));
1791 sch->qstats.overlimits++;
1792 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1793 }
1794 }
1795 }
1796
1797 if (unlikely(len > b->max_skblen))
1798 b->max_skblen = len;
1799
1800 if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1801 struct sk_buff *segs, *nskb;
1802 netdev_features_t features = netif_skb_features(skb);
1803 unsigned int slen = 0, numsegs = 0;
1804
1805 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1806 if (IS_ERR_OR_NULL(segs))
1807 return qdisc_drop(skb, sch, to_free);
1808
1809 skb_list_walk_safe(segs, segs, nskb) {
1810 skb_mark_not_on_list(segs);
1811 qdisc_skb_cb(segs)->pkt_len = segs->len;
1812 qdisc_skb_cb(segs)->pkt_segs = 1;
1813 cobalt_set_enqueue_time(segs, now);
1814 get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1815 segs);
1816 flow_queue_add(flow, segs);
1817
1818 sch->q.qlen++;
1819 numsegs++;
1820 slen += segs->len;
1821 q->buffer_used += segs->truesize;
1822 b->packets++;
1823 }
1824
1825 /* stats */
1826 b->bytes += slen;
1827 b->backlogs[idx] += slen;
1828 b->tin_backlog += slen;
1829 sch->qstats.backlog += slen;
1830 q->avg_window_bytes += slen;
1831
1832 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1833 consume_skb(skb);
1834 } else {
1835 /* not splitting */
1836 int ack_pkt_len = 0;
1837
1838 cobalt_set_enqueue_time(skb, now);
1839 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1840 flow_queue_add(flow, skb);
1841
1842 if (q->config->ack_filter)
1843 ack = cake_ack_filter(q, flow);
1844
1845 if (ack) {
1846 b->ack_drops++;
1847 sch->qstats.drops++;
1848 ack_pkt_len = qdisc_pkt_len(ack);
1849 b->bytes += ack_pkt_len;
1850 q->buffer_used += skb->truesize - ack->truesize;
1851 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1852 cake_advance_shaper(q, b, ack, now, true);
1853
1854 qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1855 consume_skb(ack);
1856 } else {
1857 sch->q.qlen++;
1858 q->buffer_used += skb->truesize;
1859 }
1860
1861 /* stats */
1862 b->packets++;
1863 b->bytes += len - ack_pkt_len;
1864 b->backlogs[idx] += len - ack_pkt_len;
1865 b->tin_backlog += len - ack_pkt_len;
1866 sch->qstats.backlog += len - ack_pkt_len;
1867 q->avg_window_bytes += len - ack_pkt_len;
1868 }
1869
1870 if (q->overflow_timeout)
1871 cake_heapify_up(q, b->overflow_idx[idx]);
1872
1873 /* incoming bandwidth capacity estimate */
1874 if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1875 u64 packet_interval = \
1876 ktime_to_ns(ktime_sub(now, q->last_packet_time));
1877
1878 if (packet_interval > NSEC_PER_SEC)
1879 packet_interval = NSEC_PER_SEC;
1880
1881 /* filter out short-term bursts, eg. wifi aggregation */
1882 q->avg_packet_interval = \
1883 cake_ewma(q->avg_packet_interval,
1884 packet_interval,
1885 (packet_interval > q->avg_packet_interval ?
1886 2 : 8));
1887
1888 q->last_packet_time = now;
1889
1890 if (packet_interval > q->avg_packet_interval) {
1891 u64 window_interval = \
1892 ktime_to_ns(ktime_sub(now,
1893 q->avg_window_begin));
1894 u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1895
1896 b = div64_u64(b, window_interval);
1897 q->avg_peak_bandwidth =
1898 cake_ewma(q->avg_peak_bandwidth, b,
1899 b > q->avg_peak_bandwidth ? 2 : 8);
1900 q->avg_window_bytes = 0;
1901 q->avg_window_begin = now;
1902
1903 if (ktime_after(now,
1904 ktime_add_ms(q->last_reconfig_time,
1905 250))) {
1906 q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1907 cake_reconfigure(sch);
1908 }
1909 }
1910 } else {
1911 q->avg_window_bytes = 0;
1912 q->last_packet_time = now;
1913 }
1914
1915 /* flowchain */
1916 if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1917 if (!flow->set) {
1918 list_add_tail(&flow->flowchain, &b->new_flows);
1919 } else {
1920 b->decaying_flow_count--;
1921 list_move_tail(&flow->flowchain, &b->new_flows);
1922 }
1923 flow->set = CAKE_SET_SPARSE;
1924 b->sparse_flow_count++;
1925
1926 flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
1927 } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1928 /* this flow was empty, accounted as a sparse flow, but actually
1929 * in the bulk rotation.
1930 */
1931 flow->set = CAKE_SET_BULK;
1932 b->sparse_flow_count--;
1933 b->bulk_flow_count++;
1934
1935 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
1936 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
1937 }
1938
1939 if (q->buffer_used > q->buffer_max_used)
1940 q->buffer_max_used = q->buffer_used;
1941
1942 if (q->buffer_used <= q->buffer_limit)
1943 return NET_XMIT_SUCCESS;
1944
1945 prev_qlen = sch->q.qlen;
1946 prev_backlog = sch->qstats.backlog;
1947
1948 while (q->buffer_used > q->buffer_limit) {
1949 drop_id = cake_drop(sch, to_free);
1950 if ((drop_id >> 16) == tin &&
1951 (drop_id & 0xFFFF) == idx)
1952 same_flow = true;
1953 }
1954
1955 prev_qlen -= sch->q.qlen;
1956 prev_backlog -= sch->qstats.backlog;
1957 b->drop_overlimit += prev_qlen;
1958
1959 if (same_flow) {
1960 qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1961 prev_backlog - len);
1962 return NET_XMIT_CN;
1963 }
1964 qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1965 return NET_XMIT_SUCCESS;
1966 }
1967
cake_dequeue_one(struct Qdisc * sch)1968 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1969 {
1970 struct cake_sched_data *q = qdisc_priv(sch);
1971 struct cake_tin_data *b = &q->tins[q->cur_tin];
1972 struct cake_flow *flow = &b->flows[q->cur_flow];
1973 struct sk_buff *skb = NULL;
1974 u32 len;
1975
1976 if (flow->head) {
1977 skb = dequeue_head(flow);
1978 len = qdisc_pkt_len(skb);
1979 b->backlogs[q->cur_flow] -= len;
1980 b->tin_backlog -= len;
1981 sch->qstats.backlog -= len;
1982 q->buffer_used -= skb->truesize;
1983 sch->q.qlen--;
1984
1985 if (q->overflow_timeout)
1986 cake_heapify(q, b->overflow_idx[q->cur_flow]);
1987 }
1988 return skb;
1989 }
1990
1991 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1992 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1993 {
1994 struct cake_sched_data *q = qdisc_priv(sch);
1995 struct sk_buff *skb;
1996
1997 q->cur_tin = tin;
1998 for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1999 while (!!(skb = cake_dequeue_one(sch)))
2000 kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
2001 }
2002
cake_dequeue(struct Qdisc * sch)2003 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2004 {
2005 struct cake_sched_data *q = qdisc_priv(sch);
2006 struct cake_tin_data *b = &q->tins[q->cur_tin];
2007 enum skb_drop_reason reason;
2008 ktime_t now = ktime_get();
2009 struct cake_flow *flow;
2010 struct list_head *head;
2011 bool first_flow = true;
2012 struct sk_buff *skb;
2013 u64 delay;
2014 u32 len;
2015
2016 if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) {
2017 struct net_device *dev = qdisc_dev(sch);
2018 struct cake_sched_data *other_priv;
2019 u64 new_rate = q->config->rate_bps;
2020 u64 other_qlen, other_last_active;
2021 struct Qdisc *other_sch;
2022 u32 num_active_qs = 1;
2023 unsigned int ntx;
2024
2025 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2026 other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
2027 other_priv = qdisc_priv(other_sch);
2028
2029 if (other_priv == q)
2030 continue;
2031
2032 other_qlen = READ_ONCE(other_sch->q.qlen);
2033 other_last_active = READ_ONCE(other_priv->last_active);
2034
2035 if (other_qlen || other_last_active > q->last_checked_active)
2036 num_active_qs++;
2037 }
2038
2039 if (num_active_qs > 1)
2040 new_rate = div64_u64(q->config->rate_bps, num_active_qs);
2041
2042 /* mtu = 0 is used to only update the rate and not mess with cobalt params */
2043 cake_set_rate(b, new_rate, 0, 0, 0);
2044 q->last_checked_active = now;
2045 q->active_queues = num_active_qs;
2046 q->rate_ns = b->tin_rate_ns;
2047 q->rate_shft = b->tin_rate_shft;
2048 }
2049
2050 begin:
2051 if (!sch->q.qlen)
2052 return NULL;
2053
2054 /* global hard shaper */
2055 if (ktime_after(q->time_next_packet, now) &&
2056 ktime_after(q->failsafe_next_packet, now)) {
2057 u64 next = min(ktime_to_ns(q->time_next_packet),
2058 ktime_to_ns(q->failsafe_next_packet));
2059
2060 sch->qstats.overlimits++;
2061 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2062 return NULL;
2063 }
2064
2065 /* Choose a class to work on. */
2066 if (!q->rate_ns) {
2067 /* In unlimited mode, can't rely on shaper timings, just balance
2068 * with DRR
2069 */
2070 bool wrapped = false, empty = true;
2071
2072 while (b->tin_deficit < 0 ||
2073 !(b->sparse_flow_count + b->bulk_flow_count)) {
2074 if (b->tin_deficit <= 0)
2075 b->tin_deficit += b->tin_quantum;
2076 if (b->sparse_flow_count + b->bulk_flow_count)
2077 empty = false;
2078
2079 q->cur_tin++;
2080 b++;
2081 if (q->cur_tin >= q->tin_cnt) {
2082 q->cur_tin = 0;
2083 b = q->tins;
2084
2085 if (wrapped) {
2086 /* It's possible for q->qlen to be
2087 * nonzero when we actually have no
2088 * packets anywhere.
2089 */
2090 if (empty)
2091 return NULL;
2092 } else {
2093 wrapped = true;
2094 }
2095 }
2096 }
2097 } else {
2098 /* In shaped mode, choose:
2099 * - Highest-priority tin with queue and meeting schedule, or
2100 * - The earliest-scheduled tin with queue.
2101 */
2102 ktime_t best_time = KTIME_MAX;
2103 int tin, best_tin = 0;
2104
2105 for (tin = 0; tin < q->tin_cnt; tin++) {
2106 b = q->tins + tin;
2107 if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2108 ktime_t time_to_pkt = \
2109 ktime_sub(b->time_next_packet, now);
2110
2111 if (ktime_to_ns(time_to_pkt) <= 0 ||
2112 ktime_compare(time_to_pkt,
2113 best_time) <= 0) {
2114 best_time = time_to_pkt;
2115 best_tin = tin;
2116 }
2117 }
2118 }
2119
2120 q->cur_tin = best_tin;
2121 b = q->tins + best_tin;
2122
2123 /* No point in going further if no packets to deliver. */
2124 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2125 return NULL;
2126 }
2127
2128 retry:
2129 /* service this class */
2130 head = &b->decaying_flows;
2131 if (!first_flow || list_empty(head)) {
2132 head = &b->new_flows;
2133 if (list_empty(head)) {
2134 head = &b->old_flows;
2135 if (unlikely(list_empty(head))) {
2136 head = &b->decaying_flows;
2137 if (unlikely(list_empty(head)))
2138 goto begin;
2139 }
2140 }
2141 }
2142 flow = list_first_entry(head, struct cake_flow, flowchain);
2143 q->cur_flow = flow - b->flows;
2144 first_flow = false;
2145
2146 /* flow isolation (DRR++) */
2147 if (flow->deficit <= 0) {
2148 /* Keep all flows with deficits out of the sparse and decaying
2149 * rotations. No non-empty flow can go into the decaying
2150 * rotation, so they can't get deficits
2151 */
2152 if (flow->set == CAKE_SET_SPARSE) {
2153 if (flow->head) {
2154 b->sparse_flow_count--;
2155 b->bulk_flow_count++;
2156
2157 cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2158 cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2159
2160 flow->set = CAKE_SET_BULK;
2161 } else {
2162 /* we've moved it to the bulk rotation for
2163 * correct deficit accounting but we still want
2164 * to count it as a sparse flow, not a bulk one.
2165 */
2166 flow->set = CAKE_SET_SPARSE_WAIT;
2167 }
2168 }
2169
2170 flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
2171 list_move_tail(&flow->flowchain, &b->old_flows);
2172
2173 goto retry;
2174 }
2175
2176 /* Retrieve a packet via the AQM */
2177 while (1) {
2178 skb = cake_dequeue_one(sch);
2179 if (!skb) {
2180 /* this queue was actually empty */
2181 if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2182 b->unresponsive_flow_count--;
2183
2184 if (flow->cvars.p_drop || flow->cvars.count ||
2185 ktime_before(now, flow->cvars.drop_next)) {
2186 /* keep in the flowchain until the state has
2187 * decayed to rest
2188 */
2189 list_move_tail(&flow->flowchain,
2190 &b->decaying_flows);
2191 if (flow->set == CAKE_SET_BULK) {
2192 b->bulk_flow_count--;
2193
2194 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2195 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2196
2197 b->decaying_flow_count++;
2198 } else if (flow->set == CAKE_SET_SPARSE ||
2199 flow->set == CAKE_SET_SPARSE_WAIT) {
2200 b->sparse_flow_count--;
2201 b->decaying_flow_count++;
2202 }
2203 flow->set = CAKE_SET_DECAYING;
2204 } else {
2205 /* remove empty queue from the flowchain */
2206 list_del_init(&flow->flowchain);
2207 if (flow->set == CAKE_SET_SPARSE ||
2208 flow->set == CAKE_SET_SPARSE_WAIT)
2209 b->sparse_flow_count--;
2210 else if (flow->set == CAKE_SET_BULK) {
2211 b->bulk_flow_count--;
2212
2213 cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2214 cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2215 } else
2216 b->decaying_flow_count--;
2217
2218 flow->set = CAKE_SET_NONE;
2219 }
2220 goto begin;
2221 }
2222
2223 reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2224 (b->bulk_flow_count *
2225 !!(q->config->rate_flags &
2226 CAKE_FLAG_INGRESS)));
2227 /* Last packet in queue may be marked, shouldn't be dropped */
2228 if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2229 break;
2230
2231 /* drop this packet, get another one */
2232 if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
2233 len = cake_advance_shaper(q, b, skb,
2234 now, true);
2235 flow->deficit -= len;
2236 b->tin_deficit -= len;
2237 }
2238 flow->dropped++;
2239 b->tin_dropped++;
2240 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2241 qdisc_qstats_drop(sch);
2242 qdisc_dequeue_drop(sch, skb, reason);
2243 if (q->config->rate_flags & CAKE_FLAG_INGRESS)
2244 goto retry;
2245 }
2246
2247 b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2248 qdisc_bstats_update(sch, skb);
2249 WRITE_ONCE(q->last_active, now);
2250
2251 /* collect delay stats */
2252 delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2253 b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2254 b->peak_delay = cake_ewma(b->peak_delay, delay,
2255 delay > b->peak_delay ? 2 : 8);
2256 b->base_delay = cake_ewma(b->base_delay, delay,
2257 delay < b->base_delay ? 2 : 8);
2258
2259 len = cake_advance_shaper(q, b, skb, now, false);
2260 flow->deficit -= len;
2261 b->tin_deficit -= len;
2262
2263 if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2264 u64 next = min(ktime_to_ns(q->time_next_packet),
2265 ktime_to_ns(q->failsafe_next_packet));
2266
2267 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2268 } else if (!sch->q.qlen) {
2269 int i;
2270
2271 for (i = 0; i < q->tin_cnt; i++) {
2272 if (q->tins[i].decaying_flow_count) {
2273 ktime_t next = \
2274 ktime_add_ns(now,
2275 q->tins[i].cparams.target);
2276
2277 qdisc_watchdog_schedule_ns(&q->watchdog,
2278 ktime_to_ns(next));
2279 break;
2280 }
2281 }
2282 }
2283
2284 if (q->overflow_timeout)
2285 q->overflow_timeout--;
2286
2287 return skb;
2288 }
2289
cake_reset(struct Qdisc * sch)2290 static void cake_reset(struct Qdisc *sch)
2291 {
2292 struct cake_sched_data *q = qdisc_priv(sch);
2293 u32 c;
2294
2295 if (!q->tins)
2296 return;
2297
2298 for (c = 0; c < CAKE_MAX_TINS; c++)
2299 cake_clear_tin(sch, c);
2300 }
2301
2302 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2303 [TCA_CAKE_BASE_RATE64] = { .type = NLA_U64 },
2304 [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2305 [TCA_CAKE_ATM] = { .type = NLA_U32 },
2306 [TCA_CAKE_FLOW_MODE] = { .type = NLA_U32 },
2307 [TCA_CAKE_OVERHEAD] = { .type = NLA_S32 },
2308 [TCA_CAKE_RTT] = { .type = NLA_U32 },
2309 [TCA_CAKE_TARGET] = { .type = NLA_U32 },
2310 [TCA_CAKE_AUTORATE] = { .type = NLA_U32 },
2311 [TCA_CAKE_MEMORY] = { .type = NLA_U32 },
2312 [TCA_CAKE_NAT] = { .type = NLA_U32 },
2313 [TCA_CAKE_RAW] = { .type = NLA_U32 },
2314 [TCA_CAKE_WASH] = { .type = NLA_U32 },
2315 [TCA_CAKE_MPU] = { .type = NLA_U32 },
2316 [TCA_CAKE_INGRESS] = { .type = NLA_U32 },
2317 [TCA_CAKE_ACK_FILTER] = { .type = NLA_U32 },
2318 [TCA_CAKE_SPLIT_GSO] = { .type = NLA_U32 },
2319 [TCA_CAKE_FWMARK] = { .type = NLA_U32 },
2320 };
2321
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2322 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2323 u64 target_ns, u64 rtt_est_ns)
2324 {
2325 /* convert byte-rate into time-per-byte
2326 * so it will always unwedge in reasonable time.
2327 */
2328 static const u64 MIN_RATE = 64;
2329 u32 byte_target = mtu;
2330 u64 byte_target_ns;
2331 u8 rate_shft = 0;
2332 u64 rate_ns = 0;
2333
2334 b->flow_quantum = 1514;
2335 if (rate) {
2336 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2337 rate_shft = 34;
2338 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2339 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2340 while (!!(rate_ns >> 34)) {
2341 rate_ns >>= 1;
2342 rate_shft--;
2343 }
2344 } /* else unlimited, ie. zero delay */
2345
2346 b->tin_rate_bps = rate;
2347 b->tin_rate_ns = rate_ns;
2348 b->tin_rate_shft = rate_shft;
2349
2350 if (mtu == 0)
2351 return;
2352
2353 byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2354
2355 b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2356 b->cparams.interval = max(rtt_est_ns +
2357 b->cparams.target - target_ns,
2358 b->cparams.target * 2);
2359 b->cparams.mtu_time = byte_target_ns;
2360 b->cparams.p_inc = 1 << 24; /* 1/256 */
2361 b->cparams.p_dec = 1 << 20; /* 1/4096 */
2362 }
2363
cake_config_besteffort(struct Qdisc * sch)2364 static int cake_config_besteffort(struct Qdisc *sch)
2365 {
2366 struct cake_sched_data *q = qdisc_priv(sch);
2367 struct cake_tin_data *b = &q->tins[0];
2368 u32 mtu = psched_mtu(qdisc_dev(sch));
2369 u64 rate = q->config->rate_bps;
2370
2371 q->tin_cnt = 1;
2372
2373 q->tin_index = besteffort;
2374 q->tin_order = normal_order;
2375
2376 cake_set_rate(b, rate, mtu,
2377 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2378 b->tin_quantum = 65535;
2379
2380 return 0;
2381 }
2382
cake_config_precedence(struct Qdisc * sch)2383 static int cake_config_precedence(struct Qdisc *sch)
2384 {
2385 /* convert high-level (user visible) parameters into internal format */
2386 struct cake_sched_data *q = qdisc_priv(sch);
2387 u32 mtu = psched_mtu(qdisc_dev(sch));
2388 u64 rate = q->config->rate_bps;
2389 u32 quantum = 256;
2390 u32 i;
2391
2392 q->tin_cnt = 8;
2393 q->tin_index = precedence;
2394 q->tin_order = normal_order;
2395
2396 for (i = 0; i < q->tin_cnt; i++) {
2397 struct cake_tin_data *b = &q->tins[i];
2398
2399 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2400 us_to_ns(q->config->interval));
2401
2402 b->tin_quantum = max_t(u16, 1U, quantum);
2403
2404 /* calculate next class's parameters */
2405 rate *= 7;
2406 rate >>= 3;
2407
2408 quantum *= 7;
2409 quantum >>= 3;
2410 }
2411
2412 return 0;
2413 }
2414
2415 /* List of known Diffserv codepoints:
2416 *
2417 * Default Forwarding (DF/CS0) - Best Effort
2418 * Max Throughput (TOS2)
2419 * Min Delay (TOS4)
2420 * LLT "La" (TOS5)
2421 * Assured Forwarding 1 (AF1x) - x3
2422 * Assured Forwarding 2 (AF2x) - x3
2423 * Assured Forwarding 3 (AF3x) - x3
2424 * Assured Forwarding 4 (AF4x) - x3
2425 * Precedence Class 1 (CS1)
2426 * Precedence Class 2 (CS2)
2427 * Precedence Class 3 (CS3)
2428 * Precedence Class 4 (CS4)
2429 * Precedence Class 5 (CS5)
2430 * Precedence Class 6 (CS6)
2431 * Precedence Class 7 (CS7)
2432 * Voice Admit (VA)
2433 * Expedited Forwarding (EF)
2434 * Lower Effort (LE)
2435 *
2436 * Total 26 codepoints.
2437 */
2438
2439 /* List of traffic classes in RFC 4594, updated by RFC 8622:
2440 * (roughly descending order of contended priority)
2441 * (roughly ascending order of uncontended throughput)
2442 *
2443 * Network Control (CS6,CS7) - routing traffic
2444 * Telephony (EF,VA) - aka. VoIP streams
2445 * Signalling (CS5) - VoIP setup
2446 * Multimedia Conferencing (AF4x) - aka. video calls
2447 * Realtime Interactive (CS4) - eg. games
2448 * Multimedia Streaming (AF3x) - eg. YouTube, NetFlix, Twitch
2449 * Broadcast Video (CS3)
2450 * Low-Latency Data (AF2x,TOS4) - eg. database
2451 * Ops, Admin, Management (CS2) - eg. ssh
2452 * Standard Service (DF & unrecognised codepoints)
2453 * High-Throughput Data (AF1x,TOS2) - eg. web traffic
2454 * Low-Priority Data (LE,CS1) - eg. BitTorrent
2455 *
2456 * Total 12 traffic classes.
2457 */
2458
cake_config_diffserv8(struct Qdisc * sch)2459 static int cake_config_diffserv8(struct Qdisc *sch)
2460 {
2461 /* Pruned list of traffic classes for typical applications:
2462 *
2463 * Network Control (CS6, CS7)
2464 * Minimum Latency (EF, VA, CS5, CS4)
2465 * Interactive Shell (CS2)
2466 * Low Latency Transactions (AF2x, TOS4)
2467 * Video Streaming (AF4x, AF3x, CS3)
2468 * Bog Standard (DF etc.)
2469 * High Throughput (AF1x, TOS2, CS1)
2470 * Background Traffic (LE)
2471 *
2472 * Total 8 traffic classes.
2473 */
2474
2475 struct cake_sched_data *q = qdisc_priv(sch);
2476 u32 mtu = psched_mtu(qdisc_dev(sch));
2477 u64 rate = q->config->rate_bps;
2478 u32 quantum = 256;
2479 u32 i;
2480
2481 q->tin_cnt = 8;
2482
2483 /* codepoint to class mapping */
2484 q->tin_index = diffserv8;
2485 q->tin_order = normal_order;
2486
2487 /* class characteristics */
2488 for (i = 0; i < q->tin_cnt; i++) {
2489 struct cake_tin_data *b = &q->tins[i];
2490
2491 cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2492 us_to_ns(q->config->interval));
2493
2494 b->tin_quantum = max_t(u16, 1U, quantum);
2495
2496 /* calculate next class's parameters */
2497 rate *= 7;
2498 rate >>= 3;
2499
2500 quantum *= 7;
2501 quantum >>= 3;
2502 }
2503
2504 return 0;
2505 }
2506
cake_config_diffserv4(struct Qdisc * sch)2507 static int cake_config_diffserv4(struct Qdisc *sch)
2508 {
2509 /* Further pruned list of traffic classes for four-class system:
2510 *
2511 * Latency Sensitive (CS7, CS6, EF, VA, CS5, CS4)
2512 * Streaming Media (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2513 * Best Effort (DF, AF1x, TOS2, and those not specified)
2514 * Background Traffic (LE, CS1)
2515 *
2516 * Total 4 traffic classes.
2517 */
2518
2519 struct cake_sched_data *q = qdisc_priv(sch);
2520 u32 mtu = psched_mtu(qdisc_dev(sch));
2521 u64 rate = q->config->rate_bps;
2522 u32 quantum = 1024;
2523
2524 q->tin_cnt = 4;
2525
2526 /* codepoint to class mapping */
2527 q->tin_index = diffserv4;
2528 q->tin_order = bulk_order;
2529
2530 /* class characteristics */
2531 cake_set_rate(&q->tins[0], rate, mtu,
2532 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2533 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2534 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2535 cake_set_rate(&q->tins[2], rate >> 1, mtu,
2536 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2537 cake_set_rate(&q->tins[3], rate >> 2, mtu,
2538 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2539
2540 /* bandwidth-sharing weights */
2541 q->tins[0].tin_quantum = quantum;
2542 q->tins[1].tin_quantum = quantum >> 4;
2543 q->tins[2].tin_quantum = quantum >> 1;
2544 q->tins[3].tin_quantum = quantum >> 2;
2545
2546 return 0;
2547 }
2548
cake_config_diffserv3(struct Qdisc * sch)2549 static int cake_config_diffserv3(struct Qdisc *sch)
2550 {
2551 /* Simplified Diffserv structure with 3 tins.
2552 * Latency Sensitive (CS7, CS6, EF, VA, TOS4)
2553 * Best Effort
2554 * Low Priority (LE, CS1)
2555 */
2556 struct cake_sched_data *q = qdisc_priv(sch);
2557 u32 mtu = psched_mtu(qdisc_dev(sch));
2558 u64 rate = q->config->rate_bps;
2559 u32 quantum = 1024;
2560
2561 q->tin_cnt = 3;
2562
2563 /* codepoint to class mapping */
2564 q->tin_index = diffserv3;
2565 q->tin_order = bulk_order;
2566
2567 /* class characteristics */
2568 cake_set_rate(&q->tins[0], rate, mtu,
2569 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2570 cake_set_rate(&q->tins[1], rate >> 4, mtu,
2571 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2572 cake_set_rate(&q->tins[2], rate >> 2, mtu,
2573 us_to_ns(q->config->target), us_to_ns(q->config->interval));
2574
2575 /* bandwidth-sharing weights */
2576 q->tins[0].tin_quantum = quantum;
2577 q->tins[1].tin_quantum = quantum >> 4;
2578 q->tins[2].tin_quantum = quantum >> 2;
2579
2580 return 0;
2581 }
2582
cake_reconfigure(struct Qdisc * sch)2583 static void cake_reconfigure(struct Qdisc *sch)
2584 {
2585 struct cake_sched_data *qd = qdisc_priv(sch);
2586 struct cake_sched_config *q = qd->config;
2587 int c, ft;
2588
2589 switch (q->tin_mode) {
2590 case CAKE_DIFFSERV_BESTEFFORT:
2591 ft = cake_config_besteffort(sch);
2592 break;
2593
2594 case CAKE_DIFFSERV_PRECEDENCE:
2595 ft = cake_config_precedence(sch);
2596 break;
2597
2598 case CAKE_DIFFSERV_DIFFSERV8:
2599 ft = cake_config_diffserv8(sch);
2600 break;
2601
2602 case CAKE_DIFFSERV_DIFFSERV4:
2603 ft = cake_config_diffserv4(sch);
2604 break;
2605
2606 case CAKE_DIFFSERV_DIFFSERV3:
2607 default:
2608 ft = cake_config_diffserv3(sch);
2609 break;
2610 }
2611
2612 for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
2613 cake_clear_tin(sch, c);
2614 qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
2615 }
2616
2617 qd->rate_ns = qd->tins[ft].tin_rate_ns;
2618 qd->rate_shft = qd->tins[ft].tin_rate_shft;
2619
2620 if (q->buffer_config_limit) {
2621 qd->buffer_limit = q->buffer_config_limit;
2622 } else if (q->rate_bps) {
2623 u64 t = q->rate_bps * q->interval;
2624
2625 do_div(t, USEC_PER_SEC / 4);
2626 qd->buffer_limit = max_t(u32, t, 4U << 20);
2627 } else {
2628 qd->buffer_limit = ~0;
2629 }
2630
2631 sch->flags &= ~TCQ_F_CAN_BYPASS;
2632
2633 qd->buffer_limit = min(qd->buffer_limit,
2634 max(sch->limit * psched_mtu(qdisc_dev(sch)),
2635 q->buffer_config_limit));
2636 }
2637
cake_config_change(struct cake_sched_config * q,struct nlattr * opt,struct netlink_ext_ack * extack,bool * overhead_changed)2638 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
2639 struct netlink_ext_ack *extack, bool *overhead_changed)
2640 {
2641 struct nlattr *tb[TCA_CAKE_MAX + 1];
2642 u16 rate_flags = q->rate_flags;
2643 u8 flow_mode = q->flow_mode;
2644 int err;
2645
2646 err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2647 extack);
2648 if (err < 0)
2649 return err;
2650
2651 if (tb[TCA_CAKE_NAT]) {
2652 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2653 flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2654 flow_mode |= CAKE_FLOW_NAT_FLAG *
2655 !!nla_get_u32(tb[TCA_CAKE_NAT]);
2656 #else
2657 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2658 "No conntrack support in kernel");
2659 return -EOPNOTSUPP;
2660 #endif
2661 }
2662
2663 if (tb[TCA_CAKE_AUTORATE]) {
2664 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
2665 if (q->is_shared) {
2666 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
2667 "Can't use autorate-ingress with cake_mq");
2668 return -EOPNOTSUPP;
2669 }
2670 rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2671 } else {
2672 rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2673 }
2674 }
2675
2676 if (tb[TCA_CAKE_BASE_RATE64])
2677 WRITE_ONCE(q->rate_bps,
2678 nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2679
2680 if (tb[TCA_CAKE_DIFFSERV_MODE])
2681 WRITE_ONCE(q->tin_mode,
2682 nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2683
2684 if (tb[TCA_CAKE_WASH]) {
2685 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2686 rate_flags |= CAKE_FLAG_WASH;
2687 else
2688 rate_flags &= ~CAKE_FLAG_WASH;
2689 }
2690
2691 if (tb[TCA_CAKE_FLOW_MODE])
2692 flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2693 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2694 CAKE_FLOW_MASK));
2695
2696 if (tb[TCA_CAKE_ATM])
2697 WRITE_ONCE(q->atm_mode,
2698 nla_get_u32(tb[TCA_CAKE_ATM]));
2699
2700 if (tb[TCA_CAKE_OVERHEAD]) {
2701 WRITE_ONCE(q->rate_overhead,
2702 nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2703 rate_flags |= CAKE_FLAG_OVERHEAD;
2704 *overhead_changed = true;
2705 }
2706
2707 if (tb[TCA_CAKE_RAW]) {
2708 rate_flags &= ~CAKE_FLAG_OVERHEAD;
2709 *overhead_changed = true;
2710 }
2711
2712 if (tb[TCA_CAKE_MPU])
2713 WRITE_ONCE(q->rate_mpu,
2714 nla_get_u32(tb[TCA_CAKE_MPU]));
2715
2716 if (tb[TCA_CAKE_RTT]) {
2717 u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2718
2719 WRITE_ONCE(q->interval, max(interval, 1U));
2720 }
2721
2722 if (tb[TCA_CAKE_TARGET]) {
2723 u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2724
2725 WRITE_ONCE(q->target, max(target, 1U));
2726 }
2727
2728 if (tb[TCA_CAKE_INGRESS]) {
2729 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2730 rate_flags |= CAKE_FLAG_INGRESS;
2731 else
2732 rate_flags &= ~CAKE_FLAG_INGRESS;
2733 }
2734
2735 if (tb[TCA_CAKE_ACK_FILTER])
2736 WRITE_ONCE(q->ack_filter,
2737 nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2738
2739 if (tb[TCA_CAKE_MEMORY])
2740 WRITE_ONCE(q->buffer_config_limit,
2741 nla_get_u32(tb[TCA_CAKE_MEMORY]));
2742
2743 if (tb[TCA_CAKE_SPLIT_GSO]) {
2744 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2745 rate_flags |= CAKE_FLAG_SPLIT_GSO;
2746 else
2747 rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2748 }
2749
2750 if (tb[TCA_CAKE_FWMARK]) {
2751 WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2752 WRITE_ONCE(q->fwmark_shft,
2753 q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2754 }
2755
2756 WRITE_ONCE(q->rate_flags, rate_flags);
2757 WRITE_ONCE(q->flow_mode, flow_mode);
2758
2759 return 0;
2760 }
2761
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2762 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2763 struct netlink_ext_ack *extack)
2764 {
2765 struct cake_sched_data *qd = qdisc_priv(sch);
2766 struct cake_sched_config *q = qd->config;
2767 bool overhead_changed = false;
2768 int ret;
2769
2770 if (q->is_shared) {
2771 NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
2772 return -EOPNOTSUPP;
2773 }
2774
2775 ret = cake_config_change(q, opt, extack, &overhead_changed);
2776 if (ret)
2777 return ret;
2778
2779 if (overhead_changed) {
2780 qd->max_netlen = 0;
2781 qd->max_adjlen = 0;
2782 qd->min_netlen = ~0;
2783 qd->min_adjlen = ~0;
2784 }
2785
2786 if (qd->tins) {
2787 sch_tree_lock(sch);
2788 cake_reconfigure(sch);
2789 sch_tree_unlock(sch);
2790 }
2791
2792 return 0;
2793 }
2794
cake_destroy(struct Qdisc * sch)2795 static void cake_destroy(struct Qdisc *sch)
2796 {
2797 struct cake_sched_data *q = qdisc_priv(sch);
2798
2799 qdisc_watchdog_cancel(&q->watchdog);
2800 tcf_block_put(q->block);
2801 kvfree(q->tins);
2802 }
2803
cake_config_init(struct cake_sched_config * q,bool is_shared)2804 static void cake_config_init(struct cake_sched_config *q, bool is_shared)
2805 {
2806 q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2807 q->flow_mode = CAKE_FLOW_TRIPLE;
2808
2809 q->rate_bps = 0; /* unlimited by default */
2810
2811 q->interval = 100000; /* 100ms default */
2812 q->target = 5000; /* 5ms: codel RFC argues
2813 * for 5 to 10% of interval
2814 */
2815 q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2816 q->is_shared = is_shared;
2817 q->sync_time = 200 * NSEC_PER_USEC;
2818 }
2819
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2820 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2821 struct netlink_ext_ack *extack)
2822 {
2823 struct cake_sched_data *qd = qdisc_priv(sch);
2824 struct cake_sched_config *q = &qd->initial_config;
2825 int i, j, err;
2826
2827 cake_config_init(q, false);
2828
2829 sch->limit = 10240;
2830 sch->flags |= TCQ_F_DEQUEUE_DROPS;
2831
2832 qd->cur_tin = 0;
2833 qd->cur_flow = 0;
2834 qd->config = q;
2835
2836 qdisc_watchdog_init(&qd->watchdog, sch);
2837
2838 if (opt) {
2839 err = cake_change(sch, opt, extack);
2840 if (err)
2841 return err;
2842 }
2843
2844 err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
2845 if (err)
2846 return err;
2847
2848 quantum_div[0] = ~0;
2849 for (i = 1; i <= CAKE_QUEUES; i++)
2850 quantum_div[i] = 65535 / i;
2851
2852 qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS);
2853 if (!qd->tins)
2854 return -ENOMEM;
2855
2856 for (i = 0; i < CAKE_MAX_TINS; i++) {
2857 struct cake_tin_data *b = qd->tins + i;
2858
2859 INIT_LIST_HEAD(&b->new_flows);
2860 INIT_LIST_HEAD(&b->old_flows);
2861 INIT_LIST_HEAD(&b->decaying_flows);
2862 b->sparse_flow_count = 0;
2863 b->bulk_flow_count = 0;
2864 b->decaying_flow_count = 0;
2865
2866 for (j = 0; j < CAKE_QUEUES; j++) {
2867 struct cake_flow *flow = b->flows + j;
2868 u32 k = j * CAKE_MAX_TINS + i;
2869
2870 INIT_LIST_HEAD(&flow->flowchain);
2871 cobalt_vars_init(&flow->cvars);
2872
2873 qd->overflow_heap[k].t = i;
2874 qd->overflow_heap[k].b = j;
2875 b->overflow_idx[j] = k;
2876 }
2877 }
2878
2879 cake_reconfigure(sch);
2880 qd->avg_peak_bandwidth = q->rate_bps;
2881 qd->min_netlen = ~0;
2882 qd->min_adjlen = ~0;
2883 qd->active_queues = 0;
2884 qd->last_checked_active = 0;
2885
2886 return 0;
2887 }
2888
cake_config_replace(struct Qdisc * sch,struct cake_sched_config * cfg)2889 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
2890 {
2891 struct cake_sched_data *qd = qdisc_priv(sch);
2892
2893 qd->config = cfg;
2894 cake_reconfigure(sch);
2895 }
2896
cake_config_dump(struct cake_sched_config * q,struct sk_buff * skb)2897 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
2898 {
2899 struct nlattr *opts;
2900 u16 rate_flags;
2901 u8 flow_mode;
2902
2903 opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2904 if (!opts)
2905 goto nla_put_failure;
2906
2907 if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2908 READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2909 goto nla_put_failure;
2910
2911 flow_mode = READ_ONCE(q->flow_mode);
2912 if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2913 goto nla_put_failure;
2914
2915 if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2916 goto nla_put_failure;
2917
2918 if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2919 goto nla_put_failure;
2920
2921 if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2922 READ_ONCE(q->buffer_config_limit)))
2923 goto nla_put_failure;
2924
2925 rate_flags = READ_ONCE(q->rate_flags);
2926 if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2927 !!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2928 goto nla_put_failure;
2929
2930 if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2931 !!(rate_flags & CAKE_FLAG_INGRESS)))
2932 goto nla_put_failure;
2933
2934 if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2935 goto nla_put_failure;
2936
2937 if (nla_put_u32(skb, TCA_CAKE_NAT,
2938 !!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2939 goto nla_put_failure;
2940
2941 if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2942 goto nla_put_failure;
2943
2944 if (nla_put_u32(skb, TCA_CAKE_WASH,
2945 !!(rate_flags & CAKE_FLAG_WASH)))
2946 goto nla_put_failure;
2947
2948 if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2949 goto nla_put_failure;
2950
2951 if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2952 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2953 goto nla_put_failure;
2954
2955 if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2956 goto nla_put_failure;
2957
2958 if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2959 goto nla_put_failure;
2960
2961 if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2962 !!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2963 goto nla_put_failure;
2964
2965 if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2966 goto nla_put_failure;
2967
2968 return nla_nest_end(skb, opts);
2969
2970 nla_put_failure:
2971 return -1;
2972 }
2973
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2974 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2975 {
2976 struct cake_sched_data *qd = qdisc_priv(sch);
2977
2978 return cake_config_dump(qd->config, skb);
2979 }
2980
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2981 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2982 {
2983 struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2984 struct cake_sched_data *q = qdisc_priv(sch);
2985 struct nlattr *tstats, *ts;
2986 int i;
2987
2988 if (!stats)
2989 return -1;
2990
2991 #define PUT_STAT_U32(attr, data) do { \
2992 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2993 goto nla_put_failure; \
2994 } while (0)
2995 #define PUT_STAT_U64(attr, data) do { \
2996 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2997 data, TCA_CAKE_STATS_PAD)) \
2998 goto nla_put_failure; \
2999 } while (0)
3000
3001 PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
3002 PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
3003 PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
3004 PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
3005 PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
3006 PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
3007 PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
3008 PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
3009 PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
3010
3011 #undef PUT_STAT_U32
3012 #undef PUT_STAT_U64
3013
3014 tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
3015 if (!tstats)
3016 goto nla_put_failure;
3017
3018 #define PUT_TSTAT_U32(attr, data) do { \
3019 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3020 goto nla_put_failure; \
3021 } while (0)
3022 #define PUT_TSTAT_U64(attr, data) do { \
3023 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3024 data, TCA_CAKE_TIN_STATS_PAD)) \
3025 goto nla_put_failure; \
3026 } while (0)
3027
3028 for (i = 0; i < q->tin_cnt; i++) {
3029 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3030
3031 ts = nla_nest_start_noflag(d->skb, i + 1);
3032 if (!ts)
3033 goto nla_put_failure;
3034
3035 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
3036 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
3037 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
3038
3039 PUT_TSTAT_U32(TARGET_US,
3040 ktime_to_us(ns_to_ktime(b->cparams.target)));
3041 PUT_TSTAT_U32(INTERVAL_US,
3042 ktime_to_us(ns_to_ktime(b->cparams.interval)));
3043
3044 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
3045 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
3046 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
3047 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
3048
3049 PUT_TSTAT_U32(PEAK_DELAY_US,
3050 ktime_to_us(ns_to_ktime(b->peak_delay)));
3051 PUT_TSTAT_U32(AVG_DELAY_US,
3052 ktime_to_us(ns_to_ktime(b->avge_delay)));
3053 PUT_TSTAT_U32(BASE_DELAY_US,
3054 ktime_to_us(ns_to_ktime(b->base_delay)));
3055
3056 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
3057 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
3058 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
3059
3060 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
3061 b->decaying_flow_count);
3062 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
3063 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
3064 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
3065
3066 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
3067 nla_nest_end(d->skb, ts);
3068 }
3069
3070 #undef PUT_TSTAT_U32
3071 #undef PUT_TSTAT_U64
3072
3073 nla_nest_end(d->skb, tstats);
3074 return nla_nest_end(d->skb, stats);
3075
3076 nla_put_failure:
3077 nla_nest_cancel(d->skb, stats);
3078 return -1;
3079 }
3080
cake_leaf(struct Qdisc * sch,unsigned long arg)3081 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3082 {
3083 return NULL;
3084 }
3085
cake_find(struct Qdisc * sch,u32 classid)3086 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3087 {
3088 return 0;
3089 }
3090
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)3091 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3092 u32 classid)
3093 {
3094 return 0;
3095 }
3096
cake_unbind(struct Qdisc * q,unsigned long cl)3097 static void cake_unbind(struct Qdisc *q, unsigned long cl)
3098 {
3099 }
3100
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)3101 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3102 struct netlink_ext_ack *extack)
3103 {
3104 struct cake_sched_data *q = qdisc_priv(sch);
3105
3106 if (cl)
3107 return NULL;
3108 return q->block;
3109 }
3110
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)3111 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3112 struct sk_buff *skb, struct tcmsg *tcm)
3113 {
3114 tcm->tcm_handle |= TC_H_MIN(cl);
3115 return 0;
3116 }
3117
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)3118 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3119 struct gnet_dump *d)
3120 {
3121 struct cake_sched_data *q = qdisc_priv(sch);
3122 const struct cake_flow *flow = NULL;
3123 struct gnet_stats_queue qs = { 0 };
3124 struct nlattr *stats;
3125 u32 idx = cl - 1;
3126
3127 if (idx < CAKE_QUEUES * q->tin_cnt) {
3128 const struct cake_tin_data *b = \
3129 &q->tins[q->tin_order[idx / CAKE_QUEUES]];
3130 const struct sk_buff *skb;
3131
3132 flow = &b->flows[idx % CAKE_QUEUES];
3133
3134 if (flow->head) {
3135 sch_tree_lock(sch);
3136 skb = flow->head;
3137 while (skb) {
3138 qs.qlen++;
3139 skb = skb->next;
3140 }
3141 sch_tree_unlock(sch);
3142 }
3143 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3144 qs.drops = flow->dropped;
3145 }
3146 if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3147 return -1;
3148 if (flow) {
3149 ktime_t now = ktime_get();
3150
3151 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3152 if (!stats)
3153 return -1;
3154
3155 #define PUT_STAT_U32(attr, data) do { \
3156 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3157 goto nla_put_failure; \
3158 } while (0)
3159 #define PUT_STAT_S32(attr, data) do { \
3160 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3161 goto nla_put_failure; \
3162 } while (0)
3163
3164 PUT_STAT_S32(DEFICIT, flow->deficit);
3165 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3166 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3167 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3168 if (flow->cvars.p_drop) {
3169 PUT_STAT_S32(BLUE_TIMER_US,
3170 ktime_to_us(
3171 ktime_sub(now,
3172 flow->cvars.blue_timer)));
3173 }
3174 if (flow->cvars.dropping) {
3175 PUT_STAT_S32(DROP_NEXT_US,
3176 ktime_to_us(
3177 ktime_sub(now,
3178 flow->cvars.drop_next)));
3179 }
3180
3181 if (nla_nest_end(d->skb, stats) < 0)
3182 return -1;
3183 }
3184
3185 return 0;
3186
3187 nla_put_failure:
3188 nla_nest_cancel(d->skb, stats);
3189 return -1;
3190 }
3191
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3192 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3193 {
3194 struct cake_sched_data *q = qdisc_priv(sch);
3195 unsigned int i, j;
3196
3197 if (arg->stop)
3198 return;
3199
3200 for (i = 0; i < q->tin_cnt; i++) {
3201 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3202
3203 for (j = 0; j < CAKE_QUEUES; j++) {
3204 if (list_empty(&b->flows[j].flowchain)) {
3205 arg->count++;
3206 continue;
3207 }
3208 if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3209 arg))
3210 break;
3211 }
3212 }
3213 }
3214
3215 static const struct Qdisc_class_ops cake_class_ops = {
3216 .leaf = cake_leaf,
3217 .find = cake_find,
3218 .tcf_block = cake_tcf_block,
3219 .bind_tcf = cake_bind,
3220 .unbind_tcf = cake_unbind,
3221 .dump = cake_dump_class,
3222 .dump_stats = cake_dump_class_stats,
3223 .walk = cake_walk,
3224 };
3225
3226 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3227 .cl_ops = &cake_class_ops,
3228 .id = "cake",
3229 .priv_size = sizeof(struct cake_sched_data),
3230 .enqueue = cake_enqueue,
3231 .dequeue = cake_dequeue,
3232 .peek = qdisc_peek_dequeued,
3233 .init = cake_init,
3234 .reset = cake_reset,
3235 .destroy = cake_destroy,
3236 .change = cake_change,
3237 .dump = cake_dump,
3238 .dump_stats = cake_dump_stats,
3239 .owner = THIS_MODULE,
3240 };
3241 MODULE_ALIAS_NET_SCH("cake");
3242
3243 struct cake_mq_sched {
3244 struct mq_sched mq_priv; /* must be first */
3245 struct cake_sched_config cake_config;
3246 };
3247
cake_mq_destroy(struct Qdisc * sch)3248 static void cake_mq_destroy(struct Qdisc *sch)
3249 {
3250 mq_destroy_common(sch);
3251 }
3252
cake_mq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3253 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
3254 struct netlink_ext_ack *extack)
3255 {
3256 struct cake_mq_sched *priv = qdisc_priv(sch);
3257 struct net_device *dev = qdisc_dev(sch);
3258 int ret, ntx;
3259 bool _unused;
3260
3261 cake_config_init(&priv->cake_config, true);
3262 if (opt) {
3263 ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
3264 if (ret)
3265 return ret;
3266 }
3267
3268 ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
3269 if (ret)
3270 return ret;
3271
3272 for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
3273 cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
3274
3275 return 0;
3276 }
3277
cake_mq_dump(struct Qdisc * sch,struct sk_buff * skb)3278 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
3279 {
3280 struct cake_mq_sched *priv = qdisc_priv(sch);
3281
3282 mq_dump_common(sch, skb);
3283 return cake_config_dump(&priv->cake_config, skb);
3284 }
3285
cake_mq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3286 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
3287 struct netlink_ext_ack *extack)
3288 {
3289 struct cake_mq_sched *priv = qdisc_priv(sch);
3290 struct net_device *dev = qdisc_dev(sch);
3291 bool overhead_changed = false;
3292 unsigned int ntx;
3293 int ret;
3294
3295 ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
3296 if (ret)
3297 return ret;
3298
3299 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
3300 struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
3301 struct cake_sched_data *qd = qdisc_priv(chld);
3302
3303 if (overhead_changed) {
3304 qd->max_netlen = 0;
3305 qd->max_adjlen = 0;
3306 qd->min_netlen = ~0;
3307 qd->min_adjlen = ~0;
3308 }
3309
3310 if (qd->tins) {
3311 sch_tree_lock(chld);
3312 cake_reconfigure(chld);
3313 sch_tree_unlock(chld);
3314 }
3315 }
3316
3317 return 0;
3318 }
3319
cake_mq_graft(struct Qdisc * sch,unsigned long cl,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)3320 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
3321 struct Qdisc **old, struct netlink_ext_ack *extack)
3322 {
3323 NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
3324 return -EOPNOTSUPP;
3325 }
3326
3327 static const struct Qdisc_class_ops cake_mq_class_ops = {
3328 .select_queue = mq_select_queue,
3329 .graft = cake_mq_graft,
3330 .leaf = mq_leaf,
3331 .find = mq_find,
3332 .walk = mq_walk,
3333 .dump = mq_dump_class,
3334 .dump_stats = mq_dump_class_stats,
3335 };
3336
3337 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
3338 .cl_ops = &cake_mq_class_ops,
3339 .id = "cake_mq",
3340 .priv_size = sizeof(struct cake_mq_sched),
3341 .init = cake_mq_init,
3342 .destroy = cake_mq_destroy,
3343 .attach = mq_attach,
3344 .change = cake_mq_change,
3345 .change_real_num_tx = mq_change_real_num_tx,
3346 .dump = cake_mq_dump,
3347 .owner = THIS_MODULE,
3348 };
3349 MODULE_ALIAS_NET_SCH("cake_mq");
3350
cake_module_init(void)3351 static int __init cake_module_init(void)
3352 {
3353 int ret;
3354
3355 ret = register_qdisc(&cake_qdisc_ops);
3356 if (ret)
3357 return ret;
3358
3359 ret = register_qdisc(&cake_mq_qdisc_ops);
3360 if (ret)
3361 unregister_qdisc(&cake_qdisc_ops);
3362
3363 return ret;
3364 }
3365
cake_module_exit(void)3366 static void __exit cake_module_exit(void)
3367 {
3368 unregister_qdisc(&cake_qdisc_ops);
3369 unregister_qdisc(&cake_mq_qdisc_ops);
3370 }
3371
3372 module_init(cake_module_init)
3373 module_exit(cake_module_exit)
3374 MODULE_AUTHOR("Jonathan Morton");
3375 MODULE_LICENSE("Dual BSD/GPL");
3376 MODULE_DESCRIPTION("The CAKE shaper.");
3377 MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
3378