xref: /linux/net/sched/sch_cake.c (revision 95f68e06b41b9e88291796efa3969409d13fdd4c)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 /* There is a big difference in timing between the accurate values placed in the
365  * cache and the approximations given by a single Newton step for small count
366  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
367  * these values are calculated using eight Newton steps, using the
368  * implementation below. Above 16, a single Newton step gives sufficient
369  * accuracy in either direction, given the precision stored.
370  *
371  * The magnitude of the error when stepping up to count 2 is such as to give the
372  * value that *should* have been produced at count 4.
373  */
374 
375 #define REC_INV_SQRT_CACHE (16)
376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
377 		~0,         ~0, 3037000500, 2479700525,
378 	2147483647, 1920767767, 1753413056, 1623345051,
379 	1518500250, 1431655765, 1358187914, 1294981364,
380 	1239850263, 1191209601, 1147878294, 1108955788
381 };
382 
383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
384  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
385  *
386  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
387  */
388 
389 static void cobalt_newton_step(struct cobalt_vars *vars)
390 {
391 	u32 invsqrt, invsqrt2;
392 	u64 val;
393 
394 	invsqrt = vars->rec_inv_sqrt;
395 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
396 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
397 
398 	val >>= 2; /* avoid overflow in following multiply */
399 	val = (val * invsqrt) >> (32 - 2 + 1);
400 
401 	vars->rec_inv_sqrt = val;
402 }
403 
404 static void cobalt_invsqrt(struct cobalt_vars *vars)
405 {
406 	if (vars->count < REC_INV_SQRT_CACHE)
407 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
408 	else
409 		cobalt_newton_step(vars);
410 }
411 
412 static void cobalt_vars_init(struct cobalt_vars *vars)
413 {
414 	memset(vars, 0, sizeof(*vars));
415 }
416 
417 /* CoDel control_law is t + interval/sqrt(count)
418  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
419  * both sqrt() and divide operation.
420  */
421 static ktime_t cobalt_control(ktime_t t,
422 			      u64 interval,
423 			      u32 rec_inv_sqrt)
424 {
425 	return ktime_add_ns(t, reciprocal_scale(interval,
426 						rec_inv_sqrt));
427 }
428 
429 /* Call this when a packet had to be dropped due to queue overflow.  Returns
430  * true if the BLUE state was quiescent before but active after this call.
431  */
432 static bool cobalt_queue_full(struct cobalt_vars *vars,
433 			      struct cobalt_params *p,
434 			      ktime_t now)
435 {
436 	bool up = false;
437 
438 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
439 		up = !vars->p_drop;
440 		vars->p_drop += p->p_inc;
441 		if (vars->p_drop < p->p_inc)
442 			vars->p_drop = ~0;
443 		vars->blue_timer = now;
444 	}
445 	vars->dropping = true;
446 	vars->drop_next = now;
447 	if (!vars->count)
448 		vars->count = 1;
449 
450 	return up;
451 }
452 
453 /* Call this when the queue was serviced but turned out to be empty.  Returns
454  * true if the BLUE state was active before but quiescent after this call.
455  */
456 static bool cobalt_queue_empty(struct cobalt_vars *vars,
457 			       struct cobalt_params *p,
458 			       ktime_t now)
459 {
460 	bool down = false;
461 
462 	if (vars->p_drop &&
463 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
464 		if (vars->p_drop < p->p_dec)
465 			vars->p_drop = 0;
466 		else
467 			vars->p_drop -= p->p_dec;
468 		vars->blue_timer = now;
469 		down = !vars->p_drop;
470 	}
471 	vars->dropping = false;
472 
473 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
474 		vars->count--;
475 		cobalt_invsqrt(vars);
476 		vars->drop_next = cobalt_control(vars->drop_next,
477 						 p->interval,
478 						 vars->rec_inv_sqrt);
479 	}
480 
481 	return down;
482 }
483 
484 /* Call this with a freshly dequeued packet for possible congestion marking.
485  * Returns true as an instruction to drop the packet, false for delivery.
486  */
487 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
488 					       struct cobalt_params *p,
489 					       ktime_t now,
490 					       struct sk_buff *skb,
491 					       u32 bulk_flows)
492 {
493 	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
494 	bool next_due, over_target;
495 	ktime_t schedule;
496 	u64 sojourn;
497 
498 /* The 'schedule' variable records, in its sign, whether 'now' is before or
499  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
500  * scheduling decision is actually branched, without destroying that
501  * information.  Similarly, the first 'schedule' value calculated is preserved
502  * in the boolean 'next_due'.
503  *
504  * As for 'drop_next', we take advantage of the fact that 'interval' is both
505  * the delay between first exceeding 'target' and the first signalling event,
506  * *and* the scaling factor for the signalling frequency.  It's therefore very
507  * natural to use a single mechanism for both purposes, and eliminates a
508  * significant amount of reference Codel's spaghetti code.  To help with this,
509  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
510  * as possible to 1.0 in fixed-point.
511  */
512 
513 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
514 	schedule = ktime_sub(now, vars->drop_next);
515 	over_target = sojourn > p->target &&
516 		      sojourn > p->mtu_time * bulk_flows * 2 &&
517 		      sojourn > p->mtu_time * 4;
518 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
519 
520 	vars->ecn_marked = false;
521 
522 	if (over_target) {
523 		if (!vars->dropping) {
524 			vars->dropping = true;
525 			vars->drop_next = cobalt_control(now,
526 							 p->interval,
527 							 vars->rec_inv_sqrt);
528 		}
529 		if (!vars->count)
530 			vars->count = 1;
531 	} else if (vars->dropping) {
532 		vars->dropping = false;
533 	}
534 
535 	if (next_due && vars->dropping) {
536 		/* Use ECN mark if possible, otherwise drop */
537 		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
538 			reason = SKB_DROP_REASON_QDISC_CONGESTED;
539 
540 		vars->count++;
541 		if (!vars->count)
542 			vars->count--;
543 		cobalt_invsqrt(vars);
544 		vars->drop_next = cobalt_control(vars->drop_next,
545 						 p->interval,
546 						 vars->rec_inv_sqrt);
547 		schedule = ktime_sub(now, vars->drop_next);
548 	} else {
549 		while (next_due) {
550 			vars->count--;
551 			cobalt_invsqrt(vars);
552 			vars->drop_next = cobalt_control(vars->drop_next,
553 							 p->interval,
554 							 vars->rec_inv_sqrt);
555 			schedule = ktime_sub(now, vars->drop_next);
556 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
557 		}
558 	}
559 
560 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
561 	if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
562 	    get_random_u32() < vars->p_drop)
563 		reason = SKB_DROP_REASON_CAKE_FLOOD;
564 
565 	/* Overload the drop_next field as an activity timeout */
566 	if (!vars->count)
567 		vars->drop_next = ktime_add_ns(now, p->interval);
568 	else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
569 		vars->drop_next = now;
570 
571 	return reason;
572 }
573 
574 static bool cake_update_flowkeys(struct flow_keys *keys,
575 				 const struct sk_buff *skb)
576 {
577 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
578 	struct nf_conntrack_tuple tuple = {};
579 	bool rev = !skb->_nfct, upd = false;
580 	__be32 ip;
581 
582 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
583 		return false;
584 
585 	if (!nf_ct_get_tuple_skb(&tuple, skb))
586 		return false;
587 
588 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
589 	if (ip != keys->addrs.v4addrs.src) {
590 		keys->addrs.v4addrs.src = ip;
591 		upd = true;
592 	}
593 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
594 	if (ip != keys->addrs.v4addrs.dst) {
595 		keys->addrs.v4addrs.dst = ip;
596 		upd = true;
597 	}
598 
599 	if (keys->ports.ports) {
600 		__be16 port;
601 
602 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
603 		if (port != keys->ports.src) {
604 			keys->ports.src = port;
605 			upd = true;
606 		}
607 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
608 		if (port != keys->ports.dst) {
609 			port = keys->ports.dst;
610 			upd = true;
611 		}
612 	}
613 	return upd;
614 #else
615 	return false;
616 #endif
617 }
618 
619 /* Cake has several subtle multiple bit settings. In these cases you
620  *  would be matching triple isolate mode as well.
621  */
622 
623 static bool cake_dsrc(int flow_mode)
624 {
625 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
626 }
627 
628 static bool cake_ddst(int flow_mode)
629 {
630 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
631 }
632 
633 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
634 		     int flow_mode, u16 flow_override, u16 host_override)
635 {
636 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
637 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
638 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
639 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
640 	u16 reduced_hash, srchost_idx, dsthost_idx;
641 	struct flow_keys keys, host_keys;
642 	bool use_skbhash = skb->l4_hash;
643 
644 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
645 		return 0;
646 
647 	/* If both overrides are set, or we can use the SKB hash and nat mode is
648 	 * disabled, we can skip packet dissection entirely. If nat mode is
649 	 * enabled there's another check below after doing the conntrack lookup.
650 	 */
651 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
652 		goto skip_hash;
653 
654 	skb_flow_dissect_flow_keys(skb, &keys,
655 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
656 
657 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
658 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
659 		use_skbhash = false;
660 
661 	/* If we can still use the SKB hash and don't need the host hash, we can
662 	 * skip the rest of the hashing procedure
663 	 */
664 	if (use_skbhash && !hash_hosts)
665 		goto skip_hash;
666 
667 	/* flow_hash_from_keys() sorts the addresses by value, so we have
668 	 * to preserve their order in a separate data structure to treat
669 	 * src and dst host addresses as independently selectable.
670 	 */
671 	host_keys = keys;
672 	host_keys.ports.ports     = 0;
673 	host_keys.basic.ip_proto  = 0;
674 	host_keys.keyid.keyid     = 0;
675 	host_keys.tags.flow_label = 0;
676 
677 	switch (host_keys.control.addr_type) {
678 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
679 		host_keys.addrs.v4addrs.src = 0;
680 		dsthost_hash = flow_hash_from_keys(&host_keys);
681 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
682 		host_keys.addrs.v4addrs.dst = 0;
683 		srchost_hash = flow_hash_from_keys(&host_keys);
684 		break;
685 
686 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
687 		memset(&host_keys.addrs.v6addrs.src, 0,
688 		       sizeof(host_keys.addrs.v6addrs.src));
689 		dsthost_hash = flow_hash_from_keys(&host_keys);
690 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
691 		memset(&host_keys.addrs.v6addrs.dst, 0,
692 		       sizeof(host_keys.addrs.v6addrs.dst));
693 		srchost_hash = flow_hash_from_keys(&host_keys);
694 		break;
695 
696 	default:
697 		dsthost_hash = 0;
698 		srchost_hash = 0;
699 	}
700 
701 	/* This *must* be after the above switch, since as a
702 	 * side-effect it sorts the src and dst addresses.
703 	 */
704 	if (hash_flows && !use_skbhash)
705 		flow_hash = flow_hash_from_keys(&keys);
706 
707 skip_hash:
708 	if (flow_override)
709 		flow_hash = flow_override - 1;
710 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
711 		flow_hash = skb->hash;
712 	if (host_override) {
713 		dsthost_hash = host_override - 1;
714 		srchost_hash = host_override - 1;
715 	}
716 
717 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
718 		if (flow_mode & CAKE_FLOW_SRC_IP)
719 			flow_hash ^= srchost_hash;
720 
721 		if (flow_mode & CAKE_FLOW_DST_IP)
722 			flow_hash ^= dsthost_hash;
723 	}
724 
725 	reduced_hash = flow_hash % CAKE_QUEUES;
726 
727 	/* set-associative hashing */
728 	/* fast path if no hash collision (direct lookup succeeds) */
729 	if (likely(q->tags[reduced_hash] == flow_hash &&
730 		   q->flows[reduced_hash].set)) {
731 		q->way_directs++;
732 	} else {
733 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
734 		u32 outer_hash = reduced_hash - inner_hash;
735 		bool allocate_src = false;
736 		bool allocate_dst = false;
737 		u32 i, k;
738 
739 		/* check if any active queue in the set is reserved for
740 		 * this flow.
741 		 */
742 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
743 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
744 			if (q->tags[outer_hash + k] == flow_hash) {
745 				if (i)
746 					q->way_hits++;
747 
748 				if (!q->flows[outer_hash + k].set) {
749 					/* need to increment host refcnts */
750 					allocate_src = cake_dsrc(flow_mode);
751 					allocate_dst = cake_ddst(flow_mode);
752 				}
753 
754 				goto found;
755 			}
756 		}
757 
758 		/* no queue is reserved for this flow, look for an
759 		 * empty one.
760 		 */
761 		for (i = 0; i < CAKE_SET_WAYS;
762 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
763 			if (!q->flows[outer_hash + k].set) {
764 				q->way_misses++;
765 				allocate_src = cake_dsrc(flow_mode);
766 				allocate_dst = cake_ddst(flow_mode);
767 				goto found;
768 			}
769 		}
770 
771 		/* With no empty queues, default to the original
772 		 * queue, accept the collision, update the host tags.
773 		 */
774 		q->way_collisions++;
775 		allocate_src = cake_dsrc(flow_mode);
776 		allocate_dst = cake_ddst(flow_mode);
777 
778 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
779 			if (allocate_src)
780 				q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
781 			if (allocate_dst)
782 				q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
783 		}
784 found:
785 		/* reserve queue for future packets in same flow */
786 		reduced_hash = outer_hash + k;
787 		q->tags[reduced_hash] = flow_hash;
788 
789 		if (allocate_src) {
790 			srchost_idx = srchost_hash % CAKE_QUEUES;
791 			inner_hash = srchost_idx % CAKE_SET_WAYS;
792 			outer_hash = srchost_idx - inner_hash;
793 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
794 				i++, k = (k + 1) % CAKE_SET_WAYS) {
795 				if (q->hosts[outer_hash + k].srchost_tag ==
796 				    srchost_hash)
797 					goto found_src;
798 			}
799 			for (i = 0; i < CAKE_SET_WAYS;
800 				i++, k = (k + 1) % CAKE_SET_WAYS) {
801 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
802 					break;
803 			}
804 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
805 found_src:
806 			srchost_idx = outer_hash + k;
807 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
808 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
809 			q->flows[reduced_hash].srchost = srchost_idx;
810 		}
811 
812 		if (allocate_dst) {
813 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
814 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
815 			outer_hash = dsthost_idx - inner_hash;
816 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
817 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
818 				if (q->hosts[outer_hash + k].dsthost_tag ==
819 				    dsthost_hash)
820 					goto found_dst;
821 			}
822 			for (i = 0; i < CAKE_SET_WAYS;
823 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
824 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
825 					break;
826 			}
827 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
828 found_dst:
829 			dsthost_idx = outer_hash + k;
830 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
831 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
832 			q->flows[reduced_hash].dsthost = dsthost_idx;
833 		}
834 	}
835 
836 	return reduced_hash;
837 }
838 
839 /* helper functions : might be changed when/if skb use a standard list_head */
840 /* remove one skb from head of slot queue */
841 
842 static struct sk_buff *dequeue_head(struct cake_flow *flow)
843 {
844 	struct sk_buff *skb = flow->head;
845 
846 	if (skb) {
847 		flow->head = skb->next;
848 		skb_mark_not_on_list(skb);
849 	}
850 
851 	return skb;
852 }
853 
854 /* add skb to flow queue (tail add) */
855 
856 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
857 {
858 	if (!flow->head)
859 		flow->head = skb;
860 	else
861 		flow->tail->next = skb;
862 	flow->tail = skb;
863 	skb->next = NULL;
864 }
865 
866 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
867 				    struct ipv6hdr *buf)
868 {
869 	unsigned int offset = skb_network_offset(skb);
870 	struct iphdr *iph;
871 
872 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
873 
874 	if (!iph)
875 		return NULL;
876 
877 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
878 		return skb_header_pointer(skb, offset + iph->ihl * 4,
879 					  sizeof(struct ipv6hdr), buf);
880 
881 	else if (iph->version == 4)
882 		return iph;
883 
884 	else if (iph->version == 6)
885 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
886 					  buf);
887 
888 	return NULL;
889 }
890 
891 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
892 				      void *buf, unsigned int bufsize)
893 {
894 	unsigned int offset = skb_network_offset(skb);
895 	const struct ipv6hdr *ipv6h;
896 	const struct tcphdr *tcph;
897 	const struct iphdr *iph;
898 	struct ipv6hdr _ipv6h;
899 	struct tcphdr _tcph;
900 
901 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
902 
903 	if (!ipv6h)
904 		return NULL;
905 
906 	if (ipv6h->version == 4) {
907 		iph = (struct iphdr *)ipv6h;
908 		offset += iph->ihl * 4;
909 
910 		/* special-case 6in4 tunnelling, as that is a common way to get
911 		 * v6 connectivity in the home
912 		 */
913 		if (iph->protocol == IPPROTO_IPV6) {
914 			ipv6h = skb_header_pointer(skb, offset,
915 						   sizeof(_ipv6h), &_ipv6h);
916 
917 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
918 				return NULL;
919 
920 			offset += sizeof(struct ipv6hdr);
921 
922 		} else if (iph->protocol != IPPROTO_TCP) {
923 			return NULL;
924 		}
925 
926 	} else if (ipv6h->version == 6) {
927 		if (ipv6h->nexthdr != IPPROTO_TCP)
928 			return NULL;
929 
930 		offset += sizeof(struct ipv6hdr);
931 	} else {
932 		return NULL;
933 	}
934 
935 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
936 	if (!tcph || tcph->doff < 5)
937 		return NULL;
938 
939 	return skb_header_pointer(skb, offset,
940 				  min(__tcp_hdrlen(tcph), bufsize), buf);
941 }
942 
943 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
944 				   int code, int *oplen)
945 {
946 	/* inspired by tcp_parse_options in tcp_input.c */
947 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
948 	const u8 *ptr = (const u8 *)(tcph + 1);
949 
950 	while (length > 0) {
951 		int opcode = *ptr++;
952 		int opsize;
953 
954 		if (opcode == TCPOPT_EOL)
955 			break;
956 		if (opcode == TCPOPT_NOP) {
957 			length--;
958 			continue;
959 		}
960 		if (length < 2)
961 			break;
962 		opsize = *ptr++;
963 		if (opsize < 2 || opsize > length)
964 			break;
965 
966 		if (opcode == code) {
967 			*oplen = opsize;
968 			return ptr;
969 		}
970 
971 		ptr += opsize - 2;
972 		length -= opsize;
973 	}
974 
975 	return NULL;
976 }
977 
978 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
979  * bytes than the other. In the case where both sequences ACKs bytes that the
980  * other doesn't, A is considered greater. DSACKs in A also makes A be
981  * considered greater.
982  *
983  * @return -1, 0 or 1 as normal compare functions
984  */
985 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
986 				  const struct tcphdr *tcph_b)
987 {
988 	const struct tcp_sack_block_wire *sack_a, *sack_b;
989 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
990 	u32 bytes_a = 0, bytes_b = 0;
991 	int oplen_a, oplen_b;
992 	bool first = true;
993 
994 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
995 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
996 
997 	/* pointers point to option contents */
998 	oplen_a -= TCPOLEN_SACK_BASE;
999 	oplen_b -= TCPOLEN_SACK_BASE;
1000 
1001 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1002 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1003 		return -1;
1004 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1005 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1006 		return 1;
1007 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1008 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1009 		return 0;
1010 
1011 	while (oplen_a >= sizeof(*sack_a)) {
1012 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1013 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1014 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1015 		int oplen_tmp = oplen_b;
1016 		bool found = false;
1017 
1018 		/* DSACK; always considered greater to prevent dropping */
1019 		if (before(start_a, ack_seq_a))
1020 			return -1;
1021 
1022 		bytes_a += end_a - start_a;
1023 
1024 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1025 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1026 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1027 
1028 			/* first time through we count the total size */
1029 			if (first)
1030 				bytes_b += end_b - start_b;
1031 
1032 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1033 				found = true;
1034 				if (!first)
1035 					break;
1036 			}
1037 			oplen_tmp -= sizeof(*sack_tmp);
1038 			sack_tmp++;
1039 		}
1040 
1041 		if (!found)
1042 			return -1;
1043 
1044 		oplen_a -= sizeof(*sack_a);
1045 		sack_a++;
1046 		first = false;
1047 	}
1048 
1049 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1050 	 * either the SACKs are equal, or B SACKs more bytes.
1051 	 */
1052 	return bytes_b > bytes_a ? 1 : 0;
1053 }
1054 
1055 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1056 				 u32 *tsval, u32 *tsecr)
1057 {
1058 	const u8 *ptr;
1059 	int opsize;
1060 
1061 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1062 
1063 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1064 		*tsval = get_unaligned_be32(ptr);
1065 		*tsecr = get_unaligned_be32(ptr + 4);
1066 	}
1067 }
1068 
1069 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1070 			       u32 tstamp_new, u32 tsecr_new)
1071 {
1072 	/* inspired by tcp_parse_options in tcp_input.c */
1073 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1074 	const u8 *ptr = (const u8 *)(tcph + 1);
1075 	u32 tstamp, tsecr;
1076 
1077 	/* 3 reserved flags must be unset to avoid future breakage
1078 	 * ACK must be set
1079 	 * ECE/CWR are handled separately
1080 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1081 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1082 	 * 0x00C00000 = CWR/ECE (handled separately)
1083 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1084 	 */
1085 	if (((tcp_flag_word(tcph) &
1086 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1087 		return false;
1088 
1089 	while (length > 0) {
1090 		int opcode = *ptr++;
1091 		int opsize;
1092 
1093 		if (opcode == TCPOPT_EOL)
1094 			break;
1095 		if (opcode == TCPOPT_NOP) {
1096 			length--;
1097 			continue;
1098 		}
1099 		if (length < 2)
1100 			break;
1101 		opsize = *ptr++;
1102 		if (opsize < 2 || opsize > length)
1103 			break;
1104 
1105 		switch (opcode) {
1106 		case TCPOPT_MD5SIG: /* doesn't influence state */
1107 			break;
1108 
1109 		case TCPOPT_SACK: /* stricter checking performed later */
1110 			if (opsize % 8 != 2)
1111 				return false;
1112 			break;
1113 
1114 		case TCPOPT_TIMESTAMP:
1115 			/* only drop timestamps lower than new */
1116 			if (opsize != TCPOLEN_TIMESTAMP)
1117 				return false;
1118 			tstamp = get_unaligned_be32(ptr);
1119 			tsecr = get_unaligned_be32(ptr + 4);
1120 			if (after(tstamp, tstamp_new) ||
1121 			    after(tsecr, tsecr_new))
1122 				return false;
1123 			break;
1124 
1125 		case TCPOPT_MSS:  /* these should only be set on SYN */
1126 		case TCPOPT_WINDOW:
1127 		case TCPOPT_SACK_PERM:
1128 		case TCPOPT_FASTOPEN:
1129 		case TCPOPT_EXP:
1130 		default: /* don't drop if any unknown options are present */
1131 			return false;
1132 		}
1133 
1134 		ptr += opsize - 2;
1135 		length -= opsize;
1136 	}
1137 
1138 	return true;
1139 }
1140 
1141 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1142 				       struct cake_flow *flow)
1143 {
1144 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1145 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1146 	struct sk_buff *skb_check, *skb_prev = NULL;
1147 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1148 	unsigned char _tcph[64], _tcph_check[64];
1149 	const struct tcphdr *tcph, *tcph_check;
1150 	const struct iphdr *iph, *iph_check;
1151 	struct ipv6hdr _iph, _iph_check;
1152 	const struct sk_buff *skb;
1153 	int seglen, num_found = 0;
1154 	u32 tstamp = 0, tsecr = 0;
1155 	__be32 elig_flags = 0;
1156 	int sack_comp;
1157 
1158 	/* no other possible ACKs to filter */
1159 	if (flow->head == flow->tail)
1160 		return NULL;
1161 
1162 	skb = flow->tail;
1163 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1164 	iph = cake_get_iphdr(skb, &_iph);
1165 	if (!tcph)
1166 		return NULL;
1167 
1168 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1169 
1170 	/* the 'triggering' packet need only have the ACK flag set.
1171 	 * also check that SYN is not set, as there won't be any previous ACKs.
1172 	 */
1173 	if ((tcp_flag_word(tcph) &
1174 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1175 		return NULL;
1176 
1177 	/* the 'triggering' ACK is at the tail of the queue, we have already
1178 	 * returned if it is the only packet in the flow. loop through the rest
1179 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1180 	 * triggering one.
1181 	 */
1182 	for (skb_check = flow->head;
1183 	     skb_check && skb_check != skb;
1184 	     skb_prev = skb_check, skb_check = skb_check->next) {
1185 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1186 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1187 					     sizeof(_tcph_check));
1188 
1189 		/* only TCP packets with matching 5-tuple are eligible, and only
1190 		 * drop safe headers
1191 		 */
1192 		if (!tcph_check || iph->version != iph_check->version ||
1193 		    tcph_check->source != tcph->source ||
1194 		    tcph_check->dest != tcph->dest)
1195 			continue;
1196 
1197 		if (iph_check->version == 4) {
1198 			if (iph_check->saddr != iph->saddr ||
1199 			    iph_check->daddr != iph->daddr)
1200 				continue;
1201 
1202 			seglen = iph_totlen(skb, iph_check) -
1203 				       (4 * iph_check->ihl);
1204 		} else if (iph_check->version == 6) {
1205 			ipv6h = (struct ipv6hdr *)iph;
1206 			ipv6h_check = (struct ipv6hdr *)iph_check;
1207 
1208 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1209 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1210 				continue;
1211 
1212 			seglen = ntohs(ipv6h_check->payload_len);
1213 		} else {
1214 			WARN_ON(1);  /* shouldn't happen */
1215 			continue;
1216 		}
1217 
1218 		/* If the ECE/CWR flags changed from the previous eligible
1219 		 * packet in the same flow, we should no longer be dropping that
1220 		 * previous packet as this would lose information.
1221 		 */
1222 		if (elig_ack && (tcp_flag_word(tcph_check) &
1223 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1224 			elig_ack = NULL;
1225 			elig_ack_prev = NULL;
1226 			num_found--;
1227 		}
1228 
1229 		/* Check TCP options and flags, don't drop ACKs with segment
1230 		 * data, and don't drop ACKs with a higher cumulative ACK
1231 		 * counter than the triggering packet. Check ACK seqno here to
1232 		 * avoid parsing SACK options of packets we are going to exclude
1233 		 * anyway.
1234 		 */
1235 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1236 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1237 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1238 			continue;
1239 
1240 		/* Check SACK options. The triggering packet must SACK more data
1241 		 * than the ACK under consideration, or SACK the same range but
1242 		 * have a larger cumulative ACK counter. The latter is a
1243 		 * pathological case, but is contained in the following check
1244 		 * anyway, just to be safe.
1245 		 */
1246 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1247 
1248 		if (sack_comp < 0 ||
1249 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1250 		     sack_comp == 0))
1251 			continue;
1252 
1253 		/* At this point we have found an eligible pure ACK to drop; if
1254 		 * we are in aggressive mode, we are done. Otherwise, keep
1255 		 * searching unless this is the second eligible ACK we
1256 		 * found.
1257 		 *
1258 		 * Since we want to drop ACK closest to the head of the queue,
1259 		 * save the first eligible ACK we find, even if we need to loop
1260 		 * again.
1261 		 */
1262 		if (!elig_ack) {
1263 			elig_ack = skb_check;
1264 			elig_ack_prev = skb_prev;
1265 			elig_flags = (tcp_flag_word(tcph_check)
1266 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1267 		}
1268 
1269 		if (num_found++ > 0)
1270 			goto found;
1271 	}
1272 
1273 	/* We made it through the queue without finding two eligible ACKs . If
1274 	 * we found a single eligible ACK we can drop it in aggressive mode if
1275 	 * we can guarantee that this does not interfere with ECN flag
1276 	 * information. We ensure this by dropping it only if the enqueued
1277 	 * packet is consecutive with the eligible ACK, and their flags match.
1278 	 */
1279 	if (elig_ack && aggressive && elig_ack->next == skb &&
1280 	    (elig_flags == (tcp_flag_word(tcph) &
1281 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1282 		goto found;
1283 
1284 	return NULL;
1285 
1286 found:
1287 	if (elig_ack_prev)
1288 		elig_ack_prev->next = elig_ack->next;
1289 	else
1290 		flow->head = elig_ack->next;
1291 
1292 	skb_mark_not_on_list(elig_ack);
1293 
1294 	return elig_ack;
1295 }
1296 
1297 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1298 {
1299 	avg -= avg >> shift;
1300 	avg += sample >> shift;
1301 	return avg;
1302 }
1303 
1304 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1305 {
1306 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1307 		len -= off;
1308 
1309 	if (q->max_netlen < len)
1310 		q->max_netlen = len;
1311 	if (q->min_netlen > len)
1312 		q->min_netlen = len;
1313 
1314 	len += q->rate_overhead;
1315 
1316 	if (len < q->rate_mpu)
1317 		len = q->rate_mpu;
1318 
1319 	if (q->atm_mode == CAKE_ATM_ATM) {
1320 		len += 47;
1321 		len /= 48;
1322 		len *= 53;
1323 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1324 		/* Add one byte per 64 bytes or part thereof.
1325 		 * This is conservative and easier to calculate than the
1326 		 * precise value.
1327 		 */
1328 		len += (len + 63) / 64;
1329 	}
1330 
1331 	if (q->max_adjlen < len)
1332 		q->max_adjlen = len;
1333 	if (q->min_adjlen > len)
1334 		q->min_adjlen = len;
1335 
1336 	return len;
1337 }
1338 
1339 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1340 {
1341 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1342 	unsigned int hdr_len, last_len = 0;
1343 	u32 off = skb_network_offset(skb);
1344 	u32 len = qdisc_pkt_len(skb);
1345 	u16 segs = 1;
1346 
1347 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1348 
1349 	if (!shinfo->gso_size)
1350 		return cake_calc_overhead(q, len, off);
1351 
1352 	/* borrowed from qdisc_pkt_len_init() */
1353 	hdr_len = skb_transport_offset(skb);
1354 
1355 	/* + transport layer */
1356 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1357 						SKB_GSO_TCPV6))) {
1358 		const struct tcphdr *th;
1359 		struct tcphdr _tcphdr;
1360 
1361 		th = skb_header_pointer(skb, hdr_len,
1362 					sizeof(_tcphdr), &_tcphdr);
1363 		if (likely(th))
1364 			hdr_len += __tcp_hdrlen(th);
1365 	} else {
1366 		struct udphdr _udphdr;
1367 
1368 		if (skb_header_pointer(skb, hdr_len,
1369 				       sizeof(_udphdr), &_udphdr))
1370 			hdr_len += sizeof(struct udphdr);
1371 	}
1372 
1373 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1374 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1375 				    shinfo->gso_size);
1376 	else
1377 		segs = shinfo->gso_segs;
1378 
1379 	len = shinfo->gso_size + hdr_len;
1380 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1381 
1382 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1383 		cake_calc_overhead(q, last_len, off));
1384 }
1385 
1386 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1387 {
1388 	struct cake_heap_entry ii = q->overflow_heap[i];
1389 	struct cake_heap_entry jj = q->overflow_heap[j];
1390 
1391 	q->overflow_heap[i] = jj;
1392 	q->overflow_heap[j] = ii;
1393 
1394 	q->tins[ii.t].overflow_idx[ii.b] = j;
1395 	q->tins[jj.t].overflow_idx[jj.b] = i;
1396 }
1397 
1398 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1399 {
1400 	struct cake_heap_entry ii = q->overflow_heap[i];
1401 
1402 	return q->tins[ii.t].backlogs[ii.b];
1403 }
1404 
1405 static void cake_heapify(struct cake_sched_data *q, u16 i)
1406 {
1407 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1408 	u32 mb = cake_heap_get_backlog(q, i);
1409 	u32 m = i;
1410 
1411 	while (m < a) {
1412 		u32 l = m + m + 1;
1413 		u32 r = l + 1;
1414 
1415 		if (l < a) {
1416 			u32 lb = cake_heap_get_backlog(q, l);
1417 
1418 			if (lb > mb) {
1419 				m  = l;
1420 				mb = lb;
1421 			}
1422 		}
1423 
1424 		if (r < a) {
1425 			u32 rb = cake_heap_get_backlog(q, r);
1426 
1427 			if (rb > mb) {
1428 				m  = r;
1429 				mb = rb;
1430 			}
1431 		}
1432 
1433 		if (m != i) {
1434 			cake_heap_swap(q, i, m);
1435 			i = m;
1436 		} else {
1437 			break;
1438 		}
1439 	}
1440 }
1441 
1442 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1443 {
1444 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1445 		u16 p = (i - 1) >> 1;
1446 		u32 ib = cake_heap_get_backlog(q, i);
1447 		u32 pb = cake_heap_get_backlog(q, p);
1448 
1449 		if (ib > pb) {
1450 			cake_heap_swap(q, i, p);
1451 			i = p;
1452 		} else {
1453 			break;
1454 		}
1455 	}
1456 }
1457 
1458 static int cake_advance_shaper(struct cake_sched_data *q,
1459 			       struct cake_tin_data *b,
1460 			       struct sk_buff *skb,
1461 			       ktime_t now, bool drop)
1462 {
1463 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1464 
1465 	/* charge packet bandwidth to this tin
1466 	 * and to the global shaper.
1467 	 */
1468 	if (q->rate_ns) {
1469 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1470 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1471 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1472 
1473 		if (ktime_before(b->time_next_packet, now))
1474 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1475 							   tin_dur);
1476 
1477 		else if (ktime_before(b->time_next_packet,
1478 				      ktime_add_ns(now, tin_dur)))
1479 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1480 
1481 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1482 						   global_dur);
1483 		if (!drop)
1484 			q->failsafe_next_packet = \
1485 				ktime_add_ns(q->failsafe_next_packet,
1486 					     failsafe_dur);
1487 	}
1488 	return len;
1489 }
1490 
1491 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1492 {
1493 	struct cake_sched_data *q = qdisc_priv(sch);
1494 	ktime_t now = ktime_get();
1495 	u32 idx = 0, tin = 0, len;
1496 	struct cake_heap_entry qq;
1497 	struct cake_tin_data *b;
1498 	struct cake_flow *flow;
1499 	struct sk_buff *skb;
1500 
1501 	if (!q->overflow_timeout) {
1502 		int i;
1503 		/* Build fresh max-heap */
1504 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1505 			cake_heapify(q, i);
1506 	}
1507 	q->overflow_timeout = 65535;
1508 
1509 	/* select longest queue for pruning */
1510 	qq  = q->overflow_heap[0];
1511 	tin = qq.t;
1512 	idx = qq.b;
1513 
1514 	b = &q->tins[tin];
1515 	flow = &b->flows[idx];
1516 	skb = dequeue_head(flow);
1517 	if (unlikely(!skb)) {
1518 		/* heap has gone wrong, rebuild it next time */
1519 		q->overflow_timeout = 0;
1520 		return idx + (tin << 16);
1521 	}
1522 
1523 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1524 		b->unresponsive_flow_count++;
1525 
1526 	len = qdisc_pkt_len(skb);
1527 	q->buffer_used      -= skb->truesize;
1528 	b->backlogs[idx]    -= len;
1529 	b->tin_backlog      -= len;
1530 	sch->qstats.backlog -= len;
1531 
1532 	flow->dropped++;
1533 	b->tin_dropped++;
1534 
1535 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1536 		cake_advance_shaper(q, b, skb, now, true);
1537 
1538 	qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1539 	sch->q.qlen--;
1540 	qdisc_tree_reduce_backlog(sch, 1, len);
1541 
1542 	cake_heapify(q, 0);
1543 
1544 	return idx + (tin << 16);
1545 }
1546 
1547 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1548 {
1549 	const int offset = skb_network_offset(skb);
1550 	u16 *buf, buf_;
1551 	u8 dscp;
1552 
1553 	switch (skb_protocol(skb, true)) {
1554 	case htons(ETH_P_IP):
1555 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1556 		if (unlikely(!buf))
1557 			return 0;
1558 
1559 		/* ToS is in the second byte of iphdr */
1560 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1561 
1562 		if (wash && dscp) {
1563 			const int wlen = offset + sizeof(struct iphdr);
1564 
1565 			if (!pskb_may_pull(skb, wlen) ||
1566 			    skb_try_make_writable(skb, wlen))
1567 				return 0;
1568 
1569 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1570 		}
1571 
1572 		return dscp;
1573 
1574 	case htons(ETH_P_IPV6):
1575 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1576 		if (unlikely(!buf))
1577 			return 0;
1578 
1579 		/* Traffic class is in the first and second bytes of ipv6hdr */
1580 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1581 
1582 		if (wash && dscp) {
1583 			const int wlen = offset + sizeof(struct ipv6hdr);
1584 
1585 			if (!pskb_may_pull(skb, wlen) ||
1586 			    skb_try_make_writable(skb, wlen))
1587 				return 0;
1588 
1589 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1590 		}
1591 
1592 		return dscp;
1593 
1594 	case htons(ETH_P_ARP):
1595 		return 0x38;  /* CS7 - Net Control */
1596 
1597 	default:
1598 		/* If there is no Diffserv field, treat as best-effort */
1599 		return 0;
1600 	}
1601 }
1602 
1603 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1604 					     struct sk_buff *skb)
1605 {
1606 	struct cake_sched_data *q = qdisc_priv(sch);
1607 	u32 tin, mark;
1608 	bool wash;
1609 	u8 dscp;
1610 
1611 	/* Tin selection: Default to diffserv-based selection, allow overriding
1612 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1613 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1614 	 */
1615 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1616 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1617 	if (wash)
1618 		dscp = cake_handle_diffserv(skb, wash);
1619 
1620 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1621 		tin = 0;
1622 
1623 	else if (mark && mark <= q->tin_cnt)
1624 		tin = q->tin_order[mark - 1];
1625 
1626 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1627 		 TC_H_MIN(skb->priority) > 0 &&
1628 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1629 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1630 
1631 	else {
1632 		if (!wash)
1633 			dscp = cake_handle_diffserv(skb, wash);
1634 		tin = q->tin_index[dscp];
1635 
1636 		if (unlikely(tin >= q->tin_cnt))
1637 			tin = 0;
1638 	}
1639 
1640 	return &q->tins[tin];
1641 }
1642 
1643 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1644 			 struct sk_buff *skb, int flow_mode, int *qerr)
1645 {
1646 	struct cake_sched_data *q = qdisc_priv(sch);
1647 	struct tcf_proto *filter;
1648 	struct tcf_result res;
1649 	u16 flow = 0, host = 0;
1650 	int result;
1651 
1652 	filter = rcu_dereference_bh(q->filter_list);
1653 	if (!filter)
1654 		goto hash;
1655 
1656 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1657 	result = tcf_classify(skb, NULL, filter, &res, false);
1658 
1659 	if (result >= 0) {
1660 #ifdef CONFIG_NET_CLS_ACT
1661 		switch (result) {
1662 		case TC_ACT_STOLEN:
1663 		case TC_ACT_QUEUED:
1664 		case TC_ACT_TRAP:
1665 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1666 			fallthrough;
1667 		case TC_ACT_SHOT:
1668 			return 0;
1669 		}
1670 #endif
1671 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1672 			flow = TC_H_MIN(res.classid);
1673 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1674 			host = TC_H_MAJ(res.classid) >> 16;
1675 	}
1676 hash:
1677 	*t = cake_select_tin(sch, skb);
1678 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1679 }
1680 
1681 static void cake_reconfigure(struct Qdisc *sch);
1682 
1683 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1684 			struct sk_buff **to_free)
1685 {
1686 	struct cake_sched_data *q = qdisc_priv(sch);
1687 	int len = qdisc_pkt_len(skb);
1688 	int ret;
1689 	struct sk_buff *ack = NULL;
1690 	ktime_t now = ktime_get();
1691 	struct cake_tin_data *b;
1692 	struct cake_flow *flow;
1693 	u32 idx;
1694 
1695 	/* choose flow to insert into */
1696 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1697 	if (idx == 0) {
1698 		if (ret & __NET_XMIT_BYPASS)
1699 			qdisc_qstats_drop(sch);
1700 		__qdisc_drop(skb, to_free);
1701 		return ret;
1702 	}
1703 	idx--;
1704 	flow = &b->flows[idx];
1705 
1706 	/* ensure shaper state isn't stale */
1707 	if (!b->tin_backlog) {
1708 		if (ktime_before(b->time_next_packet, now))
1709 			b->time_next_packet = now;
1710 
1711 		if (!sch->q.qlen) {
1712 			if (ktime_before(q->time_next_packet, now)) {
1713 				q->failsafe_next_packet = now;
1714 				q->time_next_packet = now;
1715 			} else if (ktime_after(q->time_next_packet, now) &&
1716 				   ktime_after(q->failsafe_next_packet, now)) {
1717 				u64 next = \
1718 					min(ktime_to_ns(q->time_next_packet),
1719 					    ktime_to_ns(
1720 						   q->failsafe_next_packet));
1721 				sch->qstats.overlimits++;
1722 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1723 			}
1724 		}
1725 	}
1726 
1727 	if (unlikely(len > b->max_skblen))
1728 		b->max_skblen = len;
1729 
1730 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1731 		struct sk_buff *segs, *nskb;
1732 		netdev_features_t features = netif_skb_features(skb);
1733 		unsigned int slen = 0, numsegs = 0;
1734 
1735 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1736 		if (IS_ERR_OR_NULL(segs))
1737 			return qdisc_drop(skb, sch, to_free);
1738 
1739 		skb_list_walk_safe(segs, segs, nskb) {
1740 			skb_mark_not_on_list(segs);
1741 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1742 			cobalt_set_enqueue_time(segs, now);
1743 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1744 									  segs);
1745 			flow_queue_add(flow, segs);
1746 
1747 			sch->q.qlen++;
1748 			numsegs++;
1749 			slen += segs->len;
1750 			q->buffer_used += segs->truesize;
1751 			b->packets++;
1752 		}
1753 
1754 		/* stats */
1755 		b->bytes	    += slen;
1756 		b->backlogs[idx]    += slen;
1757 		b->tin_backlog      += slen;
1758 		sch->qstats.backlog += slen;
1759 		q->avg_window_bytes += slen;
1760 
1761 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1762 		consume_skb(skb);
1763 	} else {
1764 		/* not splitting */
1765 		cobalt_set_enqueue_time(skb, now);
1766 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1767 		flow_queue_add(flow, skb);
1768 
1769 		if (q->ack_filter)
1770 			ack = cake_ack_filter(q, flow);
1771 
1772 		if (ack) {
1773 			b->ack_drops++;
1774 			sch->qstats.drops++;
1775 			b->bytes += qdisc_pkt_len(ack);
1776 			len -= qdisc_pkt_len(ack);
1777 			q->buffer_used += skb->truesize - ack->truesize;
1778 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1779 				cake_advance_shaper(q, b, ack, now, true);
1780 
1781 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1782 			consume_skb(ack);
1783 		} else {
1784 			sch->q.qlen++;
1785 			q->buffer_used      += skb->truesize;
1786 		}
1787 
1788 		/* stats */
1789 		b->packets++;
1790 		b->bytes	    += len;
1791 		b->backlogs[idx]    += len;
1792 		b->tin_backlog      += len;
1793 		sch->qstats.backlog += len;
1794 		q->avg_window_bytes += len;
1795 	}
1796 
1797 	if (q->overflow_timeout)
1798 		cake_heapify_up(q, b->overflow_idx[idx]);
1799 
1800 	/* incoming bandwidth capacity estimate */
1801 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1802 		u64 packet_interval = \
1803 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1804 
1805 		if (packet_interval > NSEC_PER_SEC)
1806 			packet_interval = NSEC_PER_SEC;
1807 
1808 		/* filter out short-term bursts, eg. wifi aggregation */
1809 		q->avg_packet_interval = \
1810 			cake_ewma(q->avg_packet_interval,
1811 				  packet_interval,
1812 				  (packet_interval > q->avg_packet_interval ?
1813 					  2 : 8));
1814 
1815 		q->last_packet_time = now;
1816 
1817 		if (packet_interval > q->avg_packet_interval) {
1818 			u64 window_interval = \
1819 				ktime_to_ns(ktime_sub(now,
1820 						      q->avg_window_begin));
1821 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1822 
1823 			b = div64_u64(b, window_interval);
1824 			q->avg_peak_bandwidth =
1825 				cake_ewma(q->avg_peak_bandwidth, b,
1826 					  b > q->avg_peak_bandwidth ? 2 : 8);
1827 			q->avg_window_bytes = 0;
1828 			q->avg_window_begin = now;
1829 
1830 			if (ktime_after(now,
1831 					ktime_add_ms(q->last_reconfig_time,
1832 						     250))) {
1833 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1834 				cake_reconfigure(sch);
1835 			}
1836 		}
1837 	} else {
1838 		q->avg_window_bytes = 0;
1839 		q->last_packet_time = now;
1840 	}
1841 
1842 	/* flowchain */
1843 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1844 		struct cake_host *srchost = &b->hosts[flow->srchost];
1845 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1846 		u16 host_load = 1;
1847 
1848 		if (!flow->set) {
1849 			list_add_tail(&flow->flowchain, &b->new_flows);
1850 		} else {
1851 			b->decaying_flow_count--;
1852 			list_move_tail(&flow->flowchain, &b->new_flows);
1853 		}
1854 		flow->set = CAKE_SET_SPARSE;
1855 		b->sparse_flow_count++;
1856 
1857 		if (cake_dsrc(q->flow_mode))
1858 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1859 
1860 		if (cake_ddst(q->flow_mode))
1861 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1862 
1863 		flow->deficit = (b->flow_quantum *
1864 				 quantum_div[host_load]) >> 16;
1865 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1866 		struct cake_host *srchost = &b->hosts[flow->srchost];
1867 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1868 
1869 		/* this flow was empty, accounted as a sparse flow, but actually
1870 		 * in the bulk rotation.
1871 		 */
1872 		flow->set = CAKE_SET_BULK;
1873 		b->sparse_flow_count--;
1874 		b->bulk_flow_count++;
1875 
1876 		if (cake_dsrc(q->flow_mode))
1877 			srchost->srchost_bulk_flow_count++;
1878 
1879 		if (cake_ddst(q->flow_mode))
1880 			dsthost->dsthost_bulk_flow_count++;
1881 
1882 	}
1883 
1884 	if (q->buffer_used > q->buffer_max_used)
1885 		q->buffer_max_used = q->buffer_used;
1886 
1887 	if (q->buffer_used > q->buffer_limit) {
1888 		u32 dropped = 0;
1889 
1890 		while (q->buffer_used > q->buffer_limit) {
1891 			dropped++;
1892 			cake_drop(sch, to_free);
1893 		}
1894 		b->drop_overlimit += dropped;
1895 	}
1896 	return NET_XMIT_SUCCESS;
1897 }
1898 
1899 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1900 {
1901 	struct cake_sched_data *q = qdisc_priv(sch);
1902 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1903 	struct cake_flow *flow = &b->flows[q->cur_flow];
1904 	struct sk_buff *skb = NULL;
1905 	u32 len;
1906 
1907 	if (flow->head) {
1908 		skb = dequeue_head(flow);
1909 		len = qdisc_pkt_len(skb);
1910 		b->backlogs[q->cur_flow] -= len;
1911 		b->tin_backlog		 -= len;
1912 		sch->qstats.backlog      -= len;
1913 		q->buffer_used		 -= skb->truesize;
1914 		sch->q.qlen--;
1915 
1916 		if (q->overflow_timeout)
1917 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1918 	}
1919 	return skb;
1920 }
1921 
1922 /* Discard leftover packets from a tin no longer in use. */
1923 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1924 {
1925 	struct cake_sched_data *q = qdisc_priv(sch);
1926 	struct sk_buff *skb;
1927 
1928 	q->cur_tin = tin;
1929 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1930 		while (!!(skb = cake_dequeue_one(sch)))
1931 			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
1932 }
1933 
1934 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1935 {
1936 	struct cake_sched_data *q = qdisc_priv(sch);
1937 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1938 	struct cake_host *srchost, *dsthost;
1939 	enum skb_drop_reason reason;
1940 	ktime_t now = ktime_get();
1941 	struct cake_flow *flow;
1942 	struct list_head *head;
1943 	bool first_flow = true;
1944 	struct sk_buff *skb;
1945 	u16 host_load;
1946 	u64 delay;
1947 	u32 len;
1948 
1949 begin:
1950 	if (!sch->q.qlen)
1951 		return NULL;
1952 
1953 	/* global hard shaper */
1954 	if (ktime_after(q->time_next_packet, now) &&
1955 	    ktime_after(q->failsafe_next_packet, now)) {
1956 		u64 next = min(ktime_to_ns(q->time_next_packet),
1957 			       ktime_to_ns(q->failsafe_next_packet));
1958 
1959 		sch->qstats.overlimits++;
1960 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1961 		return NULL;
1962 	}
1963 
1964 	/* Choose a class to work on. */
1965 	if (!q->rate_ns) {
1966 		/* In unlimited mode, can't rely on shaper timings, just balance
1967 		 * with DRR
1968 		 */
1969 		bool wrapped = false, empty = true;
1970 
1971 		while (b->tin_deficit < 0 ||
1972 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1973 			if (b->tin_deficit <= 0)
1974 				b->tin_deficit += b->tin_quantum;
1975 			if (b->sparse_flow_count + b->bulk_flow_count)
1976 				empty = false;
1977 
1978 			q->cur_tin++;
1979 			b++;
1980 			if (q->cur_tin >= q->tin_cnt) {
1981 				q->cur_tin = 0;
1982 				b = q->tins;
1983 
1984 				if (wrapped) {
1985 					/* It's possible for q->qlen to be
1986 					 * nonzero when we actually have no
1987 					 * packets anywhere.
1988 					 */
1989 					if (empty)
1990 						return NULL;
1991 				} else {
1992 					wrapped = true;
1993 				}
1994 			}
1995 		}
1996 	} else {
1997 		/* In shaped mode, choose:
1998 		 * - Highest-priority tin with queue and meeting schedule, or
1999 		 * - The earliest-scheduled tin with queue.
2000 		 */
2001 		ktime_t best_time = KTIME_MAX;
2002 		int tin, best_tin = 0;
2003 
2004 		for (tin = 0; tin < q->tin_cnt; tin++) {
2005 			b = q->tins + tin;
2006 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2007 				ktime_t time_to_pkt = \
2008 					ktime_sub(b->time_next_packet, now);
2009 
2010 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2011 				    ktime_compare(time_to_pkt,
2012 						  best_time) <= 0) {
2013 					best_time = time_to_pkt;
2014 					best_tin = tin;
2015 				}
2016 			}
2017 		}
2018 
2019 		q->cur_tin = best_tin;
2020 		b = q->tins + best_tin;
2021 
2022 		/* No point in going further if no packets to deliver. */
2023 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2024 			return NULL;
2025 	}
2026 
2027 retry:
2028 	/* service this class */
2029 	head = &b->decaying_flows;
2030 	if (!first_flow || list_empty(head)) {
2031 		head = &b->new_flows;
2032 		if (list_empty(head)) {
2033 			head = &b->old_flows;
2034 			if (unlikely(list_empty(head))) {
2035 				head = &b->decaying_flows;
2036 				if (unlikely(list_empty(head)))
2037 					goto begin;
2038 			}
2039 		}
2040 	}
2041 	flow = list_first_entry(head, struct cake_flow, flowchain);
2042 	q->cur_flow = flow - b->flows;
2043 	first_flow = false;
2044 
2045 	/* triple isolation (modified DRR++) */
2046 	srchost = &b->hosts[flow->srchost];
2047 	dsthost = &b->hosts[flow->dsthost];
2048 	host_load = 1;
2049 
2050 	/* flow isolation (DRR++) */
2051 	if (flow->deficit <= 0) {
2052 		/* Keep all flows with deficits out of the sparse and decaying
2053 		 * rotations.  No non-empty flow can go into the decaying
2054 		 * rotation, so they can't get deficits
2055 		 */
2056 		if (flow->set == CAKE_SET_SPARSE) {
2057 			if (flow->head) {
2058 				b->sparse_flow_count--;
2059 				b->bulk_flow_count++;
2060 
2061 				if (cake_dsrc(q->flow_mode))
2062 					srchost->srchost_bulk_flow_count++;
2063 
2064 				if (cake_ddst(q->flow_mode))
2065 					dsthost->dsthost_bulk_flow_count++;
2066 
2067 				flow->set = CAKE_SET_BULK;
2068 			} else {
2069 				/* we've moved it to the bulk rotation for
2070 				 * correct deficit accounting but we still want
2071 				 * to count it as a sparse flow, not a bulk one.
2072 				 */
2073 				flow->set = CAKE_SET_SPARSE_WAIT;
2074 			}
2075 		}
2076 
2077 		if (cake_dsrc(q->flow_mode))
2078 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2079 
2080 		if (cake_ddst(q->flow_mode))
2081 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2082 
2083 		WARN_ON(host_load > CAKE_QUEUES);
2084 
2085 		/* The get_random_u16() is a way to apply dithering to avoid
2086 		 * accumulating roundoff errors
2087 		 */
2088 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2089 				  get_random_u16()) >> 16;
2090 		list_move_tail(&flow->flowchain, &b->old_flows);
2091 
2092 		goto retry;
2093 	}
2094 
2095 	/* Retrieve a packet via the AQM */
2096 	while (1) {
2097 		skb = cake_dequeue_one(sch);
2098 		if (!skb) {
2099 			/* this queue was actually empty */
2100 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2101 				b->unresponsive_flow_count--;
2102 
2103 			if (flow->cvars.p_drop || flow->cvars.count ||
2104 			    ktime_before(now, flow->cvars.drop_next)) {
2105 				/* keep in the flowchain until the state has
2106 				 * decayed to rest
2107 				 */
2108 				list_move_tail(&flow->flowchain,
2109 					       &b->decaying_flows);
2110 				if (flow->set == CAKE_SET_BULK) {
2111 					b->bulk_flow_count--;
2112 
2113 					if (cake_dsrc(q->flow_mode))
2114 						srchost->srchost_bulk_flow_count--;
2115 
2116 					if (cake_ddst(q->flow_mode))
2117 						dsthost->dsthost_bulk_flow_count--;
2118 
2119 					b->decaying_flow_count++;
2120 				} else if (flow->set == CAKE_SET_SPARSE ||
2121 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2122 					b->sparse_flow_count--;
2123 					b->decaying_flow_count++;
2124 				}
2125 				flow->set = CAKE_SET_DECAYING;
2126 			} else {
2127 				/* remove empty queue from the flowchain */
2128 				list_del_init(&flow->flowchain);
2129 				if (flow->set == CAKE_SET_SPARSE ||
2130 				    flow->set == CAKE_SET_SPARSE_WAIT)
2131 					b->sparse_flow_count--;
2132 				else if (flow->set == CAKE_SET_BULK) {
2133 					b->bulk_flow_count--;
2134 
2135 					if (cake_dsrc(q->flow_mode))
2136 						srchost->srchost_bulk_flow_count--;
2137 
2138 					if (cake_ddst(q->flow_mode))
2139 						dsthost->dsthost_bulk_flow_count--;
2140 
2141 				} else
2142 					b->decaying_flow_count--;
2143 
2144 				flow->set = CAKE_SET_NONE;
2145 			}
2146 			goto begin;
2147 		}
2148 
2149 		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2150 					    (b->bulk_flow_count *
2151 					     !!(q->rate_flags &
2152 						CAKE_FLAG_INGRESS)));
2153 		/* Last packet in queue may be marked, shouldn't be dropped */
2154 		if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2155 			break;
2156 
2157 		/* drop this packet, get another one */
2158 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2159 			len = cake_advance_shaper(q, b, skb,
2160 						  now, true);
2161 			flow->deficit -= len;
2162 			b->tin_deficit -= len;
2163 		}
2164 		flow->dropped++;
2165 		b->tin_dropped++;
2166 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2167 		qdisc_qstats_drop(sch);
2168 		kfree_skb_reason(skb, reason);
2169 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2170 			goto retry;
2171 	}
2172 
2173 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2174 	qdisc_bstats_update(sch, skb);
2175 
2176 	/* collect delay stats */
2177 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2178 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2179 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2180 				  delay > b->peak_delay ? 2 : 8);
2181 	b->base_delay = cake_ewma(b->base_delay, delay,
2182 				  delay < b->base_delay ? 2 : 8);
2183 
2184 	len = cake_advance_shaper(q, b, skb, now, false);
2185 	flow->deficit -= len;
2186 	b->tin_deficit -= len;
2187 
2188 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2189 		u64 next = min(ktime_to_ns(q->time_next_packet),
2190 			       ktime_to_ns(q->failsafe_next_packet));
2191 
2192 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2193 	} else if (!sch->q.qlen) {
2194 		int i;
2195 
2196 		for (i = 0; i < q->tin_cnt; i++) {
2197 			if (q->tins[i].decaying_flow_count) {
2198 				ktime_t next = \
2199 					ktime_add_ns(now,
2200 						     q->tins[i].cparams.target);
2201 
2202 				qdisc_watchdog_schedule_ns(&q->watchdog,
2203 							   ktime_to_ns(next));
2204 				break;
2205 			}
2206 		}
2207 	}
2208 
2209 	if (q->overflow_timeout)
2210 		q->overflow_timeout--;
2211 
2212 	return skb;
2213 }
2214 
2215 static void cake_reset(struct Qdisc *sch)
2216 {
2217 	struct cake_sched_data *q = qdisc_priv(sch);
2218 	u32 c;
2219 
2220 	if (!q->tins)
2221 		return;
2222 
2223 	for (c = 0; c < CAKE_MAX_TINS; c++)
2224 		cake_clear_tin(sch, c);
2225 }
2226 
2227 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2228 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2229 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2230 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2231 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2232 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2233 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2234 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2235 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2236 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2237 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2238 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2239 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2240 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2241 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2242 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2243 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2244 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2245 };
2246 
2247 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2248 			  u64 target_ns, u64 rtt_est_ns)
2249 {
2250 	/* convert byte-rate into time-per-byte
2251 	 * so it will always unwedge in reasonable time.
2252 	 */
2253 	static const u64 MIN_RATE = 64;
2254 	u32 byte_target = mtu;
2255 	u64 byte_target_ns;
2256 	u8  rate_shft = 0;
2257 	u64 rate_ns = 0;
2258 
2259 	b->flow_quantum = 1514;
2260 	if (rate) {
2261 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2262 		rate_shft = 34;
2263 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2264 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2265 		while (!!(rate_ns >> 34)) {
2266 			rate_ns >>= 1;
2267 			rate_shft--;
2268 		}
2269 	} /* else unlimited, ie. zero delay */
2270 
2271 	b->tin_rate_bps  = rate;
2272 	b->tin_rate_ns   = rate_ns;
2273 	b->tin_rate_shft = rate_shft;
2274 
2275 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2276 
2277 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2278 	b->cparams.interval = max(rtt_est_ns +
2279 				     b->cparams.target - target_ns,
2280 				     b->cparams.target * 2);
2281 	b->cparams.mtu_time = byte_target_ns;
2282 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2283 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2284 }
2285 
2286 static int cake_config_besteffort(struct Qdisc *sch)
2287 {
2288 	struct cake_sched_data *q = qdisc_priv(sch);
2289 	struct cake_tin_data *b = &q->tins[0];
2290 	u32 mtu = psched_mtu(qdisc_dev(sch));
2291 	u64 rate = q->rate_bps;
2292 
2293 	q->tin_cnt = 1;
2294 
2295 	q->tin_index = besteffort;
2296 	q->tin_order = normal_order;
2297 
2298 	cake_set_rate(b, rate, mtu,
2299 		      us_to_ns(q->target), us_to_ns(q->interval));
2300 	b->tin_quantum = 65535;
2301 
2302 	return 0;
2303 }
2304 
2305 static int cake_config_precedence(struct Qdisc *sch)
2306 {
2307 	/* convert high-level (user visible) parameters into internal format */
2308 	struct cake_sched_data *q = qdisc_priv(sch);
2309 	u32 mtu = psched_mtu(qdisc_dev(sch));
2310 	u64 rate = q->rate_bps;
2311 	u32 quantum = 256;
2312 	u32 i;
2313 
2314 	q->tin_cnt = 8;
2315 	q->tin_index = precedence;
2316 	q->tin_order = normal_order;
2317 
2318 	for (i = 0; i < q->tin_cnt; i++) {
2319 		struct cake_tin_data *b = &q->tins[i];
2320 
2321 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2322 			      us_to_ns(q->interval));
2323 
2324 		b->tin_quantum = max_t(u16, 1U, quantum);
2325 
2326 		/* calculate next class's parameters */
2327 		rate  *= 7;
2328 		rate >>= 3;
2329 
2330 		quantum  *= 7;
2331 		quantum >>= 3;
2332 	}
2333 
2334 	return 0;
2335 }
2336 
2337 /*	List of known Diffserv codepoints:
2338  *
2339  *	Default Forwarding (DF/CS0) - Best Effort
2340  *	Max Throughput (TOS2)
2341  *	Min Delay (TOS4)
2342  *	LLT "La" (TOS5)
2343  *	Assured Forwarding 1 (AF1x) - x3
2344  *	Assured Forwarding 2 (AF2x) - x3
2345  *	Assured Forwarding 3 (AF3x) - x3
2346  *	Assured Forwarding 4 (AF4x) - x3
2347  *	Precedence Class 1 (CS1)
2348  *	Precedence Class 2 (CS2)
2349  *	Precedence Class 3 (CS3)
2350  *	Precedence Class 4 (CS4)
2351  *	Precedence Class 5 (CS5)
2352  *	Precedence Class 6 (CS6)
2353  *	Precedence Class 7 (CS7)
2354  *	Voice Admit (VA)
2355  *	Expedited Forwarding (EF)
2356  *	Lower Effort (LE)
2357  *
2358  *	Total 26 codepoints.
2359  */
2360 
2361 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2362  *		(roughly descending order of contended priority)
2363  *		(roughly ascending order of uncontended throughput)
2364  *
2365  *	Network Control (CS6,CS7)      - routing traffic
2366  *	Telephony (EF,VA)         - aka. VoIP streams
2367  *	Signalling (CS5)               - VoIP setup
2368  *	Multimedia Conferencing (AF4x) - aka. video calls
2369  *	Realtime Interactive (CS4)     - eg. games
2370  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2371  *	Broadcast Video (CS3)
2372  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2373  *	Ops, Admin, Management (CS2)      - eg. ssh
2374  *	Standard Service (DF & unrecognised codepoints)
2375  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2376  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2377  *
2378  *	Total 12 traffic classes.
2379  */
2380 
2381 static int cake_config_diffserv8(struct Qdisc *sch)
2382 {
2383 /*	Pruned list of traffic classes for typical applications:
2384  *
2385  *		Network Control          (CS6, CS7)
2386  *		Minimum Latency          (EF, VA, CS5, CS4)
2387  *		Interactive Shell        (CS2)
2388  *		Low Latency Transactions (AF2x, TOS4)
2389  *		Video Streaming          (AF4x, AF3x, CS3)
2390  *		Bog Standard             (DF etc.)
2391  *		High Throughput          (AF1x, TOS2, CS1)
2392  *		Background Traffic       (LE)
2393  *
2394  *		Total 8 traffic classes.
2395  */
2396 
2397 	struct cake_sched_data *q = qdisc_priv(sch);
2398 	u32 mtu = psched_mtu(qdisc_dev(sch));
2399 	u64 rate = q->rate_bps;
2400 	u32 quantum = 256;
2401 	u32 i;
2402 
2403 	q->tin_cnt = 8;
2404 
2405 	/* codepoint to class mapping */
2406 	q->tin_index = diffserv8;
2407 	q->tin_order = normal_order;
2408 
2409 	/* class characteristics */
2410 	for (i = 0; i < q->tin_cnt; i++) {
2411 		struct cake_tin_data *b = &q->tins[i];
2412 
2413 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2414 			      us_to_ns(q->interval));
2415 
2416 		b->tin_quantum = max_t(u16, 1U, quantum);
2417 
2418 		/* calculate next class's parameters */
2419 		rate  *= 7;
2420 		rate >>= 3;
2421 
2422 		quantum  *= 7;
2423 		quantum >>= 3;
2424 	}
2425 
2426 	return 0;
2427 }
2428 
2429 static int cake_config_diffserv4(struct Qdisc *sch)
2430 {
2431 /*  Further pruned list of traffic classes for four-class system:
2432  *
2433  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2434  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2435  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2436  *	    Background Traffic (LE, CS1)
2437  *
2438  *		Total 4 traffic classes.
2439  */
2440 
2441 	struct cake_sched_data *q = qdisc_priv(sch);
2442 	u32 mtu = psched_mtu(qdisc_dev(sch));
2443 	u64 rate = q->rate_bps;
2444 	u32 quantum = 1024;
2445 
2446 	q->tin_cnt = 4;
2447 
2448 	/* codepoint to class mapping */
2449 	q->tin_index = diffserv4;
2450 	q->tin_order = bulk_order;
2451 
2452 	/* class characteristics */
2453 	cake_set_rate(&q->tins[0], rate, mtu,
2454 		      us_to_ns(q->target), us_to_ns(q->interval));
2455 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2456 		      us_to_ns(q->target), us_to_ns(q->interval));
2457 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2458 		      us_to_ns(q->target), us_to_ns(q->interval));
2459 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2460 		      us_to_ns(q->target), us_to_ns(q->interval));
2461 
2462 	/* bandwidth-sharing weights */
2463 	q->tins[0].tin_quantum = quantum;
2464 	q->tins[1].tin_quantum = quantum >> 4;
2465 	q->tins[2].tin_quantum = quantum >> 1;
2466 	q->tins[3].tin_quantum = quantum >> 2;
2467 
2468 	return 0;
2469 }
2470 
2471 static int cake_config_diffserv3(struct Qdisc *sch)
2472 {
2473 /*  Simplified Diffserv structure with 3 tins.
2474  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2475  *		Best Effort
2476  *		Low Priority		(LE, CS1)
2477  */
2478 	struct cake_sched_data *q = qdisc_priv(sch);
2479 	u32 mtu = psched_mtu(qdisc_dev(sch));
2480 	u64 rate = q->rate_bps;
2481 	u32 quantum = 1024;
2482 
2483 	q->tin_cnt = 3;
2484 
2485 	/* codepoint to class mapping */
2486 	q->tin_index = diffserv3;
2487 	q->tin_order = bulk_order;
2488 
2489 	/* class characteristics */
2490 	cake_set_rate(&q->tins[0], rate, mtu,
2491 		      us_to_ns(q->target), us_to_ns(q->interval));
2492 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2493 		      us_to_ns(q->target), us_to_ns(q->interval));
2494 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2495 		      us_to_ns(q->target), us_to_ns(q->interval));
2496 
2497 	/* bandwidth-sharing weights */
2498 	q->tins[0].tin_quantum = quantum;
2499 	q->tins[1].tin_quantum = quantum >> 4;
2500 	q->tins[2].tin_quantum = quantum >> 2;
2501 
2502 	return 0;
2503 }
2504 
2505 static void cake_reconfigure(struct Qdisc *sch)
2506 {
2507 	struct cake_sched_data *q = qdisc_priv(sch);
2508 	int c, ft;
2509 
2510 	switch (q->tin_mode) {
2511 	case CAKE_DIFFSERV_BESTEFFORT:
2512 		ft = cake_config_besteffort(sch);
2513 		break;
2514 
2515 	case CAKE_DIFFSERV_PRECEDENCE:
2516 		ft = cake_config_precedence(sch);
2517 		break;
2518 
2519 	case CAKE_DIFFSERV_DIFFSERV8:
2520 		ft = cake_config_diffserv8(sch);
2521 		break;
2522 
2523 	case CAKE_DIFFSERV_DIFFSERV4:
2524 		ft = cake_config_diffserv4(sch);
2525 		break;
2526 
2527 	case CAKE_DIFFSERV_DIFFSERV3:
2528 	default:
2529 		ft = cake_config_diffserv3(sch);
2530 		break;
2531 	}
2532 
2533 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2534 		cake_clear_tin(sch, c);
2535 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2536 	}
2537 
2538 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2539 	q->rate_shft = q->tins[ft].tin_rate_shft;
2540 
2541 	if (q->buffer_config_limit) {
2542 		q->buffer_limit = q->buffer_config_limit;
2543 	} else if (q->rate_bps) {
2544 		u64 t = q->rate_bps * q->interval;
2545 
2546 		do_div(t, USEC_PER_SEC / 4);
2547 		q->buffer_limit = max_t(u32, t, 4U << 20);
2548 	} else {
2549 		q->buffer_limit = ~0;
2550 	}
2551 
2552 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2553 
2554 	q->buffer_limit = min(q->buffer_limit,
2555 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2556 				  q->buffer_config_limit));
2557 }
2558 
2559 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2560 		       struct netlink_ext_ack *extack)
2561 {
2562 	struct cake_sched_data *q = qdisc_priv(sch);
2563 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2564 	u16 rate_flags;
2565 	u8 flow_mode;
2566 	int err;
2567 
2568 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2569 					  extack);
2570 	if (err < 0)
2571 		return err;
2572 
2573 	flow_mode = q->flow_mode;
2574 	if (tb[TCA_CAKE_NAT]) {
2575 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2576 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2577 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2578 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2579 #else
2580 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2581 				    "No conntrack support in kernel");
2582 		return -EOPNOTSUPP;
2583 #endif
2584 	}
2585 
2586 	if (tb[TCA_CAKE_BASE_RATE64])
2587 		WRITE_ONCE(q->rate_bps,
2588 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2589 
2590 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2591 		WRITE_ONCE(q->tin_mode,
2592 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2593 
2594 	rate_flags = q->rate_flags;
2595 	if (tb[TCA_CAKE_WASH]) {
2596 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2597 			rate_flags |= CAKE_FLAG_WASH;
2598 		else
2599 			rate_flags &= ~CAKE_FLAG_WASH;
2600 	}
2601 
2602 	if (tb[TCA_CAKE_FLOW_MODE])
2603 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2604 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2605 					CAKE_FLOW_MASK));
2606 
2607 	if (tb[TCA_CAKE_ATM])
2608 		WRITE_ONCE(q->atm_mode,
2609 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2610 
2611 	if (tb[TCA_CAKE_OVERHEAD]) {
2612 		WRITE_ONCE(q->rate_overhead,
2613 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2614 		rate_flags |= CAKE_FLAG_OVERHEAD;
2615 
2616 		q->max_netlen = 0;
2617 		q->max_adjlen = 0;
2618 		q->min_netlen = ~0;
2619 		q->min_adjlen = ~0;
2620 	}
2621 
2622 	if (tb[TCA_CAKE_RAW]) {
2623 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2624 
2625 		q->max_netlen = 0;
2626 		q->max_adjlen = 0;
2627 		q->min_netlen = ~0;
2628 		q->min_adjlen = ~0;
2629 	}
2630 
2631 	if (tb[TCA_CAKE_MPU])
2632 		WRITE_ONCE(q->rate_mpu,
2633 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2634 
2635 	if (tb[TCA_CAKE_RTT]) {
2636 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2637 
2638 		WRITE_ONCE(q->interval, max(interval, 1U));
2639 	}
2640 
2641 	if (tb[TCA_CAKE_TARGET]) {
2642 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2643 
2644 		WRITE_ONCE(q->target, max(target, 1U));
2645 	}
2646 
2647 	if (tb[TCA_CAKE_AUTORATE]) {
2648 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2649 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2650 		else
2651 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2652 	}
2653 
2654 	if (tb[TCA_CAKE_INGRESS]) {
2655 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2656 			rate_flags |= CAKE_FLAG_INGRESS;
2657 		else
2658 			rate_flags &= ~CAKE_FLAG_INGRESS;
2659 	}
2660 
2661 	if (tb[TCA_CAKE_ACK_FILTER])
2662 		WRITE_ONCE(q->ack_filter,
2663 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2664 
2665 	if (tb[TCA_CAKE_MEMORY])
2666 		WRITE_ONCE(q->buffer_config_limit,
2667 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2668 
2669 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2670 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2671 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2672 		else
2673 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2674 	}
2675 
2676 	if (tb[TCA_CAKE_FWMARK]) {
2677 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2678 		WRITE_ONCE(q->fwmark_shft,
2679 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2680 	}
2681 
2682 	WRITE_ONCE(q->rate_flags, rate_flags);
2683 	WRITE_ONCE(q->flow_mode, flow_mode);
2684 	if (q->tins) {
2685 		sch_tree_lock(sch);
2686 		cake_reconfigure(sch);
2687 		sch_tree_unlock(sch);
2688 	}
2689 
2690 	return 0;
2691 }
2692 
2693 static void cake_destroy(struct Qdisc *sch)
2694 {
2695 	struct cake_sched_data *q = qdisc_priv(sch);
2696 
2697 	qdisc_watchdog_cancel(&q->watchdog);
2698 	tcf_block_put(q->block);
2699 	kvfree(q->tins);
2700 }
2701 
2702 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2703 		     struct netlink_ext_ack *extack)
2704 {
2705 	struct cake_sched_data *q = qdisc_priv(sch);
2706 	int i, j, err;
2707 
2708 	sch->limit = 10240;
2709 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2710 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2711 
2712 	q->rate_bps = 0; /* unlimited by default */
2713 
2714 	q->interval = 100000; /* 100ms default */
2715 	q->target   =   5000; /* 5ms: codel RFC argues
2716 			       * for 5 to 10% of interval
2717 			       */
2718 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2719 	q->cur_tin = 0;
2720 	q->cur_flow  = 0;
2721 
2722 	qdisc_watchdog_init(&q->watchdog, sch);
2723 
2724 	if (opt) {
2725 		err = cake_change(sch, opt, extack);
2726 
2727 		if (err)
2728 			return err;
2729 	}
2730 
2731 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2732 	if (err)
2733 		return err;
2734 
2735 	quantum_div[0] = ~0;
2736 	for (i = 1; i <= CAKE_QUEUES; i++)
2737 		quantum_div[i] = 65535 / i;
2738 
2739 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2740 			   GFP_KERNEL);
2741 	if (!q->tins)
2742 		return -ENOMEM;
2743 
2744 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2745 		struct cake_tin_data *b = q->tins + i;
2746 
2747 		INIT_LIST_HEAD(&b->new_flows);
2748 		INIT_LIST_HEAD(&b->old_flows);
2749 		INIT_LIST_HEAD(&b->decaying_flows);
2750 		b->sparse_flow_count = 0;
2751 		b->bulk_flow_count = 0;
2752 		b->decaying_flow_count = 0;
2753 
2754 		for (j = 0; j < CAKE_QUEUES; j++) {
2755 			struct cake_flow *flow = b->flows + j;
2756 			u32 k = j * CAKE_MAX_TINS + i;
2757 
2758 			INIT_LIST_HEAD(&flow->flowchain);
2759 			cobalt_vars_init(&flow->cvars);
2760 
2761 			q->overflow_heap[k].t = i;
2762 			q->overflow_heap[k].b = j;
2763 			b->overflow_idx[j] = k;
2764 		}
2765 	}
2766 
2767 	cake_reconfigure(sch);
2768 	q->avg_peak_bandwidth = q->rate_bps;
2769 	q->min_netlen = ~0;
2770 	q->min_adjlen = ~0;
2771 	return 0;
2772 }
2773 
2774 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2775 {
2776 	struct cake_sched_data *q = qdisc_priv(sch);
2777 	struct nlattr *opts;
2778 	u16 rate_flags;
2779 	u8 flow_mode;
2780 
2781 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2782 	if (!opts)
2783 		goto nla_put_failure;
2784 
2785 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2786 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2787 		goto nla_put_failure;
2788 
2789 	flow_mode = READ_ONCE(q->flow_mode);
2790 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2791 		goto nla_put_failure;
2792 
2793 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2794 		goto nla_put_failure;
2795 
2796 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2797 		goto nla_put_failure;
2798 
2799 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2800 			READ_ONCE(q->buffer_config_limit)))
2801 		goto nla_put_failure;
2802 
2803 	rate_flags = READ_ONCE(q->rate_flags);
2804 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2805 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2806 		goto nla_put_failure;
2807 
2808 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2809 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2810 		goto nla_put_failure;
2811 
2812 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2813 		goto nla_put_failure;
2814 
2815 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2816 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2817 		goto nla_put_failure;
2818 
2819 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2820 		goto nla_put_failure;
2821 
2822 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2823 			!!(rate_flags & CAKE_FLAG_WASH)))
2824 		goto nla_put_failure;
2825 
2826 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2827 		goto nla_put_failure;
2828 
2829 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2830 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2831 			goto nla_put_failure;
2832 
2833 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2834 		goto nla_put_failure;
2835 
2836 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2837 		goto nla_put_failure;
2838 
2839 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2840 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2841 		goto nla_put_failure;
2842 
2843 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2844 		goto nla_put_failure;
2845 
2846 	return nla_nest_end(skb, opts);
2847 
2848 nla_put_failure:
2849 	return -1;
2850 }
2851 
2852 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2853 {
2854 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2855 	struct cake_sched_data *q = qdisc_priv(sch);
2856 	struct nlattr *tstats, *ts;
2857 	int i;
2858 
2859 	if (!stats)
2860 		return -1;
2861 
2862 #define PUT_STAT_U32(attr, data) do {				       \
2863 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2864 			goto nla_put_failure;			       \
2865 	} while (0)
2866 #define PUT_STAT_U64(attr, data) do {				       \
2867 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2868 					data, TCA_CAKE_STATS_PAD)) \
2869 			goto nla_put_failure;			       \
2870 	} while (0)
2871 
2872 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2873 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2874 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2875 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2876 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2877 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2878 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2879 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2880 
2881 #undef PUT_STAT_U32
2882 #undef PUT_STAT_U64
2883 
2884 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2885 	if (!tstats)
2886 		goto nla_put_failure;
2887 
2888 #define PUT_TSTAT_U32(attr, data) do {					\
2889 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2890 			goto nla_put_failure;				\
2891 	} while (0)
2892 #define PUT_TSTAT_U64(attr, data) do {					\
2893 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2894 					data, TCA_CAKE_TIN_STATS_PAD))	\
2895 			goto nla_put_failure;				\
2896 	} while (0)
2897 
2898 	for (i = 0; i < q->tin_cnt; i++) {
2899 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2900 
2901 		ts = nla_nest_start_noflag(d->skb, i + 1);
2902 		if (!ts)
2903 			goto nla_put_failure;
2904 
2905 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2906 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2907 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2908 
2909 		PUT_TSTAT_U32(TARGET_US,
2910 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2911 		PUT_TSTAT_U32(INTERVAL_US,
2912 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2913 
2914 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2915 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2916 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2917 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2918 
2919 		PUT_TSTAT_U32(PEAK_DELAY_US,
2920 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2921 		PUT_TSTAT_U32(AVG_DELAY_US,
2922 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2923 		PUT_TSTAT_U32(BASE_DELAY_US,
2924 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2925 
2926 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2927 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2928 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2929 
2930 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2931 					    b->decaying_flow_count);
2932 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2933 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2934 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2935 
2936 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2937 		nla_nest_end(d->skb, ts);
2938 	}
2939 
2940 #undef PUT_TSTAT_U32
2941 #undef PUT_TSTAT_U64
2942 
2943 	nla_nest_end(d->skb, tstats);
2944 	return nla_nest_end(d->skb, stats);
2945 
2946 nla_put_failure:
2947 	nla_nest_cancel(d->skb, stats);
2948 	return -1;
2949 }
2950 
2951 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2952 {
2953 	return NULL;
2954 }
2955 
2956 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2957 {
2958 	return 0;
2959 }
2960 
2961 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2962 			       u32 classid)
2963 {
2964 	return 0;
2965 }
2966 
2967 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2968 {
2969 }
2970 
2971 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2972 					struct netlink_ext_ack *extack)
2973 {
2974 	struct cake_sched_data *q = qdisc_priv(sch);
2975 
2976 	if (cl)
2977 		return NULL;
2978 	return q->block;
2979 }
2980 
2981 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2982 			   struct sk_buff *skb, struct tcmsg *tcm)
2983 {
2984 	tcm->tcm_handle |= TC_H_MIN(cl);
2985 	return 0;
2986 }
2987 
2988 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2989 				 struct gnet_dump *d)
2990 {
2991 	struct cake_sched_data *q = qdisc_priv(sch);
2992 	const struct cake_flow *flow = NULL;
2993 	struct gnet_stats_queue qs = { 0 };
2994 	struct nlattr *stats;
2995 	u32 idx = cl - 1;
2996 
2997 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2998 		const struct cake_tin_data *b = \
2999 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3000 		const struct sk_buff *skb;
3001 
3002 		flow = &b->flows[idx % CAKE_QUEUES];
3003 
3004 		if (flow->head) {
3005 			sch_tree_lock(sch);
3006 			skb = flow->head;
3007 			while (skb) {
3008 				qs.qlen++;
3009 				skb = skb->next;
3010 			}
3011 			sch_tree_unlock(sch);
3012 		}
3013 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3014 		qs.drops = flow->dropped;
3015 	}
3016 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3017 		return -1;
3018 	if (flow) {
3019 		ktime_t now = ktime_get();
3020 
3021 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3022 		if (!stats)
3023 			return -1;
3024 
3025 #define PUT_STAT_U32(attr, data) do {				       \
3026 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3027 			goto nla_put_failure;			       \
3028 	} while (0)
3029 #define PUT_STAT_S32(attr, data) do {				       \
3030 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3031 			goto nla_put_failure;			       \
3032 	} while (0)
3033 
3034 		PUT_STAT_S32(DEFICIT, flow->deficit);
3035 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3036 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3037 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3038 		if (flow->cvars.p_drop) {
3039 			PUT_STAT_S32(BLUE_TIMER_US,
3040 				     ktime_to_us(
3041 					     ktime_sub(now,
3042 						       flow->cvars.blue_timer)));
3043 		}
3044 		if (flow->cvars.dropping) {
3045 			PUT_STAT_S32(DROP_NEXT_US,
3046 				     ktime_to_us(
3047 					     ktime_sub(now,
3048 						       flow->cvars.drop_next)));
3049 		}
3050 
3051 		if (nla_nest_end(d->skb, stats) < 0)
3052 			return -1;
3053 	}
3054 
3055 	return 0;
3056 
3057 nla_put_failure:
3058 	nla_nest_cancel(d->skb, stats);
3059 	return -1;
3060 }
3061 
3062 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3063 {
3064 	struct cake_sched_data *q = qdisc_priv(sch);
3065 	unsigned int i, j;
3066 
3067 	if (arg->stop)
3068 		return;
3069 
3070 	for (i = 0; i < q->tin_cnt; i++) {
3071 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3072 
3073 		for (j = 0; j < CAKE_QUEUES; j++) {
3074 			if (list_empty(&b->flows[j].flowchain)) {
3075 				arg->count++;
3076 				continue;
3077 			}
3078 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3079 						 arg))
3080 				break;
3081 		}
3082 	}
3083 }
3084 
3085 static const struct Qdisc_class_ops cake_class_ops = {
3086 	.leaf		=	cake_leaf,
3087 	.find		=	cake_find,
3088 	.tcf_block	=	cake_tcf_block,
3089 	.bind_tcf	=	cake_bind,
3090 	.unbind_tcf	=	cake_unbind,
3091 	.dump		=	cake_dump_class,
3092 	.dump_stats	=	cake_dump_class_stats,
3093 	.walk		=	cake_walk,
3094 };
3095 
3096 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3097 	.cl_ops		=	&cake_class_ops,
3098 	.id		=	"cake",
3099 	.priv_size	=	sizeof(struct cake_sched_data),
3100 	.enqueue	=	cake_enqueue,
3101 	.dequeue	=	cake_dequeue,
3102 	.peek		=	qdisc_peek_dequeued,
3103 	.init		=	cake_init,
3104 	.reset		=	cake_reset,
3105 	.destroy	=	cake_destroy,
3106 	.change		=	cake_change,
3107 	.dump		=	cake_dump,
3108 	.dump_stats	=	cake_dump_stats,
3109 	.owner		=	THIS_MODULE,
3110 };
3111 MODULE_ALIAS_NET_SCH("cake");
3112 
3113 static int __init cake_module_init(void)
3114 {
3115 	return register_qdisc(&cake_qdisc_ops);
3116 }
3117 
3118 static void __exit cake_module_exit(void)
3119 {
3120 	unregister_qdisc(&cake_qdisc_ops);
3121 }
3122 
3123 module_init(cake_module_init)
3124 module_exit(cake_module_exit)
3125 MODULE_AUTHOR("Jonathan Morton");
3126 MODULE_LICENSE("Dual BSD/GPL");
3127 MODULE_DESCRIPTION("The CAKE shaper.");
3128