xref: /linux/net/sched/sch_cake.c (revision d3b35898de024796c43415f9535fd0bc69cb8f1b)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/sch_priv.h>
71 #include <net/pkt_cls.h>
72 #include <net/tcp.h>
73 #include <net/flow_dissector.h>
74 
75 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
76 #include <net/netfilter/nf_conntrack_core.h>
77 #endif
78 
79 #define CAKE_SET_WAYS (8)
80 #define CAKE_MAX_TINS (8)
81 #define CAKE_QUEUES (1024)
82 #define CAKE_FLOW_MASK 63
83 #define CAKE_FLOW_NAT_FLAG 64
84 
85 /* struct cobalt_params - contains codel and blue parameters
86  * @interval:	codel initial drop rate
87  * @target:     maximum persistent sojourn time & blue update rate
88  * @mtu_time:   serialisation delay of maximum-size packet
89  * @p_inc:      increment of blue drop probability (0.32 fxp)
90  * @p_dec:      decrement of blue drop probability (0.32 fxp)
91  */
92 struct cobalt_params {
93 	u64	interval;
94 	u64	target;
95 	u64	mtu_time;
96 	u32	p_inc;
97 	u32	p_dec;
98 };
99 
100 /* struct cobalt_vars - contains codel and blue variables
101  * @count:		codel dropping frequency
102  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
103  * @drop_next:		time to drop next packet, or when we dropped last
104  * @blue_timer:		Blue time to next drop
105  * @p_drop:		BLUE drop probability (0.32 fxp)
106  * @dropping:		set if in dropping state
107  * @ecn_marked:		set if marked
108  */
109 struct cobalt_vars {
110 	u32	count;
111 	u32	rec_inv_sqrt;
112 	ktime_t	drop_next;
113 	ktime_t	blue_timer;
114 	u32     p_drop;
115 	bool	dropping;
116 	bool    ecn_marked;
117 };
118 
119 enum {
120 	CAKE_SET_NONE = 0,
121 	CAKE_SET_SPARSE,
122 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 	CAKE_SET_BULK,
124 	CAKE_SET_DECAYING
125 };
126 
127 struct cake_flow {
128 	/* this stuff is all needed per-flow at dequeue time */
129 	struct sk_buff	  *head;
130 	struct sk_buff	  *tail;
131 	struct list_head  flowchain;
132 	s32		  deficit;
133 	u32		  dropped;
134 	struct cobalt_vars cvars;
135 	u16		  srchost; /* index into cake_host table */
136 	u16		  dsthost;
137 	u8		  set;
138 }; /* please try to keep this structure <= 64 bytes */
139 
140 struct cake_host {
141 	u32 srchost_tag;
142 	u32 dsthost_tag;
143 	u16 srchost_bulk_flow_count;
144 	u16 dsthost_bulk_flow_count;
145 };
146 
147 struct cake_heap_entry {
148 	u16 t:3, b:10;
149 };
150 
151 struct cake_tin_data {
152 	struct cake_flow flows[CAKE_QUEUES];
153 	u32	backlogs[CAKE_QUEUES];
154 	u32	tags[CAKE_QUEUES]; /* for set association */
155 	u16	overflow_idx[CAKE_QUEUES];
156 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 	u16	flow_quantum;
158 
159 	struct cobalt_params cparams;
160 	u32	drop_overlimit;
161 	u16	bulk_flow_count;
162 	u16	sparse_flow_count;
163 	u16	decaying_flow_count;
164 	u16	unresponsive_flow_count;
165 
166 	u32	max_skblen;
167 
168 	struct list_head new_flows;
169 	struct list_head old_flows;
170 	struct list_head decaying_flows;
171 
172 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 	ktime_t	time_next_packet;
174 	u64	tin_rate_ns;
175 	u64	tin_rate_bps;
176 	u16	tin_rate_shft;
177 
178 	u16	tin_quantum;
179 	s32	tin_deficit;
180 	u32	tin_backlog;
181 	u32	tin_dropped;
182 	u32	tin_ecn_mark;
183 
184 	u32	packets;
185 	u64	bytes;
186 
187 	u32	ack_drops;
188 
189 	/* moving averages */
190 	u64 avge_delay;
191 	u64 peak_delay;
192 	u64 base_delay;
193 
194 	/* hash function stats */
195 	u32	way_directs;
196 	u32	way_hits;
197 	u32	way_misses;
198 	u32	way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200 
201 struct cake_sched_config {
202 	u64		rate_bps;
203 	u64		interval;
204 	u64		target;
205 	u64		sync_time;
206 	u32		buffer_config_limit;
207 	u32		fwmark_mask;
208 	u16		fwmark_shft;
209 	s16		rate_overhead;
210 	u16		rate_mpu;
211 	u16		rate_flags;
212 	u8		tin_mode;
213 	u8		flow_mode;
214 	u8		atm_mode;
215 	u8		ack_filter;
216 	u8		is_shared;
217 };
218 
219 struct cake_sched_data {
220 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
221 	struct tcf_block *block;
222 	struct cake_tin_data *tins;
223 	struct cake_sched_config *config;
224 
225 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
226 
227 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
228 	ktime_t		time_next_packet;
229 	ktime_t		failsafe_next_packet;
230 	u64		rate_ns;
231 	u16		rate_shft;
232 	u16		overflow_timeout;
233 	u16		tin_cnt;
234 
235 	/* resource tracking */
236 	u32		buffer_used;
237 	u32		buffer_max_used;
238 	u32		buffer_limit;
239 
240 	/* indices for dequeue */
241 	u16		cur_tin;
242 	u16		cur_flow;
243 
244 	struct qdisc_watchdog watchdog;
245 	const u8	*tin_index;
246 	const u8	*tin_order;
247 
248 	/* bandwidth capacity estimate */
249 	ktime_t		last_packet_time;
250 	ktime_t		avg_window_begin;
251 	u64		avg_packet_interval;
252 	u64		avg_window_bytes;
253 	u64		avg_peak_bandwidth;
254 	ktime_t		last_reconfig_time;
255 
256 	/* packet length stats */
257 	u32		avg_netoff;
258 	u16		max_netlen;
259 	u16		max_adjlen;
260 	u16		min_netlen;
261 	u16		min_adjlen;
262 
263 	/* mq sync state */
264 	u64		last_checked_active;
265 	u64		last_active;
266 	u32		active_queues;
267 };
268 
269 enum {
270 	CAKE_FLAG_OVERHEAD	   = BIT(0),
271 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
272 	CAKE_FLAG_INGRESS	   = BIT(2),
273 	CAKE_FLAG_WASH		   = BIT(3),
274 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
275 };
276 
277 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
278  * obtain the best features of each.  Codel is excellent on flows which
279  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
280  * unresponsive flows.
281  */
282 
283 struct cobalt_skb_cb {
284 	ktime_t enqueue_time;
285 	u32     adjusted_len;
286 };
287 
288 static u64 us_to_ns(u64 us)
289 {
290 	return us * NSEC_PER_USEC;
291 }
292 
293 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
294 {
295 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
296 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
297 }
298 
299 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
300 {
301 	return get_cobalt_cb(skb)->enqueue_time;
302 }
303 
304 static void cobalt_set_enqueue_time(struct sk_buff *skb,
305 				    ktime_t now)
306 {
307 	get_cobalt_cb(skb)->enqueue_time = now;
308 }
309 
310 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
311 
312 /* Diffserv lookup tables */
313 
314 static const u8 precedence[] = {
315 	0, 0, 0, 0, 0, 0, 0, 0,
316 	1, 1, 1, 1, 1, 1, 1, 1,
317 	2, 2, 2, 2, 2, 2, 2, 2,
318 	3, 3, 3, 3, 3, 3, 3, 3,
319 	4, 4, 4, 4, 4, 4, 4, 4,
320 	5, 5, 5, 5, 5, 5, 5, 5,
321 	6, 6, 6, 6, 6, 6, 6, 6,
322 	7, 7, 7, 7, 7, 7, 7, 7,
323 };
324 
325 static const u8 diffserv8[] = {
326 	2, 0, 1, 2, 4, 2, 2, 2,
327 	1, 2, 1, 2, 1, 2, 1, 2,
328 	5, 2, 4, 2, 4, 2, 4, 2,
329 	3, 2, 3, 2, 3, 2, 3, 2,
330 	6, 2, 3, 2, 3, 2, 3, 2,
331 	6, 2, 2, 2, 6, 2, 6, 2,
332 	7, 2, 2, 2, 2, 2, 2, 2,
333 	7, 2, 2, 2, 2, 2, 2, 2,
334 };
335 
336 static const u8 diffserv4[] = {
337 	0, 1, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	2, 0, 2, 0, 2, 0, 2, 0,
340 	2, 0, 2, 0, 2, 0, 2, 0,
341 	3, 0, 2, 0, 2, 0, 2, 0,
342 	3, 0, 0, 0, 3, 0, 3, 0,
343 	3, 0, 0, 0, 0, 0, 0, 0,
344 	3, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 diffserv3[] = {
348 	0, 1, 0, 0, 2, 0, 0, 0,
349 	1, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 2, 0, 2, 0,
354 	2, 0, 0, 0, 0, 0, 0, 0,
355 	2, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 static const u8 besteffort[] = {
359 	0, 0, 0, 0, 0, 0, 0, 0,
360 	0, 0, 0, 0, 0, 0, 0, 0,
361 	0, 0, 0, 0, 0, 0, 0, 0,
362 	0, 0, 0, 0, 0, 0, 0, 0,
363 	0, 0, 0, 0, 0, 0, 0, 0,
364 	0, 0, 0, 0, 0, 0, 0, 0,
365 	0, 0, 0, 0, 0, 0, 0, 0,
366 	0, 0, 0, 0, 0, 0, 0, 0,
367 };
368 
369 /* tin priority order for stats dumping */
370 
371 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
372 static const u8 bulk_order[] = {1, 0, 2, 3};
373 
374 /* There is a big difference in timing between the accurate values placed in the
375  * cache and the approximations given by a single Newton step for small count
376  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
377  * these values are calculated using eight Newton steps, using the
378  * implementation below. Above 16, a single Newton step gives sufficient
379  * accuracy in either direction, given the precision stored.
380  *
381  * The magnitude of the error when stepping up to count 2 is such as to give the
382  * value that *should* have been produced at count 4.
383  */
384 
385 #define REC_INV_SQRT_CACHE (16)
386 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
387 		~0,         ~0, 3037000500, 2479700525,
388 	2147483647, 1920767767, 1753413056, 1623345051,
389 	1518500250, 1431655765, 1358187914, 1294981364,
390 	1239850263, 1191209601, 1147878294, 1108955788
391 };
392 
393 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
394 			  u64 target_ns, u64 rtt_est_ns);
395 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
396  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
397  *
398  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
399  */
400 
401 static void cobalt_newton_step(struct cobalt_vars *vars)
402 {
403 	u32 invsqrt, invsqrt2;
404 	u64 val;
405 
406 	invsqrt = vars->rec_inv_sqrt;
407 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
408 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
409 
410 	val >>= 2; /* avoid overflow in following multiply */
411 	val = (val * invsqrt) >> (32 - 2 + 1);
412 
413 	vars->rec_inv_sqrt = val;
414 }
415 
416 static void cobalt_invsqrt(struct cobalt_vars *vars)
417 {
418 	if (vars->count < REC_INV_SQRT_CACHE)
419 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
420 	else
421 		cobalt_newton_step(vars);
422 }
423 
424 static void cobalt_vars_init(struct cobalt_vars *vars)
425 {
426 	memset(vars, 0, sizeof(*vars));
427 }
428 
429 /* CoDel control_law is t + interval/sqrt(count)
430  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
431  * both sqrt() and divide operation.
432  */
433 static ktime_t cobalt_control(ktime_t t,
434 			      u64 interval,
435 			      u32 rec_inv_sqrt)
436 {
437 	return ktime_add_ns(t, reciprocal_scale(interval,
438 						rec_inv_sqrt));
439 }
440 
441 /* Call this when a packet had to be dropped due to queue overflow.  Returns
442  * true if the BLUE state was quiescent before but active after this call.
443  */
444 static bool cobalt_queue_full(struct cobalt_vars *vars,
445 			      struct cobalt_params *p,
446 			      ktime_t now)
447 {
448 	bool up = false;
449 
450 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
451 		up = !vars->p_drop;
452 		vars->p_drop += p->p_inc;
453 		if (vars->p_drop < p->p_inc)
454 			vars->p_drop = ~0;
455 		vars->blue_timer = now;
456 	}
457 	vars->dropping = true;
458 	vars->drop_next = now;
459 	if (!vars->count)
460 		vars->count = 1;
461 
462 	return up;
463 }
464 
465 /* Call this when the queue was serviced but turned out to be empty.  Returns
466  * true if the BLUE state was active before but quiescent after this call.
467  */
468 static bool cobalt_queue_empty(struct cobalt_vars *vars,
469 			       struct cobalt_params *p,
470 			       ktime_t now)
471 {
472 	bool down = false;
473 
474 	if (vars->p_drop &&
475 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
476 		if (vars->p_drop < p->p_dec)
477 			vars->p_drop = 0;
478 		else
479 			vars->p_drop -= p->p_dec;
480 		vars->blue_timer = now;
481 		down = !vars->p_drop;
482 	}
483 	vars->dropping = false;
484 
485 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
486 		vars->count--;
487 		cobalt_invsqrt(vars);
488 		vars->drop_next = cobalt_control(vars->drop_next,
489 						 p->interval,
490 						 vars->rec_inv_sqrt);
491 	}
492 
493 	return down;
494 }
495 
496 /* Call this with a freshly dequeued packet for possible congestion marking.
497  * Returns true as an instruction to drop the packet, false for delivery.
498  */
499 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
500 					       struct cobalt_params *p,
501 					       ktime_t now,
502 					       struct sk_buff *skb,
503 					       u32 bulk_flows)
504 {
505 	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
506 	bool next_due, over_target;
507 	ktime_t schedule;
508 	u64 sojourn;
509 
510 /* The 'schedule' variable records, in its sign, whether 'now' is before or
511  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
512  * scheduling decision is actually branched, without destroying that
513  * information.  Similarly, the first 'schedule' value calculated is preserved
514  * in the boolean 'next_due'.
515  *
516  * As for 'drop_next', we take advantage of the fact that 'interval' is both
517  * the delay between first exceeding 'target' and the first signalling event,
518  * *and* the scaling factor for the signalling frequency.  It's therefore very
519  * natural to use a single mechanism for both purposes, and eliminates a
520  * significant amount of reference Codel's spaghetti code.  To help with this,
521  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
522  * as possible to 1.0 in fixed-point.
523  */
524 
525 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
526 	schedule = ktime_sub(now, vars->drop_next);
527 	over_target = sojourn > p->target &&
528 		      sojourn > p->mtu_time * bulk_flows * 2 &&
529 		      sojourn > p->mtu_time * 4;
530 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
531 
532 	vars->ecn_marked = false;
533 
534 	if (over_target) {
535 		if (!vars->dropping) {
536 			vars->dropping = true;
537 			vars->drop_next = cobalt_control(now,
538 							 p->interval,
539 							 vars->rec_inv_sqrt);
540 		}
541 		if (!vars->count)
542 			vars->count = 1;
543 	} else if (vars->dropping) {
544 		vars->dropping = false;
545 	}
546 
547 	if (next_due && vars->dropping) {
548 		/* Use ECN mark if possible, otherwise drop */
549 		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
550 			reason = SKB_DROP_REASON_QDISC_CONGESTED;
551 
552 		vars->count++;
553 		if (!vars->count)
554 			vars->count--;
555 		cobalt_invsqrt(vars);
556 		vars->drop_next = cobalt_control(vars->drop_next,
557 						 p->interval,
558 						 vars->rec_inv_sqrt);
559 		schedule = ktime_sub(now, vars->drop_next);
560 	} else {
561 		while (next_due) {
562 			vars->count--;
563 			cobalt_invsqrt(vars);
564 			vars->drop_next = cobalt_control(vars->drop_next,
565 							 p->interval,
566 							 vars->rec_inv_sqrt);
567 			schedule = ktime_sub(now, vars->drop_next);
568 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
569 		}
570 	}
571 
572 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
573 	if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
574 	    get_random_u32() < vars->p_drop)
575 		reason = SKB_DROP_REASON_CAKE_FLOOD;
576 
577 	/* Overload the drop_next field as an activity timeout */
578 	if (!vars->count)
579 		vars->drop_next = ktime_add_ns(now, p->interval);
580 	else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
581 		vars->drop_next = now;
582 
583 	return reason;
584 }
585 
586 static bool cake_update_flowkeys(struct flow_keys *keys,
587 				 const struct sk_buff *skb)
588 {
589 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
590 	struct nf_conntrack_tuple tuple = {};
591 	bool rev = !skb->_nfct, upd = false;
592 	__be32 ip;
593 
594 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
595 		return false;
596 
597 	if (!nf_ct_get_tuple_skb(&tuple, skb))
598 		return false;
599 
600 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
601 	if (ip != keys->addrs.v4addrs.src) {
602 		keys->addrs.v4addrs.src = ip;
603 		upd = true;
604 	}
605 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
606 	if (ip != keys->addrs.v4addrs.dst) {
607 		keys->addrs.v4addrs.dst = ip;
608 		upd = true;
609 	}
610 
611 	if (keys->ports.ports) {
612 		__be16 port;
613 
614 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
615 		if (port != keys->ports.src) {
616 			keys->ports.src = port;
617 			upd = true;
618 		}
619 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
620 		if (port != keys->ports.dst) {
621 			port = keys->ports.dst;
622 			upd = true;
623 		}
624 	}
625 	return upd;
626 #else
627 	return false;
628 #endif
629 }
630 
631 /* Cake has several subtle multiple bit settings. In these cases you
632  *  would be matching triple isolate mode as well.
633  */
634 
635 static bool cake_dsrc(int flow_mode)
636 {
637 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
638 }
639 
640 static bool cake_ddst(int flow_mode)
641 {
642 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
643 }
644 
645 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
646 					     struct cake_flow *flow,
647 					     int flow_mode)
648 {
649 	if (likely(cake_dsrc(flow_mode) &&
650 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
651 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
652 }
653 
654 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
655 					     struct cake_flow *flow,
656 					     int flow_mode)
657 {
658 	if (likely(cake_dsrc(flow_mode) &&
659 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
660 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
661 }
662 
663 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
664 					     struct cake_flow *flow,
665 					     int flow_mode)
666 {
667 	if (likely(cake_ddst(flow_mode) &&
668 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
669 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
670 }
671 
672 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
673 					     struct cake_flow *flow,
674 					     int flow_mode)
675 {
676 	if (likely(cake_ddst(flow_mode) &&
677 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
678 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
679 }
680 
681 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
682 				 struct cake_flow *flow,
683 				 int flow_mode)
684 {
685 	u16 host_load = 1;
686 
687 	if (cake_dsrc(flow_mode))
688 		host_load = max(host_load,
689 				q->hosts[flow->srchost].srchost_bulk_flow_count);
690 
691 	if (cake_ddst(flow_mode))
692 		host_load = max(host_load,
693 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
694 
695 	/* The get_random_u16() is a way to apply dithering to avoid
696 	 * accumulating roundoff errors
697 	 */
698 	return (q->flow_quantum * quantum_div[host_load] +
699 		get_random_u16()) >> 16;
700 }
701 
702 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
703 		     int flow_mode, u16 flow_override, u16 host_override)
704 {
705 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
706 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
707 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
708 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
709 	u16 reduced_hash, srchost_idx, dsthost_idx;
710 	struct flow_keys keys, host_keys;
711 	bool use_skbhash = skb->l4_hash;
712 
713 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
714 		return 0;
715 
716 	/* If both overrides are set, or we can use the SKB hash and nat mode is
717 	 * disabled, we can skip packet dissection entirely. If nat mode is
718 	 * enabled there's another check below after doing the conntrack lookup.
719 	 */
720 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
721 		goto skip_hash;
722 
723 	skb_flow_dissect_flow_keys(skb, &keys,
724 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
725 
726 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
727 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
728 		use_skbhash = false;
729 
730 	/* If we can still use the SKB hash and don't need the host hash, we can
731 	 * skip the rest of the hashing procedure
732 	 */
733 	if (use_skbhash && !hash_hosts)
734 		goto skip_hash;
735 
736 	/* flow_hash_from_keys() sorts the addresses by value, so we have
737 	 * to preserve their order in a separate data structure to treat
738 	 * src and dst host addresses as independently selectable.
739 	 */
740 	host_keys = keys;
741 	host_keys.ports.ports     = 0;
742 	host_keys.basic.ip_proto  = 0;
743 	host_keys.keyid.keyid     = 0;
744 	host_keys.tags.flow_label = 0;
745 
746 	switch (host_keys.control.addr_type) {
747 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
748 		host_keys.addrs.v4addrs.src = 0;
749 		dsthost_hash = flow_hash_from_keys(&host_keys);
750 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
751 		host_keys.addrs.v4addrs.dst = 0;
752 		srchost_hash = flow_hash_from_keys(&host_keys);
753 		break;
754 
755 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
756 		memset(&host_keys.addrs.v6addrs.src, 0,
757 		       sizeof(host_keys.addrs.v6addrs.src));
758 		dsthost_hash = flow_hash_from_keys(&host_keys);
759 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
760 		memset(&host_keys.addrs.v6addrs.dst, 0,
761 		       sizeof(host_keys.addrs.v6addrs.dst));
762 		srchost_hash = flow_hash_from_keys(&host_keys);
763 		break;
764 
765 	default:
766 		dsthost_hash = 0;
767 		srchost_hash = 0;
768 	}
769 
770 	/* This *must* be after the above switch, since as a
771 	 * side-effect it sorts the src and dst addresses.
772 	 */
773 	if (hash_flows && !use_skbhash)
774 		flow_hash = flow_hash_from_keys(&keys);
775 
776 skip_hash:
777 	if (flow_override)
778 		flow_hash = flow_override - 1;
779 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
780 		flow_hash = skb->hash;
781 	if (host_override) {
782 		dsthost_hash = host_override - 1;
783 		srchost_hash = host_override - 1;
784 	}
785 
786 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
787 		if (flow_mode & CAKE_FLOW_SRC_IP)
788 			flow_hash ^= srchost_hash;
789 
790 		if (flow_mode & CAKE_FLOW_DST_IP)
791 			flow_hash ^= dsthost_hash;
792 	}
793 
794 	reduced_hash = flow_hash % CAKE_QUEUES;
795 
796 	/* set-associative hashing */
797 	/* fast path if no hash collision (direct lookup succeeds) */
798 	if (likely(q->tags[reduced_hash] == flow_hash &&
799 		   q->flows[reduced_hash].set)) {
800 		q->way_directs++;
801 	} else {
802 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
803 		u32 outer_hash = reduced_hash - inner_hash;
804 		bool allocate_src = false;
805 		bool allocate_dst = false;
806 		u32 i, k;
807 
808 		/* check if any active queue in the set is reserved for
809 		 * this flow.
810 		 */
811 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
812 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
813 			if (q->tags[outer_hash + k] == flow_hash) {
814 				if (i)
815 					q->way_hits++;
816 
817 				if (!q->flows[outer_hash + k].set) {
818 					/* need to increment host refcnts */
819 					allocate_src = cake_dsrc(flow_mode);
820 					allocate_dst = cake_ddst(flow_mode);
821 				}
822 
823 				goto found;
824 			}
825 		}
826 
827 		/* no queue is reserved for this flow, look for an
828 		 * empty one.
829 		 */
830 		for (i = 0; i < CAKE_SET_WAYS;
831 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
832 			if (!q->flows[outer_hash + k].set) {
833 				q->way_misses++;
834 				allocate_src = cake_dsrc(flow_mode);
835 				allocate_dst = cake_ddst(flow_mode);
836 				goto found;
837 			}
838 		}
839 
840 		/* With no empty queues, default to the original
841 		 * queue, accept the collision, update the host tags.
842 		 */
843 		q->way_collisions++;
844 		allocate_src = cake_dsrc(flow_mode);
845 		allocate_dst = cake_ddst(flow_mode);
846 
847 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
848 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
849 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
850 		}
851 found:
852 		/* reserve queue for future packets in same flow */
853 		reduced_hash = outer_hash + k;
854 		q->tags[reduced_hash] = flow_hash;
855 
856 		if (allocate_src) {
857 			srchost_idx = srchost_hash % CAKE_QUEUES;
858 			inner_hash = srchost_idx % CAKE_SET_WAYS;
859 			outer_hash = srchost_idx - inner_hash;
860 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
861 				i++, k = (k + 1) % CAKE_SET_WAYS) {
862 				if (q->hosts[outer_hash + k].srchost_tag ==
863 				    srchost_hash)
864 					goto found_src;
865 			}
866 			for (i = 0; i < CAKE_SET_WAYS;
867 				i++, k = (k + 1) % CAKE_SET_WAYS) {
868 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
869 					break;
870 			}
871 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
872 found_src:
873 			srchost_idx = outer_hash + k;
874 			q->flows[reduced_hash].srchost = srchost_idx;
875 
876 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
877 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
878 		}
879 
880 		if (allocate_dst) {
881 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
882 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
883 			outer_hash = dsthost_idx - inner_hash;
884 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
885 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
886 				if (q->hosts[outer_hash + k].dsthost_tag ==
887 				    dsthost_hash)
888 					goto found_dst;
889 			}
890 			for (i = 0; i < CAKE_SET_WAYS;
891 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
892 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
893 					break;
894 			}
895 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
896 found_dst:
897 			dsthost_idx = outer_hash + k;
898 			q->flows[reduced_hash].dsthost = dsthost_idx;
899 
900 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
901 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
902 		}
903 	}
904 
905 	return reduced_hash;
906 }
907 
908 /* helper functions : might be changed when/if skb use a standard list_head */
909 /* remove one skb from head of slot queue */
910 
911 static struct sk_buff *dequeue_head(struct cake_flow *flow)
912 {
913 	struct sk_buff *skb = flow->head;
914 
915 	if (skb) {
916 		flow->head = skb->next;
917 		skb_mark_not_on_list(skb);
918 	}
919 
920 	return skb;
921 }
922 
923 /* add skb to flow queue (tail add) */
924 
925 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
926 {
927 	if (!flow->head)
928 		flow->head = skb;
929 	else
930 		flow->tail->next = skb;
931 	flow->tail = skb;
932 	skb->next = NULL;
933 }
934 
935 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
936 				    struct ipv6hdr *buf)
937 {
938 	unsigned int offset = skb_network_offset(skb);
939 	struct iphdr *iph;
940 
941 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
942 
943 	if (!iph)
944 		return NULL;
945 
946 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
947 		return skb_header_pointer(skb, offset + iph->ihl * 4,
948 					  sizeof(struct ipv6hdr), buf);
949 
950 	else if (iph->version == 4)
951 		return iph;
952 
953 	else if (iph->version == 6)
954 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
955 					  buf);
956 
957 	return NULL;
958 }
959 
960 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
961 				      void *buf, unsigned int bufsize)
962 {
963 	unsigned int offset = skb_network_offset(skb);
964 	const struct ipv6hdr *ipv6h;
965 	const struct tcphdr *tcph;
966 	const struct iphdr *iph;
967 	struct ipv6hdr _ipv6h;
968 	struct tcphdr _tcph;
969 
970 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
971 
972 	if (!ipv6h)
973 		return NULL;
974 
975 	if (ipv6h->version == 4) {
976 		iph = (struct iphdr *)ipv6h;
977 		offset += iph->ihl * 4;
978 
979 		/* special-case 6in4 tunnelling, as that is a common way to get
980 		 * v6 connectivity in the home
981 		 */
982 		if (iph->protocol == IPPROTO_IPV6) {
983 			ipv6h = skb_header_pointer(skb, offset,
984 						   sizeof(_ipv6h), &_ipv6h);
985 
986 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
987 				return NULL;
988 
989 			offset += sizeof(struct ipv6hdr);
990 
991 		} else if (iph->protocol != IPPROTO_TCP) {
992 			return NULL;
993 		}
994 
995 	} else if (ipv6h->version == 6) {
996 		if (ipv6h->nexthdr != IPPROTO_TCP)
997 			return NULL;
998 
999 		offset += sizeof(struct ipv6hdr);
1000 	} else {
1001 		return NULL;
1002 	}
1003 
1004 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1005 	if (!tcph || tcph->doff < 5)
1006 		return NULL;
1007 
1008 	return skb_header_pointer(skb, offset,
1009 				  min(__tcp_hdrlen(tcph), bufsize), buf);
1010 }
1011 
1012 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1013 				   int code, int *oplen)
1014 {
1015 	/* inspired by tcp_parse_options in tcp_input.c */
1016 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1017 	const u8 *ptr = (const u8 *)(tcph + 1);
1018 
1019 	while (length > 0) {
1020 		int opcode = *ptr++;
1021 		int opsize;
1022 
1023 		if (opcode == TCPOPT_EOL)
1024 			break;
1025 		if (opcode == TCPOPT_NOP) {
1026 			length--;
1027 			continue;
1028 		}
1029 		if (length < 2)
1030 			break;
1031 		opsize = *ptr++;
1032 		if (opsize < 2 || opsize > length)
1033 			break;
1034 
1035 		if (opcode == code) {
1036 			*oplen = opsize;
1037 			return ptr;
1038 		}
1039 
1040 		ptr += opsize - 2;
1041 		length -= opsize;
1042 	}
1043 
1044 	return NULL;
1045 }
1046 
1047 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1048  * bytes than the other. In the case where both sequences ACKs bytes that the
1049  * other doesn't, A is considered greater. DSACKs in A also makes A be
1050  * considered greater.
1051  *
1052  * @return -1, 0 or 1 as normal compare functions
1053  */
1054 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1055 				  const struct tcphdr *tcph_b)
1056 {
1057 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1058 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1059 	u32 bytes_a = 0, bytes_b = 0;
1060 	int oplen_a, oplen_b;
1061 	bool first = true;
1062 
1063 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1064 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1065 
1066 	/* pointers point to option contents */
1067 	oplen_a -= TCPOLEN_SACK_BASE;
1068 	oplen_b -= TCPOLEN_SACK_BASE;
1069 
1070 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1071 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1072 		return -1;
1073 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1074 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1075 		return 1;
1076 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1077 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1078 		return 0;
1079 
1080 	while (oplen_a >= sizeof(*sack_a)) {
1081 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1082 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1083 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1084 		int oplen_tmp = oplen_b;
1085 		bool found = false;
1086 
1087 		/* DSACK; always considered greater to prevent dropping */
1088 		if (before(start_a, ack_seq_a))
1089 			return -1;
1090 
1091 		bytes_a += end_a - start_a;
1092 
1093 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1094 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1095 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1096 
1097 			/* first time through we count the total size */
1098 			if (first)
1099 				bytes_b += end_b - start_b;
1100 
1101 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1102 				found = true;
1103 				if (!first)
1104 					break;
1105 			}
1106 			oplen_tmp -= sizeof(*sack_tmp);
1107 			sack_tmp++;
1108 		}
1109 
1110 		if (!found)
1111 			return -1;
1112 
1113 		oplen_a -= sizeof(*sack_a);
1114 		sack_a++;
1115 		first = false;
1116 	}
1117 
1118 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1119 	 * either the SACKs are equal, or B SACKs more bytes.
1120 	 */
1121 	return bytes_b > bytes_a ? 1 : 0;
1122 }
1123 
1124 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1125 				 u32 *tsval, u32 *tsecr)
1126 {
1127 	const u8 *ptr;
1128 	int opsize;
1129 
1130 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1131 
1132 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1133 		*tsval = get_unaligned_be32(ptr);
1134 		*tsecr = get_unaligned_be32(ptr + 4);
1135 	}
1136 }
1137 
1138 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1139 			       u32 tstamp_new, u32 tsecr_new)
1140 {
1141 	/* inspired by tcp_parse_options in tcp_input.c */
1142 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1143 	const u8 *ptr = (const u8 *)(tcph + 1);
1144 	u32 tstamp, tsecr;
1145 
1146 	/* 3 reserved flags must be unset to avoid future breakage
1147 	 * ACK must be set
1148 	 * ECE/CWR are handled separately
1149 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1150 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1151 	 * 0x00C00000 = CWR/ECE (handled separately)
1152 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1153 	 */
1154 	if (((tcp_flag_word(tcph) &
1155 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1156 		return false;
1157 
1158 	while (length > 0) {
1159 		int opcode = *ptr++;
1160 		int opsize;
1161 
1162 		if (opcode == TCPOPT_EOL)
1163 			break;
1164 		if (opcode == TCPOPT_NOP) {
1165 			length--;
1166 			continue;
1167 		}
1168 		if (length < 2)
1169 			break;
1170 		opsize = *ptr++;
1171 		if (opsize < 2 || opsize > length)
1172 			break;
1173 
1174 		switch (opcode) {
1175 		case TCPOPT_MD5SIG: /* doesn't influence state */
1176 			break;
1177 
1178 		case TCPOPT_SACK: /* stricter checking performed later */
1179 			if (opsize % 8 != 2)
1180 				return false;
1181 			break;
1182 
1183 		case TCPOPT_TIMESTAMP:
1184 			/* only drop timestamps lower than new */
1185 			if (opsize != TCPOLEN_TIMESTAMP)
1186 				return false;
1187 			tstamp = get_unaligned_be32(ptr);
1188 			tsecr = get_unaligned_be32(ptr + 4);
1189 			if (after(tstamp, tstamp_new) ||
1190 			    after(tsecr, tsecr_new))
1191 				return false;
1192 			break;
1193 
1194 		case TCPOPT_MSS:  /* these should only be set on SYN */
1195 		case TCPOPT_WINDOW:
1196 		case TCPOPT_SACK_PERM:
1197 		case TCPOPT_FASTOPEN:
1198 		case TCPOPT_EXP:
1199 		default: /* don't drop if any unknown options are present */
1200 			return false;
1201 		}
1202 
1203 		ptr += opsize - 2;
1204 		length -= opsize;
1205 	}
1206 
1207 	return true;
1208 }
1209 
1210 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1211 				       struct cake_flow *flow)
1212 {
1213 	bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
1214 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1215 	struct sk_buff *skb_check, *skb_prev = NULL;
1216 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1217 	unsigned char _tcph[64], _tcph_check[64];
1218 	const struct tcphdr *tcph, *tcph_check;
1219 	const struct iphdr *iph, *iph_check;
1220 	struct ipv6hdr _iph, _iph_check;
1221 	const struct sk_buff *skb;
1222 	int seglen, num_found = 0;
1223 	u32 tstamp = 0, tsecr = 0;
1224 	__be32 elig_flags = 0;
1225 	int sack_comp;
1226 
1227 	/* no other possible ACKs to filter */
1228 	if (flow->head == flow->tail)
1229 		return NULL;
1230 
1231 	skb = flow->tail;
1232 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1233 	iph = cake_get_iphdr(skb, &_iph);
1234 	if (!tcph)
1235 		return NULL;
1236 
1237 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1238 
1239 	/* the 'triggering' packet need only have the ACK flag set.
1240 	 * also check that SYN is not set, as there won't be any previous ACKs.
1241 	 */
1242 	if ((tcp_flag_word(tcph) &
1243 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1244 		return NULL;
1245 
1246 	/* the 'triggering' ACK is at the tail of the queue, we have already
1247 	 * returned if it is the only packet in the flow. loop through the rest
1248 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1249 	 * triggering one.
1250 	 */
1251 	for (skb_check = flow->head;
1252 	     skb_check && skb_check != skb;
1253 	     skb_prev = skb_check, skb_check = skb_check->next) {
1254 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1255 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1256 					     sizeof(_tcph_check));
1257 
1258 		/* only TCP packets with matching 5-tuple are eligible, and only
1259 		 * drop safe headers
1260 		 */
1261 		if (!tcph_check || iph->version != iph_check->version ||
1262 		    tcph_check->source != tcph->source ||
1263 		    tcph_check->dest != tcph->dest)
1264 			continue;
1265 
1266 		if (iph_check->version == 4) {
1267 			if (iph_check->saddr != iph->saddr ||
1268 			    iph_check->daddr != iph->daddr)
1269 				continue;
1270 
1271 			seglen = iph_totlen(skb, iph_check) -
1272 				       (4 * iph_check->ihl);
1273 		} else if (iph_check->version == 6) {
1274 			ipv6h = (struct ipv6hdr *)iph;
1275 			ipv6h_check = (struct ipv6hdr *)iph_check;
1276 
1277 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1278 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1279 				continue;
1280 
1281 			seglen = ntohs(ipv6h_check->payload_len);
1282 		} else {
1283 			WARN_ON(1);  /* shouldn't happen */
1284 			continue;
1285 		}
1286 
1287 		/* If the ECE/CWR flags changed from the previous eligible
1288 		 * packet in the same flow, we should no longer be dropping that
1289 		 * previous packet as this would lose information.
1290 		 */
1291 		if (elig_ack && (tcp_flag_word(tcph_check) &
1292 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1293 			elig_ack = NULL;
1294 			elig_ack_prev = NULL;
1295 			num_found--;
1296 		}
1297 
1298 		/* Check TCP options and flags, don't drop ACKs with segment
1299 		 * data, and don't drop ACKs with a higher cumulative ACK
1300 		 * counter than the triggering packet. Check ACK seqno here to
1301 		 * avoid parsing SACK options of packets we are going to exclude
1302 		 * anyway.
1303 		 */
1304 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1305 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1306 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1307 			continue;
1308 
1309 		/* Check SACK options. The triggering packet must SACK more data
1310 		 * than the ACK under consideration, or SACK the same range but
1311 		 * have a larger cumulative ACK counter. The latter is a
1312 		 * pathological case, but is contained in the following check
1313 		 * anyway, just to be safe.
1314 		 */
1315 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1316 
1317 		if (sack_comp < 0 ||
1318 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1319 		     sack_comp == 0))
1320 			continue;
1321 
1322 		/* At this point we have found an eligible pure ACK to drop; if
1323 		 * we are in aggressive mode, we are done. Otherwise, keep
1324 		 * searching unless this is the second eligible ACK we
1325 		 * found.
1326 		 *
1327 		 * Since we want to drop ACK closest to the head of the queue,
1328 		 * save the first eligible ACK we find, even if we need to loop
1329 		 * again.
1330 		 */
1331 		if (!elig_ack) {
1332 			elig_ack = skb_check;
1333 			elig_ack_prev = skb_prev;
1334 			elig_flags = (tcp_flag_word(tcph_check)
1335 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1336 		}
1337 
1338 		if (num_found++ > 0)
1339 			goto found;
1340 	}
1341 
1342 	/* We made it through the queue without finding two eligible ACKs . If
1343 	 * we found a single eligible ACK we can drop it in aggressive mode if
1344 	 * we can guarantee that this does not interfere with ECN flag
1345 	 * information. We ensure this by dropping it only if the enqueued
1346 	 * packet is consecutive with the eligible ACK, and their flags match.
1347 	 */
1348 	if (elig_ack && aggressive && elig_ack->next == skb &&
1349 	    (elig_flags == (tcp_flag_word(tcph) &
1350 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1351 		goto found;
1352 
1353 	return NULL;
1354 
1355 found:
1356 	if (elig_ack_prev)
1357 		elig_ack_prev->next = elig_ack->next;
1358 	else
1359 		flow->head = elig_ack->next;
1360 
1361 	skb_mark_not_on_list(elig_ack);
1362 
1363 	return elig_ack;
1364 }
1365 
1366 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1367 {
1368 	avg -= avg >> shift;
1369 	avg += sample >> shift;
1370 	return avg;
1371 }
1372 
1373 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
1374 {
1375 	struct cake_sched_config *q = qd->config;
1376 
1377 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1378 		len -= off;
1379 
1380 	if (qd->max_netlen < len)
1381 		qd->max_netlen = len;
1382 	if (qd->min_netlen > len)
1383 		qd->min_netlen = len;
1384 
1385 	len += q->rate_overhead;
1386 
1387 	if (len < q->rate_mpu)
1388 		len = q->rate_mpu;
1389 
1390 	if (q->atm_mode == CAKE_ATM_ATM) {
1391 		len += 47;
1392 		len /= 48;
1393 		len *= 53;
1394 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1395 		/* Add one byte per 64 bytes or part thereof.
1396 		 * This is conservative and easier to calculate than the
1397 		 * precise value.
1398 		 */
1399 		len += (len + 63) / 64;
1400 	}
1401 
1402 	if (qd->max_adjlen < len)
1403 		qd->max_adjlen = len;
1404 	if (qd->min_adjlen > len)
1405 		qd->min_adjlen = len;
1406 
1407 	return len;
1408 }
1409 
1410 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1411 {
1412 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1413 	unsigned int hdr_len, last_len = 0;
1414 	u32 off = skb_network_offset(skb);
1415 	u16 segs = qdisc_pkt_segs(skb);
1416 	u32 len = qdisc_pkt_len(skb);
1417 
1418 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1419 
1420 	if (segs == 1)
1421 		return cake_calc_overhead(q, len, off);
1422 
1423 	/* borrowed from qdisc_pkt_len_segs_init() */
1424 	if (!skb->encapsulation)
1425 		hdr_len = skb_transport_offset(skb);
1426 	else
1427 		hdr_len = skb_inner_transport_offset(skb);
1428 
1429 	/* + transport layer */
1430 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1431 						SKB_GSO_TCPV6))) {
1432 		const struct tcphdr *th;
1433 		struct tcphdr _tcphdr;
1434 
1435 		th = skb_header_pointer(skb, hdr_len,
1436 					sizeof(_tcphdr), &_tcphdr);
1437 		if (likely(th))
1438 			hdr_len += __tcp_hdrlen(th);
1439 	} else {
1440 		struct udphdr _udphdr;
1441 
1442 		if (skb_header_pointer(skb, hdr_len,
1443 				       sizeof(_udphdr), &_udphdr))
1444 			hdr_len += sizeof(struct udphdr);
1445 	}
1446 
1447 	len = shinfo->gso_size + hdr_len;
1448 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1449 
1450 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1451 		cake_calc_overhead(q, last_len, off));
1452 }
1453 
1454 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1455 {
1456 	struct cake_heap_entry ii = q->overflow_heap[i];
1457 	struct cake_heap_entry jj = q->overflow_heap[j];
1458 
1459 	q->overflow_heap[i] = jj;
1460 	q->overflow_heap[j] = ii;
1461 
1462 	q->tins[ii.t].overflow_idx[ii.b] = j;
1463 	q->tins[jj.t].overflow_idx[jj.b] = i;
1464 }
1465 
1466 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1467 {
1468 	struct cake_heap_entry ii = q->overflow_heap[i];
1469 
1470 	return q->tins[ii.t].backlogs[ii.b];
1471 }
1472 
1473 static void cake_heapify(struct cake_sched_data *q, u16 i)
1474 {
1475 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1476 	u32 mb = cake_heap_get_backlog(q, i);
1477 	u32 m = i;
1478 
1479 	while (m < a) {
1480 		u32 l = m + m + 1;
1481 		u32 r = l + 1;
1482 
1483 		if (l < a) {
1484 			u32 lb = cake_heap_get_backlog(q, l);
1485 
1486 			if (lb > mb) {
1487 				m  = l;
1488 				mb = lb;
1489 			}
1490 		}
1491 
1492 		if (r < a) {
1493 			u32 rb = cake_heap_get_backlog(q, r);
1494 
1495 			if (rb > mb) {
1496 				m  = r;
1497 				mb = rb;
1498 			}
1499 		}
1500 
1501 		if (m != i) {
1502 			cake_heap_swap(q, i, m);
1503 			i = m;
1504 		} else {
1505 			break;
1506 		}
1507 	}
1508 }
1509 
1510 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1511 {
1512 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1513 		u16 p = (i - 1) >> 1;
1514 		u32 ib = cake_heap_get_backlog(q, i);
1515 		u32 pb = cake_heap_get_backlog(q, p);
1516 
1517 		if (ib > pb) {
1518 			cake_heap_swap(q, i, p);
1519 			i = p;
1520 		} else {
1521 			break;
1522 		}
1523 	}
1524 }
1525 
1526 static int cake_advance_shaper(struct cake_sched_data *q,
1527 			       struct cake_tin_data *b,
1528 			       struct sk_buff *skb,
1529 			       ktime_t now, bool drop)
1530 {
1531 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1532 
1533 	/* charge packet bandwidth to this tin
1534 	 * and to the global shaper.
1535 	 */
1536 	if (q->rate_ns) {
1537 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1538 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1539 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1540 
1541 		if (ktime_before(b->time_next_packet, now))
1542 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1543 							   tin_dur);
1544 
1545 		else if (ktime_before(b->time_next_packet,
1546 				      ktime_add_ns(now, tin_dur)))
1547 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1548 
1549 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1550 						   global_dur);
1551 		if (!drop)
1552 			q->failsafe_next_packet = \
1553 				ktime_add_ns(q->failsafe_next_packet,
1554 					     failsafe_dur);
1555 	}
1556 	return len;
1557 }
1558 
1559 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1560 {
1561 	struct cake_sched_data *q = qdisc_priv(sch);
1562 	ktime_t now = ktime_get();
1563 	u32 idx = 0, tin = 0, len;
1564 	struct cake_heap_entry qq;
1565 	struct cake_tin_data *b;
1566 	struct cake_flow *flow;
1567 	struct sk_buff *skb;
1568 
1569 	if (!q->overflow_timeout) {
1570 		int i;
1571 		/* Build fresh max-heap */
1572 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1573 			cake_heapify(q, i);
1574 	}
1575 	q->overflow_timeout = 65535;
1576 
1577 	/* select longest queue for pruning */
1578 	qq  = q->overflow_heap[0];
1579 	tin = qq.t;
1580 	idx = qq.b;
1581 
1582 	b = &q->tins[tin];
1583 	flow = &b->flows[idx];
1584 	skb = dequeue_head(flow);
1585 	if (unlikely(!skb)) {
1586 		/* heap has gone wrong, rebuild it next time */
1587 		q->overflow_timeout = 0;
1588 		return idx + (tin << 16);
1589 	}
1590 
1591 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1592 		b->unresponsive_flow_count++;
1593 
1594 	len = qdisc_pkt_len(skb);
1595 	q->buffer_used      -= skb->truesize;
1596 	b->backlogs[idx]    -= len;
1597 	b->tin_backlog      -= len;
1598 	sch->qstats.backlog -= len;
1599 
1600 	flow->dropped++;
1601 	b->tin_dropped++;
1602 
1603 	if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1604 		cake_advance_shaper(q, b, skb, now, true);
1605 
1606 	qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1607 	sch->q.qlen--;
1608 
1609 	cake_heapify(q, 0);
1610 
1611 	return idx + (tin << 16);
1612 }
1613 
1614 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1615 {
1616 	const int offset = skb_network_offset(skb);
1617 	u16 *buf, buf_;
1618 	u8 dscp;
1619 
1620 	switch (skb_protocol(skb, true)) {
1621 	case htons(ETH_P_IP):
1622 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1623 		if (unlikely(!buf))
1624 			return 0;
1625 
1626 		/* ToS is in the second byte of iphdr */
1627 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1628 
1629 		if (wash && dscp) {
1630 			const int wlen = offset + sizeof(struct iphdr);
1631 
1632 			if (!pskb_may_pull(skb, wlen) ||
1633 			    skb_try_make_writable(skb, wlen))
1634 				return 0;
1635 
1636 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1637 		}
1638 
1639 		return dscp;
1640 
1641 	case htons(ETH_P_IPV6):
1642 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1643 		if (unlikely(!buf))
1644 			return 0;
1645 
1646 		/* Traffic class is in the first and second bytes of ipv6hdr */
1647 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1648 
1649 		if (wash && dscp) {
1650 			const int wlen = offset + sizeof(struct ipv6hdr);
1651 
1652 			if (!pskb_may_pull(skb, wlen) ||
1653 			    skb_try_make_writable(skb, wlen))
1654 				return 0;
1655 
1656 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1657 		}
1658 
1659 		return dscp;
1660 
1661 	case htons(ETH_P_ARP):
1662 		return 0x38;  /* CS7 - Net Control */
1663 
1664 	default:
1665 		/* If there is no Diffserv field, treat as best-effort */
1666 		return 0;
1667 	}
1668 }
1669 
1670 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1671 					     struct sk_buff *skb)
1672 {
1673 	struct cake_sched_data *qd = qdisc_priv(sch);
1674 	struct cake_sched_config *q = qd->config;
1675 	u32 tin, mark;
1676 	bool wash;
1677 	u8 dscp;
1678 
1679 	/* Tin selection: Default to diffserv-based selection, allow overriding
1680 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1681 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1682 	 */
1683 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1684 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1685 	if (wash)
1686 		dscp = cake_handle_diffserv(skb, wash);
1687 
1688 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1689 		tin = 0;
1690 
1691 	else if (mark && mark <= qd->tin_cnt)
1692 		tin = qd->tin_order[mark - 1];
1693 
1694 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1695 		 TC_H_MIN(skb->priority) > 0 &&
1696 		 TC_H_MIN(skb->priority) <= qd->tin_cnt)
1697 		tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
1698 
1699 	else {
1700 		if (!wash)
1701 			dscp = cake_handle_diffserv(skb, wash);
1702 		tin = qd->tin_index[dscp];
1703 
1704 		if (unlikely(tin >= qd->tin_cnt))
1705 			tin = 0;
1706 	}
1707 
1708 	return &qd->tins[tin];
1709 }
1710 
1711 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1712 			 struct sk_buff *skb, int flow_mode, int *qerr)
1713 {
1714 	struct cake_sched_data *q = qdisc_priv(sch);
1715 	struct tcf_proto *filter;
1716 	struct tcf_result res;
1717 	u16 flow = 0, host = 0;
1718 	int result;
1719 
1720 	filter = rcu_dereference_bh(q->filter_list);
1721 	if (!filter)
1722 		goto hash;
1723 
1724 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1725 	result = tcf_classify(skb, NULL, filter, &res, false);
1726 
1727 	if (result >= 0) {
1728 #ifdef CONFIG_NET_CLS_ACT
1729 		switch (result) {
1730 		case TC_ACT_STOLEN:
1731 		case TC_ACT_QUEUED:
1732 		case TC_ACT_TRAP:
1733 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1734 			fallthrough;
1735 		case TC_ACT_SHOT:
1736 			return 0;
1737 		}
1738 #endif
1739 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1740 			flow = TC_H_MIN(res.classid);
1741 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1742 			host = TC_H_MAJ(res.classid) >> 16;
1743 	}
1744 hash:
1745 	*t = cake_select_tin(sch, skb);
1746 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1747 }
1748 
1749 static void cake_reconfigure(struct Qdisc *sch);
1750 
1751 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1752 			struct sk_buff **to_free)
1753 {
1754 	u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1755 	struct cake_sched_data *q = qdisc_priv(sch);
1756 	int len = qdisc_pkt_len(skb), ret;
1757 	struct sk_buff *ack = NULL;
1758 	ktime_t now = ktime_get();
1759 	struct cake_tin_data *b;
1760 	struct cake_flow *flow;
1761 	bool same_flow = false;
1762 
1763 	/* choose flow to insert into */
1764 	idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
1765 	if (idx == 0) {
1766 		if (ret & __NET_XMIT_BYPASS)
1767 			qdisc_qstats_drop(sch);
1768 		__qdisc_drop(skb, to_free);
1769 		return ret;
1770 	}
1771 	tin = (u32)(b - q->tins);
1772 	idx--;
1773 	flow = &b->flows[idx];
1774 
1775 	/* ensure shaper state isn't stale */
1776 	if (!b->tin_backlog) {
1777 		if (ktime_before(b->time_next_packet, now))
1778 			b->time_next_packet = now;
1779 
1780 		if (!sch->q.qlen) {
1781 			if (ktime_before(q->time_next_packet, now)) {
1782 				q->failsafe_next_packet = now;
1783 				q->time_next_packet = now;
1784 			} else if (ktime_after(q->time_next_packet, now) &&
1785 				   ktime_after(q->failsafe_next_packet, now)) {
1786 				u64 next = \
1787 					min(ktime_to_ns(q->time_next_packet),
1788 					    ktime_to_ns(
1789 						   q->failsafe_next_packet));
1790 				sch->qstats.overlimits++;
1791 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1792 			}
1793 		}
1794 	}
1795 
1796 	if (unlikely(len > b->max_skblen))
1797 		b->max_skblen = len;
1798 
1799 	if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1800 		struct sk_buff *segs, *nskb;
1801 		netdev_features_t features = netif_skb_features(skb);
1802 		unsigned int slen = 0, numsegs = 0;
1803 
1804 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1805 		if (IS_ERR_OR_NULL(segs))
1806 			return qdisc_drop(skb, sch, to_free);
1807 
1808 		skb_list_walk_safe(segs, segs, nskb) {
1809 			skb_mark_not_on_list(segs);
1810 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1811 			qdisc_skb_cb(segs)->pkt_segs = 1;
1812 			cobalt_set_enqueue_time(segs, now);
1813 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1814 									  segs);
1815 			flow_queue_add(flow, segs);
1816 
1817 			sch->q.qlen++;
1818 			numsegs++;
1819 			slen += segs->len;
1820 			q->buffer_used += segs->truesize;
1821 			b->packets++;
1822 		}
1823 
1824 		/* stats */
1825 		b->bytes	    += slen;
1826 		b->backlogs[idx]    += slen;
1827 		b->tin_backlog      += slen;
1828 		sch->qstats.backlog += slen;
1829 		q->avg_window_bytes += slen;
1830 
1831 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1832 		consume_skb(skb);
1833 	} else {
1834 		/* not splitting */
1835 		int ack_pkt_len = 0;
1836 
1837 		cobalt_set_enqueue_time(skb, now);
1838 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1839 		flow_queue_add(flow, skb);
1840 
1841 		if (q->config->ack_filter)
1842 			ack = cake_ack_filter(q, flow);
1843 
1844 		if (ack) {
1845 			b->ack_drops++;
1846 			sch->qstats.drops++;
1847 			ack_pkt_len = qdisc_pkt_len(ack);
1848 			b->bytes += ack_pkt_len;
1849 			q->buffer_used += skb->truesize - ack->truesize;
1850 			if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1851 				cake_advance_shaper(q, b, ack, now, true);
1852 
1853 			qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1854 			consume_skb(ack);
1855 		} else {
1856 			sch->q.qlen++;
1857 			q->buffer_used      += skb->truesize;
1858 		}
1859 
1860 		/* stats */
1861 		b->packets++;
1862 		b->bytes	    += len - ack_pkt_len;
1863 		b->backlogs[idx]    += len - ack_pkt_len;
1864 		b->tin_backlog      += len - ack_pkt_len;
1865 		sch->qstats.backlog += len - ack_pkt_len;
1866 		q->avg_window_bytes += len - ack_pkt_len;
1867 	}
1868 
1869 	if (q->overflow_timeout)
1870 		cake_heapify_up(q, b->overflow_idx[idx]);
1871 
1872 	/* incoming bandwidth capacity estimate */
1873 	if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1874 		u64 packet_interval = \
1875 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1876 
1877 		if (packet_interval > NSEC_PER_SEC)
1878 			packet_interval = NSEC_PER_SEC;
1879 
1880 		/* filter out short-term bursts, eg. wifi aggregation */
1881 		q->avg_packet_interval = \
1882 			cake_ewma(q->avg_packet_interval,
1883 				  packet_interval,
1884 				  (packet_interval > q->avg_packet_interval ?
1885 					  2 : 8));
1886 
1887 		q->last_packet_time = now;
1888 
1889 		if (packet_interval > q->avg_packet_interval) {
1890 			u64 window_interval = \
1891 				ktime_to_ns(ktime_sub(now,
1892 						      q->avg_window_begin));
1893 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1894 
1895 			b = div64_u64(b, window_interval);
1896 			q->avg_peak_bandwidth =
1897 				cake_ewma(q->avg_peak_bandwidth, b,
1898 					  b > q->avg_peak_bandwidth ? 2 : 8);
1899 			q->avg_window_bytes = 0;
1900 			q->avg_window_begin = now;
1901 
1902 			if (ktime_after(now,
1903 					ktime_add_ms(q->last_reconfig_time,
1904 						     250))) {
1905 				q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1906 				cake_reconfigure(sch);
1907 			}
1908 		}
1909 	} else {
1910 		q->avg_window_bytes = 0;
1911 		q->last_packet_time = now;
1912 	}
1913 
1914 	/* flowchain */
1915 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1916 		if (!flow->set) {
1917 			list_add_tail(&flow->flowchain, &b->new_flows);
1918 		} else {
1919 			b->decaying_flow_count--;
1920 			list_move_tail(&flow->flowchain, &b->new_flows);
1921 		}
1922 		flow->set = CAKE_SET_SPARSE;
1923 		b->sparse_flow_count++;
1924 
1925 		flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
1926 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1927 		/* this flow was empty, accounted as a sparse flow, but actually
1928 		 * in the bulk rotation.
1929 		 */
1930 		flow->set = CAKE_SET_BULK;
1931 		b->sparse_flow_count--;
1932 		b->bulk_flow_count++;
1933 
1934 		cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
1935 		cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
1936 	}
1937 
1938 	if (q->buffer_used > q->buffer_max_used)
1939 		q->buffer_max_used = q->buffer_used;
1940 
1941 	if (q->buffer_used <= q->buffer_limit)
1942 		return NET_XMIT_SUCCESS;
1943 
1944 	prev_qlen = sch->q.qlen;
1945 	prev_backlog = sch->qstats.backlog;
1946 
1947 	while (q->buffer_used > q->buffer_limit) {
1948 		drop_id = cake_drop(sch, to_free);
1949 		if ((drop_id >> 16) == tin &&
1950 		    (drop_id & 0xFFFF) == idx)
1951 			same_flow = true;
1952 	}
1953 
1954 	prev_qlen -= sch->q.qlen;
1955 	prev_backlog -= sch->qstats.backlog;
1956 	b->drop_overlimit += prev_qlen;
1957 
1958 	if (same_flow) {
1959 		qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1960 					  prev_backlog - len);
1961 		return NET_XMIT_CN;
1962 	}
1963 	qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1964 	return NET_XMIT_SUCCESS;
1965 }
1966 
1967 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1968 {
1969 	struct cake_sched_data *q = qdisc_priv(sch);
1970 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1971 	struct cake_flow *flow = &b->flows[q->cur_flow];
1972 	struct sk_buff *skb = NULL;
1973 	u32 len;
1974 
1975 	if (flow->head) {
1976 		skb = dequeue_head(flow);
1977 		len = qdisc_pkt_len(skb);
1978 		b->backlogs[q->cur_flow] -= len;
1979 		b->tin_backlog		 -= len;
1980 		sch->qstats.backlog      -= len;
1981 		q->buffer_used		 -= skb->truesize;
1982 		sch->q.qlen--;
1983 
1984 		if (q->overflow_timeout)
1985 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1986 	}
1987 	return skb;
1988 }
1989 
1990 /* Discard leftover packets from a tin no longer in use. */
1991 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1992 {
1993 	struct cake_sched_data *q = qdisc_priv(sch);
1994 	struct sk_buff *skb;
1995 
1996 	q->cur_tin = tin;
1997 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1998 		while (!!(skb = cake_dequeue_one(sch)))
1999 			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
2000 }
2001 
2002 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2003 {
2004 	struct cake_sched_data *q = qdisc_priv(sch);
2005 	struct cake_tin_data *b = &q->tins[q->cur_tin];
2006 	enum skb_drop_reason reason;
2007 	ktime_t now = ktime_get();
2008 	struct cake_flow *flow;
2009 	struct list_head *head;
2010 	bool first_flow = true;
2011 	struct sk_buff *skb;
2012 	u64 delay;
2013 	u32 len;
2014 
2015 	if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) {
2016 		struct net_device *dev = qdisc_dev(sch);
2017 		struct cake_sched_data *other_priv;
2018 		u64 new_rate = q->config->rate_bps;
2019 		u64 other_qlen, other_last_active;
2020 		struct Qdisc *other_sch;
2021 		u32 num_active_qs = 1;
2022 		unsigned int ntx;
2023 
2024 		for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2025 			other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
2026 			other_priv = qdisc_priv(other_sch);
2027 
2028 			if (other_priv == q)
2029 				continue;
2030 
2031 			other_qlen = READ_ONCE(other_sch->q.qlen);
2032 			other_last_active = READ_ONCE(other_priv->last_active);
2033 
2034 			if (other_qlen || other_last_active > q->last_checked_active)
2035 				num_active_qs++;
2036 		}
2037 
2038 		if (num_active_qs > 1)
2039 			new_rate = div64_u64(q->config->rate_bps, num_active_qs);
2040 
2041 		/* mtu = 0 is used to only update the rate and not mess with cobalt params */
2042 		cake_set_rate(b, new_rate, 0, 0, 0);
2043 		q->last_checked_active = now;
2044 		q->active_queues = num_active_qs;
2045 		q->rate_ns = b->tin_rate_ns;
2046 		q->rate_shft = b->tin_rate_shft;
2047 	}
2048 
2049 begin:
2050 	if (!sch->q.qlen)
2051 		return NULL;
2052 
2053 	/* global hard shaper */
2054 	if (ktime_after(q->time_next_packet, now) &&
2055 	    ktime_after(q->failsafe_next_packet, now)) {
2056 		u64 next = min(ktime_to_ns(q->time_next_packet),
2057 			       ktime_to_ns(q->failsafe_next_packet));
2058 
2059 		sch->qstats.overlimits++;
2060 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2061 		return NULL;
2062 	}
2063 
2064 	/* Choose a class to work on. */
2065 	if (!q->rate_ns) {
2066 		/* In unlimited mode, can't rely on shaper timings, just balance
2067 		 * with DRR
2068 		 */
2069 		bool wrapped = false, empty = true;
2070 
2071 		while (b->tin_deficit < 0 ||
2072 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2073 			if (b->tin_deficit <= 0)
2074 				b->tin_deficit += b->tin_quantum;
2075 			if (b->sparse_flow_count + b->bulk_flow_count)
2076 				empty = false;
2077 
2078 			q->cur_tin++;
2079 			b++;
2080 			if (q->cur_tin >= q->tin_cnt) {
2081 				q->cur_tin = 0;
2082 				b = q->tins;
2083 
2084 				if (wrapped) {
2085 					/* It's possible for q->qlen to be
2086 					 * nonzero when we actually have no
2087 					 * packets anywhere.
2088 					 */
2089 					if (empty)
2090 						return NULL;
2091 				} else {
2092 					wrapped = true;
2093 				}
2094 			}
2095 		}
2096 	} else {
2097 		/* In shaped mode, choose:
2098 		 * - Highest-priority tin with queue and meeting schedule, or
2099 		 * - The earliest-scheduled tin with queue.
2100 		 */
2101 		ktime_t best_time = KTIME_MAX;
2102 		int tin, best_tin = 0;
2103 
2104 		for (tin = 0; tin < q->tin_cnt; tin++) {
2105 			b = q->tins + tin;
2106 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2107 				ktime_t time_to_pkt = \
2108 					ktime_sub(b->time_next_packet, now);
2109 
2110 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2111 				    ktime_compare(time_to_pkt,
2112 						  best_time) <= 0) {
2113 					best_time = time_to_pkt;
2114 					best_tin = tin;
2115 				}
2116 			}
2117 		}
2118 
2119 		q->cur_tin = best_tin;
2120 		b = q->tins + best_tin;
2121 
2122 		/* No point in going further if no packets to deliver. */
2123 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2124 			return NULL;
2125 	}
2126 
2127 retry:
2128 	/* service this class */
2129 	head = &b->decaying_flows;
2130 	if (!first_flow || list_empty(head)) {
2131 		head = &b->new_flows;
2132 		if (list_empty(head)) {
2133 			head = &b->old_flows;
2134 			if (unlikely(list_empty(head))) {
2135 				head = &b->decaying_flows;
2136 				if (unlikely(list_empty(head)))
2137 					goto begin;
2138 			}
2139 		}
2140 	}
2141 	flow = list_first_entry(head, struct cake_flow, flowchain);
2142 	q->cur_flow = flow - b->flows;
2143 	first_flow = false;
2144 
2145 	/* flow isolation (DRR++) */
2146 	if (flow->deficit <= 0) {
2147 		/* Keep all flows with deficits out of the sparse and decaying
2148 		 * rotations.  No non-empty flow can go into the decaying
2149 		 * rotation, so they can't get deficits
2150 		 */
2151 		if (flow->set == CAKE_SET_SPARSE) {
2152 			if (flow->head) {
2153 				b->sparse_flow_count--;
2154 				b->bulk_flow_count++;
2155 
2156 				cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2157 				cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2158 
2159 				flow->set = CAKE_SET_BULK;
2160 			} else {
2161 				/* we've moved it to the bulk rotation for
2162 				 * correct deficit accounting but we still want
2163 				 * to count it as a sparse flow, not a bulk one.
2164 				 */
2165 				flow->set = CAKE_SET_SPARSE_WAIT;
2166 			}
2167 		}
2168 
2169 		flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
2170 		list_move_tail(&flow->flowchain, &b->old_flows);
2171 
2172 		goto retry;
2173 	}
2174 
2175 	/* Retrieve a packet via the AQM */
2176 	while (1) {
2177 		skb = cake_dequeue_one(sch);
2178 		if (!skb) {
2179 			/* this queue was actually empty */
2180 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2181 				b->unresponsive_flow_count--;
2182 
2183 			if (flow->cvars.p_drop || flow->cvars.count ||
2184 			    ktime_before(now, flow->cvars.drop_next)) {
2185 				/* keep in the flowchain until the state has
2186 				 * decayed to rest
2187 				 */
2188 				list_move_tail(&flow->flowchain,
2189 					       &b->decaying_flows);
2190 				if (flow->set == CAKE_SET_BULK) {
2191 					b->bulk_flow_count--;
2192 
2193 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2194 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2195 
2196 					b->decaying_flow_count++;
2197 				} else if (flow->set == CAKE_SET_SPARSE ||
2198 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2199 					b->sparse_flow_count--;
2200 					b->decaying_flow_count++;
2201 				}
2202 				flow->set = CAKE_SET_DECAYING;
2203 			} else {
2204 				/* remove empty queue from the flowchain */
2205 				list_del_init(&flow->flowchain);
2206 				if (flow->set == CAKE_SET_SPARSE ||
2207 				    flow->set == CAKE_SET_SPARSE_WAIT)
2208 					b->sparse_flow_count--;
2209 				else if (flow->set == CAKE_SET_BULK) {
2210 					b->bulk_flow_count--;
2211 
2212 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2213 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2214 				} else
2215 					b->decaying_flow_count--;
2216 
2217 				flow->set = CAKE_SET_NONE;
2218 			}
2219 			goto begin;
2220 		}
2221 
2222 		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2223 					    (b->bulk_flow_count *
2224 					     !!(q->config->rate_flags &
2225 						CAKE_FLAG_INGRESS)));
2226 		/* Last packet in queue may be marked, shouldn't be dropped */
2227 		if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2228 			break;
2229 
2230 		/* drop this packet, get another one */
2231 		if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
2232 			len = cake_advance_shaper(q, b, skb,
2233 						  now, true);
2234 			flow->deficit -= len;
2235 			b->tin_deficit -= len;
2236 		}
2237 		flow->dropped++;
2238 		b->tin_dropped++;
2239 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2240 		qdisc_qstats_drop(sch);
2241 		qdisc_dequeue_drop(sch, skb, reason);
2242 		if (q->config->rate_flags & CAKE_FLAG_INGRESS)
2243 			goto retry;
2244 	}
2245 
2246 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2247 	qdisc_bstats_update(sch, skb);
2248 	WRITE_ONCE(q->last_active, now);
2249 
2250 	/* collect delay stats */
2251 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2252 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2253 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2254 				  delay > b->peak_delay ? 2 : 8);
2255 	b->base_delay = cake_ewma(b->base_delay, delay,
2256 				  delay < b->base_delay ? 2 : 8);
2257 
2258 	len = cake_advance_shaper(q, b, skb, now, false);
2259 	flow->deficit -= len;
2260 	b->tin_deficit -= len;
2261 
2262 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2263 		u64 next = min(ktime_to_ns(q->time_next_packet),
2264 			       ktime_to_ns(q->failsafe_next_packet));
2265 
2266 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2267 	} else if (!sch->q.qlen) {
2268 		int i;
2269 
2270 		for (i = 0; i < q->tin_cnt; i++) {
2271 			if (q->tins[i].decaying_flow_count) {
2272 				ktime_t next = \
2273 					ktime_add_ns(now,
2274 						     q->tins[i].cparams.target);
2275 
2276 				qdisc_watchdog_schedule_ns(&q->watchdog,
2277 							   ktime_to_ns(next));
2278 				break;
2279 			}
2280 		}
2281 	}
2282 
2283 	if (q->overflow_timeout)
2284 		q->overflow_timeout--;
2285 
2286 	return skb;
2287 }
2288 
2289 static void cake_reset(struct Qdisc *sch)
2290 {
2291 	struct cake_sched_data *q = qdisc_priv(sch);
2292 	u32 c;
2293 
2294 	if (!q->tins)
2295 		return;
2296 
2297 	for (c = 0; c < CAKE_MAX_TINS; c++)
2298 		cake_clear_tin(sch, c);
2299 }
2300 
2301 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2302 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2303 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2304 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2305 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2306 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2307 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2308 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2309 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2310 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2311 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2312 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2313 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2314 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2315 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2316 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2317 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2318 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2319 };
2320 
2321 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2322 			  u64 target_ns, u64 rtt_est_ns)
2323 {
2324 	/* convert byte-rate into time-per-byte
2325 	 * so it will always unwedge in reasonable time.
2326 	 */
2327 	static const u64 MIN_RATE = 64;
2328 	u32 byte_target = mtu;
2329 	u64 byte_target_ns;
2330 	u8  rate_shft = 0;
2331 	u64 rate_ns = 0;
2332 
2333 	b->flow_quantum = 1514;
2334 	if (rate) {
2335 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2336 		rate_shft = 34;
2337 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2338 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2339 		while (!!(rate_ns >> 34)) {
2340 			rate_ns >>= 1;
2341 			rate_shft--;
2342 		}
2343 	} /* else unlimited, ie. zero delay */
2344 
2345 	b->tin_rate_bps  = rate;
2346 	b->tin_rate_ns   = rate_ns;
2347 	b->tin_rate_shft = rate_shft;
2348 
2349 	if (mtu == 0)
2350 		return;
2351 
2352 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2353 
2354 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2355 	b->cparams.interval = max(rtt_est_ns +
2356 				     b->cparams.target - target_ns,
2357 				     b->cparams.target * 2);
2358 	b->cparams.mtu_time = byte_target_ns;
2359 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2360 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2361 }
2362 
2363 static int cake_config_besteffort(struct Qdisc *sch)
2364 {
2365 	struct cake_sched_data *q = qdisc_priv(sch);
2366 	struct cake_tin_data *b = &q->tins[0];
2367 	u32 mtu = psched_mtu(qdisc_dev(sch));
2368 	u64 rate = q->config->rate_bps;
2369 
2370 	q->tin_cnt = 1;
2371 
2372 	q->tin_index = besteffort;
2373 	q->tin_order = normal_order;
2374 
2375 	cake_set_rate(b, rate, mtu,
2376 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2377 	b->tin_quantum = 65535;
2378 
2379 	return 0;
2380 }
2381 
2382 static int cake_config_precedence(struct Qdisc *sch)
2383 {
2384 	/* convert high-level (user visible) parameters into internal format */
2385 	struct cake_sched_data *q = qdisc_priv(sch);
2386 	u32 mtu = psched_mtu(qdisc_dev(sch));
2387 	u64 rate = q->config->rate_bps;
2388 	u32 quantum = 256;
2389 	u32 i;
2390 
2391 	q->tin_cnt = 8;
2392 	q->tin_index = precedence;
2393 	q->tin_order = normal_order;
2394 
2395 	for (i = 0; i < q->tin_cnt; i++) {
2396 		struct cake_tin_data *b = &q->tins[i];
2397 
2398 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2399 			      us_to_ns(q->config->interval));
2400 
2401 		b->tin_quantum = max_t(u16, 1U, quantum);
2402 
2403 		/* calculate next class's parameters */
2404 		rate  *= 7;
2405 		rate >>= 3;
2406 
2407 		quantum  *= 7;
2408 		quantum >>= 3;
2409 	}
2410 
2411 	return 0;
2412 }
2413 
2414 /*	List of known Diffserv codepoints:
2415  *
2416  *	Default Forwarding (DF/CS0) - Best Effort
2417  *	Max Throughput (TOS2)
2418  *	Min Delay (TOS4)
2419  *	LLT "La" (TOS5)
2420  *	Assured Forwarding 1 (AF1x) - x3
2421  *	Assured Forwarding 2 (AF2x) - x3
2422  *	Assured Forwarding 3 (AF3x) - x3
2423  *	Assured Forwarding 4 (AF4x) - x3
2424  *	Precedence Class 1 (CS1)
2425  *	Precedence Class 2 (CS2)
2426  *	Precedence Class 3 (CS3)
2427  *	Precedence Class 4 (CS4)
2428  *	Precedence Class 5 (CS5)
2429  *	Precedence Class 6 (CS6)
2430  *	Precedence Class 7 (CS7)
2431  *	Voice Admit (VA)
2432  *	Expedited Forwarding (EF)
2433  *	Lower Effort (LE)
2434  *
2435  *	Total 26 codepoints.
2436  */
2437 
2438 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2439  *		(roughly descending order of contended priority)
2440  *		(roughly ascending order of uncontended throughput)
2441  *
2442  *	Network Control (CS6,CS7)      - routing traffic
2443  *	Telephony (EF,VA)         - aka. VoIP streams
2444  *	Signalling (CS5)               - VoIP setup
2445  *	Multimedia Conferencing (AF4x) - aka. video calls
2446  *	Realtime Interactive (CS4)     - eg. games
2447  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2448  *	Broadcast Video (CS3)
2449  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2450  *	Ops, Admin, Management (CS2)      - eg. ssh
2451  *	Standard Service (DF & unrecognised codepoints)
2452  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2453  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2454  *
2455  *	Total 12 traffic classes.
2456  */
2457 
2458 static int cake_config_diffserv8(struct Qdisc *sch)
2459 {
2460 /*	Pruned list of traffic classes for typical applications:
2461  *
2462  *		Network Control          (CS6, CS7)
2463  *		Minimum Latency          (EF, VA, CS5, CS4)
2464  *		Interactive Shell        (CS2)
2465  *		Low Latency Transactions (AF2x, TOS4)
2466  *		Video Streaming          (AF4x, AF3x, CS3)
2467  *		Bog Standard             (DF etc.)
2468  *		High Throughput          (AF1x, TOS2, CS1)
2469  *		Background Traffic       (LE)
2470  *
2471  *		Total 8 traffic classes.
2472  */
2473 
2474 	struct cake_sched_data *q = qdisc_priv(sch);
2475 	u32 mtu = psched_mtu(qdisc_dev(sch));
2476 	u64 rate = q->config->rate_bps;
2477 	u32 quantum = 256;
2478 	u32 i;
2479 
2480 	q->tin_cnt = 8;
2481 
2482 	/* codepoint to class mapping */
2483 	q->tin_index = diffserv8;
2484 	q->tin_order = normal_order;
2485 
2486 	/* class characteristics */
2487 	for (i = 0; i < q->tin_cnt; i++) {
2488 		struct cake_tin_data *b = &q->tins[i];
2489 
2490 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2491 			      us_to_ns(q->config->interval));
2492 
2493 		b->tin_quantum = max_t(u16, 1U, quantum);
2494 
2495 		/* calculate next class's parameters */
2496 		rate  *= 7;
2497 		rate >>= 3;
2498 
2499 		quantum  *= 7;
2500 		quantum >>= 3;
2501 	}
2502 
2503 	return 0;
2504 }
2505 
2506 static int cake_config_diffserv4(struct Qdisc *sch)
2507 {
2508 /*  Further pruned list of traffic classes for four-class system:
2509  *
2510  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2511  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2512  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2513  *	    Background Traffic (LE, CS1)
2514  *
2515  *		Total 4 traffic classes.
2516  */
2517 
2518 	struct cake_sched_data *q = qdisc_priv(sch);
2519 	u32 mtu = psched_mtu(qdisc_dev(sch));
2520 	u64 rate = q->config->rate_bps;
2521 	u32 quantum = 1024;
2522 
2523 	q->tin_cnt = 4;
2524 
2525 	/* codepoint to class mapping */
2526 	q->tin_index = diffserv4;
2527 	q->tin_order = bulk_order;
2528 
2529 	/* class characteristics */
2530 	cake_set_rate(&q->tins[0], rate, mtu,
2531 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2532 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2533 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2534 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2535 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2536 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2537 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2538 
2539 	/* bandwidth-sharing weights */
2540 	q->tins[0].tin_quantum = quantum;
2541 	q->tins[1].tin_quantum = quantum >> 4;
2542 	q->tins[2].tin_quantum = quantum >> 1;
2543 	q->tins[3].tin_quantum = quantum >> 2;
2544 
2545 	return 0;
2546 }
2547 
2548 static int cake_config_diffserv3(struct Qdisc *sch)
2549 {
2550 /*  Simplified Diffserv structure with 3 tins.
2551  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2552  *		Best Effort
2553  *		Low Priority		(LE, CS1)
2554  */
2555 	struct cake_sched_data *q = qdisc_priv(sch);
2556 	u32 mtu = psched_mtu(qdisc_dev(sch));
2557 	u64 rate = q->config->rate_bps;
2558 	u32 quantum = 1024;
2559 
2560 	q->tin_cnt = 3;
2561 
2562 	/* codepoint to class mapping */
2563 	q->tin_index = diffserv3;
2564 	q->tin_order = bulk_order;
2565 
2566 	/* class characteristics */
2567 	cake_set_rate(&q->tins[0], rate, mtu,
2568 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2569 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2570 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2571 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2572 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2573 
2574 	/* bandwidth-sharing weights */
2575 	q->tins[0].tin_quantum = quantum;
2576 	q->tins[1].tin_quantum = quantum >> 4;
2577 	q->tins[2].tin_quantum = quantum >> 2;
2578 
2579 	return 0;
2580 }
2581 
2582 static void cake_reconfigure(struct Qdisc *sch)
2583 {
2584 	struct cake_sched_data *qd = qdisc_priv(sch);
2585 	struct cake_sched_config *q = qd->config;
2586 	int c, ft;
2587 
2588 	switch (q->tin_mode) {
2589 	case CAKE_DIFFSERV_BESTEFFORT:
2590 		ft = cake_config_besteffort(sch);
2591 		break;
2592 
2593 	case CAKE_DIFFSERV_PRECEDENCE:
2594 		ft = cake_config_precedence(sch);
2595 		break;
2596 
2597 	case CAKE_DIFFSERV_DIFFSERV8:
2598 		ft = cake_config_diffserv8(sch);
2599 		break;
2600 
2601 	case CAKE_DIFFSERV_DIFFSERV4:
2602 		ft = cake_config_diffserv4(sch);
2603 		break;
2604 
2605 	case CAKE_DIFFSERV_DIFFSERV3:
2606 	default:
2607 		ft = cake_config_diffserv3(sch);
2608 		break;
2609 	}
2610 
2611 	for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
2612 		cake_clear_tin(sch, c);
2613 		qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
2614 	}
2615 
2616 	qd->rate_ns   = qd->tins[ft].tin_rate_ns;
2617 	qd->rate_shft = qd->tins[ft].tin_rate_shft;
2618 
2619 	if (q->buffer_config_limit) {
2620 		qd->buffer_limit = q->buffer_config_limit;
2621 	} else if (q->rate_bps) {
2622 		u64 t = q->rate_bps * q->interval;
2623 
2624 		do_div(t, USEC_PER_SEC / 4);
2625 		qd->buffer_limit = max_t(u32, t, 4U << 20);
2626 	} else {
2627 		qd->buffer_limit = ~0;
2628 	}
2629 
2630 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2631 
2632 	qd->buffer_limit = min(qd->buffer_limit,
2633 			       max(sch->limit * psched_mtu(qdisc_dev(sch)),
2634 				   q->buffer_config_limit));
2635 }
2636 
2637 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
2638 			      struct netlink_ext_ack *extack, bool *overhead_changed)
2639 {
2640 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2641 	u16 rate_flags = q->rate_flags;
2642 	u8 flow_mode = q->flow_mode;
2643 	int err;
2644 
2645 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2646 					  extack);
2647 	if (err < 0)
2648 		return err;
2649 
2650 	if (tb[TCA_CAKE_NAT]) {
2651 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2652 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2653 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2654 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2655 #else
2656 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2657 				    "No conntrack support in kernel");
2658 		return -EOPNOTSUPP;
2659 #endif
2660 	}
2661 
2662 	if (tb[TCA_CAKE_AUTORATE]) {
2663 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
2664 			if (q->is_shared) {
2665 				NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
2666 						    "Can't use autorate-ingress with cake_mq");
2667 				return -EOPNOTSUPP;
2668 			}
2669 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2670 		} else {
2671 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2672 		}
2673 	}
2674 
2675 	if (tb[TCA_CAKE_BASE_RATE64])
2676 		WRITE_ONCE(q->rate_bps,
2677 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2678 
2679 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2680 		WRITE_ONCE(q->tin_mode,
2681 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2682 
2683 	if (tb[TCA_CAKE_WASH]) {
2684 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2685 			rate_flags |= CAKE_FLAG_WASH;
2686 		else
2687 			rate_flags &= ~CAKE_FLAG_WASH;
2688 	}
2689 
2690 	if (tb[TCA_CAKE_FLOW_MODE])
2691 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2692 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2693 					CAKE_FLOW_MASK));
2694 
2695 	if (tb[TCA_CAKE_ATM])
2696 		WRITE_ONCE(q->atm_mode,
2697 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2698 
2699 	if (tb[TCA_CAKE_OVERHEAD]) {
2700 		WRITE_ONCE(q->rate_overhead,
2701 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2702 		rate_flags |= CAKE_FLAG_OVERHEAD;
2703 		*overhead_changed = true;
2704 	}
2705 
2706 	if (tb[TCA_CAKE_RAW]) {
2707 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2708 		*overhead_changed = true;
2709 	}
2710 
2711 	if (tb[TCA_CAKE_MPU])
2712 		WRITE_ONCE(q->rate_mpu,
2713 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2714 
2715 	if (tb[TCA_CAKE_RTT]) {
2716 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2717 
2718 		WRITE_ONCE(q->interval, max(interval, 1U));
2719 	}
2720 
2721 	if (tb[TCA_CAKE_TARGET]) {
2722 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2723 
2724 		WRITE_ONCE(q->target, max(target, 1U));
2725 	}
2726 
2727 	if (tb[TCA_CAKE_INGRESS]) {
2728 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2729 			rate_flags |= CAKE_FLAG_INGRESS;
2730 		else
2731 			rate_flags &= ~CAKE_FLAG_INGRESS;
2732 	}
2733 
2734 	if (tb[TCA_CAKE_ACK_FILTER])
2735 		WRITE_ONCE(q->ack_filter,
2736 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2737 
2738 	if (tb[TCA_CAKE_MEMORY])
2739 		WRITE_ONCE(q->buffer_config_limit,
2740 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2741 
2742 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2743 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2744 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2745 		else
2746 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2747 	}
2748 
2749 	if (tb[TCA_CAKE_FWMARK]) {
2750 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2751 		WRITE_ONCE(q->fwmark_shft,
2752 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2753 	}
2754 
2755 	WRITE_ONCE(q->rate_flags, rate_flags);
2756 	WRITE_ONCE(q->flow_mode, flow_mode);
2757 
2758 	return 0;
2759 }
2760 
2761 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2762 		       struct netlink_ext_ack *extack)
2763 {
2764 	struct cake_sched_data *qd = qdisc_priv(sch);
2765 	struct cake_sched_config *q = qd->config;
2766 	bool overhead_changed = false;
2767 	int ret;
2768 
2769 	if (q->is_shared) {
2770 		NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
2771 		return -EOPNOTSUPP;
2772 	}
2773 
2774 	ret = cake_config_change(q, opt, extack, &overhead_changed);
2775 	if (ret)
2776 		return ret;
2777 
2778 	if (overhead_changed) {
2779 		qd->max_netlen = 0;
2780 		qd->max_adjlen = 0;
2781 		qd->min_netlen = ~0;
2782 		qd->min_adjlen = ~0;
2783 	}
2784 
2785 	if (qd->tins) {
2786 		sch_tree_lock(sch);
2787 		cake_reconfigure(sch);
2788 		sch_tree_unlock(sch);
2789 	}
2790 
2791 	return 0;
2792 }
2793 
2794 static void cake_destroy(struct Qdisc *sch)
2795 {
2796 	struct cake_sched_data *q = qdisc_priv(sch);
2797 
2798 	qdisc_watchdog_cancel(&q->watchdog);
2799 	tcf_block_put(q->block);
2800 	kvfree(q->tins);
2801 	if (q->config && !q->config->is_shared)
2802 		kvfree(q->config);
2803 }
2804 
2805 static void cake_config_init(struct cake_sched_config *q, bool is_shared)
2806 {
2807 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2808 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2809 
2810 	q->rate_bps = 0; /* unlimited by default */
2811 
2812 	q->interval = 100000; /* 100ms default */
2813 	q->target   =   5000; /* 5ms: codel RFC argues
2814 			       * for 5 to 10% of interval
2815 			       */
2816 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2817 	q->is_shared = is_shared;
2818 	q->sync_time = 200 * NSEC_PER_USEC;
2819 }
2820 
2821 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2822 		     struct netlink_ext_ack *extack)
2823 {
2824 	struct cake_sched_data *qd = qdisc_priv(sch);
2825 	struct cake_sched_config *q;
2826 	int i, j, err;
2827 
2828 	q = kzalloc(sizeof(*q), GFP_KERNEL);
2829 	if (!q)
2830 		return -ENOMEM;
2831 
2832 	cake_config_init(q, false);
2833 
2834 	sch->limit = 10240;
2835 	sch->flags |= TCQ_F_DEQUEUE_DROPS;
2836 
2837 	qd->cur_tin = 0;
2838 	qd->cur_flow  = 0;
2839 	qd->config = q;
2840 
2841 	qdisc_watchdog_init(&qd->watchdog, sch);
2842 
2843 	if (opt) {
2844 		err = cake_change(sch, opt, extack);
2845 
2846 		if (err)
2847 			goto err;
2848 	}
2849 
2850 	err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
2851 	if (err)
2852 		goto err;
2853 
2854 	quantum_div[0] = ~0;
2855 	for (i = 1; i <= CAKE_QUEUES; i++)
2856 		quantum_div[i] = 65535 / i;
2857 
2858 	qd->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2859 			    GFP_KERNEL);
2860 	if (!qd->tins) {
2861 		err = -ENOMEM;
2862 		goto err;
2863 	}
2864 
2865 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2866 		struct cake_tin_data *b = qd->tins + i;
2867 
2868 		INIT_LIST_HEAD(&b->new_flows);
2869 		INIT_LIST_HEAD(&b->old_flows);
2870 		INIT_LIST_HEAD(&b->decaying_flows);
2871 		b->sparse_flow_count = 0;
2872 		b->bulk_flow_count = 0;
2873 		b->decaying_flow_count = 0;
2874 
2875 		for (j = 0; j < CAKE_QUEUES; j++) {
2876 			struct cake_flow *flow = b->flows + j;
2877 			u32 k = j * CAKE_MAX_TINS + i;
2878 
2879 			INIT_LIST_HEAD(&flow->flowchain);
2880 			cobalt_vars_init(&flow->cvars);
2881 
2882 			qd->overflow_heap[k].t = i;
2883 			qd->overflow_heap[k].b = j;
2884 			b->overflow_idx[j] = k;
2885 		}
2886 	}
2887 
2888 	cake_reconfigure(sch);
2889 	qd->avg_peak_bandwidth = q->rate_bps;
2890 	qd->min_netlen = ~0;
2891 	qd->min_adjlen = ~0;
2892 	qd->active_queues = 0;
2893 	qd->last_checked_active = 0;
2894 
2895 	return 0;
2896 err:
2897 	kvfree(qd->config);
2898 	qd->config = NULL;
2899 	return err;
2900 }
2901 
2902 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
2903 {
2904 	struct cake_sched_data *qd = qdisc_priv(sch);
2905 	struct cake_sched_config *q = qd->config;
2906 
2907 	qd->config = cfg;
2908 
2909 	if (!q->is_shared)
2910 		kvfree(q);
2911 
2912 	cake_reconfigure(sch);
2913 }
2914 
2915 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
2916 {
2917 	struct nlattr *opts;
2918 	u16 rate_flags;
2919 	u8 flow_mode;
2920 
2921 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2922 	if (!opts)
2923 		goto nla_put_failure;
2924 
2925 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2926 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2927 		goto nla_put_failure;
2928 
2929 	flow_mode = READ_ONCE(q->flow_mode);
2930 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2931 		goto nla_put_failure;
2932 
2933 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2934 		goto nla_put_failure;
2935 
2936 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2937 		goto nla_put_failure;
2938 
2939 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2940 			READ_ONCE(q->buffer_config_limit)))
2941 		goto nla_put_failure;
2942 
2943 	rate_flags = READ_ONCE(q->rate_flags);
2944 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2945 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2946 		goto nla_put_failure;
2947 
2948 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2949 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2950 		goto nla_put_failure;
2951 
2952 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2953 		goto nla_put_failure;
2954 
2955 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2956 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2957 		goto nla_put_failure;
2958 
2959 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2960 		goto nla_put_failure;
2961 
2962 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2963 			!!(rate_flags & CAKE_FLAG_WASH)))
2964 		goto nla_put_failure;
2965 
2966 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2967 		goto nla_put_failure;
2968 
2969 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2970 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2971 			goto nla_put_failure;
2972 
2973 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2974 		goto nla_put_failure;
2975 
2976 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2977 		goto nla_put_failure;
2978 
2979 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2980 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2981 		goto nla_put_failure;
2982 
2983 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2984 		goto nla_put_failure;
2985 
2986 	return nla_nest_end(skb, opts);
2987 
2988 nla_put_failure:
2989 	return -1;
2990 }
2991 
2992 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2993 {
2994 	struct cake_sched_data *qd = qdisc_priv(sch);
2995 
2996 	return cake_config_dump(qd->config, skb);
2997 }
2998 
2999 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
3000 {
3001 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3002 	struct cake_sched_data *q = qdisc_priv(sch);
3003 	struct nlattr *tstats, *ts;
3004 	int i;
3005 
3006 	if (!stats)
3007 		return -1;
3008 
3009 #define PUT_STAT_U32(attr, data) do {				       \
3010 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3011 			goto nla_put_failure;			       \
3012 	} while (0)
3013 #define PUT_STAT_U64(attr, data) do {				       \
3014 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
3015 					data, TCA_CAKE_STATS_PAD)) \
3016 			goto nla_put_failure;			       \
3017 	} while (0)
3018 
3019 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
3020 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
3021 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
3022 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
3023 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
3024 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
3025 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
3026 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
3027 	PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
3028 
3029 #undef PUT_STAT_U32
3030 #undef PUT_STAT_U64
3031 
3032 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
3033 	if (!tstats)
3034 		goto nla_put_failure;
3035 
3036 #define PUT_TSTAT_U32(attr, data) do {					\
3037 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3038 			goto nla_put_failure;				\
3039 	} while (0)
3040 #define PUT_TSTAT_U64(attr, data) do {					\
3041 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3042 					data, TCA_CAKE_TIN_STATS_PAD))	\
3043 			goto nla_put_failure;				\
3044 	} while (0)
3045 
3046 	for (i = 0; i < q->tin_cnt; i++) {
3047 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3048 
3049 		ts = nla_nest_start_noflag(d->skb, i + 1);
3050 		if (!ts)
3051 			goto nla_put_failure;
3052 
3053 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
3054 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
3055 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
3056 
3057 		PUT_TSTAT_U32(TARGET_US,
3058 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
3059 		PUT_TSTAT_U32(INTERVAL_US,
3060 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
3061 
3062 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
3063 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
3064 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
3065 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
3066 
3067 		PUT_TSTAT_U32(PEAK_DELAY_US,
3068 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
3069 		PUT_TSTAT_U32(AVG_DELAY_US,
3070 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
3071 		PUT_TSTAT_U32(BASE_DELAY_US,
3072 			      ktime_to_us(ns_to_ktime(b->base_delay)));
3073 
3074 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
3075 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
3076 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
3077 
3078 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
3079 					    b->decaying_flow_count);
3080 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
3081 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
3082 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
3083 
3084 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
3085 		nla_nest_end(d->skb, ts);
3086 	}
3087 
3088 #undef PUT_TSTAT_U32
3089 #undef PUT_TSTAT_U64
3090 
3091 	nla_nest_end(d->skb, tstats);
3092 	return nla_nest_end(d->skb, stats);
3093 
3094 nla_put_failure:
3095 	nla_nest_cancel(d->skb, stats);
3096 	return -1;
3097 }
3098 
3099 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3100 {
3101 	return NULL;
3102 }
3103 
3104 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3105 {
3106 	return 0;
3107 }
3108 
3109 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3110 			       u32 classid)
3111 {
3112 	return 0;
3113 }
3114 
3115 static void cake_unbind(struct Qdisc *q, unsigned long cl)
3116 {
3117 }
3118 
3119 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3120 					struct netlink_ext_ack *extack)
3121 {
3122 	struct cake_sched_data *q = qdisc_priv(sch);
3123 
3124 	if (cl)
3125 		return NULL;
3126 	return q->block;
3127 }
3128 
3129 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3130 			   struct sk_buff *skb, struct tcmsg *tcm)
3131 {
3132 	tcm->tcm_handle |= TC_H_MIN(cl);
3133 	return 0;
3134 }
3135 
3136 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3137 				 struct gnet_dump *d)
3138 {
3139 	struct cake_sched_data *q = qdisc_priv(sch);
3140 	const struct cake_flow *flow = NULL;
3141 	struct gnet_stats_queue qs = { 0 };
3142 	struct nlattr *stats;
3143 	u32 idx = cl - 1;
3144 
3145 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3146 		const struct cake_tin_data *b = \
3147 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3148 		const struct sk_buff *skb;
3149 
3150 		flow = &b->flows[idx % CAKE_QUEUES];
3151 
3152 		if (flow->head) {
3153 			sch_tree_lock(sch);
3154 			skb = flow->head;
3155 			while (skb) {
3156 				qs.qlen++;
3157 				skb = skb->next;
3158 			}
3159 			sch_tree_unlock(sch);
3160 		}
3161 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3162 		qs.drops = flow->dropped;
3163 	}
3164 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3165 		return -1;
3166 	if (flow) {
3167 		ktime_t now = ktime_get();
3168 
3169 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3170 		if (!stats)
3171 			return -1;
3172 
3173 #define PUT_STAT_U32(attr, data) do {				       \
3174 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3175 			goto nla_put_failure;			       \
3176 	} while (0)
3177 #define PUT_STAT_S32(attr, data) do {				       \
3178 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3179 			goto nla_put_failure;			       \
3180 	} while (0)
3181 
3182 		PUT_STAT_S32(DEFICIT, flow->deficit);
3183 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3184 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3185 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3186 		if (flow->cvars.p_drop) {
3187 			PUT_STAT_S32(BLUE_TIMER_US,
3188 				     ktime_to_us(
3189 					     ktime_sub(now,
3190 						       flow->cvars.blue_timer)));
3191 		}
3192 		if (flow->cvars.dropping) {
3193 			PUT_STAT_S32(DROP_NEXT_US,
3194 				     ktime_to_us(
3195 					     ktime_sub(now,
3196 						       flow->cvars.drop_next)));
3197 		}
3198 
3199 		if (nla_nest_end(d->skb, stats) < 0)
3200 			return -1;
3201 	}
3202 
3203 	return 0;
3204 
3205 nla_put_failure:
3206 	nla_nest_cancel(d->skb, stats);
3207 	return -1;
3208 }
3209 
3210 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3211 {
3212 	struct cake_sched_data *q = qdisc_priv(sch);
3213 	unsigned int i, j;
3214 
3215 	if (arg->stop)
3216 		return;
3217 
3218 	for (i = 0; i < q->tin_cnt; i++) {
3219 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3220 
3221 		for (j = 0; j < CAKE_QUEUES; j++) {
3222 			if (list_empty(&b->flows[j].flowchain)) {
3223 				arg->count++;
3224 				continue;
3225 			}
3226 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3227 						 arg))
3228 				break;
3229 		}
3230 	}
3231 }
3232 
3233 static const struct Qdisc_class_ops cake_class_ops = {
3234 	.leaf		=	cake_leaf,
3235 	.find		=	cake_find,
3236 	.tcf_block	=	cake_tcf_block,
3237 	.bind_tcf	=	cake_bind,
3238 	.unbind_tcf	=	cake_unbind,
3239 	.dump		=	cake_dump_class,
3240 	.dump_stats	=	cake_dump_class_stats,
3241 	.walk		=	cake_walk,
3242 };
3243 
3244 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3245 	.cl_ops		=	&cake_class_ops,
3246 	.id		=	"cake",
3247 	.priv_size	=	sizeof(struct cake_sched_data),
3248 	.enqueue	=	cake_enqueue,
3249 	.dequeue	=	cake_dequeue,
3250 	.peek		=	qdisc_peek_dequeued,
3251 	.init		=	cake_init,
3252 	.reset		=	cake_reset,
3253 	.destroy	=	cake_destroy,
3254 	.change		=	cake_change,
3255 	.dump		=	cake_dump,
3256 	.dump_stats	=	cake_dump_stats,
3257 	.owner		=	THIS_MODULE,
3258 };
3259 MODULE_ALIAS_NET_SCH("cake");
3260 
3261 struct cake_mq_sched {
3262 	struct mq_sched mq_priv; /* must be first */
3263 	struct cake_sched_config cake_config;
3264 };
3265 
3266 static void cake_mq_destroy(struct Qdisc *sch)
3267 {
3268 	mq_destroy_common(sch);
3269 }
3270 
3271 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
3272 			struct netlink_ext_ack *extack)
3273 {
3274 	struct cake_mq_sched *priv = qdisc_priv(sch);
3275 	struct net_device *dev = qdisc_dev(sch);
3276 	int ret, ntx;
3277 	bool _unused;
3278 
3279 	cake_config_init(&priv->cake_config, true);
3280 	if (opt) {
3281 		ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
3282 		if (ret)
3283 			return ret;
3284 	}
3285 
3286 	ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
3287 	if (ret)
3288 		return ret;
3289 
3290 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
3291 		cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
3292 
3293 	return 0;
3294 }
3295 
3296 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
3297 {
3298 	struct cake_mq_sched *priv = qdisc_priv(sch);
3299 
3300 	mq_dump_common(sch, skb);
3301 	return cake_config_dump(&priv->cake_config, skb);
3302 }
3303 
3304 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
3305 			  struct netlink_ext_ack *extack)
3306 {
3307 	struct cake_mq_sched *priv = qdisc_priv(sch);
3308 	struct net_device *dev = qdisc_dev(sch);
3309 	bool overhead_changed = false;
3310 	unsigned int ntx;
3311 	int ret;
3312 
3313 	ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
3314 	if (ret)
3315 		return ret;
3316 
3317 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
3318 		struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
3319 		struct cake_sched_data *qd = qdisc_priv(chld);
3320 
3321 		if (overhead_changed) {
3322 			qd->max_netlen = 0;
3323 			qd->max_adjlen = 0;
3324 			qd->min_netlen = ~0;
3325 			qd->min_adjlen = ~0;
3326 		}
3327 
3328 		if (qd->tins) {
3329 			sch_tree_lock(chld);
3330 			cake_reconfigure(chld);
3331 			sch_tree_unlock(chld);
3332 		}
3333 	}
3334 
3335 	return 0;
3336 }
3337 
3338 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
3339 			 struct Qdisc **old, struct netlink_ext_ack *extack)
3340 {
3341 	NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
3342 	return -EOPNOTSUPP;
3343 }
3344 
3345 static const struct Qdisc_class_ops cake_mq_class_ops = {
3346 	.select_queue	= mq_select_queue,
3347 	.graft		= cake_mq_graft,
3348 	.leaf		= mq_leaf,
3349 	.find		= mq_find,
3350 	.walk		= mq_walk,
3351 	.dump		= mq_dump_class,
3352 	.dump_stats	= mq_dump_class_stats,
3353 };
3354 
3355 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
3356 	.cl_ops		=	&cake_mq_class_ops,
3357 	.id		=	"cake_mq",
3358 	.priv_size	=	sizeof(struct cake_mq_sched),
3359 	.init		=	cake_mq_init,
3360 	.destroy	=	cake_mq_destroy,
3361 	.attach		=	mq_attach,
3362 	.change		=	cake_mq_change,
3363 	.change_real_num_tx = mq_change_real_num_tx,
3364 	.dump		=	cake_mq_dump,
3365 	.owner		=	THIS_MODULE,
3366 };
3367 MODULE_ALIAS_NET_SCH("cake_mq");
3368 
3369 static int __init cake_module_init(void)
3370 {
3371 	int ret;
3372 
3373 	ret = register_qdisc(&cake_qdisc_ops);
3374 	if (ret)
3375 		return ret;
3376 
3377 	ret = register_qdisc(&cake_mq_qdisc_ops);
3378 	if (ret)
3379 		unregister_qdisc(&cake_qdisc_ops);
3380 
3381 	return ret;
3382 }
3383 
3384 static void __exit cake_module_exit(void)
3385 {
3386 	unregister_qdisc(&cake_qdisc_ops);
3387 	unregister_qdisc(&cake_mq_qdisc_ops);
3388 }
3389 
3390 module_init(cake_module_init)
3391 module_exit(cake_module_exit)
3392 MODULE_AUTHOR("Jonathan Morton");
3393 MODULE_LICENSE("Dual BSD/GPL");
3394 MODULE_DESCRIPTION("The CAKE shaper.");
3395 MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
3396