xref: /linux/net/sched/sch_cake.c (revision 94901b7a74d82bfd30420f1d9d00898278fdc8bf)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 /* There is a big difference in timing between the accurate values placed in the
365  * cache and the approximations given by a single Newton step for small count
366  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
367  * these values are calculated using eight Newton steps, using the
368  * implementation below. Above 16, a single Newton step gives sufficient
369  * accuracy in either direction, given the precision stored.
370  *
371  * The magnitude of the error when stepping up to count 2 is such as to give the
372  * value that *should* have been produced at count 4.
373  */
374 
375 #define REC_INV_SQRT_CACHE (16)
376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
377 		~0,         ~0, 3037000500, 2479700525,
378 	2147483647, 1920767767, 1753413056, 1623345051,
379 	1518500250, 1431655765, 1358187914, 1294981364,
380 	1239850263, 1191209601, 1147878294, 1108955788
381 };
382 
383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
384  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
385  *
386  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
387  */
388 
389 static void cobalt_newton_step(struct cobalt_vars *vars)
390 {
391 	u32 invsqrt, invsqrt2;
392 	u64 val;
393 
394 	invsqrt = vars->rec_inv_sqrt;
395 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
396 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
397 
398 	val >>= 2; /* avoid overflow in following multiply */
399 	val = (val * invsqrt) >> (32 - 2 + 1);
400 
401 	vars->rec_inv_sqrt = val;
402 }
403 
404 static void cobalt_invsqrt(struct cobalt_vars *vars)
405 {
406 	if (vars->count < REC_INV_SQRT_CACHE)
407 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
408 	else
409 		cobalt_newton_step(vars);
410 }
411 
412 static void cobalt_vars_init(struct cobalt_vars *vars)
413 {
414 	memset(vars, 0, sizeof(*vars));
415 }
416 
417 /* CoDel control_law is t + interval/sqrt(count)
418  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
419  * both sqrt() and divide operation.
420  */
421 static ktime_t cobalt_control(ktime_t t,
422 			      u64 interval,
423 			      u32 rec_inv_sqrt)
424 {
425 	return ktime_add_ns(t, reciprocal_scale(interval,
426 						rec_inv_sqrt));
427 }
428 
429 /* Call this when a packet had to be dropped due to queue overflow.  Returns
430  * true if the BLUE state was quiescent before but active after this call.
431  */
432 static bool cobalt_queue_full(struct cobalt_vars *vars,
433 			      struct cobalt_params *p,
434 			      ktime_t now)
435 {
436 	bool up = false;
437 
438 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
439 		up = !vars->p_drop;
440 		vars->p_drop += p->p_inc;
441 		if (vars->p_drop < p->p_inc)
442 			vars->p_drop = ~0;
443 		vars->blue_timer = now;
444 	}
445 	vars->dropping = true;
446 	vars->drop_next = now;
447 	if (!vars->count)
448 		vars->count = 1;
449 
450 	return up;
451 }
452 
453 /* Call this when the queue was serviced but turned out to be empty.  Returns
454  * true if the BLUE state was active before but quiescent after this call.
455  */
456 static bool cobalt_queue_empty(struct cobalt_vars *vars,
457 			       struct cobalt_params *p,
458 			       ktime_t now)
459 {
460 	bool down = false;
461 
462 	if (vars->p_drop &&
463 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
464 		if (vars->p_drop < p->p_dec)
465 			vars->p_drop = 0;
466 		else
467 			vars->p_drop -= p->p_dec;
468 		vars->blue_timer = now;
469 		down = !vars->p_drop;
470 	}
471 	vars->dropping = false;
472 
473 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
474 		vars->count--;
475 		cobalt_invsqrt(vars);
476 		vars->drop_next = cobalt_control(vars->drop_next,
477 						 p->interval,
478 						 vars->rec_inv_sqrt);
479 	}
480 
481 	return down;
482 }
483 
484 /* Call this with a freshly dequeued packet for possible congestion marking.
485  * Returns true as an instruction to drop the packet, false for delivery.
486  */
487 static bool cobalt_should_drop(struct cobalt_vars *vars,
488 			       struct cobalt_params *p,
489 			       ktime_t now,
490 			       struct sk_buff *skb,
491 			       u32 bulk_flows)
492 {
493 	bool next_due, over_target, drop = false;
494 	ktime_t schedule;
495 	u64 sojourn;
496 
497 /* The 'schedule' variable records, in its sign, whether 'now' is before or
498  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
499  * scheduling decision is actually branched, without destroying that
500  * information.  Similarly, the first 'schedule' value calculated is preserved
501  * in the boolean 'next_due'.
502  *
503  * As for 'drop_next', we take advantage of the fact that 'interval' is both
504  * the delay between first exceeding 'target' and the first signalling event,
505  * *and* the scaling factor for the signalling frequency.  It's therefore very
506  * natural to use a single mechanism for both purposes, and eliminates a
507  * significant amount of reference Codel's spaghetti code.  To help with this,
508  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
509  * as possible to 1.0 in fixed-point.
510  */
511 
512 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
513 	schedule = ktime_sub(now, vars->drop_next);
514 	over_target = sojourn > p->target &&
515 		      sojourn > p->mtu_time * bulk_flows * 2 &&
516 		      sojourn > p->mtu_time * 4;
517 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
518 
519 	vars->ecn_marked = false;
520 
521 	if (over_target) {
522 		if (!vars->dropping) {
523 			vars->dropping = true;
524 			vars->drop_next = cobalt_control(now,
525 							 p->interval,
526 							 vars->rec_inv_sqrt);
527 		}
528 		if (!vars->count)
529 			vars->count = 1;
530 	} else if (vars->dropping) {
531 		vars->dropping = false;
532 	}
533 
534 	if (next_due && vars->dropping) {
535 		/* Use ECN mark if possible, otherwise drop */
536 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
537 
538 		vars->count++;
539 		if (!vars->count)
540 			vars->count--;
541 		cobalt_invsqrt(vars);
542 		vars->drop_next = cobalt_control(vars->drop_next,
543 						 p->interval,
544 						 vars->rec_inv_sqrt);
545 		schedule = ktime_sub(now, vars->drop_next);
546 	} else {
547 		while (next_due) {
548 			vars->count--;
549 			cobalt_invsqrt(vars);
550 			vars->drop_next = cobalt_control(vars->drop_next,
551 							 p->interval,
552 							 vars->rec_inv_sqrt);
553 			schedule = ktime_sub(now, vars->drop_next);
554 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
555 		}
556 	}
557 
558 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
559 	if (vars->p_drop)
560 		drop |= (get_random_u32() < vars->p_drop);
561 
562 	/* Overload the drop_next field as an activity timeout */
563 	if (!vars->count)
564 		vars->drop_next = ktime_add_ns(now, p->interval);
565 	else if (ktime_to_ns(schedule) > 0 && !drop)
566 		vars->drop_next = now;
567 
568 	return drop;
569 }
570 
571 static bool cake_update_flowkeys(struct flow_keys *keys,
572 				 const struct sk_buff *skb)
573 {
574 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
575 	struct nf_conntrack_tuple tuple = {};
576 	bool rev = !skb->_nfct, upd = false;
577 	__be32 ip;
578 
579 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
580 		return false;
581 
582 	if (!nf_ct_get_tuple_skb(&tuple, skb))
583 		return false;
584 
585 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
586 	if (ip != keys->addrs.v4addrs.src) {
587 		keys->addrs.v4addrs.src = ip;
588 		upd = true;
589 	}
590 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
591 	if (ip != keys->addrs.v4addrs.dst) {
592 		keys->addrs.v4addrs.dst = ip;
593 		upd = true;
594 	}
595 
596 	if (keys->ports.ports) {
597 		__be16 port;
598 
599 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
600 		if (port != keys->ports.src) {
601 			keys->ports.src = port;
602 			upd = true;
603 		}
604 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
605 		if (port != keys->ports.dst) {
606 			port = keys->ports.dst;
607 			upd = true;
608 		}
609 	}
610 	return upd;
611 #else
612 	return false;
613 #endif
614 }
615 
616 /* Cake has several subtle multiple bit settings. In these cases you
617  *  would be matching triple isolate mode as well.
618  */
619 
620 static bool cake_dsrc(int flow_mode)
621 {
622 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
623 }
624 
625 static bool cake_ddst(int flow_mode)
626 {
627 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
628 }
629 
630 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
631 		     int flow_mode, u16 flow_override, u16 host_override)
632 {
633 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
634 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
635 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
636 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
637 	u16 reduced_hash, srchost_idx, dsthost_idx;
638 	struct flow_keys keys, host_keys;
639 	bool use_skbhash = skb->l4_hash;
640 
641 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
642 		return 0;
643 
644 	/* If both overrides are set, or we can use the SKB hash and nat mode is
645 	 * disabled, we can skip packet dissection entirely. If nat mode is
646 	 * enabled there's another check below after doing the conntrack lookup.
647 	 */
648 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
649 		goto skip_hash;
650 
651 	skb_flow_dissect_flow_keys(skb, &keys,
652 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
653 
654 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
655 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
656 		use_skbhash = false;
657 
658 	/* If we can still use the SKB hash and don't need the host hash, we can
659 	 * skip the rest of the hashing procedure
660 	 */
661 	if (use_skbhash && !hash_hosts)
662 		goto skip_hash;
663 
664 	/* flow_hash_from_keys() sorts the addresses by value, so we have
665 	 * to preserve their order in a separate data structure to treat
666 	 * src and dst host addresses as independently selectable.
667 	 */
668 	host_keys = keys;
669 	host_keys.ports.ports     = 0;
670 	host_keys.basic.ip_proto  = 0;
671 	host_keys.keyid.keyid     = 0;
672 	host_keys.tags.flow_label = 0;
673 
674 	switch (host_keys.control.addr_type) {
675 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
676 		host_keys.addrs.v4addrs.src = 0;
677 		dsthost_hash = flow_hash_from_keys(&host_keys);
678 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
679 		host_keys.addrs.v4addrs.dst = 0;
680 		srchost_hash = flow_hash_from_keys(&host_keys);
681 		break;
682 
683 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
684 		memset(&host_keys.addrs.v6addrs.src, 0,
685 		       sizeof(host_keys.addrs.v6addrs.src));
686 		dsthost_hash = flow_hash_from_keys(&host_keys);
687 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
688 		memset(&host_keys.addrs.v6addrs.dst, 0,
689 		       sizeof(host_keys.addrs.v6addrs.dst));
690 		srchost_hash = flow_hash_from_keys(&host_keys);
691 		break;
692 
693 	default:
694 		dsthost_hash = 0;
695 		srchost_hash = 0;
696 	}
697 
698 	/* This *must* be after the above switch, since as a
699 	 * side-effect it sorts the src and dst addresses.
700 	 */
701 	if (hash_flows && !use_skbhash)
702 		flow_hash = flow_hash_from_keys(&keys);
703 
704 skip_hash:
705 	if (flow_override)
706 		flow_hash = flow_override - 1;
707 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
708 		flow_hash = skb->hash;
709 	if (host_override) {
710 		dsthost_hash = host_override - 1;
711 		srchost_hash = host_override - 1;
712 	}
713 
714 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
715 		if (flow_mode & CAKE_FLOW_SRC_IP)
716 			flow_hash ^= srchost_hash;
717 
718 		if (flow_mode & CAKE_FLOW_DST_IP)
719 			flow_hash ^= dsthost_hash;
720 	}
721 
722 	reduced_hash = flow_hash % CAKE_QUEUES;
723 
724 	/* set-associative hashing */
725 	/* fast path if no hash collision (direct lookup succeeds) */
726 	if (likely(q->tags[reduced_hash] == flow_hash &&
727 		   q->flows[reduced_hash].set)) {
728 		q->way_directs++;
729 	} else {
730 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
731 		u32 outer_hash = reduced_hash - inner_hash;
732 		bool allocate_src = false;
733 		bool allocate_dst = false;
734 		u32 i, k;
735 
736 		/* check if any active queue in the set is reserved for
737 		 * this flow.
738 		 */
739 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
740 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
741 			if (q->tags[outer_hash + k] == flow_hash) {
742 				if (i)
743 					q->way_hits++;
744 
745 				if (!q->flows[outer_hash + k].set) {
746 					/* need to increment host refcnts */
747 					allocate_src = cake_dsrc(flow_mode);
748 					allocate_dst = cake_ddst(flow_mode);
749 				}
750 
751 				goto found;
752 			}
753 		}
754 
755 		/* no queue is reserved for this flow, look for an
756 		 * empty one.
757 		 */
758 		for (i = 0; i < CAKE_SET_WAYS;
759 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
760 			if (!q->flows[outer_hash + k].set) {
761 				q->way_misses++;
762 				allocate_src = cake_dsrc(flow_mode);
763 				allocate_dst = cake_ddst(flow_mode);
764 				goto found;
765 			}
766 		}
767 
768 		/* With no empty queues, default to the original
769 		 * queue, accept the collision, update the host tags.
770 		 */
771 		q->way_collisions++;
772 		allocate_src = cake_dsrc(flow_mode);
773 		allocate_dst = cake_ddst(flow_mode);
774 
775 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
776 			if (allocate_src)
777 				q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
778 			if (allocate_dst)
779 				q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
780 		}
781 found:
782 		/* reserve queue for future packets in same flow */
783 		reduced_hash = outer_hash + k;
784 		q->tags[reduced_hash] = flow_hash;
785 
786 		if (allocate_src) {
787 			srchost_idx = srchost_hash % CAKE_QUEUES;
788 			inner_hash = srchost_idx % CAKE_SET_WAYS;
789 			outer_hash = srchost_idx - inner_hash;
790 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
791 				i++, k = (k + 1) % CAKE_SET_WAYS) {
792 				if (q->hosts[outer_hash + k].srchost_tag ==
793 				    srchost_hash)
794 					goto found_src;
795 			}
796 			for (i = 0; i < CAKE_SET_WAYS;
797 				i++, k = (k + 1) % CAKE_SET_WAYS) {
798 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
799 					break;
800 			}
801 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
802 found_src:
803 			srchost_idx = outer_hash + k;
804 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
805 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
806 			q->flows[reduced_hash].srchost = srchost_idx;
807 		}
808 
809 		if (allocate_dst) {
810 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
811 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
812 			outer_hash = dsthost_idx - inner_hash;
813 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
814 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
815 				if (q->hosts[outer_hash + k].dsthost_tag ==
816 				    dsthost_hash)
817 					goto found_dst;
818 			}
819 			for (i = 0; i < CAKE_SET_WAYS;
820 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
821 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
822 					break;
823 			}
824 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
825 found_dst:
826 			dsthost_idx = outer_hash + k;
827 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
828 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
829 			q->flows[reduced_hash].dsthost = dsthost_idx;
830 		}
831 	}
832 
833 	return reduced_hash;
834 }
835 
836 /* helper functions : might be changed when/if skb use a standard list_head */
837 /* remove one skb from head of slot queue */
838 
839 static struct sk_buff *dequeue_head(struct cake_flow *flow)
840 {
841 	struct sk_buff *skb = flow->head;
842 
843 	if (skb) {
844 		flow->head = skb->next;
845 		skb_mark_not_on_list(skb);
846 	}
847 
848 	return skb;
849 }
850 
851 /* add skb to flow queue (tail add) */
852 
853 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
854 {
855 	if (!flow->head)
856 		flow->head = skb;
857 	else
858 		flow->tail->next = skb;
859 	flow->tail = skb;
860 	skb->next = NULL;
861 }
862 
863 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
864 				    struct ipv6hdr *buf)
865 {
866 	unsigned int offset = skb_network_offset(skb);
867 	struct iphdr *iph;
868 
869 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
870 
871 	if (!iph)
872 		return NULL;
873 
874 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
875 		return skb_header_pointer(skb, offset + iph->ihl * 4,
876 					  sizeof(struct ipv6hdr), buf);
877 
878 	else if (iph->version == 4)
879 		return iph;
880 
881 	else if (iph->version == 6)
882 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
883 					  buf);
884 
885 	return NULL;
886 }
887 
888 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
889 				      void *buf, unsigned int bufsize)
890 {
891 	unsigned int offset = skb_network_offset(skb);
892 	const struct ipv6hdr *ipv6h;
893 	const struct tcphdr *tcph;
894 	const struct iphdr *iph;
895 	struct ipv6hdr _ipv6h;
896 	struct tcphdr _tcph;
897 
898 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
899 
900 	if (!ipv6h)
901 		return NULL;
902 
903 	if (ipv6h->version == 4) {
904 		iph = (struct iphdr *)ipv6h;
905 		offset += iph->ihl * 4;
906 
907 		/* special-case 6in4 tunnelling, as that is a common way to get
908 		 * v6 connectivity in the home
909 		 */
910 		if (iph->protocol == IPPROTO_IPV6) {
911 			ipv6h = skb_header_pointer(skb, offset,
912 						   sizeof(_ipv6h), &_ipv6h);
913 
914 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
915 				return NULL;
916 
917 			offset += sizeof(struct ipv6hdr);
918 
919 		} else if (iph->protocol != IPPROTO_TCP) {
920 			return NULL;
921 		}
922 
923 	} else if (ipv6h->version == 6) {
924 		if (ipv6h->nexthdr != IPPROTO_TCP)
925 			return NULL;
926 
927 		offset += sizeof(struct ipv6hdr);
928 	} else {
929 		return NULL;
930 	}
931 
932 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
933 	if (!tcph || tcph->doff < 5)
934 		return NULL;
935 
936 	return skb_header_pointer(skb, offset,
937 				  min(__tcp_hdrlen(tcph), bufsize), buf);
938 }
939 
940 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
941 				   int code, int *oplen)
942 {
943 	/* inspired by tcp_parse_options in tcp_input.c */
944 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
945 	const u8 *ptr = (const u8 *)(tcph + 1);
946 
947 	while (length > 0) {
948 		int opcode = *ptr++;
949 		int opsize;
950 
951 		if (opcode == TCPOPT_EOL)
952 			break;
953 		if (opcode == TCPOPT_NOP) {
954 			length--;
955 			continue;
956 		}
957 		if (length < 2)
958 			break;
959 		opsize = *ptr++;
960 		if (opsize < 2 || opsize > length)
961 			break;
962 
963 		if (opcode == code) {
964 			*oplen = opsize;
965 			return ptr;
966 		}
967 
968 		ptr += opsize - 2;
969 		length -= opsize;
970 	}
971 
972 	return NULL;
973 }
974 
975 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
976  * bytes than the other. In the case where both sequences ACKs bytes that the
977  * other doesn't, A is considered greater. DSACKs in A also makes A be
978  * considered greater.
979  *
980  * @return -1, 0 or 1 as normal compare functions
981  */
982 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
983 				  const struct tcphdr *tcph_b)
984 {
985 	const struct tcp_sack_block_wire *sack_a, *sack_b;
986 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
987 	u32 bytes_a = 0, bytes_b = 0;
988 	int oplen_a, oplen_b;
989 	bool first = true;
990 
991 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
992 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
993 
994 	/* pointers point to option contents */
995 	oplen_a -= TCPOLEN_SACK_BASE;
996 	oplen_b -= TCPOLEN_SACK_BASE;
997 
998 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
999 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1000 		return -1;
1001 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1002 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1003 		return 1;
1004 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1005 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1006 		return 0;
1007 
1008 	while (oplen_a >= sizeof(*sack_a)) {
1009 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1010 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1011 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1012 		int oplen_tmp = oplen_b;
1013 		bool found = false;
1014 
1015 		/* DSACK; always considered greater to prevent dropping */
1016 		if (before(start_a, ack_seq_a))
1017 			return -1;
1018 
1019 		bytes_a += end_a - start_a;
1020 
1021 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1022 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1023 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1024 
1025 			/* first time through we count the total size */
1026 			if (first)
1027 				bytes_b += end_b - start_b;
1028 
1029 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1030 				found = true;
1031 				if (!first)
1032 					break;
1033 			}
1034 			oplen_tmp -= sizeof(*sack_tmp);
1035 			sack_tmp++;
1036 		}
1037 
1038 		if (!found)
1039 			return -1;
1040 
1041 		oplen_a -= sizeof(*sack_a);
1042 		sack_a++;
1043 		first = false;
1044 	}
1045 
1046 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1047 	 * either the SACKs are equal, or B SACKs more bytes.
1048 	 */
1049 	return bytes_b > bytes_a ? 1 : 0;
1050 }
1051 
1052 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1053 				 u32 *tsval, u32 *tsecr)
1054 {
1055 	const u8 *ptr;
1056 	int opsize;
1057 
1058 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1059 
1060 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1061 		*tsval = get_unaligned_be32(ptr);
1062 		*tsecr = get_unaligned_be32(ptr + 4);
1063 	}
1064 }
1065 
1066 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1067 			       u32 tstamp_new, u32 tsecr_new)
1068 {
1069 	/* inspired by tcp_parse_options in tcp_input.c */
1070 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1071 	const u8 *ptr = (const u8 *)(tcph + 1);
1072 	u32 tstamp, tsecr;
1073 
1074 	/* 3 reserved flags must be unset to avoid future breakage
1075 	 * ACK must be set
1076 	 * ECE/CWR are handled separately
1077 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1078 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1079 	 * 0x00C00000 = CWR/ECE (handled separately)
1080 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1081 	 */
1082 	if (((tcp_flag_word(tcph) &
1083 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1084 		return false;
1085 
1086 	while (length > 0) {
1087 		int opcode = *ptr++;
1088 		int opsize;
1089 
1090 		if (opcode == TCPOPT_EOL)
1091 			break;
1092 		if (opcode == TCPOPT_NOP) {
1093 			length--;
1094 			continue;
1095 		}
1096 		if (length < 2)
1097 			break;
1098 		opsize = *ptr++;
1099 		if (opsize < 2 || opsize > length)
1100 			break;
1101 
1102 		switch (opcode) {
1103 		case TCPOPT_MD5SIG: /* doesn't influence state */
1104 			break;
1105 
1106 		case TCPOPT_SACK: /* stricter checking performed later */
1107 			if (opsize % 8 != 2)
1108 				return false;
1109 			break;
1110 
1111 		case TCPOPT_TIMESTAMP:
1112 			/* only drop timestamps lower than new */
1113 			if (opsize != TCPOLEN_TIMESTAMP)
1114 				return false;
1115 			tstamp = get_unaligned_be32(ptr);
1116 			tsecr = get_unaligned_be32(ptr + 4);
1117 			if (after(tstamp, tstamp_new) ||
1118 			    after(tsecr, tsecr_new))
1119 				return false;
1120 			break;
1121 
1122 		case TCPOPT_MSS:  /* these should only be set on SYN */
1123 		case TCPOPT_WINDOW:
1124 		case TCPOPT_SACK_PERM:
1125 		case TCPOPT_FASTOPEN:
1126 		case TCPOPT_EXP:
1127 		default: /* don't drop if any unknown options are present */
1128 			return false;
1129 		}
1130 
1131 		ptr += opsize - 2;
1132 		length -= opsize;
1133 	}
1134 
1135 	return true;
1136 }
1137 
1138 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1139 				       struct cake_flow *flow)
1140 {
1141 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1142 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1143 	struct sk_buff *skb_check, *skb_prev = NULL;
1144 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1145 	unsigned char _tcph[64], _tcph_check[64];
1146 	const struct tcphdr *tcph, *tcph_check;
1147 	const struct iphdr *iph, *iph_check;
1148 	struct ipv6hdr _iph, _iph_check;
1149 	const struct sk_buff *skb;
1150 	int seglen, num_found = 0;
1151 	u32 tstamp = 0, tsecr = 0;
1152 	__be32 elig_flags = 0;
1153 	int sack_comp;
1154 
1155 	/* no other possible ACKs to filter */
1156 	if (flow->head == flow->tail)
1157 		return NULL;
1158 
1159 	skb = flow->tail;
1160 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1161 	iph = cake_get_iphdr(skb, &_iph);
1162 	if (!tcph)
1163 		return NULL;
1164 
1165 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1166 
1167 	/* the 'triggering' packet need only have the ACK flag set.
1168 	 * also check that SYN is not set, as there won't be any previous ACKs.
1169 	 */
1170 	if ((tcp_flag_word(tcph) &
1171 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1172 		return NULL;
1173 
1174 	/* the 'triggering' ACK is at the tail of the queue, we have already
1175 	 * returned if it is the only packet in the flow. loop through the rest
1176 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1177 	 * triggering one.
1178 	 */
1179 	for (skb_check = flow->head;
1180 	     skb_check && skb_check != skb;
1181 	     skb_prev = skb_check, skb_check = skb_check->next) {
1182 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1183 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1184 					     sizeof(_tcph_check));
1185 
1186 		/* only TCP packets with matching 5-tuple are eligible, and only
1187 		 * drop safe headers
1188 		 */
1189 		if (!tcph_check || iph->version != iph_check->version ||
1190 		    tcph_check->source != tcph->source ||
1191 		    tcph_check->dest != tcph->dest)
1192 			continue;
1193 
1194 		if (iph_check->version == 4) {
1195 			if (iph_check->saddr != iph->saddr ||
1196 			    iph_check->daddr != iph->daddr)
1197 				continue;
1198 
1199 			seglen = iph_totlen(skb, iph_check) -
1200 				       (4 * iph_check->ihl);
1201 		} else if (iph_check->version == 6) {
1202 			ipv6h = (struct ipv6hdr *)iph;
1203 			ipv6h_check = (struct ipv6hdr *)iph_check;
1204 
1205 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1206 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1207 				continue;
1208 
1209 			seglen = ntohs(ipv6h_check->payload_len);
1210 		} else {
1211 			WARN_ON(1);  /* shouldn't happen */
1212 			continue;
1213 		}
1214 
1215 		/* If the ECE/CWR flags changed from the previous eligible
1216 		 * packet in the same flow, we should no longer be dropping that
1217 		 * previous packet as this would lose information.
1218 		 */
1219 		if (elig_ack && (tcp_flag_word(tcph_check) &
1220 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1221 			elig_ack = NULL;
1222 			elig_ack_prev = NULL;
1223 			num_found--;
1224 		}
1225 
1226 		/* Check TCP options and flags, don't drop ACKs with segment
1227 		 * data, and don't drop ACKs with a higher cumulative ACK
1228 		 * counter than the triggering packet. Check ACK seqno here to
1229 		 * avoid parsing SACK options of packets we are going to exclude
1230 		 * anyway.
1231 		 */
1232 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1233 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1234 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1235 			continue;
1236 
1237 		/* Check SACK options. The triggering packet must SACK more data
1238 		 * than the ACK under consideration, or SACK the same range but
1239 		 * have a larger cumulative ACK counter. The latter is a
1240 		 * pathological case, but is contained in the following check
1241 		 * anyway, just to be safe.
1242 		 */
1243 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1244 
1245 		if (sack_comp < 0 ||
1246 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1247 		     sack_comp == 0))
1248 			continue;
1249 
1250 		/* At this point we have found an eligible pure ACK to drop; if
1251 		 * we are in aggressive mode, we are done. Otherwise, keep
1252 		 * searching unless this is the second eligible ACK we
1253 		 * found.
1254 		 *
1255 		 * Since we want to drop ACK closest to the head of the queue,
1256 		 * save the first eligible ACK we find, even if we need to loop
1257 		 * again.
1258 		 */
1259 		if (!elig_ack) {
1260 			elig_ack = skb_check;
1261 			elig_ack_prev = skb_prev;
1262 			elig_flags = (tcp_flag_word(tcph_check)
1263 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1264 		}
1265 
1266 		if (num_found++ > 0)
1267 			goto found;
1268 	}
1269 
1270 	/* We made it through the queue without finding two eligible ACKs . If
1271 	 * we found a single eligible ACK we can drop it in aggressive mode if
1272 	 * we can guarantee that this does not interfere with ECN flag
1273 	 * information. We ensure this by dropping it only if the enqueued
1274 	 * packet is consecutive with the eligible ACK, and their flags match.
1275 	 */
1276 	if (elig_ack && aggressive && elig_ack->next == skb &&
1277 	    (elig_flags == (tcp_flag_word(tcph) &
1278 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1279 		goto found;
1280 
1281 	return NULL;
1282 
1283 found:
1284 	if (elig_ack_prev)
1285 		elig_ack_prev->next = elig_ack->next;
1286 	else
1287 		flow->head = elig_ack->next;
1288 
1289 	skb_mark_not_on_list(elig_ack);
1290 
1291 	return elig_ack;
1292 }
1293 
1294 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1295 {
1296 	avg -= avg >> shift;
1297 	avg += sample >> shift;
1298 	return avg;
1299 }
1300 
1301 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1302 {
1303 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1304 		len -= off;
1305 
1306 	if (q->max_netlen < len)
1307 		q->max_netlen = len;
1308 	if (q->min_netlen > len)
1309 		q->min_netlen = len;
1310 
1311 	len += q->rate_overhead;
1312 
1313 	if (len < q->rate_mpu)
1314 		len = q->rate_mpu;
1315 
1316 	if (q->atm_mode == CAKE_ATM_ATM) {
1317 		len += 47;
1318 		len /= 48;
1319 		len *= 53;
1320 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1321 		/* Add one byte per 64 bytes or part thereof.
1322 		 * This is conservative and easier to calculate than the
1323 		 * precise value.
1324 		 */
1325 		len += (len + 63) / 64;
1326 	}
1327 
1328 	if (q->max_adjlen < len)
1329 		q->max_adjlen = len;
1330 	if (q->min_adjlen > len)
1331 		q->min_adjlen = len;
1332 
1333 	return len;
1334 }
1335 
1336 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1337 {
1338 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1339 	unsigned int hdr_len, last_len = 0;
1340 	u32 off = skb_network_offset(skb);
1341 	u32 len = qdisc_pkt_len(skb);
1342 	u16 segs = 1;
1343 
1344 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1345 
1346 	if (!shinfo->gso_size)
1347 		return cake_calc_overhead(q, len, off);
1348 
1349 	/* borrowed from qdisc_pkt_len_init() */
1350 	hdr_len = skb_transport_offset(skb);
1351 
1352 	/* + transport layer */
1353 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1354 						SKB_GSO_TCPV6))) {
1355 		const struct tcphdr *th;
1356 		struct tcphdr _tcphdr;
1357 
1358 		th = skb_header_pointer(skb, hdr_len,
1359 					sizeof(_tcphdr), &_tcphdr);
1360 		if (likely(th))
1361 			hdr_len += __tcp_hdrlen(th);
1362 	} else {
1363 		struct udphdr _udphdr;
1364 
1365 		if (skb_header_pointer(skb, hdr_len,
1366 				       sizeof(_udphdr), &_udphdr))
1367 			hdr_len += sizeof(struct udphdr);
1368 	}
1369 
1370 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1371 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1372 				    shinfo->gso_size);
1373 	else
1374 		segs = shinfo->gso_segs;
1375 
1376 	len = shinfo->gso_size + hdr_len;
1377 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1378 
1379 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1380 		cake_calc_overhead(q, last_len, off));
1381 }
1382 
1383 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1384 {
1385 	struct cake_heap_entry ii = q->overflow_heap[i];
1386 	struct cake_heap_entry jj = q->overflow_heap[j];
1387 
1388 	q->overflow_heap[i] = jj;
1389 	q->overflow_heap[j] = ii;
1390 
1391 	q->tins[ii.t].overflow_idx[ii.b] = j;
1392 	q->tins[jj.t].overflow_idx[jj.b] = i;
1393 }
1394 
1395 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1396 {
1397 	struct cake_heap_entry ii = q->overflow_heap[i];
1398 
1399 	return q->tins[ii.t].backlogs[ii.b];
1400 }
1401 
1402 static void cake_heapify(struct cake_sched_data *q, u16 i)
1403 {
1404 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1405 	u32 mb = cake_heap_get_backlog(q, i);
1406 	u32 m = i;
1407 
1408 	while (m < a) {
1409 		u32 l = m + m + 1;
1410 		u32 r = l + 1;
1411 
1412 		if (l < a) {
1413 			u32 lb = cake_heap_get_backlog(q, l);
1414 
1415 			if (lb > mb) {
1416 				m  = l;
1417 				mb = lb;
1418 			}
1419 		}
1420 
1421 		if (r < a) {
1422 			u32 rb = cake_heap_get_backlog(q, r);
1423 
1424 			if (rb > mb) {
1425 				m  = r;
1426 				mb = rb;
1427 			}
1428 		}
1429 
1430 		if (m != i) {
1431 			cake_heap_swap(q, i, m);
1432 			i = m;
1433 		} else {
1434 			break;
1435 		}
1436 	}
1437 }
1438 
1439 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1440 {
1441 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1442 		u16 p = (i - 1) >> 1;
1443 		u32 ib = cake_heap_get_backlog(q, i);
1444 		u32 pb = cake_heap_get_backlog(q, p);
1445 
1446 		if (ib > pb) {
1447 			cake_heap_swap(q, i, p);
1448 			i = p;
1449 		} else {
1450 			break;
1451 		}
1452 	}
1453 }
1454 
1455 static int cake_advance_shaper(struct cake_sched_data *q,
1456 			       struct cake_tin_data *b,
1457 			       struct sk_buff *skb,
1458 			       ktime_t now, bool drop)
1459 {
1460 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1461 
1462 	/* charge packet bandwidth to this tin
1463 	 * and to the global shaper.
1464 	 */
1465 	if (q->rate_ns) {
1466 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1467 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1468 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1469 
1470 		if (ktime_before(b->time_next_packet, now))
1471 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1472 							   tin_dur);
1473 
1474 		else if (ktime_before(b->time_next_packet,
1475 				      ktime_add_ns(now, tin_dur)))
1476 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1477 
1478 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1479 						   global_dur);
1480 		if (!drop)
1481 			q->failsafe_next_packet = \
1482 				ktime_add_ns(q->failsafe_next_packet,
1483 					     failsafe_dur);
1484 	}
1485 	return len;
1486 }
1487 
1488 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1489 {
1490 	struct cake_sched_data *q = qdisc_priv(sch);
1491 	ktime_t now = ktime_get();
1492 	u32 idx = 0, tin = 0, len;
1493 	struct cake_heap_entry qq;
1494 	struct cake_tin_data *b;
1495 	struct cake_flow *flow;
1496 	struct sk_buff *skb;
1497 
1498 	if (!q->overflow_timeout) {
1499 		int i;
1500 		/* Build fresh max-heap */
1501 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1502 			cake_heapify(q, i);
1503 	}
1504 	q->overflow_timeout = 65535;
1505 
1506 	/* select longest queue for pruning */
1507 	qq  = q->overflow_heap[0];
1508 	tin = qq.t;
1509 	idx = qq.b;
1510 
1511 	b = &q->tins[tin];
1512 	flow = &b->flows[idx];
1513 	skb = dequeue_head(flow);
1514 	if (unlikely(!skb)) {
1515 		/* heap has gone wrong, rebuild it next time */
1516 		q->overflow_timeout = 0;
1517 		return idx + (tin << 16);
1518 	}
1519 
1520 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1521 		b->unresponsive_flow_count++;
1522 
1523 	len = qdisc_pkt_len(skb);
1524 	q->buffer_used      -= skb->truesize;
1525 	b->backlogs[idx]    -= len;
1526 	b->tin_backlog      -= len;
1527 	sch->qstats.backlog -= len;
1528 
1529 	flow->dropped++;
1530 	b->tin_dropped++;
1531 	sch->qstats.drops++;
1532 
1533 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1534 		cake_advance_shaper(q, b, skb, now, true);
1535 
1536 	__qdisc_drop(skb, to_free);
1537 	sch->q.qlen--;
1538 	qdisc_tree_reduce_backlog(sch, 1, len);
1539 
1540 	cake_heapify(q, 0);
1541 
1542 	return idx + (tin << 16);
1543 }
1544 
1545 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1546 {
1547 	const int offset = skb_network_offset(skb);
1548 	u16 *buf, buf_;
1549 	u8 dscp;
1550 
1551 	switch (skb_protocol(skb, true)) {
1552 	case htons(ETH_P_IP):
1553 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1554 		if (unlikely(!buf))
1555 			return 0;
1556 
1557 		/* ToS is in the second byte of iphdr */
1558 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1559 
1560 		if (wash && dscp) {
1561 			const int wlen = offset + sizeof(struct iphdr);
1562 
1563 			if (!pskb_may_pull(skb, wlen) ||
1564 			    skb_try_make_writable(skb, wlen))
1565 				return 0;
1566 
1567 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1568 		}
1569 
1570 		return dscp;
1571 
1572 	case htons(ETH_P_IPV6):
1573 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1574 		if (unlikely(!buf))
1575 			return 0;
1576 
1577 		/* Traffic class is in the first and second bytes of ipv6hdr */
1578 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1579 
1580 		if (wash && dscp) {
1581 			const int wlen = offset + sizeof(struct ipv6hdr);
1582 
1583 			if (!pskb_may_pull(skb, wlen) ||
1584 			    skb_try_make_writable(skb, wlen))
1585 				return 0;
1586 
1587 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1588 		}
1589 
1590 		return dscp;
1591 
1592 	case htons(ETH_P_ARP):
1593 		return 0x38;  /* CS7 - Net Control */
1594 
1595 	default:
1596 		/* If there is no Diffserv field, treat as best-effort */
1597 		return 0;
1598 	}
1599 }
1600 
1601 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1602 					     struct sk_buff *skb)
1603 {
1604 	struct cake_sched_data *q = qdisc_priv(sch);
1605 	u32 tin, mark;
1606 	bool wash;
1607 	u8 dscp;
1608 
1609 	/* Tin selection: Default to diffserv-based selection, allow overriding
1610 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1611 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1612 	 */
1613 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1614 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1615 	if (wash)
1616 		dscp = cake_handle_diffserv(skb, wash);
1617 
1618 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1619 		tin = 0;
1620 
1621 	else if (mark && mark <= q->tin_cnt)
1622 		tin = q->tin_order[mark - 1];
1623 
1624 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1625 		 TC_H_MIN(skb->priority) > 0 &&
1626 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1627 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1628 
1629 	else {
1630 		if (!wash)
1631 			dscp = cake_handle_diffserv(skb, wash);
1632 		tin = q->tin_index[dscp];
1633 
1634 		if (unlikely(tin >= q->tin_cnt))
1635 			tin = 0;
1636 	}
1637 
1638 	return &q->tins[tin];
1639 }
1640 
1641 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1642 			 struct sk_buff *skb, int flow_mode, int *qerr)
1643 {
1644 	struct cake_sched_data *q = qdisc_priv(sch);
1645 	struct tcf_proto *filter;
1646 	struct tcf_result res;
1647 	u16 flow = 0, host = 0;
1648 	int result;
1649 
1650 	filter = rcu_dereference_bh(q->filter_list);
1651 	if (!filter)
1652 		goto hash;
1653 
1654 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1655 	result = tcf_classify(skb, NULL, filter, &res, false);
1656 
1657 	if (result >= 0) {
1658 #ifdef CONFIG_NET_CLS_ACT
1659 		switch (result) {
1660 		case TC_ACT_STOLEN:
1661 		case TC_ACT_QUEUED:
1662 		case TC_ACT_TRAP:
1663 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1664 			fallthrough;
1665 		case TC_ACT_SHOT:
1666 			return 0;
1667 		}
1668 #endif
1669 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1670 			flow = TC_H_MIN(res.classid);
1671 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1672 			host = TC_H_MAJ(res.classid) >> 16;
1673 	}
1674 hash:
1675 	*t = cake_select_tin(sch, skb);
1676 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1677 }
1678 
1679 static void cake_reconfigure(struct Qdisc *sch);
1680 
1681 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1682 			struct sk_buff **to_free)
1683 {
1684 	struct cake_sched_data *q = qdisc_priv(sch);
1685 	int len = qdisc_pkt_len(skb);
1686 	int ret;
1687 	struct sk_buff *ack = NULL;
1688 	ktime_t now = ktime_get();
1689 	struct cake_tin_data *b;
1690 	struct cake_flow *flow;
1691 	u32 idx;
1692 
1693 	/* choose flow to insert into */
1694 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1695 	if (idx == 0) {
1696 		if (ret & __NET_XMIT_BYPASS)
1697 			qdisc_qstats_drop(sch);
1698 		__qdisc_drop(skb, to_free);
1699 		return ret;
1700 	}
1701 	idx--;
1702 	flow = &b->flows[idx];
1703 
1704 	/* ensure shaper state isn't stale */
1705 	if (!b->tin_backlog) {
1706 		if (ktime_before(b->time_next_packet, now))
1707 			b->time_next_packet = now;
1708 
1709 		if (!sch->q.qlen) {
1710 			if (ktime_before(q->time_next_packet, now)) {
1711 				q->failsafe_next_packet = now;
1712 				q->time_next_packet = now;
1713 			} else if (ktime_after(q->time_next_packet, now) &&
1714 				   ktime_after(q->failsafe_next_packet, now)) {
1715 				u64 next = \
1716 					min(ktime_to_ns(q->time_next_packet),
1717 					    ktime_to_ns(
1718 						   q->failsafe_next_packet));
1719 				sch->qstats.overlimits++;
1720 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1721 			}
1722 		}
1723 	}
1724 
1725 	if (unlikely(len > b->max_skblen))
1726 		b->max_skblen = len;
1727 
1728 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1729 		struct sk_buff *segs, *nskb;
1730 		netdev_features_t features = netif_skb_features(skb);
1731 		unsigned int slen = 0, numsegs = 0;
1732 
1733 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1734 		if (IS_ERR_OR_NULL(segs))
1735 			return qdisc_drop(skb, sch, to_free);
1736 
1737 		skb_list_walk_safe(segs, segs, nskb) {
1738 			skb_mark_not_on_list(segs);
1739 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1740 			cobalt_set_enqueue_time(segs, now);
1741 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1742 									  segs);
1743 			flow_queue_add(flow, segs);
1744 
1745 			sch->q.qlen++;
1746 			numsegs++;
1747 			slen += segs->len;
1748 			q->buffer_used += segs->truesize;
1749 			b->packets++;
1750 		}
1751 
1752 		/* stats */
1753 		b->bytes	    += slen;
1754 		b->backlogs[idx]    += slen;
1755 		b->tin_backlog      += slen;
1756 		sch->qstats.backlog += slen;
1757 		q->avg_window_bytes += slen;
1758 
1759 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1760 		consume_skb(skb);
1761 	} else {
1762 		/* not splitting */
1763 		cobalt_set_enqueue_time(skb, now);
1764 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1765 		flow_queue_add(flow, skb);
1766 
1767 		if (q->ack_filter)
1768 			ack = cake_ack_filter(q, flow);
1769 
1770 		if (ack) {
1771 			b->ack_drops++;
1772 			sch->qstats.drops++;
1773 			b->bytes += qdisc_pkt_len(ack);
1774 			len -= qdisc_pkt_len(ack);
1775 			q->buffer_used += skb->truesize - ack->truesize;
1776 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1777 				cake_advance_shaper(q, b, ack, now, true);
1778 
1779 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1780 			consume_skb(ack);
1781 		} else {
1782 			sch->q.qlen++;
1783 			q->buffer_used      += skb->truesize;
1784 		}
1785 
1786 		/* stats */
1787 		b->packets++;
1788 		b->bytes	    += len;
1789 		b->backlogs[idx]    += len;
1790 		b->tin_backlog      += len;
1791 		sch->qstats.backlog += len;
1792 		q->avg_window_bytes += len;
1793 	}
1794 
1795 	if (q->overflow_timeout)
1796 		cake_heapify_up(q, b->overflow_idx[idx]);
1797 
1798 	/* incoming bandwidth capacity estimate */
1799 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1800 		u64 packet_interval = \
1801 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1802 
1803 		if (packet_interval > NSEC_PER_SEC)
1804 			packet_interval = NSEC_PER_SEC;
1805 
1806 		/* filter out short-term bursts, eg. wifi aggregation */
1807 		q->avg_packet_interval = \
1808 			cake_ewma(q->avg_packet_interval,
1809 				  packet_interval,
1810 				  (packet_interval > q->avg_packet_interval ?
1811 					  2 : 8));
1812 
1813 		q->last_packet_time = now;
1814 
1815 		if (packet_interval > q->avg_packet_interval) {
1816 			u64 window_interval = \
1817 				ktime_to_ns(ktime_sub(now,
1818 						      q->avg_window_begin));
1819 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1820 
1821 			b = div64_u64(b, window_interval);
1822 			q->avg_peak_bandwidth =
1823 				cake_ewma(q->avg_peak_bandwidth, b,
1824 					  b > q->avg_peak_bandwidth ? 2 : 8);
1825 			q->avg_window_bytes = 0;
1826 			q->avg_window_begin = now;
1827 
1828 			if (ktime_after(now,
1829 					ktime_add_ms(q->last_reconfig_time,
1830 						     250))) {
1831 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1832 				cake_reconfigure(sch);
1833 			}
1834 		}
1835 	} else {
1836 		q->avg_window_bytes = 0;
1837 		q->last_packet_time = now;
1838 	}
1839 
1840 	/* flowchain */
1841 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1842 		struct cake_host *srchost = &b->hosts[flow->srchost];
1843 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1844 		u16 host_load = 1;
1845 
1846 		if (!flow->set) {
1847 			list_add_tail(&flow->flowchain, &b->new_flows);
1848 		} else {
1849 			b->decaying_flow_count--;
1850 			list_move_tail(&flow->flowchain, &b->new_flows);
1851 		}
1852 		flow->set = CAKE_SET_SPARSE;
1853 		b->sparse_flow_count++;
1854 
1855 		if (cake_dsrc(q->flow_mode))
1856 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1857 
1858 		if (cake_ddst(q->flow_mode))
1859 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1860 
1861 		flow->deficit = (b->flow_quantum *
1862 				 quantum_div[host_load]) >> 16;
1863 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1864 		struct cake_host *srchost = &b->hosts[flow->srchost];
1865 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1866 
1867 		/* this flow was empty, accounted as a sparse flow, but actually
1868 		 * in the bulk rotation.
1869 		 */
1870 		flow->set = CAKE_SET_BULK;
1871 		b->sparse_flow_count--;
1872 		b->bulk_flow_count++;
1873 
1874 		if (cake_dsrc(q->flow_mode))
1875 			srchost->srchost_bulk_flow_count++;
1876 
1877 		if (cake_ddst(q->flow_mode))
1878 			dsthost->dsthost_bulk_flow_count++;
1879 
1880 	}
1881 
1882 	if (q->buffer_used > q->buffer_max_used)
1883 		q->buffer_max_used = q->buffer_used;
1884 
1885 	if (q->buffer_used > q->buffer_limit) {
1886 		u32 dropped = 0;
1887 
1888 		while (q->buffer_used > q->buffer_limit) {
1889 			dropped++;
1890 			cake_drop(sch, to_free);
1891 		}
1892 		b->drop_overlimit += dropped;
1893 	}
1894 	return NET_XMIT_SUCCESS;
1895 }
1896 
1897 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1898 {
1899 	struct cake_sched_data *q = qdisc_priv(sch);
1900 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1901 	struct cake_flow *flow = &b->flows[q->cur_flow];
1902 	struct sk_buff *skb = NULL;
1903 	u32 len;
1904 
1905 	if (flow->head) {
1906 		skb = dequeue_head(flow);
1907 		len = qdisc_pkt_len(skb);
1908 		b->backlogs[q->cur_flow] -= len;
1909 		b->tin_backlog		 -= len;
1910 		sch->qstats.backlog      -= len;
1911 		q->buffer_used		 -= skb->truesize;
1912 		sch->q.qlen--;
1913 
1914 		if (q->overflow_timeout)
1915 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1916 	}
1917 	return skb;
1918 }
1919 
1920 /* Discard leftover packets from a tin no longer in use. */
1921 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1922 {
1923 	struct cake_sched_data *q = qdisc_priv(sch);
1924 	struct sk_buff *skb;
1925 
1926 	q->cur_tin = tin;
1927 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1928 		while (!!(skb = cake_dequeue_one(sch)))
1929 			kfree_skb(skb);
1930 }
1931 
1932 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1933 {
1934 	struct cake_sched_data *q = qdisc_priv(sch);
1935 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1936 	struct cake_host *srchost, *dsthost;
1937 	ktime_t now = ktime_get();
1938 	struct cake_flow *flow;
1939 	struct list_head *head;
1940 	bool first_flow = true;
1941 	struct sk_buff *skb;
1942 	u16 host_load;
1943 	u64 delay;
1944 	u32 len;
1945 
1946 begin:
1947 	if (!sch->q.qlen)
1948 		return NULL;
1949 
1950 	/* global hard shaper */
1951 	if (ktime_after(q->time_next_packet, now) &&
1952 	    ktime_after(q->failsafe_next_packet, now)) {
1953 		u64 next = min(ktime_to_ns(q->time_next_packet),
1954 			       ktime_to_ns(q->failsafe_next_packet));
1955 
1956 		sch->qstats.overlimits++;
1957 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1958 		return NULL;
1959 	}
1960 
1961 	/* Choose a class to work on. */
1962 	if (!q->rate_ns) {
1963 		/* In unlimited mode, can't rely on shaper timings, just balance
1964 		 * with DRR
1965 		 */
1966 		bool wrapped = false, empty = true;
1967 
1968 		while (b->tin_deficit < 0 ||
1969 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1970 			if (b->tin_deficit <= 0)
1971 				b->tin_deficit += b->tin_quantum;
1972 			if (b->sparse_flow_count + b->bulk_flow_count)
1973 				empty = false;
1974 
1975 			q->cur_tin++;
1976 			b++;
1977 			if (q->cur_tin >= q->tin_cnt) {
1978 				q->cur_tin = 0;
1979 				b = q->tins;
1980 
1981 				if (wrapped) {
1982 					/* It's possible for q->qlen to be
1983 					 * nonzero when we actually have no
1984 					 * packets anywhere.
1985 					 */
1986 					if (empty)
1987 						return NULL;
1988 				} else {
1989 					wrapped = true;
1990 				}
1991 			}
1992 		}
1993 	} else {
1994 		/* In shaped mode, choose:
1995 		 * - Highest-priority tin with queue and meeting schedule, or
1996 		 * - The earliest-scheduled tin with queue.
1997 		 */
1998 		ktime_t best_time = KTIME_MAX;
1999 		int tin, best_tin = 0;
2000 
2001 		for (tin = 0; tin < q->tin_cnt; tin++) {
2002 			b = q->tins + tin;
2003 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2004 				ktime_t time_to_pkt = \
2005 					ktime_sub(b->time_next_packet, now);
2006 
2007 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2008 				    ktime_compare(time_to_pkt,
2009 						  best_time) <= 0) {
2010 					best_time = time_to_pkt;
2011 					best_tin = tin;
2012 				}
2013 			}
2014 		}
2015 
2016 		q->cur_tin = best_tin;
2017 		b = q->tins + best_tin;
2018 
2019 		/* No point in going further if no packets to deliver. */
2020 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2021 			return NULL;
2022 	}
2023 
2024 retry:
2025 	/* service this class */
2026 	head = &b->decaying_flows;
2027 	if (!first_flow || list_empty(head)) {
2028 		head = &b->new_flows;
2029 		if (list_empty(head)) {
2030 			head = &b->old_flows;
2031 			if (unlikely(list_empty(head))) {
2032 				head = &b->decaying_flows;
2033 				if (unlikely(list_empty(head)))
2034 					goto begin;
2035 			}
2036 		}
2037 	}
2038 	flow = list_first_entry(head, struct cake_flow, flowchain);
2039 	q->cur_flow = flow - b->flows;
2040 	first_flow = false;
2041 
2042 	/* triple isolation (modified DRR++) */
2043 	srchost = &b->hosts[flow->srchost];
2044 	dsthost = &b->hosts[flow->dsthost];
2045 	host_load = 1;
2046 
2047 	/* flow isolation (DRR++) */
2048 	if (flow->deficit <= 0) {
2049 		/* Keep all flows with deficits out of the sparse and decaying
2050 		 * rotations.  No non-empty flow can go into the decaying
2051 		 * rotation, so they can't get deficits
2052 		 */
2053 		if (flow->set == CAKE_SET_SPARSE) {
2054 			if (flow->head) {
2055 				b->sparse_flow_count--;
2056 				b->bulk_flow_count++;
2057 
2058 				if (cake_dsrc(q->flow_mode))
2059 					srchost->srchost_bulk_flow_count++;
2060 
2061 				if (cake_ddst(q->flow_mode))
2062 					dsthost->dsthost_bulk_flow_count++;
2063 
2064 				flow->set = CAKE_SET_BULK;
2065 			} else {
2066 				/* we've moved it to the bulk rotation for
2067 				 * correct deficit accounting but we still want
2068 				 * to count it as a sparse flow, not a bulk one.
2069 				 */
2070 				flow->set = CAKE_SET_SPARSE_WAIT;
2071 			}
2072 		}
2073 
2074 		if (cake_dsrc(q->flow_mode))
2075 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2076 
2077 		if (cake_ddst(q->flow_mode))
2078 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2079 
2080 		WARN_ON(host_load > CAKE_QUEUES);
2081 
2082 		/* The get_random_u16() is a way to apply dithering to avoid
2083 		 * accumulating roundoff errors
2084 		 */
2085 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2086 				  get_random_u16()) >> 16;
2087 		list_move_tail(&flow->flowchain, &b->old_flows);
2088 
2089 		goto retry;
2090 	}
2091 
2092 	/* Retrieve a packet via the AQM */
2093 	while (1) {
2094 		skb = cake_dequeue_one(sch);
2095 		if (!skb) {
2096 			/* this queue was actually empty */
2097 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2098 				b->unresponsive_flow_count--;
2099 
2100 			if (flow->cvars.p_drop || flow->cvars.count ||
2101 			    ktime_before(now, flow->cvars.drop_next)) {
2102 				/* keep in the flowchain until the state has
2103 				 * decayed to rest
2104 				 */
2105 				list_move_tail(&flow->flowchain,
2106 					       &b->decaying_flows);
2107 				if (flow->set == CAKE_SET_BULK) {
2108 					b->bulk_flow_count--;
2109 
2110 					if (cake_dsrc(q->flow_mode))
2111 						srchost->srchost_bulk_flow_count--;
2112 
2113 					if (cake_ddst(q->flow_mode))
2114 						dsthost->dsthost_bulk_flow_count--;
2115 
2116 					b->decaying_flow_count++;
2117 				} else if (flow->set == CAKE_SET_SPARSE ||
2118 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2119 					b->sparse_flow_count--;
2120 					b->decaying_flow_count++;
2121 				}
2122 				flow->set = CAKE_SET_DECAYING;
2123 			} else {
2124 				/* remove empty queue from the flowchain */
2125 				list_del_init(&flow->flowchain);
2126 				if (flow->set == CAKE_SET_SPARSE ||
2127 				    flow->set == CAKE_SET_SPARSE_WAIT)
2128 					b->sparse_flow_count--;
2129 				else if (flow->set == CAKE_SET_BULK) {
2130 					b->bulk_flow_count--;
2131 
2132 					if (cake_dsrc(q->flow_mode))
2133 						srchost->srchost_bulk_flow_count--;
2134 
2135 					if (cake_ddst(q->flow_mode))
2136 						dsthost->dsthost_bulk_flow_count--;
2137 
2138 				} else
2139 					b->decaying_flow_count--;
2140 
2141 				flow->set = CAKE_SET_NONE;
2142 			}
2143 			goto begin;
2144 		}
2145 
2146 		/* Last packet in queue may be marked, shouldn't be dropped */
2147 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2148 					(b->bulk_flow_count *
2149 					 !!(q->rate_flags &
2150 					    CAKE_FLAG_INGRESS))) ||
2151 		    !flow->head)
2152 			break;
2153 
2154 		/* drop this packet, get another one */
2155 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2156 			len = cake_advance_shaper(q, b, skb,
2157 						  now, true);
2158 			flow->deficit -= len;
2159 			b->tin_deficit -= len;
2160 		}
2161 		flow->dropped++;
2162 		b->tin_dropped++;
2163 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2164 		qdisc_qstats_drop(sch);
2165 		kfree_skb(skb);
2166 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2167 			goto retry;
2168 	}
2169 
2170 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2171 	qdisc_bstats_update(sch, skb);
2172 
2173 	/* collect delay stats */
2174 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2175 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2176 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2177 				  delay > b->peak_delay ? 2 : 8);
2178 	b->base_delay = cake_ewma(b->base_delay, delay,
2179 				  delay < b->base_delay ? 2 : 8);
2180 
2181 	len = cake_advance_shaper(q, b, skb, now, false);
2182 	flow->deficit -= len;
2183 	b->tin_deficit -= len;
2184 
2185 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2186 		u64 next = min(ktime_to_ns(q->time_next_packet),
2187 			       ktime_to_ns(q->failsafe_next_packet));
2188 
2189 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2190 	} else if (!sch->q.qlen) {
2191 		int i;
2192 
2193 		for (i = 0; i < q->tin_cnt; i++) {
2194 			if (q->tins[i].decaying_flow_count) {
2195 				ktime_t next = \
2196 					ktime_add_ns(now,
2197 						     q->tins[i].cparams.target);
2198 
2199 				qdisc_watchdog_schedule_ns(&q->watchdog,
2200 							   ktime_to_ns(next));
2201 				break;
2202 			}
2203 		}
2204 	}
2205 
2206 	if (q->overflow_timeout)
2207 		q->overflow_timeout--;
2208 
2209 	return skb;
2210 }
2211 
2212 static void cake_reset(struct Qdisc *sch)
2213 {
2214 	struct cake_sched_data *q = qdisc_priv(sch);
2215 	u32 c;
2216 
2217 	if (!q->tins)
2218 		return;
2219 
2220 	for (c = 0; c < CAKE_MAX_TINS; c++)
2221 		cake_clear_tin(sch, c);
2222 }
2223 
2224 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2225 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2226 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2227 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2228 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2229 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2230 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2231 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2232 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2233 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2234 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2235 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2236 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2237 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2238 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2239 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2240 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2241 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2242 };
2243 
2244 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2245 			  u64 target_ns, u64 rtt_est_ns)
2246 {
2247 	/* convert byte-rate into time-per-byte
2248 	 * so it will always unwedge in reasonable time.
2249 	 */
2250 	static const u64 MIN_RATE = 64;
2251 	u32 byte_target = mtu;
2252 	u64 byte_target_ns;
2253 	u8  rate_shft = 0;
2254 	u64 rate_ns = 0;
2255 
2256 	b->flow_quantum = 1514;
2257 	if (rate) {
2258 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2259 		rate_shft = 34;
2260 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2261 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2262 		while (!!(rate_ns >> 34)) {
2263 			rate_ns >>= 1;
2264 			rate_shft--;
2265 		}
2266 	} /* else unlimited, ie. zero delay */
2267 
2268 	b->tin_rate_bps  = rate;
2269 	b->tin_rate_ns   = rate_ns;
2270 	b->tin_rate_shft = rate_shft;
2271 
2272 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2273 
2274 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2275 	b->cparams.interval = max(rtt_est_ns +
2276 				     b->cparams.target - target_ns,
2277 				     b->cparams.target * 2);
2278 	b->cparams.mtu_time = byte_target_ns;
2279 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2280 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2281 }
2282 
2283 static int cake_config_besteffort(struct Qdisc *sch)
2284 {
2285 	struct cake_sched_data *q = qdisc_priv(sch);
2286 	struct cake_tin_data *b = &q->tins[0];
2287 	u32 mtu = psched_mtu(qdisc_dev(sch));
2288 	u64 rate = q->rate_bps;
2289 
2290 	q->tin_cnt = 1;
2291 
2292 	q->tin_index = besteffort;
2293 	q->tin_order = normal_order;
2294 
2295 	cake_set_rate(b, rate, mtu,
2296 		      us_to_ns(q->target), us_to_ns(q->interval));
2297 	b->tin_quantum = 65535;
2298 
2299 	return 0;
2300 }
2301 
2302 static int cake_config_precedence(struct Qdisc *sch)
2303 {
2304 	/* convert high-level (user visible) parameters into internal format */
2305 	struct cake_sched_data *q = qdisc_priv(sch);
2306 	u32 mtu = psched_mtu(qdisc_dev(sch));
2307 	u64 rate = q->rate_bps;
2308 	u32 quantum = 256;
2309 	u32 i;
2310 
2311 	q->tin_cnt = 8;
2312 	q->tin_index = precedence;
2313 	q->tin_order = normal_order;
2314 
2315 	for (i = 0; i < q->tin_cnt; i++) {
2316 		struct cake_tin_data *b = &q->tins[i];
2317 
2318 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2319 			      us_to_ns(q->interval));
2320 
2321 		b->tin_quantum = max_t(u16, 1U, quantum);
2322 
2323 		/* calculate next class's parameters */
2324 		rate  *= 7;
2325 		rate >>= 3;
2326 
2327 		quantum  *= 7;
2328 		quantum >>= 3;
2329 	}
2330 
2331 	return 0;
2332 }
2333 
2334 /*	List of known Diffserv codepoints:
2335  *
2336  *	Default Forwarding (DF/CS0) - Best Effort
2337  *	Max Throughput (TOS2)
2338  *	Min Delay (TOS4)
2339  *	LLT "La" (TOS5)
2340  *	Assured Forwarding 1 (AF1x) - x3
2341  *	Assured Forwarding 2 (AF2x) - x3
2342  *	Assured Forwarding 3 (AF3x) - x3
2343  *	Assured Forwarding 4 (AF4x) - x3
2344  *	Precedence Class 1 (CS1)
2345  *	Precedence Class 2 (CS2)
2346  *	Precedence Class 3 (CS3)
2347  *	Precedence Class 4 (CS4)
2348  *	Precedence Class 5 (CS5)
2349  *	Precedence Class 6 (CS6)
2350  *	Precedence Class 7 (CS7)
2351  *	Voice Admit (VA)
2352  *	Expedited Forwarding (EF)
2353  *	Lower Effort (LE)
2354  *
2355  *	Total 26 codepoints.
2356  */
2357 
2358 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2359  *		(roughly descending order of contended priority)
2360  *		(roughly ascending order of uncontended throughput)
2361  *
2362  *	Network Control (CS6,CS7)      - routing traffic
2363  *	Telephony (EF,VA)         - aka. VoIP streams
2364  *	Signalling (CS5)               - VoIP setup
2365  *	Multimedia Conferencing (AF4x) - aka. video calls
2366  *	Realtime Interactive (CS4)     - eg. games
2367  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2368  *	Broadcast Video (CS3)
2369  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2370  *	Ops, Admin, Management (CS2)      - eg. ssh
2371  *	Standard Service (DF & unrecognised codepoints)
2372  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2373  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2374  *
2375  *	Total 12 traffic classes.
2376  */
2377 
2378 static int cake_config_diffserv8(struct Qdisc *sch)
2379 {
2380 /*	Pruned list of traffic classes for typical applications:
2381  *
2382  *		Network Control          (CS6, CS7)
2383  *		Minimum Latency          (EF, VA, CS5, CS4)
2384  *		Interactive Shell        (CS2)
2385  *		Low Latency Transactions (AF2x, TOS4)
2386  *		Video Streaming          (AF4x, AF3x, CS3)
2387  *		Bog Standard             (DF etc.)
2388  *		High Throughput          (AF1x, TOS2, CS1)
2389  *		Background Traffic       (LE)
2390  *
2391  *		Total 8 traffic classes.
2392  */
2393 
2394 	struct cake_sched_data *q = qdisc_priv(sch);
2395 	u32 mtu = psched_mtu(qdisc_dev(sch));
2396 	u64 rate = q->rate_bps;
2397 	u32 quantum = 256;
2398 	u32 i;
2399 
2400 	q->tin_cnt = 8;
2401 
2402 	/* codepoint to class mapping */
2403 	q->tin_index = diffserv8;
2404 	q->tin_order = normal_order;
2405 
2406 	/* class characteristics */
2407 	for (i = 0; i < q->tin_cnt; i++) {
2408 		struct cake_tin_data *b = &q->tins[i];
2409 
2410 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2411 			      us_to_ns(q->interval));
2412 
2413 		b->tin_quantum = max_t(u16, 1U, quantum);
2414 
2415 		/* calculate next class's parameters */
2416 		rate  *= 7;
2417 		rate >>= 3;
2418 
2419 		quantum  *= 7;
2420 		quantum >>= 3;
2421 	}
2422 
2423 	return 0;
2424 }
2425 
2426 static int cake_config_diffserv4(struct Qdisc *sch)
2427 {
2428 /*  Further pruned list of traffic classes for four-class system:
2429  *
2430  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2431  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2432  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2433  *	    Background Traffic (LE, CS1)
2434  *
2435  *		Total 4 traffic classes.
2436  */
2437 
2438 	struct cake_sched_data *q = qdisc_priv(sch);
2439 	u32 mtu = psched_mtu(qdisc_dev(sch));
2440 	u64 rate = q->rate_bps;
2441 	u32 quantum = 1024;
2442 
2443 	q->tin_cnt = 4;
2444 
2445 	/* codepoint to class mapping */
2446 	q->tin_index = diffserv4;
2447 	q->tin_order = bulk_order;
2448 
2449 	/* class characteristics */
2450 	cake_set_rate(&q->tins[0], rate, mtu,
2451 		      us_to_ns(q->target), us_to_ns(q->interval));
2452 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2453 		      us_to_ns(q->target), us_to_ns(q->interval));
2454 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2455 		      us_to_ns(q->target), us_to_ns(q->interval));
2456 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2457 		      us_to_ns(q->target), us_to_ns(q->interval));
2458 
2459 	/* bandwidth-sharing weights */
2460 	q->tins[0].tin_quantum = quantum;
2461 	q->tins[1].tin_quantum = quantum >> 4;
2462 	q->tins[2].tin_quantum = quantum >> 1;
2463 	q->tins[3].tin_quantum = quantum >> 2;
2464 
2465 	return 0;
2466 }
2467 
2468 static int cake_config_diffserv3(struct Qdisc *sch)
2469 {
2470 /*  Simplified Diffserv structure with 3 tins.
2471  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2472  *		Best Effort
2473  *		Low Priority		(LE, CS1)
2474  */
2475 	struct cake_sched_data *q = qdisc_priv(sch);
2476 	u32 mtu = psched_mtu(qdisc_dev(sch));
2477 	u64 rate = q->rate_bps;
2478 	u32 quantum = 1024;
2479 
2480 	q->tin_cnt = 3;
2481 
2482 	/* codepoint to class mapping */
2483 	q->tin_index = diffserv3;
2484 	q->tin_order = bulk_order;
2485 
2486 	/* class characteristics */
2487 	cake_set_rate(&q->tins[0], rate, mtu,
2488 		      us_to_ns(q->target), us_to_ns(q->interval));
2489 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2490 		      us_to_ns(q->target), us_to_ns(q->interval));
2491 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2492 		      us_to_ns(q->target), us_to_ns(q->interval));
2493 
2494 	/* bandwidth-sharing weights */
2495 	q->tins[0].tin_quantum = quantum;
2496 	q->tins[1].tin_quantum = quantum >> 4;
2497 	q->tins[2].tin_quantum = quantum >> 2;
2498 
2499 	return 0;
2500 }
2501 
2502 static void cake_reconfigure(struct Qdisc *sch)
2503 {
2504 	struct cake_sched_data *q = qdisc_priv(sch);
2505 	int c, ft;
2506 
2507 	switch (q->tin_mode) {
2508 	case CAKE_DIFFSERV_BESTEFFORT:
2509 		ft = cake_config_besteffort(sch);
2510 		break;
2511 
2512 	case CAKE_DIFFSERV_PRECEDENCE:
2513 		ft = cake_config_precedence(sch);
2514 		break;
2515 
2516 	case CAKE_DIFFSERV_DIFFSERV8:
2517 		ft = cake_config_diffserv8(sch);
2518 		break;
2519 
2520 	case CAKE_DIFFSERV_DIFFSERV4:
2521 		ft = cake_config_diffserv4(sch);
2522 		break;
2523 
2524 	case CAKE_DIFFSERV_DIFFSERV3:
2525 	default:
2526 		ft = cake_config_diffserv3(sch);
2527 		break;
2528 	}
2529 
2530 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2531 		cake_clear_tin(sch, c);
2532 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2533 	}
2534 
2535 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2536 	q->rate_shft = q->tins[ft].tin_rate_shft;
2537 
2538 	if (q->buffer_config_limit) {
2539 		q->buffer_limit = q->buffer_config_limit;
2540 	} else if (q->rate_bps) {
2541 		u64 t = q->rate_bps * q->interval;
2542 
2543 		do_div(t, USEC_PER_SEC / 4);
2544 		q->buffer_limit = max_t(u32, t, 4U << 20);
2545 	} else {
2546 		q->buffer_limit = ~0;
2547 	}
2548 
2549 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2550 
2551 	q->buffer_limit = min(q->buffer_limit,
2552 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2553 				  q->buffer_config_limit));
2554 }
2555 
2556 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2557 		       struct netlink_ext_ack *extack)
2558 {
2559 	struct cake_sched_data *q = qdisc_priv(sch);
2560 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2561 	u16 rate_flags;
2562 	u8 flow_mode;
2563 	int err;
2564 
2565 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2566 					  extack);
2567 	if (err < 0)
2568 		return err;
2569 
2570 	flow_mode = q->flow_mode;
2571 	if (tb[TCA_CAKE_NAT]) {
2572 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2573 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2574 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2575 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2576 #else
2577 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2578 				    "No conntrack support in kernel");
2579 		return -EOPNOTSUPP;
2580 #endif
2581 	}
2582 
2583 	if (tb[TCA_CAKE_BASE_RATE64])
2584 		WRITE_ONCE(q->rate_bps,
2585 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2586 
2587 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2588 		WRITE_ONCE(q->tin_mode,
2589 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2590 
2591 	rate_flags = q->rate_flags;
2592 	if (tb[TCA_CAKE_WASH]) {
2593 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2594 			rate_flags |= CAKE_FLAG_WASH;
2595 		else
2596 			rate_flags &= ~CAKE_FLAG_WASH;
2597 	}
2598 
2599 	if (tb[TCA_CAKE_FLOW_MODE])
2600 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2601 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2602 					CAKE_FLOW_MASK));
2603 
2604 	if (tb[TCA_CAKE_ATM])
2605 		WRITE_ONCE(q->atm_mode,
2606 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2607 
2608 	if (tb[TCA_CAKE_OVERHEAD]) {
2609 		WRITE_ONCE(q->rate_overhead,
2610 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2611 		rate_flags |= CAKE_FLAG_OVERHEAD;
2612 
2613 		q->max_netlen = 0;
2614 		q->max_adjlen = 0;
2615 		q->min_netlen = ~0;
2616 		q->min_adjlen = ~0;
2617 	}
2618 
2619 	if (tb[TCA_CAKE_RAW]) {
2620 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2621 
2622 		q->max_netlen = 0;
2623 		q->max_adjlen = 0;
2624 		q->min_netlen = ~0;
2625 		q->min_adjlen = ~0;
2626 	}
2627 
2628 	if (tb[TCA_CAKE_MPU])
2629 		WRITE_ONCE(q->rate_mpu,
2630 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2631 
2632 	if (tb[TCA_CAKE_RTT]) {
2633 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2634 
2635 		WRITE_ONCE(q->interval, max(interval, 1U));
2636 	}
2637 
2638 	if (tb[TCA_CAKE_TARGET]) {
2639 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2640 
2641 		WRITE_ONCE(q->target, max(target, 1U));
2642 	}
2643 
2644 	if (tb[TCA_CAKE_AUTORATE]) {
2645 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2646 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2647 		else
2648 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2649 	}
2650 
2651 	if (tb[TCA_CAKE_INGRESS]) {
2652 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2653 			rate_flags |= CAKE_FLAG_INGRESS;
2654 		else
2655 			rate_flags &= ~CAKE_FLAG_INGRESS;
2656 	}
2657 
2658 	if (tb[TCA_CAKE_ACK_FILTER])
2659 		WRITE_ONCE(q->ack_filter,
2660 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2661 
2662 	if (tb[TCA_CAKE_MEMORY])
2663 		WRITE_ONCE(q->buffer_config_limit,
2664 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2665 
2666 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2667 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2668 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2669 		else
2670 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2671 	}
2672 
2673 	if (tb[TCA_CAKE_FWMARK]) {
2674 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2675 		WRITE_ONCE(q->fwmark_shft,
2676 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2677 	}
2678 
2679 	WRITE_ONCE(q->rate_flags, rate_flags);
2680 	WRITE_ONCE(q->flow_mode, flow_mode);
2681 	if (q->tins) {
2682 		sch_tree_lock(sch);
2683 		cake_reconfigure(sch);
2684 		sch_tree_unlock(sch);
2685 	}
2686 
2687 	return 0;
2688 }
2689 
2690 static void cake_destroy(struct Qdisc *sch)
2691 {
2692 	struct cake_sched_data *q = qdisc_priv(sch);
2693 
2694 	qdisc_watchdog_cancel(&q->watchdog);
2695 	tcf_block_put(q->block);
2696 	kvfree(q->tins);
2697 }
2698 
2699 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2700 		     struct netlink_ext_ack *extack)
2701 {
2702 	struct cake_sched_data *q = qdisc_priv(sch);
2703 	int i, j, err;
2704 
2705 	sch->limit = 10240;
2706 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2707 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2708 
2709 	q->rate_bps = 0; /* unlimited by default */
2710 
2711 	q->interval = 100000; /* 100ms default */
2712 	q->target   =   5000; /* 5ms: codel RFC argues
2713 			       * for 5 to 10% of interval
2714 			       */
2715 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2716 	q->cur_tin = 0;
2717 	q->cur_flow  = 0;
2718 
2719 	qdisc_watchdog_init(&q->watchdog, sch);
2720 
2721 	if (opt) {
2722 		err = cake_change(sch, opt, extack);
2723 
2724 		if (err)
2725 			return err;
2726 	}
2727 
2728 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2729 	if (err)
2730 		return err;
2731 
2732 	quantum_div[0] = ~0;
2733 	for (i = 1; i <= CAKE_QUEUES; i++)
2734 		quantum_div[i] = 65535 / i;
2735 
2736 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2737 			   GFP_KERNEL);
2738 	if (!q->tins)
2739 		return -ENOMEM;
2740 
2741 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2742 		struct cake_tin_data *b = q->tins + i;
2743 
2744 		INIT_LIST_HEAD(&b->new_flows);
2745 		INIT_LIST_HEAD(&b->old_flows);
2746 		INIT_LIST_HEAD(&b->decaying_flows);
2747 		b->sparse_flow_count = 0;
2748 		b->bulk_flow_count = 0;
2749 		b->decaying_flow_count = 0;
2750 
2751 		for (j = 0; j < CAKE_QUEUES; j++) {
2752 			struct cake_flow *flow = b->flows + j;
2753 			u32 k = j * CAKE_MAX_TINS + i;
2754 
2755 			INIT_LIST_HEAD(&flow->flowchain);
2756 			cobalt_vars_init(&flow->cvars);
2757 
2758 			q->overflow_heap[k].t = i;
2759 			q->overflow_heap[k].b = j;
2760 			b->overflow_idx[j] = k;
2761 		}
2762 	}
2763 
2764 	cake_reconfigure(sch);
2765 	q->avg_peak_bandwidth = q->rate_bps;
2766 	q->min_netlen = ~0;
2767 	q->min_adjlen = ~0;
2768 	return 0;
2769 }
2770 
2771 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2772 {
2773 	struct cake_sched_data *q = qdisc_priv(sch);
2774 	struct nlattr *opts;
2775 	u16 rate_flags;
2776 	u8 flow_mode;
2777 
2778 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2779 	if (!opts)
2780 		goto nla_put_failure;
2781 
2782 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2783 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2784 		goto nla_put_failure;
2785 
2786 	flow_mode = READ_ONCE(q->flow_mode);
2787 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2788 		goto nla_put_failure;
2789 
2790 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2791 		goto nla_put_failure;
2792 
2793 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2794 		goto nla_put_failure;
2795 
2796 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2797 			READ_ONCE(q->buffer_config_limit)))
2798 		goto nla_put_failure;
2799 
2800 	rate_flags = READ_ONCE(q->rate_flags);
2801 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2802 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2803 		goto nla_put_failure;
2804 
2805 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2806 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2807 		goto nla_put_failure;
2808 
2809 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2810 		goto nla_put_failure;
2811 
2812 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2813 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2814 		goto nla_put_failure;
2815 
2816 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2817 		goto nla_put_failure;
2818 
2819 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2820 			!!(rate_flags & CAKE_FLAG_WASH)))
2821 		goto nla_put_failure;
2822 
2823 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2824 		goto nla_put_failure;
2825 
2826 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2827 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2828 			goto nla_put_failure;
2829 
2830 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2831 		goto nla_put_failure;
2832 
2833 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2834 		goto nla_put_failure;
2835 
2836 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2837 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2838 		goto nla_put_failure;
2839 
2840 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2841 		goto nla_put_failure;
2842 
2843 	return nla_nest_end(skb, opts);
2844 
2845 nla_put_failure:
2846 	return -1;
2847 }
2848 
2849 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2850 {
2851 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2852 	struct cake_sched_data *q = qdisc_priv(sch);
2853 	struct nlattr *tstats, *ts;
2854 	int i;
2855 
2856 	if (!stats)
2857 		return -1;
2858 
2859 #define PUT_STAT_U32(attr, data) do {				       \
2860 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2861 			goto nla_put_failure;			       \
2862 	} while (0)
2863 #define PUT_STAT_U64(attr, data) do {				       \
2864 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2865 					data, TCA_CAKE_STATS_PAD)) \
2866 			goto nla_put_failure;			       \
2867 	} while (0)
2868 
2869 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2870 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2871 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2872 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2873 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2874 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2875 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2876 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2877 
2878 #undef PUT_STAT_U32
2879 #undef PUT_STAT_U64
2880 
2881 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2882 	if (!tstats)
2883 		goto nla_put_failure;
2884 
2885 #define PUT_TSTAT_U32(attr, data) do {					\
2886 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2887 			goto nla_put_failure;				\
2888 	} while (0)
2889 #define PUT_TSTAT_U64(attr, data) do {					\
2890 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2891 					data, TCA_CAKE_TIN_STATS_PAD))	\
2892 			goto nla_put_failure;				\
2893 	} while (0)
2894 
2895 	for (i = 0; i < q->tin_cnt; i++) {
2896 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2897 
2898 		ts = nla_nest_start_noflag(d->skb, i + 1);
2899 		if (!ts)
2900 			goto nla_put_failure;
2901 
2902 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2903 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2904 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2905 
2906 		PUT_TSTAT_U32(TARGET_US,
2907 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2908 		PUT_TSTAT_U32(INTERVAL_US,
2909 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2910 
2911 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2912 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2913 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2914 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2915 
2916 		PUT_TSTAT_U32(PEAK_DELAY_US,
2917 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2918 		PUT_TSTAT_U32(AVG_DELAY_US,
2919 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2920 		PUT_TSTAT_U32(BASE_DELAY_US,
2921 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2922 
2923 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2924 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2925 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2926 
2927 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2928 					    b->decaying_flow_count);
2929 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2930 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2931 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2932 
2933 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2934 		nla_nest_end(d->skb, ts);
2935 	}
2936 
2937 #undef PUT_TSTAT_U32
2938 #undef PUT_TSTAT_U64
2939 
2940 	nla_nest_end(d->skb, tstats);
2941 	return nla_nest_end(d->skb, stats);
2942 
2943 nla_put_failure:
2944 	nla_nest_cancel(d->skb, stats);
2945 	return -1;
2946 }
2947 
2948 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2949 {
2950 	return NULL;
2951 }
2952 
2953 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2954 {
2955 	return 0;
2956 }
2957 
2958 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2959 			       u32 classid)
2960 {
2961 	return 0;
2962 }
2963 
2964 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2965 {
2966 }
2967 
2968 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2969 					struct netlink_ext_ack *extack)
2970 {
2971 	struct cake_sched_data *q = qdisc_priv(sch);
2972 
2973 	if (cl)
2974 		return NULL;
2975 	return q->block;
2976 }
2977 
2978 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2979 			   struct sk_buff *skb, struct tcmsg *tcm)
2980 {
2981 	tcm->tcm_handle |= TC_H_MIN(cl);
2982 	return 0;
2983 }
2984 
2985 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2986 				 struct gnet_dump *d)
2987 {
2988 	struct cake_sched_data *q = qdisc_priv(sch);
2989 	const struct cake_flow *flow = NULL;
2990 	struct gnet_stats_queue qs = { 0 };
2991 	struct nlattr *stats;
2992 	u32 idx = cl - 1;
2993 
2994 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2995 		const struct cake_tin_data *b = \
2996 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2997 		const struct sk_buff *skb;
2998 
2999 		flow = &b->flows[idx % CAKE_QUEUES];
3000 
3001 		if (flow->head) {
3002 			sch_tree_lock(sch);
3003 			skb = flow->head;
3004 			while (skb) {
3005 				qs.qlen++;
3006 				skb = skb->next;
3007 			}
3008 			sch_tree_unlock(sch);
3009 		}
3010 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3011 		qs.drops = flow->dropped;
3012 	}
3013 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3014 		return -1;
3015 	if (flow) {
3016 		ktime_t now = ktime_get();
3017 
3018 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3019 		if (!stats)
3020 			return -1;
3021 
3022 #define PUT_STAT_U32(attr, data) do {				       \
3023 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3024 			goto nla_put_failure;			       \
3025 	} while (0)
3026 #define PUT_STAT_S32(attr, data) do {				       \
3027 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3028 			goto nla_put_failure;			       \
3029 	} while (0)
3030 
3031 		PUT_STAT_S32(DEFICIT, flow->deficit);
3032 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3033 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3034 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3035 		if (flow->cvars.p_drop) {
3036 			PUT_STAT_S32(BLUE_TIMER_US,
3037 				     ktime_to_us(
3038 					     ktime_sub(now,
3039 						       flow->cvars.blue_timer)));
3040 		}
3041 		if (flow->cvars.dropping) {
3042 			PUT_STAT_S32(DROP_NEXT_US,
3043 				     ktime_to_us(
3044 					     ktime_sub(now,
3045 						       flow->cvars.drop_next)));
3046 		}
3047 
3048 		if (nla_nest_end(d->skb, stats) < 0)
3049 			return -1;
3050 	}
3051 
3052 	return 0;
3053 
3054 nla_put_failure:
3055 	nla_nest_cancel(d->skb, stats);
3056 	return -1;
3057 }
3058 
3059 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3060 {
3061 	struct cake_sched_data *q = qdisc_priv(sch);
3062 	unsigned int i, j;
3063 
3064 	if (arg->stop)
3065 		return;
3066 
3067 	for (i = 0; i < q->tin_cnt; i++) {
3068 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3069 
3070 		for (j = 0; j < CAKE_QUEUES; j++) {
3071 			if (list_empty(&b->flows[j].flowchain)) {
3072 				arg->count++;
3073 				continue;
3074 			}
3075 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3076 						 arg))
3077 				break;
3078 		}
3079 	}
3080 }
3081 
3082 static const struct Qdisc_class_ops cake_class_ops = {
3083 	.leaf		=	cake_leaf,
3084 	.find		=	cake_find,
3085 	.tcf_block	=	cake_tcf_block,
3086 	.bind_tcf	=	cake_bind,
3087 	.unbind_tcf	=	cake_unbind,
3088 	.dump		=	cake_dump_class,
3089 	.dump_stats	=	cake_dump_class_stats,
3090 	.walk		=	cake_walk,
3091 };
3092 
3093 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3094 	.cl_ops		=	&cake_class_ops,
3095 	.id		=	"cake",
3096 	.priv_size	=	sizeof(struct cake_sched_data),
3097 	.enqueue	=	cake_enqueue,
3098 	.dequeue	=	cake_dequeue,
3099 	.peek		=	qdisc_peek_dequeued,
3100 	.init		=	cake_init,
3101 	.reset		=	cake_reset,
3102 	.destroy	=	cake_destroy,
3103 	.change		=	cake_change,
3104 	.dump		=	cake_dump,
3105 	.dump_stats	=	cake_dump_stats,
3106 	.owner		=	THIS_MODULE,
3107 };
3108 MODULE_ALIAS_NET_SCH("cake");
3109 
3110 static int __init cake_module_init(void)
3111 {
3112 	return register_qdisc(&cake_qdisc_ops);
3113 }
3114 
3115 static void __exit cake_module_exit(void)
3116 {
3117 	unregister_qdisc(&cake_qdisc_ops);
3118 }
3119 
3120 module_init(cake_module_init)
3121 module_exit(cake_module_exit)
3122 MODULE_AUTHOR("Jonathan Morton");
3123 MODULE_LICENSE("Dual BSD/GPL");
3124 MODULE_DESCRIPTION("The CAKE shaper.");
3125