xref: /linux/net/sched/sch_cake.c (revision 6a35ddc5445a8291ced6247a67977e110275acde)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum;
177 	s32	tin_deficit;
178 	u32	tin_backlog;
179 	u32	tin_dropped;
180 	u32	tin_ecn_mark;
181 
182 	u32	packets;
183 	u64	bytes;
184 
185 	u32	ack_drops;
186 
187 	/* moving averages */
188 	u64 avge_delay;
189 	u64 peak_delay;
190 	u64 base_delay;
191 
192 	/* hash function stats */
193 	u32	way_directs;
194 	u32	way_hits;
195 	u32	way_misses;
196 	u32	way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198 
199 struct cake_sched_data {
200 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
201 	struct tcf_block *block;
202 	struct cake_tin_data *tins;
203 
204 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205 	u16		overflow_timeout;
206 
207 	u16		tin_cnt;
208 	u8		tin_mode;
209 	u8		flow_mode;
210 	u8		ack_filter;
211 	u8		atm_mode;
212 
213 	u32		fwmark_mask;
214 	u16		fwmark_shft;
215 
216 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217 	u16		rate_shft;
218 	ktime_t		time_next_packet;
219 	ktime_t		failsafe_next_packet;
220 	u64		rate_ns;
221 	u64		rate_bps;
222 	u16		rate_flags;
223 	s16		rate_overhead;
224 	u16		rate_mpu;
225 	u64		interval;
226 	u64		target;
227 
228 	/* resource tracking */
229 	u32		buffer_used;
230 	u32		buffer_max_used;
231 	u32		buffer_limit;
232 	u32		buffer_config_limit;
233 
234 	/* indices for dequeue */
235 	u16		cur_tin;
236 	u16		cur_flow;
237 
238 	struct qdisc_watchdog watchdog;
239 	const u8	*tin_index;
240 	const u8	*tin_order;
241 
242 	/* bandwidth capacity estimate */
243 	ktime_t		last_packet_time;
244 	ktime_t		avg_window_begin;
245 	u64		avg_packet_interval;
246 	u64		avg_window_bytes;
247 	u64		avg_peak_bandwidth;
248 	ktime_t		last_reconfig_time;
249 
250 	/* packet length stats */
251 	u32		avg_netoff;
252 	u16		max_netlen;
253 	u16		max_adjlen;
254 	u16		min_netlen;
255 	u16		min_adjlen;
256 };
257 
258 enum {
259 	CAKE_FLAG_OVERHEAD	   = BIT(0),
260 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261 	CAKE_FLAG_INGRESS	   = BIT(2),
262 	CAKE_FLAG_WASH		   = BIT(3),
263 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
264 };
265 
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271 
272 struct cobalt_skb_cb {
273 	ktime_t enqueue_time;
274 	u32     adjusted_len;
275 };
276 
277 static u64 us_to_ns(u64 us)
278 {
279 	return us * NSEC_PER_USEC;
280 }
281 
282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287 
288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290 	return get_cobalt_cb(skb)->enqueue_time;
291 }
292 
293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294 				    ktime_t now)
295 {
296 	get_cobalt_cb(skb)->enqueue_time = now;
297 }
298 
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300 
301 /* Diffserv lookup tables */
302 
303 static const u8 precedence[] = {
304 	0, 0, 0, 0, 0, 0, 0, 0,
305 	1, 1, 1, 1, 1, 1, 1, 1,
306 	2, 2, 2, 2, 2, 2, 2, 2,
307 	3, 3, 3, 3, 3, 3, 3, 3,
308 	4, 4, 4, 4, 4, 4, 4, 4,
309 	5, 5, 5, 5, 5, 5, 5, 5,
310 	6, 6, 6, 6, 6, 6, 6, 6,
311 	7, 7, 7, 7, 7, 7, 7, 7,
312 };
313 
314 static const u8 diffserv8[] = {
315 	2, 5, 1, 2, 4, 2, 2, 2,
316 	0, 2, 1, 2, 1, 2, 1, 2,
317 	5, 2, 4, 2, 4, 2, 4, 2,
318 	3, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 2, 2, 6, 2, 6, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 };
324 
325 static const u8 diffserv4[] = {
326 	0, 2, 0, 0, 2, 0, 0, 0,
327 	1, 0, 0, 0, 0, 0, 0, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 0, 0, 3, 0, 3, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 };
335 
336 static const u8 diffserv3[] = {
337 	0, 0, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 2, 0, 2, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 besteffort[] = {
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 /* tin priority order for stats dumping */
359 
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362 
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365 
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371 
372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374 	u32 invsqrt, invsqrt2;
375 	u64 val;
376 
377 	invsqrt = vars->rec_inv_sqrt;
378 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380 
381 	val >>= 2; /* avoid overflow in following multiply */
382 	val = (val * invsqrt) >> (32 - 2 + 1);
383 
384 	vars->rec_inv_sqrt = val;
385 }
386 
387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389 	if (vars->count < REC_INV_SQRT_CACHE)
390 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391 	else
392 		cobalt_newton_step(vars);
393 }
394 
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404 
405 static void cobalt_cache_init(void)
406 {
407 	struct cobalt_vars v;
408 
409 	memset(&v, 0, sizeof(v));
410 	v.rec_inv_sqrt = ~0U;
411 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412 
413 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 
419 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420 	}
421 }
422 
423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425 	memset(vars, 0, sizeof(*vars));
426 
427 	if (!cobalt_rec_inv_sqrt_cache[0]) {
428 		cobalt_cache_init();
429 		cobalt_rec_inv_sqrt_cache[0] = ~0;
430 	}
431 }
432 
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
437 static ktime_t cobalt_control(ktime_t t,
438 			      u64 interval,
439 			      u32 rec_inv_sqrt)
440 {
441 	return ktime_add_ns(t, reciprocal_scale(interval,
442 						rec_inv_sqrt));
443 }
444 
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449 			      struct cobalt_params *p,
450 			      ktime_t now)
451 {
452 	bool up = false;
453 
454 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455 		up = !vars->p_drop;
456 		vars->p_drop += p->p_inc;
457 		if (vars->p_drop < p->p_inc)
458 			vars->p_drop = ~0;
459 		vars->blue_timer = now;
460 	}
461 	vars->dropping = true;
462 	vars->drop_next = now;
463 	if (!vars->count)
464 		vars->count = 1;
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			vars->p_drop = 0;
482 		else
483 			vars->p_drop -= p->p_dec;
484 		vars->blue_timer = now;
485 		down = !vars->p_drop;
486 	}
487 	vars->dropping = false;
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		vars->count--;
491 		cobalt_invsqrt(vars);
492 		vars->drop_next = cobalt_control(vars->drop_next,
493 						 p->interval,
494 						 vars->rec_inv_sqrt);
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504 			       struct cobalt_params *p,
505 			       ktime_t now,
506 			       struct sk_buff *skb,
507 			       u32 bulk_flows)
508 {
509 	bool next_due, over_target, drop = false;
510 	ktime_t schedule;
511 	u64 sojourn;
512 
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527 
528 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529 	schedule = ktime_sub(now, vars->drop_next);
530 	over_target = sojourn > p->target &&
531 		      sojourn > p->mtu_time * bulk_flows * 2 &&
532 		      sojourn > p->mtu_time * 4;
533 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
534 
535 	vars->ecn_marked = false;
536 
537 	if (over_target) {
538 		if (!vars->dropping) {
539 			vars->dropping = true;
540 			vars->drop_next = cobalt_control(now,
541 							 p->interval,
542 							 vars->rec_inv_sqrt);
543 		}
544 		if (!vars->count)
545 			vars->count = 1;
546 	} else if (vars->dropping) {
547 		vars->dropping = false;
548 	}
549 
550 	if (next_due && vars->dropping) {
551 		/* Use ECN mark if possible, otherwise drop */
552 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553 
554 		vars->count++;
555 		if (!vars->count)
556 			vars->count--;
557 		cobalt_invsqrt(vars);
558 		vars->drop_next = cobalt_control(vars->drop_next,
559 						 p->interval,
560 						 vars->rec_inv_sqrt);
561 		schedule = ktime_sub(now, vars->drop_next);
562 	} else {
563 		while (next_due) {
564 			vars->count--;
565 			cobalt_invsqrt(vars);
566 			vars->drop_next = cobalt_control(vars->drop_next,
567 							 p->interval,
568 							 vars->rec_inv_sqrt);
569 			schedule = ktime_sub(now, vars->drop_next);
570 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
571 		}
572 	}
573 
574 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
575 	if (vars->p_drop)
576 		drop |= (prandom_u32() < vars->p_drop);
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && !drop)
582 		vars->drop_next = now;
583 
584 	return drop;
585 }
586 
587 static void cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct;
593 
594 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
595 		return;
596 
597 	if (!nf_ct_get_tuple_skb(&tuple, skb))
598 		return;
599 
600 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
601 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
602 
603 	if (keys->ports.ports) {
604 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
605 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
606 	}
607 #endif
608 }
609 
610 /* Cake has several subtle multiple bit settings. In these cases you
611  *  would be matching triple isolate mode as well.
612  */
613 
614 static bool cake_dsrc(int flow_mode)
615 {
616 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
617 }
618 
619 static bool cake_ddst(int flow_mode)
620 {
621 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
622 }
623 
624 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
625 		     int flow_mode, u16 flow_override, u16 host_override)
626 {
627 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
628 	u16 reduced_hash, srchost_idx, dsthost_idx;
629 	struct flow_keys keys, host_keys;
630 
631 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
632 		return 0;
633 
634 	/* If both overrides are set we can skip packet dissection entirely */
635 	if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) &&
636 	    (host_override || !(flow_mode & CAKE_FLOW_HOSTS)))
637 		goto skip_hash;
638 
639 	skb_flow_dissect_flow_keys(skb, &keys,
640 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
641 
642 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
643 		cake_update_flowkeys(&keys, skb);
644 
645 	/* flow_hash_from_keys() sorts the addresses by value, so we have
646 	 * to preserve their order in a separate data structure to treat
647 	 * src and dst host addresses as independently selectable.
648 	 */
649 	host_keys = keys;
650 	host_keys.ports.ports     = 0;
651 	host_keys.basic.ip_proto  = 0;
652 	host_keys.keyid.keyid     = 0;
653 	host_keys.tags.flow_label = 0;
654 
655 	switch (host_keys.control.addr_type) {
656 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
657 		host_keys.addrs.v4addrs.src = 0;
658 		dsthost_hash = flow_hash_from_keys(&host_keys);
659 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
660 		host_keys.addrs.v4addrs.dst = 0;
661 		srchost_hash = flow_hash_from_keys(&host_keys);
662 		break;
663 
664 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
665 		memset(&host_keys.addrs.v6addrs.src, 0,
666 		       sizeof(host_keys.addrs.v6addrs.src));
667 		dsthost_hash = flow_hash_from_keys(&host_keys);
668 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
669 		memset(&host_keys.addrs.v6addrs.dst, 0,
670 		       sizeof(host_keys.addrs.v6addrs.dst));
671 		srchost_hash = flow_hash_from_keys(&host_keys);
672 		break;
673 
674 	default:
675 		dsthost_hash = 0;
676 		srchost_hash = 0;
677 	}
678 
679 	/* This *must* be after the above switch, since as a
680 	 * side-effect it sorts the src and dst addresses.
681 	 */
682 	if (flow_mode & CAKE_FLOW_FLOWS)
683 		flow_hash = flow_hash_from_keys(&keys);
684 
685 skip_hash:
686 	if (flow_override)
687 		flow_hash = flow_override - 1;
688 	if (host_override) {
689 		dsthost_hash = host_override - 1;
690 		srchost_hash = host_override - 1;
691 	}
692 
693 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
694 		if (flow_mode & CAKE_FLOW_SRC_IP)
695 			flow_hash ^= srchost_hash;
696 
697 		if (flow_mode & CAKE_FLOW_DST_IP)
698 			flow_hash ^= dsthost_hash;
699 	}
700 
701 	reduced_hash = flow_hash % CAKE_QUEUES;
702 
703 	/* set-associative hashing */
704 	/* fast path if no hash collision (direct lookup succeeds) */
705 	if (likely(q->tags[reduced_hash] == flow_hash &&
706 		   q->flows[reduced_hash].set)) {
707 		q->way_directs++;
708 	} else {
709 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
710 		u32 outer_hash = reduced_hash - inner_hash;
711 		bool allocate_src = false;
712 		bool allocate_dst = false;
713 		u32 i, k;
714 
715 		/* check if any active queue in the set is reserved for
716 		 * this flow.
717 		 */
718 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
719 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
720 			if (q->tags[outer_hash + k] == flow_hash) {
721 				if (i)
722 					q->way_hits++;
723 
724 				if (!q->flows[outer_hash + k].set) {
725 					/* need to increment host refcnts */
726 					allocate_src = cake_dsrc(flow_mode);
727 					allocate_dst = cake_ddst(flow_mode);
728 				}
729 
730 				goto found;
731 			}
732 		}
733 
734 		/* no queue is reserved for this flow, look for an
735 		 * empty one.
736 		 */
737 		for (i = 0; i < CAKE_SET_WAYS;
738 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
739 			if (!q->flows[outer_hash + k].set) {
740 				q->way_misses++;
741 				allocate_src = cake_dsrc(flow_mode);
742 				allocate_dst = cake_ddst(flow_mode);
743 				goto found;
744 			}
745 		}
746 
747 		/* With no empty queues, default to the original
748 		 * queue, accept the collision, update the host tags.
749 		 */
750 		q->way_collisions++;
751 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
752 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
753 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
754 		}
755 		allocate_src = cake_dsrc(flow_mode);
756 		allocate_dst = cake_ddst(flow_mode);
757 found:
758 		/* reserve queue for future packets in same flow */
759 		reduced_hash = outer_hash + k;
760 		q->tags[reduced_hash] = flow_hash;
761 
762 		if (allocate_src) {
763 			srchost_idx = srchost_hash % CAKE_QUEUES;
764 			inner_hash = srchost_idx % CAKE_SET_WAYS;
765 			outer_hash = srchost_idx - inner_hash;
766 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
767 				i++, k = (k + 1) % CAKE_SET_WAYS) {
768 				if (q->hosts[outer_hash + k].srchost_tag ==
769 				    srchost_hash)
770 					goto found_src;
771 			}
772 			for (i = 0; i < CAKE_SET_WAYS;
773 				i++, k = (k + 1) % CAKE_SET_WAYS) {
774 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
775 					break;
776 			}
777 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
778 found_src:
779 			srchost_idx = outer_hash + k;
780 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
781 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
782 			q->flows[reduced_hash].srchost = srchost_idx;
783 		}
784 
785 		if (allocate_dst) {
786 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
787 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
788 			outer_hash = dsthost_idx - inner_hash;
789 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
790 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
791 				if (q->hosts[outer_hash + k].dsthost_tag ==
792 				    dsthost_hash)
793 					goto found_dst;
794 			}
795 			for (i = 0; i < CAKE_SET_WAYS;
796 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
797 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
798 					break;
799 			}
800 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
801 found_dst:
802 			dsthost_idx = outer_hash + k;
803 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
804 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
805 			q->flows[reduced_hash].dsthost = dsthost_idx;
806 		}
807 	}
808 
809 	return reduced_hash;
810 }
811 
812 /* helper functions : might be changed when/if skb use a standard list_head */
813 /* remove one skb from head of slot queue */
814 
815 static struct sk_buff *dequeue_head(struct cake_flow *flow)
816 {
817 	struct sk_buff *skb = flow->head;
818 
819 	if (skb) {
820 		flow->head = skb->next;
821 		skb_mark_not_on_list(skb);
822 	}
823 
824 	return skb;
825 }
826 
827 /* add skb to flow queue (tail add) */
828 
829 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
830 {
831 	if (!flow->head)
832 		flow->head = skb;
833 	else
834 		flow->tail->next = skb;
835 	flow->tail = skb;
836 	skb->next = NULL;
837 }
838 
839 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
840 				    struct ipv6hdr *buf)
841 {
842 	unsigned int offset = skb_network_offset(skb);
843 	struct iphdr *iph;
844 
845 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
846 
847 	if (!iph)
848 		return NULL;
849 
850 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
851 		return skb_header_pointer(skb, offset + iph->ihl * 4,
852 					  sizeof(struct ipv6hdr), buf);
853 
854 	else if (iph->version == 4)
855 		return iph;
856 
857 	else if (iph->version == 6)
858 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
859 					  buf);
860 
861 	return NULL;
862 }
863 
864 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
865 				      void *buf, unsigned int bufsize)
866 {
867 	unsigned int offset = skb_network_offset(skb);
868 	const struct ipv6hdr *ipv6h;
869 	const struct tcphdr *tcph;
870 	const struct iphdr *iph;
871 	struct ipv6hdr _ipv6h;
872 	struct tcphdr _tcph;
873 
874 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
875 
876 	if (!ipv6h)
877 		return NULL;
878 
879 	if (ipv6h->version == 4) {
880 		iph = (struct iphdr *)ipv6h;
881 		offset += iph->ihl * 4;
882 
883 		/* special-case 6in4 tunnelling, as that is a common way to get
884 		 * v6 connectivity in the home
885 		 */
886 		if (iph->protocol == IPPROTO_IPV6) {
887 			ipv6h = skb_header_pointer(skb, offset,
888 						   sizeof(_ipv6h), &_ipv6h);
889 
890 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
891 				return NULL;
892 
893 			offset += sizeof(struct ipv6hdr);
894 
895 		} else if (iph->protocol != IPPROTO_TCP) {
896 			return NULL;
897 		}
898 
899 	} else if (ipv6h->version == 6) {
900 		if (ipv6h->nexthdr != IPPROTO_TCP)
901 			return NULL;
902 
903 		offset += sizeof(struct ipv6hdr);
904 	} else {
905 		return NULL;
906 	}
907 
908 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
909 	if (!tcph)
910 		return NULL;
911 
912 	return skb_header_pointer(skb, offset,
913 				  min(__tcp_hdrlen(tcph), bufsize), buf);
914 }
915 
916 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
917 				   int code, int *oplen)
918 {
919 	/* inspired by tcp_parse_options in tcp_input.c */
920 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
921 	const u8 *ptr = (const u8 *)(tcph + 1);
922 
923 	while (length > 0) {
924 		int opcode = *ptr++;
925 		int opsize;
926 
927 		if (opcode == TCPOPT_EOL)
928 			break;
929 		if (opcode == TCPOPT_NOP) {
930 			length--;
931 			continue;
932 		}
933 		opsize = *ptr++;
934 		if (opsize < 2 || opsize > length)
935 			break;
936 
937 		if (opcode == code) {
938 			*oplen = opsize;
939 			return ptr;
940 		}
941 
942 		ptr += opsize - 2;
943 		length -= opsize;
944 	}
945 
946 	return NULL;
947 }
948 
949 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
950  * bytes than the other. In the case where both sequences ACKs bytes that the
951  * other doesn't, A is considered greater. DSACKs in A also makes A be
952  * considered greater.
953  *
954  * @return -1, 0 or 1 as normal compare functions
955  */
956 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
957 				  const struct tcphdr *tcph_b)
958 {
959 	const struct tcp_sack_block_wire *sack_a, *sack_b;
960 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
961 	u32 bytes_a = 0, bytes_b = 0;
962 	int oplen_a, oplen_b;
963 	bool first = true;
964 
965 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
966 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
967 
968 	/* pointers point to option contents */
969 	oplen_a -= TCPOLEN_SACK_BASE;
970 	oplen_b -= TCPOLEN_SACK_BASE;
971 
972 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
973 	    (!sack_b || oplen_b < sizeof(*sack_b)))
974 		return -1;
975 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
976 		 (!sack_a || oplen_a < sizeof(*sack_a)))
977 		return 1;
978 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
979 		 (!sack_b || oplen_b < sizeof(*sack_b)))
980 		return 0;
981 
982 	while (oplen_a >= sizeof(*sack_a)) {
983 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
984 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
985 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
986 		int oplen_tmp = oplen_b;
987 		bool found = false;
988 
989 		/* DSACK; always considered greater to prevent dropping */
990 		if (before(start_a, ack_seq_a))
991 			return -1;
992 
993 		bytes_a += end_a - start_a;
994 
995 		while (oplen_tmp >= sizeof(*sack_tmp)) {
996 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
997 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
998 
999 			/* first time through we count the total size */
1000 			if (first)
1001 				bytes_b += end_b - start_b;
1002 
1003 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1004 				found = true;
1005 				if (!first)
1006 					break;
1007 			}
1008 			oplen_tmp -= sizeof(*sack_tmp);
1009 			sack_tmp++;
1010 		}
1011 
1012 		if (!found)
1013 			return -1;
1014 
1015 		oplen_a -= sizeof(*sack_a);
1016 		sack_a++;
1017 		first = false;
1018 	}
1019 
1020 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1021 	 * either the SACKs are equal, or B SACKs more bytes.
1022 	 */
1023 	return bytes_b > bytes_a ? 1 : 0;
1024 }
1025 
1026 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1027 				 u32 *tsval, u32 *tsecr)
1028 {
1029 	const u8 *ptr;
1030 	int opsize;
1031 
1032 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1033 
1034 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1035 		*tsval = get_unaligned_be32(ptr);
1036 		*tsecr = get_unaligned_be32(ptr + 4);
1037 	}
1038 }
1039 
1040 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1041 			       u32 tstamp_new, u32 tsecr_new)
1042 {
1043 	/* inspired by tcp_parse_options in tcp_input.c */
1044 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1045 	const u8 *ptr = (const u8 *)(tcph + 1);
1046 	u32 tstamp, tsecr;
1047 
1048 	/* 3 reserved flags must be unset to avoid future breakage
1049 	 * ACK must be set
1050 	 * ECE/CWR are handled separately
1051 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1052 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1053 	 * 0x00C00000 = CWR/ECE (handled separately)
1054 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1055 	 */
1056 	if (((tcp_flag_word(tcph) &
1057 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1058 		return false;
1059 
1060 	while (length > 0) {
1061 		int opcode = *ptr++;
1062 		int opsize;
1063 
1064 		if (opcode == TCPOPT_EOL)
1065 			break;
1066 		if (opcode == TCPOPT_NOP) {
1067 			length--;
1068 			continue;
1069 		}
1070 		opsize = *ptr++;
1071 		if (opsize < 2 || opsize > length)
1072 			break;
1073 
1074 		switch (opcode) {
1075 		case TCPOPT_MD5SIG: /* doesn't influence state */
1076 			break;
1077 
1078 		case TCPOPT_SACK: /* stricter checking performed later */
1079 			if (opsize % 8 != 2)
1080 				return false;
1081 			break;
1082 
1083 		case TCPOPT_TIMESTAMP:
1084 			/* only drop timestamps lower than new */
1085 			if (opsize != TCPOLEN_TIMESTAMP)
1086 				return false;
1087 			tstamp = get_unaligned_be32(ptr);
1088 			tsecr = get_unaligned_be32(ptr + 4);
1089 			if (after(tstamp, tstamp_new) ||
1090 			    after(tsecr, tsecr_new))
1091 				return false;
1092 			break;
1093 
1094 		case TCPOPT_MSS:  /* these should only be set on SYN */
1095 		case TCPOPT_WINDOW:
1096 		case TCPOPT_SACK_PERM:
1097 		case TCPOPT_FASTOPEN:
1098 		case TCPOPT_EXP:
1099 		default: /* don't drop if any unknown options are present */
1100 			return false;
1101 		}
1102 
1103 		ptr += opsize - 2;
1104 		length -= opsize;
1105 	}
1106 
1107 	return true;
1108 }
1109 
1110 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1111 				       struct cake_flow *flow)
1112 {
1113 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1114 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1115 	struct sk_buff *skb_check, *skb_prev = NULL;
1116 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1117 	unsigned char _tcph[64], _tcph_check[64];
1118 	const struct tcphdr *tcph, *tcph_check;
1119 	const struct iphdr *iph, *iph_check;
1120 	struct ipv6hdr _iph, _iph_check;
1121 	const struct sk_buff *skb;
1122 	int seglen, num_found = 0;
1123 	u32 tstamp = 0, tsecr = 0;
1124 	__be32 elig_flags = 0;
1125 	int sack_comp;
1126 
1127 	/* no other possible ACKs to filter */
1128 	if (flow->head == flow->tail)
1129 		return NULL;
1130 
1131 	skb = flow->tail;
1132 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1133 	iph = cake_get_iphdr(skb, &_iph);
1134 	if (!tcph)
1135 		return NULL;
1136 
1137 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1138 
1139 	/* the 'triggering' packet need only have the ACK flag set.
1140 	 * also check that SYN is not set, as there won't be any previous ACKs.
1141 	 */
1142 	if ((tcp_flag_word(tcph) &
1143 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1144 		return NULL;
1145 
1146 	/* the 'triggering' ACK is at the tail of the queue, we have already
1147 	 * returned if it is the only packet in the flow. loop through the rest
1148 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1149 	 * triggering one.
1150 	 */
1151 	for (skb_check = flow->head;
1152 	     skb_check && skb_check != skb;
1153 	     skb_prev = skb_check, skb_check = skb_check->next) {
1154 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1155 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1156 					     sizeof(_tcph_check));
1157 
1158 		/* only TCP packets with matching 5-tuple are eligible, and only
1159 		 * drop safe headers
1160 		 */
1161 		if (!tcph_check || iph->version != iph_check->version ||
1162 		    tcph_check->source != tcph->source ||
1163 		    tcph_check->dest != tcph->dest)
1164 			continue;
1165 
1166 		if (iph_check->version == 4) {
1167 			if (iph_check->saddr != iph->saddr ||
1168 			    iph_check->daddr != iph->daddr)
1169 				continue;
1170 
1171 			seglen = ntohs(iph_check->tot_len) -
1172 				       (4 * iph_check->ihl);
1173 		} else if (iph_check->version == 6) {
1174 			ipv6h = (struct ipv6hdr *)iph;
1175 			ipv6h_check = (struct ipv6hdr *)iph_check;
1176 
1177 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1178 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1179 				continue;
1180 
1181 			seglen = ntohs(ipv6h_check->payload_len);
1182 		} else {
1183 			WARN_ON(1);  /* shouldn't happen */
1184 			continue;
1185 		}
1186 
1187 		/* If the ECE/CWR flags changed from the previous eligible
1188 		 * packet in the same flow, we should no longer be dropping that
1189 		 * previous packet as this would lose information.
1190 		 */
1191 		if (elig_ack && (tcp_flag_word(tcph_check) &
1192 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1193 			elig_ack = NULL;
1194 			elig_ack_prev = NULL;
1195 			num_found--;
1196 		}
1197 
1198 		/* Check TCP options and flags, don't drop ACKs with segment
1199 		 * data, and don't drop ACKs with a higher cumulative ACK
1200 		 * counter than the triggering packet. Check ACK seqno here to
1201 		 * avoid parsing SACK options of packets we are going to exclude
1202 		 * anyway.
1203 		 */
1204 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1205 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1206 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1207 			continue;
1208 
1209 		/* Check SACK options. The triggering packet must SACK more data
1210 		 * than the ACK under consideration, or SACK the same range but
1211 		 * have a larger cumulative ACK counter. The latter is a
1212 		 * pathological case, but is contained in the following check
1213 		 * anyway, just to be safe.
1214 		 */
1215 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1216 
1217 		if (sack_comp < 0 ||
1218 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1219 		     sack_comp == 0))
1220 			continue;
1221 
1222 		/* At this point we have found an eligible pure ACK to drop; if
1223 		 * we are in aggressive mode, we are done. Otherwise, keep
1224 		 * searching unless this is the second eligible ACK we
1225 		 * found.
1226 		 *
1227 		 * Since we want to drop ACK closest to the head of the queue,
1228 		 * save the first eligible ACK we find, even if we need to loop
1229 		 * again.
1230 		 */
1231 		if (!elig_ack) {
1232 			elig_ack = skb_check;
1233 			elig_ack_prev = skb_prev;
1234 			elig_flags = (tcp_flag_word(tcph_check)
1235 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1236 		}
1237 
1238 		if (num_found++ > 0)
1239 			goto found;
1240 	}
1241 
1242 	/* We made it through the queue without finding two eligible ACKs . If
1243 	 * we found a single eligible ACK we can drop it in aggressive mode if
1244 	 * we can guarantee that this does not interfere with ECN flag
1245 	 * information. We ensure this by dropping it only if the enqueued
1246 	 * packet is consecutive with the eligible ACK, and their flags match.
1247 	 */
1248 	if (elig_ack && aggressive && elig_ack->next == skb &&
1249 	    (elig_flags == (tcp_flag_word(tcph) &
1250 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1251 		goto found;
1252 
1253 	return NULL;
1254 
1255 found:
1256 	if (elig_ack_prev)
1257 		elig_ack_prev->next = elig_ack->next;
1258 	else
1259 		flow->head = elig_ack->next;
1260 
1261 	skb_mark_not_on_list(elig_ack);
1262 
1263 	return elig_ack;
1264 }
1265 
1266 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1267 {
1268 	avg -= avg >> shift;
1269 	avg += sample >> shift;
1270 	return avg;
1271 }
1272 
1273 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1274 {
1275 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1276 		len -= off;
1277 
1278 	if (q->max_netlen < len)
1279 		q->max_netlen = len;
1280 	if (q->min_netlen > len)
1281 		q->min_netlen = len;
1282 
1283 	len += q->rate_overhead;
1284 
1285 	if (len < q->rate_mpu)
1286 		len = q->rate_mpu;
1287 
1288 	if (q->atm_mode == CAKE_ATM_ATM) {
1289 		len += 47;
1290 		len /= 48;
1291 		len *= 53;
1292 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1293 		/* Add one byte per 64 bytes or part thereof.
1294 		 * This is conservative and easier to calculate than the
1295 		 * precise value.
1296 		 */
1297 		len += (len + 63) / 64;
1298 	}
1299 
1300 	if (q->max_adjlen < len)
1301 		q->max_adjlen = len;
1302 	if (q->min_adjlen > len)
1303 		q->min_adjlen = len;
1304 
1305 	return len;
1306 }
1307 
1308 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1309 {
1310 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1311 	unsigned int hdr_len, last_len = 0;
1312 	u32 off = skb_network_offset(skb);
1313 	u32 len = qdisc_pkt_len(skb);
1314 	u16 segs = 1;
1315 
1316 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1317 
1318 	if (!shinfo->gso_size)
1319 		return cake_calc_overhead(q, len, off);
1320 
1321 	/* borrowed from qdisc_pkt_len_init() */
1322 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1323 
1324 	/* + transport layer */
1325 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1326 						SKB_GSO_TCPV6))) {
1327 		const struct tcphdr *th;
1328 		struct tcphdr _tcphdr;
1329 
1330 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1331 					sizeof(_tcphdr), &_tcphdr);
1332 		if (likely(th))
1333 			hdr_len += __tcp_hdrlen(th);
1334 	} else {
1335 		struct udphdr _udphdr;
1336 
1337 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1338 				       sizeof(_udphdr), &_udphdr))
1339 			hdr_len += sizeof(struct udphdr);
1340 	}
1341 
1342 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1343 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1344 				    shinfo->gso_size);
1345 	else
1346 		segs = shinfo->gso_segs;
1347 
1348 	len = shinfo->gso_size + hdr_len;
1349 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1350 
1351 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1352 		cake_calc_overhead(q, last_len, off));
1353 }
1354 
1355 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1356 {
1357 	struct cake_heap_entry ii = q->overflow_heap[i];
1358 	struct cake_heap_entry jj = q->overflow_heap[j];
1359 
1360 	q->overflow_heap[i] = jj;
1361 	q->overflow_heap[j] = ii;
1362 
1363 	q->tins[ii.t].overflow_idx[ii.b] = j;
1364 	q->tins[jj.t].overflow_idx[jj.b] = i;
1365 }
1366 
1367 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1368 {
1369 	struct cake_heap_entry ii = q->overflow_heap[i];
1370 
1371 	return q->tins[ii.t].backlogs[ii.b];
1372 }
1373 
1374 static void cake_heapify(struct cake_sched_data *q, u16 i)
1375 {
1376 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1377 	u32 mb = cake_heap_get_backlog(q, i);
1378 	u32 m = i;
1379 
1380 	while (m < a) {
1381 		u32 l = m + m + 1;
1382 		u32 r = l + 1;
1383 
1384 		if (l < a) {
1385 			u32 lb = cake_heap_get_backlog(q, l);
1386 
1387 			if (lb > mb) {
1388 				m  = l;
1389 				mb = lb;
1390 			}
1391 		}
1392 
1393 		if (r < a) {
1394 			u32 rb = cake_heap_get_backlog(q, r);
1395 
1396 			if (rb > mb) {
1397 				m  = r;
1398 				mb = rb;
1399 			}
1400 		}
1401 
1402 		if (m != i) {
1403 			cake_heap_swap(q, i, m);
1404 			i = m;
1405 		} else {
1406 			break;
1407 		}
1408 	}
1409 }
1410 
1411 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1412 {
1413 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1414 		u16 p = (i - 1) >> 1;
1415 		u32 ib = cake_heap_get_backlog(q, i);
1416 		u32 pb = cake_heap_get_backlog(q, p);
1417 
1418 		if (ib > pb) {
1419 			cake_heap_swap(q, i, p);
1420 			i = p;
1421 		} else {
1422 			break;
1423 		}
1424 	}
1425 }
1426 
1427 static int cake_advance_shaper(struct cake_sched_data *q,
1428 			       struct cake_tin_data *b,
1429 			       struct sk_buff *skb,
1430 			       ktime_t now, bool drop)
1431 {
1432 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1433 
1434 	/* charge packet bandwidth to this tin
1435 	 * and to the global shaper.
1436 	 */
1437 	if (q->rate_ns) {
1438 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1439 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1440 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1441 
1442 		if (ktime_before(b->time_next_packet, now))
1443 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1444 							   tin_dur);
1445 
1446 		else if (ktime_before(b->time_next_packet,
1447 				      ktime_add_ns(now, tin_dur)))
1448 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1449 
1450 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1451 						   global_dur);
1452 		if (!drop)
1453 			q->failsafe_next_packet = \
1454 				ktime_add_ns(q->failsafe_next_packet,
1455 					     failsafe_dur);
1456 	}
1457 	return len;
1458 }
1459 
1460 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1461 {
1462 	struct cake_sched_data *q = qdisc_priv(sch);
1463 	ktime_t now = ktime_get();
1464 	u32 idx = 0, tin = 0, len;
1465 	struct cake_heap_entry qq;
1466 	struct cake_tin_data *b;
1467 	struct cake_flow *flow;
1468 	struct sk_buff *skb;
1469 
1470 	if (!q->overflow_timeout) {
1471 		int i;
1472 		/* Build fresh max-heap */
1473 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1474 			cake_heapify(q, i);
1475 	}
1476 	q->overflow_timeout = 65535;
1477 
1478 	/* select longest queue for pruning */
1479 	qq  = q->overflow_heap[0];
1480 	tin = qq.t;
1481 	idx = qq.b;
1482 
1483 	b = &q->tins[tin];
1484 	flow = &b->flows[idx];
1485 	skb = dequeue_head(flow);
1486 	if (unlikely(!skb)) {
1487 		/* heap has gone wrong, rebuild it next time */
1488 		q->overflow_timeout = 0;
1489 		return idx + (tin << 16);
1490 	}
1491 
1492 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1493 		b->unresponsive_flow_count++;
1494 
1495 	len = qdisc_pkt_len(skb);
1496 	q->buffer_used      -= skb->truesize;
1497 	b->backlogs[idx]    -= len;
1498 	b->tin_backlog      -= len;
1499 	sch->qstats.backlog -= len;
1500 	qdisc_tree_reduce_backlog(sch, 1, len);
1501 
1502 	flow->dropped++;
1503 	b->tin_dropped++;
1504 	sch->qstats.drops++;
1505 
1506 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1507 		cake_advance_shaper(q, b, skb, now, true);
1508 
1509 	__qdisc_drop(skb, to_free);
1510 	sch->q.qlen--;
1511 
1512 	cake_heapify(q, 0);
1513 
1514 	return idx + (tin << 16);
1515 }
1516 
1517 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1518 {
1519 	int wlen = skb_network_offset(skb);
1520 	u8 dscp;
1521 
1522 	switch (tc_skb_protocol(skb)) {
1523 	case htons(ETH_P_IP):
1524 		wlen += sizeof(struct iphdr);
1525 		if (!pskb_may_pull(skb, wlen) ||
1526 		    skb_try_make_writable(skb, wlen))
1527 			return 0;
1528 
1529 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1530 		if (wash && dscp)
1531 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1532 		return dscp;
1533 
1534 	case htons(ETH_P_IPV6):
1535 		wlen += sizeof(struct ipv6hdr);
1536 		if (!pskb_may_pull(skb, wlen) ||
1537 		    skb_try_make_writable(skb, wlen))
1538 			return 0;
1539 
1540 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1541 		if (wash && dscp)
1542 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1543 		return dscp;
1544 
1545 	case htons(ETH_P_ARP):
1546 		return 0x38;  /* CS7 - Net Control */
1547 
1548 	default:
1549 		/* If there is no Diffserv field, treat as best-effort */
1550 		return 0;
1551 	}
1552 }
1553 
1554 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1555 					     struct sk_buff *skb)
1556 {
1557 	struct cake_sched_data *q = qdisc_priv(sch);
1558 	u32 tin, mark;
1559 	u8 dscp;
1560 
1561 	/* Tin selection: Default to diffserv-based selection, allow overriding
1562 	 * using firewall marks or skb->priority.
1563 	 */
1564 	dscp = cake_handle_diffserv(skb,
1565 				    q->rate_flags & CAKE_FLAG_WASH);
1566 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1567 
1568 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1569 		tin = 0;
1570 
1571 	else if (mark && mark <= q->tin_cnt)
1572 		tin = q->tin_order[mark - 1];
1573 
1574 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1575 		 TC_H_MIN(skb->priority) > 0 &&
1576 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1577 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1578 
1579 	else {
1580 		tin = q->tin_index[dscp];
1581 
1582 		if (unlikely(tin >= q->tin_cnt))
1583 			tin = 0;
1584 	}
1585 
1586 	return &q->tins[tin];
1587 }
1588 
1589 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1590 			 struct sk_buff *skb, int flow_mode, int *qerr)
1591 {
1592 	struct cake_sched_data *q = qdisc_priv(sch);
1593 	struct tcf_proto *filter;
1594 	struct tcf_result res;
1595 	u16 flow = 0, host = 0;
1596 	int result;
1597 
1598 	filter = rcu_dereference_bh(q->filter_list);
1599 	if (!filter)
1600 		goto hash;
1601 
1602 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1603 	result = tcf_classify(skb, filter, &res, false);
1604 
1605 	if (result >= 0) {
1606 #ifdef CONFIG_NET_CLS_ACT
1607 		switch (result) {
1608 		case TC_ACT_STOLEN:
1609 		case TC_ACT_QUEUED:
1610 		case TC_ACT_TRAP:
1611 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1612 			/* fall through */
1613 		case TC_ACT_SHOT:
1614 			return 0;
1615 		}
1616 #endif
1617 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1618 			flow = TC_H_MIN(res.classid);
1619 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1620 			host = TC_H_MAJ(res.classid) >> 16;
1621 	}
1622 hash:
1623 	*t = cake_select_tin(sch, skb);
1624 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1625 }
1626 
1627 static void cake_reconfigure(struct Qdisc *sch);
1628 
1629 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1630 			struct sk_buff **to_free)
1631 {
1632 	struct cake_sched_data *q = qdisc_priv(sch);
1633 	int len = qdisc_pkt_len(skb);
1634 	int uninitialized_var(ret);
1635 	struct sk_buff *ack = NULL;
1636 	ktime_t now = ktime_get();
1637 	struct cake_tin_data *b;
1638 	struct cake_flow *flow;
1639 	u32 idx;
1640 
1641 	/* choose flow to insert into */
1642 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1643 	if (idx == 0) {
1644 		if (ret & __NET_XMIT_BYPASS)
1645 			qdisc_qstats_drop(sch);
1646 		__qdisc_drop(skb, to_free);
1647 		return ret;
1648 	}
1649 	idx--;
1650 	flow = &b->flows[idx];
1651 
1652 	/* ensure shaper state isn't stale */
1653 	if (!b->tin_backlog) {
1654 		if (ktime_before(b->time_next_packet, now))
1655 			b->time_next_packet = now;
1656 
1657 		if (!sch->q.qlen) {
1658 			if (ktime_before(q->time_next_packet, now)) {
1659 				q->failsafe_next_packet = now;
1660 				q->time_next_packet = now;
1661 			} else if (ktime_after(q->time_next_packet, now) &&
1662 				   ktime_after(q->failsafe_next_packet, now)) {
1663 				u64 next = \
1664 					min(ktime_to_ns(q->time_next_packet),
1665 					    ktime_to_ns(
1666 						   q->failsafe_next_packet));
1667 				sch->qstats.overlimits++;
1668 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1669 			}
1670 		}
1671 	}
1672 
1673 	if (unlikely(len > b->max_skblen))
1674 		b->max_skblen = len;
1675 
1676 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1677 		struct sk_buff *segs, *nskb;
1678 		netdev_features_t features = netif_skb_features(skb);
1679 		unsigned int slen = 0, numsegs = 0;
1680 
1681 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1682 		if (IS_ERR_OR_NULL(segs))
1683 			return qdisc_drop(skb, sch, to_free);
1684 
1685 		while (segs) {
1686 			nskb = segs->next;
1687 			skb_mark_not_on_list(segs);
1688 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1689 			cobalt_set_enqueue_time(segs, now);
1690 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1691 									  segs);
1692 			flow_queue_add(flow, segs);
1693 
1694 			sch->q.qlen++;
1695 			numsegs++;
1696 			slen += segs->len;
1697 			q->buffer_used += segs->truesize;
1698 			b->packets++;
1699 			segs = nskb;
1700 		}
1701 
1702 		/* stats */
1703 		b->bytes	    += slen;
1704 		b->backlogs[idx]    += slen;
1705 		b->tin_backlog      += slen;
1706 		sch->qstats.backlog += slen;
1707 		q->avg_window_bytes += slen;
1708 
1709 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1710 		consume_skb(skb);
1711 	} else {
1712 		/* not splitting */
1713 		cobalt_set_enqueue_time(skb, now);
1714 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1715 		flow_queue_add(flow, skb);
1716 
1717 		if (q->ack_filter)
1718 			ack = cake_ack_filter(q, flow);
1719 
1720 		if (ack) {
1721 			b->ack_drops++;
1722 			sch->qstats.drops++;
1723 			b->bytes += qdisc_pkt_len(ack);
1724 			len -= qdisc_pkt_len(ack);
1725 			q->buffer_used += skb->truesize - ack->truesize;
1726 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1727 				cake_advance_shaper(q, b, ack, now, true);
1728 
1729 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1730 			consume_skb(ack);
1731 		} else {
1732 			sch->q.qlen++;
1733 			q->buffer_used      += skb->truesize;
1734 		}
1735 
1736 		/* stats */
1737 		b->packets++;
1738 		b->bytes	    += len;
1739 		b->backlogs[idx]    += len;
1740 		b->tin_backlog      += len;
1741 		sch->qstats.backlog += len;
1742 		q->avg_window_bytes += len;
1743 	}
1744 
1745 	if (q->overflow_timeout)
1746 		cake_heapify_up(q, b->overflow_idx[idx]);
1747 
1748 	/* incoming bandwidth capacity estimate */
1749 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1750 		u64 packet_interval = \
1751 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1752 
1753 		if (packet_interval > NSEC_PER_SEC)
1754 			packet_interval = NSEC_PER_SEC;
1755 
1756 		/* filter out short-term bursts, eg. wifi aggregation */
1757 		q->avg_packet_interval = \
1758 			cake_ewma(q->avg_packet_interval,
1759 				  packet_interval,
1760 				  (packet_interval > q->avg_packet_interval ?
1761 					  2 : 8));
1762 
1763 		q->last_packet_time = now;
1764 
1765 		if (packet_interval > q->avg_packet_interval) {
1766 			u64 window_interval = \
1767 				ktime_to_ns(ktime_sub(now,
1768 						      q->avg_window_begin));
1769 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1770 
1771 			do_div(b, window_interval);
1772 			q->avg_peak_bandwidth =
1773 				cake_ewma(q->avg_peak_bandwidth, b,
1774 					  b > q->avg_peak_bandwidth ? 2 : 8);
1775 			q->avg_window_bytes = 0;
1776 			q->avg_window_begin = now;
1777 
1778 			if (ktime_after(now,
1779 					ktime_add_ms(q->last_reconfig_time,
1780 						     250))) {
1781 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1782 				cake_reconfigure(sch);
1783 			}
1784 		}
1785 	} else {
1786 		q->avg_window_bytes = 0;
1787 		q->last_packet_time = now;
1788 	}
1789 
1790 	/* flowchain */
1791 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1792 		struct cake_host *srchost = &b->hosts[flow->srchost];
1793 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1794 		u16 host_load = 1;
1795 
1796 		if (!flow->set) {
1797 			list_add_tail(&flow->flowchain, &b->new_flows);
1798 		} else {
1799 			b->decaying_flow_count--;
1800 			list_move_tail(&flow->flowchain, &b->new_flows);
1801 		}
1802 		flow->set = CAKE_SET_SPARSE;
1803 		b->sparse_flow_count++;
1804 
1805 		if (cake_dsrc(q->flow_mode))
1806 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1807 
1808 		if (cake_ddst(q->flow_mode))
1809 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1810 
1811 		flow->deficit = (b->flow_quantum *
1812 				 quantum_div[host_load]) >> 16;
1813 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1814 		struct cake_host *srchost = &b->hosts[flow->srchost];
1815 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1816 
1817 		/* this flow was empty, accounted as a sparse flow, but actually
1818 		 * in the bulk rotation.
1819 		 */
1820 		flow->set = CAKE_SET_BULK;
1821 		b->sparse_flow_count--;
1822 		b->bulk_flow_count++;
1823 
1824 		if (cake_dsrc(q->flow_mode))
1825 			srchost->srchost_bulk_flow_count++;
1826 
1827 		if (cake_ddst(q->flow_mode))
1828 			dsthost->dsthost_bulk_flow_count++;
1829 
1830 	}
1831 
1832 	if (q->buffer_used > q->buffer_max_used)
1833 		q->buffer_max_used = q->buffer_used;
1834 
1835 	if (q->buffer_used > q->buffer_limit) {
1836 		u32 dropped = 0;
1837 
1838 		while (q->buffer_used > q->buffer_limit) {
1839 			dropped++;
1840 			cake_drop(sch, to_free);
1841 		}
1842 		b->drop_overlimit += dropped;
1843 	}
1844 	return NET_XMIT_SUCCESS;
1845 }
1846 
1847 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1848 {
1849 	struct cake_sched_data *q = qdisc_priv(sch);
1850 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1851 	struct cake_flow *flow = &b->flows[q->cur_flow];
1852 	struct sk_buff *skb = NULL;
1853 	u32 len;
1854 
1855 	if (flow->head) {
1856 		skb = dequeue_head(flow);
1857 		len = qdisc_pkt_len(skb);
1858 		b->backlogs[q->cur_flow] -= len;
1859 		b->tin_backlog		 -= len;
1860 		sch->qstats.backlog      -= len;
1861 		q->buffer_used		 -= skb->truesize;
1862 		sch->q.qlen--;
1863 
1864 		if (q->overflow_timeout)
1865 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1866 	}
1867 	return skb;
1868 }
1869 
1870 /* Discard leftover packets from a tin no longer in use. */
1871 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1872 {
1873 	struct cake_sched_data *q = qdisc_priv(sch);
1874 	struct sk_buff *skb;
1875 
1876 	q->cur_tin = tin;
1877 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1878 		while (!!(skb = cake_dequeue_one(sch)))
1879 			kfree_skb(skb);
1880 }
1881 
1882 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1883 {
1884 	struct cake_sched_data *q = qdisc_priv(sch);
1885 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1886 	struct cake_host *srchost, *dsthost;
1887 	ktime_t now = ktime_get();
1888 	struct cake_flow *flow;
1889 	struct list_head *head;
1890 	bool first_flow = true;
1891 	struct sk_buff *skb;
1892 	u16 host_load;
1893 	u64 delay;
1894 	u32 len;
1895 
1896 begin:
1897 	if (!sch->q.qlen)
1898 		return NULL;
1899 
1900 	/* global hard shaper */
1901 	if (ktime_after(q->time_next_packet, now) &&
1902 	    ktime_after(q->failsafe_next_packet, now)) {
1903 		u64 next = min(ktime_to_ns(q->time_next_packet),
1904 			       ktime_to_ns(q->failsafe_next_packet));
1905 
1906 		sch->qstats.overlimits++;
1907 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1908 		return NULL;
1909 	}
1910 
1911 	/* Choose a class to work on. */
1912 	if (!q->rate_ns) {
1913 		/* In unlimited mode, can't rely on shaper timings, just balance
1914 		 * with DRR
1915 		 */
1916 		bool wrapped = false, empty = true;
1917 
1918 		while (b->tin_deficit < 0 ||
1919 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1920 			if (b->tin_deficit <= 0)
1921 				b->tin_deficit += b->tin_quantum;
1922 			if (b->sparse_flow_count + b->bulk_flow_count)
1923 				empty = false;
1924 
1925 			q->cur_tin++;
1926 			b++;
1927 			if (q->cur_tin >= q->tin_cnt) {
1928 				q->cur_tin = 0;
1929 				b = q->tins;
1930 
1931 				if (wrapped) {
1932 					/* It's possible for q->qlen to be
1933 					 * nonzero when we actually have no
1934 					 * packets anywhere.
1935 					 */
1936 					if (empty)
1937 						return NULL;
1938 				} else {
1939 					wrapped = true;
1940 				}
1941 			}
1942 		}
1943 	} else {
1944 		/* In shaped mode, choose:
1945 		 * - Highest-priority tin with queue and meeting schedule, or
1946 		 * - The earliest-scheduled tin with queue.
1947 		 */
1948 		ktime_t best_time = KTIME_MAX;
1949 		int tin, best_tin = 0;
1950 
1951 		for (tin = 0; tin < q->tin_cnt; tin++) {
1952 			b = q->tins + tin;
1953 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1954 				ktime_t time_to_pkt = \
1955 					ktime_sub(b->time_next_packet, now);
1956 
1957 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1958 				    ktime_compare(time_to_pkt,
1959 						  best_time) <= 0) {
1960 					best_time = time_to_pkt;
1961 					best_tin = tin;
1962 				}
1963 			}
1964 		}
1965 
1966 		q->cur_tin = best_tin;
1967 		b = q->tins + best_tin;
1968 
1969 		/* No point in going further if no packets to deliver. */
1970 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1971 			return NULL;
1972 	}
1973 
1974 retry:
1975 	/* service this class */
1976 	head = &b->decaying_flows;
1977 	if (!first_flow || list_empty(head)) {
1978 		head = &b->new_flows;
1979 		if (list_empty(head)) {
1980 			head = &b->old_flows;
1981 			if (unlikely(list_empty(head))) {
1982 				head = &b->decaying_flows;
1983 				if (unlikely(list_empty(head)))
1984 					goto begin;
1985 			}
1986 		}
1987 	}
1988 	flow = list_first_entry(head, struct cake_flow, flowchain);
1989 	q->cur_flow = flow - b->flows;
1990 	first_flow = false;
1991 
1992 	/* triple isolation (modified DRR++) */
1993 	srchost = &b->hosts[flow->srchost];
1994 	dsthost = &b->hosts[flow->dsthost];
1995 	host_load = 1;
1996 
1997 	/* flow isolation (DRR++) */
1998 	if (flow->deficit <= 0) {
1999 		/* Keep all flows with deficits out of the sparse and decaying
2000 		 * rotations.  No non-empty flow can go into the decaying
2001 		 * rotation, so they can't get deficits
2002 		 */
2003 		if (flow->set == CAKE_SET_SPARSE) {
2004 			if (flow->head) {
2005 				b->sparse_flow_count--;
2006 				b->bulk_flow_count++;
2007 
2008 				if (cake_dsrc(q->flow_mode))
2009 					srchost->srchost_bulk_flow_count++;
2010 
2011 				if (cake_ddst(q->flow_mode))
2012 					dsthost->dsthost_bulk_flow_count++;
2013 
2014 				flow->set = CAKE_SET_BULK;
2015 			} else {
2016 				/* we've moved it to the bulk rotation for
2017 				 * correct deficit accounting but we still want
2018 				 * to count it as a sparse flow, not a bulk one.
2019 				 */
2020 				flow->set = CAKE_SET_SPARSE_WAIT;
2021 			}
2022 		}
2023 
2024 		if (cake_dsrc(q->flow_mode))
2025 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2026 
2027 		if (cake_ddst(q->flow_mode))
2028 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2029 
2030 		WARN_ON(host_load > CAKE_QUEUES);
2031 
2032 		/* The shifted prandom_u32() is a way to apply dithering to
2033 		 * avoid accumulating roundoff errors
2034 		 */
2035 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2036 				  (prandom_u32() >> 16)) >> 16;
2037 		list_move_tail(&flow->flowchain, &b->old_flows);
2038 
2039 		goto retry;
2040 	}
2041 
2042 	/* Retrieve a packet via the AQM */
2043 	while (1) {
2044 		skb = cake_dequeue_one(sch);
2045 		if (!skb) {
2046 			/* this queue was actually empty */
2047 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2048 				b->unresponsive_flow_count--;
2049 
2050 			if (flow->cvars.p_drop || flow->cvars.count ||
2051 			    ktime_before(now, flow->cvars.drop_next)) {
2052 				/* keep in the flowchain until the state has
2053 				 * decayed to rest
2054 				 */
2055 				list_move_tail(&flow->flowchain,
2056 					       &b->decaying_flows);
2057 				if (flow->set == CAKE_SET_BULK) {
2058 					b->bulk_flow_count--;
2059 
2060 					if (cake_dsrc(q->flow_mode))
2061 						srchost->srchost_bulk_flow_count--;
2062 
2063 					if (cake_ddst(q->flow_mode))
2064 						dsthost->dsthost_bulk_flow_count--;
2065 
2066 					b->decaying_flow_count++;
2067 				} else if (flow->set == CAKE_SET_SPARSE ||
2068 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2069 					b->sparse_flow_count--;
2070 					b->decaying_flow_count++;
2071 				}
2072 				flow->set = CAKE_SET_DECAYING;
2073 			} else {
2074 				/* remove empty queue from the flowchain */
2075 				list_del_init(&flow->flowchain);
2076 				if (flow->set == CAKE_SET_SPARSE ||
2077 				    flow->set == CAKE_SET_SPARSE_WAIT)
2078 					b->sparse_flow_count--;
2079 				else if (flow->set == CAKE_SET_BULK) {
2080 					b->bulk_flow_count--;
2081 
2082 					if (cake_dsrc(q->flow_mode))
2083 						srchost->srchost_bulk_flow_count--;
2084 
2085 					if (cake_ddst(q->flow_mode))
2086 						dsthost->dsthost_bulk_flow_count--;
2087 
2088 				} else
2089 					b->decaying_flow_count--;
2090 
2091 				flow->set = CAKE_SET_NONE;
2092 			}
2093 			goto begin;
2094 		}
2095 
2096 		/* Last packet in queue may be marked, shouldn't be dropped */
2097 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2098 					(b->bulk_flow_count *
2099 					 !!(q->rate_flags &
2100 					    CAKE_FLAG_INGRESS))) ||
2101 		    !flow->head)
2102 			break;
2103 
2104 		/* drop this packet, get another one */
2105 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2106 			len = cake_advance_shaper(q, b, skb,
2107 						  now, true);
2108 			flow->deficit -= len;
2109 			b->tin_deficit -= len;
2110 		}
2111 		flow->dropped++;
2112 		b->tin_dropped++;
2113 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2114 		qdisc_qstats_drop(sch);
2115 		kfree_skb(skb);
2116 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2117 			goto retry;
2118 	}
2119 
2120 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2121 	qdisc_bstats_update(sch, skb);
2122 
2123 	/* collect delay stats */
2124 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2125 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2126 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2127 				  delay > b->peak_delay ? 2 : 8);
2128 	b->base_delay = cake_ewma(b->base_delay, delay,
2129 				  delay < b->base_delay ? 2 : 8);
2130 
2131 	len = cake_advance_shaper(q, b, skb, now, false);
2132 	flow->deficit -= len;
2133 	b->tin_deficit -= len;
2134 
2135 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2136 		u64 next = min(ktime_to_ns(q->time_next_packet),
2137 			       ktime_to_ns(q->failsafe_next_packet));
2138 
2139 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2140 	} else if (!sch->q.qlen) {
2141 		int i;
2142 
2143 		for (i = 0; i < q->tin_cnt; i++) {
2144 			if (q->tins[i].decaying_flow_count) {
2145 				ktime_t next = \
2146 					ktime_add_ns(now,
2147 						     q->tins[i].cparams.target);
2148 
2149 				qdisc_watchdog_schedule_ns(&q->watchdog,
2150 							   ktime_to_ns(next));
2151 				break;
2152 			}
2153 		}
2154 	}
2155 
2156 	if (q->overflow_timeout)
2157 		q->overflow_timeout--;
2158 
2159 	return skb;
2160 }
2161 
2162 static void cake_reset(struct Qdisc *sch)
2163 {
2164 	u32 c;
2165 
2166 	for (c = 0; c < CAKE_MAX_TINS; c++)
2167 		cake_clear_tin(sch, c);
2168 }
2169 
2170 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2171 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2172 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2173 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2174 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2175 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2176 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2177 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2178 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2179 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2180 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2181 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2182 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2183 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2184 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2185 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2186 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2187 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2188 };
2189 
2190 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2191 			  u64 target_ns, u64 rtt_est_ns)
2192 {
2193 	/* convert byte-rate into time-per-byte
2194 	 * so it will always unwedge in reasonable time.
2195 	 */
2196 	static const u64 MIN_RATE = 64;
2197 	u32 byte_target = mtu;
2198 	u64 byte_target_ns;
2199 	u8  rate_shft = 0;
2200 	u64 rate_ns = 0;
2201 
2202 	b->flow_quantum = 1514;
2203 	if (rate) {
2204 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2205 		rate_shft = 34;
2206 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2207 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2208 		while (!!(rate_ns >> 34)) {
2209 			rate_ns >>= 1;
2210 			rate_shft--;
2211 		}
2212 	} /* else unlimited, ie. zero delay */
2213 
2214 	b->tin_rate_bps  = rate;
2215 	b->tin_rate_ns   = rate_ns;
2216 	b->tin_rate_shft = rate_shft;
2217 
2218 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2219 
2220 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2221 	b->cparams.interval = max(rtt_est_ns +
2222 				     b->cparams.target - target_ns,
2223 				     b->cparams.target * 2);
2224 	b->cparams.mtu_time = byte_target_ns;
2225 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2226 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2227 }
2228 
2229 static int cake_config_besteffort(struct Qdisc *sch)
2230 {
2231 	struct cake_sched_data *q = qdisc_priv(sch);
2232 	struct cake_tin_data *b = &q->tins[0];
2233 	u32 mtu = psched_mtu(qdisc_dev(sch));
2234 	u64 rate = q->rate_bps;
2235 
2236 	q->tin_cnt = 1;
2237 
2238 	q->tin_index = besteffort;
2239 	q->tin_order = normal_order;
2240 
2241 	cake_set_rate(b, rate, mtu,
2242 		      us_to_ns(q->target), us_to_ns(q->interval));
2243 	b->tin_quantum = 65535;
2244 
2245 	return 0;
2246 }
2247 
2248 static int cake_config_precedence(struct Qdisc *sch)
2249 {
2250 	/* convert high-level (user visible) parameters into internal format */
2251 	struct cake_sched_data *q = qdisc_priv(sch);
2252 	u32 mtu = psched_mtu(qdisc_dev(sch));
2253 	u64 rate = q->rate_bps;
2254 	u32 quantum = 256;
2255 	u32 i;
2256 
2257 	q->tin_cnt = 8;
2258 	q->tin_index = precedence;
2259 	q->tin_order = normal_order;
2260 
2261 	for (i = 0; i < q->tin_cnt; i++) {
2262 		struct cake_tin_data *b = &q->tins[i];
2263 
2264 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2265 			      us_to_ns(q->interval));
2266 
2267 		b->tin_quantum = max_t(u16, 1U, quantum);
2268 
2269 		/* calculate next class's parameters */
2270 		rate  *= 7;
2271 		rate >>= 3;
2272 
2273 		quantum  *= 7;
2274 		quantum >>= 3;
2275 	}
2276 
2277 	return 0;
2278 }
2279 
2280 /*	List of known Diffserv codepoints:
2281  *
2282  *	Least Effort (CS1)
2283  *	Best Effort (CS0)
2284  *	Max Reliability & LLT "Lo" (TOS1)
2285  *	Max Throughput (TOS2)
2286  *	Min Delay (TOS4)
2287  *	LLT "La" (TOS5)
2288  *	Assured Forwarding 1 (AF1x) - x3
2289  *	Assured Forwarding 2 (AF2x) - x3
2290  *	Assured Forwarding 3 (AF3x) - x3
2291  *	Assured Forwarding 4 (AF4x) - x3
2292  *	Precedence Class 2 (CS2)
2293  *	Precedence Class 3 (CS3)
2294  *	Precedence Class 4 (CS4)
2295  *	Precedence Class 5 (CS5)
2296  *	Precedence Class 6 (CS6)
2297  *	Precedence Class 7 (CS7)
2298  *	Voice Admit (VA)
2299  *	Expedited Forwarding (EF)
2300 
2301  *	Total 25 codepoints.
2302  */
2303 
2304 /*	List of traffic classes in RFC 4594:
2305  *		(roughly descending order of contended priority)
2306  *		(roughly ascending order of uncontended throughput)
2307  *
2308  *	Network Control (CS6,CS7)      - routing traffic
2309  *	Telephony (EF,VA)         - aka. VoIP streams
2310  *	Signalling (CS5)               - VoIP setup
2311  *	Multimedia Conferencing (AF4x) - aka. video calls
2312  *	Realtime Interactive (CS4)     - eg. games
2313  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2314  *	Broadcast Video (CS3)
2315  *	Low Latency Data (AF2x,TOS4)      - eg. database
2316  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2317  *	Standard Service (CS0 & unrecognised codepoints)
2318  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2319  *	Low Priority Data (CS1)           - eg. BitTorrent
2320 
2321  *	Total 12 traffic classes.
2322  */
2323 
2324 static int cake_config_diffserv8(struct Qdisc *sch)
2325 {
2326 /*	Pruned list of traffic classes for typical applications:
2327  *
2328  *		Network Control          (CS6, CS7)
2329  *		Minimum Latency          (EF, VA, CS5, CS4)
2330  *		Interactive Shell        (CS2, TOS1)
2331  *		Low Latency Transactions (AF2x, TOS4)
2332  *		Video Streaming          (AF4x, AF3x, CS3)
2333  *		Bog Standard             (CS0 etc.)
2334  *		High Throughput          (AF1x, TOS2)
2335  *		Background Traffic       (CS1)
2336  *
2337  *		Total 8 traffic classes.
2338  */
2339 
2340 	struct cake_sched_data *q = qdisc_priv(sch);
2341 	u32 mtu = psched_mtu(qdisc_dev(sch));
2342 	u64 rate = q->rate_bps;
2343 	u32 quantum = 256;
2344 	u32 i;
2345 
2346 	q->tin_cnt = 8;
2347 
2348 	/* codepoint to class mapping */
2349 	q->tin_index = diffserv8;
2350 	q->tin_order = normal_order;
2351 
2352 	/* class characteristics */
2353 	for (i = 0; i < q->tin_cnt; i++) {
2354 		struct cake_tin_data *b = &q->tins[i];
2355 
2356 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2357 			      us_to_ns(q->interval));
2358 
2359 		b->tin_quantum = max_t(u16, 1U, quantum);
2360 
2361 		/* calculate next class's parameters */
2362 		rate  *= 7;
2363 		rate >>= 3;
2364 
2365 		quantum  *= 7;
2366 		quantum >>= 3;
2367 	}
2368 
2369 	return 0;
2370 }
2371 
2372 static int cake_config_diffserv4(struct Qdisc *sch)
2373 {
2374 /*  Further pruned list of traffic classes for four-class system:
2375  *
2376  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2377  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2378  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2379  *	    Background Traffic (CS1)
2380  *
2381  *		Total 4 traffic classes.
2382  */
2383 
2384 	struct cake_sched_data *q = qdisc_priv(sch);
2385 	u32 mtu = psched_mtu(qdisc_dev(sch));
2386 	u64 rate = q->rate_bps;
2387 	u32 quantum = 1024;
2388 
2389 	q->tin_cnt = 4;
2390 
2391 	/* codepoint to class mapping */
2392 	q->tin_index = diffserv4;
2393 	q->tin_order = bulk_order;
2394 
2395 	/* class characteristics */
2396 	cake_set_rate(&q->tins[0], rate, mtu,
2397 		      us_to_ns(q->target), us_to_ns(q->interval));
2398 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2399 		      us_to_ns(q->target), us_to_ns(q->interval));
2400 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2401 		      us_to_ns(q->target), us_to_ns(q->interval));
2402 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2403 		      us_to_ns(q->target), us_to_ns(q->interval));
2404 
2405 	/* bandwidth-sharing weights */
2406 	q->tins[0].tin_quantum = quantum;
2407 	q->tins[1].tin_quantum = quantum >> 4;
2408 	q->tins[2].tin_quantum = quantum >> 1;
2409 	q->tins[3].tin_quantum = quantum >> 2;
2410 
2411 	return 0;
2412 }
2413 
2414 static int cake_config_diffserv3(struct Qdisc *sch)
2415 {
2416 /*  Simplified Diffserv structure with 3 tins.
2417  *		Low Priority		(CS1)
2418  *		Best Effort
2419  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2420  */
2421 	struct cake_sched_data *q = qdisc_priv(sch);
2422 	u32 mtu = psched_mtu(qdisc_dev(sch));
2423 	u64 rate = q->rate_bps;
2424 	u32 quantum = 1024;
2425 
2426 	q->tin_cnt = 3;
2427 
2428 	/* codepoint to class mapping */
2429 	q->tin_index = diffserv3;
2430 	q->tin_order = bulk_order;
2431 
2432 	/* class characteristics */
2433 	cake_set_rate(&q->tins[0], rate, mtu,
2434 		      us_to_ns(q->target), us_to_ns(q->interval));
2435 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2436 		      us_to_ns(q->target), us_to_ns(q->interval));
2437 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2438 		      us_to_ns(q->target), us_to_ns(q->interval));
2439 
2440 	/* bandwidth-sharing weights */
2441 	q->tins[0].tin_quantum = quantum;
2442 	q->tins[1].tin_quantum = quantum >> 4;
2443 	q->tins[2].tin_quantum = quantum >> 2;
2444 
2445 	return 0;
2446 }
2447 
2448 static void cake_reconfigure(struct Qdisc *sch)
2449 {
2450 	struct cake_sched_data *q = qdisc_priv(sch);
2451 	int c, ft;
2452 
2453 	switch (q->tin_mode) {
2454 	case CAKE_DIFFSERV_BESTEFFORT:
2455 		ft = cake_config_besteffort(sch);
2456 		break;
2457 
2458 	case CAKE_DIFFSERV_PRECEDENCE:
2459 		ft = cake_config_precedence(sch);
2460 		break;
2461 
2462 	case CAKE_DIFFSERV_DIFFSERV8:
2463 		ft = cake_config_diffserv8(sch);
2464 		break;
2465 
2466 	case CAKE_DIFFSERV_DIFFSERV4:
2467 		ft = cake_config_diffserv4(sch);
2468 		break;
2469 
2470 	case CAKE_DIFFSERV_DIFFSERV3:
2471 	default:
2472 		ft = cake_config_diffserv3(sch);
2473 		break;
2474 	}
2475 
2476 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2477 		cake_clear_tin(sch, c);
2478 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2479 	}
2480 
2481 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2482 	q->rate_shft = q->tins[ft].tin_rate_shft;
2483 
2484 	if (q->buffer_config_limit) {
2485 		q->buffer_limit = q->buffer_config_limit;
2486 	} else if (q->rate_bps) {
2487 		u64 t = q->rate_bps * q->interval;
2488 
2489 		do_div(t, USEC_PER_SEC / 4);
2490 		q->buffer_limit = max_t(u32, t, 4U << 20);
2491 	} else {
2492 		q->buffer_limit = ~0;
2493 	}
2494 
2495 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2496 
2497 	q->buffer_limit = min(q->buffer_limit,
2498 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2499 				  q->buffer_config_limit));
2500 }
2501 
2502 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2503 		       struct netlink_ext_ack *extack)
2504 {
2505 	struct cake_sched_data *q = qdisc_priv(sch);
2506 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2507 	int err;
2508 
2509 	if (!opt)
2510 		return -EINVAL;
2511 
2512 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2513 					  extack);
2514 	if (err < 0)
2515 		return err;
2516 
2517 	if (tb[TCA_CAKE_NAT]) {
2518 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2519 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2520 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2521 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2522 #else
2523 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2524 				    "No conntrack support in kernel");
2525 		return -EOPNOTSUPP;
2526 #endif
2527 	}
2528 
2529 	if (tb[TCA_CAKE_BASE_RATE64])
2530 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2531 
2532 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2533 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2534 
2535 	if (tb[TCA_CAKE_WASH]) {
2536 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2537 			q->rate_flags |= CAKE_FLAG_WASH;
2538 		else
2539 			q->rate_flags &= ~CAKE_FLAG_WASH;
2540 	}
2541 
2542 	if (tb[TCA_CAKE_FLOW_MODE])
2543 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2544 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2545 					CAKE_FLOW_MASK));
2546 
2547 	if (tb[TCA_CAKE_ATM])
2548 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2549 
2550 	if (tb[TCA_CAKE_OVERHEAD]) {
2551 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2552 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2553 
2554 		q->max_netlen = 0;
2555 		q->max_adjlen = 0;
2556 		q->min_netlen = ~0;
2557 		q->min_adjlen = ~0;
2558 	}
2559 
2560 	if (tb[TCA_CAKE_RAW]) {
2561 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2562 
2563 		q->max_netlen = 0;
2564 		q->max_adjlen = 0;
2565 		q->min_netlen = ~0;
2566 		q->min_adjlen = ~0;
2567 	}
2568 
2569 	if (tb[TCA_CAKE_MPU])
2570 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2571 
2572 	if (tb[TCA_CAKE_RTT]) {
2573 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2574 
2575 		if (!q->interval)
2576 			q->interval = 1;
2577 	}
2578 
2579 	if (tb[TCA_CAKE_TARGET]) {
2580 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2581 
2582 		if (!q->target)
2583 			q->target = 1;
2584 	}
2585 
2586 	if (tb[TCA_CAKE_AUTORATE]) {
2587 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2588 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2589 		else
2590 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2591 	}
2592 
2593 	if (tb[TCA_CAKE_INGRESS]) {
2594 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2595 			q->rate_flags |= CAKE_FLAG_INGRESS;
2596 		else
2597 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2598 	}
2599 
2600 	if (tb[TCA_CAKE_ACK_FILTER])
2601 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2602 
2603 	if (tb[TCA_CAKE_MEMORY])
2604 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2605 
2606 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2607 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2608 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2609 		else
2610 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2611 	}
2612 
2613 	if (tb[TCA_CAKE_FWMARK]) {
2614 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2615 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2616 	}
2617 
2618 	if (q->tins) {
2619 		sch_tree_lock(sch);
2620 		cake_reconfigure(sch);
2621 		sch_tree_unlock(sch);
2622 	}
2623 
2624 	return 0;
2625 }
2626 
2627 static void cake_destroy(struct Qdisc *sch)
2628 {
2629 	struct cake_sched_data *q = qdisc_priv(sch);
2630 
2631 	qdisc_watchdog_cancel(&q->watchdog);
2632 	tcf_block_put(q->block);
2633 	kvfree(q->tins);
2634 }
2635 
2636 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2637 		     struct netlink_ext_ack *extack)
2638 {
2639 	struct cake_sched_data *q = qdisc_priv(sch);
2640 	int i, j, err;
2641 
2642 	sch->limit = 10240;
2643 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2644 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2645 
2646 	q->rate_bps = 0; /* unlimited by default */
2647 
2648 	q->interval = 100000; /* 100ms default */
2649 	q->target   =   5000; /* 5ms: codel RFC argues
2650 			       * for 5 to 10% of interval
2651 			       */
2652 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2653 	q->cur_tin = 0;
2654 	q->cur_flow  = 0;
2655 
2656 	qdisc_watchdog_init(&q->watchdog, sch);
2657 
2658 	if (opt) {
2659 		int err = cake_change(sch, opt, extack);
2660 
2661 		if (err)
2662 			return err;
2663 	}
2664 
2665 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2666 	if (err)
2667 		return err;
2668 
2669 	quantum_div[0] = ~0;
2670 	for (i = 1; i <= CAKE_QUEUES; i++)
2671 		quantum_div[i] = 65535 / i;
2672 
2673 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2674 			   GFP_KERNEL);
2675 	if (!q->tins)
2676 		goto nomem;
2677 
2678 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2679 		struct cake_tin_data *b = q->tins + i;
2680 
2681 		INIT_LIST_HEAD(&b->new_flows);
2682 		INIT_LIST_HEAD(&b->old_flows);
2683 		INIT_LIST_HEAD(&b->decaying_flows);
2684 		b->sparse_flow_count = 0;
2685 		b->bulk_flow_count = 0;
2686 		b->decaying_flow_count = 0;
2687 
2688 		for (j = 0; j < CAKE_QUEUES; j++) {
2689 			struct cake_flow *flow = b->flows + j;
2690 			u32 k = j * CAKE_MAX_TINS + i;
2691 
2692 			INIT_LIST_HEAD(&flow->flowchain);
2693 			cobalt_vars_init(&flow->cvars);
2694 
2695 			q->overflow_heap[k].t = i;
2696 			q->overflow_heap[k].b = j;
2697 			b->overflow_idx[j] = k;
2698 		}
2699 	}
2700 
2701 	cake_reconfigure(sch);
2702 	q->avg_peak_bandwidth = q->rate_bps;
2703 	q->min_netlen = ~0;
2704 	q->min_adjlen = ~0;
2705 	return 0;
2706 
2707 nomem:
2708 	cake_destroy(sch);
2709 	return -ENOMEM;
2710 }
2711 
2712 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2713 {
2714 	struct cake_sched_data *q = qdisc_priv(sch);
2715 	struct nlattr *opts;
2716 
2717 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2718 	if (!opts)
2719 		goto nla_put_failure;
2720 
2721 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2722 			      TCA_CAKE_PAD))
2723 		goto nla_put_failure;
2724 
2725 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2726 			q->flow_mode & CAKE_FLOW_MASK))
2727 		goto nla_put_failure;
2728 
2729 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2730 		goto nla_put_failure;
2731 
2732 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2733 		goto nla_put_failure;
2734 
2735 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2736 		goto nla_put_failure;
2737 
2738 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2739 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2740 		goto nla_put_failure;
2741 
2742 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2743 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2744 		goto nla_put_failure;
2745 
2746 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2747 		goto nla_put_failure;
2748 
2749 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2750 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2751 		goto nla_put_failure;
2752 
2753 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2754 		goto nla_put_failure;
2755 
2756 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2757 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2758 		goto nla_put_failure;
2759 
2760 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2761 		goto nla_put_failure;
2762 
2763 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2764 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2765 			goto nla_put_failure;
2766 
2767 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2768 		goto nla_put_failure;
2769 
2770 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2771 		goto nla_put_failure;
2772 
2773 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2774 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2775 		goto nla_put_failure;
2776 
2777 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2778 		goto nla_put_failure;
2779 
2780 	return nla_nest_end(skb, opts);
2781 
2782 nla_put_failure:
2783 	return -1;
2784 }
2785 
2786 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2787 {
2788 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2789 	struct cake_sched_data *q = qdisc_priv(sch);
2790 	struct nlattr *tstats, *ts;
2791 	int i;
2792 
2793 	if (!stats)
2794 		return -1;
2795 
2796 #define PUT_STAT_U32(attr, data) do {				       \
2797 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2798 			goto nla_put_failure;			       \
2799 	} while (0)
2800 #define PUT_STAT_U64(attr, data) do {				       \
2801 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2802 					data, TCA_CAKE_STATS_PAD)) \
2803 			goto nla_put_failure;			       \
2804 	} while (0)
2805 
2806 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2807 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2808 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2809 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2810 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2811 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2812 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2813 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2814 
2815 #undef PUT_STAT_U32
2816 #undef PUT_STAT_U64
2817 
2818 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2819 	if (!tstats)
2820 		goto nla_put_failure;
2821 
2822 #define PUT_TSTAT_U32(attr, data) do {					\
2823 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2824 			goto nla_put_failure;				\
2825 	} while (0)
2826 #define PUT_TSTAT_U64(attr, data) do {					\
2827 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2828 					data, TCA_CAKE_TIN_STATS_PAD))	\
2829 			goto nla_put_failure;				\
2830 	} while (0)
2831 
2832 	for (i = 0; i < q->tin_cnt; i++) {
2833 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2834 
2835 		ts = nla_nest_start_noflag(d->skb, i + 1);
2836 		if (!ts)
2837 			goto nla_put_failure;
2838 
2839 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2840 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2841 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2842 
2843 		PUT_TSTAT_U32(TARGET_US,
2844 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2845 		PUT_TSTAT_U32(INTERVAL_US,
2846 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2847 
2848 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2849 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2850 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2851 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2852 
2853 		PUT_TSTAT_U32(PEAK_DELAY_US,
2854 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2855 		PUT_TSTAT_U32(AVG_DELAY_US,
2856 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2857 		PUT_TSTAT_U32(BASE_DELAY_US,
2858 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2859 
2860 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2861 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2862 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2863 
2864 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2865 					    b->decaying_flow_count);
2866 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2867 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2868 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2869 
2870 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2871 		nla_nest_end(d->skb, ts);
2872 	}
2873 
2874 #undef PUT_TSTAT_U32
2875 #undef PUT_TSTAT_U64
2876 
2877 	nla_nest_end(d->skb, tstats);
2878 	return nla_nest_end(d->skb, stats);
2879 
2880 nla_put_failure:
2881 	nla_nest_cancel(d->skb, stats);
2882 	return -1;
2883 }
2884 
2885 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2886 {
2887 	return NULL;
2888 }
2889 
2890 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2891 {
2892 	return 0;
2893 }
2894 
2895 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2896 			       u32 classid)
2897 {
2898 	return 0;
2899 }
2900 
2901 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2902 {
2903 }
2904 
2905 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2906 					struct netlink_ext_ack *extack)
2907 {
2908 	struct cake_sched_data *q = qdisc_priv(sch);
2909 
2910 	if (cl)
2911 		return NULL;
2912 	return q->block;
2913 }
2914 
2915 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2916 			   struct sk_buff *skb, struct tcmsg *tcm)
2917 {
2918 	tcm->tcm_handle |= TC_H_MIN(cl);
2919 	return 0;
2920 }
2921 
2922 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2923 				 struct gnet_dump *d)
2924 {
2925 	struct cake_sched_data *q = qdisc_priv(sch);
2926 	const struct cake_flow *flow = NULL;
2927 	struct gnet_stats_queue qs = { 0 };
2928 	struct nlattr *stats;
2929 	u32 idx = cl - 1;
2930 
2931 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2932 		const struct cake_tin_data *b = \
2933 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2934 		const struct sk_buff *skb;
2935 
2936 		flow = &b->flows[idx % CAKE_QUEUES];
2937 
2938 		if (flow->head) {
2939 			sch_tree_lock(sch);
2940 			skb = flow->head;
2941 			while (skb) {
2942 				qs.qlen++;
2943 				skb = skb->next;
2944 			}
2945 			sch_tree_unlock(sch);
2946 		}
2947 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2948 		qs.drops = flow->dropped;
2949 	}
2950 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2951 		return -1;
2952 	if (flow) {
2953 		ktime_t now = ktime_get();
2954 
2955 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2956 		if (!stats)
2957 			return -1;
2958 
2959 #define PUT_STAT_U32(attr, data) do {				       \
2960 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2961 			goto nla_put_failure;			       \
2962 	} while (0)
2963 #define PUT_STAT_S32(attr, data) do {				       \
2964 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2965 			goto nla_put_failure;			       \
2966 	} while (0)
2967 
2968 		PUT_STAT_S32(DEFICIT, flow->deficit);
2969 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2970 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2971 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2972 		if (flow->cvars.p_drop) {
2973 			PUT_STAT_S32(BLUE_TIMER_US,
2974 				     ktime_to_us(
2975 					     ktime_sub(now,
2976 						     flow->cvars.blue_timer)));
2977 		}
2978 		if (flow->cvars.dropping) {
2979 			PUT_STAT_S32(DROP_NEXT_US,
2980 				     ktime_to_us(
2981 					     ktime_sub(now,
2982 						       flow->cvars.drop_next)));
2983 		}
2984 
2985 		if (nla_nest_end(d->skb, stats) < 0)
2986 			return -1;
2987 	}
2988 
2989 	return 0;
2990 
2991 nla_put_failure:
2992 	nla_nest_cancel(d->skb, stats);
2993 	return -1;
2994 }
2995 
2996 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2997 {
2998 	struct cake_sched_data *q = qdisc_priv(sch);
2999 	unsigned int i, j;
3000 
3001 	if (arg->stop)
3002 		return;
3003 
3004 	for (i = 0; i < q->tin_cnt; i++) {
3005 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3006 
3007 		for (j = 0; j < CAKE_QUEUES; j++) {
3008 			if (list_empty(&b->flows[j].flowchain) ||
3009 			    arg->count < arg->skip) {
3010 				arg->count++;
3011 				continue;
3012 			}
3013 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3014 				arg->stop = 1;
3015 				break;
3016 			}
3017 			arg->count++;
3018 		}
3019 	}
3020 }
3021 
3022 static const struct Qdisc_class_ops cake_class_ops = {
3023 	.leaf		=	cake_leaf,
3024 	.find		=	cake_find,
3025 	.tcf_block	=	cake_tcf_block,
3026 	.bind_tcf	=	cake_bind,
3027 	.unbind_tcf	=	cake_unbind,
3028 	.dump		=	cake_dump_class,
3029 	.dump_stats	=	cake_dump_class_stats,
3030 	.walk		=	cake_walk,
3031 };
3032 
3033 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3034 	.cl_ops		=	&cake_class_ops,
3035 	.id		=	"cake",
3036 	.priv_size	=	sizeof(struct cake_sched_data),
3037 	.enqueue	=	cake_enqueue,
3038 	.dequeue	=	cake_dequeue,
3039 	.peek		=	qdisc_peek_dequeued,
3040 	.init		=	cake_init,
3041 	.reset		=	cake_reset,
3042 	.destroy	=	cake_destroy,
3043 	.change		=	cake_change,
3044 	.dump		=	cake_dump,
3045 	.dump_stats	=	cake_dump_stats,
3046 	.owner		=	THIS_MODULE,
3047 };
3048 
3049 static int __init cake_module_init(void)
3050 {
3051 	return register_qdisc(&cake_qdisc_ops);
3052 }
3053 
3054 static void __exit cake_module_exit(void)
3055 {
3056 	unregister_qdisc(&cake_qdisc_ops);
3057 }
3058 
3059 module_init(cake_module_init)
3060 module_exit(cake_module_exit)
3061 MODULE_AUTHOR("Jonathan Morton");
3062 MODULE_LICENSE("Dual BSD/GPL");
3063 MODULE_DESCRIPTION("The CAKE shaper.");
3064