xref: /linux/net/sched/sch_cake.c (revision dc0d1c4519095a6c6bbd9ec4a808674aba502741)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/version.h>
68 #include <linux/if_vlan.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_refcnt;
143 	u16 dsthost_refcnt;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum_prio;
178 	u16	tin_quantum_band;
179 	s32	tin_deficit;
180 	u32	tin_backlog;
181 	u32	tin_dropped;
182 	u32	tin_ecn_mark;
183 
184 	u32	packets;
185 	u64	bytes;
186 
187 	u32	ack_drops;
188 
189 	/* moving averages */
190 	u64 avge_delay;
191 	u64 peak_delay;
192 	u64 base_delay;
193 
194 	/* hash function stats */
195 	u32	way_directs;
196 	u32	way_hits;
197 	u32	way_misses;
198 	u32	way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200 
201 struct cake_sched_data {
202 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
203 	struct tcf_block *block;
204 	struct cake_tin_data *tins;
205 
206 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
207 	u16		overflow_timeout;
208 
209 	u16		tin_cnt;
210 	u8		tin_mode;
211 	u8		flow_mode;
212 	u8		ack_filter;
213 	u8		atm_mode;
214 
215 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
216 	u16		rate_shft;
217 	ktime_t		time_next_packet;
218 	ktime_t		failsafe_next_packet;
219 	u64		rate_ns;
220 	u64		rate_bps;
221 	u16		rate_flags;
222 	s16		rate_overhead;
223 	u16		rate_mpu;
224 	u64		interval;
225 	u64		target;
226 
227 	/* resource tracking */
228 	u32		buffer_used;
229 	u32		buffer_max_used;
230 	u32		buffer_limit;
231 	u32		buffer_config_limit;
232 
233 	/* indices for dequeue */
234 	u16		cur_tin;
235 	u16		cur_flow;
236 
237 	struct qdisc_watchdog watchdog;
238 	const u8	*tin_index;
239 	const u8	*tin_order;
240 
241 	/* bandwidth capacity estimate */
242 	ktime_t		last_packet_time;
243 	ktime_t		avg_window_begin;
244 	u64		avg_packet_interval;
245 	u64		avg_window_bytes;
246 	u64		avg_peak_bandwidth;
247 	ktime_t		last_reconfig_time;
248 
249 	/* packet length stats */
250 	u32		avg_netoff;
251 	u16		max_netlen;
252 	u16		max_adjlen;
253 	u16		min_netlen;
254 	u16		min_adjlen;
255 };
256 
257 enum {
258 	CAKE_FLAG_OVERHEAD	   = BIT(0),
259 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
260 	CAKE_FLAG_INGRESS	   = BIT(2),
261 	CAKE_FLAG_WASH		   = BIT(3),
262 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
263 };
264 
265 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
266  * obtain the best features of each.  Codel is excellent on flows which
267  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
268  * unresponsive flows.
269  */
270 
271 struct cobalt_skb_cb {
272 	ktime_t enqueue_time;
273 	u32     adjusted_len;
274 };
275 
276 static u64 us_to_ns(u64 us)
277 {
278 	return us * NSEC_PER_USEC;
279 }
280 
281 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
282 {
283 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
284 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
285 }
286 
287 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
288 {
289 	return get_cobalt_cb(skb)->enqueue_time;
290 }
291 
292 static void cobalt_set_enqueue_time(struct sk_buff *skb,
293 				    ktime_t now)
294 {
295 	get_cobalt_cb(skb)->enqueue_time = now;
296 }
297 
298 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
299 
300 /* Diffserv lookup tables */
301 
302 static const u8 precedence[] = {
303 	0, 0, 0, 0, 0, 0, 0, 0,
304 	1, 1, 1, 1, 1, 1, 1, 1,
305 	2, 2, 2, 2, 2, 2, 2, 2,
306 	3, 3, 3, 3, 3, 3, 3, 3,
307 	4, 4, 4, 4, 4, 4, 4, 4,
308 	5, 5, 5, 5, 5, 5, 5, 5,
309 	6, 6, 6, 6, 6, 6, 6, 6,
310 	7, 7, 7, 7, 7, 7, 7, 7,
311 };
312 
313 static const u8 diffserv8[] = {
314 	2, 5, 1, 2, 4, 2, 2, 2,
315 	0, 2, 1, 2, 1, 2, 1, 2,
316 	5, 2, 4, 2, 4, 2, 4, 2,
317 	3, 2, 3, 2, 3, 2, 3, 2,
318 	6, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 2, 2, 6, 2, 6, 2,
320 	7, 2, 2, 2, 2, 2, 2, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 };
323 
324 static const u8 diffserv4[] = {
325 	0, 2, 0, 0, 2, 0, 0, 0,
326 	1, 0, 0, 0, 0, 0, 0, 0,
327 	2, 0, 2, 0, 2, 0, 2, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	3, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 0, 0, 3, 0, 3, 0,
331 	3, 0, 0, 0, 0, 0, 0, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 };
334 
335 static const u8 diffserv3[] = {
336 	0, 0, 0, 0, 2, 0, 0, 0,
337 	1, 0, 0, 0, 0, 0, 0, 0,
338 	0, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 2, 0, 2, 0,
342 	2, 0, 0, 0, 0, 0, 0, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 };
345 
346 static const u8 besteffort[] = {
347 	0, 0, 0, 0, 0, 0, 0, 0,
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 };
356 
357 /* tin priority order for stats dumping */
358 
359 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
360 static const u8 bulk_order[] = {1, 0, 2, 3};
361 
362 #define REC_INV_SQRT_CACHE (16)
363 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
364 
365 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
366  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
367  *
368  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
369  */
370 
371 static void cobalt_newton_step(struct cobalt_vars *vars)
372 {
373 	u32 invsqrt, invsqrt2;
374 	u64 val;
375 
376 	invsqrt = vars->rec_inv_sqrt;
377 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
378 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
379 
380 	val >>= 2; /* avoid overflow in following multiply */
381 	val = (val * invsqrt) >> (32 - 2 + 1);
382 
383 	vars->rec_inv_sqrt = val;
384 }
385 
386 static void cobalt_invsqrt(struct cobalt_vars *vars)
387 {
388 	if (vars->count < REC_INV_SQRT_CACHE)
389 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
390 	else
391 		cobalt_newton_step(vars);
392 }
393 
394 /* There is a big difference in timing between the accurate values placed in
395  * the cache and the approximations given by a single Newton step for small
396  * count values, particularly when stepping from count 1 to 2 or vice versa.
397  * Above 16, a single Newton step gives sufficient accuracy in either
398  * direction, given the precision stored.
399  *
400  * The magnitude of the error when stepping up to count 2 is such as to give
401  * the value that *should* have been produced at count 4.
402  */
403 
404 static void cobalt_cache_init(void)
405 {
406 	struct cobalt_vars v;
407 
408 	memset(&v, 0, sizeof(v));
409 	v.rec_inv_sqrt = ~0U;
410 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
411 
412 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
413 		cobalt_newton_step(&v);
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 
418 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
419 	}
420 }
421 
422 static void cobalt_vars_init(struct cobalt_vars *vars)
423 {
424 	memset(vars, 0, sizeof(*vars));
425 
426 	if (!cobalt_rec_inv_sqrt_cache[0]) {
427 		cobalt_cache_init();
428 		cobalt_rec_inv_sqrt_cache[0] = ~0;
429 	}
430 }
431 
432 /* CoDel control_law is t + interval/sqrt(count)
433  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
434  * both sqrt() and divide operation.
435  */
436 static ktime_t cobalt_control(ktime_t t,
437 			      u64 interval,
438 			      u32 rec_inv_sqrt)
439 {
440 	return ktime_add_ns(t, reciprocal_scale(interval,
441 						rec_inv_sqrt));
442 }
443 
444 /* Call this when a packet had to be dropped due to queue overflow.  Returns
445  * true if the BLUE state was quiescent before but active after this call.
446  */
447 static bool cobalt_queue_full(struct cobalt_vars *vars,
448 			      struct cobalt_params *p,
449 			      ktime_t now)
450 {
451 	bool up = false;
452 
453 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
454 		up = !vars->p_drop;
455 		vars->p_drop += p->p_inc;
456 		if (vars->p_drop < p->p_inc)
457 			vars->p_drop = ~0;
458 		vars->blue_timer = now;
459 	}
460 	vars->dropping = true;
461 	vars->drop_next = now;
462 	if (!vars->count)
463 		vars->count = 1;
464 
465 	return up;
466 }
467 
468 /* Call this when the queue was serviced but turned out to be empty.  Returns
469  * true if the BLUE state was active before but quiescent after this call.
470  */
471 static bool cobalt_queue_empty(struct cobalt_vars *vars,
472 			       struct cobalt_params *p,
473 			       ktime_t now)
474 {
475 	bool down = false;
476 
477 	if (vars->p_drop &&
478 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
479 		if (vars->p_drop < p->p_dec)
480 			vars->p_drop = 0;
481 		else
482 			vars->p_drop -= p->p_dec;
483 		vars->blue_timer = now;
484 		down = !vars->p_drop;
485 	}
486 	vars->dropping = false;
487 
488 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
489 		vars->count--;
490 		cobalt_invsqrt(vars);
491 		vars->drop_next = cobalt_control(vars->drop_next,
492 						 p->interval,
493 						 vars->rec_inv_sqrt);
494 	}
495 
496 	return down;
497 }
498 
499 /* Call this with a freshly dequeued packet for possible congestion marking.
500  * Returns true as an instruction to drop the packet, false for delivery.
501  */
502 static bool cobalt_should_drop(struct cobalt_vars *vars,
503 			       struct cobalt_params *p,
504 			       ktime_t now,
505 			       struct sk_buff *skb,
506 			       u32 bulk_flows)
507 {
508 	bool next_due, over_target, drop = false;
509 	ktime_t schedule;
510 	u64 sojourn;
511 
512 /* The 'schedule' variable records, in its sign, whether 'now' is before or
513  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
514  * scheduling decision is actually branched, without destroying that
515  * information.  Similarly, the first 'schedule' value calculated is preserved
516  * in the boolean 'next_due'.
517  *
518  * As for 'drop_next', we take advantage of the fact that 'interval' is both
519  * the delay between first exceeding 'target' and the first signalling event,
520  * *and* the scaling factor for the signalling frequency.  It's therefore very
521  * natural to use a single mechanism for both purposes, and eliminates a
522  * significant amount of reference Codel's spaghetti code.  To help with this,
523  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
524  * as possible to 1.0 in fixed-point.
525  */
526 
527 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
528 	schedule = ktime_sub(now, vars->drop_next);
529 	over_target = sojourn > p->target &&
530 		      sojourn > p->mtu_time * bulk_flows * 2 &&
531 		      sojourn > p->mtu_time * 4;
532 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
533 
534 	vars->ecn_marked = false;
535 
536 	if (over_target) {
537 		if (!vars->dropping) {
538 			vars->dropping = true;
539 			vars->drop_next = cobalt_control(now,
540 							 p->interval,
541 							 vars->rec_inv_sqrt);
542 		}
543 		if (!vars->count)
544 			vars->count = 1;
545 	} else if (vars->dropping) {
546 		vars->dropping = false;
547 	}
548 
549 	if (next_due && vars->dropping) {
550 		/* Use ECN mark if possible, otherwise drop */
551 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
552 
553 		vars->count++;
554 		if (!vars->count)
555 			vars->count--;
556 		cobalt_invsqrt(vars);
557 		vars->drop_next = cobalt_control(vars->drop_next,
558 						 p->interval,
559 						 vars->rec_inv_sqrt);
560 		schedule = ktime_sub(now, vars->drop_next);
561 	} else {
562 		while (next_due) {
563 			vars->count--;
564 			cobalt_invsqrt(vars);
565 			vars->drop_next = cobalt_control(vars->drop_next,
566 							 p->interval,
567 							 vars->rec_inv_sqrt);
568 			schedule = ktime_sub(now, vars->drop_next);
569 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
570 		}
571 	}
572 
573 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
574 	if (vars->p_drop)
575 		drop |= (prandom_u32() < vars->p_drop);
576 
577 	/* Overload the drop_next field as an activity timeout */
578 	if (!vars->count)
579 		vars->drop_next = ktime_add_ns(now, p->interval);
580 	else if (ktime_to_ns(schedule) > 0 && !drop)
581 		vars->drop_next = now;
582 
583 	return drop;
584 }
585 
586 static void cake_update_flowkeys(struct flow_keys *keys,
587 				 const struct sk_buff *skb)
588 {
589 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
590 	struct nf_conntrack_tuple tuple = {};
591 	bool rev = !skb->_nfct;
592 
593 	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
594 		return;
595 
596 	if (!nf_ct_get_tuple_skb(&tuple, skb))
597 		return;
598 
599 	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
600 	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
601 
602 	if (keys->ports.ports) {
603 		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
604 		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
605 	}
606 #endif
607 }
608 
609 /* Cake has several subtle multiple bit settings. In these cases you
610  *  would be matching triple isolate mode as well.
611  */
612 
613 static bool cake_dsrc(int flow_mode)
614 {
615 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
616 }
617 
618 static bool cake_ddst(int flow_mode)
619 {
620 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
621 }
622 
623 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
624 		     int flow_mode)
625 {
626 	u32 flow_hash = 0, srchost_hash, dsthost_hash;
627 	u16 reduced_hash, srchost_idx, dsthost_idx;
628 	struct flow_keys keys, host_keys;
629 
630 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
631 		return 0;
632 
633 	skb_flow_dissect_flow_keys(skb, &keys,
634 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
635 
636 	if (flow_mode & CAKE_FLOW_NAT_FLAG)
637 		cake_update_flowkeys(&keys, skb);
638 
639 	/* flow_hash_from_keys() sorts the addresses by value, so we have
640 	 * to preserve their order in a separate data structure to treat
641 	 * src and dst host addresses as independently selectable.
642 	 */
643 	host_keys = keys;
644 	host_keys.ports.ports     = 0;
645 	host_keys.basic.ip_proto  = 0;
646 	host_keys.keyid.keyid     = 0;
647 	host_keys.tags.flow_label = 0;
648 
649 	switch (host_keys.control.addr_type) {
650 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
651 		host_keys.addrs.v4addrs.src = 0;
652 		dsthost_hash = flow_hash_from_keys(&host_keys);
653 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
654 		host_keys.addrs.v4addrs.dst = 0;
655 		srchost_hash = flow_hash_from_keys(&host_keys);
656 		break;
657 
658 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
659 		memset(&host_keys.addrs.v6addrs.src, 0,
660 		       sizeof(host_keys.addrs.v6addrs.src));
661 		dsthost_hash = flow_hash_from_keys(&host_keys);
662 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
663 		memset(&host_keys.addrs.v6addrs.dst, 0,
664 		       sizeof(host_keys.addrs.v6addrs.dst));
665 		srchost_hash = flow_hash_from_keys(&host_keys);
666 		break;
667 
668 	default:
669 		dsthost_hash = 0;
670 		srchost_hash = 0;
671 	}
672 
673 	/* This *must* be after the above switch, since as a
674 	 * side-effect it sorts the src and dst addresses.
675 	 */
676 	if (flow_mode & CAKE_FLOW_FLOWS)
677 		flow_hash = flow_hash_from_keys(&keys);
678 
679 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
680 		if (flow_mode & CAKE_FLOW_SRC_IP)
681 			flow_hash ^= srchost_hash;
682 
683 		if (flow_mode & CAKE_FLOW_DST_IP)
684 			flow_hash ^= dsthost_hash;
685 	}
686 
687 	reduced_hash = flow_hash % CAKE_QUEUES;
688 
689 	/* set-associative hashing */
690 	/* fast path if no hash collision (direct lookup succeeds) */
691 	if (likely(q->tags[reduced_hash] == flow_hash &&
692 		   q->flows[reduced_hash].set)) {
693 		q->way_directs++;
694 	} else {
695 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
696 		u32 outer_hash = reduced_hash - inner_hash;
697 		bool allocate_src = false;
698 		bool allocate_dst = false;
699 		u32 i, k;
700 
701 		/* check if any active queue in the set is reserved for
702 		 * this flow.
703 		 */
704 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
705 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
706 			if (q->tags[outer_hash + k] == flow_hash) {
707 				if (i)
708 					q->way_hits++;
709 
710 				if (!q->flows[outer_hash + k].set) {
711 					/* need to increment host refcnts */
712 					allocate_src = cake_dsrc(flow_mode);
713 					allocate_dst = cake_ddst(flow_mode);
714 				}
715 
716 				goto found;
717 			}
718 		}
719 
720 		/* no queue is reserved for this flow, look for an
721 		 * empty one.
722 		 */
723 		for (i = 0; i < CAKE_SET_WAYS;
724 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
725 			if (!q->flows[outer_hash + k].set) {
726 				q->way_misses++;
727 				allocate_src = cake_dsrc(flow_mode);
728 				allocate_dst = cake_ddst(flow_mode);
729 				goto found;
730 			}
731 		}
732 
733 		/* With no empty queues, default to the original
734 		 * queue, accept the collision, update the host tags.
735 		 */
736 		q->way_collisions++;
737 		q->hosts[q->flows[reduced_hash].srchost].srchost_refcnt--;
738 		q->hosts[q->flows[reduced_hash].dsthost].dsthost_refcnt--;
739 		allocate_src = cake_dsrc(flow_mode);
740 		allocate_dst = cake_ddst(flow_mode);
741 found:
742 		/* reserve queue for future packets in same flow */
743 		reduced_hash = outer_hash + k;
744 		q->tags[reduced_hash] = flow_hash;
745 
746 		if (allocate_src) {
747 			srchost_idx = srchost_hash % CAKE_QUEUES;
748 			inner_hash = srchost_idx % CAKE_SET_WAYS;
749 			outer_hash = srchost_idx - inner_hash;
750 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
751 				i++, k = (k + 1) % CAKE_SET_WAYS) {
752 				if (q->hosts[outer_hash + k].srchost_tag ==
753 				    srchost_hash)
754 					goto found_src;
755 			}
756 			for (i = 0; i < CAKE_SET_WAYS;
757 				i++, k = (k + 1) % CAKE_SET_WAYS) {
758 				if (!q->hosts[outer_hash + k].srchost_refcnt)
759 					break;
760 			}
761 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
762 found_src:
763 			srchost_idx = outer_hash + k;
764 			q->hosts[srchost_idx].srchost_refcnt++;
765 			q->flows[reduced_hash].srchost = srchost_idx;
766 		}
767 
768 		if (allocate_dst) {
769 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
770 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
771 			outer_hash = dsthost_idx - inner_hash;
772 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
773 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
774 				if (q->hosts[outer_hash + k].dsthost_tag ==
775 				    dsthost_hash)
776 					goto found_dst;
777 			}
778 			for (i = 0; i < CAKE_SET_WAYS;
779 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
780 				if (!q->hosts[outer_hash + k].dsthost_refcnt)
781 					break;
782 			}
783 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
784 found_dst:
785 			dsthost_idx = outer_hash + k;
786 			q->hosts[dsthost_idx].dsthost_refcnt++;
787 			q->flows[reduced_hash].dsthost = dsthost_idx;
788 		}
789 	}
790 
791 	return reduced_hash;
792 }
793 
794 /* helper functions : might be changed when/if skb use a standard list_head */
795 /* remove one skb from head of slot queue */
796 
797 static struct sk_buff *dequeue_head(struct cake_flow *flow)
798 {
799 	struct sk_buff *skb = flow->head;
800 
801 	if (skb) {
802 		flow->head = skb->next;
803 		skb->next = NULL;
804 	}
805 
806 	return skb;
807 }
808 
809 /* add skb to flow queue (tail add) */
810 
811 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
812 {
813 	if (!flow->head)
814 		flow->head = skb;
815 	else
816 		flow->tail->next = skb;
817 	flow->tail = skb;
818 	skb->next = NULL;
819 }
820 
821 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
822 				    struct ipv6hdr *buf)
823 {
824 	unsigned int offset = skb_network_offset(skb);
825 	struct iphdr *iph;
826 
827 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
828 
829 	if (!iph)
830 		return NULL;
831 
832 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
833 		return skb_header_pointer(skb, offset + iph->ihl * 4,
834 					  sizeof(struct ipv6hdr), buf);
835 
836 	else if (iph->version == 4)
837 		return iph;
838 
839 	else if (iph->version == 6)
840 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
841 					  buf);
842 
843 	return NULL;
844 }
845 
846 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
847 				      void *buf, unsigned int bufsize)
848 {
849 	unsigned int offset = skb_network_offset(skb);
850 	const struct ipv6hdr *ipv6h;
851 	const struct tcphdr *tcph;
852 	const struct iphdr *iph;
853 	struct ipv6hdr _ipv6h;
854 	struct tcphdr _tcph;
855 
856 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
857 
858 	if (!ipv6h)
859 		return NULL;
860 
861 	if (ipv6h->version == 4) {
862 		iph = (struct iphdr *)ipv6h;
863 		offset += iph->ihl * 4;
864 
865 		/* special-case 6in4 tunnelling, as that is a common way to get
866 		 * v6 connectivity in the home
867 		 */
868 		if (iph->protocol == IPPROTO_IPV6) {
869 			ipv6h = skb_header_pointer(skb, offset,
870 						   sizeof(_ipv6h), &_ipv6h);
871 
872 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
873 				return NULL;
874 
875 			offset += sizeof(struct ipv6hdr);
876 
877 		} else if (iph->protocol != IPPROTO_TCP) {
878 			return NULL;
879 		}
880 
881 	} else if (ipv6h->version == 6) {
882 		if (ipv6h->nexthdr != IPPROTO_TCP)
883 			return NULL;
884 
885 		offset += sizeof(struct ipv6hdr);
886 	} else {
887 		return NULL;
888 	}
889 
890 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
891 	if (!tcph)
892 		return NULL;
893 
894 	return skb_header_pointer(skb, offset,
895 				  min(__tcp_hdrlen(tcph), bufsize), buf);
896 }
897 
898 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
899 				   int code, int *oplen)
900 {
901 	/* inspired by tcp_parse_options in tcp_input.c */
902 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
903 	const u8 *ptr = (const u8 *)(tcph + 1);
904 
905 	while (length > 0) {
906 		int opcode = *ptr++;
907 		int opsize;
908 
909 		if (opcode == TCPOPT_EOL)
910 			break;
911 		if (opcode == TCPOPT_NOP) {
912 			length--;
913 			continue;
914 		}
915 		opsize = *ptr++;
916 		if (opsize < 2 || opsize > length)
917 			break;
918 
919 		if (opcode == code) {
920 			*oplen = opsize;
921 			return ptr;
922 		}
923 
924 		ptr += opsize - 2;
925 		length -= opsize;
926 	}
927 
928 	return NULL;
929 }
930 
931 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
932  * bytes than the other. In the case where both sequences ACKs bytes that the
933  * other doesn't, A is considered greater. DSACKs in A also makes A be
934  * considered greater.
935  *
936  * @return -1, 0 or 1 as normal compare functions
937  */
938 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
939 				  const struct tcphdr *tcph_b)
940 {
941 	const struct tcp_sack_block_wire *sack_a, *sack_b;
942 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
943 	u32 bytes_a = 0, bytes_b = 0;
944 	int oplen_a, oplen_b;
945 	bool first = true;
946 
947 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
948 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
949 
950 	/* pointers point to option contents */
951 	oplen_a -= TCPOLEN_SACK_BASE;
952 	oplen_b -= TCPOLEN_SACK_BASE;
953 
954 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
955 	    (!sack_b || oplen_b < sizeof(*sack_b)))
956 		return -1;
957 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
958 		 (!sack_a || oplen_a < sizeof(*sack_a)))
959 		return 1;
960 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
961 		 (!sack_b || oplen_b < sizeof(*sack_b)))
962 		return 0;
963 
964 	while (oplen_a >= sizeof(*sack_a)) {
965 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
966 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
967 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
968 		int oplen_tmp = oplen_b;
969 		bool found = false;
970 
971 		/* DSACK; always considered greater to prevent dropping */
972 		if (before(start_a, ack_seq_a))
973 			return -1;
974 
975 		bytes_a += end_a - start_a;
976 
977 		while (oplen_tmp >= sizeof(*sack_tmp)) {
978 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
979 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
980 
981 			/* first time through we count the total size */
982 			if (first)
983 				bytes_b += end_b - start_b;
984 
985 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
986 				found = true;
987 				if (!first)
988 					break;
989 			}
990 			oplen_tmp -= sizeof(*sack_tmp);
991 			sack_tmp++;
992 		}
993 
994 		if (!found)
995 			return -1;
996 
997 		oplen_a -= sizeof(*sack_a);
998 		sack_a++;
999 		first = false;
1000 	}
1001 
1002 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1003 	 * either the SACKs are equal, or B SACKs more bytes.
1004 	 */
1005 	return bytes_b > bytes_a ? 1 : 0;
1006 }
1007 
1008 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1009 				 u32 *tsval, u32 *tsecr)
1010 {
1011 	const u8 *ptr;
1012 	int opsize;
1013 
1014 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1015 
1016 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1017 		*tsval = get_unaligned_be32(ptr);
1018 		*tsecr = get_unaligned_be32(ptr + 4);
1019 	}
1020 }
1021 
1022 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1023 			       u32 tstamp_new, u32 tsecr_new)
1024 {
1025 	/* inspired by tcp_parse_options in tcp_input.c */
1026 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1027 	const u8 *ptr = (const u8 *)(tcph + 1);
1028 	u32 tstamp, tsecr;
1029 
1030 	/* 3 reserved flags must be unset to avoid future breakage
1031 	 * ACK must be set
1032 	 * ECE/CWR are handled separately
1033 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1034 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1035 	 * 0x00C00000 = CWR/ECE (handled separately)
1036 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1037 	 */
1038 	if (((tcp_flag_word(tcph) &
1039 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1040 		return false;
1041 
1042 	while (length > 0) {
1043 		int opcode = *ptr++;
1044 		int opsize;
1045 
1046 		if (opcode == TCPOPT_EOL)
1047 			break;
1048 		if (opcode == TCPOPT_NOP) {
1049 			length--;
1050 			continue;
1051 		}
1052 		opsize = *ptr++;
1053 		if (opsize < 2 || opsize > length)
1054 			break;
1055 
1056 		switch (opcode) {
1057 		case TCPOPT_MD5SIG: /* doesn't influence state */
1058 			break;
1059 
1060 		case TCPOPT_SACK: /* stricter checking performed later */
1061 			if (opsize % 8 != 2)
1062 				return false;
1063 			break;
1064 
1065 		case TCPOPT_TIMESTAMP:
1066 			/* only drop timestamps lower than new */
1067 			if (opsize != TCPOLEN_TIMESTAMP)
1068 				return false;
1069 			tstamp = get_unaligned_be32(ptr);
1070 			tsecr = get_unaligned_be32(ptr + 4);
1071 			if (after(tstamp, tstamp_new) ||
1072 			    after(tsecr, tsecr_new))
1073 				return false;
1074 			break;
1075 
1076 		case TCPOPT_MSS:  /* these should only be set on SYN */
1077 		case TCPOPT_WINDOW:
1078 		case TCPOPT_SACK_PERM:
1079 		case TCPOPT_FASTOPEN:
1080 		case TCPOPT_EXP:
1081 		default: /* don't drop if any unknown options are present */
1082 			return false;
1083 		}
1084 
1085 		ptr += opsize - 2;
1086 		length -= opsize;
1087 	}
1088 
1089 	return true;
1090 }
1091 
1092 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1093 				       struct cake_flow *flow)
1094 {
1095 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1096 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1097 	struct sk_buff *skb_check, *skb_prev = NULL;
1098 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1099 	unsigned char _tcph[64], _tcph_check[64];
1100 	const struct tcphdr *tcph, *tcph_check;
1101 	const struct iphdr *iph, *iph_check;
1102 	struct ipv6hdr _iph, _iph_check;
1103 	const struct sk_buff *skb;
1104 	int seglen, num_found = 0;
1105 	u32 tstamp = 0, tsecr = 0;
1106 	__be32 elig_flags = 0;
1107 	int sack_comp;
1108 
1109 	/* no other possible ACKs to filter */
1110 	if (flow->head == flow->tail)
1111 		return NULL;
1112 
1113 	skb = flow->tail;
1114 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1115 	iph = cake_get_iphdr(skb, &_iph);
1116 	if (!tcph)
1117 		return NULL;
1118 
1119 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1120 
1121 	/* the 'triggering' packet need only have the ACK flag set.
1122 	 * also check that SYN is not set, as there won't be any previous ACKs.
1123 	 */
1124 	if ((tcp_flag_word(tcph) &
1125 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1126 		return NULL;
1127 
1128 	/* the 'triggering' ACK is at the tail of the queue, we have already
1129 	 * returned if it is the only packet in the flow. loop through the rest
1130 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1131 	 * triggering one.
1132 	 */
1133 	for (skb_check = flow->head;
1134 	     skb_check && skb_check != skb;
1135 	     skb_prev = skb_check, skb_check = skb_check->next) {
1136 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1137 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1138 					     sizeof(_tcph_check));
1139 
1140 		/* only TCP packets with matching 5-tuple are eligible, and only
1141 		 * drop safe headers
1142 		 */
1143 		if (!tcph_check || iph->version != iph_check->version ||
1144 		    tcph_check->source != tcph->source ||
1145 		    tcph_check->dest != tcph->dest)
1146 			continue;
1147 
1148 		if (iph_check->version == 4) {
1149 			if (iph_check->saddr != iph->saddr ||
1150 			    iph_check->daddr != iph->daddr)
1151 				continue;
1152 
1153 			seglen = ntohs(iph_check->tot_len) -
1154 				       (4 * iph_check->ihl);
1155 		} else if (iph_check->version == 6) {
1156 			ipv6h = (struct ipv6hdr *)iph;
1157 			ipv6h_check = (struct ipv6hdr *)iph_check;
1158 
1159 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1160 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1161 				continue;
1162 
1163 			seglen = ntohs(ipv6h_check->payload_len);
1164 		} else {
1165 			WARN_ON(1);  /* shouldn't happen */
1166 			continue;
1167 		}
1168 
1169 		/* If the ECE/CWR flags changed from the previous eligible
1170 		 * packet in the same flow, we should no longer be dropping that
1171 		 * previous packet as this would lose information.
1172 		 */
1173 		if (elig_ack && (tcp_flag_word(tcph_check) &
1174 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1175 			elig_ack = NULL;
1176 			elig_ack_prev = NULL;
1177 			num_found--;
1178 		}
1179 
1180 		/* Check TCP options and flags, don't drop ACKs with segment
1181 		 * data, and don't drop ACKs with a higher cumulative ACK
1182 		 * counter than the triggering packet. Check ACK seqno here to
1183 		 * avoid parsing SACK options of packets we are going to exclude
1184 		 * anyway.
1185 		 */
1186 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1187 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1188 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1189 			continue;
1190 
1191 		/* Check SACK options. The triggering packet must SACK more data
1192 		 * than the ACK under consideration, or SACK the same range but
1193 		 * have a larger cumulative ACK counter. The latter is a
1194 		 * pathological case, but is contained in the following check
1195 		 * anyway, just to be safe.
1196 		 */
1197 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1198 
1199 		if (sack_comp < 0 ||
1200 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1201 		     sack_comp == 0))
1202 			continue;
1203 
1204 		/* At this point we have found an eligible pure ACK to drop; if
1205 		 * we are in aggressive mode, we are done. Otherwise, keep
1206 		 * searching unless this is the second eligible ACK we
1207 		 * found.
1208 		 *
1209 		 * Since we want to drop ACK closest to the head of the queue,
1210 		 * save the first eligible ACK we find, even if we need to loop
1211 		 * again.
1212 		 */
1213 		if (!elig_ack) {
1214 			elig_ack = skb_check;
1215 			elig_ack_prev = skb_prev;
1216 			elig_flags = (tcp_flag_word(tcph_check)
1217 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1218 		}
1219 
1220 		if (num_found++ > 0)
1221 			goto found;
1222 	}
1223 
1224 	/* We made it through the queue without finding two eligible ACKs . If
1225 	 * we found a single eligible ACK we can drop it in aggressive mode if
1226 	 * we can guarantee that this does not interfere with ECN flag
1227 	 * information. We ensure this by dropping it only if the enqueued
1228 	 * packet is consecutive with the eligible ACK, and their flags match.
1229 	 */
1230 	if (elig_ack && aggressive && elig_ack->next == skb &&
1231 	    (elig_flags == (tcp_flag_word(tcph) &
1232 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1233 		goto found;
1234 
1235 	return NULL;
1236 
1237 found:
1238 	if (elig_ack_prev)
1239 		elig_ack_prev->next = elig_ack->next;
1240 	else
1241 		flow->head = elig_ack->next;
1242 
1243 	elig_ack->next = NULL;
1244 
1245 	return elig_ack;
1246 }
1247 
1248 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1249 {
1250 	avg -= avg >> shift;
1251 	avg += sample >> shift;
1252 	return avg;
1253 }
1254 
1255 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1256 {
1257 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1258 		len -= off;
1259 
1260 	if (q->max_netlen < len)
1261 		q->max_netlen = len;
1262 	if (q->min_netlen > len)
1263 		q->min_netlen = len;
1264 
1265 	len += q->rate_overhead;
1266 
1267 	if (len < q->rate_mpu)
1268 		len = q->rate_mpu;
1269 
1270 	if (q->atm_mode == CAKE_ATM_ATM) {
1271 		len += 47;
1272 		len /= 48;
1273 		len *= 53;
1274 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1275 		/* Add one byte per 64 bytes or part thereof.
1276 		 * This is conservative and easier to calculate than the
1277 		 * precise value.
1278 		 */
1279 		len += (len + 63) / 64;
1280 	}
1281 
1282 	if (q->max_adjlen < len)
1283 		q->max_adjlen = len;
1284 	if (q->min_adjlen > len)
1285 		q->min_adjlen = len;
1286 
1287 	return len;
1288 }
1289 
1290 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1291 {
1292 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1293 	unsigned int hdr_len, last_len = 0;
1294 	u32 off = skb_network_offset(skb);
1295 	u32 len = qdisc_pkt_len(skb);
1296 	u16 segs = 1;
1297 
1298 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1299 
1300 	if (!shinfo->gso_size)
1301 		return cake_calc_overhead(q, len, off);
1302 
1303 	/* borrowed from qdisc_pkt_len_init() */
1304 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1305 
1306 	/* + transport layer */
1307 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1308 						SKB_GSO_TCPV6))) {
1309 		const struct tcphdr *th;
1310 		struct tcphdr _tcphdr;
1311 
1312 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1313 					sizeof(_tcphdr), &_tcphdr);
1314 		if (likely(th))
1315 			hdr_len += __tcp_hdrlen(th);
1316 	} else {
1317 		struct udphdr _udphdr;
1318 
1319 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1320 				       sizeof(_udphdr), &_udphdr))
1321 			hdr_len += sizeof(struct udphdr);
1322 	}
1323 
1324 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1325 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1326 				    shinfo->gso_size);
1327 	else
1328 		segs = shinfo->gso_segs;
1329 
1330 	len = shinfo->gso_size + hdr_len;
1331 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1332 
1333 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1334 		cake_calc_overhead(q, last_len, off));
1335 }
1336 
1337 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1338 {
1339 	struct cake_heap_entry ii = q->overflow_heap[i];
1340 	struct cake_heap_entry jj = q->overflow_heap[j];
1341 
1342 	q->overflow_heap[i] = jj;
1343 	q->overflow_heap[j] = ii;
1344 
1345 	q->tins[ii.t].overflow_idx[ii.b] = j;
1346 	q->tins[jj.t].overflow_idx[jj.b] = i;
1347 }
1348 
1349 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1350 {
1351 	struct cake_heap_entry ii = q->overflow_heap[i];
1352 
1353 	return q->tins[ii.t].backlogs[ii.b];
1354 }
1355 
1356 static void cake_heapify(struct cake_sched_data *q, u16 i)
1357 {
1358 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1359 	u32 mb = cake_heap_get_backlog(q, i);
1360 	u32 m = i;
1361 
1362 	while (m < a) {
1363 		u32 l = m + m + 1;
1364 		u32 r = l + 1;
1365 
1366 		if (l < a) {
1367 			u32 lb = cake_heap_get_backlog(q, l);
1368 
1369 			if (lb > mb) {
1370 				m  = l;
1371 				mb = lb;
1372 			}
1373 		}
1374 
1375 		if (r < a) {
1376 			u32 rb = cake_heap_get_backlog(q, r);
1377 
1378 			if (rb > mb) {
1379 				m  = r;
1380 				mb = rb;
1381 			}
1382 		}
1383 
1384 		if (m != i) {
1385 			cake_heap_swap(q, i, m);
1386 			i = m;
1387 		} else {
1388 			break;
1389 		}
1390 	}
1391 }
1392 
1393 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1394 {
1395 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1396 		u16 p = (i - 1) >> 1;
1397 		u32 ib = cake_heap_get_backlog(q, i);
1398 		u32 pb = cake_heap_get_backlog(q, p);
1399 
1400 		if (ib > pb) {
1401 			cake_heap_swap(q, i, p);
1402 			i = p;
1403 		} else {
1404 			break;
1405 		}
1406 	}
1407 }
1408 
1409 static int cake_advance_shaper(struct cake_sched_data *q,
1410 			       struct cake_tin_data *b,
1411 			       struct sk_buff *skb,
1412 			       ktime_t now, bool drop)
1413 {
1414 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1415 
1416 	/* charge packet bandwidth to this tin
1417 	 * and to the global shaper.
1418 	 */
1419 	if (q->rate_ns) {
1420 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1421 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1422 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1423 
1424 		if (ktime_before(b->time_next_packet, now))
1425 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1426 							   tin_dur);
1427 
1428 		else if (ktime_before(b->time_next_packet,
1429 				      ktime_add_ns(now, tin_dur)))
1430 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1431 
1432 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1433 						   global_dur);
1434 		if (!drop)
1435 			q->failsafe_next_packet = \
1436 				ktime_add_ns(q->failsafe_next_packet,
1437 					     failsafe_dur);
1438 	}
1439 	return len;
1440 }
1441 
1442 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1443 {
1444 	struct cake_sched_data *q = qdisc_priv(sch);
1445 	ktime_t now = ktime_get();
1446 	u32 idx = 0, tin = 0, len;
1447 	struct cake_heap_entry qq;
1448 	struct cake_tin_data *b;
1449 	struct cake_flow *flow;
1450 	struct sk_buff *skb;
1451 
1452 	if (!q->overflow_timeout) {
1453 		int i;
1454 		/* Build fresh max-heap */
1455 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1456 			cake_heapify(q, i);
1457 	}
1458 	q->overflow_timeout = 65535;
1459 
1460 	/* select longest queue for pruning */
1461 	qq  = q->overflow_heap[0];
1462 	tin = qq.t;
1463 	idx = qq.b;
1464 
1465 	b = &q->tins[tin];
1466 	flow = &b->flows[idx];
1467 	skb = dequeue_head(flow);
1468 	if (unlikely(!skb)) {
1469 		/* heap has gone wrong, rebuild it next time */
1470 		q->overflow_timeout = 0;
1471 		return idx + (tin << 16);
1472 	}
1473 
1474 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1475 		b->unresponsive_flow_count++;
1476 
1477 	len = qdisc_pkt_len(skb);
1478 	q->buffer_used      -= skb->truesize;
1479 	b->backlogs[idx]    -= len;
1480 	b->tin_backlog      -= len;
1481 	sch->qstats.backlog -= len;
1482 	qdisc_tree_reduce_backlog(sch, 1, len);
1483 
1484 	flow->dropped++;
1485 	b->tin_dropped++;
1486 	sch->qstats.drops++;
1487 
1488 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1489 		cake_advance_shaper(q, b, skb, now, true);
1490 
1491 	__qdisc_drop(skb, to_free);
1492 	sch->q.qlen--;
1493 
1494 	cake_heapify(q, 0);
1495 
1496 	return idx + (tin << 16);
1497 }
1498 
1499 static void cake_wash_diffserv(struct sk_buff *skb)
1500 {
1501 	switch (skb->protocol) {
1502 	case htons(ETH_P_IP):
1503 		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1504 		break;
1505 	case htons(ETH_P_IPV6):
1506 		ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1507 		break;
1508 	default:
1509 		break;
1510 	}
1511 }
1512 
1513 static u8 cake_handle_diffserv(struct sk_buff *skb, u16 wash)
1514 {
1515 	u8 dscp;
1516 
1517 	switch (skb->protocol) {
1518 	case htons(ETH_P_IP):
1519 		dscp = ipv4_get_dsfield(ip_hdr(skb)) >> 2;
1520 		if (wash && dscp)
1521 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1522 		return dscp;
1523 
1524 	case htons(ETH_P_IPV6):
1525 		dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> 2;
1526 		if (wash && dscp)
1527 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1528 		return dscp;
1529 
1530 	case htons(ETH_P_ARP):
1531 		return 0x38;  /* CS7 - Net Control */
1532 
1533 	default:
1534 		/* If there is no Diffserv field, treat as best-effort */
1535 		return 0;
1536 	}
1537 }
1538 
1539 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1540 					     struct sk_buff *skb)
1541 {
1542 	struct cake_sched_data *q = qdisc_priv(sch);
1543 	u32 tin;
1544 
1545 	if (TC_H_MAJ(skb->priority) == sch->handle &&
1546 	    TC_H_MIN(skb->priority) > 0 &&
1547 	    TC_H_MIN(skb->priority) <= q->tin_cnt) {
1548 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1549 
1550 		if (q->rate_flags & CAKE_FLAG_WASH)
1551 			cake_wash_diffserv(skb);
1552 	} else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) {
1553 		/* extract the Diffserv Precedence field, if it exists */
1554 		/* and clear DSCP bits if washing */
1555 		tin = q->tin_index[cake_handle_diffserv(skb,
1556 				q->rate_flags & CAKE_FLAG_WASH)];
1557 		if (unlikely(tin >= q->tin_cnt))
1558 			tin = 0;
1559 	} else {
1560 		tin = 0;
1561 		if (q->rate_flags & CAKE_FLAG_WASH)
1562 			cake_wash_diffserv(skb);
1563 	}
1564 
1565 	return &q->tins[tin];
1566 }
1567 
1568 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1569 			 struct sk_buff *skb, int flow_mode, int *qerr)
1570 {
1571 	struct cake_sched_data *q = qdisc_priv(sch);
1572 	struct tcf_proto *filter;
1573 	struct tcf_result res;
1574 	u32 flow = 0;
1575 	int result;
1576 
1577 	filter = rcu_dereference_bh(q->filter_list);
1578 	if (!filter)
1579 		goto hash;
1580 
1581 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1582 	result = tcf_classify(skb, filter, &res, false);
1583 
1584 	if (result >= 0) {
1585 #ifdef CONFIG_NET_CLS_ACT
1586 		switch (result) {
1587 		case TC_ACT_STOLEN:
1588 		case TC_ACT_QUEUED:
1589 		case TC_ACT_TRAP:
1590 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1591 			/* fall through */
1592 		case TC_ACT_SHOT:
1593 			return 0;
1594 		}
1595 #endif
1596 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1597 			flow = TC_H_MIN(res.classid);
1598 	}
1599 hash:
1600 	*t = cake_select_tin(sch, skb);
1601 	return flow ?: cake_hash(*t, skb, flow_mode) + 1;
1602 }
1603 
1604 static void cake_reconfigure(struct Qdisc *sch);
1605 
1606 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1607 			struct sk_buff **to_free)
1608 {
1609 	struct cake_sched_data *q = qdisc_priv(sch);
1610 	int len = qdisc_pkt_len(skb);
1611 	int uninitialized_var(ret);
1612 	struct sk_buff *ack = NULL;
1613 	ktime_t now = ktime_get();
1614 	struct cake_tin_data *b;
1615 	struct cake_flow *flow;
1616 	u32 idx;
1617 
1618 	/* choose flow to insert into */
1619 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1620 	if (idx == 0) {
1621 		if (ret & __NET_XMIT_BYPASS)
1622 			qdisc_qstats_drop(sch);
1623 		__qdisc_drop(skb, to_free);
1624 		return ret;
1625 	}
1626 	idx--;
1627 	flow = &b->flows[idx];
1628 
1629 	/* ensure shaper state isn't stale */
1630 	if (!b->tin_backlog) {
1631 		if (ktime_before(b->time_next_packet, now))
1632 			b->time_next_packet = now;
1633 
1634 		if (!sch->q.qlen) {
1635 			if (ktime_before(q->time_next_packet, now)) {
1636 				q->failsafe_next_packet = now;
1637 				q->time_next_packet = now;
1638 			} else if (ktime_after(q->time_next_packet, now) &&
1639 				   ktime_after(q->failsafe_next_packet, now)) {
1640 				u64 next = \
1641 					min(ktime_to_ns(q->time_next_packet),
1642 					    ktime_to_ns(
1643 						   q->failsafe_next_packet));
1644 				sch->qstats.overlimits++;
1645 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1646 			}
1647 		}
1648 	}
1649 
1650 	if (unlikely(len > b->max_skblen))
1651 		b->max_skblen = len;
1652 
1653 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1654 		struct sk_buff *segs, *nskb;
1655 		netdev_features_t features = netif_skb_features(skb);
1656 		unsigned int slen = 0;
1657 
1658 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1659 		if (IS_ERR_OR_NULL(segs))
1660 			return qdisc_drop(skb, sch, to_free);
1661 
1662 		while (segs) {
1663 			nskb = segs->next;
1664 			segs->next = NULL;
1665 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1666 			cobalt_set_enqueue_time(segs, now);
1667 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1668 									  segs);
1669 			flow_queue_add(flow, segs);
1670 
1671 			sch->q.qlen++;
1672 			slen += segs->len;
1673 			q->buffer_used += segs->truesize;
1674 			b->packets++;
1675 			segs = nskb;
1676 		}
1677 
1678 		/* stats */
1679 		b->bytes	    += slen;
1680 		b->backlogs[idx]    += slen;
1681 		b->tin_backlog      += slen;
1682 		sch->qstats.backlog += slen;
1683 		q->avg_window_bytes += slen;
1684 
1685 		qdisc_tree_reduce_backlog(sch, 1, len);
1686 		consume_skb(skb);
1687 	} else {
1688 		/* not splitting */
1689 		cobalt_set_enqueue_time(skb, now);
1690 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1691 		flow_queue_add(flow, skb);
1692 
1693 		if (q->ack_filter)
1694 			ack = cake_ack_filter(q, flow);
1695 
1696 		if (ack) {
1697 			b->ack_drops++;
1698 			sch->qstats.drops++;
1699 			b->bytes += qdisc_pkt_len(ack);
1700 			len -= qdisc_pkt_len(ack);
1701 			q->buffer_used += skb->truesize - ack->truesize;
1702 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1703 				cake_advance_shaper(q, b, ack, now, true);
1704 
1705 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1706 			consume_skb(ack);
1707 		} else {
1708 			sch->q.qlen++;
1709 			q->buffer_used      += skb->truesize;
1710 		}
1711 
1712 		/* stats */
1713 		b->packets++;
1714 		b->bytes	    += len;
1715 		b->backlogs[idx]    += len;
1716 		b->tin_backlog      += len;
1717 		sch->qstats.backlog += len;
1718 		q->avg_window_bytes += len;
1719 	}
1720 
1721 	if (q->overflow_timeout)
1722 		cake_heapify_up(q, b->overflow_idx[idx]);
1723 
1724 	/* incoming bandwidth capacity estimate */
1725 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1726 		u64 packet_interval = \
1727 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1728 
1729 		if (packet_interval > NSEC_PER_SEC)
1730 			packet_interval = NSEC_PER_SEC;
1731 
1732 		/* filter out short-term bursts, eg. wifi aggregation */
1733 		q->avg_packet_interval = \
1734 			cake_ewma(q->avg_packet_interval,
1735 				  packet_interval,
1736 				  (packet_interval > q->avg_packet_interval ?
1737 					  2 : 8));
1738 
1739 		q->last_packet_time = now;
1740 
1741 		if (packet_interval > q->avg_packet_interval) {
1742 			u64 window_interval = \
1743 				ktime_to_ns(ktime_sub(now,
1744 						      q->avg_window_begin));
1745 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1746 
1747 			do_div(b, window_interval);
1748 			q->avg_peak_bandwidth =
1749 				cake_ewma(q->avg_peak_bandwidth, b,
1750 					  b > q->avg_peak_bandwidth ? 2 : 8);
1751 			q->avg_window_bytes = 0;
1752 			q->avg_window_begin = now;
1753 
1754 			if (ktime_after(now,
1755 					ktime_add_ms(q->last_reconfig_time,
1756 						     250))) {
1757 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1758 				cake_reconfigure(sch);
1759 			}
1760 		}
1761 	} else {
1762 		q->avg_window_bytes = 0;
1763 		q->last_packet_time = now;
1764 	}
1765 
1766 	/* flowchain */
1767 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1768 		struct cake_host *srchost = &b->hosts[flow->srchost];
1769 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1770 		u16 host_load = 1;
1771 
1772 		if (!flow->set) {
1773 			list_add_tail(&flow->flowchain, &b->new_flows);
1774 		} else {
1775 			b->decaying_flow_count--;
1776 			list_move_tail(&flow->flowchain, &b->new_flows);
1777 		}
1778 		flow->set = CAKE_SET_SPARSE;
1779 		b->sparse_flow_count++;
1780 
1781 		if (cake_dsrc(q->flow_mode))
1782 			host_load = max(host_load, srchost->srchost_refcnt);
1783 
1784 		if (cake_ddst(q->flow_mode))
1785 			host_load = max(host_load, dsthost->dsthost_refcnt);
1786 
1787 		flow->deficit = (b->flow_quantum *
1788 				 quantum_div[host_load]) >> 16;
1789 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1790 		/* this flow was empty, accounted as a sparse flow, but actually
1791 		 * in the bulk rotation.
1792 		 */
1793 		flow->set = CAKE_SET_BULK;
1794 		b->sparse_flow_count--;
1795 		b->bulk_flow_count++;
1796 	}
1797 
1798 	if (q->buffer_used > q->buffer_max_used)
1799 		q->buffer_max_used = q->buffer_used;
1800 
1801 	if (q->buffer_used > q->buffer_limit) {
1802 		u32 dropped = 0;
1803 
1804 		while (q->buffer_used > q->buffer_limit) {
1805 			dropped++;
1806 			cake_drop(sch, to_free);
1807 		}
1808 		b->drop_overlimit += dropped;
1809 	}
1810 	return NET_XMIT_SUCCESS;
1811 }
1812 
1813 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1814 {
1815 	struct cake_sched_data *q = qdisc_priv(sch);
1816 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1817 	struct cake_flow *flow = &b->flows[q->cur_flow];
1818 	struct sk_buff *skb = NULL;
1819 	u32 len;
1820 
1821 	if (flow->head) {
1822 		skb = dequeue_head(flow);
1823 		len = qdisc_pkt_len(skb);
1824 		b->backlogs[q->cur_flow] -= len;
1825 		b->tin_backlog		 -= len;
1826 		sch->qstats.backlog      -= len;
1827 		q->buffer_used		 -= skb->truesize;
1828 		sch->q.qlen--;
1829 
1830 		if (q->overflow_timeout)
1831 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1832 	}
1833 	return skb;
1834 }
1835 
1836 /* Discard leftover packets from a tin no longer in use. */
1837 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1838 {
1839 	struct cake_sched_data *q = qdisc_priv(sch);
1840 	struct sk_buff *skb;
1841 
1842 	q->cur_tin = tin;
1843 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1844 		while (!!(skb = cake_dequeue_one(sch)))
1845 			kfree_skb(skb);
1846 }
1847 
1848 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1849 {
1850 	struct cake_sched_data *q = qdisc_priv(sch);
1851 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1852 	struct cake_host *srchost, *dsthost;
1853 	ktime_t now = ktime_get();
1854 	struct cake_flow *flow;
1855 	struct list_head *head;
1856 	bool first_flow = true;
1857 	struct sk_buff *skb;
1858 	u16 host_load;
1859 	u64 delay;
1860 	u32 len;
1861 
1862 begin:
1863 	if (!sch->q.qlen)
1864 		return NULL;
1865 
1866 	/* global hard shaper */
1867 	if (ktime_after(q->time_next_packet, now) &&
1868 	    ktime_after(q->failsafe_next_packet, now)) {
1869 		u64 next = min(ktime_to_ns(q->time_next_packet),
1870 			       ktime_to_ns(q->failsafe_next_packet));
1871 
1872 		sch->qstats.overlimits++;
1873 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1874 		return NULL;
1875 	}
1876 
1877 	/* Choose a class to work on. */
1878 	if (!q->rate_ns) {
1879 		/* In unlimited mode, can't rely on shaper timings, just balance
1880 		 * with DRR
1881 		 */
1882 		bool wrapped = false, empty = true;
1883 
1884 		while (b->tin_deficit < 0 ||
1885 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1886 			if (b->tin_deficit <= 0)
1887 				b->tin_deficit += b->tin_quantum_band;
1888 			if (b->sparse_flow_count + b->bulk_flow_count)
1889 				empty = false;
1890 
1891 			q->cur_tin++;
1892 			b++;
1893 			if (q->cur_tin >= q->tin_cnt) {
1894 				q->cur_tin = 0;
1895 				b = q->tins;
1896 
1897 				if (wrapped) {
1898 					/* It's possible for q->qlen to be
1899 					 * nonzero when we actually have no
1900 					 * packets anywhere.
1901 					 */
1902 					if (empty)
1903 						return NULL;
1904 				} else {
1905 					wrapped = true;
1906 				}
1907 			}
1908 		}
1909 	} else {
1910 		/* In shaped mode, choose:
1911 		 * - Highest-priority tin with queue and meeting schedule, or
1912 		 * - The earliest-scheduled tin with queue.
1913 		 */
1914 		ktime_t best_time = KTIME_MAX;
1915 		int tin, best_tin = 0;
1916 
1917 		for (tin = 0; tin < q->tin_cnt; tin++) {
1918 			b = q->tins + tin;
1919 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
1920 				ktime_t time_to_pkt = \
1921 					ktime_sub(b->time_next_packet, now);
1922 
1923 				if (ktime_to_ns(time_to_pkt) <= 0 ||
1924 				    ktime_compare(time_to_pkt,
1925 						  best_time) <= 0) {
1926 					best_time = time_to_pkt;
1927 					best_tin = tin;
1928 				}
1929 			}
1930 		}
1931 
1932 		q->cur_tin = best_tin;
1933 		b = q->tins + best_tin;
1934 
1935 		/* No point in going further if no packets to deliver. */
1936 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
1937 			return NULL;
1938 	}
1939 
1940 retry:
1941 	/* service this class */
1942 	head = &b->decaying_flows;
1943 	if (!first_flow || list_empty(head)) {
1944 		head = &b->new_flows;
1945 		if (list_empty(head)) {
1946 			head = &b->old_flows;
1947 			if (unlikely(list_empty(head))) {
1948 				head = &b->decaying_flows;
1949 				if (unlikely(list_empty(head)))
1950 					goto begin;
1951 			}
1952 		}
1953 	}
1954 	flow = list_first_entry(head, struct cake_flow, flowchain);
1955 	q->cur_flow = flow - b->flows;
1956 	first_flow = false;
1957 
1958 	/* triple isolation (modified DRR++) */
1959 	srchost = &b->hosts[flow->srchost];
1960 	dsthost = &b->hosts[flow->dsthost];
1961 	host_load = 1;
1962 
1963 	if (cake_dsrc(q->flow_mode))
1964 		host_load = max(host_load, srchost->srchost_refcnt);
1965 
1966 	if (cake_ddst(q->flow_mode))
1967 		host_load = max(host_load, dsthost->dsthost_refcnt);
1968 
1969 	WARN_ON(host_load > CAKE_QUEUES);
1970 
1971 	/* flow isolation (DRR++) */
1972 	if (flow->deficit <= 0) {
1973 		/* The shifted prandom_u32() is a way to apply dithering to
1974 		 * avoid accumulating roundoff errors
1975 		 */
1976 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1977 				  (prandom_u32() >> 16)) >> 16;
1978 		list_move_tail(&flow->flowchain, &b->old_flows);
1979 
1980 		/* Keep all flows with deficits out of the sparse and decaying
1981 		 * rotations.  No non-empty flow can go into the decaying
1982 		 * rotation, so they can't get deficits
1983 		 */
1984 		if (flow->set == CAKE_SET_SPARSE) {
1985 			if (flow->head) {
1986 				b->sparse_flow_count--;
1987 				b->bulk_flow_count++;
1988 				flow->set = CAKE_SET_BULK;
1989 			} else {
1990 				/* we've moved it to the bulk rotation for
1991 				 * correct deficit accounting but we still want
1992 				 * to count it as a sparse flow, not a bulk one.
1993 				 */
1994 				flow->set = CAKE_SET_SPARSE_WAIT;
1995 			}
1996 		}
1997 		goto retry;
1998 	}
1999 
2000 	/* Retrieve a packet via the AQM */
2001 	while (1) {
2002 		skb = cake_dequeue_one(sch);
2003 		if (!skb) {
2004 			/* this queue was actually empty */
2005 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2006 				b->unresponsive_flow_count--;
2007 
2008 			if (flow->cvars.p_drop || flow->cvars.count ||
2009 			    ktime_before(now, flow->cvars.drop_next)) {
2010 				/* keep in the flowchain until the state has
2011 				 * decayed to rest
2012 				 */
2013 				list_move_tail(&flow->flowchain,
2014 					       &b->decaying_flows);
2015 				if (flow->set == CAKE_SET_BULK) {
2016 					b->bulk_flow_count--;
2017 					b->decaying_flow_count++;
2018 				} else if (flow->set == CAKE_SET_SPARSE ||
2019 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2020 					b->sparse_flow_count--;
2021 					b->decaying_flow_count++;
2022 				}
2023 				flow->set = CAKE_SET_DECAYING;
2024 			} else {
2025 				/* remove empty queue from the flowchain */
2026 				list_del_init(&flow->flowchain);
2027 				if (flow->set == CAKE_SET_SPARSE ||
2028 				    flow->set == CAKE_SET_SPARSE_WAIT)
2029 					b->sparse_flow_count--;
2030 				else if (flow->set == CAKE_SET_BULK)
2031 					b->bulk_flow_count--;
2032 				else
2033 					b->decaying_flow_count--;
2034 
2035 				flow->set = CAKE_SET_NONE;
2036 				srchost->srchost_refcnt--;
2037 				dsthost->dsthost_refcnt--;
2038 			}
2039 			goto begin;
2040 		}
2041 
2042 		/* Last packet in queue may be marked, shouldn't be dropped */
2043 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2044 					(b->bulk_flow_count *
2045 					 !!(q->rate_flags &
2046 					    CAKE_FLAG_INGRESS))) ||
2047 		    !flow->head)
2048 			break;
2049 
2050 		/* drop this packet, get another one */
2051 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2052 			len = cake_advance_shaper(q, b, skb,
2053 						  now, true);
2054 			flow->deficit -= len;
2055 			b->tin_deficit -= len;
2056 		}
2057 		flow->dropped++;
2058 		b->tin_dropped++;
2059 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2060 		qdisc_qstats_drop(sch);
2061 		kfree_skb(skb);
2062 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2063 			goto retry;
2064 	}
2065 
2066 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2067 	qdisc_bstats_update(sch, skb);
2068 
2069 	/* collect delay stats */
2070 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2071 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2072 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2073 				  delay > b->peak_delay ? 2 : 8);
2074 	b->base_delay = cake_ewma(b->base_delay, delay,
2075 				  delay < b->base_delay ? 2 : 8);
2076 
2077 	len = cake_advance_shaper(q, b, skb, now, false);
2078 	flow->deficit -= len;
2079 	b->tin_deficit -= len;
2080 
2081 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2082 		u64 next = min(ktime_to_ns(q->time_next_packet),
2083 			       ktime_to_ns(q->failsafe_next_packet));
2084 
2085 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2086 	} else if (!sch->q.qlen) {
2087 		int i;
2088 
2089 		for (i = 0; i < q->tin_cnt; i++) {
2090 			if (q->tins[i].decaying_flow_count) {
2091 				ktime_t next = \
2092 					ktime_add_ns(now,
2093 						     q->tins[i].cparams.target);
2094 
2095 				qdisc_watchdog_schedule_ns(&q->watchdog,
2096 							   ktime_to_ns(next));
2097 				break;
2098 			}
2099 		}
2100 	}
2101 
2102 	if (q->overflow_timeout)
2103 		q->overflow_timeout--;
2104 
2105 	return skb;
2106 }
2107 
2108 static void cake_reset(struct Qdisc *sch)
2109 {
2110 	u32 c;
2111 
2112 	for (c = 0; c < CAKE_MAX_TINS; c++)
2113 		cake_clear_tin(sch, c);
2114 }
2115 
2116 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2117 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2118 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2119 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2120 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2121 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2122 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2123 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2124 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2125 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2126 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2127 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2128 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2129 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2130 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2131 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2132 };
2133 
2134 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2135 			  u64 target_ns, u64 rtt_est_ns)
2136 {
2137 	/* convert byte-rate into time-per-byte
2138 	 * so it will always unwedge in reasonable time.
2139 	 */
2140 	static const u64 MIN_RATE = 64;
2141 	u32 byte_target = mtu;
2142 	u64 byte_target_ns;
2143 	u8  rate_shft = 0;
2144 	u64 rate_ns = 0;
2145 
2146 	b->flow_quantum = 1514;
2147 	if (rate) {
2148 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2149 		rate_shft = 34;
2150 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2151 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2152 		while (!!(rate_ns >> 34)) {
2153 			rate_ns >>= 1;
2154 			rate_shft--;
2155 		}
2156 	} /* else unlimited, ie. zero delay */
2157 
2158 	b->tin_rate_bps  = rate;
2159 	b->tin_rate_ns   = rate_ns;
2160 	b->tin_rate_shft = rate_shft;
2161 
2162 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2163 
2164 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2165 	b->cparams.interval = max(rtt_est_ns +
2166 				     b->cparams.target - target_ns,
2167 				     b->cparams.target * 2);
2168 	b->cparams.mtu_time = byte_target_ns;
2169 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2170 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2171 }
2172 
2173 static int cake_config_besteffort(struct Qdisc *sch)
2174 {
2175 	struct cake_sched_data *q = qdisc_priv(sch);
2176 	struct cake_tin_data *b = &q->tins[0];
2177 	u32 mtu = psched_mtu(qdisc_dev(sch));
2178 	u64 rate = q->rate_bps;
2179 
2180 	q->tin_cnt = 1;
2181 
2182 	q->tin_index = besteffort;
2183 	q->tin_order = normal_order;
2184 
2185 	cake_set_rate(b, rate, mtu,
2186 		      us_to_ns(q->target), us_to_ns(q->interval));
2187 	b->tin_quantum_band = 65535;
2188 	b->tin_quantum_prio = 65535;
2189 
2190 	return 0;
2191 }
2192 
2193 static int cake_config_precedence(struct Qdisc *sch)
2194 {
2195 	/* convert high-level (user visible) parameters into internal format */
2196 	struct cake_sched_data *q = qdisc_priv(sch);
2197 	u32 mtu = psched_mtu(qdisc_dev(sch));
2198 	u64 rate = q->rate_bps;
2199 	u32 quantum1 = 256;
2200 	u32 quantum2 = 256;
2201 	u32 i;
2202 
2203 	q->tin_cnt = 8;
2204 	q->tin_index = precedence;
2205 	q->tin_order = normal_order;
2206 
2207 	for (i = 0; i < q->tin_cnt; i++) {
2208 		struct cake_tin_data *b = &q->tins[i];
2209 
2210 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2211 			      us_to_ns(q->interval));
2212 
2213 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2214 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2215 
2216 		/* calculate next class's parameters */
2217 		rate  *= 7;
2218 		rate >>= 3;
2219 
2220 		quantum1  *= 3;
2221 		quantum1 >>= 1;
2222 
2223 		quantum2  *= 7;
2224 		quantum2 >>= 3;
2225 	}
2226 
2227 	return 0;
2228 }
2229 
2230 /*	List of known Diffserv codepoints:
2231  *
2232  *	Least Effort (CS1)
2233  *	Best Effort (CS0)
2234  *	Max Reliability & LLT "Lo" (TOS1)
2235  *	Max Throughput (TOS2)
2236  *	Min Delay (TOS4)
2237  *	LLT "La" (TOS5)
2238  *	Assured Forwarding 1 (AF1x) - x3
2239  *	Assured Forwarding 2 (AF2x) - x3
2240  *	Assured Forwarding 3 (AF3x) - x3
2241  *	Assured Forwarding 4 (AF4x) - x3
2242  *	Precedence Class 2 (CS2)
2243  *	Precedence Class 3 (CS3)
2244  *	Precedence Class 4 (CS4)
2245  *	Precedence Class 5 (CS5)
2246  *	Precedence Class 6 (CS6)
2247  *	Precedence Class 7 (CS7)
2248  *	Voice Admit (VA)
2249  *	Expedited Forwarding (EF)
2250 
2251  *	Total 25 codepoints.
2252  */
2253 
2254 /*	List of traffic classes in RFC 4594:
2255  *		(roughly descending order of contended priority)
2256  *		(roughly ascending order of uncontended throughput)
2257  *
2258  *	Network Control (CS6,CS7)      - routing traffic
2259  *	Telephony (EF,VA)         - aka. VoIP streams
2260  *	Signalling (CS5)               - VoIP setup
2261  *	Multimedia Conferencing (AF4x) - aka. video calls
2262  *	Realtime Interactive (CS4)     - eg. games
2263  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2264  *	Broadcast Video (CS3)
2265  *	Low Latency Data (AF2x,TOS4)      - eg. database
2266  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2267  *	Standard Service (CS0 & unrecognised codepoints)
2268  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2269  *	Low Priority Data (CS1)           - eg. BitTorrent
2270 
2271  *	Total 12 traffic classes.
2272  */
2273 
2274 static int cake_config_diffserv8(struct Qdisc *sch)
2275 {
2276 /*	Pruned list of traffic classes for typical applications:
2277  *
2278  *		Network Control          (CS6, CS7)
2279  *		Minimum Latency          (EF, VA, CS5, CS4)
2280  *		Interactive Shell        (CS2, TOS1)
2281  *		Low Latency Transactions (AF2x, TOS4)
2282  *		Video Streaming          (AF4x, AF3x, CS3)
2283  *		Bog Standard             (CS0 etc.)
2284  *		High Throughput          (AF1x, TOS2)
2285  *		Background Traffic       (CS1)
2286  *
2287  *		Total 8 traffic classes.
2288  */
2289 
2290 	struct cake_sched_data *q = qdisc_priv(sch);
2291 	u32 mtu = psched_mtu(qdisc_dev(sch));
2292 	u64 rate = q->rate_bps;
2293 	u32 quantum1 = 256;
2294 	u32 quantum2 = 256;
2295 	u32 i;
2296 
2297 	q->tin_cnt = 8;
2298 
2299 	/* codepoint to class mapping */
2300 	q->tin_index = diffserv8;
2301 	q->tin_order = normal_order;
2302 
2303 	/* class characteristics */
2304 	for (i = 0; i < q->tin_cnt; i++) {
2305 		struct cake_tin_data *b = &q->tins[i];
2306 
2307 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2308 			      us_to_ns(q->interval));
2309 
2310 		b->tin_quantum_prio = max_t(u16, 1U, quantum1);
2311 		b->tin_quantum_band = max_t(u16, 1U, quantum2);
2312 
2313 		/* calculate next class's parameters */
2314 		rate  *= 7;
2315 		rate >>= 3;
2316 
2317 		quantum1  *= 3;
2318 		quantum1 >>= 1;
2319 
2320 		quantum2  *= 7;
2321 		quantum2 >>= 3;
2322 	}
2323 
2324 	return 0;
2325 }
2326 
2327 static int cake_config_diffserv4(struct Qdisc *sch)
2328 {
2329 /*  Further pruned list of traffic classes for four-class system:
2330  *
2331  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2332  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2333  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2334  *	    Background Traffic (CS1)
2335  *
2336  *		Total 4 traffic classes.
2337  */
2338 
2339 	struct cake_sched_data *q = qdisc_priv(sch);
2340 	u32 mtu = psched_mtu(qdisc_dev(sch));
2341 	u64 rate = q->rate_bps;
2342 	u32 quantum = 1024;
2343 
2344 	q->tin_cnt = 4;
2345 
2346 	/* codepoint to class mapping */
2347 	q->tin_index = diffserv4;
2348 	q->tin_order = bulk_order;
2349 
2350 	/* class characteristics */
2351 	cake_set_rate(&q->tins[0], rate, mtu,
2352 		      us_to_ns(q->target), us_to_ns(q->interval));
2353 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2354 		      us_to_ns(q->target), us_to_ns(q->interval));
2355 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2356 		      us_to_ns(q->target), us_to_ns(q->interval));
2357 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2358 		      us_to_ns(q->target), us_to_ns(q->interval));
2359 
2360 	/* priority weights */
2361 	q->tins[0].tin_quantum_prio = quantum;
2362 	q->tins[1].tin_quantum_prio = quantum >> 4;
2363 	q->tins[2].tin_quantum_prio = quantum << 2;
2364 	q->tins[3].tin_quantum_prio = quantum << 4;
2365 
2366 	/* bandwidth-sharing weights */
2367 	q->tins[0].tin_quantum_band = quantum;
2368 	q->tins[1].tin_quantum_band = quantum >> 4;
2369 	q->tins[2].tin_quantum_band = quantum >> 1;
2370 	q->tins[3].tin_quantum_band = quantum >> 2;
2371 
2372 	return 0;
2373 }
2374 
2375 static int cake_config_diffserv3(struct Qdisc *sch)
2376 {
2377 /*  Simplified Diffserv structure with 3 tins.
2378  *		Low Priority		(CS1)
2379  *		Best Effort
2380  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2381  */
2382 	struct cake_sched_data *q = qdisc_priv(sch);
2383 	u32 mtu = psched_mtu(qdisc_dev(sch));
2384 	u64 rate = q->rate_bps;
2385 	u32 quantum = 1024;
2386 
2387 	q->tin_cnt = 3;
2388 
2389 	/* codepoint to class mapping */
2390 	q->tin_index = diffserv3;
2391 	q->tin_order = bulk_order;
2392 
2393 	/* class characteristics */
2394 	cake_set_rate(&q->tins[0], rate, mtu,
2395 		      us_to_ns(q->target), us_to_ns(q->interval));
2396 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2397 		      us_to_ns(q->target), us_to_ns(q->interval));
2398 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2399 		      us_to_ns(q->target), us_to_ns(q->interval));
2400 
2401 	/* priority weights */
2402 	q->tins[0].tin_quantum_prio = quantum;
2403 	q->tins[1].tin_quantum_prio = quantum >> 4;
2404 	q->tins[2].tin_quantum_prio = quantum << 4;
2405 
2406 	/* bandwidth-sharing weights */
2407 	q->tins[0].tin_quantum_band = quantum;
2408 	q->tins[1].tin_quantum_band = quantum >> 4;
2409 	q->tins[2].tin_quantum_band = quantum >> 2;
2410 
2411 	return 0;
2412 }
2413 
2414 static void cake_reconfigure(struct Qdisc *sch)
2415 {
2416 	struct cake_sched_data *q = qdisc_priv(sch);
2417 	int c, ft;
2418 
2419 	switch (q->tin_mode) {
2420 	case CAKE_DIFFSERV_BESTEFFORT:
2421 		ft = cake_config_besteffort(sch);
2422 		break;
2423 
2424 	case CAKE_DIFFSERV_PRECEDENCE:
2425 		ft = cake_config_precedence(sch);
2426 		break;
2427 
2428 	case CAKE_DIFFSERV_DIFFSERV8:
2429 		ft = cake_config_diffserv8(sch);
2430 		break;
2431 
2432 	case CAKE_DIFFSERV_DIFFSERV4:
2433 		ft = cake_config_diffserv4(sch);
2434 		break;
2435 
2436 	case CAKE_DIFFSERV_DIFFSERV3:
2437 	default:
2438 		ft = cake_config_diffserv3(sch);
2439 		break;
2440 	}
2441 
2442 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2443 		cake_clear_tin(sch, c);
2444 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2445 	}
2446 
2447 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2448 	q->rate_shft = q->tins[ft].tin_rate_shft;
2449 
2450 	if (q->buffer_config_limit) {
2451 		q->buffer_limit = q->buffer_config_limit;
2452 	} else if (q->rate_bps) {
2453 		u64 t = q->rate_bps * q->interval;
2454 
2455 		do_div(t, USEC_PER_SEC / 4);
2456 		q->buffer_limit = max_t(u32, t, 4U << 20);
2457 	} else {
2458 		q->buffer_limit = ~0;
2459 	}
2460 
2461 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2462 
2463 	q->buffer_limit = min(q->buffer_limit,
2464 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2465 				  q->buffer_config_limit));
2466 }
2467 
2468 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2469 		       struct netlink_ext_ack *extack)
2470 {
2471 	struct cake_sched_data *q = qdisc_priv(sch);
2472 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2473 	int err;
2474 
2475 	if (!opt)
2476 		return -EINVAL;
2477 
2478 	err = nla_parse_nested(tb, TCA_CAKE_MAX, opt, cake_policy, extack);
2479 	if (err < 0)
2480 		return err;
2481 
2482 	if (tb[TCA_CAKE_NAT]) {
2483 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2484 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2485 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2486 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2487 #else
2488 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2489 				    "No conntrack support in kernel");
2490 		return -EOPNOTSUPP;
2491 #endif
2492 	}
2493 
2494 	if (tb[TCA_CAKE_BASE_RATE64])
2495 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2496 
2497 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2498 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2499 
2500 	if (tb[TCA_CAKE_WASH]) {
2501 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2502 			q->rate_flags |= CAKE_FLAG_WASH;
2503 		else
2504 			q->rate_flags &= ~CAKE_FLAG_WASH;
2505 	}
2506 
2507 	if (tb[TCA_CAKE_FLOW_MODE])
2508 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2509 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2510 					CAKE_FLOW_MASK));
2511 
2512 	if (tb[TCA_CAKE_ATM])
2513 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2514 
2515 	if (tb[TCA_CAKE_OVERHEAD]) {
2516 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2517 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2518 
2519 		q->max_netlen = 0;
2520 		q->max_adjlen = 0;
2521 		q->min_netlen = ~0;
2522 		q->min_adjlen = ~0;
2523 	}
2524 
2525 	if (tb[TCA_CAKE_RAW]) {
2526 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2527 
2528 		q->max_netlen = 0;
2529 		q->max_adjlen = 0;
2530 		q->min_netlen = ~0;
2531 		q->min_adjlen = ~0;
2532 	}
2533 
2534 	if (tb[TCA_CAKE_MPU])
2535 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2536 
2537 	if (tb[TCA_CAKE_RTT]) {
2538 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2539 
2540 		if (!q->interval)
2541 			q->interval = 1;
2542 	}
2543 
2544 	if (tb[TCA_CAKE_TARGET]) {
2545 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2546 
2547 		if (!q->target)
2548 			q->target = 1;
2549 	}
2550 
2551 	if (tb[TCA_CAKE_AUTORATE]) {
2552 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2553 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2554 		else
2555 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2556 	}
2557 
2558 	if (tb[TCA_CAKE_INGRESS]) {
2559 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2560 			q->rate_flags |= CAKE_FLAG_INGRESS;
2561 		else
2562 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2563 	}
2564 
2565 	if (tb[TCA_CAKE_ACK_FILTER])
2566 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2567 
2568 	if (tb[TCA_CAKE_MEMORY])
2569 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2570 
2571 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2572 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2573 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2574 		else
2575 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2576 	}
2577 
2578 	if (q->tins) {
2579 		sch_tree_lock(sch);
2580 		cake_reconfigure(sch);
2581 		sch_tree_unlock(sch);
2582 	}
2583 
2584 	return 0;
2585 }
2586 
2587 static void cake_destroy(struct Qdisc *sch)
2588 {
2589 	struct cake_sched_data *q = qdisc_priv(sch);
2590 
2591 	qdisc_watchdog_cancel(&q->watchdog);
2592 	tcf_block_put(q->block);
2593 	kvfree(q->tins);
2594 }
2595 
2596 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2597 		     struct netlink_ext_ack *extack)
2598 {
2599 	struct cake_sched_data *q = qdisc_priv(sch);
2600 	int i, j, err;
2601 
2602 	sch->limit = 10240;
2603 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2604 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2605 
2606 	q->rate_bps = 0; /* unlimited by default */
2607 
2608 	q->interval = 100000; /* 100ms default */
2609 	q->target   =   5000; /* 5ms: codel RFC argues
2610 			       * for 5 to 10% of interval
2611 			       */
2612 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2613 	q->cur_tin = 0;
2614 	q->cur_flow  = 0;
2615 
2616 	qdisc_watchdog_init(&q->watchdog, sch);
2617 
2618 	if (opt) {
2619 		int err = cake_change(sch, opt, extack);
2620 
2621 		if (err)
2622 			return err;
2623 	}
2624 
2625 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2626 	if (err)
2627 		return err;
2628 
2629 	quantum_div[0] = ~0;
2630 	for (i = 1; i <= CAKE_QUEUES; i++)
2631 		quantum_div[i] = 65535 / i;
2632 
2633 	q->tins = kvzalloc(CAKE_MAX_TINS * sizeof(struct cake_tin_data),
2634 			   GFP_KERNEL);
2635 	if (!q->tins)
2636 		goto nomem;
2637 
2638 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2639 		struct cake_tin_data *b = q->tins + i;
2640 
2641 		INIT_LIST_HEAD(&b->new_flows);
2642 		INIT_LIST_HEAD(&b->old_flows);
2643 		INIT_LIST_HEAD(&b->decaying_flows);
2644 		b->sparse_flow_count = 0;
2645 		b->bulk_flow_count = 0;
2646 		b->decaying_flow_count = 0;
2647 
2648 		for (j = 0; j < CAKE_QUEUES; j++) {
2649 			struct cake_flow *flow = b->flows + j;
2650 			u32 k = j * CAKE_MAX_TINS + i;
2651 
2652 			INIT_LIST_HEAD(&flow->flowchain);
2653 			cobalt_vars_init(&flow->cvars);
2654 
2655 			q->overflow_heap[k].t = i;
2656 			q->overflow_heap[k].b = j;
2657 			b->overflow_idx[j] = k;
2658 		}
2659 	}
2660 
2661 	cake_reconfigure(sch);
2662 	q->avg_peak_bandwidth = q->rate_bps;
2663 	q->min_netlen = ~0;
2664 	q->min_adjlen = ~0;
2665 	return 0;
2666 
2667 nomem:
2668 	cake_destroy(sch);
2669 	return -ENOMEM;
2670 }
2671 
2672 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2673 {
2674 	struct cake_sched_data *q = qdisc_priv(sch);
2675 	struct nlattr *opts;
2676 
2677 	opts = nla_nest_start(skb, TCA_OPTIONS);
2678 	if (!opts)
2679 		goto nla_put_failure;
2680 
2681 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2682 			      TCA_CAKE_PAD))
2683 		goto nla_put_failure;
2684 
2685 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2686 			q->flow_mode & CAKE_FLOW_MASK))
2687 		goto nla_put_failure;
2688 
2689 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2690 		goto nla_put_failure;
2691 
2692 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2693 		goto nla_put_failure;
2694 
2695 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2696 		goto nla_put_failure;
2697 
2698 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2699 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2700 		goto nla_put_failure;
2701 
2702 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2703 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2704 		goto nla_put_failure;
2705 
2706 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2707 		goto nla_put_failure;
2708 
2709 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2710 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2711 		goto nla_put_failure;
2712 
2713 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2714 		goto nla_put_failure;
2715 
2716 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2717 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2718 		goto nla_put_failure;
2719 
2720 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2721 		goto nla_put_failure;
2722 
2723 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2724 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2725 			goto nla_put_failure;
2726 
2727 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2728 		goto nla_put_failure;
2729 
2730 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2731 		goto nla_put_failure;
2732 
2733 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2734 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2735 		goto nla_put_failure;
2736 
2737 	return nla_nest_end(skb, opts);
2738 
2739 nla_put_failure:
2740 	return -1;
2741 }
2742 
2743 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2744 {
2745 	struct nlattr *stats = nla_nest_start(d->skb, TCA_STATS_APP);
2746 	struct cake_sched_data *q = qdisc_priv(sch);
2747 	struct nlattr *tstats, *ts;
2748 	int i;
2749 
2750 	if (!stats)
2751 		return -1;
2752 
2753 #define PUT_STAT_U32(attr, data) do {				       \
2754 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2755 			goto nla_put_failure;			       \
2756 	} while (0)
2757 #define PUT_STAT_U64(attr, data) do {				       \
2758 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2759 					data, TCA_CAKE_STATS_PAD)) \
2760 			goto nla_put_failure;			       \
2761 	} while (0)
2762 
2763 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2764 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2765 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2766 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2767 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2768 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2769 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2770 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2771 
2772 #undef PUT_STAT_U32
2773 #undef PUT_STAT_U64
2774 
2775 	tstats = nla_nest_start(d->skb, TCA_CAKE_STATS_TIN_STATS);
2776 	if (!tstats)
2777 		goto nla_put_failure;
2778 
2779 #define PUT_TSTAT_U32(attr, data) do {					\
2780 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2781 			goto nla_put_failure;				\
2782 	} while (0)
2783 #define PUT_TSTAT_U64(attr, data) do {					\
2784 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2785 					data, TCA_CAKE_TIN_STATS_PAD))	\
2786 			goto nla_put_failure;				\
2787 	} while (0)
2788 
2789 	for (i = 0; i < q->tin_cnt; i++) {
2790 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2791 
2792 		ts = nla_nest_start(d->skb, i + 1);
2793 		if (!ts)
2794 			goto nla_put_failure;
2795 
2796 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2797 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2798 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2799 
2800 		PUT_TSTAT_U32(TARGET_US,
2801 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2802 		PUT_TSTAT_U32(INTERVAL_US,
2803 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2804 
2805 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2806 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2807 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2808 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2809 
2810 		PUT_TSTAT_U32(PEAK_DELAY_US,
2811 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2812 		PUT_TSTAT_U32(AVG_DELAY_US,
2813 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2814 		PUT_TSTAT_U32(BASE_DELAY_US,
2815 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2816 
2817 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2818 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2819 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2820 
2821 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2822 					    b->decaying_flow_count);
2823 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2824 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2825 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2826 
2827 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2828 		nla_nest_end(d->skb, ts);
2829 	}
2830 
2831 #undef PUT_TSTAT_U32
2832 #undef PUT_TSTAT_U64
2833 
2834 	nla_nest_end(d->skb, tstats);
2835 	return nla_nest_end(d->skb, stats);
2836 
2837 nla_put_failure:
2838 	nla_nest_cancel(d->skb, stats);
2839 	return -1;
2840 }
2841 
2842 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2843 {
2844 	return NULL;
2845 }
2846 
2847 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2848 {
2849 	return 0;
2850 }
2851 
2852 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2853 			       u32 classid)
2854 {
2855 	return 0;
2856 }
2857 
2858 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2859 {
2860 }
2861 
2862 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2863 					struct netlink_ext_ack *extack)
2864 {
2865 	struct cake_sched_data *q = qdisc_priv(sch);
2866 
2867 	if (cl)
2868 		return NULL;
2869 	return q->block;
2870 }
2871 
2872 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2873 			   struct sk_buff *skb, struct tcmsg *tcm)
2874 {
2875 	tcm->tcm_handle |= TC_H_MIN(cl);
2876 	return 0;
2877 }
2878 
2879 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2880 				 struct gnet_dump *d)
2881 {
2882 	struct cake_sched_data *q = qdisc_priv(sch);
2883 	const struct cake_flow *flow = NULL;
2884 	struct gnet_stats_queue qs = { 0 };
2885 	struct nlattr *stats;
2886 	u32 idx = cl - 1;
2887 
2888 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2889 		const struct cake_tin_data *b = \
2890 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2891 		const struct sk_buff *skb;
2892 
2893 		flow = &b->flows[idx % CAKE_QUEUES];
2894 
2895 		if (flow->head) {
2896 			sch_tree_lock(sch);
2897 			skb = flow->head;
2898 			while (skb) {
2899 				qs.qlen++;
2900 				skb = skb->next;
2901 			}
2902 			sch_tree_unlock(sch);
2903 		}
2904 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
2905 		qs.drops = flow->dropped;
2906 	}
2907 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
2908 		return -1;
2909 	if (flow) {
2910 		ktime_t now = ktime_get();
2911 
2912 		stats = nla_nest_start(d->skb, TCA_STATS_APP);
2913 		if (!stats)
2914 			return -1;
2915 
2916 #define PUT_STAT_U32(attr, data) do {				       \
2917 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2918 			goto nla_put_failure;			       \
2919 	} while (0)
2920 #define PUT_STAT_S32(attr, data) do {				       \
2921 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2922 			goto nla_put_failure;			       \
2923 	} while (0)
2924 
2925 		PUT_STAT_S32(DEFICIT, flow->deficit);
2926 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
2927 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
2928 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
2929 		if (flow->cvars.p_drop) {
2930 			PUT_STAT_S32(BLUE_TIMER_US,
2931 				     ktime_to_us(
2932 					     ktime_sub(now,
2933 						     flow->cvars.blue_timer)));
2934 		}
2935 		if (flow->cvars.dropping) {
2936 			PUT_STAT_S32(DROP_NEXT_US,
2937 				     ktime_to_us(
2938 					     ktime_sub(now,
2939 						       flow->cvars.drop_next)));
2940 		}
2941 
2942 		if (nla_nest_end(d->skb, stats) < 0)
2943 			return -1;
2944 	}
2945 
2946 	return 0;
2947 
2948 nla_put_failure:
2949 	nla_nest_cancel(d->skb, stats);
2950 	return -1;
2951 }
2952 
2953 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2954 {
2955 	struct cake_sched_data *q = qdisc_priv(sch);
2956 	unsigned int i, j;
2957 
2958 	if (arg->stop)
2959 		return;
2960 
2961 	for (i = 0; i < q->tin_cnt; i++) {
2962 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2963 
2964 		for (j = 0; j < CAKE_QUEUES; j++) {
2965 			if (list_empty(&b->flows[j].flowchain) ||
2966 			    arg->count < arg->skip) {
2967 				arg->count++;
2968 				continue;
2969 			}
2970 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
2971 				arg->stop = 1;
2972 				break;
2973 			}
2974 			arg->count++;
2975 		}
2976 	}
2977 }
2978 
2979 static const struct Qdisc_class_ops cake_class_ops = {
2980 	.leaf		=	cake_leaf,
2981 	.find		=	cake_find,
2982 	.tcf_block	=	cake_tcf_block,
2983 	.bind_tcf	=	cake_bind,
2984 	.unbind_tcf	=	cake_unbind,
2985 	.dump		=	cake_dump_class,
2986 	.dump_stats	=	cake_dump_class_stats,
2987 	.walk		=	cake_walk,
2988 };
2989 
2990 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
2991 	.cl_ops		=	&cake_class_ops,
2992 	.id		=	"cake",
2993 	.priv_size	=	sizeof(struct cake_sched_data),
2994 	.enqueue	=	cake_enqueue,
2995 	.dequeue	=	cake_dequeue,
2996 	.peek		=	qdisc_peek_dequeued,
2997 	.init		=	cake_init,
2998 	.reset		=	cake_reset,
2999 	.destroy	=	cake_destroy,
3000 	.change		=	cake_change,
3001 	.dump		=	cake_dump,
3002 	.dump_stats	=	cake_dump_stats,
3003 	.owner		=	THIS_MODULE,
3004 };
3005 
3006 static int __init cake_module_init(void)
3007 {
3008 	return register_qdisc(&cake_qdisc_ops);
3009 }
3010 
3011 static void __exit cake_module_exit(void)
3012 {
3013 	unregister_qdisc(&cake_qdisc_ops);
3014 }
3015 
3016 module_init(cake_module_init)
3017 module_exit(cake_module_exit)
3018 MODULE_AUTHOR("Jonathan Morton");
3019 MODULE_LICENSE("Dual BSD/GPL");
3020 MODULE_DESCRIPTION("The CAKE shaper.");
3021