xref: /linux/net/sched/sch_cake.c (revision 67dc6c56b871617deac85b9f72500b69b1fdf835)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/sch_priv.h>
71 #include <net/pkt_cls.h>
72 #include <net/tcp.h>
73 #include <net/flow_dissector.h>
74 
75 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
76 #include <net/netfilter/nf_conntrack_core.h>
77 #endif
78 
79 #define CAKE_SET_WAYS (8)
80 #define CAKE_MAX_TINS (8)
81 #define CAKE_QUEUES (1024)
82 #define CAKE_FLOW_MASK 63
83 #define CAKE_FLOW_NAT_FLAG 64
84 
85 /* struct cobalt_params - contains codel and blue parameters
86  * @interval:	codel initial drop rate
87  * @target:     maximum persistent sojourn time & blue update rate
88  * @mtu_time:   serialisation delay of maximum-size packet
89  * @p_inc:      increment of blue drop probability (0.32 fxp)
90  * @p_dec:      decrement of blue drop probability (0.32 fxp)
91  */
92 struct cobalt_params {
93 	u64	interval;
94 	u64	target;
95 	u64	mtu_time;
96 	u32	p_inc;
97 	u32	p_dec;
98 };
99 
100 /* struct cobalt_vars - contains codel and blue variables
101  * @count:		codel dropping frequency
102  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
103  * @drop_next:		time to drop next packet, or when we dropped last
104  * @blue_timer:		Blue time to next drop
105  * @p_drop:		BLUE drop probability (0.32 fxp)
106  * @dropping:		set if in dropping state
107  * @ecn_marked:		set if marked
108  */
109 struct cobalt_vars {
110 	u32	count;
111 	u32	rec_inv_sqrt;
112 	ktime_t	drop_next;
113 	ktime_t	blue_timer;
114 	u32     p_drop;
115 	bool	dropping;
116 	bool    ecn_marked;
117 };
118 
119 enum {
120 	CAKE_SET_NONE = 0,
121 	CAKE_SET_SPARSE,
122 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 	CAKE_SET_BULK,
124 	CAKE_SET_DECAYING
125 };
126 
127 struct cake_flow {
128 	/* this stuff is all needed per-flow at dequeue time */
129 	struct sk_buff	  *head;
130 	struct sk_buff	  *tail;
131 	struct list_head  flowchain;
132 	s32		  deficit;
133 	u32		  dropped;
134 	struct cobalt_vars cvars;
135 	u16		  srchost; /* index into cake_host table */
136 	u16		  dsthost;
137 	u8		  set;
138 }; /* please try to keep this structure <= 64 bytes */
139 
140 struct cake_host {
141 	u32 srchost_tag;
142 	u32 dsthost_tag;
143 	u16 srchost_bulk_flow_count;
144 	u16 dsthost_bulk_flow_count;
145 };
146 
147 struct cake_heap_entry {
148 	u16 t:3, b:10;
149 };
150 
151 struct cake_tin_data {
152 	struct cake_flow flows[CAKE_QUEUES];
153 	u32	backlogs[CAKE_QUEUES];
154 	u32	tags[CAKE_QUEUES]; /* for set association */
155 	u16	overflow_idx[CAKE_QUEUES];
156 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 	u16	flow_quantum;
158 
159 	struct cobalt_params cparams;
160 	u32	drop_overlimit;
161 	u16	bulk_flow_count;
162 	u16	sparse_flow_count;
163 	u16	decaying_flow_count;
164 	u16	unresponsive_flow_count;
165 
166 	u32	max_skblen;
167 
168 	struct list_head new_flows;
169 	struct list_head old_flows;
170 	struct list_head decaying_flows;
171 
172 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 	ktime_t	time_next_packet;
174 	u64	tin_rate_ns;
175 	u64	tin_rate_bps;
176 	u16	tin_rate_shft;
177 
178 	u16	tin_quantum;
179 	s32	tin_deficit;
180 	u32	tin_backlog;
181 	u32	tin_dropped;
182 	u32	tin_ecn_mark;
183 
184 	u32	packets;
185 	u64	bytes;
186 
187 	u32	ack_drops;
188 
189 	/* moving averages */
190 	u64 avge_delay;
191 	u64 peak_delay;
192 	u64 base_delay;
193 
194 	/* hash function stats */
195 	u32	way_directs;
196 	u32	way_hits;
197 	u32	way_misses;
198 	u32	way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200 
201 struct cake_sched_config {
202 	u64		rate_bps;
203 	u64		interval;
204 	u64		target;
205 	u64		sync_time;
206 	u32		buffer_config_limit;
207 	u32		fwmark_mask;
208 	u16		fwmark_shft;
209 	s16		rate_overhead;
210 	u16		rate_mpu;
211 	u16		rate_flags;
212 	u8		tin_mode;
213 	u8		flow_mode;
214 	u8		atm_mode;
215 	u8		ack_filter;
216 	u8		is_shared;
217 };
218 
219 struct cake_sched_data {
220 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
221 	struct tcf_block *block;
222 	struct cake_tin_data *tins;
223 	struct cake_sched_config *config;
224 	struct cake_sched_config initial_config;
225 
226 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
227 
228 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
229 	ktime_t		time_next_packet;
230 	ktime_t		failsafe_next_packet;
231 	u64		rate_ns;
232 	u16		rate_shft;
233 	u16		overflow_timeout;
234 	u16		tin_cnt;
235 
236 	/* resource tracking */
237 	u32		buffer_used;
238 	u32		buffer_max_used;
239 	u32		buffer_limit;
240 
241 	/* indices for dequeue */
242 	u16		cur_tin;
243 	u16		cur_flow;
244 
245 	struct qdisc_watchdog watchdog;
246 	const u8	*tin_index;
247 	const u8	*tin_order;
248 
249 	/* bandwidth capacity estimate */
250 	ktime_t		last_packet_time;
251 	ktime_t		avg_window_begin;
252 	u64		avg_packet_interval;
253 	u64		avg_window_bytes;
254 	u64		avg_peak_bandwidth;
255 	ktime_t		last_reconfig_time;
256 
257 	/* packet length stats */
258 	u32		avg_netoff;
259 	u16		max_netlen;
260 	u16		max_adjlen;
261 	u16		min_netlen;
262 	u16		min_adjlen;
263 
264 	/* mq sync state */
265 	u64		last_checked_active;
266 	u64		last_active;
267 	u32		active_queues;
268 };
269 
270 enum {
271 	CAKE_FLAG_OVERHEAD	   = BIT(0),
272 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
273 	CAKE_FLAG_INGRESS	   = BIT(2),
274 	CAKE_FLAG_WASH		   = BIT(3),
275 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
276 };
277 
278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
279  * obtain the best features of each.  Codel is excellent on flows which
280  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
281  * unresponsive flows.
282  */
283 
284 struct cobalt_skb_cb {
285 	ktime_t enqueue_time;
286 	u32     adjusted_len;
287 };
288 
289 static u64 us_to_ns(u64 us)
290 {
291 	return us * NSEC_PER_USEC;
292 }
293 
294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
295 {
296 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
297 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
298 }
299 
300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
301 {
302 	return get_cobalt_cb(skb)->enqueue_time;
303 }
304 
305 static void cobalt_set_enqueue_time(struct sk_buff *skb,
306 				    ktime_t now)
307 {
308 	get_cobalt_cb(skb)->enqueue_time = now;
309 }
310 
311 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
312 
313 /* Diffserv lookup tables */
314 
315 static const u8 precedence[] = {
316 	0, 0, 0, 0, 0, 0, 0, 0,
317 	1, 1, 1, 1, 1, 1, 1, 1,
318 	2, 2, 2, 2, 2, 2, 2, 2,
319 	3, 3, 3, 3, 3, 3, 3, 3,
320 	4, 4, 4, 4, 4, 4, 4, 4,
321 	5, 5, 5, 5, 5, 5, 5, 5,
322 	6, 6, 6, 6, 6, 6, 6, 6,
323 	7, 7, 7, 7, 7, 7, 7, 7,
324 };
325 
326 static const u8 diffserv8[] = {
327 	2, 0, 1, 2, 4, 2, 2, 2,
328 	1, 2, 1, 2, 1, 2, 1, 2,
329 	5, 2, 4, 2, 4, 2, 4, 2,
330 	3, 2, 3, 2, 3, 2, 3, 2,
331 	6, 2, 3, 2, 3, 2, 3, 2,
332 	6, 2, 2, 2, 6, 2, 6, 2,
333 	7, 2, 2, 2, 2, 2, 2, 2,
334 	7, 2, 2, 2, 2, 2, 2, 2,
335 };
336 
337 static const u8 diffserv4[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	2, 0, 2, 0, 2, 0, 2, 0,
341 	2, 0, 2, 0, 2, 0, 2, 0,
342 	3, 0, 2, 0, 2, 0, 2, 0,
343 	3, 0, 0, 0, 3, 0, 3, 0,
344 	3, 0, 0, 0, 0, 0, 0, 0,
345 	3, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 diffserv3[] = {
349 	0, 1, 0, 0, 2, 0, 0, 0,
350 	1, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 2, 0, 2, 0,
355 	2, 0, 0, 0, 0, 0, 0, 0,
356 	2, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 static const u8 besteffort[] = {
360 	0, 0, 0, 0, 0, 0, 0, 0,
361 	0, 0, 0, 0, 0, 0, 0, 0,
362 	0, 0, 0, 0, 0, 0, 0, 0,
363 	0, 0, 0, 0, 0, 0, 0, 0,
364 	0, 0, 0, 0, 0, 0, 0, 0,
365 	0, 0, 0, 0, 0, 0, 0, 0,
366 	0, 0, 0, 0, 0, 0, 0, 0,
367 	0, 0, 0, 0, 0, 0, 0, 0,
368 };
369 
370 /* tin priority order for stats dumping */
371 
372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
373 static const u8 bulk_order[] = {1, 0, 2, 3};
374 
375 /* There is a big difference in timing between the accurate values placed in the
376  * cache and the approximations given by a single Newton step for small count
377  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
378  * these values are calculated using eight Newton steps, using the
379  * implementation below. Above 16, a single Newton step gives sufficient
380  * accuracy in either direction, given the precision stored.
381  *
382  * The magnitude of the error when stepping up to count 2 is such as to give the
383  * value that *should* have been produced at count 4.
384  */
385 
386 #define REC_INV_SQRT_CACHE (16)
387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
388 		~0,         ~0, 3037000500, 2479700525,
389 	2147483647, 1920767767, 1753413056, 1623345051,
390 	1518500250, 1431655765, 1358187914, 1294981364,
391 	1239850263, 1191209601, 1147878294, 1108955788
392 };
393 
394 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust);
395 
396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
397  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
398  *
399  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
400  */
401 
402 static void cobalt_newton_step(struct cobalt_vars *vars, u32 count)
403 {
404 	u32 invsqrt, invsqrt2;
405 	u64 val;
406 
407 	invsqrt = vars->rec_inv_sqrt;
408 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
409 	val = (3LL << 32) - ((u64)count * invsqrt2);
410 
411 	val >>= 2; /* avoid overflow in following multiply */
412 	val = (val * invsqrt) >> (32 - 2 + 1);
413 
414 	vars->rec_inv_sqrt = val;
415 }
416 
417 static void cobalt_invsqrt(struct cobalt_vars *vars, u32 count)
418 {
419 	if (count < REC_INV_SQRT_CACHE)
420 		vars->rec_inv_sqrt = inv_sqrt_cache[count];
421 	else
422 		cobalt_newton_step(vars, count);
423 }
424 
425 static void cobalt_vars_init(struct cobalt_vars *vars)
426 {
427 	memset(vars, 0, sizeof(*vars));
428 }
429 
430 /* CoDel control_law is t + interval/sqrt(count)
431  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
432  * both sqrt() and divide operation.
433  */
434 static ktime_t cobalt_control(ktime_t t,
435 			      u64 interval,
436 			      u32 rec_inv_sqrt)
437 {
438 	return ktime_add_ns(t, reciprocal_scale(interval,
439 						rec_inv_sqrt));
440 }
441 
442 /* Call this when a packet had to be dropped due to queue overflow.  Returns
443  * true if the BLUE state was quiescent before but active after this call.
444  */
445 static bool cobalt_queue_full(struct cobalt_vars *vars,
446 			      struct cobalt_params *p,
447 			      ktime_t now)
448 {
449 	bool up = false;
450 
451 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
452 		u32 p_drop = vars->p_drop;
453 
454 		up = !p_drop;
455 		p_drop += p->p_inc;
456 		if (p_drop < p->p_inc)
457 			p_drop = ~0;
458 		WRITE_ONCE(vars->p_drop, p_drop);
459 		WRITE_ONCE(vars->blue_timer, now);
460 	}
461 	WRITE_ONCE(vars->dropping, true);
462 	WRITE_ONCE(vars->drop_next, now);
463 	if (!vars->count)
464 		WRITE_ONCE(vars->count, 1);
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			WRITE_ONCE(vars->p_drop, 0);
482 		else
483 			WRITE_ONCE(vars->p_drop, vars->p_drop - p->p_dec);
484 		WRITE_ONCE(vars->blue_timer, now);
485 		down = !vars->p_drop;
486 	}
487 	WRITE_ONCE(vars->dropping, false);
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		WRITE_ONCE(vars->count, vars->count - 1);
491 		cobalt_invsqrt(vars, vars->count);
492 		WRITE_ONCE(vars->drop_next,
493 			   cobalt_control(vars->drop_next, p->interval,
494 					  vars->rec_inv_sqrt));
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
503 static enum qdisc_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
504 						 struct cobalt_params *p,
505 						 ktime_t now,
506 						 struct sk_buff *skb,
507 						 u32 bulk_flows)
508 {
509 	enum qdisc_drop_reason reason = QDISC_DROP_UNSPEC;
510 	bool next_due, over_target;
511 	ktime_t schedule;
512 	u64 sojourn;
513 	u32 count;
514 
515 /* The 'schedule' variable records, in its sign, whether 'now' is before or
516  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
517  * scheduling decision is actually branched, without destroying that
518  * information.  Similarly, the first 'schedule' value calculated is preserved
519  * in the boolean 'next_due'.
520  *
521  * As for 'drop_next', we take advantage of the fact that 'interval' is both
522  * the delay between first exceeding 'target' and the first signalling event,
523  * *and* the scaling factor for the signalling frequency.  It's therefore very
524  * natural to use a single mechanism for both purposes, and eliminates a
525  * significant amount of reference Codel's spaghetti code.  To help with this,
526  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
527  * as possible to 1.0 in fixed-point.
528  */
529 
530 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
531 	schedule = ktime_sub(now, vars->drop_next);
532 	over_target = sojourn > p->target &&
533 		      sojourn > p->mtu_time * bulk_flows * 2 &&
534 		      sojourn > p->mtu_time * 4;
535 	count = vars->count;
536 	next_due = count && ktime_to_ns(schedule) >= 0;
537 
538 	vars->ecn_marked = false;
539 
540 	if (over_target) {
541 		if (!vars->dropping) {
542 			WRITE_ONCE(vars->dropping, true);
543 			WRITE_ONCE(vars->drop_next,
544 				   cobalt_control(now, p->interval,
545 						  vars->rec_inv_sqrt));
546 		}
547 		if (!count)
548 			count = 1;
549 	} else if (vars->dropping) {
550 		WRITE_ONCE(vars->dropping, false);
551 	}
552 
553 	if (next_due && vars->dropping) {
554 		/* Use ECN mark if possible, otherwise drop */
555 		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
556 			reason = QDISC_DROP_CONGESTED;
557 
558 		count++;
559 		if (!count)
560 			count--;
561 		cobalt_invsqrt(vars, count);
562 		WRITE_ONCE(vars->drop_next,
563 			   cobalt_control(vars->drop_next, p->interval,
564 					  vars->rec_inv_sqrt));
565 		schedule = ktime_sub(now, vars->drop_next);
566 	} else {
567 		while (next_due) {
568 			count--;
569 			cobalt_invsqrt(vars, count);
570 			WRITE_ONCE(vars->drop_next,
571 				   cobalt_control(vars->drop_next, p->interval,
572 						  vars->rec_inv_sqrt));
573 			schedule = ktime_sub(now, vars->drop_next);
574 			next_due = count && ktime_to_ns(schedule) >= 0;
575 		}
576 	}
577 
578 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
579 	if (vars->p_drop && reason == QDISC_DROP_UNSPEC &&
580 	    get_random_u32() < vars->p_drop)
581 		reason = QDISC_DROP_FLOOD_PROTECTION;
582 
583 	WRITE_ONCE(vars->count, count);
584 	/* Overload the drop_next field as an activity timeout */
585 	if (!count)
586 		WRITE_ONCE(vars->drop_next, ktime_add_ns(now, p->interval));
587 	else if (ktime_to_ns(schedule) > 0 && reason == QDISC_DROP_UNSPEC)
588 		WRITE_ONCE(vars->drop_next, now);
589 
590 	return reason;
591 }
592 
593 static bool cake_update_flowkeys(struct flow_keys *keys,
594 				 const struct sk_buff *skb)
595 {
596 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
597 	struct nf_conntrack_tuple tuple = {};
598 	bool rev = !skb->_nfct, upd = false;
599 	__be32 ip;
600 
601 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
602 		return false;
603 
604 	if (!nf_ct_get_tuple_skb(&tuple, skb))
605 		return false;
606 
607 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
608 	if (ip != keys->addrs.v4addrs.src) {
609 		keys->addrs.v4addrs.src = ip;
610 		upd = true;
611 	}
612 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
613 	if (ip != keys->addrs.v4addrs.dst) {
614 		keys->addrs.v4addrs.dst = ip;
615 		upd = true;
616 	}
617 
618 	if (keys->ports.ports) {
619 		__be16 port;
620 
621 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
622 		if (port != keys->ports.src) {
623 			keys->ports.src = port;
624 			upd = true;
625 		}
626 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
627 		if (port != keys->ports.dst) {
628 			keys->ports.dst = port;
629 			upd = true;
630 		}
631 	}
632 	return upd;
633 #else
634 	return false;
635 #endif
636 }
637 
638 /* Cake has several subtle multiple bit settings. In these cases you
639  *  would be matching triple isolate mode as well.
640  */
641 
642 static bool cake_dsrc(int flow_mode)
643 {
644 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
645 }
646 
647 static bool cake_ddst(int flow_mode)
648 {
649 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
650 }
651 
652 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
653 					     struct cake_flow *flow,
654 					     int flow_mode)
655 {
656 	if (likely(cake_dsrc(flow_mode) &&
657 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
658 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
659 }
660 
661 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
662 					     struct cake_flow *flow,
663 					     int flow_mode)
664 {
665 	if (likely(cake_dsrc(flow_mode) &&
666 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
667 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
668 }
669 
670 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
671 					     struct cake_flow *flow,
672 					     int flow_mode)
673 {
674 	if (likely(cake_ddst(flow_mode) &&
675 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
676 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
677 }
678 
679 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
680 					     struct cake_flow *flow,
681 					     int flow_mode)
682 {
683 	if (likely(cake_ddst(flow_mode) &&
684 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
685 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
686 }
687 
688 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
689 				 struct cake_flow *flow,
690 				 int flow_mode)
691 {
692 	u16 host_load = 1;
693 
694 	if (cake_dsrc(flow_mode))
695 		host_load = max(host_load,
696 				q->hosts[flow->srchost].srchost_bulk_flow_count);
697 
698 	if (cake_ddst(flow_mode))
699 		host_load = max(host_load,
700 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
701 
702 	/* The get_random_u16() is a way to apply dithering to avoid
703 	 * accumulating roundoff errors
704 	 */
705 	return (q->flow_quantum * quantum_div[host_load] +
706 		get_random_u16()) >> 16;
707 }
708 
709 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
710 		     int flow_mode, u16 flow_override, u16 host_override)
711 {
712 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
713 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
714 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
715 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
716 	u16 reduced_hash, srchost_idx, dsthost_idx;
717 	struct flow_keys keys, host_keys;
718 	bool use_skbhash = skb->l4_hash;
719 
720 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
721 		return 0;
722 
723 	/* If both overrides are set, or we can use the SKB hash and nat mode is
724 	 * disabled, we can skip packet dissection entirely. If nat mode is
725 	 * enabled there's another check below after doing the conntrack lookup.
726 	 */
727 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
728 		goto skip_hash;
729 
730 	skb_flow_dissect_flow_keys(skb, &keys,
731 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
732 
733 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
734 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
735 		use_skbhash = false;
736 
737 	/* If we can still use the SKB hash and don't need the host hash, we can
738 	 * skip the rest of the hashing procedure
739 	 */
740 	if (use_skbhash && !hash_hosts)
741 		goto skip_hash;
742 
743 	/* flow_hash_from_keys() sorts the addresses by value, so we have
744 	 * to preserve their order in a separate data structure to treat
745 	 * src and dst host addresses as independently selectable.
746 	 */
747 	host_keys = keys;
748 	host_keys.ports.ports     = 0;
749 	host_keys.basic.ip_proto  = 0;
750 	host_keys.keyid.keyid     = 0;
751 	host_keys.tags.flow_label = 0;
752 
753 	switch (host_keys.control.addr_type) {
754 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
755 		host_keys.addrs.v4addrs.src = 0;
756 		dsthost_hash = flow_hash_from_keys(&host_keys);
757 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
758 		host_keys.addrs.v4addrs.dst = 0;
759 		srchost_hash = flow_hash_from_keys(&host_keys);
760 		break;
761 
762 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
763 		memset(&host_keys.addrs.v6addrs.src, 0,
764 		       sizeof(host_keys.addrs.v6addrs.src));
765 		dsthost_hash = flow_hash_from_keys(&host_keys);
766 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
767 		memset(&host_keys.addrs.v6addrs.dst, 0,
768 		       sizeof(host_keys.addrs.v6addrs.dst));
769 		srchost_hash = flow_hash_from_keys(&host_keys);
770 		break;
771 
772 	default:
773 		dsthost_hash = 0;
774 		srchost_hash = 0;
775 	}
776 
777 	/* This *must* be after the above switch, since as a
778 	 * side-effect it sorts the src and dst addresses.
779 	 */
780 	if (hash_flows && !use_skbhash)
781 		flow_hash = flow_hash_from_keys(&keys);
782 
783 skip_hash:
784 	if (flow_override)
785 		flow_hash = flow_override - 1;
786 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
787 		flow_hash = skb->hash;
788 	if (host_override) {
789 		dsthost_hash = host_override - 1;
790 		srchost_hash = host_override - 1;
791 	}
792 
793 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
794 		if (flow_mode & CAKE_FLOW_SRC_IP)
795 			flow_hash ^= srchost_hash;
796 
797 		if (flow_mode & CAKE_FLOW_DST_IP)
798 			flow_hash ^= dsthost_hash;
799 	}
800 
801 	reduced_hash = flow_hash % CAKE_QUEUES;
802 
803 	/* set-associative hashing */
804 	/* fast path if no hash collision (direct lookup succeeds) */
805 	if (likely(q->tags[reduced_hash] == flow_hash &&
806 		   q->flows[reduced_hash].set)) {
807 		q->way_directs++;
808 	} else {
809 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
810 		u32 outer_hash = reduced_hash - inner_hash;
811 		bool allocate_src = false;
812 		bool allocate_dst = false;
813 		u32 i, k;
814 
815 		/* check if any active queue in the set is reserved for
816 		 * this flow.
817 		 */
818 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
819 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
820 			if (q->tags[outer_hash + k] == flow_hash) {
821 				if (i)
822 					WRITE_ONCE(q->way_hits, q->way_hits + 1);
823 
824 				if (!q->flows[outer_hash + k].set) {
825 					/* need to increment host refcnts */
826 					allocate_src = cake_dsrc(flow_mode);
827 					allocate_dst = cake_ddst(flow_mode);
828 				}
829 
830 				goto found;
831 			}
832 		}
833 
834 		/* no queue is reserved for this flow, look for an
835 		 * empty one.
836 		 */
837 		for (i = 0; i < CAKE_SET_WAYS;
838 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
839 			if (!q->flows[outer_hash + k].set) {
840 				WRITE_ONCE(q->way_misses, q->way_misses + 1);
841 				allocate_src = cake_dsrc(flow_mode);
842 				allocate_dst = cake_ddst(flow_mode);
843 				goto found;
844 			}
845 		}
846 
847 		/* With no empty queues, default to the original
848 		 * queue, accept the collision, update the host tags.
849 		 */
850 		WRITE_ONCE(q->way_collisions, q->way_collisions + 1);
851 		allocate_src = cake_dsrc(flow_mode);
852 		allocate_dst = cake_ddst(flow_mode);
853 
854 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
855 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
856 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
857 		}
858 found:
859 		/* reserve queue for future packets in same flow */
860 		reduced_hash = outer_hash + k;
861 		q->tags[reduced_hash] = flow_hash;
862 
863 		if (allocate_src) {
864 			srchost_idx = srchost_hash % CAKE_QUEUES;
865 			inner_hash = srchost_idx % CAKE_SET_WAYS;
866 			outer_hash = srchost_idx - inner_hash;
867 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
868 				i++, k = (k + 1) % CAKE_SET_WAYS) {
869 				if (q->hosts[outer_hash + k].srchost_tag ==
870 				    srchost_hash)
871 					goto found_src;
872 			}
873 			for (i = 0; i < CAKE_SET_WAYS;
874 				i++, k = (k + 1) % CAKE_SET_WAYS) {
875 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
876 					break;
877 			}
878 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
879 found_src:
880 			srchost_idx = outer_hash + k;
881 			q->flows[reduced_hash].srchost = srchost_idx;
882 
883 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
884 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
885 		}
886 
887 		if (allocate_dst) {
888 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
889 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
890 			outer_hash = dsthost_idx - inner_hash;
891 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
892 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
893 				if (q->hosts[outer_hash + k].dsthost_tag ==
894 				    dsthost_hash)
895 					goto found_dst;
896 			}
897 			for (i = 0; i < CAKE_SET_WAYS;
898 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
899 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
900 					break;
901 			}
902 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
903 found_dst:
904 			dsthost_idx = outer_hash + k;
905 			q->flows[reduced_hash].dsthost = dsthost_idx;
906 
907 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
908 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
909 		}
910 	}
911 
912 	return reduced_hash;
913 }
914 
915 /* helper functions : might be changed when/if skb use a standard list_head */
916 /* remove one skb from head of slot queue */
917 
918 static struct sk_buff *dequeue_head(struct cake_flow *flow)
919 {
920 	struct sk_buff *skb = flow->head;
921 
922 	if (skb) {
923 		WRITE_ONCE(flow->head, skb->next);
924 		skb_mark_not_on_list(skb);
925 	}
926 
927 	return skb;
928 }
929 
930 /* add skb to flow queue (tail add) */
931 
932 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
933 {
934 	if (!flow->head)
935 		WRITE_ONCE(flow->head, skb);
936 	else
937 		flow->tail->next = skb;
938 	flow->tail = skb;
939 	skb->next = NULL;
940 }
941 
942 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
943 				    struct ipv6hdr *buf)
944 {
945 	unsigned int offset = skb_network_offset(skb);
946 	struct iphdr *iph;
947 
948 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
949 
950 	if (!iph)
951 		return NULL;
952 
953 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
954 		return skb_header_pointer(skb, offset + iph->ihl * 4,
955 					  sizeof(struct ipv6hdr), buf);
956 
957 	else if (iph->version == 4)
958 		return iph;
959 
960 	else if (iph->version == 6)
961 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
962 					  buf);
963 
964 	return NULL;
965 }
966 
967 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
968 				      void *buf, unsigned int bufsize)
969 {
970 	unsigned int offset = skb_network_offset(skb);
971 	const struct ipv6hdr *ipv6h;
972 	const struct tcphdr *tcph;
973 	const struct iphdr *iph;
974 	struct ipv6hdr _ipv6h;
975 	struct tcphdr _tcph;
976 
977 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
978 
979 	if (!ipv6h)
980 		return NULL;
981 
982 	if (ipv6h->version == 4) {
983 		iph = (struct iphdr *)ipv6h;
984 		offset += iph->ihl * 4;
985 
986 		/* special-case 6in4 tunnelling, as that is a common way to get
987 		 * v6 connectivity in the home
988 		 */
989 		if (iph->protocol == IPPROTO_IPV6) {
990 			ipv6h = skb_header_pointer(skb, offset,
991 						   sizeof(_ipv6h), &_ipv6h);
992 
993 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
994 				return NULL;
995 
996 			offset += sizeof(struct ipv6hdr);
997 
998 		} else if (iph->protocol != IPPROTO_TCP) {
999 			return NULL;
1000 		}
1001 
1002 	} else if (ipv6h->version == 6) {
1003 		if (ipv6h->nexthdr != IPPROTO_TCP)
1004 			return NULL;
1005 
1006 		offset += sizeof(struct ipv6hdr);
1007 	} else {
1008 		return NULL;
1009 	}
1010 
1011 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1012 	if (!tcph || tcph->doff < 5)
1013 		return NULL;
1014 
1015 	return skb_header_pointer(skb, offset,
1016 				  min(__tcp_hdrlen(tcph), bufsize), buf);
1017 }
1018 
1019 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1020 				   int code, int *oplen)
1021 {
1022 	/* inspired by tcp_parse_options in tcp_input.c */
1023 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1024 	const u8 *ptr = (const u8 *)(tcph + 1);
1025 
1026 	while (length > 0) {
1027 		int opcode = *ptr++;
1028 		int opsize;
1029 
1030 		if (opcode == TCPOPT_EOL)
1031 			break;
1032 		if (opcode == TCPOPT_NOP) {
1033 			length--;
1034 			continue;
1035 		}
1036 		if (length < 2)
1037 			break;
1038 		opsize = *ptr++;
1039 		if (opsize < 2 || opsize > length)
1040 			break;
1041 
1042 		if (opcode == code) {
1043 			*oplen = opsize;
1044 			return ptr;
1045 		}
1046 
1047 		ptr += opsize - 2;
1048 		length -= opsize;
1049 	}
1050 
1051 	return NULL;
1052 }
1053 
1054 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1055  * bytes than the other. In the case where both sequences ACKs bytes that the
1056  * other doesn't, A is considered greater. DSACKs in A also makes A be
1057  * considered greater.
1058  *
1059  * @return -1, 0 or 1 as normal compare functions
1060  */
1061 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1062 				  const struct tcphdr *tcph_b)
1063 {
1064 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1065 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1066 	u32 bytes_a = 0, bytes_b = 0;
1067 	int oplen_a, oplen_b;
1068 	bool first = true;
1069 
1070 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1071 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1072 
1073 	/* pointers point to option contents */
1074 	oplen_a -= TCPOLEN_SACK_BASE;
1075 	oplen_b -= TCPOLEN_SACK_BASE;
1076 
1077 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1078 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1079 		return -1;
1080 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1081 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1082 		return 1;
1083 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1084 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1085 		return 0;
1086 
1087 	while (oplen_a >= sizeof(*sack_a)) {
1088 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1089 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1090 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1091 		int oplen_tmp = oplen_b;
1092 		bool found = false;
1093 
1094 		/* DSACK; always considered greater to prevent dropping */
1095 		if (before(start_a, ack_seq_a))
1096 			return -1;
1097 
1098 		bytes_a += end_a - start_a;
1099 
1100 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1101 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1102 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1103 
1104 			/* first time through we count the total size */
1105 			if (first)
1106 				bytes_b += end_b - start_b;
1107 
1108 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1109 				found = true;
1110 				if (!first)
1111 					break;
1112 			}
1113 			oplen_tmp -= sizeof(*sack_tmp);
1114 			sack_tmp++;
1115 		}
1116 
1117 		if (!found)
1118 			return -1;
1119 
1120 		oplen_a -= sizeof(*sack_a);
1121 		sack_a++;
1122 		first = false;
1123 	}
1124 
1125 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1126 	 * either the SACKs are equal, or B SACKs more bytes.
1127 	 */
1128 	return bytes_b > bytes_a ? 1 : 0;
1129 }
1130 
1131 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1132 				 u32 *tsval, u32 *tsecr)
1133 {
1134 	const u8 *ptr;
1135 	int opsize;
1136 
1137 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1138 
1139 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1140 		*tsval = get_unaligned_be32(ptr);
1141 		*tsecr = get_unaligned_be32(ptr + 4);
1142 	}
1143 }
1144 
1145 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1146 			       u32 tstamp_new, u32 tsecr_new)
1147 {
1148 	/* inspired by tcp_parse_options in tcp_input.c */
1149 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1150 	const u8 *ptr = (const u8 *)(tcph + 1);
1151 	u32 tstamp, tsecr;
1152 
1153 	/* 3 reserved flags must be unset to avoid future breakage
1154 	 * ACK must be set
1155 	 * ECE/CWR are handled separately
1156 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1157 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1158 	 * 0x00C00000 = CWR/ECE (handled separately)
1159 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1160 	 */
1161 	if (((tcp_flag_word(tcph) &
1162 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1163 		return false;
1164 
1165 	while (length > 0) {
1166 		int opcode = *ptr++;
1167 		int opsize;
1168 
1169 		if (opcode == TCPOPT_EOL)
1170 			break;
1171 		if (opcode == TCPOPT_NOP) {
1172 			length--;
1173 			continue;
1174 		}
1175 		if (length < 2)
1176 			break;
1177 		opsize = *ptr++;
1178 		if (opsize < 2 || opsize > length)
1179 			break;
1180 
1181 		switch (opcode) {
1182 		case TCPOPT_MD5SIG: /* doesn't influence state */
1183 			break;
1184 
1185 		case TCPOPT_SACK: /* stricter checking performed later */
1186 			if (opsize % 8 != 2)
1187 				return false;
1188 			break;
1189 
1190 		case TCPOPT_TIMESTAMP:
1191 			/* only drop timestamps lower than new */
1192 			if (opsize != TCPOLEN_TIMESTAMP)
1193 				return false;
1194 			tstamp = get_unaligned_be32(ptr);
1195 			tsecr = get_unaligned_be32(ptr + 4);
1196 			if (after(tstamp, tstamp_new) ||
1197 			    after(tsecr, tsecr_new))
1198 				return false;
1199 			break;
1200 
1201 		case TCPOPT_MSS:  /* these should only be set on SYN */
1202 		case TCPOPT_WINDOW:
1203 		case TCPOPT_SACK_PERM:
1204 		case TCPOPT_FASTOPEN:
1205 		case TCPOPT_EXP:
1206 		default: /* don't drop if any unknown options are present */
1207 			return false;
1208 		}
1209 
1210 		ptr += opsize - 2;
1211 		length -= opsize;
1212 	}
1213 
1214 	return true;
1215 }
1216 
1217 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1218 				       struct cake_flow *flow)
1219 {
1220 	bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
1221 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1222 	struct sk_buff *skb_check, *skb_prev = NULL;
1223 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1224 	unsigned char _tcph[64], _tcph_check[64];
1225 	const struct tcphdr *tcph, *tcph_check;
1226 	const struct iphdr *iph, *iph_check;
1227 	struct ipv6hdr _iph, _iph_check;
1228 	const struct sk_buff *skb;
1229 	int seglen, num_found = 0;
1230 	u32 tstamp = 0, tsecr = 0;
1231 	__be32 elig_flags = 0;
1232 	int sack_comp;
1233 
1234 	/* no other possible ACKs to filter */
1235 	if (flow->head == flow->tail)
1236 		return NULL;
1237 
1238 	skb = flow->tail;
1239 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1240 	iph = cake_get_iphdr(skb, &_iph);
1241 	if (!tcph)
1242 		return NULL;
1243 
1244 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1245 
1246 	/* the 'triggering' packet need only have the ACK flag set.
1247 	 * also check that SYN is not set, as there won't be any previous ACKs.
1248 	 */
1249 	if ((tcp_flag_word(tcph) &
1250 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1251 		return NULL;
1252 
1253 	/* the 'triggering' ACK is at the tail of the queue, we have already
1254 	 * returned if it is the only packet in the flow. loop through the rest
1255 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1256 	 * triggering one.
1257 	 */
1258 	for (skb_check = flow->head;
1259 	     skb_check && skb_check != skb;
1260 	     skb_prev = skb_check, skb_check = skb_check->next) {
1261 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1262 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1263 					     sizeof(_tcph_check));
1264 
1265 		/* only TCP packets with matching 5-tuple are eligible, and only
1266 		 * drop safe headers
1267 		 */
1268 		if (!tcph_check || iph->version != iph_check->version ||
1269 		    tcph_check->source != tcph->source ||
1270 		    tcph_check->dest != tcph->dest)
1271 			continue;
1272 
1273 		if (iph_check->version == 4) {
1274 			if (iph_check->saddr != iph->saddr ||
1275 			    iph_check->daddr != iph->daddr)
1276 				continue;
1277 
1278 			seglen = iph_totlen(skb, iph_check) -
1279 				       (4 * iph_check->ihl);
1280 		} else if (iph_check->version == 6) {
1281 			ipv6h = (struct ipv6hdr *)iph;
1282 			ipv6h_check = (struct ipv6hdr *)iph_check;
1283 
1284 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1285 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1286 				continue;
1287 
1288 			seglen = ipv6_payload_len(skb, ipv6h_check);
1289 		} else {
1290 			WARN_ON(1);  /* shouldn't happen */
1291 			continue;
1292 		}
1293 
1294 		/* If the ECE/CWR flags changed from the previous eligible
1295 		 * packet in the same flow, we should no longer be dropping that
1296 		 * previous packet as this would lose information.
1297 		 */
1298 		if (elig_ack && (tcp_flag_word(tcph_check) &
1299 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1300 			elig_ack = NULL;
1301 			elig_ack_prev = NULL;
1302 			num_found--;
1303 		}
1304 
1305 		/* Check TCP options and flags, don't drop ACKs with segment
1306 		 * data, and don't drop ACKs with a higher cumulative ACK
1307 		 * counter than the triggering packet. Check ACK seqno here to
1308 		 * avoid parsing SACK options of packets we are going to exclude
1309 		 * anyway.
1310 		 */
1311 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1312 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1313 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1314 			continue;
1315 
1316 		/* Check SACK options. The triggering packet must SACK more data
1317 		 * than the ACK under consideration, or SACK the same range but
1318 		 * have a larger cumulative ACK counter. The latter is a
1319 		 * pathological case, but is contained in the following check
1320 		 * anyway, just to be safe.
1321 		 */
1322 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1323 
1324 		if (sack_comp < 0 ||
1325 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1326 		     sack_comp == 0))
1327 			continue;
1328 
1329 		/* At this point we have found an eligible pure ACK to drop; if
1330 		 * we are in aggressive mode, we are done. Otherwise, keep
1331 		 * searching unless this is the second eligible ACK we
1332 		 * found.
1333 		 *
1334 		 * Since we want to drop ACK closest to the head of the queue,
1335 		 * save the first eligible ACK we find, even if we need to loop
1336 		 * again.
1337 		 */
1338 		if (!elig_ack) {
1339 			elig_ack = skb_check;
1340 			elig_ack_prev = skb_prev;
1341 			elig_flags = (tcp_flag_word(tcph_check)
1342 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1343 		}
1344 
1345 		if (num_found++ > 0)
1346 			goto found;
1347 	}
1348 
1349 	/* We made it through the queue without finding two eligible ACKs . If
1350 	 * we found a single eligible ACK we can drop it in aggressive mode if
1351 	 * we can guarantee that this does not interfere with ECN flag
1352 	 * information. We ensure this by dropping it only if the enqueued
1353 	 * packet is consecutive with the eligible ACK, and their flags match.
1354 	 */
1355 	if (elig_ack && aggressive && elig_ack->next == skb &&
1356 	    (elig_flags == (tcp_flag_word(tcph) &
1357 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1358 		goto found;
1359 
1360 	return NULL;
1361 
1362 found:
1363 	if (elig_ack_prev)
1364 		elig_ack_prev->next = elig_ack->next;
1365 	else
1366 		WRITE_ONCE(flow->head, elig_ack->next);
1367 
1368 	skb_mark_not_on_list(elig_ack);
1369 
1370 	return elig_ack;
1371 }
1372 
1373 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1374 {
1375 	avg -= avg >> shift;
1376 	avg += sample >> shift;
1377 	return avg;
1378 }
1379 
1380 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
1381 {
1382 	struct cake_sched_config *q = qd->config;
1383 
1384 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1385 		len -= off;
1386 
1387 	if (qd->max_netlen < len)
1388 		WRITE_ONCE(qd->max_netlen, len);
1389 	if (qd->min_netlen > len)
1390 		WRITE_ONCE(qd->min_netlen, len);
1391 
1392 	len += q->rate_overhead;
1393 
1394 	if (len < q->rate_mpu)
1395 		len = q->rate_mpu;
1396 
1397 	if (q->atm_mode == CAKE_ATM_ATM) {
1398 		len += 47;
1399 		len /= 48;
1400 		len *= 53;
1401 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1402 		/* Add one byte per 64 bytes or part thereof.
1403 		 * This is conservative and easier to calculate than the
1404 		 * precise value.
1405 		 */
1406 		len += (len + 63) / 64;
1407 	}
1408 
1409 	if (qd->max_adjlen < len)
1410 		WRITE_ONCE(qd->max_adjlen, len);
1411 	if (qd->min_adjlen > len)
1412 		WRITE_ONCE(qd->min_adjlen, len);
1413 
1414 	return len;
1415 }
1416 
1417 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1418 {
1419 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1420 	unsigned int hdr_len, last_len = 0;
1421 	u32 off = skb_network_offset(skb);
1422 	u16 segs = qdisc_pkt_segs(skb);
1423 	u32 len = qdisc_pkt_len(skb);
1424 
1425 	WRITE_ONCE(q->avg_netoff, cake_ewma(q->avg_netoff, off << 16, 8));
1426 
1427 	if (segs == 1)
1428 		return cake_calc_overhead(q, len, off);
1429 
1430 	/* borrowed from qdisc_pkt_len_segs_init() */
1431 	if (!skb->encapsulation)
1432 		hdr_len = skb_transport_offset(skb);
1433 	else
1434 		hdr_len = skb_inner_transport_offset(skb);
1435 
1436 	/* + transport layer */
1437 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1438 						SKB_GSO_TCPV6))) {
1439 		const struct tcphdr *th;
1440 		struct tcphdr _tcphdr;
1441 
1442 		th = skb_header_pointer(skb, hdr_len,
1443 					sizeof(_tcphdr), &_tcphdr);
1444 		if (likely(th))
1445 			hdr_len += __tcp_hdrlen(th);
1446 	} else {
1447 		struct udphdr _udphdr;
1448 
1449 		if (skb_header_pointer(skb, hdr_len,
1450 				       sizeof(_udphdr), &_udphdr))
1451 			hdr_len += sizeof(struct udphdr);
1452 	}
1453 
1454 	len = shinfo->gso_size + hdr_len;
1455 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1456 
1457 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1458 		cake_calc_overhead(q, last_len, off));
1459 }
1460 
1461 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1462 {
1463 	struct cake_heap_entry ii = q->overflow_heap[i];
1464 	struct cake_heap_entry jj = q->overflow_heap[j];
1465 
1466 	q->overflow_heap[i] = jj;
1467 	q->overflow_heap[j] = ii;
1468 
1469 	q->tins[ii.t].overflow_idx[ii.b] = j;
1470 	q->tins[jj.t].overflow_idx[jj.b] = i;
1471 }
1472 
1473 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1474 {
1475 	struct cake_heap_entry ii = q->overflow_heap[i];
1476 
1477 	return q->tins[ii.t].backlogs[ii.b];
1478 }
1479 
1480 static void cake_heapify(struct cake_sched_data *q, u16 i)
1481 {
1482 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1483 	u32 mb = cake_heap_get_backlog(q, i);
1484 	u32 m = i;
1485 
1486 	while (m < a) {
1487 		u32 l = m + m + 1;
1488 		u32 r = l + 1;
1489 
1490 		if (l < a) {
1491 			u32 lb = cake_heap_get_backlog(q, l);
1492 
1493 			if (lb > mb) {
1494 				m  = l;
1495 				mb = lb;
1496 			}
1497 		}
1498 
1499 		if (r < a) {
1500 			u32 rb = cake_heap_get_backlog(q, r);
1501 
1502 			if (rb > mb) {
1503 				m  = r;
1504 				mb = rb;
1505 			}
1506 		}
1507 
1508 		if (m != i) {
1509 			cake_heap_swap(q, i, m);
1510 			i = m;
1511 		} else {
1512 			break;
1513 		}
1514 	}
1515 }
1516 
1517 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1518 {
1519 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1520 		u16 p = (i - 1) >> 1;
1521 		u32 ib = cake_heap_get_backlog(q, i);
1522 		u32 pb = cake_heap_get_backlog(q, p);
1523 
1524 		if (ib > pb) {
1525 			cake_heap_swap(q, i, p);
1526 			i = p;
1527 		} else {
1528 			break;
1529 		}
1530 	}
1531 }
1532 
1533 static int cake_advance_shaper(struct cake_sched_data *q,
1534 			       struct cake_tin_data *b,
1535 			       struct sk_buff *skb,
1536 			       ktime_t now, bool drop)
1537 {
1538 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1539 
1540 	/* charge packet bandwidth to this tin
1541 	 * and to the global shaper.
1542 	 */
1543 	if (q->rate_ns) {
1544 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1545 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1546 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1547 
1548 		if (ktime_before(b->time_next_packet, now))
1549 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1550 							   tin_dur);
1551 
1552 		else if (ktime_before(b->time_next_packet,
1553 				      ktime_add_ns(now, tin_dur)))
1554 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1555 
1556 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1557 						   global_dur);
1558 		if (!drop)
1559 			q->failsafe_next_packet = \
1560 				ktime_add_ns(q->failsafe_next_packet,
1561 					     failsafe_dur);
1562 	}
1563 	return len;
1564 }
1565 
1566 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1567 {
1568 	struct cake_sched_data *q = qdisc_priv(sch);
1569 	ktime_t now = ktime_get();
1570 	u32 idx = 0, tin = 0, len;
1571 	struct cake_heap_entry qq;
1572 	struct cake_tin_data *b;
1573 	struct cake_flow *flow;
1574 	struct sk_buff *skb;
1575 
1576 	if (!q->overflow_timeout) {
1577 		int i;
1578 		/* Build fresh max-heap */
1579 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1580 			cake_heapify(q, i);
1581 	}
1582 	q->overflow_timeout = 65535;
1583 
1584 	/* select longest queue for pruning */
1585 	qq  = q->overflow_heap[0];
1586 	tin = qq.t;
1587 	idx = qq.b;
1588 
1589 	b = &q->tins[tin];
1590 	flow = &b->flows[idx];
1591 	skb = dequeue_head(flow);
1592 	if (unlikely(!skb)) {
1593 		/* heap has gone wrong, rebuild it next time */
1594 		q->overflow_timeout = 0;
1595 		return idx + (tin << 16);
1596 	}
1597 
1598 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1599 		WRITE_ONCE(b->unresponsive_flow_count,
1600 			   b->unresponsive_flow_count + 1);
1601 
1602 	len = qdisc_pkt_len(skb);
1603 	q->buffer_used      -= skb->truesize;
1604 	WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
1605 	WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] - len);
1606 	sch->qstats.backlog -= len;
1607 
1608 	WRITE_ONCE(flow->dropped, flow->dropped + 1);
1609 	WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
1610 
1611 	if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1612 		cake_advance_shaper(q, b, skb, now, true);
1613 
1614 	qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT);
1615 	sch->q.qlen--;
1616 
1617 	cake_heapify(q, 0);
1618 
1619 	return idx + (tin << 16);
1620 }
1621 
1622 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1623 {
1624 	const int offset = skb_network_offset(skb);
1625 	u16 *buf, buf_;
1626 	u8 dscp;
1627 
1628 	switch (skb_protocol(skb, true)) {
1629 	case htons(ETH_P_IP):
1630 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1631 		if (unlikely(!buf))
1632 			return 0;
1633 
1634 		/* ToS is in the second byte of iphdr */
1635 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1636 
1637 		if (wash && dscp) {
1638 			const int wlen = offset + sizeof(struct iphdr);
1639 
1640 			if (!pskb_may_pull(skb, wlen) ||
1641 			    skb_try_make_writable(skb, wlen))
1642 				return 0;
1643 
1644 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1645 		}
1646 
1647 		return dscp;
1648 
1649 	case htons(ETH_P_IPV6):
1650 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1651 		if (unlikely(!buf))
1652 			return 0;
1653 
1654 		/* Traffic class is in the first and second bytes of ipv6hdr */
1655 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1656 
1657 		if (wash && dscp) {
1658 			const int wlen = offset + sizeof(struct ipv6hdr);
1659 
1660 			if (!pskb_may_pull(skb, wlen) ||
1661 			    skb_try_make_writable(skb, wlen))
1662 				return 0;
1663 
1664 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1665 		}
1666 
1667 		return dscp;
1668 
1669 	case htons(ETH_P_ARP):
1670 		return 0x38;  /* CS7 - Net Control */
1671 
1672 	default:
1673 		/* If there is no Diffserv field, treat as best-effort */
1674 		return 0;
1675 	}
1676 }
1677 
1678 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1679 					     struct sk_buff *skb)
1680 {
1681 	struct cake_sched_data *qd = qdisc_priv(sch);
1682 	struct cake_sched_config *q = qd->config;
1683 	u32 tin, mark;
1684 	bool wash;
1685 	u8 dscp;
1686 
1687 	/* Tin selection: Default to diffserv-based selection, allow overriding
1688 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1689 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1690 	 */
1691 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1692 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1693 	if (wash)
1694 		dscp = cake_handle_diffserv(skb, wash);
1695 
1696 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1697 		tin = 0;
1698 
1699 	else if (mark && mark <= qd->tin_cnt)
1700 		tin = qd->tin_order[mark - 1];
1701 
1702 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1703 		 TC_H_MIN(skb->priority) > 0 &&
1704 		 TC_H_MIN(skb->priority) <= qd->tin_cnt)
1705 		tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
1706 
1707 	else {
1708 		if (!wash)
1709 			dscp = cake_handle_diffserv(skb, wash);
1710 		tin = qd->tin_index[dscp];
1711 
1712 		if (unlikely(tin >= qd->tin_cnt))
1713 			tin = 0;
1714 	}
1715 
1716 	return &qd->tins[tin];
1717 }
1718 
1719 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1720 			 struct sk_buff *skb, int flow_mode, int *qerr)
1721 {
1722 	struct cake_sched_data *q = qdisc_priv(sch);
1723 	struct tcf_proto *filter;
1724 	struct tcf_result res;
1725 	u16 flow = 0, host = 0;
1726 	int result;
1727 
1728 	filter = rcu_dereference_bh(q->filter_list);
1729 	if (!filter)
1730 		goto hash;
1731 
1732 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1733 	result = tcf_classify(skb, NULL, filter, &res, false);
1734 
1735 	if (result >= 0) {
1736 #ifdef CONFIG_NET_CLS_ACT
1737 		switch (result) {
1738 		case TC_ACT_STOLEN:
1739 		case TC_ACT_QUEUED:
1740 		case TC_ACT_TRAP:
1741 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1742 			fallthrough;
1743 		case TC_ACT_SHOT:
1744 			return 0;
1745 		}
1746 #endif
1747 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1748 			flow = TC_H_MIN(res.classid);
1749 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1750 			host = TC_H_MAJ(res.classid) >> 16;
1751 	}
1752 hash:
1753 	*t = cake_select_tin(sch, skb);
1754 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1755 }
1756 
1757 static void cake_reconfigure(struct Qdisc *sch);
1758 
1759 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1760 			struct sk_buff **to_free)
1761 {
1762 	u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1763 	struct cake_sched_data *q = qdisc_priv(sch);
1764 	int len = qdisc_pkt_len(skb), ret;
1765 	struct sk_buff *ack = NULL;
1766 	ktime_t now = ktime_get();
1767 	struct cake_tin_data *b;
1768 	struct cake_flow *flow;
1769 	bool same_flow = false;
1770 
1771 	/* choose flow to insert into */
1772 	idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
1773 	if (idx == 0) {
1774 		if (ret & __NET_XMIT_BYPASS)
1775 			qdisc_qstats_drop(sch);
1776 		__qdisc_drop(skb, to_free);
1777 		return ret;
1778 	}
1779 	tin = (u32)(b - q->tins);
1780 	idx--;
1781 	flow = &b->flows[idx];
1782 
1783 	/* ensure shaper state isn't stale */
1784 	if (!b->tin_backlog) {
1785 		if (ktime_before(b->time_next_packet, now))
1786 			b->time_next_packet = now;
1787 
1788 		if (!sch->q.qlen) {
1789 			if (ktime_before(q->time_next_packet, now)) {
1790 				q->failsafe_next_packet = now;
1791 				q->time_next_packet = now;
1792 			} else if (ktime_after(q->time_next_packet, now) &&
1793 				   ktime_after(q->failsafe_next_packet, now)) {
1794 				u64 next = \
1795 					min(ktime_to_ns(q->time_next_packet),
1796 					    ktime_to_ns(
1797 						   q->failsafe_next_packet));
1798 				sch->qstats.overlimits++;
1799 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1800 			}
1801 		}
1802 	}
1803 
1804 	if (unlikely(len > b->max_skblen))
1805 		WRITE_ONCE(b->max_skblen, len);
1806 
1807 	if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1808 		struct sk_buff *segs, *nskb;
1809 		netdev_features_t features = netif_skb_features(skb);
1810 		unsigned int slen = 0, numsegs = 0;
1811 
1812 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1813 		if (IS_ERR_OR_NULL(segs))
1814 			return qdisc_drop(skb, sch, to_free);
1815 
1816 		skb_list_walk_safe(segs, segs, nskb) {
1817 			skb_mark_not_on_list(segs);
1818 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1819 			qdisc_skb_cb(segs)->pkt_segs = 1;
1820 			cobalt_set_enqueue_time(segs, now);
1821 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1822 									  segs);
1823 			flow_queue_add(flow, segs);
1824 
1825 			sch->q.qlen++;
1826 			numsegs++;
1827 			slen += segs->len;
1828 			q->buffer_used += segs->truesize;
1829 			WRITE_ONCE(b->packets, b->packets + 1);
1830 		}
1831 
1832 		/* stats */
1833 		sch->qstats.backlog += slen;
1834 		q->avg_window_bytes += slen;
1835 		WRITE_ONCE(b->bytes, b->bytes + slen);
1836 		WRITE_ONCE(b->tin_backlog, b->tin_backlog + slen);
1837 		WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + slen);
1838 
1839 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1840 		consume_skb(skb);
1841 	} else {
1842 		/* not splitting */
1843 		int ack_pkt_len = 0;
1844 
1845 		cobalt_set_enqueue_time(skb, now);
1846 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1847 		flow_queue_add(flow, skb);
1848 
1849 		if (q->config->ack_filter)
1850 			ack = cake_ack_filter(q, flow);
1851 
1852 		if (ack) {
1853 			WRITE_ONCE(b->ack_drops, b->ack_drops + 1);
1854 			sch->qstats.drops++;
1855 			ack_pkt_len = qdisc_pkt_len(ack);
1856 			WRITE_ONCE(b->bytes, b->bytes + ack_pkt_len);
1857 			q->buffer_used += skb->truesize - ack->truesize;
1858 			if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1859 				cake_advance_shaper(q, b, ack, now, true);
1860 
1861 			qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1862 			consume_skb(ack);
1863 		} else {
1864 			sch->q.qlen++;
1865 			q->buffer_used      += skb->truesize;
1866 		}
1867 
1868 		/* stats */
1869 		WRITE_ONCE(b->packets, b->packets + 1);
1870 		sch->qstats.backlog += len - ack_pkt_len;
1871 		q->avg_window_bytes += len - ack_pkt_len;
1872 		WRITE_ONCE(b->bytes, b->bytes + len - ack_pkt_len);
1873 		WRITE_ONCE(b->tin_backlog, b->tin_backlog + len - ack_pkt_len);
1874 		WRITE_ONCE(b->backlogs[idx], b->backlogs[idx] + len - ack_pkt_len);
1875 	}
1876 
1877 	if (q->overflow_timeout)
1878 		cake_heapify_up(q, b->overflow_idx[idx]);
1879 
1880 	/* incoming bandwidth capacity estimate */
1881 	if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1882 		u64 packet_interval = \
1883 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1884 
1885 		if (packet_interval > NSEC_PER_SEC)
1886 			packet_interval = NSEC_PER_SEC;
1887 
1888 		/* filter out short-term bursts, eg. wifi aggregation */
1889 		q->avg_packet_interval = \
1890 			cake_ewma(q->avg_packet_interval,
1891 				  packet_interval,
1892 				  (packet_interval > q->avg_packet_interval ?
1893 					  2 : 8));
1894 
1895 		q->last_packet_time = now;
1896 
1897 		if (packet_interval > q->avg_packet_interval) {
1898 			u64 window_interval = \
1899 				ktime_to_ns(ktime_sub(now,
1900 						      q->avg_window_begin));
1901 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1902 
1903 			b = div64_u64(b, window_interval);
1904 			WRITE_ONCE(q->avg_peak_bandwidth,
1905 				   cake_ewma(q->avg_peak_bandwidth, b,
1906 					     b > q->avg_peak_bandwidth ? 2 : 8));
1907 			q->avg_window_bytes = 0;
1908 			q->avg_window_begin = now;
1909 
1910 			if (ktime_after(now,
1911 					ktime_add_ms(q->last_reconfig_time,
1912 						     250))) {
1913 				q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1914 				cake_reconfigure(sch);
1915 			}
1916 		}
1917 	} else {
1918 		q->avg_window_bytes = 0;
1919 		q->last_packet_time = now;
1920 	}
1921 
1922 	/* flowchain */
1923 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1924 		if (!flow->set) {
1925 			list_add_tail(&flow->flowchain, &b->new_flows);
1926 		} else {
1927 			WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1);
1928 			list_move_tail(&flow->flowchain, &b->new_flows);
1929 		}
1930 		flow->set = CAKE_SET_SPARSE;
1931 		WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count + 1);
1932 
1933 		WRITE_ONCE(flow->deficit, cake_get_flow_quantum(b, flow, q->config->flow_mode));
1934 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1935 		/* this flow was empty, accounted as a sparse flow, but actually
1936 		 * in the bulk rotation.
1937 		 */
1938 		flow->set = CAKE_SET_BULK;
1939 		WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1);
1940 		WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1);
1941 
1942 		cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
1943 		cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
1944 	}
1945 
1946 	if (q->buffer_used > q->buffer_max_used)
1947 		WRITE_ONCE(q->buffer_max_used, q->buffer_used);
1948 
1949 	if (q->buffer_used <= q->buffer_limit)
1950 		return NET_XMIT_SUCCESS;
1951 
1952 	prev_qlen = sch->q.qlen;
1953 	prev_backlog = sch->qstats.backlog;
1954 
1955 	while (q->buffer_used > q->buffer_limit) {
1956 		drop_id = cake_drop(sch, to_free);
1957 		if ((drop_id >> 16) == tin &&
1958 		    (drop_id & 0xFFFF) == idx)
1959 			same_flow = true;
1960 	}
1961 
1962 	prev_qlen -= sch->q.qlen;
1963 	prev_backlog -= sch->qstats.backlog;
1964 	b->drop_overlimit += prev_qlen;
1965 
1966 	if (same_flow) {
1967 		qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1968 					  prev_backlog - len);
1969 		return NET_XMIT_CN;
1970 	}
1971 	qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1972 	return NET_XMIT_SUCCESS;
1973 }
1974 
1975 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1976 {
1977 	struct cake_sched_data *q = qdisc_priv(sch);
1978 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1979 	struct cake_flow *flow = &b->flows[q->cur_flow];
1980 	struct sk_buff *skb = NULL;
1981 	u32 len;
1982 
1983 	if (flow->head) {
1984 		skb = dequeue_head(flow);
1985 		len = qdisc_pkt_len(skb);
1986 		WRITE_ONCE(b->backlogs[q->cur_flow], b->backlogs[q->cur_flow] - len);
1987 		WRITE_ONCE(b->tin_backlog, b->tin_backlog - len);
1988 		sch->qstats.backlog      -= len;
1989 		q->buffer_used		 -= skb->truesize;
1990 		sch->q.qlen--;
1991 
1992 		if (q->overflow_timeout)
1993 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1994 	}
1995 	return skb;
1996 }
1997 
1998 /* Discard leftover packets from a tin no longer in use. */
1999 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
2000 {
2001 	struct cake_sched_data *q = qdisc_priv(sch);
2002 	struct sk_buff *skb;
2003 
2004 	q->cur_tin = tin;
2005 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
2006 		while (!!(skb = cake_dequeue_one(sch)))
2007 			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
2008 }
2009 
2010 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2011 {
2012 	struct cake_sched_data *q = qdisc_priv(sch);
2013 	struct cake_tin_data *b = &q->tins[q->cur_tin];
2014 	enum qdisc_drop_reason reason;
2015 	ktime_t now = ktime_get();
2016 	struct cake_flow *flow;
2017 	struct list_head *head;
2018 	bool first_flow = true;
2019 	struct sk_buff *skb;
2020 	u64 delay;
2021 	u32 len;
2022 
2023 	if (q->config->is_shared && q->rate_ns &&
2024 	    now - q->last_checked_active >= q->config->sync_time) {
2025 		struct net_device *dev = qdisc_dev(sch);
2026 		struct cake_sched_data *other_priv;
2027 		u64 new_rate = q->config->rate_bps;
2028 		u64 other_qlen, other_last_active;
2029 		struct Qdisc *other_sch;
2030 		u32 num_active_qs = 1;
2031 		unsigned int ntx;
2032 
2033 		for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2034 			other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
2035 			other_priv = qdisc_priv(other_sch);
2036 
2037 			if (other_priv == q)
2038 				continue;
2039 
2040 			other_qlen = READ_ONCE(other_sch->q.qlen);
2041 			other_last_active = READ_ONCE(other_priv->last_active);
2042 
2043 			if (other_qlen || other_last_active > q->last_checked_active)
2044 				num_active_qs++;
2045 		}
2046 
2047 		if (num_active_qs > 1)
2048 			new_rate = div64_u64(q->config->rate_bps, num_active_qs);
2049 
2050 		cake_configure_rates(sch, new_rate, true);
2051 		q->last_checked_active = now;
2052 		WRITE_ONCE(q->active_queues, num_active_qs);
2053 	}
2054 
2055 begin:
2056 	if (!sch->q.qlen)
2057 		return NULL;
2058 
2059 	/* global hard shaper */
2060 	if (ktime_after(q->time_next_packet, now) &&
2061 	    ktime_after(q->failsafe_next_packet, now)) {
2062 		u64 next = min(ktime_to_ns(q->time_next_packet),
2063 			       ktime_to_ns(q->failsafe_next_packet));
2064 
2065 		sch->qstats.overlimits++;
2066 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2067 		return NULL;
2068 	}
2069 
2070 	/* Choose a class to work on. */
2071 	if (!q->rate_ns) {
2072 		/* In unlimited mode, can't rely on shaper timings, just balance
2073 		 * with DRR
2074 		 */
2075 		bool wrapped = false, empty = true;
2076 
2077 		while (b->tin_deficit < 0 ||
2078 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2079 			if (b->tin_deficit <= 0)
2080 				b->tin_deficit += b->tin_quantum;
2081 			if (b->sparse_flow_count + b->bulk_flow_count)
2082 				empty = false;
2083 
2084 			q->cur_tin++;
2085 			b++;
2086 			if (q->cur_tin >= q->tin_cnt) {
2087 				q->cur_tin = 0;
2088 				b = q->tins;
2089 
2090 				if (wrapped) {
2091 					/* It's possible for q->qlen to be
2092 					 * nonzero when we actually have no
2093 					 * packets anywhere.
2094 					 */
2095 					if (empty)
2096 						return NULL;
2097 				} else {
2098 					wrapped = true;
2099 				}
2100 			}
2101 		}
2102 	} else {
2103 		/* In shaped mode, choose:
2104 		 * - Highest-priority tin with queue and meeting schedule, or
2105 		 * - The earliest-scheduled tin with queue.
2106 		 */
2107 		ktime_t best_time = KTIME_MAX;
2108 		int tin, best_tin = 0;
2109 
2110 		for (tin = 0; tin < q->tin_cnt; tin++) {
2111 			b = q->tins + tin;
2112 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2113 				ktime_t time_to_pkt = \
2114 					ktime_sub(b->time_next_packet, now);
2115 
2116 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2117 				    ktime_compare(time_to_pkt,
2118 						  best_time) <= 0) {
2119 					best_time = time_to_pkt;
2120 					best_tin = tin;
2121 				}
2122 			}
2123 		}
2124 
2125 		q->cur_tin = best_tin;
2126 		b = q->tins + best_tin;
2127 
2128 		/* No point in going further if no packets to deliver. */
2129 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2130 			return NULL;
2131 	}
2132 
2133 retry:
2134 	/* service this class */
2135 	head = &b->decaying_flows;
2136 	if (!first_flow || list_empty(head)) {
2137 		head = &b->new_flows;
2138 		if (list_empty(head)) {
2139 			head = &b->old_flows;
2140 			if (unlikely(list_empty(head))) {
2141 				head = &b->decaying_flows;
2142 				if (unlikely(list_empty(head)))
2143 					goto begin;
2144 			}
2145 		}
2146 	}
2147 	flow = list_first_entry(head, struct cake_flow, flowchain);
2148 	q->cur_flow = flow - b->flows;
2149 	first_flow = false;
2150 
2151 	/* flow isolation (DRR++) */
2152 	if (flow->deficit <= 0) {
2153 		/* Keep all flows with deficits out of the sparse and decaying
2154 		 * rotations.  No non-empty flow can go into the decaying
2155 		 * rotation, so they can't get deficits
2156 		 */
2157 		if (flow->set == CAKE_SET_SPARSE) {
2158 			if (flow->head) {
2159 				WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1);
2160 				WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count + 1);
2161 
2162 				cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2163 				cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2164 
2165 				flow->set = CAKE_SET_BULK;
2166 			} else {
2167 				/* we've moved it to the bulk rotation for
2168 				 * correct deficit accounting but we still want
2169 				 * to count it as a sparse flow, not a bulk one.
2170 				 */
2171 				flow->set = CAKE_SET_SPARSE_WAIT;
2172 			}
2173 		}
2174 
2175 		WRITE_ONCE(flow->deficit,
2176 			   flow->deficit + cake_get_flow_quantum(b, flow, q->config->flow_mode));
2177 		list_move_tail(&flow->flowchain, &b->old_flows);
2178 
2179 		goto retry;
2180 	}
2181 
2182 	/* Retrieve a packet via the AQM */
2183 	while (1) {
2184 		skb = cake_dequeue_one(sch);
2185 		if (!skb) {
2186 			/* this queue was actually empty */
2187 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2188 				WRITE_ONCE(b->unresponsive_flow_count,
2189 					   b->unresponsive_flow_count - 1);
2190 
2191 			if (flow->cvars.p_drop || flow->cvars.count ||
2192 			    ktime_before(now, flow->cvars.drop_next)) {
2193 				/* keep in the flowchain until the state has
2194 				 * decayed to rest
2195 				 */
2196 				list_move_tail(&flow->flowchain,
2197 					       &b->decaying_flows);
2198 				if (flow->set == CAKE_SET_BULK) {
2199 					WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1);
2200 
2201 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2202 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2203 
2204 					WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1);
2205 				} else if (flow->set == CAKE_SET_SPARSE ||
2206 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2207 					WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1);
2208 					WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count + 1);
2209 				}
2210 				flow->set = CAKE_SET_DECAYING;
2211 			} else {
2212 				/* remove empty queue from the flowchain */
2213 				list_del_init(&flow->flowchain);
2214 				if (flow->set == CAKE_SET_SPARSE ||
2215 				    flow->set == CAKE_SET_SPARSE_WAIT) {
2216 					WRITE_ONCE(b->sparse_flow_count, b->sparse_flow_count - 1);
2217 				} else if (flow->set == CAKE_SET_BULK) {
2218 					WRITE_ONCE(b->bulk_flow_count, b->bulk_flow_count - 1);
2219 
2220 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2221 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2222 				} else {
2223 					WRITE_ONCE(b->decaying_flow_count, b->decaying_flow_count - 1);
2224 				}
2225 				flow->set = CAKE_SET_NONE;
2226 			}
2227 			goto begin;
2228 		}
2229 
2230 		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2231 					    (b->bulk_flow_count *
2232 					     !!(q->config->rate_flags &
2233 						CAKE_FLAG_INGRESS)));
2234 		/* Last packet in queue may be marked, shouldn't be dropped */
2235 		if (reason == QDISC_DROP_UNSPEC || !flow->head)
2236 			break;
2237 
2238 		/* drop this packet, get another one */
2239 		if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
2240 			len = cake_advance_shaper(q, b, skb,
2241 						  now, true);
2242 			WRITE_ONCE(flow->deficit, flow->deficit - len);
2243 			b->tin_deficit -= len;
2244 		}
2245 		WRITE_ONCE(flow->dropped, flow->dropped + 1);
2246 		WRITE_ONCE(b->tin_dropped, b->tin_dropped + 1);
2247 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2248 		qdisc_qstats_drop(sch);
2249 		qdisc_dequeue_drop(sch, skb, reason);
2250 		if (q->config->rate_flags & CAKE_FLAG_INGRESS)
2251 			goto retry;
2252 	}
2253 
2254 	WRITE_ONCE(b->tin_ecn_mark, b->tin_ecn_mark + !!flow->cvars.ecn_marked);
2255 	qdisc_bstats_update(sch, skb);
2256 	WRITE_ONCE(q->last_active, now);
2257 
2258 	/* collect delay stats */
2259 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2260 	WRITE_ONCE(b->avge_delay, cake_ewma(b->avge_delay, delay, 8));
2261 	WRITE_ONCE(b->peak_delay,
2262 		   cake_ewma(b->peak_delay, delay,
2263 			     delay > b->peak_delay ? 2 : 8));
2264 	WRITE_ONCE(b->base_delay,
2265 		   cake_ewma(b->base_delay, delay,
2266 			     delay < b->base_delay ? 2 : 8));
2267 
2268 	len = cake_advance_shaper(q, b, skb, now, false);
2269 	WRITE_ONCE(flow->deficit, flow->deficit - len);
2270 	b->tin_deficit -= len;
2271 
2272 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2273 		u64 next = min(ktime_to_ns(q->time_next_packet),
2274 			       ktime_to_ns(q->failsafe_next_packet));
2275 
2276 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2277 	} else if (!sch->q.qlen) {
2278 		int i;
2279 
2280 		for (i = 0; i < q->tin_cnt; i++) {
2281 			if (q->tins[i].decaying_flow_count) {
2282 				ktime_t next = \
2283 					ktime_add_ns(now,
2284 						     q->tins[i].cparams.target);
2285 
2286 				qdisc_watchdog_schedule_ns(&q->watchdog,
2287 							   ktime_to_ns(next));
2288 				break;
2289 			}
2290 		}
2291 	}
2292 
2293 	if (q->overflow_timeout)
2294 		q->overflow_timeout--;
2295 
2296 	return skb;
2297 }
2298 
2299 static void cake_reset(struct Qdisc *sch)
2300 {
2301 	struct cake_sched_data *q = qdisc_priv(sch);
2302 	u32 c;
2303 
2304 	if (!q->tins)
2305 		return;
2306 
2307 	for (c = 0; c < CAKE_MAX_TINS; c++)
2308 		cake_clear_tin(sch, c);
2309 }
2310 
2311 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2312 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2313 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2314 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2315 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2316 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2317 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2318 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2319 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2320 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2321 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2322 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2323 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2324 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2325 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2326 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2327 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2328 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2329 };
2330 
2331 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2332 			  u64 target_ns, u64 rtt_est_ns)
2333 {
2334 	/* convert byte-rate into time-per-byte
2335 	 * so it will always unwedge in reasonable time.
2336 	 */
2337 	static const u64 MIN_RATE = 64;
2338 	u32 byte_target = mtu;
2339 	u64 byte_target_ns;
2340 	u8  rate_shft = 0;
2341 	u64 rate_ns = 0;
2342 
2343 	if (rate) {
2344 		WRITE_ONCE(b->flow_quantum,
2345 			   max(min(rate >> 12, 1514ULL), 300ULL));
2346 		rate_shft = 34;
2347 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2348 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2349 		while (!!(rate_ns >> 34)) {
2350 			rate_ns >>= 1;
2351 			rate_shft--;
2352 		}
2353 	} else {
2354 		/* else unlimited, ie. zero delay */
2355 		WRITE_ONCE(b->flow_quantum, 1514);
2356 	}
2357 	WRITE_ONCE(b->tin_rate_bps, rate);
2358 	b->tin_rate_ns   = rate_ns;
2359 	b->tin_rate_shft = rate_shft;
2360 
2361 	if (mtu == 0)
2362 		return;
2363 
2364 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2365 
2366 	WRITE_ONCE(b->cparams.target,
2367 		   max((byte_target_ns * 3) / 2, target_ns));
2368 	WRITE_ONCE(b->cparams.interval,
2369 		   max(rtt_est_ns + b->cparams.target - target_ns,
2370 		       b->cparams.target * 2));
2371 	b->cparams.mtu_time = byte_target_ns;
2372 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2373 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2374 }
2375 
2376 static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu)
2377 {
2378 	struct cake_sched_data *q = qdisc_priv(sch);
2379 	struct cake_tin_data *b = &q->tins[0];
2380 
2381 	q->tin_cnt = 1;
2382 
2383 	q->tin_index = besteffort;
2384 	q->tin_order = normal_order;
2385 
2386 	cake_set_rate(b, rate, mtu,
2387 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2388 	b->tin_quantum = 65535;
2389 
2390 	return 0;
2391 }
2392 
2393 static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu)
2394 {
2395 	/* convert high-level (user visible) parameters into internal format */
2396 	struct cake_sched_data *q = qdisc_priv(sch);
2397 	u32 quantum = 256;
2398 	u32 i;
2399 
2400 	q->tin_cnt = 8;
2401 	q->tin_index = precedence;
2402 	q->tin_order = normal_order;
2403 
2404 	for (i = 0; i < q->tin_cnt; i++) {
2405 		struct cake_tin_data *b = &q->tins[i];
2406 
2407 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2408 			      us_to_ns(q->config->interval));
2409 
2410 		b->tin_quantum = max_t(u16, 1U, quantum);
2411 
2412 		/* calculate next class's parameters */
2413 		rate  *= 7;
2414 		rate >>= 3;
2415 
2416 		quantum  *= 7;
2417 		quantum >>= 3;
2418 	}
2419 
2420 	return 0;
2421 }
2422 
2423 /*	List of known Diffserv codepoints:
2424  *
2425  *	Default Forwarding (DF/CS0) - Best Effort
2426  *	Max Throughput (TOS2)
2427  *	Min Delay (TOS4)
2428  *	LLT "La" (TOS5)
2429  *	Assured Forwarding 1 (AF1x) - x3
2430  *	Assured Forwarding 2 (AF2x) - x3
2431  *	Assured Forwarding 3 (AF3x) - x3
2432  *	Assured Forwarding 4 (AF4x) - x3
2433  *	Precedence Class 1 (CS1)
2434  *	Precedence Class 2 (CS2)
2435  *	Precedence Class 3 (CS3)
2436  *	Precedence Class 4 (CS4)
2437  *	Precedence Class 5 (CS5)
2438  *	Precedence Class 6 (CS6)
2439  *	Precedence Class 7 (CS7)
2440  *	Voice Admit (VA)
2441  *	Expedited Forwarding (EF)
2442  *	Lower Effort (LE)
2443  *
2444  *	Total 26 codepoints.
2445  */
2446 
2447 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2448  *		(roughly descending order of contended priority)
2449  *		(roughly ascending order of uncontended throughput)
2450  *
2451  *	Network Control (CS6,CS7)      - routing traffic
2452  *	Telephony (EF,VA)         - aka. VoIP streams
2453  *	Signalling (CS5)               - VoIP setup
2454  *	Multimedia Conferencing (AF4x) - aka. video calls
2455  *	Realtime Interactive (CS4)     - eg. games
2456  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2457  *	Broadcast Video (CS3)
2458  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2459  *	Ops, Admin, Management (CS2)      - eg. ssh
2460  *	Standard Service (DF & unrecognised codepoints)
2461  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2462  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2463  *
2464  *	Total 12 traffic classes.
2465  */
2466 
2467 static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu)
2468 {
2469 /*	Pruned list of traffic classes for typical applications:
2470  *
2471  *		Network Control          (CS6, CS7)
2472  *		Minimum Latency          (EF, VA, CS5, CS4)
2473  *		Interactive Shell        (CS2)
2474  *		Low Latency Transactions (AF2x, TOS4)
2475  *		Video Streaming          (AF4x, AF3x, CS3)
2476  *		Bog Standard             (DF etc.)
2477  *		High Throughput          (AF1x, TOS2, CS1)
2478  *		Background Traffic       (LE)
2479  *
2480  *		Total 8 traffic classes.
2481  */
2482 
2483 	struct cake_sched_data *q = qdisc_priv(sch);
2484 	u32 quantum = 256;
2485 	u32 i;
2486 
2487 	q->tin_cnt = 8;
2488 
2489 	/* codepoint to class mapping */
2490 	q->tin_index = diffserv8;
2491 	q->tin_order = normal_order;
2492 
2493 	/* class characteristics */
2494 	for (i = 0; i < q->tin_cnt; i++) {
2495 		struct cake_tin_data *b = &q->tins[i];
2496 
2497 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2498 			      us_to_ns(q->config->interval));
2499 
2500 		b->tin_quantum = max_t(u16, 1U, quantum);
2501 
2502 		/* calculate next class's parameters */
2503 		rate  *= 7;
2504 		rate >>= 3;
2505 
2506 		quantum  *= 7;
2507 		quantum >>= 3;
2508 	}
2509 
2510 	return 0;
2511 }
2512 
2513 static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu)
2514 {
2515 /*  Further pruned list of traffic classes for four-class system:
2516  *
2517  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2518  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2519  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2520  *	    Background Traffic (LE, CS1)
2521  *
2522  *		Total 4 traffic classes.
2523  */
2524 
2525 	struct cake_sched_data *q = qdisc_priv(sch);
2526 	u32 quantum = 1024;
2527 
2528 	q->tin_cnt = 4;
2529 
2530 	/* codepoint to class mapping */
2531 	q->tin_index = diffserv4;
2532 	q->tin_order = bulk_order;
2533 
2534 	/* class characteristics */
2535 	cake_set_rate(&q->tins[0], rate, mtu,
2536 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2537 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2538 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2539 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2540 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2541 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2542 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2543 
2544 	/* bandwidth-sharing weights */
2545 	q->tins[0].tin_quantum = quantum;
2546 	q->tins[1].tin_quantum = quantum >> 4;
2547 	q->tins[2].tin_quantum = quantum >> 1;
2548 	q->tins[3].tin_quantum = quantum >> 2;
2549 
2550 	return 0;
2551 }
2552 
2553 static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu)
2554 {
2555 /*  Simplified Diffserv structure with 3 tins.
2556  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2557  *		Best Effort
2558  *		Low Priority		(LE, CS1)
2559  */
2560 	struct cake_sched_data *q = qdisc_priv(sch);
2561 	u32 quantum = 1024;
2562 
2563 	q->tin_cnt = 3;
2564 
2565 	/* codepoint to class mapping */
2566 	q->tin_index = diffserv3;
2567 	q->tin_order = bulk_order;
2568 
2569 	/* class characteristics */
2570 	cake_set_rate(&q->tins[0], rate, mtu,
2571 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2572 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2573 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2574 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2575 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2576 
2577 	/* bandwidth-sharing weights */
2578 	q->tins[0].tin_quantum = quantum;
2579 	q->tins[1].tin_quantum = quantum >> 4;
2580 	q->tins[2].tin_quantum = quantum >> 2;
2581 
2582 	return 0;
2583 }
2584 
2585 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust)
2586 {
2587 	u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch));
2588 	struct cake_sched_data *qd = qdisc_priv(sch);
2589 	struct cake_sched_config *q = qd->config;
2590 	int c, ft;
2591 
2592 	switch (q->tin_mode) {
2593 	case CAKE_DIFFSERV_BESTEFFORT:
2594 		ft = cake_config_besteffort(sch, rate, mtu);
2595 		break;
2596 
2597 	case CAKE_DIFFSERV_PRECEDENCE:
2598 		ft = cake_config_precedence(sch, rate, mtu);
2599 		break;
2600 
2601 	case CAKE_DIFFSERV_DIFFSERV8:
2602 		ft = cake_config_diffserv8(sch, rate, mtu);
2603 		break;
2604 
2605 	case CAKE_DIFFSERV_DIFFSERV4:
2606 		ft = cake_config_diffserv4(sch, rate, mtu);
2607 		break;
2608 
2609 	case CAKE_DIFFSERV_DIFFSERV3:
2610 	default:
2611 		ft = cake_config_diffserv3(sch, rate, mtu);
2612 		break;
2613 	}
2614 
2615 	for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
2616 		cake_clear_tin(sch, c);
2617 		qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
2618 	}
2619 
2620 	qd->rate_ns   = qd->tins[ft].tin_rate_ns;
2621 	qd->rate_shft = qd->tins[ft].tin_rate_shft;
2622 }
2623 
2624 static void cake_reconfigure(struct Qdisc *sch)
2625 {
2626 	struct cake_sched_data *qd = qdisc_priv(sch);
2627 	struct cake_sched_config *q = qd->config;
2628 	u32 buffer_limit;
2629 
2630 	cake_configure_rates(sch, qd->config->rate_bps, false);
2631 
2632 	if (q->buffer_config_limit) {
2633 		buffer_limit = q->buffer_config_limit;
2634 	} else if (q->rate_bps) {
2635 		u64 t = q->rate_bps * q->interval;
2636 
2637 		do_div(t, USEC_PER_SEC / 4);
2638 		buffer_limit = max_t(u32, t, 4U << 20);
2639 	} else {
2640 		buffer_limit = ~0;
2641 	}
2642 
2643 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2644 
2645 	WRITE_ONCE(qd->buffer_limit,
2646 		   min(buffer_limit,
2647 		       max(sch->limit * psched_mtu(qdisc_dev(sch)),
2648 			   q->buffer_config_limit)));
2649 }
2650 
2651 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
2652 			      struct netlink_ext_ack *extack, bool *overhead_changed)
2653 {
2654 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2655 	u16 rate_flags = q->rate_flags;
2656 	u8 flow_mode = q->flow_mode;
2657 	int err;
2658 
2659 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2660 					  extack);
2661 	if (err < 0)
2662 		return err;
2663 
2664 	if (tb[TCA_CAKE_NAT]) {
2665 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2666 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2667 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2668 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2669 #else
2670 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2671 				    "No conntrack support in kernel");
2672 		return -EOPNOTSUPP;
2673 #endif
2674 	}
2675 
2676 	if (tb[TCA_CAKE_AUTORATE]) {
2677 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
2678 			if (q->is_shared) {
2679 				NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
2680 						    "Can't use autorate-ingress with cake_mq");
2681 				return -EOPNOTSUPP;
2682 			}
2683 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2684 		} else {
2685 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2686 		}
2687 	}
2688 
2689 	if (tb[TCA_CAKE_BASE_RATE64])
2690 		WRITE_ONCE(q->rate_bps,
2691 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2692 
2693 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2694 		WRITE_ONCE(q->tin_mode,
2695 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2696 
2697 	if (tb[TCA_CAKE_WASH]) {
2698 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2699 			rate_flags |= CAKE_FLAG_WASH;
2700 		else
2701 			rate_flags &= ~CAKE_FLAG_WASH;
2702 	}
2703 
2704 	if (tb[TCA_CAKE_FLOW_MODE])
2705 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2706 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2707 					CAKE_FLOW_MASK));
2708 
2709 	if (tb[TCA_CAKE_ATM])
2710 		WRITE_ONCE(q->atm_mode,
2711 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2712 
2713 	if (tb[TCA_CAKE_OVERHEAD]) {
2714 		WRITE_ONCE(q->rate_overhead,
2715 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2716 		rate_flags |= CAKE_FLAG_OVERHEAD;
2717 		*overhead_changed = true;
2718 	}
2719 
2720 	if (tb[TCA_CAKE_RAW]) {
2721 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2722 		*overhead_changed = true;
2723 	}
2724 
2725 	if (tb[TCA_CAKE_MPU])
2726 		WRITE_ONCE(q->rate_mpu,
2727 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2728 
2729 	if (tb[TCA_CAKE_RTT]) {
2730 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2731 
2732 		WRITE_ONCE(q->interval, max(interval, 1U));
2733 	}
2734 
2735 	if (tb[TCA_CAKE_TARGET]) {
2736 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2737 
2738 		WRITE_ONCE(q->target, max(target, 1U));
2739 	}
2740 
2741 	if (tb[TCA_CAKE_INGRESS]) {
2742 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2743 			rate_flags |= CAKE_FLAG_INGRESS;
2744 		else
2745 			rate_flags &= ~CAKE_FLAG_INGRESS;
2746 	}
2747 
2748 	if (tb[TCA_CAKE_ACK_FILTER])
2749 		WRITE_ONCE(q->ack_filter,
2750 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2751 
2752 	if (tb[TCA_CAKE_MEMORY])
2753 		WRITE_ONCE(q->buffer_config_limit,
2754 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2755 
2756 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2757 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2758 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2759 		else
2760 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2761 	}
2762 
2763 	if (tb[TCA_CAKE_FWMARK]) {
2764 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2765 		WRITE_ONCE(q->fwmark_shft,
2766 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2767 	}
2768 
2769 	WRITE_ONCE(q->rate_flags, rate_flags);
2770 	WRITE_ONCE(q->flow_mode, flow_mode);
2771 
2772 	return 0;
2773 }
2774 
2775 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2776 		       struct netlink_ext_ack *extack)
2777 {
2778 	struct cake_sched_data *qd = qdisc_priv(sch);
2779 	struct cake_sched_config *q = qd->config;
2780 	bool overhead_changed = false;
2781 	int ret;
2782 
2783 	if (q->is_shared) {
2784 		NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
2785 		return -EOPNOTSUPP;
2786 	}
2787 
2788 	ret = cake_config_change(q, opt, extack, &overhead_changed);
2789 	if (ret)
2790 		return ret;
2791 
2792 	if (overhead_changed) {
2793 		WRITE_ONCE(qd->max_netlen, 0);
2794 		WRITE_ONCE(qd->max_adjlen, 0);
2795 		WRITE_ONCE(qd->min_netlen, ~0);
2796 		WRITE_ONCE(qd->min_adjlen, ~0);
2797 	}
2798 
2799 	if (qd->tins) {
2800 		sch_tree_lock(sch);
2801 		cake_reconfigure(sch);
2802 		sch_tree_unlock(sch);
2803 	}
2804 
2805 	return 0;
2806 }
2807 
2808 static void cake_destroy(struct Qdisc *sch)
2809 {
2810 	struct cake_sched_data *q = qdisc_priv(sch);
2811 
2812 	qdisc_watchdog_cancel(&q->watchdog);
2813 	tcf_block_put(q->block);
2814 	kvfree(q->tins);
2815 }
2816 
2817 static void cake_config_init(struct cake_sched_config *q, bool is_shared)
2818 {
2819 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2820 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2821 
2822 	q->rate_bps = 0; /* unlimited by default */
2823 
2824 	q->interval = 100000; /* 100ms default */
2825 	q->target   =   5000; /* 5ms: codel RFC argues
2826 			       * for 5 to 10% of interval
2827 			       */
2828 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2829 	q->is_shared = is_shared;
2830 	q->sync_time = 200 * NSEC_PER_USEC;
2831 }
2832 
2833 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2834 		     struct netlink_ext_ack *extack)
2835 {
2836 	struct cake_sched_data *qd = qdisc_priv(sch);
2837 	struct cake_sched_config *q = &qd->initial_config;
2838 	int i, j, err;
2839 
2840 	cake_config_init(q, false);
2841 
2842 	sch->limit = 10240;
2843 	sch->flags |= TCQ_F_DEQUEUE_DROPS;
2844 
2845 	qd->cur_tin = 0;
2846 	qd->cur_flow  = 0;
2847 	qd->config = q;
2848 
2849 	qdisc_watchdog_init(&qd->watchdog, sch);
2850 
2851 	if (opt) {
2852 		err = cake_change(sch, opt, extack);
2853 		if (err)
2854 			return err;
2855 	}
2856 
2857 	err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
2858 	if (err)
2859 		return err;
2860 
2861 	quantum_div[0] = ~0;
2862 	for (i = 1; i <= CAKE_QUEUES; i++)
2863 		quantum_div[i] = 65535 / i;
2864 
2865 	qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS);
2866 	if (!qd->tins)
2867 		return -ENOMEM;
2868 
2869 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2870 		struct cake_tin_data *b = qd->tins + i;
2871 
2872 		INIT_LIST_HEAD(&b->new_flows);
2873 		INIT_LIST_HEAD(&b->old_flows);
2874 		INIT_LIST_HEAD(&b->decaying_flows);
2875 		b->sparse_flow_count = 0;
2876 		b->bulk_flow_count = 0;
2877 		b->decaying_flow_count = 0;
2878 
2879 		for (j = 0; j < CAKE_QUEUES; j++) {
2880 			struct cake_flow *flow = b->flows + j;
2881 			u32 k = j * CAKE_MAX_TINS + i;
2882 
2883 			INIT_LIST_HEAD(&flow->flowchain);
2884 			cobalt_vars_init(&flow->cvars);
2885 
2886 			qd->overflow_heap[k].t = i;
2887 			qd->overflow_heap[k].b = j;
2888 			b->overflow_idx[j] = k;
2889 		}
2890 	}
2891 
2892 	cake_reconfigure(sch);
2893 	qd->avg_peak_bandwidth = q->rate_bps;
2894 	qd->min_netlen = ~0;
2895 	qd->min_adjlen = ~0;
2896 	qd->active_queues = 0;
2897 	qd->last_checked_active = 0;
2898 
2899 	return 0;
2900 }
2901 
2902 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
2903 {
2904 	struct cake_sched_data *qd = qdisc_priv(sch);
2905 
2906 	qd->config = cfg;
2907 	cake_reconfigure(sch);
2908 }
2909 
2910 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
2911 {
2912 	struct nlattr *opts;
2913 	u16 rate_flags;
2914 	u8 flow_mode;
2915 
2916 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2917 	if (!opts)
2918 		goto nla_put_failure;
2919 
2920 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2921 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2922 		goto nla_put_failure;
2923 
2924 	flow_mode = READ_ONCE(q->flow_mode);
2925 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2926 		goto nla_put_failure;
2927 
2928 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2929 		goto nla_put_failure;
2930 
2931 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2932 		goto nla_put_failure;
2933 
2934 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2935 			READ_ONCE(q->buffer_config_limit)))
2936 		goto nla_put_failure;
2937 
2938 	rate_flags = READ_ONCE(q->rate_flags);
2939 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2940 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2941 		goto nla_put_failure;
2942 
2943 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2944 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2945 		goto nla_put_failure;
2946 
2947 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2948 		goto nla_put_failure;
2949 
2950 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2951 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2952 		goto nla_put_failure;
2953 
2954 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2955 		goto nla_put_failure;
2956 
2957 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2958 			!!(rate_flags & CAKE_FLAG_WASH)))
2959 		goto nla_put_failure;
2960 
2961 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2962 		goto nla_put_failure;
2963 
2964 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2965 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2966 			goto nla_put_failure;
2967 
2968 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2969 		goto nla_put_failure;
2970 
2971 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2972 		goto nla_put_failure;
2973 
2974 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2975 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2976 		goto nla_put_failure;
2977 
2978 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2979 		goto nla_put_failure;
2980 
2981 	return nla_nest_end(skb, opts);
2982 
2983 nla_put_failure:
2984 	return -1;
2985 }
2986 
2987 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2988 {
2989 	struct cake_sched_data *qd = qdisc_priv(sch);
2990 
2991 	return cake_config_dump(qd->config, skb);
2992 }
2993 
2994 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2995 {
2996 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2997 	struct cake_sched_data *q = qdisc_priv(sch);
2998 	struct nlattr *tstats, *ts;
2999 	int i;
3000 
3001 	if (!stats)
3002 		return -1;
3003 
3004 #define PUT_STAT_U32(attr, data) do {				       \
3005 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3006 			goto nla_put_failure;			       \
3007 	} while (0)
3008 #define PUT_STAT_U64(attr, data) do {				       \
3009 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
3010 					data, TCA_CAKE_STATS_PAD)) \
3011 			goto nla_put_failure;			       \
3012 	} while (0)
3013 
3014 	PUT_STAT_U64(CAPACITY_ESTIMATE64, READ_ONCE(q->avg_peak_bandwidth));
3015 	PUT_STAT_U32(MEMORY_LIMIT, READ_ONCE(q->buffer_limit));
3016 	PUT_STAT_U32(MEMORY_USED, READ_ONCE(q->buffer_max_used));
3017 	PUT_STAT_U32(AVG_NETOFF, ((READ_ONCE(q->avg_netoff) + 0x8000) >> 16));
3018 	PUT_STAT_U32(MAX_NETLEN, READ_ONCE(q->max_netlen));
3019 	PUT_STAT_U32(MAX_ADJLEN, READ_ONCE(q->max_adjlen));
3020 	PUT_STAT_U32(MIN_NETLEN, READ_ONCE(q->min_netlen));
3021 	PUT_STAT_U32(MIN_ADJLEN, READ_ONCE(q->min_adjlen));
3022 	PUT_STAT_U32(ACTIVE_QUEUES, READ_ONCE(q->active_queues));
3023 
3024 #undef PUT_STAT_U32
3025 #undef PUT_STAT_U64
3026 
3027 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
3028 	if (!tstats)
3029 		goto nla_put_failure;
3030 
3031 #define PUT_TSTAT_U32(attr, data) do {					\
3032 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3033 			goto nla_put_failure;				\
3034 	} while (0)
3035 #define PUT_TSTAT_U64(attr, data) do {					\
3036 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3037 					data, TCA_CAKE_TIN_STATS_PAD))	\
3038 			goto nla_put_failure;				\
3039 	} while (0)
3040 
3041 	for (i = 0; i < q->tin_cnt; i++) {
3042 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3043 
3044 		ts = nla_nest_start_noflag(d->skb, i + 1);
3045 		if (!ts)
3046 			goto nla_put_failure;
3047 
3048 		PUT_TSTAT_U64(THRESHOLD_RATE64, READ_ONCE(b->tin_rate_bps));
3049 		PUT_TSTAT_U64(SENT_BYTES64, READ_ONCE(b->bytes));
3050 		PUT_TSTAT_U32(BACKLOG_BYTES, READ_ONCE(b->tin_backlog));
3051 
3052 		PUT_TSTAT_U32(TARGET_US,
3053 			      ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.target))));
3054 		PUT_TSTAT_U32(INTERVAL_US,
3055 			      ktime_to_us(ns_to_ktime(READ_ONCE(b->cparams.interval))));
3056 
3057 		PUT_TSTAT_U32(SENT_PACKETS, READ_ONCE(b->packets));
3058 		PUT_TSTAT_U32(DROPPED_PACKETS, READ_ONCE(b->tin_dropped));
3059 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, READ_ONCE(b->tin_ecn_mark));
3060 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, READ_ONCE(b->ack_drops));
3061 
3062 		PUT_TSTAT_U32(PEAK_DELAY_US,
3063 			      ktime_to_us(ns_to_ktime(READ_ONCE(b->peak_delay))));
3064 		PUT_TSTAT_U32(AVG_DELAY_US,
3065 			      ktime_to_us(ns_to_ktime(READ_ONCE(b->avge_delay))));
3066 		PUT_TSTAT_U32(BASE_DELAY_US,
3067 			      ktime_to_us(ns_to_ktime(READ_ONCE(b->base_delay))));
3068 
3069 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, READ_ONCE(b->way_hits));
3070 		PUT_TSTAT_U32(WAY_MISSES, READ_ONCE(b->way_misses));
3071 		PUT_TSTAT_U32(WAY_COLLISIONS, READ_ONCE(b->way_collisions));
3072 
3073 		PUT_TSTAT_U32(SPARSE_FLOWS, READ_ONCE(b->sparse_flow_count) +
3074 					    READ_ONCE(b->decaying_flow_count));
3075 		PUT_TSTAT_U32(BULK_FLOWS, READ_ONCE(b->bulk_flow_count));
3076 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, READ_ONCE(b->unresponsive_flow_count));
3077 		PUT_TSTAT_U32(MAX_SKBLEN, READ_ONCE(b->max_skblen));
3078 
3079 		PUT_TSTAT_U32(FLOW_QUANTUM, READ_ONCE(b->flow_quantum));
3080 		nla_nest_end(d->skb, ts);
3081 	}
3082 
3083 #undef PUT_TSTAT_U32
3084 #undef PUT_TSTAT_U64
3085 
3086 	nla_nest_end(d->skb, tstats);
3087 	return nla_nest_end(d->skb, stats);
3088 
3089 nla_put_failure:
3090 	nla_nest_cancel(d->skb, stats);
3091 	return -1;
3092 }
3093 
3094 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3095 {
3096 	return NULL;
3097 }
3098 
3099 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3100 {
3101 	return 0;
3102 }
3103 
3104 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3105 			       u32 classid)
3106 {
3107 	return 0;
3108 }
3109 
3110 static void cake_unbind(struct Qdisc *q, unsigned long cl)
3111 {
3112 }
3113 
3114 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3115 					struct netlink_ext_ack *extack)
3116 {
3117 	struct cake_sched_data *q = qdisc_priv(sch);
3118 
3119 	if (cl)
3120 		return NULL;
3121 	return q->block;
3122 }
3123 
3124 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3125 			   struct sk_buff *skb, struct tcmsg *tcm)
3126 {
3127 	tcm->tcm_handle |= TC_H_MIN(cl);
3128 	return 0;
3129 }
3130 
3131 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3132 				 struct gnet_dump *d)
3133 {
3134 	struct cake_sched_data *q = qdisc_priv(sch);
3135 	const struct cake_flow *flow = NULL;
3136 	struct gnet_stats_queue qs = { 0 };
3137 	struct nlattr *stats;
3138 	u32 idx = cl - 1;
3139 
3140 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3141 		const struct cake_tin_data *b = \
3142 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3143 		const struct sk_buff *skb;
3144 
3145 		flow = &b->flows[idx % CAKE_QUEUES];
3146 
3147 		if (READ_ONCE(flow->head)) {
3148 			sch_tree_lock(sch);
3149 			skb = flow->head;
3150 			while (skb) {
3151 				qs.qlen++;
3152 				skb = skb->next;
3153 			}
3154 			sch_tree_unlock(sch);
3155 		}
3156 		qs.backlog = READ_ONCE(b->backlogs[idx % CAKE_QUEUES]);
3157 		qs.drops = READ_ONCE(flow->dropped);
3158 	}
3159 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3160 		return -1;
3161 	if (flow) {
3162 		ktime_t now = ktime_get();
3163 		bool dropping;
3164 		u32 p_drop;
3165 
3166 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3167 		if (!stats)
3168 			return -1;
3169 
3170 #define PUT_STAT_U32(attr, data) do {				       \
3171 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3172 			goto nla_put_failure;			       \
3173 	} while (0)
3174 #define PUT_STAT_S32(attr, data) do {				       \
3175 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3176 			goto nla_put_failure;			       \
3177 	} while (0)
3178 
3179 		PUT_STAT_S32(DEFICIT, READ_ONCE(flow->deficit));
3180 		dropping = READ_ONCE(flow->cvars.dropping);
3181 		PUT_STAT_U32(DROPPING, dropping);
3182 		PUT_STAT_U32(COBALT_COUNT, READ_ONCE(flow->cvars.count));
3183 		p_drop = READ_ONCE(flow->cvars.p_drop);
3184 		PUT_STAT_U32(P_DROP, p_drop);
3185 		if (p_drop) {
3186 			PUT_STAT_S32(BLUE_TIMER_US,
3187 				     ktime_to_us(
3188 					     ktime_sub(now,
3189 						       READ_ONCE(flow->cvars.blue_timer))));
3190 		}
3191 		if (dropping) {
3192 			PUT_STAT_S32(DROP_NEXT_US,
3193 				     ktime_to_us(
3194 					     ktime_sub(now,
3195 						       READ_ONCE(flow->cvars.drop_next))));
3196 		}
3197 
3198 		if (nla_nest_end(d->skb, stats) < 0)
3199 			return -1;
3200 	}
3201 
3202 	return 0;
3203 
3204 nla_put_failure:
3205 	nla_nest_cancel(d->skb, stats);
3206 	return -1;
3207 }
3208 
3209 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3210 {
3211 	struct cake_sched_data *q = qdisc_priv(sch);
3212 	unsigned int i, j;
3213 
3214 	if (arg->stop)
3215 		return;
3216 
3217 	for (i = 0; i < q->tin_cnt; i++) {
3218 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3219 
3220 		for (j = 0; j < CAKE_QUEUES; j++) {
3221 			if (list_empty(&b->flows[j].flowchain)) {
3222 				arg->count++;
3223 				continue;
3224 			}
3225 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3226 						 arg))
3227 				break;
3228 		}
3229 	}
3230 }
3231 
3232 static const struct Qdisc_class_ops cake_class_ops = {
3233 	.leaf		=	cake_leaf,
3234 	.find		=	cake_find,
3235 	.tcf_block	=	cake_tcf_block,
3236 	.bind_tcf	=	cake_bind,
3237 	.unbind_tcf	=	cake_unbind,
3238 	.dump		=	cake_dump_class,
3239 	.dump_stats	=	cake_dump_class_stats,
3240 	.walk		=	cake_walk,
3241 };
3242 
3243 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3244 	.cl_ops		=	&cake_class_ops,
3245 	.id		=	"cake",
3246 	.priv_size	=	sizeof(struct cake_sched_data),
3247 	.enqueue	=	cake_enqueue,
3248 	.dequeue	=	cake_dequeue,
3249 	.peek		=	qdisc_peek_dequeued,
3250 	.init		=	cake_init,
3251 	.reset		=	cake_reset,
3252 	.destroy	=	cake_destroy,
3253 	.change		=	cake_change,
3254 	.dump		=	cake_dump,
3255 	.dump_stats	=	cake_dump_stats,
3256 	.owner		=	THIS_MODULE,
3257 };
3258 MODULE_ALIAS_NET_SCH("cake");
3259 
3260 struct cake_mq_sched {
3261 	struct mq_sched mq_priv; /* must be first */
3262 	struct cake_sched_config cake_config;
3263 };
3264 
3265 static void cake_mq_destroy(struct Qdisc *sch)
3266 {
3267 	mq_destroy_common(sch);
3268 }
3269 
3270 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
3271 			struct netlink_ext_ack *extack)
3272 {
3273 	struct cake_mq_sched *priv = qdisc_priv(sch);
3274 	struct net_device *dev = qdisc_dev(sch);
3275 	int ret, ntx;
3276 	bool _unused;
3277 
3278 	cake_config_init(&priv->cake_config, true);
3279 	if (opt) {
3280 		ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
3281 		if (ret)
3282 			return ret;
3283 	}
3284 
3285 	ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
3286 	if (ret)
3287 		return ret;
3288 
3289 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
3290 		cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
3291 
3292 	return 0;
3293 }
3294 
3295 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
3296 {
3297 	struct cake_mq_sched *priv = qdisc_priv(sch);
3298 
3299 	mq_dump_common(sch, skb);
3300 	return cake_config_dump(&priv->cake_config, skb);
3301 }
3302 
3303 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
3304 			  struct netlink_ext_ack *extack)
3305 {
3306 	struct cake_mq_sched *priv = qdisc_priv(sch);
3307 	struct net_device *dev = qdisc_dev(sch);
3308 	bool overhead_changed = false;
3309 	unsigned int ntx;
3310 	int ret;
3311 
3312 	ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
3313 	if (ret)
3314 		return ret;
3315 
3316 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
3317 		struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
3318 		struct cake_sched_data *qd = qdisc_priv(chld);
3319 
3320 		if (overhead_changed) {
3321 			WRITE_ONCE(qd->max_netlen, 0);
3322 			WRITE_ONCE(qd->max_adjlen, 0);
3323 			WRITE_ONCE(qd->min_netlen, ~0);
3324 			WRITE_ONCE(qd->min_adjlen, ~0);
3325 		}
3326 
3327 		if (qd->tins) {
3328 			sch_tree_lock(chld);
3329 			cake_reconfigure(chld);
3330 			sch_tree_unlock(chld);
3331 		}
3332 	}
3333 
3334 	return 0;
3335 }
3336 
3337 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
3338 			 struct Qdisc **old, struct netlink_ext_ack *extack)
3339 {
3340 	NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
3341 	return -EOPNOTSUPP;
3342 }
3343 
3344 static const struct Qdisc_class_ops cake_mq_class_ops = {
3345 	.select_queue	= mq_select_queue,
3346 	.graft		= cake_mq_graft,
3347 	.leaf		= mq_leaf,
3348 	.find		= mq_find,
3349 	.walk		= mq_walk,
3350 	.dump		= mq_dump_class,
3351 	.dump_stats	= mq_dump_class_stats,
3352 };
3353 
3354 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
3355 	.cl_ops		=	&cake_mq_class_ops,
3356 	.id		=	"cake_mq",
3357 	.priv_size	=	sizeof(struct cake_mq_sched),
3358 	.init		=	cake_mq_init,
3359 	.destroy	=	cake_mq_destroy,
3360 	.attach		=	mq_attach,
3361 	.change		=	cake_mq_change,
3362 	.change_real_num_tx = mq_change_real_num_tx,
3363 	.dump		=	cake_mq_dump,
3364 	.owner		=	THIS_MODULE,
3365 };
3366 MODULE_ALIAS_NET_SCH("cake_mq");
3367 
3368 static int __init cake_module_init(void)
3369 {
3370 	int ret;
3371 
3372 	ret = register_qdisc(&cake_qdisc_ops);
3373 	if (ret)
3374 		return ret;
3375 
3376 	ret = register_qdisc(&cake_mq_qdisc_ops);
3377 	if (ret)
3378 		unregister_qdisc(&cake_qdisc_ops);
3379 
3380 	return ret;
3381 }
3382 
3383 static void __exit cake_module_exit(void)
3384 {
3385 	unregister_qdisc(&cake_qdisc_ops);
3386 	unregister_qdisc(&cake_mq_qdisc_ops);
3387 }
3388 
3389 module_init(cake_module_init)
3390 module_exit(cake_module_exit)
3391 MODULE_AUTHOR("Jonathan Morton");
3392 MODULE_LICENSE("Dual BSD/GPL");
3393 MODULE_DESCRIPTION("The CAKE shaper.");
3394 MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
3395