xref: /linux/net/sched/sch_cake.c (revision abacaf559950eec0d99d37ff6b92049409af5943)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/sch_priv.h>
71 #include <net/pkt_cls.h>
72 #include <net/tcp.h>
73 #include <net/flow_dissector.h>
74 
75 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
76 #include <net/netfilter/nf_conntrack_core.h>
77 #endif
78 
79 #define CAKE_SET_WAYS (8)
80 #define CAKE_MAX_TINS (8)
81 #define CAKE_QUEUES (1024)
82 #define CAKE_FLOW_MASK 63
83 #define CAKE_FLOW_NAT_FLAG 64
84 
85 /* struct cobalt_params - contains codel and blue parameters
86  * @interval:	codel initial drop rate
87  * @target:     maximum persistent sojourn time & blue update rate
88  * @mtu_time:   serialisation delay of maximum-size packet
89  * @p_inc:      increment of blue drop probability (0.32 fxp)
90  * @p_dec:      decrement of blue drop probability (0.32 fxp)
91  */
92 struct cobalt_params {
93 	u64	interval;
94 	u64	target;
95 	u64	mtu_time;
96 	u32	p_inc;
97 	u32	p_dec;
98 };
99 
100 /* struct cobalt_vars - contains codel and blue variables
101  * @count:		codel dropping frequency
102  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
103  * @drop_next:		time to drop next packet, or when we dropped last
104  * @blue_timer:		Blue time to next drop
105  * @p_drop:		BLUE drop probability (0.32 fxp)
106  * @dropping:		set if in dropping state
107  * @ecn_marked:		set if marked
108  */
109 struct cobalt_vars {
110 	u32	count;
111 	u32	rec_inv_sqrt;
112 	ktime_t	drop_next;
113 	ktime_t	blue_timer;
114 	u32     p_drop;
115 	bool	dropping;
116 	bool    ecn_marked;
117 };
118 
119 enum {
120 	CAKE_SET_NONE = 0,
121 	CAKE_SET_SPARSE,
122 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
123 	CAKE_SET_BULK,
124 	CAKE_SET_DECAYING
125 };
126 
127 struct cake_flow {
128 	/* this stuff is all needed per-flow at dequeue time */
129 	struct sk_buff	  *head;
130 	struct sk_buff	  *tail;
131 	struct list_head  flowchain;
132 	s32		  deficit;
133 	u32		  dropped;
134 	struct cobalt_vars cvars;
135 	u16		  srchost; /* index into cake_host table */
136 	u16		  dsthost;
137 	u8		  set;
138 }; /* please try to keep this structure <= 64 bytes */
139 
140 struct cake_host {
141 	u32 srchost_tag;
142 	u32 dsthost_tag;
143 	u16 srchost_bulk_flow_count;
144 	u16 dsthost_bulk_flow_count;
145 };
146 
147 struct cake_heap_entry {
148 	u16 t:3, b:10;
149 };
150 
151 struct cake_tin_data {
152 	struct cake_flow flows[CAKE_QUEUES];
153 	u32	backlogs[CAKE_QUEUES];
154 	u32	tags[CAKE_QUEUES]; /* for set association */
155 	u16	overflow_idx[CAKE_QUEUES];
156 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
157 	u16	flow_quantum;
158 
159 	struct cobalt_params cparams;
160 	u32	drop_overlimit;
161 	u16	bulk_flow_count;
162 	u16	sparse_flow_count;
163 	u16	decaying_flow_count;
164 	u16	unresponsive_flow_count;
165 
166 	u32	max_skblen;
167 
168 	struct list_head new_flows;
169 	struct list_head old_flows;
170 	struct list_head decaying_flows;
171 
172 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
173 	ktime_t	time_next_packet;
174 	u64	tin_rate_ns;
175 	u64	tin_rate_bps;
176 	u16	tin_rate_shft;
177 
178 	u16	tin_quantum;
179 	s32	tin_deficit;
180 	u32	tin_backlog;
181 	u32	tin_dropped;
182 	u32	tin_ecn_mark;
183 
184 	u32	packets;
185 	u64	bytes;
186 
187 	u32	ack_drops;
188 
189 	/* moving averages */
190 	u64 avge_delay;
191 	u64 peak_delay;
192 	u64 base_delay;
193 
194 	/* hash function stats */
195 	u32	way_directs;
196 	u32	way_hits;
197 	u32	way_misses;
198 	u32	way_collisions;
199 }; /* number of tins is small, so size of this struct doesn't matter much */
200 
201 struct cake_sched_config {
202 	u64		rate_bps;
203 	u64		interval;
204 	u64		target;
205 	u64		sync_time;
206 	u32		buffer_config_limit;
207 	u32		fwmark_mask;
208 	u16		fwmark_shft;
209 	s16		rate_overhead;
210 	u16		rate_mpu;
211 	u16		rate_flags;
212 	u8		tin_mode;
213 	u8		flow_mode;
214 	u8		atm_mode;
215 	u8		ack_filter;
216 	u8		is_shared;
217 };
218 
219 struct cake_sched_data {
220 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
221 	struct tcf_block *block;
222 	struct cake_tin_data *tins;
223 	struct cake_sched_config *config;
224 	struct cake_sched_config initial_config;
225 
226 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
227 
228 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
229 	ktime_t		time_next_packet;
230 	ktime_t		failsafe_next_packet;
231 	u64		rate_ns;
232 	u16		rate_shft;
233 	u16		overflow_timeout;
234 	u16		tin_cnt;
235 
236 	/* resource tracking */
237 	u32		buffer_used;
238 	u32		buffer_max_used;
239 	u32		buffer_limit;
240 
241 	/* indices for dequeue */
242 	u16		cur_tin;
243 	u16		cur_flow;
244 
245 	struct qdisc_watchdog watchdog;
246 	const u8	*tin_index;
247 	const u8	*tin_order;
248 
249 	/* bandwidth capacity estimate */
250 	ktime_t		last_packet_time;
251 	ktime_t		avg_window_begin;
252 	u64		avg_packet_interval;
253 	u64		avg_window_bytes;
254 	u64		avg_peak_bandwidth;
255 	ktime_t		last_reconfig_time;
256 
257 	/* packet length stats */
258 	u32		avg_netoff;
259 	u16		max_netlen;
260 	u16		max_adjlen;
261 	u16		min_netlen;
262 	u16		min_adjlen;
263 
264 	/* mq sync state */
265 	u64		last_checked_active;
266 	u64		last_active;
267 	u32		active_queues;
268 };
269 
270 enum {
271 	CAKE_FLAG_OVERHEAD	   = BIT(0),
272 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
273 	CAKE_FLAG_INGRESS	   = BIT(2),
274 	CAKE_FLAG_WASH		   = BIT(3),
275 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
276 };
277 
278 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
279  * obtain the best features of each.  Codel is excellent on flows which
280  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
281  * unresponsive flows.
282  */
283 
284 struct cobalt_skb_cb {
285 	ktime_t enqueue_time;
286 	u32     adjusted_len;
287 };
288 
us_to_ns(u64 us)289 static u64 us_to_ns(u64 us)
290 {
291 	return us * NSEC_PER_USEC;
292 }
293 
get_cobalt_cb(const struct sk_buff * skb)294 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
295 {
296 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
297 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
298 }
299 
cobalt_get_enqueue_time(const struct sk_buff * skb)300 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
301 {
302 	return get_cobalt_cb(skb)->enqueue_time;
303 }
304 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)305 static void cobalt_set_enqueue_time(struct sk_buff *skb,
306 				    ktime_t now)
307 {
308 	get_cobalt_cb(skb)->enqueue_time = now;
309 }
310 
311 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
312 
313 /* Diffserv lookup tables */
314 
315 static const u8 precedence[] = {
316 	0, 0, 0, 0, 0, 0, 0, 0,
317 	1, 1, 1, 1, 1, 1, 1, 1,
318 	2, 2, 2, 2, 2, 2, 2, 2,
319 	3, 3, 3, 3, 3, 3, 3, 3,
320 	4, 4, 4, 4, 4, 4, 4, 4,
321 	5, 5, 5, 5, 5, 5, 5, 5,
322 	6, 6, 6, 6, 6, 6, 6, 6,
323 	7, 7, 7, 7, 7, 7, 7, 7,
324 };
325 
326 static const u8 diffserv8[] = {
327 	2, 0, 1, 2, 4, 2, 2, 2,
328 	1, 2, 1, 2, 1, 2, 1, 2,
329 	5, 2, 4, 2, 4, 2, 4, 2,
330 	3, 2, 3, 2, 3, 2, 3, 2,
331 	6, 2, 3, 2, 3, 2, 3, 2,
332 	6, 2, 2, 2, 6, 2, 6, 2,
333 	7, 2, 2, 2, 2, 2, 2, 2,
334 	7, 2, 2, 2, 2, 2, 2, 2,
335 };
336 
337 static const u8 diffserv4[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	2, 0, 2, 0, 2, 0, 2, 0,
341 	2, 0, 2, 0, 2, 0, 2, 0,
342 	3, 0, 2, 0, 2, 0, 2, 0,
343 	3, 0, 0, 0, 3, 0, 3, 0,
344 	3, 0, 0, 0, 0, 0, 0, 0,
345 	3, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 diffserv3[] = {
349 	0, 1, 0, 0, 2, 0, 0, 0,
350 	1, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 2, 0, 2, 0,
355 	2, 0, 0, 0, 0, 0, 0, 0,
356 	2, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 static const u8 besteffort[] = {
360 	0, 0, 0, 0, 0, 0, 0, 0,
361 	0, 0, 0, 0, 0, 0, 0, 0,
362 	0, 0, 0, 0, 0, 0, 0, 0,
363 	0, 0, 0, 0, 0, 0, 0, 0,
364 	0, 0, 0, 0, 0, 0, 0, 0,
365 	0, 0, 0, 0, 0, 0, 0, 0,
366 	0, 0, 0, 0, 0, 0, 0, 0,
367 	0, 0, 0, 0, 0, 0, 0, 0,
368 };
369 
370 /* tin priority order for stats dumping */
371 
372 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
373 static const u8 bulk_order[] = {1, 0, 2, 3};
374 
375 /* There is a big difference in timing between the accurate values placed in the
376  * cache and the approximations given by a single Newton step for small count
377  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
378  * these values are calculated using eight Newton steps, using the
379  * implementation below. Above 16, a single Newton step gives sufficient
380  * accuracy in either direction, given the precision stored.
381  *
382  * The magnitude of the error when stepping up to count 2 is such as to give the
383  * value that *should* have been produced at count 4.
384  */
385 
386 #define REC_INV_SQRT_CACHE (16)
387 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
388 		~0,         ~0, 3037000500, 2479700525,
389 	2147483647, 1920767767, 1753413056, 1623345051,
390 	1518500250, 1431655765, 1358187914, 1294981364,
391 	1239850263, 1191209601, 1147878294, 1108955788
392 };
393 
394 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust);
395 
396 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
397  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
398  *
399  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
400  */
401 
cobalt_newton_step(struct cobalt_vars * vars)402 static void cobalt_newton_step(struct cobalt_vars *vars)
403 {
404 	u32 invsqrt, invsqrt2;
405 	u64 val;
406 
407 	invsqrt = vars->rec_inv_sqrt;
408 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
409 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
410 
411 	val >>= 2; /* avoid overflow in following multiply */
412 	val = (val * invsqrt) >> (32 - 2 + 1);
413 
414 	vars->rec_inv_sqrt = val;
415 }
416 
cobalt_invsqrt(struct cobalt_vars * vars)417 static void cobalt_invsqrt(struct cobalt_vars *vars)
418 {
419 	if (vars->count < REC_INV_SQRT_CACHE)
420 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
421 	else
422 		cobalt_newton_step(vars);
423 }
424 
cobalt_vars_init(struct cobalt_vars * vars)425 static void cobalt_vars_init(struct cobalt_vars *vars)
426 {
427 	memset(vars, 0, sizeof(*vars));
428 }
429 
430 /* CoDel control_law is t + interval/sqrt(count)
431  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
432  * both sqrt() and divide operation.
433  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)434 static ktime_t cobalt_control(ktime_t t,
435 			      u64 interval,
436 			      u32 rec_inv_sqrt)
437 {
438 	return ktime_add_ns(t, reciprocal_scale(interval,
439 						rec_inv_sqrt));
440 }
441 
442 /* Call this when a packet had to be dropped due to queue overflow.  Returns
443  * true if the BLUE state was quiescent before but active after this call.
444  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)445 static bool cobalt_queue_full(struct cobalt_vars *vars,
446 			      struct cobalt_params *p,
447 			      ktime_t now)
448 {
449 	bool up = false;
450 
451 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
452 		up = !vars->p_drop;
453 		vars->p_drop += p->p_inc;
454 		if (vars->p_drop < p->p_inc)
455 			vars->p_drop = ~0;
456 		vars->blue_timer = now;
457 	}
458 	vars->dropping = true;
459 	vars->drop_next = now;
460 	if (!vars->count)
461 		vars->count = 1;
462 
463 	return up;
464 }
465 
466 /* Call this when the queue was serviced but turned out to be empty.  Returns
467  * true if the BLUE state was active before but quiescent after this call.
468  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)469 static bool cobalt_queue_empty(struct cobalt_vars *vars,
470 			       struct cobalt_params *p,
471 			       ktime_t now)
472 {
473 	bool down = false;
474 
475 	if (vars->p_drop &&
476 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
477 		if (vars->p_drop < p->p_dec)
478 			vars->p_drop = 0;
479 		else
480 			vars->p_drop -= p->p_dec;
481 		vars->blue_timer = now;
482 		down = !vars->p_drop;
483 	}
484 	vars->dropping = false;
485 
486 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
487 		vars->count--;
488 		cobalt_invsqrt(vars);
489 		vars->drop_next = cobalt_control(vars->drop_next,
490 						 p->interval,
491 						 vars->rec_inv_sqrt);
492 	}
493 
494 	return down;
495 }
496 
497 /* Call this with a freshly dequeued packet for possible congestion marking.
498  * Returns true as an instruction to drop the packet, false for delivery.
499  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)500 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
501 					       struct cobalt_params *p,
502 					       ktime_t now,
503 					       struct sk_buff *skb,
504 					       u32 bulk_flows)
505 {
506 	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
507 	bool next_due, over_target;
508 	ktime_t schedule;
509 	u64 sojourn;
510 
511 /* The 'schedule' variable records, in its sign, whether 'now' is before or
512  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
513  * scheduling decision is actually branched, without destroying that
514  * information.  Similarly, the first 'schedule' value calculated is preserved
515  * in the boolean 'next_due'.
516  *
517  * As for 'drop_next', we take advantage of the fact that 'interval' is both
518  * the delay between first exceeding 'target' and the first signalling event,
519  * *and* the scaling factor for the signalling frequency.  It's therefore very
520  * natural to use a single mechanism for both purposes, and eliminates a
521  * significant amount of reference Codel's spaghetti code.  To help with this,
522  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
523  * as possible to 1.0 in fixed-point.
524  */
525 
526 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
527 	schedule = ktime_sub(now, vars->drop_next);
528 	over_target = sojourn > p->target &&
529 		      sojourn > p->mtu_time * bulk_flows * 2 &&
530 		      sojourn > p->mtu_time * 4;
531 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
532 
533 	vars->ecn_marked = false;
534 
535 	if (over_target) {
536 		if (!vars->dropping) {
537 			vars->dropping = true;
538 			vars->drop_next = cobalt_control(now,
539 							 p->interval,
540 							 vars->rec_inv_sqrt);
541 		}
542 		if (!vars->count)
543 			vars->count = 1;
544 	} else if (vars->dropping) {
545 		vars->dropping = false;
546 	}
547 
548 	if (next_due && vars->dropping) {
549 		/* Use ECN mark if possible, otherwise drop */
550 		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
551 			reason = SKB_DROP_REASON_QDISC_CONGESTED;
552 
553 		vars->count++;
554 		if (!vars->count)
555 			vars->count--;
556 		cobalt_invsqrt(vars);
557 		vars->drop_next = cobalt_control(vars->drop_next,
558 						 p->interval,
559 						 vars->rec_inv_sqrt);
560 		schedule = ktime_sub(now, vars->drop_next);
561 	} else {
562 		while (next_due) {
563 			vars->count--;
564 			cobalt_invsqrt(vars);
565 			vars->drop_next = cobalt_control(vars->drop_next,
566 							 p->interval,
567 							 vars->rec_inv_sqrt);
568 			schedule = ktime_sub(now, vars->drop_next);
569 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
570 		}
571 	}
572 
573 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
574 	if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
575 	    get_random_u32() < vars->p_drop)
576 		reason = SKB_DROP_REASON_CAKE_FLOOD;
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
582 		vars->drop_next = now;
583 
584 	return reason;
585 }
586 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)587 static bool cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct, upd = false;
593 	__be32 ip;
594 
595 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
596 		return false;
597 
598 	if (!nf_ct_get_tuple_skb(&tuple, skb))
599 		return false;
600 
601 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 	if (ip != keys->addrs.v4addrs.src) {
603 		keys->addrs.v4addrs.src = ip;
604 		upd = true;
605 	}
606 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607 	if (ip != keys->addrs.v4addrs.dst) {
608 		keys->addrs.v4addrs.dst = ip;
609 		upd = true;
610 	}
611 
612 	if (keys->ports.ports) {
613 		__be16 port;
614 
615 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
616 		if (port != keys->ports.src) {
617 			keys->ports.src = port;
618 			upd = true;
619 		}
620 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
621 		if (port != keys->ports.dst) {
622 			port = keys->ports.dst;
623 			upd = true;
624 		}
625 	}
626 	return upd;
627 #else
628 	return false;
629 #endif
630 }
631 
632 /* Cake has several subtle multiple bit settings. In these cases you
633  *  would be matching triple isolate mode as well.
634  */
635 
cake_dsrc(int flow_mode)636 static bool cake_dsrc(int flow_mode)
637 {
638 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640 
cake_ddst(int flow_mode)641 static bool cake_ddst(int flow_mode)
642 {
643 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645 
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)646 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
647 					     struct cake_flow *flow,
648 					     int flow_mode)
649 {
650 	if (likely(cake_dsrc(flow_mode) &&
651 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
652 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
653 }
654 
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)655 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
656 					     struct cake_flow *flow,
657 					     int flow_mode)
658 {
659 	if (likely(cake_dsrc(flow_mode) &&
660 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
661 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
662 }
663 
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)664 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
665 					     struct cake_flow *flow,
666 					     int flow_mode)
667 {
668 	if (likely(cake_ddst(flow_mode) &&
669 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
670 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
671 }
672 
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)673 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
674 					     struct cake_flow *flow,
675 					     int flow_mode)
676 {
677 	if (likely(cake_ddst(flow_mode) &&
678 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
679 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
680 }
681 
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)682 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
683 				 struct cake_flow *flow,
684 				 int flow_mode)
685 {
686 	u16 host_load = 1;
687 
688 	if (cake_dsrc(flow_mode))
689 		host_load = max(host_load,
690 				q->hosts[flow->srchost].srchost_bulk_flow_count);
691 
692 	if (cake_ddst(flow_mode))
693 		host_load = max(host_load,
694 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
695 
696 	/* The get_random_u16() is a way to apply dithering to avoid
697 	 * accumulating roundoff errors
698 	 */
699 	return (q->flow_quantum * quantum_div[host_load] +
700 		get_random_u16()) >> 16;
701 }
702 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)703 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
704 		     int flow_mode, u16 flow_override, u16 host_override)
705 {
706 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
707 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
708 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
709 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
710 	u16 reduced_hash, srchost_idx, dsthost_idx;
711 	struct flow_keys keys, host_keys;
712 	bool use_skbhash = skb->l4_hash;
713 
714 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
715 		return 0;
716 
717 	/* If both overrides are set, or we can use the SKB hash and nat mode is
718 	 * disabled, we can skip packet dissection entirely. If nat mode is
719 	 * enabled there's another check below after doing the conntrack lookup.
720 	 */
721 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
722 		goto skip_hash;
723 
724 	skb_flow_dissect_flow_keys(skb, &keys,
725 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
726 
727 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
728 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
729 		use_skbhash = false;
730 
731 	/* If we can still use the SKB hash and don't need the host hash, we can
732 	 * skip the rest of the hashing procedure
733 	 */
734 	if (use_skbhash && !hash_hosts)
735 		goto skip_hash;
736 
737 	/* flow_hash_from_keys() sorts the addresses by value, so we have
738 	 * to preserve their order in a separate data structure to treat
739 	 * src and dst host addresses as independently selectable.
740 	 */
741 	host_keys = keys;
742 	host_keys.ports.ports     = 0;
743 	host_keys.basic.ip_proto  = 0;
744 	host_keys.keyid.keyid     = 0;
745 	host_keys.tags.flow_label = 0;
746 
747 	switch (host_keys.control.addr_type) {
748 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
749 		host_keys.addrs.v4addrs.src = 0;
750 		dsthost_hash = flow_hash_from_keys(&host_keys);
751 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
752 		host_keys.addrs.v4addrs.dst = 0;
753 		srchost_hash = flow_hash_from_keys(&host_keys);
754 		break;
755 
756 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
757 		memset(&host_keys.addrs.v6addrs.src, 0,
758 		       sizeof(host_keys.addrs.v6addrs.src));
759 		dsthost_hash = flow_hash_from_keys(&host_keys);
760 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
761 		memset(&host_keys.addrs.v6addrs.dst, 0,
762 		       sizeof(host_keys.addrs.v6addrs.dst));
763 		srchost_hash = flow_hash_from_keys(&host_keys);
764 		break;
765 
766 	default:
767 		dsthost_hash = 0;
768 		srchost_hash = 0;
769 	}
770 
771 	/* This *must* be after the above switch, since as a
772 	 * side-effect it sorts the src and dst addresses.
773 	 */
774 	if (hash_flows && !use_skbhash)
775 		flow_hash = flow_hash_from_keys(&keys);
776 
777 skip_hash:
778 	if (flow_override)
779 		flow_hash = flow_override - 1;
780 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
781 		flow_hash = skb->hash;
782 	if (host_override) {
783 		dsthost_hash = host_override - 1;
784 		srchost_hash = host_override - 1;
785 	}
786 
787 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
788 		if (flow_mode & CAKE_FLOW_SRC_IP)
789 			flow_hash ^= srchost_hash;
790 
791 		if (flow_mode & CAKE_FLOW_DST_IP)
792 			flow_hash ^= dsthost_hash;
793 	}
794 
795 	reduced_hash = flow_hash % CAKE_QUEUES;
796 
797 	/* set-associative hashing */
798 	/* fast path if no hash collision (direct lookup succeeds) */
799 	if (likely(q->tags[reduced_hash] == flow_hash &&
800 		   q->flows[reduced_hash].set)) {
801 		q->way_directs++;
802 	} else {
803 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
804 		u32 outer_hash = reduced_hash - inner_hash;
805 		bool allocate_src = false;
806 		bool allocate_dst = false;
807 		u32 i, k;
808 
809 		/* check if any active queue in the set is reserved for
810 		 * this flow.
811 		 */
812 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
813 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
814 			if (q->tags[outer_hash + k] == flow_hash) {
815 				if (i)
816 					q->way_hits++;
817 
818 				if (!q->flows[outer_hash + k].set) {
819 					/* need to increment host refcnts */
820 					allocate_src = cake_dsrc(flow_mode);
821 					allocate_dst = cake_ddst(flow_mode);
822 				}
823 
824 				goto found;
825 			}
826 		}
827 
828 		/* no queue is reserved for this flow, look for an
829 		 * empty one.
830 		 */
831 		for (i = 0; i < CAKE_SET_WAYS;
832 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
833 			if (!q->flows[outer_hash + k].set) {
834 				q->way_misses++;
835 				allocate_src = cake_dsrc(flow_mode);
836 				allocate_dst = cake_ddst(flow_mode);
837 				goto found;
838 			}
839 		}
840 
841 		/* With no empty queues, default to the original
842 		 * queue, accept the collision, update the host tags.
843 		 */
844 		q->way_collisions++;
845 		allocate_src = cake_dsrc(flow_mode);
846 		allocate_dst = cake_ddst(flow_mode);
847 
848 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
849 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
850 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
851 		}
852 found:
853 		/* reserve queue for future packets in same flow */
854 		reduced_hash = outer_hash + k;
855 		q->tags[reduced_hash] = flow_hash;
856 
857 		if (allocate_src) {
858 			srchost_idx = srchost_hash % CAKE_QUEUES;
859 			inner_hash = srchost_idx % CAKE_SET_WAYS;
860 			outer_hash = srchost_idx - inner_hash;
861 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
862 				i++, k = (k + 1) % CAKE_SET_WAYS) {
863 				if (q->hosts[outer_hash + k].srchost_tag ==
864 				    srchost_hash)
865 					goto found_src;
866 			}
867 			for (i = 0; i < CAKE_SET_WAYS;
868 				i++, k = (k + 1) % CAKE_SET_WAYS) {
869 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
870 					break;
871 			}
872 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
873 found_src:
874 			srchost_idx = outer_hash + k;
875 			q->flows[reduced_hash].srchost = srchost_idx;
876 
877 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
878 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
879 		}
880 
881 		if (allocate_dst) {
882 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
883 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
884 			outer_hash = dsthost_idx - inner_hash;
885 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
886 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
887 				if (q->hosts[outer_hash + k].dsthost_tag ==
888 				    dsthost_hash)
889 					goto found_dst;
890 			}
891 			for (i = 0; i < CAKE_SET_WAYS;
892 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
893 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
894 					break;
895 			}
896 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
897 found_dst:
898 			dsthost_idx = outer_hash + k;
899 			q->flows[reduced_hash].dsthost = dsthost_idx;
900 
901 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
902 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
903 		}
904 	}
905 
906 	return reduced_hash;
907 }
908 
909 /* helper functions : might be changed when/if skb use a standard list_head */
910 /* remove one skb from head of slot queue */
911 
dequeue_head(struct cake_flow * flow)912 static struct sk_buff *dequeue_head(struct cake_flow *flow)
913 {
914 	struct sk_buff *skb = flow->head;
915 
916 	if (skb) {
917 		flow->head = skb->next;
918 		skb_mark_not_on_list(skb);
919 	}
920 
921 	return skb;
922 }
923 
924 /* add skb to flow queue (tail add) */
925 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)926 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
927 {
928 	if (!flow->head)
929 		flow->head = skb;
930 	else
931 		flow->tail->next = skb;
932 	flow->tail = skb;
933 	skb->next = NULL;
934 }
935 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)936 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
937 				    struct ipv6hdr *buf)
938 {
939 	unsigned int offset = skb_network_offset(skb);
940 	struct iphdr *iph;
941 
942 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
943 
944 	if (!iph)
945 		return NULL;
946 
947 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
948 		return skb_header_pointer(skb, offset + iph->ihl * 4,
949 					  sizeof(struct ipv6hdr), buf);
950 
951 	else if (iph->version == 4)
952 		return iph;
953 
954 	else if (iph->version == 6)
955 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
956 					  buf);
957 
958 	return NULL;
959 }
960 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)961 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
962 				      void *buf, unsigned int bufsize)
963 {
964 	unsigned int offset = skb_network_offset(skb);
965 	const struct ipv6hdr *ipv6h;
966 	const struct tcphdr *tcph;
967 	const struct iphdr *iph;
968 	struct ipv6hdr _ipv6h;
969 	struct tcphdr _tcph;
970 
971 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
972 
973 	if (!ipv6h)
974 		return NULL;
975 
976 	if (ipv6h->version == 4) {
977 		iph = (struct iphdr *)ipv6h;
978 		offset += iph->ihl * 4;
979 
980 		/* special-case 6in4 tunnelling, as that is a common way to get
981 		 * v6 connectivity in the home
982 		 */
983 		if (iph->protocol == IPPROTO_IPV6) {
984 			ipv6h = skb_header_pointer(skb, offset,
985 						   sizeof(_ipv6h), &_ipv6h);
986 
987 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
988 				return NULL;
989 
990 			offset += sizeof(struct ipv6hdr);
991 
992 		} else if (iph->protocol != IPPROTO_TCP) {
993 			return NULL;
994 		}
995 
996 	} else if (ipv6h->version == 6) {
997 		if (ipv6h->nexthdr != IPPROTO_TCP)
998 			return NULL;
999 
1000 		offset += sizeof(struct ipv6hdr);
1001 	} else {
1002 		return NULL;
1003 	}
1004 
1005 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
1006 	if (!tcph || tcph->doff < 5)
1007 		return NULL;
1008 
1009 	return skb_header_pointer(skb, offset,
1010 				  min(__tcp_hdrlen(tcph), bufsize), buf);
1011 }
1012 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1013 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1014 				   int code, int *oplen)
1015 {
1016 	/* inspired by tcp_parse_options in tcp_input.c */
1017 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1018 	const u8 *ptr = (const u8 *)(tcph + 1);
1019 
1020 	while (length > 0) {
1021 		int opcode = *ptr++;
1022 		int opsize;
1023 
1024 		if (opcode == TCPOPT_EOL)
1025 			break;
1026 		if (opcode == TCPOPT_NOP) {
1027 			length--;
1028 			continue;
1029 		}
1030 		if (length < 2)
1031 			break;
1032 		opsize = *ptr++;
1033 		if (opsize < 2 || opsize > length)
1034 			break;
1035 
1036 		if (opcode == code) {
1037 			*oplen = opsize;
1038 			return ptr;
1039 		}
1040 
1041 		ptr += opsize - 2;
1042 		length -= opsize;
1043 	}
1044 
1045 	return NULL;
1046 }
1047 
1048 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1049  * bytes than the other. In the case where both sequences ACKs bytes that the
1050  * other doesn't, A is considered greater. DSACKs in A also makes A be
1051  * considered greater.
1052  *
1053  * @return -1, 0 or 1 as normal compare functions
1054  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1055 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1056 				  const struct tcphdr *tcph_b)
1057 {
1058 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1059 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1060 	u32 bytes_a = 0, bytes_b = 0;
1061 	int oplen_a, oplen_b;
1062 	bool first = true;
1063 
1064 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1065 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1066 
1067 	/* pointers point to option contents */
1068 	oplen_a -= TCPOLEN_SACK_BASE;
1069 	oplen_b -= TCPOLEN_SACK_BASE;
1070 
1071 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1072 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1073 		return -1;
1074 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1075 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1076 		return 1;
1077 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1078 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1079 		return 0;
1080 
1081 	while (oplen_a >= sizeof(*sack_a)) {
1082 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1083 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1084 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1085 		int oplen_tmp = oplen_b;
1086 		bool found = false;
1087 
1088 		/* DSACK; always considered greater to prevent dropping */
1089 		if (before(start_a, ack_seq_a))
1090 			return -1;
1091 
1092 		bytes_a += end_a - start_a;
1093 
1094 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1095 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1096 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1097 
1098 			/* first time through we count the total size */
1099 			if (first)
1100 				bytes_b += end_b - start_b;
1101 
1102 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1103 				found = true;
1104 				if (!first)
1105 					break;
1106 			}
1107 			oplen_tmp -= sizeof(*sack_tmp);
1108 			sack_tmp++;
1109 		}
1110 
1111 		if (!found)
1112 			return -1;
1113 
1114 		oplen_a -= sizeof(*sack_a);
1115 		sack_a++;
1116 		first = false;
1117 	}
1118 
1119 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1120 	 * either the SACKs are equal, or B SACKs more bytes.
1121 	 */
1122 	return bytes_b > bytes_a ? 1 : 0;
1123 }
1124 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1125 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1126 				 u32 *tsval, u32 *tsecr)
1127 {
1128 	const u8 *ptr;
1129 	int opsize;
1130 
1131 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1132 
1133 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1134 		*tsval = get_unaligned_be32(ptr);
1135 		*tsecr = get_unaligned_be32(ptr + 4);
1136 	}
1137 }
1138 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1139 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1140 			       u32 tstamp_new, u32 tsecr_new)
1141 {
1142 	/* inspired by tcp_parse_options in tcp_input.c */
1143 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1144 	const u8 *ptr = (const u8 *)(tcph + 1);
1145 	u32 tstamp, tsecr;
1146 
1147 	/* 3 reserved flags must be unset to avoid future breakage
1148 	 * ACK must be set
1149 	 * ECE/CWR are handled separately
1150 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1151 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1152 	 * 0x00C00000 = CWR/ECE (handled separately)
1153 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1154 	 */
1155 	if (((tcp_flag_word(tcph) &
1156 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1157 		return false;
1158 
1159 	while (length > 0) {
1160 		int opcode = *ptr++;
1161 		int opsize;
1162 
1163 		if (opcode == TCPOPT_EOL)
1164 			break;
1165 		if (opcode == TCPOPT_NOP) {
1166 			length--;
1167 			continue;
1168 		}
1169 		if (length < 2)
1170 			break;
1171 		opsize = *ptr++;
1172 		if (opsize < 2 || opsize > length)
1173 			break;
1174 
1175 		switch (opcode) {
1176 		case TCPOPT_MD5SIG: /* doesn't influence state */
1177 			break;
1178 
1179 		case TCPOPT_SACK: /* stricter checking performed later */
1180 			if (opsize % 8 != 2)
1181 				return false;
1182 			break;
1183 
1184 		case TCPOPT_TIMESTAMP:
1185 			/* only drop timestamps lower than new */
1186 			if (opsize != TCPOLEN_TIMESTAMP)
1187 				return false;
1188 			tstamp = get_unaligned_be32(ptr);
1189 			tsecr = get_unaligned_be32(ptr + 4);
1190 			if (after(tstamp, tstamp_new) ||
1191 			    after(tsecr, tsecr_new))
1192 				return false;
1193 			break;
1194 
1195 		case TCPOPT_MSS:  /* these should only be set on SYN */
1196 		case TCPOPT_WINDOW:
1197 		case TCPOPT_SACK_PERM:
1198 		case TCPOPT_FASTOPEN:
1199 		case TCPOPT_EXP:
1200 		default: /* don't drop if any unknown options are present */
1201 			return false;
1202 		}
1203 
1204 		ptr += opsize - 2;
1205 		length -= opsize;
1206 	}
1207 
1208 	return true;
1209 }
1210 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1211 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1212 				       struct cake_flow *flow)
1213 {
1214 	bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
1215 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1216 	struct sk_buff *skb_check, *skb_prev = NULL;
1217 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1218 	unsigned char _tcph[64], _tcph_check[64];
1219 	const struct tcphdr *tcph, *tcph_check;
1220 	const struct iphdr *iph, *iph_check;
1221 	struct ipv6hdr _iph, _iph_check;
1222 	const struct sk_buff *skb;
1223 	int seglen, num_found = 0;
1224 	u32 tstamp = 0, tsecr = 0;
1225 	__be32 elig_flags = 0;
1226 	int sack_comp;
1227 
1228 	/* no other possible ACKs to filter */
1229 	if (flow->head == flow->tail)
1230 		return NULL;
1231 
1232 	skb = flow->tail;
1233 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1234 	iph = cake_get_iphdr(skb, &_iph);
1235 	if (!tcph)
1236 		return NULL;
1237 
1238 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1239 
1240 	/* the 'triggering' packet need only have the ACK flag set.
1241 	 * also check that SYN is not set, as there won't be any previous ACKs.
1242 	 */
1243 	if ((tcp_flag_word(tcph) &
1244 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1245 		return NULL;
1246 
1247 	/* the 'triggering' ACK is at the tail of the queue, we have already
1248 	 * returned if it is the only packet in the flow. loop through the rest
1249 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1250 	 * triggering one.
1251 	 */
1252 	for (skb_check = flow->head;
1253 	     skb_check && skb_check != skb;
1254 	     skb_prev = skb_check, skb_check = skb_check->next) {
1255 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1256 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1257 					     sizeof(_tcph_check));
1258 
1259 		/* only TCP packets with matching 5-tuple are eligible, and only
1260 		 * drop safe headers
1261 		 */
1262 		if (!tcph_check || iph->version != iph_check->version ||
1263 		    tcph_check->source != tcph->source ||
1264 		    tcph_check->dest != tcph->dest)
1265 			continue;
1266 
1267 		if (iph_check->version == 4) {
1268 			if (iph_check->saddr != iph->saddr ||
1269 			    iph_check->daddr != iph->daddr)
1270 				continue;
1271 
1272 			seglen = iph_totlen(skb, iph_check) -
1273 				       (4 * iph_check->ihl);
1274 		} else if (iph_check->version == 6) {
1275 			ipv6h = (struct ipv6hdr *)iph;
1276 			ipv6h_check = (struct ipv6hdr *)iph_check;
1277 
1278 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1279 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1280 				continue;
1281 
1282 			seglen = ipv6_payload_len(skb, ipv6h_check);
1283 		} else {
1284 			WARN_ON(1);  /* shouldn't happen */
1285 			continue;
1286 		}
1287 
1288 		/* If the ECE/CWR flags changed from the previous eligible
1289 		 * packet in the same flow, we should no longer be dropping that
1290 		 * previous packet as this would lose information.
1291 		 */
1292 		if (elig_ack && (tcp_flag_word(tcph_check) &
1293 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1294 			elig_ack = NULL;
1295 			elig_ack_prev = NULL;
1296 			num_found--;
1297 		}
1298 
1299 		/* Check TCP options and flags, don't drop ACKs with segment
1300 		 * data, and don't drop ACKs with a higher cumulative ACK
1301 		 * counter than the triggering packet. Check ACK seqno here to
1302 		 * avoid parsing SACK options of packets we are going to exclude
1303 		 * anyway.
1304 		 */
1305 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1306 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1307 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1308 			continue;
1309 
1310 		/* Check SACK options. The triggering packet must SACK more data
1311 		 * than the ACK under consideration, or SACK the same range but
1312 		 * have a larger cumulative ACK counter. The latter is a
1313 		 * pathological case, but is contained in the following check
1314 		 * anyway, just to be safe.
1315 		 */
1316 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1317 
1318 		if (sack_comp < 0 ||
1319 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1320 		     sack_comp == 0))
1321 			continue;
1322 
1323 		/* At this point we have found an eligible pure ACK to drop; if
1324 		 * we are in aggressive mode, we are done. Otherwise, keep
1325 		 * searching unless this is the second eligible ACK we
1326 		 * found.
1327 		 *
1328 		 * Since we want to drop ACK closest to the head of the queue,
1329 		 * save the first eligible ACK we find, even if we need to loop
1330 		 * again.
1331 		 */
1332 		if (!elig_ack) {
1333 			elig_ack = skb_check;
1334 			elig_ack_prev = skb_prev;
1335 			elig_flags = (tcp_flag_word(tcph_check)
1336 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1337 		}
1338 
1339 		if (num_found++ > 0)
1340 			goto found;
1341 	}
1342 
1343 	/* We made it through the queue without finding two eligible ACKs . If
1344 	 * we found a single eligible ACK we can drop it in aggressive mode if
1345 	 * we can guarantee that this does not interfere with ECN flag
1346 	 * information. We ensure this by dropping it only if the enqueued
1347 	 * packet is consecutive with the eligible ACK, and their flags match.
1348 	 */
1349 	if (elig_ack && aggressive && elig_ack->next == skb &&
1350 	    (elig_flags == (tcp_flag_word(tcph) &
1351 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1352 		goto found;
1353 
1354 	return NULL;
1355 
1356 found:
1357 	if (elig_ack_prev)
1358 		elig_ack_prev->next = elig_ack->next;
1359 	else
1360 		flow->head = elig_ack->next;
1361 
1362 	skb_mark_not_on_list(elig_ack);
1363 
1364 	return elig_ack;
1365 }
1366 
cake_ewma(u64 avg,u64 sample,u32 shift)1367 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1368 {
1369 	avg -= avg >> shift;
1370 	avg += sample >> shift;
1371 	return avg;
1372 }
1373 
cake_calc_overhead(struct cake_sched_data * qd,u32 len,u32 off)1374 static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
1375 {
1376 	struct cake_sched_config *q = qd->config;
1377 
1378 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1379 		len -= off;
1380 
1381 	if (qd->max_netlen < len)
1382 		qd->max_netlen = len;
1383 	if (qd->min_netlen > len)
1384 		qd->min_netlen = len;
1385 
1386 	len += q->rate_overhead;
1387 
1388 	if (len < q->rate_mpu)
1389 		len = q->rate_mpu;
1390 
1391 	if (q->atm_mode == CAKE_ATM_ATM) {
1392 		len += 47;
1393 		len /= 48;
1394 		len *= 53;
1395 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1396 		/* Add one byte per 64 bytes or part thereof.
1397 		 * This is conservative and easier to calculate than the
1398 		 * precise value.
1399 		 */
1400 		len += (len + 63) / 64;
1401 	}
1402 
1403 	if (qd->max_adjlen < len)
1404 		qd->max_adjlen = len;
1405 	if (qd->min_adjlen > len)
1406 		qd->min_adjlen = len;
1407 
1408 	return len;
1409 }
1410 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1411 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1412 {
1413 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1414 	unsigned int hdr_len, last_len = 0;
1415 	u32 off = skb_network_offset(skb);
1416 	u16 segs = qdisc_pkt_segs(skb);
1417 	u32 len = qdisc_pkt_len(skb);
1418 
1419 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1420 
1421 	if (segs == 1)
1422 		return cake_calc_overhead(q, len, off);
1423 
1424 	/* borrowed from qdisc_pkt_len_segs_init() */
1425 	if (!skb->encapsulation)
1426 		hdr_len = skb_transport_offset(skb);
1427 	else
1428 		hdr_len = skb_inner_transport_offset(skb);
1429 
1430 	/* + transport layer */
1431 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1432 						SKB_GSO_TCPV6))) {
1433 		const struct tcphdr *th;
1434 		struct tcphdr _tcphdr;
1435 
1436 		th = skb_header_pointer(skb, hdr_len,
1437 					sizeof(_tcphdr), &_tcphdr);
1438 		if (likely(th))
1439 			hdr_len += __tcp_hdrlen(th);
1440 	} else {
1441 		struct udphdr _udphdr;
1442 
1443 		if (skb_header_pointer(skb, hdr_len,
1444 				       sizeof(_udphdr), &_udphdr))
1445 			hdr_len += sizeof(struct udphdr);
1446 	}
1447 
1448 	len = shinfo->gso_size + hdr_len;
1449 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1450 
1451 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1452 		cake_calc_overhead(q, last_len, off));
1453 }
1454 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1455 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1456 {
1457 	struct cake_heap_entry ii = q->overflow_heap[i];
1458 	struct cake_heap_entry jj = q->overflow_heap[j];
1459 
1460 	q->overflow_heap[i] = jj;
1461 	q->overflow_heap[j] = ii;
1462 
1463 	q->tins[ii.t].overflow_idx[ii.b] = j;
1464 	q->tins[jj.t].overflow_idx[jj.b] = i;
1465 }
1466 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1467 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1468 {
1469 	struct cake_heap_entry ii = q->overflow_heap[i];
1470 
1471 	return q->tins[ii.t].backlogs[ii.b];
1472 }
1473 
cake_heapify(struct cake_sched_data * q,u16 i)1474 static void cake_heapify(struct cake_sched_data *q, u16 i)
1475 {
1476 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1477 	u32 mb = cake_heap_get_backlog(q, i);
1478 	u32 m = i;
1479 
1480 	while (m < a) {
1481 		u32 l = m + m + 1;
1482 		u32 r = l + 1;
1483 
1484 		if (l < a) {
1485 			u32 lb = cake_heap_get_backlog(q, l);
1486 
1487 			if (lb > mb) {
1488 				m  = l;
1489 				mb = lb;
1490 			}
1491 		}
1492 
1493 		if (r < a) {
1494 			u32 rb = cake_heap_get_backlog(q, r);
1495 
1496 			if (rb > mb) {
1497 				m  = r;
1498 				mb = rb;
1499 			}
1500 		}
1501 
1502 		if (m != i) {
1503 			cake_heap_swap(q, i, m);
1504 			i = m;
1505 		} else {
1506 			break;
1507 		}
1508 	}
1509 }
1510 
cake_heapify_up(struct cake_sched_data * q,u16 i)1511 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1512 {
1513 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1514 		u16 p = (i - 1) >> 1;
1515 		u32 ib = cake_heap_get_backlog(q, i);
1516 		u32 pb = cake_heap_get_backlog(q, p);
1517 
1518 		if (ib > pb) {
1519 			cake_heap_swap(q, i, p);
1520 			i = p;
1521 		} else {
1522 			break;
1523 		}
1524 	}
1525 }
1526 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1527 static int cake_advance_shaper(struct cake_sched_data *q,
1528 			       struct cake_tin_data *b,
1529 			       struct sk_buff *skb,
1530 			       ktime_t now, bool drop)
1531 {
1532 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1533 
1534 	/* charge packet bandwidth to this tin
1535 	 * and to the global shaper.
1536 	 */
1537 	if (q->rate_ns) {
1538 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1539 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1540 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1541 
1542 		if (ktime_before(b->time_next_packet, now))
1543 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1544 							   tin_dur);
1545 
1546 		else if (ktime_before(b->time_next_packet,
1547 				      ktime_add_ns(now, tin_dur)))
1548 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1549 
1550 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1551 						   global_dur);
1552 		if (!drop)
1553 			q->failsafe_next_packet = \
1554 				ktime_add_ns(q->failsafe_next_packet,
1555 					     failsafe_dur);
1556 	}
1557 	return len;
1558 }
1559 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1560 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1561 {
1562 	struct cake_sched_data *q = qdisc_priv(sch);
1563 	ktime_t now = ktime_get();
1564 	u32 idx = 0, tin = 0, len;
1565 	struct cake_heap_entry qq;
1566 	struct cake_tin_data *b;
1567 	struct cake_flow *flow;
1568 	struct sk_buff *skb;
1569 
1570 	if (!q->overflow_timeout) {
1571 		int i;
1572 		/* Build fresh max-heap */
1573 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1574 			cake_heapify(q, i);
1575 	}
1576 	q->overflow_timeout = 65535;
1577 
1578 	/* select longest queue for pruning */
1579 	qq  = q->overflow_heap[0];
1580 	tin = qq.t;
1581 	idx = qq.b;
1582 
1583 	b = &q->tins[tin];
1584 	flow = &b->flows[idx];
1585 	skb = dequeue_head(flow);
1586 	if (unlikely(!skb)) {
1587 		/* heap has gone wrong, rebuild it next time */
1588 		q->overflow_timeout = 0;
1589 		return idx + (tin << 16);
1590 	}
1591 
1592 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1593 		b->unresponsive_flow_count++;
1594 
1595 	len = qdisc_pkt_len(skb);
1596 	q->buffer_used      -= skb->truesize;
1597 	b->backlogs[idx]    -= len;
1598 	b->tin_backlog      -= len;
1599 	sch->qstats.backlog -= len;
1600 
1601 	flow->dropped++;
1602 	b->tin_dropped++;
1603 
1604 	if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1605 		cake_advance_shaper(q, b, skb, now, true);
1606 
1607 	qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1608 	sch->q.qlen--;
1609 
1610 	cake_heapify(q, 0);
1611 
1612 	return idx + (tin << 16);
1613 }
1614 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1615 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1616 {
1617 	const int offset = skb_network_offset(skb);
1618 	u16 *buf, buf_;
1619 	u8 dscp;
1620 
1621 	switch (skb_protocol(skb, true)) {
1622 	case htons(ETH_P_IP):
1623 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1624 		if (unlikely(!buf))
1625 			return 0;
1626 
1627 		/* ToS is in the second byte of iphdr */
1628 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1629 
1630 		if (wash && dscp) {
1631 			const int wlen = offset + sizeof(struct iphdr);
1632 
1633 			if (!pskb_may_pull(skb, wlen) ||
1634 			    skb_try_make_writable(skb, wlen))
1635 				return 0;
1636 
1637 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1638 		}
1639 
1640 		return dscp;
1641 
1642 	case htons(ETH_P_IPV6):
1643 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1644 		if (unlikely(!buf))
1645 			return 0;
1646 
1647 		/* Traffic class is in the first and second bytes of ipv6hdr */
1648 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1649 
1650 		if (wash && dscp) {
1651 			const int wlen = offset + sizeof(struct ipv6hdr);
1652 
1653 			if (!pskb_may_pull(skb, wlen) ||
1654 			    skb_try_make_writable(skb, wlen))
1655 				return 0;
1656 
1657 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1658 		}
1659 
1660 		return dscp;
1661 
1662 	case htons(ETH_P_ARP):
1663 		return 0x38;  /* CS7 - Net Control */
1664 
1665 	default:
1666 		/* If there is no Diffserv field, treat as best-effort */
1667 		return 0;
1668 	}
1669 }
1670 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1671 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1672 					     struct sk_buff *skb)
1673 {
1674 	struct cake_sched_data *qd = qdisc_priv(sch);
1675 	struct cake_sched_config *q = qd->config;
1676 	u32 tin, mark;
1677 	bool wash;
1678 	u8 dscp;
1679 
1680 	/* Tin selection: Default to diffserv-based selection, allow overriding
1681 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1682 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1683 	 */
1684 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1685 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1686 	if (wash)
1687 		dscp = cake_handle_diffserv(skb, wash);
1688 
1689 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1690 		tin = 0;
1691 
1692 	else if (mark && mark <= qd->tin_cnt)
1693 		tin = qd->tin_order[mark - 1];
1694 
1695 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1696 		 TC_H_MIN(skb->priority) > 0 &&
1697 		 TC_H_MIN(skb->priority) <= qd->tin_cnt)
1698 		tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
1699 
1700 	else {
1701 		if (!wash)
1702 			dscp = cake_handle_diffserv(skb, wash);
1703 		tin = qd->tin_index[dscp];
1704 
1705 		if (unlikely(tin >= qd->tin_cnt))
1706 			tin = 0;
1707 	}
1708 
1709 	return &qd->tins[tin];
1710 }
1711 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1712 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1713 			 struct sk_buff *skb, int flow_mode, int *qerr)
1714 {
1715 	struct cake_sched_data *q = qdisc_priv(sch);
1716 	struct tcf_proto *filter;
1717 	struct tcf_result res;
1718 	u16 flow = 0, host = 0;
1719 	int result;
1720 
1721 	filter = rcu_dereference_bh(q->filter_list);
1722 	if (!filter)
1723 		goto hash;
1724 
1725 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1726 	result = tcf_classify(skb, NULL, filter, &res, false);
1727 
1728 	if (result >= 0) {
1729 #ifdef CONFIG_NET_CLS_ACT
1730 		switch (result) {
1731 		case TC_ACT_STOLEN:
1732 		case TC_ACT_QUEUED:
1733 		case TC_ACT_TRAP:
1734 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1735 			fallthrough;
1736 		case TC_ACT_SHOT:
1737 			return 0;
1738 		}
1739 #endif
1740 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1741 			flow = TC_H_MIN(res.classid);
1742 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1743 			host = TC_H_MAJ(res.classid) >> 16;
1744 	}
1745 hash:
1746 	*t = cake_select_tin(sch, skb);
1747 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1748 }
1749 
1750 static void cake_reconfigure(struct Qdisc *sch);
1751 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1752 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1753 			struct sk_buff **to_free)
1754 {
1755 	u32 idx, tin, prev_qlen, prev_backlog, drop_id;
1756 	struct cake_sched_data *q = qdisc_priv(sch);
1757 	int len = qdisc_pkt_len(skb), ret;
1758 	struct sk_buff *ack = NULL;
1759 	ktime_t now = ktime_get();
1760 	struct cake_tin_data *b;
1761 	struct cake_flow *flow;
1762 	bool same_flow = false;
1763 
1764 	/* choose flow to insert into */
1765 	idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
1766 	if (idx == 0) {
1767 		if (ret & __NET_XMIT_BYPASS)
1768 			qdisc_qstats_drop(sch);
1769 		__qdisc_drop(skb, to_free);
1770 		return ret;
1771 	}
1772 	tin = (u32)(b - q->tins);
1773 	idx--;
1774 	flow = &b->flows[idx];
1775 
1776 	/* ensure shaper state isn't stale */
1777 	if (!b->tin_backlog) {
1778 		if (ktime_before(b->time_next_packet, now))
1779 			b->time_next_packet = now;
1780 
1781 		if (!sch->q.qlen) {
1782 			if (ktime_before(q->time_next_packet, now)) {
1783 				q->failsafe_next_packet = now;
1784 				q->time_next_packet = now;
1785 			} else if (ktime_after(q->time_next_packet, now) &&
1786 				   ktime_after(q->failsafe_next_packet, now)) {
1787 				u64 next = \
1788 					min(ktime_to_ns(q->time_next_packet),
1789 					    ktime_to_ns(
1790 						   q->failsafe_next_packet));
1791 				sch->qstats.overlimits++;
1792 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1793 			}
1794 		}
1795 	}
1796 
1797 	if (unlikely(len > b->max_skblen))
1798 		b->max_skblen = len;
1799 
1800 	if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1801 		struct sk_buff *segs, *nskb;
1802 		netdev_features_t features = netif_skb_features(skb);
1803 		unsigned int slen = 0, numsegs = 0;
1804 
1805 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1806 		if (IS_ERR_OR_NULL(segs))
1807 			return qdisc_drop(skb, sch, to_free);
1808 
1809 		skb_list_walk_safe(segs, segs, nskb) {
1810 			skb_mark_not_on_list(segs);
1811 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1812 			qdisc_skb_cb(segs)->pkt_segs = 1;
1813 			cobalt_set_enqueue_time(segs, now);
1814 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1815 									  segs);
1816 			flow_queue_add(flow, segs);
1817 
1818 			sch->q.qlen++;
1819 			numsegs++;
1820 			slen += segs->len;
1821 			q->buffer_used += segs->truesize;
1822 			b->packets++;
1823 		}
1824 
1825 		/* stats */
1826 		b->bytes	    += slen;
1827 		b->backlogs[idx]    += slen;
1828 		b->tin_backlog      += slen;
1829 		sch->qstats.backlog += slen;
1830 		q->avg_window_bytes += slen;
1831 
1832 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1833 		consume_skb(skb);
1834 	} else {
1835 		/* not splitting */
1836 		int ack_pkt_len = 0;
1837 
1838 		cobalt_set_enqueue_time(skb, now);
1839 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1840 		flow_queue_add(flow, skb);
1841 
1842 		if (q->config->ack_filter)
1843 			ack = cake_ack_filter(q, flow);
1844 
1845 		if (ack) {
1846 			b->ack_drops++;
1847 			sch->qstats.drops++;
1848 			ack_pkt_len = qdisc_pkt_len(ack);
1849 			b->bytes += ack_pkt_len;
1850 			q->buffer_used += skb->truesize - ack->truesize;
1851 			if (q->config->rate_flags & CAKE_FLAG_INGRESS)
1852 				cake_advance_shaper(q, b, ack, now, true);
1853 
1854 			qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
1855 			consume_skb(ack);
1856 		} else {
1857 			sch->q.qlen++;
1858 			q->buffer_used      += skb->truesize;
1859 		}
1860 
1861 		/* stats */
1862 		b->packets++;
1863 		b->bytes	    += len - ack_pkt_len;
1864 		b->backlogs[idx]    += len - ack_pkt_len;
1865 		b->tin_backlog      += len - ack_pkt_len;
1866 		sch->qstats.backlog += len - ack_pkt_len;
1867 		q->avg_window_bytes += len - ack_pkt_len;
1868 	}
1869 
1870 	if (q->overflow_timeout)
1871 		cake_heapify_up(q, b->overflow_idx[idx]);
1872 
1873 	/* incoming bandwidth capacity estimate */
1874 	if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1875 		u64 packet_interval = \
1876 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1877 
1878 		if (packet_interval > NSEC_PER_SEC)
1879 			packet_interval = NSEC_PER_SEC;
1880 
1881 		/* filter out short-term bursts, eg. wifi aggregation */
1882 		q->avg_packet_interval = \
1883 			cake_ewma(q->avg_packet_interval,
1884 				  packet_interval,
1885 				  (packet_interval > q->avg_packet_interval ?
1886 					  2 : 8));
1887 
1888 		q->last_packet_time = now;
1889 
1890 		if (packet_interval > q->avg_packet_interval) {
1891 			u64 window_interval = \
1892 				ktime_to_ns(ktime_sub(now,
1893 						      q->avg_window_begin));
1894 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1895 
1896 			b = div64_u64(b, window_interval);
1897 			q->avg_peak_bandwidth =
1898 				cake_ewma(q->avg_peak_bandwidth, b,
1899 					  b > q->avg_peak_bandwidth ? 2 : 8);
1900 			q->avg_window_bytes = 0;
1901 			q->avg_window_begin = now;
1902 
1903 			if (ktime_after(now,
1904 					ktime_add_ms(q->last_reconfig_time,
1905 						     250))) {
1906 				q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1907 				cake_reconfigure(sch);
1908 			}
1909 		}
1910 	} else {
1911 		q->avg_window_bytes = 0;
1912 		q->last_packet_time = now;
1913 	}
1914 
1915 	/* flowchain */
1916 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1917 		if (!flow->set) {
1918 			list_add_tail(&flow->flowchain, &b->new_flows);
1919 		} else {
1920 			b->decaying_flow_count--;
1921 			list_move_tail(&flow->flowchain, &b->new_flows);
1922 		}
1923 		flow->set = CAKE_SET_SPARSE;
1924 		b->sparse_flow_count++;
1925 
1926 		flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
1927 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1928 		/* this flow was empty, accounted as a sparse flow, but actually
1929 		 * in the bulk rotation.
1930 		 */
1931 		flow->set = CAKE_SET_BULK;
1932 		b->sparse_flow_count--;
1933 		b->bulk_flow_count++;
1934 
1935 		cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
1936 		cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
1937 	}
1938 
1939 	if (q->buffer_used > q->buffer_max_used)
1940 		q->buffer_max_used = q->buffer_used;
1941 
1942 	if (q->buffer_used <= q->buffer_limit)
1943 		return NET_XMIT_SUCCESS;
1944 
1945 	prev_qlen = sch->q.qlen;
1946 	prev_backlog = sch->qstats.backlog;
1947 
1948 	while (q->buffer_used > q->buffer_limit) {
1949 		drop_id = cake_drop(sch, to_free);
1950 		if ((drop_id >> 16) == tin &&
1951 		    (drop_id & 0xFFFF) == idx)
1952 			same_flow = true;
1953 	}
1954 
1955 	prev_qlen -= sch->q.qlen;
1956 	prev_backlog -= sch->qstats.backlog;
1957 	b->drop_overlimit += prev_qlen;
1958 
1959 	if (same_flow) {
1960 		qdisc_tree_reduce_backlog(sch, prev_qlen - 1,
1961 					  prev_backlog - len);
1962 		return NET_XMIT_CN;
1963 	}
1964 	qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog);
1965 	return NET_XMIT_SUCCESS;
1966 }
1967 
cake_dequeue_one(struct Qdisc * sch)1968 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1969 {
1970 	struct cake_sched_data *q = qdisc_priv(sch);
1971 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1972 	struct cake_flow *flow = &b->flows[q->cur_flow];
1973 	struct sk_buff *skb = NULL;
1974 	u32 len;
1975 
1976 	if (flow->head) {
1977 		skb = dequeue_head(flow);
1978 		len = qdisc_pkt_len(skb);
1979 		b->backlogs[q->cur_flow] -= len;
1980 		b->tin_backlog		 -= len;
1981 		sch->qstats.backlog      -= len;
1982 		q->buffer_used		 -= skb->truesize;
1983 		sch->q.qlen--;
1984 
1985 		if (q->overflow_timeout)
1986 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1987 	}
1988 	return skb;
1989 }
1990 
1991 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1992 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1993 {
1994 	struct cake_sched_data *q = qdisc_priv(sch);
1995 	struct sk_buff *skb;
1996 
1997 	q->cur_tin = tin;
1998 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1999 		while (!!(skb = cake_dequeue_one(sch)))
2000 			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
2001 }
2002 
cake_dequeue(struct Qdisc * sch)2003 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
2004 {
2005 	struct cake_sched_data *q = qdisc_priv(sch);
2006 	struct cake_tin_data *b = &q->tins[q->cur_tin];
2007 	enum skb_drop_reason reason;
2008 	ktime_t now = ktime_get();
2009 	struct cake_flow *flow;
2010 	struct list_head *head;
2011 	bool first_flow = true;
2012 	struct sk_buff *skb;
2013 	u64 delay;
2014 	u32 len;
2015 
2016 	if (q->config->is_shared && q->rate_ns &&
2017 	    now - q->last_checked_active >= q->config->sync_time) {
2018 		struct net_device *dev = qdisc_dev(sch);
2019 		struct cake_sched_data *other_priv;
2020 		u64 new_rate = q->config->rate_bps;
2021 		u64 other_qlen, other_last_active;
2022 		struct Qdisc *other_sch;
2023 		u32 num_active_qs = 1;
2024 		unsigned int ntx;
2025 
2026 		for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
2027 			other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
2028 			other_priv = qdisc_priv(other_sch);
2029 
2030 			if (other_priv == q)
2031 				continue;
2032 
2033 			other_qlen = READ_ONCE(other_sch->q.qlen);
2034 			other_last_active = READ_ONCE(other_priv->last_active);
2035 
2036 			if (other_qlen || other_last_active > q->last_checked_active)
2037 				num_active_qs++;
2038 		}
2039 
2040 		if (num_active_qs > 1)
2041 			new_rate = div64_u64(q->config->rate_bps, num_active_qs);
2042 
2043 		cake_configure_rates(sch, new_rate, true);
2044 		q->last_checked_active = now;
2045 		q->active_queues = num_active_qs;
2046 	}
2047 
2048 begin:
2049 	if (!sch->q.qlen)
2050 		return NULL;
2051 
2052 	/* global hard shaper */
2053 	if (ktime_after(q->time_next_packet, now) &&
2054 	    ktime_after(q->failsafe_next_packet, now)) {
2055 		u64 next = min(ktime_to_ns(q->time_next_packet),
2056 			       ktime_to_ns(q->failsafe_next_packet));
2057 
2058 		sch->qstats.overlimits++;
2059 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2060 		return NULL;
2061 	}
2062 
2063 	/* Choose a class to work on. */
2064 	if (!q->rate_ns) {
2065 		/* In unlimited mode, can't rely on shaper timings, just balance
2066 		 * with DRR
2067 		 */
2068 		bool wrapped = false, empty = true;
2069 
2070 		while (b->tin_deficit < 0 ||
2071 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2072 			if (b->tin_deficit <= 0)
2073 				b->tin_deficit += b->tin_quantum;
2074 			if (b->sparse_flow_count + b->bulk_flow_count)
2075 				empty = false;
2076 
2077 			q->cur_tin++;
2078 			b++;
2079 			if (q->cur_tin >= q->tin_cnt) {
2080 				q->cur_tin = 0;
2081 				b = q->tins;
2082 
2083 				if (wrapped) {
2084 					/* It's possible for q->qlen to be
2085 					 * nonzero when we actually have no
2086 					 * packets anywhere.
2087 					 */
2088 					if (empty)
2089 						return NULL;
2090 				} else {
2091 					wrapped = true;
2092 				}
2093 			}
2094 		}
2095 	} else {
2096 		/* In shaped mode, choose:
2097 		 * - Highest-priority tin with queue and meeting schedule, or
2098 		 * - The earliest-scheduled tin with queue.
2099 		 */
2100 		ktime_t best_time = KTIME_MAX;
2101 		int tin, best_tin = 0;
2102 
2103 		for (tin = 0; tin < q->tin_cnt; tin++) {
2104 			b = q->tins + tin;
2105 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2106 				ktime_t time_to_pkt = \
2107 					ktime_sub(b->time_next_packet, now);
2108 
2109 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2110 				    ktime_compare(time_to_pkt,
2111 						  best_time) <= 0) {
2112 					best_time = time_to_pkt;
2113 					best_tin = tin;
2114 				}
2115 			}
2116 		}
2117 
2118 		q->cur_tin = best_tin;
2119 		b = q->tins + best_tin;
2120 
2121 		/* No point in going further if no packets to deliver. */
2122 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2123 			return NULL;
2124 	}
2125 
2126 retry:
2127 	/* service this class */
2128 	head = &b->decaying_flows;
2129 	if (!first_flow || list_empty(head)) {
2130 		head = &b->new_flows;
2131 		if (list_empty(head)) {
2132 			head = &b->old_flows;
2133 			if (unlikely(list_empty(head))) {
2134 				head = &b->decaying_flows;
2135 				if (unlikely(list_empty(head)))
2136 					goto begin;
2137 			}
2138 		}
2139 	}
2140 	flow = list_first_entry(head, struct cake_flow, flowchain);
2141 	q->cur_flow = flow - b->flows;
2142 	first_flow = false;
2143 
2144 	/* flow isolation (DRR++) */
2145 	if (flow->deficit <= 0) {
2146 		/* Keep all flows with deficits out of the sparse and decaying
2147 		 * rotations.  No non-empty flow can go into the decaying
2148 		 * rotation, so they can't get deficits
2149 		 */
2150 		if (flow->set == CAKE_SET_SPARSE) {
2151 			if (flow->head) {
2152 				b->sparse_flow_count--;
2153 				b->bulk_flow_count++;
2154 
2155 				cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2156 				cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2157 
2158 				flow->set = CAKE_SET_BULK;
2159 			} else {
2160 				/* we've moved it to the bulk rotation for
2161 				 * correct deficit accounting but we still want
2162 				 * to count it as a sparse flow, not a bulk one.
2163 				 */
2164 				flow->set = CAKE_SET_SPARSE_WAIT;
2165 			}
2166 		}
2167 
2168 		flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
2169 		list_move_tail(&flow->flowchain, &b->old_flows);
2170 
2171 		goto retry;
2172 	}
2173 
2174 	/* Retrieve a packet via the AQM */
2175 	while (1) {
2176 		skb = cake_dequeue_one(sch);
2177 		if (!skb) {
2178 			/* this queue was actually empty */
2179 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2180 				b->unresponsive_flow_count--;
2181 
2182 			if (flow->cvars.p_drop || flow->cvars.count ||
2183 			    ktime_before(now, flow->cvars.drop_next)) {
2184 				/* keep in the flowchain until the state has
2185 				 * decayed to rest
2186 				 */
2187 				list_move_tail(&flow->flowchain,
2188 					       &b->decaying_flows);
2189 				if (flow->set == CAKE_SET_BULK) {
2190 					b->bulk_flow_count--;
2191 
2192 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2193 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2194 
2195 					b->decaying_flow_count++;
2196 				} else if (flow->set == CAKE_SET_SPARSE ||
2197 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2198 					b->sparse_flow_count--;
2199 					b->decaying_flow_count++;
2200 				}
2201 				flow->set = CAKE_SET_DECAYING;
2202 			} else {
2203 				/* remove empty queue from the flowchain */
2204 				list_del_init(&flow->flowchain);
2205 				if (flow->set == CAKE_SET_SPARSE ||
2206 				    flow->set == CAKE_SET_SPARSE_WAIT)
2207 					b->sparse_flow_count--;
2208 				else if (flow->set == CAKE_SET_BULK) {
2209 					b->bulk_flow_count--;
2210 
2211 					cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
2212 					cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
2213 				} else
2214 					b->decaying_flow_count--;
2215 
2216 				flow->set = CAKE_SET_NONE;
2217 			}
2218 			goto begin;
2219 		}
2220 
2221 		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2222 					    (b->bulk_flow_count *
2223 					     !!(q->config->rate_flags &
2224 						CAKE_FLAG_INGRESS)));
2225 		/* Last packet in queue may be marked, shouldn't be dropped */
2226 		if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2227 			break;
2228 
2229 		/* drop this packet, get another one */
2230 		if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
2231 			len = cake_advance_shaper(q, b, skb,
2232 						  now, true);
2233 			flow->deficit -= len;
2234 			b->tin_deficit -= len;
2235 		}
2236 		flow->dropped++;
2237 		b->tin_dropped++;
2238 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2239 		qdisc_qstats_drop(sch);
2240 		qdisc_dequeue_drop(sch, skb, reason);
2241 		if (q->config->rate_flags & CAKE_FLAG_INGRESS)
2242 			goto retry;
2243 	}
2244 
2245 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2246 	qdisc_bstats_update(sch, skb);
2247 	WRITE_ONCE(q->last_active, now);
2248 
2249 	/* collect delay stats */
2250 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2251 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2252 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2253 				  delay > b->peak_delay ? 2 : 8);
2254 	b->base_delay = cake_ewma(b->base_delay, delay,
2255 				  delay < b->base_delay ? 2 : 8);
2256 
2257 	len = cake_advance_shaper(q, b, skb, now, false);
2258 	flow->deficit -= len;
2259 	b->tin_deficit -= len;
2260 
2261 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2262 		u64 next = min(ktime_to_ns(q->time_next_packet),
2263 			       ktime_to_ns(q->failsafe_next_packet));
2264 
2265 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2266 	} else if (!sch->q.qlen) {
2267 		int i;
2268 
2269 		for (i = 0; i < q->tin_cnt; i++) {
2270 			if (q->tins[i].decaying_flow_count) {
2271 				ktime_t next = \
2272 					ktime_add_ns(now,
2273 						     q->tins[i].cparams.target);
2274 
2275 				qdisc_watchdog_schedule_ns(&q->watchdog,
2276 							   ktime_to_ns(next));
2277 				break;
2278 			}
2279 		}
2280 	}
2281 
2282 	if (q->overflow_timeout)
2283 		q->overflow_timeout--;
2284 
2285 	return skb;
2286 }
2287 
cake_reset(struct Qdisc * sch)2288 static void cake_reset(struct Qdisc *sch)
2289 {
2290 	struct cake_sched_data *q = qdisc_priv(sch);
2291 	u32 c;
2292 
2293 	if (!q->tins)
2294 		return;
2295 
2296 	for (c = 0; c < CAKE_MAX_TINS; c++)
2297 		cake_clear_tin(sch, c);
2298 }
2299 
2300 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2301 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2302 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2303 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2304 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2305 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2306 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2307 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2308 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2309 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2310 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2311 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2312 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2313 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2314 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2315 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2316 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2317 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2318 };
2319 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2320 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2321 			  u64 target_ns, u64 rtt_est_ns)
2322 {
2323 	/* convert byte-rate into time-per-byte
2324 	 * so it will always unwedge in reasonable time.
2325 	 */
2326 	static const u64 MIN_RATE = 64;
2327 	u32 byte_target = mtu;
2328 	u64 byte_target_ns;
2329 	u8  rate_shft = 0;
2330 	u64 rate_ns = 0;
2331 
2332 	b->flow_quantum = 1514;
2333 	if (rate) {
2334 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2335 		rate_shft = 34;
2336 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2337 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2338 		while (!!(rate_ns >> 34)) {
2339 			rate_ns >>= 1;
2340 			rate_shft--;
2341 		}
2342 	} /* else unlimited, ie. zero delay */
2343 
2344 	b->tin_rate_bps  = rate;
2345 	b->tin_rate_ns   = rate_ns;
2346 	b->tin_rate_shft = rate_shft;
2347 
2348 	if (mtu == 0)
2349 		return;
2350 
2351 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2352 
2353 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2354 	b->cparams.interval = max(rtt_est_ns +
2355 				     b->cparams.target - target_ns,
2356 				     b->cparams.target * 2);
2357 	b->cparams.mtu_time = byte_target_ns;
2358 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2359 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2360 }
2361 
cake_config_besteffort(struct Qdisc * sch,u64 rate,u32 mtu)2362 static int cake_config_besteffort(struct Qdisc *sch, u64 rate, u32 mtu)
2363 {
2364 	struct cake_sched_data *q = qdisc_priv(sch);
2365 	struct cake_tin_data *b = &q->tins[0];
2366 
2367 	q->tin_cnt = 1;
2368 
2369 	q->tin_index = besteffort;
2370 	q->tin_order = normal_order;
2371 
2372 	cake_set_rate(b, rate, mtu,
2373 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2374 	b->tin_quantum = 65535;
2375 
2376 	return 0;
2377 }
2378 
cake_config_precedence(struct Qdisc * sch,u64 rate,u32 mtu)2379 static int cake_config_precedence(struct Qdisc *sch, u64 rate, u32 mtu)
2380 {
2381 	/* convert high-level (user visible) parameters into internal format */
2382 	struct cake_sched_data *q = qdisc_priv(sch);
2383 	u32 quantum = 256;
2384 	u32 i;
2385 
2386 	q->tin_cnt = 8;
2387 	q->tin_index = precedence;
2388 	q->tin_order = normal_order;
2389 
2390 	for (i = 0; i < q->tin_cnt; i++) {
2391 		struct cake_tin_data *b = &q->tins[i];
2392 
2393 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2394 			      us_to_ns(q->config->interval));
2395 
2396 		b->tin_quantum = max_t(u16, 1U, quantum);
2397 
2398 		/* calculate next class's parameters */
2399 		rate  *= 7;
2400 		rate >>= 3;
2401 
2402 		quantum  *= 7;
2403 		quantum >>= 3;
2404 	}
2405 
2406 	return 0;
2407 }
2408 
2409 /*	List of known Diffserv codepoints:
2410  *
2411  *	Default Forwarding (DF/CS0) - Best Effort
2412  *	Max Throughput (TOS2)
2413  *	Min Delay (TOS4)
2414  *	LLT "La" (TOS5)
2415  *	Assured Forwarding 1 (AF1x) - x3
2416  *	Assured Forwarding 2 (AF2x) - x3
2417  *	Assured Forwarding 3 (AF3x) - x3
2418  *	Assured Forwarding 4 (AF4x) - x3
2419  *	Precedence Class 1 (CS1)
2420  *	Precedence Class 2 (CS2)
2421  *	Precedence Class 3 (CS3)
2422  *	Precedence Class 4 (CS4)
2423  *	Precedence Class 5 (CS5)
2424  *	Precedence Class 6 (CS6)
2425  *	Precedence Class 7 (CS7)
2426  *	Voice Admit (VA)
2427  *	Expedited Forwarding (EF)
2428  *	Lower Effort (LE)
2429  *
2430  *	Total 26 codepoints.
2431  */
2432 
2433 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2434  *		(roughly descending order of contended priority)
2435  *		(roughly ascending order of uncontended throughput)
2436  *
2437  *	Network Control (CS6,CS7)      - routing traffic
2438  *	Telephony (EF,VA)         - aka. VoIP streams
2439  *	Signalling (CS5)               - VoIP setup
2440  *	Multimedia Conferencing (AF4x) - aka. video calls
2441  *	Realtime Interactive (CS4)     - eg. games
2442  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2443  *	Broadcast Video (CS3)
2444  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2445  *	Ops, Admin, Management (CS2)      - eg. ssh
2446  *	Standard Service (DF & unrecognised codepoints)
2447  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2448  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2449  *
2450  *	Total 12 traffic classes.
2451  */
2452 
cake_config_diffserv8(struct Qdisc * sch,u64 rate,u32 mtu)2453 static int cake_config_diffserv8(struct Qdisc *sch, u64 rate, u32 mtu)
2454 {
2455 /*	Pruned list of traffic classes for typical applications:
2456  *
2457  *		Network Control          (CS6, CS7)
2458  *		Minimum Latency          (EF, VA, CS5, CS4)
2459  *		Interactive Shell        (CS2)
2460  *		Low Latency Transactions (AF2x, TOS4)
2461  *		Video Streaming          (AF4x, AF3x, CS3)
2462  *		Bog Standard             (DF etc.)
2463  *		High Throughput          (AF1x, TOS2, CS1)
2464  *		Background Traffic       (LE)
2465  *
2466  *		Total 8 traffic classes.
2467  */
2468 
2469 	struct cake_sched_data *q = qdisc_priv(sch);
2470 	u32 quantum = 256;
2471 	u32 i;
2472 
2473 	q->tin_cnt = 8;
2474 
2475 	/* codepoint to class mapping */
2476 	q->tin_index = diffserv8;
2477 	q->tin_order = normal_order;
2478 
2479 	/* class characteristics */
2480 	for (i = 0; i < q->tin_cnt; i++) {
2481 		struct cake_tin_data *b = &q->tins[i];
2482 
2483 		cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
2484 			      us_to_ns(q->config->interval));
2485 
2486 		b->tin_quantum = max_t(u16, 1U, quantum);
2487 
2488 		/* calculate next class's parameters */
2489 		rate  *= 7;
2490 		rate >>= 3;
2491 
2492 		quantum  *= 7;
2493 		quantum >>= 3;
2494 	}
2495 
2496 	return 0;
2497 }
2498 
cake_config_diffserv4(struct Qdisc * sch,u64 rate,u32 mtu)2499 static int cake_config_diffserv4(struct Qdisc *sch, u64 rate, u32 mtu)
2500 {
2501 /*  Further pruned list of traffic classes for four-class system:
2502  *
2503  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2504  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2505  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2506  *	    Background Traffic (LE, CS1)
2507  *
2508  *		Total 4 traffic classes.
2509  */
2510 
2511 	struct cake_sched_data *q = qdisc_priv(sch);
2512 	u32 quantum = 1024;
2513 
2514 	q->tin_cnt = 4;
2515 
2516 	/* codepoint to class mapping */
2517 	q->tin_index = diffserv4;
2518 	q->tin_order = bulk_order;
2519 
2520 	/* class characteristics */
2521 	cake_set_rate(&q->tins[0], rate, mtu,
2522 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2523 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2524 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2525 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2526 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2527 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2528 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2529 
2530 	/* bandwidth-sharing weights */
2531 	q->tins[0].tin_quantum = quantum;
2532 	q->tins[1].tin_quantum = quantum >> 4;
2533 	q->tins[2].tin_quantum = quantum >> 1;
2534 	q->tins[3].tin_quantum = quantum >> 2;
2535 
2536 	return 0;
2537 }
2538 
cake_config_diffserv3(struct Qdisc * sch,u64 rate,u32 mtu)2539 static int cake_config_diffserv3(struct Qdisc *sch, u64 rate, u32 mtu)
2540 {
2541 /*  Simplified Diffserv structure with 3 tins.
2542  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2543  *		Best Effort
2544  *		Low Priority		(LE, CS1)
2545  */
2546 	struct cake_sched_data *q = qdisc_priv(sch);
2547 	u32 quantum = 1024;
2548 
2549 	q->tin_cnt = 3;
2550 
2551 	/* codepoint to class mapping */
2552 	q->tin_index = diffserv3;
2553 	q->tin_order = bulk_order;
2554 
2555 	/* class characteristics */
2556 	cake_set_rate(&q->tins[0], rate, mtu,
2557 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2558 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2559 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2560 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2561 		      us_to_ns(q->config->target), us_to_ns(q->config->interval));
2562 
2563 	/* bandwidth-sharing weights */
2564 	q->tins[0].tin_quantum = quantum;
2565 	q->tins[1].tin_quantum = quantum >> 4;
2566 	q->tins[2].tin_quantum = quantum >> 2;
2567 
2568 	return 0;
2569 }
2570 
cake_configure_rates(struct Qdisc * sch,u64 rate,bool rate_adjust)2571 static void cake_configure_rates(struct Qdisc *sch, u64 rate, bool rate_adjust)
2572 {
2573 	u32 mtu = likely(rate_adjust) ? 0 : psched_mtu(qdisc_dev(sch));
2574 	struct cake_sched_data *qd = qdisc_priv(sch);
2575 	struct cake_sched_config *q = qd->config;
2576 	int c, ft;
2577 
2578 	switch (q->tin_mode) {
2579 	case CAKE_DIFFSERV_BESTEFFORT:
2580 		ft = cake_config_besteffort(sch, rate, mtu);
2581 		break;
2582 
2583 	case CAKE_DIFFSERV_PRECEDENCE:
2584 		ft = cake_config_precedence(sch, rate, mtu);
2585 		break;
2586 
2587 	case CAKE_DIFFSERV_DIFFSERV8:
2588 		ft = cake_config_diffserv8(sch, rate, mtu);
2589 		break;
2590 
2591 	case CAKE_DIFFSERV_DIFFSERV4:
2592 		ft = cake_config_diffserv4(sch, rate, mtu);
2593 		break;
2594 
2595 	case CAKE_DIFFSERV_DIFFSERV3:
2596 	default:
2597 		ft = cake_config_diffserv3(sch, rate, mtu);
2598 		break;
2599 	}
2600 
2601 	for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
2602 		cake_clear_tin(sch, c);
2603 		qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
2604 	}
2605 
2606 	qd->rate_ns   = qd->tins[ft].tin_rate_ns;
2607 	qd->rate_shft = qd->tins[ft].tin_rate_shft;
2608 }
2609 
cake_reconfigure(struct Qdisc * sch)2610 static void cake_reconfigure(struct Qdisc *sch)
2611 {
2612 	struct cake_sched_data *qd = qdisc_priv(sch);
2613 	struct cake_sched_config *q = qd->config;
2614 
2615 	cake_configure_rates(sch, qd->config->rate_bps, false);
2616 
2617 	if (q->buffer_config_limit) {
2618 		qd->buffer_limit = q->buffer_config_limit;
2619 	} else if (q->rate_bps) {
2620 		u64 t = q->rate_bps * q->interval;
2621 
2622 		do_div(t, USEC_PER_SEC / 4);
2623 		qd->buffer_limit = max_t(u32, t, 4U << 20);
2624 	} else {
2625 		qd->buffer_limit = ~0;
2626 	}
2627 
2628 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2629 
2630 	qd->buffer_limit = min(qd->buffer_limit,
2631 			       max(sch->limit * psched_mtu(qdisc_dev(sch)),
2632 				   q->buffer_config_limit));
2633 }
2634 
cake_config_change(struct cake_sched_config * q,struct nlattr * opt,struct netlink_ext_ack * extack,bool * overhead_changed)2635 static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
2636 			      struct netlink_ext_ack *extack, bool *overhead_changed)
2637 {
2638 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2639 	u16 rate_flags = q->rate_flags;
2640 	u8 flow_mode = q->flow_mode;
2641 	int err;
2642 
2643 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2644 					  extack);
2645 	if (err < 0)
2646 		return err;
2647 
2648 	if (tb[TCA_CAKE_NAT]) {
2649 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2650 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2651 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2652 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2653 #else
2654 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2655 				    "No conntrack support in kernel");
2656 		return -EOPNOTSUPP;
2657 #endif
2658 	}
2659 
2660 	if (tb[TCA_CAKE_AUTORATE]) {
2661 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
2662 			if (q->is_shared) {
2663 				NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
2664 						    "Can't use autorate-ingress with cake_mq");
2665 				return -EOPNOTSUPP;
2666 			}
2667 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2668 		} else {
2669 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2670 		}
2671 	}
2672 
2673 	if (tb[TCA_CAKE_BASE_RATE64])
2674 		WRITE_ONCE(q->rate_bps,
2675 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2676 
2677 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2678 		WRITE_ONCE(q->tin_mode,
2679 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2680 
2681 	if (tb[TCA_CAKE_WASH]) {
2682 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2683 			rate_flags |= CAKE_FLAG_WASH;
2684 		else
2685 			rate_flags &= ~CAKE_FLAG_WASH;
2686 	}
2687 
2688 	if (tb[TCA_CAKE_FLOW_MODE])
2689 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2690 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2691 					CAKE_FLOW_MASK));
2692 
2693 	if (tb[TCA_CAKE_ATM])
2694 		WRITE_ONCE(q->atm_mode,
2695 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2696 
2697 	if (tb[TCA_CAKE_OVERHEAD]) {
2698 		WRITE_ONCE(q->rate_overhead,
2699 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2700 		rate_flags |= CAKE_FLAG_OVERHEAD;
2701 		*overhead_changed = true;
2702 	}
2703 
2704 	if (tb[TCA_CAKE_RAW]) {
2705 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2706 		*overhead_changed = true;
2707 	}
2708 
2709 	if (tb[TCA_CAKE_MPU])
2710 		WRITE_ONCE(q->rate_mpu,
2711 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2712 
2713 	if (tb[TCA_CAKE_RTT]) {
2714 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2715 
2716 		WRITE_ONCE(q->interval, max(interval, 1U));
2717 	}
2718 
2719 	if (tb[TCA_CAKE_TARGET]) {
2720 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2721 
2722 		WRITE_ONCE(q->target, max(target, 1U));
2723 	}
2724 
2725 	if (tb[TCA_CAKE_INGRESS]) {
2726 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2727 			rate_flags |= CAKE_FLAG_INGRESS;
2728 		else
2729 			rate_flags &= ~CAKE_FLAG_INGRESS;
2730 	}
2731 
2732 	if (tb[TCA_CAKE_ACK_FILTER])
2733 		WRITE_ONCE(q->ack_filter,
2734 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2735 
2736 	if (tb[TCA_CAKE_MEMORY])
2737 		WRITE_ONCE(q->buffer_config_limit,
2738 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2739 
2740 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2741 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2742 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2743 		else
2744 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2745 	}
2746 
2747 	if (tb[TCA_CAKE_FWMARK]) {
2748 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2749 		WRITE_ONCE(q->fwmark_shft,
2750 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2751 	}
2752 
2753 	WRITE_ONCE(q->rate_flags, rate_flags);
2754 	WRITE_ONCE(q->flow_mode, flow_mode);
2755 
2756 	return 0;
2757 }
2758 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2759 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2760 		       struct netlink_ext_ack *extack)
2761 {
2762 	struct cake_sched_data *qd = qdisc_priv(sch);
2763 	struct cake_sched_config *q = qd->config;
2764 	bool overhead_changed = false;
2765 	int ret;
2766 
2767 	if (q->is_shared) {
2768 		NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
2769 		return -EOPNOTSUPP;
2770 	}
2771 
2772 	ret = cake_config_change(q, opt, extack, &overhead_changed);
2773 	if (ret)
2774 		return ret;
2775 
2776 	if (overhead_changed) {
2777 		qd->max_netlen = 0;
2778 		qd->max_adjlen = 0;
2779 		qd->min_netlen = ~0;
2780 		qd->min_adjlen = ~0;
2781 	}
2782 
2783 	if (qd->tins) {
2784 		sch_tree_lock(sch);
2785 		cake_reconfigure(sch);
2786 		sch_tree_unlock(sch);
2787 	}
2788 
2789 	return 0;
2790 }
2791 
cake_destroy(struct Qdisc * sch)2792 static void cake_destroy(struct Qdisc *sch)
2793 {
2794 	struct cake_sched_data *q = qdisc_priv(sch);
2795 
2796 	qdisc_watchdog_cancel(&q->watchdog);
2797 	tcf_block_put(q->block);
2798 	kvfree(q->tins);
2799 }
2800 
cake_config_init(struct cake_sched_config * q,bool is_shared)2801 static void cake_config_init(struct cake_sched_config *q, bool is_shared)
2802 {
2803 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2804 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2805 
2806 	q->rate_bps = 0; /* unlimited by default */
2807 
2808 	q->interval = 100000; /* 100ms default */
2809 	q->target   =   5000; /* 5ms: codel RFC argues
2810 			       * for 5 to 10% of interval
2811 			       */
2812 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2813 	q->is_shared = is_shared;
2814 	q->sync_time = 200 * NSEC_PER_USEC;
2815 }
2816 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2817 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2818 		     struct netlink_ext_ack *extack)
2819 {
2820 	struct cake_sched_data *qd = qdisc_priv(sch);
2821 	struct cake_sched_config *q = &qd->initial_config;
2822 	int i, j, err;
2823 
2824 	cake_config_init(q, false);
2825 
2826 	sch->limit = 10240;
2827 	sch->flags |= TCQ_F_DEQUEUE_DROPS;
2828 
2829 	qd->cur_tin = 0;
2830 	qd->cur_flow  = 0;
2831 	qd->config = q;
2832 
2833 	qdisc_watchdog_init(&qd->watchdog, sch);
2834 
2835 	if (opt) {
2836 		err = cake_change(sch, opt, extack);
2837 		if (err)
2838 			return err;
2839 	}
2840 
2841 	err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
2842 	if (err)
2843 		return err;
2844 
2845 	quantum_div[0] = ~0;
2846 	for (i = 1; i <= CAKE_QUEUES; i++)
2847 		quantum_div[i] = 65535 / i;
2848 
2849 	qd->tins = kvzalloc_objs(struct cake_tin_data, CAKE_MAX_TINS);
2850 	if (!qd->tins)
2851 		return -ENOMEM;
2852 
2853 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2854 		struct cake_tin_data *b = qd->tins + i;
2855 
2856 		INIT_LIST_HEAD(&b->new_flows);
2857 		INIT_LIST_HEAD(&b->old_flows);
2858 		INIT_LIST_HEAD(&b->decaying_flows);
2859 		b->sparse_flow_count = 0;
2860 		b->bulk_flow_count = 0;
2861 		b->decaying_flow_count = 0;
2862 
2863 		for (j = 0; j < CAKE_QUEUES; j++) {
2864 			struct cake_flow *flow = b->flows + j;
2865 			u32 k = j * CAKE_MAX_TINS + i;
2866 
2867 			INIT_LIST_HEAD(&flow->flowchain);
2868 			cobalt_vars_init(&flow->cvars);
2869 
2870 			qd->overflow_heap[k].t = i;
2871 			qd->overflow_heap[k].b = j;
2872 			b->overflow_idx[j] = k;
2873 		}
2874 	}
2875 
2876 	cake_reconfigure(sch);
2877 	qd->avg_peak_bandwidth = q->rate_bps;
2878 	qd->min_netlen = ~0;
2879 	qd->min_adjlen = ~0;
2880 	qd->active_queues = 0;
2881 	qd->last_checked_active = 0;
2882 
2883 	return 0;
2884 }
2885 
cake_config_replace(struct Qdisc * sch,struct cake_sched_config * cfg)2886 static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
2887 {
2888 	struct cake_sched_data *qd = qdisc_priv(sch);
2889 
2890 	qd->config = cfg;
2891 	cake_reconfigure(sch);
2892 }
2893 
cake_config_dump(struct cake_sched_config * q,struct sk_buff * skb)2894 static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
2895 {
2896 	struct nlattr *opts;
2897 	u16 rate_flags;
2898 	u8 flow_mode;
2899 
2900 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2901 	if (!opts)
2902 		goto nla_put_failure;
2903 
2904 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2905 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2906 		goto nla_put_failure;
2907 
2908 	flow_mode = READ_ONCE(q->flow_mode);
2909 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2910 		goto nla_put_failure;
2911 
2912 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2913 		goto nla_put_failure;
2914 
2915 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2916 		goto nla_put_failure;
2917 
2918 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2919 			READ_ONCE(q->buffer_config_limit)))
2920 		goto nla_put_failure;
2921 
2922 	rate_flags = READ_ONCE(q->rate_flags);
2923 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2924 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2925 		goto nla_put_failure;
2926 
2927 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2928 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2929 		goto nla_put_failure;
2930 
2931 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2932 		goto nla_put_failure;
2933 
2934 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2935 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2936 		goto nla_put_failure;
2937 
2938 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2939 		goto nla_put_failure;
2940 
2941 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2942 			!!(rate_flags & CAKE_FLAG_WASH)))
2943 		goto nla_put_failure;
2944 
2945 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2946 		goto nla_put_failure;
2947 
2948 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2949 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2950 			goto nla_put_failure;
2951 
2952 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2953 		goto nla_put_failure;
2954 
2955 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2956 		goto nla_put_failure;
2957 
2958 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2959 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2960 		goto nla_put_failure;
2961 
2962 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2963 		goto nla_put_failure;
2964 
2965 	return nla_nest_end(skb, opts);
2966 
2967 nla_put_failure:
2968 	return -1;
2969 }
2970 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2971 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2972 {
2973 	struct cake_sched_data *qd = qdisc_priv(sch);
2974 
2975 	return cake_config_dump(qd->config, skb);
2976 }
2977 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2978 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2979 {
2980 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2981 	struct cake_sched_data *q = qdisc_priv(sch);
2982 	struct nlattr *tstats, *ts;
2983 	int i;
2984 
2985 	if (!stats)
2986 		return -1;
2987 
2988 #define PUT_STAT_U32(attr, data) do {				       \
2989 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2990 			goto nla_put_failure;			       \
2991 	} while (0)
2992 #define PUT_STAT_U64(attr, data) do {				       \
2993 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2994 					data, TCA_CAKE_STATS_PAD)) \
2995 			goto nla_put_failure;			       \
2996 	} while (0)
2997 
2998 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2999 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
3000 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
3001 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
3002 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
3003 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
3004 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
3005 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
3006 	PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
3007 
3008 #undef PUT_STAT_U32
3009 #undef PUT_STAT_U64
3010 
3011 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
3012 	if (!tstats)
3013 		goto nla_put_failure;
3014 
3015 #define PUT_TSTAT_U32(attr, data) do {					\
3016 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
3017 			goto nla_put_failure;				\
3018 	} while (0)
3019 #define PUT_TSTAT_U64(attr, data) do {					\
3020 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
3021 					data, TCA_CAKE_TIN_STATS_PAD))	\
3022 			goto nla_put_failure;				\
3023 	} while (0)
3024 
3025 	for (i = 0; i < q->tin_cnt; i++) {
3026 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3027 
3028 		ts = nla_nest_start_noflag(d->skb, i + 1);
3029 		if (!ts)
3030 			goto nla_put_failure;
3031 
3032 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
3033 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
3034 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
3035 
3036 		PUT_TSTAT_U32(TARGET_US,
3037 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
3038 		PUT_TSTAT_U32(INTERVAL_US,
3039 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
3040 
3041 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
3042 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
3043 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
3044 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
3045 
3046 		PUT_TSTAT_U32(PEAK_DELAY_US,
3047 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
3048 		PUT_TSTAT_U32(AVG_DELAY_US,
3049 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
3050 		PUT_TSTAT_U32(BASE_DELAY_US,
3051 			      ktime_to_us(ns_to_ktime(b->base_delay)));
3052 
3053 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
3054 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
3055 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
3056 
3057 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
3058 					    b->decaying_flow_count);
3059 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
3060 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
3061 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
3062 
3063 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
3064 		nla_nest_end(d->skb, ts);
3065 	}
3066 
3067 #undef PUT_TSTAT_U32
3068 #undef PUT_TSTAT_U64
3069 
3070 	nla_nest_end(d->skb, tstats);
3071 	return nla_nest_end(d->skb, stats);
3072 
3073 nla_put_failure:
3074 	nla_nest_cancel(d->skb, stats);
3075 	return -1;
3076 }
3077 
cake_leaf(struct Qdisc * sch,unsigned long arg)3078 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
3079 {
3080 	return NULL;
3081 }
3082 
cake_find(struct Qdisc * sch,u32 classid)3083 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
3084 {
3085 	return 0;
3086 }
3087 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)3088 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
3089 			       u32 classid)
3090 {
3091 	return 0;
3092 }
3093 
cake_unbind(struct Qdisc * q,unsigned long cl)3094 static void cake_unbind(struct Qdisc *q, unsigned long cl)
3095 {
3096 }
3097 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)3098 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
3099 					struct netlink_ext_ack *extack)
3100 {
3101 	struct cake_sched_data *q = qdisc_priv(sch);
3102 
3103 	if (cl)
3104 		return NULL;
3105 	return q->block;
3106 }
3107 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)3108 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3109 			   struct sk_buff *skb, struct tcmsg *tcm)
3110 {
3111 	tcm->tcm_handle |= TC_H_MIN(cl);
3112 	return 0;
3113 }
3114 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)3115 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3116 				 struct gnet_dump *d)
3117 {
3118 	struct cake_sched_data *q = qdisc_priv(sch);
3119 	const struct cake_flow *flow = NULL;
3120 	struct gnet_stats_queue qs = { 0 };
3121 	struct nlattr *stats;
3122 	u32 idx = cl - 1;
3123 
3124 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3125 		const struct cake_tin_data *b = \
3126 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3127 		const struct sk_buff *skb;
3128 
3129 		flow = &b->flows[idx % CAKE_QUEUES];
3130 
3131 		if (flow->head) {
3132 			sch_tree_lock(sch);
3133 			skb = flow->head;
3134 			while (skb) {
3135 				qs.qlen++;
3136 				skb = skb->next;
3137 			}
3138 			sch_tree_unlock(sch);
3139 		}
3140 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3141 		qs.drops = flow->dropped;
3142 	}
3143 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3144 		return -1;
3145 	if (flow) {
3146 		ktime_t now = ktime_get();
3147 
3148 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3149 		if (!stats)
3150 			return -1;
3151 
3152 #define PUT_STAT_U32(attr, data) do {				       \
3153 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3154 			goto nla_put_failure;			       \
3155 	} while (0)
3156 #define PUT_STAT_S32(attr, data) do {				       \
3157 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3158 			goto nla_put_failure;			       \
3159 	} while (0)
3160 
3161 		PUT_STAT_S32(DEFICIT, flow->deficit);
3162 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3163 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3164 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3165 		if (flow->cvars.p_drop) {
3166 			PUT_STAT_S32(BLUE_TIMER_US,
3167 				     ktime_to_us(
3168 					     ktime_sub(now,
3169 						       flow->cvars.blue_timer)));
3170 		}
3171 		if (flow->cvars.dropping) {
3172 			PUT_STAT_S32(DROP_NEXT_US,
3173 				     ktime_to_us(
3174 					     ktime_sub(now,
3175 						       flow->cvars.drop_next)));
3176 		}
3177 
3178 		if (nla_nest_end(d->skb, stats) < 0)
3179 			return -1;
3180 	}
3181 
3182 	return 0;
3183 
3184 nla_put_failure:
3185 	nla_nest_cancel(d->skb, stats);
3186 	return -1;
3187 }
3188 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3189 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3190 {
3191 	struct cake_sched_data *q = qdisc_priv(sch);
3192 	unsigned int i, j;
3193 
3194 	if (arg->stop)
3195 		return;
3196 
3197 	for (i = 0; i < q->tin_cnt; i++) {
3198 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3199 
3200 		for (j = 0; j < CAKE_QUEUES; j++) {
3201 			if (list_empty(&b->flows[j].flowchain)) {
3202 				arg->count++;
3203 				continue;
3204 			}
3205 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3206 						 arg))
3207 				break;
3208 		}
3209 	}
3210 }
3211 
3212 static const struct Qdisc_class_ops cake_class_ops = {
3213 	.leaf		=	cake_leaf,
3214 	.find		=	cake_find,
3215 	.tcf_block	=	cake_tcf_block,
3216 	.bind_tcf	=	cake_bind,
3217 	.unbind_tcf	=	cake_unbind,
3218 	.dump		=	cake_dump_class,
3219 	.dump_stats	=	cake_dump_class_stats,
3220 	.walk		=	cake_walk,
3221 };
3222 
3223 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3224 	.cl_ops		=	&cake_class_ops,
3225 	.id		=	"cake",
3226 	.priv_size	=	sizeof(struct cake_sched_data),
3227 	.enqueue	=	cake_enqueue,
3228 	.dequeue	=	cake_dequeue,
3229 	.peek		=	qdisc_peek_dequeued,
3230 	.init		=	cake_init,
3231 	.reset		=	cake_reset,
3232 	.destroy	=	cake_destroy,
3233 	.change		=	cake_change,
3234 	.dump		=	cake_dump,
3235 	.dump_stats	=	cake_dump_stats,
3236 	.owner		=	THIS_MODULE,
3237 };
3238 MODULE_ALIAS_NET_SCH("cake");
3239 
3240 struct cake_mq_sched {
3241 	struct mq_sched mq_priv; /* must be first */
3242 	struct cake_sched_config cake_config;
3243 };
3244 
cake_mq_destroy(struct Qdisc * sch)3245 static void cake_mq_destroy(struct Qdisc *sch)
3246 {
3247 	mq_destroy_common(sch);
3248 }
3249 
cake_mq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3250 static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
3251 			struct netlink_ext_ack *extack)
3252 {
3253 	struct cake_mq_sched *priv = qdisc_priv(sch);
3254 	struct net_device *dev = qdisc_dev(sch);
3255 	int ret, ntx;
3256 	bool _unused;
3257 
3258 	cake_config_init(&priv->cake_config, true);
3259 	if (opt) {
3260 		ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
3261 		if (ret)
3262 			return ret;
3263 	}
3264 
3265 	ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
3266 	if (ret)
3267 		return ret;
3268 
3269 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
3270 		cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
3271 
3272 	return 0;
3273 }
3274 
cake_mq_dump(struct Qdisc * sch,struct sk_buff * skb)3275 static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
3276 {
3277 	struct cake_mq_sched *priv = qdisc_priv(sch);
3278 
3279 	mq_dump_common(sch, skb);
3280 	return cake_config_dump(&priv->cake_config, skb);
3281 }
3282 
cake_mq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)3283 static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
3284 			  struct netlink_ext_ack *extack)
3285 {
3286 	struct cake_mq_sched *priv = qdisc_priv(sch);
3287 	struct net_device *dev = qdisc_dev(sch);
3288 	bool overhead_changed = false;
3289 	unsigned int ntx;
3290 	int ret;
3291 
3292 	ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
3293 	if (ret)
3294 		return ret;
3295 
3296 	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
3297 		struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
3298 		struct cake_sched_data *qd = qdisc_priv(chld);
3299 
3300 		if (overhead_changed) {
3301 			qd->max_netlen = 0;
3302 			qd->max_adjlen = 0;
3303 			qd->min_netlen = ~0;
3304 			qd->min_adjlen = ~0;
3305 		}
3306 
3307 		if (qd->tins) {
3308 			sch_tree_lock(chld);
3309 			cake_reconfigure(chld);
3310 			sch_tree_unlock(chld);
3311 		}
3312 	}
3313 
3314 	return 0;
3315 }
3316 
cake_mq_graft(struct Qdisc * sch,unsigned long cl,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)3317 static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
3318 			 struct Qdisc **old, struct netlink_ext_ack *extack)
3319 {
3320 	NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
3321 	return -EOPNOTSUPP;
3322 }
3323 
3324 static const struct Qdisc_class_ops cake_mq_class_ops = {
3325 	.select_queue	= mq_select_queue,
3326 	.graft		= cake_mq_graft,
3327 	.leaf		= mq_leaf,
3328 	.find		= mq_find,
3329 	.walk		= mq_walk,
3330 	.dump		= mq_dump_class,
3331 	.dump_stats	= mq_dump_class_stats,
3332 };
3333 
3334 static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
3335 	.cl_ops		=	&cake_mq_class_ops,
3336 	.id		=	"cake_mq",
3337 	.priv_size	=	sizeof(struct cake_mq_sched),
3338 	.init		=	cake_mq_init,
3339 	.destroy	=	cake_mq_destroy,
3340 	.attach		=	mq_attach,
3341 	.change		=	cake_mq_change,
3342 	.change_real_num_tx = mq_change_real_num_tx,
3343 	.dump		=	cake_mq_dump,
3344 	.owner		=	THIS_MODULE,
3345 };
3346 MODULE_ALIAS_NET_SCH("cake_mq");
3347 
cake_module_init(void)3348 static int __init cake_module_init(void)
3349 {
3350 	int ret;
3351 
3352 	ret = register_qdisc(&cake_qdisc_ops);
3353 	if (ret)
3354 		return ret;
3355 
3356 	ret = register_qdisc(&cake_mq_qdisc_ops);
3357 	if (ret)
3358 		unregister_qdisc(&cake_qdisc_ops);
3359 
3360 	return ret;
3361 }
3362 
cake_module_exit(void)3363 static void __exit cake_module_exit(void)
3364 {
3365 	unregister_qdisc(&cake_qdisc_ops);
3366 	unregister_qdisc(&cake_mq_qdisc_ops);
3367 }
3368 
3369 module_init(cake_module_init)
3370 module_exit(cake_module_exit)
3371 MODULE_AUTHOR("Jonathan Morton");
3372 MODULE_LICENSE("Dual BSD/GPL");
3373 MODULE_DESCRIPTION("The CAKE shaper.");
3374 MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
3375