xref: /linux/net/sched/sch_cake.c (revision 6439a0e64c355d2e375bd094f365d56ce81faba3)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/gso.h>
69 #include <net/pkt_sched.h>
70 #include <net/pkt_cls.h>
71 #include <net/tcp.h>
72 #include <net/flow_dissector.h>
73 
74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
75 #include <net/netfilter/nf_conntrack_core.h>
76 #endif
77 
78 #define CAKE_SET_WAYS (8)
79 #define CAKE_MAX_TINS (8)
80 #define CAKE_QUEUES (1024)
81 #define CAKE_FLOW_MASK 63
82 #define CAKE_FLOW_NAT_FLAG 64
83 
84 /* struct cobalt_params - contains codel and blue parameters
85  * @interval:	codel initial drop rate
86  * @target:     maximum persistent sojourn time & blue update rate
87  * @mtu_time:   serialisation delay of maximum-size packet
88  * @p_inc:      increment of blue drop probability (0.32 fxp)
89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
90  */
91 struct cobalt_params {
92 	u64	interval;
93 	u64	target;
94 	u64	mtu_time;
95 	u32	p_inc;
96 	u32	p_dec;
97 };
98 
99 /* struct cobalt_vars - contains codel and blue variables
100  * @count:		codel dropping frequency
101  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
102  * @drop_next:		time to drop next packet, or when we dropped last
103  * @blue_timer:		Blue time to next drop
104  * @p_drop:		BLUE drop probability (0.32 fxp)
105  * @dropping:		set if in dropping state
106  * @ecn_marked:		set if marked
107  */
108 struct cobalt_vars {
109 	u32	count;
110 	u32	rec_inv_sqrt;
111 	ktime_t	drop_next;
112 	ktime_t	blue_timer;
113 	u32     p_drop;
114 	bool	dropping;
115 	bool    ecn_marked;
116 };
117 
118 enum {
119 	CAKE_SET_NONE = 0,
120 	CAKE_SET_SPARSE,
121 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
122 	CAKE_SET_BULK,
123 	CAKE_SET_DECAYING
124 };
125 
126 struct cake_flow {
127 	/* this stuff is all needed per-flow at dequeue time */
128 	struct sk_buff	  *head;
129 	struct sk_buff	  *tail;
130 	struct list_head  flowchain;
131 	s32		  deficit;
132 	u32		  dropped;
133 	struct cobalt_vars cvars;
134 	u16		  srchost; /* index into cake_host table */
135 	u16		  dsthost;
136 	u8		  set;
137 }; /* please try to keep this structure <= 64 bytes */
138 
139 struct cake_host {
140 	u32 srchost_tag;
141 	u32 dsthost_tag;
142 	u16 srchost_bulk_flow_count;
143 	u16 dsthost_bulk_flow_count;
144 };
145 
146 struct cake_heap_entry {
147 	u16 t:3, b:10;
148 };
149 
150 struct cake_tin_data {
151 	struct cake_flow flows[CAKE_QUEUES];
152 	u32	backlogs[CAKE_QUEUES];
153 	u32	tags[CAKE_QUEUES]; /* for set association */
154 	u16	overflow_idx[CAKE_QUEUES];
155 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
156 	u16	flow_quantum;
157 
158 	struct cobalt_params cparams;
159 	u32	drop_overlimit;
160 	u16	bulk_flow_count;
161 	u16	sparse_flow_count;
162 	u16	decaying_flow_count;
163 	u16	unresponsive_flow_count;
164 
165 	u32	max_skblen;
166 
167 	struct list_head new_flows;
168 	struct list_head old_flows;
169 	struct list_head decaying_flows;
170 
171 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
172 	ktime_t	time_next_packet;
173 	u64	tin_rate_ns;
174 	u64	tin_rate_bps;
175 	u16	tin_rate_shft;
176 
177 	u16	tin_quantum;
178 	s32	tin_deficit;
179 	u32	tin_backlog;
180 	u32	tin_dropped;
181 	u32	tin_ecn_mark;
182 
183 	u32	packets;
184 	u64	bytes;
185 
186 	u32	ack_drops;
187 
188 	/* moving averages */
189 	u64 avge_delay;
190 	u64 peak_delay;
191 	u64 base_delay;
192 
193 	/* hash function stats */
194 	u32	way_directs;
195 	u32	way_hits;
196 	u32	way_misses;
197 	u32	way_collisions;
198 }; /* number of tins is small, so size of this struct doesn't matter much */
199 
200 struct cake_sched_data {
201 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
202 	struct tcf_block *block;
203 	struct cake_tin_data *tins;
204 
205 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
206 	u16		overflow_timeout;
207 
208 	u16		tin_cnt;
209 	u8		tin_mode;
210 	u8		flow_mode;
211 	u8		ack_filter;
212 	u8		atm_mode;
213 
214 	u32		fwmark_mask;
215 	u16		fwmark_shft;
216 
217 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
218 	u16		rate_shft;
219 	ktime_t		time_next_packet;
220 	ktime_t		failsafe_next_packet;
221 	u64		rate_ns;
222 	u64		rate_bps;
223 	u16		rate_flags;
224 	s16		rate_overhead;
225 	u16		rate_mpu;
226 	u64		interval;
227 	u64		target;
228 
229 	/* resource tracking */
230 	u32		buffer_used;
231 	u32		buffer_max_used;
232 	u32		buffer_limit;
233 	u32		buffer_config_limit;
234 
235 	/* indices for dequeue */
236 	u16		cur_tin;
237 	u16		cur_flow;
238 
239 	struct qdisc_watchdog watchdog;
240 	const u8	*tin_index;
241 	const u8	*tin_order;
242 
243 	/* bandwidth capacity estimate */
244 	ktime_t		last_packet_time;
245 	ktime_t		avg_window_begin;
246 	u64		avg_packet_interval;
247 	u64		avg_window_bytes;
248 	u64		avg_peak_bandwidth;
249 	ktime_t		last_reconfig_time;
250 
251 	/* packet length stats */
252 	u32		avg_netoff;
253 	u16		max_netlen;
254 	u16		max_adjlen;
255 	u16		min_netlen;
256 	u16		min_adjlen;
257 };
258 
259 enum {
260 	CAKE_FLAG_OVERHEAD	   = BIT(0),
261 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
262 	CAKE_FLAG_INGRESS	   = BIT(2),
263 	CAKE_FLAG_WASH		   = BIT(3),
264 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
265 };
266 
267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
268  * obtain the best features of each.  Codel is excellent on flows which
269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
270  * unresponsive flows.
271  */
272 
273 struct cobalt_skb_cb {
274 	ktime_t enqueue_time;
275 	u32     adjusted_len;
276 };
277 
us_to_ns(u64 us)278 static u64 us_to_ns(u64 us)
279 {
280 	return us * NSEC_PER_USEC;
281 }
282 
get_cobalt_cb(const struct sk_buff * skb)283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
284 {
285 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
286 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
287 }
288 
cobalt_get_enqueue_time(const struct sk_buff * skb)289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
290 {
291 	return get_cobalt_cb(skb)->enqueue_time;
292 }
293 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
295 				    ktime_t now)
296 {
297 	get_cobalt_cb(skb)->enqueue_time = now;
298 }
299 
300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
301 
302 /* Diffserv lookup tables */
303 
304 static const u8 precedence[] = {
305 	0, 0, 0, 0, 0, 0, 0, 0,
306 	1, 1, 1, 1, 1, 1, 1, 1,
307 	2, 2, 2, 2, 2, 2, 2, 2,
308 	3, 3, 3, 3, 3, 3, 3, 3,
309 	4, 4, 4, 4, 4, 4, 4, 4,
310 	5, 5, 5, 5, 5, 5, 5, 5,
311 	6, 6, 6, 6, 6, 6, 6, 6,
312 	7, 7, 7, 7, 7, 7, 7, 7,
313 };
314 
315 static const u8 diffserv8[] = {
316 	2, 0, 1, 2, 4, 2, 2, 2,
317 	1, 2, 1, 2, 1, 2, 1, 2,
318 	5, 2, 4, 2, 4, 2, 4, 2,
319 	3, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 3, 2, 3, 2, 3, 2,
321 	6, 2, 2, 2, 6, 2, 6, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 	7, 2, 2, 2, 2, 2, 2, 2,
324 };
325 
326 static const u8 diffserv4[] = {
327 	0, 1, 0, 0, 2, 0, 0, 0,
328 	1, 0, 0, 0, 0, 0, 0, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	2, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 2, 0, 2, 0, 2, 0,
332 	3, 0, 0, 0, 3, 0, 3, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 	3, 0, 0, 0, 0, 0, 0, 0,
335 };
336 
337 static const u8 diffserv3[] = {
338 	0, 1, 0, 0, 2, 0, 0, 0,
339 	1, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 0, 0, 0, 0,
343 	0, 0, 0, 0, 2, 0, 2, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 	2, 0, 0, 0, 0, 0, 0, 0,
346 };
347 
348 static const u8 besteffort[] = {
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 	0, 0, 0, 0, 0, 0, 0, 0,
357 };
358 
359 /* tin priority order for stats dumping */
360 
361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
362 static const u8 bulk_order[] = {1, 0, 2, 3};
363 
364 /* There is a big difference in timing between the accurate values placed in the
365  * cache and the approximations given by a single Newton step for small count
366  * values, particularly when stepping from count 1 to 2 or vice versa. Hence,
367  * these values are calculated using eight Newton steps, using the
368  * implementation below. Above 16, a single Newton step gives sufficient
369  * accuracy in either direction, given the precision stored.
370  *
371  * The magnitude of the error when stepping up to count 2 is such as to give the
372  * value that *should* have been produced at count 4.
373  */
374 
375 #define REC_INV_SQRT_CACHE (16)
376 static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
377 		~0,         ~0, 3037000500, 2479700525,
378 	2147483647, 1920767767, 1753413056, 1623345051,
379 	1518500250, 1431655765, 1358187914, 1294981364,
380 	1239850263, 1191209601, 1147878294, 1108955788
381 };
382 
383 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
384  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
385  *
386  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
387  */
388 
cobalt_newton_step(struct cobalt_vars * vars)389 static void cobalt_newton_step(struct cobalt_vars *vars)
390 {
391 	u32 invsqrt, invsqrt2;
392 	u64 val;
393 
394 	invsqrt = vars->rec_inv_sqrt;
395 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
396 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
397 
398 	val >>= 2; /* avoid overflow in following multiply */
399 	val = (val * invsqrt) >> (32 - 2 + 1);
400 
401 	vars->rec_inv_sqrt = val;
402 }
403 
cobalt_invsqrt(struct cobalt_vars * vars)404 static void cobalt_invsqrt(struct cobalt_vars *vars)
405 {
406 	if (vars->count < REC_INV_SQRT_CACHE)
407 		vars->rec_inv_sqrt = inv_sqrt_cache[vars->count];
408 	else
409 		cobalt_newton_step(vars);
410 }
411 
cobalt_vars_init(struct cobalt_vars * vars)412 static void cobalt_vars_init(struct cobalt_vars *vars)
413 {
414 	memset(vars, 0, sizeof(*vars));
415 }
416 
417 /* CoDel control_law is t + interval/sqrt(count)
418  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
419  * both sqrt() and divide operation.
420  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)421 static ktime_t cobalt_control(ktime_t t,
422 			      u64 interval,
423 			      u32 rec_inv_sqrt)
424 {
425 	return ktime_add_ns(t, reciprocal_scale(interval,
426 						rec_inv_sqrt));
427 }
428 
429 /* Call this when a packet had to be dropped due to queue overflow.  Returns
430  * true if the BLUE state was quiescent before but active after this call.
431  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)432 static bool cobalt_queue_full(struct cobalt_vars *vars,
433 			      struct cobalt_params *p,
434 			      ktime_t now)
435 {
436 	bool up = false;
437 
438 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
439 		up = !vars->p_drop;
440 		vars->p_drop += p->p_inc;
441 		if (vars->p_drop < p->p_inc)
442 			vars->p_drop = ~0;
443 		vars->blue_timer = now;
444 	}
445 	vars->dropping = true;
446 	vars->drop_next = now;
447 	if (!vars->count)
448 		vars->count = 1;
449 
450 	return up;
451 }
452 
453 /* Call this when the queue was serviced but turned out to be empty.  Returns
454  * true if the BLUE state was active before but quiescent after this call.
455  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)456 static bool cobalt_queue_empty(struct cobalt_vars *vars,
457 			       struct cobalt_params *p,
458 			       ktime_t now)
459 {
460 	bool down = false;
461 
462 	if (vars->p_drop &&
463 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
464 		if (vars->p_drop < p->p_dec)
465 			vars->p_drop = 0;
466 		else
467 			vars->p_drop -= p->p_dec;
468 		vars->blue_timer = now;
469 		down = !vars->p_drop;
470 	}
471 	vars->dropping = false;
472 
473 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
474 		vars->count--;
475 		cobalt_invsqrt(vars);
476 		vars->drop_next = cobalt_control(vars->drop_next,
477 						 p->interval,
478 						 vars->rec_inv_sqrt);
479 	}
480 
481 	return down;
482 }
483 
484 /* Call this with a freshly dequeued packet for possible congestion marking.
485  * Returns true as an instruction to drop the packet, false for delivery.
486  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)487 static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
488 					       struct cobalt_params *p,
489 					       ktime_t now,
490 					       struct sk_buff *skb,
491 					       u32 bulk_flows)
492 {
493 	enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
494 	bool next_due, over_target;
495 	ktime_t schedule;
496 	u64 sojourn;
497 
498 /* The 'schedule' variable records, in its sign, whether 'now' is before or
499  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
500  * scheduling decision is actually branched, without destroying that
501  * information.  Similarly, the first 'schedule' value calculated is preserved
502  * in the boolean 'next_due'.
503  *
504  * As for 'drop_next', we take advantage of the fact that 'interval' is both
505  * the delay between first exceeding 'target' and the first signalling event,
506  * *and* the scaling factor for the signalling frequency.  It's therefore very
507  * natural to use a single mechanism for both purposes, and eliminates a
508  * significant amount of reference Codel's spaghetti code.  To help with this,
509  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
510  * as possible to 1.0 in fixed-point.
511  */
512 
513 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
514 	schedule = ktime_sub(now, vars->drop_next);
515 	over_target = sojourn > p->target &&
516 		      sojourn > p->mtu_time * bulk_flows * 2 &&
517 		      sojourn > p->mtu_time * 4;
518 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
519 
520 	vars->ecn_marked = false;
521 
522 	if (over_target) {
523 		if (!vars->dropping) {
524 			vars->dropping = true;
525 			vars->drop_next = cobalt_control(now,
526 							 p->interval,
527 							 vars->rec_inv_sqrt);
528 		}
529 		if (!vars->count)
530 			vars->count = 1;
531 	} else if (vars->dropping) {
532 		vars->dropping = false;
533 	}
534 
535 	if (next_due && vars->dropping) {
536 		/* Use ECN mark if possible, otherwise drop */
537 		if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
538 			reason = SKB_DROP_REASON_QDISC_CONGESTED;
539 
540 		vars->count++;
541 		if (!vars->count)
542 			vars->count--;
543 		cobalt_invsqrt(vars);
544 		vars->drop_next = cobalt_control(vars->drop_next,
545 						 p->interval,
546 						 vars->rec_inv_sqrt);
547 		schedule = ktime_sub(now, vars->drop_next);
548 	} else {
549 		while (next_due) {
550 			vars->count--;
551 			cobalt_invsqrt(vars);
552 			vars->drop_next = cobalt_control(vars->drop_next,
553 							 p->interval,
554 							 vars->rec_inv_sqrt);
555 			schedule = ktime_sub(now, vars->drop_next);
556 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
557 		}
558 	}
559 
560 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
561 	if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
562 	    get_random_u32() < vars->p_drop)
563 		reason = SKB_DROP_REASON_CAKE_FLOOD;
564 
565 	/* Overload the drop_next field as an activity timeout */
566 	if (!vars->count)
567 		vars->drop_next = ktime_add_ns(now, p->interval);
568 	else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
569 		vars->drop_next = now;
570 
571 	return reason;
572 }
573 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)574 static bool cake_update_flowkeys(struct flow_keys *keys,
575 				 const struct sk_buff *skb)
576 {
577 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
578 	struct nf_conntrack_tuple tuple = {};
579 	bool rev = !skb->_nfct, upd = false;
580 	__be32 ip;
581 
582 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
583 		return false;
584 
585 	if (!nf_ct_get_tuple_skb(&tuple, skb))
586 		return false;
587 
588 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
589 	if (ip != keys->addrs.v4addrs.src) {
590 		keys->addrs.v4addrs.src = ip;
591 		upd = true;
592 	}
593 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
594 	if (ip != keys->addrs.v4addrs.dst) {
595 		keys->addrs.v4addrs.dst = ip;
596 		upd = true;
597 	}
598 
599 	if (keys->ports.ports) {
600 		__be16 port;
601 
602 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
603 		if (port != keys->ports.src) {
604 			keys->ports.src = port;
605 			upd = true;
606 		}
607 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
608 		if (port != keys->ports.dst) {
609 			port = keys->ports.dst;
610 			upd = true;
611 		}
612 	}
613 	return upd;
614 #else
615 	return false;
616 #endif
617 }
618 
619 /* Cake has several subtle multiple bit settings. In these cases you
620  *  would be matching triple isolate mode as well.
621  */
622 
cake_dsrc(int flow_mode)623 static bool cake_dsrc(int flow_mode)
624 {
625 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
626 }
627 
cake_ddst(int flow_mode)628 static bool cake_ddst(int flow_mode)
629 {
630 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
631 }
632 
cake_dec_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)633 static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q,
634 					     struct cake_flow *flow,
635 					     int flow_mode)
636 {
637 	if (likely(cake_dsrc(flow_mode) &&
638 		   q->hosts[flow->srchost].srchost_bulk_flow_count))
639 		q->hosts[flow->srchost].srchost_bulk_flow_count--;
640 }
641 
cake_inc_srchost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)642 static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q,
643 					     struct cake_flow *flow,
644 					     int flow_mode)
645 {
646 	if (likely(cake_dsrc(flow_mode) &&
647 		   q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES))
648 		q->hosts[flow->srchost].srchost_bulk_flow_count++;
649 }
650 
cake_dec_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)651 static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q,
652 					     struct cake_flow *flow,
653 					     int flow_mode)
654 {
655 	if (likely(cake_ddst(flow_mode) &&
656 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count))
657 		q->hosts[flow->dsthost].dsthost_bulk_flow_count--;
658 }
659 
cake_inc_dsthost_bulk_flow_count(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)660 static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q,
661 					     struct cake_flow *flow,
662 					     int flow_mode)
663 {
664 	if (likely(cake_ddst(flow_mode) &&
665 		   q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES))
666 		q->hosts[flow->dsthost].dsthost_bulk_flow_count++;
667 }
668 
cake_get_flow_quantum(struct cake_tin_data * q,struct cake_flow * flow,int flow_mode)669 static u16 cake_get_flow_quantum(struct cake_tin_data *q,
670 				 struct cake_flow *flow,
671 				 int flow_mode)
672 {
673 	u16 host_load = 1;
674 
675 	if (cake_dsrc(flow_mode))
676 		host_load = max(host_load,
677 				q->hosts[flow->srchost].srchost_bulk_flow_count);
678 
679 	if (cake_ddst(flow_mode))
680 		host_load = max(host_load,
681 				q->hosts[flow->dsthost].dsthost_bulk_flow_count);
682 
683 	/* The get_random_u16() is a way to apply dithering to avoid
684 	 * accumulating roundoff errors
685 	 */
686 	return (q->flow_quantum * quantum_div[host_load] +
687 		get_random_u16()) >> 16;
688 }
689 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)690 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
691 		     int flow_mode, u16 flow_override, u16 host_override)
692 {
693 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
694 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
695 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
696 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
697 	u16 reduced_hash, srchost_idx, dsthost_idx;
698 	struct flow_keys keys, host_keys;
699 	bool use_skbhash = skb->l4_hash;
700 
701 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
702 		return 0;
703 
704 	/* If both overrides are set, or we can use the SKB hash and nat mode is
705 	 * disabled, we can skip packet dissection entirely. If nat mode is
706 	 * enabled there's another check below after doing the conntrack lookup.
707 	 */
708 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
709 		goto skip_hash;
710 
711 	skb_flow_dissect_flow_keys(skb, &keys,
712 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
713 
714 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
715 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
716 		use_skbhash = false;
717 
718 	/* If we can still use the SKB hash and don't need the host hash, we can
719 	 * skip the rest of the hashing procedure
720 	 */
721 	if (use_skbhash && !hash_hosts)
722 		goto skip_hash;
723 
724 	/* flow_hash_from_keys() sorts the addresses by value, so we have
725 	 * to preserve their order in a separate data structure to treat
726 	 * src and dst host addresses as independently selectable.
727 	 */
728 	host_keys = keys;
729 	host_keys.ports.ports     = 0;
730 	host_keys.basic.ip_proto  = 0;
731 	host_keys.keyid.keyid     = 0;
732 	host_keys.tags.flow_label = 0;
733 
734 	switch (host_keys.control.addr_type) {
735 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
736 		host_keys.addrs.v4addrs.src = 0;
737 		dsthost_hash = flow_hash_from_keys(&host_keys);
738 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
739 		host_keys.addrs.v4addrs.dst = 0;
740 		srchost_hash = flow_hash_from_keys(&host_keys);
741 		break;
742 
743 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
744 		memset(&host_keys.addrs.v6addrs.src, 0,
745 		       sizeof(host_keys.addrs.v6addrs.src));
746 		dsthost_hash = flow_hash_from_keys(&host_keys);
747 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
748 		memset(&host_keys.addrs.v6addrs.dst, 0,
749 		       sizeof(host_keys.addrs.v6addrs.dst));
750 		srchost_hash = flow_hash_from_keys(&host_keys);
751 		break;
752 
753 	default:
754 		dsthost_hash = 0;
755 		srchost_hash = 0;
756 	}
757 
758 	/* This *must* be after the above switch, since as a
759 	 * side-effect it sorts the src and dst addresses.
760 	 */
761 	if (hash_flows && !use_skbhash)
762 		flow_hash = flow_hash_from_keys(&keys);
763 
764 skip_hash:
765 	if (flow_override)
766 		flow_hash = flow_override - 1;
767 	else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
768 		flow_hash = skb->hash;
769 	if (host_override) {
770 		dsthost_hash = host_override - 1;
771 		srchost_hash = host_override - 1;
772 	}
773 
774 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
775 		if (flow_mode & CAKE_FLOW_SRC_IP)
776 			flow_hash ^= srchost_hash;
777 
778 		if (flow_mode & CAKE_FLOW_DST_IP)
779 			flow_hash ^= dsthost_hash;
780 	}
781 
782 	reduced_hash = flow_hash % CAKE_QUEUES;
783 
784 	/* set-associative hashing */
785 	/* fast path if no hash collision (direct lookup succeeds) */
786 	if (likely(q->tags[reduced_hash] == flow_hash &&
787 		   q->flows[reduced_hash].set)) {
788 		q->way_directs++;
789 	} else {
790 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
791 		u32 outer_hash = reduced_hash - inner_hash;
792 		bool allocate_src = false;
793 		bool allocate_dst = false;
794 		u32 i, k;
795 
796 		/* check if any active queue in the set is reserved for
797 		 * this flow.
798 		 */
799 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
800 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
801 			if (q->tags[outer_hash + k] == flow_hash) {
802 				if (i)
803 					q->way_hits++;
804 
805 				if (!q->flows[outer_hash + k].set) {
806 					/* need to increment host refcnts */
807 					allocate_src = cake_dsrc(flow_mode);
808 					allocate_dst = cake_ddst(flow_mode);
809 				}
810 
811 				goto found;
812 			}
813 		}
814 
815 		/* no queue is reserved for this flow, look for an
816 		 * empty one.
817 		 */
818 		for (i = 0; i < CAKE_SET_WAYS;
819 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
820 			if (!q->flows[outer_hash + k].set) {
821 				q->way_misses++;
822 				allocate_src = cake_dsrc(flow_mode);
823 				allocate_dst = cake_ddst(flow_mode);
824 				goto found;
825 			}
826 		}
827 
828 		/* With no empty queues, default to the original
829 		 * queue, accept the collision, update the host tags.
830 		 */
831 		q->way_collisions++;
832 		allocate_src = cake_dsrc(flow_mode);
833 		allocate_dst = cake_ddst(flow_mode);
834 
835 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
836 			cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
837 			cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode);
838 		}
839 found:
840 		/* reserve queue for future packets in same flow */
841 		reduced_hash = outer_hash + k;
842 		q->tags[reduced_hash] = flow_hash;
843 
844 		if (allocate_src) {
845 			srchost_idx = srchost_hash % CAKE_QUEUES;
846 			inner_hash = srchost_idx % CAKE_SET_WAYS;
847 			outer_hash = srchost_idx - inner_hash;
848 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
849 				i++, k = (k + 1) % CAKE_SET_WAYS) {
850 				if (q->hosts[outer_hash + k].srchost_tag ==
851 				    srchost_hash)
852 					goto found_src;
853 			}
854 			for (i = 0; i < CAKE_SET_WAYS;
855 				i++, k = (k + 1) % CAKE_SET_WAYS) {
856 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
857 					break;
858 			}
859 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
860 found_src:
861 			srchost_idx = outer_hash + k;
862 			q->flows[reduced_hash].srchost = srchost_idx;
863 
864 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
865 				cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
866 		}
867 
868 		if (allocate_dst) {
869 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
870 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
871 			outer_hash = dsthost_idx - inner_hash;
872 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
873 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
874 				if (q->hosts[outer_hash + k].dsthost_tag ==
875 				    dsthost_hash)
876 					goto found_dst;
877 			}
878 			for (i = 0; i < CAKE_SET_WAYS;
879 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
880 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
881 					break;
882 			}
883 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
884 found_dst:
885 			dsthost_idx = outer_hash + k;
886 			q->flows[reduced_hash].dsthost = dsthost_idx;
887 
888 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
889 				cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode);
890 		}
891 	}
892 
893 	return reduced_hash;
894 }
895 
896 /* helper functions : might be changed when/if skb use a standard list_head */
897 /* remove one skb from head of slot queue */
898 
dequeue_head(struct cake_flow * flow)899 static struct sk_buff *dequeue_head(struct cake_flow *flow)
900 {
901 	struct sk_buff *skb = flow->head;
902 
903 	if (skb) {
904 		flow->head = skb->next;
905 		skb_mark_not_on_list(skb);
906 	}
907 
908 	return skb;
909 }
910 
911 /* add skb to flow queue (tail add) */
912 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)913 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
914 {
915 	if (!flow->head)
916 		flow->head = skb;
917 	else
918 		flow->tail->next = skb;
919 	flow->tail = skb;
920 	skb->next = NULL;
921 }
922 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)923 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
924 				    struct ipv6hdr *buf)
925 {
926 	unsigned int offset = skb_network_offset(skb);
927 	struct iphdr *iph;
928 
929 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
930 
931 	if (!iph)
932 		return NULL;
933 
934 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
935 		return skb_header_pointer(skb, offset + iph->ihl * 4,
936 					  sizeof(struct ipv6hdr), buf);
937 
938 	else if (iph->version == 4)
939 		return iph;
940 
941 	else if (iph->version == 6)
942 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
943 					  buf);
944 
945 	return NULL;
946 }
947 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)948 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
949 				      void *buf, unsigned int bufsize)
950 {
951 	unsigned int offset = skb_network_offset(skb);
952 	const struct ipv6hdr *ipv6h;
953 	const struct tcphdr *tcph;
954 	const struct iphdr *iph;
955 	struct ipv6hdr _ipv6h;
956 	struct tcphdr _tcph;
957 
958 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
959 
960 	if (!ipv6h)
961 		return NULL;
962 
963 	if (ipv6h->version == 4) {
964 		iph = (struct iphdr *)ipv6h;
965 		offset += iph->ihl * 4;
966 
967 		/* special-case 6in4 tunnelling, as that is a common way to get
968 		 * v6 connectivity in the home
969 		 */
970 		if (iph->protocol == IPPROTO_IPV6) {
971 			ipv6h = skb_header_pointer(skb, offset,
972 						   sizeof(_ipv6h), &_ipv6h);
973 
974 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
975 				return NULL;
976 
977 			offset += sizeof(struct ipv6hdr);
978 
979 		} else if (iph->protocol != IPPROTO_TCP) {
980 			return NULL;
981 		}
982 
983 	} else if (ipv6h->version == 6) {
984 		if (ipv6h->nexthdr != IPPROTO_TCP)
985 			return NULL;
986 
987 		offset += sizeof(struct ipv6hdr);
988 	} else {
989 		return NULL;
990 	}
991 
992 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
993 	if (!tcph || tcph->doff < 5)
994 		return NULL;
995 
996 	return skb_header_pointer(skb, offset,
997 				  min(__tcp_hdrlen(tcph), bufsize), buf);
998 }
999 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)1000 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
1001 				   int code, int *oplen)
1002 {
1003 	/* inspired by tcp_parse_options in tcp_input.c */
1004 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1005 	const u8 *ptr = (const u8 *)(tcph + 1);
1006 
1007 	while (length > 0) {
1008 		int opcode = *ptr++;
1009 		int opsize;
1010 
1011 		if (opcode == TCPOPT_EOL)
1012 			break;
1013 		if (opcode == TCPOPT_NOP) {
1014 			length--;
1015 			continue;
1016 		}
1017 		if (length < 2)
1018 			break;
1019 		opsize = *ptr++;
1020 		if (opsize < 2 || opsize > length)
1021 			break;
1022 
1023 		if (opcode == code) {
1024 			*oplen = opsize;
1025 			return ptr;
1026 		}
1027 
1028 		ptr += opsize - 2;
1029 		length -= opsize;
1030 	}
1031 
1032 	return NULL;
1033 }
1034 
1035 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
1036  * bytes than the other. In the case where both sequences ACKs bytes that the
1037  * other doesn't, A is considered greater. DSACKs in A also makes A be
1038  * considered greater.
1039  *
1040  * @return -1, 0 or 1 as normal compare functions
1041  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)1042 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
1043 				  const struct tcphdr *tcph_b)
1044 {
1045 	const struct tcp_sack_block_wire *sack_a, *sack_b;
1046 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1047 	u32 bytes_a = 0, bytes_b = 0;
1048 	int oplen_a, oplen_b;
1049 	bool first = true;
1050 
1051 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1052 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1053 
1054 	/* pointers point to option contents */
1055 	oplen_a -= TCPOLEN_SACK_BASE;
1056 	oplen_b -= TCPOLEN_SACK_BASE;
1057 
1058 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1059 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1060 		return -1;
1061 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1062 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1063 		return 1;
1064 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1065 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1066 		return 0;
1067 
1068 	while (oplen_a >= sizeof(*sack_a)) {
1069 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1070 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1071 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1072 		int oplen_tmp = oplen_b;
1073 		bool found = false;
1074 
1075 		/* DSACK; always considered greater to prevent dropping */
1076 		if (before(start_a, ack_seq_a))
1077 			return -1;
1078 
1079 		bytes_a += end_a - start_a;
1080 
1081 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1082 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1083 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1084 
1085 			/* first time through we count the total size */
1086 			if (first)
1087 				bytes_b += end_b - start_b;
1088 
1089 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1090 				found = true;
1091 				if (!first)
1092 					break;
1093 			}
1094 			oplen_tmp -= sizeof(*sack_tmp);
1095 			sack_tmp++;
1096 		}
1097 
1098 		if (!found)
1099 			return -1;
1100 
1101 		oplen_a -= sizeof(*sack_a);
1102 		sack_a++;
1103 		first = false;
1104 	}
1105 
1106 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1107 	 * either the SACKs are equal, or B SACKs more bytes.
1108 	 */
1109 	return bytes_b > bytes_a ? 1 : 0;
1110 }
1111 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1112 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1113 				 u32 *tsval, u32 *tsecr)
1114 {
1115 	const u8 *ptr;
1116 	int opsize;
1117 
1118 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1119 
1120 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1121 		*tsval = get_unaligned_be32(ptr);
1122 		*tsecr = get_unaligned_be32(ptr + 4);
1123 	}
1124 }
1125 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1126 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1127 			       u32 tstamp_new, u32 tsecr_new)
1128 {
1129 	/* inspired by tcp_parse_options in tcp_input.c */
1130 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1131 	const u8 *ptr = (const u8 *)(tcph + 1);
1132 	u32 tstamp, tsecr;
1133 
1134 	/* 3 reserved flags must be unset to avoid future breakage
1135 	 * ACK must be set
1136 	 * ECE/CWR are handled separately
1137 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1138 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1139 	 * 0x00C00000 = CWR/ECE (handled separately)
1140 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1141 	 */
1142 	if (((tcp_flag_word(tcph) &
1143 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1144 		return false;
1145 
1146 	while (length > 0) {
1147 		int opcode = *ptr++;
1148 		int opsize;
1149 
1150 		if (opcode == TCPOPT_EOL)
1151 			break;
1152 		if (opcode == TCPOPT_NOP) {
1153 			length--;
1154 			continue;
1155 		}
1156 		if (length < 2)
1157 			break;
1158 		opsize = *ptr++;
1159 		if (opsize < 2 || opsize > length)
1160 			break;
1161 
1162 		switch (opcode) {
1163 		case TCPOPT_MD5SIG: /* doesn't influence state */
1164 			break;
1165 
1166 		case TCPOPT_SACK: /* stricter checking performed later */
1167 			if (opsize % 8 != 2)
1168 				return false;
1169 			break;
1170 
1171 		case TCPOPT_TIMESTAMP:
1172 			/* only drop timestamps lower than new */
1173 			if (opsize != TCPOLEN_TIMESTAMP)
1174 				return false;
1175 			tstamp = get_unaligned_be32(ptr);
1176 			tsecr = get_unaligned_be32(ptr + 4);
1177 			if (after(tstamp, tstamp_new) ||
1178 			    after(tsecr, tsecr_new))
1179 				return false;
1180 			break;
1181 
1182 		case TCPOPT_MSS:  /* these should only be set on SYN */
1183 		case TCPOPT_WINDOW:
1184 		case TCPOPT_SACK_PERM:
1185 		case TCPOPT_FASTOPEN:
1186 		case TCPOPT_EXP:
1187 		default: /* don't drop if any unknown options are present */
1188 			return false;
1189 		}
1190 
1191 		ptr += opsize - 2;
1192 		length -= opsize;
1193 	}
1194 
1195 	return true;
1196 }
1197 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1198 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1199 				       struct cake_flow *flow)
1200 {
1201 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1202 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1203 	struct sk_buff *skb_check, *skb_prev = NULL;
1204 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1205 	unsigned char _tcph[64], _tcph_check[64];
1206 	const struct tcphdr *tcph, *tcph_check;
1207 	const struct iphdr *iph, *iph_check;
1208 	struct ipv6hdr _iph, _iph_check;
1209 	const struct sk_buff *skb;
1210 	int seglen, num_found = 0;
1211 	u32 tstamp = 0, tsecr = 0;
1212 	__be32 elig_flags = 0;
1213 	int sack_comp;
1214 
1215 	/* no other possible ACKs to filter */
1216 	if (flow->head == flow->tail)
1217 		return NULL;
1218 
1219 	skb = flow->tail;
1220 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1221 	iph = cake_get_iphdr(skb, &_iph);
1222 	if (!tcph)
1223 		return NULL;
1224 
1225 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1226 
1227 	/* the 'triggering' packet need only have the ACK flag set.
1228 	 * also check that SYN is not set, as there won't be any previous ACKs.
1229 	 */
1230 	if ((tcp_flag_word(tcph) &
1231 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1232 		return NULL;
1233 
1234 	/* the 'triggering' ACK is at the tail of the queue, we have already
1235 	 * returned if it is the only packet in the flow. loop through the rest
1236 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1237 	 * triggering one.
1238 	 */
1239 	for (skb_check = flow->head;
1240 	     skb_check && skb_check != skb;
1241 	     skb_prev = skb_check, skb_check = skb_check->next) {
1242 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1243 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1244 					     sizeof(_tcph_check));
1245 
1246 		/* only TCP packets with matching 5-tuple are eligible, and only
1247 		 * drop safe headers
1248 		 */
1249 		if (!tcph_check || iph->version != iph_check->version ||
1250 		    tcph_check->source != tcph->source ||
1251 		    tcph_check->dest != tcph->dest)
1252 			continue;
1253 
1254 		if (iph_check->version == 4) {
1255 			if (iph_check->saddr != iph->saddr ||
1256 			    iph_check->daddr != iph->daddr)
1257 				continue;
1258 
1259 			seglen = iph_totlen(skb, iph_check) -
1260 				       (4 * iph_check->ihl);
1261 		} else if (iph_check->version == 6) {
1262 			ipv6h = (struct ipv6hdr *)iph;
1263 			ipv6h_check = (struct ipv6hdr *)iph_check;
1264 
1265 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1266 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1267 				continue;
1268 
1269 			seglen = ntohs(ipv6h_check->payload_len);
1270 		} else {
1271 			WARN_ON(1);  /* shouldn't happen */
1272 			continue;
1273 		}
1274 
1275 		/* If the ECE/CWR flags changed from the previous eligible
1276 		 * packet in the same flow, we should no longer be dropping that
1277 		 * previous packet as this would lose information.
1278 		 */
1279 		if (elig_ack && (tcp_flag_word(tcph_check) &
1280 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1281 			elig_ack = NULL;
1282 			elig_ack_prev = NULL;
1283 			num_found--;
1284 		}
1285 
1286 		/* Check TCP options and flags, don't drop ACKs with segment
1287 		 * data, and don't drop ACKs with a higher cumulative ACK
1288 		 * counter than the triggering packet. Check ACK seqno here to
1289 		 * avoid parsing SACK options of packets we are going to exclude
1290 		 * anyway.
1291 		 */
1292 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1293 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1294 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1295 			continue;
1296 
1297 		/* Check SACK options. The triggering packet must SACK more data
1298 		 * than the ACK under consideration, or SACK the same range but
1299 		 * have a larger cumulative ACK counter. The latter is a
1300 		 * pathological case, but is contained in the following check
1301 		 * anyway, just to be safe.
1302 		 */
1303 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1304 
1305 		if (sack_comp < 0 ||
1306 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1307 		     sack_comp == 0))
1308 			continue;
1309 
1310 		/* At this point we have found an eligible pure ACK to drop; if
1311 		 * we are in aggressive mode, we are done. Otherwise, keep
1312 		 * searching unless this is the second eligible ACK we
1313 		 * found.
1314 		 *
1315 		 * Since we want to drop ACK closest to the head of the queue,
1316 		 * save the first eligible ACK we find, even if we need to loop
1317 		 * again.
1318 		 */
1319 		if (!elig_ack) {
1320 			elig_ack = skb_check;
1321 			elig_ack_prev = skb_prev;
1322 			elig_flags = (tcp_flag_word(tcph_check)
1323 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1324 		}
1325 
1326 		if (num_found++ > 0)
1327 			goto found;
1328 	}
1329 
1330 	/* We made it through the queue without finding two eligible ACKs . If
1331 	 * we found a single eligible ACK we can drop it in aggressive mode if
1332 	 * we can guarantee that this does not interfere with ECN flag
1333 	 * information. We ensure this by dropping it only if the enqueued
1334 	 * packet is consecutive with the eligible ACK, and their flags match.
1335 	 */
1336 	if (elig_ack && aggressive && elig_ack->next == skb &&
1337 	    (elig_flags == (tcp_flag_word(tcph) &
1338 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1339 		goto found;
1340 
1341 	return NULL;
1342 
1343 found:
1344 	if (elig_ack_prev)
1345 		elig_ack_prev->next = elig_ack->next;
1346 	else
1347 		flow->head = elig_ack->next;
1348 
1349 	skb_mark_not_on_list(elig_ack);
1350 
1351 	return elig_ack;
1352 }
1353 
cake_ewma(u64 avg,u64 sample,u32 shift)1354 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1355 {
1356 	avg -= avg >> shift;
1357 	avg += sample >> shift;
1358 	return avg;
1359 }
1360 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1361 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1362 {
1363 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1364 		len -= off;
1365 
1366 	if (q->max_netlen < len)
1367 		q->max_netlen = len;
1368 	if (q->min_netlen > len)
1369 		q->min_netlen = len;
1370 
1371 	len += q->rate_overhead;
1372 
1373 	if (len < q->rate_mpu)
1374 		len = q->rate_mpu;
1375 
1376 	if (q->atm_mode == CAKE_ATM_ATM) {
1377 		len += 47;
1378 		len /= 48;
1379 		len *= 53;
1380 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1381 		/* Add one byte per 64 bytes or part thereof.
1382 		 * This is conservative and easier to calculate than the
1383 		 * precise value.
1384 		 */
1385 		len += (len + 63) / 64;
1386 	}
1387 
1388 	if (q->max_adjlen < len)
1389 		q->max_adjlen = len;
1390 	if (q->min_adjlen > len)
1391 		q->min_adjlen = len;
1392 
1393 	return len;
1394 }
1395 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1396 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1397 {
1398 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1399 	unsigned int hdr_len, last_len = 0;
1400 	u32 off = skb_network_offset(skb);
1401 	u32 len = qdisc_pkt_len(skb);
1402 	u16 segs = 1;
1403 
1404 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1405 
1406 	if (!shinfo->gso_size)
1407 		return cake_calc_overhead(q, len, off);
1408 
1409 	/* borrowed from qdisc_pkt_len_init() */
1410 	if (!skb->encapsulation)
1411 		hdr_len = skb_transport_offset(skb);
1412 	else
1413 		hdr_len = skb_inner_transport_offset(skb);
1414 
1415 	/* + transport layer */
1416 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1417 						SKB_GSO_TCPV6))) {
1418 		const struct tcphdr *th;
1419 		struct tcphdr _tcphdr;
1420 
1421 		th = skb_header_pointer(skb, hdr_len,
1422 					sizeof(_tcphdr), &_tcphdr);
1423 		if (likely(th))
1424 			hdr_len += __tcp_hdrlen(th);
1425 	} else {
1426 		struct udphdr _udphdr;
1427 
1428 		if (skb_header_pointer(skb, hdr_len,
1429 				       sizeof(_udphdr), &_udphdr))
1430 			hdr_len += sizeof(struct udphdr);
1431 	}
1432 
1433 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1434 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1435 				    shinfo->gso_size);
1436 	else
1437 		segs = shinfo->gso_segs;
1438 
1439 	len = shinfo->gso_size + hdr_len;
1440 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1441 
1442 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1443 		cake_calc_overhead(q, last_len, off));
1444 }
1445 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1446 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1447 {
1448 	struct cake_heap_entry ii = q->overflow_heap[i];
1449 	struct cake_heap_entry jj = q->overflow_heap[j];
1450 
1451 	q->overflow_heap[i] = jj;
1452 	q->overflow_heap[j] = ii;
1453 
1454 	q->tins[ii.t].overflow_idx[ii.b] = j;
1455 	q->tins[jj.t].overflow_idx[jj.b] = i;
1456 }
1457 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1458 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1459 {
1460 	struct cake_heap_entry ii = q->overflow_heap[i];
1461 
1462 	return q->tins[ii.t].backlogs[ii.b];
1463 }
1464 
cake_heapify(struct cake_sched_data * q,u16 i)1465 static void cake_heapify(struct cake_sched_data *q, u16 i)
1466 {
1467 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1468 	u32 mb = cake_heap_get_backlog(q, i);
1469 	u32 m = i;
1470 
1471 	while (m < a) {
1472 		u32 l = m + m + 1;
1473 		u32 r = l + 1;
1474 
1475 		if (l < a) {
1476 			u32 lb = cake_heap_get_backlog(q, l);
1477 
1478 			if (lb > mb) {
1479 				m  = l;
1480 				mb = lb;
1481 			}
1482 		}
1483 
1484 		if (r < a) {
1485 			u32 rb = cake_heap_get_backlog(q, r);
1486 
1487 			if (rb > mb) {
1488 				m  = r;
1489 				mb = rb;
1490 			}
1491 		}
1492 
1493 		if (m != i) {
1494 			cake_heap_swap(q, i, m);
1495 			i = m;
1496 		} else {
1497 			break;
1498 		}
1499 	}
1500 }
1501 
cake_heapify_up(struct cake_sched_data * q,u16 i)1502 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1503 {
1504 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1505 		u16 p = (i - 1) >> 1;
1506 		u32 ib = cake_heap_get_backlog(q, i);
1507 		u32 pb = cake_heap_get_backlog(q, p);
1508 
1509 		if (ib > pb) {
1510 			cake_heap_swap(q, i, p);
1511 			i = p;
1512 		} else {
1513 			break;
1514 		}
1515 	}
1516 }
1517 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1518 static int cake_advance_shaper(struct cake_sched_data *q,
1519 			       struct cake_tin_data *b,
1520 			       struct sk_buff *skb,
1521 			       ktime_t now, bool drop)
1522 {
1523 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1524 
1525 	/* charge packet bandwidth to this tin
1526 	 * and to the global shaper.
1527 	 */
1528 	if (q->rate_ns) {
1529 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1530 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1531 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1532 
1533 		if (ktime_before(b->time_next_packet, now))
1534 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1535 							   tin_dur);
1536 
1537 		else if (ktime_before(b->time_next_packet,
1538 				      ktime_add_ns(now, tin_dur)))
1539 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1540 
1541 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1542 						   global_dur);
1543 		if (!drop)
1544 			q->failsafe_next_packet = \
1545 				ktime_add_ns(q->failsafe_next_packet,
1546 					     failsafe_dur);
1547 	}
1548 	return len;
1549 }
1550 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1551 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1552 {
1553 	struct cake_sched_data *q = qdisc_priv(sch);
1554 	ktime_t now = ktime_get();
1555 	u32 idx = 0, tin = 0, len;
1556 	struct cake_heap_entry qq;
1557 	struct cake_tin_data *b;
1558 	struct cake_flow *flow;
1559 	struct sk_buff *skb;
1560 
1561 	if (!q->overflow_timeout) {
1562 		int i;
1563 		/* Build fresh max-heap */
1564 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2 - 1; i >= 0; i--)
1565 			cake_heapify(q, i);
1566 	}
1567 	q->overflow_timeout = 65535;
1568 
1569 	/* select longest queue for pruning */
1570 	qq  = q->overflow_heap[0];
1571 	tin = qq.t;
1572 	idx = qq.b;
1573 
1574 	b = &q->tins[tin];
1575 	flow = &b->flows[idx];
1576 	skb = dequeue_head(flow);
1577 	if (unlikely(!skb)) {
1578 		/* heap has gone wrong, rebuild it next time */
1579 		q->overflow_timeout = 0;
1580 		return idx + (tin << 16);
1581 	}
1582 
1583 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1584 		b->unresponsive_flow_count++;
1585 
1586 	len = qdisc_pkt_len(skb);
1587 	q->buffer_used      -= skb->truesize;
1588 	b->backlogs[idx]    -= len;
1589 	b->tin_backlog      -= len;
1590 	sch->qstats.backlog -= len;
1591 
1592 	flow->dropped++;
1593 	b->tin_dropped++;
1594 
1595 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1596 		cake_advance_shaper(q, b, skb, now, true);
1597 
1598 	qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
1599 	sch->q.qlen--;
1600 	qdisc_tree_reduce_backlog(sch, 1, len);
1601 
1602 	cake_heapify(q, 0);
1603 
1604 	return idx + (tin << 16);
1605 }
1606 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1607 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1608 {
1609 	const int offset = skb_network_offset(skb);
1610 	u16 *buf, buf_;
1611 	u8 dscp;
1612 
1613 	switch (skb_protocol(skb, true)) {
1614 	case htons(ETH_P_IP):
1615 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1616 		if (unlikely(!buf))
1617 			return 0;
1618 
1619 		/* ToS is in the second byte of iphdr */
1620 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1621 
1622 		if (wash && dscp) {
1623 			const int wlen = offset + sizeof(struct iphdr);
1624 
1625 			if (!pskb_may_pull(skb, wlen) ||
1626 			    skb_try_make_writable(skb, wlen))
1627 				return 0;
1628 
1629 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1630 		}
1631 
1632 		return dscp;
1633 
1634 	case htons(ETH_P_IPV6):
1635 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1636 		if (unlikely(!buf))
1637 			return 0;
1638 
1639 		/* Traffic class is in the first and second bytes of ipv6hdr */
1640 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1641 
1642 		if (wash && dscp) {
1643 			const int wlen = offset + sizeof(struct ipv6hdr);
1644 
1645 			if (!pskb_may_pull(skb, wlen) ||
1646 			    skb_try_make_writable(skb, wlen))
1647 				return 0;
1648 
1649 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1650 		}
1651 
1652 		return dscp;
1653 
1654 	case htons(ETH_P_ARP):
1655 		return 0x38;  /* CS7 - Net Control */
1656 
1657 	default:
1658 		/* If there is no Diffserv field, treat as best-effort */
1659 		return 0;
1660 	}
1661 }
1662 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1663 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1664 					     struct sk_buff *skb)
1665 {
1666 	struct cake_sched_data *q = qdisc_priv(sch);
1667 	u32 tin, mark;
1668 	bool wash;
1669 	u8 dscp;
1670 
1671 	/* Tin selection: Default to diffserv-based selection, allow overriding
1672 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1673 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1674 	 */
1675 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1676 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1677 	if (wash)
1678 		dscp = cake_handle_diffserv(skb, wash);
1679 
1680 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1681 		tin = 0;
1682 
1683 	else if (mark && mark <= q->tin_cnt)
1684 		tin = q->tin_order[mark - 1];
1685 
1686 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1687 		 TC_H_MIN(skb->priority) > 0 &&
1688 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1689 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1690 
1691 	else {
1692 		if (!wash)
1693 			dscp = cake_handle_diffserv(skb, wash);
1694 		tin = q->tin_index[dscp];
1695 
1696 		if (unlikely(tin >= q->tin_cnt))
1697 			tin = 0;
1698 	}
1699 
1700 	return &q->tins[tin];
1701 }
1702 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1703 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1704 			 struct sk_buff *skb, int flow_mode, int *qerr)
1705 {
1706 	struct cake_sched_data *q = qdisc_priv(sch);
1707 	struct tcf_proto *filter;
1708 	struct tcf_result res;
1709 	u16 flow = 0, host = 0;
1710 	int result;
1711 
1712 	filter = rcu_dereference_bh(q->filter_list);
1713 	if (!filter)
1714 		goto hash;
1715 
1716 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1717 	result = tcf_classify(skb, NULL, filter, &res, false);
1718 
1719 	if (result >= 0) {
1720 #ifdef CONFIG_NET_CLS_ACT
1721 		switch (result) {
1722 		case TC_ACT_STOLEN:
1723 		case TC_ACT_QUEUED:
1724 		case TC_ACT_TRAP:
1725 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1726 			fallthrough;
1727 		case TC_ACT_SHOT:
1728 			return 0;
1729 		}
1730 #endif
1731 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1732 			flow = TC_H_MIN(res.classid);
1733 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1734 			host = TC_H_MAJ(res.classid) >> 16;
1735 	}
1736 hash:
1737 	*t = cake_select_tin(sch, skb);
1738 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1739 }
1740 
1741 static void cake_reconfigure(struct Qdisc *sch);
1742 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1743 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1744 			struct sk_buff **to_free)
1745 {
1746 	struct cake_sched_data *q = qdisc_priv(sch);
1747 	int len = qdisc_pkt_len(skb);
1748 	int ret;
1749 	struct sk_buff *ack = NULL;
1750 	ktime_t now = ktime_get();
1751 	struct cake_tin_data *b;
1752 	struct cake_flow *flow;
1753 	u32 idx, tin;
1754 
1755 	/* choose flow to insert into */
1756 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1757 	if (idx == 0) {
1758 		if (ret & __NET_XMIT_BYPASS)
1759 			qdisc_qstats_drop(sch);
1760 		__qdisc_drop(skb, to_free);
1761 		return ret;
1762 	}
1763 	tin = (u32)(b - q->tins);
1764 	idx--;
1765 	flow = &b->flows[idx];
1766 
1767 	/* ensure shaper state isn't stale */
1768 	if (!b->tin_backlog) {
1769 		if (ktime_before(b->time_next_packet, now))
1770 			b->time_next_packet = now;
1771 
1772 		if (!sch->q.qlen) {
1773 			if (ktime_before(q->time_next_packet, now)) {
1774 				q->failsafe_next_packet = now;
1775 				q->time_next_packet = now;
1776 			} else if (ktime_after(q->time_next_packet, now) &&
1777 				   ktime_after(q->failsafe_next_packet, now)) {
1778 				u64 next = \
1779 					min(ktime_to_ns(q->time_next_packet),
1780 					    ktime_to_ns(
1781 						   q->failsafe_next_packet));
1782 				sch->qstats.overlimits++;
1783 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1784 			}
1785 		}
1786 	}
1787 
1788 	if (unlikely(len > b->max_skblen))
1789 		b->max_skblen = len;
1790 
1791 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1792 		struct sk_buff *segs, *nskb;
1793 		netdev_features_t features = netif_skb_features(skb);
1794 		unsigned int slen = 0, numsegs = 0;
1795 
1796 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1797 		if (IS_ERR_OR_NULL(segs))
1798 			return qdisc_drop(skb, sch, to_free);
1799 
1800 		skb_list_walk_safe(segs, segs, nskb) {
1801 			skb_mark_not_on_list(segs);
1802 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1803 			cobalt_set_enqueue_time(segs, now);
1804 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1805 									  segs);
1806 			flow_queue_add(flow, segs);
1807 
1808 			sch->q.qlen++;
1809 			numsegs++;
1810 			slen += segs->len;
1811 			q->buffer_used += segs->truesize;
1812 			b->packets++;
1813 		}
1814 
1815 		/* stats */
1816 		b->bytes	    += slen;
1817 		b->backlogs[idx]    += slen;
1818 		b->tin_backlog      += slen;
1819 		sch->qstats.backlog += slen;
1820 		q->avg_window_bytes += slen;
1821 
1822 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1823 		consume_skb(skb);
1824 	} else {
1825 		/* not splitting */
1826 		cobalt_set_enqueue_time(skb, now);
1827 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1828 		flow_queue_add(flow, skb);
1829 
1830 		if (q->ack_filter)
1831 			ack = cake_ack_filter(q, flow);
1832 
1833 		if (ack) {
1834 			b->ack_drops++;
1835 			sch->qstats.drops++;
1836 			b->bytes += qdisc_pkt_len(ack);
1837 			len -= qdisc_pkt_len(ack);
1838 			q->buffer_used += skb->truesize - ack->truesize;
1839 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1840 				cake_advance_shaper(q, b, ack, now, true);
1841 
1842 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1843 			consume_skb(ack);
1844 		} else {
1845 			sch->q.qlen++;
1846 			q->buffer_used      += skb->truesize;
1847 		}
1848 
1849 		/* stats */
1850 		b->packets++;
1851 		b->bytes	    += len;
1852 		b->backlogs[idx]    += len;
1853 		b->tin_backlog      += len;
1854 		sch->qstats.backlog += len;
1855 		q->avg_window_bytes += len;
1856 	}
1857 
1858 	if (q->overflow_timeout)
1859 		cake_heapify_up(q, b->overflow_idx[idx]);
1860 
1861 	/* incoming bandwidth capacity estimate */
1862 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1863 		u64 packet_interval = \
1864 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1865 
1866 		if (packet_interval > NSEC_PER_SEC)
1867 			packet_interval = NSEC_PER_SEC;
1868 
1869 		/* filter out short-term bursts, eg. wifi aggregation */
1870 		q->avg_packet_interval = \
1871 			cake_ewma(q->avg_packet_interval,
1872 				  packet_interval,
1873 				  (packet_interval > q->avg_packet_interval ?
1874 					  2 : 8));
1875 
1876 		q->last_packet_time = now;
1877 
1878 		if (packet_interval > q->avg_packet_interval) {
1879 			u64 window_interval = \
1880 				ktime_to_ns(ktime_sub(now,
1881 						      q->avg_window_begin));
1882 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1883 
1884 			b = div64_u64(b, window_interval);
1885 			q->avg_peak_bandwidth =
1886 				cake_ewma(q->avg_peak_bandwidth, b,
1887 					  b > q->avg_peak_bandwidth ? 2 : 8);
1888 			q->avg_window_bytes = 0;
1889 			q->avg_window_begin = now;
1890 
1891 			if (ktime_after(now,
1892 					ktime_add_ms(q->last_reconfig_time,
1893 						     250))) {
1894 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1895 				cake_reconfigure(sch);
1896 			}
1897 		}
1898 	} else {
1899 		q->avg_window_bytes = 0;
1900 		q->last_packet_time = now;
1901 	}
1902 
1903 	/* flowchain */
1904 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1905 		if (!flow->set) {
1906 			list_add_tail(&flow->flowchain, &b->new_flows);
1907 		} else {
1908 			b->decaying_flow_count--;
1909 			list_move_tail(&flow->flowchain, &b->new_flows);
1910 		}
1911 		flow->set = CAKE_SET_SPARSE;
1912 		b->sparse_flow_count++;
1913 
1914 		flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
1915 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1916 		/* this flow was empty, accounted as a sparse flow, but actually
1917 		 * in the bulk rotation.
1918 		 */
1919 		flow->set = CAKE_SET_BULK;
1920 		b->sparse_flow_count--;
1921 		b->bulk_flow_count++;
1922 
1923 		cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
1924 		cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
1925 	}
1926 
1927 	if (q->buffer_used > q->buffer_max_used)
1928 		q->buffer_max_used = q->buffer_used;
1929 
1930 	if (q->buffer_used > q->buffer_limit) {
1931 		bool same_flow = false;
1932 		u32 dropped = 0;
1933 		u32 drop_id;
1934 
1935 		while (q->buffer_used > q->buffer_limit) {
1936 			dropped++;
1937 			drop_id = cake_drop(sch, to_free);
1938 
1939 			if ((drop_id >> 16) == tin &&
1940 			    (drop_id & 0xFFFF) == idx)
1941 				same_flow = true;
1942 		}
1943 		b->drop_overlimit += dropped;
1944 
1945 		if (same_flow)
1946 			return NET_XMIT_CN;
1947 	}
1948 	return NET_XMIT_SUCCESS;
1949 }
1950 
cake_dequeue_one(struct Qdisc * sch)1951 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1952 {
1953 	struct cake_sched_data *q = qdisc_priv(sch);
1954 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1955 	struct cake_flow *flow = &b->flows[q->cur_flow];
1956 	struct sk_buff *skb = NULL;
1957 	u32 len;
1958 
1959 	if (flow->head) {
1960 		skb = dequeue_head(flow);
1961 		len = qdisc_pkt_len(skb);
1962 		b->backlogs[q->cur_flow] -= len;
1963 		b->tin_backlog		 -= len;
1964 		sch->qstats.backlog      -= len;
1965 		q->buffer_used		 -= skb->truesize;
1966 		sch->q.qlen--;
1967 
1968 		if (q->overflow_timeout)
1969 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1970 	}
1971 	return skb;
1972 }
1973 
1974 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1975 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1976 {
1977 	struct cake_sched_data *q = qdisc_priv(sch);
1978 	struct sk_buff *skb;
1979 
1980 	q->cur_tin = tin;
1981 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1982 		while (!!(skb = cake_dequeue_one(sch)))
1983 			kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
1984 }
1985 
cake_dequeue(struct Qdisc * sch)1986 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1987 {
1988 	struct cake_sched_data *q = qdisc_priv(sch);
1989 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1990 	enum skb_drop_reason reason;
1991 	ktime_t now = ktime_get();
1992 	struct cake_flow *flow;
1993 	struct list_head *head;
1994 	bool first_flow = true;
1995 	struct sk_buff *skb;
1996 	u64 delay;
1997 	u32 len;
1998 
1999 begin:
2000 	if (!sch->q.qlen)
2001 		return NULL;
2002 
2003 	/* global hard shaper */
2004 	if (ktime_after(q->time_next_packet, now) &&
2005 	    ktime_after(q->failsafe_next_packet, now)) {
2006 		u64 next = min(ktime_to_ns(q->time_next_packet),
2007 			       ktime_to_ns(q->failsafe_next_packet));
2008 
2009 		sch->qstats.overlimits++;
2010 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2011 		return NULL;
2012 	}
2013 
2014 	/* Choose a class to work on. */
2015 	if (!q->rate_ns) {
2016 		/* In unlimited mode, can't rely on shaper timings, just balance
2017 		 * with DRR
2018 		 */
2019 		bool wrapped = false, empty = true;
2020 
2021 		while (b->tin_deficit < 0 ||
2022 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
2023 			if (b->tin_deficit <= 0)
2024 				b->tin_deficit += b->tin_quantum;
2025 			if (b->sparse_flow_count + b->bulk_flow_count)
2026 				empty = false;
2027 
2028 			q->cur_tin++;
2029 			b++;
2030 			if (q->cur_tin >= q->tin_cnt) {
2031 				q->cur_tin = 0;
2032 				b = q->tins;
2033 
2034 				if (wrapped) {
2035 					/* It's possible for q->qlen to be
2036 					 * nonzero when we actually have no
2037 					 * packets anywhere.
2038 					 */
2039 					if (empty)
2040 						return NULL;
2041 				} else {
2042 					wrapped = true;
2043 				}
2044 			}
2045 		}
2046 	} else {
2047 		/* In shaped mode, choose:
2048 		 * - Highest-priority tin with queue and meeting schedule, or
2049 		 * - The earliest-scheduled tin with queue.
2050 		 */
2051 		ktime_t best_time = KTIME_MAX;
2052 		int tin, best_tin = 0;
2053 
2054 		for (tin = 0; tin < q->tin_cnt; tin++) {
2055 			b = q->tins + tin;
2056 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2057 				ktime_t time_to_pkt = \
2058 					ktime_sub(b->time_next_packet, now);
2059 
2060 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2061 				    ktime_compare(time_to_pkt,
2062 						  best_time) <= 0) {
2063 					best_time = time_to_pkt;
2064 					best_tin = tin;
2065 				}
2066 			}
2067 		}
2068 
2069 		q->cur_tin = best_tin;
2070 		b = q->tins + best_tin;
2071 
2072 		/* No point in going further if no packets to deliver. */
2073 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2074 			return NULL;
2075 	}
2076 
2077 retry:
2078 	/* service this class */
2079 	head = &b->decaying_flows;
2080 	if (!first_flow || list_empty(head)) {
2081 		head = &b->new_flows;
2082 		if (list_empty(head)) {
2083 			head = &b->old_flows;
2084 			if (unlikely(list_empty(head))) {
2085 				head = &b->decaying_flows;
2086 				if (unlikely(list_empty(head)))
2087 					goto begin;
2088 			}
2089 		}
2090 	}
2091 	flow = list_first_entry(head, struct cake_flow, flowchain);
2092 	q->cur_flow = flow - b->flows;
2093 	first_flow = false;
2094 
2095 	/* flow isolation (DRR++) */
2096 	if (flow->deficit <= 0) {
2097 		/* Keep all flows with deficits out of the sparse and decaying
2098 		 * rotations.  No non-empty flow can go into the decaying
2099 		 * rotation, so they can't get deficits
2100 		 */
2101 		if (flow->set == CAKE_SET_SPARSE) {
2102 			if (flow->head) {
2103 				b->sparse_flow_count--;
2104 				b->bulk_flow_count++;
2105 
2106 				cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
2107 				cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2108 
2109 				flow->set = CAKE_SET_BULK;
2110 			} else {
2111 				/* we've moved it to the bulk rotation for
2112 				 * correct deficit accounting but we still want
2113 				 * to count it as a sparse flow, not a bulk one.
2114 				 */
2115 				flow->set = CAKE_SET_SPARSE_WAIT;
2116 			}
2117 		}
2118 
2119 		flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
2120 		list_move_tail(&flow->flowchain, &b->old_flows);
2121 
2122 		goto retry;
2123 	}
2124 
2125 	/* Retrieve a packet via the AQM */
2126 	while (1) {
2127 		skb = cake_dequeue_one(sch);
2128 		if (!skb) {
2129 			/* this queue was actually empty */
2130 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2131 				b->unresponsive_flow_count--;
2132 
2133 			if (flow->cvars.p_drop || flow->cvars.count ||
2134 			    ktime_before(now, flow->cvars.drop_next)) {
2135 				/* keep in the flowchain until the state has
2136 				 * decayed to rest
2137 				 */
2138 				list_move_tail(&flow->flowchain,
2139 					       &b->decaying_flows);
2140 				if (flow->set == CAKE_SET_BULK) {
2141 					b->bulk_flow_count--;
2142 
2143 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2144 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2145 
2146 					b->decaying_flow_count++;
2147 				} else if (flow->set == CAKE_SET_SPARSE ||
2148 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2149 					b->sparse_flow_count--;
2150 					b->decaying_flow_count++;
2151 				}
2152 				flow->set = CAKE_SET_DECAYING;
2153 			} else {
2154 				/* remove empty queue from the flowchain */
2155 				list_del_init(&flow->flowchain);
2156 				if (flow->set == CAKE_SET_SPARSE ||
2157 				    flow->set == CAKE_SET_SPARSE_WAIT)
2158 					b->sparse_flow_count--;
2159 				else if (flow->set == CAKE_SET_BULK) {
2160 					b->bulk_flow_count--;
2161 
2162 					cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
2163 					cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
2164 				} else
2165 					b->decaying_flow_count--;
2166 
2167 				flow->set = CAKE_SET_NONE;
2168 			}
2169 			goto begin;
2170 		}
2171 
2172 		reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2173 					    (b->bulk_flow_count *
2174 					     !!(q->rate_flags &
2175 						CAKE_FLAG_INGRESS)));
2176 		/* Last packet in queue may be marked, shouldn't be dropped */
2177 		if (reason == SKB_NOT_DROPPED_YET || !flow->head)
2178 			break;
2179 
2180 		/* drop this packet, get another one */
2181 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2182 			len = cake_advance_shaper(q, b, skb,
2183 						  now, true);
2184 			flow->deficit -= len;
2185 			b->tin_deficit -= len;
2186 		}
2187 		flow->dropped++;
2188 		b->tin_dropped++;
2189 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2190 		qdisc_qstats_drop(sch);
2191 		kfree_skb_reason(skb, reason);
2192 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2193 			goto retry;
2194 	}
2195 
2196 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2197 	qdisc_bstats_update(sch, skb);
2198 
2199 	/* collect delay stats */
2200 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2201 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2202 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2203 				  delay > b->peak_delay ? 2 : 8);
2204 	b->base_delay = cake_ewma(b->base_delay, delay,
2205 				  delay < b->base_delay ? 2 : 8);
2206 
2207 	len = cake_advance_shaper(q, b, skb, now, false);
2208 	flow->deficit -= len;
2209 	b->tin_deficit -= len;
2210 
2211 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2212 		u64 next = min(ktime_to_ns(q->time_next_packet),
2213 			       ktime_to_ns(q->failsafe_next_packet));
2214 
2215 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2216 	} else if (!sch->q.qlen) {
2217 		int i;
2218 
2219 		for (i = 0; i < q->tin_cnt; i++) {
2220 			if (q->tins[i].decaying_flow_count) {
2221 				ktime_t next = \
2222 					ktime_add_ns(now,
2223 						     q->tins[i].cparams.target);
2224 
2225 				qdisc_watchdog_schedule_ns(&q->watchdog,
2226 							   ktime_to_ns(next));
2227 				break;
2228 			}
2229 		}
2230 	}
2231 
2232 	if (q->overflow_timeout)
2233 		q->overflow_timeout--;
2234 
2235 	return skb;
2236 }
2237 
cake_reset(struct Qdisc * sch)2238 static void cake_reset(struct Qdisc *sch)
2239 {
2240 	struct cake_sched_data *q = qdisc_priv(sch);
2241 	u32 c;
2242 
2243 	if (!q->tins)
2244 		return;
2245 
2246 	for (c = 0; c < CAKE_MAX_TINS; c++)
2247 		cake_clear_tin(sch, c);
2248 }
2249 
2250 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2251 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2252 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2253 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2254 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2255 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2256 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2257 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2258 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2259 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2260 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2261 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2262 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2263 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2264 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2265 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2266 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2267 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2268 };
2269 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2270 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2271 			  u64 target_ns, u64 rtt_est_ns)
2272 {
2273 	/* convert byte-rate into time-per-byte
2274 	 * so it will always unwedge in reasonable time.
2275 	 */
2276 	static const u64 MIN_RATE = 64;
2277 	u32 byte_target = mtu;
2278 	u64 byte_target_ns;
2279 	u8  rate_shft = 0;
2280 	u64 rate_ns = 0;
2281 
2282 	b->flow_quantum = 1514;
2283 	if (rate) {
2284 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2285 		rate_shft = 34;
2286 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2287 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2288 		while (!!(rate_ns >> 34)) {
2289 			rate_ns >>= 1;
2290 			rate_shft--;
2291 		}
2292 	} /* else unlimited, ie. zero delay */
2293 
2294 	b->tin_rate_bps  = rate;
2295 	b->tin_rate_ns   = rate_ns;
2296 	b->tin_rate_shft = rate_shft;
2297 
2298 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2299 
2300 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2301 	b->cparams.interval = max(rtt_est_ns +
2302 				     b->cparams.target - target_ns,
2303 				     b->cparams.target * 2);
2304 	b->cparams.mtu_time = byte_target_ns;
2305 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2306 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2307 }
2308 
cake_config_besteffort(struct Qdisc * sch)2309 static int cake_config_besteffort(struct Qdisc *sch)
2310 {
2311 	struct cake_sched_data *q = qdisc_priv(sch);
2312 	struct cake_tin_data *b = &q->tins[0];
2313 	u32 mtu = psched_mtu(qdisc_dev(sch));
2314 	u64 rate = q->rate_bps;
2315 
2316 	q->tin_cnt = 1;
2317 
2318 	q->tin_index = besteffort;
2319 	q->tin_order = normal_order;
2320 
2321 	cake_set_rate(b, rate, mtu,
2322 		      us_to_ns(q->target), us_to_ns(q->interval));
2323 	b->tin_quantum = 65535;
2324 
2325 	return 0;
2326 }
2327 
cake_config_precedence(struct Qdisc * sch)2328 static int cake_config_precedence(struct Qdisc *sch)
2329 {
2330 	/* convert high-level (user visible) parameters into internal format */
2331 	struct cake_sched_data *q = qdisc_priv(sch);
2332 	u32 mtu = psched_mtu(qdisc_dev(sch));
2333 	u64 rate = q->rate_bps;
2334 	u32 quantum = 256;
2335 	u32 i;
2336 
2337 	q->tin_cnt = 8;
2338 	q->tin_index = precedence;
2339 	q->tin_order = normal_order;
2340 
2341 	for (i = 0; i < q->tin_cnt; i++) {
2342 		struct cake_tin_data *b = &q->tins[i];
2343 
2344 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2345 			      us_to_ns(q->interval));
2346 
2347 		b->tin_quantum = max_t(u16, 1U, quantum);
2348 
2349 		/* calculate next class's parameters */
2350 		rate  *= 7;
2351 		rate >>= 3;
2352 
2353 		quantum  *= 7;
2354 		quantum >>= 3;
2355 	}
2356 
2357 	return 0;
2358 }
2359 
2360 /*	List of known Diffserv codepoints:
2361  *
2362  *	Default Forwarding (DF/CS0) - Best Effort
2363  *	Max Throughput (TOS2)
2364  *	Min Delay (TOS4)
2365  *	LLT "La" (TOS5)
2366  *	Assured Forwarding 1 (AF1x) - x3
2367  *	Assured Forwarding 2 (AF2x) - x3
2368  *	Assured Forwarding 3 (AF3x) - x3
2369  *	Assured Forwarding 4 (AF4x) - x3
2370  *	Precedence Class 1 (CS1)
2371  *	Precedence Class 2 (CS2)
2372  *	Precedence Class 3 (CS3)
2373  *	Precedence Class 4 (CS4)
2374  *	Precedence Class 5 (CS5)
2375  *	Precedence Class 6 (CS6)
2376  *	Precedence Class 7 (CS7)
2377  *	Voice Admit (VA)
2378  *	Expedited Forwarding (EF)
2379  *	Lower Effort (LE)
2380  *
2381  *	Total 26 codepoints.
2382  */
2383 
2384 /*	List of traffic classes in RFC 4594, updated by RFC 8622:
2385  *		(roughly descending order of contended priority)
2386  *		(roughly ascending order of uncontended throughput)
2387  *
2388  *	Network Control (CS6,CS7)      - routing traffic
2389  *	Telephony (EF,VA)         - aka. VoIP streams
2390  *	Signalling (CS5)               - VoIP setup
2391  *	Multimedia Conferencing (AF4x) - aka. video calls
2392  *	Realtime Interactive (CS4)     - eg. games
2393  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2394  *	Broadcast Video (CS3)
2395  *	Low-Latency Data (AF2x,TOS4)      - eg. database
2396  *	Ops, Admin, Management (CS2)      - eg. ssh
2397  *	Standard Service (DF & unrecognised codepoints)
2398  *	High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2399  *	Low-Priority Data (LE,CS1)        - eg. BitTorrent
2400  *
2401  *	Total 12 traffic classes.
2402  */
2403 
cake_config_diffserv8(struct Qdisc * sch)2404 static int cake_config_diffserv8(struct Qdisc *sch)
2405 {
2406 /*	Pruned list of traffic classes for typical applications:
2407  *
2408  *		Network Control          (CS6, CS7)
2409  *		Minimum Latency          (EF, VA, CS5, CS4)
2410  *		Interactive Shell        (CS2)
2411  *		Low Latency Transactions (AF2x, TOS4)
2412  *		Video Streaming          (AF4x, AF3x, CS3)
2413  *		Bog Standard             (DF etc.)
2414  *		High Throughput          (AF1x, TOS2, CS1)
2415  *		Background Traffic       (LE)
2416  *
2417  *		Total 8 traffic classes.
2418  */
2419 
2420 	struct cake_sched_data *q = qdisc_priv(sch);
2421 	u32 mtu = psched_mtu(qdisc_dev(sch));
2422 	u64 rate = q->rate_bps;
2423 	u32 quantum = 256;
2424 	u32 i;
2425 
2426 	q->tin_cnt = 8;
2427 
2428 	/* codepoint to class mapping */
2429 	q->tin_index = diffserv8;
2430 	q->tin_order = normal_order;
2431 
2432 	/* class characteristics */
2433 	for (i = 0; i < q->tin_cnt; i++) {
2434 		struct cake_tin_data *b = &q->tins[i];
2435 
2436 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2437 			      us_to_ns(q->interval));
2438 
2439 		b->tin_quantum = max_t(u16, 1U, quantum);
2440 
2441 		/* calculate next class's parameters */
2442 		rate  *= 7;
2443 		rate >>= 3;
2444 
2445 		quantum  *= 7;
2446 		quantum >>= 3;
2447 	}
2448 
2449 	return 0;
2450 }
2451 
cake_config_diffserv4(struct Qdisc * sch)2452 static int cake_config_diffserv4(struct Qdisc *sch)
2453 {
2454 /*  Further pruned list of traffic classes for four-class system:
2455  *
2456  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2457  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2458  *	    Best Effort        (DF, AF1x, TOS2, and those not specified)
2459  *	    Background Traffic (LE, CS1)
2460  *
2461  *		Total 4 traffic classes.
2462  */
2463 
2464 	struct cake_sched_data *q = qdisc_priv(sch);
2465 	u32 mtu = psched_mtu(qdisc_dev(sch));
2466 	u64 rate = q->rate_bps;
2467 	u32 quantum = 1024;
2468 
2469 	q->tin_cnt = 4;
2470 
2471 	/* codepoint to class mapping */
2472 	q->tin_index = diffserv4;
2473 	q->tin_order = bulk_order;
2474 
2475 	/* class characteristics */
2476 	cake_set_rate(&q->tins[0], rate, mtu,
2477 		      us_to_ns(q->target), us_to_ns(q->interval));
2478 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2479 		      us_to_ns(q->target), us_to_ns(q->interval));
2480 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2481 		      us_to_ns(q->target), us_to_ns(q->interval));
2482 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2483 		      us_to_ns(q->target), us_to_ns(q->interval));
2484 
2485 	/* bandwidth-sharing weights */
2486 	q->tins[0].tin_quantum = quantum;
2487 	q->tins[1].tin_quantum = quantum >> 4;
2488 	q->tins[2].tin_quantum = quantum >> 1;
2489 	q->tins[3].tin_quantum = quantum >> 2;
2490 
2491 	return 0;
2492 }
2493 
cake_config_diffserv3(struct Qdisc * sch)2494 static int cake_config_diffserv3(struct Qdisc *sch)
2495 {
2496 /*  Simplified Diffserv structure with 3 tins.
2497  *		Latency Sensitive	(CS7, CS6, EF, VA, TOS4)
2498  *		Best Effort
2499  *		Low Priority		(LE, CS1)
2500  */
2501 	struct cake_sched_data *q = qdisc_priv(sch);
2502 	u32 mtu = psched_mtu(qdisc_dev(sch));
2503 	u64 rate = q->rate_bps;
2504 	u32 quantum = 1024;
2505 
2506 	q->tin_cnt = 3;
2507 
2508 	/* codepoint to class mapping */
2509 	q->tin_index = diffserv3;
2510 	q->tin_order = bulk_order;
2511 
2512 	/* class characteristics */
2513 	cake_set_rate(&q->tins[0], rate, mtu,
2514 		      us_to_ns(q->target), us_to_ns(q->interval));
2515 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2516 		      us_to_ns(q->target), us_to_ns(q->interval));
2517 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2518 		      us_to_ns(q->target), us_to_ns(q->interval));
2519 
2520 	/* bandwidth-sharing weights */
2521 	q->tins[0].tin_quantum = quantum;
2522 	q->tins[1].tin_quantum = quantum >> 4;
2523 	q->tins[2].tin_quantum = quantum >> 2;
2524 
2525 	return 0;
2526 }
2527 
cake_reconfigure(struct Qdisc * sch)2528 static void cake_reconfigure(struct Qdisc *sch)
2529 {
2530 	struct cake_sched_data *q = qdisc_priv(sch);
2531 	int c, ft;
2532 
2533 	switch (q->tin_mode) {
2534 	case CAKE_DIFFSERV_BESTEFFORT:
2535 		ft = cake_config_besteffort(sch);
2536 		break;
2537 
2538 	case CAKE_DIFFSERV_PRECEDENCE:
2539 		ft = cake_config_precedence(sch);
2540 		break;
2541 
2542 	case CAKE_DIFFSERV_DIFFSERV8:
2543 		ft = cake_config_diffserv8(sch);
2544 		break;
2545 
2546 	case CAKE_DIFFSERV_DIFFSERV4:
2547 		ft = cake_config_diffserv4(sch);
2548 		break;
2549 
2550 	case CAKE_DIFFSERV_DIFFSERV3:
2551 	default:
2552 		ft = cake_config_diffserv3(sch);
2553 		break;
2554 	}
2555 
2556 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2557 		cake_clear_tin(sch, c);
2558 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2559 	}
2560 
2561 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2562 	q->rate_shft = q->tins[ft].tin_rate_shft;
2563 
2564 	if (q->buffer_config_limit) {
2565 		q->buffer_limit = q->buffer_config_limit;
2566 	} else if (q->rate_bps) {
2567 		u64 t = q->rate_bps * q->interval;
2568 
2569 		do_div(t, USEC_PER_SEC / 4);
2570 		q->buffer_limit = max_t(u32, t, 4U << 20);
2571 	} else {
2572 		q->buffer_limit = ~0;
2573 	}
2574 
2575 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2576 
2577 	q->buffer_limit = min(q->buffer_limit,
2578 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2579 				  q->buffer_config_limit));
2580 }
2581 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2582 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2583 		       struct netlink_ext_ack *extack)
2584 {
2585 	struct cake_sched_data *q = qdisc_priv(sch);
2586 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2587 	u16 rate_flags;
2588 	u8 flow_mode;
2589 	int err;
2590 
2591 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2592 					  extack);
2593 	if (err < 0)
2594 		return err;
2595 
2596 	flow_mode = q->flow_mode;
2597 	if (tb[TCA_CAKE_NAT]) {
2598 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2599 		flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2600 		flow_mode |= CAKE_FLOW_NAT_FLAG *
2601 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2602 #else
2603 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2604 				    "No conntrack support in kernel");
2605 		return -EOPNOTSUPP;
2606 #endif
2607 	}
2608 
2609 	if (tb[TCA_CAKE_BASE_RATE64])
2610 		WRITE_ONCE(q->rate_bps,
2611 			   nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
2612 
2613 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2614 		WRITE_ONCE(q->tin_mode,
2615 			   nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
2616 
2617 	rate_flags = q->rate_flags;
2618 	if (tb[TCA_CAKE_WASH]) {
2619 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2620 			rate_flags |= CAKE_FLAG_WASH;
2621 		else
2622 			rate_flags &= ~CAKE_FLAG_WASH;
2623 	}
2624 
2625 	if (tb[TCA_CAKE_FLOW_MODE])
2626 		flow_mode = ((flow_mode & CAKE_FLOW_NAT_FLAG) |
2627 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2628 					CAKE_FLOW_MASK));
2629 
2630 	if (tb[TCA_CAKE_ATM])
2631 		WRITE_ONCE(q->atm_mode,
2632 			   nla_get_u32(tb[TCA_CAKE_ATM]));
2633 
2634 	if (tb[TCA_CAKE_OVERHEAD]) {
2635 		WRITE_ONCE(q->rate_overhead,
2636 			   nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
2637 		rate_flags |= CAKE_FLAG_OVERHEAD;
2638 
2639 		q->max_netlen = 0;
2640 		q->max_adjlen = 0;
2641 		q->min_netlen = ~0;
2642 		q->min_adjlen = ~0;
2643 	}
2644 
2645 	if (tb[TCA_CAKE_RAW]) {
2646 		rate_flags &= ~CAKE_FLAG_OVERHEAD;
2647 
2648 		q->max_netlen = 0;
2649 		q->max_adjlen = 0;
2650 		q->min_netlen = ~0;
2651 		q->min_adjlen = ~0;
2652 	}
2653 
2654 	if (tb[TCA_CAKE_MPU])
2655 		WRITE_ONCE(q->rate_mpu,
2656 			   nla_get_u32(tb[TCA_CAKE_MPU]));
2657 
2658 	if (tb[TCA_CAKE_RTT]) {
2659 		u32 interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2660 
2661 		WRITE_ONCE(q->interval, max(interval, 1U));
2662 	}
2663 
2664 	if (tb[TCA_CAKE_TARGET]) {
2665 		u32 target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2666 
2667 		WRITE_ONCE(q->target, max(target, 1U));
2668 	}
2669 
2670 	if (tb[TCA_CAKE_AUTORATE]) {
2671 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2672 			rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2673 		else
2674 			rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2675 	}
2676 
2677 	if (tb[TCA_CAKE_INGRESS]) {
2678 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2679 			rate_flags |= CAKE_FLAG_INGRESS;
2680 		else
2681 			rate_flags &= ~CAKE_FLAG_INGRESS;
2682 	}
2683 
2684 	if (tb[TCA_CAKE_ACK_FILTER])
2685 		WRITE_ONCE(q->ack_filter,
2686 			   nla_get_u32(tb[TCA_CAKE_ACK_FILTER]));
2687 
2688 	if (tb[TCA_CAKE_MEMORY])
2689 		WRITE_ONCE(q->buffer_config_limit,
2690 			   nla_get_u32(tb[TCA_CAKE_MEMORY]));
2691 
2692 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2693 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2694 			rate_flags |= CAKE_FLAG_SPLIT_GSO;
2695 		else
2696 			rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2697 	}
2698 
2699 	if (tb[TCA_CAKE_FWMARK]) {
2700 		WRITE_ONCE(q->fwmark_mask, nla_get_u32(tb[TCA_CAKE_FWMARK]));
2701 		WRITE_ONCE(q->fwmark_shft,
2702 			   q->fwmark_mask ? __ffs(q->fwmark_mask) : 0);
2703 	}
2704 
2705 	WRITE_ONCE(q->rate_flags, rate_flags);
2706 	WRITE_ONCE(q->flow_mode, flow_mode);
2707 	if (q->tins) {
2708 		sch_tree_lock(sch);
2709 		cake_reconfigure(sch);
2710 		sch_tree_unlock(sch);
2711 	}
2712 
2713 	return 0;
2714 }
2715 
cake_destroy(struct Qdisc * sch)2716 static void cake_destroy(struct Qdisc *sch)
2717 {
2718 	struct cake_sched_data *q = qdisc_priv(sch);
2719 
2720 	qdisc_watchdog_cancel(&q->watchdog);
2721 	tcf_block_put(q->block);
2722 	kvfree(q->tins);
2723 }
2724 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2725 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2726 		     struct netlink_ext_ack *extack)
2727 {
2728 	struct cake_sched_data *q = qdisc_priv(sch);
2729 	int i, j, err;
2730 
2731 	sch->limit = 10240;
2732 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2733 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2734 
2735 	q->rate_bps = 0; /* unlimited by default */
2736 
2737 	q->interval = 100000; /* 100ms default */
2738 	q->target   =   5000; /* 5ms: codel RFC argues
2739 			       * for 5 to 10% of interval
2740 			       */
2741 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2742 	q->cur_tin = 0;
2743 	q->cur_flow  = 0;
2744 
2745 	qdisc_watchdog_init(&q->watchdog, sch);
2746 
2747 	if (opt) {
2748 		err = cake_change(sch, opt, extack);
2749 
2750 		if (err)
2751 			return err;
2752 	}
2753 
2754 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2755 	if (err)
2756 		return err;
2757 
2758 	quantum_div[0] = ~0;
2759 	for (i = 1; i <= CAKE_QUEUES; i++)
2760 		quantum_div[i] = 65535 / i;
2761 
2762 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2763 			   GFP_KERNEL);
2764 	if (!q->tins)
2765 		return -ENOMEM;
2766 
2767 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2768 		struct cake_tin_data *b = q->tins + i;
2769 
2770 		INIT_LIST_HEAD(&b->new_flows);
2771 		INIT_LIST_HEAD(&b->old_flows);
2772 		INIT_LIST_HEAD(&b->decaying_flows);
2773 		b->sparse_flow_count = 0;
2774 		b->bulk_flow_count = 0;
2775 		b->decaying_flow_count = 0;
2776 
2777 		for (j = 0; j < CAKE_QUEUES; j++) {
2778 			struct cake_flow *flow = b->flows + j;
2779 			u32 k = j * CAKE_MAX_TINS + i;
2780 
2781 			INIT_LIST_HEAD(&flow->flowchain);
2782 			cobalt_vars_init(&flow->cvars);
2783 
2784 			q->overflow_heap[k].t = i;
2785 			q->overflow_heap[k].b = j;
2786 			b->overflow_idx[j] = k;
2787 		}
2788 	}
2789 
2790 	cake_reconfigure(sch);
2791 	q->avg_peak_bandwidth = q->rate_bps;
2792 	q->min_netlen = ~0;
2793 	q->min_adjlen = ~0;
2794 	return 0;
2795 }
2796 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2797 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2798 {
2799 	struct cake_sched_data *q = qdisc_priv(sch);
2800 	struct nlattr *opts;
2801 	u16 rate_flags;
2802 	u8 flow_mode;
2803 
2804 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2805 	if (!opts)
2806 		goto nla_put_failure;
2807 
2808 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64,
2809 			      READ_ONCE(q->rate_bps), TCA_CAKE_PAD))
2810 		goto nla_put_failure;
2811 
2812 	flow_mode = READ_ONCE(q->flow_mode);
2813 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE, flow_mode & CAKE_FLOW_MASK))
2814 		goto nla_put_failure;
2815 
2816 	if (nla_put_u32(skb, TCA_CAKE_RTT, READ_ONCE(q->interval)))
2817 		goto nla_put_failure;
2818 
2819 	if (nla_put_u32(skb, TCA_CAKE_TARGET, READ_ONCE(q->target)))
2820 		goto nla_put_failure;
2821 
2822 	if (nla_put_u32(skb, TCA_CAKE_MEMORY,
2823 			READ_ONCE(q->buffer_config_limit)))
2824 		goto nla_put_failure;
2825 
2826 	rate_flags = READ_ONCE(q->rate_flags);
2827 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2828 			!!(rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2829 		goto nla_put_failure;
2830 
2831 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2832 			!!(rate_flags & CAKE_FLAG_INGRESS)))
2833 		goto nla_put_failure;
2834 
2835 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, READ_ONCE(q->ack_filter)))
2836 		goto nla_put_failure;
2837 
2838 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2839 			!!(flow_mode & CAKE_FLOW_NAT_FLAG)))
2840 		goto nla_put_failure;
2841 
2842 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, READ_ONCE(q->tin_mode)))
2843 		goto nla_put_failure;
2844 
2845 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2846 			!!(rate_flags & CAKE_FLAG_WASH)))
2847 		goto nla_put_failure;
2848 
2849 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, READ_ONCE(q->rate_overhead)))
2850 		goto nla_put_failure;
2851 
2852 	if (!(rate_flags & CAKE_FLAG_OVERHEAD))
2853 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2854 			goto nla_put_failure;
2855 
2856 	if (nla_put_u32(skb, TCA_CAKE_ATM, READ_ONCE(q->atm_mode)))
2857 		goto nla_put_failure;
2858 
2859 	if (nla_put_u32(skb, TCA_CAKE_MPU, READ_ONCE(q->rate_mpu)))
2860 		goto nla_put_failure;
2861 
2862 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2863 			!!(rate_flags & CAKE_FLAG_SPLIT_GSO)))
2864 		goto nla_put_failure;
2865 
2866 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, READ_ONCE(q->fwmark_mask)))
2867 		goto nla_put_failure;
2868 
2869 	return nla_nest_end(skb, opts);
2870 
2871 nla_put_failure:
2872 	return -1;
2873 }
2874 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2875 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2876 {
2877 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2878 	struct cake_sched_data *q = qdisc_priv(sch);
2879 	struct nlattr *tstats, *ts;
2880 	int i;
2881 
2882 	if (!stats)
2883 		return -1;
2884 
2885 #define PUT_STAT_U32(attr, data) do {				       \
2886 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2887 			goto nla_put_failure;			       \
2888 	} while (0)
2889 #define PUT_STAT_U64(attr, data) do {				       \
2890 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2891 					data, TCA_CAKE_STATS_PAD)) \
2892 			goto nla_put_failure;			       \
2893 	} while (0)
2894 
2895 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2896 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2897 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2898 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2899 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2900 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2901 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2902 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2903 
2904 #undef PUT_STAT_U32
2905 #undef PUT_STAT_U64
2906 
2907 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2908 	if (!tstats)
2909 		goto nla_put_failure;
2910 
2911 #define PUT_TSTAT_U32(attr, data) do {					\
2912 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2913 			goto nla_put_failure;				\
2914 	} while (0)
2915 #define PUT_TSTAT_U64(attr, data) do {					\
2916 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2917 					data, TCA_CAKE_TIN_STATS_PAD))	\
2918 			goto nla_put_failure;				\
2919 	} while (0)
2920 
2921 	for (i = 0; i < q->tin_cnt; i++) {
2922 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2923 
2924 		ts = nla_nest_start_noflag(d->skb, i + 1);
2925 		if (!ts)
2926 			goto nla_put_failure;
2927 
2928 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2929 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2930 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2931 
2932 		PUT_TSTAT_U32(TARGET_US,
2933 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2934 		PUT_TSTAT_U32(INTERVAL_US,
2935 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2936 
2937 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2938 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2939 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2940 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2941 
2942 		PUT_TSTAT_U32(PEAK_DELAY_US,
2943 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2944 		PUT_TSTAT_U32(AVG_DELAY_US,
2945 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2946 		PUT_TSTAT_U32(BASE_DELAY_US,
2947 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2948 
2949 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2950 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2951 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2952 
2953 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2954 					    b->decaying_flow_count);
2955 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2956 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2957 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2958 
2959 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2960 		nla_nest_end(d->skb, ts);
2961 	}
2962 
2963 #undef PUT_TSTAT_U32
2964 #undef PUT_TSTAT_U64
2965 
2966 	nla_nest_end(d->skb, tstats);
2967 	return nla_nest_end(d->skb, stats);
2968 
2969 nla_put_failure:
2970 	nla_nest_cancel(d->skb, stats);
2971 	return -1;
2972 }
2973 
cake_leaf(struct Qdisc * sch,unsigned long arg)2974 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2975 {
2976 	return NULL;
2977 }
2978 
cake_find(struct Qdisc * sch,u32 classid)2979 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2980 {
2981 	return 0;
2982 }
2983 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2984 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2985 			       u32 classid)
2986 {
2987 	return 0;
2988 }
2989 
cake_unbind(struct Qdisc * q,unsigned long cl)2990 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2991 {
2992 }
2993 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2994 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2995 					struct netlink_ext_ack *extack)
2996 {
2997 	struct cake_sched_data *q = qdisc_priv(sch);
2998 
2999 	if (cl)
3000 		return NULL;
3001 	return q->block;
3002 }
3003 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)3004 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
3005 			   struct sk_buff *skb, struct tcmsg *tcm)
3006 {
3007 	tcm->tcm_handle |= TC_H_MIN(cl);
3008 	return 0;
3009 }
3010 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)3011 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
3012 				 struct gnet_dump *d)
3013 {
3014 	struct cake_sched_data *q = qdisc_priv(sch);
3015 	const struct cake_flow *flow = NULL;
3016 	struct gnet_stats_queue qs = { 0 };
3017 	struct nlattr *stats;
3018 	u32 idx = cl - 1;
3019 
3020 	if (idx < CAKE_QUEUES * q->tin_cnt) {
3021 		const struct cake_tin_data *b = \
3022 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
3023 		const struct sk_buff *skb;
3024 
3025 		flow = &b->flows[idx % CAKE_QUEUES];
3026 
3027 		if (flow->head) {
3028 			sch_tree_lock(sch);
3029 			skb = flow->head;
3030 			while (skb) {
3031 				qs.qlen++;
3032 				skb = skb->next;
3033 			}
3034 			sch_tree_unlock(sch);
3035 		}
3036 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3037 		qs.drops = flow->dropped;
3038 	}
3039 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3040 		return -1;
3041 	if (flow) {
3042 		ktime_t now = ktime_get();
3043 
3044 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3045 		if (!stats)
3046 			return -1;
3047 
3048 #define PUT_STAT_U32(attr, data) do {				       \
3049 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3050 			goto nla_put_failure;			       \
3051 	} while (0)
3052 #define PUT_STAT_S32(attr, data) do {				       \
3053 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3054 			goto nla_put_failure;			       \
3055 	} while (0)
3056 
3057 		PUT_STAT_S32(DEFICIT, flow->deficit);
3058 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3059 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3060 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3061 		if (flow->cvars.p_drop) {
3062 			PUT_STAT_S32(BLUE_TIMER_US,
3063 				     ktime_to_us(
3064 					     ktime_sub(now,
3065 						       flow->cvars.blue_timer)));
3066 		}
3067 		if (flow->cvars.dropping) {
3068 			PUT_STAT_S32(DROP_NEXT_US,
3069 				     ktime_to_us(
3070 					     ktime_sub(now,
3071 						       flow->cvars.drop_next)));
3072 		}
3073 
3074 		if (nla_nest_end(d->skb, stats) < 0)
3075 			return -1;
3076 	}
3077 
3078 	return 0;
3079 
3080 nla_put_failure:
3081 	nla_nest_cancel(d->skb, stats);
3082 	return -1;
3083 }
3084 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3085 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3086 {
3087 	struct cake_sched_data *q = qdisc_priv(sch);
3088 	unsigned int i, j;
3089 
3090 	if (arg->stop)
3091 		return;
3092 
3093 	for (i = 0; i < q->tin_cnt; i++) {
3094 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3095 
3096 		for (j = 0; j < CAKE_QUEUES; j++) {
3097 			if (list_empty(&b->flows[j].flowchain)) {
3098 				arg->count++;
3099 				continue;
3100 			}
3101 			if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3102 						 arg))
3103 				break;
3104 		}
3105 	}
3106 }
3107 
3108 static const struct Qdisc_class_ops cake_class_ops = {
3109 	.leaf		=	cake_leaf,
3110 	.find		=	cake_find,
3111 	.tcf_block	=	cake_tcf_block,
3112 	.bind_tcf	=	cake_bind,
3113 	.unbind_tcf	=	cake_unbind,
3114 	.dump		=	cake_dump_class,
3115 	.dump_stats	=	cake_dump_class_stats,
3116 	.walk		=	cake_walk,
3117 };
3118 
3119 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3120 	.cl_ops		=	&cake_class_ops,
3121 	.id		=	"cake",
3122 	.priv_size	=	sizeof(struct cake_sched_data),
3123 	.enqueue	=	cake_enqueue,
3124 	.dequeue	=	cake_dequeue,
3125 	.peek		=	qdisc_peek_dequeued,
3126 	.init		=	cake_init,
3127 	.reset		=	cake_reset,
3128 	.destroy	=	cake_destroy,
3129 	.change		=	cake_change,
3130 	.dump		=	cake_dump,
3131 	.dump_stats	=	cake_dump_stats,
3132 	.owner		=	THIS_MODULE,
3133 };
3134 MODULE_ALIAS_NET_SCH("cake");
3135 
cake_module_init(void)3136 static int __init cake_module_init(void)
3137 {
3138 	return register_qdisc(&cake_qdisc_ops);
3139 }
3140 
cake_module_exit(void)3141 static void __exit cake_module_exit(void)
3142 {
3143 	unregister_qdisc(&cake_qdisc_ops);
3144 }
3145 
3146 module_init(cake_module_init)
3147 module_exit(cake_module_exit)
3148 MODULE_AUTHOR("Jonathan Morton");
3149 MODULE_LICENSE("Dual BSD/GPL");
3150 MODULE_DESCRIPTION("The CAKE shaper.");
3151