xref: /linux/net/sched/sch_sfq.c (revision a06c3fad49a50d5d5eb078f93e70f4d3eca5d5a5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* We use 16 bits to store allot, and want to handle packets up to 64K
81  * Scale allot by 8 (1<<3) so that no overflow occurs.
82  */
83 #define SFQ_ALLOT_SHIFT		3
84 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
85 
86 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
87 typedef u16 sfq_index;
88 
89 /*
90  * We dont use pointers to save space.
91  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
92  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
93  * are 'pointers' to dep[] array
94  */
95 struct sfq_head {
96 	sfq_index	next;
97 	sfq_index	prev;
98 };
99 
100 struct sfq_slot {
101 	struct sk_buff	*skblist_next;
102 	struct sk_buff	*skblist_prev;
103 	sfq_index	qlen; /* number of skbs in skblist */
104 	sfq_index	next; /* next slot in sfq RR chain */
105 	struct sfq_head dep; /* anchor in dep[] chains */
106 	unsigned short	hash; /* hash value (index in ht[]) */
107 	short		allot; /* credit for this slot */
108 
109 	unsigned int    backlog;
110 	struct red_vars vars;
111 };
112 
113 struct sfq_sched_data {
114 /* frequently used fields */
115 	int		limit;		/* limit of total number of packets in this qdisc */
116 	unsigned int	divisor;	/* number of slots in hash table */
117 	u8		headdrop;
118 	u8		maxdepth;	/* limit of packets per flow */
119 
120 	siphash_key_t 	perturbation;
121 	u8		cur_depth;	/* depth of longest slot */
122 	u8		flags;
123 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124 	struct tcf_proto __rcu *filter_list;
125 	struct tcf_block *block;
126 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
127 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
128 
129 	struct red_parms *red_parms;
130 	struct tc_sfqred_stats stats;
131 	struct sfq_slot *tail;		/* current slot in round */
132 
133 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
134 					/* Linked lists of slots, indexed by depth
135 					 * dep[0] : list of unused flows
136 					 * dep[1] : list of flows with 1 packet
137 					 * dep[X] : list of flows with X packets
138 					 */
139 
140 	unsigned int	maxflows;	/* number of flows in flows array */
141 	int		perturb_period;
142 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
143 	struct timer_list perturb_timer;
144 	struct Qdisc	*sch;
145 };
146 
147 /*
148  * sfq_head are either in a sfq_slot or in dep[] array
149  */
150 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
151 {
152 	if (val < SFQ_MAX_FLOWS)
153 		return &q->slots[val].dep;
154 	return &q->dep[val - SFQ_MAX_FLOWS];
155 }
156 
157 static unsigned int sfq_hash(const struct sfq_sched_data *q,
158 			     const struct sk_buff *skb)
159 {
160 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
161 }
162 
163 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
164 				 int *qerr)
165 {
166 	struct sfq_sched_data *q = qdisc_priv(sch);
167 	struct tcf_result res;
168 	struct tcf_proto *fl;
169 	int result;
170 
171 	if (TC_H_MAJ(skb->priority) == sch->handle &&
172 	    TC_H_MIN(skb->priority) > 0 &&
173 	    TC_H_MIN(skb->priority) <= q->divisor)
174 		return TC_H_MIN(skb->priority);
175 
176 	fl = rcu_dereference_bh(q->filter_list);
177 	if (!fl)
178 		return sfq_hash(q, skb) + 1;
179 
180 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
181 	result = tcf_classify(skb, NULL, fl, &res, false);
182 	if (result >= 0) {
183 #ifdef CONFIG_NET_CLS_ACT
184 		switch (result) {
185 		case TC_ACT_STOLEN:
186 		case TC_ACT_QUEUED:
187 		case TC_ACT_TRAP:
188 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189 			fallthrough;
190 		case TC_ACT_SHOT:
191 			return 0;
192 		}
193 #endif
194 		if (TC_H_MIN(res.classid) <= q->divisor)
195 			return TC_H_MIN(res.classid);
196 	}
197 	return 0;
198 }
199 
200 /*
201  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202  */
203 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204 {
205 	sfq_index p, n;
206 	struct sfq_slot *slot = &q->slots[x];
207 	int qlen = slot->qlen;
208 
209 	p = qlen + SFQ_MAX_FLOWS;
210 	n = q->dep[qlen].next;
211 
212 	slot->dep.next = n;
213 	slot->dep.prev = p;
214 
215 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
216 	sfq_dep_head(q, n)->prev = x;
217 }
218 
219 #define sfq_unlink(q, x, n, p)			\
220 	do {					\
221 		n = q->slots[x].dep.next;	\
222 		p = q->slots[x].dep.prev;	\
223 		sfq_dep_head(q, p)->next = n;	\
224 		sfq_dep_head(q, n)->prev = p;	\
225 	} while (0)
226 
227 
228 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229 {
230 	sfq_index p, n;
231 	int d;
232 
233 	sfq_unlink(q, x, n, p);
234 
235 	d = q->slots[x].qlen--;
236 	if (n == p && q->cur_depth == d)
237 		q->cur_depth--;
238 	sfq_link(q, x);
239 }
240 
241 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242 {
243 	sfq_index p, n;
244 	int d;
245 
246 	sfq_unlink(q, x, n, p);
247 
248 	d = ++q->slots[x].qlen;
249 	if (q->cur_depth < d)
250 		q->cur_depth = d;
251 	sfq_link(q, x);
252 }
253 
254 /* helper functions : might be changed when/if skb use a standard list_head */
255 
256 /* remove one skb from tail of slot queue */
257 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258 {
259 	struct sk_buff *skb = slot->skblist_prev;
260 
261 	slot->skblist_prev = skb->prev;
262 	skb->prev->next = (struct sk_buff *)slot;
263 	skb->next = skb->prev = NULL;
264 	return skb;
265 }
266 
267 /* remove one skb from head of slot queue */
268 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269 {
270 	struct sk_buff *skb = slot->skblist_next;
271 
272 	slot->skblist_next = skb->next;
273 	skb->next->prev = (struct sk_buff *)slot;
274 	skb->next = skb->prev = NULL;
275 	return skb;
276 }
277 
278 static inline void slot_queue_init(struct sfq_slot *slot)
279 {
280 	memset(slot, 0, sizeof(*slot));
281 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282 }
283 
284 /* add skb to slot queue (tail add) */
285 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286 {
287 	skb->prev = slot->skblist_prev;
288 	skb->next = (struct sk_buff *)slot;
289 	slot->skblist_prev->next = skb;
290 	slot->skblist_prev = skb;
291 }
292 
293 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
294 {
295 	struct sfq_sched_data *q = qdisc_priv(sch);
296 	sfq_index x, d = q->cur_depth;
297 	struct sk_buff *skb;
298 	unsigned int len;
299 	struct sfq_slot *slot;
300 
301 	/* Queue is full! Find the longest slot and drop tail packet from it */
302 	if (d > 1) {
303 		x = q->dep[d].next;
304 		slot = &q->slots[x];
305 drop:
306 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307 		len = qdisc_pkt_len(skb);
308 		slot->backlog -= len;
309 		sfq_dec(q, x);
310 		sch->q.qlen--;
311 		qdisc_qstats_backlog_dec(sch, skb);
312 		qdisc_drop(skb, sch, to_free);
313 		return len;
314 	}
315 
316 	if (d == 1) {
317 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318 		x = q->tail->next;
319 		slot = &q->slots[x];
320 		q->tail->next = slot->next;
321 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322 		goto drop;
323 	}
324 
325 	return 0;
326 }
327 
328 /* Is ECN parameter configured */
329 static int sfq_prob_mark(const struct sfq_sched_data *q)
330 {
331 	return q->flags & TC_RED_ECN;
332 }
333 
334 /* Should packets over max threshold just be marked */
335 static int sfq_hard_mark(const struct sfq_sched_data *q)
336 {
337 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338 }
339 
340 static int sfq_headdrop(const struct sfq_sched_data *q)
341 {
342 	return q->headdrop;
343 }
344 
345 static int
346 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
347 {
348 	struct sfq_sched_data *q = qdisc_priv(sch);
349 	unsigned int hash, dropped;
350 	sfq_index x, qlen;
351 	struct sfq_slot *slot;
352 	int ret;
353 	struct sk_buff *head;
354 	int delta;
355 
356 	hash = sfq_classify(skb, sch, &ret);
357 	if (hash == 0) {
358 		if (ret & __NET_XMIT_BYPASS)
359 			qdisc_qstats_drop(sch);
360 		__qdisc_drop(skb, to_free);
361 		return ret;
362 	}
363 	hash--;
364 
365 	x = q->ht[hash];
366 	slot = &q->slots[x];
367 	if (x == SFQ_EMPTY_SLOT) {
368 		x = q->dep[0].next; /* get a free slot */
369 		if (x >= SFQ_MAX_FLOWS)
370 			return qdisc_drop(skb, sch, to_free);
371 		q->ht[hash] = x;
372 		slot = &q->slots[x];
373 		slot->hash = hash;
374 		slot->backlog = 0; /* should already be 0 anyway... */
375 		red_set_vars(&slot->vars);
376 		goto enqueue;
377 	}
378 	if (q->red_parms) {
379 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380 							&slot->vars,
381 							slot->backlog);
382 		switch (red_action(q->red_parms,
383 				   &slot->vars,
384 				   slot->vars.qavg)) {
385 		case RED_DONT_MARK:
386 			break;
387 
388 		case RED_PROB_MARK:
389 			qdisc_qstats_overlimit(sch);
390 			if (sfq_prob_mark(q)) {
391 				/* We know we have at least one packet in queue */
392 				if (sfq_headdrop(q) &&
393 				    INET_ECN_set_ce(slot->skblist_next)) {
394 					q->stats.prob_mark_head++;
395 					break;
396 				}
397 				if (INET_ECN_set_ce(skb)) {
398 					q->stats.prob_mark++;
399 					break;
400 				}
401 			}
402 			q->stats.prob_drop++;
403 			goto congestion_drop;
404 
405 		case RED_HARD_MARK:
406 			qdisc_qstats_overlimit(sch);
407 			if (sfq_hard_mark(q)) {
408 				/* We know we have at least one packet in queue */
409 				if (sfq_headdrop(q) &&
410 				    INET_ECN_set_ce(slot->skblist_next)) {
411 					q->stats.forced_mark_head++;
412 					break;
413 				}
414 				if (INET_ECN_set_ce(skb)) {
415 					q->stats.forced_mark++;
416 					break;
417 				}
418 			}
419 			q->stats.forced_drop++;
420 			goto congestion_drop;
421 		}
422 	}
423 
424 	if (slot->qlen >= q->maxdepth) {
425 congestion_drop:
426 		if (!sfq_headdrop(q))
427 			return qdisc_drop(skb, sch, to_free);
428 
429 		/* We know we have at least one packet in queue */
430 		head = slot_dequeue_head(slot);
431 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432 		sch->qstats.backlog -= delta;
433 		slot->backlog -= delta;
434 		qdisc_drop(head, sch, to_free);
435 
436 		slot_queue_add(slot, skb);
437 		qdisc_tree_reduce_backlog(sch, 0, delta);
438 		return NET_XMIT_CN;
439 	}
440 
441 enqueue:
442 	qdisc_qstats_backlog_inc(sch, skb);
443 	slot->backlog += qdisc_pkt_len(skb);
444 	slot_queue_add(slot, skb);
445 	sfq_inc(q, x);
446 	if (slot->qlen == 1) {		/* The flow is new */
447 		if (q->tail == NULL) {	/* It is the first flow */
448 			slot->next = x;
449 		} else {
450 			slot->next = q->tail->next;
451 			q->tail->next = x;
452 		}
453 		/* We put this flow at the end of our flow list.
454 		 * This might sound unfair for a new flow to wait after old ones,
455 		 * but we could endup servicing new flows only, and freeze old ones.
456 		 */
457 		q->tail = slot;
458 		/* We could use a bigger initial quantum for new flows */
459 		slot->allot = q->scaled_quantum;
460 	}
461 	if (++sch->q.qlen <= q->limit)
462 		return NET_XMIT_SUCCESS;
463 
464 	qlen = slot->qlen;
465 	dropped = sfq_drop(sch, to_free);
466 	/* Return Congestion Notification only if we dropped a packet
467 	 * from this flow.
468 	 */
469 	if (qlen != slot->qlen) {
470 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
471 		return NET_XMIT_CN;
472 	}
473 
474 	/* As we dropped a packet, better let upper stack know this */
475 	qdisc_tree_reduce_backlog(sch, 1, dropped);
476 	return NET_XMIT_SUCCESS;
477 }
478 
479 static struct sk_buff *
480 sfq_dequeue(struct Qdisc *sch)
481 {
482 	struct sfq_sched_data *q = qdisc_priv(sch);
483 	struct sk_buff *skb;
484 	sfq_index a, next_a;
485 	struct sfq_slot *slot;
486 
487 	/* No active slots */
488 	if (q->tail == NULL)
489 		return NULL;
490 
491 next_slot:
492 	a = q->tail->next;
493 	slot = &q->slots[a];
494 	if (slot->allot <= 0) {
495 		q->tail = slot;
496 		slot->allot += q->scaled_quantum;
497 		goto next_slot;
498 	}
499 	skb = slot_dequeue_head(slot);
500 	sfq_dec(q, a);
501 	qdisc_bstats_update(sch, skb);
502 	sch->q.qlen--;
503 	qdisc_qstats_backlog_dec(sch, skb);
504 	slot->backlog -= qdisc_pkt_len(skb);
505 	/* Is the slot empty? */
506 	if (slot->qlen == 0) {
507 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
508 		next_a = slot->next;
509 		if (a == next_a) {
510 			q->tail = NULL; /* no more active slots */
511 			return skb;
512 		}
513 		q->tail->next = next_a;
514 	} else {
515 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
516 	}
517 	return skb;
518 }
519 
520 static void
521 sfq_reset(struct Qdisc *sch)
522 {
523 	struct sk_buff *skb;
524 
525 	while ((skb = sfq_dequeue(sch)) != NULL)
526 		rtnl_kfree_skbs(skb, skb);
527 }
528 
529 /*
530  * When q->perturbation is changed, we rehash all queued skbs
531  * to avoid OOO (Out Of Order) effects.
532  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
533  * counters.
534  */
535 static void sfq_rehash(struct Qdisc *sch)
536 {
537 	struct sfq_sched_data *q = qdisc_priv(sch);
538 	struct sk_buff *skb;
539 	int i;
540 	struct sfq_slot *slot;
541 	struct sk_buff_head list;
542 	int dropped = 0;
543 	unsigned int drop_len = 0;
544 
545 	__skb_queue_head_init(&list);
546 
547 	for (i = 0; i < q->maxflows; i++) {
548 		slot = &q->slots[i];
549 		if (!slot->qlen)
550 			continue;
551 		while (slot->qlen) {
552 			skb = slot_dequeue_head(slot);
553 			sfq_dec(q, i);
554 			__skb_queue_tail(&list, skb);
555 		}
556 		slot->backlog = 0;
557 		red_set_vars(&slot->vars);
558 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
559 	}
560 	q->tail = NULL;
561 
562 	while ((skb = __skb_dequeue(&list)) != NULL) {
563 		unsigned int hash = sfq_hash(q, skb);
564 		sfq_index x = q->ht[hash];
565 
566 		slot = &q->slots[x];
567 		if (x == SFQ_EMPTY_SLOT) {
568 			x = q->dep[0].next; /* get a free slot */
569 			if (x >= SFQ_MAX_FLOWS) {
570 drop:
571 				qdisc_qstats_backlog_dec(sch, skb);
572 				drop_len += qdisc_pkt_len(skb);
573 				kfree_skb(skb);
574 				dropped++;
575 				continue;
576 			}
577 			q->ht[hash] = x;
578 			slot = &q->slots[x];
579 			slot->hash = hash;
580 		}
581 		if (slot->qlen >= q->maxdepth)
582 			goto drop;
583 		slot_queue_add(slot, skb);
584 		if (q->red_parms)
585 			slot->vars.qavg = red_calc_qavg(q->red_parms,
586 							&slot->vars,
587 							slot->backlog);
588 		slot->backlog += qdisc_pkt_len(skb);
589 		sfq_inc(q, x);
590 		if (slot->qlen == 1) {		/* The flow is new */
591 			if (q->tail == NULL) {	/* It is the first flow */
592 				slot->next = x;
593 			} else {
594 				slot->next = q->tail->next;
595 				q->tail->next = x;
596 			}
597 			q->tail = slot;
598 			slot->allot = q->scaled_quantum;
599 		}
600 	}
601 	sch->q.qlen -= dropped;
602 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
603 }
604 
605 static void sfq_perturbation(struct timer_list *t)
606 {
607 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
608 	struct Qdisc *sch = q->sch;
609 	spinlock_t *root_lock;
610 	siphash_key_t nkey;
611 	int period;
612 
613 	get_random_bytes(&nkey, sizeof(nkey));
614 	rcu_read_lock();
615 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
616 	spin_lock(root_lock);
617 	q->perturbation = nkey;
618 	if (!q->filter_list && q->tail)
619 		sfq_rehash(sch);
620 	spin_unlock(root_lock);
621 
622 	/* q->perturb_period can change under us from
623 	 * sfq_change() and sfq_destroy().
624 	 */
625 	period = READ_ONCE(q->perturb_period);
626 	if (period)
627 		mod_timer(&q->perturb_timer, jiffies + period);
628 	rcu_read_unlock();
629 }
630 
631 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
632 {
633 	struct sfq_sched_data *q = qdisc_priv(sch);
634 	struct tc_sfq_qopt *ctl = nla_data(opt);
635 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
636 	unsigned int qlen, dropped = 0;
637 	struct red_parms *p = NULL;
638 	struct sk_buff *to_free = NULL;
639 	struct sk_buff *tail = NULL;
640 
641 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
642 		return -EINVAL;
643 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
644 		ctl_v1 = nla_data(opt);
645 	if (ctl->divisor &&
646 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
647 		return -EINVAL;
648 
649 	/* slot->allot is a short, make sure quantum is not too big. */
650 	if (ctl->quantum) {
651 		unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum);
652 
653 		if (scaled <= 0 || scaled > SHRT_MAX)
654 			return -EINVAL;
655 	}
656 
657 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
658 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
659 		return -EINVAL;
660 	if (ctl_v1 && ctl_v1->qth_min) {
661 		p = kmalloc(sizeof(*p), GFP_KERNEL);
662 		if (!p)
663 			return -ENOMEM;
664 	}
665 	sch_tree_lock(sch);
666 	if (ctl->quantum) {
667 		q->quantum = ctl->quantum;
668 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
669 	}
670 	WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
671 	if (ctl->flows)
672 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
673 	if (ctl->divisor) {
674 		q->divisor = ctl->divisor;
675 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
676 	}
677 	if (ctl_v1) {
678 		if (ctl_v1->depth)
679 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
680 		if (p) {
681 			swap(q->red_parms, p);
682 			red_set_parms(q->red_parms,
683 				      ctl_v1->qth_min, ctl_v1->qth_max,
684 				      ctl_v1->Wlog,
685 				      ctl_v1->Plog, ctl_v1->Scell_log,
686 				      NULL,
687 				      ctl_v1->max_P);
688 		}
689 		q->flags = ctl_v1->flags;
690 		q->headdrop = ctl_v1->headdrop;
691 	}
692 	if (ctl->limit) {
693 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
694 		q->maxflows = min_t(u32, q->maxflows, q->limit);
695 	}
696 
697 	qlen = sch->q.qlen;
698 	while (sch->q.qlen > q->limit) {
699 		dropped += sfq_drop(sch, &to_free);
700 		if (!tail)
701 			tail = to_free;
702 	}
703 
704 	rtnl_kfree_skbs(to_free, tail);
705 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
706 
707 	del_timer(&q->perturb_timer);
708 	if (q->perturb_period) {
709 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
710 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
711 	}
712 	sch_tree_unlock(sch);
713 	kfree(p);
714 	return 0;
715 }
716 
717 static void *sfq_alloc(size_t sz)
718 {
719 	return  kvmalloc(sz, GFP_KERNEL);
720 }
721 
722 static void sfq_free(void *addr)
723 {
724 	kvfree(addr);
725 }
726 
727 static void sfq_destroy(struct Qdisc *sch)
728 {
729 	struct sfq_sched_data *q = qdisc_priv(sch);
730 
731 	tcf_block_put(q->block);
732 	WRITE_ONCE(q->perturb_period, 0);
733 	del_timer_sync(&q->perturb_timer);
734 	sfq_free(q->ht);
735 	sfq_free(q->slots);
736 	kfree(q->red_parms);
737 }
738 
739 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
740 		    struct netlink_ext_ack *extack)
741 {
742 	struct sfq_sched_data *q = qdisc_priv(sch);
743 	int i;
744 	int err;
745 
746 	q->sch = sch;
747 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
748 
749 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
750 	if (err)
751 		return err;
752 
753 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
754 		q->dep[i].next = i + SFQ_MAX_FLOWS;
755 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
756 	}
757 
758 	q->limit = SFQ_MAX_DEPTH;
759 	q->maxdepth = SFQ_MAX_DEPTH;
760 	q->cur_depth = 0;
761 	q->tail = NULL;
762 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
763 	q->maxflows = SFQ_DEFAULT_FLOWS;
764 	q->quantum = psched_mtu(qdisc_dev(sch));
765 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
766 	q->perturb_period = 0;
767 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
768 
769 	if (opt) {
770 		int err = sfq_change(sch, opt);
771 		if (err)
772 			return err;
773 	}
774 
775 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
776 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
777 	if (!q->ht || !q->slots) {
778 		/* Note: sfq_destroy() will be called by our caller */
779 		return -ENOMEM;
780 	}
781 
782 	for (i = 0; i < q->divisor; i++)
783 		q->ht[i] = SFQ_EMPTY_SLOT;
784 
785 	for (i = 0; i < q->maxflows; i++) {
786 		slot_queue_init(&q->slots[i]);
787 		sfq_link(q, i);
788 	}
789 	if (q->limit >= 1)
790 		sch->flags |= TCQ_F_CAN_BYPASS;
791 	else
792 		sch->flags &= ~TCQ_F_CAN_BYPASS;
793 	return 0;
794 }
795 
796 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
797 {
798 	struct sfq_sched_data *q = qdisc_priv(sch);
799 	unsigned char *b = skb_tail_pointer(skb);
800 	struct tc_sfq_qopt_v1 opt;
801 	struct red_parms *p = q->red_parms;
802 
803 	memset(&opt, 0, sizeof(opt));
804 	opt.v0.quantum	= q->quantum;
805 	opt.v0.perturb_period = q->perturb_period / HZ;
806 	opt.v0.limit	= q->limit;
807 	opt.v0.divisor	= q->divisor;
808 	opt.v0.flows	= q->maxflows;
809 	opt.depth	= q->maxdepth;
810 	opt.headdrop	= q->headdrop;
811 
812 	if (p) {
813 		opt.qth_min	= p->qth_min >> p->Wlog;
814 		opt.qth_max	= p->qth_max >> p->Wlog;
815 		opt.Wlog	= p->Wlog;
816 		opt.Plog	= p->Plog;
817 		opt.Scell_log	= p->Scell_log;
818 		opt.max_P	= p->max_P;
819 	}
820 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
821 	opt.flags	= q->flags;
822 
823 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
824 		goto nla_put_failure;
825 
826 	return skb->len;
827 
828 nla_put_failure:
829 	nlmsg_trim(skb, b);
830 	return -1;
831 }
832 
833 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
834 {
835 	return NULL;
836 }
837 
838 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
839 {
840 	return 0;
841 }
842 
843 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
844 			      u32 classid)
845 {
846 	return 0;
847 }
848 
849 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
850 {
851 }
852 
853 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
854 				       struct netlink_ext_ack *extack)
855 {
856 	struct sfq_sched_data *q = qdisc_priv(sch);
857 
858 	if (cl)
859 		return NULL;
860 	return q->block;
861 }
862 
863 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
864 			  struct sk_buff *skb, struct tcmsg *tcm)
865 {
866 	tcm->tcm_handle |= TC_H_MIN(cl);
867 	return 0;
868 }
869 
870 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
871 				struct gnet_dump *d)
872 {
873 	struct sfq_sched_data *q = qdisc_priv(sch);
874 	sfq_index idx = q->ht[cl - 1];
875 	struct gnet_stats_queue qs = { 0 };
876 	struct tc_sfq_xstats xstats = { 0 };
877 
878 	if (idx != SFQ_EMPTY_SLOT) {
879 		const struct sfq_slot *slot = &q->slots[idx];
880 
881 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
882 		qs.qlen = slot->qlen;
883 		qs.backlog = slot->backlog;
884 	}
885 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
886 		return -1;
887 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
888 }
889 
890 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
891 {
892 	struct sfq_sched_data *q = qdisc_priv(sch);
893 	unsigned int i;
894 
895 	if (arg->stop)
896 		return;
897 
898 	for (i = 0; i < q->divisor; i++) {
899 		if (q->ht[i] == SFQ_EMPTY_SLOT) {
900 			arg->count++;
901 			continue;
902 		}
903 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
904 			break;
905 	}
906 }
907 
908 static const struct Qdisc_class_ops sfq_class_ops = {
909 	.leaf		=	sfq_leaf,
910 	.find		=	sfq_find,
911 	.tcf_block	=	sfq_tcf_block,
912 	.bind_tcf	=	sfq_bind,
913 	.unbind_tcf	=	sfq_unbind,
914 	.dump		=	sfq_dump_class,
915 	.dump_stats	=	sfq_dump_class_stats,
916 	.walk		=	sfq_walk,
917 };
918 
919 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
920 	.cl_ops		=	&sfq_class_ops,
921 	.id		=	"sfq",
922 	.priv_size	=	sizeof(struct sfq_sched_data),
923 	.enqueue	=	sfq_enqueue,
924 	.dequeue	=	sfq_dequeue,
925 	.peek		=	qdisc_peek_dequeued,
926 	.init		=	sfq_init,
927 	.reset		=	sfq_reset,
928 	.destroy	=	sfq_destroy,
929 	.change		=	NULL,
930 	.dump		=	sfq_dump,
931 	.owner		=	THIS_MODULE,
932 };
933 MODULE_ALIAS_NET_SCH("sfq");
934 
935 static int __init sfq_module_init(void)
936 {
937 	return register_qdisc(&sfq_qdisc_ops);
938 }
939 static void __exit sfq_module_exit(void)
940 {
941 	unregister_qdisc(&sfq_qdisc_ops);
942 }
943 module_init(sfq_module_init)
944 module_exit(sfq_module_exit)
945 MODULE_LICENSE("GPL");
946 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
947