xref: /linux/net/sched/sch_sfq.c (revision 13d68a16430312fc21990f48326366eb73891202)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen--;
229 	if (n == p && q->cur_depth == d)
230 		q->cur_depth--;
231 	sfq_link(q, x);
232 }
233 
234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235 {
236 	sfq_index p, n;
237 	int d;
238 
239 	sfq_unlink(q, x, n, p);
240 
241 	d = ++q->slots[x].qlen;
242 	if (q->cur_depth < d)
243 		q->cur_depth = d;
244 	sfq_link(q, x);
245 }
246 
247 /* helper functions : might be changed when/if skb use a standard list_head */
248 
249 /* remove one skb from tail of slot queue */
250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251 {
252 	struct sk_buff *skb = slot->skblist_prev;
253 
254 	slot->skblist_prev = skb->prev;
255 	skb->prev->next = (struct sk_buff *)slot;
256 	skb->next = skb->prev = NULL;
257 	return skb;
258 }
259 
260 /* remove one skb from head of slot queue */
261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262 {
263 	struct sk_buff *skb = slot->skblist_next;
264 
265 	slot->skblist_next = skb->next;
266 	skb->next->prev = (struct sk_buff *)slot;
267 	skb->next = skb->prev = NULL;
268 	return skb;
269 }
270 
271 static inline void slot_queue_init(struct sfq_slot *slot)
272 {
273 	memset(slot, 0, sizeof(*slot));
274 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275 }
276 
277 /* add skb to slot queue (tail add) */
278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279 {
280 	skb->prev = slot->skblist_prev;
281 	skb->next = (struct sk_buff *)slot;
282 	slot->skblist_prev->next = skb;
283 	slot->skblist_prev = skb;
284 }
285 
286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287 {
288 	struct sfq_sched_data *q = qdisc_priv(sch);
289 	sfq_index x, d = q->cur_depth;
290 	struct sk_buff *skb;
291 	unsigned int len;
292 	struct sfq_slot *slot;
293 
294 	/* Queue is full! Find the longest slot and drop tail packet from it */
295 	if (d > 1) {
296 		x = q->dep[d].next;
297 		slot = &q->slots[x];
298 drop:
299 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300 		len = qdisc_pkt_len(skb);
301 		slot->backlog -= len;
302 		sfq_dec(q, x);
303 		sch->q.qlen--;
304 		qdisc_qstats_backlog_dec(sch, skb);
305 		qdisc_drop(skb, sch, to_free);
306 		return len;
307 	}
308 
309 	if (d == 1) {
310 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311 		x = q->tail->next;
312 		slot = &q->slots[x];
313 		q->tail->next = slot->next;
314 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
315 		goto drop;
316 	}
317 
318 	return 0;
319 }
320 
321 /* Is ECN parameter configured */
322 static int sfq_prob_mark(const struct sfq_sched_data *q)
323 {
324 	return q->flags & TC_RED_ECN;
325 }
326 
327 /* Should packets over max threshold just be marked */
328 static int sfq_hard_mark(const struct sfq_sched_data *q)
329 {
330 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
331 }
332 
333 static int sfq_headdrop(const struct sfq_sched_data *q)
334 {
335 	return q->headdrop;
336 }
337 
338 static int
339 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
340 {
341 	struct sfq_sched_data *q = qdisc_priv(sch);
342 	unsigned int hash, dropped;
343 	sfq_index x, qlen;
344 	struct sfq_slot *slot;
345 	int ret;
346 	struct sk_buff *head;
347 	int delta;
348 
349 	hash = sfq_classify(skb, sch, &ret);
350 	if (hash == 0) {
351 		if (ret & __NET_XMIT_BYPASS)
352 			qdisc_qstats_drop(sch);
353 		__qdisc_drop(skb, to_free);
354 		return ret;
355 	}
356 	hash--;
357 
358 	x = q->ht[hash];
359 	slot = &q->slots[x];
360 	if (x == SFQ_EMPTY_SLOT) {
361 		x = q->dep[0].next; /* get a free slot */
362 		if (x >= SFQ_MAX_FLOWS)
363 			return qdisc_drop(skb, sch, to_free);
364 		q->ht[hash] = x;
365 		slot = &q->slots[x];
366 		slot->hash = hash;
367 		slot->backlog = 0; /* should already be 0 anyway... */
368 		red_set_vars(&slot->vars);
369 		goto enqueue;
370 	}
371 	if (q->red_parms) {
372 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
373 							&slot->vars,
374 							slot->backlog);
375 		switch (red_action(q->red_parms,
376 				   &slot->vars,
377 				   slot->vars.qavg)) {
378 		case RED_DONT_MARK:
379 			break;
380 
381 		case RED_PROB_MARK:
382 			qdisc_qstats_overlimit(sch);
383 			if (sfq_prob_mark(q)) {
384 				/* We know we have at least one packet in queue */
385 				if (sfq_headdrop(q) &&
386 				    INET_ECN_set_ce(slot->skblist_next)) {
387 					q->stats.prob_mark_head++;
388 					break;
389 				}
390 				if (INET_ECN_set_ce(skb)) {
391 					q->stats.prob_mark++;
392 					break;
393 				}
394 			}
395 			q->stats.prob_drop++;
396 			goto congestion_drop;
397 
398 		case RED_HARD_MARK:
399 			qdisc_qstats_overlimit(sch);
400 			if (sfq_hard_mark(q)) {
401 				/* We know we have at least one packet in queue */
402 				if (sfq_headdrop(q) &&
403 				    INET_ECN_set_ce(slot->skblist_next)) {
404 					q->stats.forced_mark_head++;
405 					break;
406 				}
407 				if (INET_ECN_set_ce(skb)) {
408 					q->stats.forced_mark++;
409 					break;
410 				}
411 			}
412 			q->stats.forced_drop++;
413 			goto congestion_drop;
414 		}
415 	}
416 
417 	if (slot->qlen >= q->maxdepth) {
418 congestion_drop:
419 		if (!sfq_headdrop(q))
420 			return qdisc_drop(skb, sch, to_free);
421 
422 		/* We know we have at least one packet in queue */
423 		head = slot_dequeue_head(slot);
424 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
425 		sch->qstats.backlog -= delta;
426 		slot->backlog -= delta;
427 		qdisc_drop(head, sch, to_free);
428 
429 		slot_queue_add(slot, skb);
430 		qdisc_tree_reduce_backlog(sch, 0, delta);
431 		return NET_XMIT_CN;
432 	}
433 
434 enqueue:
435 	qdisc_qstats_backlog_inc(sch, skb);
436 	slot->backlog += qdisc_pkt_len(skb);
437 	slot_queue_add(slot, skb);
438 	sfq_inc(q, x);
439 	if (slot->qlen == 1) {		/* The flow is new */
440 		if (q->tail == NULL) {	/* It is the first flow */
441 			slot->next = x;
442 		} else {
443 			slot->next = q->tail->next;
444 			q->tail->next = x;
445 		}
446 		/* We put this flow at the end of our flow list.
447 		 * This might sound unfair for a new flow to wait after old ones,
448 		 * but we could endup servicing new flows only, and freeze old ones.
449 		 */
450 		q->tail = slot;
451 		/* We could use a bigger initial quantum for new flows */
452 		slot->allot = q->quantum;
453 	}
454 	if (++sch->q.qlen <= q->limit)
455 		return NET_XMIT_SUCCESS;
456 
457 	qlen = slot->qlen;
458 	dropped = sfq_drop(sch, to_free);
459 	/* Return Congestion Notification only if we dropped a packet
460 	 * from this flow.
461 	 */
462 	if (qlen != slot->qlen) {
463 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
464 		return NET_XMIT_CN;
465 	}
466 
467 	/* As we dropped a packet, better let upper stack know this */
468 	qdisc_tree_reduce_backlog(sch, 1, dropped);
469 	return NET_XMIT_SUCCESS;
470 }
471 
472 static struct sk_buff *
473 sfq_dequeue(struct Qdisc *sch)
474 {
475 	struct sfq_sched_data *q = qdisc_priv(sch);
476 	struct sk_buff *skb;
477 	sfq_index a, next_a;
478 	struct sfq_slot *slot;
479 
480 	/* No active slots */
481 	if (q->tail == NULL)
482 		return NULL;
483 
484 next_slot:
485 	a = q->tail->next;
486 	slot = &q->slots[a];
487 	if (slot->allot <= 0) {
488 		q->tail = slot;
489 		slot->allot += q->quantum;
490 		goto next_slot;
491 	}
492 	skb = slot_dequeue_head(slot);
493 	sfq_dec(q, a);
494 	qdisc_bstats_update(sch, skb);
495 	sch->q.qlen--;
496 	qdisc_qstats_backlog_dec(sch, skb);
497 	slot->backlog -= qdisc_pkt_len(skb);
498 	/* Is the slot empty? */
499 	if (slot->qlen == 0) {
500 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
501 		next_a = slot->next;
502 		if (a == next_a) {
503 			q->tail = NULL; /* no more active slots */
504 			return skb;
505 		}
506 		q->tail->next = next_a;
507 	} else {
508 		slot->allot -= qdisc_pkt_len(skb);
509 	}
510 	return skb;
511 }
512 
513 static void
514 sfq_reset(struct Qdisc *sch)
515 {
516 	struct sk_buff *skb;
517 
518 	while ((skb = sfq_dequeue(sch)) != NULL)
519 		rtnl_kfree_skbs(skb, skb);
520 }
521 
522 /*
523  * When q->perturbation is changed, we rehash all queued skbs
524  * to avoid OOO (Out Of Order) effects.
525  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
526  * counters.
527  */
528 static void sfq_rehash(struct Qdisc *sch)
529 {
530 	struct sfq_sched_data *q = qdisc_priv(sch);
531 	struct sk_buff *skb;
532 	int i;
533 	struct sfq_slot *slot;
534 	struct sk_buff_head list;
535 	int dropped = 0;
536 	unsigned int drop_len = 0;
537 
538 	__skb_queue_head_init(&list);
539 
540 	for (i = 0; i < q->maxflows; i++) {
541 		slot = &q->slots[i];
542 		if (!slot->qlen)
543 			continue;
544 		while (slot->qlen) {
545 			skb = slot_dequeue_head(slot);
546 			sfq_dec(q, i);
547 			__skb_queue_tail(&list, skb);
548 		}
549 		slot->backlog = 0;
550 		red_set_vars(&slot->vars);
551 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
552 	}
553 	q->tail = NULL;
554 
555 	while ((skb = __skb_dequeue(&list)) != NULL) {
556 		unsigned int hash = sfq_hash(q, skb);
557 		sfq_index x = q->ht[hash];
558 
559 		slot = &q->slots[x];
560 		if (x == SFQ_EMPTY_SLOT) {
561 			x = q->dep[0].next; /* get a free slot */
562 			if (x >= SFQ_MAX_FLOWS) {
563 drop:
564 				qdisc_qstats_backlog_dec(sch, skb);
565 				drop_len += qdisc_pkt_len(skb);
566 				kfree_skb(skb);
567 				dropped++;
568 				continue;
569 			}
570 			q->ht[hash] = x;
571 			slot = &q->slots[x];
572 			slot->hash = hash;
573 		}
574 		if (slot->qlen >= q->maxdepth)
575 			goto drop;
576 		slot_queue_add(slot, skb);
577 		if (q->red_parms)
578 			slot->vars.qavg = red_calc_qavg(q->red_parms,
579 							&slot->vars,
580 							slot->backlog);
581 		slot->backlog += qdisc_pkt_len(skb);
582 		sfq_inc(q, x);
583 		if (slot->qlen == 1) {		/* The flow is new */
584 			if (q->tail == NULL) {	/* It is the first flow */
585 				slot->next = x;
586 			} else {
587 				slot->next = q->tail->next;
588 				q->tail->next = x;
589 			}
590 			q->tail = slot;
591 			slot->allot = q->quantum;
592 		}
593 	}
594 	sch->q.qlen -= dropped;
595 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
596 }
597 
598 static void sfq_perturbation(struct timer_list *t)
599 {
600 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
601 	struct Qdisc *sch = q->sch;
602 	spinlock_t *root_lock;
603 	siphash_key_t nkey;
604 	int period;
605 
606 	get_random_bytes(&nkey, sizeof(nkey));
607 	rcu_read_lock();
608 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
609 	spin_lock(root_lock);
610 	q->perturbation = nkey;
611 	if (!q->filter_list && q->tail)
612 		sfq_rehash(sch);
613 	spin_unlock(root_lock);
614 
615 	/* q->perturb_period can change under us from
616 	 * sfq_change() and sfq_destroy().
617 	 */
618 	period = READ_ONCE(q->perturb_period);
619 	if (period)
620 		mod_timer(&q->perturb_timer, jiffies + period);
621 	rcu_read_unlock();
622 }
623 
624 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
625 		      struct netlink_ext_ack *extack)
626 {
627 	struct sfq_sched_data *q = qdisc_priv(sch);
628 	struct tc_sfq_qopt *ctl = nla_data(opt);
629 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
630 	unsigned int qlen, dropped = 0;
631 	struct red_parms *p = NULL;
632 	struct sk_buff *to_free = NULL;
633 	struct sk_buff *tail = NULL;
634 
635 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
636 		return -EINVAL;
637 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
638 		ctl_v1 = nla_data(opt);
639 	if (ctl->divisor &&
640 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
641 		return -EINVAL;
642 
643 	if ((int)ctl->quantum < 0) {
644 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
645 		return -EINVAL;
646 	}
647 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
648 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
649 		return -EINVAL;
650 	if (ctl_v1 && ctl_v1->qth_min) {
651 		p = kmalloc(sizeof(*p), GFP_KERNEL);
652 		if (!p)
653 			return -ENOMEM;
654 	}
655 	sch_tree_lock(sch);
656 	if (ctl->quantum)
657 		q->quantum = ctl->quantum;
658 	WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
659 	if (ctl->flows)
660 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
661 	if (ctl->divisor) {
662 		q->divisor = ctl->divisor;
663 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
664 	}
665 	if (ctl_v1) {
666 		if (ctl_v1->depth)
667 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
668 		if (p) {
669 			swap(q->red_parms, p);
670 			red_set_parms(q->red_parms,
671 				      ctl_v1->qth_min, ctl_v1->qth_max,
672 				      ctl_v1->Wlog,
673 				      ctl_v1->Plog, ctl_v1->Scell_log,
674 				      NULL,
675 				      ctl_v1->max_P);
676 		}
677 		q->flags = ctl_v1->flags;
678 		q->headdrop = ctl_v1->headdrop;
679 	}
680 	if (ctl->limit) {
681 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
682 		q->maxflows = min_t(u32, q->maxflows, q->limit);
683 	}
684 
685 	qlen = sch->q.qlen;
686 	while (sch->q.qlen > q->limit) {
687 		dropped += sfq_drop(sch, &to_free);
688 		if (!tail)
689 			tail = to_free;
690 	}
691 
692 	rtnl_kfree_skbs(to_free, tail);
693 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
694 
695 	del_timer(&q->perturb_timer);
696 	if (q->perturb_period) {
697 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
698 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
699 	}
700 	sch_tree_unlock(sch);
701 	kfree(p);
702 	return 0;
703 }
704 
705 static void *sfq_alloc(size_t sz)
706 {
707 	return  kvmalloc(sz, GFP_KERNEL);
708 }
709 
710 static void sfq_free(void *addr)
711 {
712 	kvfree(addr);
713 }
714 
715 static void sfq_destroy(struct Qdisc *sch)
716 {
717 	struct sfq_sched_data *q = qdisc_priv(sch);
718 
719 	tcf_block_put(q->block);
720 	WRITE_ONCE(q->perturb_period, 0);
721 	del_timer_sync(&q->perturb_timer);
722 	sfq_free(q->ht);
723 	sfq_free(q->slots);
724 	kfree(q->red_parms);
725 }
726 
727 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
728 		    struct netlink_ext_ack *extack)
729 {
730 	struct sfq_sched_data *q = qdisc_priv(sch);
731 	int i;
732 	int err;
733 
734 	q->sch = sch;
735 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
736 
737 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
738 	if (err)
739 		return err;
740 
741 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
742 		q->dep[i].next = i + SFQ_MAX_FLOWS;
743 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
744 	}
745 
746 	q->limit = SFQ_MAX_DEPTH;
747 	q->maxdepth = SFQ_MAX_DEPTH;
748 	q->cur_depth = 0;
749 	q->tail = NULL;
750 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
751 	q->maxflows = SFQ_DEFAULT_FLOWS;
752 	q->quantum = psched_mtu(qdisc_dev(sch));
753 	q->perturb_period = 0;
754 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
755 
756 	if (opt) {
757 		int err = sfq_change(sch, opt, extack);
758 		if (err)
759 			return err;
760 	}
761 
762 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
763 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
764 	if (!q->ht || !q->slots) {
765 		/* Note: sfq_destroy() will be called by our caller */
766 		return -ENOMEM;
767 	}
768 
769 	for (i = 0; i < q->divisor; i++)
770 		q->ht[i] = SFQ_EMPTY_SLOT;
771 
772 	for (i = 0; i < q->maxflows; i++) {
773 		slot_queue_init(&q->slots[i]);
774 		sfq_link(q, i);
775 	}
776 	if (q->limit >= 1)
777 		sch->flags |= TCQ_F_CAN_BYPASS;
778 	else
779 		sch->flags &= ~TCQ_F_CAN_BYPASS;
780 	return 0;
781 }
782 
783 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
784 {
785 	struct sfq_sched_data *q = qdisc_priv(sch);
786 	unsigned char *b = skb_tail_pointer(skb);
787 	struct tc_sfq_qopt_v1 opt;
788 	struct red_parms *p = q->red_parms;
789 
790 	memset(&opt, 0, sizeof(opt));
791 	opt.v0.quantum	= q->quantum;
792 	opt.v0.perturb_period = q->perturb_period / HZ;
793 	opt.v0.limit	= q->limit;
794 	opt.v0.divisor	= q->divisor;
795 	opt.v0.flows	= q->maxflows;
796 	opt.depth	= q->maxdepth;
797 	opt.headdrop	= q->headdrop;
798 
799 	if (p) {
800 		opt.qth_min	= p->qth_min >> p->Wlog;
801 		opt.qth_max	= p->qth_max >> p->Wlog;
802 		opt.Wlog	= p->Wlog;
803 		opt.Plog	= p->Plog;
804 		opt.Scell_log	= p->Scell_log;
805 		opt.max_P	= p->max_P;
806 	}
807 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
808 	opt.flags	= q->flags;
809 
810 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
811 		goto nla_put_failure;
812 
813 	return skb->len;
814 
815 nla_put_failure:
816 	nlmsg_trim(skb, b);
817 	return -1;
818 }
819 
820 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
821 {
822 	return NULL;
823 }
824 
825 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
826 {
827 	return 0;
828 }
829 
830 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
831 			      u32 classid)
832 {
833 	return 0;
834 }
835 
836 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
837 {
838 }
839 
840 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
841 				       struct netlink_ext_ack *extack)
842 {
843 	struct sfq_sched_data *q = qdisc_priv(sch);
844 
845 	if (cl)
846 		return NULL;
847 	return q->block;
848 }
849 
850 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
851 			  struct sk_buff *skb, struct tcmsg *tcm)
852 {
853 	tcm->tcm_handle |= TC_H_MIN(cl);
854 	return 0;
855 }
856 
857 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
858 				struct gnet_dump *d)
859 {
860 	struct sfq_sched_data *q = qdisc_priv(sch);
861 	sfq_index idx = q->ht[cl - 1];
862 	struct gnet_stats_queue qs = { 0 };
863 	struct tc_sfq_xstats xstats = { 0 };
864 
865 	if (idx != SFQ_EMPTY_SLOT) {
866 		const struct sfq_slot *slot = &q->slots[idx];
867 
868 		xstats.allot = slot->allot;
869 		qs.qlen = slot->qlen;
870 		qs.backlog = slot->backlog;
871 	}
872 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
873 		return -1;
874 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
875 }
876 
877 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
878 {
879 	struct sfq_sched_data *q = qdisc_priv(sch);
880 	unsigned int i;
881 
882 	if (arg->stop)
883 		return;
884 
885 	for (i = 0; i < q->divisor; i++) {
886 		if (q->ht[i] == SFQ_EMPTY_SLOT) {
887 			arg->count++;
888 			continue;
889 		}
890 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
891 			break;
892 	}
893 }
894 
895 static const struct Qdisc_class_ops sfq_class_ops = {
896 	.leaf		=	sfq_leaf,
897 	.find		=	sfq_find,
898 	.tcf_block	=	sfq_tcf_block,
899 	.bind_tcf	=	sfq_bind,
900 	.unbind_tcf	=	sfq_unbind,
901 	.dump		=	sfq_dump_class,
902 	.dump_stats	=	sfq_dump_class_stats,
903 	.walk		=	sfq_walk,
904 };
905 
906 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
907 	.cl_ops		=	&sfq_class_ops,
908 	.id		=	"sfq",
909 	.priv_size	=	sizeof(struct sfq_sched_data),
910 	.enqueue	=	sfq_enqueue,
911 	.dequeue	=	sfq_dequeue,
912 	.peek		=	qdisc_peek_dequeued,
913 	.init		=	sfq_init,
914 	.reset		=	sfq_reset,
915 	.destroy	=	sfq_destroy,
916 	.change		=	NULL,
917 	.dump		=	sfq_dump,
918 	.owner		=	THIS_MODULE,
919 };
920 MODULE_ALIAS_NET_SCH("sfq");
921 
922 static int __init sfq_module_init(void)
923 {
924 	return register_qdisc(&sfq_qdisc_ops);
925 }
926 static void __exit sfq_module_exit(void)
927 {
928 	unregister_qdisc(&sfq_qdisc_ops);
929 }
930 module_init(sfq_module_init)
931 module_exit(sfq_module_exit)
932 MODULE_LICENSE("GPL");
933 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
934