xref: /linux/net/sched/sch_sfq.c (revision 0e5b05f44d818d1efe5154832d2e3f3c44352c02)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen;
229 	WRITE_ONCE(q->slots[x].qlen, d - 1);
230 	if (n == p && q->cur_depth == d)
231 		q->cur_depth--;
232 	sfq_link(q, x);
233 }
234 
235 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
236 {
237 	sfq_index p, n;
238 	int d;
239 
240 	sfq_unlink(q, x, n, p);
241 
242 	d = q->slots[x].qlen + 1;
243 	WRITE_ONCE(q->slots[x].qlen, d);
244 	if (q->cur_depth < d)
245 		q->cur_depth = d;
246 	sfq_link(q, x);
247 }
248 
249 /* helper functions : might be changed when/if skb use a standard list_head */
250 
251 /* remove one skb from tail of slot queue */
252 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
253 {
254 	struct sk_buff *skb = slot->skblist_prev;
255 
256 	slot->skblist_prev = skb->prev;
257 	skb->prev->next = (struct sk_buff *)slot;
258 	skb->next = skb->prev = NULL;
259 	return skb;
260 }
261 
262 /* remove one skb from head of slot queue */
263 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
264 {
265 	struct sk_buff *skb = slot->skblist_next;
266 
267 	slot->skblist_next = skb->next;
268 	skb->next->prev = (struct sk_buff *)slot;
269 	skb->next = skb->prev = NULL;
270 	return skb;
271 }
272 
273 static inline void slot_queue_init(struct sfq_slot *slot)
274 {
275 	memset(slot, 0, sizeof(*slot));
276 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
277 }
278 
279 /* add skb to slot queue (tail add) */
280 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
281 {
282 	skb->prev = slot->skblist_prev;
283 	skb->next = (struct sk_buff *)slot;
284 	slot->skblist_prev->next = skb;
285 	slot->skblist_prev = skb;
286 }
287 
288 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
289 {
290 	struct sfq_sched_data *q = qdisc_priv(sch);
291 	sfq_index x, d = q->cur_depth;
292 	struct sk_buff *skb;
293 	unsigned int len;
294 	struct sfq_slot *slot;
295 
296 	/* Queue is full! Find the longest slot and drop tail packet from it */
297 	if (d > 1) {
298 		x = q->dep[d].next;
299 		slot = &q->slots[x];
300 drop:
301 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
302 		len = qdisc_pkt_len(skb);
303 		WRITE_ONCE(slot->backlog, slot->backlog - len);
304 		sfq_dec(q, x);
305 		qdisc_qlen_dec(sch);
306 		qdisc_qstats_backlog_dec(sch, skb);
307 		qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT);
308 		return len;
309 	}
310 
311 	if (d == 1) {
312 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
313 		x = q->tail->next;
314 		slot = &q->slots[x];
315 		if (slot->next == x)
316 			q->tail = NULL; /* no more active slots */
317 		else
318 			q->tail->next = slot->next;
319 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
320 		goto drop;
321 	}
322 
323 	return 0;
324 }
325 
326 /* Is ECN parameter configured */
327 static int sfq_prob_mark(const struct sfq_sched_data *q)
328 {
329 	return q->flags & TC_RED_ECN;
330 }
331 
332 /* Should packets over max threshold just be marked */
333 static int sfq_hard_mark(const struct sfq_sched_data *q)
334 {
335 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
336 }
337 
338 static int sfq_headdrop(const struct sfq_sched_data *q)
339 {
340 	return q->headdrop;
341 }
342 
343 static int
344 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
345 {
346 	struct sfq_sched_data *q = qdisc_priv(sch);
347 	unsigned int hash, dropped;
348 	sfq_index x, qlen;
349 	struct sfq_slot *slot;
350 	int ret;
351 	struct sk_buff *head;
352 	int delta;
353 
354 	hash = sfq_classify(skb, sch, &ret);
355 	if (hash == 0) {
356 		if (ret & __NET_XMIT_BYPASS)
357 			qdisc_qstats_drop(sch);
358 		__qdisc_drop(skb, to_free);
359 		return ret;
360 	}
361 	hash--;
362 
363 	x = q->ht[hash];
364 	slot = &q->slots[x];
365 	if (x == SFQ_EMPTY_SLOT) {
366 		x = q->dep[0].next; /* get a free slot */
367 		if (x >= SFQ_MAX_FLOWS)
368 			return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS);
369 		WRITE_ONCE(q->ht[hash], x);
370 		slot = &q->slots[x];
371 		slot->hash = hash;
372 		WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */
373 		red_set_vars(&slot->vars);
374 		goto enqueue;
375 	}
376 	if (q->red_parms) {
377 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
378 							&slot->vars,
379 							slot->backlog);
380 		switch (red_action(q->red_parms,
381 				   &slot->vars,
382 				   slot->vars.qavg)) {
383 		case RED_DONT_MARK:
384 			break;
385 
386 		case RED_PROB_MARK:
387 			qdisc_qstats_overlimit(sch);
388 			if (sfq_prob_mark(q)) {
389 				/* We know we have at least one packet in queue */
390 				if (sfq_headdrop(q) &&
391 				    INET_ECN_set_ce(slot->skblist_next)) {
392 					q->stats.prob_mark_head++;
393 					break;
394 				}
395 				if (INET_ECN_set_ce(skb)) {
396 					q->stats.prob_mark++;
397 					break;
398 				}
399 			}
400 			q->stats.prob_drop++;
401 			goto congestion_drop;
402 
403 		case RED_HARD_MARK:
404 			qdisc_qstats_overlimit(sch);
405 			if (sfq_hard_mark(q)) {
406 				/* We know we have at least one packet in queue */
407 				if (sfq_headdrop(q) &&
408 				    INET_ECN_set_ce(slot->skblist_next)) {
409 					q->stats.forced_mark_head++;
410 					break;
411 				}
412 				if (INET_ECN_set_ce(skb)) {
413 					q->stats.forced_mark++;
414 					break;
415 				}
416 			}
417 			q->stats.forced_drop++;
418 			goto congestion_drop;
419 		}
420 	}
421 
422 	if (slot->qlen >= q->maxdepth) {
423 congestion_drop:
424 		if (!sfq_headdrop(q))
425 			return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT);
426 
427 		/* We know we have at least one packet in queue */
428 		head = slot_dequeue_head(slot);
429 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
430 		qstats_backlog_sub(sch, delta);
431 		WRITE_ONCE(slot->backlog, slot->backlog - delta);
432 		qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT);
433 
434 		slot_queue_add(slot, skb);
435 		qdisc_tree_reduce_backlog(sch, 0, delta);
436 		return NET_XMIT_CN;
437 	}
438 
439 enqueue:
440 	qdisc_qstats_backlog_inc(sch, skb);
441 	WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
442 	slot_queue_add(slot, skb);
443 	sfq_inc(q, x);
444 	if (slot->qlen == 1) {		/* The flow is new */
445 		if (q->tail == NULL) {	/* It is the first flow */
446 			slot->next = x;
447 		} else {
448 			slot->next = q->tail->next;
449 			q->tail->next = x;
450 		}
451 		/* We put this flow at the end of our flow list.
452 		 * This might sound unfair for a new flow to wait after old ones,
453 		 * but we could endup servicing new flows only, and freeze old ones.
454 		 */
455 		q->tail = slot;
456 		/* We could use a bigger initial quantum for new flows */
457 		WRITE_ONCE(slot->allot, q->quantum);
458 	}
459 	qdisc_qlen_inc(sch);
460 	if (sch->q.qlen <= q->limit)
461 		return NET_XMIT_SUCCESS;
462 
463 	qlen = slot->qlen;
464 	dropped = sfq_drop(sch, to_free);
465 	/* Return Congestion Notification only if we dropped a packet
466 	 * from this flow.
467 	 */
468 	if (qlen != slot->qlen) {
469 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
470 		return NET_XMIT_CN;
471 	}
472 
473 	/* As we dropped a packet, better let upper stack know this */
474 	qdisc_tree_reduce_backlog(sch, 1, dropped);
475 	return NET_XMIT_SUCCESS;
476 }
477 
478 static struct sk_buff *
479 sfq_dequeue(struct Qdisc *sch)
480 {
481 	struct sfq_sched_data *q = qdisc_priv(sch);
482 	struct sk_buff *skb;
483 	sfq_index a, next_a;
484 	struct sfq_slot *slot;
485 
486 	/* No active slots */
487 	if (q->tail == NULL)
488 		return NULL;
489 
490 next_slot:
491 	a = q->tail->next;
492 	slot = &q->slots[a];
493 	if (slot->allot <= 0) {
494 		q->tail = slot;
495 		WRITE_ONCE(slot->allot, slot->allot + q->quantum);
496 		goto next_slot;
497 	}
498 	skb = slot_dequeue_head(slot);
499 	sfq_dec(q, a);
500 	qdisc_bstats_update(sch, skb);
501 	qdisc_qlen_dec(sch);
502 	qdisc_qstats_backlog_dec(sch, skb);
503 	WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb));
504 	/* Is the slot empty? */
505 	if (slot->qlen == 0) {
506 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
507 		next_a = slot->next;
508 		if (a == next_a) {
509 			q->tail = NULL; /* no more active slots */
510 			return skb;
511 		}
512 		q->tail->next = next_a;
513 	} else {
514 		WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb));
515 	}
516 	return skb;
517 }
518 
519 static void
520 sfq_reset(struct Qdisc *sch)
521 {
522 	struct sk_buff *skb;
523 
524 	while ((skb = sfq_dequeue(sch)) != NULL)
525 		rtnl_kfree_skbs(skb, skb);
526 }
527 
528 /*
529  * When q->perturbation is changed, we rehash all queued skbs
530  * to avoid OOO (Out Of Order) effects.
531  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
532  * counters.
533  */
534 static void sfq_rehash(struct Qdisc *sch)
535 {
536 	struct sfq_sched_data *q = qdisc_priv(sch);
537 	struct sk_buff *skb;
538 	int i;
539 	struct sfq_slot *slot;
540 	struct sk_buff_head list;
541 	int dropped = 0;
542 	unsigned int drop_len = 0;
543 
544 	__skb_queue_head_init(&list);
545 
546 	for (i = 0; i < q->maxflows; i++) {
547 		slot = &q->slots[i];
548 		if (!slot->qlen)
549 			continue;
550 		while (slot->qlen) {
551 			skb = slot_dequeue_head(slot);
552 			sfq_dec(q, i);
553 			__skb_queue_tail(&list, skb);
554 		}
555 		WRITE_ONCE(slot->backlog, 0);
556 		red_set_vars(&slot->vars);
557 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
558 	}
559 	q->tail = NULL;
560 
561 	while ((skb = __skb_dequeue(&list)) != NULL) {
562 		unsigned int hash = sfq_hash(q, skb);
563 		sfq_index x = q->ht[hash];
564 
565 		slot = &q->slots[x];
566 		if (x == SFQ_EMPTY_SLOT) {
567 			x = q->dep[0].next; /* get a free slot */
568 			if (x >= SFQ_MAX_FLOWS) {
569 drop:
570 				qdisc_qstats_backlog_dec(sch, skb);
571 				drop_len += qdisc_pkt_len(skb);
572 				kfree_skb(skb);
573 				dropped++;
574 				continue;
575 			}
576 			WRITE_ONCE(q->ht[hash], x);
577 			slot = &q->slots[x];
578 			slot->hash = hash;
579 		}
580 		if (slot->qlen >= q->maxdepth)
581 			goto drop;
582 		slot_queue_add(slot, skb);
583 		if (q->red_parms)
584 			slot->vars.qavg = red_calc_qavg(q->red_parms,
585 							&slot->vars,
586 							slot->backlog);
587 		WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
588 		sfq_inc(q, x);
589 		if (slot->qlen == 1) {		/* The flow is new */
590 			if (q->tail == NULL) {	/* It is the first flow */
591 				slot->next = x;
592 			} else {
593 				slot->next = q->tail->next;
594 				q->tail->next = x;
595 			}
596 			q->tail = slot;
597 			WRITE_ONCE(slot->allot, q->quantum);
598 		}
599 	}
600 	WRITE_ONCE(sch->q.qlen, sch->q.qlen - dropped);
601 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
602 }
603 
604 static void sfq_perturbation(struct timer_list *t)
605 {
606 	struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
607 	struct Qdisc *sch = q->sch;
608 	spinlock_t *root_lock;
609 	siphash_key_t nkey;
610 	int period;
611 
612 	get_random_bytes(&nkey, sizeof(nkey));
613 	rcu_read_lock();
614 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
615 	spin_lock(root_lock);
616 	q->perturbation = nkey;
617 	if (!q->filter_list && q->tail)
618 		sfq_rehash(sch);
619 	spin_unlock(root_lock);
620 
621 	/* q->perturb_period can change under us from
622 	 * sfq_change() and sfq_destroy().
623 	 */
624 	period = READ_ONCE(q->perturb_period);
625 	if (period)
626 		mod_timer(&q->perturb_timer, jiffies + period);
627 	rcu_read_unlock();
628 }
629 
630 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
631 		      struct netlink_ext_ack *extack)
632 {
633 	struct sfq_sched_data *q = qdisc_priv(sch);
634 	struct tc_sfq_qopt *ctl = nla_data(opt);
635 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
636 	unsigned int qlen, dropped = 0;
637 	struct red_parms *p = NULL;
638 	struct sk_buff *to_free = NULL;
639 	struct sk_buff *tail = NULL;
640 	unsigned int maxflows;
641 	unsigned int quantum;
642 	unsigned int divisor;
643 	int perturb_period;
644 	u8 headdrop;
645 	u8 maxdepth;
646 	int limit;
647 	u8 flags;
648 
649 
650 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
651 		return -EINVAL;
652 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
653 		ctl_v1 = nla_data(opt);
654 	if (ctl->divisor &&
655 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
656 		return -EINVAL;
657 
658 	if ((int)ctl->quantum < 0) {
659 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
660 		return -EINVAL;
661 	}
662 
663 	if (ctl->perturb_period < 0 ||
664 	    ctl->perturb_period > INT_MAX / HZ) {
665 		NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
666 		return -EINVAL;
667 	}
668 	perturb_period = ctl->perturb_period * HZ;
669 
670 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
671 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
672 		return -EINVAL;
673 	if (ctl_v1 && ctl_v1->qth_min) {
674 		p = kmalloc_obj(*p);
675 		if (!p)
676 			return -ENOMEM;
677 	}
678 
679 	sch_tree_lock(sch);
680 
681 	limit = q->limit;
682 	divisor = q->divisor;
683 	headdrop = q->headdrop;
684 	maxdepth = q->maxdepth;
685 	maxflows = q->maxflows;
686 	quantum = q->quantum;
687 	flags = q->flags;
688 
689 	/* update and validate configuration */
690 	if (ctl->quantum)
691 		quantum = ctl->quantum;
692 	if (ctl->flows)
693 		maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
694 	if (ctl->divisor) {
695 		divisor = ctl->divisor;
696 		maxflows = min_t(u32, maxflows, divisor);
697 	}
698 	if (ctl_v1) {
699 		if (ctl_v1->depth)
700 			maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
701 		if (p) {
702 			red_set_parms(p,
703 				      ctl_v1->qth_min, ctl_v1->qth_max,
704 				      ctl_v1->Wlog,
705 				      ctl_v1->Plog, ctl_v1->Scell_log,
706 				      NULL,
707 				      ctl_v1->max_P);
708 		}
709 		flags = ctl_v1->flags;
710 		headdrop = ctl_v1->headdrop;
711 	}
712 	if (ctl->limit) {
713 		limit = min_t(u32, ctl->limit, maxdepth * maxflows);
714 		maxflows = min_t(u32, maxflows, limit);
715 	}
716 	if (limit == 1) {
717 		sch_tree_unlock(sch);
718 		kfree(p);
719 		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
720 		return -EINVAL;
721 	}
722 
723 	/* commit configuration */
724 	q->limit = limit;
725 	q->divisor = divisor;
726 	q->headdrop = headdrop;
727 	q->maxdepth = maxdepth;
728 	q->maxflows = maxflows;
729 	WRITE_ONCE(q->perturb_period, perturb_period);
730 	q->quantum = quantum;
731 	q->flags = flags;
732 	if (p)
733 		swap(q->red_parms, p);
734 
735 	qlen = sch->q.qlen;
736 	while (sch->q.qlen > q->limit) {
737 		dropped += sfq_drop(sch, &to_free);
738 		if (!tail)
739 			tail = to_free;
740 	}
741 
742 	rtnl_kfree_skbs(to_free, tail);
743 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
744 
745 	timer_delete(&q->perturb_timer);
746 	if (q->perturb_period) {
747 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
748 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
749 	}
750 	sch_tree_unlock(sch);
751 	kfree(p);
752 	return 0;
753 }
754 
755 static void *sfq_alloc(size_t sz)
756 {
757 	return  kvmalloc(sz, GFP_KERNEL);
758 }
759 
760 static void sfq_free(void *addr)
761 {
762 	kvfree(addr);
763 }
764 
765 static void sfq_destroy(struct Qdisc *sch)
766 {
767 	struct sfq_sched_data *q = qdisc_priv(sch);
768 
769 	tcf_block_put(q->block);
770 	WRITE_ONCE(q->perturb_period, 0);
771 	timer_delete_sync(&q->perturb_timer);
772 	sfq_free(q->ht);
773 	sfq_free(q->slots);
774 	kfree(q->red_parms);
775 }
776 
777 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
778 		    struct netlink_ext_ack *extack)
779 {
780 	struct sfq_sched_data *q = qdisc_priv(sch);
781 	int i;
782 	int err;
783 
784 	q->sch = sch;
785 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
786 
787 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
788 	if (err)
789 		return err;
790 
791 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
792 		q->dep[i].next = i + SFQ_MAX_FLOWS;
793 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
794 	}
795 
796 	q->limit = SFQ_MAX_DEPTH;
797 	q->maxdepth = SFQ_MAX_DEPTH;
798 	q->cur_depth = 0;
799 	q->tail = NULL;
800 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
801 	q->maxflows = SFQ_DEFAULT_FLOWS;
802 	q->quantum = psched_mtu(qdisc_dev(sch));
803 	q->perturb_period = 0;
804 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
805 
806 	if (opt) {
807 		int err = sfq_change(sch, opt, extack);
808 		if (err)
809 			return err;
810 	}
811 
812 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
813 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
814 	if (!q->ht || !q->slots) {
815 		/* Note: sfq_destroy() will be called by our caller */
816 		return -ENOMEM;
817 	}
818 
819 	for (i = 0; i < q->divisor; i++)
820 		q->ht[i] = SFQ_EMPTY_SLOT;
821 
822 	for (i = 0; i < q->maxflows; i++) {
823 		slot_queue_init(&q->slots[i]);
824 		sfq_link(q, i);
825 	}
826 	if (q->limit >= 1)
827 		sch->flags |= TCQ_F_CAN_BYPASS;
828 	else
829 		sch->flags &= ~TCQ_F_CAN_BYPASS;
830 	return 0;
831 }
832 
833 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
834 {
835 	struct sfq_sched_data *q = qdisc_priv(sch);
836 	unsigned char *b = skb_tail_pointer(skb);
837 	struct tc_sfq_qopt_v1 opt;
838 	struct red_parms *p = q->red_parms;
839 
840 	memset(&opt, 0, sizeof(opt));
841 	opt.v0.quantum	= q->quantum;
842 	opt.v0.perturb_period = q->perturb_period / HZ;
843 	opt.v0.limit	= q->limit;
844 	opt.v0.divisor	= q->divisor;
845 	opt.v0.flows	= q->maxflows;
846 	opt.depth	= q->maxdepth;
847 	opt.headdrop	= q->headdrop;
848 
849 	if (p) {
850 		opt.qth_min	= p->qth_min >> p->Wlog;
851 		opt.qth_max	= p->qth_max >> p->Wlog;
852 		opt.Wlog	= p->Wlog;
853 		opt.Plog	= p->Plog;
854 		opt.Scell_log	= p->Scell_log;
855 		opt.max_P	= p->max_P;
856 	}
857 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
858 	opt.flags	= q->flags;
859 
860 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
861 		goto nla_put_failure;
862 
863 	return skb->len;
864 
865 nla_put_failure:
866 	nlmsg_trim(skb, b);
867 	return -1;
868 }
869 
870 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
871 {
872 	return NULL;
873 }
874 
875 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
876 {
877 	return 0;
878 }
879 
880 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
881 			      u32 classid)
882 {
883 	return 0;
884 }
885 
886 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
887 {
888 }
889 
890 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
891 				       struct netlink_ext_ack *extack)
892 {
893 	struct sfq_sched_data *q = qdisc_priv(sch);
894 
895 	if (cl)
896 		return NULL;
897 	return q->block;
898 }
899 
900 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
901 			  struct sk_buff *skb, struct tcmsg *tcm)
902 {
903 	tcm->tcm_handle |= TC_H_MIN(cl);
904 	return 0;
905 }
906 
907 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
908 				struct gnet_dump *d)
909 {
910 	struct sfq_sched_data *q = qdisc_priv(sch);
911 	sfq_index idx = READ_ONCE(q->ht[cl - 1]);
912 	struct gnet_stats_queue qs = { 0 };
913 	struct tc_sfq_xstats xstats = { 0 };
914 
915 	if (idx != SFQ_EMPTY_SLOT) {
916 		const struct sfq_slot *slot = &q->slots[idx];
917 
918 		xstats.allot = READ_ONCE(slot->allot);
919 		qs.qlen = READ_ONCE(slot->qlen);
920 		qs.backlog = READ_ONCE(slot->backlog);
921 	}
922 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
923 		return -1;
924 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
925 }
926 
927 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
928 {
929 	struct sfq_sched_data *q = qdisc_priv(sch);
930 	unsigned int i;
931 
932 	if (arg->stop)
933 		return;
934 
935 	for (i = 0; i < q->divisor; i++) {
936 		if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) {
937 			arg->count++;
938 			continue;
939 		}
940 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
941 			break;
942 	}
943 }
944 
945 static const struct Qdisc_class_ops sfq_class_ops = {
946 	.leaf		=	sfq_leaf,
947 	.find		=	sfq_find,
948 	.tcf_block	=	sfq_tcf_block,
949 	.bind_tcf	=	sfq_bind,
950 	.unbind_tcf	=	sfq_unbind,
951 	.dump		=	sfq_dump_class,
952 	.dump_stats	=	sfq_dump_class_stats,
953 	.walk		=	sfq_walk,
954 };
955 
956 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
957 	.cl_ops		=	&sfq_class_ops,
958 	.id		=	"sfq",
959 	.priv_size	=	sizeof(struct sfq_sched_data),
960 	.enqueue	=	sfq_enqueue,
961 	.dequeue	=	sfq_dequeue,
962 	.peek		=	qdisc_peek_dequeued,
963 	.init		=	sfq_init,
964 	.reset		=	sfq_reset,
965 	.destroy	=	sfq_destroy,
966 	.change		=	NULL,
967 	.dump		=	sfq_dump,
968 	.owner		=	THIS_MODULE,
969 };
970 MODULE_ALIAS_NET_SCH("sfq");
971 
972 static int __init sfq_module_init(void)
973 {
974 	return register_qdisc(&sfq_qdisc_ops);
975 }
976 static void __exit sfq_module_exit(void)
977 {
978 	unregister_qdisc(&sfq_qdisc_ops);
979 }
980 module_init(sfq_module_init)
981 module_exit(sfq_module_exit)
982 MODULE_LICENSE("GPL");
983 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
984