xref: /linux/net/sched/sch_sfq.c (revision 770b136ff9bf3e319d19875da59c4f7f4853da3a)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen;
229 	WRITE_ONCE(q->slots[x].qlen, d - 1);
230 	if (n == p && q->cur_depth == d)
231 		q->cur_depth--;
232 	sfq_link(q, x);
233 }
234 
235 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
236 {
237 	sfq_index p, n;
238 	int d;
239 
240 	sfq_unlink(q, x, n, p);
241 
242 	d = q->slots[x].qlen + 1;
243 	WRITE_ONCE(q->slots[x].qlen, d);
244 	if (q->cur_depth < d)
245 		q->cur_depth = d;
246 	sfq_link(q, x);
247 }
248 
249 /* helper functions : might be changed when/if skb use a standard list_head */
250 
251 /* remove one skb from tail of slot queue */
252 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
253 {
254 	struct sk_buff *skb = slot->skblist_prev;
255 
256 	slot->skblist_prev = skb->prev;
257 	skb->prev->next = (struct sk_buff *)slot;
258 	skb->next = skb->prev = NULL;
259 	return skb;
260 }
261 
262 /* remove one skb from head of slot queue */
263 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
264 {
265 	struct sk_buff *skb = slot->skblist_next;
266 
267 	slot->skblist_next = skb->next;
268 	skb->next->prev = (struct sk_buff *)slot;
269 	skb->next = skb->prev = NULL;
270 	return skb;
271 }
272 
273 static inline void slot_queue_init(struct sfq_slot *slot)
274 {
275 	memset(slot, 0, sizeof(*slot));
276 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
277 }
278 
279 /* add skb to slot queue (tail add) */
280 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
281 {
282 	skb->prev = slot->skblist_prev;
283 	skb->next = (struct sk_buff *)slot;
284 	slot->skblist_prev->next = skb;
285 	slot->skblist_prev = skb;
286 }
287 
288 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
289 {
290 	struct sfq_sched_data *q = qdisc_priv(sch);
291 	sfq_index x, d = q->cur_depth;
292 	struct sk_buff *skb;
293 	unsigned int len;
294 	struct sfq_slot *slot;
295 
296 	/* Queue is full! Find the longest slot and drop tail packet from it */
297 	if (d > 1) {
298 		x = q->dep[d].next;
299 		slot = &q->slots[x];
300 drop:
301 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
302 		len = qdisc_pkt_len(skb);
303 		WRITE_ONCE(slot->backlog, slot->backlog - len);
304 		sfq_dec(q, x);
305 		sch->q.qlen--;
306 		qdisc_qstats_backlog_dec(sch, skb);
307 		qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_OVERLIMIT);
308 		return len;
309 	}
310 
311 	if (d == 1) {
312 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
313 		x = q->tail->next;
314 		slot = &q->slots[x];
315 		if (slot->next == x)
316 			q->tail = NULL; /* no more active slots */
317 		else
318 			q->tail->next = slot->next;
319 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
320 		goto drop;
321 	}
322 
323 	return 0;
324 }
325 
326 /* Is ECN parameter configured */
327 static int sfq_prob_mark(const struct sfq_sched_data *q)
328 {
329 	return q->flags & TC_RED_ECN;
330 }
331 
332 /* Should packets over max threshold just be marked */
333 static int sfq_hard_mark(const struct sfq_sched_data *q)
334 {
335 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
336 }
337 
338 static int sfq_headdrop(const struct sfq_sched_data *q)
339 {
340 	return q->headdrop;
341 }
342 
343 static int
344 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
345 {
346 	struct sfq_sched_data *q = qdisc_priv(sch);
347 	unsigned int hash, dropped;
348 	sfq_index x, qlen;
349 	struct sfq_slot *slot;
350 	int ret;
351 	struct sk_buff *head;
352 	int delta;
353 
354 	hash = sfq_classify(skb, sch, &ret);
355 	if (hash == 0) {
356 		if (ret & __NET_XMIT_BYPASS)
357 			qdisc_qstats_drop(sch);
358 		__qdisc_drop(skb, to_free);
359 		return ret;
360 	}
361 	hash--;
362 
363 	x = q->ht[hash];
364 	slot = &q->slots[x];
365 	if (x == SFQ_EMPTY_SLOT) {
366 		x = q->dep[0].next; /* get a free slot */
367 		if (x >= SFQ_MAX_FLOWS)
368 			return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_MAXFLOWS);
369 		WRITE_ONCE(q->ht[hash], x);
370 		slot = &q->slots[x];
371 		slot->hash = hash;
372 		WRITE_ONCE(slot->backlog, 0); /* should already be 0 anyway... */
373 		red_set_vars(&slot->vars);
374 		goto enqueue;
375 	}
376 	if (q->red_parms) {
377 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
378 							&slot->vars,
379 							slot->backlog);
380 		switch (red_action(q->red_parms,
381 				   &slot->vars,
382 				   slot->vars.qavg)) {
383 		case RED_DONT_MARK:
384 			break;
385 
386 		case RED_PROB_MARK:
387 			qdisc_qstats_overlimit(sch);
388 			if (sfq_prob_mark(q)) {
389 				/* We know we have at least one packet in queue */
390 				if (sfq_headdrop(q) &&
391 				    INET_ECN_set_ce(slot->skblist_next)) {
392 					q->stats.prob_mark_head++;
393 					break;
394 				}
395 				if (INET_ECN_set_ce(skb)) {
396 					q->stats.prob_mark++;
397 					break;
398 				}
399 			}
400 			q->stats.prob_drop++;
401 			goto congestion_drop;
402 
403 		case RED_HARD_MARK:
404 			qdisc_qstats_overlimit(sch);
405 			if (sfq_hard_mark(q)) {
406 				/* We know we have at least one packet in queue */
407 				if (sfq_headdrop(q) &&
408 				    INET_ECN_set_ce(slot->skblist_next)) {
409 					q->stats.forced_mark_head++;
410 					break;
411 				}
412 				if (INET_ECN_set_ce(skb)) {
413 					q->stats.forced_mark++;
414 					break;
415 				}
416 			}
417 			q->stats.forced_drop++;
418 			goto congestion_drop;
419 		}
420 	}
421 
422 	if (slot->qlen >= q->maxdepth) {
423 congestion_drop:
424 		if (!sfq_headdrop(q))
425 			return qdisc_drop_reason(skb, sch, to_free, QDISC_DROP_FLOW_LIMIT);
426 
427 		/* We know we have at least one packet in queue */
428 		head = slot_dequeue_head(slot);
429 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
430 		sch->qstats.backlog -= delta;
431 		WRITE_ONCE(slot->backlog, slot->backlog - delta);
432 		qdisc_drop_reason(head, sch, to_free, QDISC_DROP_FLOW_LIMIT);
433 
434 		slot_queue_add(slot, skb);
435 		qdisc_tree_reduce_backlog(sch, 0, delta);
436 		return NET_XMIT_CN;
437 	}
438 
439 enqueue:
440 	qdisc_qstats_backlog_inc(sch, skb);
441 	WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
442 	slot_queue_add(slot, skb);
443 	sfq_inc(q, x);
444 	if (slot->qlen == 1) {		/* The flow is new */
445 		if (q->tail == NULL) {	/* It is the first flow */
446 			slot->next = x;
447 		} else {
448 			slot->next = q->tail->next;
449 			q->tail->next = x;
450 		}
451 		/* We put this flow at the end of our flow list.
452 		 * This might sound unfair for a new flow to wait after old ones,
453 		 * but we could endup servicing new flows only, and freeze old ones.
454 		 */
455 		q->tail = slot;
456 		/* We could use a bigger initial quantum for new flows */
457 		WRITE_ONCE(slot->allot, q->quantum);
458 	}
459 	if (++sch->q.qlen <= q->limit)
460 		return NET_XMIT_SUCCESS;
461 
462 	qlen = slot->qlen;
463 	dropped = sfq_drop(sch, to_free);
464 	/* Return Congestion Notification only if we dropped a packet
465 	 * from this flow.
466 	 */
467 	if (qlen != slot->qlen) {
468 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
469 		return NET_XMIT_CN;
470 	}
471 
472 	/* As we dropped a packet, better let upper stack know this */
473 	qdisc_tree_reduce_backlog(sch, 1, dropped);
474 	return NET_XMIT_SUCCESS;
475 }
476 
477 static struct sk_buff *
478 sfq_dequeue(struct Qdisc *sch)
479 {
480 	struct sfq_sched_data *q = qdisc_priv(sch);
481 	struct sk_buff *skb;
482 	sfq_index a, next_a;
483 	struct sfq_slot *slot;
484 
485 	/* No active slots */
486 	if (q->tail == NULL)
487 		return NULL;
488 
489 next_slot:
490 	a = q->tail->next;
491 	slot = &q->slots[a];
492 	if (slot->allot <= 0) {
493 		q->tail = slot;
494 		WRITE_ONCE(slot->allot, slot->allot + q->quantum);
495 		goto next_slot;
496 	}
497 	skb = slot_dequeue_head(slot);
498 	sfq_dec(q, a);
499 	qdisc_bstats_update(sch, skb);
500 	sch->q.qlen--;
501 	qdisc_qstats_backlog_dec(sch, skb);
502 	WRITE_ONCE(slot->backlog, slot->backlog - qdisc_pkt_len(skb));
503 	/* Is the slot empty? */
504 	if (slot->qlen == 0) {
505 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
506 		next_a = slot->next;
507 		if (a == next_a) {
508 			q->tail = NULL; /* no more active slots */
509 			return skb;
510 		}
511 		q->tail->next = next_a;
512 	} else {
513 		WRITE_ONCE(slot->allot, slot->allot - qdisc_pkt_len(skb));
514 	}
515 	return skb;
516 }
517 
518 static void
519 sfq_reset(struct Qdisc *sch)
520 {
521 	struct sk_buff *skb;
522 
523 	while ((skb = sfq_dequeue(sch)) != NULL)
524 		rtnl_kfree_skbs(skb, skb);
525 }
526 
527 /*
528  * When q->perturbation is changed, we rehash all queued skbs
529  * to avoid OOO (Out Of Order) effects.
530  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
531  * counters.
532  */
533 static void sfq_rehash(struct Qdisc *sch)
534 {
535 	struct sfq_sched_data *q = qdisc_priv(sch);
536 	struct sk_buff *skb;
537 	int i;
538 	struct sfq_slot *slot;
539 	struct sk_buff_head list;
540 	int dropped = 0;
541 	unsigned int drop_len = 0;
542 
543 	__skb_queue_head_init(&list);
544 
545 	for (i = 0; i < q->maxflows; i++) {
546 		slot = &q->slots[i];
547 		if (!slot->qlen)
548 			continue;
549 		while (slot->qlen) {
550 			skb = slot_dequeue_head(slot);
551 			sfq_dec(q, i);
552 			__skb_queue_tail(&list, skb);
553 		}
554 		WRITE_ONCE(slot->backlog, 0);
555 		red_set_vars(&slot->vars);
556 		WRITE_ONCE(q->ht[slot->hash], SFQ_EMPTY_SLOT);
557 	}
558 	q->tail = NULL;
559 
560 	while ((skb = __skb_dequeue(&list)) != NULL) {
561 		unsigned int hash = sfq_hash(q, skb);
562 		sfq_index x = q->ht[hash];
563 
564 		slot = &q->slots[x];
565 		if (x == SFQ_EMPTY_SLOT) {
566 			x = q->dep[0].next; /* get a free slot */
567 			if (x >= SFQ_MAX_FLOWS) {
568 drop:
569 				qdisc_qstats_backlog_dec(sch, skb);
570 				drop_len += qdisc_pkt_len(skb);
571 				kfree_skb(skb);
572 				dropped++;
573 				continue;
574 			}
575 			WRITE_ONCE(q->ht[hash], x);
576 			slot = &q->slots[x];
577 			slot->hash = hash;
578 		}
579 		if (slot->qlen >= q->maxdepth)
580 			goto drop;
581 		slot_queue_add(slot, skb);
582 		if (q->red_parms)
583 			slot->vars.qavg = red_calc_qavg(q->red_parms,
584 							&slot->vars,
585 							slot->backlog);
586 		WRITE_ONCE(slot->backlog, slot->backlog + qdisc_pkt_len(skb));
587 		sfq_inc(q, x);
588 		if (slot->qlen == 1) {		/* The flow is new */
589 			if (q->tail == NULL) {	/* It is the first flow */
590 				slot->next = x;
591 			} else {
592 				slot->next = q->tail->next;
593 				q->tail->next = x;
594 			}
595 			q->tail = slot;
596 			WRITE_ONCE(slot->allot, q->quantum);
597 		}
598 	}
599 	sch->q.qlen -= dropped;
600 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
601 }
602 
603 static void sfq_perturbation(struct timer_list *t)
604 {
605 	struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
606 	struct Qdisc *sch = q->sch;
607 	spinlock_t *root_lock;
608 	siphash_key_t nkey;
609 	int period;
610 
611 	get_random_bytes(&nkey, sizeof(nkey));
612 	rcu_read_lock();
613 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
614 	spin_lock(root_lock);
615 	q->perturbation = nkey;
616 	if (!q->filter_list && q->tail)
617 		sfq_rehash(sch);
618 	spin_unlock(root_lock);
619 
620 	/* q->perturb_period can change under us from
621 	 * sfq_change() and sfq_destroy().
622 	 */
623 	period = READ_ONCE(q->perturb_period);
624 	if (period)
625 		mod_timer(&q->perturb_timer, jiffies + period);
626 	rcu_read_unlock();
627 }
628 
629 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
630 		      struct netlink_ext_ack *extack)
631 {
632 	struct sfq_sched_data *q = qdisc_priv(sch);
633 	struct tc_sfq_qopt *ctl = nla_data(opt);
634 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
635 	unsigned int qlen, dropped = 0;
636 	struct red_parms *p = NULL;
637 	struct sk_buff *to_free = NULL;
638 	struct sk_buff *tail = NULL;
639 	unsigned int maxflows;
640 	unsigned int quantum;
641 	unsigned int divisor;
642 	int perturb_period;
643 	u8 headdrop;
644 	u8 maxdepth;
645 	int limit;
646 	u8 flags;
647 
648 
649 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
650 		return -EINVAL;
651 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
652 		ctl_v1 = nla_data(opt);
653 	if (ctl->divisor &&
654 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
655 		return -EINVAL;
656 
657 	if ((int)ctl->quantum < 0) {
658 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
659 		return -EINVAL;
660 	}
661 
662 	if (ctl->perturb_period < 0 ||
663 	    ctl->perturb_period > INT_MAX / HZ) {
664 		NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
665 		return -EINVAL;
666 	}
667 	perturb_period = ctl->perturb_period * HZ;
668 
669 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
670 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
671 		return -EINVAL;
672 	if (ctl_v1 && ctl_v1->qth_min) {
673 		p = kmalloc_obj(*p);
674 		if (!p)
675 			return -ENOMEM;
676 	}
677 
678 	sch_tree_lock(sch);
679 
680 	limit = q->limit;
681 	divisor = q->divisor;
682 	headdrop = q->headdrop;
683 	maxdepth = q->maxdepth;
684 	maxflows = q->maxflows;
685 	quantum = q->quantum;
686 	flags = q->flags;
687 
688 	/* update and validate configuration */
689 	if (ctl->quantum)
690 		quantum = ctl->quantum;
691 	if (ctl->flows)
692 		maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
693 	if (ctl->divisor) {
694 		divisor = ctl->divisor;
695 		maxflows = min_t(u32, maxflows, divisor);
696 	}
697 	if (ctl_v1) {
698 		if (ctl_v1->depth)
699 			maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
700 		if (p) {
701 			red_set_parms(p,
702 				      ctl_v1->qth_min, ctl_v1->qth_max,
703 				      ctl_v1->Wlog,
704 				      ctl_v1->Plog, ctl_v1->Scell_log,
705 				      NULL,
706 				      ctl_v1->max_P);
707 		}
708 		flags = ctl_v1->flags;
709 		headdrop = ctl_v1->headdrop;
710 	}
711 	if (ctl->limit) {
712 		limit = min_t(u32, ctl->limit, maxdepth * maxflows);
713 		maxflows = min_t(u32, maxflows, limit);
714 	}
715 	if (limit == 1) {
716 		sch_tree_unlock(sch);
717 		kfree(p);
718 		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
719 		return -EINVAL;
720 	}
721 
722 	/* commit configuration */
723 	q->limit = limit;
724 	q->divisor = divisor;
725 	q->headdrop = headdrop;
726 	q->maxdepth = maxdepth;
727 	q->maxflows = maxflows;
728 	WRITE_ONCE(q->perturb_period, perturb_period);
729 	q->quantum = quantum;
730 	q->flags = flags;
731 	if (p)
732 		swap(q->red_parms, p);
733 
734 	qlen = sch->q.qlen;
735 	while (sch->q.qlen > q->limit) {
736 		dropped += sfq_drop(sch, &to_free);
737 		if (!tail)
738 			tail = to_free;
739 	}
740 
741 	rtnl_kfree_skbs(to_free, tail);
742 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
743 
744 	timer_delete(&q->perturb_timer);
745 	if (q->perturb_period) {
746 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
747 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
748 	}
749 	sch_tree_unlock(sch);
750 	kfree(p);
751 	return 0;
752 }
753 
754 static void *sfq_alloc(size_t sz)
755 {
756 	return  kvmalloc(sz, GFP_KERNEL);
757 }
758 
759 static void sfq_free(void *addr)
760 {
761 	kvfree(addr);
762 }
763 
764 static void sfq_destroy(struct Qdisc *sch)
765 {
766 	struct sfq_sched_data *q = qdisc_priv(sch);
767 
768 	tcf_block_put(q->block);
769 	WRITE_ONCE(q->perturb_period, 0);
770 	timer_delete_sync(&q->perturb_timer);
771 	sfq_free(q->ht);
772 	sfq_free(q->slots);
773 	kfree(q->red_parms);
774 }
775 
776 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
777 		    struct netlink_ext_ack *extack)
778 {
779 	struct sfq_sched_data *q = qdisc_priv(sch);
780 	int i;
781 	int err;
782 
783 	q->sch = sch;
784 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
785 
786 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
787 	if (err)
788 		return err;
789 
790 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
791 		q->dep[i].next = i + SFQ_MAX_FLOWS;
792 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
793 	}
794 
795 	q->limit = SFQ_MAX_DEPTH;
796 	q->maxdepth = SFQ_MAX_DEPTH;
797 	q->cur_depth = 0;
798 	q->tail = NULL;
799 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
800 	q->maxflows = SFQ_DEFAULT_FLOWS;
801 	q->quantum = psched_mtu(qdisc_dev(sch));
802 	q->perturb_period = 0;
803 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
804 
805 	if (opt) {
806 		int err = sfq_change(sch, opt, extack);
807 		if (err)
808 			return err;
809 	}
810 
811 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
812 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
813 	if (!q->ht || !q->slots) {
814 		/* Note: sfq_destroy() will be called by our caller */
815 		return -ENOMEM;
816 	}
817 
818 	for (i = 0; i < q->divisor; i++)
819 		q->ht[i] = SFQ_EMPTY_SLOT;
820 
821 	for (i = 0; i < q->maxflows; i++) {
822 		slot_queue_init(&q->slots[i]);
823 		sfq_link(q, i);
824 	}
825 	if (q->limit >= 1)
826 		sch->flags |= TCQ_F_CAN_BYPASS;
827 	else
828 		sch->flags &= ~TCQ_F_CAN_BYPASS;
829 	return 0;
830 }
831 
832 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
833 {
834 	struct sfq_sched_data *q = qdisc_priv(sch);
835 	unsigned char *b = skb_tail_pointer(skb);
836 	struct tc_sfq_qopt_v1 opt;
837 	struct red_parms *p = q->red_parms;
838 
839 	memset(&opt, 0, sizeof(opt));
840 	opt.v0.quantum	= q->quantum;
841 	opt.v0.perturb_period = q->perturb_period / HZ;
842 	opt.v0.limit	= q->limit;
843 	opt.v0.divisor	= q->divisor;
844 	opt.v0.flows	= q->maxflows;
845 	opt.depth	= q->maxdepth;
846 	opt.headdrop	= q->headdrop;
847 
848 	if (p) {
849 		opt.qth_min	= p->qth_min >> p->Wlog;
850 		opt.qth_max	= p->qth_max >> p->Wlog;
851 		opt.Wlog	= p->Wlog;
852 		opt.Plog	= p->Plog;
853 		opt.Scell_log	= p->Scell_log;
854 		opt.max_P	= p->max_P;
855 	}
856 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
857 	opt.flags	= q->flags;
858 
859 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
860 		goto nla_put_failure;
861 
862 	return skb->len;
863 
864 nla_put_failure:
865 	nlmsg_trim(skb, b);
866 	return -1;
867 }
868 
869 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
870 {
871 	return NULL;
872 }
873 
874 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
875 {
876 	return 0;
877 }
878 
879 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
880 			      u32 classid)
881 {
882 	return 0;
883 }
884 
885 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
886 {
887 }
888 
889 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
890 				       struct netlink_ext_ack *extack)
891 {
892 	struct sfq_sched_data *q = qdisc_priv(sch);
893 
894 	if (cl)
895 		return NULL;
896 	return q->block;
897 }
898 
899 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
900 			  struct sk_buff *skb, struct tcmsg *tcm)
901 {
902 	tcm->tcm_handle |= TC_H_MIN(cl);
903 	return 0;
904 }
905 
906 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
907 				struct gnet_dump *d)
908 {
909 	struct sfq_sched_data *q = qdisc_priv(sch);
910 	sfq_index idx = READ_ONCE(q->ht[cl - 1]);
911 	struct gnet_stats_queue qs = { 0 };
912 	struct tc_sfq_xstats xstats = { 0 };
913 
914 	if (idx != SFQ_EMPTY_SLOT) {
915 		const struct sfq_slot *slot = &q->slots[idx];
916 
917 		xstats.allot = READ_ONCE(slot->allot);
918 		qs.qlen = READ_ONCE(slot->qlen);
919 		qs.backlog = READ_ONCE(slot->backlog);
920 	}
921 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
922 		return -1;
923 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
924 }
925 
926 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
927 {
928 	struct sfq_sched_data *q = qdisc_priv(sch);
929 	unsigned int i;
930 
931 	if (arg->stop)
932 		return;
933 
934 	for (i = 0; i < q->divisor; i++) {
935 		if (READ_ONCE(q->ht[i]) == SFQ_EMPTY_SLOT) {
936 			arg->count++;
937 			continue;
938 		}
939 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
940 			break;
941 	}
942 }
943 
944 static const struct Qdisc_class_ops sfq_class_ops = {
945 	.leaf		=	sfq_leaf,
946 	.find		=	sfq_find,
947 	.tcf_block	=	sfq_tcf_block,
948 	.bind_tcf	=	sfq_bind,
949 	.unbind_tcf	=	sfq_unbind,
950 	.dump		=	sfq_dump_class,
951 	.dump_stats	=	sfq_dump_class_stats,
952 	.walk		=	sfq_walk,
953 };
954 
955 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
956 	.cl_ops		=	&sfq_class_ops,
957 	.id		=	"sfq",
958 	.priv_size	=	sizeof(struct sfq_sched_data),
959 	.enqueue	=	sfq_enqueue,
960 	.dequeue	=	sfq_dequeue,
961 	.peek		=	qdisc_peek_dequeued,
962 	.init		=	sfq_init,
963 	.reset		=	sfq_reset,
964 	.destroy	=	sfq_destroy,
965 	.change		=	NULL,
966 	.dump		=	sfq_dump,
967 	.owner		=	THIS_MODULE,
968 };
969 MODULE_ALIAS_NET_SCH("sfq");
970 
971 static int __init sfq_module_init(void)
972 {
973 	return register_qdisc(&sfq_qdisc_ops);
974 }
975 static void __exit sfq_module_exit(void)
976 {
977 	unregister_qdisc(&sfq_qdisc_ops);
978 }
979 module_init(sfq_module_init)
980 module_exit(sfq_module_exit)
981 MODULE_LICENSE("GPL");
982 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
983