xref: /linux/net/sched/sch_sfq.c (revision 27605c8c0f69e319df156b471974e4e223035378)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/types.h>
10 #include <linux/kernel.h>
11 #include <linux/jiffies.h>
12 #include <linux/string.h>
13 #include <linux/in.h>
14 #include <linux/errno.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/siphash.h>
18 #include <linux/slab.h>
19 #include <linux/vmalloc.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
23 #include <net/red.h>
24 
25 
26 /*	Stochastic Fairness Queuing algorithm.
27 	=======================================
28 
29 	Source:
30 	Paul E. McKenney "Stochastic Fairness Queuing",
31 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
32 
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
35 
36 
37 	See also:
38 	M. Shreedhar and George Varghese "Efficient Fair
39 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
40 
41 
42 	This is not the thing that is usually called (W)FQ nowadays.
43 	It does not use any timestamp mechanism, but instead
44 	processes queues in round-robin order.
45 
46 	ADVANTAGE:
47 
48 	- It is very cheap. Both CPU and memory requirements are minimal.
49 
50 	DRAWBACKS:
51 
52 	- "Stochastic" -> It is not 100% fair.
53 	When hash collisions occur, several flows are considered as one.
54 
55 	- "Round-robin" -> It introduces larger delays than virtual clock
56 	based schemes, and should not be used for isolating interactive
57 	traffic	from non-interactive. It means, that this scheduler
58 	should be used as leaf of CBQ or P3, which put interactive traffic
59 	to higher priority band.
60 
61 	We still need true WFQ for top level CSZ, but using WFQ
62 	for the best effort traffic is absolutely pointless:
63 	SFQ is superior for this purpose.
64 
65 	IMPLEMENTATION:
66 	This implementation limits :
67 	- maximal queue length per flow to 127 packets.
68 	- max mtu to 2^18-1;
69 	- max 65408 flows,
70 	- number of hash buckets to 65536.
71 
72 	It is easy to increase these values, but not in flight.  */
73 
74 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
75 #define SFQ_DEFAULT_FLOWS	128
76 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
77 #define SFQ_EMPTY_SLOT		0xffff
78 #define SFQ_DEFAULT_HASH_DIVISOR 1024
79 
80 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
81 typedef u16 sfq_index;
82 
83 /*
84  * We dont use pointers to save space.
85  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
86  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
87  * are 'pointers' to dep[] array
88  */
89 struct sfq_head {
90 	sfq_index	next;
91 	sfq_index	prev;
92 };
93 
94 struct sfq_slot {
95 	struct sk_buff	*skblist_next;
96 	struct sk_buff	*skblist_prev;
97 	sfq_index	qlen; /* number of skbs in skblist */
98 	sfq_index	next; /* next slot in sfq RR chain */
99 	struct sfq_head dep; /* anchor in dep[] chains */
100 	unsigned short	hash; /* hash value (index in ht[]) */
101 	int		allot; /* credit for this slot */
102 
103 	unsigned int    backlog;
104 	struct red_vars vars;
105 };
106 
107 struct sfq_sched_data {
108 /* frequently used fields */
109 	int		limit;		/* limit of total number of packets in this qdisc */
110 	unsigned int	divisor;	/* number of slots in hash table */
111 	u8		headdrop;
112 	u8		maxdepth;	/* limit of packets per flow */
113 
114 	siphash_key_t 	perturbation;
115 	u8		cur_depth;	/* depth of longest slot */
116 	u8		flags;
117 	struct tcf_proto __rcu *filter_list;
118 	struct tcf_block *block;
119 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
120 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
121 
122 	struct red_parms *red_parms;
123 	struct tc_sfqred_stats stats;
124 	struct sfq_slot *tail;		/* current slot in round */
125 
126 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
127 					/* Linked lists of slots, indexed by depth
128 					 * dep[0] : list of unused flows
129 					 * dep[1] : list of flows with 1 packet
130 					 * dep[X] : list of flows with X packets
131 					 */
132 
133 	unsigned int	maxflows;	/* number of flows in flows array */
134 	int		perturb_period;
135 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
136 	struct timer_list perturb_timer;
137 	struct Qdisc	*sch;
138 };
139 
140 /*
141  * sfq_head are either in a sfq_slot or in dep[] array
142  */
sfq_dep_head(struct sfq_sched_data * q,sfq_index val)143 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
144 {
145 	if (val < SFQ_MAX_FLOWS)
146 		return &q->slots[val].dep;
147 	return &q->dep[val - SFQ_MAX_FLOWS];
148 }
149 
sfq_hash(const struct sfq_sched_data * q,const struct sk_buff * skb)150 static unsigned int sfq_hash(const struct sfq_sched_data *q,
151 			     const struct sk_buff *skb)
152 {
153 	return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1);
154 }
155 
sfq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)156 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
157 				 int *qerr)
158 {
159 	struct sfq_sched_data *q = qdisc_priv(sch);
160 	struct tcf_result res;
161 	struct tcf_proto *fl;
162 	int result;
163 
164 	if (TC_H_MAJ(skb->priority) == sch->handle &&
165 	    TC_H_MIN(skb->priority) > 0 &&
166 	    TC_H_MIN(skb->priority) <= q->divisor)
167 		return TC_H_MIN(skb->priority);
168 
169 	fl = rcu_dereference_bh(q->filter_list);
170 	if (!fl)
171 		return sfq_hash(q, skb) + 1;
172 
173 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
174 	result = tcf_classify(skb, NULL, fl, &res, false);
175 	if (result >= 0) {
176 #ifdef CONFIG_NET_CLS_ACT
177 		switch (result) {
178 		case TC_ACT_STOLEN:
179 		case TC_ACT_QUEUED:
180 		case TC_ACT_TRAP:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 			fallthrough;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= q->divisor)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 /*
194  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
195  */
sfq_link(struct sfq_sched_data * q,sfq_index x)196 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
197 {
198 	sfq_index p, n;
199 	struct sfq_slot *slot = &q->slots[x];
200 	int qlen = slot->qlen;
201 
202 	p = qlen + SFQ_MAX_FLOWS;
203 	n = q->dep[qlen].next;
204 
205 	slot->dep.next = n;
206 	slot->dep.prev = p;
207 
208 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
209 	sfq_dep_head(q, n)->prev = x;
210 }
211 
212 #define sfq_unlink(q, x, n, p)			\
213 	do {					\
214 		n = q->slots[x].dep.next;	\
215 		p = q->slots[x].dep.prev;	\
216 		sfq_dep_head(q, p)->next = n;	\
217 		sfq_dep_head(q, n)->prev = p;	\
218 	} while (0)
219 
220 
sfq_dec(struct sfq_sched_data * q,sfq_index x)221 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
222 {
223 	sfq_index p, n;
224 	int d;
225 
226 	sfq_unlink(q, x, n, p);
227 
228 	d = q->slots[x].qlen--;
229 	if (n == p && q->cur_depth == d)
230 		q->cur_depth--;
231 	sfq_link(q, x);
232 }
233 
sfq_inc(struct sfq_sched_data * q,sfq_index x)234 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
235 {
236 	sfq_index p, n;
237 	int d;
238 
239 	sfq_unlink(q, x, n, p);
240 
241 	d = ++q->slots[x].qlen;
242 	if (q->cur_depth < d)
243 		q->cur_depth = d;
244 	sfq_link(q, x);
245 }
246 
247 /* helper functions : might be changed when/if skb use a standard list_head */
248 
249 /* remove one skb from tail of slot queue */
slot_dequeue_tail(struct sfq_slot * slot)250 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
251 {
252 	struct sk_buff *skb = slot->skblist_prev;
253 
254 	slot->skblist_prev = skb->prev;
255 	skb->prev->next = (struct sk_buff *)slot;
256 	skb->next = skb->prev = NULL;
257 	return skb;
258 }
259 
260 /* remove one skb from head of slot queue */
slot_dequeue_head(struct sfq_slot * slot)261 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
262 {
263 	struct sk_buff *skb = slot->skblist_next;
264 
265 	slot->skblist_next = skb->next;
266 	skb->next->prev = (struct sk_buff *)slot;
267 	skb->next = skb->prev = NULL;
268 	return skb;
269 }
270 
slot_queue_init(struct sfq_slot * slot)271 static inline void slot_queue_init(struct sfq_slot *slot)
272 {
273 	memset(slot, 0, sizeof(*slot));
274 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
275 }
276 
277 /* add skb to slot queue (tail add) */
slot_queue_add(struct sfq_slot * slot,struct sk_buff * skb)278 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
279 {
280 	skb->prev = slot->skblist_prev;
281 	skb->next = (struct sk_buff *)slot;
282 	slot->skblist_prev->next = skb;
283 	slot->skblist_prev = skb;
284 }
285 
sfq_drop(struct Qdisc * sch,struct sk_buff ** to_free)286 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
287 {
288 	struct sfq_sched_data *q = qdisc_priv(sch);
289 	sfq_index x, d = q->cur_depth;
290 	struct sk_buff *skb;
291 	unsigned int len;
292 	struct sfq_slot *slot;
293 
294 	/* Queue is full! Find the longest slot and drop tail packet from it */
295 	if (d > 1) {
296 		x = q->dep[d].next;
297 		slot = &q->slots[x];
298 drop:
299 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
300 		len = qdisc_pkt_len(skb);
301 		slot->backlog -= len;
302 		sfq_dec(q, x);
303 		sch->q.qlen--;
304 		qdisc_qstats_backlog_dec(sch, skb);
305 		qdisc_drop(skb, sch, to_free);
306 		return len;
307 	}
308 
309 	if (d == 1) {
310 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
311 		x = q->tail->next;
312 		slot = &q->slots[x];
313 		if (slot->next == x)
314 			q->tail = NULL; /* no more active slots */
315 		else
316 			q->tail->next = slot->next;
317 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
318 		goto drop;
319 	}
320 
321 	return 0;
322 }
323 
324 /* Is ECN parameter configured */
sfq_prob_mark(const struct sfq_sched_data * q)325 static int sfq_prob_mark(const struct sfq_sched_data *q)
326 {
327 	return q->flags & TC_RED_ECN;
328 }
329 
330 /* Should packets over max threshold just be marked */
sfq_hard_mark(const struct sfq_sched_data * q)331 static int sfq_hard_mark(const struct sfq_sched_data *q)
332 {
333 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
334 }
335 
sfq_headdrop(const struct sfq_sched_data * q)336 static int sfq_headdrop(const struct sfq_sched_data *q)
337 {
338 	return q->headdrop;
339 }
340 
341 static int
sfq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)342 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
343 {
344 	struct sfq_sched_data *q = qdisc_priv(sch);
345 	unsigned int hash, dropped;
346 	sfq_index x, qlen;
347 	struct sfq_slot *slot;
348 	int ret;
349 	struct sk_buff *head;
350 	int delta;
351 
352 	hash = sfq_classify(skb, sch, &ret);
353 	if (hash == 0) {
354 		if (ret & __NET_XMIT_BYPASS)
355 			qdisc_qstats_drop(sch);
356 		__qdisc_drop(skb, to_free);
357 		return ret;
358 	}
359 	hash--;
360 
361 	x = q->ht[hash];
362 	slot = &q->slots[x];
363 	if (x == SFQ_EMPTY_SLOT) {
364 		x = q->dep[0].next; /* get a free slot */
365 		if (x >= SFQ_MAX_FLOWS)
366 			return qdisc_drop(skb, sch, to_free);
367 		q->ht[hash] = x;
368 		slot = &q->slots[x];
369 		slot->hash = hash;
370 		slot->backlog = 0; /* should already be 0 anyway... */
371 		red_set_vars(&slot->vars);
372 		goto enqueue;
373 	}
374 	if (q->red_parms) {
375 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
376 							&slot->vars,
377 							slot->backlog);
378 		switch (red_action(q->red_parms,
379 				   &slot->vars,
380 				   slot->vars.qavg)) {
381 		case RED_DONT_MARK:
382 			break;
383 
384 		case RED_PROB_MARK:
385 			qdisc_qstats_overlimit(sch);
386 			if (sfq_prob_mark(q)) {
387 				/* We know we have at least one packet in queue */
388 				if (sfq_headdrop(q) &&
389 				    INET_ECN_set_ce(slot->skblist_next)) {
390 					q->stats.prob_mark_head++;
391 					break;
392 				}
393 				if (INET_ECN_set_ce(skb)) {
394 					q->stats.prob_mark++;
395 					break;
396 				}
397 			}
398 			q->stats.prob_drop++;
399 			goto congestion_drop;
400 
401 		case RED_HARD_MARK:
402 			qdisc_qstats_overlimit(sch);
403 			if (sfq_hard_mark(q)) {
404 				/* We know we have at least one packet in queue */
405 				if (sfq_headdrop(q) &&
406 				    INET_ECN_set_ce(slot->skblist_next)) {
407 					q->stats.forced_mark_head++;
408 					break;
409 				}
410 				if (INET_ECN_set_ce(skb)) {
411 					q->stats.forced_mark++;
412 					break;
413 				}
414 			}
415 			q->stats.forced_drop++;
416 			goto congestion_drop;
417 		}
418 	}
419 
420 	if (slot->qlen >= q->maxdepth) {
421 congestion_drop:
422 		if (!sfq_headdrop(q))
423 			return qdisc_drop(skb, sch, to_free);
424 
425 		/* We know we have at least one packet in queue */
426 		head = slot_dequeue_head(slot);
427 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
428 		sch->qstats.backlog -= delta;
429 		slot->backlog -= delta;
430 		qdisc_drop(head, sch, to_free);
431 
432 		slot_queue_add(slot, skb);
433 		qdisc_tree_reduce_backlog(sch, 0, delta);
434 		return NET_XMIT_CN;
435 	}
436 
437 enqueue:
438 	qdisc_qstats_backlog_inc(sch, skb);
439 	slot->backlog += qdisc_pkt_len(skb);
440 	slot_queue_add(slot, skb);
441 	sfq_inc(q, x);
442 	if (slot->qlen == 1) {		/* The flow is new */
443 		if (q->tail == NULL) {	/* It is the first flow */
444 			slot->next = x;
445 		} else {
446 			slot->next = q->tail->next;
447 			q->tail->next = x;
448 		}
449 		/* We put this flow at the end of our flow list.
450 		 * This might sound unfair for a new flow to wait after old ones,
451 		 * but we could endup servicing new flows only, and freeze old ones.
452 		 */
453 		q->tail = slot;
454 		/* We could use a bigger initial quantum for new flows */
455 		slot->allot = q->quantum;
456 	}
457 	if (++sch->q.qlen <= q->limit)
458 		return NET_XMIT_SUCCESS;
459 
460 	qlen = slot->qlen;
461 	dropped = sfq_drop(sch, to_free);
462 	/* Return Congestion Notification only if we dropped a packet
463 	 * from this flow.
464 	 */
465 	if (qlen != slot->qlen) {
466 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
467 		return NET_XMIT_CN;
468 	}
469 
470 	/* As we dropped a packet, better let upper stack know this */
471 	qdisc_tree_reduce_backlog(sch, 1, dropped);
472 	return NET_XMIT_SUCCESS;
473 }
474 
475 static struct sk_buff *
sfq_dequeue(struct Qdisc * sch)476 sfq_dequeue(struct Qdisc *sch)
477 {
478 	struct sfq_sched_data *q = qdisc_priv(sch);
479 	struct sk_buff *skb;
480 	sfq_index a, next_a;
481 	struct sfq_slot *slot;
482 
483 	/* No active slots */
484 	if (q->tail == NULL)
485 		return NULL;
486 
487 next_slot:
488 	a = q->tail->next;
489 	slot = &q->slots[a];
490 	if (slot->allot <= 0) {
491 		q->tail = slot;
492 		slot->allot += q->quantum;
493 		goto next_slot;
494 	}
495 	skb = slot_dequeue_head(slot);
496 	sfq_dec(q, a);
497 	qdisc_bstats_update(sch, skb);
498 	sch->q.qlen--;
499 	qdisc_qstats_backlog_dec(sch, skb);
500 	slot->backlog -= qdisc_pkt_len(skb);
501 	/* Is the slot empty? */
502 	if (slot->qlen == 0) {
503 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
504 		next_a = slot->next;
505 		if (a == next_a) {
506 			q->tail = NULL; /* no more active slots */
507 			return skb;
508 		}
509 		q->tail->next = next_a;
510 	} else {
511 		slot->allot -= qdisc_pkt_len(skb);
512 	}
513 	return skb;
514 }
515 
516 static void
sfq_reset(struct Qdisc * sch)517 sfq_reset(struct Qdisc *sch)
518 {
519 	struct sk_buff *skb;
520 
521 	while ((skb = sfq_dequeue(sch)) != NULL)
522 		rtnl_kfree_skbs(skb, skb);
523 }
524 
525 /*
526  * When q->perturbation is changed, we rehash all queued skbs
527  * to avoid OOO (Out Of Order) effects.
528  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
529  * counters.
530  */
sfq_rehash(struct Qdisc * sch)531 static void sfq_rehash(struct Qdisc *sch)
532 {
533 	struct sfq_sched_data *q = qdisc_priv(sch);
534 	struct sk_buff *skb;
535 	int i;
536 	struct sfq_slot *slot;
537 	struct sk_buff_head list;
538 	int dropped = 0;
539 	unsigned int drop_len = 0;
540 
541 	__skb_queue_head_init(&list);
542 
543 	for (i = 0; i < q->maxflows; i++) {
544 		slot = &q->slots[i];
545 		if (!slot->qlen)
546 			continue;
547 		while (slot->qlen) {
548 			skb = slot_dequeue_head(slot);
549 			sfq_dec(q, i);
550 			__skb_queue_tail(&list, skb);
551 		}
552 		slot->backlog = 0;
553 		red_set_vars(&slot->vars);
554 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
555 	}
556 	q->tail = NULL;
557 
558 	while ((skb = __skb_dequeue(&list)) != NULL) {
559 		unsigned int hash = sfq_hash(q, skb);
560 		sfq_index x = q->ht[hash];
561 
562 		slot = &q->slots[x];
563 		if (x == SFQ_EMPTY_SLOT) {
564 			x = q->dep[0].next; /* get a free slot */
565 			if (x >= SFQ_MAX_FLOWS) {
566 drop:
567 				qdisc_qstats_backlog_dec(sch, skb);
568 				drop_len += qdisc_pkt_len(skb);
569 				kfree_skb(skb);
570 				dropped++;
571 				continue;
572 			}
573 			q->ht[hash] = x;
574 			slot = &q->slots[x];
575 			slot->hash = hash;
576 		}
577 		if (slot->qlen >= q->maxdepth)
578 			goto drop;
579 		slot_queue_add(slot, skb);
580 		if (q->red_parms)
581 			slot->vars.qavg = red_calc_qavg(q->red_parms,
582 							&slot->vars,
583 							slot->backlog);
584 		slot->backlog += qdisc_pkt_len(skb);
585 		sfq_inc(q, x);
586 		if (slot->qlen == 1) {		/* The flow is new */
587 			if (q->tail == NULL) {	/* It is the first flow */
588 				slot->next = x;
589 			} else {
590 				slot->next = q->tail->next;
591 				q->tail->next = x;
592 			}
593 			q->tail = slot;
594 			slot->allot = q->quantum;
595 		}
596 	}
597 	sch->q.qlen -= dropped;
598 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
599 }
600 
sfq_perturbation(struct timer_list * t)601 static void sfq_perturbation(struct timer_list *t)
602 {
603 	struct sfq_sched_data *q = timer_container_of(q, t, perturb_timer);
604 	struct Qdisc *sch = q->sch;
605 	spinlock_t *root_lock;
606 	siphash_key_t nkey;
607 	int period;
608 
609 	get_random_bytes(&nkey, sizeof(nkey));
610 	rcu_read_lock();
611 	root_lock = qdisc_lock(qdisc_root_sleeping(sch));
612 	spin_lock(root_lock);
613 	q->perturbation = nkey;
614 	if (!q->filter_list && q->tail)
615 		sfq_rehash(sch);
616 	spin_unlock(root_lock);
617 
618 	/* q->perturb_period can change under us from
619 	 * sfq_change() and sfq_destroy().
620 	 */
621 	period = READ_ONCE(q->perturb_period);
622 	if (period)
623 		mod_timer(&q->perturb_timer, jiffies + period);
624 	rcu_read_unlock();
625 }
626 
sfq_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)627 static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
628 		      struct netlink_ext_ack *extack)
629 {
630 	struct sfq_sched_data *q = qdisc_priv(sch);
631 	struct tc_sfq_qopt *ctl = nla_data(opt);
632 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
633 	unsigned int qlen, dropped = 0;
634 	struct red_parms *p = NULL;
635 	struct sk_buff *to_free = NULL;
636 	struct sk_buff *tail = NULL;
637 	unsigned int maxflows;
638 	unsigned int quantum;
639 	unsigned int divisor;
640 	int perturb_period;
641 	u8 headdrop;
642 	u8 maxdepth;
643 	int limit;
644 	u8 flags;
645 
646 
647 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648 		return -EINVAL;
649 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650 		ctl_v1 = nla_data(opt);
651 	if (ctl->divisor &&
652 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653 		return -EINVAL;
654 
655 	if ((int)ctl->quantum < 0) {
656 		NL_SET_ERR_MSG_MOD(extack, "invalid quantum");
657 		return -EINVAL;
658 	}
659 
660 	if (ctl->perturb_period < 0 ||
661 	    ctl->perturb_period > INT_MAX / HZ) {
662 		NL_SET_ERR_MSG_MOD(extack, "invalid perturb period");
663 		return -EINVAL;
664 	}
665 	perturb_period = ctl->perturb_period * HZ;
666 
667 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
668 					ctl_v1->Wlog, ctl_v1->Scell_log, NULL))
669 		return -EINVAL;
670 	if (ctl_v1 && ctl_v1->qth_min) {
671 		p = kmalloc(sizeof(*p), GFP_KERNEL);
672 		if (!p)
673 			return -ENOMEM;
674 	}
675 
676 	sch_tree_lock(sch);
677 
678 	limit = q->limit;
679 	divisor = q->divisor;
680 	headdrop = q->headdrop;
681 	maxdepth = q->maxdepth;
682 	maxflows = q->maxflows;
683 	quantum = q->quantum;
684 	flags = q->flags;
685 
686 	/* update and validate configuration */
687 	if (ctl->quantum)
688 		quantum = ctl->quantum;
689 	if (ctl->flows)
690 		maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
691 	if (ctl->divisor) {
692 		divisor = ctl->divisor;
693 		maxflows = min_t(u32, maxflows, divisor);
694 	}
695 	if (ctl_v1) {
696 		if (ctl_v1->depth)
697 			maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
698 		if (p) {
699 			red_set_parms(p,
700 				      ctl_v1->qth_min, ctl_v1->qth_max,
701 				      ctl_v1->Wlog,
702 				      ctl_v1->Plog, ctl_v1->Scell_log,
703 				      NULL,
704 				      ctl_v1->max_P);
705 		}
706 		flags = ctl_v1->flags;
707 		headdrop = ctl_v1->headdrop;
708 	}
709 	if (ctl->limit) {
710 		limit = min_t(u32, ctl->limit, maxdepth * maxflows);
711 		maxflows = min_t(u32, maxflows, limit);
712 	}
713 	if (limit == 1) {
714 		sch_tree_unlock(sch);
715 		kfree(p);
716 		NL_SET_ERR_MSG_MOD(extack, "invalid limit");
717 		return -EINVAL;
718 	}
719 
720 	/* commit configuration */
721 	q->limit = limit;
722 	q->divisor = divisor;
723 	q->headdrop = headdrop;
724 	q->maxdepth = maxdepth;
725 	q->maxflows = maxflows;
726 	WRITE_ONCE(q->perturb_period, perturb_period);
727 	q->quantum = quantum;
728 	q->flags = flags;
729 	if (p)
730 		swap(q->red_parms, p);
731 
732 	qlen = sch->q.qlen;
733 	while (sch->q.qlen > q->limit) {
734 		dropped += sfq_drop(sch, &to_free);
735 		if (!tail)
736 			tail = to_free;
737 	}
738 
739 	rtnl_kfree_skbs(to_free, tail);
740 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
741 
742 	timer_delete(&q->perturb_timer);
743 	if (q->perturb_period) {
744 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
745 		get_random_bytes(&q->perturbation, sizeof(q->perturbation));
746 	}
747 	sch_tree_unlock(sch);
748 	kfree(p);
749 	return 0;
750 }
751 
sfq_alloc(size_t sz)752 static void *sfq_alloc(size_t sz)
753 {
754 	return  kvmalloc(sz, GFP_KERNEL);
755 }
756 
sfq_free(void * addr)757 static void sfq_free(void *addr)
758 {
759 	kvfree(addr);
760 }
761 
sfq_destroy(struct Qdisc * sch)762 static void sfq_destroy(struct Qdisc *sch)
763 {
764 	struct sfq_sched_data *q = qdisc_priv(sch);
765 
766 	tcf_block_put(q->block);
767 	WRITE_ONCE(q->perturb_period, 0);
768 	timer_delete_sync(&q->perturb_timer);
769 	sfq_free(q->ht);
770 	sfq_free(q->slots);
771 	kfree(q->red_parms);
772 }
773 
sfq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)774 static int sfq_init(struct Qdisc *sch, struct nlattr *opt,
775 		    struct netlink_ext_ack *extack)
776 {
777 	struct sfq_sched_data *q = qdisc_priv(sch);
778 	int i;
779 	int err;
780 
781 	q->sch = sch;
782 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
783 
784 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
785 	if (err)
786 		return err;
787 
788 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
789 		q->dep[i].next = i + SFQ_MAX_FLOWS;
790 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
791 	}
792 
793 	q->limit = SFQ_MAX_DEPTH;
794 	q->maxdepth = SFQ_MAX_DEPTH;
795 	q->cur_depth = 0;
796 	q->tail = NULL;
797 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
798 	q->maxflows = SFQ_DEFAULT_FLOWS;
799 	q->quantum = psched_mtu(qdisc_dev(sch));
800 	q->perturb_period = 0;
801 	get_random_bytes(&q->perturbation, sizeof(q->perturbation));
802 
803 	if (opt) {
804 		int err = sfq_change(sch, opt, extack);
805 		if (err)
806 			return err;
807 	}
808 
809 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
810 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
811 	if (!q->ht || !q->slots) {
812 		/* Note: sfq_destroy() will be called by our caller */
813 		return -ENOMEM;
814 	}
815 
816 	for (i = 0; i < q->divisor; i++)
817 		q->ht[i] = SFQ_EMPTY_SLOT;
818 
819 	for (i = 0; i < q->maxflows; i++) {
820 		slot_queue_init(&q->slots[i]);
821 		sfq_link(q, i);
822 	}
823 	if (q->limit >= 1)
824 		sch->flags |= TCQ_F_CAN_BYPASS;
825 	else
826 		sch->flags &= ~TCQ_F_CAN_BYPASS;
827 	return 0;
828 }
829 
sfq_dump(struct Qdisc * sch,struct sk_buff * skb)830 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
831 {
832 	struct sfq_sched_data *q = qdisc_priv(sch);
833 	unsigned char *b = skb_tail_pointer(skb);
834 	struct tc_sfq_qopt_v1 opt;
835 	struct red_parms *p = q->red_parms;
836 
837 	memset(&opt, 0, sizeof(opt));
838 	opt.v0.quantum	= q->quantum;
839 	opt.v0.perturb_period = q->perturb_period / HZ;
840 	opt.v0.limit	= q->limit;
841 	opt.v0.divisor	= q->divisor;
842 	opt.v0.flows	= q->maxflows;
843 	opt.depth	= q->maxdepth;
844 	opt.headdrop	= q->headdrop;
845 
846 	if (p) {
847 		opt.qth_min	= p->qth_min >> p->Wlog;
848 		opt.qth_max	= p->qth_max >> p->Wlog;
849 		opt.Wlog	= p->Wlog;
850 		opt.Plog	= p->Plog;
851 		opt.Scell_log	= p->Scell_log;
852 		opt.max_P	= p->max_P;
853 	}
854 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
855 	opt.flags	= q->flags;
856 
857 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
858 		goto nla_put_failure;
859 
860 	return skb->len;
861 
862 nla_put_failure:
863 	nlmsg_trim(skb, b);
864 	return -1;
865 }
866 
sfq_leaf(struct Qdisc * sch,unsigned long arg)867 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
868 {
869 	return NULL;
870 }
871 
sfq_find(struct Qdisc * sch,u32 classid)872 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
873 {
874 	return 0;
875 }
876 
sfq_bind(struct Qdisc * sch,unsigned long parent,u32 classid)877 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
878 			      u32 classid)
879 {
880 	return 0;
881 }
882 
sfq_unbind(struct Qdisc * q,unsigned long cl)883 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
884 {
885 }
886 
sfq_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)887 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl,
888 				       struct netlink_ext_ack *extack)
889 {
890 	struct sfq_sched_data *q = qdisc_priv(sch);
891 
892 	if (cl)
893 		return NULL;
894 	return q->block;
895 }
896 
sfq_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)897 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
898 			  struct sk_buff *skb, struct tcmsg *tcm)
899 {
900 	tcm->tcm_handle |= TC_H_MIN(cl);
901 	return 0;
902 }
903 
sfq_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)904 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
905 				struct gnet_dump *d)
906 {
907 	struct sfq_sched_data *q = qdisc_priv(sch);
908 	sfq_index idx = q->ht[cl - 1];
909 	struct gnet_stats_queue qs = { 0 };
910 	struct tc_sfq_xstats xstats = { 0 };
911 
912 	if (idx != SFQ_EMPTY_SLOT) {
913 		const struct sfq_slot *slot = &q->slots[idx];
914 
915 		xstats.allot = slot->allot;
916 		qs.qlen = slot->qlen;
917 		qs.backlog = slot->backlog;
918 	}
919 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
920 		return -1;
921 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
922 }
923 
sfq_walk(struct Qdisc * sch,struct qdisc_walker * arg)924 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
925 {
926 	struct sfq_sched_data *q = qdisc_priv(sch);
927 	unsigned int i;
928 
929 	if (arg->stop)
930 		return;
931 
932 	for (i = 0; i < q->divisor; i++) {
933 		if (q->ht[i] == SFQ_EMPTY_SLOT) {
934 			arg->count++;
935 			continue;
936 		}
937 		if (!tc_qdisc_stats_dump(sch, i + 1, arg))
938 			break;
939 	}
940 }
941 
942 static const struct Qdisc_class_ops sfq_class_ops = {
943 	.leaf		=	sfq_leaf,
944 	.find		=	sfq_find,
945 	.tcf_block	=	sfq_tcf_block,
946 	.bind_tcf	=	sfq_bind,
947 	.unbind_tcf	=	sfq_unbind,
948 	.dump		=	sfq_dump_class,
949 	.dump_stats	=	sfq_dump_class_stats,
950 	.walk		=	sfq_walk,
951 };
952 
953 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
954 	.cl_ops		=	&sfq_class_ops,
955 	.id		=	"sfq",
956 	.priv_size	=	sizeof(struct sfq_sched_data),
957 	.enqueue	=	sfq_enqueue,
958 	.dequeue	=	sfq_dequeue,
959 	.peek		=	qdisc_peek_dequeued,
960 	.init		=	sfq_init,
961 	.reset		=	sfq_reset,
962 	.destroy	=	sfq_destroy,
963 	.change		=	NULL,
964 	.dump		=	sfq_dump,
965 	.owner		=	THIS_MODULE,
966 };
967 MODULE_ALIAS_NET_SCH("sfq");
968 
sfq_module_init(void)969 static int __init sfq_module_init(void)
970 {
971 	return register_qdisc(&sfq_qdisc_ops);
972 }
sfq_module_exit(void)973 static void __exit sfq_module_exit(void)
974 {
975 	unregister_qdisc(&sfq_qdisc_ops);
976 }
977 module_init(sfq_module_init)
978 module_exit(sfq_module_exit)
979 MODULE_LICENSE("GPL");
980 MODULE_DESCRIPTION("Stochastic Fairness qdisc");
981