xref: /linux/net/sched/sch_sfq.c (revision fd639726bf15fca8ee1a00dce8e0096d0ad9bd18)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/pkt_cls.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	struct tcf_block *block;
130 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
131 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
132 
133 	struct red_parms *red_parms;
134 	struct tc_sfqred_stats stats;
135 	struct sfq_slot *tail;		/* current slot in round */
136 
137 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
138 					/* Linked lists of slots, indexed by depth
139 					 * dep[0] : list of unused flows
140 					 * dep[1] : list of flows with 1 packet
141 					 * dep[X] : list of flows with X packets
142 					 */
143 
144 	unsigned int	maxflows;	/* number of flows in flows array */
145 	int		perturb_period;
146 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
147 	struct timer_list perturb_timer;
148 	struct Qdisc	*sch;
149 };
150 
151 /*
152  * sfq_head are either in a sfq_slot or in dep[] array
153  */
154 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
155 {
156 	if (val < SFQ_MAX_FLOWS)
157 		return &q->slots[val].dep;
158 	return &q->dep[val - SFQ_MAX_FLOWS];
159 }
160 
161 static unsigned int sfq_hash(const struct sfq_sched_data *q,
162 			     const struct sk_buff *skb)
163 {
164 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
165 }
166 
167 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
168 				 int *qerr)
169 {
170 	struct sfq_sched_data *q = qdisc_priv(sch);
171 	struct tcf_result res;
172 	struct tcf_proto *fl;
173 	int result;
174 
175 	if (TC_H_MAJ(skb->priority) == sch->handle &&
176 	    TC_H_MIN(skb->priority) > 0 &&
177 	    TC_H_MIN(skb->priority) <= q->divisor)
178 		return TC_H_MIN(skb->priority);
179 
180 	fl = rcu_dereference_bh(q->filter_list);
181 	if (!fl)
182 		return sfq_hash(q, skb) + 1;
183 
184 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
185 	result = tcf_classify(skb, fl, &res, false);
186 	if (result >= 0) {
187 #ifdef CONFIG_NET_CLS_ACT
188 		switch (result) {
189 		case TC_ACT_STOLEN:
190 		case TC_ACT_QUEUED:
191 		case TC_ACT_TRAP:
192 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
193 			/* fall through */
194 		case TC_ACT_SHOT:
195 			return 0;
196 		}
197 #endif
198 		if (TC_H_MIN(res.classid) <= q->divisor)
199 			return TC_H_MIN(res.classid);
200 	}
201 	return 0;
202 }
203 
204 /*
205  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
206  */
207 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
208 {
209 	sfq_index p, n;
210 	struct sfq_slot *slot = &q->slots[x];
211 	int qlen = slot->qlen;
212 
213 	p = qlen + SFQ_MAX_FLOWS;
214 	n = q->dep[qlen].next;
215 
216 	slot->dep.next = n;
217 	slot->dep.prev = p;
218 
219 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
220 	sfq_dep_head(q, n)->prev = x;
221 }
222 
223 #define sfq_unlink(q, x, n, p)			\
224 	do {					\
225 		n = q->slots[x].dep.next;	\
226 		p = q->slots[x].dep.prev;	\
227 		sfq_dep_head(q, p)->next = n;	\
228 		sfq_dep_head(q, n)->prev = p;	\
229 	} while (0)
230 
231 
232 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
233 {
234 	sfq_index p, n;
235 	int d;
236 
237 	sfq_unlink(q, x, n, p);
238 
239 	d = q->slots[x].qlen--;
240 	if (n == p && q->cur_depth == d)
241 		q->cur_depth--;
242 	sfq_link(q, x);
243 }
244 
245 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
246 {
247 	sfq_index p, n;
248 	int d;
249 
250 	sfq_unlink(q, x, n, p);
251 
252 	d = ++q->slots[x].qlen;
253 	if (q->cur_depth < d)
254 		q->cur_depth = d;
255 	sfq_link(q, x);
256 }
257 
258 /* helper functions : might be changed when/if skb use a standard list_head */
259 
260 /* remove one skb from tail of slot queue */
261 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
262 {
263 	struct sk_buff *skb = slot->skblist_prev;
264 
265 	slot->skblist_prev = skb->prev;
266 	skb->prev->next = (struct sk_buff *)slot;
267 	skb->next = skb->prev = NULL;
268 	return skb;
269 }
270 
271 /* remove one skb from head of slot queue */
272 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
273 {
274 	struct sk_buff *skb = slot->skblist_next;
275 
276 	slot->skblist_next = skb->next;
277 	skb->next->prev = (struct sk_buff *)slot;
278 	skb->next = skb->prev = NULL;
279 	return skb;
280 }
281 
282 static inline void slot_queue_init(struct sfq_slot *slot)
283 {
284 	memset(slot, 0, sizeof(*slot));
285 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
286 }
287 
288 /* add skb to slot queue (tail add) */
289 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
290 {
291 	skb->prev = slot->skblist_prev;
292 	skb->next = (struct sk_buff *)slot;
293 	slot->skblist_prev->next = skb;
294 	slot->skblist_prev = skb;
295 }
296 
297 static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free)
298 {
299 	struct sfq_sched_data *q = qdisc_priv(sch);
300 	sfq_index x, d = q->cur_depth;
301 	struct sk_buff *skb;
302 	unsigned int len;
303 	struct sfq_slot *slot;
304 
305 	/* Queue is full! Find the longest slot and drop tail packet from it */
306 	if (d > 1) {
307 		x = q->dep[d].next;
308 		slot = &q->slots[x];
309 drop:
310 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
311 		len = qdisc_pkt_len(skb);
312 		slot->backlog -= len;
313 		sfq_dec(q, x);
314 		sch->q.qlen--;
315 		qdisc_qstats_backlog_dec(sch, skb);
316 		qdisc_drop(skb, sch, to_free);
317 		return len;
318 	}
319 
320 	if (d == 1) {
321 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
322 		x = q->tail->next;
323 		slot = &q->slots[x];
324 		q->tail->next = slot->next;
325 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
326 		goto drop;
327 	}
328 
329 	return 0;
330 }
331 
332 /* Is ECN parameter configured */
333 static int sfq_prob_mark(const struct sfq_sched_data *q)
334 {
335 	return q->flags & TC_RED_ECN;
336 }
337 
338 /* Should packets over max threshold just be marked */
339 static int sfq_hard_mark(const struct sfq_sched_data *q)
340 {
341 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
342 }
343 
344 static int sfq_headdrop(const struct sfq_sched_data *q)
345 {
346 	return q->headdrop;
347 }
348 
349 static int
350 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
351 {
352 	struct sfq_sched_data *q = qdisc_priv(sch);
353 	unsigned int hash, dropped;
354 	sfq_index x, qlen;
355 	struct sfq_slot *slot;
356 	int uninitialized_var(ret);
357 	struct sk_buff *head;
358 	int delta;
359 
360 	hash = sfq_classify(skb, sch, &ret);
361 	if (hash == 0) {
362 		if (ret & __NET_XMIT_BYPASS)
363 			qdisc_qstats_drop(sch);
364 		__qdisc_drop(skb, to_free);
365 		return ret;
366 	}
367 	hash--;
368 
369 	x = q->ht[hash];
370 	slot = &q->slots[x];
371 	if (x == SFQ_EMPTY_SLOT) {
372 		x = q->dep[0].next; /* get a free slot */
373 		if (x >= SFQ_MAX_FLOWS)
374 			return qdisc_drop(skb, sch, to_free);
375 		q->ht[hash] = x;
376 		slot = &q->slots[x];
377 		slot->hash = hash;
378 		slot->backlog = 0; /* should already be 0 anyway... */
379 		red_set_vars(&slot->vars);
380 		goto enqueue;
381 	}
382 	if (q->red_parms) {
383 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
384 							&slot->vars,
385 							slot->backlog);
386 		switch (red_action(q->red_parms,
387 				   &slot->vars,
388 				   slot->vars.qavg)) {
389 		case RED_DONT_MARK:
390 			break;
391 
392 		case RED_PROB_MARK:
393 			qdisc_qstats_overlimit(sch);
394 			if (sfq_prob_mark(q)) {
395 				/* We know we have at least one packet in queue */
396 				if (sfq_headdrop(q) &&
397 				    INET_ECN_set_ce(slot->skblist_next)) {
398 					q->stats.prob_mark_head++;
399 					break;
400 				}
401 				if (INET_ECN_set_ce(skb)) {
402 					q->stats.prob_mark++;
403 					break;
404 				}
405 			}
406 			q->stats.prob_drop++;
407 			goto congestion_drop;
408 
409 		case RED_HARD_MARK:
410 			qdisc_qstats_overlimit(sch);
411 			if (sfq_hard_mark(q)) {
412 				/* We know we have at least one packet in queue */
413 				if (sfq_headdrop(q) &&
414 				    INET_ECN_set_ce(slot->skblist_next)) {
415 					q->stats.forced_mark_head++;
416 					break;
417 				}
418 				if (INET_ECN_set_ce(skb)) {
419 					q->stats.forced_mark++;
420 					break;
421 				}
422 			}
423 			q->stats.forced_drop++;
424 			goto congestion_drop;
425 		}
426 	}
427 
428 	if (slot->qlen >= q->maxdepth) {
429 congestion_drop:
430 		if (!sfq_headdrop(q))
431 			return qdisc_drop(skb, sch, to_free);
432 
433 		/* We know we have at least one packet in queue */
434 		head = slot_dequeue_head(slot);
435 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
436 		sch->qstats.backlog -= delta;
437 		slot->backlog -= delta;
438 		qdisc_drop(head, sch, to_free);
439 
440 		slot_queue_add(slot, skb);
441 		qdisc_tree_reduce_backlog(sch, 0, delta);
442 		return NET_XMIT_CN;
443 	}
444 
445 enqueue:
446 	qdisc_qstats_backlog_inc(sch, skb);
447 	slot->backlog += qdisc_pkt_len(skb);
448 	slot_queue_add(slot, skb);
449 	sfq_inc(q, x);
450 	if (slot->qlen == 1) {		/* The flow is new */
451 		if (q->tail == NULL) {	/* It is the first flow */
452 			slot->next = x;
453 		} else {
454 			slot->next = q->tail->next;
455 			q->tail->next = x;
456 		}
457 		/* We put this flow at the end of our flow list.
458 		 * This might sound unfair for a new flow to wait after old ones,
459 		 * but we could endup servicing new flows only, and freeze old ones.
460 		 */
461 		q->tail = slot;
462 		/* We could use a bigger initial quantum for new flows */
463 		slot->allot = q->scaled_quantum;
464 	}
465 	if (++sch->q.qlen <= q->limit)
466 		return NET_XMIT_SUCCESS;
467 
468 	qlen = slot->qlen;
469 	dropped = sfq_drop(sch, to_free);
470 	/* Return Congestion Notification only if we dropped a packet
471 	 * from this flow.
472 	 */
473 	if (qlen != slot->qlen) {
474 		qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
475 		return NET_XMIT_CN;
476 	}
477 
478 	/* As we dropped a packet, better let upper stack know this */
479 	qdisc_tree_reduce_backlog(sch, 1, dropped);
480 	return NET_XMIT_SUCCESS;
481 }
482 
483 static struct sk_buff *
484 sfq_dequeue(struct Qdisc *sch)
485 {
486 	struct sfq_sched_data *q = qdisc_priv(sch);
487 	struct sk_buff *skb;
488 	sfq_index a, next_a;
489 	struct sfq_slot *slot;
490 
491 	/* No active slots */
492 	if (q->tail == NULL)
493 		return NULL;
494 
495 next_slot:
496 	a = q->tail->next;
497 	slot = &q->slots[a];
498 	if (slot->allot <= 0) {
499 		q->tail = slot;
500 		slot->allot += q->scaled_quantum;
501 		goto next_slot;
502 	}
503 	skb = slot_dequeue_head(slot);
504 	sfq_dec(q, a);
505 	qdisc_bstats_update(sch, skb);
506 	sch->q.qlen--;
507 	qdisc_qstats_backlog_dec(sch, skb);
508 	slot->backlog -= qdisc_pkt_len(skb);
509 	/* Is the slot empty? */
510 	if (slot->qlen == 0) {
511 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
512 		next_a = slot->next;
513 		if (a == next_a) {
514 			q->tail = NULL; /* no more active slots */
515 			return skb;
516 		}
517 		q->tail->next = next_a;
518 	} else {
519 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
520 	}
521 	return skb;
522 }
523 
524 static void
525 sfq_reset(struct Qdisc *sch)
526 {
527 	struct sk_buff *skb;
528 
529 	while ((skb = sfq_dequeue(sch)) != NULL)
530 		rtnl_kfree_skbs(skb, skb);
531 }
532 
533 /*
534  * When q->perturbation is changed, we rehash all queued skbs
535  * to avoid OOO (Out Of Order) effects.
536  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
537  * counters.
538  */
539 static void sfq_rehash(struct Qdisc *sch)
540 {
541 	struct sfq_sched_data *q = qdisc_priv(sch);
542 	struct sk_buff *skb;
543 	int i;
544 	struct sfq_slot *slot;
545 	struct sk_buff_head list;
546 	int dropped = 0;
547 	unsigned int drop_len = 0;
548 
549 	__skb_queue_head_init(&list);
550 
551 	for (i = 0; i < q->maxflows; i++) {
552 		slot = &q->slots[i];
553 		if (!slot->qlen)
554 			continue;
555 		while (slot->qlen) {
556 			skb = slot_dequeue_head(slot);
557 			sfq_dec(q, i);
558 			__skb_queue_tail(&list, skb);
559 		}
560 		slot->backlog = 0;
561 		red_set_vars(&slot->vars);
562 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
563 	}
564 	q->tail = NULL;
565 
566 	while ((skb = __skb_dequeue(&list)) != NULL) {
567 		unsigned int hash = sfq_hash(q, skb);
568 		sfq_index x = q->ht[hash];
569 
570 		slot = &q->slots[x];
571 		if (x == SFQ_EMPTY_SLOT) {
572 			x = q->dep[0].next; /* get a free slot */
573 			if (x >= SFQ_MAX_FLOWS) {
574 drop:
575 				qdisc_qstats_backlog_dec(sch, skb);
576 				drop_len += qdisc_pkt_len(skb);
577 				kfree_skb(skb);
578 				dropped++;
579 				continue;
580 			}
581 			q->ht[hash] = x;
582 			slot = &q->slots[x];
583 			slot->hash = hash;
584 		}
585 		if (slot->qlen >= q->maxdepth)
586 			goto drop;
587 		slot_queue_add(slot, skb);
588 		if (q->red_parms)
589 			slot->vars.qavg = red_calc_qavg(q->red_parms,
590 							&slot->vars,
591 							slot->backlog);
592 		slot->backlog += qdisc_pkt_len(skb);
593 		sfq_inc(q, x);
594 		if (slot->qlen == 1) {		/* The flow is new */
595 			if (q->tail == NULL) {	/* It is the first flow */
596 				slot->next = x;
597 			} else {
598 				slot->next = q->tail->next;
599 				q->tail->next = x;
600 			}
601 			q->tail = slot;
602 			slot->allot = q->scaled_quantum;
603 		}
604 	}
605 	sch->q.qlen -= dropped;
606 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
607 }
608 
609 static void sfq_perturbation(struct timer_list *t)
610 {
611 	struct sfq_sched_data *q = from_timer(q, t, perturb_timer);
612 	struct Qdisc *sch = q->sch;
613 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
614 
615 	spin_lock(root_lock);
616 	q->perturbation = prandom_u32();
617 	if (!q->filter_list && q->tail)
618 		sfq_rehash(sch);
619 	spin_unlock(root_lock);
620 
621 	if (q->perturb_period)
622 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
623 }
624 
625 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
626 {
627 	struct sfq_sched_data *q = qdisc_priv(sch);
628 	struct tc_sfq_qopt *ctl = nla_data(opt);
629 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
630 	unsigned int qlen, dropped = 0;
631 	struct red_parms *p = NULL;
632 	struct sk_buff *to_free = NULL;
633 	struct sk_buff *tail = NULL;
634 
635 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
636 		return -EINVAL;
637 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
638 		ctl_v1 = nla_data(opt);
639 	if (ctl->divisor &&
640 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
641 		return -EINVAL;
642 	if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max,
643 					ctl_v1->Wlog))
644 		return -EINVAL;
645 	if (ctl_v1 && ctl_v1->qth_min) {
646 		p = kmalloc(sizeof(*p), GFP_KERNEL);
647 		if (!p)
648 			return -ENOMEM;
649 	}
650 	sch_tree_lock(sch);
651 	if (ctl->quantum) {
652 		q->quantum = ctl->quantum;
653 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
654 	}
655 	q->perturb_period = ctl->perturb_period * HZ;
656 	if (ctl->flows)
657 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
658 	if (ctl->divisor) {
659 		q->divisor = ctl->divisor;
660 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
661 	}
662 	if (ctl_v1) {
663 		if (ctl_v1->depth)
664 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
665 		if (p) {
666 			swap(q->red_parms, p);
667 			red_set_parms(q->red_parms,
668 				      ctl_v1->qth_min, ctl_v1->qth_max,
669 				      ctl_v1->Wlog,
670 				      ctl_v1->Plog, ctl_v1->Scell_log,
671 				      NULL,
672 				      ctl_v1->max_P);
673 		}
674 		q->flags = ctl_v1->flags;
675 		q->headdrop = ctl_v1->headdrop;
676 	}
677 	if (ctl->limit) {
678 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
679 		q->maxflows = min_t(u32, q->maxflows, q->limit);
680 	}
681 
682 	qlen = sch->q.qlen;
683 	while (sch->q.qlen > q->limit) {
684 		dropped += sfq_drop(sch, &to_free);
685 		if (!tail)
686 			tail = to_free;
687 	}
688 
689 	rtnl_kfree_skbs(to_free, tail);
690 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
691 
692 	del_timer(&q->perturb_timer);
693 	if (q->perturb_period) {
694 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
695 		q->perturbation = prandom_u32();
696 	}
697 	sch_tree_unlock(sch);
698 	kfree(p);
699 	return 0;
700 }
701 
702 static void *sfq_alloc(size_t sz)
703 {
704 	return  kvmalloc(sz, GFP_KERNEL);
705 }
706 
707 static void sfq_free(void *addr)
708 {
709 	kvfree(addr);
710 }
711 
712 static void sfq_destroy(struct Qdisc *sch)
713 {
714 	struct sfq_sched_data *q = qdisc_priv(sch);
715 
716 	tcf_block_put(q->block);
717 	q->perturb_period = 0;
718 	del_timer_sync(&q->perturb_timer);
719 	sfq_free(q->ht);
720 	sfq_free(q->slots);
721 	kfree(q->red_parms);
722 }
723 
724 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
725 {
726 	struct sfq_sched_data *q = qdisc_priv(sch);
727 	int i;
728 	int err;
729 
730 	q->sch = sch;
731 	timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE);
732 
733 	err = tcf_block_get(&q->block, &q->filter_list, sch);
734 	if (err)
735 		return err;
736 
737 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
738 		q->dep[i].next = i + SFQ_MAX_FLOWS;
739 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
740 	}
741 
742 	q->limit = SFQ_MAX_DEPTH;
743 	q->maxdepth = SFQ_MAX_DEPTH;
744 	q->cur_depth = 0;
745 	q->tail = NULL;
746 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
747 	q->maxflows = SFQ_DEFAULT_FLOWS;
748 	q->quantum = psched_mtu(qdisc_dev(sch));
749 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
750 	q->perturb_period = 0;
751 	q->perturbation = prandom_u32();
752 
753 	if (opt) {
754 		int err = sfq_change(sch, opt);
755 		if (err)
756 			return err;
757 	}
758 
759 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
760 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
761 	if (!q->ht || !q->slots) {
762 		/* Note: sfq_destroy() will be called by our caller */
763 		return -ENOMEM;
764 	}
765 
766 	for (i = 0; i < q->divisor; i++)
767 		q->ht[i] = SFQ_EMPTY_SLOT;
768 
769 	for (i = 0; i < q->maxflows; i++) {
770 		slot_queue_init(&q->slots[i]);
771 		sfq_link(q, i);
772 	}
773 	if (q->limit >= 1)
774 		sch->flags |= TCQ_F_CAN_BYPASS;
775 	else
776 		sch->flags &= ~TCQ_F_CAN_BYPASS;
777 	return 0;
778 }
779 
780 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
781 {
782 	struct sfq_sched_data *q = qdisc_priv(sch);
783 	unsigned char *b = skb_tail_pointer(skb);
784 	struct tc_sfq_qopt_v1 opt;
785 	struct red_parms *p = q->red_parms;
786 
787 	memset(&opt, 0, sizeof(opt));
788 	opt.v0.quantum	= q->quantum;
789 	opt.v0.perturb_period = q->perturb_period / HZ;
790 	opt.v0.limit	= q->limit;
791 	opt.v0.divisor	= q->divisor;
792 	opt.v0.flows	= q->maxflows;
793 	opt.depth	= q->maxdepth;
794 	opt.headdrop	= q->headdrop;
795 
796 	if (p) {
797 		opt.qth_min	= p->qth_min >> p->Wlog;
798 		opt.qth_max	= p->qth_max >> p->Wlog;
799 		opt.Wlog	= p->Wlog;
800 		opt.Plog	= p->Plog;
801 		opt.Scell_log	= p->Scell_log;
802 		opt.max_P	= p->max_P;
803 	}
804 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
805 	opt.flags	= q->flags;
806 
807 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
808 		goto nla_put_failure;
809 
810 	return skb->len;
811 
812 nla_put_failure:
813 	nlmsg_trim(skb, b);
814 	return -1;
815 }
816 
817 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
818 {
819 	return NULL;
820 }
821 
822 static unsigned long sfq_find(struct Qdisc *sch, u32 classid)
823 {
824 	return 0;
825 }
826 
827 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
828 			      u32 classid)
829 {
830 	/* we cannot bypass queue discipline anymore */
831 	sch->flags &= ~TCQ_F_CAN_BYPASS;
832 	return 0;
833 }
834 
835 static void sfq_unbind(struct Qdisc *q, unsigned long cl)
836 {
837 }
838 
839 static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl)
840 {
841 	struct sfq_sched_data *q = qdisc_priv(sch);
842 
843 	if (cl)
844 		return NULL;
845 	return q->block;
846 }
847 
848 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
849 			  struct sk_buff *skb, struct tcmsg *tcm)
850 {
851 	tcm->tcm_handle |= TC_H_MIN(cl);
852 	return 0;
853 }
854 
855 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
856 				struct gnet_dump *d)
857 {
858 	struct sfq_sched_data *q = qdisc_priv(sch);
859 	sfq_index idx = q->ht[cl - 1];
860 	struct gnet_stats_queue qs = { 0 };
861 	struct tc_sfq_xstats xstats = { 0 };
862 
863 	if (idx != SFQ_EMPTY_SLOT) {
864 		const struct sfq_slot *slot = &q->slots[idx];
865 
866 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
867 		qs.qlen = slot->qlen;
868 		qs.backlog = slot->backlog;
869 	}
870 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
871 		return -1;
872 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
873 }
874 
875 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
876 {
877 	struct sfq_sched_data *q = qdisc_priv(sch);
878 	unsigned int i;
879 
880 	if (arg->stop)
881 		return;
882 
883 	for (i = 0; i < q->divisor; i++) {
884 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
885 		    arg->count < arg->skip) {
886 			arg->count++;
887 			continue;
888 		}
889 		if (arg->fn(sch, i + 1, arg) < 0) {
890 			arg->stop = 1;
891 			break;
892 		}
893 		arg->count++;
894 	}
895 }
896 
897 static const struct Qdisc_class_ops sfq_class_ops = {
898 	.leaf		=	sfq_leaf,
899 	.find		=	sfq_find,
900 	.tcf_block	=	sfq_tcf_block,
901 	.bind_tcf	=	sfq_bind,
902 	.unbind_tcf	=	sfq_unbind,
903 	.dump		=	sfq_dump_class,
904 	.dump_stats	=	sfq_dump_class_stats,
905 	.walk		=	sfq_walk,
906 };
907 
908 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
909 	.cl_ops		=	&sfq_class_ops,
910 	.id		=	"sfq",
911 	.priv_size	=	sizeof(struct sfq_sched_data),
912 	.enqueue	=	sfq_enqueue,
913 	.dequeue	=	sfq_dequeue,
914 	.peek		=	qdisc_peek_dequeued,
915 	.init		=	sfq_init,
916 	.reset		=	sfq_reset,
917 	.destroy	=	sfq_destroy,
918 	.change		=	NULL,
919 	.dump		=	sfq_dump,
920 	.owner		=	THIS_MODULE,
921 };
922 
923 static int __init sfq_module_init(void)
924 {
925 	return register_qdisc(&sfq_qdisc_ops);
926 }
927 static void __exit sfq_module_exit(void)
928 {
929 	unregister_qdisc(&sfq_qdisc_ops);
930 }
931 module_init(sfq_module_init)
932 module_exit(sfq_module_exit)
933 MODULE_LICENSE("GPL");
934