xref: /linux/net/sched/sch_sfq.c (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/red.h>
27 
28 
29 /*	Stochastic Fairness Queuing algorithm.
30 	=======================================
31 
32 	Source:
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35 
36 	Paul E. McKenney "Stochastic Fairness Queuing",
37 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
38 
39 
40 	See also:
41 	M. Shreedhar and George Varghese "Efficient Fair
42 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43 
44 
45 	This is not the thing that is usually called (W)FQ nowadays.
46 	It does not use any timestamp mechanism, but instead
47 	processes queues in round-robin order.
48 
49 	ADVANTAGE:
50 
51 	- It is very cheap. Both CPU and memory requirements are minimal.
52 
53 	DRAWBACKS:
54 
55 	- "Stochastic" -> It is not 100% fair.
56 	When hash collisions occur, several flows are considered as one.
57 
58 	- "Round-robin" -> It introduces larger delays than virtual clock
59 	based schemes, and should not be used for isolating interactive
60 	traffic	from non-interactive. It means, that this scheduler
61 	should be used as leaf of CBQ or P3, which put interactive traffic
62 	to higher priority band.
63 
64 	We still need true WFQ for top level CSZ, but using WFQ
65 	for the best effort traffic is absolutely pointless:
66 	SFQ is superior for this purpose.
67 
68 	IMPLEMENTATION:
69 	This implementation limits :
70 	- maximal queue length per flow to 127 packets.
71 	- max mtu to 2^18-1;
72 	- max 65408 flows,
73 	- number of hash buckets to 65536.
74 
75 	It is easy to increase these values, but not in flight.  */
76 
77 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
78 #define SFQ_DEFAULT_FLOWS	128
79 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
80 #define SFQ_EMPTY_SLOT		0xffff
81 #define SFQ_DEFAULT_HASH_DIVISOR 1024
82 
83 /* We use 16 bits to store allot, and want to handle packets up to 64K
84  * Scale allot by 8 (1<<3) so that no overflow occurs.
85  */
86 #define SFQ_ALLOT_SHIFT		3
87 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
88 
89 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
90 typedef u16 sfq_index;
91 
92 /*
93  * We dont use pointers to save space.
94  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
95  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
96  * are 'pointers' to dep[] array
97  */
98 struct sfq_head {
99 	sfq_index	next;
100 	sfq_index	prev;
101 };
102 
103 struct sfq_slot {
104 	struct sk_buff	*skblist_next;
105 	struct sk_buff	*skblist_prev;
106 	sfq_index	qlen; /* number of skbs in skblist */
107 	sfq_index	next; /* next slot in sfq RR chain */
108 	struct sfq_head dep; /* anchor in dep[] chains */
109 	unsigned short	hash; /* hash value (index in ht[]) */
110 	short		allot; /* credit for this slot */
111 
112 	unsigned int    backlog;
113 	struct red_vars vars;
114 };
115 
116 struct sfq_sched_data {
117 /* frequently used fields */
118 	int		limit;		/* limit of total number of packets in this qdisc */
119 	unsigned int	divisor;	/* number of slots in hash table */
120 	u8		headdrop;
121 	u8		maxdepth;	/* limit of packets per flow */
122 
123 	u32		perturbation;
124 	u8		cur_depth;	/* depth of longest slot */
125 	u8		flags;
126 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
127 	struct tcf_proto __rcu *filter_list;
128 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
129 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
130 
131 	struct red_parms *red_parms;
132 	struct tc_sfqred_stats stats;
133 	struct sfq_slot *tail;		/* current slot in round */
134 
135 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
136 					/* Linked lists of slots, indexed by depth
137 					 * dep[0] : list of unused flows
138 					 * dep[1] : list of flows with 1 packet
139 					 * dep[X] : list of flows with X packets
140 					 */
141 
142 	unsigned int	maxflows;	/* number of flows in flows array */
143 	int		perturb_period;
144 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
145 	struct timer_list perturb_timer;
146 };
147 
148 /*
149  * sfq_head are either in a sfq_slot or in dep[] array
150  */
151 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
152 {
153 	if (val < SFQ_MAX_FLOWS)
154 		return &q->slots[val].dep;
155 	return &q->dep[val - SFQ_MAX_FLOWS];
156 }
157 
158 static unsigned int sfq_hash(const struct sfq_sched_data *q,
159 			     const struct sk_buff *skb)
160 {
161 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
162 }
163 
164 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
165 				 int *qerr)
166 {
167 	struct sfq_sched_data *q = qdisc_priv(sch);
168 	struct tcf_result res;
169 	struct tcf_proto *fl;
170 	int result;
171 
172 	if (TC_H_MAJ(skb->priority) == sch->handle &&
173 	    TC_H_MIN(skb->priority) > 0 &&
174 	    TC_H_MIN(skb->priority) <= q->divisor)
175 		return TC_H_MIN(skb->priority);
176 
177 	fl = rcu_dereference_bh(q->filter_list);
178 	if (!fl)
179 		return sfq_hash(q, skb) + 1;
180 
181 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
182 	result = tc_classify(skb, fl, &res, false);
183 	if (result >= 0) {
184 #ifdef CONFIG_NET_CLS_ACT
185 		switch (result) {
186 		case TC_ACT_STOLEN:
187 		case TC_ACT_QUEUED:
188 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
189 		case TC_ACT_SHOT:
190 			return 0;
191 		}
192 #endif
193 		if (TC_H_MIN(res.classid) <= q->divisor)
194 			return TC_H_MIN(res.classid);
195 	}
196 	return 0;
197 }
198 
199 /*
200  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
201  */
202 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
203 {
204 	sfq_index p, n;
205 	struct sfq_slot *slot = &q->slots[x];
206 	int qlen = slot->qlen;
207 
208 	p = qlen + SFQ_MAX_FLOWS;
209 	n = q->dep[qlen].next;
210 
211 	slot->dep.next = n;
212 	slot->dep.prev = p;
213 
214 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
215 	sfq_dep_head(q, n)->prev = x;
216 }
217 
218 #define sfq_unlink(q, x, n, p)			\
219 	do {					\
220 		n = q->slots[x].dep.next;	\
221 		p = q->slots[x].dep.prev;	\
222 		sfq_dep_head(q, p)->next = n;	\
223 		sfq_dep_head(q, n)->prev = p;	\
224 	} while (0)
225 
226 
227 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
228 {
229 	sfq_index p, n;
230 	int d;
231 
232 	sfq_unlink(q, x, n, p);
233 
234 	d = q->slots[x].qlen--;
235 	if (n == p && q->cur_depth == d)
236 		q->cur_depth--;
237 	sfq_link(q, x);
238 }
239 
240 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
241 {
242 	sfq_index p, n;
243 	int d;
244 
245 	sfq_unlink(q, x, n, p);
246 
247 	d = ++q->slots[x].qlen;
248 	if (q->cur_depth < d)
249 		q->cur_depth = d;
250 	sfq_link(q, x);
251 }
252 
253 /* helper functions : might be changed when/if skb use a standard list_head */
254 
255 /* remove one skb from tail of slot queue */
256 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
257 {
258 	struct sk_buff *skb = slot->skblist_prev;
259 
260 	slot->skblist_prev = skb->prev;
261 	skb->prev->next = (struct sk_buff *)slot;
262 	skb->next = skb->prev = NULL;
263 	return skb;
264 }
265 
266 /* remove one skb from head of slot queue */
267 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
268 {
269 	struct sk_buff *skb = slot->skblist_next;
270 
271 	slot->skblist_next = skb->next;
272 	skb->next->prev = (struct sk_buff *)slot;
273 	skb->next = skb->prev = NULL;
274 	return skb;
275 }
276 
277 static inline void slot_queue_init(struct sfq_slot *slot)
278 {
279 	memset(slot, 0, sizeof(*slot));
280 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
281 }
282 
283 /* add skb to slot queue (tail add) */
284 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
285 {
286 	skb->prev = slot->skblist_prev;
287 	skb->next = (struct sk_buff *)slot;
288 	slot->skblist_prev->next = skb;
289 	slot->skblist_prev = skb;
290 }
291 
292 static unsigned int sfq_drop(struct Qdisc *sch)
293 {
294 	struct sfq_sched_data *q = qdisc_priv(sch);
295 	sfq_index x, d = q->cur_depth;
296 	struct sk_buff *skb;
297 	unsigned int len;
298 	struct sfq_slot *slot;
299 
300 	/* Queue is full! Find the longest slot and drop tail packet from it */
301 	if (d > 1) {
302 		x = q->dep[d].next;
303 		slot = &q->slots[x];
304 drop:
305 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
306 		len = qdisc_pkt_len(skb);
307 		slot->backlog -= len;
308 		sfq_dec(q, x);
309 		sch->q.qlen--;
310 		qdisc_qstats_drop(sch);
311 		qdisc_qstats_backlog_dec(sch, skb);
312 		kfree_skb(skb);
313 		return len;
314 	}
315 
316 	if (d == 1) {
317 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
318 		x = q->tail->next;
319 		slot = &q->slots[x];
320 		q->tail->next = slot->next;
321 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
322 		goto drop;
323 	}
324 
325 	return 0;
326 }
327 
328 /* Is ECN parameter configured */
329 static int sfq_prob_mark(const struct sfq_sched_data *q)
330 {
331 	return q->flags & TC_RED_ECN;
332 }
333 
334 /* Should packets over max threshold just be marked */
335 static int sfq_hard_mark(const struct sfq_sched_data *q)
336 {
337 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
338 }
339 
340 static int sfq_headdrop(const struct sfq_sched_data *q)
341 {
342 	return q->headdrop;
343 }
344 
345 static int
346 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
347 {
348 	struct sfq_sched_data *q = qdisc_priv(sch);
349 	unsigned int hash;
350 	sfq_index x, qlen;
351 	struct sfq_slot *slot;
352 	int uninitialized_var(ret);
353 	struct sk_buff *head;
354 	int delta;
355 
356 	hash = sfq_classify(skb, sch, &ret);
357 	if (hash == 0) {
358 		if (ret & __NET_XMIT_BYPASS)
359 			qdisc_qstats_drop(sch);
360 		kfree_skb(skb);
361 		return ret;
362 	}
363 	hash--;
364 
365 	x = q->ht[hash];
366 	slot = &q->slots[x];
367 	if (x == SFQ_EMPTY_SLOT) {
368 		x = q->dep[0].next; /* get a free slot */
369 		if (x >= SFQ_MAX_FLOWS)
370 			return qdisc_drop(skb, sch);
371 		q->ht[hash] = x;
372 		slot = &q->slots[x];
373 		slot->hash = hash;
374 		slot->backlog = 0; /* should already be 0 anyway... */
375 		red_set_vars(&slot->vars);
376 		goto enqueue;
377 	}
378 	if (q->red_parms) {
379 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
380 							&slot->vars,
381 							slot->backlog);
382 		switch (red_action(q->red_parms,
383 				   &slot->vars,
384 				   slot->vars.qavg)) {
385 		case RED_DONT_MARK:
386 			break;
387 
388 		case RED_PROB_MARK:
389 			qdisc_qstats_overlimit(sch);
390 			if (sfq_prob_mark(q)) {
391 				/* We know we have at least one packet in queue */
392 				if (sfq_headdrop(q) &&
393 				    INET_ECN_set_ce(slot->skblist_next)) {
394 					q->stats.prob_mark_head++;
395 					break;
396 				}
397 				if (INET_ECN_set_ce(skb)) {
398 					q->stats.prob_mark++;
399 					break;
400 				}
401 			}
402 			q->stats.prob_drop++;
403 			goto congestion_drop;
404 
405 		case RED_HARD_MARK:
406 			qdisc_qstats_overlimit(sch);
407 			if (sfq_hard_mark(q)) {
408 				/* We know we have at least one packet in queue */
409 				if (sfq_headdrop(q) &&
410 				    INET_ECN_set_ce(slot->skblist_next)) {
411 					q->stats.forced_mark_head++;
412 					break;
413 				}
414 				if (INET_ECN_set_ce(skb)) {
415 					q->stats.forced_mark++;
416 					break;
417 				}
418 			}
419 			q->stats.forced_drop++;
420 			goto congestion_drop;
421 		}
422 	}
423 
424 	if (slot->qlen >= q->maxdepth) {
425 congestion_drop:
426 		if (!sfq_headdrop(q))
427 			return qdisc_drop(skb, sch);
428 
429 		/* We know we have at least one packet in queue */
430 		head = slot_dequeue_head(slot);
431 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
432 		sch->qstats.backlog -= delta;
433 		slot->backlog -= delta;
434 		qdisc_drop(head, sch);
435 
436 		slot_queue_add(slot, skb);
437 		return NET_XMIT_CN;
438 	}
439 
440 enqueue:
441 	qdisc_qstats_backlog_inc(sch, skb);
442 	slot->backlog += qdisc_pkt_len(skb);
443 	slot_queue_add(slot, skb);
444 	sfq_inc(q, x);
445 	if (slot->qlen == 1) {		/* The flow is new */
446 		if (q->tail == NULL) {	/* It is the first flow */
447 			slot->next = x;
448 		} else {
449 			slot->next = q->tail->next;
450 			q->tail->next = x;
451 		}
452 		/* We put this flow at the end of our flow list.
453 		 * This might sound unfair for a new flow to wait after old ones,
454 		 * but we could endup servicing new flows only, and freeze old ones.
455 		 */
456 		q->tail = slot;
457 		/* We could use a bigger initial quantum for new flows */
458 		slot->allot = q->scaled_quantum;
459 	}
460 	if (++sch->q.qlen <= q->limit)
461 		return NET_XMIT_SUCCESS;
462 
463 	qlen = slot->qlen;
464 	sfq_drop(sch);
465 	/* Return Congestion Notification only if we dropped a packet
466 	 * from this flow.
467 	 */
468 	if (qlen != slot->qlen)
469 		return NET_XMIT_CN;
470 
471 	/* As we dropped a packet, better let upper stack know this */
472 	qdisc_tree_decrease_qlen(sch, 1);
473 	return NET_XMIT_SUCCESS;
474 }
475 
476 static struct sk_buff *
477 sfq_dequeue(struct Qdisc *sch)
478 {
479 	struct sfq_sched_data *q = qdisc_priv(sch);
480 	struct sk_buff *skb;
481 	sfq_index a, next_a;
482 	struct sfq_slot *slot;
483 
484 	/* No active slots */
485 	if (q->tail == NULL)
486 		return NULL;
487 
488 next_slot:
489 	a = q->tail->next;
490 	slot = &q->slots[a];
491 	if (slot->allot <= 0) {
492 		q->tail = slot;
493 		slot->allot += q->scaled_quantum;
494 		goto next_slot;
495 	}
496 	skb = slot_dequeue_head(slot);
497 	sfq_dec(q, a);
498 	qdisc_bstats_update(sch, skb);
499 	sch->q.qlen--;
500 	qdisc_qstats_backlog_dec(sch, skb);
501 	slot->backlog -= qdisc_pkt_len(skb);
502 	/* Is the slot empty? */
503 	if (slot->qlen == 0) {
504 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
505 		next_a = slot->next;
506 		if (a == next_a) {
507 			q->tail = NULL; /* no more active slots */
508 			return skb;
509 		}
510 		q->tail->next = next_a;
511 	} else {
512 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
513 	}
514 	return skb;
515 }
516 
517 static void
518 sfq_reset(struct Qdisc *sch)
519 {
520 	struct sk_buff *skb;
521 
522 	while ((skb = sfq_dequeue(sch)) != NULL)
523 		kfree_skb(skb);
524 }
525 
526 /*
527  * When q->perturbation is changed, we rehash all queued skbs
528  * to avoid OOO (Out Of Order) effects.
529  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
530  * counters.
531  */
532 static void sfq_rehash(struct Qdisc *sch)
533 {
534 	struct sfq_sched_data *q = qdisc_priv(sch);
535 	struct sk_buff *skb;
536 	int i;
537 	struct sfq_slot *slot;
538 	struct sk_buff_head list;
539 	int dropped = 0;
540 
541 	__skb_queue_head_init(&list);
542 
543 	for (i = 0; i < q->maxflows; i++) {
544 		slot = &q->slots[i];
545 		if (!slot->qlen)
546 			continue;
547 		while (slot->qlen) {
548 			skb = slot_dequeue_head(slot);
549 			sfq_dec(q, i);
550 			__skb_queue_tail(&list, skb);
551 		}
552 		slot->backlog = 0;
553 		red_set_vars(&slot->vars);
554 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
555 	}
556 	q->tail = NULL;
557 
558 	while ((skb = __skb_dequeue(&list)) != NULL) {
559 		unsigned int hash = sfq_hash(q, skb);
560 		sfq_index x = q->ht[hash];
561 
562 		slot = &q->slots[x];
563 		if (x == SFQ_EMPTY_SLOT) {
564 			x = q->dep[0].next; /* get a free slot */
565 			if (x >= SFQ_MAX_FLOWS) {
566 drop:
567 				qdisc_qstats_backlog_dec(sch, skb);
568 				kfree_skb(skb);
569 				dropped++;
570 				continue;
571 			}
572 			q->ht[hash] = x;
573 			slot = &q->slots[x];
574 			slot->hash = hash;
575 		}
576 		if (slot->qlen >= q->maxdepth)
577 			goto drop;
578 		slot_queue_add(slot, skb);
579 		if (q->red_parms)
580 			slot->vars.qavg = red_calc_qavg(q->red_parms,
581 							&slot->vars,
582 							slot->backlog);
583 		slot->backlog += qdisc_pkt_len(skb);
584 		sfq_inc(q, x);
585 		if (slot->qlen == 1) {		/* The flow is new */
586 			if (q->tail == NULL) {	/* It is the first flow */
587 				slot->next = x;
588 			} else {
589 				slot->next = q->tail->next;
590 				q->tail->next = x;
591 			}
592 			q->tail = slot;
593 			slot->allot = q->scaled_quantum;
594 		}
595 	}
596 	sch->q.qlen -= dropped;
597 	qdisc_tree_decrease_qlen(sch, dropped);
598 }
599 
600 static void sfq_perturbation(unsigned long arg)
601 {
602 	struct Qdisc *sch = (struct Qdisc *)arg;
603 	struct sfq_sched_data *q = qdisc_priv(sch);
604 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
605 
606 	spin_lock(root_lock);
607 	q->perturbation = prandom_u32();
608 	if (!q->filter_list && q->tail)
609 		sfq_rehash(sch);
610 	spin_unlock(root_lock);
611 
612 	if (q->perturb_period)
613 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
614 }
615 
616 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
617 {
618 	struct sfq_sched_data *q = qdisc_priv(sch);
619 	struct tc_sfq_qopt *ctl = nla_data(opt);
620 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
621 	unsigned int qlen;
622 	struct red_parms *p = NULL;
623 
624 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
625 		return -EINVAL;
626 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
627 		ctl_v1 = nla_data(opt);
628 	if (ctl->divisor &&
629 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
630 		return -EINVAL;
631 	if (ctl_v1 && ctl_v1->qth_min) {
632 		p = kmalloc(sizeof(*p), GFP_KERNEL);
633 		if (!p)
634 			return -ENOMEM;
635 	}
636 	sch_tree_lock(sch);
637 	if (ctl->quantum) {
638 		q->quantum = ctl->quantum;
639 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
640 	}
641 	q->perturb_period = ctl->perturb_period * HZ;
642 	if (ctl->flows)
643 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
644 	if (ctl->divisor) {
645 		q->divisor = ctl->divisor;
646 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
647 	}
648 	if (ctl_v1) {
649 		if (ctl_v1->depth)
650 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
651 		if (p) {
652 			swap(q->red_parms, p);
653 			red_set_parms(q->red_parms,
654 				      ctl_v1->qth_min, ctl_v1->qth_max,
655 				      ctl_v1->Wlog,
656 				      ctl_v1->Plog, ctl_v1->Scell_log,
657 				      NULL,
658 				      ctl_v1->max_P);
659 		}
660 		q->flags = ctl_v1->flags;
661 		q->headdrop = ctl_v1->headdrop;
662 	}
663 	if (ctl->limit) {
664 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
665 		q->maxflows = min_t(u32, q->maxflows, q->limit);
666 	}
667 
668 	qlen = sch->q.qlen;
669 	while (sch->q.qlen > q->limit)
670 		sfq_drop(sch);
671 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
672 
673 	del_timer(&q->perturb_timer);
674 	if (q->perturb_period) {
675 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
676 		q->perturbation = prandom_u32();
677 	}
678 	sch_tree_unlock(sch);
679 	kfree(p);
680 	return 0;
681 }
682 
683 static void *sfq_alloc(size_t sz)
684 {
685 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
686 
687 	if (!ptr)
688 		ptr = vmalloc(sz);
689 	return ptr;
690 }
691 
692 static void sfq_free(void *addr)
693 {
694 	kvfree(addr);
695 }
696 
697 static void sfq_destroy(struct Qdisc *sch)
698 {
699 	struct sfq_sched_data *q = qdisc_priv(sch);
700 
701 	tcf_destroy_chain(&q->filter_list);
702 	q->perturb_period = 0;
703 	del_timer_sync(&q->perturb_timer);
704 	sfq_free(q->ht);
705 	sfq_free(q->slots);
706 	kfree(q->red_parms);
707 }
708 
709 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
710 {
711 	struct sfq_sched_data *q = qdisc_priv(sch);
712 	int i;
713 
714 	q->perturb_timer.function = sfq_perturbation;
715 	q->perturb_timer.data = (unsigned long)sch;
716 	init_timer_deferrable(&q->perturb_timer);
717 
718 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
719 		q->dep[i].next = i + SFQ_MAX_FLOWS;
720 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
721 	}
722 
723 	q->limit = SFQ_MAX_DEPTH;
724 	q->maxdepth = SFQ_MAX_DEPTH;
725 	q->cur_depth = 0;
726 	q->tail = NULL;
727 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
728 	q->maxflows = SFQ_DEFAULT_FLOWS;
729 	q->quantum = psched_mtu(qdisc_dev(sch));
730 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
731 	q->perturb_period = 0;
732 	q->perturbation = prandom_u32();
733 
734 	if (opt) {
735 		int err = sfq_change(sch, opt);
736 		if (err)
737 			return err;
738 	}
739 
740 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
741 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
742 	if (!q->ht || !q->slots) {
743 		sfq_destroy(sch);
744 		return -ENOMEM;
745 	}
746 	for (i = 0; i < q->divisor; i++)
747 		q->ht[i] = SFQ_EMPTY_SLOT;
748 
749 	for (i = 0; i < q->maxflows; i++) {
750 		slot_queue_init(&q->slots[i]);
751 		sfq_link(q, i);
752 	}
753 	if (q->limit >= 1)
754 		sch->flags |= TCQ_F_CAN_BYPASS;
755 	else
756 		sch->flags &= ~TCQ_F_CAN_BYPASS;
757 	return 0;
758 }
759 
760 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
761 {
762 	struct sfq_sched_data *q = qdisc_priv(sch);
763 	unsigned char *b = skb_tail_pointer(skb);
764 	struct tc_sfq_qopt_v1 opt;
765 	struct red_parms *p = q->red_parms;
766 
767 	memset(&opt, 0, sizeof(opt));
768 	opt.v0.quantum	= q->quantum;
769 	opt.v0.perturb_period = q->perturb_period / HZ;
770 	opt.v0.limit	= q->limit;
771 	opt.v0.divisor	= q->divisor;
772 	opt.v0.flows	= q->maxflows;
773 	opt.depth	= q->maxdepth;
774 	opt.headdrop	= q->headdrop;
775 
776 	if (p) {
777 		opt.qth_min	= p->qth_min >> p->Wlog;
778 		opt.qth_max	= p->qth_max >> p->Wlog;
779 		opt.Wlog	= p->Wlog;
780 		opt.Plog	= p->Plog;
781 		opt.Scell_log	= p->Scell_log;
782 		opt.max_P	= p->max_P;
783 	}
784 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
785 	opt.flags	= q->flags;
786 
787 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
788 		goto nla_put_failure;
789 
790 	return skb->len;
791 
792 nla_put_failure:
793 	nlmsg_trim(skb, b);
794 	return -1;
795 }
796 
797 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
798 {
799 	return NULL;
800 }
801 
802 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
803 {
804 	return 0;
805 }
806 
807 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
808 			      u32 classid)
809 {
810 	/* we cannot bypass queue discipline anymore */
811 	sch->flags &= ~TCQ_F_CAN_BYPASS;
812 	return 0;
813 }
814 
815 static void sfq_put(struct Qdisc *q, unsigned long cl)
816 {
817 }
818 
819 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
820 					     unsigned long cl)
821 {
822 	struct sfq_sched_data *q = qdisc_priv(sch);
823 
824 	if (cl)
825 		return NULL;
826 	return &q->filter_list;
827 }
828 
829 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
830 			  struct sk_buff *skb, struct tcmsg *tcm)
831 {
832 	tcm->tcm_handle |= TC_H_MIN(cl);
833 	return 0;
834 }
835 
836 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
837 				struct gnet_dump *d)
838 {
839 	struct sfq_sched_data *q = qdisc_priv(sch);
840 	sfq_index idx = q->ht[cl - 1];
841 	struct gnet_stats_queue qs = { 0 };
842 	struct tc_sfq_xstats xstats = { 0 };
843 
844 	if (idx != SFQ_EMPTY_SLOT) {
845 		const struct sfq_slot *slot = &q->slots[idx];
846 
847 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
848 		qs.qlen = slot->qlen;
849 		qs.backlog = slot->backlog;
850 	}
851 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
852 		return -1;
853 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
854 }
855 
856 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
857 {
858 	struct sfq_sched_data *q = qdisc_priv(sch);
859 	unsigned int i;
860 
861 	if (arg->stop)
862 		return;
863 
864 	for (i = 0; i < q->divisor; i++) {
865 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
866 		    arg->count < arg->skip) {
867 			arg->count++;
868 			continue;
869 		}
870 		if (arg->fn(sch, i + 1, arg) < 0) {
871 			arg->stop = 1;
872 			break;
873 		}
874 		arg->count++;
875 	}
876 }
877 
878 static const struct Qdisc_class_ops sfq_class_ops = {
879 	.leaf		=	sfq_leaf,
880 	.get		=	sfq_get,
881 	.put		=	sfq_put,
882 	.tcf_chain	=	sfq_find_tcf,
883 	.bind_tcf	=	sfq_bind,
884 	.unbind_tcf	=	sfq_put,
885 	.dump		=	sfq_dump_class,
886 	.dump_stats	=	sfq_dump_class_stats,
887 	.walk		=	sfq_walk,
888 };
889 
890 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
891 	.cl_ops		=	&sfq_class_ops,
892 	.id		=	"sfq",
893 	.priv_size	=	sizeof(struct sfq_sched_data),
894 	.enqueue	=	sfq_enqueue,
895 	.dequeue	=	sfq_dequeue,
896 	.peek		=	qdisc_peek_dequeued,
897 	.drop		=	sfq_drop,
898 	.init		=	sfq_init,
899 	.reset		=	sfq_reset,
900 	.destroy	=	sfq_destroy,
901 	.change		=	NULL,
902 	.dump		=	sfq_dump,
903 	.owner		=	THIS_MODULE,
904 };
905 
906 static int __init sfq_module_init(void)
907 {
908 	return register_qdisc(&sfq_qdisc_ops);
909 }
910 static void __exit sfq_module_exit(void)
911 {
912 	unregister_qdisc(&sfq_qdisc_ops);
913 }
914 module_init(sfq_module_init)
915 module_exit(sfq_module_exit)
916 MODULE_LICENSE("GPL");
917