xref: /linux/net/sched/sch_sfq.c (revision 4949009eb8d40a441dcddcd96e101e77d31cf1b2)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/flow_keys.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131 
132 	struct red_parms *red_parms;
133 	struct tc_sfqred_stats stats;
134 	struct sfq_slot *tail;		/* current slot in round */
135 
136 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137 					/* Linked lists of slots, indexed by depth
138 					 * dep[0] : list of unused flows
139 					 * dep[1] : list of flows with 1 packet
140 					 * dep[X] : list of flows with X packets
141 					 */
142 
143 	unsigned int	maxflows;	/* number of flows in flows array */
144 	int		perturb_period;
145 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146 	struct timer_list perturb_timer;
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154 	if (val < SFQ_MAX_FLOWS)
155 		return &q->slots[val].dep;
156 	return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 /*
160  * In order to be able to quickly rehash our queue when timer changes
161  * q->perturbation, we store flow_keys in skb->cb[]
162  */
163 struct sfq_skb_cb {
164        struct flow_keys        keys;
165 };
166 
167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
168 {
169 	qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
170 	return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
171 }
172 
173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
174 			     const struct sk_buff *skb)
175 {
176 	const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
177 	unsigned int hash;
178 
179 	hash = jhash_3words((__force u32)keys->dst,
180 			    (__force u32)keys->src ^ keys->ip_proto,
181 			    (__force u32)keys->ports, q->perturbation);
182 	return hash & (q->divisor - 1);
183 }
184 
185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
186 				 int *qerr)
187 {
188 	struct sfq_sched_data *q = qdisc_priv(sch);
189 	struct tcf_result res;
190 	struct tcf_proto *fl;
191 	int result;
192 
193 	if (TC_H_MAJ(skb->priority) == sch->handle &&
194 	    TC_H_MIN(skb->priority) > 0 &&
195 	    TC_H_MIN(skb->priority) <= q->divisor)
196 		return TC_H_MIN(skb->priority);
197 
198 	fl = rcu_dereference_bh(q->filter_list);
199 	if (!fl) {
200 		skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
201 		return sfq_hash(q, skb) + 1;
202 	}
203 
204 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
205 	result = tc_classify(skb, fl, &res);
206 	if (result >= 0) {
207 #ifdef CONFIG_NET_CLS_ACT
208 		switch (result) {
209 		case TC_ACT_STOLEN:
210 		case TC_ACT_QUEUED:
211 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
212 		case TC_ACT_SHOT:
213 			return 0;
214 		}
215 #endif
216 		if (TC_H_MIN(res.classid) <= q->divisor)
217 			return TC_H_MIN(res.classid);
218 	}
219 	return 0;
220 }
221 
222 /*
223  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
224  */
225 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
226 {
227 	sfq_index p, n;
228 	struct sfq_slot *slot = &q->slots[x];
229 	int qlen = slot->qlen;
230 
231 	p = qlen + SFQ_MAX_FLOWS;
232 	n = q->dep[qlen].next;
233 
234 	slot->dep.next = n;
235 	slot->dep.prev = p;
236 
237 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
238 	sfq_dep_head(q, n)->prev = x;
239 }
240 
241 #define sfq_unlink(q, x, n, p)			\
242 	do {					\
243 		n = q->slots[x].dep.next;	\
244 		p = q->slots[x].dep.prev;	\
245 		sfq_dep_head(q, p)->next = n;	\
246 		sfq_dep_head(q, n)->prev = p;	\
247 	} while (0)
248 
249 
250 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
251 {
252 	sfq_index p, n;
253 	int d;
254 
255 	sfq_unlink(q, x, n, p);
256 
257 	d = q->slots[x].qlen--;
258 	if (n == p && q->cur_depth == d)
259 		q->cur_depth--;
260 	sfq_link(q, x);
261 }
262 
263 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
264 {
265 	sfq_index p, n;
266 	int d;
267 
268 	sfq_unlink(q, x, n, p);
269 
270 	d = ++q->slots[x].qlen;
271 	if (q->cur_depth < d)
272 		q->cur_depth = d;
273 	sfq_link(q, x);
274 }
275 
276 /* helper functions : might be changed when/if skb use a standard list_head */
277 
278 /* remove one skb from tail of slot queue */
279 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
280 {
281 	struct sk_buff *skb = slot->skblist_prev;
282 
283 	slot->skblist_prev = skb->prev;
284 	skb->prev->next = (struct sk_buff *)slot;
285 	skb->next = skb->prev = NULL;
286 	return skb;
287 }
288 
289 /* remove one skb from head of slot queue */
290 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
291 {
292 	struct sk_buff *skb = slot->skblist_next;
293 
294 	slot->skblist_next = skb->next;
295 	skb->next->prev = (struct sk_buff *)slot;
296 	skb->next = skb->prev = NULL;
297 	return skb;
298 }
299 
300 static inline void slot_queue_init(struct sfq_slot *slot)
301 {
302 	memset(slot, 0, sizeof(*slot));
303 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
304 }
305 
306 /* add skb to slot queue (tail add) */
307 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
308 {
309 	skb->prev = slot->skblist_prev;
310 	skb->next = (struct sk_buff *)slot;
311 	slot->skblist_prev->next = skb;
312 	slot->skblist_prev = skb;
313 }
314 
315 static unsigned int sfq_drop(struct Qdisc *sch)
316 {
317 	struct sfq_sched_data *q = qdisc_priv(sch);
318 	sfq_index x, d = q->cur_depth;
319 	struct sk_buff *skb;
320 	unsigned int len;
321 	struct sfq_slot *slot;
322 
323 	/* Queue is full! Find the longest slot and drop tail packet from it */
324 	if (d > 1) {
325 		x = q->dep[d].next;
326 		slot = &q->slots[x];
327 drop:
328 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
329 		len = qdisc_pkt_len(skb);
330 		slot->backlog -= len;
331 		sfq_dec(q, x);
332 		kfree_skb(skb);
333 		sch->q.qlen--;
334 		qdisc_qstats_drop(sch);
335 		qdisc_qstats_backlog_dec(sch, skb);
336 		return len;
337 	}
338 
339 	if (d == 1) {
340 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
341 		x = q->tail->next;
342 		slot = &q->slots[x];
343 		q->tail->next = slot->next;
344 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
345 		goto drop;
346 	}
347 
348 	return 0;
349 }
350 
351 /* Is ECN parameter configured */
352 static int sfq_prob_mark(const struct sfq_sched_data *q)
353 {
354 	return q->flags & TC_RED_ECN;
355 }
356 
357 /* Should packets over max threshold just be marked */
358 static int sfq_hard_mark(const struct sfq_sched_data *q)
359 {
360 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
361 }
362 
363 static int sfq_headdrop(const struct sfq_sched_data *q)
364 {
365 	return q->headdrop;
366 }
367 
368 static int
369 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
370 {
371 	struct sfq_sched_data *q = qdisc_priv(sch);
372 	unsigned int hash;
373 	sfq_index x, qlen;
374 	struct sfq_slot *slot;
375 	int uninitialized_var(ret);
376 	struct sk_buff *head;
377 	int delta;
378 
379 	hash = sfq_classify(skb, sch, &ret);
380 	if (hash == 0) {
381 		if (ret & __NET_XMIT_BYPASS)
382 			qdisc_qstats_drop(sch);
383 		kfree_skb(skb);
384 		return ret;
385 	}
386 	hash--;
387 
388 	x = q->ht[hash];
389 	slot = &q->slots[x];
390 	if (x == SFQ_EMPTY_SLOT) {
391 		x = q->dep[0].next; /* get a free slot */
392 		if (x >= SFQ_MAX_FLOWS)
393 			return qdisc_drop(skb, sch);
394 		q->ht[hash] = x;
395 		slot = &q->slots[x];
396 		slot->hash = hash;
397 		slot->backlog = 0; /* should already be 0 anyway... */
398 		red_set_vars(&slot->vars);
399 		goto enqueue;
400 	}
401 	if (q->red_parms) {
402 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
403 							&slot->vars,
404 							slot->backlog);
405 		switch (red_action(q->red_parms,
406 				   &slot->vars,
407 				   slot->vars.qavg)) {
408 		case RED_DONT_MARK:
409 			break;
410 
411 		case RED_PROB_MARK:
412 			qdisc_qstats_overlimit(sch);
413 			if (sfq_prob_mark(q)) {
414 				/* We know we have at least one packet in queue */
415 				if (sfq_headdrop(q) &&
416 				    INET_ECN_set_ce(slot->skblist_next)) {
417 					q->stats.prob_mark_head++;
418 					break;
419 				}
420 				if (INET_ECN_set_ce(skb)) {
421 					q->stats.prob_mark++;
422 					break;
423 				}
424 			}
425 			q->stats.prob_drop++;
426 			goto congestion_drop;
427 
428 		case RED_HARD_MARK:
429 			qdisc_qstats_overlimit(sch);
430 			if (sfq_hard_mark(q)) {
431 				/* We know we have at least one packet in queue */
432 				if (sfq_headdrop(q) &&
433 				    INET_ECN_set_ce(slot->skblist_next)) {
434 					q->stats.forced_mark_head++;
435 					break;
436 				}
437 				if (INET_ECN_set_ce(skb)) {
438 					q->stats.forced_mark++;
439 					break;
440 				}
441 			}
442 			q->stats.forced_drop++;
443 			goto congestion_drop;
444 		}
445 	}
446 
447 	if (slot->qlen >= q->maxdepth) {
448 congestion_drop:
449 		if (!sfq_headdrop(q))
450 			return qdisc_drop(skb, sch);
451 
452 		/* We know we have at least one packet in queue */
453 		head = slot_dequeue_head(slot);
454 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
455 		sch->qstats.backlog -= delta;
456 		slot->backlog -= delta;
457 		qdisc_drop(head, sch);
458 
459 		slot_queue_add(slot, skb);
460 		return NET_XMIT_CN;
461 	}
462 
463 enqueue:
464 	qdisc_qstats_backlog_inc(sch, skb);
465 	slot->backlog += qdisc_pkt_len(skb);
466 	slot_queue_add(slot, skb);
467 	sfq_inc(q, x);
468 	if (slot->qlen == 1) {		/* The flow is new */
469 		if (q->tail == NULL) {	/* It is the first flow */
470 			slot->next = x;
471 		} else {
472 			slot->next = q->tail->next;
473 			q->tail->next = x;
474 		}
475 		/* We put this flow at the end of our flow list.
476 		 * This might sound unfair for a new flow to wait after old ones,
477 		 * but we could endup servicing new flows only, and freeze old ones.
478 		 */
479 		q->tail = slot;
480 		/* We could use a bigger initial quantum for new flows */
481 		slot->allot = q->scaled_quantum;
482 	}
483 	if (++sch->q.qlen <= q->limit)
484 		return NET_XMIT_SUCCESS;
485 
486 	qlen = slot->qlen;
487 	sfq_drop(sch);
488 	/* Return Congestion Notification only if we dropped a packet
489 	 * from this flow.
490 	 */
491 	if (qlen != slot->qlen)
492 		return NET_XMIT_CN;
493 
494 	/* As we dropped a packet, better let upper stack know this */
495 	qdisc_tree_decrease_qlen(sch, 1);
496 	return NET_XMIT_SUCCESS;
497 }
498 
499 static struct sk_buff *
500 sfq_dequeue(struct Qdisc *sch)
501 {
502 	struct sfq_sched_data *q = qdisc_priv(sch);
503 	struct sk_buff *skb;
504 	sfq_index a, next_a;
505 	struct sfq_slot *slot;
506 
507 	/* No active slots */
508 	if (q->tail == NULL)
509 		return NULL;
510 
511 next_slot:
512 	a = q->tail->next;
513 	slot = &q->slots[a];
514 	if (slot->allot <= 0) {
515 		q->tail = slot;
516 		slot->allot += q->scaled_quantum;
517 		goto next_slot;
518 	}
519 	skb = slot_dequeue_head(slot);
520 	sfq_dec(q, a);
521 	qdisc_bstats_update(sch, skb);
522 	sch->q.qlen--;
523 	qdisc_qstats_backlog_dec(sch, skb);
524 	slot->backlog -= qdisc_pkt_len(skb);
525 	/* Is the slot empty? */
526 	if (slot->qlen == 0) {
527 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
528 		next_a = slot->next;
529 		if (a == next_a) {
530 			q->tail = NULL; /* no more active slots */
531 			return skb;
532 		}
533 		q->tail->next = next_a;
534 	} else {
535 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
536 	}
537 	return skb;
538 }
539 
540 static void
541 sfq_reset(struct Qdisc *sch)
542 {
543 	struct sk_buff *skb;
544 
545 	while ((skb = sfq_dequeue(sch)) != NULL)
546 		kfree_skb(skb);
547 }
548 
549 /*
550  * When q->perturbation is changed, we rehash all queued skbs
551  * to avoid OOO (Out Of Order) effects.
552  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
553  * counters.
554  */
555 static void sfq_rehash(struct Qdisc *sch)
556 {
557 	struct sfq_sched_data *q = qdisc_priv(sch);
558 	struct sk_buff *skb;
559 	int i;
560 	struct sfq_slot *slot;
561 	struct sk_buff_head list;
562 	int dropped = 0;
563 
564 	__skb_queue_head_init(&list);
565 
566 	for (i = 0; i < q->maxflows; i++) {
567 		slot = &q->slots[i];
568 		if (!slot->qlen)
569 			continue;
570 		while (slot->qlen) {
571 			skb = slot_dequeue_head(slot);
572 			sfq_dec(q, i);
573 			__skb_queue_tail(&list, skb);
574 		}
575 		slot->backlog = 0;
576 		red_set_vars(&slot->vars);
577 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
578 	}
579 	q->tail = NULL;
580 
581 	while ((skb = __skb_dequeue(&list)) != NULL) {
582 		unsigned int hash = sfq_hash(q, skb);
583 		sfq_index x = q->ht[hash];
584 
585 		slot = &q->slots[x];
586 		if (x == SFQ_EMPTY_SLOT) {
587 			x = q->dep[0].next; /* get a free slot */
588 			if (x >= SFQ_MAX_FLOWS) {
589 drop:
590 				qdisc_qstats_backlog_dec(sch, skb);
591 				kfree_skb(skb);
592 				dropped++;
593 				continue;
594 			}
595 			q->ht[hash] = x;
596 			slot = &q->slots[x];
597 			slot->hash = hash;
598 		}
599 		if (slot->qlen >= q->maxdepth)
600 			goto drop;
601 		slot_queue_add(slot, skb);
602 		if (q->red_parms)
603 			slot->vars.qavg = red_calc_qavg(q->red_parms,
604 							&slot->vars,
605 							slot->backlog);
606 		slot->backlog += qdisc_pkt_len(skb);
607 		sfq_inc(q, x);
608 		if (slot->qlen == 1) {		/* The flow is new */
609 			if (q->tail == NULL) {	/* It is the first flow */
610 				slot->next = x;
611 			} else {
612 				slot->next = q->tail->next;
613 				q->tail->next = x;
614 			}
615 			q->tail = slot;
616 			slot->allot = q->scaled_quantum;
617 		}
618 	}
619 	sch->q.qlen -= dropped;
620 	qdisc_tree_decrease_qlen(sch, dropped);
621 }
622 
623 static void sfq_perturbation(unsigned long arg)
624 {
625 	struct Qdisc *sch = (struct Qdisc *)arg;
626 	struct sfq_sched_data *q = qdisc_priv(sch);
627 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
628 
629 	spin_lock(root_lock);
630 	q->perturbation = prandom_u32();
631 	if (!q->filter_list && q->tail)
632 		sfq_rehash(sch);
633 	spin_unlock(root_lock);
634 
635 	if (q->perturb_period)
636 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
637 }
638 
639 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
640 {
641 	struct sfq_sched_data *q = qdisc_priv(sch);
642 	struct tc_sfq_qopt *ctl = nla_data(opt);
643 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
644 	unsigned int qlen;
645 	struct red_parms *p = NULL;
646 
647 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
648 		return -EINVAL;
649 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
650 		ctl_v1 = nla_data(opt);
651 	if (ctl->divisor &&
652 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
653 		return -EINVAL;
654 	if (ctl_v1 && ctl_v1->qth_min) {
655 		p = kmalloc(sizeof(*p), GFP_KERNEL);
656 		if (!p)
657 			return -ENOMEM;
658 	}
659 	sch_tree_lock(sch);
660 	if (ctl->quantum) {
661 		q->quantum = ctl->quantum;
662 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
663 	}
664 	q->perturb_period = ctl->perturb_period * HZ;
665 	if (ctl->flows)
666 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
667 	if (ctl->divisor) {
668 		q->divisor = ctl->divisor;
669 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
670 	}
671 	if (ctl_v1) {
672 		if (ctl_v1->depth)
673 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
674 		if (p) {
675 			swap(q->red_parms, p);
676 			red_set_parms(q->red_parms,
677 				      ctl_v1->qth_min, ctl_v1->qth_max,
678 				      ctl_v1->Wlog,
679 				      ctl_v1->Plog, ctl_v1->Scell_log,
680 				      NULL,
681 				      ctl_v1->max_P);
682 		}
683 		q->flags = ctl_v1->flags;
684 		q->headdrop = ctl_v1->headdrop;
685 	}
686 	if (ctl->limit) {
687 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
688 		q->maxflows = min_t(u32, q->maxflows, q->limit);
689 	}
690 
691 	qlen = sch->q.qlen;
692 	while (sch->q.qlen > q->limit)
693 		sfq_drop(sch);
694 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
695 
696 	del_timer(&q->perturb_timer);
697 	if (q->perturb_period) {
698 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
699 		q->perturbation = prandom_u32();
700 	}
701 	sch_tree_unlock(sch);
702 	kfree(p);
703 	return 0;
704 }
705 
706 static void *sfq_alloc(size_t sz)
707 {
708 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
709 
710 	if (!ptr)
711 		ptr = vmalloc(sz);
712 	return ptr;
713 }
714 
715 static void sfq_free(void *addr)
716 {
717 	kvfree(addr);
718 }
719 
720 static void sfq_destroy(struct Qdisc *sch)
721 {
722 	struct sfq_sched_data *q = qdisc_priv(sch);
723 
724 	tcf_destroy_chain(&q->filter_list);
725 	q->perturb_period = 0;
726 	del_timer_sync(&q->perturb_timer);
727 	sfq_free(q->ht);
728 	sfq_free(q->slots);
729 	kfree(q->red_parms);
730 }
731 
732 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
733 {
734 	struct sfq_sched_data *q = qdisc_priv(sch);
735 	int i;
736 
737 	q->perturb_timer.function = sfq_perturbation;
738 	q->perturb_timer.data = (unsigned long)sch;
739 	init_timer_deferrable(&q->perturb_timer);
740 
741 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
742 		q->dep[i].next = i + SFQ_MAX_FLOWS;
743 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
744 	}
745 
746 	q->limit = SFQ_MAX_DEPTH;
747 	q->maxdepth = SFQ_MAX_DEPTH;
748 	q->cur_depth = 0;
749 	q->tail = NULL;
750 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
751 	q->maxflows = SFQ_DEFAULT_FLOWS;
752 	q->quantum = psched_mtu(qdisc_dev(sch));
753 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
754 	q->perturb_period = 0;
755 	q->perturbation = prandom_u32();
756 
757 	if (opt) {
758 		int err = sfq_change(sch, opt);
759 		if (err)
760 			return err;
761 	}
762 
763 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
764 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
765 	if (!q->ht || !q->slots) {
766 		sfq_destroy(sch);
767 		return -ENOMEM;
768 	}
769 	for (i = 0; i < q->divisor; i++)
770 		q->ht[i] = SFQ_EMPTY_SLOT;
771 
772 	for (i = 0; i < q->maxflows; i++) {
773 		slot_queue_init(&q->slots[i]);
774 		sfq_link(q, i);
775 	}
776 	if (q->limit >= 1)
777 		sch->flags |= TCQ_F_CAN_BYPASS;
778 	else
779 		sch->flags &= ~TCQ_F_CAN_BYPASS;
780 	return 0;
781 }
782 
783 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
784 {
785 	struct sfq_sched_data *q = qdisc_priv(sch);
786 	unsigned char *b = skb_tail_pointer(skb);
787 	struct tc_sfq_qopt_v1 opt;
788 	struct red_parms *p = q->red_parms;
789 
790 	memset(&opt, 0, sizeof(opt));
791 	opt.v0.quantum	= q->quantum;
792 	opt.v0.perturb_period = q->perturb_period / HZ;
793 	opt.v0.limit	= q->limit;
794 	opt.v0.divisor	= q->divisor;
795 	opt.v0.flows	= q->maxflows;
796 	opt.depth	= q->maxdepth;
797 	opt.headdrop	= q->headdrop;
798 
799 	if (p) {
800 		opt.qth_min	= p->qth_min >> p->Wlog;
801 		opt.qth_max	= p->qth_max >> p->Wlog;
802 		opt.Wlog	= p->Wlog;
803 		opt.Plog	= p->Plog;
804 		opt.Scell_log	= p->Scell_log;
805 		opt.max_P	= p->max_P;
806 	}
807 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
808 	opt.flags	= q->flags;
809 
810 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
811 		goto nla_put_failure;
812 
813 	return skb->len;
814 
815 nla_put_failure:
816 	nlmsg_trim(skb, b);
817 	return -1;
818 }
819 
820 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
821 {
822 	return NULL;
823 }
824 
825 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
826 {
827 	return 0;
828 }
829 
830 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
831 			      u32 classid)
832 {
833 	/* we cannot bypass queue discipline anymore */
834 	sch->flags &= ~TCQ_F_CAN_BYPASS;
835 	return 0;
836 }
837 
838 static void sfq_put(struct Qdisc *q, unsigned long cl)
839 {
840 }
841 
842 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
843 					     unsigned long cl)
844 {
845 	struct sfq_sched_data *q = qdisc_priv(sch);
846 
847 	if (cl)
848 		return NULL;
849 	return &q->filter_list;
850 }
851 
852 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
853 			  struct sk_buff *skb, struct tcmsg *tcm)
854 {
855 	tcm->tcm_handle |= TC_H_MIN(cl);
856 	return 0;
857 }
858 
859 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
860 				struct gnet_dump *d)
861 {
862 	struct sfq_sched_data *q = qdisc_priv(sch);
863 	sfq_index idx = q->ht[cl - 1];
864 	struct gnet_stats_queue qs = { 0 };
865 	struct tc_sfq_xstats xstats = { 0 };
866 
867 	if (idx != SFQ_EMPTY_SLOT) {
868 		const struct sfq_slot *slot = &q->slots[idx];
869 
870 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
871 		qs.qlen = slot->qlen;
872 		qs.backlog = slot->backlog;
873 	}
874 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
875 		return -1;
876 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
877 }
878 
879 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
880 {
881 	struct sfq_sched_data *q = qdisc_priv(sch);
882 	unsigned int i;
883 
884 	if (arg->stop)
885 		return;
886 
887 	for (i = 0; i < q->divisor; i++) {
888 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
889 		    arg->count < arg->skip) {
890 			arg->count++;
891 			continue;
892 		}
893 		if (arg->fn(sch, i + 1, arg) < 0) {
894 			arg->stop = 1;
895 			break;
896 		}
897 		arg->count++;
898 	}
899 }
900 
901 static const struct Qdisc_class_ops sfq_class_ops = {
902 	.leaf		=	sfq_leaf,
903 	.get		=	sfq_get,
904 	.put		=	sfq_put,
905 	.tcf_chain	=	sfq_find_tcf,
906 	.bind_tcf	=	sfq_bind,
907 	.unbind_tcf	=	sfq_put,
908 	.dump		=	sfq_dump_class,
909 	.dump_stats	=	sfq_dump_class_stats,
910 	.walk		=	sfq_walk,
911 };
912 
913 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
914 	.cl_ops		=	&sfq_class_ops,
915 	.id		=	"sfq",
916 	.priv_size	=	sizeof(struct sfq_sched_data),
917 	.enqueue	=	sfq_enqueue,
918 	.dequeue	=	sfq_dequeue,
919 	.peek		=	qdisc_peek_dequeued,
920 	.drop		=	sfq_drop,
921 	.init		=	sfq_init,
922 	.reset		=	sfq_reset,
923 	.destroy	=	sfq_destroy,
924 	.change		=	NULL,
925 	.dump		=	sfq_dump,
926 	.owner		=	THIS_MODULE,
927 };
928 
929 static int __init sfq_module_init(void)
930 {
931 	return register_qdisc(&sfq_qdisc_ops);
932 }
933 static void __exit sfq_module_exit(void)
934 {
935 	unregister_qdisc(&sfq_qdisc_ops);
936 }
937 module_init(sfq_module_init)
938 module_exit(sfq_module_exit)
939 MODULE_LICENSE("GPL");
940