xref: /linux/net/sched/sch_sfq.c (revision e5c86679d5e864947a52fb31e45a425dea3e7fa9)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/skbuff.h>
21 #include <linux/jhash.h>
22 #include <linux/slab.h>
23 #include <linux/vmalloc.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 #include <net/pkt_cls.h>
27 #include <net/red.h>
28 
29 
30 /*	Stochastic Fairness Queuing algorithm.
31 	=======================================
32 
33 	Source:
34 	Paul E. McKenney "Stochastic Fairness Queuing",
35 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
36 
37 	Paul E. McKenney "Stochastic Fairness Queuing",
38 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
39 
40 
41 	See also:
42 	M. Shreedhar and George Varghese "Efficient Fair
43 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
44 
45 
46 	This is not the thing that is usually called (W)FQ nowadays.
47 	It does not use any timestamp mechanism, but instead
48 	processes queues in round-robin order.
49 
50 	ADVANTAGE:
51 
52 	- It is very cheap. Both CPU and memory requirements are minimal.
53 
54 	DRAWBACKS:
55 
56 	- "Stochastic" -> It is not 100% fair.
57 	When hash collisions occur, several flows are considered as one.
58 
59 	- "Round-robin" -> It introduces larger delays than virtual clock
60 	based schemes, and should not be used for isolating interactive
61 	traffic	from non-interactive. It means, that this scheduler
62 	should be used as leaf of CBQ or P3, which put interactive traffic
63 	to higher priority band.
64 
65 	We still need true WFQ for top level CSZ, but using WFQ
66 	for the best effort traffic is absolutely pointless:
67 	SFQ is superior for this purpose.
68 
69 	IMPLEMENTATION:
70 	This implementation limits :
71 	- maximal queue length per flow to 127 packets.
72 	- max mtu to 2^18-1;
73 	- max 65408 flows,
74 	- number of hash buckets to 65536.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_MAX_DEPTH		127 /* max number of packets per flow */
79 #define SFQ_DEFAULT_FLOWS	128
80 #define SFQ_MAX_FLOWS		(0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
81 #define SFQ_EMPTY_SLOT		0xffff
82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
83 
84 /* We use 16 bits to store allot, and want to handle packets up to 64K
85  * Scale allot by 8 (1<<3) so that no overflow occurs.
86  */
87 #define SFQ_ALLOT_SHIFT		3
88 #define SFQ_ALLOT_SIZE(X)	DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
89 
90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
91 typedef u16 sfq_index;
92 
93 /*
94  * We dont use pointers to save space.
95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
97  * are 'pointers' to dep[] array
98  */
99 struct sfq_head {
100 	sfq_index	next;
101 	sfq_index	prev;
102 };
103 
104 struct sfq_slot {
105 	struct sk_buff	*skblist_next;
106 	struct sk_buff	*skblist_prev;
107 	sfq_index	qlen; /* number of skbs in skblist */
108 	sfq_index	next; /* next slot in sfq RR chain */
109 	struct sfq_head dep; /* anchor in dep[] chains */
110 	unsigned short	hash; /* hash value (index in ht[]) */
111 	short		allot; /* credit for this slot */
112 
113 	unsigned int    backlog;
114 	struct red_vars vars;
115 };
116 
117 struct sfq_sched_data {
118 /* frequently used fields */
119 	int		limit;		/* limit of total number of packets in this qdisc */
120 	unsigned int	divisor;	/* number of slots in hash table */
121 	u8		headdrop;
122 	u8		maxdepth;	/* limit of packets per flow */
123 
124 	u32		perturbation;
125 	u8		cur_depth;	/* depth of longest slot */
126 	u8		flags;
127 	unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
128 	struct tcf_proto __rcu *filter_list;
129 	sfq_index	*ht;		/* Hash table ('divisor' slots) */
130 	struct sfq_slot	*slots;		/* Flows table ('maxflows' entries) */
131 
132 	struct red_parms *red_parms;
133 	struct tc_sfqred_stats stats;
134 	struct sfq_slot *tail;		/* current slot in round */
135 
136 	struct sfq_head	dep[SFQ_MAX_DEPTH + 1];
137 					/* Linked lists of slots, indexed by depth
138 					 * dep[0] : list of unused flows
139 					 * dep[1] : list of flows with 1 packet
140 					 * dep[X] : list of flows with X packets
141 					 */
142 
143 	unsigned int	maxflows;	/* number of flows in flows array */
144 	int		perturb_period;
145 	unsigned int	quantum;	/* Allotment per round: MUST BE >= MTU */
146 	struct timer_list perturb_timer;
147 };
148 
149 /*
150  * sfq_head are either in a sfq_slot or in dep[] array
151  */
152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
153 {
154 	if (val < SFQ_MAX_FLOWS)
155 		return &q->slots[val].dep;
156 	return &q->dep[val - SFQ_MAX_FLOWS];
157 }
158 
159 static unsigned int sfq_hash(const struct sfq_sched_data *q,
160 			     const struct sk_buff *skb)
161 {
162 	return skb_get_hash_perturb(skb, q->perturbation) & (q->divisor - 1);
163 }
164 
165 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
166 				 int *qerr)
167 {
168 	struct sfq_sched_data *q = qdisc_priv(sch);
169 	struct tcf_result res;
170 	struct tcf_proto *fl;
171 	int result;
172 
173 	if (TC_H_MAJ(skb->priority) == sch->handle &&
174 	    TC_H_MIN(skb->priority) > 0 &&
175 	    TC_H_MIN(skb->priority) <= q->divisor)
176 		return TC_H_MIN(skb->priority);
177 
178 	fl = rcu_dereference_bh(q->filter_list);
179 	if (!fl)
180 		return sfq_hash(q, skb) + 1;
181 
182 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
183 	result = tc_classify(skb, fl, &res, false);
184 	if (result >= 0) {
185 #ifdef CONFIG_NET_CLS_ACT
186 		switch (result) {
187 		case TC_ACT_STOLEN:
188 		case TC_ACT_QUEUED:
189 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
190 		case TC_ACT_SHOT:
191 			return 0;
192 		}
193 #endif
194 		if (TC_H_MIN(res.classid) <= q->divisor)
195 			return TC_H_MIN(res.classid);
196 	}
197 	return 0;
198 }
199 
200 /*
201  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
202  */
203 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
204 {
205 	sfq_index p, n;
206 	struct sfq_slot *slot = &q->slots[x];
207 	int qlen = slot->qlen;
208 
209 	p = qlen + SFQ_MAX_FLOWS;
210 	n = q->dep[qlen].next;
211 
212 	slot->dep.next = n;
213 	slot->dep.prev = p;
214 
215 	q->dep[qlen].next = x;		/* sfq_dep_head(q, p)->next = x */
216 	sfq_dep_head(q, n)->prev = x;
217 }
218 
219 #define sfq_unlink(q, x, n, p)			\
220 	do {					\
221 		n = q->slots[x].dep.next;	\
222 		p = q->slots[x].dep.prev;	\
223 		sfq_dep_head(q, p)->next = n;	\
224 		sfq_dep_head(q, n)->prev = p;	\
225 	} while (0)
226 
227 
228 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
229 {
230 	sfq_index p, n;
231 	int d;
232 
233 	sfq_unlink(q, x, n, p);
234 
235 	d = q->slots[x].qlen--;
236 	if (n == p && q->cur_depth == d)
237 		q->cur_depth--;
238 	sfq_link(q, x);
239 }
240 
241 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
242 {
243 	sfq_index p, n;
244 	int d;
245 
246 	sfq_unlink(q, x, n, p);
247 
248 	d = ++q->slots[x].qlen;
249 	if (q->cur_depth < d)
250 		q->cur_depth = d;
251 	sfq_link(q, x);
252 }
253 
254 /* helper functions : might be changed when/if skb use a standard list_head */
255 
256 /* remove one skb from tail of slot queue */
257 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
258 {
259 	struct sk_buff *skb = slot->skblist_prev;
260 
261 	slot->skblist_prev = skb->prev;
262 	skb->prev->next = (struct sk_buff *)slot;
263 	skb->next = skb->prev = NULL;
264 	return skb;
265 }
266 
267 /* remove one skb from head of slot queue */
268 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
269 {
270 	struct sk_buff *skb = slot->skblist_next;
271 
272 	slot->skblist_next = skb->next;
273 	skb->next->prev = (struct sk_buff *)slot;
274 	skb->next = skb->prev = NULL;
275 	return skb;
276 }
277 
278 static inline void slot_queue_init(struct sfq_slot *slot)
279 {
280 	memset(slot, 0, sizeof(*slot));
281 	slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
282 }
283 
284 /* add skb to slot queue (tail add) */
285 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
286 {
287 	skb->prev = slot->skblist_prev;
288 	skb->next = (struct sk_buff *)slot;
289 	slot->skblist_prev->next = skb;
290 	slot->skblist_prev = skb;
291 }
292 
293 static unsigned int sfq_drop(struct Qdisc *sch)
294 {
295 	struct sfq_sched_data *q = qdisc_priv(sch);
296 	sfq_index x, d = q->cur_depth;
297 	struct sk_buff *skb;
298 	unsigned int len;
299 	struct sfq_slot *slot;
300 
301 	/* Queue is full! Find the longest slot and drop tail packet from it */
302 	if (d > 1) {
303 		x = q->dep[d].next;
304 		slot = &q->slots[x];
305 drop:
306 		skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
307 		len = qdisc_pkt_len(skb);
308 		slot->backlog -= len;
309 		sfq_dec(q, x);
310 		sch->q.qlen--;
311 		qdisc_qstats_drop(sch);
312 		qdisc_qstats_backlog_dec(sch, skb);
313 		kfree_skb(skb);
314 		return len;
315 	}
316 
317 	if (d == 1) {
318 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
319 		x = q->tail->next;
320 		slot = &q->slots[x];
321 		q->tail->next = slot->next;
322 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
323 		goto drop;
324 	}
325 
326 	return 0;
327 }
328 
329 /* Is ECN parameter configured */
330 static int sfq_prob_mark(const struct sfq_sched_data *q)
331 {
332 	return q->flags & TC_RED_ECN;
333 }
334 
335 /* Should packets over max threshold just be marked */
336 static int sfq_hard_mark(const struct sfq_sched_data *q)
337 {
338 	return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
339 }
340 
341 static int sfq_headdrop(const struct sfq_sched_data *q)
342 {
343 	return q->headdrop;
344 }
345 
346 static int
347 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
348 {
349 	struct sfq_sched_data *q = qdisc_priv(sch);
350 	unsigned int hash, dropped;
351 	sfq_index x, qlen;
352 	struct sfq_slot *slot;
353 	int uninitialized_var(ret);
354 	struct sk_buff *head;
355 	int delta;
356 
357 	hash = sfq_classify(skb, sch, &ret);
358 	if (hash == 0) {
359 		if (ret & __NET_XMIT_BYPASS)
360 			qdisc_qstats_drop(sch);
361 		kfree_skb(skb);
362 		return ret;
363 	}
364 	hash--;
365 
366 	x = q->ht[hash];
367 	slot = &q->slots[x];
368 	if (x == SFQ_EMPTY_SLOT) {
369 		x = q->dep[0].next; /* get a free slot */
370 		if (x >= SFQ_MAX_FLOWS)
371 			return qdisc_drop(skb, sch, to_free);
372 		q->ht[hash] = x;
373 		slot = &q->slots[x];
374 		slot->hash = hash;
375 		slot->backlog = 0; /* should already be 0 anyway... */
376 		red_set_vars(&slot->vars);
377 		goto enqueue;
378 	}
379 	if (q->red_parms) {
380 		slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
381 							&slot->vars,
382 							slot->backlog);
383 		switch (red_action(q->red_parms,
384 				   &slot->vars,
385 				   slot->vars.qavg)) {
386 		case RED_DONT_MARK:
387 			break;
388 
389 		case RED_PROB_MARK:
390 			qdisc_qstats_overlimit(sch);
391 			if (sfq_prob_mark(q)) {
392 				/* We know we have at least one packet in queue */
393 				if (sfq_headdrop(q) &&
394 				    INET_ECN_set_ce(slot->skblist_next)) {
395 					q->stats.prob_mark_head++;
396 					break;
397 				}
398 				if (INET_ECN_set_ce(skb)) {
399 					q->stats.prob_mark++;
400 					break;
401 				}
402 			}
403 			q->stats.prob_drop++;
404 			goto congestion_drop;
405 
406 		case RED_HARD_MARK:
407 			qdisc_qstats_overlimit(sch);
408 			if (sfq_hard_mark(q)) {
409 				/* We know we have at least one packet in queue */
410 				if (sfq_headdrop(q) &&
411 				    INET_ECN_set_ce(slot->skblist_next)) {
412 					q->stats.forced_mark_head++;
413 					break;
414 				}
415 				if (INET_ECN_set_ce(skb)) {
416 					q->stats.forced_mark++;
417 					break;
418 				}
419 			}
420 			q->stats.forced_drop++;
421 			goto congestion_drop;
422 		}
423 	}
424 
425 	if (slot->qlen >= q->maxdepth) {
426 congestion_drop:
427 		if (!sfq_headdrop(q))
428 			return qdisc_drop(skb, sch, to_free);
429 
430 		/* We know we have at least one packet in queue */
431 		head = slot_dequeue_head(slot);
432 		delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
433 		sch->qstats.backlog -= delta;
434 		slot->backlog -= delta;
435 		qdisc_drop(head, sch, to_free);
436 
437 		slot_queue_add(slot, skb);
438 		return NET_XMIT_CN;
439 	}
440 
441 enqueue:
442 	qdisc_qstats_backlog_inc(sch, skb);
443 	slot->backlog += qdisc_pkt_len(skb);
444 	slot_queue_add(slot, skb);
445 	sfq_inc(q, x);
446 	if (slot->qlen == 1) {		/* The flow is new */
447 		if (q->tail == NULL) {	/* It is the first flow */
448 			slot->next = x;
449 		} else {
450 			slot->next = q->tail->next;
451 			q->tail->next = x;
452 		}
453 		/* We put this flow at the end of our flow list.
454 		 * This might sound unfair for a new flow to wait after old ones,
455 		 * but we could endup servicing new flows only, and freeze old ones.
456 		 */
457 		q->tail = slot;
458 		/* We could use a bigger initial quantum for new flows */
459 		slot->allot = q->scaled_quantum;
460 	}
461 	if (++sch->q.qlen <= q->limit)
462 		return NET_XMIT_SUCCESS;
463 
464 	qlen = slot->qlen;
465 	dropped = sfq_drop(sch);
466 	/* Return Congestion Notification only if we dropped a packet
467 	 * from this flow.
468 	 */
469 	if (qlen != slot->qlen)
470 		return NET_XMIT_CN;
471 
472 	/* As we dropped a packet, better let upper stack know this */
473 	qdisc_tree_reduce_backlog(sch, 1, dropped);
474 	return NET_XMIT_SUCCESS;
475 }
476 
477 static struct sk_buff *
478 sfq_dequeue(struct Qdisc *sch)
479 {
480 	struct sfq_sched_data *q = qdisc_priv(sch);
481 	struct sk_buff *skb;
482 	sfq_index a, next_a;
483 	struct sfq_slot *slot;
484 
485 	/* No active slots */
486 	if (q->tail == NULL)
487 		return NULL;
488 
489 next_slot:
490 	a = q->tail->next;
491 	slot = &q->slots[a];
492 	if (slot->allot <= 0) {
493 		q->tail = slot;
494 		slot->allot += q->scaled_quantum;
495 		goto next_slot;
496 	}
497 	skb = slot_dequeue_head(slot);
498 	sfq_dec(q, a);
499 	qdisc_bstats_update(sch, skb);
500 	sch->q.qlen--;
501 	qdisc_qstats_backlog_dec(sch, skb);
502 	slot->backlog -= qdisc_pkt_len(skb);
503 	/* Is the slot empty? */
504 	if (slot->qlen == 0) {
505 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
506 		next_a = slot->next;
507 		if (a == next_a) {
508 			q->tail = NULL; /* no more active slots */
509 			return skb;
510 		}
511 		q->tail->next = next_a;
512 	} else {
513 		slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
514 	}
515 	return skb;
516 }
517 
518 static void
519 sfq_reset(struct Qdisc *sch)
520 {
521 	struct sk_buff *skb;
522 
523 	while ((skb = sfq_dequeue(sch)) != NULL)
524 		rtnl_kfree_skbs(skb, skb);
525 }
526 
527 /*
528  * When q->perturbation is changed, we rehash all queued skbs
529  * to avoid OOO (Out Of Order) effects.
530  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
531  * counters.
532  */
533 static void sfq_rehash(struct Qdisc *sch)
534 {
535 	struct sfq_sched_data *q = qdisc_priv(sch);
536 	struct sk_buff *skb;
537 	int i;
538 	struct sfq_slot *slot;
539 	struct sk_buff_head list;
540 	int dropped = 0;
541 	unsigned int drop_len = 0;
542 
543 	__skb_queue_head_init(&list);
544 
545 	for (i = 0; i < q->maxflows; i++) {
546 		slot = &q->slots[i];
547 		if (!slot->qlen)
548 			continue;
549 		while (slot->qlen) {
550 			skb = slot_dequeue_head(slot);
551 			sfq_dec(q, i);
552 			__skb_queue_tail(&list, skb);
553 		}
554 		slot->backlog = 0;
555 		red_set_vars(&slot->vars);
556 		q->ht[slot->hash] = SFQ_EMPTY_SLOT;
557 	}
558 	q->tail = NULL;
559 
560 	while ((skb = __skb_dequeue(&list)) != NULL) {
561 		unsigned int hash = sfq_hash(q, skb);
562 		sfq_index x = q->ht[hash];
563 
564 		slot = &q->slots[x];
565 		if (x == SFQ_EMPTY_SLOT) {
566 			x = q->dep[0].next; /* get a free slot */
567 			if (x >= SFQ_MAX_FLOWS) {
568 drop:
569 				qdisc_qstats_backlog_dec(sch, skb);
570 				drop_len += qdisc_pkt_len(skb);
571 				kfree_skb(skb);
572 				dropped++;
573 				continue;
574 			}
575 			q->ht[hash] = x;
576 			slot = &q->slots[x];
577 			slot->hash = hash;
578 		}
579 		if (slot->qlen >= q->maxdepth)
580 			goto drop;
581 		slot_queue_add(slot, skb);
582 		if (q->red_parms)
583 			slot->vars.qavg = red_calc_qavg(q->red_parms,
584 							&slot->vars,
585 							slot->backlog);
586 		slot->backlog += qdisc_pkt_len(skb);
587 		sfq_inc(q, x);
588 		if (slot->qlen == 1) {		/* The flow is new */
589 			if (q->tail == NULL) {	/* It is the first flow */
590 				slot->next = x;
591 			} else {
592 				slot->next = q->tail->next;
593 				q->tail->next = x;
594 			}
595 			q->tail = slot;
596 			slot->allot = q->scaled_quantum;
597 		}
598 	}
599 	sch->q.qlen -= dropped;
600 	qdisc_tree_reduce_backlog(sch, dropped, drop_len);
601 }
602 
603 static void sfq_perturbation(unsigned long arg)
604 {
605 	struct Qdisc *sch = (struct Qdisc *)arg;
606 	struct sfq_sched_data *q = qdisc_priv(sch);
607 	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
608 
609 	spin_lock(root_lock);
610 	q->perturbation = prandom_u32();
611 	if (!q->filter_list && q->tail)
612 		sfq_rehash(sch);
613 	spin_unlock(root_lock);
614 
615 	if (q->perturb_period)
616 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
617 }
618 
619 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
620 {
621 	struct sfq_sched_data *q = qdisc_priv(sch);
622 	struct tc_sfq_qopt *ctl = nla_data(opt);
623 	struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
624 	unsigned int qlen, dropped = 0;
625 	struct red_parms *p = NULL;
626 
627 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
628 		return -EINVAL;
629 	if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
630 		ctl_v1 = nla_data(opt);
631 	if (ctl->divisor &&
632 	    (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
633 		return -EINVAL;
634 	if (ctl_v1 && ctl_v1->qth_min) {
635 		p = kmalloc(sizeof(*p), GFP_KERNEL);
636 		if (!p)
637 			return -ENOMEM;
638 	}
639 	sch_tree_lock(sch);
640 	if (ctl->quantum) {
641 		q->quantum = ctl->quantum;
642 		q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
643 	}
644 	q->perturb_period = ctl->perturb_period * HZ;
645 	if (ctl->flows)
646 		q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
647 	if (ctl->divisor) {
648 		q->divisor = ctl->divisor;
649 		q->maxflows = min_t(u32, q->maxflows, q->divisor);
650 	}
651 	if (ctl_v1) {
652 		if (ctl_v1->depth)
653 			q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
654 		if (p) {
655 			swap(q->red_parms, p);
656 			red_set_parms(q->red_parms,
657 				      ctl_v1->qth_min, ctl_v1->qth_max,
658 				      ctl_v1->Wlog,
659 				      ctl_v1->Plog, ctl_v1->Scell_log,
660 				      NULL,
661 				      ctl_v1->max_P);
662 		}
663 		q->flags = ctl_v1->flags;
664 		q->headdrop = ctl_v1->headdrop;
665 	}
666 	if (ctl->limit) {
667 		q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
668 		q->maxflows = min_t(u32, q->maxflows, q->limit);
669 	}
670 
671 	qlen = sch->q.qlen;
672 	while (sch->q.qlen > q->limit)
673 		dropped += sfq_drop(sch);
674 	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
675 
676 	del_timer(&q->perturb_timer);
677 	if (q->perturb_period) {
678 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
679 		q->perturbation = prandom_u32();
680 	}
681 	sch_tree_unlock(sch);
682 	kfree(p);
683 	return 0;
684 }
685 
686 static void *sfq_alloc(size_t sz)
687 {
688 	void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
689 
690 	if (!ptr)
691 		ptr = vmalloc(sz);
692 	return ptr;
693 }
694 
695 static void sfq_free(void *addr)
696 {
697 	kvfree(addr);
698 }
699 
700 static void sfq_destroy(struct Qdisc *sch)
701 {
702 	struct sfq_sched_data *q = qdisc_priv(sch);
703 
704 	tcf_destroy_chain(&q->filter_list);
705 	q->perturb_period = 0;
706 	del_timer_sync(&q->perturb_timer);
707 	sfq_free(q->ht);
708 	sfq_free(q->slots);
709 	kfree(q->red_parms);
710 }
711 
712 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
713 {
714 	struct sfq_sched_data *q = qdisc_priv(sch);
715 	int i;
716 
717 	q->perturb_timer.function = sfq_perturbation;
718 	q->perturb_timer.data = (unsigned long)sch;
719 	init_timer_deferrable(&q->perturb_timer);
720 
721 	for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
722 		q->dep[i].next = i + SFQ_MAX_FLOWS;
723 		q->dep[i].prev = i + SFQ_MAX_FLOWS;
724 	}
725 
726 	q->limit = SFQ_MAX_DEPTH;
727 	q->maxdepth = SFQ_MAX_DEPTH;
728 	q->cur_depth = 0;
729 	q->tail = NULL;
730 	q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
731 	q->maxflows = SFQ_DEFAULT_FLOWS;
732 	q->quantum = psched_mtu(qdisc_dev(sch));
733 	q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
734 	q->perturb_period = 0;
735 	q->perturbation = prandom_u32();
736 
737 	if (opt) {
738 		int err = sfq_change(sch, opt);
739 		if (err)
740 			return err;
741 	}
742 
743 	q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
744 	q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
745 	if (!q->ht || !q->slots) {
746 		/* Note: sfq_destroy() will be called by our caller */
747 		return -ENOMEM;
748 	}
749 
750 	for (i = 0; i < q->divisor; i++)
751 		q->ht[i] = SFQ_EMPTY_SLOT;
752 
753 	for (i = 0; i < q->maxflows; i++) {
754 		slot_queue_init(&q->slots[i]);
755 		sfq_link(q, i);
756 	}
757 	if (q->limit >= 1)
758 		sch->flags |= TCQ_F_CAN_BYPASS;
759 	else
760 		sch->flags &= ~TCQ_F_CAN_BYPASS;
761 	return 0;
762 }
763 
764 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
765 {
766 	struct sfq_sched_data *q = qdisc_priv(sch);
767 	unsigned char *b = skb_tail_pointer(skb);
768 	struct tc_sfq_qopt_v1 opt;
769 	struct red_parms *p = q->red_parms;
770 
771 	memset(&opt, 0, sizeof(opt));
772 	opt.v0.quantum	= q->quantum;
773 	opt.v0.perturb_period = q->perturb_period / HZ;
774 	opt.v0.limit	= q->limit;
775 	opt.v0.divisor	= q->divisor;
776 	opt.v0.flows	= q->maxflows;
777 	opt.depth	= q->maxdepth;
778 	opt.headdrop	= q->headdrop;
779 
780 	if (p) {
781 		opt.qth_min	= p->qth_min >> p->Wlog;
782 		opt.qth_max	= p->qth_max >> p->Wlog;
783 		opt.Wlog	= p->Wlog;
784 		opt.Plog	= p->Plog;
785 		opt.Scell_log	= p->Scell_log;
786 		opt.max_P	= p->max_P;
787 	}
788 	memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
789 	opt.flags	= q->flags;
790 
791 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
792 		goto nla_put_failure;
793 
794 	return skb->len;
795 
796 nla_put_failure:
797 	nlmsg_trim(skb, b);
798 	return -1;
799 }
800 
801 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
802 {
803 	return NULL;
804 }
805 
806 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
807 {
808 	return 0;
809 }
810 
811 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
812 			      u32 classid)
813 {
814 	/* we cannot bypass queue discipline anymore */
815 	sch->flags &= ~TCQ_F_CAN_BYPASS;
816 	return 0;
817 }
818 
819 static void sfq_put(struct Qdisc *q, unsigned long cl)
820 {
821 }
822 
823 static struct tcf_proto __rcu **sfq_find_tcf(struct Qdisc *sch,
824 					     unsigned long cl)
825 {
826 	struct sfq_sched_data *q = qdisc_priv(sch);
827 
828 	if (cl)
829 		return NULL;
830 	return &q->filter_list;
831 }
832 
833 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
834 			  struct sk_buff *skb, struct tcmsg *tcm)
835 {
836 	tcm->tcm_handle |= TC_H_MIN(cl);
837 	return 0;
838 }
839 
840 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
841 				struct gnet_dump *d)
842 {
843 	struct sfq_sched_data *q = qdisc_priv(sch);
844 	sfq_index idx = q->ht[cl - 1];
845 	struct gnet_stats_queue qs = { 0 };
846 	struct tc_sfq_xstats xstats = { 0 };
847 
848 	if (idx != SFQ_EMPTY_SLOT) {
849 		const struct sfq_slot *slot = &q->slots[idx];
850 
851 		xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
852 		qs.qlen = slot->qlen;
853 		qs.backlog = slot->backlog;
854 	}
855 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
856 		return -1;
857 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
858 }
859 
860 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
861 {
862 	struct sfq_sched_data *q = qdisc_priv(sch);
863 	unsigned int i;
864 
865 	if (arg->stop)
866 		return;
867 
868 	for (i = 0; i < q->divisor; i++) {
869 		if (q->ht[i] == SFQ_EMPTY_SLOT ||
870 		    arg->count < arg->skip) {
871 			arg->count++;
872 			continue;
873 		}
874 		if (arg->fn(sch, i + 1, arg) < 0) {
875 			arg->stop = 1;
876 			break;
877 		}
878 		arg->count++;
879 	}
880 }
881 
882 static const struct Qdisc_class_ops sfq_class_ops = {
883 	.leaf		=	sfq_leaf,
884 	.get		=	sfq_get,
885 	.put		=	sfq_put,
886 	.tcf_chain	=	sfq_find_tcf,
887 	.bind_tcf	=	sfq_bind,
888 	.unbind_tcf	=	sfq_put,
889 	.dump		=	sfq_dump_class,
890 	.dump_stats	=	sfq_dump_class_stats,
891 	.walk		=	sfq_walk,
892 };
893 
894 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
895 	.cl_ops		=	&sfq_class_ops,
896 	.id		=	"sfq",
897 	.priv_size	=	sizeof(struct sfq_sched_data),
898 	.enqueue	=	sfq_enqueue,
899 	.dequeue	=	sfq_dequeue,
900 	.peek		=	qdisc_peek_dequeued,
901 	.init		=	sfq_init,
902 	.reset		=	sfq_reset,
903 	.destroy	=	sfq_destroy,
904 	.change		=	NULL,
905 	.dump		=	sfq_dump,
906 	.owner		=	THIS_MODULE,
907 };
908 
909 static int __init sfq_module_init(void)
910 {
911 	return register_qdisc(&sfq_qdisc_ops);
912 }
913 static void __exit sfq_module_exit(void)
914 {
915 	unregister_qdisc(&sfq_qdisc_ops);
916 }
917 module_init(sfq_module_init)
918 module_exit(sfq_module_exit)
919 MODULE_LICENSE("GPL");
920