xref: /linux/net/sched/sch_sfq.c (revision 378a2f090f7a478704a372a4869b8a9ac206234e)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/ipv6.h>
21 #include <linux/skbuff.h>
22 #include <linux/jhash.h>
23 #include <net/ip.h>
24 #include <net/netlink.h>
25 #include <net/pkt_sched.h>
26 
27 
28 /*	Stochastic Fairness Queuing algorithm.
29 	=======================================
30 
31 	Source:
32 	Paul E. McKenney "Stochastic Fairness Queuing",
33 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
34 
35 	Paul E. McKenney "Stochastic Fairness Queuing",
36 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
37 
38 
39 	See also:
40 	M. Shreedhar and George Varghese "Efficient Fair
41 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
42 
43 
44 	This is not the thing that is usually called (W)FQ nowadays.
45 	It does not use any timestamp mechanism, but instead
46 	processes queues in round-robin order.
47 
48 	ADVANTAGE:
49 
50 	- It is very cheap. Both CPU and memory requirements are minimal.
51 
52 	DRAWBACKS:
53 
54 	- "Stochastic" -> It is not 100% fair.
55 	When hash collisions occur, several flows are considered as one.
56 
57 	- "Round-robin" -> It introduces larger delays than virtual clock
58 	based schemes, and should not be used for isolating interactive
59 	traffic	from non-interactive. It means, that this scheduler
60 	should be used as leaf of CBQ or P3, which put interactive traffic
61 	to higher priority band.
62 
63 	We still need true WFQ for top level CSZ, but using WFQ
64 	for the best effort traffic is absolutely pointless:
65 	SFQ is superior for this purpose.
66 
67 	IMPLEMENTATION:
68 	This implementation limits maximal queue length to 128;
69 	maximal mtu to 2^15-1; number of hash buckets to 1024.
70 	The only goal of this restrictions was that all data
71 	fit into one 4K page :-). Struct sfq_sched_data is
72 	organized in anti-cache manner: all the data for a bucket
73 	are scattered over different locations. This is not good,
74 	but it allowed me to put it into 4K.
75 
76 	It is easy to increase these values, but not in flight.  */
77 
78 #define SFQ_DEPTH		128
79 #define SFQ_HASH_DIVISOR	1024
80 
81 /* This type should contain at least SFQ_DEPTH*2 values */
82 typedef unsigned char sfq_index;
83 
84 struct sfq_head
85 {
86 	sfq_index	next;
87 	sfq_index	prev;
88 };
89 
90 struct sfq_sched_data
91 {
92 /* Parameters */
93 	int		perturb_period;
94 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
95 	int		limit;
96 
97 /* Variables */
98 	struct tcf_proto *filter_list;
99 	struct timer_list perturb_timer;
100 	u32		perturbation;
101 	sfq_index	tail;		/* Index of current slot in round */
102 	sfq_index	max_depth;	/* Maximal depth */
103 
104 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
105 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
106 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
107 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
108 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
109 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
110 };
111 
112 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
113 {
114 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
115 }
116 
117 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
118 {
119 	u32 h, h2;
120 
121 	switch (skb->protocol) {
122 	case __constant_htons(ETH_P_IP):
123 	{
124 		const struct iphdr *iph = ip_hdr(skb);
125 		h = iph->daddr;
126 		h2 = iph->saddr ^ iph->protocol;
127 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
128 		    (iph->protocol == IPPROTO_TCP ||
129 		     iph->protocol == IPPROTO_UDP ||
130 		     iph->protocol == IPPROTO_UDPLITE ||
131 		     iph->protocol == IPPROTO_SCTP ||
132 		     iph->protocol == IPPROTO_DCCP ||
133 		     iph->protocol == IPPROTO_ESP))
134 			h2 ^= *(((u32*)iph) + iph->ihl);
135 		break;
136 	}
137 	case __constant_htons(ETH_P_IPV6):
138 	{
139 		struct ipv6hdr *iph = ipv6_hdr(skb);
140 		h = iph->daddr.s6_addr32[3];
141 		h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
142 		if (iph->nexthdr == IPPROTO_TCP ||
143 		    iph->nexthdr == IPPROTO_UDP ||
144 		    iph->nexthdr == IPPROTO_UDPLITE ||
145 		    iph->nexthdr == IPPROTO_SCTP ||
146 		    iph->nexthdr == IPPROTO_DCCP ||
147 		    iph->nexthdr == IPPROTO_ESP)
148 			h2 ^= *(u32*)&iph[1];
149 		break;
150 	}
151 	default:
152 		h = (unsigned long)skb->dst ^ skb->protocol;
153 		h2 = (unsigned long)skb->sk;
154 	}
155 
156 	return sfq_fold_hash(q, h, h2);
157 }
158 
159 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
160 				 int *qerr)
161 {
162 	struct sfq_sched_data *q = qdisc_priv(sch);
163 	struct tcf_result res;
164 	int result;
165 
166 	if (TC_H_MAJ(skb->priority) == sch->handle &&
167 	    TC_H_MIN(skb->priority) > 0 &&
168 	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
169 		return TC_H_MIN(skb->priority);
170 
171 	if (!q->filter_list)
172 		return sfq_hash(q, skb) + 1;
173 
174 	*qerr = NET_XMIT_BYPASS;
175 	result = tc_classify(skb, q->filter_list, &res);
176 	if (result >= 0) {
177 #ifdef CONFIG_NET_CLS_ACT
178 		switch (result) {
179 		case TC_ACT_STOLEN:
180 		case TC_ACT_QUEUED:
181 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
182 		case TC_ACT_SHOT:
183 			return 0;
184 		}
185 #endif
186 		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
187 			return TC_H_MIN(res.classid);
188 	}
189 	return 0;
190 }
191 
192 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
193 {
194 	sfq_index p, n;
195 	int d = q->qs[x].qlen + SFQ_DEPTH;
196 
197 	p = d;
198 	n = q->dep[d].next;
199 	q->dep[x].next = n;
200 	q->dep[x].prev = p;
201 	q->dep[p].next = q->dep[n].prev = x;
202 }
203 
204 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
205 {
206 	sfq_index p, n;
207 
208 	n = q->dep[x].next;
209 	p = q->dep[x].prev;
210 	q->dep[p].next = n;
211 	q->dep[n].prev = p;
212 
213 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
214 		q->max_depth--;
215 
216 	sfq_link(q, x);
217 }
218 
219 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
220 {
221 	sfq_index p, n;
222 	int d;
223 
224 	n = q->dep[x].next;
225 	p = q->dep[x].prev;
226 	q->dep[p].next = n;
227 	q->dep[n].prev = p;
228 	d = q->qs[x].qlen;
229 	if (q->max_depth < d)
230 		q->max_depth = d;
231 
232 	sfq_link(q, x);
233 }
234 
235 static unsigned int sfq_drop(struct Qdisc *sch)
236 {
237 	struct sfq_sched_data *q = qdisc_priv(sch);
238 	sfq_index d = q->max_depth;
239 	struct sk_buff *skb;
240 	unsigned int len;
241 
242 	/* Queue is full! Find the longest slot and
243 	   drop a packet from it */
244 
245 	if (d > 1) {
246 		sfq_index x = q->dep[d + SFQ_DEPTH].next;
247 		skb = q->qs[x].prev;
248 		len = qdisc_pkt_len(skb);
249 		__skb_unlink(skb, &q->qs[x]);
250 		kfree_skb(skb);
251 		sfq_dec(q, x);
252 		sch->q.qlen--;
253 		sch->qstats.drops++;
254 		sch->qstats.backlog -= len;
255 		return len;
256 	}
257 
258 	if (d == 1) {
259 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
260 		d = q->next[q->tail];
261 		q->next[q->tail] = q->next[d];
262 		q->allot[q->next[d]] += q->quantum;
263 		skb = q->qs[d].prev;
264 		len = qdisc_pkt_len(skb);
265 		__skb_unlink(skb, &q->qs[d]);
266 		kfree_skb(skb);
267 		sfq_dec(q, d);
268 		sch->q.qlen--;
269 		q->ht[q->hash[d]] = SFQ_DEPTH;
270 		sch->qstats.drops++;
271 		sch->qstats.backlog -= len;
272 		return len;
273 	}
274 
275 	return 0;
276 }
277 
278 static int
279 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
280 {
281 	struct sfq_sched_data *q = qdisc_priv(sch);
282 	unsigned int hash;
283 	sfq_index x;
284 	int ret;
285 
286 	hash = sfq_classify(skb, sch, &ret);
287 	if (hash == 0) {
288 		if (ret == NET_XMIT_BYPASS)
289 			sch->qstats.drops++;
290 		kfree_skb(skb);
291 		return ret;
292 	}
293 	hash--;
294 
295 	x = q->ht[hash];
296 	if (x == SFQ_DEPTH) {
297 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
298 		q->hash[x] = hash;
299 	}
300 
301 	/* If selected queue has length q->limit, this means that
302 	 * all another queues are empty and that we do simple tail drop,
303 	 * i.e. drop _this_ packet.
304 	 */
305 	if (q->qs[x].qlen >= q->limit)
306 		return qdisc_drop(skb, sch);
307 
308 	sch->qstats.backlog += qdisc_pkt_len(skb);
309 	__skb_queue_tail(&q->qs[x], skb);
310 	sfq_inc(q, x);
311 	if (q->qs[x].qlen == 1) {		/* The flow is new */
312 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
313 			q->tail = x;
314 			q->next[x] = x;
315 			q->allot[x] = q->quantum;
316 		} else {
317 			q->next[x] = q->next[q->tail];
318 			q->next[q->tail] = x;
319 			q->tail = x;
320 		}
321 	}
322 	if (++sch->q.qlen <= q->limit) {
323 		sch->bstats.bytes += qdisc_pkt_len(skb);
324 		sch->bstats.packets++;
325 		return 0;
326 	}
327 
328 	sfq_drop(sch);
329 	return NET_XMIT_CN;
330 }
331 
332 static int
333 sfq_requeue(struct sk_buff *skb, struct Qdisc *sch)
334 {
335 	struct sfq_sched_data *q = qdisc_priv(sch);
336 	unsigned int hash;
337 	sfq_index x;
338 	int ret;
339 
340 	hash = sfq_classify(skb, sch, &ret);
341 	if (hash == 0) {
342 		if (ret == NET_XMIT_BYPASS)
343 			sch->qstats.drops++;
344 		kfree_skb(skb);
345 		return ret;
346 	}
347 	hash--;
348 
349 	x = q->ht[hash];
350 	if (x == SFQ_DEPTH) {
351 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
352 		q->hash[x] = hash;
353 	}
354 
355 	sch->qstats.backlog += qdisc_pkt_len(skb);
356 	__skb_queue_head(&q->qs[x], skb);
357 	/* If selected queue has length q->limit+1, this means that
358 	 * all another queues are empty and we do simple tail drop.
359 	 * This packet is still requeued at head of queue, tail packet
360 	 * is dropped.
361 	 */
362 	if (q->qs[x].qlen > q->limit) {
363 		skb = q->qs[x].prev;
364 		__skb_unlink(skb, &q->qs[x]);
365 		sch->qstats.drops++;
366 		sch->qstats.backlog -= qdisc_pkt_len(skb);
367 		kfree_skb(skb);
368 		return NET_XMIT_CN;
369 	}
370 
371 	sfq_inc(q, x);
372 	if (q->qs[x].qlen == 1) {		/* The flow is new */
373 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
374 			q->tail = x;
375 			q->next[x] = x;
376 			q->allot[x] = q->quantum;
377 		} else {
378 			q->next[x] = q->next[q->tail];
379 			q->next[q->tail] = x;
380 			q->tail = x;
381 		}
382 	}
383 
384 	if (++sch->q.qlen <= q->limit) {
385 		sch->qstats.requeues++;
386 		return 0;
387 	}
388 
389 	sch->qstats.drops++;
390 	sfq_drop(sch);
391 	return NET_XMIT_CN;
392 }
393 
394 
395 
396 
397 static struct sk_buff *
398 sfq_dequeue(struct Qdisc *sch)
399 {
400 	struct sfq_sched_data *q = qdisc_priv(sch);
401 	struct sk_buff *skb;
402 	sfq_index a, old_a;
403 
404 	/* No active slots */
405 	if (q->tail == SFQ_DEPTH)
406 		return NULL;
407 
408 	a = old_a = q->next[q->tail];
409 
410 	/* Grab packet */
411 	skb = __skb_dequeue(&q->qs[a]);
412 	sfq_dec(q, a);
413 	sch->q.qlen--;
414 	sch->qstats.backlog -= qdisc_pkt_len(skb);
415 
416 	/* Is the slot empty? */
417 	if (q->qs[a].qlen == 0) {
418 		q->ht[q->hash[a]] = SFQ_DEPTH;
419 		a = q->next[a];
420 		if (a == old_a) {
421 			q->tail = SFQ_DEPTH;
422 			return skb;
423 		}
424 		q->next[q->tail] = a;
425 		q->allot[a] += q->quantum;
426 	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
427 		q->tail = a;
428 		a = q->next[a];
429 		q->allot[a] += q->quantum;
430 	}
431 	return skb;
432 }
433 
434 static void
435 sfq_reset(struct Qdisc *sch)
436 {
437 	struct sk_buff *skb;
438 
439 	while ((skb = sfq_dequeue(sch)) != NULL)
440 		kfree_skb(skb);
441 }
442 
443 static void sfq_perturbation(unsigned long arg)
444 {
445 	struct Qdisc *sch = (struct Qdisc *)arg;
446 	struct sfq_sched_data *q = qdisc_priv(sch);
447 
448 	q->perturbation = net_random();
449 
450 	if (q->perturb_period)
451 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
452 }
453 
454 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
455 {
456 	struct sfq_sched_data *q = qdisc_priv(sch);
457 	struct tc_sfq_qopt *ctl = nla_data(opt);
458 	unsigned int qlen;
459 
460 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
461 		return -EINVAL;
462 
463 	sch_tree_lock(sch);
464 	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
465 	q->perturb_period = ctl->perturb_period * HZ;
466 	if (ctl->limit)
467 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
468 
469 	qlen = sch->q.qlen;
470 	while (sch->q.qlen > q->limit)
471 		sfq_drop(sch);
472 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
473 
474 	del_timer(&q->perturb_timer);
475 	if (q->perturb_period) {
476 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
477 		q->perturbation = net_random();
478 	}
479 	sch_tree_unlock(sch);
480 	return 0;
481 }
482 
483 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
484 {
485 	struct sfq_sched_data *q = qdisc_priv(sch);
486 	int i;
487 
488 	q->perturb_timer.function = sfq_perturbation;
489 	q->perturb_timer.data = (unsigned long)sch;;
490 	init_timer_deferrable(&q->perturb_timer);
491 
492 	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
493 		q->ht[i] = SFQ_DEPTH;
494 
495 	for (i = 0; i < SFQ_DEPTH; i++) {
496 		skb_queue_head_init(&q->qs[i]);
497 		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
498 		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
499 	}
500 
501 	q->limit = SFQ_DEPTH - 1;
502 	q->max_depth = 0;
503 	q->tail = SFQ_DEPTH;
504 	if (opt == NULL) {
505 		q->quantum = psched_mtu(qdisc_dev(sch));
506 		q->perturb_period = 0;
507 		q->perturbation = net_random();
508 	} else {
509 		int err = sfq_change(sch, opt);
510 		if (err)
511 			return err;
512 	}
513 
514 	for (i = 0; i < SFQ_DEPTH; i++)
515 		sfq_link(q, i);
516 	return 0;
517 }
518 
519 static void sfq_destroy(struct Qdisc *sch)
520 {
521 	struct sfq_sched_data *q = qdisc_priv(sch);
522 
523 	tcf_destroy_chain(&q->filter_list);
524 	q->perturb_period = 0;
525 	del_timer_sync(&q->perturb_timer);
526 }
527 
528 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
529 {
530 	struct sfq_sched_data *q = qdisc_priv(sch);
531 	unsigned char *b = skb_tail_pointer(skb);
532 	struct tc_sfq_qopt opt;
533 
534 	opt.quantum = q->quantum;
535 	opt.perturb_period = q->perturb_period / HZ;
536 
537 	opt.limit = q->limit;
538 	opt.divisor = SFQ_HASH_DIVISOR;
539 	opt.flows = q->limit;
540 
541 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
542 
543 	return skb->len;
544 
545 nla_put_failure:
546 	nlmsg_trim(skb, b);
547 	return -1;
548 }
549 
550 static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
551 			    struct nlattr **tca, unsigned long *arg)
552 {
553 	return -EOPNOTSUPP;
554 }
555 
556 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
557 {
558 	return 0;
559 }
560 
561 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
562 {
563 	struct sfq_sched_data *q = qdisc_priv(sch);
564 
565 	if (cl)
566 		return NULL;
567 	return &q->filter_list;
568 }
569 
570 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
571 			  struct sk_buff *skb, struct tcmsg *tcm)
572 {
573 	tcm->tcm_handle |= TC_H_MIN(cl);
574 	return 0;
575 }
576 
577 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
578 				struct gnet_dump *d)
579 {
580 	struct sfq_sched_data *q = qdisc_priv(sch);
581 	sfq_index idx = q->ht[cl-1];
582 	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
583 	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
584 
585 	if (gnet_stats_copy_queue(d, &qs) < 0)
586 		return -1;
587 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
588 }
589 
590 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
591 {
592 	struct sfq_sched_data *q = qdisc_priv(sch);
593 	unsigned int i;
594 
595 	if (arg->stop)
596 		return;
597 
598 	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
599 		if (q->ht[i] == SFQ_DEPTH ||
600 		    arg->count < arg->skip) {
601 			arg->count++;
602 			continue;
603 		}
604 		if (arg->fn(sch, i + 1, arg) < 0) {
605 			arg->stop = 1;
606 			break;
607 		}
608 		arg->count++;
609 	}
610 }
611 
612 static const struct Qdisc_class_ops sfq_class_ops = {
613 	.get		=	sfq_get,
614 	.change		=	sfq_change_class,
615 	.tcf_chain	=	sfq_find_tcf,
616 	.dump		=	sfq_dump_class,
617 	.dump_stats	=	sfq_dump_class_stats,
618 	.walk		=	sfq_walk,
619 };
620 
621 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
622 	.cl_ops		=	&sfq_class_ops,
623 	.id		=	"sfq",
624 	.priv_size	=	sizeof(struct sfq_sched_data),
625 	.enqueue	=	sfq_enqueue,
626 	.dequeue	=	sfq_dequeue,
627 	.requeue	=	sfq_requeue,
628 	.drop		=	sfq_drop,
629 	.init		=	sfq_init,
630 	.reset		=	sfq_reset,
631 	.destroy	=	sfq_destroy,
632 	.change		=	NULL,
633 	.dump		=	sfq_dump,
634 	.owner		=	THIS_MODULE,
635 };
636 
637 static int __init sfq_module_init(void)
638 {
639 	return register_qdisc(&sfq_qdisc_ops);
640 }
641 static void __exit sfq_module_exit(void)
642 {
643 	unregister_qdisc(&sfq_qdisc_ops);
644 }
645 module_init(sfq_module_init)
646 module_exit(sfq_module_exit)
647 MODULE_LICENSE("GPL");
648