xref: /linux/net/sched/sch_sfq.c (revision a234ca0faa65dcd5cc473915bd925130ebb7b74b)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/ipv6.h>
21 #include <linux/skbuff.h>
22 #include <linux/jhash.h>
23 #include <linux/slab.h>
24 #include <net/ip.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 
28 
29 /*	Stochastic Fairness Queuing algorithm.
30 	=======================================
31 
32 	Source:
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35 
36 	Paul E. McKenney "Stochastic Fairness Queuing",
37 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
38 
39 
40 	See also:
41 	M. Shreedhar and George Varghese "Efficient Fair
42 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43 
44 
45 	This is not the thing that is usually called (W)FQ nowadays.
46 	It does not use any timestamp mechanism, but instead
47 	processes queues in round-robin order.
48 
49 	ADVANTAGE:
50 
51 	- It is very cheap. Both CPU and memory requirements are minimal.
52 
53 	DRAWBACKS:
54 
55 	- "Stochastic" -> It is not 100% fair.
56 	When hash collisions occur, several flows are considered as one.
57 
58 	- "Round-robin" -> It introduces larger delays than virtual clock
59 	based schemes, and should not be used for isolating interactive
60 	traffic	from non-interactive. It means, that this scheduler
61 	should be used as leaf of CBQ or P3, which put interactive traffic
62 	to higher priority band.
63 
64 	We still need true WFQ for top level CSZ, but using WFQ
65 	for the best effort traffic is absolutely pointless:
66 	SFQ is superior for this purpose.
67 
68 	IMPLEMENTATION:
69 	This implementation limits maximal queue length to 128;
70 	maximal mtu to 2^15-1; number of hash buckets to 1024.
71 	The only goal of this restrictions was that all data
72 	fit into one 4K page :-). Struct sfq_sched_data is
73 	organized in anti-cache manner: all the data for a bucket
74 	are scattered over different locations. This is not good,
75 	but it allowed me to put it into 4K.
76 
77 	It is easy to increase these values, but not in flight.  */
78 
79 #define SFQ_DEPTH		128
80 #define SFQ_HASH_DIVISOR	1024
81 
82 /* This type should contain at least SFQ_DEPTH*2 values */
83 typedef unsigned char sfq_index;
84 
85 struct sfq_head
86 {
87 	sfq_index	next;
88 	sfq_index	prev;
89 };
90 
91 struct sfq_sched_data
92 {
93 /* Parameters */
94 	int		perturb_period;
95 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
96 	int		limit;
97 
98 /* Variables */
99 	struct tcf_proto *filter_list;
100 	struct timer_list perturb_timer;
101 	u32		perturbation;
102 	sfq_index	tail;		/* Index of current slot in round */
103 	sfq_index	max_depth;	/* Maximal depth */
104 
105 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
106 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
107 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
108 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
109 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
110 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
111 };
112 
113 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
114 {
115 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
116 }
117 
118 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
119 {
120 	u32 h, h2;
121 
122 	switch (skb->protocol) {
123 	case htons(ETH_P_IP):
124 	{
125 		const struct iphdr *iph;
126 
127 		if (!pskb_network_may_pull(skb, sizeof(*iph)))
128 			goto err;
129 		iph = ip_hdr(skb);
130 		h = (__force u32)iph->daddr;
131 		h2 = (__force u32)iph->saddr ^ iph->protocol;
132 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
133 		    (iph->protocol == IPPROTO_TCP ||
134 		     iph->protocol == IPPROTO_UDP ||
135 		     iph->protocol == IPPROTO_UDPLITE ||
136 		     iph->protocol == IPPROTO_SCTP ||
137 		     iph->protocol == IPPROTO_DCCP ||
138 		     iph->protocol == IPPROTO_ESP) &&
139 		     pskb_network_may_pull(skb, iph->ihl * 4 + 4))
140 			h2 ^= *(((u32*)iph) + iph->ihl);
141 		break;
142 	}
143 	case htons(ETH_P_IPV6):
144 	{
145 		struct ipv6hdr *iph;
146 
147 		if (!pskb_network_may_pull(skb, sizeof(*iph)))
148 			goto err;
149 		iph = ipv6_hdr(skb);
150 		h = (__force u32)iph->daddr.s6_addr32[3];
151 		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
152 		if ((iph->nexthdr == IPPROTO_TCP ||
153 		     iph->nexthdr == IPPROTO_UDP ||
154 		     iph->nexthdr == IPPROTO_UDPLITE ||
155 		     iph->nexthdr == IPPROTO_SCTP ||
156 		     iph->nexthdr == IPPROTO_DCCP ||
157 		     iph->nexthdr == IPPROTO_ESP) &&
158 		    pskb_network_may_pull(skb, sizeof(*iph) + 4))
159 			h2 ^= *(u32*)&iph[1];
160 		break;
161 	}
162 	default:
163 err:
164 		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
165 		h2 = (unsigned long)skb->sk;
166 	}
167 
168 	return sfq_fold_hash(q, h, h2);
169 }
170 
171 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
172 				 int *qerr)
173 {
174 	struct sfq_sched_data *q = qdisc_priv(sch);
175 	struct tcf_result res;
176 	int result;
177 
178 	if (TC_H_MAJ(skb->priority) == sch->handle &&
179 	    TC_H_MIN(skb->priority) > 0 &&
180 	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
181 		return TC_H_MIN(skb->priority);
182 
183 	if (!q->filter_list)
184 		return sfq_hash(q, skb) + 1;
185 
186 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
187 	result = tc_classify(skb, q->filter_list, &res);
188 	if (result >= 0) {
189 #ifdef CONFIG_NET_CLS_ACT
190 		switch (result) {
191 		case TC_ACT_STOLEN:
192 		case TC_ACT_QUEUED:
193 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
194 		case TC_ACT_SHOT:
195 			return 0;
196 		}
197 #endif
198 		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
199 			return TC_H_MIN(res.classid);
200 	}
201 	return 0;
202 }
203 
204 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
205 {
206 	sfq_index p, n;
207 	int d = q->qs[x].qlen + SFQ_DEPTH;
208 
209 	p = d;
210 	n = q->dep[d].next;
211 	q->dep[x].next = n;
212 	q->dep[x].prev = p;
213 	q->dep[p].next = q->dep[n].prev = x;
214 }
215 
216 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
217 {
218 	sfq_index p, n;
219 
220 	n = q->dep[x].next;
221 	p = q->dep[x].prev;
222 	q->dep[p].next = n;
223 	q->dep[n].prev = p;
224 
225 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
226 		q->max_depth--;
227 
228 	sfq_link(q, x);
229 }
230 
231 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
232 {
233 	sfq_index p, n;
234 	int d;
235 
236 	n = q->dep[x].next;
237 	p = q->dep[x].prev;
238 	q->dep[p].next = n;
239 	q->dep[n].prev = p;
240 	d = q->qs[x].qlen;
241 	if (q->max_depth < d)
242 		q->max_depth = d;
243 
244 	sfq_link(q, x);
245 }
246 
247 static unsigned int sfq_drop(struct Qdisc *sch)
248 {
249 	struct sfq_sched_data *q = qdisc_priv(sch);
250 	sfq_index d = q->max_depth;
251 	struct sk_buff *skb;
252 	unsigned int len;
253 
254 	/* Queue is full! Find the longest slot and
255 	   drop a packet from it */
256 
257 	if (d > 1) {
258 		sfq_index x = q->dep[d + SFQ_DEPTH].next;
259 		skb = q->qs[x].prev;
260 		len = qdisc_pkt_len(skb);
261 		__skb_unlink(skb, &q->qs[x]);
262 		kfree_skb(skb);
263 		sfq_dec(q, x);
264 		sch->q.qlen--;
265 		sch->qstats.drops++;
266 		sch->qstats.backlog -= len;
267 		return len;
268 	}
269 
270 	if (d == 1) {
271 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
272 		d = q->next[q->tail];
273 		q->next[q->tail] = q->next[d];
274 		q->allot[q->next[d]] += q->quantum;
275 		skb = q->qs[d].prev;
276 		len = qdisc_pkt_len(skb);
277 		__skb_unlink(skb, &q->qs[d]);
278 		kfree_skb(skb);
279 		sfq_dec(q, d);
280 		sch->q.qlen--;
281 		q->ht[q->hash[d]] = SFQ_DEPTH;
282 		sch->qstats.drops++;
283 		sch->qstats.backlog -= len;
284 		return len;
285 	}
286 
287 	return 0;
288 }
289 
290 static int
291 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
292 {
293 	struct sfq_sched_data *q = qdisc_priv(sch);
294 	unsigned int hash;
295 	sfq_index x;
296 	int uninitialized_var(ret);
297 
298 	hash = sfq_classify(skb, sch, &ret);
299 	if (hash == 0) {
300 		if (ret & __NET_XMIT_BYPASS)
301 			sch->qstats.drops++;
302 		kfree_skb(skb);
303 		return ret;
304 	}
305 	hash--;
306 
307 	x = q->ht[hash];
308 	if (x == SFQ_DEPTH) {
309 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
310 		q->hash[x] = hash;
311 	}
312 
313 	/* If selected queue has length q->limit, this means that
314 	 * all another queues are empty and that we do simple tail drop,
315 	 * i.e. drop _this_ packet.
316 	 */
317 	if (q->qs[x].qlen >= q->limit)
318 		return qdisc_drop(skb, sch);
319 
320 	sch->qstats.backlog += qdisc_pkt_len(skb);
321 	__skb_queue_tail(&q->qs[x], skb);
322 	sfq_inc(q, x);
323 	if (q->qs[x].qlen == 1) {		/* The flow is new */
324 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
325 			q->tail = x;
326 			q->next[x] = x;
327 			q->allot[x] = q->quantum;
328 		} else {
329 			q->next[x] = q->next[q->tail];
330 			q->next[q->tail] = x;
331 			q->tail = x;
332 		}
333 	}
334 	if (++sch->q.qlen <= q->limit) {
335 		sch->bstats.bytes += qdisc_pkt_len(skb);
336 		sch->bstats.packets++;
337 		return NET_XMIT_SUCCESS;
338 	}
339 
340 	sfq_drop(sch);
341 	return NET_XMIT_CN;
342 }
343 
344 static struct sk_buff *
345 sfq_peek(struct Qdisc *sch)
346 {
347 	struct sfq_sched_data *q = qdisc_priv(sch);
348 	sfq_index a;
349 
350 	/* No active slots */
351 	if (q->tail == SFQ_DEPTH)
352 		return NULL;
353 
354 	a = q->next[q->tail];
355 	return skb_peek(&q->qs[a]);
356 }
357 
358 static struct sk_buff *
359 sfq_dequeue(struct Qdisc *sch)
360 {
361 	struct sfq_sched_data *q = qdisc_priv(sch);
362 	struct sk_buff *skb;
363 	sfq_index a, old_a;
364 
365 	/* No active slots */
366 	if (q->tail == SFQ_DEPTH)
367 		return NULL;
368 
369 	a = old_a = q->next[q->tail];
370 
371 	/* Grab packet */
372 	skb = __skb_dequeue(&q->qs[a]);
373 	sfq_dec(q, a);
374 	sch->q.qlen--;
375 	sch->qstats.backlog -= qdisc_pkt_len(skb);
376 
377 	/* Is the slot empty? */
378 	if (q->qs[a].qlen == 0) {
379 		q->ht[q->hash[a]] = SFQ_DEPTH;
380 		a = q->next[a];
381 		if (a == old_a) {
382 			q->tail = SFQ_DEPTH;
383 			return skb;
384 		}
385 		q->next[q->tail] = a;
386 		q->allot[a] += q->quantum;
387 	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
388 		q->tail = a;
389 		a = q->next[a];
390 		q->allot[a] += q->quantum;
391 	}
392 	return skb;
393 }
394 
395 static void
396 sfq_reset(struct Qdisc *sch)
397 {
398 	struct sk_buff *skb;
399 
400 	while ((skb = sfq_dequeue(sch)) != NULL)
401 		kfree_skb(skb);
402 }
403 
404 static void sfq_perturbation(unsigned long arg)
405 {
406 	struct Qdisc *sch = (struct Qdisc *)arg;
407 	struct sfq_sched_data *q = qdisc_priv(sch);
408 
409 	q->perturbation = net_random();
410 
411 	if (q->perturb_period)
412 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
413 }
414 
415 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
416 {
417 	struct sfq_sched_data *q = qdisc_priv(sch);
418 	struct tc_sfq_qopt *ctl = nla_data(opt);
419 	unsigned int qlen;
420 
421 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
422 		return -EINVAL;
423 
424 	sch_tree_lock(sch);
425 	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
426 	q->perturb_period = ctl->perturb_period * HZ;
427 	if (ctl->limit)
428 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
429 
430 	qlen = sch->q.qlen;
431 	while (sch->q.qlen > q->limit)
432 		sfq_drop(sch);
433 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
434 
435 	del_timer(&q->perturb_timer);
436 	if (q->perturb_period) {
437 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
438 		q->perturbation = net_random();
439 	}
440 	sch_tree_unlock(sch);
441 	return 0;
442 }
443 
444 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
445 {
446 	struct sfq_sched_data *q = qdisc_priv(sch);
447 	int i;
448 
449 	q->perturb_timer.function = sfq_perturbation;
450 	q->perturb_timer.data = (unsigned long)sch;
451 	init_timer_deferrable(&q->perturb_timer);
452 
453 	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
454 		q->ht[i] = SFQ_DEPTH;
455 
456 	for (i = 0; i < SFQ_DEPTH; i++) {
457 		skb_queue_head_init(&q->qs[i]);
458 		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
459 		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
460 	}
461 
462 	q->limit = SFQ_DEPTH - 1;
463 	q->max_depth = 0;
464 	q->tail = SFQ_DEPTH;
465 	if (opt == NULL) {
466 		q->quantum = psched_mtu(qdisc_dev(sch));
467 		q->perturb_period = 0;
468 		q->perturbation = net_random();
469 	} else {
470 		int err = sfq_change(sch, opt);
471 		if (err)
472 			return err;
473 	}
474 
475 	for (i = 0; i < SFQ_DEPTH; i++)
476 		sfq_link(q, i);
477 	return 0;
478 }
479 
480 static void sfq_destroy(struct Qdisc *sch)
481 {
482 	struct sfq_sched_data *q = qdisc_priv(sch);
483 
484 	tcf_destroy_chain(&q->filter_list);
485 	q->perturb_period = 0;
486 	del_timer_sync(&q->perturb_timer);
487 }
488 
489 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
490 {
491 	struct sfq_sched_data *q = qdisc_priv(sch);
492 	unsigned char *b = skb_tail_pointer(skb);
493 	struct tc_sfq_qopt opt;
494 
495 	opt.quantum = q->quantum;
496 	opt.perturb_period = q->perturb_period / HZ;
497 
498 	opt.limit = q->limit;
499 	opt.divisor = SFQ_HASH_DIVISOR;
500 	opt.flows = q->limit;
501 
502 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
503 
504 	return skb->len;
505 
506 nla_put_failure:
507 	nlmsg_trim(skb, b);
508 	return -1;
509 }
510 
511 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
512 {
513 	return NULL;
514 }
515 
516 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
517 {
518 	return 0;
519 }
520 
521 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
522 			      u32 classid)
523 {
524 	return 0;
525 }
526 
527 static void sfq_put(struct Qdisc *q, unsigned long cl)
528 {
529 }
530 
531 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
532 {
533 	struct sfq_sched_data *q = qdisc_priv(sch);
534 
535 	if (cl)
536 		return NULL;
537 	return &q->filter_list;
538 }
539 
540 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
541 			  struct sk_buff *skb, struct tcmsg *tcm)
542 {
543 	tcm->tcm_handle |= TC_H_MIN(cl);
544 	return 0;
545 }
546 
547 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
548 				struct gnet_dump *d)
549 {
550 	struct sfq_sched_data *q = qdisc_priv(sch);
551 	sfq_index idx = q->ht[cl-1];
552 	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
553 	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
554 
555 	if (gnet_stats_copy_queue(d, &qs) < 0)
556 		return -1;
557 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
558 }
559 
560 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
561 {
562 	struct sfq_sched_data *q = qdisc_priv(sch);
563 	unsigned int i;
564 
565 	if (arg->stop)
566 		return;
567 
568 	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
569 		if (q->ht[i] == SFQ_DEPTH ||
570 		    arg->count < arg->skip) {
571 			arg->count++;
572 			continue;
573 		}
574 		if (arg->fn(sch, i + 1, arg) < 0) {
575 			arg->stop = 1;
576 			break;
577 		}
578 		arg->count++;
579 	}
580 }
581 
582 static const struct Qdisc_class_ops sfq_class_ops = {
583 	.leaf		=	sfq_leaf,
584 	.get		=	sfq_get,
585 	.put		=	sfq_put,
586 	.tcf_chain	=	sfq_find_tcf,
587 	.bind_tcf	=	sfq_bind,
588 	.unbind_tcf	=	sfq_put,
589 	.dump		=	sfq_dump_class,
590 	.dump_stats	=	sfq_dump_class_stats,
591 	.walk		=	sfq_walk,
592 };
593 
594 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
595 	.cl_ops		=	&sfq_class_ops,
596 	.id		=	"sfq",
597 	.priv_size	=	sizeof(struct sfq_sched_data),
598 	.enqueue	=	sfq_enqueue,
599 	.dequeue	=	sfq_dequeue,
600 	.peek		=	sfq_peek,
601 	.drop		=	sfq_drop,
602 	.init		=	sfq_init,
603 	.reset		=	sfq_reset,
604 	.destroy	=	sfq_destroy,
605 	.change		=	NULL,
606 	.dump		=	sfq_dump,
607 	.owner		=	THIS_MODULE,
608 };
609 
610 static int __init sfq_module_init(void)
611 {
612 	return register_qdisc(&sfq_qdisc_ops);
613 }
614 static void __exit sfq_module_exit(void)
615 {
616 	unregister_qdisc(&sfq_qdisc_ops);
617 }
618 module_init(sfq_module_init)
619 module_exit(sfq_module_exit)
620 MODULE_LICENSE("GPL");
621