xref: /linux/net/sched/sch_sfq.c (revision e190bfe56841551b1ad5abb42ebd0c4798cc8c01)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <linux/types.h>
14 #include <linux/kernel.h>
15 #include <linux/jiffies.h>
16 #include <linux/string.h>
17 #include <linux/in.h>
18 #include <linux/errno.h>
19 #include <linux/init.h>
20 #include <linux/ipv6.h>
21 #include <linux/skbuff.h>
22 #include <linux/jhash.h>
23 #include <linux/slab.h>
24 #include <net/ip.h>
25 #include <net/netlink.h>
26 #include <net/pkt_sched.h>
27 
28 
29 /*	Stochastic Fairness Queuing algorithm.
30 	=======================================
31 
32 	Source:
33 	Paul E. McKenney "Stochastic Fairness Queuing",
34 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35 
36 	Paul E. McKenney "Stochastic Fairness Queuing",
37 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
38 
39 
40 	See also:
41 	M. Shreedhar and George Varghese "Efficient Fair
42 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43 
44 
45 	This is not the thing that is usually called (W)FQ nowadays.
46 	It does not use any timestamp mechanism, but instead
47 	processes queues in round-robin order.
48 
49 	ADVANTAGE:
50 
51 	- It is very cheap. Both CPU and memory requirements are minimal.
52 
53 	DRAWBACKS:
54 
55 	- "Stochastic" -> It is not 100% fair.
56 	When hash collisions occur, several flows are considered as one.
57 
58 	- "Round-robin" -> It introduces larger delays than virtual clock
59 	based schemes, and should not be used for isolating interactive
60 	traffic	from non-interactive. It means, that this scheduler
61 	should be used as leaf of CBQ or P3, which put interactive traffic
62 	to higher priority band.
63 
64 	We still need true WFQ for top level CSZ, but using WFQ
65 	for the best effort traffic is absolutely pointless:
66 	SFQ is superior for this purpose.
67 
68 	IMPLEMENTATION:
69 	This implementation limits maximal queue length to 128;
70 	maximal mtu to 2^15-1; number of hash buckets to 1024.
71 	The only goal of this restrictions was that all data
72 	fit into one 4K page :-). Struct sfq_sched_data is
73 	organized in anti-cache manner: all the data for a bucket
74 	are scattered over different locations. This is not good,
75 	but it allowed me to put it into 4K.
76 
77 	It is easy to increase these values, but not in flight.  */
78 
79 #define SFQ_DEPTH		128
80 #define SFQ_HASH_DIVISOR	1024
81 
82 /* This type should contain at least SFQ_DEPTH*2 values */
83 typedef unsigned char sfq_index;
84 
85 struct sfq_head
86 {
87 	sfq_index	next;
88 	sfq_index	prev;
89 };
90 
91 struct sfq_sched_data
92 {
93 /* Parameters */
94 	int		perturb_period;
95 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
96 	int		limit;
97 
98 /* Variables */
99 	struct tcf_proto *filter_list;
100 	struct timer_list perturb_timer;
101 	u32		perturbation;
102 	sfq_index	tail;		/* Index of current slot in round */
103 	sfq_index	max_depth;	/* Maximal depth */
104 
105 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
106 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
107 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
108 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
109 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
110 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
111 };
112 
113 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
114 {
115 	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
116 }
117 
118 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
119 {
120 	u32 h, h2;
121 
122 	switch (skb->protocol) {
123 	case htons(ETH_P_IP):
124 	{
125 		const struct iphdr *iph = ip_hdr(skb);
126 		h = (__force u32)iph->daddr;
127 		h2 = (__force u32)iph->saddr ^ iph->protocol;
128 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
129 		    (iph->protocol == IPPROTO_TCP ||
130 		     iph->protocol == IPPROTO_UDP ||
131 		     iph->protocol == IPPROTO_UDPLITE ||
132 		     iph->protocol == IPPROTO_SCTP ||
133 		     iph->protocol == IPPROTO_DCCP ||
134 		     iph->protocol == IPPROTO_ESP))
135 			h2 ^= *(((u32*)iph) + iph->ihl);
136 		break;
137 	}
138 	case htons(ETH_P_IPV6):
139 	{
140 		struct ipv6hdr *iph = ipv6_hdr(skb);
141 		h = (__force u32)iph->daddr.s6_addr32[3];
142 		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
143 		if (iph->nexthdr == IPPROTO_TCP ||
144 		    iph->nexthdr == IPPROTO_UDP ||
145 		    iph->nexthdr == IPPROTO_UDPLITE ||
146 		    iph->nexthdr == IPPROTO_SCTP ||
147 		    iph->nexthdr == IPPROTO_DCCP ||
148 		    iph->nexthdr == IPPROTO_ESP)
149 			h2 ^= *(u32*)&iph[1];
150 		break;
151 	}
152 	default:
153 		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
154 		h2 = (unsigned long)skb->sk;
155 	}
156 
157 	return sfq_fold_hash(q, h, h2);
158 }
159 
160 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
161 				 int *qerr)
162 {
163 	struct sfq_sched_data *q = qdisc_priv(sch);
164 	struct tcf_result res;
165 	int result;
166 
167 	if (TC_H_MAJ(skb->priority) == sch->handle &&
168 	    TC_H_MIN(skb->priority) > 0 &&
169 	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
170 		return TC_H_MIN(skb->priority);
171 
172 	if (!q->filter_list)
173 		return sfq_hash(q, skb) + 1;
174 
175 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
176 	result = tc_classify(skb, q->filter_list, &res);
177 	if (result >= 0) {
178 #ifdef CONFIG_NET_CLS_ACT
179 		switch (result) {
180 		case TC_ACT_STOLEN:
181 		case TC_ACT_QUEUED:
182 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
183 		case TC_ACT_SHOT:
184 			return 0;
185 		}
186 #endif
187 		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
188 			return TC_H_MIN(res.classid);
189 	}
190 	return 0;
191 }
192 
193 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
194 {
195 	sfq_index p, n;
196 	int d = q->qs[x].qlen + SFQ_DEPTH;
197 
198 	p = d;
199 	n = q->dep[d].next;
200 	q->dep[x].next = n;
201 	q->dep[x].prev = p;
202 	q->dep[p].next = q->dep[n].prev = x;
203 }
204 
205 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
206 {
207 	sfq_index p, n;
208 
209 	n = q->dep[x].next;
210 	p = q->dep[x].prev;
211 	q->dep[p].next = n;
212 	q->dep[n].prev = p;
213 
214 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
215 		q->max_depth--;
216 
217 	sfq_link(q, x);
218 }
219 
220 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
221 {
222 	sfq_index p, n;
223 	int d;
224 
225 	n = q->dep[x].next;
226 	p = q->dep[x].prev;
227 	q->dep[p].next = n;
228 	q->dep[n].prev = p;
229 	d = q->qs[x].qlen;
230 	if (q->max_depth < d)
231 		q->max_depth = d;
232 
233 	sfq_link(q, x);
234 }
235 
236 static unsigned int sfq_drop(struct Qdisc *sch)
237 {
238 	struct sfq_sched_data *q = qdisc_priv(sch);
239 	sfq_index d = q->max_depth;
240 	struct sk_buff *skb;
241 	unsigned int len;
242 
243 	/* Queue is full! Find the longest slot and
244 	   drop a packet from it */
245 
246 	if (d > 1) {
247 		sfq_index x = q->dep[d + SFQ_DEPTH].next;
248 		skb = q->qs[x].prev;
249 		len = qdisc_pkt_len(skb);
250 		__skb_unlink(skb, &q->qs[x]);
251 		kfree_skb(skb);
252 		sfq_dec(q, x);
253 		sch->q.qlen--;
254 		sch->qstats.drops++;
255 		sch->qstats.backlog -= len;
256 		return len;
257 	}
258 
259 	if (d == 1) {
260 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
261 		d = q->next[q->tail];
262 		q->next[q->tail] = q->next[d];
263 		q->allot[q->next[d]] += q->quantum;
264 		skb = q->qs[d].prev;
265 		len = qdisc_pkt_len(skb);
266 		__skb_unlink(skb, &q->qs[d]);
267 		kfree_skb(skb);
268 		sfq_dec(q, d);
269 		sch->q.qlen--;
270 		q->ht[q->hash[d]] = SFQ_DEPTH;
271 		sch->qstats.drops++;
272 		sch->qstats.backlog -= len;
273 		return len;
274 	}
275 
276 	return 0;
277 }
278 
279 static int
280 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
281 {
282 	struct sfq_sched_data *q = qdisc_priv(sch);
283 	unsigned int hash;
284 	sfq_index x;
285 	int uninitialized_var(ret);
286 
287 	hash = sfq_classify(skb, sch, &ret);
288 	if (hash == 0) {
289 		if (ret & __NET_XMIT_BYPASS)
290 			sch->qstats.drops++;
291 		kfree_skb(skb);
292 		return ret;
293 	}
294 	hash--;
295 
296 	x = q->ht[hash];
297 	if (x == SFQ_DEPTH) {
298 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
299 		q->hash[x] = hash;
300 	}
301 
302 	/* If selected queue has length q->limit, this means that
303 	 * all another queues are empty and that we do simple tail drop,
304 	 * i.e. drop _this_ packet.
305 	 */
306 	if (q->qs[x].qlen >= q->limit)
307 		return qdisc_drop(skb, sch);
308 
309 	sch->qstats.backlog += qdisc_pkt_len(skb);
310 	__skb_queue_tail(&q->qs[x], skb);
311 	sfq_inc(q, x);
312 	if (q->qs[x].qlen == 1) {		/* The flow is new */
313 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
314 			q->tail = x;
315 			q->next[x] = x;
316 			q->allot[x] = q->quantum;
317 		} else {
318 			q->next[x] = q->next[q->tail];
319 			q->next[q->tail] = x;
320 			q->tail = x;
321 		}
322 	}
323 	if (++sch->q.qlen <= q->limit) {
324 		sch->bstats.bytes += qdisc_pkt_len(skb);
325 		sch->bstats.packets++;
326 		return 0;
327 	}
328 
329 	sfq_drop(sch);
330 	return NET_XMIT_CN;
331 }
332 
333 static struct sk_buff *
334 sfq_peek(struct Qdisc *sch)
335 {
336 	struct sfq_sched_data *q = qdisc_priv(sch);
337 	sfq_index a;
338 
339 	/* No active slots */
340 	if (q->tail == SFQ_DEPTH)
341 		return NULL;
342 
343 	a = q->next[q->tail];
344 	return skb_peek(&q->qs[a]);
345 }
346 
347 static struct sk_buff *
348 sfq_dequeue(struct Qdisc *sch)
349 {
350 	struct sfq_sched_data *q = qdisc_priv(sch);
351 	struct sk_buff *skb;
352 	sfq_index a, old_a;
353 
354 	/* No active slots */
355 	if (q->tail == SFQ_DEPTH)
356 		return NULL;
357 
358 	a = old_a = q->next[q->tail];
359 
360 	/* Grab packet */
361 	skb = __skb_dequeue(&q->qs[a]);
362 	sfq_dec(q, a);
363 	sch->q.qlen--;
364 	sch->qstats.backlog -= qdisc_pkt_len(skb);
365 
366 	/* Is the slot empty? */
367 	if (q->qs[a].qlen == 0) {
368 		q->ht[q->hash[a]] = SFQ_DEPTH;
369 		a = q->next[a];
370 		if (a == old_a) {
371 			q->tail = SFQ_DEPTH;
372 			return skb;
373 		}
374 		q->next[q->tail] = a;
375 		q->allot[a] += q->quantum;
376 	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
377 		q->tail = a;
378 		a = q->next[a];
379 		q->allot[a] += q->quantum;
380 	}
381 	return skb;
382 }
383 
384 static void
385 sfq_reset(struct Qdisc *sch)
386 {
387 	struct sk_buff *skb;
388 
389 	while ((skb = sfq_dequeue(sch)) != NULL)
390 		kfree_skb(skb);
391 }
392 
393 static void sfq_perturbation(unsigned long arg)
394 {
395 	struct Qdisc *sch = (struct Qdisc *)arg;
396 	struct sfq_sched_data *q = qdisc_priv(sch);
397 
398 	q->perturbation = net_random();
399 
400 	if (q->perturb_period)
401 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
402 }
403 
404 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
405 {
406 	struct sfq_sched_data *q = qdisc_priv(sch);
407 	struct tc_sfq_qopt *ctl = nla_data(opt);
408 	unsigned int qlen;
409 
410 	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
411 		return -EINVAL;
412 
413 	sch_tree_lock(sch);
414 	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
415 	q->perturb_period = ctl->perturb_period * HZ;
416 	if (ctl->limit)
417 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
418 
419 	qlen = sch->q.qlen;
420 	while (sch->q.qlen > q->limit)
421 		sfq_drop(sch);
422 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
423 
424 	del_timer(&q->perturb_timer);
425 	if (q->perturb_period) {
426 		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
427 		q->perturbation = net_random();
428 	}
429 	sch_tree_unlock(sch);
430 	return 0;
431 }
432 
433 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
434 {
435 	struct sfq_sched_data *q = qdisc_priv(sch);
436 	int i;
437 
438 	q->perturb_timer.function = sfq_perturbation;
439 	q->perturb_timer.data = (unsigned long)sch;
440 	init_timer_deferrable(&q->perturb_timer);
441 
442 	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
443 		q->ht[i] = SFQ_DEPTH;
444 
445 	for (i = 0; i < SFQ_DEPTH; i++) {
446 		skb_queue_head_init(&q->qs[i]);
447 		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
448 		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
449 	}
450 
451 	q->limit = SFQ_DEPTH - 1;
452 	q->max_depth = 0;
453 	q->tail = SFQ_DEPTH;
454 	if (opt == NULL) {
455 		q->quantum = psched_mtu(qdisc_dev(sch));
456 		q->perturb_period = 0;
457 		q->perturbation = net_random();
458 	} else {
459 		int err = sfq_change(sch, opt);
460 		if (err)
461 			return err;
462 	}
463 
464 	for (i = 0; i < SFQ_DEPTH; i++)
465 		sfq_link(q, i);
466 	return 0;
467 }
468 
469 static void sfq_destroy(struct Qdisc *sch)
470 {
471 	struct sfq_sched_data *q = qdisc_priv(sch);
472 
473 	tcf_destroy_chain(&q->filter_list);
474 	q->perturb_period = 0;
475 	del_timer_sync(&q->perturb_timer);
476 }
477 
478 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
479 {
480 	struct sfq_sched_data *q = qdisc_priv(sch);
481 	unsigned char *b = skb_tail_pointer(skb);
482 	struct tc_sfq_qopt opt;
483 
484 	opt.quantum = q->quantum;
485 	opt.perturb_period = q->perturb_period / HZ;
486 
487 	opt.limit = q->limit;
488 	opt.divisor = SFQ_HASH_DIVISOR;
489 	opt.flows = q->limit;
490 
491 	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
492 
493 	return skb->len;
494 
495 nla_put_failure:
496 	nlmsg_trim(skb, b);
497 	return -1;
498 }
499 
500 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
501 {
502 	return 0;
503 }
504 
505 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
506 {
507 	struct sfq_sched_data *q = qdisc_priv(sch);
508 
509 	if (cl)
510 		return NULL;
511 	return &q->filter_list;
512 }
513 
514 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
515 			  struct sk_buff *skb, struct tcmsg *tcm)
516 {
517 	tcm->tcm_handle |= TC_H_MIN(cl);
518 	return 0;
519 }
520 
521 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
522 				struct gnet_dump *d)
523 {
524 	struct sfq_sched_data *q = qdisc_priv(sch);
525 	sfq_index idx = q->ht[cl-1];
526 	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
527 	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
528 
529 	if (gnet_stats_copy_queue(d, &qs) < 0)
530 		return -1;
531 	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
532 }
533 
534 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
535 {
536 	struct sfq_sched_data *q = qdisc_priv(sch);
537 	unsigned int i;
538 
539 	if (arg->stop)
540 		return;
541 
542 	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
543 		if (q->ht[i] == SFQ_DEPTH ||
544 		    arg->count < arg->skip) {
545 			arg->count++;
546 			continue;
547 		}
548 		if (arg->fn(sch, i + 1, arg) < 0) {
549 			arg->stop = 1;
550 			break;
551 		}
552 		arg->count++;
553 	}
554 }
555 
556 static const struct Qdisc_class_ops sfq_class_ops = {
557 	.get		=	sfq_get,
558 	.tcf_chain	=	sfq_find_tcf,
559 	.dump		=	sfq_dump_class,
560 	.dump_stats	=	sfq_dump_class_stats,
561 	.walk		=	sfq_walk,
562 };
563 
564 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
565 	.cl_ops		=	&sfq_class_ops,
566 	.id		=	"sfq",
567 	.priv_size	=	sizeof(struct sfq_sched_data),
568 	.enqueue	=	sfq_enqueue,
569 	.dequeue	=	sfq_dequeue,
570 	.peek		=	sfq_peek,
571 	.drop		=	sfq_drop,
572 	.init		=	sfq_init,
573 	.reset		=	sfq_reset,
574 	.destroy	=	sfq_destroy,
575 	.change		=	NULL,
576 	.dump		=	sfq_dump,
577 	.owner		=	THIS_MODULE,
578 };
579 
580 static int __init sfq_module_init(void)
581 {
582 	return register_qdisc(&sfq_qdisc_ops);
583 }
584 static void __exit sfq_module_exit(void)
585 {
586 	unregister_qdisc(&sfq_qdisc_ops);
587 }
588 module_init(sfq_module_init)
589 module_exit(sfq_module_exit)
590 MODULE_LICENSE("GPL");
591