xref: /linux/net/sched/sch_sfq.c (revision c537b994505099b7197e7d3125b942ecbcc51eb6)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <asm/uaccess.h>
14 #include <asm/system.h>
15 #include <linux/bitops.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/jiffies.h>
19 #include <linux/string.h>
20 #include <linux/mm.h>
21 #include <linux/socket.h>
22 #include <linux/sockios.h>
23 #include <linux/in.h>
24 #include <linux/errno.h>
25 #include <linux/interrupt.h>
26 #include <linux/if_ether.h>
27 #include <linux/inet.h>
28 #include <linux/netdevice.h>
29 #include <linux/etherdevice.h>
30 #include <linux/notifier.h>
31 #include <linux/init.h>
32 #include <net/ip.h>
33 #include <linux/ipv6.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38 
39 
40 /*	Stochastic Fairness Queuing algorithm.
41 	=======================================
42 
43 	Source:
44 	Paul E. McKenney "Stochastic Fairness Queuing",
45 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
46 
47 	Paul E. McKenney "Stochastic Fairness Queuing",
48 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
49 
50 
51 	See also:
52 	M. Shreedhar and George Varghese "Efficient Fair
53 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
54 
55 
56 	This is not the thing that is usually called (W)FQ nowadays.
57 	It does not use any timestamp mechanism, but instead
58 	processes queues in round-robin order.
59 
60 	ADVANTAGE:
61 
62 	- It is very cheap. Both CPU and memory requirements are minimal.
63 
64 	DRAWBACKS:
65 
66 	- "Stochastic" -> It is not 100% fair.
67 	When hash collisions occur, several flows are considered as one.
68 
69 	- "Round-robin" -> It introduces larger delays than virtual clock
70 	based schemes, and should not be used for isolating interactive
71 	traffic	from non-interactive. It means, that this scheduler
72 	should be used as leaf of CBQ or P3, which put interactive traffic
73 	to higher priority band.
74 
75 	We still need true WFQ for top level CSZ, but using WFQ
76 	for the best effort traffic is absolutely pointless:
77 	SFQ is superior for this purpose.
78 
79 	IMPLEMENTATION:
80 	This implementation limits maximal queue length to 128;
81 	maximal mtu to 2^15-1; number of hash buckets to 1024.
82 	The only goal of this restrictions was that all data
83 	fit into one 4K page :-). Struct sfq_sched_data is
84 	organized in anti-cache manner: all the data for a bucket
85 	are scattered over different locations. This is not good,
86 	but it allowed me to put it into 4K.
87 
88 	It is easy to increase these values, but not in flight.  */
89 
90 #define SFQ_DEPTH		128
91 #define SFQ_HASH_DIVISOR	1024
92 
93 /* This type should contain at least SFQ_DEPTH*2 values */
94 typedef unsigned char sfq_index;
95 
96 struct sfq_head
97 {
98 	sfq_index	next;
99 	sfq_index	prev;
100 };
101 
102 struct sfq_sched_data
103 {
104 /* Parameters */
105 	int		perturb_period;
106 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
107 	int		limit;
108 
109 /* Variables */
110 	struct timer_list perturb_timer;
111 	int		perturbation;
112 	sfq_index	tail;		/* Index of current slot in round */
113 	sfq_index	max_depth;	/* Maximal depth */
114 
115 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
116 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
117 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
118 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
119 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
120 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
121 };
122 
123 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
124 {
125 	int pert = q->perturbation;
126 
127 	/* Have we any rotation primitives? If not, WHY? */
128 	h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
129 	h ^= h>>10;
130 	return h & 0x3FF;
131 }
132 
133 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
134 {
135 	u32 h, h2;
136 
137 	switch (skb->protocol) {
138 	case __constant_htons(ETH_P_IP):
139 	{
140 		struct iphdr *iph = skb->nh.iph;
141 		h = iph->daddr;
142 		h2 = iph->saddr^iph->protocol;
143 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
144 		    (iph->protocol == IPPROTO_TCP ||
145 		     iph->protocol == IPPROTO_UDP ||
146 		     iph->protocol == IPPROTO_UDPLITE ||
147 		     iph->protocol == IPPROTO_SCTP ||
148 		     iph->protocol == IPPROTO_DCCP ||
149 		     iph->protocol == IPPROTO_ESP))
150 			h2 ^= *(((u32*)iph) + iph->ihl);
151 		break;
152 	}
153 	case __constant_htons(ETH_P_IPV6):
154 	{
155 		struct ipv6hdr *iph = skb->nh.ipv6h;
156 		h = iph->daddr.s6_addr32[3];
157 		h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
158 		if (iph->nexthdr == IPPROTO_TCP ||
159 		    iph->nexthdr == IPPROTO_UDP ||
160 		    iph->nexthdr == IPPROTO_UDPLITE ||
161 		    iph->nexthdr == IPPROTO_SCTP ||
162 		    iph->nexthdr == IPPROTO_DCCP ||
163 		    iph->nexthdr == IPPROTO_ESP)
164 			h2 ^= *(u32*)&iph[1];
165 		break;
166 	}
167 	default:
168 		h = (u32)(unsigned long)skb->dst^skb->protocol;
169 		h2 = (u32)(unsigned long)skb->sk;
170 	}
171 	return sfq_fold_hash(q, h, h2);
172 }
173 
174 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
175 {
176 	sfq_index p, n;
177 	int d = q->qs[x].qlen + SFQ_DEPTH;
178 
179 	p = d;
180 	n = q->dep[d].next;
181 	q->dep[x].next = n;
182 	q->dep[x].prev = p;
183 	q->dep[p].next = q->dep[n].prev = x;
184 }
185 
186 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
187 {
188 	sfq_index p, n;
189 
190 	n = q->dep[x].next;
191 	p = q->dep[x].prev;
192 	q->dep[p].next = n;
193 	q->dep[n].prev = p;
194 
195 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
196 		q->max_depth--;
197 
198 	sfq_link(q, x);
199 }
200 
201 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
202 {
203 	sfq_index p, n;
204 	int d;
205 
206 	n = q->dep[x].next;
207 	p = q->dep[x].prev;
208 	q->dep[p].next = n;
209 	q->dep[n].prev = p;
210 	d = q->qs[x].qlen;
211 	if (q->max_depth < d)
212 		q->max_depth = d;
213 
214 	sfq_link(q, x);
215 }
216 
217 static unsigned int sfq_drop(struct Qdisc *sch)
218 {
219 	struct sfq_sched_data *q = qdisc_priv(sch);
220 	sfq_index d = q->max_depth;
221 	struct sk_buff *skb;
222 	unsigned int len;
223 
224 	/* Queue is full! Find the longest slot and
225 	   drop a packet from it */
226 
227 	if (d > 1) {
228 		sfq_index x = q->dep[d+SFQ_DEPTH].next;
229 		skb = q->qs[x].prev;
230 		len = skb->len;
231 		__skb_unlink(skb, &q->qs[x]);
232 		kfree_skb(skb);
233 		sfq_dec(q, x);
234 		sch->q.qlen--;
235 		sch->qstats.drops++;
236 		sch->qstats.backlog -= len;
237 		return len;
238 	}
239 
240 	if (d == 1) {
241 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
242 		d = q->next[q->tail];
243 		q->next[q->tail] = q->next[d];
244 		q->allot[q->next[d]] += q->quantum;
245 		skb = q->qs[d].prev;
246 		len = skb->len;
247 		__skb_unlink(skb, &q->qs[d]);
248 		kfree_skb(skb);
249 		sfq_dec(q, d);
250 		sch->q.qlen--;
251 		q->ht[q->hash[d]] = SFQ_DEPTH;
252 		sch->qstats.drops++;
253 		sch->qstats.backlog -= len;
254 		return len;
255 	}
256 
257 	return 0;
258 }
259 
260 static int
261 sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
262 {
263 	struct sfq_sched_data *q = qdisc_priv(sch);
264 	unsigned hash = sfq_hash(q, skb);
265 	sfq_index x;
266 
267 	x = q->ht[hash];
268 	if (x == SFQ_DEPTH) {
269 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
270 		q->hash[x] = hash;
271 	}
272 	sch->qstats.backlog += skb->len;
273 	__skb_queue_tail(&q->qs[x], skb);
274 	sfq_inc(q, x);
275 	if (q->qs[x].qlen == 1) {		/* The flow is new */
276 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
277 			q->tail = x;
278 			q->next[x] = x;
279 			q->allot[x] = q->quantum;
280 		} else {
281 			q->next[x] = q->next[q->tail];
282 			q->next[q->tail] = x;
283 			q->tail = x;
284 		}
285 	}
286 	if (++sch->q.qlen < q->limit-1) {
287 		sch->bstats.bytes += skb->len;
288 		sch->bstats.packets++;
289 		return 0;
290 	}
291 
292 	sfq_drop(sch);
293 	return NET_XMIT_CN;
294 }
295 
296 static int
297 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
298 {
299 	struct sfq_sched_data *q = qdisc_priv(sch);
300 	unsigned hash = sfq_hash(q, skb);
301 	sfq_index x;
302 
303 	x = q->ht[hash];
304 	if (x == SFQ_DEPTH) {
305 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
306 		q->hash[x] = hash;
307 	}
308 	sch->qstats.backlog += skb->len;
309 	__skb_queue_head(&q->qs[x], skb);
310 	sfq_inc(q, x);
311 	if (q->qs[x].qlen == 1) {		/* The flow is new */
312 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
313 			q->tail = x;
314 			q->next[x] = x;
315 			q->allot[x] = q->quantum;
316 		} else {
317 			q->next[x] = q->next[q->tail];
318 			q->next[q->tail] = x;
319 			q->tail = x;
320 		}
321 	}
322 	if (++sch->q.qlen < q->limit - 1) {
323 		sch->qstats.requeues++;
324 		return 0;
325 	}
326 
327 	sch->qstats.drops++;
328 	sfq_drop(sch);
329 	return NET_XMIT_CN;
330 }
331 
332 
333 
334 
335 static struct sk_buff *
336 sfq_dequeue(struct Qdisc* sch)
337 {
338 	struct sfq_sched_data *q = qdisc_priv(sch);
339 	struct sk_buff *skb;
340 	sfq_index a, old_a;
341 
342 	/* No active slots */
343 	if (q->tail == SFQ_DEPTH)
344 		return NULL;
345 
346 	a = old_a = q->next[q->tail];
347 
348 	/* Grab packet */
349 	skb = __skb_dequeue(&q->qs[a]);
350 	sfq_dec(q, a);
351 	sch->q.qlen--;
352 	sch->qstats.backlog -= skb->len;
353 
354 	/* Is the slot empty? */
355 	if (q->qs[a].qlen == 0) {
356 		q->ht[q->hash[a]] = SFQ_DEPTH;
357 		a = q->next[a];
358 		if (a == old_a) {
359 			q->tail = SFQ_DEPTH;
360 			return skb;
361 		}
362 		q->next[q->tail] = a;
363 		q->allot[a] += q->quantum;
364 	} else if ((q->allot[a] -= skb->len) <= 0) {
365 		q->tail = a;
366 		a = q->next[a];
367 		q->allot[a] += q->quantum;
368 	}
369 	return skb;
370 }
371 
372 static void
373 sfq_reset(struct Qdisc* sch)
374 {
375 	struct sk_buff *skb;
376 
377 	while ((skb = sfq_dequeue(sch)) != NULL)
378 		kfree_skb(skb);
379 }
380 
381 static void sfq_perturbation(unsigned long arg)
382 {
383 	struct Qdisc *sch = (struct Qdisc*)arg;
384 	struct sfq_sched_data *q = qdisc_priv(sch);
385 
386 	q->perturbation = net_random()&0x1F;
387 
388 	if (q->perturb_period) {
389 		q->perturb_timer.expires = jiffies + q->perturb_period;
390 		add_timer(&q->perturb_timer);
391 	}
392 }
393 
394 static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
395 {
396 	struct sfq_sched_data *q = qdisc_priv(sch);
397 	struct tc_sfq_qopt *ctl = RTA_DATA(opt);
398 	unsigned int qlen;
399 
400 	if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
401 		return -EINVAL;
402 
403 	sch_tree_lock(sch);
404 	q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
405 	q->perturb_period = ctl->perturb_period*HZ;
406 	if (ctl->limit)
407 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
408 
409 	qlen = sch->q.qlen;
410 	while (sch->q.qlen >= q->limit-1)
411 		sfq_drop(sch);
412 	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
413 
414 	del_timer(&q->perturb_timer);
415 	if (q->perturb_period) {
416 		q->perturb_timer.expires = jiffies + q->perturb_period;
417 		add_timer(&q->perturb_timer);
418 	}
419 	sch_tree_unlock(sch);
420 	return 0;
421 }
422 
423 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
424 {
425 	struct sfq_sched_data *q = qdisc_priv(sch);
426 	int i;
427 
428 	init_timer(&q->perturb_timer);
429 	q->perturb_timer.data = (unsigned long)sch;
430 	q->perturb_timer.function = sfq_perturbation;
431 
432 	for (i=0; i<SFQ_HASH_DIVISOR; i++)
433 		q->ht[i] = SFQ_DEPTH;
434 	for (i=0; i<SFQ_DEPTH; i++) {
435 		skb_queue_head_init(&q->qs[i]);
436 		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
437 		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
438 	}
439 	q->limit = SFQ_DEPTH;
440 	q->max_depth = 0;
441 	q->tail = SFQ_DEPTH;
442 	if (opt == NULL) {
443 		q->quantum = psched_mtu(sch->dev);
444 		q->perturb_period = 0;
445 	} else {
446 		int err = sfq_change(sch, opt);
447 		if (err)
448 			return err;
449 	}
450 	for (i=0; i<SFQ_DEPTH; i++)
451 		sfq_link(q, i);
452 	return 0;
453 }
454 
455 static void sfq_destroy(struct Qdisc *sch)
456 {
457 	struct sfq_sched_data *q = qdisc_priv(sch);
458 	del_timer(&q->perturb_timer);
459 }
460 
461 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
462 {
463 	struct sfq_sched_data *q = qdisc_priv(sch);
464 	unsigned char	 *b = skb->tail;
465 	struct tc_sfq_qopt opt;
466 
467 	opt.quantum = q->quantum;
468 	opt.perturb_period = q->perturb_period/HZ;
469 
470 	opt.limit = q->limit;
471 	opt.divisor = SFQ_HASH_DIVISOR;
472 	opt.flows = q->limit;
473 
474 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
475 
476 	return skb->len;
477 
478 rtattr_failure:
479 	skb_trim(skb, b - skb->data);
480 	return -1;
481 }
482 
483 static struct Qdisc_ops sfq_qdisc_ops = {
484 	.next		=	NULL,
485 	.cl_ops		=	NULL,
486 	.id		=	"sfq",
487 	.priv_size	=	sizeof(struct sfq_sched_data),
488 	.enqueue	=	sfq_enqueue,
489 	.dequeue	=	sfq_dequeue,
490 	.requeue	=	sfq_requeue,
491 	.drop		=	sfq_drop,
492 	.init		=	sfq_init,
493 	.reset		=	sfq_reset,
494 	.destroy	=	sfq_destroy,
495 	.change		=	NULL,
496 	.dump		=	sfq_dump,
497 	.owner		=	THIS_MODULE,
498 };
499 
500 static int __init sfq_module_init(void)
501 {
502 	return register_qdisc(&sfq_qdisc_ops);
503 }
504 static void __exit sfq_module_exit(void)
505 {
506 	unregister_qdisc(&sfq_qdisc_ops);
507 }
508 module_init(sfq_module_init)
509 module_exit(sfq_module_exit)
510 MODULE_LICENSE("GPL");
511