xref: /linux/net/sched/sch_sfq.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/config.h>
13 #include <linux/module.h>
14 #include <asm/uaccess.h>
15 #include <asm/system.h>
16 #include <linux/bitops.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/jiffies.h>
20 #include <linux/string.h>
21 #include <linux/mm.h>
22 #include <linux/socket.h>
23 #include <linux/sockios.h>
24 #include <linux/in.h>
25 #include <linux/errno.h>
26 #include <linux/interrupt.h>
27 #include <linux/if_ether.h>
28 #include <linux/inet.h>
29 #include <linux/netdevice.h>
30 #include <linux/etherdevice.h>
31 #include <linux/notifier.h>
32 #include <linux/init.h>
33 #include <net/ip.h>
34 #include <linux/ipv6.h>
35 #include <net/route.h>
36 #include <linux/skbuff.h>
37 #include <net/sock.h>
38 #include <net/pkt_sched.h>
39 
40 
41 /*	Stochastic Fairness Queuing algorithm.
42 	=======================================
43 
44 	Source:
45 	Paul E. McKenney "Stochastic Fairness Queuing",
46 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
47 
48 	Paul E. McKenney "Stochastic Fairness Queuing",
49 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
50 
51 
52 	See also:
53 	M. Shreedhar and George Varghese "Efficient Fair
54 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
55 
56 
57 	This is not the thing that is usually called (W)FQ nowadays.
58 	It does not use any timestamp mechanism, but instead
59 	processes queues in round-robin order.
60 
61 	ADVANTAGE:
62 
63 	- It is very cheap. Both CPU and memory requirements are minimal.
64 
65 	DRAWBACKS:
66 
67 	- "Stochastic" -> It is not 100% fair.
68 	When hash collisions occur, several flows are considered as one.
69 
70 	- "Round-robin" -> It introduces larger delays than virtual clock
71 	based schemes, and should not be used for isolating interactive
72 	traffic	from non-interactive. It means, that this scheduler
73 	should be used as leaf of CBQ or P3, which put interactive traffic
74 	to higher priority band.
75 
76 	We still need true WFQ for top level CSZ, but using WFQ
77 	for the best effort traffic is absolutely pointless:
78 	SFQ is superior for this purpose.
79 
80 	IMPLEMENTATION:
81 	This implementation limits maximal queue length to 128;
82 	maximal mtu to 2^15-1; number of hash buckets to 1024.
83 	The only goal of this restrictions was that all data
84 	fit into one 4K page :-). Struct sfq_sched_data is
85 	organized in anti-cache manner: all the data for a bucket
86 	are scattered over different locations. This is not good,
87 	but it allowed me to put it into 4K.
88 
89 	It is easy to increase these values, but not in flight.  */
90 
91 #define SFQ_DEPTH		128
92 #define SFQ_HASH_DIVISOR	1024
93 
94 /* This type should contain at least SFQ_DEPTH*2 values */
95 typedef unsigned char sfq_index;
96 
97 struct sfq_head
98 {
99 	sfq_index	next;
100 	sfq_index	prev;
101 };
102 
103 struct sfq_sched_data
104 {
105 /* Parameters */
106 	int		perturb_period;
107 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
108 	int		limit;
109 
110 /* Variables */
111 	struct timer_list perturb_timer;
112 	int		perturbation;
113 	sfq_index	tail;		/* Index of current slot in round */
114 	sfq_index	max_depth;	/* Maximal depth */
115 
116 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
117 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
118 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
119 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
120 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
121 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
122 };
123 
124 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
125 {
126 	int pert = q->perturbation;
127 
128 	/* Have we any rotation primitives? If not, WHY? */
129 	h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
130 	h ^= h>>10;
131 	return h & 0x3FF;
132 }
133 
134 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
135 {
136 	u32 h, h2;
137 
138 	switch (skb->protocol) {
139 	case __constant_htons(ETH_P_IP):
140 	{
141 		struct iphdr *iph = skb->nh.iph;
142 		h = iph->daddr;
143 		h2 = iph->saddr^iph->protocol;
144 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
145 		    (iph->protocol == IPPROTO_TCP ||
146 		     iph->protocol == IPPROTO_UDP ||
147 		     iph->protocol == IPPROTO_SCTP ||
148 		     iph->protocol == IPPROTO_DCCP ||
149 		     iph->protocol == IPPROTO_ESP))
150 			h2 ^= *(((u32*)iph) + iph->ihl);
151 		break;
152 	}
153 	case __constant_htons(ETH_P_IPV6):
154 	{
155 		struct ipv6hdr *iph = skb->nh.ipv6h;
156 		h = iph->daddr.s6_addr32[3];
157 		h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
158 		if (iph->nexthdr == IPPROTO_TCP ||
159 		    iph->nexthdr == IPPROTO_UDP ||
160 		    iph->nexthdr == IPPROTO_SCTP ||
161 		    iph->nexthdr == IPPROTO_DCCP ||
162 		    iph->nexthdr == IPPROTO_ESP)
163 			h2 ^= *(u32*)&iph[1];
164 		break;
165 	}
166 	default:
167 		h = (u32)(unsigned long)skb->dst^skb->protocol;
168 		h2 = (u32)(unsigned long)skb->sk;
169 	}
170 	return sfq_fold_hash(q, h, h2);
171 }
172 
173 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
174 {
175 	sfq_index p, n;
176 	int d = q->qs[x].qlen + SFQ_DEPTH;
177 
178 	p = d;
179 	n = q->dep[d].next;
180 	q->dep[x].next = n;
181 	q->dep[x].prev = p;
182 	q->dep[p].next = q->dep[n].prev = x;
183 }
184 
185 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
186 {
187 	sfq_index p, n;
188 
189 	n = q->dep[x].next;
190 	p = q->dep[x].prev;
191 	q->dep[p].next = n;
192 	q->dep[n].prev = p;
193 
194 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
195 		q->max_depth--;
196 
197 	sfq_link(q, x);
198 }
199 
200 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
201 {
202 	sfq_index p, n;
203 	int d;
204 
205 	n = q->dep[x].next;
206 	p = q->dep[x].prev;
207 	q->dep[p].next = n;
208 	q->dep[n].prev = p;
209 	d = q->qs[x].qlen;
210 	if (q->max_depth < d)
211 		q->max_depth = d;
212 
213 	sfq_link(q, x);
214 }
215 
216 static unsigned int sfq_drop(struct Qdisc *sch)
217 {
218 	struct sfq_sched_data *q = qdisc_priv(sch);
219 	sfq_index d = q->max_depth;
220 	struct sk_buff *skb;
221 	unsigned int len;
222 
223 	/* Queue is full! Find the longest slot and
224 	   drop a packet from it */
225 
226 	if (d > 1) {
227 		sfq_index x = q->dep[d+SFQ_DEPTH].next;
228 		skb = q->qs[x].prev;
229 		len = skb->len;
230 		__skb_unlink(skb, &q->qs[x]);
231 		kfree_skb(skb);
232 		sfq_dec(q, x);
233 		sch->q.qlen--;
234 		sch->qstats.drops++;
235 		sch->qstats.backlog -= len;
236 		return len;
237 	}
238 
239 	if (d == 1) {
240 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
241 		d = q->next[q->tail];
242 		q->next[q->tail] = q->next[d];
243 		q->allot[q->next[d]] += q->quantum;
244 		skb = q->qs[d].prev;
245 		len = skb->len;
246 		__skb_unlink(skb, &q->qs[d]);
247 		kfree_skb(skb);
248 		sfq_dec(q, d);
249 		sch->q.qlen--;
250 		q->ht[q->hash[d]] = SFQ_DEPTH;
251 		sch->qstats.drops++;
252 		sch->qstats.backlog -= len;
253 		return len;
254 	}
255 
256 	return 0;
257 }
258 
259 static int
260 sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
261 {
262 	struct sfq_sched_data *q = qdisc_priv(sch);
263 	unsigned hash = sfq_hash(q, skb);
264 	sfq_index x;
265 
266 	x = q->ht[hash];
267 	if (x == SFQ_DEPTH) {
268 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
269 		q->hash[x] = hash;
270 	}
271 	sch->qstats.backlog += skb->len;
272 	__skb_queue_tail(&q->qs[x], skb);
273 	sfq_inc(q, x);
274 	if (q->qs[x].qlen == 1) {		/* The flow is new */
275 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
276 			q->tail = x;
277 			q->next[x] = x;
278 			q->allot[x] = q->quantum;
279 		} else {
280 			q->next[x] = q->next[q->tail];
281 			q->next[q->tail] = x;
282 			q->tail = x;
283 		}
284 	}
285 	if (++sch->q.qlen < q->limit-1) {
286 		sch->bstats.bytes += skb->len;
287 		sch->bstats.packets++;
288 		return 0;
289 	}
290 
291 	sfq_drop(sch);
292 	return NET_XMIT_CN;
293 }
294 
295 static int
296 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
297 {
298 	struct sfq_sched_data *q = qdisc_priv(sch);
299 	unsigned hash = sfq_hash(q, skb);
300 	sfq_index x;
301 
302 	x = q->ht[hash];
303 	if (x == SFQ_DEPTH) {
304 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
305 		q->hash[x] = hash;
306 	}
307 	sch->qstats.backlog += skb->len;
308 	__skb_queue_head(&q->qs[x], skb);
309 	sfq_inc(q, x);
310 	if (q->qs[x].qlen == 1) {		/* The flow is new */
311 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
312 			q->tail = x;
313 			q->next[x] = x;
314 			q->allot[x] = q->quantum;
315 		} else {
316 			q->next[x] = q->next[q->tail];
317 			q->next[q->tail] = x;
318 			q->tail = x;
319 		}
320 	}
321 	if (++sch->q.qlen < q->limit - 1) {
322 		sch->qstats.requeues++;
323 		return 0;
324 	}
325 
326 	sch->qstats.drops++;
327 	sfq_drop(sch);
328 	return NET_XMIT_CN;
329 }
330 
331 
332 
333 
334 static struct sk_buff *
335 sfq_dequeue(struct Qdisc* sch)
336 {
337 	struct sfq_sched_data *q = qdisc_priv(sch);
338 	struct sk_buff *skb;
339 	sfq_index a, old_a;
340 
341 	/* No active slots */
342 	if (q->tail == SFQ_DEPTH)
343 		return NULL;
344 
345 	a = old_a = q->next[q->tail];
346 
347 	/* Grab packet */
348 	skb = __skb_dequeue(&q->qs[a]);
349 	sfq_dec(q, a);
350 	sch->q.qlen--;
351 	sch->qstats.backlog -= skb->len;
352 
353 	/* Is the slot empty? */
354 	if (q->qs[a].qlen == 0) {
355 		q->ht[q->hash[a]] = SFQ_DEPTH;
356 		a = q->next[a];
357 		if (a == old_a) {
358 			q->tail = SFQ_DEPTH;
359 			return skb;
360 		}
361 		q->next[q->tail] = a;
362 		q->allot[a] += q->quantum;
363 	} else if ((q->allot[a] -= skb->len) <= 0) {
364 		q->tail = a;
365 		a = q->next[a];
366 		q->allot[a] += q->quantum;
367 	}
368 	return skb;
369 }
370 
371 static void
372 sfq_reset(struct Qdisc* sch)
373 {
374 	struct sk_buff *skb;
375 
376 	while ((skb = sfq_dequeue(sch)) != NULL)
377 		kfree_skb(skb);
378 }
379 
380 static void sfq_perturbation(unsigned long arg)
381 {
382 	struct Qdisc *sch = (struct Qdisc*)arg;
383 	struct sfq_sched_data *q = qdisc_priv(sch);
384 
385 	q->perturbation = net_random()&0x1F;
386 
387 	if (q->perturb_period) {
388 		q->perturb_timer.expires = jiffies + q->perturb_period;
389 		add_timer(&q->perturb_timer);
390 	}
391 }
392 
393 static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
394 {
395 	struct sfq_sched_data *q = qdisc_priv(sch);
396 	struct tc_sfq_qopt *ctl = RTA_DATA(opt);
397 
398 	if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
399 		return -EINVAL;
400 
401 	sch_tree_lock(sch);
402 	q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
403 	q->perturb_period = ctl->perturb_period*HZ;
404 	if (ctl->limit)
405 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
406 
407 	while (sch->q.qlen >= q->limit-1)
408 		sfq_drop(sch);
409 
410 	del_timer(&q->perturb_timer);
411 	if (q->perturb_period) {
412 		q->perturb_timer.expires = jiffies + q->perturb_period;
413 		add_timer(&q->perturb_timer);
414 	}
415 	sch_tree_unlock(sch);
416 	return 0;
417 }
418 
419 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
420 {
421 	struct sfq_sched_data *q = qdisc_priv(sch);
422 	int i;
423 
424 	init_timer(&q->perturb_timer);
425 	q->perturb_timer.data = (unsigned long)sch;
426 	q->perturb_timer.function = sfq_perturbation;
427 
428 	for (i=0; i<SFQ_HASH_DIVISOR; i++)
429 		q->ht[i] = SFQ_DEPTH;
430 	for (i=0; i<SFQ_DEPTH; i++) {
431 		skb_queue_head_init(&q->qs[i]);
432 		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
433 		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
434 	}
435 	q->limit = SFQ_DEPTH;
436 	q->max_depth = 0;
437 	q->tail = SFQ_DEPTH;
438 	if (opt == NULL) {
439 		q->quantum = psched_mtu(sch->dev);
440 		q->perturb_period = 0;
441 	} else {
442 		int err = sfq_change(sch, opt);
443 		if (err)
444 			return err;
445 	}
446 	for (i=0; i<SFQ_DEPTH; i++)
447 		sfq_link(q, i);
448 	return 0;
449 }
450 
451 static void sfq_destroy(struct Qdisc *sch)
452 {
453 	struct sfq_sched_data *q = qdisc_priv(sch);
454 	del_timer(&q->perturb_timer);
455 }
456 
457 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
458 {
459 	struct sfq_sched_data *q = qdisc_priv(sch);
460 	unsigned char	 *b = skb->tail;
461 	struct tc_sfq_qopt opt;
462 
463 	opt.quantum = q->quantum;
464 	opt.perturb_period = q->perturb_period/HZ;
465 
466 	opt.limit = q->limit;
467 	opt.divisor = SFQ_HASH_DIVISOR;
468 	opt.flows = q->limit;
469 
470 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
471 
472 	return skb->len;
473 
474 rtattr_failure:
475 	skb_trim(skb, b - skb->data);
476 	return -1;
477 }
478 
479 static struct Qdisc_ops sfq_qdisc_ops = {
480 	.next		=	NULL,
481 	.cl_ops		=	NULL,
482 	.id		=	"sfq",
483 	.priv_size	=	sizeof(struct sfq_sched_data),
484 	.enqueue	=	sfq_enqueue,
485 	.dequeue	=	sfq_dequeue,
486 	.requeue	=	sfq_requeue,
487 	.drop		=	sfq_drop,
488 	.init		=	sfq_init,
489 	.reset		=	sfq_reset,
490 	.destroy	=	sfq_destroy,
491 	.change		=	NULL,
492 	.dump		=	sfq_dump,
493 	.owner		=	THIS_MODULE,
494 };
495 
496 static int __init sfq_module_init(void)
497 {
498 	return register_qdisc(&sfq_qdisc_ops);
499 }
500 static void __exit sfq_module_exit(void)
501 {
502 	unregister_qdisc(&sfq_qdisc_ops);
503 }
504 module_init(sfq_module_init)
505 module_exit(sfq_module_exit)
506 MODULE_LICENSE("GPL");
507