xref: /linux/net/sched/sch_sfq.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <linux/module.h>
13 #include <asm/uaccess.h>
14 #include <asm/system.h>
15 #include <linux/bitops.h>
16 #include <linux/types.h>
17 #include <linux/kernel.h>
18 #include <linux/jiffies.h>
19 #include <linux/string.h>
20 #include <linux/mm.h>
21 #include <linux/socket.h>
22 #include <linux/sockios.h>
23 #include <linux/in.h>
24 #include <linux/errno.h>
25 #include <linux/interrupt.h>
26 #include <linux/if_ether.h>
27 #include <linux/inet.h>
28 #include <linux/netdevice.h>
29 #include <linux/etherdevice.h>
30 #include <linux/notifier.h>
31 #include <linux/init.h>
32 #include <net/ip.h>
33 #include <linux/ipv6.h>
34 #include <net/route.h>
35 #include <linux/skbuff.h>
36 #include <net/sock.h>
37 #include <net/pkt_sched.h>
38 
39 
40 /*	Stochastic Fairness Queuing algorithm.
41 	=======================================
42 
43 	Source:
44 	Paul E. McKenney "Stochastic Fairness Queuing",
45 	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
46 
47 	Paul E. McKenney "Stochastic Fairness Queuing",
48 	"Interworking: Research and Experience", v.2, 1991, p.113-131.
49 
50 
51 	See also:
52 	M. Shreedhar and George Varghese "Efficient Fair
53 	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
54 
55 
56 	This is not the thing that is usually called (W)FQ nowadays.
57 	It does not use any timestamp mechanism, but instead
58 	processes queues in round-robin order.
59 
60 	ADVANTAGE:
61 
62 	- It is very cheap. Both CPU and memory requirements are minimal.
63 
64 	DRAWBACKS:
65 
66 	- "Stochastic" -> It is not 100% fair.
67 	When hash collisions occur, several flows are considered as one.
68 
69 	- "Round-robin" -> It introduces larger delays than virtual clock
70 	based schemes, and should not be used for isolating interactive
71 	traffic	from non-interactive. It means, that this scheduler
72 	should be used as leaf of CBQ or P3, which put interactive traffic
73 	to higher priority band.
74 
75 	We still need true WFQ for top level CSZ, but using WFQ
76 	for the best effort traffic is absolutely pointless:
77 	SFQ is superior for this purpose.
78 
79 	IMPLEMENTATION:
80 	This implementation limits maximal queue length to 128;
81 	maximal mtu to 2^15-1; number of hash buckets to 1024.
82 	The only goal of this restrictions was that all data
83 	fit into one 4K page :-). Struct sfq_sched_data is
84 	organized in anti-cache manner: all the data for a bucket
85 	are scattered over different locations. This is not good,
86 	but it allowed me to put it into 4K.
87 
88 	It is easy to increase these values, but not in flight.  */
89 
90 #define SFQ_DEPTH		128
91 #define SFQ_HASH_DIVISOR	1024
92 
93 /* This type should contain at least SFQ_DEPTH*2 values */
94 typedef unsigned char sfq_index;
95 
96 struct sfq_head
97 {
98 	sfq_index	next;
99 	sfq_index	prev;
100 };
101 
102 struct sfq_sched_data
103 {
104 /* Parameters */
105 	int		perturb_period;
106 	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
107 	int		limit;
108 
109 /* Variables */
110 	struct timer_list perturb_timer;
111 	int		perturbation;
112 	sfq_index	tail;		/* Index of current slot in round */
113 	sfq_index	max_depth;	/* Maximal depth */
114 
115 	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
116 	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
117 	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
118 	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
119 	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
120 	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
121 };
122 
123 static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
124 {
125 	int pert = q->perturbation;
126 
127 	/* Have we any rotation primitives? If not, WHY? */
128 	h ^= (h1<<pert) ^ (h1>>(0x1F - pert));
129 	h ^= h>>10;
130 	return h & 0x3FF;
131 }
132 
133 static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
134 {
135 	u32 h, h2;
136 
137 	switch (skb->protocol) {
138 	case __constant_htons(ETH_P_IP):
139 	{
140 		struct iphdr *iph = skb->nh.iph;
141 		h = iph->daddr;
142 		h2 = iph->saddr^iph->protocol;
143 		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
144 		    (iph->protocol == IPPROTO_TCP ||
145 		     iph->protocol == IPPROTO_UDP ||
146 		     iph->protocol == IPPROTO_SCTP ||
147 		     iph->protocol == IPPROTO_DCCP ||
148 		     iph->protocol == IPPROTO_ESP))
149 			h2 ^= *(((u32*)iph) + iph->ihl);
150 		break;
151 	}
152 	case __constant_htons(ETH_P_IPV6):
153 	{
154 		struct ipv6hdr *iph = skb->nh.ipv6h;
155 		h = iph->daddr.s6_addr32[3];
156 		h2 = iph->saddr.s6_addr32[3]^iph->nexthdr;
157 		if (iph->nexthdr == IPPROTO_TCP ||
158 		    iph->nexthdr == IPPROTO_UDP ||
159 		    iph->nexthdr == IPPROTO_SCTP ||
160 		    iph->nexthdr == IPPROTO_DCCP ||
161 		    iph->nexthdr == IPPROTO_ESP)
162 			h2 ^= *(u32*)&iph[1];
163 		break;
164 	}
165 	default:
166 		h = (u32)(unsigned long)skb->dst^skb->protocol;
167 		h2 = (u32)(unsigned long)skb->sk;
168 	}
169 	return sfq_fold_hash(q, h, h2);
170 }
171 
172 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
173 {
174 	sfq_index p, n;
175 	int d = q->qs[x].qlen + SFQ_DEPTH;
176 
177 	p = d;
178 	n = q->dep[d].next;
179 	q->dep[x].next = n;
180 	q->dep[x].prev = p;
181 	q->dep[p].next = q->dep[n].prev = x;
182 }
183 
184 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
185 {
186 	sfq_index p, n;
187 
188 	n = q->dep[x].next;
189 	p = q->dep[x].prev;
190 	q->dep[p].next = n;
191 	q->dep[n].prev = p;
192 
193 	if (n == p && q->max_depth == q->qs[x].qlen + 1)
194 		q->max_depth--;
195 
196 	sfq_link(q, x);
197 }
198 
199 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
200 {
201 	sfq_index p, n;
202 	int d;
203 
204 	n = q->dep[x].next;
205 	p = q->dep[x].prev;
206 	q->dep[p].next = n;
207 	q->dep[n].prev = p;
208 	d = q->qs[x].qlen;
209 	if (q->max_depth < d)
210 		q->max_depth = d;
211 
212 	sfq_link(q, x);
213 }
214 
215 static unsigned int sfq_drop(struct Qdisc *sch)
216 {
217 	struct sfq_sched_data *q = qdisc_priv(sch);
218 	sfq_index d = q->max_depth;
219 	struct sk_buff *skb;
220 	unsigned int len;
221 
222 	/* Queue is full! Find the longest slot and
223 	   drop a packet from it */
224 
225 	if (d > 1) {
226 		sfq_index x = q->dep[d+SFQ_DEPTH].next;
227 		skb = q->qs[x].prev;
228 		len = skb->len;
229 		__skb_unlink(skb, &q->qs[x]);
230 		kfree_skb(skb);
231 		sfq_dec(q, x);
232 		sch->q.qlen--;
233 		sch->qstats.drops++;
234 		sch->qstats.backlog -= len;
235 		return len;
236 	}
237 
238 	if (d == 1) {
239 		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
240 		d = q->next[q->tail];
241 		q->next[q->tail] = q->next[d];
242 		q->allot[q->next[d]] += q->quantum;
243 		skb = q->qs[d].prev;
244 		len = skb->len;
245 		__skb_unlink(skb, &q->qs[d]);
246 		kfree_skb(skb);
247 		sfq_dec(q, d);
248 		sch->q.qlen--;
249 		q->ht[q->hash[d]] = SFQ_DEPTH;
250 		sch->qstats.drops++;
251 		sch->qstats.backlog -= len;
252 		return len;
253 	}
254 
255 	return 0;
256 }
257 
258 static int
259 sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch)
260 {
261 	struct sfq_sched_data *q = qdisc_priv(sch);
262 	unsigned hash = sfq_hash(q, skb);
263 	sfq_index x;
264 
265 	x = q->ht[hash];
266 	if (x == SFQ_DEPTH) {
267 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
268 		q->hash[x] = hash;
269 	}
270 	sch->qstats.backlog += skb->len;
271 	__skb_queue_tail(&q->qs[x], skb);
272 	sfq_inc(q, x);
273 	if (q->qs[x].qlen == 1) {		/* The flow is new */
274 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
275 			q->tail = x;
276 			q->next[x] = x;
277 			q->allot[x] = q->quantum;
278 		} else {
279 			q->next[x] = q->next[q->tail];
280 			q->next[q->tail] = x;
281 			q->tail = x;
282 		}
283 	}
284 	if (++sch->q.qlen < q->limit-1) {
285 		sch->bstats.bytes += skb->len;
286 		sch->bstats.packets++;
287 		return 0;
288 	}
289 
290 	sfq_drop(sch);
291 	return NET_XMIT_CN;
292 }
293 
294 static int
295 sfq_requeue(struct sk_buff *skb, struct Qdisc* sch)
296 {
297 	struct sfq_sched_data *q = qdisc_priv(sch);
298 	unsigned hash = sfq_hash(q, skb);
299 	sfq_index x;
300 
301 	x = q->ht[hash];
302 	if (x == SFQ_DEPTH) {
303 		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
304 		q->hash[x] = hash;
305 	}
306 	sch->qstats.backlog += skb->len;
307 	__skb_queue_head(&q->qs[x], skb);
308 	sfq_inc(q, x);
309 	if (q->qs[x].qlen == 1) {		/* The flow is new */
310 		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
311 			q->tail = x;
312 			q->next[x] = x;
313 			q->allot[x] = q->quantum;
314 		} else {
315 			q->next[x] = q->next[q->tail];
316 			q->next[q->tail] = x;
317 			q->tail = x;
318 		}
319 	}
320 	if (++sch->q.qlen < q->limit - 1) {
321 		sch->qstats.requeues++;
322 		return 0;
323 	}
324 
325 	sch->qstats.drops++;
326 	sfq_drop(sch);
327 	return NET_XMIT_CN;
328 }
329 
330 
331 
332 
333 static struct sk_buff *
334 sfq_dequeue(struct Qdisc* sch)
335 {
336 	struct sfq_sched_data *q = qdisc_priv(sch);
337 	struct sk_buff *skb;
338 	sfq_index a, old_a;
339 
340 	/* No active slots */
341 	if (q->tail == SFQ_DEPTH)
342 		return NULL;
343 
344 	a = old_a = q->next[q->tail];
345 
346 	/* Grab packet */
347 	skb = __skb_dequeue(&q->qs[a]);
348 	sfq_dec(q, a);
349 	sch->q.qlen--;
350 	sch->qstats.backlog -= skb->len;
351 
352 	/* Is the slot empty? */
353 	if (q->qs[a].qlen == 0) {
354 		q->ht[q->hash[a]] = SFQ_DEPTH;
355 		a = q->next[a];
356 		if (a == old_a) {
357 			q->tail = SFQ_DEPTH;
358 			return skb;
359 		}
360 		q->next[q->tail] = a;
361 		q->allot[a] += q->quantum;
362 	} else if ((q->allot[a] -= skb->len) <= 0) {
363 		q->tail = a;
364 		a = q->next[a];
365 		q->allot[a] += q->quantum;
366 	}
367 	return skb;
368 }
369 
370 static void
371 sfq_reset(struct Qdisc* sch)
372 {
373 	struct sk_buff *skb;
374 
375 	while ((skb = sfq_dequeue(sch)) != NULL)
376 		kfree_skb(skb);
377 }
378 
379 static void sfq_perturbation(unsigned long arg)
380 {
381 	struct Qdisc *sch = (struct Qdisc*)arg;
382 	struct sfq_sched_data *q = qdisc_priv(sch);
383 
384 	q->perturbation = net_random()&0x1F;
385 
386 	if (q->perturb_period) {
387 		q->perturb_timer.expires = jiffies + q->perturb_period;
388 		add_timer(&q->perturb_timer);
389 	}
390 }
391 
392 static int sfq_change(struct Qdisc *sch, struct rtattr *opt)
393 {
394 	struct sfq_sched_data *q = qdisc_priv(sch);
395 	struct tc_sfq_qopt *ctl = RTA_DATA(opt);
396 
397 	if (opt->rta_len < RTA_LENGTH(sizeof(*ctl)))
398 		return -EINVAL;
399 
400 	sch_tree_lock(sch);
401 	q->quantum = ctl->quantum ? : psched_mtu(sch->dev);
402 	q->perturb_period = ctl->perturb_period*HZ;
403 	if (ctl->limit)
404 		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH);
405 
406 	while (sch->q.qlen >= q->limit-1)
407 		sfq_drop(sch);
408 
409 	del_timer(&q->perturb_timer);
410 	if (q->perturb_period) {
411 		q->perturb_timer.expires = jiffies + q->perturb_period;
412 		add_timer(&q->perturb_timer);
413 	}
414 	sch_tree_unlock(sch);
415 	return 0;
416 }
417 
418 static int sfq_init(struct Qdisc *sch, struct rtattr *opt)
419 {
420 	struct sfq_sched_data *q = qdisc_priv(sch);
421 	int i;
422 
423 	init_timer(&q->perturb_timer);
424 	q->perturb_timer.data = (unsigned long)sch;
425 	q->perturb_timer.function = sfq_perturbation;
426 
427 	for (i=0; i<SFQ_HASH_DIVISOR; i++)
428 		q->ht[i] = SFQ_DEPTH;
429 	for (i=0; i<SFQ_DEPTH; i++) {
430 		skb_queue_head_init(&q->qs[i]);
431 		q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH;
432 		q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH;
433 	}
434 	q->limit = SFQ_DEPTH;
435 	q->max_depth = 0;
436 	q->tail = SFQ_DEPTH;
437 	if (opt == NULL) {
438 		q->quantum = psched_mtu(sch->dev);
439 		q->perturb_period = 0;
440 	} else {
441 		int err = sfq_change(sch, opt);
442 		if (err)
443 			return err;
444 	}
445 	for (i=0; i<SFQ_DEPTH; i++)
446 		sfq_link(q, i);
447 	return 0;
448 }
449 
450 static void sfq_destroy(struct Qdisc *sch)
451 {
452 	struct sfq_sched_data *q = qdisc_priv(sch);
453 	del_timer(&q->perturb_timer);
454 }
455 
456 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
457 {
458 	struct sfq_sched_data *q = qdisc_priv(sch);
459 	unsigned char	 *b = skb->tail;
460 	struct tc_sfq_qopt opt;
461 
462 	opt.quantum = q->quantum;
463 	opt.perturb_period = q->perturb_period/HZ;
464 
465 	opt.limit = q->limit;
466 	opt.divisor = SFQ_HASH_DIVISOR;
467 	opt.flows = q->limit;
468 
469 	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
470 
471 	return skb->len;
472 
473 rtattr_failure:
474 	skb_trim(skb, b - skb->data);
475 	return -1;
476 }
477 
478 static struct Qdisc_ops sfq_qdisc_ops = {
479 	.next		=	NULL,
480 	.cl_ops		=	NULL,
481 	.id		=	"sfq",
482 	.priv_size	=	sizeof(struct sfq_sched_data),
483 	.enqueue	=	sfq_enqueue,
484 	.dequeue	=	sfq_dequeue,
485 	.requeue	=	sfq_requeue,
486 	.drop		=	sfq_drop,
487 	.init		=	sfq_init,
488 	.reset		=	sfq_reset,
489 	.destroy	=	sfq_destroy,
490 	.change		=	NULL,
491 	.dump		=	sfq_dump,
492 	.owner		=	THIS_MODULE,
493 };
494 
495 static int __init sfq_module_init(void)
496 {
497 	return register_qdisc(&sfq_qdisc_ops);
498 }
499 static void __exit sfq_module_exit(void)
500 {
501 	unregister_qdisc(&sfq_qdisc_ops);
502 }
503 module_init(sfq_module_init)
504 module_exit(sfq_module_exit)
505 MODULE_LICENSE("GPL");
506