xref: /linux/net/sched/sch_red.c (revision 20d0021394c1b070bf04b22c5bc8fdb437edd4c5)
1 /*
2  * net/sched/sch_red.c	Random Early Detection queue.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Changes:
12  * J Hadi Salim <hadi@nortel.com> 980914:	computation fixes
13  * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly.
14  * J Hadi Salim <hadi@nortelnetworks.com> 980816:  ECN support
15  */
16 
17 #include <linux/config.h>
18 #include <linux/module.h>
19 #include <asm/uaccess.h>
20 #include <asm/system.h>
21 #include <linux/bitops.h>
22 #include <linux/types.h>
23 #include <linux/kernel.h>
24 #include <linux/sched.h>
25 #include <linux/string.h>
26 #include <linux/mm.h>
27 #include <linux/socket.h>
28 #include <linux/sockios.h>
29 #include <linux/in.h>
30 #include <linux/errno.h>
31 #include <linux/interrupt.h>
32 #include <linux/if_ether.h>
33 #include <linux/inet.h>
34 #include <linux/netdevice.h>
35 #include <linux/etherdevice.h>
36 #include <linux/notifier.h>
37 #include <net/ip.h>
38 #include <net/route.h>
39 #include <linux/skbuff.h>
40 #include <net/sock.h>
41 #include <net/pkt_sched.h>
42 #include <net/inet_ecn.h>
43 #include <net/dsfield.h>
44 
45 
46 /*	Random Early Detection (RED) algorithm.
47 	=======================================
48 
49 	Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways
50 	for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking.
51 
52 	This file codes a "divisionless" version of RED algorithm
53 	as written down in Fig.17 of the paper.
54 
55 Short description.
56 ------------------
57 
58 	When a new packet arrives we calculate the average queue length:
59 
60 	avg = (1-W)*avg + W*current_queue_len,
61 
62 	W is the filter time constant (chosen as 2^(-Wlog)), it controls
63 	the inertia of the algorithm. To allow larger bursts, W should be
64 	decreased.
65 
66 	if (avg > th_max) -> packet marked (dropped).
67 	if (avg < th_min) -> packet passes.
68 	if (th_min < avg < th_max) we calculate probability:
69 
70 	Pb = max_P * (avg - th_min)/(th_max-th_min)
71 
72 	and mark (drop) packet with this probability.
73 	Pb changes from 0 (at avg==th_min) to max_P (avg==th_max).
74 	max_P should be small (not 1), usually 0.01..0.02 is good value.
75 
76 	max_P is chosen as a number, so that max_P/(th_max-th_min)
77 	is a negative power of two in order arithmetics to contain
78 	only shifts.
79 
80 
81 	Parameters, settable by user:
82 	-----------------------------
83 
84 	limit		- bytes (must be > qth_max + burst)
85 
86 	Hard limit on queue length, should be chosen >qth_max
87 	to allow packet bursts. This parameter does not
88 	affect the algorithms behaviour and can be chosen
89 	arbitrarily high (well, less than ram size)
90 	Really, this limit will never be reached
91 	if RED works correctly.
92 
93 	qth_min		- bytes (should be < qth_max/2)
94 	qth_max		- bytes (should be at least 2*qth_min and less limit)
95 	Wlog	       	- bits (<32) log(1/W).
96 	Plog	       	- bits (<32)
97 
98 	Plog is related to max_P by formula:
99 
100 	max_P = (qth_max-qth_min)/2^Plog;
101 
102 	F.e. if qth_max=128K and qth_min=32K, then Plog=22
103 	corresponds to max_P=0.02
104 
105 	Scell_log
106 	Stab
107 
108 	Lookup table for log((1-W)^(t/t_ave).
109 
110 
111 NOTES:
112 
113 Upper bound on W.
114 -----------------
115 
116 	If you want to allow bursts of L packets of size S,
117 	you should choose W:
118 
119 	L + 1 - th_min/S < (1-(1-W)^L)/W
120 
121 	th_min/S = 32         th_min/S = 4
122 
123 	log(W)	L
124 	-1	33
125 	-2	35
126 	-3	39
127 	-4	46
128 	-5	57
129 	-6	75
130 	-7	101
131 	-8	135
132 	-9	190
133 	etc.
134  */
135 
136 struct red_sched_data
137 {
138 /* Parameters */
139 	u32		limit;		/* HARD maximal queue length	*/
140 	u32		qth_min;	/* Min average length threshold: A scaled */
141 	u32		qth_max;	/* Max average length threshold: A scaled */
142 	u32		Rmask;
143 	u32		Scell_max;
144 	unsigned char	flags;
145 	char		Wlog;		/* log(W)		*/
146 	char		Plog;		/* random number bits	*/
147 	char		Scell_log;
148 	u8		Stab[256];
149 
150 /* Variables */
151 	unsigned long	qave;		/* Average queue length: A scaled */
152 	int		qcount;		/* Packets since last random number generation */
153 	u32		qR;		/* Cached random number */
154 
155 	psched_time_t	qidlestart;	/* Start of idle period		*/
156 	struct tc_red_xstats st;
157 };
158 
159 static int red_ecn_mark(struct sk_buff *skb)
160 {
161 	if (skb->nh.raw + 20 > skb->tail)
162 		return 0;
163 
164 	switch (skb->protocol) {
165 	case __constant_htons(ETH_P_IP):
166 		if (INET_ECN_is_not_ect(skb->nh.iph->tos))
167 			return 0;
168 		IP_ECN_set_ce(skb->nh.iph);
169 		return 1;
170 	case __constant_htons(ETH_P_IPV6):
171 		if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h)))
172 			return 0;
173 		IP6_ECN_set_ce(skb->nh.ipv6h);
174 		return 1;
175 	default:
176 		return 0;
177 	}
178 }
179 
180 static int
181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
182 {
183 	struct red_sched_data *q = qdisc_priv(sch);
184 
185 	psched_time_t now;
186 
187 	if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) {
188 		long us_idle;
189 		int  shift;
190 
191 		PSCHED_GET_TIME(now);
192 		us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max);
193 		PSCHED_SET_PASTPERFECT(q->qidlestart);
194 
195 /*
196    The problem: ideally, average length queue recalcultion should
197    be done over constant clock intervals. This is too expensive, so that
198    the calculation is driven by outgoing packets.
199    When the queue is idle we have to model this clock by hand.
200 
201    SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth)
202    dummy packets as a burst after idle time, i.e.
203 
204           q->qave *= (1-W)^m
205 
206    This is an apparently overcomplicated solution (f.e. we have to precompute
207    a table to make this calculation in reasonable time)
208    I believe that a simpler model may be used here,
209    but it is field for experiments.
210 */
211 		shift = q->Stab[us_idle>>q->Scell_log];
212 
213 		if (shift) {
214 			q->qave >>= shift;
215 		} else {
216 			/* Approximate initial part of exponent
217 			   with linear function:
218 			   (1-W)^m ~= 1-mW + ...
219 
220 			   Seems, it is the best solution to
221 			   problem of too coarce exponent tabulation.
222 			 */
223 
224 			us_idle = (q->qave * us_idle)>>q->Scell_log;
225 			if (us_idle < q->qave/2)
226 				q->qave -= us_idle;
227 			else
228 				q->qave >>= 1;
229 		}
230 	} else {
231 		q->qave += sch->qstats.backlog - (q->qave >> q->Wlog);
232 		/* NOTE:
233 		   q->qave is fixed point number with point at Wlog.
234 		   The formulae above is equvalent to floating point
235 		   version:
236 
237 		   qave = qave*(1-W) + sch->qstats.backlog*W;
238 		                                           --ANK (980924)
239 		 */
240 	}
241 
242 	if (q->qave < q->qth_min) {
243 		q->qcount = -1;
244 enqueue:
245 		if (sch->qstats.backlog + skb->len <= q->limit) {
246 			__skb_queue_tail(&sch->q, skb);
247 			sch->qstats.backlog += skb->len;
248 			sch->bstats.bytes += skb->len;
249 			sch->bstats.packets++;
250 			return NET_XMIT_SUCCESS;
251 		} else {
252 			q->st.pdrop++;
253 		}
254 		kfree_skb(skb);
255 		sch->qstats.drops++;
256 		return NET_XMIT_DROP;
257 	}
258 	if (q->qave >= q->qth_max) {
259 		q->qcount = -1;
260 		sch->qstats.overlimits++;
261 mark:
262 		if  (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) {
263 			q->st.early++;
264 			goto drop;
265 		}
266 		q->st.marked++;
267 		goto enqueue;
268 	}
269 
270 	if (++q->qcount) {
271 		/* The formula used below causes questions.
272 
273 		   OK. qR is random number in the interval 0..Rmask
274 		   i.e. 0..(2^Plog). If we used floating point
275 		   arithmetics, it would be: (2^Plog)*rnd_num,
276 		   where rnd_num is less 1.
277 
278 		   Taking into account, that qave have fixed
279 		   point at Wlog, and Plog is related to max_P by
280 		   max_P = (qth_max-qth_min)/2^Plog; two lines
281 		   below have the following floating point equivalent:
282 
283 		   max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount
284 
285 		   Any questions? --ANK (980924)
286 		 */
287 		if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR)
288 			goto enqueue;
289 		q->qcount = 0;
290 		q->qR = net_random()&q->Rmask;
291 		sch->qstats.overlimits++;
292 		goto mark;
293 	}
294 	q->qR = net_random()&q->Rmask;
295 	goto enqueue;
296 
297 drop:
298 	kfree_skb(skb);
299 	sch->qstats.drops++;
300 	return NET_XMIT_CN;
301 }
302 
303 static int
304 red_requeue(struct sk_buff *skb, struct Qdisc* sch)
305 {
306 	struct red_sched_data *q = qdisc_priv(sch);
307 
308 	PSCHED_SET_PASTPERFECT(q->qidlestart);
309 
310 	__skb_queue_head(&sch->q, skb);
311 	sch->qstats.backlog += skb->len;
312 	sch->qstats.requeues++;
313 	return 0;
314 }
315 
316 static struct sk_buff *
317 red_dequeue(struct Qdisc* sch)
318 {
319 	struct sk_buff *skb;
320 	struct red_sched_data *q = qdisc_priv(sch);
321 
322 	skb = __skb_dequeue(&sch->q);
323 	if (skb) {
324 		sch->qstats.backlog -= skb->len;
325 		return skb;
326 	}
327 	PSCHED_GET_TIME(q->qidlestart);
328 	return NULL;
329 }
330 
331 static unsigned int red_drop(struct Qdisc* sch)
332 {
333 	struct sk_buff *skb;
334 	struct red_sched_data *q = qdisc_priv(sch);
335 
336 	skb = __skb_dequeue_tail(&sch->q);
337 	if (skb) {
338 		unsigned int len = skb->len;
339 		sch->qstats.backlog -= len;
340 		sch->qstats.drops++;
341 		q->st.other++;
342 		kfree_skb(skb);
343 		return len;
344 	}
345 	PSCHED_GET_TIME(q->qidlestart);
346 	return 0;
347 }
348 
349 static void red_reset(struct Qdisc* sch)
350 {
351 	struct red_sched_data *q = qdisc_priv(sch);
352 
353 	__skb_queue_purge(&sch->q);
354 	sch->qstats.backlog = 0;
355 	PSCHED_SET_PASTPERFECT(q->qidlestart);
356 	q->qave = 0;
357 	q->qcount = -1;
358 }
359 
360 static int red_change(struct Qdisc *sch, struct rtattr *opt)
361 {
362 	struct red_sched_data *q = qdisc_priv(sch);
363 	struct rtattr *tb[TCA_RED_STAB];
364 	struct tc_red_qopt *ctl;
365 
366 	if (opt == NULL ||
367 	    rtattr_parse_nested(tb, TCA_RED_STAB, opt) ||
368 	    tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 ||
369 	    RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) ||
370 	    RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256)
371 		return -EINVAL;
372 
373 	ctl = RTA_DATA(tb[TCA_RED_PARMS-1]);
374 
375 	sch_tree_lock(sch);
376 	q->flags = ctl->flags;
377 	q->Wlog = ctl->Wlog;
378 	q->Plog = ctl->Plog;
379 	q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL;
380 	q->Scell_log = ctl->Scell_log;
381 	q->Scell_max = (255<<q->Scell_log);
382 	q->qth_min = ctl->qth_min<<ctl->Wlog;
383 	q->qth_max = ctl->qth_max<<ctl->Wlog;
384 	q->limit = ctl->limit;
385 	memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256);
386 
387 	q->qcount = -1;
388 	if (skb_queue_empty(&sch->q))
389 		PSCHED_SET_PASTPERFECT(q->qidlestart);
390 	sch_tree_unlock(sch);
391 	return 0;
392 }
393 
394 static int red_init(struct Qdisc* sch, struct rtattr *opt)
395 {
396 	return red_change(sch, opt);
397 }
398 
399 static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
400 {
401 	struct red_sched_data *q = qdisc_priv(sch);
402 	unsigned char	 *b = skb->tail;
403 	struct rtattr *rta;
404 	struct tc_red_qopt opt;
405 
406 	rta = (struct rtattr*)b;
407 	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
408 	opt.limit = q->limit;
409 	opt.qth_min = q->qth_min>>q->Wlog;
410 	opt.qth_max = q->qth_max>>q->Wlog;
411 	opt.Wlog = q->Wlog;
412 	opt.Plog = q->Plog;
413 	opt.Scell_log = q->Scell_log;
414 	opt.flags = q->flags;
415 	RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt);
416 	rta->rta_len = skb->tail - b;
417 
418 	return skb->len;
419 
420 rtattr_failure:
421 	skb_trim(skb, b - skb->data);
422 	return -1;
423 }
424 
425 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
426 {
427 	struct red_sched_data *q = qdisc_priv(sch);
428 
429 	return gnet_stats_copy_app(d, &q->st, sizeof(q->st));
430 }
431 
432 static struct Qdisc_ops red_qdisc_ops = {
433 	.next		=	NULL,
434 	.cl_ops		=	NULL,
435 	.id		=	"red",
436 	.priv_size	=	sizeof(struct red_sched_data),
437 	.enqueue	=	red_enqueue,
438 	.dequeue	=	red_dequeue,
439 	.requeue	=	red_requeue,
440 	.drop		=	red_drop,
441 	.init		=	red_init,
442 	.reset		=	red_reset,
443 	.change		=	red_change,
444 	.dump		=	red_dump,
445 	.dump_stats	=	red_dump_stats,
446 	.owner		=	THIS_MODULE,
447 };
448 
449 static int __init red_module_init(void)
450 {
451 	return register_qdisc(&red_qdisc_ops);
452 }
453 static void __exit red_module_exit(void)
454 {
455 	unregister_qdisc(&red_qdisc_ops);
456 }
457 module_init(red_module_init)
458 module_exit(red_module_exit)
459 MODULE_LICENSE("GPL");
460