1 /* 2 * net/sched/sch_red.c Random Early Detection queue. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Changes: 12 * J Hadi Salim <hadi@nortel.com> 980914: computation fixes 13 * Alexey Makarenko <makar@phoenix.kharkov.ua> 990814: qave on idle link was calculated incorrectly. 14 * J Hadi Salim <hadi@nortelnetworks.com> 980816: ECN support 15 */ 16 17 #include <linux/config.h> 18 #include <linux/module.h> 19 #include <asm/uaccess.h> 20 #include <asm/system.h> 21 #include <linux/bitops.h> 22 #include <linux/types.h> 23 #include <linux/kernel.h> 24 #include <linux/sched.h> 25 #include <linux/string.h> 26 #include <linux/mm.h> 27 #include <linux/socket.h> 28 #include <linux/sockios.h> 29 #include <linux/in.h> 30 #include <linux/errno.h> 31 #include <linux/interrupt.h> 32 #include <linux/if_ether.h> 33 #include <linux/inet.h> 34 #include <linux/netdevice.h> 35 #include <linux/etherdevice.h> 36 #include <linux/notifier.h> 37 #include <net/ip.h> 38 #include <net/route.h> 39 #include <linux/skbuff.h> 40 #include <net/sock.h> 41 #include <net/pkt_sched.h> 42 #include <net/inet_ecn.h> 43 #include <net/dsfield.h> 44 45 46 /* Random Early Detection (RED) algorithm. 47 ======================================= 48 49 Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways 50 for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. 51 52 This file codes a "divisionless" version of RED algorithm 53 as written down in Fig.17 of the paper. 54 55 Short description. 56 ------------------ 57 58 When a new packet arrives we calculate the average queue length: 59 60 avg = (1-W)*avg + W*current_queue_len, 61 62 W is the filter time constant (chosen as 2^(-Wlog)), it controls 63 the inertia of the algorithm. To allow larger bursts, W should be 64 decreased. 65 66 if (avg > th_max) -> packet marked (dropped). 67 if (avg < th_min) -> packet passes. 68 if (th_min < avg < th_max) we calculate probability: 69 70 Pb = max_P * (avg - th_min)/(th_max-th_min) 71 72 and mark (drop) packet with this probability. 73 Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). 74 max_P should be small (not 1), usually 0.01..0.02 is good value. 75 76 max_P is chosen as a number, so that max_P/(th_max-th_min) 77 is a negative power of two in order arithmetics to contain 78 only shifts. 79 80 81 Parameters, settable by user: 82 ----------------------------- 83 84 limit - bytes (must be > qth_max + burst) 85 86 Hard limit on queue length, should be chosen >qth_max 87 to allow packet bursts. This parameter does not 88 affect the algorithms behaviour and can be chosen 89 arbitrarily high (well, less than ram size) 90 Really, this limit will never be reached 91 if RED works correctly. 92 93 qth_min - bytes (should be < qth_max/2) 94 qth_max - bytes (should be at least 2*qth_min and less limit) 95 Wlog - bits (<32) log(1/W). 96 Plog - bits (<32) 97 98 Plog is related to max_P by formula: 99 100 max_P = (qth_max-qth_min)/2^Plog; 101 102 F.e. if qth_max=128K and qth_min=32K, then Plog=22 103 corresponds to max_P=0.02 104 105 Scell_log 106 Stab 107 108 Lookup table for log((1-W)^(t/t_ave). 109 110 111 NOTES: 112 113 Upper bound on W. 114 ----------------- 115 116 If you want to allow bursts of L packets of size S, 117 you should choose W: 118 119 L + 1 - th_min/S < (1-(1-W)^L)/W 120 121 th_min/S = 32 th_min/S = 4 122 123 log(W) L 124 -1 33 125 -2 35 126 -3 39 127 -4 46 128 -5 57 129 -6 75 130 -7 101 131 -8 135 132 -9 190 133 etc. 134 */ 135 136 struct red_sched_data 137 { 138 /* Parameters */ 139 u32 limit; /* HARD maximal queue length */ 140 u32 qth_min; /* Min average length threshold: A scaled */ 141 u32 qth_max; /* Max average length threshold: A scaled */ 142 u32 Rmask; 143 u32 Scell_max; 144 unsigned char flags; 145 char Wlog; /* log(W) */ 146 char Plog; /* random number bits */ 147 char Scell_log; 148 u8 Stab[256]; 149 150 /* Variables */ 151 unsigned long qave; /* Average queue length: A scaled */ 152 int qcount; /* Packets since last random number generation */ 153 u32 qR; /* Cached random number */ 154 155 psched_time_t qidlestart; /* Start of idle period */ 156 struct tc_red_xstats st; 157 }; 158 159 static int red_ecn_mark(struct sk_buff *skb) 160 { 161 if (skb->nh.raw + 20 > skb->tail) 162 return 0; 163 164 switch (skb->protocol) { 165 case __constant_htons(ETH_P_IP): 166 if (INET_ECN_is_not_ect(skb->nh.iph->tos)) 167 return 0; 168 IP_ECN_set_ce(skb->nh.iph); 169 return 1; 170 case __constant_htons(ETH_P_IPV6): 171 if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) 172 return 0; 173 IP6_ECN_set_ce(skb->nh.ipv6h); 174 return 1; 175 default: 176 return 0; 177 } 178 } 179 180 static int 181 red_enqueue(struct sk_buff *skb, struct Qdisc* sch) 182 { 183 struct red_sched_data *q = qdisc_priv(sch); 184 185 psched_time_t now; 186 187 if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { 188 long us_idle; 189 int shift; 190 191 PSCHED_GET_TIME(now); 192 us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); 193 PSCHED_SET_PASTPERFECT(q->qidlestart); 194 195 /* 196 The problem: ideally, average length queue recalcultion should 197 be done over constant clock intervals. This is too expensive, so that 198 the calculation is driven by outgoing packets. 199 When the queue is idle we have to model this clock by hand. 200 201 SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) 202 dummy packets as a burst after idle time, i.e. 203 204 q->qave *= (1-W)^m 205 206 This is an apparently overcomplicated solution (f.e. we have to precompute 207 a table to make this calculation in reasonable time) 208 I believe that a simpler model may be used here, 209 but it is field for experiments. 210 */ 211 shift = q->Stab[us_idle>>q->Scell_log]; 212 213 if (shift) { 214 q->qave >>= shift; 215 } else { 216 /* Approximate initial part of exponent 217 with linear function: 218 (1-W)^m ~= 1-mW + ... 219 220 Seems, it is the best solution to 221 problem of too coarce exponent tabulation. 222 */ 223 224 us_idle = (q->qave * us_idle)>>q->Scell_log; 225 if (us_idle < q->qave/2) 226 q->qave -= us_idle; 227 else 228 q->qave >>= 1; 229 } 230 } else { 231 q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); 232 /* NOTE: 233 q->qave is fixed point number with point at Wlog. 234 The formulae above is equvalent to floating point 235 version: 236 237 qave = qave*(1-W) + sch->qstats.backlog*W; 238 --ANK (980924) 239 */ 240 } 241 242 if (q->qave < q->qth_min) { 243 q->qcount = -1; 244 enqueue: 245 if (sch->qstats.backlog + skb->len <= q->limit) { 246 __skb_queue_tail(&sch->q, skb); 247 sch->qstats.backlog += skb->len; 248 sch->bstats.bytes += skb->len; 249 sch->bstats.packets++; 250 return NET_XMIT_SUCCESS; 251 } else { 252 q->st.pdrop++; 253 } 254 kfree_skb(skb); 255 sch->qstats.drops++; 256 return NET_XMIT_DROP; 257 } 258 if (q->qave >= q->qth_max) { 259 q->qcount = -1; 260 sch->qstats.overlimits++; 261 mark: 262 if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { 263 q->st.early++; 264 goto drop; 265 } 266 q->st.marked++; 267 goto enqueue; 268 } 269 270 if (++q->qcount) { 271 /* The formula used below causes questions. 272 273 OK. qR is random number in the interval 0..Rmask 274 i.e. 0..(2^Plog). If we used floating point 275 arithmetics, it would be: (2^Plog)*rnd_num, 276 where rnd_num is less 1. 277 278 Taking into account, that qave have fixed 279 point at Wlog, and Plog is related to max_P by 280 max_P = (qth_max-qth_min)/2^Plog; two lines 281 below have the following floating point equivalent: 282 283 max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount 284 285 Any questions? --ANK (980924) 286 */ 287 if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) 288 goto enqueue; 289 q->qcount = 0; 290 q->qR = net_random()&q->Rmask; 291 sch->qstats.overlimits++; 292 goto mark; 293 } 294 q->qR = net_random()&q->Rmask; 295 goto enqueue; 296 297 drop: 298 kfree_skb(skb); 299 sch->qstats.drops++; 300 return NET_XMIT_CN; 301 } 302 303 static int 304 red_requeue(struct sk_buff *skb, struct Qdisc* sch) 305 { 306 struct red_sched_data *q = qdisc_priv(sch); 307 308 PSCHED_SET_PASTPERFECT(q->qidlestart); 309 310 __skb_queue_head(&sch->q, skb); 311 sch->qstats.backlog += skb->len; 312 sch->qstats.requeues++; 313 return 0; 314 } 315 316 static struct sk_buff * 317 red_dequeue(struct Qdisc* sch) 318 { 319 struct sk_buff *skb; 320 struct red_sched_data *q = qdisc_priv(sch); 321 322 skb = __skb_dequeue(&sch->q); 323 if (skb) { 324 sch->qstats.backlog -= skb->len; 325 return skb; 326 } 327 PSCHED_GET_TIME(q->qidlestart); 328 return NULL; 329 } 330 331 static unsigned int red_drop(struct Qdisc* sch) 332 { 333 struct sk_buff *skb; 334 struct red_sched_data *q = qdisc_priv(sch); 335 336 skb = __skb_dequeue_tail(&sch->q); 337 if (skb) { 338 unsigned int len = skb->len; 339 sch->qstats.backlog -= len; 340 sch->qstats.drops++; 341 q->st.other++; 342 kfree_skb(skb); 343 return len; 344 } 345 PSCHED_GET_TIME(q->qidlestart); 346 return 0; 347 } 348 349 static void red_reset(struct Qdisc* sch) 350 { 351 struct red_sched_data *q = qdisc_priv(sch); 352 353 __skb_queue_purge(&sch->q); 354 sch->qstats.backlog = 0; 355 PSCHED_SET_PASTPERFECT(q->qidlestart); 356 q->qave = 0; 357 q->qcount = -1; 358 } 359 360 static int red_change(struct Qdisc *sch, struct rtattr *opt) 361 { 362 struct red_sched_data *q = qdisc_priv(sch); 363 struct rtattr *tb[TCA_RED_STAB]; 364 struct tc_red_qopt *ctl; 365 366 if (opt == NULL || 367 rtattr_parse_nested(tb, TCA_RED_STAB, opt) || 368 tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || 369 RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || 370 RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) 371 return -EINVAL; 372 373 ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); 374 375 sch_tree_lock(sch); 376 q->flags = ctl->flags; 377 q->Wlog = ctl->Wlog; 378 q->Plog = ctl->Plog; 379 q->Rmask = ctl->Plog < 32 ? ((1<<ctl->Plog) - 1) : ~0UL; 380 q->Scell_log = ctl->Scell_log; 381 q->Scell_max = (255<<q->Scell_log); 382 q->qth_min = ctl->qth_min<<ctl->Wlog; 383 q->qth_max = ctl->qth_max<<ctl->Wlog; 384 q->limit = ctl->limit; 385 memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); 386 387 q->qcount = -1; 388 if (skb_queue_empty(&sch->q)) 389 PSCHED_SET_PASTPERFECT(q->qidlestart); 390 sch_tree_unlock(sch); 391 return 0; 392 } 393 394 static int red_init(struct Qdisc* sch, struct rtattr *opt) 395 { 396 return red_change(sch, opt); 397 } 398 399 static int red_dump(struct Qdisc *sch, struct sk_buff *skb) 400 { 401 struct red_sched_data *q = qdisc_priv(sch); 402 unsigned char *b = skb->tail; 403 struct rtattr *rta; 404 struct tc_red_qopt opt; 405 406 rta = (struct rtattr*)b; 407 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 408 opt.limit = q->limit; 409 opt.qth_min = q->qth_min>>q->Wlog; 410 opt.qth_max = q->qth_max>>q->Wlog; 411 opt.Wlog = q->Wlog; 412 opt.Plog = q->Plog; 413 opt.Scell_log = q->Scell_log; 414 opt.flags = q->flags; 415 RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); 416 rta->rta_len = skb->tail - b; 417 418 return skb->len; 419 420 rtattr_failure: 421 skb_trim(skb, b - skb->data); 422 return -1; 423 } 424 425 static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 426 { 427 struct red_sched_data *q = qdisc_priv(sch); 428 429 return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); 430 } 431 432 static struct Qdisc_ops red_qdisc_ops = { 433 .next = NULL, 434 .cl_ops = NULL, 435 .id = "red", 436 .priv_size = sizeof(struct red_sched_data), 437 .enqueue = red_enqueue, 438 .dequeue = red_dequeue, 439 .requeue = red_requeue, 440 .drop = red_drop, 441 .init = red_init, 442 .reset = red_reset, 443 .change = red_change, 444 .dump = red_dump, 445 .dump_stats = red_dump_stats, 446 .owner = THIS_MODULE, 447 }; 448 449 static int __init red_module_init(void) 450 { 451 return register_qdisc(&red_qdisc_ops); 452 } 453 static void __exit red_module_exit(void) 454 { 455 unregister_qdisc(&red_qdisc_ops); 456 } 457 module_init(red_module_init) 458 module_exit(red_module_exit) 459 MODULE_LICENSE("GPL"); 460