1 /*- 2 * Copyright (C) 1997-2003 3 * Sony Computer Science Laboratories Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 /*- 28 * Copyright (c) 1990-1994 Regents of the University of California. 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. All advertising materials mentioning features or use of this software 40 * must display the following acknowledgement: 41 * This product includes software developed by the Computer Systems 42 * Engineering Group at Lawrence Berkeley Laboratory. 43 * 4. Neither the name of the University nor of the Laboratory may be used 44 * to endorse or promote products derived from this software without 45 * specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ 60 */ 61 62 #include "opt_altq.h" 63 #include "opt_inet.h" 64 #include "opt_inet6.h" 65 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ 66 67 #include <sys/param.h> 68 #include <sys/malloc.h> 69 #include <sys/mbuf.h> 70 #include <sys/socket.h> 71 #include <sys/systm.h> 72 #include <sys/errno.h> 73 #if 1 /* ALTQ3_COMPAT */ 74 #include <sys/sockio.h> 75 #include <sys/proc.h> 76 #include <sys/kernel.h> 77 #ifdef ALTQ_FLOWVALVE 78 #include <sys/queue.h> 79 #include <sys/time.h> 80 #endif 81 #endif /* ALTQ3_COMPAT */ 82 83 #include <net/if.h> 84 #include <net/if_var.h> 85 86 #include <netinet/in.h> 87 #include <netinet/in_systm.h> 88 #include <netinet/ip.h> 89 #ifdef INET6 90 #include <netinet/ip6.h> 91 #endif 92 93 #include <netpfil/pf/pf.h> 94 #include <netpfil/pf/pf_altq.h> 95 #include <netpfil/pf/pf_mtag.h> 96 #include <net/altq/altq.h> 97 #include <net/altq/altq_red.h> 98 99 /* 100 * ALTQ/RED (Random Early Detection) implementation using 32-bit 101 * fixed-point calculation. 102 * 103 * written by kjc using the ns code as a reference. 104 * you can learn more about red and ns from Sally's home page at 105 * http://www-nrg.ee.lbl.gov/floyd/ 106 * 107 * most of the red parameter values are fixed in this implementation 108 * to prevent fixed-point overflow/underflow. 109 * if you change the parameters, watch out for overflow/underflow! 110 * 111 * the parameters used are recommended values by Sally. 112 * the corresponding ns config looks: 113 * q_weight=0.00195 114 * minthresh=5 maxthresh=15 queue-size=60 115 * linterm=30 116 * dropmech=drop-tail 117 * bytes=false (can't be handled by 32-bit fixed-point) 118 * doubleq=false dqthresh=false 119 * wait=true 120 */ 121 /* 122 * alternative red parameters for a slow link. 123 * 124 * assume the queue length becomes from zero to L and keeps L, it takes 125 * N packets for q_avg to reach 63% of L. 126 * when q_weight is 0.002, N is about 500 packets. 127 * for a slow link like dial-up, 500 packets takes more than 1 minute! 128 * when q_weight is 0.008, N is about 127 packets. 129 * when q_weight is 0.016, N is about 63 packets. 130 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 131 * are allowed for 0.016. 132 * see Sally's paper for more details. 133 */ 134 /* normal red parameters */ 135 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 136 /* q_weight = 0.00195 */ 137 138 /* red parameters for a slow link */ 139 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 140 /* q_weight = 0.0078125 */ 141 142 /* red parameters for a very slow link (e.g., dialup) */ 143 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 144 /* q_weight = 0.015625 */ 145 146 /* fixed-point uses 12-bit decimal places */ 147 #define FP_SHIFT 12 /* fixed-point shift */ 148 149 /* red parameters for drop probability */ 150 #define INV_P_MAX 10 /* inverse of max drop probability */ 151 #define TH_MIN 5 /* min threshold */ 152 #define TH_MAX 15 /* max threshold */ 153 154 #define RED_LIMIT 60 /* default max queue length */ 155 #define RED_STATS /* collect statistics */ 156 157 /* 158 * our default policy for forced-drop is drop-tail. 159 * (in altq-1.1.2 or earlier, the default was random-drop. 160 * but it makes more sense to punish the cause of the surge.) 161 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 162 */ 163 164 /* default red parameter values */ 165 static int default_th_min = TH_MIN; 166 static int default_th_max = TH_MAX; 167 static int default_inv_pmax = INV_P_MAX; 168 169 /* 170 * red support routines 171 */ 172 red_t * 173 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, 174 int pkttime) 175 { 176 red_t *rp; 177 int w, i; 178 int npkts_per_sec; 179 180 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO); 181 if (rp == NULL) 182 return (NULL); 183 184 if (weight == 0) 185 rp->red_weight = W_WEIGHT; 186 else 187 rp->red_weight = weight; 188 189 /* allocate weight table */ 190 rp->red_wtab = wtab_alloc(rp->red_weight); 191 if (rp->red_wtab == NULL) { 192 free(rp, M_DEVBUF); 193 return (NULL); 194 } 195 196 rp->red_avg = 0; 197 rp->red_idle = 1; 198 199 if (inv_pmax == 0) 200 rp->red_inv_pmax = default_inv_pmax; 201 else 202 rp->red_inv_pmax = inv_pmax; 203 if (th_min == 0) 204 rp->red_thmin = default_th_min; 205 else 206 rp->red_thmin = th_min; 207 if (th_max == 0) 208 rp->red_thmax = default_th_max; 209 else 210 rp->red_thmax = th_max; 211 212 rp->red_flags = flags; 213 214 if (pkttime == 0) 215 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 216 rp->red_pkttime = 800; 217 else 218 rp->red_pkttime = pkttime; 219 220 if (weight == 0) { 221 /* when the link is very slow, adjust red parameters */ 222 npkts_per_sec = 1000000 / rp->red_pkttime; 223 if (npkts_per_sec < 50) { 224 /* up to about 400Kbps */ 225 rp->red_weight = W_WEIGHT_2; 226 } else if (npkts_per_sec < 300) { 227 /* up to about 2.4Mbps */ 228 rp->red_weight = W_WEIGHT_1; 229 } 230 } 231 232 /* calculate wshift. weight must be power of 2 */ 233 w = rp->red_weight; 234 for (i = 0; w > 1; i++) 235 w = w >> 1; 236 rp->red_wshift = i; 237 w = 1 << rp->red_wshift; 238 if (w != rp->red_weight) { 239 printf("invalid weight value %d for red! use %d\n", 240 rp->red_weight, w); 241 rp->red_weight = w; 242 } 243 244 /* 245 * thmin_s and thmax_s are scaled versions of th_min and th_max 246 * to be compared with avg. 247 */ 248 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 249 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 250 251 /* 252 * precompute probability denominator 253 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 254 */ 255 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) 256 * rp->red_inv_pmax) << FP_SHIFT; 257 258 microtime(&rp->red_last); 259 return (rp); 260 } 261 262 void 263 red_destroy(red_t *rp) 264 { 265 wtab_destroy(rp->red_wtab); 266 free(rp, M_DEVBUF); 267 } 268 269 void 270 red_getstats(red_t *rp, struct redstats *sp) 271 { 272 sp->q_avg = rp->red_avg >> rp->red_wshift; 273 sp->xmit_cnt = rp->red_stats.xmit_cnt; 274 sp->drop_cnt = rp->red_stats.drop_cnt; 275 sp->drop_forced = rp->red_stats.drop_forced; 276 sp->drop_unforced = rp->red_stats.drop_unforced; 277 sp->marked_packets = rp->red_stats.marked_packets; 278 } 279 280 int 281 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, 282 struct altq_pktattr *pktattr) 283 { 284 int avg, droptype; 285 int n; 286 287 avg = rp->red_avg; 288 289 /* 290 * if we were idle, we pretend that n packets arrived during 291 * the idle period. 292 */ 293 if (rp->red_idle) { 294 struct timeval now; 295 int t; 296 297 rp->red_idle = 0; 298 microtime(&now); 299 t = (now.tv_sec - rp->red_last.tv_sec); 300 if (t > 60) { 301 /* 302 * being idle for more than 1 minute, set avg to zero. 303 * this prevents t from overflow. 304 */ 305 avg = 0; 306 } else { 307 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 308 n = t / rp->red_pkttime - 1; 309 310 /* the following line does (avg = (1 - Wq)^n * avg) */ 311 if (n > 0) 312 avg = (avg >> FP_SHIFT) * 313 pow_w(rp->red_wtab, n); 314 } 315 } 316 317 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 318 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 319 rp->red_avg = avg; /* save the new value */ 320 321 /* 322 * red_count keeps a tally of arriving traffic that has not 323 * been dropped. 324 */ 325 rp->red_count++; 326 327 /* see if we drop early */ 328 droptype = DTYPE_NODROP; 329 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 330 if (avg >= rp->red_thmax_s) { 331 /* avg >= th_max: forced drop */ 332 droptype = DTYPE_FORCED; 333 } else if (rp->red_old == 0) { 334 /* first exceeds th_min */ 335 rp->red_count = 1; 336 rp->red_old = 1; 337 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 338 rp->red_probd, rp->red_count)) { 339 /* mark or drop by red */ 340 if ((rp->red_flags & REDF_ECN) && 341 mark_ecn(m, pktattr, rp->red_flags)) { 342 /* successfully marked. do not drop. */ 343 rp->red_count = 0; 344 #ifdef RED_STATS 345 rp->red_stats.marked_packets++; 346 #endif 347 } else { 348 /* unforced drop by red */ 349 droptype = DTYPE_EARLY; 350 } 351 } 352 } else { 353 /* avg < th_min */ 354 rp->red_old = 0; 355 } 356 357 /* 358 * if the queue length hits the hard limit, it's a forced drop. 359 */ 360 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 361 droptype = DTYPE_FORCED; 362 363 #ifdef RED_RANDOM_DROP 364 /* if successful or forced drop, enqueue this packet. */ 365 if (droptype != DTYPE_EARLY) 366 _addq(q, m); 367 #else 368 /* if successful, enqueue this packet. */ 369 if (droptype == DTYPE_NODROP) 370 _addq(q, m); 371 #endif 372 if (droptype != DTYPE_NODROP) { 373 if (droptype == DTYPE_EARLY) { 374 /* drop the incoming packet */ 375 #ifdef RED_STATS 376 rp->red_stats.drop_unforced++; 377 #endif 378 } else { 379 /* forced drop, select a victim packet in the queue. */ 380 #ifdef RED_RANDOM_DROP 381 m = _getq_random(q); 382 #endif 383 #ifdef RED_STATS 384 rp->red_stats.drop_forced++; 385 #endif 386 } 387 #ifdef RED_STATS 388 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); 389 #endif 390 rp->red_count = 0; 391 m_freem(m); 392 return (-1); 393 } 394 /* successfully queued */ 395 #ifdef RED_STATS 396 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); 397 #endif 398 return (0); 399 } 400 401 /* 402 * early-drop probability is calculated as follows: 403 * prob = p_max * (avg - th_min) / (th_max - th_min) 404 * prob_a = prob / (2 - count*prob) 405 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 406 * here prob_a increases as successive undrop count increases. 407 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 408 * becomes 1 when (count >= (2 / prob))). 409 */ 410 int 411 drop_early(int fp_len, int fp_probd, int count) 412 { 413 int d; /* denominator of drop-probability */ 414 415 d = fp_probd - count * fp_len; 416 if (d <= 0) 417 /* count exceeds the hard limit: drop or mark */ 418 return (1); 419 420 /* 421 * now the range of d is [1..600] in fixed-point. (when 422 * th_max-th_min=10 and p_max=1/30) 423 * drop probability = (avg - TH_MIN) / d 424 */ 425 426 if ((arc4random() % d) < fp_len) { 427 /* drop or mark */ 428 return (1); 429 } 430 /* no drop/mark */ 431 return (0); 432 } 433 434 /* 435 * try to mark CE bit to the packet. 436 * returns 1 if successfully marked, 0 otherwise. 437 */ 438 int 439 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) 440 { 441 struct mbuf *m0; 442 struct pf_mtag *at; 443 void *hdr; 444 445 at = pf_find_mtag(m); 446 if (at != NULL) { 447 hdr = at->hdr; 448 } else 449 return (0); 450 451 /* verify that pattr_hdr is within the mbuf data */ 452 for (m0 = m; m0 != NULL; m0 = m0->m_next) 453 if (((caddr_t)hdr >= m0->m_data) && 454 ((caddr_t)hdr < m0->m_data + m0->m_len)) 455 break; 456 if (m0 == NULL) { 457 /* ick, tag info is stale */ 458 return (0); 459 } 460 461 switch (((struct ip *)hdr)->ip_v) { 462 case IPVERSION: 463 if (flags & REDF_ECN4) { 464 struct ip *ip = hdr; 465 u_int8_t otos; 466 int sum; 467 468 if (ip->ip_v != 4) 469 return (0); /* version mismatch! */ 470 471 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) 472 return (0); /* not-ECT */ 473 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 474 return (1); /* already marked */ 475 476 /* 477 * ecn-capable but not marked, 478 * mark CE and update checksum 479 */ 480 otos = ip->ip_tos; 481 ip->ip_tos |= IPTOS_ECN_CE; 482 /* 483 * update checksum (from RFC1624) 484 * HC' = ~(~HC + ~m + m') 485 */ 486 sum = ~ntohs(ip->ip_sum) & 0xffff; 487 sum += (~otos & 0xffff) + ip->ip_tos; 488 sum = (sum >> 16) + (sum & 0xffff); 489 sum += (sum >> 16); /* add carry */ 490 ip->ip_sum = htons(~sum & 0xffff); 491 return (1); 492 } 493 break; 494 #ifdef INET6 495 case (IPV6_VERSION >> 4): 496 if (flags & REDF_ECN6) { 497 struct ip6_hdr *ip6 = hdr; 498 u_int32_t flowlabel; 499 500 flowlabel = ntohl(ip6->ip6_flow); 501 if ((flowlabel >> 28) != 6) 502 return (0); /* version mismatch! */ 503 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 504 (IPTOS_ECN_NOTECT << 20)) 505 return (0); /* not-ECT */ 506 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 507 (IPTOS_ECN_CE << 20)) 508 return (1); /* already marked */ 509 /* 510 * ecn-capable but not marked, mark CE 511 */ 512 flowlabel |= (IPTOS_ECN_CE << 20); 513 ip6->ip6_flow = htonl(flowlabel); 514 return (1); 515 } 516 break; 517 #endif /* INET6 */ 518 } 519 520 /* not marked */ 521 return (0); 522 } 523 524 struct mbuf * 525 red_getq(red_t *rp, class_queue_t *q) 526 { 527 struct mbuf *m; 528 529 if ((m = _getq(q)) == NULL) { 530 if (rp->red_idle == 0) { 531 rp->red_idle = 1; 532 microtime(&rp->red_last); 533 } 534 return NULL; 535 } 536 537 rp->red_idle = 0; 538 return (m); 539 } 540 541 /* 542 * helper routine to calibrate avg during idle. 543 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 544 * here Wq = 1/weight and the code assumes Wq is close to zero. 545 * 546 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 547 */ 548 static struct wtab *wtab_list = NULL; /* pointer to wtab list */ 549 550 struct wtab * 551 wtab_alloc(int weight) 552 { 553 struct wtab *w; 554 int i; 555 556 for (w = wtab_list; w != NULL; w = w->w_next) 557 if (w->w_weight == weight) { 558 w->w_refcount++; 559 return (w); 560 } 561 562 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO); 563 if (w == NULL) 564 return (NULL); 565 w->w_weight = weight; 566 w->w_refcount = 1; 567 w->w_next = wtab_list; 568 wtab_list = w; 569 570 /* initialize the weight table */ 571 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 572 for (i = 1; i < 32; i++) { 573 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 574 if (w->w_tab[i] == 0 && w->w_param_max == 0) 575 w->w_param_max = 1 << i; 576 } 577 578 return (w); 579 } 580 581 int 582 wtab_destroy(struct wtab *w) 583 { 584 struct wtab *prev; 585 586 if (--w->w_refcount > 0) 587 return (0); 588 589 if (wtab_list == w) 590 wtab_list = w->w_next; 591 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) 592 if (prev->w_next == w) { 593 prev->w_next = w->w_next; 594 break; 595 } 596 597 free(w, M_DEVBUF); 598 return (0); 599 } 600 601 int32_t 602 pow_w(struct wtab *w, int n) 603 { 604 int i, bit; 605 int32_t val; 606 607 if (n >= w->w_param_max) 608 return (0); 609 610 val = 1 << FP_SHIFT; 611 if (n <= 0) 612 return (val); 613 614 bit = 1; 615 i = 0; 616 while (n) { 617 if (n & bit) { 618 val = (val * w->w_tab[i]) >> FP_SHIFT; 619 n &= ~bit; 620 } 621 i++; 622 bit <<= 1; 623 } 624 return (val); 625 } 626 627 #endif /* ALTQ_RED */ 628