1 /*- 2 * Copyright (C) 1997-2003 3 * Sony Computer Science Laboratories Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 /*- 28 * Copyright (c) 1990-1994 Regents of the University of California. 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. All advertising materials mentioning features or use of this software 40 * must display the following acknowledgement: 41 * This product includes software developed by the Computer Systems 42 * Engineering Group at Lawrence Berkeley Laboratory. 43 * 4. Neither the name of the University nor of the Laboratory may be used 44 * to endorse or promote products derived from this software without 45 * specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ 60 * $FreeBSD$ 61 */ 62 63 #include "opt_altq.h" 64 #include "opt_inet.h" 65 #include "opt_inet6.h" 66 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ 67 68 #include <sys/param.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/socket.h> 72 #include <sys/systm.h> 73 #include <sys/errno.h> 74 #if 1 /* ALTQ3_COMPAT */ 75 #include <sys/sockio.h> 76 #include <sys/proc.h> 77 #include <sys/kernel.h> 78 #ifdef ALTQ_FLOWVALVE 79 #include <sys/queue.h> 80 #include <sys/time.h> 81 #endif 82 #endif /* ALTQ3_COMPAT */ 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_systm.h> 89 #include <netinet/ip.h> 90 #ifdef INET6 91 #include <netinet/ip6.h> 92 #endif 93 94 #include <netpfil/pf/pf.h> 95 #include <netpfil/pf/pf_altq.h> 96 #include <netpfil/pf/pf_mtag.h> 97 #include <net/altq/altq.h> 98 #include <net/altq/altq_red.h> 99 100 /* 101 * ALTQ/RED (Random Early Detection) implementation using 32-bit 102 * fixed-point calculation. 103 * 104 * written by kjc using the ns code as a reference. 105 * you can learn more about red and ns from Sally's home page at 106 * http://www-nrg.ee.lbl.gov/floyd/ 107 * 108 * most of the red parameter values are fixed in this implementation 109 * to prevent fixed-point overflow/underflow. 110 * if you change the parameters, watch out for overflow/underflow! 111 * 112 * the parameters used are recommended values by Sally. 113 * the corresponding ns config looks: 114 * q_weight=0.00195 115 * minthresh=5 maxthresh=15 queue-size=60 116 * linterm=30 117 * dropmech=drop-tail 118 * bytes=false (can't be handled by 32-bit fixed-point) 119 * doubleq=false dqthresh=false 120 * wait=true 121 */ 122 /* 123 * alternative red parameters for a slow link. 124 * 125 * assume the queue length becomes from zero to L and keeps L, it takes 126 * N packets for q_avg to reach 63% of L. 127 * when q_weight is 0.002, N is about 500 packets. 128 * for a slow link like dial-up, 500 packets takes more than 1 minute! 129 * when q_weight is 0.008, N is about 127 packets. 130 * when q_weight is 0.016, N is about 63 packets. 131 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 132 * are allowed for 0.016. 133 * see Sally's paper for more details. 134 */ 135 /* normal red parameters */ 136 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 137 /* q_weight = 0.00195 */ 138 139 /* red parameters for a slow link */ 140 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 141 /* q_weight = 0.0078125 */ 142 143 /* red parameters for a very slow link (e.g., dialup) */ 144 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 145 /* q_weight = 0.015625 */ 146 147 /* fixed-point uses 12-bit decimal places */ 148 #define FP_SHIFT 12 /* fixed-point shift */ 149 150 /* red parameters for drop probability */ 151 #define INV_P_MAX 10 /* inverse of max drop probability */ 152 #define TH_MIN 5 /* min threshold */ 153 #define TH_MAX 15 /* max threshold */ 154 155 #define RED_LIMIT 60 /* default max queue length */ 156 #define RED_STATS /* collect statistics */ 157 158 /* 159 * our default policy for forced-drop is drop-tail. 160 * (in altq-1.1.2 or earlier, the default was random-drop. 161 * but it makes more sense to punish the cause of the surge.) 162 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 163 */ 164 165 166 /* default red parameter values */ 167 static int default_th_min = TH_MIN; 168 static int default_th_max = TH_MAX; 169 static int default_inv_pmax = INV_P_MAX; 170 171 172 /* 173 * red support routines 174 */ 175 red_t * 176 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, 177 int pkttime) 178 { 179 red_t *rp; 180 int w, i; 181 int npkts_per_sec; 182 183 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO); 184 if (rp == NULL) 185 return (NULL); 186 187 if (weight == 0) 188 rp->red_weight = W_WEIGHT; 189 else 190 rp->red_weight = weight; 191 192 /* allocate weight table */ 193 rp->red_wtab = wtab_alloc(rp->red_weight); 194 if (rp->red_wtab == NULL) { 195 free(rp, M_DEVBUF); 196 return (NULL); 197 } 198 199 rp->red_avg = 0; 200 rp->red_idle = 1; 201 202 if (inv_pmax == 0) 203 rp->red_inv_pmax = default_inv_pmax; 204 else 205 rp->red_inv_pmax = inv_pmax; 206 if (th_min == 0) 207 rp->red_thmin = default_th_min; 208 else 209 rp->red_thmin = th_min; 210 if (th_max == 0) 211 rp->red_thmax = default_th_max; 212 else 213 rp->red_thmax = th_max; 214 215 rp->red_flags = flags; 216 217 if (pkttime == 0) 218 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 219 rp->red_pkttime = 800; 220 else 221 rp->red_pkttime = pkttime; 222 223 if (weight == 0) { 224 /* when the link is very slow, adjust red parameters */ 225 npkts_per_sec = 1000000 / rp->red_pkttime; 226 if (npkts_per_sec < 50) { 227 /* up to about 400Kbps */ 228 rp->red_weight = W_WEIGHT_2; 229 } else if (npkts_per_sec < 300) { 230 /* up to about 2.4Mbps */ 231 rp->red_weight = W_WEIGHT_1; 232 } 233 } 234 235 /* calculate wshift. weight must be power of 2 */ 236 w = rp->red_weight; 237 for (i = 0; w > 1; i++) 238 w = w >> 1; 239 rp->red_wshift = i; 240 w = 1 << rp->red_wshift; 241 if (w != rp->red_weight) { 242 printf("invalid weight value %d for red! use %d\n", 243 rp->red_weight, w); 244 rp->red_weight = w; 245 } 246 247 /* 248 * thmin_s and thmax_s are scaled versions of th_min and th_max 249 * to be compared with avg. 250 */ 251 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 252 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 253 254 /* 255 * precompute probability denominator 256 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 257 */ 258 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) 259 * rp->red_inv_pmax) << FP_SHIFT; 260 261 microtime(&rp->red_last); 262 return (rp); 263 } 264 265 void 266 red_destroy(red_t *rp) 267 { 268 wtab_destroy(rp->red_wtab); 269 free(rp, M_DEVBUF); 270 } 271 272 void 273 red_getstats(red_t *rp, struct redstats *sp) 274 { 275 sp->q_avg = rp->red_avg >> rp->red_wshift; 276 sp->xmit_cnt = rp->red_stats.xmit_cnt; 277 sp->drop_cnt = rp->red_stats.drop_cnt; 278 sp->drop_forced = rp->red_stats.drop_forced; 279 sp->drop_unforced = rp->red_stats.drop_unforced; 280 sp->marked_packets = rp->red_stats.marked_packets; 281 } 282 283 int 284 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, 285 struct altq_pktattr *pktattr) 286 { 287 int avg, droptype; 288 int n; 289 290 avg = rp->red_avg; 291 292 /* 293 * if we were idle, we pretend that n packets arrived during 294 * the idle period. 295 */ 296 if (rp->red_idle) { 297 struct timeval now; 298 int t; 299 300 rp->red_idle = 0; 301 microtime(&now); 302 t = (now.tv_sec - rp->red_last.tv_sec); 303 if (t > 60) { 304 /* 305 * being idle for more than 1 minute, set avg to zero. 306 * this prevents t from overflow. 307 */ 308 avg = 0; 309 } else { 310 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 311 n = t / rp->red_pkttime - 1; 312 313 /* the following line does (avg = (1 - Wq)^n * avg) */ 314 if (n > 0) 315 avg = (avg >> FP_SHIFT) * 316 pow_w(rp->red_wtab, n); 317 } 318 } 319 320 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 321 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 322 rp->red_avg = avg; /* save the new value */ 323 324 /* 325 * red_count keeps a tally of arriving traffic that has not 326 * been dropped. 327 */ 328 rp->red_count++; 329 330 /* see if we drop early */ 331 droptype = DTYPE_NODROP; 332 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 333 if (avg >= rp->red_thmax_s) { 334 /* avg >= th_max: forced drop */ 335 droptype = DTYPE_FORCED; 336 } else if (rp->red_old == 0) { 337 /* first exceeds th_min */ 338 rp->red_count = 1; 339 rp->red_old = 1; 340 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 341 rp->red_probd, rp->red_count)) { 342 /* mark or drop by red */ 343 if ((rp->red_flags & REDF_ECN) && 344 mark_ecn(m, pktattr, rp->red_flags)) { 345 /* successfully marked. do not drop. */ 346 rp->red_count = 0; 347 #ifdef RED_STATS 348 rp->red_stats.marked_packets++; 349 #endif 350 } else { 351 /* unforced drop by red */ 352 droptype = DTYPE_EARLY; 353 } 354 } 355 } else { 356 /* avg < th_min */ 357 rp->red_old = 0; 358 } 359 360 /* 361 * if the queue length hits the hard limit, it's a forced drop. 362 */ 363 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 364 droptype = DTYPE_FORCED; 365 366 #ifdef RED_RANDOM_DROP 367 /* if successful or forced drop, enqueue this packet. */ 368 if (droptype != DTYPE_EARLY) 369 _addq(q, m); 370 #else 371 /* if successful, enqueue this packet. */ 372 if (droptype == DTYPE_NODROP) 373 _addq(q, m); 374 #endif 375 if (droptype != DTYPE_NODROP) { 376 if (droptype == DTYPE_EARLY) { 377 /* drop the incoming packet */ 378 #ifdef RED_STATS 379 rp->red_stats.drop_unforced++; 380 #endif 381 } else { 382 /* forced drop, select a victim packet in the queue. */ 383 #ifdef RED_RANDOM_DROP 384 m = _getq_random(q); 385 #endif 386 #ifdef RED_STATS 387 rp->red_stats.drop_forced++; 388 #endif 389 } 390 #ifdef RED_STATS 391 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); 392 #endif 393 rp->red_count = 0; 394 m_freem(m); 395 return (-1); 396 } 397 /* successfully queued */ 398 #ifdef RED_STATS 399 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); 400 #endif 401 return (0); 402 } 403 404 /* 405 * early-drop probability is calculated as follows: 406 * prob = p_max * (avg - th_min) / (th_max - th_min) 407 * prob_a = prob / (2 - count*prob) 408 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 409 * here prob_a increases as successive undrop count increases. 410 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 411 * becomes 1 when (count >= (2 / prob))). 412 */ 413 int 414 drop_early(int fp_len, int fp_probd, int count) 415 { 416 int d; /* denominator of drop-probability */ 417 418 d = fp_probd - count * fp_len; 419 if (d <= 0) 420 /* count exceeds the hard limit: drop or mark */ 421 return (1); 422 423 /* 424 * now the range of d is [1..600] in fixed-point. (when 425 * th_max-th_min=10 and p_max=1/30) 426 * drop probability = (avg - TH_MIN) / d 427 */ 428 429 if ((arc4random() % d) < fp_len) { 430 /* drop or mark */ 431 return (1); 432 } 433 /* no drop/mark */ 434 return (0); 435 } 436 437 /* 438 * try to mark CE bit to the packet. 439 * returns 1 if successfully marked, 0 otherwise. 440 */ 441 int 442 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) 443 { 444 struct mbuf *m0; 445 struct pf_mtag *at; 446 void *hdr; 447 448 at = pf_find_mtag(m); 449 if (at != NULL) { 450 hdr = at->hdr; 451 } else 452 return (0); 453 454 /* verify that pattr_hdr is within the mbuf data */ 455 for (m0 = m; m0 != NULL; m0 = m0->m_next) 456 if (((caddr_t)hdr >= m0->m_data) && 457 ((caddr_t)hdr < m0->m_data + m0->m_len)) 458 break; 459 if (m0 == NULL) { 460 /* ick, tag info is stale */ 461 return (0); 462 } 463 464 switch (((struct ip *)hdr)->ip_v) { 465 case IPVERSION: 466 if (flags & REDF_ECN4) { 467 struct ip *ip = hdr; 468 u_int8_t otos; 469 int sum; 470 471 if (ip->ip_v != 4) 472 return (0); /* version mismatch! */ 473 474 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) 475 return (0); /* not-ECT */ 476 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 477 return (1); /* already marked */ 478 479 /* 480 * ecn-capable but not marked, 481 * mark CE and update checksum 482 */ 483 otos = ip->ip_tos; 484 ip->ip_tos |= IPTOS_ECN_CE; 485 /* 486 * update checksum (from RFC1624) 487 * HC' = ~(~HC + ~m + m') 488 */ 489 sum = ~ntohs(ip->ip_sum) & 0xffff; 490 sum += (~otos & 0xffff) + ip->ip_tos; 491 sum = (sum >> 16) + (sum & 0xffff); 492 sum += (sum >> 16); /* add carry */ 493 ip->ip_sum = htons(~sum & 0xffff); 494 return (1); 495 } 496 break; 497 #ifdef INET6 498 case (IPV6_VERSION >> 4): 499 if (flags & REDF_ECN6) { 500 struct ip6_hdr *ip6 = hdr; 501 u_int32_t flowlabel; 502 503 flowlabel = ntohl(ip6->ip6_flow); 504 if ((flowlabel >> 28) != 6) 505 return (0); /* version mismatch! */ 506 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 507 (IPTOS_ECN_NOTECT << 20)) 508 return (0); /* not-ECT */ 509 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 510 (IPTOS_ECN_CE << 20)) 511 return (1); /* already marked */ 512 /* 513 * ecn-capable but not marked, mark CE 514 */ 515 flowlabel |= (IPTOS_ECN_CE << 20); 516 ip6->ip6_flow = htonl(flowlabel); 517 return (1); 518 } 519 break; 520 #endif /* INET6 */ 521 } 522 523 /* not marked */ 524 return (0); 525 } 526 527 struct mbuf * 528 red_getq(rp, q) 529 red_t *rp; 530 class_queue_t *q; 531 { 532 struct mbuf *m; 533 534 if ((m = _getq(q)) == NULL) { 535 if (rp->red_idle == 0) { 536 rp->red_idle = 1; 537 microtime(&rp->red_last); 538 } 539 return NULL; 540 } 541 542 rp->red_idle = 0; 543 return (m); 544 } 545 546 /* 547 * helper routine to calibrate avg during idle. 548 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 549 * here Wq = 1/weight and the code assumes Wq is close to zero. 550 * 551 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 552 */ 553 static struct wtab *wtab_list = NULL; /* pointer to wtab list */ 554 555 struct wtab * 556 wtab_alloc(int weight) 557 { 558 struct wtab *w; 559 int i; 560 561 for (w = wtab_list; w != NULL; w = w->w_next) 562 if (w->w_weight == weight) { 563 w->w_refcount++; 564 return (w); 565 } 566 567 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO); 568 if (w == NULL) 569 return (NULL); 570 w->w_weight = weight; 571 w->w_refcount = 1; 572 w->w_next = wtab_list; 573 wtab_list = w; 574 575 /* initialize the weight table */ 576 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 577 for (i = 1; i < 32; i++) { 578 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 579 if (w->w_tab[i] == 0 && w->w_param_max == 0) 580 w->w_param_max = 1 << i; 581 } 582 583 return (w); 584 } 585 586 int 587 wtab_destroy(struct wtab *w) 588 { 589 struct wtab *prev; 590 591 if (--w->w_refcount > 0) 592 return (0); 593 594 if (wtab_list == w) 595 wtab_list = w->w_next; 596 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) 597 if (prev->w_next == w) { 598 prev->w_next = w->w_next; 599 break; 600 } 601 602 free(w, M_DEVBUF); 603 return (0); 604 } 605 606 int32_t 607 pow_w(struct wtab *w, int n) 608 { 609 int i, bit; 610 int32_t val; 611 612 if (n >= w->w_param_max) 613 return (0); 614 615 val = 1 << FP_SHIFT; 616 if (n <= 0) 617 return (val); 618 619 bit = 1; 620 i = 0; 621 while (n) { 622 if (n & bit) { 623 val = (val * w->w_tab[i]) >> FP_SHIFT; 624 n &= ~bit; 625 } 626 i++; 627 bit <<= 1; 628 } 629 return (val); 630 } 631 632 633 #endif /* ALTQ_RED */ 634