1 /*- 2 * Copyright (C) 1997-2003 3 * Sony Computer Science Laboratories Inc. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 */ 27 /*- 28 * Copyright (c) 1990-1994 Regents of the University of California. 29 * All rights reserved. 30 * 31 * Redistribution and use in source and binary forms, with or without 32 * modification, are permitted provided that the following conditions 33 * are met: 34 * 1. Redistributions of source code must retain the above copyright 35 * notice, this list of conditions and the following disclaimer. 36 * 2. Redistributions in binary form must reproduce the above copyright 37 * notice, this list of conditions and the following disclaimer in the 38 * documentation and/or other materials provided with the distribution. 39 * 3. All advertising materials mentioning features or use of this software 40 * must display the following acknowledgement: 41 * This product includes software developed by the Computer Systems 42 * Engineering Group at Lawrence Berkeley Laboratory. 43 * 4. Neither the name of the University nor of the Laboratory may be used 44 * to endorse or promote products derived from this software without 45 * specific prior written permission. 46 * 47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 50 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 57 * SUCH DAMAGE. 58 * 59 * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $ 60 * $FreeBSD$ 61 */ 62 63 #include "opt_altq.h" 64 #include "opt_inet.h" 65 #include "opt_inet6.h" 66 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ 67 68 #include <sys/param.h> 69 #include <sys/malloc.h> 70 #include <sys/mbuf.h> 71 #include <sys/socket.h> 72 #include <sys/systm.h> 73 #include <sys/errno.h> 74 #if 1 /* ALTQ3_COMPAT */ 75 #include <sys/sockio.h> 76 #include <sys/proc.h> 77 #include <sys/kernel.h> 78 #ifdef ALTQ_FLOWVALVE 79 #include <sys/queue.h> 80 #include <sys/time.h> 81 #endif 82 #endif /* ALTQ3_COMPAT */ 83 84 #include <net/if.h> 85 #include <net/if_var.h> 86 87 #include <netinet/in.h> 88 #include <netinet/in_systm.h> 89 #include <netinet/ip.h> 90 #ifdef INET6 91 #include <netinet/ip6.h> 92 #endif 93 94 #include <netpfil/pf/pf.h> 95 #include <netpfil/pf/pf_altq.h> 96 #include <netpfil/pf/pf_mtag.h> 97 #include <net/altq/altq.h> 98 #include <net/altq/altq_red.h> 99 100 /* 101 * ALTQ/RED (Random Early Detection) implementation using 32-bit 102 * fixed-point calculation. 103 * 104 * written by kjc using the ns code as a reference. 105 * you can learn more about red and ns from Sally's home page at 106 * http://www-nrg.ee.lbl.gov/floyd/ 107 * 108 * most of the red parameter values are fixed in this implementation 109 * to prevent fixed-point overflow/underflow. 110 * if you change the parameters, watch out for overflow/underflow! 111 * 112 * the parameters used are recommended values by Sally. 113 * the corresponding ns config looks: 114 * q_weight=0.00195 115 * minthresh=5 maxthresh=15 queue-size=60 116 * linterm=30 117 * dropmech=drop-tail 118 * bytes=false (can't be handled by 32-bit fixed-point) 119 * doubleq=false dqthresh=false 120 * wait=true 121 */ 122 /* 123 * alternative red parameters for a slow link. 124 * 125 * assume the queue length becomes from zero to L and keeps L, it takes 126 * N packets for q_avg to reach 63% of L. 127 * when q_weight is 0.002, N is about 500 packets. 128 * for a slow link like dial-up, 500 packets takes more than 1 minute! 129 * when q_weight is 0.008, N is about 127 packets. 130 * when q_weight is 0.016, N is about 63 packets. 131 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 132 * are allowed for 0.016. 133 * see Sally's paper for more details. 134 */ 135 /* normal red parameters */ 136 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 137 /* q_weight = 0.00195 */ 138 139 /* red parameters for a slow link */ 140 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 141 /* q_weight = 0.0078125 */ 142 143 /* red parameters for a very slow link (e.g., dialup) */ 144 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 145 /* q_weight = 0.015625 */ 146 147 /* fixed-point uses 12-bit decimal places */ 148 #define FP_SHIFT 12 /* fixed-point shift */ 149 150 /* red parameters for drop probability */ 151 #define INV_P_MAX 10 /* inverse of max drop probability */ 152 #define TH_MIN 5 /* min threshold */ 153 #define TH_MAX 15 /* max threshold */ 154 155 #define RED_LIMIT 60 /* default max queue length */ 156 #define RED_STATS /* collect statistics */ 157 158 /* 159 * our default policy for forced-drop is drop-tail. 160 * (in altq-1.1.2 or earlier, the default was random-drop. 161 * but it makes more sense to punish the cause of the surge.) 162 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 163 */ 164 165 /* default red parameter values */ 166 static int default_th_min = TH_MIN; 167 static int default_th_max = TH_MAX; 168 static int default_inv_pmax = INV_P_MAX; 169 170 /* 171 * red support routines 172 */ 173 red_t * 174 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, 175 int pkttime) 176 { 177 red_t *rp; 178 int w, i; 179 int npkts_per_sec; 180 181 rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO); 182 if (rp == NULL) 183 return (NULL); 184 185 if (weight == 0) 186 rp->red_weight = W_WEIGHT; 187 else 188 rp->red_weight = weight; 189 190 /* allocate weight table */ 191 rp->red_wtab = wtab_alloc(rp->red_weight); 192 if (rp->red_wtab == NULL) { 193 free(rp, M_DEVBUF); 194 return (NULL); 195 } 196 197 rp->red_avg = 0; 198 rp->red_idle = 1; 199 200 if (inv_pmax == 0) 201 rp->red_inv_pmax = default_inv_pmax; 202 else 203 rp->red_inv_pmax = inv_pmax; 204 if (th_min == 0) 205 rp->red_thmin = default_th_min; 206 else 207 rp->red_thmin = th_min; 208 if (th_max == 0) 209 rp->red_thmax = default_th_max; 210 else 211 rp->red_thmax = th_max; 212 213 rp->red_flags = flags; 214 215 if (pkttime == 0) 216 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 217 rp->red_pkttime = 800; 218 else 219 rp->red_pkttime = pkttime; 220 221 if (weight == 0) { 222 /* when the link is very slow, adjust red parameters */ 223 npkts_per_sec = 1000000 / rp->red_pkttime; 224 if (npkts_per_sec < 50) { 225 /* up to about 400Kbps */ 226 rp->red_weight = W_WEIGHT_2; 227 } else if (npkts_per_sec < 300) { 228 /* up to about 2.4Mbps */ 229 rp->red_weight = W_WEIGHT_1; 230 } 231 } 232 233 /* calculate wshift. weight must be power of 2 */ 234 w = rp->red_weight; 235 for (i = 0; w > 1; i++) 236 w = w >> 1; 237 rp->red_wshift = i; 238 w = 1 << rp->red_wshift; 239 if (w != rp->red_weight) { 240 printf("invalid weight value %d for red! use %d\n", 241 rp->red_weight, w); 242 rp->red_weight = w; 243 } 244 245 /* 246 * thmin_s and thmax_s are scaled versions of th_min and th_max 247 * to be compared with avg. 248 */ 249 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 250 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 251 252 /* 253 * precompute probability denominator 254 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 255 */ 256 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) 257 * rp->red_inv_pmax) << FP_SHIFT; 258 259 microtime(&rp->red_last); 260 return (rp); 261 } 262 263 void 264 red_destroy(red_t *rp) 265 { 266 wtab_destroy(rp->red_wtab); 267 free(rp, M_DEVBUF); 268 } 269 270 void 271 red_getstats(red_t *rp, struct redstats *sp) 272 { 273 sp->q_avg = rp->red_avg >> rp->red_wshift; 274 sp->xmit_cnt = rp->red_stats.xmit_cnt; 275 sp->drop_cnt = rp->red_stats.drop_cnt; 276 sp->drop_forced = rp->red_stats.drop_forced; 277 sp->drop_unforced = rp->red_stats.drop_unforced; 278 sp->marked_packets = rp->red_stats.marked_packets; 279 } 280 281 int 282 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, 283 struct altq_pktattr *pktattr) 284 { 285 int avg, droptype; 286 int n; 287 288 avg = rp->red_avg; 289 290 /* 291 * if we were idle, we pretend that n packets arrived during 292 * the idle period. 293 */ 294 if (rp->red_idle) { 295 struct timeval now; 296 int t; 297 298 rp->red_idle = 0; 299 microtime(&now); 300 t = (now.tv_sec - rp->red_last.tv_sec); 301 if (t > 60) { 302 /* 303 * being idle for more than 1 minute, set avg to zero. 304 * this prevents t from overflow. 305 */ 306 avg = 0; 307 } else { 308 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 309 n = t / rp->red_pkttime - 1; 310 311 /* the following line does (avg = (1 - Wq)^n * avg) */ 312 if (n > 0) 313 avg = (avg >> FP_SHIFT) * 314 pow_w(rp->red_wtab, n); 315 } 316 } 317 318 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 319 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 320 rp->red_avg = avg; /* save the new value */ 321 322 /* 323 * red_count keeps a tally of arriving traffic that has not 324 * been dropped. 325 */ 326 rp->red_count++; 327 328 /* see if we drop early */ 329 droptype = DTYPE_NODROP; 330 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 331 if (avg >= rp->red_thmax_s) { 332 /* avg >= th_max: forced drop */ 333 droptype = DTYPE_FORCED; 334 } else if (rp->red_old == 0) { 335 /* first exceeds th_min */ 336 rp->red_count = 1; 337 rp->red_old = 1; 338 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 339 rp->red_probd, rp->red_count)) { 340 /* mark or drop by red */ 341 if ((rp->red_flags & REDF_ECN) && 342 mark_ecn(m, pktattr, rp->red_flags)) { 343 /* successfully marked. do not drop. */ 344 rp->red_count = 0; 345 #ifdef RED_STATS 346 rp->red_stats.marked_packets++; 347 #endif 348 } else { 349 /* unforced drop by red */ 350 droptype = DTYPE_EARLY; 351 } 352 } 353 } else { 354 /* avg < th_min */ 355 rp->red_old = 0; 356 } 357 358 /* 359 * if the queue length hits the hard limit, it's a forced drop. 360 */ 361 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 362 droptype = DTYPE_FORCED; 363 364 #ifdef RED_RANDOM_DROP 365 /* if successful or forced drop, enqueue this packet. */ 366 if (droptype != DTYPE_EARLY) 367 _addq(q, m); 368 #else 369 /* if successful, enqueue this packet. */ 370 if (droptype == DTYPE_NODROP) 371 _addq(q, m); 372 #endif 373 if (droptype != DTYPE_NODROP) { 374 if (droptype == DTYPE_EARLY) { 375 /* drop the incoming packet */ 376 #ifdef RED_STATS 377 rp->red_stats.drop_unforced++; 378 #endif 379 } else { 380 /* forced drop, select a victim packet in the queue. */ 381 #ifdef RED_RANDOM_DROP 382 m = _getq_random(q); 383 #endif 384 #ifdef RED_STATS 385 rp->red_stats.drop_forced++; 386 #endif 387 } 388 #ifdef RED_STATS 389 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); 390 #endif 391 rp->red_count = 0; 392 m_freem(m); 393 return (-1); 394 } 395 /* successfully queued */ 396 #ifdef RED_STATS 397 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); 398 #endif 399 return (0); 400 } 401 402 /* 403 * early-drop probability is calculated as follows: 404 * prob = p_max * (avg - th_min) / (th_max - th_min) 405 * prob_a = prob / (2 - count*prob) 406 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 407 * here prob_a increases as successive undrop count increases. 408 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 409 * becomes 1 when (count >= (2 / prob))). 410 */ 411 int 412 drop_early(int fp_len, int fp_probd, int count) 413 { 414 int d; /* denominator of drop-probability */ 415 416 d = fp_probd - count * fp_len; 417 if (d <= 0) 418 /* count exceeds the hard limit: drop or mark */ 419 return (1); 420 421 /* 422 * now the range of d is [1..600] in fixed-point. (when 423 * th_max-th_min=10 and p_max=1/30) 424 * drop probability = (avg - TH_MIN) / d 425 */ 426 427 if ((arc4random() % d) < fp_len) { 428 /* drop or mark */ 429 return (1); 430 } 431 /* no drop/mark */ 432 return (0); 433 } 434 435 /* 436 * try to mark CE bit to the packet. 437 * returns 1 if successfully marked, 0 otherwise. 438 */ 439 int 440 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) 441 { 442 struct mbuf *m0; 443 struct pf_mtag *at; 444 void *hdr; 445 446 at = pf_find_mtag(m); 447 if (at != NULL) { 448 hdr = at->hdr; 449 } else 450 return (0); 451 452 /* verify that pattr_hdr is within the mbuf data */ 453 for (m0 = m; m0 != NULL; m0 = m0->m_next) 454 if (((caddr_t)hdr >= m0->m_data) && 455 ((caddr_t)hdr < m0->m_data + m0->m_len)) 456 break; 457 if (m0 == NULL) { 458 /* ick, tag info is stale */ 459 return (0); 460 } 461 462 switch (((struct ip *)hdr)->ip_v) { 463 case IPVERSION: 464 if (flags & REDF_ECN4) { 465 struct ip *ip = hdr; 466 u_int8_t otos; 467 int sum; 468 469 if (ip->ip_v != 4) 470 return (0); /* version mismatch! */ 471 472 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) 473 return (0); /* not-ECT */ 474 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 475 return (1); /* already marked */ 476 477 /* 478 * ecn-capable but not marked, 479 * mark CE and update checksum 480 */ 481 otos = ip->ip_tos; 482 ip->ip_tos |= IPTOS_ECN_CE; 483 /* 484 * update checksum (from RFC1624) 485 * HC' = ~(~HC + ~m + m') 486 */ 487 sum = ~ntohs(ip->ip_sum) & 0xffff; 488 sum += (~otos & 0xffff) + ip->ip_tos; 489 sum = (sum >> 16) + (sum & 0xffff); 490 sum += (sum >> 16); /* add carry */ 491 ip->ip_sum = htons(~sum & 0xffff); 492 return (1); 493 } 494 break; 495 #ifdef INET6 496 case (IPV6_VERSION >> 4): 497 if (flags & REDF_ECN6) { 498 struct ip6_hdr *ip6 = hdr; 499 u_int32_t flowlabel; 500 501 flowlabel = ntohl(ip6->ip6_flow); 502 if ((flowlabel >> 28) != 6) 503 return (0); /* version mismatch! */ 504 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 505 (IPTOS_ECN_NOTECT << 20)) 506 return (0); /* not-ECT */ 507 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 508 (IPTOS_ECN_CE << 20)) 509 return (1); /* already marked */ 510 /* 511 * ecn-capable but not marked, mark CE 512 */ 513 flowlabel |= (IPTOS_ECN_CE << 20); 514 ip6->ip6_flow = htonl(flowlabel); 515 return (1); 516 } 517 break; 518 #endif /* INET6 */ 519 } 520 521 /* not marked */ 522 return (0); 523 } 524 525 struct mbuf * 526 red_getq(rp, q) 527 red_t *rp; 528 class_queue_t *q; 529 { 530 struct mbuf *m; 531 532 if ((m = _getq(q)) == NULL) { 533 if (rp->red_idle == 0) { 534 rp->red_idle = 1; 535 microtime(&rp->red_last); 536 } 537 return NULL; 538 } 539 540 rp->red_idle = 0; 541 return (m); 542 } 543 544 /* 545 * helper routine to calibrate avg during idle. 546 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 547 * here Wq = 1/weight and the code assumes Wq is close to zero. 548 * 549 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 550 */ 551 static struct wtab *wtab_list = NULL; /* pointer to wtab list */ 552 553 struct wtab * 554 wtab_alloc(int weight) 555 { 556 struct wtab *w; 557 int i; 558 559 for (w = wtab_list; w != NULL; w = w->w_next) 560 if (w->w_weight == weight) { 561 w->w_refcount++; 562 return (w); 563 } 564 565 w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO); 566 if (w == NULL) 567 return (NULL); 568 w->w_weight = weight; 569 w->w_refcount = 1; 570 w->w_next = wtab_list; 571 wtab_list = w; 572 573 /* initialize the weight table */ 574 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 575 for (i = 1; i < 32; i++) { 576 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 577 if (w->w_tab[i] == 0 && w->w_param_max == 0) 578 w->w_param_max = 1 << i; 579 } 580 581 return (w); 582 } 583 584 int 585 wtab_destroy(struct wtab *w) 586 { 587 struct wtab *prev; 588 589 if (--w->w_refcount > 0) 590 return (0); 591 592 if (wtab_list == w) 593 wtab_list = w->w_next; 594 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) 595 if (prev->w_next == w) { 596 prev->w_next = w->w_next; 597 break; 598 } 599 600 free(w, M_DEVBUF); 601 return (0); 602 } 603 604 int32_t 605 pow_w(struct wtab *w, int n) 606 { 607 int i, bit; 608 int32_t val; 609 610 if (n >= w->w_param_max) 611 return (0); 612 613 val = 1 << FP_SHIFT; 614 if (n <= 0) 615 return (val); 616 617 bit = 1; 618 i = 0; 619 while (n) { 620 if (n & bit) { 621 val = (val * w->w_tab[i]) >> FP_SHIFT; 622 n &= ~bit; 623 } 624 i++; 625 bit <<= 1; 626 } 627 return (val); 628 } 629 630 #endif /* ALTQ_RED */ 631