1 /*- 2 * Copyright (c) 1991-1997 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the Network Research 16 * Group at Lawrence Berkeley Laboratory. 17 * 4. Neither the name of the University nor of the Laboratory may be used 18 * to endorse or promote products derived from this software without 19 * specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * LBL code modified by speer@eng.sun.com, May 1977. 34 * For questions and/or comments, please send mail to cbq@ee.lbl.gov 35 * 36 * @(#)rm_class.c 1.48 97/12/05 SMI 37 * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ 38 */ 39 #include "opt_altq.h" 40 #include "opt_inet.h" 41 #include "opt_inet6.h" 42 #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ 43 44 #include <sys/param.h> 45 #include <sys/malloc.h> 46 #include <sys/mbuf.h> 47 #include <sys/socket.h> 48 #include <sys/systm.h> 49 #include <sys/errno.h> 50 #include <sys/time.h> 51 52 #include <net/if.h> 53 #include <net/if_var.h> 54 #include <net/if_private.h> 55 56 #include <net/altq/if_altq.h> 57 #include <net/altq/altq.h> 58 #include <net/altq/altq_codel.h> 59 #include <net/altq/altq_rmclass.h> 60 #include <net/altq/altq_rmclass_debug.h> 61 #include <net/altq/altq_red.h> 62 #include <net/altq/altq_rio.h> 63 64 /* 65 * Local Macros 66 */ 67 #define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } 68 69 /* 70 * Local routines. 71 */ 72 73 static int rmc_satisfied(struct rm_class *, struct timeval *); 74 static void rmc_wrr_set_weights(struct rm_ifdat *); 75 static void rmc_depth_compute(struct rm_class *); 76 static void rmc_depth_recompute(rm_class_t *); 77 78 static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); 79 static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int); 80 81 static int _rmc_addq(rm_class_t *, mbuf_t *); 82 static void _rmc_dropq(rm_class_t *); 83 static mbuf_t *_rmc_getq(rm_class_t *); 84 static mbuf_t *_rmc_pollq(rm_class_t *); 85 86 static int rmc_under_limit(struct rm_class *, struct timeval *); 87 static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); 88 static void rmc_drop_action(struct rm_class *); 89 static void rmc_restart(void *); 90 static void rmc_root_overlimit(struct rm_class *, struct rm_class *); 91 92 #define BORROW_OFFTIME 93 /* 94 * BORROW_OFFTIME (experimental): 95 * borrow the offtime of the class borrowing from. 96 * the reason is that when its own offtime is set, the class is unable 97 * to borrow much, especially when cutoff is taking effect. 98 * but when the borrowed class is overloaded (advidle is close to minidle), 99 * use the borrowing class's offtime to avoid overload. 100 */ 101 #define ADJUST_CUTOFF 102 /* 103 * ADJUST_CUTOFF (experimental): 104 * if no underlimit class is found due to cutoff, increase cutoff and 105 * retry the scheduling loop. 106 * also, don't invoke delay_actions while cutoff is taking effect, 107 * since a sleeping class won't have a chance to be scheduled in the 108 * next loop. 109 * 110 * now heuristics for setting the top-level variable (cutoff_) becomes: 111 * 1. if a packet arrives for a not-overlimit class, set cutoff 112 * to the depth of the class. 113 * 2. if cutoff is i, and a packet arrives for an overlimit class 114 * with an underlimit ancestor at a lower level than i (say j), 115 * then set cutoff to j. 116 * 3. at scheduling a packet, if there is no underlimit class 117 * due to the current cutoff level, increase cutoff by 1 and 118 * then try to schedule again. 119 */ 120 121 /* 122 * rm_class_t * 123 * rmc_newclass(...) - Create a new resource management class at priority 124 * 'pri' on the interface given by 'ifd'. 125 * 126 * nsecPerByte is the data rate of the interface in nanoseconds/byte. 127 * E.g., 800 for a 10Mb/s ethernet. If the class gets less 128 * than 100% of the bandwidth, this number should be the 129 * 'effective' rate for the class. Let f be the 130 * bandwidth fraction allocated to this class, and let 131 * nsPerByte be the data rate of the output link in 132 * nanoseconds/byte. Then nsecPerByte is set to 133 * nsPerByte / f. E.g., 1600 (= 800 / .5) 134 * for a class that gets 50% of an ethernet's bandwidth. 135 * 136 * action the routine to call when the class is over limit. 137 * 138 * maxq max allowable queue size for class (in packets). 139 * 140 * parent parent class pointer. 141 * 142 * borrow class to borrow from (should be either 'parent' or null). 143 * 144 * maxidle max value allowed for class 'idle' time estimate (this 145 * parameter determines how large an initial burst of packets 146 * can be before overlimit action is invoked. 147 * 148 * offtime how long 'delay' action will delay when class goes over 149 * limit (this parameter determines the steady-state burst 150 * size when a class is running over its limit). 151 * 152 * Maxidle and offtime have to be computed from the following: If the 153 * average packet size is s, the bandwidth fraction allocated to this 154 * class is f, we want to allow b packet bursts, and the gain of the 155 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: 156 * 157 * ptime = s * nsPerByte * (1 - f) / f 158 * maxidle = ptime * (1 - g^b) / g^b 159 * minidle = -ptime * (1 / (f - 1)) 160 * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) 161 * 162 * Operationally, it's convenient to specify maxidle & offtime in units 163 * independent of the link bandwidth so the maxidle & offtime passed to 164 * this routine are the above values multiplied by 8*f/(1000*nsPerByte). 165 * (The constant factor is a scale factor needed to make the parameters 166 * integers. This scaling also means that the 'unscaled' values of 167 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, 168 * not nanoseconds.) Also note that the 'idle' filter computation keeps 169 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 170 * maxidle also must be scaled upward by this value. Thus, the passed 171 * values for maxidle and offtime can be computed as follows: 172 * 173 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) 174 * offtime = offtime * 8 / (1000 * nsecPerByte) 175 * 176 * When USE_HRTIME is employed, then maxidle and offtime become: 177 * maxidle = maxilde * (8.0 / nsecPerByte); 178 * offtime = offtime * (8.0 / nsecPerByte); 179 */ 180 struct rm_class * 181 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, 182 void (*action)(rm_class_t *, rm_class_t *), int maxq, 183 struct rm_class *parent, struct rm_class *borrow, u_int maxidle, 184 int minidle, u_int offtime, int pktsize, int flags) 185 { 186 struct rm_class *cl; 187 struct rm_class *peer; 188 int s; 189 190 if (pri >= RM_MAXPRIO) 191 return (NULL); 192 #ifndef ALTQ_RED 193 if (flags & RMCF_RED) { 194 #ifdef ALTQ_DEBUG 195 printf("rmc_newclass: RED not configured for CBQ!\n"); 196 #endif 197 return (NULL); 198 } 199 #endif 200 #ifndef ALTQ_RIO 201 if (flags & RMCF_RIO) { 202 #ifdef ALTQ_DEBUG 203 printf("rmc_newclass: RIO not configured for CBQ!\n"); 204 #endif 205 return (NULL); 206 } 207 #endif 208 #ifndef ALTQ_CODEL 209 if (flags & RMCF_CODEL) { 210 #ifdef ALTQ_DEBUG 211 printf("rmc_newclass: CODEL not configured for CBQ!\n"); 212 #endif 213 return (NULL); 214 } 215 #endif 216 217 cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO); 218 if (cl == NULL) 219 return (NULL); 220 CALLOUT_INIT(&cl->callout_); 221 cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO); 222 if (cl->q_ == NULL) { 223 free(cl, M_DEVBUF); 224 return (NULL); 225 } 226 227 /* 228 * Class initialization. 229 */ 230 cl->children_ = NULL; 231 cl->parent_ = parent; 232 cl->borrow_ = borrow; 233 cl->leaf_ = 1; 234 cl->ifdat_ = ifd; 235 cl->pri_ = pri; 236 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 237 cl->depth_ = 0; 238 cl->qthresh_ = 0; 239 cl->ns_per_byte_ = nsecPerByte; 240 241 qlimit(cl->q_) = maxq; 242 qtype(cl->q_) = Q_DROPHEAD; 243 qlen(cl->q_) = 0; 244 cl->flags_ = flags; 245 246 #if 1 /* minidle is also scaled in ALTQ */ 247 cl->minidle_ = (minidle * (int)nsecPerByte) / 8; 248 if (cl->minidle_ > 0) 249 cl->minidle_ = 0; 250 #else 251 cl->minidle_ = minidle; 252 #endif 253 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 254 if (cl->maxidle_ == 0) 255 cl->maxidle_ = 1; 256 #if 1 /* offtime is also scaled in ALTQ */ 257 cl->avgidle_ = cl->maxidle_; 258 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 259 if (cl->offtime_ == 0) 260 cl->offtime_ = 1; 261 #else 262 cl->avgidle_ = 0; 263 cl->offtime_ = (offtime * nsecPerByte) / 8; 264 #endif 265 cl->overlimit = action; 266 267 #ifdef ALTQ_RED 268 if (flags & (RMCF_RED|RMCF_RIO)) { 269 int red_flags, red_pkttime; 270 271 red_flags = 0; 272 if (flags & RMCF_ECN) 273 red_flags |= REDF_ECN; 274 if (flags & RMCF_FLOWVALVE) 275 red_flags |= REDF_FLOWVALVE; 276 #ifdef ALTQ_RIO 277 if (flags & RMCF_CLEARDSCP) 278 red_flags |= RIOF_CLEARDSCP; 279 #endif 280 red_pkttime = nsecPerByte * pktsize / 1000; 281 282 if (flags & RMCF_RED) { 283 cl->red_ = red_alloc(0, 0, 284 qlimit(cl->q_) * 10/100, 285 qlimit(cl->q_) * 30/100, 286 red_flags, red_pkttime); 287 if (cl->red_ != NULL) 288 qtype(cl->q_) = Q_RED; 289 } 290 #ifdef ALTQ_RIO 291 else { 292 cl->red_ = (red_t *)rio_alloc(0, NULL, 293 red_flags, red_pkttime); 294 if (cl->red_ != NULL) 295 qtype(cl->q_) = Q_RIO; 296 } 297 #endif 298 } 299 #endif /* ALTQ_RED */ 300 #ifdef ALTQ_CODEL 301 if (flags & RMCF_CODEL) { 302 cl->codel_ = codel_alloc(5, 100, 0); 303 if (cl->codel_ != NULL) 304 qtype(cl->q_) = Q_CODEL; 305 } 306 #endif 307 308 /* 309 * put the class into the class tree 310 */ 311 s = splnet(); 312 IFQ_LOCK(ifd->ifq_); 313 if ((peer = ifd->active_[pri]) != NULL) { 314 /* find the last class at this pri */ 315 cl->peer_ = peer; 316 while (peer->peer_ != ifd->active_[pri]) 317 peer = peer->peer_; 318 peer->peer_ = cl; 319 } else { 320 ifd->active_[pri] = cl; 321 cl->peer_ = cl; 322 } 323 324 if (cl->parent_) { 325 cl->next_ = parent->children_; 326 parent->children_ = cl; 327 parent->leaf_ = 0; 328 } 329 330 /* 331 * Compute the depth of this class and its ancestors in the class 332 * hierarchy. 333 */ 334 rmc_depth_compute(cl); 335 336 /* 337 * If CBQ's WRR is enabled, then initialize the class WRR state. 338 */ 339 if (ifd->wrr_) { 340 ifd->num_[pri]++; 341 ifd->alloc_[pri] += cl->allotment_; 342 rmc_wrr_set_weights(ifd); 343 } 344 IFQ_UNLOCK(ifd->ifq_); 345 splx(s); 346 return (cl); 347 } 348 349 int 350 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, 351 int minidle, u_int offtime, int pktsize) 352 { 353 struct rm_ifdat *ifd; 354 u_int old_allotment; 355 int s; 356 357 ifd = cl->ifdat_; 358 old_allotment = cl->allotment_; 359 360 s = splnet(); 361 IFQ_LOCK(ifd->ifq_); 362 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 363 cl->qthresh_ = 0; 364 cl->ns_per_byte_ = nsecPerByte; 365 366 qlimit(cl->q_) = maxq; 367 368 #if 1 /* minidle is also scaled in ALTQ */ 369 cl->minidle_ = (minidle * nsecPerByte) / 8; 370 if (cl->minidle_ > 0) 371 cl->minidle_ = 0; 372 #else 373 cl->minidle_ = minidle; 374 #endif 375 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 376 if (cl->maxidle_ == 0) 377 cl->maxidle_ = 1; 378 #if 1 /* offtime is also scaled in ALTQ */ 379 cl->avgidle_ = cl->maxidle_; 380 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 381 if (cl->offtime_ == 0) 382 cl->offtime_ = 1; 383 #else 384 cl->avgidle_ = 0; 385 cl->offtime_ = (offtime * nsecPerByte) / 8; 386 #endif 387 388 /* 389 * If CBQ's WRR is enabled, then initialize the class WRR state. 390 */ 391 if (ifd->wrr_) { 392 ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; 393 rmc_wrr_set_weights(ifd); 394 } 395 IFQ_UNLOCK(ifd->ifq_); 396 splx(s); 397 return (0); 398 } 399 400 /* 401 * static void 402 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes 403 * the appropriate run robin weights for the CBQ weighted round robin 404 * algorithm. 405 * 406 * Returns: NONE 407 */ 408 409 static void 410 rmc_wrr_set_weights(struct rm_ifdat *ifd) 411 { 412 int i; 413 struct rm_class *cl, *clh; 414 415 for (i = 0; i < RM_MAXPRIO; i++) { 416 /* 417 * This is inverted from that of the simulator to 418 * maintain precision. 419 */ 420 if (ifd->num_[i] == 0) 421 ifd->M_[i] = 0; 422 else 423 ifd->M_[i] = ifd->alloc_[i] / 424 (ifd->num_[i] * ifd->maxpkt_); 425 /* 426 * Compute the weighted allotment for each class. 427 * This takes the expensive div instruction out 428 * of the main loop for the wrr scheduling path. 429 * These only get recomputed when a class comes or 430 * goes. 431 */ 432 if (ifd->active_[i] != NULL) { 433 clh = cl = ifd->active_[i]; 434 do { 435 /* safe-guard for slow link or alloc_ == 0 */ 436 if (ifd->M_[i] == 0) 437 cl->w_allotment_ = 0; 438 else 439 cl->w_allotment_ = cl->allotment_ / 440 ifd->M_[i]; 441 cl = cl->peer_; 442 } while ((cl != NULL) && (cl != clh)); 443 } 444 } 445 } 446 447 int 448 rmc_get_weight(struct rm_ifdat *ifd, int pri) 449 { 450 if ((pri >= 0) && (pri < RM_MAXPRIO)) 451 return (ifd->M_[pri]); 452 else 453 return (0); 454 } 455 456 /* 457 * static void 458 * rmc_depth_compute(struct rm_class *cl) - This function computes the 459 * appropriate depth of class 'cl' and its ancestors. 460 * 461 * Returns: NONE 462 */ 463 464 static void 465 rmc_depth_compute(struct rm_class *cl) 466 { 467 rm_class_t *t = cl, *p; 468 469 /* 470 * Recompute the depth for the branch of the tree. 471 */ 472 while (t != NULL) { 473 p = t->parent_; 474 if (p && (t->depth_ >= p->depth_)) { 475 p->depth_ = t->depth_ + 1; 476 t = p; 477 } else 478 t = NULL; 479 } 480 } 481 482 /* 483 * static void 484 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes 485 * the depth of the tree after a class has been deleted. 486 * 487 * Returns: NONE 488 */ 489 490 static void 491 rmc_depth_recompute(rm_class_t *cl) 492 { 493 #if 1 /* ALTQ */ 494 rm_class_t *p, *t; 495 496 p = cl; 497 while (p != NULL) { 498 if ((t = p->children_) == NULL) { 499 p->depth_ = 0; 500 } else { 501 int cdepth = 0; 502 503 while (t != NULL) { 504 if (t->depth_ > cdepth) 505 cdepth = t->depth_; 506 t = t->next_; 507 } 508 509 if (p->depth_ == cdepth + 1) 510 /* no change to this parent */ 511 return; 512 513 p->depth_ = cdepth + 1; 514 } 515 516 p = p->parent_; 517 } 518 #else 519 rm_class_t *t; 520 521 if (cl->depth_ >= 1) { 522 if (cl->children_ == NULL) { 523 cl->depth_ = 0; 524 } else if ((t = cl->children_) != NULL) { 525 while (t != NULL) { 526 if (t->children_ != NULL) 527 rmc_depth_recompute(t); 528 t = t->next_; 529 } 530 } else 531 rmc_depth_compute(cl); 532 } 533 #endif 534 } 535 536 /* 537 * void 538 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This 539 * function deletes a class from the link-sharing structure and frees 540 * all resources associated with the class. 541 * 542 * Returns: NONE 543 */ 544 545 void 546 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) 547 { 548 struct rm_class *p, *head, *previous; 549 int s; 550 551 ASSERT(cl->children_ == NULL); 552 553 if (cl->sleeping_) 554 CALLOUT_STOP(&cl->callout_); 555 556 s = splnet(); 557 IFQ_LOCK(ifd->ifq_); 558 /* 559 * Free packets in the packet queue. 560 * XXX - this may not be a desired behavior. Packets should be 561 * re-queued. 562 */ 563 rmc_dropall(cl); 564 565 /* 566 * If the class has a parent, then remove the class from the 567 * class from the parent's children chain. 568 */ 569 if (cl->parent_ != NULL) { 570 head = cl->parent_->children_; 571 p = previous = head; 572 if (head->next_ == NULL) { 573 ASSERT(head == cl); 574 cl->parent_->children_ = NULL; 575 cl->parent_->leaf_ = 1; 576 } else while (p != NULL) { 577 if (p == cl) { 578 if (cl == head) 579 cl->parent_->children_ = cl->next_; 580 else 581 previous->next_ = cl->next_; 582 cl->next_ = NULL; 583 p = NULL; 584 } else { 585 previous = p; 586 p = p->next_; 587 } 588 } 589 } 590 591 /* 592 * Delete class from class priority peer list. 593 */ 594 if ((p = ifd->active_[cl->pri_]) != NULL) { 595 /* 596 * If there is more than one member of this priority 597 * level, then look for class(cl) in the priority level. 598 */ 599 if (p != p->peer_) { 600 while (p->peer_ != cl) 601 p = p->peer_; 602 p->peer_ = cl->peer_; 603 604 if (ifd->active_[cl->pri_] == cl) 605 ifd->active_[cl->pri_] = cl->peer_; 606 } else { 607 ASSERT(p == cl); 608 ifd->active_[cl->pri_] = NULL; 609 } 610 } 611 612 /* 613 * Recompute the WRR weights. 614 */ 615 if (ifd->wrr_) { 616 ifd->alloc_[cl->pri_] -= cl->allotment_; 617 ifd->num_[cl->pri_]--; 618 rmc_wrr_set_weights(ifd); 619 } 620 621 /* 622 * Re-compute the depth of the tree. 623 */ 624 #if 1 /* ALTQ */ 625 rmc_depth_recompute(cl->parent_); 626 #else 627 rmc_depth_recompute(ifd->root_); 628 #endif 629 630 IFQ_UNLOCK(ifd->ifq_); 631 splx(s); 632 633 /* 634 * Free the class structure. 635 */ 636 if (cl->red_ != NULL) { 637 #ifdef ALTQ_RIO 638 if (q_is_rio(cl->q_)) 639 rio_destroy((rio_t *)cl->red_); 640 #endif 641 #ifdef ALTQ_RED 642 if (q_is_red(cl->q_)) 643 red_destroy(cl->red_); 644 #endif 645 #ifdef ALTQ_CODEL 646 if (q_is_codel(cl->q_)) 647 codel_destroy(cl->codel_); 648 #endif 649 } 650 free(cl->q_, M_DEVBUF); 651 free(cl, M_DEVBUF); 652 } 653 654 /* 655 * void 656 * rmc_init(...) - Initialize the resource management data structures 657 * associated with the output portion of interface 'ifp'. 'ifd' is 658 * where the structures will be built (for backwards compatibility, the 659 * structures aren't kept in the ifnet struct). 'nsecPerByte' 660 * gives the link speed (inverse of bandwidth) in nanoseconds/byte. 661 * 'restart' is the driver-specific routine that the generic 'delay 662 * until under limit' action will call to restart output. `maxq' 663 * is the queue size of the 'link' & 'default' classes. 'maxqueued' 664 * is the maximum number of packets that the resource management 665 * code will allow to be queued 'downstream' (this is typically 1). 666 * 667 * Returns: NONE 668 */ 669 670 void 671 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, 672 void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, 673 int minidle, u_int offtime, int flags) 674 { 675 int i, mtu; 676 677 /* 678 * Initialize the CBQ tracing/debug facility. 679 */ 680 CBQTRACEINIT(); 681 682 bzero((char *)ifd, sizeof (*ifd)); 683 mtu = ifq->altq_ifp->if_mtu; 684 ifd->ifq_ = ifq; 685 ifd->restart = restart; 686 ifd->maxqueued_ = maxqueued; 687 ifd->ns_per_byte_ = nsecPerByte; 688 ifd->maxpkt_ = mtu; 689 ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; 690 ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; 691 #if 1 692 ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; 693 if (mtu * nsecPerByte > 10 * 1000000) 694 ifd->maxiftime_ /= 4; 695 #endif 696 697 reset_cutoff(ifd); 698 CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); 699 700 /* 701 * Initialize the CBQ's WRR state. 702 */ 703 for (i = 0; i < RM_MAXPRIO; i++) { 704 ifd->alloc_[i] = 0; 705 ifd->M_[i] = 0; 706 ifd->num_[i] = 0; 707 ifd->na_[i] = 0; 708 ifd->active_[i] = NULL; 709 } 710 711 /* 712 * Initialize current packet state. 713 */ 714 ifd->qi_ = 0; 715 ifd->qo_ = 0; 716 for (i = 0; i < RM_MAXQUEUED; i++) { 717 ifd->class_[i] = NULL; 718 ifd->curlen_[i] = 0; 719 ifd->borrowed_[i] = NULL; 720 } 721 722 /* 723 * Create the root class of the link-sharing structure. 724 */ 725 if ((ifd->root_ = rmc_newclass(0, ifd, 726 nsecPerByte, 727 rmc_root_overlimit, maxq, 0, 0, 728 maxidle, minidle, offtime, 729 0, 0)) == NULL) { 730 printf("rmc_init: root class not allocated\n"); 731 return ; 732 } 733 ifd->root_->depth_ = 0; 734 } 735 736 /* 737 * void 738 * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by 739 * mbuf 'm' to queue for resource class 'cl'. This routine is called 740 * by a driver's if_output routine. This routine must be called with 741 * output packet completion interrupts locked out (to avoid racing with 742 * rmc_dequeue_next). 743 * 744 * Returns: 0 on successful queueing 745 * -1 when packet drop occurs 746 */ 747 int 748 rmc_queue_packet(struct rm_class *cl, mbuf_t *m) 749 { 750 struct timeval now; 751 struct rm_ifdat *ifd = cl->ifdat_; 752 int cpri = cl->pri_; 753 int is_empty = qempty(cl->q_); 754 755 RM_GETTIME(now); 756 if (ifd->cutoff_ > 0) { 757 if (TV_LT(&cl->undertime_, &now)) { 758 if (ifd->cutoff_ > cl->depth_) 759 ifd->cutoff_ = cl->depth_; 760 CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); 761 } 762 #if 1 /* ALTQ */ 763 else { 764 /* 765 * the class is overlimit. if the class has 766 * underlimit ancestors, set cutoff to the lowest 767 * depth among them. 768 */ 769 struct rm_class *borrow = cl->borrow_; 770 771 while (borrow != NULL && 772 borrow->depth_ < ifd->cutoff_) { 773 if (TV_LT(&borrow->undertime_, &now)) { 774 ifd->cutoff_ = borrow->depth_; 775 CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); 776 break; 777 } 778 borrow = borrow->borrow_; 779 } 780 } 781 #else /* !ALTQ */ 782 else if ((ifd->cutoff_ > 1) && cl->borrow_) { 783 if (TV_LT(&cl->borrow_->undertime_, &now)) { 784 ifd->cutoff_ = cl->borrow_->depth_; 785 CBQTRACE(rmc_queue_packet, 'ffob', 786 cl->borrow_->depth_); 787 } 788 } 789 #endif /* !ALTQ */ 790 } 791 792 if (_rmc_addq(cl, m) < 0) 793 /* failed */ 794 return (-1); 795 796 if (is_empty) { 797 CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); 798 ifd->na_[cpri]++; 799 } 800 801 if (qlen(cl->q_) > qlimit(cl->q_)) { 802 /* note: qlimit can be set to 0 or 1 */ 803 rmc_drop_action(cl); 804 return (-1); 805 } 806 return (0); 807 } 808 809 /* 810 * void 811 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all 812 * classes to see if there are satified. 813 */ 814 815 static void 816 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) 817 { 818 int i; 819 rm_class_t *p, *bp; 820 821 for (i = RM_MAXPRIO - 1; i >= 0; i--) { 822 if ((bp = ifd->active_[i]) != NULL) { 823 p = bp; 824 do { 825 if (!rmc_satisfied(p, now)) { 826 ifd->cutoff_ = p->depth_; 827 return; 828 } 829 p = p->peer_; 830 } while (p != bp); 831 } 832 } 833 834 reset_cutoff(ifd); 835 } 836 837 /* 838 * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. 839 */ 840 841 static int 842 rmc_satisfied(struct rm_class *cl, struct timeval *now) 843 { 844 rm_class_t *p; 845 846 if (cl == NULL) 847 return (1); 848 if (TV_LT(now, &cl->undertime_)) 849 return (1); 850 if (cl->depth_ == 0) { 851 if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) 852 return (0); 853 else 854 return (1); 855 } 856 if (cl->children_ != NULL) { 857 p = cl->children_; 858 while (p != NULL) { 859 if (!rmc_satisfied(p, now)) 860 return (0); 861 p = p->next_; 862 } 863 } 864 865 return (1); 866 } 867 868 /* 869 * Return 1 if class 'cl' is under limit or can borrow from a parent, 870 * 0 if overlimit. As a side-effect, this routine will invoke the 871 * class overlimit action if the class if overlimit. 872 */ 873 874 static int 875 rmc_under_limit(struct rm_class *cl, struct timeval *now) 876 { 877 rm_class_t *p = cl; 878 rm_class_t *top; 879 struct rm_ifdat *ifd = cl->ifdat_; 880 881 ifd->borrowed_[ifd->qi_] = NULL; 882 /* 883 * If cl is the root class, then always return that it is 884 * underlimit. Otherwise, check to see if the class is underlimit. 885 */ 886 if (cl->parent_ == NULL) 887 return (1); 888 889 if (cl->sleeping_) { 890 if (TV_LT(now, &cl->undertime_)) 891 return (0); 892 893 CALLOUT_STOP(&cl->callout_); 894 cl->sleeping_ = 0; 895 cl->undertime_.tv_sec = 0; 896 return (1); 897 } 898 899 top = NULL; 900 while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { 901 if (((cl = cl->borrow_) == NULL) || 902 (cl->depth_ > ifd->cutoff_)) { 903 #ifdef ADJUST_CUTOFF 904 if (cl != NULL) 905 /* cutoff is taking effect, just 906 return false without calling 907 the delay action. */ 908 return (0); 909 #endif 910 #ifdef BORROW_OFFTIME 911 /* 912 * check if the class can borrow offtime too. 913 * borrow offtime from the top of the borrow 914 * chain if the top class is not overloaded. 915 */ 916 if (cl != NULL) { 917 /* cutoff is taking effect, use this class as top. */ 918 top = cl; 919 CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); 920 } 921 if (top != NULL && top->avgidle_ == top->minidle_) 922 top = NULL; 923 p->overtime_ = *now; 924 (p->overlimit)(p, top); 925 #else 926 p->overtime_ = *now; 927 (p->overlimit)(p, NULL); 928 #endif 929 return (0); 930 } 931 top = cl; 932 } 933 934 if (cl != p) 935 ifd->borrowed_[ifd->qi_] = cl; 936 return (1); 937 } 938 939 /* 940 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to 941 * Packet-by-packet round robin. 942 * 943 * The heart of the weighted round-robin scheduler, which decides which 944 * class next gets to send a packet. Highest priority first, then 945 * weighted round-robin within priorites. 946 * 947 * Each able-to-send class gets to send until its byte allocation is 948 * exhausted. Thus, the active pointer is only changed after a class has 949 * exhausted its allocation. 950 * 951 * If the scheduler finds no class that is underlimit or able to borrow, 952 * then the first class found that had a nonzero queue and is allowed to 953 * borrow gets to send. 954 */ 955 956 static mbuf_t * 957 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) 958 { 959 struct rm_class *cl = NULL, *first = NULL; 960 u_int deficit; 961 int cpri; 962 mbuf_t *m; 963 struct timeval now; 964 965 RM_GETTIME(now); 966 967 /* 968 * if the driver polls the top of the queue and then removes 969 * the polled packet, we must return the same packet. 970 */ 971 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 972 cl = ifd->pollcache_; 973 cpri = cl->pri_; 974 if (ifd->efficient_) { 975 /* check if this class is overlimit */ 976 if (cl->undertime_.tv_sec != 0 && 977 rmc_under_limit(cl, &now) == 0) 978 first = cl; 979 } 980 ifd->pollcache_ = NULL; 981 goto _wrr_out; 982 } 983 else { 984 /* mode == ALTDQ_POLL || pollcache == NULL */ 985 ifd->pollcache_ = NULL; 986 ifd->borrowed_[ifd->qi_] = NULL; 987 } 988 #ifdef ADJUST_CUTOFF 989 _again: 990 #endif 991 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 992 if (ifd->na_[cpri] == 0) 993 continue; 994 deficit = 0; 995 /* 996 * Loop through twice for a priority level, if some class 997 * was unable to send a packet the first round because 998 * of the weighted round-robin mechanism. 999 * During the second loop at this level, deficit==2. 1000 * (This second loop is not needed if for every class, 1001 * "M[cl->pri_])" times "cl->allotment" is greater than 1002 * the byte size for the largest packet in the class.) 1003 */ 1004 _wrr_loop: 1005 cl = ifd->active_[cpri]; 1006 ASSERT(cl != NULL); 1007 do { 1008 if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) 1009 cl->bytes_alloc_ += cl->w_allotment_; 1010 if (!qempty(cl->q_)) { 1011 if ((cl->undertime_.tv_sec == 0) || 1012 rmc_under_limit(cl, &now)) { 1013 if (cl->bytes_alloc_ > 0 || deficit > 1) 1014 goto _wrr_out; 1015 1016 /* underlimit but no alloc */ 1017 deficit = 1; 1018 #if 1 1019 ifd->borrowed_[ifd->qi_] = NULL; 1020 #endif 1021 } 1022 else if (first == NULL && cl->borrow_ != NULL) 1023 first = cl; /* borrowing candidate */ 1024 } 1025 1026 cl->bytes_alloc_ = 0; 1027 cl = cl->peer_; 1028 } while (cl != ifd->active_[cpri]); 1029 1030 if (deficit == 1) { 1031 /* first loop found an underlimit class with deficit */ 1032 /* Loop on same priority level, with new deficit. */ 1033 deficit = 2; 1034 goto _wrr_loop; 1035 } 1036 } 1037 1038 #ifdef ADJUST_CUTOFF 1039 /* 1040 * no underlimit class found. if cutoff is taking effect, 1041 * increase cutoff and try again. 1042 */ 1043 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1044 ifd->cutoff_++; 1045 CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); 1046 goto _again; 1047 } 1048 #endif /* ADJUST_CUTOFF */ 1049 /* 1050 * If LINK_EFFICIENCY is turned on, then the first overlimit 1051 * class we encounter will send a packet if all the classes 1052 * of the link-sharing structure are overlimit. 1053 */ 1054 reset_cutoff(ifd); 1055 CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); 1056 1057 if (!ifd->efficient_ || first == NULL) 1058 return (NULL); 1059 1060 cl = first; 1061 cpri = cl->pri_; 1062 #if 0 /* too time-consuming for nothing */ 1063 if (cl->sleeping_) 1064 CALLOUT_STOP(&cl->callout_); 1065 cl->sleeping_ = 0; 1066 cl->undertime_.tv_sec = 0; 1067 #endif 1068 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1069 ifd->cutoff_ = cl->borrow_->depth_; 1070 1071 /* 1072 * Deque the packet and do the book keeping... 1073 */ 1074 _wrr_out: 1075 if (op == ALTDQ_REMOVE) { 1076 m = _rmc_getq(cl); 1077 if (m == NULL) 1078 panic("_rmc_wrr_dequeue_next"); 1079 if (qempty(cl->q_)) 1080 ifd->na_[cpri]--; 1081 1082 /* 1083 * Update class statistics and link data. 1084 */ 1085 if (cl->bytes_alloc_ > 0) 1086 cl->bytes_alloc_ -= m_pktlen(m); 1087 1088 if ((cl->bytes_alloc_ <= 0) || first == cl) 1089 ifd->active_[cl->pri_] = cl->peer_; 1090 else 1091 ifd->active_[cl->pri_] = cl; 1092 1093 ifd->class_[ifd->qi_] = cl; 1094 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1095 ifd->now_[ifd->qi_] = now; 1096 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1097 ifd->queued_++; 1098 } else { 1099 /* mode == ALTDQ_PPOLL */ 1100 m = _rmc_pollq(cl); 1101 ifd->pollcache_ = cl; 1102 } 1103 return (m); 1104 } 1105 1106 /* 1107 * Dequeue & return next packet from the highest priority class that 1108 * has a packet to send & has enough allocation to send it. This 1109 * routine is called by a driver whenever it needs a new packet to 1110 * output. 1111 */ 1112 static mbuf_t * 1113 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) 1114 { 1115 mbuf_t *m; 1116 int cpri; 1117 struct rm_class *cl, *first = NULL; 1118 struct timeval now; 1119 1120 RM_GETTIME(now); 1121 1122 /* 1123 * if the driver polls the top of the queue and then removes 1124 * the polled packet, we must return the same packet. 1125 */ 1126 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 1127 cl = ifd->pollcache_; 1128 cpri = cl->pri_; 1129 ifd->pollcache_ = NULL; 1130 goto _prr_out; 1131 } else { 1132 /* mode == ALTDQ_POLL || pollcache == NULL */ 1133 ifd->pollcache_ = NULL; 1134 ifd->borrowed_[ifd->qi_] = NULL; 1135 } 1136 #ifdef ADJUST_CUTOFF 1137 _again: 1138 #endif 1139 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1140 if (ifd->na_[cpri] == 0) 1141 continue; 1142 cl = ifd->active_[cpri]; 1143 ASSERT(cl != NULL); 1144 do { 1145 if (!qempty(cl->q_)) { 1146 if ((cl->undertime_.tv_sec == 0) || 1147 rmc_under_limit(cl, &now)) 1148 goto _prr_out; 1149 if (first == NULL && cl->borrow_ != NULL) 1150 first = cl; 1151 } 1152 cl = cl->peer_; 1153 } while (cl != ifd->active_[cpri]); 1154 } 1155 1156 #ifdef ADJUST_CUTOFF 1157 /* 1158 * no underlimit class found. if cutoff is taking effect, increase 1159 * cutoff and try again. 1160 */ 1161 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1162 ifd->cutoff_++; 1163 goto _again; 1164 } 1165 #endif /* ADJUST_CUTOFF */ 1166 /* 1167 * If LINK_EFFICIENCY is turned on, then the first overlimit 1168 * class we encounter will send a packet if all the classes 1169 * of the link-sharing structure are overlimit. 1170 */ 1171 reset_cutoff(ifd); 1172 if (!ifd->efficient_ || first == NULL) 1173 return (NULL); 1174 1175 cl = first; 1176 cpri = cl->pri_; 1177 #if 0 /* too time-consuming for nothing */ 1178 if (cl->sleeping_) 1179 CALLOUT_STOP(&cl->callout_); 1180 cl->sleeping_ = 0; 1181 cl->undertime_.tv_sec = 0; 1182 #endif 1183 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1184 ifd->cutoff_ = cl->borrow_->depth_; 1185 1186 /* 1187 * Deque the packet and do the book keeping... 1188 */ 1189 _prr_out: 1190 if (op == ALTDQ_REMOVE) { 1191 m = _rmc_getq(cl); 1192 if (m == NULL) 1193 panic("_rmc_prr_dequeue_next"); 1194 if (qempty(cl->q_)) 1195 ifd->na_[cpri]--; 1196 1197 ifd->active_[cpri] = cl->peer_; 1198 1199 ifd->class_[ifd->qi_] = cl; 1200 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1201 ifd->now_[ifd->qi_] = now; 1202 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1203 ifd->queued_++; 1204 } else { 1205 /* mode == ALTDQ_POLL */ 1206 m = _rmc_pollq(cl); 1207 ifd->pollcache_ = cl; 1208 } 1209 return (m); 1210 } 1211 1212 /* 1213 * mbuf_t * 1214 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function 1215 * is invoked by the packet driver to get the next packet to be 1216 * dequeued and output on the link. If WRR is enabled, then the 1217 * WRR dequeue next routine will determine the next packet to sent. 1218 * Otherwise, packet-by-packet round robin is invoked. 1219 * 1220 * Returns: NULL, if a packet is not available or if all 1221 * classes are overlimit. 1222 * 1223 * Otherwise, Pointer to the next packet. 1224 */ 1225 1226 mbuf_t * 1227 rmc_dequeue_next(struct rm_ifdat *ifd, int mode) 1228 { 1229 if (ifd->queued_ >= ifd->maxqueued_) 1230 return (NULL); 1231 else if (ifd->wrr_) 1232 return (_rmc_wrr_dequeue_next(ifd, mode)); 1233 else 1234 return (_rmc_prr_dequeue_next(ifd, mode)); 1235 } 1236 1237 /* 1238 * Update the utilization estimate for the packet that just completed. 1239 * The packet's class & the parent(s) of that class all get their 1240 * estimators updated. This routine is called by the driver's output- 1241 * packet-completion interrupt service routine. 1242 */ 1243 1244 /* 1245 * a macro to approximate "divide by 1000" that gives 0.000999, 1246 * if a value has enough effective digits. 1247 * (on pentium, mul takes 9 cycles but div takes 46!) 1248 */ 1249 #define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) 1250 void 1251 rmc_update_class_util(struct rm_ifdat *ifd) 1252 { 1253 int idle, avgidle, pktlen; 1254 int pkt_time, tidle; 1255 rm_class_t *cl, *borrowed; 1256 rm_class_t *borrows; 1257 struct timeval *nowp; 1258 1259 /* 1260 * Get the most recent completed class. 1261 */ 1262 if ((cl = ifd->class_[ifd->qo_]) == NULL) 1263 return; 1264 1265 pktlen = ifd->curlen_[ifd->qo_]; 1266 borrowed = ifd->borrowed_[ifd->qo_]; 1267 borrows = borrowed; 1268 1269 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1270 1271 /* 1272 * Run estimator on class and its ancestors. 1273 */ 1274 /* 1275 * rm_update_class_util is designed to be called when the 1276 * transfer is completed from a xmit complete interrupt, 1277 * but most drivers don't implement an upcall for that. 1278 * so, just use estimated completion time. 1279 * as a result, ifd->qi_ and ifd->qo_ are always synced. 1280 */ 1281 nowp = &ifd->now_[ifd->qo_]; 1282 /* get pkt_time (for link) in usec */ 1283 #if 1 /* use approximation */ 1284 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; 1285 pkt_time = NSEC_TO_USEC(pkt_time); 1286 #else 1287 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; 1288 #endif 1289 #if 1 /* ALTQ4PPP */ 1290 if (TV_LT(nowp, &ifd->ifnow_)) { 1291 int iftime; 1292 1293 /* 1294 * make sure the estimated completion time does not go 1295 * too far. it can happen when the link layer supports 1296 * data compression or the interface speed is set to 1297 * a much lower value. 1298 */ 1299 TV_DELTA(&ifd->ifnow_, nowp, iftime); 1300 if (iftime+pkt_time < ifd->maxiftime_) { 1301 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1302 } else { 1303 TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); 1304 } 1305 } else { 1306 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1307 } 1308 #else 1309 if (TV_LT(nowp, &ifd->ifnow_)) { 1310 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1311 } else { 1312 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1313 } 1314 #endif 1315 1316 while (cl != NULL) { 1317 TV_DELTA(&ifd->ifnow_, &cl->last_, idle); 1318 if (idle >= 2000000) 1319 /* 1320 * this class is idle enough, reset avgidle. 1321 * (TV_DELTA returns 2000000 us when delta is large.) 1322 */ 1323 cl->avgidle_ = cl->maxidle_; 1324 1325 /* get pkt_time (for class) in usec */ 1326 #if 1 /* use approximation */ 1327 pkt_time = pktlen * cl->ns_per_byte_; 1328 pkt_time = NSEC_TO_USEC(pkt_time); 1329 #else 1330 pkt_time = pktlen * cl->ns_per_byte_ / 1000; 1331 #endif 1332 idle -= pkt_time; 1333 1334 avgidle = cl->avgidle_; 1335 avgidle += idle - (avgidle >> RM_FILTER_GAIN); 1336 cl->avgidle_ = avgidle; 1337 1338 /* Are we overlimit ? */ 1339 if (avgidle <= 0) { 1340 CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); 1341 #if 1 /* ALTQ */ 1342 /* 1343 * need some lower bound for avgidle, otherwise 1344 * a borrowing class gets unbounded penalty. 1345 */ 1346 if (avgidle < cl->minidle_) 1347 avgidle = cl->avgidle_ = cl->minidle_; 1348 #endif 1349 /* set next idle to make avgidle 0 */ 1350 tidle = pkt_time + 1351 (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); 1352 TV_ADD_DELTA(nowp, tidle, &cl->undertime_); 1353 ++cl->stats_.over; 1354 } else { 1355 cl->avgidle_ = 1356 (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; 1357 cl->undertime_.tv_sec = 0; 1358 if (cl->sleeping_) { 1359 CALLOUT_STOP(&cl->callout_); 1360 cl->sleeping_ = 0; 1361 } 1362 } 1363 1364 if (borrows != NULL) { 1365 if (borrows != cl) 1366 ++cl->stats_.borrows; 1367 else 1368 borrows = NULL; 1369 } 1370 cl->last_ = ifd->ifnow_; 1371 cl->last_pkttime_ = pkt_time; 1372 1373 #if 1 1374 if (cl->parent_ == NULL) { 1375 /* take stats of root class */ 1376 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1377 } 1378 #endif 1379 1380 cl = cl->parent_; 1381 } 1382 1383 /* 1384 * Check to see if cutoff needs to set to a new level. 1385 */ 1386 cl = ifd->class_[ifd->qo_]; 1387 if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { 1388 #if 1 /* ALTQ */ 1389 if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { 1390 rmc_tl_satisfied(ifd, nowp); 1391 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1392 } else { 1393 ifd->cutoff_ = borrowed->depth_; 1394 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1395 } 1396 #else /* !ALTQ */ 1397 if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { 1398 reset_cutoff(ifd); 1399 #ifdef notdef 1400 rmc_tl_satisfied(ifd, &now); 1401 #endif 1402 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1403 } else { 1404 ifd->cutoff_ = borrowed->depth_; 1405 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1406 } 1407 #endif /* !ALTQ */ 1408 } 1409 1410 /* 1411 * Release class slot 1412 */ 1413 ifd->borrowed_[ifd->qo_] = NULL; 1414 ifd->class_[ifd->qo_] = NULL; 1415 ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; 1416 ifd->queued_--; 1417 } 1418 1419 /* 1420 * void 1421 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) 1422 * over-limit action routines. These get invoked by rmc_under_limit() 1423 * if a class with packets to send if over its bandwidth limit & can't 1424 * borrow from a parent class. 1425 * 1426 * Returns: NONE 1427 */ 1428 1429 static void 1430 rmc_drop_action(struct rm_class *cl) 1431 { 1432 struct rm_ifdat *ifd = cl->ifdat_; 1433 1434 ASSERT(qlen(cl->q_) > 0); 1435 _rmc_dropq(cl); 1436 if (qempty(cl->q_)) 1437 ifd->na_[cl->pri_]--; 1438 } 1439 1440 void rmc_dropall(struct rm_class *cl) 1441 { 1442 struct rm_ifdat *ifd = cl->ifdat_; 1443 1444 if (!qempty(cl->q_)) { 1445 _flushq(cl->q_); 1446 1447 ifd->na_[cl->pri_]--; 1448 } 1449 } 1450 1451 static int 1452 hzto(struct timeval *tv) 1453 { 1454 struct timeval t2; 1455 1456 getmicrotime(&t2); 1457 t2.tv_sec = tv->tv_sec - t2.tv_sec; 1458 t2.tv_usec = tv->tv_usec - t2.tv_usec; 1459 return (tvtohz(&t2)); 1460 } 1461 1462 /* 1463 * void 1464 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ 1465 * delay action routine. It is invoked via rmc_under_limit when the 1466 * packet is discoverd to be overlimit. 1467 * 1468 * If the delay action is result of borrow class being overlimit, then 1469 * delay for the offtime of the borrowing class that is overlimit. 1470 * 1471 * Returns: NONE 1472 */ 1473 1474 void 1475 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) 1476 { 1477 int delay, t, extradelay; 1478 1479 cl->stats_.overactions++; 1480 TV_DELTA(&cl->undertime_, &cl->overtime_, delay); 1481 #ifndef BORROW_OFFTIME 1482 delay += cl->offtime_; 1483 #endif 1484 1485 if (!cl->sleeping_) { 1486 CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); 1487 #ifdef BORROW_OFFTIME 1488 if (borrow != NULL) 1489 extradelay = borrow->offtime_; 1490 else 1491 #endif 1492 extradelay = cl->offtime_; 1493 1494 #ifdef ALTQ 1495 /* 1496 * XXX recalculate suspend time: 1497 * current undertime is (tidle + pkt_time) calculated 1498 * from the last transmission. 1499 * tidle: time required to bring avgidle back to 0 1500 * pkt_time: target waiting time for this class 1501 * we need to replace pkt_time by offtime 1502 */ 1503 extradelay -= cl->last_pkttime_; 1504 #endif 1505 if (extradelay > 0) { 1506 TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); 1507 delay += extradelay; 1508 } 1509 1510 cl->sleeping_ = 1; 1511 cl->stats_.delays++; 1512 1513 /* 1514 * Since packets are phased randomly with respect to the 1515 * clock, 1 tick (the next clock tick) can be an arbitrarily 1516 * short time so we have to wait for at least two ticks. 1517 * NOTE: If there's no other traffic, we need the timer as 1518 * a 'backstop' to restart this class. 1519 */ 1520 if (delay > tick * 2) { 1521 /* FreeBSD rounds up the tick */ 1522 t = hzto(&cl->undertime_); 1523 } else 1524 t = 2; 1525 CALLOUT_RESET(&cl->callout_, t, rmc_restart, cl); 1526 } 1527 } 1528 1529 /* 1530 * void 1531 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is 1532 * called by the system timer code & is responsible checking if the 1533 * class is still sleeping (it might have been restarted as a side 1534 * effect of the queue scan on a packet arrival) and, if so, restarting 1535 * output for the class. Inspecting the class state & restarting output 1536 * require locking the class structure. In general the driver is 1537 * responsible for locking but this is the only routine that is not 1538 * called directly or indirectly from the interface driver so it has 1539 * know about system locking conventions. Under bsd, locking is done 1540 * by raising IPL to splimp so that's what's implemented here. On a 1541 * different system this would probably need to be changed. 1542 * 1543 * Returns: NONE 1544 */ 1545 1546 static void 1547 rmc_restart(void *arg) 1548 { 1549 struct rm_class *cl = arg; 1550 struct rm_ifdat *ifd = cl->ifdat_; 1551 struct epoch_tracker et; 1552 int s; 1553 1554 s = splnet(); 1555 NET_EPOCH_ENTER(et); 1556 IFQ_LOCK(ifd->ifq_); 1557 CURVNET_SET(ifd->ifq_->altq_ifp->if_vnet); 1558 if (cl->sleeping_) { 1559 cl->sleeping_ = 0; 1560 cl->undertime_.tv_sec = 0; 1561 1562 if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { 1563 CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); 1564 (ifd->restart)(ifd->ifq_); 1565 } 1566 } 1567 CURVNET_RESTORE(); 1568 IFQ_UNLOCK(ifd->ifq_); 1569 NET_EPOCH_EXIT(et); 1570 splx(s); 1571 } 1572 1573 /* 1574 * void 1575 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit 1576 * handling routine for the root class of the link sharing structure. 1577 * 1578 * Returns: NONE 1579 */ 1580 1581 static void 1582 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) 1583 { 1584 panic("rmc_root_overlimit"); 1585 } 1586 1587 /* 1588 * Packet Queue handling routines. Eventually, this is to localize the 1589 * effects on the code whether queues are red queues or droptail 1590 * queues. 1591 */ 1592 1593 static int 1594 _rmc_addq(rm_class_t *cl, mbuf_t *m) 1595 { 1596 #ifdef ALTQ_RIO 1597 if (q_is_rio(cl->q_)) 1598 return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); 1599 #endif 1600 #ifdef ALTQ_RED 1601 if (q_is_red(cl->q_)) 1602 return red_addq(cl->red_, cl->q_, m, cl->pktattr_); 1603 #endif /* ALTQ_RED */ 1604 #ifdef ALTQ_CODEL 1605 if (q_is_codel(cl->q_)) 1606 return codel_addq(cl->codel_, cl->q_, m); 1607 #endif 1608 1609 if (cl->flags_ & RMCF_CLEARDSCP) 1610 write_dsfield(m, cl->pktattr_, 0); 1611 1612 _addq(cl->q_, m); 1613 return (0); 1614 } 1615 1616 /* note: _rmc_dropq is not called for red */ 1617 static void 1618 _rmc_dropq(rm_class_t *cl) 1619 { 1620 mbuf_t *m; 1621 1622 if ((m = _getq(cl->q_)) != NULL) 1623 m_freem(m); 1624 } 1625 1626 static mbuf_t * 1627 _rmc_getq(rm_class_t *cl) 1628 { 1629 #ifdef ALTQ_RIO 1630 if (q_is_rio(cl->q_)) 1631 return rio_getq((rio_t *)cl->red_, cl->q_); 1632 #endif 1633 #ifdef ALTQ_RED 1634 if (q_is_red(cl->q_)) 1635 return red_getq(cl->red_, cl->q_); 1636 #endif 1637 #ifdef ALTQ_CODEL 1638 if (q_is_codel(cl->q_)) 1639 return codel_getq(cl->codel_, cl->q_); 1640 #endif 1641 return _getq(cl->q_); 1642 } 1643 1644 static mbuf_t * 1645 _rmc_pollq(rm_class_t *cl) 1646 { 1647 return qhead(cl->q_); 1648 } 1649 1650 #ifdef CBQ_TRACE 1651 1652 struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; 1653 struct cbqtrace *cbqtrace_ptr = NULL; 1654 int cbqtrace_count; 1655 1656 /* 1657 * DDB hook to trace cbq events: 1658 * the last 1024 events are held in a circular buffer. 1659 * use "call cbqtrace_dump(N)" to display 20 events from Nth event. 1660 */ 1661 void cbqtrace_dump(int); 1662 static char *rmc_funcname(void *); 1663 1664 static struct rmc_funcs { 1665 void *func; 1666 char *name; 1667 } rmc_funcs[] = 1668 { 1669 rmc_init, "rmc_init", 1670 rmc_queue_packet, "rmc_queue_packet", 1671 rmc_under_limit, "rmc_under_limit", 1672 rmc_update_class_util, "rmc_update_class_util", 1673 rmc_delay_action, "rmc_delay_action", 1674 rmc_restart, "rmc_restart", 1675 _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", 1676 NULL, NULL 1677 }; 1678 1679 static char *rmc_funcname(void *func) 1680 { 1681 struct rmc_funcs *fp; 1682 1683 for (fp = rmc_funcs; fp->func != NULL; fp++) 1684 if (fp->func == func) 1685 return (fp->name); 1686 return ("unknown"); 1687 } 1688 1689 void cbqtrace_dump(int counter) 1690 { 1691 int i, *p; 1692 char *cp; 1693 1694 counter = counter % NCBQTRACE; 1695 p = (int *)&cbqtrace_buffer[counter]; 1696 1697 for (i=0; i<20; i++) { 1698 printf("[0x%x] ", *p++); 1699 printf("%s: ", rmc_funcname((void *)*p++)); 1700 cp = (char *)p++; 1701 printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); 1702 printf("%d\n",*p++); 1703 1704 if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) 1705 p = (int *)cbqtrace_buffer; 1706 } 1707 } 1708 #endif /* CBQ_TRACE */ 1709 #endif /* ALTQ_CBQ */ 1710 1711 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \ 1712 defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL) 1713 #if !defined(__GNUC__) || defined(ALTQ_DEBUG) 1714 1715 void 1716 _addq(class_queue_t *q, mbuf_t *m) 1717 { 1718 mbuf_t *m0; 1719 1720 if ((m0 = qtail(q)) != NULL) 1721 m->m_nextpkt = m0->m_nextpkt; 1722 else 1723 m0 = m; 1724 m0->m_nextpkt = m; 1725 qtail(q) = m; 1726 qlen(q)++; 1727 } 1728 1729 mbuf_t * 1730 _getq(class_queue_t *q) 1731 { 1732 mbuf_t *m, *m0; 1733 1734 if ((m = qtail(q)) == NULL) 1735 return (NULL); 1736 if ((m0 = m->m_nextpkt) != m) 1737 m->m_nextpkt = m0->m_nextpkt; 1738 else { 1739 ASSERT(qlen(q) == 1); 1740 qtail(q) = NULL; 1741 } 1742 qlen(q)--; 1743 m0->m_nextpkt = NULL; 1744 return (m0); 1745 } 1746 1747 /* drop a packet at the tail of the queue */ 1748 mbuf_t * 1749 _getq_tail(class_queue_t *q) 1750 { 1751 mbuf_t *m, *m0, *prev; 1752 1753 if ((m = m0 = qtail(q)) == NULL) 1754 return NULL; 1755 do { 1756 prev = m0; 1757 m0 = m0->m_nextpkt; 1758 } while (m0 != m); 1759 prev->m_nextpkt = m->m_nextpkt; 1760 if (prev == m) { 1761 ASSERT(qlen(q) == 1); 1762 qtail(q) = NULL; 1763 } else 1764 qtail(q) = prev; 1765 qlen(q)--; 1766 m->m_nextpkt = NULL; 1767 return (m); 1768 } 1769 1770 /* randomly select a packet in the queue */ 1771 mbuf_t * 1772 _getq_random(class_queue_t *q) 1773 { 1774 struct mbuf *m; 1775 int i, n; 1776 1777 if ((m = qtail(q)) == NULL) 1778 return NULL; 1779 if (m->m_nextpkt == m) { 1780 ASSERT(qlen(q) == 1); 1781 qtail(q) = NULL; 1782 } else { 1783 struct mbuf *prev = NULL; 1784 1785 n = arc4random() % qlen(q) + 1; 1786 for (i = 0; i < n; i++) { 1787 prev = m; 1788 m = m->m_nextpkt; 1789 } 1790 prev->m_nextpkt = m->m_nextpkt; 1791 if (m == qtail(q)) 1792 qtail(q) = prev; 1793 } 1794 qlen(q)--; 1795 m->m_nextpkt = NULL; 1796 return (m); 1797 } 1798 1799 void 1800 _removeq(class_queue_t *q, mbuf_t *m) 1801 { 1802 mbuf_t *m0, *prev; 1803 1804 m0 = qtail(q); 1805 do { 1806 prev = m0; 1807 m0 = m0->m_nextpkt; 1808 } while (m0 != m); 1809 prev->m_nextpkt = m->m_nextpkt; 1810 if (prev == m) 1811 qtail(q) = NULL; 1812 else if (qtail(q) == m) 1813 qtail(q) = prev; 1814 qlen(q)--; 1815 } 1816 1817 void 1818 _flushq(class_queue_t *q) 1819 { 1820 mbuf_t *m; 1821 1822 while ((m = _getq(q)) != NULL) 1823 m_freem(m); 1824 ASSERT(qlen(q) == 0); 1825 } 1826 1827 #endif /* !__GNUC__ || ALTQ_DEBUG */ 1828 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */ 1829