xref: /freebsd/sys/net/altq/altq_rmclass.c (revision 994297b01b98816bea1abf45ae4bac1bc69ee7a0)
1 /*-
2  * Copyright (c) 1991-1997 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the Network Research
16  *      Group at Lawrence Berkeley Laboratory.
17  * 4. Neither the name of the University nor of the Laboratory may be used
18  *    to endorse or promote products derived from this software without
19  *    specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * LBL code modified by speer@eng.sun.com, May 1977.
34  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
35  *
36  * @(#)rm_class.c  1.48     97/12/05 SMI
37  * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $
38  * $FreeBSD$
39  */
40 #include "opt_altq.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
44 
45 #include <sys/param.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/socket.h>
49 #include <sys/systm.h>
50 #include <sys/errno.h>
51 #include <sys/time.h>
52 
53 #include <net/if.h>
54 #include <net/if_var.h>
55 
56 #include <net/altq/if_altq.h>
57 #include <net/altq/altq.h>
58 #include <net/altq/altq_codel.h>
59 #include <net/altq/altq_rmclass.h>
60 #include <net/altq/altq_rmclass_debug.h>
61 #include <net/altq/altq_red.h>
62 #include <net/altq/altq_rio.h>
63 
64 /*
65  * Local Macros
66  */
67 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
68 
69 /*
70  * Local routines.
71  */
72 
73 static int	rmc_satisfied(struct rm_class *, struct timeval *);
74 static void	rmc_wrr_set_weights(struct rm_ifdat *);
75 static void	rmc_depth_compute(struct rm_class *);
76 static void	rmc_depth_recompute(rm_class_t *);
77 
78 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
79 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
80 
81 static int	_rmc_addq(rm_class_t *, mbuf_t *);
82 static void	_rmc_dropq(rm_class_t *);
83 static mbuf_t	*_rmc_getq(rm_class_t *);
84 static mbuf_t	*_rmc_pollq(rm_class_t *);
85 
86 static int	rmc_under_limit(struct rm_class *, struct timeval *);
87 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
88 static void	rmc_drop_action(struct rm_class *);
89 static void	rmc_restart(void *);
90 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
91 
92 #define	BORROW_OFFTIME
93 /*
94  * BORROW_OFFTIME (experimental):
95  * borrow the offtime of the class borrowing from.
96  * the reason is that when its own offtime is set, the class is unable
97  * to borrow much, especially when cutoff is taking effect.
98  * but when the borrowed class is overloaded (advidle is close to minidle),
99  * use the borrowing class's offtime to avoid overload.
100  */
101 #define	ADJUST_CUTOFF
102 /*
103  * ADJUST_CUTOFF (experimental):
104  * if no underlimit class is found due to cutoff, increase cutoff and
105  * retry the scheduling loop.
106  * also, don't invoke delay_actions while cutoff is taking effect,
107  * since a sleeping class won't have a chance to be scheduled in the
108  * next loop.
109  *
110  * now heuristics for setting the top-level variable (cutoff_) becomes:
111  *	1. if a packet arrives for a not-overlimit class, set cutoff
112  *	   to the depth of the class.
113  *	2. if cutoff is i, and a packet arrives for an overlimit class
114  *	   with an underlimit ancestor at a lower level than i (say j),
115  *	   then set cutoff to j.
116  *	3. at scheduling a packet, if there is no underlimit class
117  *	   due to the current cutoff level, increase cutoff by 1 and
118  *	   then try to schedule again.
119  */
120 
121 /*
122  * rm_class_t *
123  * rmc_newclass(...) - Create a new resource management class at priority
124  * 'pri' on the interface given by 'ifd'.
125  *
126  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
127  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
128  *              than 100% of the bandwidth, this number should be the
129  *              'effective' rate for the class.  Let f be the
130  *              bandwidth fraction allocated to this class, and let
131  *              nsPerByte be the data rate of the output link in
132  *              nanoseconds/byte.  Then nsecPerByte is set to
133  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
134  *              for a class that gets 50% of an ethernet's bandwidth.
135  *
136  * action       the routine to call when the class is over limit.
137  *
138  * maxq         max allowable queue size for class (in packets).
139  *
140  * parent       parent class pointer.
141  *
142  * borrow       class to borrow from (should be either 'parent' or null).
143  *
144  * maxidle      max value allowed for class 'idle' time estimate (this
145  *              parameter determines how large an initial burst of packets
146  *              can be before overlimit action is invoked.
147  *
148  * offtime      how long 'delay' action will delay when class goes over
149  *              limit (this parameter determines the steady-state burst
150  *              size when a class is running over its limit).
151  *
152  * Maxidle and offtime have to be computed from the following:  If the
153  * average packet size is s, the bandwidth fraction allocated to this
154  * class is f, we want to allow b packet bursts, and the gain of the
155  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
156  *
157  *   ptime = s * nsPerByte * (1 - f) / f
158  *   maxidle = ptime * (1 - g^b) / g^b
159  *   minidle = -ptime * (1 / (f - 1))
160  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
161  *
162  * Operationally, it's convenient to specify maxidle & offtime in units
163  * independent of the link bandwidth so the maxidle & offtime passed to
164  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
165  * (The constant factor is a scale factor needed to make the parameters
166  * integers.  This scaling also means that the 'unscaled' values of
167  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
168  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
169  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
170  * maxidle also must be scaled upward by this value.  Thus, the passed
171  * values for maxidle and offtime can be computed as follows:
172  *
173  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
174  * offtime = offtime * 8 / (1000 * nsecPerByte)
175  *
176  * When USE_HRTIME is employed, then maxidle and offtime become:
177  * 	maxidle = maxilde * (8.0 / nsecPerByte);
178  * 	offtime = offtime * (8.0 / nsecPerByte);
179  */
180 struct rm_class *
181 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
182     void (*action)(rm_class_t *, rm_class_t *), int maxq,
183     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
184     int minidle, u_int offtime, int pktsize, int flags)
185 {
186 	struct rm_class	*cl;
187 	struct rm_class	*peer;
188 	int		 s;
189 
190 	if (pri >= RM_MAXPRIO)
191 		return (NULL);
192 #ifndef ALTQ_RED
193 	if (flags & RMCF_RED) {
194 #ifdef ALTQ_DEBUG
195 		printf("rmc_newclass: RED not configured for CBQ!\n");
196 #endif
197 		return (NULL);
198 	}
199 #endif
200 #ifndef ALTQ_RIO
201 	if (flags & RMCF_RIO) {
202 #ifdef ALTQ_DEBUG
203 		printf("rmc_newclass: RIO not configured for CBQ!\n");
204 #endif
205 		return (NULL);
206 	}
207 #endif
208 #ifndef ALTQ_CODEL
209 	if (flags & RMCF_CODEL) {
210 #ifdef ALTQ_DEBUG
211 		printf("rmc_newclass: CODEL not configured for CBQ!\n");
212 #endif
213 		return (NULL);
214 	}
215 #endif
216 
217 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
218 	if (cl == NULL)
219 		return (NULL);
220 	CALLOUT_INIT(&cl->callout_);
221 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
222 	if (cl->q_ == NULL) {
223 		free(cl, M_DEVBUF);
224 		return (NULL);
225 	}
226 
227 	/*
228 	 * Class initialization.
229 	 */
230 	cl->children_ = NULL;
231 	cl->parent_ = parent;
232 	cl->borrow_ = borrow;
233 	cl->leaf_ = 1;
234 	cl->ifdat_ = ifd;
235 	cl->pri_ = pri;
236 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
237 	cl->depth_ = 0;
238 	cl->qthresh_ = 0;
239 	cl->ns_per_byte_ = nsecPerByte;
240 
241 	qlimit(cl->q_) = maxq;
242 	qtype(cl->q_) = Q_DROPHEAD;
243 	qlen(cl->q_) = 0;
244 	cl->flags_ = flags;
245 
246 #if 1 /* minidle is also scaled in ALTQ */
247 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
248 	if (cl->minidle_ > 0)
249 		cl->minidle_ = 0;
250 #else
251 	cl->minidle_ = minidle;
252 #endif
253 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
254 	if (cl->maxidle_ == 0)
255 		cl->maxidle_ = 1;
256 #if 1 /* offtime is also scaled in ALTQ */
257 	cl->avgidle_ = cl->maxidle_;
258 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
259 	if (cl->offtime_ == 0)
260 		cl->offtime_ = 1;
261 #else
262 	cl->avgidle_ = 0;
263 	cl->offtime_ = (offtime * nsecPerByte) / 8;
264 #endif
265 	cl->overlimit = action;
266 
267 #ifdef ALTQ_RED
268 	if (flags & (RMCF_RED|RMCF_RIO)) {
269 		int red_flags, red_pkttime;
270 
271 		red_flags = 0;
272 		if (flags & RMCF_ECN)
273 			red_flags |= REDF_ECN;
274 		if (flags & RMCF_FLOWVALVE)
275 			red_flags |= REDF_FLOWVALVE;
276 #ifdef ALTQ_RIO
277 		if (flags & RMCF_CLEARDSCP)
278 			red_flags |= RIOF_CLEARDSCP;
279 #endif
280 		red_pkttime = nsecPerByte * pktsize  / 1000;
281 
282 		if (flags & RMCF_RED) {
283 			cl->red_ = red_alloc(0, 0,
284 			    qlimit(cl->q_) * 10/100,
285 			    qlimit(cl->q_) * 30/100,
286 			    red_flags, red_pkttime);
287 			if (cl->red_ != NULL)
288 				qtype(cl->q_) = Q_RED;
289 		}
290 #ifdef ALTQ_RIO
291 		else {
292 			cl->red_ = (red_t *)rio_alloc(0, NULL,
293 						      red_flags, red_pkttime);
294 			if (cl->red_ != NULL)
295 				qtype(cl->q_) = Q_RIO;
296 		}
297 #endif
298 	}
299 #endif /* ALTQ_RED */
300 #ifdef ALTQ_CODEL
301 	if (flags & RMCF_CODEL) {
302 		cl->codel_ = codel_alloc(5, 100, 0);
303 		if (cl->codel_ != NULL)
304 			qtype(cl->q_) = Q_CODEL;
305 	}
306 #endif
307 
308 	/*
309 	 * put the class into the class tree
310 	 */
311 	s = splnet();
312 	IFQ_LOCK(ifd->ifq_);
313 	if ((peer = ifd->active_[pri]) != NULL) {
314 		/* find the last class at this pri */
315 		cl->peer_ = peer;
316 		while (peer->peer_ != ifd->active_[pri])
317 			peer = peer->peer_;
318 		peer->peer_ = cl;
319 	} else {
320 		ifd->active_[pri] = cl;
321 		cl->peer_ = cl;
322 	}
323 
324 	if (cl->parent_) {
325 		cl->next_ = parent->children_;
326 		parent->children_ = cl;
327 		parent->leaf_ = 0;
328 	}
329 
330 	/*
331 	 * Compute the depth of this class and its ancestors in the class
332 	 * hierarchy.
333 	 */
334 	rmc_depth_compute(cl);
335 
336 	/*
337 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
338 	 */
339 	if (ifd->wrr_) {
340 		ifd->num_[pri]++;
341 		ifd->alloc_[pri] += cl->allotment_;
342 		rmc_wrr_set_weights(ifd);
343 	}
344 	IFQ_UNLOCK(ifd->ifq_);
345 	splx(s);
346 	return (cl);
347 }
348 
349 int
350 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
351     int minidle, u_int offtime, int pktsize)
352 {
353 	struct rm_ifdat	*ifd;
354 	u_int		 old_allotment;
355 	int		 s;
356 
357 	ifd = cl->ifdat_;
358 	old_allotment = cl->allotment_;
359 
360 	s = splnet();
361 	IFQ_LOCK(ifd->ifq_);
362 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
363 	cl->qthresh_ = 0;
364 	cl->ns_per_byte_ = nsecPerByte;
365 
366 	qlimit(cl->q_) = maxq;
367 
368 #if 1 /* minidle is also scaled in ALTQ */
369 	cl->minidle_ = (minidle * nsecPerByte) / 8;
370 	if (cl->minidle_ > 0)
371 		cl->minidle_ = 0;
372 #else
373 	cl->minidle_ = minidle;
374 #endif
375 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
376 	if (cl->maxidle_ == 0)
377 		cl->maxidle_ = 1;
378 #if 1 /* offtime is also scaled in ALTQ */
379 	cl->avgidle_ = cl->maxidle_;
380 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
381 	if (cl->offtime_ == 0)
382 		cl->offtime_ = 1;
383 #else
384 	cl->avgidle_ = 0;
385 	cl->offtime_ = (offtime * nsecPerByte) / 8;
386 #endif
387 
388 	/*
389 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
390 	 */
391 	if (ifd->wrr_) {
392 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
393 		rmc_wrr_set_weights(ifd);
394 	}
395 	IFQ_UNLOCK(ifd->ifq_);
396 	splx(s);
397 	return (0);
398 }
399 
400 /*
401  * static void
402  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
403  *	the appropriate run robin weights for the CBQ weighted round robin
404  *	algorithm.
405  *
406  *	Returns: NONE
407  */
408 
409 static void
410 rmc_wrr_set_weights(struct rm_ifdat *ifd)
411 {
412 	int		i;
413 	struct rm_class	*cl, *clh;
414 
415 	for (i = 0; i < RM_MAXPRIO; i++) {
416 		/*
417 		 * This is inverted from that of the simulator to
418 		 * maintain precision.
419 		 */
420 		if (ifd->num_[i] == 0)
421 			ifd->M_[i] = 0;
422 		else
423 			ifd->M_[i] = ifd->alloc_[i] /
424 				(ifd->num_[i] * ifd->maxpkt_);
425 		/*
426 		 * Compute the weighted allotment for each class.
427 		 * This takes the expensive div instruction out
428 		 * of the main loop for the wrr scheduling path.
429 		 * These only get recomputed when a class comes or
430 		 * goes.
431 		 */
432 		if (ifd->active_[i] != NULL) {
433 			clh = cl = ifd->active_[i];
434 			do {
435 				/* safe-guard for slow link or alloc_ == 0 */
436 				if (ifd->M_[i] == 0)
437 					cl->w_allotment_ = 0;
438 				else
439 					cl->w_allotment_ = cl->allotment_ /
440 						ifd->M_[i];
441 				cl = cl->peer_;
442 			} while ((cl != NULL) && (cl != clh));
443 		}
444 	}
445 }
446 
447 int
448 rmc_get_weight(struct rm_ifdat *ifd, int pri)
449 {
450 	if ((pri >= 0) && (pri < RM_MAXPRIO))
451 		return (ifd->M_[pri]);
452 	else
453 		return (0);
454 }
455 
456 /*
457  * static void
458  * rmc_depth_compute(struct rm_class *cl) - This function computes the
459  *	appropriate depth of class 'cl' and its ancestors.
460  *
461  *	Returns:	NONE
462  */
463 
464 static void
465 rmc_depth_compute(struct rm_class *cl)
466 {
467 	rm_class_t	*t = cl, *p;
468 
469 	/*
470 	 * Recompute the depth for the branch of the tree.
471 	 */
472 	while (t != NULL) {
473 		p = t->parent_;
474 		if (p && (t->depth_ >= p->depth_)) {
475 			p->depth_ = t->depth_ + 1;
476 			t = p;
477 		} else
478 			t = NULL;
479 	}
480 }
481 
482 /*
483  * static void
484  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
485  *	the depth of the tree after a class has been deleted.
486  *
487  *	Returns:	NONE
488  */
489 
490 static void
491 rmc_depth_recompute(rm_class_t *cl)
492 {
493 #if 1 /* ALTQ */
494 	rm_class_t	*p, *t;
495 
496 	p = cl;
497 	while (p != NULL) {
498 		if ((t = p->children_) == NULL) {
499 			p->depth_ = 0;
500 		} else {
501 			int cdepth = 0;
502 
503 			while (t != NULL) {
504 				if (t->depth_ > cdepth)
505 					cdepth = t->depth_;
506 				t = t->next_;
507 			}
508 
509 			if (p->depth_ == cdepth + 1)
510 				/* no change to this parent */
511 				return;
512 
513 			p->depth_ = cdepth + 1;
514 		}
515 
516 		p = p->parent_;
517 	}
518 #else
519 	rm_class_t	*t;
520 
521 	if (cl->depth_ >= 1) {
522 		if (cl->children_ == NULL) {
523 			cl->depth_ = 0;
524 		} else if ((t = cl->children_) != NULL) {
525 			while (t != NULL) {
526 				if (t->children_ != NULL)
527 					rmc_depth_recompute(t);
528 				t = t->next_;
529 			}
530 		} else
531 			rmc_depth_compute(cl);
532 	}
533 #endif
534 }
535 
536 /*
537  * void
538  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
539  *	function deletes a class from the link-sharing structure and frees
540  *	all resources associated with the class.
541  *
542  *	Returns: NONE
543  */
544 
545 void
546 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
547 {
548 	struct rm_class	*p, *head, *previous;
549 	int		 s;
550 
551 	ASSERT(cl->children_ == NULL);
552 
553 	if (cl->sleeping_)
554 		CALLOUT_STOP(&cl->callout_);
555 
556 	s = splnet();
557 	IFQ_LOCK(ifd->ifq_);
558 	/*
559 	 * Free packets in the packet queue.
560 	 * XXX - this may not be a desired behavior.  Packets should be
561 	 *		re-queued.
562 	 */
563 	rmc_dropall(cl);
564 
565 	/*
566 	 * If the class has a parent, then remove the class from the
567 	 * class from the parent's children chain.
568 	 */
569 	if (cl->parent_ != NULL) {
570 		head = cl->parent_->children_;
571 		p = previous = head;
572 		if (head->next_ == NULL) {
573 			ASSERT(head == cl);
574 			cl->parent_->children_ = NULL;
575 			cl->parent_->leaf_ = 1;
576 		} else while (p != NULL) {
577 			if (p == cl) {
578 				if (cl == head)
579 					cl->parent_->children_ = cl->next_;
580 				else
581 					previous->next_ = cl->next_;
582 				cl->next_ = NULL;
583 				p = NULL;
584 			} else {
585 				previous = p;
586 				p = p->next_;
587 			}
588 		}
589 	}
590 
591 	/*
592 	 * Delete class from class priority peer list.
593 	 */
594 	if ((p = ifd->active_[cl->pri_]) != NULL) {
595 		/*
596 		 * If there is more than one member of this priority
597 		 * level, then look for class(cl) in the priority level.
598 		 */
599 		if (p != p->peer_) {
600 			while (p->peer_ != cl)
601 				p = p->peer_;
602 			p->peer_ = cl->peer_;
603 
604 			if (ifd->active_[cl->pri_] == cl)
605 				ifd->active_[cl->pri_] = cl->peer_;
606 		} else {
607 			ASSERT(p == cl);
608 			ifd->active_[cl->pri_] = NULL;
609 		}
610 	}
611 
612 	/*
613 	 * Recompute the WRR weights.
614 	 */
615 	if (ifd->wrr_) {
616 		ifd->alloc_[cl->pri_] -= cl->allotment_;
617 		ifd->num_[cl->pri_]--;
618 		rmc_wrr_set_weights(ifd);
619 	}
620 
621 	/*
622 	 * Re-compute the depth of the tree.
623 	 */
624 #if 1 /* ALTQ */
625 	rmc_depth_recompute(cl->parent_);
626 #else
627 	rmc_depth_recompute(ifd->root_);
628 #endif
629 
630 	IFQ_UNLOCK(ifd->ifq_);
631 	splx(s);
632 
633 	/*
634 	 * Free the class structure.
635 	 */
636 	if (cl->red_ != NULL) {
637 #ifdef ALTQ_RIO
638 		if (q_is_rio(cl->q_))
639 			rio_destroy((rio_t *)cl->red_);
640 #endif
641 #ifdef ALTQ_RED
642 		if (q_is_red(cl->q_))
643 			red_destroy(cl->red_);
644 #endif
645 #ifdef ALTQ_CODEL
646 		if (q_is_codel(cl->q_))
647 			codel_destroy(cl->codel_);
648 #endif
649 	}
650 	free(cl->q_, M_DEVBUF);
651 	free(cl, M_DEVBUF);
652 }
653 
654 /*
655  * void
656  * rmc_init(...) - Initialize the resource management data structures
657  *	associated with the output portion of interface 'ifp'.  'ifd' is
658  *	where the structures will be built (for backwards compatibility, the
659  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
660  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
661  *	'restart' is the driver-specific routine that the generic 'delay
662  *	until under limit' action will call to restart output.  `maxq'
663  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
664  *	is the maximum number of packets that the resource management
665  *	code will allow to be queued 'downstream' (this is typically 1).
666  *
667  *	Returns:	NONE
668  */
669 
670 void
671 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
672     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
673     int minidle, u_int offtime, int flags)
674 {
675 	int		i, mtu;
676 
677 	/*
678 	 * Initialize the CBQ tracing/debug facility.
679 	 */
680 	CBQTRACEINIT();
681 
682 	bzero((char *)ifd, sizeof (*ifd));
683 	mtu = ifq->altq_ifp->if_mtu;
684 	ifd->ifq_ = ifq;
685 	ifd->restart = restart;
686 	ifd->maxqueued_ = maxqueued;
687 	ifd->ns_per_byte_ = nsecPerByte;
688 	ifd->maxpkt_ = mtu;
689 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
690 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
691 #if 1
692 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
693 	if (mtu * nsecPerByte > 10 * 1000000)
694 		ifd->maxiftime_ /= 4;
695 #endif
696 
697 	reset_cutoff(ifd);
698 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
699 
700 	/*
701 	 * Initialize the CBQ's WRR state.
702 	 */
703 	for (i = 0; i < RM_MAXPRIO; i++) {
704 		ifd->alloc_[i] = 0;
705 		ifd->M_[i] = 0;
706 		ifd->num_[i] = 0;
707 		ifd->na_[i] = 0;
708 		ifd->active_[i] = NULL;
709 	}
710 
711 	/*
712 	 * Initialize current packet state.
713 	 */
714 	ifd->qi_ = 0;
715 	ifd->qo_ = 0;
716 	for (i = 0; i < RM_MAXQUEUED; i++) {
717 		ifd->class_[i] = NULL;
718 		ifd->curlen_[i] = 0;
719 		ifd->borrowed_[i] = NULL;
720 	}
721 
722 	/*
723 	 * Create the root class of the link-sharing structure.
724 	 */
725 	if ((ifd->root_ = rmc_newclass(0, ifd,
726 				       nsecPerByte,
727 				       rmc_root_overlimit, maxq, 0, 0,
728 				       maxidle, minidle, offtime,
729 				       0, 0)) == NULL) {
730 		printf("rmc_init: root class not allocated\n");
731 		return ;
732 	}
733 	ifd->root_->depth_ = 0;
734 }
735 
736 /*
737  * void
738  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
739  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
740  *	by a driver's if_output routine.  This routine must be called with
741  *	output packet completion interrupts locked out (to avoid racing with
742  *	rmc_dequeue_next).
743  *
744  *	Returns:	0 on successful queueing
745  *			-1 when packet drop occurs
746  */
747 int
748 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
749 {
750 	struct timeval	 now;
751 	struct rm_ifdat *ifd = cl->ifdat_;
752 	int		 cpri = cl->pri_;
753 	int		 is_empty = qempty(cl->q_);
754 
755 	RM_GETTIME(now);
756 	if (ifd->cutoff_ > 0) {
757 		if (TV_LT(&cl->undertime_, &now)) {
758 			if (ifd->cutoff_ > cl->depth_)
759 				ifd->cutoff_ = cl->depth_;
760 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
761 		}
762 #if 1 /* ALTQ */
763 		else {
764 			/*
765 			 * the class is overlimit. if the class has
766 			 * underlimit ancestors, set cutoff to the lowest
767 			 * depth among them.
768 			 */
769 			struct rm_class *borrow = cl->borrow_;
770 
771 			while (borrow != NULL &&
772 			       borrow->depth_ < ifd->cutoff_) {
773 				if (TV_LT(&borrow->undertime_, &now)) {
774 					ifd->cutoff_ = borrow->depth_;
775 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
776 					break;
777 				}
778 				borrow = borrow->borrow_;
779 			}
780 		}
781 #else /* !ALTQ */
782 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
783 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
784 				ifd->cutoff_ = cl->borrow_->depth_;
785 				CBQTRACE(rmc_queue_packet, 'ffob',
786 					 cl->borrow_->depth_);
787 			}
788 		}
789 #endif /* !ALTQ */
790 	}
791 
792 	if (_rmc_addq(cl, m) < 0)
793 		/* failed */
794 		return (-1);
795 
796 	if (is_empty) {
797 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
798 		ifd->na_[cpri]++;
799 	}
800 
801 	if (qlen(cl->q_) > qlimit(cl->q_)) {
802 		/* note: qlimit can be set to 0 or 1 */
803 		rmc_drop_action(cl);
804 		return (-1);
805 	}
806 	return (0);
807 }
808 
809 /*
810  * void
811  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
812  *	classes to see if there are satified.
813  */
814 
815 static void
816 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
817 {
818 	int		 i;
819 	rm_class_t	*p, *bp;
820 
821 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
822 		if ((bp = ifd->active_[i]) != NULL) {
823 			p = bp;
824 			do {
825 				if (!rmc_satisfied(p, now)) {
826 					ifd->cutoff_ = p->depth_;
827 					return;
828 				}
829 				p = p->peer_;
830 			} while (p != bp);
831 		}
832 	}
833 
834 	reset_cutoff(ifd);
835 }
836 
837 /*
838  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
839  */
840 
841 static int
842 rmc_satisfied(struct rm_class *cl, struct timeval *now)
843 {
844 	rm_class_t	*p;
845 
846 	if (cl == NULL)
847 		return (1);
848 	if (TV_LT(now, &cl->undertime_))
849 		return (1);
850 	if (cl->depth_ == 0) {
851 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
852 			return (0);
853 		else
854 			return (1);
855 	}
856 	if (cl->children_ != NULL) {
857 		p = cl->children_;
858 		while (p != NULL) {
859 			if (!rmc_satisfied(p, now))
860 				return (0);
861 			p = p->next_;
862 		}
863 	}
864 
865 	return (1);
866 }
867 
868 /*
869  * Return 1 if class 'cl' is under limit or can borrow from a parent,
870  * 0 if overlimit.  As a side-effect, this routine will invoke the
871  * class overlimit action if the class if overlimit.
872  */
873 
874 static int
875 rmc_under_limit(struct rm_class *cl, struct timeval *now)
876 {
877 	rm_class_t	*p = cl;
878 	rm_class_t	*top;
879 	struct rm_ifdat	*ifd = cl->ifdat_;
880 
881 	ifd->borrowed_[ifd->qi_] = NULL;
882 	/*
883 	 * If cl is the root class, then always return that it is
884 	 * underlimit.  Otherwise, check to see if the class is underlimit.
885 	 */
886 	if (cl->parent_ == NULL)
887 		return (1);
888 
889 	if (cl->sleeping_) {
890 		if (TV_LT(now, &cl->undertime_))
891 			return (0);
892 
893 		CALLOUT_STOP(&cl->callout_);
894 		cl->sleeping_ = 0;
895 		cl->undertime_.tv_sec = 0;
896 		return (1);
897 	}
898 
899 	top = NULL;
900 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
901 		if (((cl = cl->borrow_) == NULL) ||
902 		    (cl->depth_ > ifd->cutoff_)) {
903 #ifdef ADJUST_CUTOFF
904 			if (cl != NULL)
905 				/* cutoff is taking effect, just
906 				   return false without calling
907 				   the delay action. */
908 				return (0);
909 #endif
910 #ifdef BORROW_OFFTIME
911 			/*
912 			 * check if the class can borrow offtime too.
913 			 * borrow offtime from the top of the borrow
914 			 * chain if the top class is not overloaded.
915 			 */
916 			if (cl != NULL) {
917 				/* cutoff is taking effect, use this class as top. */
918 				top = cl;
919 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
920 			}
921 			if (top != NULL && top->avgidle_ == top->minidle_)
922 				top = NULL;
923 			p->overtime_ = *now;
924 			(p->overlimit)(p, top);
925 #else
926 			p->overtime_ = *now;
927 			(p->overlimit)(p, NULL);
928 #endif
929 			return (0);
930 		}
931 		top = cl;
932 	}
933 
934 	if (cl != p)
935 		ifd->borrowed_[ifd->qi_] = cl;
936 	return (1);
937 }
938 
939 /*
940  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
941  *	Packet-by-packet round robin.
942  *
943  * The heart of the weighted round-robin scheduler, which decides which
944  * class next gets to send a packet.  Highest priority first, then
945  * weighted round-robin within priorites.
946  *
947  * Each able-to-send class gets to send until its byte allocation is
948  * exhausted.  Thus, the active pointer is only changed after a class has
949  * exhausted its allocation.
950  *
951  * If the scheduler finds no class that is underlimit or able to borrow,
952  * then the first class found that had a nonzero queue and is allowed to
953  * borrow gets to send.
954  */
955 
956 static mbuf_t *
957 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
958 {
959 	struct rm_class	*cl = NULL, *first = NULL;
960 	u_int		 deficit;
961 	int		 cpri;
962 	mbuf_t		*m;
963 	struct timeval	 now;
964 
965 	RM_GETTIME(now);
966 
967 	/*
968 	 * if the driver polls the top of the queue and then removes
969 	 * the polled packet, we must return the same packet.
970 	 */
971 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
972 		cl = ifd->pollcache_;
973 		cpri = cl->pri_;
974 		if (ifd->efficient_) {
975 			/* check if this class is overlimit */
976 			if (cl->undertime_.tv_sec != 0 &&
977 			    rmc_under_limit(cl, &now) == 0)
978 				first = cl;
979 		}
980 		ifd->pollcache_ = NULL;
981 		goto _wrr_out;
982 	}
983 	else {
984 		/* mode == ALTDQ_POLL || pollcache == NULL */
985 		ifd->pollcache_ = NULL;
986 		ifd->borrowed_[ifd->qi_] = NULL;
987 	}
988 #ifdef ADJUST_CUTOFF
989  _again:
990 #endif
991 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
992 		if (ifd->na_[cpri] == 0)
993 			continue;
994 		deficit = 0;
995 		/*
996 		 * Loop through twice for a priority level, if some class
997 		 * was unable to send a packet the first round because
998 		 * of the weighted round-robin mechanism.
999 		 * During the second loop at this level, deficit==2.
1000 		 * (This second loop is not needed if for every class,
1001 		 * "M[cl->pri_])" times "cl->allotment" is greater than
1002 		 * the byte size for the largest packet in the class.)
1003 		 */
1004  _wrr_loop:
1005 		cl = ifd->active_[cpri];
1006 		ASSERT(cl != NULL);
1007 		do {
1008 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1009 				cl->bytes_alloc_ += cl->w_allotment_;
1010 			if (!qempty(cl->q_)) {
1011 				if ((cl->undertime_.tv_sec == 0) ||
1012 				    rmc_under_limit(cl, &now)) {
1013 					if (cl->bytes_alloc_ > 0 || deficit > 1)
1014 						goto _wrr_out;
1015 
1016 					/* underlimit but no alloc */
1017 					deficit = 1;
1018 #if 1
1019 					ifd->borrowed_[ifd->qi_] = NULL;
1020 #endif
1021 				}
1022 				else if (first == NULL && cl->borrow_ != NULL)
1023 					first = cl; /* borrowing candidate */
1024 			}
1025 
1026 			cl->bytes_alloc_ = 0;
1027 			cl = cl->peer_;
1028 		} while (cl != ifd->active_[cpri]);
1029 
1030 		if (deficit == 1) {
1031 			/* first loop found an underlimit class with deficit */
1032 			/* Loop on same priority level, with new deficit.  */
1033 			deficit = 2;
1034 			goto _wrr_loop;
1035 		}
1036 	}
1037 
1038 #ifdef ADJUST_CUTOFF
1039 	/*
1040 	 * no underlimit class found.  if cutoff is taking effect,
1041 	 * increase cutoff and try again.
1042 	 */
1043 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1044 		ifd->cutoff_++;
1045 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1046 		goto _again;
1047 	}
1048 #endif /* ADJUST_CUTOFF */
1049 	/*
1050 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1051 	 * class we encounter will send a packet if all the classes
1052 	 * of the link-sharing structure are overlimit.
1053 	 */
1054 	reset_cutoff(ifd);
1055 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1056 
1057 	if (!ifd->efficient_ || first == NULL)
1058 		return (NULL);
1059 
1060 	cl = first;
1061 	cpri = cl->pri_;
1062 #if 0	/* too time-consuming for nothing */
1063 	if (cl->sleeping_)
1064 		CALLOUT_STOP(&cl->callout_);
1065 	cl->sleeping_ = 0;
1066 	cl->undertime_.tv_sec = 0;
1067 #endif
1068 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1069 	ifd->cutoff_ = cl->borrow_->depth_;
1070 
1071 	/*
1072 	 * Deque the packet and do the book keeping...
1073 	 */
1074  _wrr_out:
1075 	if (op == ALTDQ_REMOVE) {
1076 		m = _rmc_getq(cl);
1077 		if (m == NULL)
1078 			panic("_rmc_wrr_dequeue_next");
1079 		if (qempty(cl->q_))
1080 			ifd->na_[cpri]--;
1081 
1082 		/*
1083 		 * Update class statistics and link data.
1084 		 */
1085 		if (cl->bytes_alloc_ > 0)
1086 			cl->bytes_alloc_ -= m_pktlen(m);
1087 
1088 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1089 			ifd->active_[cl->pri_] = cl->peer_;
1090 		else
1091 			ifd->active_[cl->pri_] = cl;
1092 
1093 		ifd->class_[ifd->qi_] = cl;
1094 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1095 		ifd->now_[ifd->qi_] = now;
1096 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1097 		ifd->queued_++;
1098 	} else {
1099 		/* mode == ALTDQ_PPOLL */
1100 		m = _rmc_pollq(cl);
1101 		ifd->pollcache_ = cl;
1102 	}
1103 	return (m);
1104 }
1105 
1106 /*
1107  * Dequeue & return next packet from the highest priority class that
1108  * has a packet to send & has enough allocation to send it.  This
1109  * routine is called by a driver whenever it needs a new packet to
1110  * output.
1111  */
1112 static mbuf_t *
1113 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1114 {
1115 	mbuf_t		*m;
1116 	int		 cpri;
1117 	struct rm_class	*cl, *first = NULL;
1118 	struct timeval	 now;
1119 
1120 	RM_GETTIME(now);
1121 
1122 	/*
1123 	 * if the driver polls the top of the queue and then removes
1124 	 * the polled packet, we must return the same packet.
1125 	 */
1126 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1127 		cl = ifd->pollcache_;
1128 		cpri = cl->pri_;
1129 		ifd->pollcache_ = NULL;
1130 		goto _prr_out;
1131 	} else {
1132 		/* mode == ALTDQ_POLL || pollcache == NULL */
1133 		ifd->pollcache_ = NULL;
1134 		ifd->borrowed_[ifd->qi_] = NULL;
1135 	}
1136 #ifdef ADJUST_CUTOFF
1137  _again:
1138 #endif
1139 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1140 		if (ifd->na_[cpri] == 0)
1141 			continue;
1142 		cl = ifd->active_[cpri];
1143 		ASSERT(cl != NULL);
1144 		do {
1145 			if (!qempty(cl->q_)) {
1146 				if ((cl->undertime_.tv_sec == 0) ||
1147 				    rmc_under_limit(cl, &now))
1148 					goto _prr_out;
1149 				if (first == NULL && cl->borrow_ != NULL)
1150 					first = cl;
1151 			}
1152 			cl = cl->peer_;
1153 		} while (cl != ifd->active_[cpri]);
1154 	}
1155 
1156 #ifdef ADJUST_CUTOFF
1157 	/*
1158 	 * no underlimit class found.  if cutoff is taking effect, increase
1159 	 * cutoff and try again.
1160 	 */
1161 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1162 		ifd->cutoff_++;
1163 		goto _again;
1164 	}
1165 #endif /* ADJUST_CUTOFF */
1166 	/*
1167 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1168 	 * class we encounter will send a packet if all the classes
1169 	 * of the link-sharing structure are overlimit.
1170 	 */
1171 	reset_cutoff(ifd);
1172 	if (!ifd->efficient_ || first == NULL)
1173 		return (NULL);
1174 
1175 	cl = first;
1176 	cpri = cl->pri_;
1177 #if 0	/* too time-consuming for nothing */
1178 	if (cl->sleeping_)
1179 		CALLOUT_STOP(&cl->callout_);
1180 	cl->sleeping_ = 0;
1181 	cl->undertime_.tv_sec = 0;
1182 #endif
1183 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1184 	ifd->cutoff_ = cl->borrow_->depth_;
1185 
1186 	/*
1187 	 * Deque the packet and do the book keeping...
1188 	 */
1189  _prr_out:
1190 	if (op == ALTDQ_REMOVE) {
1191 		m = _rmc_getq(cl);
1192 		if (m == NULL)
1193 			panic("_rmc_prr_dequeue_next");
1194 		if (qempty(cl->q_))
1195 			ifd->na_[cpri]--;
1196 
1197 		ifd->active_[cpri] = cl->peer_;
1198 
1199 		ifd->class_[ifd->qi_] = cl;
1200 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1201 		ifd->now_[ifd->qi_] = now;
1202 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1203 		ifd->queued_++;
1204 	} else {
1205 		/* mode == ALTDQ_POLL */
1206 		m = _rmc_pollq(cl);
1207 		ifd->pollcache_ = cl;
1208 	}
1209 	return (m);
1210 }
1211 
1212 /*
1213  * mbuf_t *
1214  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1215  *	is invoked by the packet driver to get the next packet to be
1216  *	dequeued and output on the link.  If WRR is enabled, then the
1217  *	WRR dequeue next routine will determine the next packet to sent.
1218  *	Otherwise, packet-by-packet round robin is invoked.
1219  *
1220  *	Returns:	NULL, if a packet is not available or if all
1221  *			classes are overlimit.
1222  *
1223  *			Otherwise, Pointer to the next packet.
1224  */
1225 
1226 mbuf_t *
1227 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1228 {
1229 	if (ifd->queued_ >= ifd->maxqueued_)
1230 		return (NULL);
1231 	else if (ifd->wrr_)
1232 		return (_rmc_wrr_dequeue_next(ifd, mode));
1233 	else
1234 		return (_rmc_prr_dequeue_next(ifd, mode));
1235 }
1236 
1237 /*
1238  * Update the utilization estimate for the packet that just completed.
1239  * The packet's class & the parent(s) of that class all get their
1240  * estimators updated.  This routine is called by the driver's output-
1241  * packet-completion interrupt service routine.
1242  */
1243 
1244 /*
1245  * a macro to approximate "divide by 1000" that gives 0.000999,
1246  * if a value has enough effective digits.
1247  * (on pentium, mul takes 9 cycles but div takes 46!)
1248  */
1249 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1250 void
1251 rmc_update_class_util(struct rm_ifdat *ifd)
1252 {
1253 	int		 idle, avgidle, pktlen;
1254 	int		 pkt_time, tidle;
1255 	rm_class_t	*cl, *borrowed;
1256 	rm_class_t	*borrows;
1257 	struct timeval	*nowp;
1258 
1259 	/*
1260 	 * Get the most recent completed class.
1261 	 */
1262 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1263 		return;
1264 
1265 	pktlen = ifd->curlen_[ifd->qo_];
1266 	borrowed = ifd->borrowed_[ifd->qo_];
1267 	borrows = borrowed;
1268 
1269 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1270 
1271 	/*
1272 	 * Run estimator on class and its ancestors.
1273 	 */
1274 	/*
1275 	 * rm_update_class_util is designed to be called when the
1276 	 * transfer is completed from a xmit complete interrupt,
1277 	 * but most drivers don't implement an upcall for that.
1278 	 * so, just use estimated completion time.
1279 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1280 	 */
1281 	nowp = &ifd->now_[ifd->qo_];
1282 	/* get pkt_time (for link) in usec */
1283 #if 1  /* use approximation */
1284 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1285 	pkt_time = NSEC_TO_USEC(pkt_time);
1286 #else
1287 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1288 #endif
1289 #if 1 /* ALTQ4PPP */
1290 	if (TV_LT(nowp, &ifd->ifnow_)) {
1291 		int iftime;
1292 
1293 		/*
1294 		 * make sure the estimated completion time does not go
1295 		 * too far.  it can happen when the link layer supports
1296 		 * data compression or the interface speed is set to
1297 		 * a much lower value.
1298 		 */
1299 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1300 		if (iftime+pkt_time < ifd->maxiftime_) {
1301 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1302 		} else {
1303 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1304 		}
1305 	} else {
1306 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1307 	}
1308 #else
1309 	if (TV_LT(nowp, &ifd->ifnow_)) {
1310 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1311 	} else {
1312 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1313 	}
1314 #endif
1315 
1316 	while (cl != NULL) {
1317 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1318 		if (idle >= 2000000)
1319 			/*
1320 			 * this class is idle enough, reset avgidle.
1321 			 * (TV_DELTA returns 2000000 us when delta is large.)
1322 			 */
1323 			cl->avgidle_ = cl->maxidle_;
1324 
1325 		/* get pkt_time (for class) in usec */
1326 #if 1  /* use approximation */
1327 		pkt_time = pktlen * cl->ns_per_byte_;
1328 		pkt_time = NSEC_TO_USEC(pkt_time);
1329 #else
1330 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1331 #endif
1332 		idle -= pkt_time;
1333 
1334 		avgidle = cl->avgidle_;
1335 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1336 		cl->avgidle_ = avgidle;
1337 
1338 		/* Are we overlimit ? */
1339 		if (avgidle <= 0) {
1340 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1341 #if 1 /* ALTQ */
1342 			/*
1343 			 * need some lower bound for avgidle, otherwise
1344 			 * a borrowing class gets unbounded penalty.
1345 			 */
1346 			if (avgidle < cl->minidle_)
1347 				avgidle = cl->avgidle_ = cl->minidle_;
1348 #endif
1349 			/* set next idle to make avgidle 0 */
1350 			tidle = pkt_time +
1351 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1352 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1353 			++cl->stats_.over;
1354 		} else {
1355 			cl->avgidle_ =
1356 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1357 			cl->undertime_.tv_sec = 0;
1358 			if (cl->sleeping_) {
1359 				CALLOUT_STOP(&cl->callout_);
1360 				cl->sleeping_ = 0;
1361 			}
1362 		}
1363 
1364 		if (borrows != NULL) {
1365 			if (borrows != cl)
1366 				++cl->stats_.borrows;
1367 			else
1368 				borrows = NULL;
1369 		}
1370 		cl->last_ = ifd->ifnow_;
1371 		cl->last_pkttime_ = pkt_time;
1372 
1373 #if 1
1374 		if (cl->parent_ == NULL) {
1375 			/* take stats of root class */
1376 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1377 		}
1378 #endif
1379 
1380 		cl = cl->parent_;
1381 	}
1382 
1383 	/*
1384 	 * Check to see if cutoff needs to set to a new level.
1385 	 */
1386 	cl = ifd->class_[ifd->qo_];
1387 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1388 #if 1 /* ALTQ */
1389 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1390 			rmc_tl_satisfied(ifd, nowp);
1391 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1392 		} else {
1393 			ifd->cutoff_ = borrowed->depth_;
1394 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1395 		}
1396 #else /* !ALTQ */
1397 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1398 			reset_cutoff(ifd);
1399 #ifdef notdef
1400 			rmc_tl_satisfied(ifd, &now);
1401 #endif
1402 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1403 		} else {
1404 			ifd->cutoff_ = borrowed->depth_;
1405 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1406 		}
1407 #endif /* !ALTQ */
1408 	}
1409 
1410 	/*
1411 	 * Release class slot
1412 	 */
1413 	ifd->borrowed_[ifd->qo_] = NULL;
1414 	ifd->class_[ifd->qo_] = NULL;
1415 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1416 	ifd->queued_--;
1417 }
1418 
1419 /*
1420  * void
1421  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1422  *	over-limit action routines.  These get invoked by rmc_under_limit()
1423  *	if a class with packets to send if over its bandwidth limit & can't
1424  *	borrow from a parent class.
1425  *
1426  *	Returns: NONE
1427  */
1428 
1429 static void
1430 rmc_drop_action(struct rm_class *cl)
1431 {
1432 	struct rm_ifdat	*ifd = cl->ifdat_;
1433 
1434 	ASSERT(qlen(cl->q_) > 0);
1435 	_rmc_dropq(cl);
1436 	if (qempty(cl->q_))
1437 		ifd->na_[cl->pri_]--;
1438 }
1439 
1440 void rmc_dropall(struct rm_class *cl)
1441 {
1442 	struct rm_ifdat	*ifd = cl->ifdat_;
1443 
1444 	if (!qempty(cl->q_)) {
1445 		_flushq(cl->q_);
1446 
1447 		ifd->na_[cl->pri_]--;
1448 	}
1449 }
1450 
1451 #if (__FreeBSD_version > 300000)
1452 /* hzto() is removed from FreeBSD-3.0 */
1453 static int hzto(struct timeval *);
1454 
1455 static int
1456 hzto(tv)
1457 	struct timeval *tv;
1458 {
1459 	struct timeval t2;
1460 
1461 	getmicrotime(&t2);
1462 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1463 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1464 	return (tvtohz(&t2));
1465 }
1466 #endif /* __FreeBSD_version > 300000 */
1467 
1468 /*
1469  * void
1470  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1471  *	delay action routine.  It is invoked via rmc_under_limit when the
1472  *	packet is discoverd to be overlimit.
1473  *
1474  *	If the delay action is result of borrow class being overlimit, then
1475  *	delay for the offtime of the borrowing class that is overlimit.
1476  *
1477  *	Returns: NONE
1478  */
1479 
1480 void
1481 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1482 {
1483 	int	delay, t, extradelay;
1484 
1485 	cl->stats_.overactions++;
1486 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1487 #ifndef BORROW_OFFTIME
1488 	delay += cl->offtime_;
1489 #endif
1490 
1491 	if (!cl->sleeping_) {
1492 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1493 #ifdef BORROW_OFFTIME
1494 		if (borrow != NULL)
1495 			extradelay = borrow->offtime_;
1496 		else
1497 #endif
1498 			extradelay = cl->offtime_;
1499 
1500 #ifdef ALTQ
1501 		/*
1502 		 * XXX recalculate suspend time:
1503 		 * current undertime is (tidle + pkt_time) calculated
1504 		 * from the last transmission.
1505 		 *	tidle: time required to bring avgidle back to 0
1506 		 *	pkt_time: target waiting time for this class
1507 		 * we need to replace pkt_time by offtime
1508 		 */
1509 		extradelay -= cl->last_pkttime_;
1510 #endif
1511 		if (extradelay > 0) {
1512 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1513 			delay += extradelay;
1514 		}
1515 
1516 		cl->sleeping_ = 1;
1517 		cl->stats_.delays++;
1518 
1519 		/*
1520 		 * Since packets are phased randomly with respect to the
1521 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1522 		 * short time so we have to wait for at least two ticks.
1523 		 * NOTE:  If there's no other traffic, we need the timer as
1524 		 * a 'backstop' to restart this class.
1525 		 */
1526 		if (delay > tick * 2) {
1527 			/* FreeBSD rounds up the tick */
1528 			t = hzto(&cl->undertime_);
1529 		} else
1530 			t = 2;
1531 		CALLOUT_RESET(&cl->callout_, t, rmc_restart, cl);
1532 	}
1533 }
1534 
1535 /*
1536  * void
1537  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1538  *	called by the system timer code & is responsible checking if the
1539  *	class is still sleeping (it might have been restarted as a side
1540  *	effect of the queue scan on a packet arrival) and, if so, restarting
1541  *	output for the class.  Inspecting the class state & restarting output
1542  *	require locking the class structure.  In general the driver is
1543  *	responsible for locking but this is the only routine that is not
1544  *	called directly or indirectly from the interface driver so it has
1545  *	know about system locking conventions.  Under bsd, locking is done
1546  *	by raising IPL to splimp so that's what's implemented here.  On a
1547  *	different system this would probably need to be changed.
1548  *
1549  *	Returns:	NONE
1550  */
1551 
1552 static void
1553 rmc_restart(void *arg)
1554 {
1555 	struct rm_class *cl = arg;
1556 	struct rm_ifdat	*ifd = cl->ifdat_;
1557 	struct epoch_tracker et;
1558 	int		 s;
1559 
1560 	s = splnet();
1561 	NET_EPOCH_ENTER(et);
1562 	IFQ_LOCK(ifd->ifq_);
1563 	CURVNET_SET(ifd->ifq_->altq_ifp->if_vnet);
1564 	if (cl->sleeping_) {
1565 		cl->sleeping_ = 0;
1566 		cl->undertime_.tv_sec = 0;
1567 
1568 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1569 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1570 			(ifd->restart)(ifd->ifq_);
1571 		}
1572 	}
1573 	CURVNET_RESTORE();
1574 	IFQ_UNLOCK(ifd->ifq_);
1575 	NET_EPOCH_EXIT(et);
1576 	splx(s);
1577 }
1578 
1579 /*
1580  * void
1581  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1582  *	handling routine for the root class of the link sharing structure.
1583  *
1584  *	Returns: NONE
1585  */
1586 
1587 static void
1588 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1589 {
1590     panic("rmc_root_overlimit");
1591 }
1592 
1593 /*
1594  * Packet Queue handling routines.  Eventually, this is to localize the
1595  *	effects on the code whether queues are red queues or droptail
1596  *	queues.
1597  */
1598 
1599 static int
1600 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1601 {
1602 #ifdef ALTQ_RIO
1603 	if (q_is_rio(cl->q_))
1604 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1605 #endif
1606 #ifdef ALTQ_RED
1607 	if (q_is_red(cl->q_))
1608 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1609 #endif /* ALTQ_RED */
1610 #ifdef ALTQ_CODEL
1611 	if (q_is_codel(cl->q_))
1612 		return codel_addq(cl->codel_, cl->q_, m);
1613 #endif
1614 
1615 	if (cl->flags_ & RMCF_CLEARDSCP)
1616 		write_dsfield(m, cl->pktattr_, 0);
1617 
1618 	_addq(cl->q_, m);
1619 	return (0);
1620 }
1621 
1622 /* note: _rmc_dropq is not called for red */
1623 static void
1624 _rmc_dropq(rm_class_t *cl)
1625 {
1626 	mbuf_t	*m;
1627 
1628 	if ((m = _getq(cl->q_)) != NULL)
1629 		m_freem(m);
1630 }
1631 
1632 static mbuf_t *
1633 _rmc_getq(rm_class_t *cl)
1634 {
1635 #ifdef ALTQ_RIO
1636 	if (q_is_rio(cl->q_))
1637 		return rio_getq((rio_t *)cl->red_, cl->q_);
1638 #endif
1639 #ifdef ALTQ_RED
1640 	if (q_is_red(cl->q_))
1641 		return red_getq(cl->red_, cl->q_);
1642 #endif
1643 #ifdef ALTQ_CODEL
1644 	if (q_is_codel(cl->q_))
1645 		return codel_getq(cl->codel_, cl->q_);
1646 #endif
1647 	return _getq(cl->q_);
1648 }
1649 
1650 static mbuf_t *
1651 _rmc_pollq(rm_class_t *cl)
1652 {
1653 	return qhead(cl->q_);
1654 }
1655 
1656 #ifdef CBQ_TRACE
1657 
1658 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1659 struct cbqtrace		*cbqtrace_ptr = NULL;
1660 int			 cbqtrace_count;
1661 
1662 /*
1663  * DDB hook to trace cbq events:
1664  *  the last 1024 events are held in a circular buffer.
1665  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1666  */
1667 void cbqtrace_dump(int);
1668 static char *rmc_funcname(void *);
1669 
1670 static struct rmc_funcs {
1671 	void	*func;
1672 	char	*name;
1673 } rmc_funcs[] =
1674 {
1675 	rmc_init,		"rmc_init",
1676 	rmc_queue_packet,	"rmc_queue_packet",
1677 	rmc_under_limit,	"rmc_under_limit",
1678 	rmc_update_class_util,	"rmc_update_class_util",
1679 	rmc_delay_action,	"rmc_delay_action",
1680 	rmc_restart,		"rmc_restart",
1681 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1682 	NULL,			NULL
1683 };
1684 
1685 static char *rmc_funcname(void *func)
1686 {
1687 	struct rmc_funcs *fp;
1688 
1689 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1690 		if (fp->func == func)
1691 			return (fp->name);
1692 	return ("unknown");
1693 }
1694 
1695 void cbqtrace_dump(int counter)
1696 {
1697 	int	 i, *p;
1698 	char	*cp;
1699 
1700 	counter = counter % NCBQTRACE;
1701 	p = (int *)&cbqtrace_buffer[counter];
1702 
1703 	for (i=0; i<20; i++) {
1704 		printf("[0x%x] ", *p++);
1705 		printf("%s: ", rmc_funcname((void *)*p++));
1706 		cp = (char *)p++;
1707 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1708 		printf("%d\n",*p++);
1709 
1710 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1711 			p = (int *)cbqtrace_buffer;
1712 	}
1713 }
1714 #endif /* CBQ_TRACE */
1715 #endif /* ALTQ_CBQ */
1716 
1717 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
1718     defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
1719 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1720 
1721 void
1722 _addq(class_queue_t *q, mbuf_t *m)
1723 {
1724         mbuf_t	*m0;
1725 
1726 	if ((m0 = qtail(q)) != NULL)
1727 		m->m_nextpkt = m0->m_nextpkt;
1728 	else
1729 		m0 = m;
1730 	m0->m_nextpkt = m;
1731 	qtail(q) = m;
1732 	qlen(q)++;
1733 }
1734 
1735 mbuf_t *
1736 _getq(class_queue_t *q)
1737 {
1738 	mbuf_t	*m, *m0;
1739 
1740 	if ((m = qtail(q)) == NULL)
1741 		return (NULL);
1742 	if ((m0 = m->m_nextpkt) != m)
1743 		m->m_nextpkt = m0->m_nextpkt;
1744 	else {
1745 		ASSERT(qlen(q) == 1);
1746 		qtail(q) = NULL;
1747 	}
1748 	qlen(q)--;
1749 	m0->m_nextpkt = NULL;
1750 	return (m0);
1751 }
1752 
1753 /* drop a packet at the tail of the queue */
1754 mbuf_t *
1755 _getq_tail(class_queue_t *q)
1756 {
1757 	mbuf_t	*m, *m0, *prev;
1758 
1759 	if ((m = m0 = qtail(q)) == NULL)
1760 		return NULL;
1761 	do {
1762 		prev = m0;
1763 		m0 = m0->m_nextpkt;
1764 	} while (m0 != m);
1765 	prev->m_nextpkt = m->m_nextpkt;
1766 	if (prev == m)  {
1767 		ASSERT(qlen(q) == 1);
1768 		qtail(q) = NULL;
1769 	} else
1770 		qtail(q) = prev;
1771 	qlen(q)--;
1772 	m->m_nextpkt = NULL;
1773 	return (m);
1774 }
1775 
1776 /* randomly select a packet in the queue */
1777 mbuf_t *
1778 _getq_random(class_queue_t *q)
1779 {
1780 	struct mbuf	*m;
1781 	int		 i, n;
1782 
1783 	if ((m = qtail(q)) == NULL)
1784 		return NULL;
1785 	if (m->m_nextpkt == m) {
1786 		ASSERT(qlen(q) == 1);
1787 		qtail(q) = NULL;
1788 	} else {
1789 		struct mbuf *prev = NULL;
1790 
1791 		n = arc4random() % qlen(q) + 1;
1792 		for (i = 0; i < n; i++) {
1793 			prev = m;
1794 			m = m->m_nextpkt;
1795 		}
1796 		prev->m_nextpkt = m->m_nextpkt;
1797 		if (m == qtail(q))
1798 			qtail(q) = prev;
1799 	}
1800 	qlen(q)--;
1801 	m->m_nextpkt = NULL;
1802 	return (m);
1803 }
1804 
1805 void
1806 _removeq(class_queue_t *q, mbuf_t *m)
1807 {
1808 	mbuf_t	*m0, *prev;
1809 
1810 	m0 = qtail(q);
1811 	do {
1812 		prev = m0;
1813 		m0 = m0->m_nextpkt;
1814 	} while (m0 != m);
1815 	prev->m_nextpkt = m->m_nextpkt;
1816 	if (prev == m)
1817 		qtail(q) = NULL;
1818 	else if (qtail(q) == m)
1819 		qtail(q) = prev;
1820 	qlen(q)--;
1821 }
1822 
1823 void
1824 _flushq(class_queue_t *q)
1825 {
1826 	mbuf_t *m;
1827 
1828 	while ((m = _getq(q)) != NULL)
1829 		m_freem(m);
1830 	ASSERT(qlen(q) == 0);
1831 }
1832 
1833 #endif /* !__GNUC__ || ALTQ_DEBUG */
1834 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1835