xref: /freebsd/sys/net/altq/altq_rmclass.c (revision 2f9966ff63d65bd474478888c9088eeae3f9c669)
1 /*-
2  * Copyright (c) 1991-1997 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the Network Research
16  *      Group at Lawrence Berkeley Laboratory.
17  * 4. Neither the name of the University nor of the Laboratory may be used
18  *    to endorse or promote products derived from this software without
19  *    specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * LBL code modified by speer@eng.sun.com, May 1977.
34  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
35  * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $
36  */
37 #include "opt_altq.h"
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
41 
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/socket.h>
46 #include <sys/systm.h>
47 #include <sys/errno.h>
48 #include <sys/time.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/if_private.h>
53 
54 #include <net/altq/if_altq.h>
55 #include <net/altq/altq.h>
56 #include <net/altq/altq_codel.h>
57 #include <net/altq/altq_rmclass.h>
58 #include <net/altq/altq_rmclass_debug.h>
59 #include <net/altq/altq_red.h>
60 #include <net/altq/altq_rio.h>
61 
62 /*
63  * Local Macros
64  */
65 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
66 
67 /*
68  * Local routines.
69  */
70 
71 static int	rmc_satisfied(struct rm_class *, struct timeval *);
72 static void	rmc_wrr_set_weights(struct rm_ifdat *);
73 static void	rmc_depth_compute(struct rm_class *);
74 static void	rmc_depth_recompute(rm_class_t *);
75 
76 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
77 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
78 
79 static int	_rmc_addq(rm_class_t *, mbuf_t *);
80 static void	_rmc_dropq(rm_class_t *);
81 static mbuf_t	*_rmc_getq(rm_class_t *);
82 static mbuf_t	*_rmc_pollq(rm_class_t *);
83 
84 static int	rmc_under_limit(struct rm_class *, struct timeval *);
85 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
86 static void	rmc_drop_action(struct rm_class *);
87 static void	rmc_restart(void *);
88 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
89 
90 #define	BORROW_OFFTIME
91 /*
92  * BORROW_OFFTIME (experimental):
93  * borrow the offtime of the class borrowing from.
94  * the reason is that when its own offtime is set, the class is unable
95  * to borrow much, especially when cutoff is taking effect.
96  * but when the borrowed class is overloaded (advidle is close to minidle),
97  * use the borrowing class's offtime to avoid overload.
98  */
99 #define	ADJUST_CUTOFF
100 /*
101  * ADJUST_CUTOFF (experimental):
102  * if no underlimit class is found due to cutoff, increase cutoff and
103  * retry the scheduling loop.
104  * also, don't invoke delay_actions while cutoff is taking effect,
105  * since a sleeping class won't have a chance to be scheduled in the
106  * next loop.
107  *
108  * now heuristics for setting the top-level variable (cutoff_) becomes:
109  *	1. if a packet arrives for a not-overlimit class, set cutoff
110  *	   to the depth of the class.
111  *	2. if cutoff is i, and a packet arrives for an overlimit class
112  *	   with an underlimit ancestor at a lower level than i (say j),
113  *	   then set cutoff to j.
114  *	3. at scheduling a packet, if there is no underlimit class
115  *	   due to the current cutoff level, increase cutoff by 1 and
116  *	   then try to schedule again.
117  */
118 
119 /*
120  * rm_class_t *
121  * rmc_newclass(...) - Create a new resource management class at priority
122  * 'pri' on the interface given by 'ifd'.
123  *
124  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
125  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
126  *              than 100% of the bandwidth, this number should be the
127  *              'effective' rate for the class.  Let f be the
128  *              bandwidth fraction allocated to this class, and let
129  *              nsPerByte be the data rate of the output link in
130  *              nanoseconds/byte.  Then nsecPerByte is set to
131  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
132  *              for a class that gets 50% of an ethernet's bandwidth.
133  *
134  * action       the routine to call when the class is over limit.
135  *
136  * maxq         max allowable queue size for class (in packets).
137  *
138  * parent       parent class pointer.
139  *
140  * borrow       class to borrow from (should be either 'parent' or null).
141  *
142  * maxidle      max value allowed for class 'idle' time estimate (this
143  *              parameter determines how large an initial burst of packets
144  *              can be before overlimit action is invoked.
145  *
146  * offtime      how long 'delay' action will delay when class goes over
147  *              limit (this parameter determines the steady-state burst
148  *              size when a class is running over its limit).
149  *
150  * Maxidle and offtime have to be computed from the following:  If the
151  * average packet size is s, the bandwidth fraction allocated to this
152  * class is f, we want to allow b packet bursts, and the gain of the
153  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
154  *
155  *   ptime = s * nsPerByte * (1 - f) / f
156  *   maxidle = ptime * (1 - g^b) / g^b
157  *   minidle = -ptime * (1 / (f - 1))
158  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
159  *
160  * Operationally, it's convenient to specify maxidle & offtime in units
161  * independent of the link bandwidth so the maxidle & offtime passed to
162  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
163  * (The constant factor is a scale factor needed to make the parameters
164  * integers.  This scaling also means that the 'unscaled' values of
165  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
166  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
167  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
168  * maxidle also must be scaled upward by this value.  Thus, the passed
169  * values for maxidle and offtime can be computed as follows:
170  *
171  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
172  * offtime = offtime * 8 / (1000 * nsecPerByte)
173  *
174  * When USE_HRTIME is employed, then maxidle and offtime become:
175  * 	maxidle = maxilde * (8.0 / nsecPerByte);
176  * 	offtime = offtime * (8.0 / nsecPerByte);
177  */
178 struct rm_class *
179 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
180     void (*action)(rm_class_t *, rm_class_t *), int maxq,
181     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
182     int minidle, u_int offtime, int pktsize, int flags)
183 {
184 	struct rm_class	*cl;
185 	struct rm_class	*peer;
186 	int		 s;
187 
188 	if (pri >= RM_MAXPRIO)
189 		return (NULL);
190 #ifndef ALTQ_RED
191 	if (flags & RMCF_RED) {
192 #ifdef ALTQ_DEBUG
193 		printf("rmc_newclass: RED not configured for CBQ!\n");
194 #endif
195 		return (NULL);
196 	}
197 #endif
198 #ifndef ALTQ_RIO
199 	if (flags & RMCF_RIO) {
200 #ifdef ALTQ_DEBUG
201 		printf("rmc_newclass: RIO not configured for CBQ!\n");
202 #endif
203 		return (NULL);
204 	}
205 #endif
206 #ifndef ALTQ_CODEL
207 	if (flags & RMCF_CODEL) {
208 #ifdef ALTQ_DEBUG
209 		printf("rmc_newclass: CODEL not configured for CBQ!\n");
210 #endif
211 		return (NULL);
212 	}
213 #endif
214 
215 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
216 	if (cl == NULL)
217 		return (NULL);
218 	CALLOUT_INIT(&cl->callout_);
219 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
220 	if (cl->q_ == NULL) {
221 		free(cl, M_DEVBUF);
222 		return (NULL);
223 	}
224 
225 	/*
226 	 * Class initialization.
227 	 */
228 	cl->children_ = NULL;
229 	cl->parent_ = parent;
230 	cl->borrow_ = borrow;
231 	cl->leaf_ = 1;
232 	cl->ifdat_ = ifd;
233 	cl->pri_ = pri;
234 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
235 	cl->depth_ = 0;
236 	cl->qthresh_ = 0;
237 	cl->ns_per_byte_ = nsecPerByte;
238 
239 	qlimit(cl->q_) = maxq;
240 	qtype(cl->q_) = Q_DROPHEAD;
241 	qlen(cl->q_) = 0;
242 	cl->flags_ = flags;
243 
244 #if 1 /* minidle is also scaled in ALTQ */
245 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
246 	if (cl->minidle_ > 0)
247 		cl->minidle_ = 0;
248 #else
249 	cl->minidle_ = minidle;
250 #endif
251 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
252 	if (cl->maxidle_ == 0)
253 		cl->maxidle_ = 1;
254 #if 1 /* offtime is also scaled in ALTQ */
255 	cl->avgidle_ = cl->maxidle_;
256 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
257 	if (cl->offtime_ == 0)
258 		cl->offtime_ = 1;
259 #else
260 	cl->avgidle_ = 0;
261 	cl->offtime_ = (offtime * nsecPerByte) / 8;
262 #endif
263 	cl->overlimit = action;
264 
265 #ifdef ALTQ_RED
266 	if (flags & (RMCF_RED|RMCF_RIO)) {
267 		int red_flags, red_pkttime;
268 
269 		red_flags = 0;
270 		if (flags & RMCF_ECN)
271 			red_flags |= REDF_ECN;
272 		if (flags & RMCF_FLOWVALVE)
273 			red_flags |= REDF_FLOWVALVE;
274 #ifdef ALTQ_RIO
275 		if (flags & RMCF_CLEARDSCP)
276 			red_flags |= RIOF_CLEARDSCP;
277 #endif
278 		red_pkttime = nsecPerByte * pktsize  / 1000;
279 
280 		if (flags & RMCF_RED) {
281 			cl->red_ = red_alloc(0, 0,
282 			    qlimit(cl->q_) * 10/100,
283 			    qlimit(cl->q_) * 30/100,
284 			    red_flags, red_pkttime);
285 			if (cl->red_ != NULL)
286 				qtype(cl->q_) = Q_RED;
287 		}
288 #ifdef ALTQ_RIO
289 		else {
290 			cl->red_ = (red_t *)rio_alloc(0, NULL,
291 						      red_flags, red_pkttime);
292 			if (cl->red_ != NULL)
293 				qtype(cl->q_) = Q_RIO;
294 		}
295 #endif
296 	}
297 #endif /* ALTQ_RED */
298 #ifdef ALTQ_CODEL
299 	if (flags & RMCF_CODEL) {
300 		cl->codel_ = codel_alloc(5, 100, 0);
301 		if (cl->codel_ != NULL)
302 			qtype(cl->q_) = Q_CODEL;
303 	}
304 #endif
305 
306 	/*
307 	 * put the class into the class tree
308 	 */
309 	s = splnet();
310 	IFQ_LOCK(ifd->ifq_);
311 	if ((peer = ifd->active_[pri]) != NULL) {
312 		/* find the last class at this pri */
313 		cl->peer_ = peer;
314 		while (peer->peer_ != ifd->active_[pri])
315 			peer = peer->peer_;
316 		peer->peer_ = cl;
317 	} else {
318 		ifd->active_[pri] = cl;
319 		cl->peer_ = cl;
320 	}
321 
322 	if (cl->parent_) {
323 		cl->next_ = parent->children_;
324 		parent->children_ = cl;
325 		parent->leaf_ = 0;
326 	}
327 
328 	/*
329 	 * Compute the depth of this class and its ancestors in the class
330 	 * hierarchy.
331 	 */
332 	rmc_depth_compute(cl);
333 
334 	/*
335 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
336 	 */
337 	if (ifd->wrr_) {
338 		ifd->num_[pri]++;
339 		ifd->alloc_[pri] += cl->allotment_;
340 		rmc_wrr_set_weights(ifd);
341 	}
342 	IFQ_UNLOCK(ifd->ifq_);
343 	splx(s);
344 	return (cl);
345 }
346 
347 int
348 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
349     int minidle, u_int offtime, int pktsize)
350 {
351 	struct rm_ifdat	*ifd;
352 	u_int		 old_allotment;
353 	int		 s;
354 
355 	ifd = cl->ifdat_;
356 	old_allotment = cl->allotment_;
357 
358 	s = splnet();
359 	IFQ_LOCK(ifd->ifq_);
360 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
361 	cl->qthresh_ = 0;
362 	cl->ns_per_byte_ = nsecPerByte;
363 
364 	qlimit(cl->q_) = maxq;
365 
366 #if 1 /* minidle is also scaled in ALTQ */
367 	cl->minidle_ = (minidle * nsecPerByte) / 8;
368 	if (cl->minidle_ > 0)
369 		cl->minidle_ = 0;
370 #else
371 	cl->minidle_ = minidle;
372 #endif
373 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
374 	if (cl->maxidle_ == 0)
375 		cl->maxidle_ = 1;
376 #if 1 /* offtime is also scaled in ALTQ */
377 	cl->avgidle_ = cl->maxidle_;
378 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
379 	if (cl->offtime_ == 0)
380 		cl->offtime_ = 1;
381 #else
382 	cl->avgidle_ = 0;
383 	cl->offtime_ = (offtime * nsecPerByte) / 8;
384 #endif
385 
386 	/*
387 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
388 	 */
389 	if (ifd->wrr_) {
390 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
391 		rmc_wrr_set_weights(ifd);
392 	}
393 	IFQ_UNLOCK(ifd->ifq_);
394 	splx(s);
395 	return (0);
396 }
397 
398 /*
399  * static void
400  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
401  *	the appropriate run robin weights for the CBQ weighted round robin
402  *	algorithm.
403  *
404  *	Returns: NONE
405  */
406 
407 static void
408 rmc_wrr_set_weights(struct rm_ifdat *ifd)
409 {
410 	int		i;
411 	struct rm_class	*cl, *clh;
412 
413 	for (i = 0; i < RM_MAXPRIO; i++) {
414 		/*
415 		 * This is inverted from that of the simulator to
416 		 * maintain precision.
417 		 */
418 		if (ifd->num_[i] == 0)
419 			ifd->M_[i] = 0;
420 		else
421 			ifd->M_[i] = ifd->alloc_[i] /
422 				(ifd->num_[i] * ifd->maxpkt_);
423 		/*
424 		 * Compute the weighted allotment for each class.
425 		 * This takes the expensive div instruction out
426 		 * of the main loop for the wrr scheduling path.
427 		 * These only get recomputed when a class comes or
428 		 * goes.
429 		 */
430 		if (ifd->active_[i] != NULL) {
431 			clh = cl = ifd->active_[i];
432 			do {
433 				/* safe-guard for slow link or alloc_ == 0 */
434 				if (ifd->M_[i] == 0)
435 					cl->w_allotment_ = 0;
436 				else
437 					cl->w_allotment_ = cl->allotment_ /
438 						ifd->M_[i];
439 				cl = cl->peer_;
440 			} while ((cl != NULL) && (cl != clh));
441 		}
442 	}
443 }
444 
445 int
446 rmc_get_weight(struct rm_ifdat *ifd, int pri)
447 {
448 	if ((pri >= 0) && (pri < RM_MAXPRIO))
449 		return (ifd->M_[pri]);
450 	else
451 		return (0);
452 }
453 
454 /*
455  * static void
456  * rmc_depth_compute(struct rm_class *cl) - This function computes the
457  *	appropriate depth of class 'cl' and its ancestors.
458  *
459  *	Returns:	NONE
460  */
461 
462 static void
463 rmc_depth_compute(struct rm_class *cl)
464 {
465 	rm_class_t	*t = cl, *p;
466 
467 	/*
468 	 * Recompute the depth for the branch of the tree.
469 	 */
470 	while (t != NULL) {
471 		p = t->parent_;
472 		if (p && (t->depth_ >= p->depth_)) {
473 			p->depth_ = t->depth_ + 1;
474 			t = p;
475 		} else
476 			t = NULL;
477 	}
478 }
479 
480 /*
481  * static void
482  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
483  *	the depth of the tree after a class has been deleted.
484  *
485  *	Returns:	NONE
486  */
487 
488 static void
489 rmc_depth_recompute(rm_class_t *cl)
490 {
491 #if 1 /* ALTQ */
492 	rm_class_t	*p, *t;
493 
494 	p = cl;
495 	while (p != NULL) {
496 		if ((t = p->children_) == NULL) {
497 			p->depth_ = 0;
498 		} else {
499 			int cdepth = 0;
500 
501 			while (t != NULL) {
502 				if (t->depth_ > cdepth)
503 					cdepth = t->depth_;
504 				t = t->next_;
505 			}
506 
507 			if (p->depth_ == cdepth + 1)
508 				/* no change to this parent */
509 				return;
510 
511 			p->depth_ = cdepth + 1;
512 		}
513 
514 		p = p->parent_;
515 	}
516 #else
517 	rm_class_t	*t;
518 
519 	if (cl->depth_ >= 1) {
520 		if (cl->children_ == NULL) {
521 			cl->depth_ = 0;
522 		} else if ((t = cl->children_) != NULL) {
523 			while (t != NULL) {
524 				if (t->children_ != NULL)
525 					rmc_depth_recompute(t);
526 				t = t->next_;
527 			}
528 		} else
529 			rmc_depth_compute(cl);
530 	}
531 #endif
532 }
533 
534 /*
535  * void
536  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
537  *	function deletes a class from the link-sharing structure and frees
538  *	all resources associated with the class.
539  *
540  *	Returns: NONE
541  */
542 
543 void
544 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
545 {
546 	struct rm_class	*p, *head, *previous;
547 	int		 s;
548 
549 	ASSERT(cl->children_ == NULL);
550 
551 	if (cl->sleeping_)
552 		CALLOUT_STOP(&cl->callout_);
553 
554 	s = splnet();
555 	IFQ_LOCK(ifd->ifq_);
556 	/*
557 	 * Free packets in the packet queue.
558 	 * XXX - this may not be a desired behavior.  Packets should be
559 	 *		re-queued.
560 	 */
561 	rmc_dropall(cl);
562 
563 	/*
564 	 * If the class has a parent, then remove the class from the
565 	 * class from the parent's children chain.
566 	 */
567 	if (cl->parent_ != NULL) {
568 		head = cl->parent_->children_;
569 		p = previous = head;
570 		if (head->next_ == NULL) {
571 			ASSERT(head == cl);
572 			cl->parent_->children_ = NULL;
573 			cl->parent_->leaf_ = 1;
574 		} else while (p != NULL) {
575 			if (p == cl) {
576 				if (cl == head)
577 					cl->parent_->children_ = cl->next_;
578 				else
579 					previous->next_ = cl->next_;
580 				cl->next_ = NULL;
581 				p = NULL;
582 			} else {
583 				previous = p;
584 				p = p->next_;
585 			}
586 		}
587 	}
588 
589 	/*
590 	 * Delete class from class priority peer list.
591 	 */
592 	if ((p = ifd->active_[cl->pri_]) != NULL) {
593 		/*
594 		 * If there is more than one member of this priority
595 		 * level, then look for class(cl) in the priority level.
596 		 */
597 		if (p != p->peer_) {
598 			while (p->peer_ != cl)
599 				p = p->peer_;
600 			p->peer_ = cl->peer_;
601 
602 			if (ifd->active_[cl->pri_] == cl)
603 				ifd->active_[cl->pri_] = cl->peer_;
604 		} else {
605 			ASSERT(p == cl);
606 			ifd->active_[cl->pri_] = NULL;
607 		}
608 	}
609 
610 	/*
611 	 * Recompute the WRR weights.
612 	 */
613 	if (ifd->wrr_) {
614 		ifd->alloc_[cl->pri_] -= cl->allotment_;
615 		ifd->num_[cl->pri_]--;
616 		rmc_wrr_set_weights(ifd);
617 	}
618 
619 	/*
620 	 * Re-compute the depth of the tree.
621 	 */
622 #if 1 /* ALTQ */
623 	rmc_depth_recompute(cl->parent_);
624 #else
625 	rmc_depth_recompute(ifd->root_);
626 #endif
627 
628 	IFQ_UNLOCK(ifd->ifq_);
629 	splx(s);
630 
631 	/*
632 	 * Free the class structure.
633 	 */
634 	if (cl->red_ != NULL) {
635 #ifdef ALTQ_RIO
636 		if (q_is_rio(cl->q_))
637 			rio_destroy((rio_t *)cl->red_);
638 #endif
639 #ifdef ALTQ_RED
640 		if (q_is_red(cl->q_))
641 			red_destroy(cl->red_);
642 #endif
643 #ifdef ALTQ_CODEL
644 		if (q_is_codel(cl->q_))
645 			codel_destroy(cl->codel_);
646 #endif
647 	}
648 	free(cl->q_, M_DEVBUF);
649 	free(cl, M_DEVBUF);
650 }
651 
652 /*
653  * void
654  * rmc_init(...) - Initialize the resource management data structures
655  *	associated with the output portion of interface 'ifp'.  'ifd' is
656  *	where the structures will be built (for backwards compatibility, the
657  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
658  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
659  *	'restart' is the driver-specific routine that the generic 'delay
660  *	until under limit' action will call to restart output.  `maxq'
661  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
662  *	is the maximum number of packets that the resource management
663  *	code will allow to be queued 'downstream' (this is typically 1).
664  *
665  *	Returns:	NONE
666  */
667 
668 void
669 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
670     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
671     int minidle, u_int offtime, int flags)
672 {
673 	int		i, mtu;
674 
675 	/*
676 	 * Initialize the CBQ tracing/debug facility.
677 	 */
678 	CBQTRACEINIT();
679 
680 	bzero((char *)ifd, sizeof (*ifd));
681 	mtu = ifq->altq_ifp->if_mtu;
682 	ifd->ifq_ = ifq;
683 	ifd->restart = restart;
684 	ifd->maxqueued_ = maxqueued;
685 	ifd->ns_per_byte_ = nsecPerByte;
686 	ifd->maxpkt_ = mtu;
687 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
688 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
689 #if 1
690 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
691 	if (mtu * nsecPerByte > 10 * 1000000)
692 		ifd->maxiftime_ /= 4;
693 #endif
694 
695 	reset_cutoff(ifd);
696 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
697 
698 	/*
699 	 * Initialize the CBQ's WRR state.
700 	 */
701 	for (i = 0; i < RM_MAXPRIO; i++) {
702 		ifd->alloc_[i] = 0;
703 		ifd->M_[i] = 0;
704 		ifd->num_[i] = 0;
705 		ifd->na_[i] = 0;
706 		ifd->active_[i] = NULL;
707 	}
708 
709 	/*
710 	 * Initialize current packet state.
711 	 */
712 	ifd->qi_ = 0;
713 	ifd->qo_ = 0;
714 	for (i = 0; i < RM_MAXQUEUED; i++) {
715 		ifd->class_[i] = NULL;
716 		ifd->curlen_[i] = 0;
717 		ifd->borrowed_[i] = NULL;
718 	}
719 
720 	/*
721 	 * Create the root class of the link-sharing structure.
722 	 */
723 	if ((ifd->root_ = rmc_newclass(0, ifd,
724 				       nsecPerByte,
725 				       rmc_root_overlimit, maxq, 0, 0,
726 				       maxidle, minidle, offtime,
727 				       0, 0)) == NULL) {
728 		printf("rmc_init: root class not allocated\n");
729 		return ;
730 	}
731 	ifd->root_->depth_ = 0;
732 }
733 
734 /*
735  * void
736  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
737  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
738  *	by a driver's if_output routine.  This routine must be called with
739  *	output packet completion interrupts locked out (to avoid racing with
740  *	rmc_dequeue_next).
741  *
742  *	Returns:	0 on successful queueing
743  *			-1 when packet drop occurs
744  */
745 int
746 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
747 {
748 	struct timeval	 now;
749 	struct rm_ifdat *ifd = cl->ifdat_;
750 	int		 cpri = cl->pri_;
751 	int		 is_empty = qempty(cl->q_);
752 
753 	RM_GETTIME(now);
754 	if (ifd->cutoff_ > 0) {
755 		if (TV_LT(&cl->undertime_, &now)) {
756 			if (ifd->cutoff_ > cl->depth_)
757 				ifd->cutoff_ = cl->depth_;
758 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
759 		}
760 #if 1 /* ALTQ */
761 		else {
762 			/*
763 			 * the class is overlimit. if the class has
764 			 * underlimit ancestors, set cutoff to the lowest
765 			 * depth among them.
766 			 */
767 			struct rm_class *borrow = cl->borrow_;
768 
769 			while (borrow != NULL &&
770 			       borrow->depth_ < ifd->cutoff_) {
771 				if (TV_LT(&borrow->undertime_, &now)) {
772 					ifd->cutoff_ = borrow->depth_;
773 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
774 					break;
775 				}
776 				borrow = borrow->borrow_;
777 			}
778 		}
779 #else /* !ALTQ */
780 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
781 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
782 				ifd->cutoff_ = cl->borrow_->depth_;
783 				CBQTRACE(rmc_queue_packet, 'ffob',
784 					 cl->borrow_->depth_);
785 			}
786 		}
787 #endif /* !ALTQ */
788 	}
789 
790 	if (_rmc_addq(cl, m) < 0)
791 		/* failed */
792 		return (-1);
793 
794 	if (is_empty) {
795 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
796 		ifd->na_[cpri]++;
797 	}
798 
799 	if (qlen(cl->q_) > qlimit(cl->q_)) {
800 		/* note: qlimit can be set to 0 or 1 */
801 		rmc_drop_action(cl);
802 		return (-1);
803 	}
804 	return (0);
805 }
806 
807 /*
808  * void
809  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
810  *	classes to see if there are satified.
811  */
812 
813 static void
814 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
815 {
816 	int		 i;
817 	rm_class_t	*p, *bp;
818 
819 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
820 		if ((bp = ifd->active_[i]) != NULL) {
821 			p = bp;
822 			do {
823 				if (!rmc_satisfied(p, now)) {
824 					ifd->cutoff_ = p->depth_;
825 					return;
826 				}
827 				p = p->peer_;
828 			} while (p != bp);
829 		}
830 	}
831 
832 	reset_cutoff(ifd);
833 }
834 
835 /*
836  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
837  */
838 
839 static int
840 rmc_satisfied(struct rm_class *cl, struct timeval *now)
841 {
842 	rm_class_t	*p;
843 
844 	if (cl == NULL)
845 		return (1);
846 	if (TV_LT(now, &cl->undertime_))
847 		return (1);
848 	if (cl->depth_ == 0) {
849 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
850 			return (0);
851 		else
852 			return (1);
853 	}
854 	if (cl->children_ != NULL) {
855 		p = cl->children_;
856 		while (p != NULL) {
857 			if (!rmc_satisfied(p, now))
858 				return (0);
859 			p = p->next_;
860 		}
861 	}
862 
863 	return (1);
864 }
865 
866 /*
867  * Return 1 if class 'cl' is under limit or can borrow from a parent,
868  * 0 if overlimit.  As a side-effect, this routine will invoke the
869  * class overlimit action if the class if overlimit.
870  */
871 
872 static int
873 rmc_under_limit(struct rm_class *cl, struct timeval *now)
874 {
875 	rm_class_t	*p = cl;
876 	rm_class_t	*top;
877 	struct rm_ifdat	*ifd = cl->ifdat_;
878 
879 	ifd->borrowed_[ifd->qi_] = NULL;
880 	/*
881 	 * If cl is the root class, then always return that it is
882 	 * underlimit.  Otherwise, check to see if the class is underlimit.
883 	 */
884 	if (cl->parent_ == NULL)
885 		return (1);
886 
887 	if (cl->sleeping_) {
888 		if (TV_LT(now, &cl->undertime_))
889 			return (0);
890 
891 		CALLOUT_STOP(&cl->callout_);
892 		cl->sleeping_ = 0;
893 		cl->undertime_.tv_sec = 0;
894 		return (1);
895 	}
896 
897 	top = NULL;
898 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
899 		if (((cl = cl->borrow_) == NULL) ||
900 		    (cl->depth_ > ifd->cutoff_)) {
901 #ifdef ADJUST_CUTOFF
902 			if (cl != NULL)
903 				/* cutoff is taking effect, just
904 				   return false without calling
905 				   the delay action. */
906 				return (0);
907 #endif
908 #ifdef BORROW_OFFTIME
909 			/*
910 			 * check if the class can borrow offtime too.
911 			 * borrow offtime from the top of the borrow
912 			 * chain if the top class is not overloaded.
913 			 */
914 			if (cl != NULL) {
915 				/* cutoff is taking effect, use this class as top. */
916 				top = cl;
917 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
918 			}
919 			if (top != NULL && top->avgidle_ == top->minidle_)
920 				top = NULL;
921 			p->overtime_ = *now;
922 			(p->overlimit)(p, top);
923 #else
924 			p->overtime_ = *now;
925 			(p->overlimit)(p, NULL);
926 #endif
927 			return (0);
928 		}
929 		top = cl;
930 	}
931 
932 	if (cl != p)
933 		ifd->borrowed_[ifd->qi_] = cl;
934 	return (1);
935 }
936 
937 /*
938  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
939  *	Packet-by-packet round robin.
940  *
941  * The heart of the weighted round-robin scheduler, which decides which
942  * class next gets to send a packet.  Highest priority first, then
943  * weighted round-robin within priorites.
944  *
945  * Each able-to-send class gets to send until its byte allocation is
946  * exhausted.  Thus, the active pointer is only changed after a class has
947  * exhausted its allocation.
948  *
949  * If the scheduler finds no class that is underlimit or able to borrow,
950  * then the first class found that had a nonzero queue and is allowed to
951  * borrow gets to send.
952  */
953 
954 static mbuf_t *
955 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
956 {
957 	struct rm_class	*cl = NULL, *first = NULL;
958 	u_int		 deficit;
959 	int		 cpri;
960 	mbuf_t		*m;
961 	struct timeval	 now;
962 
963 	RM_GETTIME(now);
964 
965 	/*
966 	 * if the driver polls the top of the queue and then removes
967 	 * the polled packet, we must return the same packet.
968 	 */
969 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
970 		cl = ifd->pollcache_;
971 		cpri = cl->pri_;
972 		if (ifd->efficient_) {
973 			/* check if this class is overlimit */
974 			if (cl->undertime_.tv_sec != 0 &&
975 			    rmc_under_limit(cl, &now) == 0)
976 				first = cl;
977 		}
978 		ifd->pollcache_ = NULL;
979 		goto _wrr_out;
980 	}
981 	else {
982 		/* mode == ALTDQ_POLL || pollcache == NULL */
983 		ifd->pollcache_ = NULL;
984 		ifd->borrowed_[ifd->qi_] = NULL;
985 	}
986 #ifdef ADJUST_CUTOFF
987  _again:
988 #endif
989 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
990 		if (ifd->na_[cpri] == 0)
991 			continue;
992 		deficit = 0;
993 		/*
994 		 * Loop through twice for a priority level, if some class
995 		 * was unable to send a packet the first round because
996 		 * of the weighted round-robin mechanism.
997 		 * During the second loop at this level, deficit==2.
998 		 * (This second loop is not needed if for every class,
999 		 * "M[cl->pri_])" times "cl->allotment" is greater than
1000 		 * the byte size for the largest packet in the class.)
1001 		 */
1002  _wrr_loop:
1003 		cl = ifd->active_[cpri];
1004 		ASSERT(cl != NULL);
1005 		do {
1006 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1007 				cl->bytes_alloc_ += cl->w_allotment_;
1008 			if (!qempty(cl->q_)) {
1009 				if ((cl->undertime_.tv_sec == 0) ||
1010 				    rmc_under_limit(cl, &now)) {
1011 					if (cl->bytes_alloc_ > 0 || deficit > 1)
1012 						goto _wrr_out;
1013 
1014 					/* underlimit but no alloc */
1015 					deficit = 1;
1016 #if 1
1017 					ifd->borrowed_[ifd->qi_] = NULL;
1018 #endif
1019 				}
1020 				else if (first == NULL && cl->borrow_ != NULL)
1021 					first = cl; /* borrowing candidate */
1022 			}
1023 
1024 			cl->bytes_alloc_ = 0;
1025 			cl = cl->peer_;
1026 		} while (cl != ifd->active_[cpri]);
1027 
1028 		if (deficit == 1) {
1029 			/* first loop found an underlimit class with deficit */
1030 			/* Loop on same priority level, with new deficit.  */
1031 			deficit = 2;
1032 			goto _wrr_loop;
1033 		}
1034 	}
1035 
1036 #ifdef ADJUST_CUTOFF
1037 	/*
1038 	 * no underlimit class found.  if cutoff is taking effect,
1039 	 * increase cutoff and try again.
1040 	 */
1041 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1042 		ifd->cutoff_++;
1043 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1044 		goto _again;
1045 	}
1046 #endif /* ADJUST_CUTOFF */
1047 	/*
1048 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1049 	 * class we encounter will send a packet if all the classes
1050 	 * of the link-sharing structure are overlimit.
1051 	 */
1052 	reset_cutoff(ifd);
1053 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1054 
1055 	if (!ifd->efficient_ || first == NULL)
1056 		return (NULL);
1057 
1058 	cl = first;
1059 	cpri = cl->pri_;
1060 #if 0	/* too time-consuming for nothing */
1061 	if (cl->sleeping_)
1062 		CALLOUT_STOP(&cl->callout_);
1063 	cl->sleeping_ = 0;
1064 	cl->undertime_.tv_sec = 0;
1065 #endif
1066 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1067 	ifd->cutoff_ = cl->borrow_->depth_;
1068 
1069 	/*
1070 	 * Deque the packet and do the book keeping...
1071 	 */
1072  _wrr_out:
1073 	if (op == ALTDQ_REMOVE) {
1074 		m = _rmc_getq(cl);
1075 		if (m == NULL)
1076 			panic("_rmc_wrr_dequeue_next");
1077 		if (qempty(cl->q_))
1078 			ifd->na_[cpri]--;
1079 
1080 		/*
1081 		 * Update class statistics and link data.
1082 		 */
1083 		if (cl->bytes_alloc_ > 0)
1084 			cl->bytes_alloc_ -= m_pktlen(m);
1085 
1086 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1087 			ifd->active_[cl->pri_] = cl->peer_;
1088 		else
1089 			ifd->active_[cl->pri_] = cl;
1090 
1091 		ifd->class_[ifd->qi_] = cl;
1092 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1093 		ifd->now_[ifd->qi_] = now;
1094 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1095 		ifd->queued_++;
1096 	} else {
1097 		/* mode == ALTDQ_PPOLL */
1098 		m = _rmc_pollq(cl);
1099 		ifd->pollcache_ = cl;
1100 	}
1101 	return (m);
1102 }
1103 
1104 /*
1105  * Dequeue & return next packet from the highest priority class that
1106  * has a packet to send & has enough allocation to send it.  This
1107  * routine is called by a driver whenever it needs a new packet to
1108  * output.
1109  */
1110 static mbuf_t *
1111 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1112 {
1113 	mbuf_t		*m;
1114 	int		 cpri;
1115 	struct rm_class	*cl, *first = NULL;
1116 	struct timeval	 now;
1117 
1118 	RM_GETTIME(now);
1119 
1120 	/*
1121 	 * if the driver polls the top of the queue and then removes
1122 	 * the polled packet, we must return the same packet.
1123 	 */
1124 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1125 		cl = ifd->pollcache_;
1126 		cpri = cl->pri_;
1127 		ifd->pollcache_ = NULL;
1128 		goto _prr_out;
1129 	} else {
1130 		/* mode == ALTDQ_POLL || pollcache == NULL */
1131 		ifd->pollcache_ = NULL;
1132 		ifd->borrowed_[ifd->qi_] = NULL;
1133 	}
1134 #ifdef ADJUST_CUTOFF
1135  _again:
1136 #endif
1137 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1138 		if (ifd->na_[cpri] == 0)
1139 			continue;
1140 		cl = ifd->active_[cpri];
1141 		ASSERT(cl != NULL);
1142 		do {
1143 			if (!qempty(cl->q_)) {
1144 				if ((cl->undertime_.tv_sec == 0) ||
1145 				    rmc_under_limit(cl, &now))
1146 					goto _prr_out;
1147 				if (first == NULL && cl->borrow_ != NULL)
1148 					first = cl;
1149 			}
1150 			cl = cl->peer_;
1151 		} while (cl != ifd->active_[cpri]);
1152 	}
1153 
1154 #ifdef ADJUST_CUTOFF
1155 	/*
1156 	 * no underlimit class found.  if cutoff is taking effect, increase
1157 	 * cutoff and try again.
1158 	 */
1159 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1160 		ifd->cutoff_++;
1161 		goto _again;
1162 	}
1163 #endif /* ADJUST_CUTOFF */
1164 	/*
1165 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1166 	 * class we encounter will send a packet if all the classes
1167 	 * of the link-sharing structure are overlimit.
1168 	 */
1169 	reset_cutoff(ifd);
1170 	if (!ifd->efficient_ || first == NULL)
1171 		return (NULL);
1172 
1173 	cl = first;
1174 	cpri = cl->pri_;
1175 #if 0	/* too time-consuming for nothing */
1176 	if (cl->sleeping_)
1177 		CALLOUT_STOP(&cl->callout_);
1178 	cl->sleeping_ = 0;
1179 	cl->undertime_.tv_sec = 0;
1180 #endif
1181 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1182 	ifd->cutoff_ = cl->borrow_->depth_;
1183 
1184 	/*
1185 	 * Deque the packet and do the book keeping...
1186 	 */
1187  _prr_out:
1188 	if (op == ALTDQ_REMOVE) {
1189 		m = _rmc_getq(cl);
1190 		if (m == NULL)
1191 			panic("_rmc_prr_dequeue_next");
1192 		if (qempty(cl->q_))
1193 			ifd->na_[cpri]--;
1194 
1195 		ifd->active_[cpri] = cl->peer_;
1196 
1197 		ifd->class_[ifd->qi_] = cl;
1198 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1199 		ifd->now_[ifd->qi_] = now;
1200 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1201 		ifd->queued_++;
1202 	} else {
1203 		/* mode == ALTDQ_POLL */
1204 		m = _rmc_pollq(cl);
1205 		ifd->pollcache_ = cl;
1206 	}
1207 	return (m);
1208 }
1209 
1210 /*
1211  * mbuf_t *
1212  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1213  *	is invoked by the packet driver to get the next packet to be
1214  *	dequeued and output on the link.  If WRR is enabled, then the
1215  *	WRR dequeue next routine will determine the next packet to sent.
1216  *	Otherwise, packet-by-packet round robin is invoked.
1217  *
1218  *	Returns:	NULL, if a packet is not available or if all
1219  *			classes are overlimit.
1220  *
1221  *			Otherwise, Pointer to the next packet.
1222  */
1223 
1224 mbuf_t *
1225 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1226 {
1227 	if (ifd->queued_ >= ifd->maxqueued_)
1228 		return (NULL);
1229 	else if (ifd->wrr_)
1230 		return (_rmc_wrr_dequeue_next(ifd, mode));
1231 	else
1232 		return (_rmc_prr_dequeue_next(ifd, mode));
1233 }
1234 
1235 /*
1236  * Update the utilization estimate for the packet that just completed.
1237  * The packet's class & the parent(s) of that class all get their
1238  * estimators updated.  This routine is called by the driver's output-
1239  * packet-completion interrupt service routine.
1240  */
1241 
1242 /*
1243  * a macro to approximate "divide by 1000" that gives 0.000999,
1244  * if a value has enough effective digits.
1245  * (on pentium, mul takes 9 cycles but div takes 46!)
1246  */
1247 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1248 void
1249 rmc_update_class_util(struct rm_ifdat *ifd)
1250 {
1251 	int		 idle, avgidle, pktlen;
1252 	int		 pkt_time, tidle;
1253 	rm_class_t	*cl, *borrowed;
1254 	rm_class_t	*borrows;
1255 	struct timeval	*nowp;
1256 
1257 	/*
1258 	 * Get the most recent completed class.
1259 	 */
1260 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1261 		return;
1262 
1263 	pktlen = ifd->curlen_[ifd->qo_];
1264 	borrowed = ifd->borrowed_[ifd->qo_];
1265 	borrows = borrowed;
1266 
1267 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1268 
1269 	/*
1270 	 * Run estimator on class and its ancestors.
1271 	 */
1272 	/*
1273 	 * rm_update_class_util is designed to be called when the
1274 	 * transfer is completed from a xmit complete interrupt,
1275 	 * but most drivers don't implement an upcall for that.
1276 	 * so, just use estimated completion time.
1277 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1278 	 */
1279 	nowp = &ifd->now_[ifd->qo_];
1280 	/* get pkt_time (for link) in usec */
1281 #if 1  /* use approximation */
1282 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1283 	pkt_time = NSEC_TO_USEC(pkt_time);
1284 #else
1285 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1286 #endif
1287 #if 1 /* ALTQ4PPP */
1288 	if (TV_LT(nowp, &ifd->ifnow_)) {
1289 		int iftime;
1290 
1291 		/*
1292 		 * make sure the estimated completion time does not go
1293 		 * too far.  it can happen when the link layer supports
1294 		 * data compression or the interface speed is set to
1295 		 * a much lower value.
1296 		 */
1297 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1298 		if (iftime+pkt_time < ifd->maxiftime_) {
1299 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1300 		} else {
1301 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1302 		}
1303 	} else {
1304 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1305 	}
1306 #else
1307 	if (TV_LT(nowp, &ifd->ifnow_)) {
1308 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1309 	} else {
1310 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1311 	}
1312 #endif
1313 
1314 	while (cl != NULL) {
1315 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1316 		if (idle >= 2000000)
1317 			/*
1318 			 * this class is idle enough, reset avgidle.
1319 			 * (TV_DELTA returns 2000000 us when delta is large.)
1320 			 */
1321 			cl->avgidle_ = cl->maxidle_;
1322 
1323 		/* get pkt_time (for class) in usec */
1324 #if 1  /* use approximation */
1325 		pkt_time = pktlen * cl->ns_per_byte_;
1326 		pkt_time = NSEC_TO_USEC(pkt_time);
1327 #else
1328 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1329 #endif
1330 		idle -= pkt_time;
1331 
1332 		avgidle = cl->avgidle_;
1333 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1334 		cl->avgidle_ = avgidle;
1335 
1336 		/* Are we overlimit ? */
1337 		if (avgidle <= 0) {
1338 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1339 #if 1 /* ALTQ */
1340 			/*
1341 			 * need some lower bound for avgidle, otherwise
1342 			 * a borrowing class gets unbounded penalty.
1343 			 */
1344 			if (avgidle < cl->minidle_)
1345 				avgidle = cl->avgidle_ = cl->minidle_;
1346 #endif
1347 			/* set next idle to make avgidle 0 */
1348 			tidle = pkt_time +
1349 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1350 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1351 			++cl->stats_.over;
1352 		} else {
1353 			cl->avgidle_ =
1354 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1355 			cl->undertime_.tv_sec = 0;
1356 			if (cl->sleeping_) {
1357 				CALLOUT_STOP(&cl->callout_);
1358 				cl->sleeping_ = 0;
1359 			}
1360 		}
1361 
1362 		if (borrows != NULL) {
1363 			if (borrows != cl)
1364 				++cl->stats_.borrows;
1365 			else
1366 				borrows = NULL;
1367 		}
1368 		cl->last_ = ifd->ifnow_;
1369 		cl->last_pkttime_ = pkt_time;
1370 
1371 #if 1
1372 		if (cl->parent_ == NULL) {
1373 			/* take stats of root class */
1374 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1375 		}
1376 #endif
1377 
1378 		cl = cl->parent_;
1379 	}
1380 
1381 	/*
1382 	 * Check to see if cutoff needs to set to a new level.
1383 	 */
1384 	cl = ifd->class_[ifd->qo_];
1385 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1386 #if 1 /* ALTQ */
1387 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1388 			rmc_tl_satisfied(ifd, nowp);
1389 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1390 		} else {
1391 			ifd->cutoff_ = borrowed->depth_;
1392 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1393 		}
1394 #else /* !ALTQ */
1395 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1396 			reset_cutoff(ifd);
1397 #ifdef notdef
1398 			rmc_tl_satisfied(ifd, &now);
1399 #endif
1400 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1401 		} else {
1402 			ifd->cutoff_ = borrowed->depth_;
1403 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1404 		}
1405 #endif /* !ALTQ */
1406 	}
1407 
1408 	/*
1409 	 * Release class slot
1410 	 */
1411 	ifd->borrowed_[ifd->qo_] = NULL;
1412 	ifd->class_[ifd->qo_] = NULL;
1413 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1414 	ifd->queued_--;
1415 }
1416 
1417 /*
1418  * void
1419  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1420  *	over-limit action routines.  These get invoked by rmc_under_limit()
1421  *	if a class with packets to send if over its bandwidth limit & can't
1422  *	borrow from a parent class.
1423  *
1424  *	Returns: NONE
1425  */
1426 
1427 static void
1428 rmc_drop_action(struct rm_class *cl)
1429 {
1430 	struct rm_ifdat	*ifd = cl->ifdat_;
1431 
1432 	ASSERT(qlen(cl->q_) > 0);
1433 	_rmc_dropq(cl);
1434 	if (qempty(cl->q_))
1435 		ifd->na_[cl->pri_]--;
1436 }
1437 
1438 void rmc_dropall(struct rm_class *cl)
1439 {
1440 	struct rm_ifdat	*ifd = cl->ifdat_;
1441 
1442 	if (!qempty(cl->q_)) {
1443 		_flushq(cl->q_);
1444 
1445 		ifd->na_[cl->pri_]--;
1446 	}
1447 }
1448 
1449 static int
1450 hzto(struct timeval *tv)
1451 {
1452 	struct timeval t2;
1453 
1454 	getmicrotime(&t2);
1455 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1456 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1457 	return (tvtohz(&t2));
1458 }
1459 
1460 /*
1461  * void
1462  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1463  *	delay action routine.  It is invoked via rmc_under_limit when the
1464  *	packet is discoverd to be overlimit.
1465  *
1466  *	If the delay action is result of borrow class being overlimit, then
1467  *	delay for the offtime of the borrowing class that is overlimit.
1468  *
1469  *	Returns: NONE
1470  */
1471 
1472 void
1473 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1474 {
1475 	int	delay, t, extradelay;
1476 
1477 	cl->stats_.overactions++;
1478 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1479 #ifndef BORROW_OFFTIME
1480 	delay += cl->offtime_;
1481 #endif
1482 
1483 	if (!cl->sleeping_) {
1484 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1485 #ifdef BORROW_OFFTIME
1486 		if (borrow != NULL)
1487 			extradelay = borrow->offtime_;
1488 		else
1489 #endif
1490 			extradelay = cl->offtime_;
1491 
1492 #ifdef ALTQ
1493 		/*
1494 		 * XXX recalculate suspend time:
1495 		 * current undertime is (tidle + pkt_time) calculated
1496 		 * from the last transmission.
1497 		 *	tidle: time required to bring avgidle back to 0
1498 		 *	pkt_time: target waiting time for this class
1499 		 * we need to replace pkt_time by offtime
1500 		 */
1501 		extradelay -= cl->last_pkttime_;
1502 #endif
1503 		if (extradelay > 0) {
1504 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1505 			delay += extradelay;
1506 		}
1507 
1508 		cl->sleeping_ = 1;
1509 		cl->stats_.delays++;
1510 
1511 		/*
1512 		 * Since packets are phased randomly with respect to the
1513 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1514 		 * short time so we have to wait for at least two ticks.
1515 		 * NOTE:  If there's no other traffic, we need the timer as
1516 		 * a 'backstop' to restart this class.
1517 		 */
1518 		if (delay > tick * 2) {
1519 			/* FreeBSD rounds up the tick */
1520 			t = hzto(&cl->undertime_);
1521 		} else
1522 			t = 2;
1523 		CALLOUT_RESET(&cl->callout_, t, rmc_restart, cl);
1524 	}
1525 }
1526 
1527 /*
1528  * void
1529  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1530  *	called by the system timer code & is responsible checking if the
1531  *	class is still sleeping (it might have been restarted as a side
1532  *	effect of the queue scan on a packet arrival) and, if so, restarting
1533  *	output for the class.  Inspecting the class state & restarting output
1534  *	require locking the class structure.  In general the driver is
1535  *	responsible for locking but this is the only routine that is not
1536  *	called directly or indirectly from the interface driver so it has
1537  *	know about system locking conventions.  Under bsd, locking is done
1538  *	by raising IPL to splimp so that's what's implemented here.  On a
1539  *	different system this would probably need to be changed.
1540  *
1541  *	Returns:	NONE
1542  */
1543 
1544 static void
1545 rmc_restart(void *arg)
1546 {
1547 	struct rm_class *cl = arg;
1548 	struct rm_ifdat	*ifd = cl->ifdat_;
1549 	struct epoch_tracker et;
1550 	int		 s;
1551 
1552 	s = splnet();
1553 	NET_EPOCH_ENTER(et);
1554 	IFQ_LOCK(ifd->ifq_);
1555 	CURVNET_SET(ifd->ifq_->altq_ifp->if_vnet);
1556 	if (cl->sleeping_) {
1557 		cl->sleeping_ = 0;
1558 		cl->undertime_.tv_sec = 0;
1559 
1560 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1561 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1562 			(ifd->restart)(ifd->ifq_);
1563 		}
1564 	}
1565 	CURVNET_RESTORE();
1566 	IFQ_UNLOCK(ifd->ifq_);
1567 	NET_EPOCH_EXIT(et);
1568 	splx(s);
1569 }
1570 
1571 /*
1572  * void
1573  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1574  *	handling routine for the root class of the link sharing structure.
1575  *
1576  *	Returns: NONE
1577  */
1578 
1579 static void
1580 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1581 {
1582     panic("rmc_root_overlimit");
1583 }
1584 
1585 /*
1586  * Packet Queue handling routines.  Eventually, this is to localize the
1587  *	effects on the code whether queues are red queues or droptail
1588  *	queues.
1589  */
1590 
1591 static int
1592 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1593 {
1594 #ifdef ALTQ_RIO
1595 	if (q_is_rio(cl->q_))
1596 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1597 #endif
1598 #ifdef ALTQ_RED
1599 	if (q_is_red(cl->q_))
1600 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1601 #endif /* ALTQ_RED */
1602 #ifdef ALTQ_CODEL
1603 	if (q_is_codel(cl->q_))
1604 		return codel_addq(cl->codel_, cl->q_, m);
1605 #endif
1606 
1607 	if (cl->flags_ & RMCF_CLEARDSCP)
1608 		write_dsfield(m, cl->pktattr_, 0);
1609 
1610 	_addq(cl->q_, m);
1611 	return (0);
1612 }
1613 
1614 /* note: _rmc_dropq is not called for red */
1615 static void
1616 _rmc_dropq(rm_class_t *cl)
1617 {
1618 	mbuf_t	*m;
1619 
1620 	if ((m = _getq(cl->q_)) != NULL)
1621 		m_freem(m);
1622 }
1623 
1624 static mbuf_t *
1625 _rmc_getq(rm_class_t *cl)
1626 {
1627 #ifdef ALTQ_RIO
1628 	if (q_is_rio(cl->q_))
1629 		return rio_getq((rio_t *)cl->red_, cl->q_);
1630 #endif
1631 #ifdef ALTQ_RED
1632 	if (q_is_red(cl->q_))
1633 		return red_getq(cl->red_, cl->q_);
1634 #endif
1635 #ifdef ALTQ_CODEL
1636 	if (q_is_codel(cl->q_))
1637 		return codel_getq(cl->codel_, cl->q_);
1638 #endif
1639 	return _getq(cl->q_);
1640 }
1641 
1642 static mbuf_t *
1643 _rmc_pollq(rm_class_t *cl)
1644 {
1645 	return qhead(cl->q_);
1646 }
1647 
1648 #ifdef CBQ_TRACE
1649 
1650 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1651 struct cbqtrace		*cbqtrace_ptr = NULL;
1652 int			 cbqtrace_count;
1653 
1654 /*
1655  * DDB hook to trace cbq events:
1656  *  the last 1024 events are held in a circular buffer.
1657  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1658  */
1659 void cbqtrace_dump(int);
1660 static char *rmc_funcname(void *);
1661 
1662 static struct rmc_funcs {
1663 	void	*func;
1664 	char	*name;
1665 } rmc_funcs[] =
1666 {
1667 	rmc_init,		"rmc_init",
1668 	rmc_queue_packet,	"rmc_queue_packet",
1669 	rmc_under_limit,	"rmc_under_limit",
1670 	rmc_update_class_util,	"rmc_update_class_util",
1671 	rmc_delay_action,	"rmc_delay_action",
1672 	rmc_restart,		"rmc_restart",
1673 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1674 	NULL,			NULL
1675 };
1676 
1677 static char *rmc_funcname(void *func)
1678 {
1679 	struct rmc_funcs *fp;
1680 
1681 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1682 		if (fp->func == func)
1683 			return (fp->name);
1684 	return ("unknown");
1685 }
1686 
1687 void cbqtrace_dump(int counter)
1688 {
1689 	int	 i, *p;
1690 	char	*cp;
1691 
1692 	counter = counter % NCBQTRACE;
1693 	p = (int *)&cbqtrace_buffer[counter];
1694 
1695 	for (i=0; i<20; i++) {
1696 		printf("[0x%x] ", *p++);
1697 		printf("%s: ", rmc_funcname((void *)*p++));
1698 		cp = (char *)p++;
1699 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1700 		printf("%d\n",*p++);
1701 
1702 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1703 			p = (int *)cbqtrace_buffer;
1704 	}
1705 }
1706 #endif /* CBQ_TRACE */
1707 #endif /* ALTQ_CBQ */
1708 
1709 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
1710     defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
1711 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1712 
1713 void
1714 _addq(class_queue_t *q, mbuf_t *m)
1715 {
1716         mbuf_t	*m0;
1717 
1718 	if ((m0 = qtail(q)) != NULL)
1719 		m->m_nextpkt = m0->m_nextpkt;
1720 	else
1721 		m0 = m;
1722 	m0->m_nextpkt = m;
1723 	qtail(q) = m;
1724 	qlen(q)++;
1725 }
1726 
1727 mbuf_t *
1728 _getq(class_queue_t *q)
1729 {
1730 	mbuf_t	*m, *m0;
1731 
1732 	if ((m = qtail(q)) == NULL)
1733 		return (NULL);
1734 	if ((m0 = m->m_nextpkt) != m)
1735 		m->m_nextpkt = m0->m_nextpkt;
1736 	else {
1737 		ASSERT(qlen(q) == 1);
1738 		qtail(q) = NULL;
1739 	}
1740 	qlen(q)--;
1741 	m0->m_nextpkt = NULL;
1742 	return (m0);
1743 }
1744 
1745 /* drop a packet at the tail of the queue */
1746 mbuf_t *
1747 _getq_tail(class_queue_t *q)
1748 {
1749 	mbuf_t	*m, *m0, *prev;
1750 
1751 	if ((m = m0 = qtail(q)) == NULL)
1752 		return NULL;
1753 	do {
1754 		prev = m0;
1755 		m0 = m0->m_nextpkt;
1756 	} while (m0 != m);
1757 	prev->m_nextpkt = m->m_nextpkt;
1758 	if (prev == m)  {
1759 		ASSERT(qlen(q) == 1);
1760 		qtail(q) = NULL;
1761 	} else
1762 		qtail(q) = prev;
1763 	qlen(q)--;
1764 	m->m_nextpkt = NULL;
1765 	return (m);
1766 }
1767 
1768 /* randomly select a packet in the queue */
1769 mbuf_t *
1770 _getq_random(class_queue_t *q)
1771 {
1772 	struct mbuf	*m;
1773 	int		 i, n;
1774 
1775 	if ((m = qtail(q)) == NULL)
1776 		return NULL;
1777 	if (m->m_nextpkt == m) {
1778 		ASSERT(qlen(q) == 1);
1779 		qtail(q) = NULL;
1780 	} else {
1781 		struct mbuf *prev = NULL;
1782 
1783 		n = arc4random() % qlen(q) + 1;
1784 		for (i = 0; i < n; i++) {
1785 			prev = m;
1786 			m = m->m_nextpkt;
1787 		}
1788 		prev->m_nextpkt = m->m_nextpkt;
1789 		if (m == qtail(q))
1790 			qtail(q) = prev;
1791 	}
1792 	qlen(q)--;
1793 	m->m_nextpkt = NULL;
1794 	return (m);
1795 }
1796 
1797 void
1798 _removeq(class_queue_t *q, mbuf_t *m)
1799 {
1800 	mbuf_t	*m0, *prev;
1801 
1802 	m0 = qtail(q);
1803 	do {
1804 		prev = m0;
1805 		m0 = m0->m_nextpkt;
1806 	} while (m0 != m);
1807 	prev->m_nextpkt = m->m_nextpkt;
1808 	if (prev == m)
1809 		qtail(q) = NULL;
1810 	else if (qtail(q) == m)
1811 		qtail(q) = prev;
1812 	qlen(q)--;
1813 }
1814 
1815 void
1816 _flushq(class_queue_t *q)
1817 {
1818 	mbuf_t *m;
1819 
1820 	while ((m = _getq(q)) != NULL)
1821 		m_freem(m);
1822 	ASSERT(qlen(q) == 0);
1823 }
1824 
1825 #endif /* !__GNUC__ || ALTQ_DEBUG */
1826 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1827