xref: /freebsd/sys/net/altq/altq_rmclass.c (revision 079171874c9bf263b69e3af10784ad2bcd1fe699)
1 /*-
2  * Copyright (c) 1991-1997 Regents of the University of California.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *      This product includes software developed by the Network Research
16  *      Group at Lawrence Berkeley Laboratory.
17  * 4. Neither the name of the University nor of the Laboratory may be used
18  *    to endorse or promote products derived from this software without
19  *    specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  * LBL code modified by speer@eng.sun.com, May 1977.
34  * For questions and/or comments, please send mail to cbq@ee.lbl.gov
35  *
36  * @(#)rm_class.c  1.48     97/12/05 SMI
37  * $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $
38  * $FreeBSD$
39  */
40 #include "opt_altq.h"
41 #include "opt_inet.h"
42 #include "opt_inet6.h"
43 #ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
44 
45 #include <sys/param.h>
46 #include <sys/malloc.h>
47 #include <sys/mbuf.h>
48 #include <sys/socket.h>
49 #include <sys/systm.h>
50 #include <sys/errno.h>
51 #include <sys/time.h>
52 #ifdef ALTQ3_COMPAT
53 #include <sys/kernel.h>
54 #endif
55 
56 #include <net/if.h>
57 #include <net/if_var.h>
58 #ifdef ALTQ3_COMPAT
59 #include <netinet/in.h>
60 #include <netinet/in_systm.h>
61 #include <netinet/ip.h>
62 #endif
63 
64 #include <net/altq/if_altq.h>
65 #include <net/altq/altq.h>
66 #include <net/altq/altq_codel.h>
67 #include <net/altq/altq_rmclass.h>
68 #include <net/altq/altq_rmclass_debug.h>
69 #include <net/altq/altq_red.h>
70 #include <net/altq/altq_rio.h>
71 
72 /*
73  * Local Macros
74  */
75 
76 #define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
77 
78 /*
79  * Local routines.
80  */
81 
82 static int	rmc_satisfied(struct rm_class *, struct timeval *);
83 static void	rmc_wrr_set_weights(struct rm_ifdat *);
84 static void	rmc_depth_compute(struct rm_class *);
85 static void	rmc_depth_recompute(rm_class_t *);
86 
87 static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
88 static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
89 
90 static int	_rmc_addq(rm_class_t *, mbuf_t *);
91 static void	_rmc_dropq(rm_class_t *);
92 static mbuf_t	*_rmc_getq(rm_class_t *);
93 static mbuf_t	*_rmc_pollq(rm_class_t *);
94 
95 static int	rmc_under_limit(struct rm_class *, struct timeval *);
96 static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
97 static void	rmc_drop_action(struct rm_class *);
98 static void	rmc_restart(struct rm_class *);
99 static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
100 
101 #define	BORROW_OFFTIME
102 /*
103  * BORROW_OFFTIME (experimental):
104  * borrow the offtime of the class borrowing from.
105  * the reason is that when its own offtime is set, the class is unable
106  * to borrow much, especially when cutoff is taking effect.
107  * but when the borrowed class is overloaded (advidle is close to minidle),
108  * use the borrowing class's offtime to avoid overload.
109  */
110 #define	ADJUST_CUTOFF
111 /*
112  * ADJUST_CUTOFF (experimental):
113  * if no underlimit class is found due to cutoff, increase cutoff and
114  * retry the scheduling loop.
115  * also, don't invoke delay_actions while cutoff is taking effect,
116  * since a sleeping class won't have a chance to be scheduled in the
117  * next loop.
118  *
119  * now heuristics for setting the top-level variable (cutoff_) becomes:
120  *	1. if a packet arrives for a not-overlimit class, set cutoff
121  *	   to the depth of the class.
122  *	2. if cutoff is i, and a packet arrives for an overlimit class
123  *	   with an underlimit ancestor at a lower level than i (say j),
124  *	   then set cutoff to j.
125  *	3. at scheduling a packet, if there is no underlimit class
126  *	   due to the current cutoff level, increase cutoff by 1 and
127  *	   then try to schedule again.
128  */
129 
130 /*
131  * rm_class_t *
132  * rmc_newclass(...) - Create a new resource management class at priority
133  * 'pri' on the interface given by 'ifd'.
134  *
135  * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
136  *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
137  *              than 100% of the bandwidth, this number should be the
138  *              'effective' rate for the class.  Let f be the
139  *              bandwidth fraction allocated to this class, and let
140  *              nsPerByte be the data rate of the output link in
141  *              nanoseconds/byte.  Then nsecPerByte is set to
142  *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
143  *              for a class that gets 50% of an ethernet's bandwidth.
144  *
145  * action       the routine to call when the class is over limit.
146  *
147  * maxq         max allowable queue size for class (in packets).
148  *
149  * parent       parent class pointer.
150  *
151  * borrow       class to borrow from (should be either 'parent' or null).
152  *
153  * maxidle      max value allowed for class 'idle' time estimate (this
154  *              parameter determines how large an initial burst of packets
155  *              can be before overlimit action is invoked.
156  *
157  * offtime      how long 'delay' action will delay when class goes over
158  *              limit (this parameter determines the steady-state burst
159  *              size when a class is running over its limit).
160  *
161  * Maxidle and offtime have to be computed from the following:  If the
162  * average packet size is s, the bandwidth fraction allocated to this
163  * class is f, we want to allow b packet bursts, and the gain of the
164  * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
165  *
166  *   ptime = s * nsPerByte * (1 - f) / f
167  *   maxidle = ptime * (1 - g^b) / g^b
168  *   minidle = -ptime * (1 / (f - 1))
169  *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
170  *
171  * Operationally, it's convenient to specify maxidle & offtime in units
172  * independent of the link bandwidth so the maxidle & offtime passed to
173  * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
174  * (The constant factor is a scale factor needed to make the parameters
175  * integers.  This scaling also means that the 'unscaled' values of
176  * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
177  * not nanoseconds.)  Also note that the 'idle' filter computation keeps
178  * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
179  * maxidle also must be scaled upward by this value.  Thus, the passed
180  * values for maxidle and offtime can be computed as follows:
181  *
182  * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
183  * offtime = offtime * 8 / (1000 * nsecPerByte)
184  *
185  * When USE_HRTIME is employed, then maxidle and offtime become:
186  * 	maxidle = maxilde * (8.0 / nsecPerByte);
187  * 	offtime = offtime * (8.0 / nsecPerByte);
188  */
189 struct rm_class *
190 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
191     void (*action)(rm_class_t *, rm_class_t *), int maxq,
192     struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
193     int minidle, u_int offtime, int pktsize, int flags)
194 {
195 	struct rm_class	*cl;
196 	struct rm_class	*peer;
197 	int		 s;
198 
199 	if (pri >= RM_MAXPRIO)
200 		return (NULL);
201 #ifndef ALTQ_RED
202 	if (flags & RMCF_RED) {
203 #ifdef ALTQ_DEBUG
204 		printf("rmc_newclass: RED not configured for CBQ!\n");
205 #endif
206 		return (NULL);
207 	}
208 #endif
209 #ifndef ALTQ_RIO
210 	if (flags & RMCF_RIO) {
211 #ifdef ALTQ_DEBUG
212 		printf("rmc_newclass: RIO not configured for CBQ!\n");
213 #endif
214 		return (NULL);
215 	}
216 #endif
217 #ifndef ALTQ_CODEL
218 	if (flags & RMCF_CODEL) {
219 #ifdef ALTQ_DEBUG
220 		printf("rmc_newclass: CODEL not configured for CBQ!\n");
221 #endif
222 		return (NULL);
223 	}
224 #endif
225 
226 	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
227 	if (cl == NULL)
228 		return (NULL);
229 	CALLOUT_INIT(&cl->callout_);
230 	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
231 	if (cl->q_ == NULL) {
232 		free(cl, M_DEVBUF);
233 		return (NULL);
234 	}
235 
236 	/*
237 	 * Class initialization.
238 	 */
239 	cl->children_ = NULL;
240 	cl->parent_ = parent;
241 	cl->borrow_ = borrow;
242 	cl->leaf_ = 1;
243 	cl->ifdat_ = ifd;
244 	cl->pri_ = pri;
245 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
246 	cl->depth_ = 0;
247 	cl->qthresh_ = 0;
248 	cl->ns_per_byte_ = nsecPerByte;
249 
250 	qlimit(cl->q_) = maxq;
251 	qtype(cl->q_) = Q_DROPHEAD;
252 	qlen(cl->q_) = 0;
253 	cl->flags_ = flags;
254 
255 #if 1 /* minidle is also scaled in ALTQ */
256 	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
257 	if (cl->minidle_ > 0)
258 		cl->minidle_ = 0;
259 #else
260 	cl->minidle_ = minidle;
261 #endif
262 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
263 	if (cl->maxidle_ == 0)
264 		cl->maxidle_ = 1;
265 #if 1 /* offtime is also scaled in ALTQ */
266 	cl->avgidle_ = cl->maxidle_;
267 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
268 	if (cl->offtime_ == 0)
269 		cl->offtime_ = 1;
270 #else
271 	cl->avgidle_ = 0;
272 	cl->offtime_ = (offtime * nsecPerByte) / 8;
273 #endif
274 	cl->overlimit = action;
275 
276 #ifdef ALTQ_RED
277 	if (flags & (RMCF_RED|RMCF_RIO)) {
278 		int red_flags, red_pkttime;
279 
280 		red_flags = 0;
281 		if (flags & RMCF_ECN)
282 			red_flags |= REDF_ECN;
283 		if (flags & RMCF_FLOWVALVE)
284 			red_flags |= REDF_FLOWVALVE;
285 #ifdef ALTQ_RIO
286 		if (flags & RMCF_CLEARDSCP)
287 			red_flags |= RIOF_CLEARDSCP;
288 #endif
289 		red_pkttime = nsecPerByte * pktsize  / 1000;
290 
291 		if (flags & RMCF_RED) {
292 			cl->red_ = red_alloc(0, 0,
293 			    qlimit(cl->q_) * 10/100,
294 			    qlimit(cl->q_) * 30/100,
295 			    red_flags, red_pkttime);
296 			if (cl->red_ != NULL)
297 				qtype(cl->q_) = Q_RED;
298 		}
299 #ifdef ALTQ_RIO
300 		else {
301 			cl->red_ = (red_t *)rio_alloc(0, NULL,
302 						      red_flags, red_pkttime);
303 			if (cl->red_ != NULL)
304 				qtype(cl->q_) = Q_RIO;
305 		}
306 #endif
307 	}
308 #endif /* ALTQ_RED */
309 #ifdef ALTQ_CODEL
310 	if (flags & RMCF_CODEL) {
311 		cl->codel_ = codel_alloc(5, 100, 0);
312 		if (cl->codel_ != NULL)
313 			qtype(cl->q_) = Q_CODEL;
314 	}
315 #endif
316 
317 	/*
318 	 * put the class into the class tree
319 	 */
320 	s = splnet();
321 	IFQ_LOCK(ifd->ifq_);
322 	if ((peer = ifd->active_[pri]) != NULL) {
323 		/* find the last class at this pri */
324 		cl->peer_ = peer;
325 		while (peer->peer_ != ifd->active_[pri])
326 			peer = peer->peer_;
327 		peer->peer_ = cl;
328 	} else {
329 		ifd->active_[pri] = cl;
330 		cl->peer_ = cl;
331 	}
332 
333 	if (cl->parent_) {
334 		cl->next_ = parent->children_;
335 		parent->children_ = cl;
336 		parent->leaf_ = 0;
337 	}
338 
339 	/*
340 	 * Compute the depth of this class and its ancestors in the class
341 	 * hierarchy.
342 	 */
343 	rmc_depth_compute(cl);
344 
345 	/*
346 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
347 	 */
348 	if (ifd->wrr_) {
349 		ifd->num_[pri]++;
350 		ifd->alloc_[pri] += cl->allotment_;
351 		rmc_wrr_set_weights(ifd);
352 	}
353 	IFQ_UNLOCK(ifd->ifq_);
354 	splx(s);
355 	return (cl);
356 }
357 
358 int
359 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
360     int minidle, u_int offtime, int pktsize)
361 {
362 	struct rm_ifdat	*ifd;
363 	u_int		 old_allotment;
364 	int		 s;
365 
366 	ifd = cl->ifdat_;
367 	old_allotment = cl->allotment_;
368 
369 	s = splnet();
370 	IFQ_LOCK(ifd->ifq_);
371 	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
372 	cl->qthresh_ = 0;
373 	cl->ns_per_byte_ = nsecPerByte;
374 
375 	qlimit(cl->q_) = maxq;
376 
377 #if 1 /* minidle is also scaled in ALTQ */
378 	cl->minidle_ = (minidle * nsecPerByte) / 8;
379 	if (cl->minidle_ > 0)
380 		cl->minidle_ = 0;
381 #else
382 	cl->minidle_ = minidle;
383 #endif
384 	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
385 	if (cl->maxidle_ == 0)
386 		cl->maxidle_ = 1;
387 #if 1 /* offtime is also scaled in ALTQ */
388 	cl->avgidle_ = cl->maxidle_;
389 	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
390 	if (cl->offtime_ == 0)
391 		cl->offtime_ = 1;
392 #else
393 	cl->avgidle_ = 0;
394 	cl->offtime_ = (offtime * nsecPerByte) / 8;
395 #endif
396 
397 	/*
398 	 * If CBQ's WRR is enabled, then initialize the class WRR state.
399 	 */
400 	if (ifd->wrr_) {
401 		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
402 		rmc_wrr_set_weights(ifd);
403 	}
404 	IFQ_UNLOCK(ifd->ifq_);
405 	splx(s);
406 	return (0);
407 }
408 
409 /*
410  * static void
411  * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
412  *	the appropriate run robin weights for the CBQ weighted round robin
413  *	algorithm.
414  *
415  *	Returns: NONE
416  */
417 
418 static void
419 rmc_wrr_set_weights(struct rm_ifdat *ifd)
420 {
421 	int		i;
422 	struct rm_class	*cl, *clh;
423 
424 	for (i = 0; i < RM_MAXPRIO; i++) {
425 		/*
426 		 * This is inverted from that of the simulator to
427 		 * maintain precision.
428 		 */
429 		if (ifd->num_[i] == 0)
430 			ifd->M_[i] = 0;
431 		else
432 			ifd->M_[i] = ifd->alloc_[i] /
433 				(ifd->num_[i] * ifd->maxpkt_);
434 		/*
435 		 * Compute the weighted allotment for each class.
436 		 * This takes the expensive div instruction out
437 		 * of the main loop for the wrr scheduling path.
438 		 * These only get recomputed when a class comes or
439 		 * goes.
440 		 */
441 		if (ifd->active_[i] != NULL) {
442 			clh = cl = ifd->active_[i];
443 			do {
444 				/* safe-guard for slow link or alloc_ == 0 */
445 				if (ifd->M_[i] == 0)
446 					cl->w_allotment_ = 0;
447 				else
448 					cl->w_allotment_ = cl->allotment_ /
449 						ifd->M_[i];
450 				cl = cl->peer_;
451 			} while ((cl != NULL) && (cl != clh));
452 		}
453 	}
454 }
455 
456 int
457 rmc_get_weight(struct rm_ifdat *ifd, int pri)
458 {
459 	if ((pri >= 0) && (pri < RM_MAXPRIO))
460 		return (ifd->M_[pri]);
461 	else
462 		return (0);
463 }
464 
465 /*
466  * static void
467  * rmc_depth_compute(struct rm_class *cl) - This function computes the
468  *	appropriate depth of class 'cl' and its ancestors.
469  *
470  *	Returns:	NONE
471  */
472 
473 static void
474 rmc_depth_compute(struct rm_class *cl)
475 {
476 	rm_class_t	*t = cl, *p;
477 
478 	/*
479 	 * Recompute the depth for the branch of the tree.
480 	 */
481 	while (t != NULL) {
482 		p = t->parent_;
483 		if (p && (t->depth_ >= p->depth_)) {
484 			p->depth_ = t->depth_ + 1;
485 			t = p;
486 		} else
487 			t = NULL;
488 	}
489 }
490 
491 /*
492  * static void
493  * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
494  *	the depth of the tree after a class has been deleted.
495  *
496  *	Returns:	NONE
497  */
498 
499 static void
500 rmc_depth_recompute(rm_class_t *cl)
501 {
502 #if 1 /* ALTQ */
503 	rm_class_t	*p, *t;
504 
505 	p = cl;
506 	while (p != NULL) {
507 		if ((t = p->children_) == NULL) {
508 			p->depth_ = 0;
509 		} else {
510 			int cdepth = 0;
511 
512 			while (t != NULL) {
513 				if (t->depth_ > cdepth)
514 					cdepth = t->depth_;
515 				t = t->next_;
516 			}
517 
518 			if (p->depth_ == cdepth + 1)
519 				/* no change to this parent */
520 				return;
521 
522 			p->depth_ = cdepth + 1;
523 		}
524 
525 		p = p->parent_;
526 	}
527 #else
528 	rm_class_t	*t;
529 
530 	if (cl->depth_ >= 1) {
531 		if (cl->children_ == NULL) {
532 			cl->depth_ = 0;
533 		} else if ((t = cl->children_) != NULL) {
534 			while (t != NULL) {
535 				if (t->children_ != NULL)
536 					rmc_depth_recompute(t);
537 				t = t->next_;
538 			}
539 		} else
540 			rmc_depth_compute(cl);
541 	}
542 #endif
543 }
544 
545 /*
546  * void
547  * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
548  *	function deletes a class from the link-sharing structure and frees
549  *	all resources associated with the class.
550  *
551  *	Returns: NONE
552  */
553 
554 void
555 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
556 {
557 	struct rm_class	*p, *head, *previous;
558 	int		 s;
559 
560 	ASSERT(cl->children_ == NULL);
561 
562 	if (cl->sleeping_)
563 		CALLOUT_STOP(&cl->callout_);
564 
565 	s = splnet();
566 	IFQ_LOCK(ifd->ifq_);
567 	/*
568 	 * Free packets in the packet queue.
569 	 * XXX - this may not be a desired behavior.  Packets should be
570 	 *		re-queued.
571 	 */
572 	rmc_dropall(cl);
573 
574 	/*
575 	 * If the class has a parent, then remove the class from the
576 	 * class from the parent's children chain.
577 	 */
578 	if (cl->parent_ != NULL) {
579 		head = cl->parent_->children_;
580 		p = previous = head;
581 		if (head->next_ == NULL) {
582 			ASSERT(head == cl);
583 			cl->parent_->children_ = NULL;
584 			cl->parent_->leaf_ = 1;
585 		} else while (p != NULL) {
586 			if (p == cl) {
587 				if (cl == head)
588 					cl->parent_->children_ = cl->next_;
589 				else
590 					previous->next_ = cl->next_;
591 				cl->next_ = NULL;
592 				p = NULL;
593 			} else {
594 				previous = p;
595 				p = p->next_;
596 			}
597 		}
598 	}
599 
600 	/*
601 	 * Delete class from class priority peer list.
602 	 */
603 	if ((p = ifd->active_[cl->pri_]) != NULL) {
604 		/*
605 		 * If there is more than one member of this priority
606 		 * level, then look for class(cl) in the priority level.
607 		 */
608 		if (p != p->peer_) {
609 			while (p->peer_ != cl)
610 				p = p->peer_;
611 			p->peer_ = cl->peer_;
612 
613 			if (ifd->active_[cl->pri_] == cl)
614 				ifd->active_[cl->pri_] = cl->peer_;
615 		} else {
616 			ASSERT(p == cl);
617 			ifd->active_[cl->pri_] = NULL;
618 		}
619 	}
620 
621 	/*
622 	 * Recompute the WRR weights.
623 	 */
624 	if (ifd->wrr_) {
625 		ifd->alloc_[cl->pri_] -= cl->allotment_;
626 		ifd->num_[cl->pri_]--;
627 		rmc_wrr_set_weights(ifd);
628 	}
629 
630 	/*
631 	 * Re-compute the depth of the tree.
632 	 */
633 #if 1 /* ALTQ */
634 	rmc_depth_recompute(cl->parent_);
635 #else
636 	rmc_depth_recompute(ifd->root_);
637 #endif
638 
639 	IFQ_UNLOCK(ifd->ifq_);
640 	splx(s);
641 
642 	/*
643 	 * Free the class structure.
644 	 */
645 	if (cl->red_ != NULL) {
646 #ifdef ALTQ_RIO
647 		if (q_is_rio(cl->q_))
648 			rio_destroy((rio_t *)cl->red_);
649 #endif
650 #ifdef ALTQ_RED
651 		if (q_is_red(cl->q_))
652 			red_destroy(cl->red_);
653 #endif
654 #ifdef ALTQ_CODEL
655 		if (q_is_codel(cl->q_))
656 			codel_destroy(cl->codel_);
657 #endif
658 	}
659 	free(cl->q_, M_DEVBUF);
660 	free(cl, M_DEVBUF);
661 }
662 
663 
664 /*
665  * void
666  * rmc_init(...) - Initialize the resource management data structures
667  *	associated with the output portion of interface 'ifp'.  'ifd' is
668  *	where the structures will be built (for backwards compatibility, the
669  *	structures aren't kept in the ifnet struct).  'nsecPerByte'
670  *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
671  *	'restart' is the driver-specific routine that the generic 'delay
672  *	until under limit' action will call to restart output.  `maxq'
673  *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
674  *	is the maximum number of packets that the resource management
675  *	code will allow to be queued 'downstream' (this is typically 1).
676  *
677  *	Returns:	NONE
678  */
679 
680 void
681 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
682     void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
683     int minidle, u_int offtime, int flags)
684 {
685 	int		i, mtu;
686 
687 	/*
688 	 * Initialize the CBQ tracing/debug facility.
689 	 */
690 	CBQTRACEINIT();
691 
692 	bzero((char *)ifd, sizeof (*ifd));
693 	mtu = ifq->altq_ifp->if_mtu;
694 	ifd->ifq_ = ifq;
695 	ifd->restart = restart;
696 	ifd->maxqueued_ = maxqueued;
697 	ifd->ns_per_byte_ = nsecPerByte;
698 	ifd->maxpkt_ = mtu;
699 	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
700 	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
701 #if 1
702 	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
703 	if (mtu * nsecPerByte > 10 * 1000000)
704 		ifd->maxiftime_ /= 4;
705 #endif
706 
707 	reset_cutoff(ifd);
708 	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
709 
710 	/*
711 	 * Initialize the CBQ's WRR state.
712 	 */
713 	for (i = 0; i < RM_MAXPRIO; i++) {
714 		ifd->alloc_[i] = 0;
715 		ifd->M_[i] = 0;
716 		ifd->num_[i] = 0;
717 		ifd->na_[i] = 0;
718 		ifd->active_[i] = NULL;
719 	}
720 
721 	/*
722 	 * Initialize current packet state.
723 	 */
724 	ifd->qi_ = 0;
725 	ifd->qo_ = 0;
726 	for (i = 0; i < RM_MAXQUEUED; i++) {
727 		ifd->class_[i] = NULL;
728 		ifd->curlen_[i] = 0;
729 		ifd->borrowed_[i] = NULL;
730 	}
731 
732 	/*
733 	 * Create the root class of the link-sharing structure.
734 	 */
735 	if ((ifd->root_ = rmc_newclass(0, ifd,
736 				       nsecPerByte,
737 				       rmc_root_overlimit, maxq, 0, 0,
738 				       maxidle, minidle, offtime,
739 				       0, 0)) == NULL) {
740 		printf("rmc_init: root class not allocated\n");
741 		return ;
742 	}
743 	ifd->root_->depth_ = 0;
744 }
745 
746 /*
747  * void
748  * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
749  *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
750  *	by a driver's if_output routine.  This routine must be called with
751  *	output packet completion interrupts locked out (to avoid racing with
752  *	rmc_dequeue_next).
753  *
754  *	Returns:	0 on successful queueing
755  *			-1 when packet drop occurs
756  */
757 int
758 rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
759 {
760 	struct timeval	 now;
761 	struct rm_ifdat *ifd = cl->ifdat_;
762 	int		 cpri = cl->pri_;
763 	int		 is_empty = qempty(cl->q_);
764 
765 	RM_GETTIME(now);
766 	if (ifd->cutoff_ > 0) {
767 		if (TV_LT(&cl->undertime_, &now)) {
768 			if (ifd->cutoff_ > cl->depth_)
769 				ifd->cutoff_ = cl->depth_;
770 			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
771 		}
772 #if 1 /* ALTQ */
773 		else {
774 			/*
775 			 * the class is overlimit. if the class has
776 			 * underlimit ancestors, set cutoff to the lowest
777 			 * depth among them.
778 			 */
779 			struct rm_class *borrow = cl->borrow_;
780 
781 			while (borrow != NULL &&
782 			       borrow->depth_ < ifd->cutoff_) {
783 				if (TV_LT(&borrow->undertime_, &now)) {
784 					ifd->cutoff_ = borrow->depth_;
785 					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
786 					break;
787 				}
788 				borrow = borrow->borrow_;
789 			}
790 		}
791 #else /* !ALTQ */
792 		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
793 			if (TV_LT(&cl->borrow_->undertime_, &now)) {
794 				ifd->cutoff_ = cl->borrow_->depth_;
795 				CBQTRACE(rmc_queue_packet, 'ffob',
796 					 cl->borrow_->depth_);
797 			}
798 		}
799 #endif /* !ALTQ */
800 	}
801 
802 	if (_rmc_addq(cl, m) < 0)
803 		/* failed */
804 		return (-1);
805 
806 	if (is_empty) {
807 		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
808 		ifd->na_[cpri]++;
809 	}
810 
811 	if (qlen(cl->q_) > qlimit(cl->q_)) {
812 		/* note: qlimit can be set to 0 or 1 */
813 		rmc_drop_action(cl);
814 		return (-1);
815 	}
816 	return (0);
817 }
818 
819 /*
820  * void
821  * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
822  *	classes to see if there are satified.
823  */
824 
825 static void
826 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
827 {
828 	int		 i;
829 	rm_class_t	*p, *bp;
830 
831 	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
832 		if ((bp = ifd->active_[i]) != NULL) {
833 			p = bp;
834 			do {
835 				if (!rmc_satisfied(p, now)) {
836 					ifd->cutoff_ = p->depth_;
837 					return;
838 				}
839 				p = p->peer_;
840 			} while (p != bp);
841 		}
842 	}
843 
844 	reset_cutoff(ifd);
845 }
846 
847 /*
848  * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
849  */
850 
851 static int
852 rmc_satisfied(struct rm_class *cl, struct timeval *now)
853 {
854 	rm_class_t	*p;
855 
856 	if (cl == NULL)
857 		return (1);
858 	if (TV_LT(now, &cl->undertime_))
859 		return (1);
860 	if (cl->depth_ == 0) {
861 		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
862 			return (0);
863 		else
864 			return (1);
865 	}
866 	if (cl->children_ != NULL) {
867 		p = cl->children_;
868 		while (p != NULL) {
869 			if (!rmc_satisfied(p, now))
870 				return (0);
871 			p = p->next_;
872 		}
873 	}
874 
875 	return (1);
876 }
877 
878 /*
879  * Return 1 if class 'cl' is under limit or can borrow from a parent,
880  * 0 if overlimit.  As a side-effect, this routine will invoke the
881  * class overlimit action if the class if overlimit.
882  */
883 
884 static int
885 rmc_under_limit(struct rm_class *cl, struct timeval *now)
886 {
887 	rm_class_t	*p = cl;
888 	rm_class_t	*top;
889 	struct rm_ifdat	*ifd = cl->ifdat_;
890 
891 	ifd->borrowed_[ifd->qi_] = NULL;
892 	/*
893 	 * If cl is the root class, then always return that it is
894 	 * underlimit.  Otherwise, check to see if the class is underlimit.
895 	 */
896 	if (cl->parent_ == NULL)
897 		return (1);
898 
899 	if (cl->sleeping_) {
900 		if (TV_LT(now, &cl->undertime_))
901 			return (0);
902 
903 		CALLOUT_STOP(&cl->callout_);
904 		cl->sleeping_ = 0;
905 		cl->undertime_.tv_sec = 0;
906 		return (1);
907 	}
908 
909 	top = NULL;
910 	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
911 		if (((cl = cl->borrow_) == NULL) ||
912 		    (cl->depth_ > ifd->cutoff_)) {
913 #ifdef ADJUST_CUTOFF
914 			if (cl != NULL)
915 				/* cutoff is taking effect, just
916 				   return false without calling
917 				   the delay action. */
918 				return (0);
919 #endif
920 #ifdef BORROW_OFFTIME
921 			/*
922 			 * check if the class can borrow offtime too.
923 			 * borrow offtime from the top of the borrow
924 			 * chain if the top class is not overloaded.
925 			 */
926 			if (cl != NULL) {
927 				/* cutoff is taking effect, use this class as top. */
928 				top = cl;
929 				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
930 			}
931 			if (top != NULL && top->avgidle_ == top->minidle_)
932 				top = NULL;
933 			p->overtime_ = *now;
934 			(p->overlimit)(p, top);
935 #else
936 			p->overtime_ = *now;
937 			(p->overlimit)(p, NULL);
938 #endif
939 			return (0);
940 		}
941 		top = cl;
942 	}
943 
944 	if (cl != p)
945 		ifd->borrowed_[ifd->qi_] = cl;
946 	return (1);
947 }
948 
949 /*
950  * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
951  *	Packet-by-packet round robin.
952  *
953  * The heart of the weighted round-robin scheduler, which decides which
954  * class next gets to send a packet.  Highest priority first, then
955  * weighted round-robin within priorites.
956  *
957  * Each able-to-send class gets to send until its byte allocation is
958  * exhausted.  Thus, the active pointer is only changed after a class has
959  * exhausted its allocation.
960  *
961  * If the scheduler finds no class that is underlimit or able to borrow,
962  * then the first class found that had a nonzero queue and is allowed to
963  * borrow gets to send.
964  */
965 
966 static mbuf_t *
967 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
968 {
969 	struct rm_class	*cl = NULL, *first = NULL;
970 	u_int		 deficit;
971 	int		 cpri;
972 	mbuf_t		*m;
973 	struct timeval	 now;
974 
975 	RM_GETTIME(now);
976 
977 	/*
978 	 * if the driver polls the top of the queue and then removes
979 	 * the polled packet, we must return the same packet.
980 	 */
981 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
982 		cl = ifd->pollcache_;
983 		cpri = cl->pri_;
984 		if (ifd->efficient_) {
985 			/* check if this class is overlimit */
986 			if (cl->undertime_.tv_sec != 0 &&
987 			    rmc_under_limit(cl, &now) == 0)
988 				first = cl;
989 		}
990 		ifd->pollcache_ = NULL;
991 		goto _wrr_out;
992 	}
993 	else {
994 		/* mode == ALTDQ_POLL || pollcache == NULL */
995 		ifd->pollcache_ = NULL;
996 		ifd->borrowed_[ifd->qi_] = NULL;
997 	}
998 #ifdef ADJUST_CUTOFF
999  _again:
1000 #endif
1001 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1002 		if (ifd->na_[cpri] == 0)
1003 			continue;
1004 		deficit = 0;
1005 		/*
1006 		 * Loop through twice for a priority level, if some class
1007 		 * was unable to send a packet the first round because
1008 		 * of the weighted round-robin mechanism.
1009 		 * During the second loop at this level, deficit==2.
1010 		 * (This second loop is not needed if for every class,
1011 		 * "M[cl->pri_])" times "cl->allotment" is greater than
1012 		 * the byte size for the largest packet in the class.)
1013 		 */
1014  _wrr_loop:
1015 		cl = ifd->active_[cpri];
1016 		ASSERT(cl != NULL);
1017 		do {
1018 			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1019 				cl->bytes_alloc_ += cl->w_allotment_;
1020 			if (!qempty(cl->q_)) {
1021 				if ((cl->undertime_.tv_sec == 0) ||
1022 				    rmc_under_limit(cl, &now)) {
1023 					if (cl->bytes_alloc_ > 0 || deficit > 1)
1024 						goto _wrr_out;
1025 
1026 					/* underlimit but no alloc */
1027 					deficit = 1;
1028 #if 1
1029 					ifd->borrowed_[ifd->qi_] = NULL;
1030 #endif
1031 				}
1032 				else if (first == NULL && cl->borrow_ != NULL)
1033 					first = cl; /* borrowing candidate */
1034 			}
1035 
1036 			cl->bytes_alloc_ = 0;
1037 			cl = cl->peer_;
1038 		} while (cl != ifd->active_[cpri]);
1039 
1040 		if (deficit == 1) {
1041 			/* first loop found an underlimit class with deficit */
1042 			/* Loop on same priority level, with new deficit.  */
1043 			deficit = 2;
1044 			goto _wrr_loop;
1045 		}
1046 	}
1047 
1048 #ifdef ADJUST_CUTOFF
1049 	/*
1050 	 * no underlimit class found.  if cutoff is taking effect,
1051 	 * increase cutoff and try again.
1052 	 */
1053 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1054 		ifd->cutoff_++;
1055 		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1056 		goto _again;
1057 	}
1058 #endif /* ADJUST_CUTOFF */
1059 	/*
1060 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1061 	 * class we encounter will send a packet if all the classes
1062 	 * of the link-sharing structure are overlimit.
1063 	 */
1064 	reset_cutoff(ifd);
1065 	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1066 
1067 	if (!ifd->efficient_ || first == NULL)
1068 		return (NULL);
1069 
1070 	cl = first;
1071 	cpri = cl->pri_;
1072 #if 0	/* too time-consuming for nothing */
1073 	if (cl->sleeping_)
1074 		CALLOUT_STOP(&cl->callout_);
1075 	cl->sleeping_ = 0;
1076 	cl->undertime_.tv_sec = 0;
1077 #endif
1078 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1079 	ifd->cutoff_ = cl->borrow_->depth_;
1080 
1081 	/*
1082 	 * Deque the packet and do the book keeping...
1083 	 */
1084  _wrr_out:
1085 	if (op == ALTDQ_REMOVE) {
1086 		m = _rmc_getq(cl);
1087 		if (m == NULL)
1088 			panic("_rmc_wrr_dequeue_next");
1089 		if (qempty(cl->q_))
1090 			ifd->na_[cpri]--;
1091 
1092 		/*
1093 		 * Update class statistics and link data.
1094 		 */
1095 		if (cl->bytes_alloc_ > 0)
1096 			cl->bytes_alloc_ -= m_pktlen(m);
1097 
1098 		if ((cl->bytes_alloc_ <= 0) || first == cl)
1099 			ifd->active_[cl->pri_] = cl->peer_;
1100 		else
1101 			ifd->active_[cl->pri_] = cl;
1102 
1103 		ifd->class_[ifd->qi_] = cl;
1104 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1105 		ifd->now_[ifd->qi_] = now;
1106 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1107 		ifd->queued_++;
1108 	} else {
1109 		/* mode == ALTDQ_PPOLL */
1110 		m = _rmc_pollq(cl);
1111 		ifd->pollcache_ = cl;
1112 	}
1113 	return (m);
1114 }
1115 
1116 /*
1117  * Dequeue & return next packet from the highest priority class that
1118  * has a packet to send & has enough allocation to send it.  This
1119  * routine is called by a driver whenever it needs a new packet to
1120  * output.
1121  */
1122 static mbuf_t *
1123 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1124 {
1125 	mbuf_t		*m;
1126 	int		 cpri;
1127 	struct rm_class	*cl, *first = NULL;
1128 	struct timeval	 now;
1129 
1130 	RM_GETTIME(now);
1131 
1132 	/*
1133 	 * if the driver polls the top of the queue and then removes
1134 	 * the polled packet, we must return the same packet.
1135 	 */
1136 	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1137 		cl = ifd->pollcache_;
1138 		cpri = cl->pri_;
1139 		ifd->pollcache_ = NULL;
1140 		goto _prr_out;
1141 	} else {
1142 		/* mode == ALTDQ_POLL || pollcache == NULL */
1143 		ifd->pollcache_ = NULL;
1144 		ifd->borrowed_[ifd->qi_] = NULL;
1145 	}
1146 #ifdef ADJUST_CUTOFF
1147  _again:
1148 #endif
1149 	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1150 		if (ifd->na_[cpri] == 0)
1151 			continue;
1152 		cl = ifd->active_[cpri];
1153 		ASSERT(cl != NULL);
1154 		do {
1155 			if (!qempty(cl->q_)) {
1156 				if ((cl->undertime_.tv_sec == 0) ||
1157 				    rmc_under_limit(cl, &now))
1158 					goto _prr_out;
1159 				if (first == NULL && cl->borrow_ != NULL)
1160 					first = cl;
1161 			}
1162 			cl = cl->peer_;
1163 		} while (cl != ifd->active_[cpri]);
1164 	}
1165 
1166 #ifdef ADJUST_CUTOFF
1167 	/*
1168 	 * no underlimit class found.  if cutoff is taking effect, increase
1169 	 * cutoff and try again.
1170 	 */
1171 	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1172 		ifd->cutoff_++;
1173 		goto _again;
1174 	}
1175 #endif /* ADJUST_CUTOFF */
1176 	/*
1177 	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1178 	 * class we encounter will send a packet if all the classes
1179 	 * of the link-sharing structure are overlimit.
1180 	 */
1181 	reset_cutoff(ifd);
1182 	if (!ifd->efficient_ || first == NULL)
1183 		return (NULL);
1184 
1185 	cl = first;
1186 	cpri = cl->pri_;
1187 #if 0	/* too time-consuming for nothing */
1188 	if (cl->sleeping_)
1189 		CALLOUT_STOP(&cl->callout_);
1190 	cl->sleeping_ = 0;
1191 	cl->undertime_.tv_sec = 0;
1192 #endif
1193 	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1194 	ifd->cutoff_ = cl->borrow_->depth_;
1195 
1196 	/*
1197 	 * Deque the packet and do the book keeping...
1198 	 */
1199  _prr_out:
1200 	if (op == ALTDQ_REMOVE) {
1201 		m = _rmc_getq(cl);
1202 		if (m == NULL)
1203 			panic("_rmc_prr_dequeue_next");
1204 		if (qempty(cl->q_))
1205 			ifd->na_[cpri]--;
1206 
1207 		ifd->active_[cpri] = cl->peer_;
1208 
1209 		ifd->class_[ifd->qi_] = cl;
1210 		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1211 		ifd->now_[ifd->qi_] = now;
1212 		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1213 		ifd->queued_++;
1214 	} else {
1215 		/* mode == ALTDQ_POLL */
1216 		m = _rmc_pollq(cl);
1217 		ifd->pollcache_ = cl;
1218 	}
1219 	return (m);
1220 }
1221 
1222 /*
1223  * mbuf_t *
1224  * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1225  *	is invoked by the packet driver to get the next packet to be
1226  *	dequeued and output on the link.  If WRR is enabled, then the
1227  *	WRR dequeue next routine will determine the next packet to sent.
1228  *	Otherwise, packet-by-packet round robin is invoked.
1229  *
1230  *	Returns:	NULL, if a packet is not available or if all
1231  *			classes are overlimit.
1232  *
1233  *			Otherwise, Pointer to the next packet.
1234  */
1235 
1236 mbuf_t *
1237 rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1238 {
1239 	if (ifd->queued_ >= ifd->maxqueued_)
1240 		return (NULL);
1241 	else if (ifd->wrr_)
1242 		return (_rmc_wrr_dequeue_next(ifd, mode));
1243 	else
1244 		return (_rmc_prr_dequeue_next(ifd, mode));
1245 }
1246 
1247 /*
1248  * Update the utilization estimate for the packet that just completed.
1249  * The packet's class & the parent(s) of that class all get their
1250  * estimators updated.  This routine is called by the driver's output-
1251  * packet-completion interrupt service routine.
1252  */
1253 
1254 /*
1255  * a macro to approximate "divide by 1000" that gives 0.000999,
1256  * if a value has enough effective digits.
1257  * (on pentium, mul takes 9 cycles but div takes 46!)
1258  */
1259 #define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1260 void
1261 rmc_update_class_util(struct rm_ifdat *ifd)
1262 {
1263 	int		 idle, avgidle, pktlen;
1264 	int		 pkt_time, tidle;
1265 	rm_class_t	*cl, *borrowed;
1266 	rm_class_t	*borrows;
1267 	struct timeval	*nowp;
1268 
1269 	/*
1270 	 * Get the most recent completed class.
1271 	 */
1272 	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1273 		return;
1274 
1275 	pktlen = ifd->curlen_[ifd->qo_];
1276 	borrowed = ifd->borrowed_[ifd->qo_];
1277 	borrows = borrowed;
1278 
1279 	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1280 
1281 	/*
1282 	 * Run estimator on class and its ancestors.
1283 	 */
1284 	/*
1285 	 * rm_update_class_util is designed to be called when the
1286 	 * transfer is completed from a xmit complete interrupt,
1287 	 * but most drivers don't implement an upcall for that.
1288 	 * so, just use estimated completion time.
1289 	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1290 	 */
1291 	nowp = &ifd->now_[ifd->qo_];
1292 	/* get pkt_time (for link) in usec */
1293 #if 1  /* use approximation */
1294 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1295 	pkt_time = NSEC_TO_USEC(pkt_time);
1296 #else
1297 	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1298 #endif
1299 #if 1 /* ALTQ4PPP */
1300 	if (TV_LT(nowp, &ifd->ifnow_)) {
1301 		int iftime;
1302 
1303 		/*
1304 		 * make sure the estimated completion time does not go
1305 		 * too far.  it can happen when the link layer supports
1306 		 * data compression or the interface speed is set to
1307 		 * a much lower value.
1308 		 */
1309 		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1310 		if (iftime+pkt_time < ifd->maxiftime_) {
1311 			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1312 		} else {
1313 			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1314 		}
1315 	} else {
1316 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1317 	}
1318 #else
1319 	if (TV_LT(nowp, &ifd->ifnow_)) {
1320 		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1321 	} else {
1322 		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1323 	}
1324 #endif
1325 
1326 	while (cl != NULL) {
1327 		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1328 		if (idle >= 2000000)
1329 			/*
1330 			 * this class is idle enough, reset avgidle.
1331 			 * (TV_DELTA returns 2000000 us when delta is large.)
1332 			 */
1333 			cl->avgidle_ = cl->maxidle_;
1334 
1335 		/* get pkt_time (for class) in usec */
1336 #if 1  /* use approximation */
1337 		pkt_time = pktlen * cl->ns_per_byte_;
1338 		pkt_time = NSEC_TO_USEC(pkt_time);
1339 #else
1340 		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1341 #endif
1342 		idle -= pkt_time;
1343 
1344 		avgidle = cl->avgidle_;
1345 		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1346 		cl->avgidle_ = avgidle;
1347 
1348 		/* Are we overlimit ? */
1349 		if (avgidle <= 0) {
1350 			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1351 #if 1 /* ALTQ */
1352 			/*
1353 			 * need some lower bound for avgidle, otherwise
1354 			 * a borrowing class gets unbounded penalty.
1355 			 */
1356 			if (avgidle < cl->minidle_)
1357 				avgidle = cl->avgidle_ = cl->minidle_;
1358 #endif
1359 			/* set next idle to make avgidle 0 */
1360 			tidle = pkt_time +
1361 				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1362 			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1363 			++cl->stats_.over;
1364 		} else {
1365 			cl->avgidle_ =
1366 			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1367 			cl->undertime_.tv_sec = 0;
1368 			if (cl->sleeping_) {
1369 				CALLOUT_STOP(&cl->callout_);
1370 				cl->sleeping_ = 0;
1371 			}
1372 		}
1373 
1374 		if (borrows != NULL) {
1375 			if (borrows != cl)
1376 				++cl->stats_.borrows;
1377 			else
1378 				borrows = NULL;
1379 		}
1380 		cl->last_ = ifd->ifnow_;
1381 		cl->last_pkttime_ = pkt_time;
1382 
1383 #if 1
1384 		if (cl->parent_ == NULL) {
1385 			/* take stats of root class */
1386 			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1387 		}
1388 #endif
1389 
1390 		cl = cl->parent_;
1391 	}
1392 
1393 	/*
1394 	 * Check to see if cutoff needs to set to a new level.
1395 	 */
1396 	cl = ifd->class_[ifd->qo_];
1397 	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1398 #if 1 /* ALTQ */
1399 		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1400 			rmc_tl_satisfied(ifd, nowp);
1401 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1402 		} else {
1403 			ifd->cutoff_ = borrowed->depth_;
1404 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1405 		}
1406 #else /* !ALTQ */
1407 		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1408 			reset_cutoff(ifd);
1409 #ifdef notdef
1410 			rmc_tl_satisfied(ifd, &now);
1411 #endif
1412 			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1413 		} else {
1414 			ifd->cutoff_ = borrowed->depth_;
1415 			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1416 		}
1417 #endif /* !ALTQ */
1418 	}
1419 
1420 	/*
1421 	 * Release class slot
1422 	 */
1423 	ifd->borrowed_[ifd->qo_] = NULL;
1424 	ifd->class_[ifd->qo_] = NULL;
1425 	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1426 	ifd->queued_--;
1427 }
1428 
1429 /*
1430  * void
1431  * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1432  *	over-limit action routines.  These get invoked by rmc_under_limit()
1433  *	if a class with packets to send if over its bandwidth limit & can't
1434  *	borrow from a parent class.
1435  *
1436  *	Returns: NONE
1437  */
1438 
1439 static void
1440 rmc_drop_action(struct rm_class *cl)
1441 {
1442 	struct rm_ifdat	*ifd = cl->ifdat_;
1443 
1444 	ASSERT(qlen(cl->q_) > 0);
1445 	_rmc_dropq(cl);
1446 	if (qempty(cl->q_))
1447 		ifd->na_[cl->pri_]--;
1448 }
1449 
1450 void rmc_dropall(struct rm_class *cl)
1451 {
1452 	struct rm_ifdat	*ifd = cl->ifdat_;
1453 
1454 	if (!qempty(cl->q_)) {
1455 		_flushq(cl->q_);
1456 
1457 		ifd->na_[cl->pri_]--;
1458 	}
1459 }
1460 
1461 #if (__FreeBSD_version > 300000)
1462 /* hzto() is removed from FreeBSD-3.0 */
1463 static int hzto(struct timeval *);
1464 
1465 static int
1466 hzto(tv)
1467 	struct timeval *tv;
1468 {
1469 	struct timeval t2;
1470 
1471 	getmicrotime(&t2);
1472 	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1473 	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1474 	return (tvtohz(&t2));
1475 }
1476 #endif /* __FreeBSD_version > 300000 */
1477 
1478 /*
1479  * void
1480  * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1481  *	delay action routine.  It is invoked via rmc_under_limit when the
1482  *	packet is discoverd to be overlimit.
1483  *
1484  *	If the delay action is result of borrow class being overlimit, then
1485  *	delay for the offtime of the borrowing class that is overlimit.
1486  *
1487  *	Returns: NONE
1488  */
1489 
1490 void
1491 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1492 {
1493 	int	delay, t, extradelay;
1494 
1495 	cl->stats_.overactions++;
1496 	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1497 #ifndef BORROW_OFFTIME
1498 	delay += cl->offtime_;
1499 #endif
1500 
1501 	if (!cl->sleeping_) {
1502 		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1503 #ifdef BORROW_OFFTIME
1504 		if (borrow != NULL)
1505 			extradelay = borrow->offtime_;
1506 		else
1507 #endif
1508 			extradelay = cl->offtime_;
1509 
1510 #ifdef ALTQ
1511 		/*
1512 		 * XXX recalculate suspend time:
1513 		 * current undertime is (tidle + pkt_time) calculated
1514 		 * from the last transmission.
1515 		 *	tidle: time required to bring avgidle back to 0
1516 		 *	pkt_time: target waiting time for this class
1517 		 * we need to replace pkt_time by offtime
1518 		 */
1519 		extradelay -= cl->last_pkttime_;
1520 #endif
1521 		if (extradelay > 0) {
1522 			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1523 			delay += extradelay;
1524 		}
1525 
1526 		cl->sleeping_ = 1;
1527 		cl->stats_.delays++;
1528 
1529 		/*
1530 		 * Since packets are phased randomly with respect to the
1531 		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1532 		 * short time so we have to wait for at least two ticks.
1533 		 * NOTE:  If there's no other traffic, we need the timer as
1534 		 * a 'backstop' to restart this class.
1535 		 */
1536 		if (delay > tick * 2) {
1537 			/* FreeBSD rounds up the tick */
1538 			t = hzto(&cl->undertime_);
1539 		} else
1540 			t = 2;
1541 		CALLOUT_RESET(&cl->callout_, t,
1542 			      (timeout_t *)rmc_restart, (caddr_t)cl);
1543 	}
1544 }
1545 
1546 /*
1547  * void
1548  * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1549  *	called by the system timer code & is responsible checking if the
1550  *	class is still sleeping (it might have been restarted as a side
1551  *	effect of the queue scan on a packet arrival) and, if so, restarting
1552  *	output for the class.  Inspecting the class state & restarting output
1553  *	require locking the class structure.  In general the driver is
1554  *	responsible for locking but this is the only routine that is not
1555  *	called directly or indirectly from the interface driver so it has
1556  *	know about system locking conventions.  Under bsd, locking is done
1557  *	by raising IPL to splimp so that's what's implemented here.  On a
1558  *	different system this would probably need to be changed.
1559  *
1560  *	Returns:	NONE
1561  */
1562 
1563 static void
1564 rmc_restart(struct rm_class *cl)
1565 {
1566 	struct rm_ifdat	*ifd = cl->ifdat_;
1567 	int		 s;
1568 
1569 	s = splnet();
1570 	IFQ_LOCK(ifd->ifq_);
1571 	if (cl->sleeping_) {
1572 		cl->sleeping_ = 0;
1573 		cl->undertime_.tv_sec = 0;
1574 
1575 		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1576 			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1577 			(ifd->restart)(ifd->ifq_);
1578 		}
1579 	}
1580 	IFQ_UNLOCK(ifd->ifq_);
1581 	splx(s);
1582 }
1583 
1584 /*
1585  * void
1586  * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1587  *	handling routine for the root class of the link sharing structure.
1588  *
1589  *	Returns: NONE
1590  */
1591 
1592 static void
1593 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1594 {
1595     panic("rmc_root_overlimit");
1596 }
1597 
1598 /*
1599  * Packet Queue handling routines.  Eventually, this is to localize the
1600  *	effects on the code whether queues are red queues or droptail
1601  *	queues.
1602  */
1603 
1604 static int
1605 _rmc_addq(rm_class_t *cl, mbuf_t *m)
1606 {
1607 #ifdef ALTQ_RIO
1608 	if (q_is_rio(cl->q_))
1609 		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1610 #endif
1611 #ifdef ALTQ_RED
1612 	if (q_is_red(cl->q_))
1613 		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1614 #endif /* ALTQ_RED */
1615 #ifdef ALTQ_CODEL
1616 	if (q_is_codel(cl->q_))
1617 		return codel_addq(cl->codel_, cl->q_, m);
1618 #endif
1619 
1620 	if (cl->flags_ & RMCF_CLEARDSCP)
1621 		write_dsfield(m, cl->pktattr_, 0);
1622 
1623 	_addq(cl->q_, m);
1624 	return (0);
1625 }
1626 
1627 /* note: _rmc_dropq is not called for red */
1628 static void
1629 _rmc_dropq(rm_class_t *cl)
1630 {
1631 	mbuf_t	*m;
1632 
1633 	if ((m = _getq(cl->q_)) != NULL)
1634 		m_freem(m);
1635 }
1636 
1637 static mbuf_t *
1638 _rmc_getq(rm_class_t *cl)
1639 {
1640 #ifdef ALTQ_RIO
1641 	if (q_is_rio(cl->q_))
1642 		return rio_getq((rio_t *)cl->red_, cl->q_);
1643 #endif
1644 #ifdef ALTQ_RED
1645 	if (q_is_red(cl->q_))
1646 		return red_getq(cl->red_, cl->q_);
1647 #endif
1648 #ifdef ALTQ_CODEL
1649 	if (q_is_codel(cl->q_))
1650 		return codel_getq(cl->codel_, cl->q_);
1651 #endif
1652 	return _getq(cl->q_);
1653 }
1654 
1655 static mbuf_t *
1656 _rmc_pollq(rm_class_t *cl)
1657 {
1658 	return qhead(cl->q_);
1659 }
1660 
1661 #ifdef CBQ_TRACE
1662 
1663 struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1664 struct cbqtrace		*cbqtrace_ptr = NULL;
1665 int			 cbqtrace_count;
1666 
1667 /*
1668  * DDB hook to trace cbq events:
1669  *  the last 1024 events are held in a circular buffer.
1670  *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1671  */
1672 void cbqtrace_dump(int);
1673 static char *rmc_funcname(void *);
1674 
1675 static struct rmc_funcs {
1676 	void	*func;
1677 	char	*name;
1678 } rmc_funcs[] =
1679 {
1680 	rmc_init,		"rmc_init",
1681 	rmc_queue_packet,	"rmc_queue_packet",
1682 	rmc_under_limit,	"rmc_under_limit",
1683 	rmc_update_class_util,	"rmc_update_class_util",
1684 	rmc_delay_action,	"rmc_delay_action",
1685 	rmc_restart,		"rmc_restart",
1686 	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1687 	NULL,			NULL
1688 };
1689 
1690 static char *rmc_funcname(void *func)
1691 {
1692 	struct rmc_funcs *fp;
1693 
1694 	for (fp = rmc_funcs; fp->func != NULL; fp++)
1695 		if (fp->func == func)
1696 			return (fp->name);
1697 	return ("unknown");
1698 }
1699 
1700 void cbqtrace_dump(int counter)
1701 {
1702 	int	 i, *p;
1703 	char	*cp;
1704 
1705 	counter = counter % NCBQTRACE;
1706 	p = (int *)&cbqtrace_buffer[counter];
1707 
1708 	for (i=0; i<20; i++) {
1709 		printf("[0x%x] ", *p++);
1710 		printf("%s: ", rmc_funcname((void *)*p++));
1711 		cp = (char *)p++;
1712 		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1713 		printf("%d\n",*p++);
1714 
1715 		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1716 			p = (int *)cbqtrace_buffer;
1717 	}
1718 }
1719 #endif /* CBQ_TRACE */
1720 #endif /* ALTQ_CBQ */
1721 
1722 #if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || \
1723     defined(ALTQ_HFSC) || defined(ALTQ_PRIQ) || defined(ALTQ_CODEL)
1724 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1725 
1726 void
1727 _addq(class_queue_t *q, mbuf_t *m)
1728 {
1729         mbuf_t	*m0;
1730 
1731 	if ((m0 = qtail(q)) != NULL)
1732 		m->m_nextpkt = m0->m_nextpkt;
1733 	else
1734 		m0 = m;
1735 	m0->m_nextpkt = m;
1736 	qtail(q) = m;
1737 	qlen(q)++;
1738 }
1739 
1740 mbuf_t *
1741 _getq(class_queue_t *q)
1742 {
1743 	mbuf_t	*m, *m0;
1744 
1745 	if ((m = qtail(q)) == NULL)
1746 		return (NULL);
1747 	if ((m0 = m->m_nextpkt) != m)
1748 		m->m_nextpkt = m0->m_nextpkt;
1749 	else {
1750 		ASSERT(qlen(q) == 1);
1751 		qtail(q) = NULL;
1752 	}
1753 	qlen(q)--;
1754 	m0->m_nextpkt = NULL;
1755 	return (m0);
1756 }
1757 
1758 /* drop a packet at the tail of the queue */
1759 mbuf_t *
1760 _getq_tail(class_queue_t *q)
1761 {
1762 	mbuf_t	*m, *m0, *prev;
1763 
1764 	if ((m = m0 = qtail(q)) == NULL)
1765 		return NULL;
1766 	do {
1767 		prev = m0;
1768 		m0 = m0->m_nextpkt;
1769 	} while (m0 != m);
1770 	prev->m_nextpkt = m->m_nextpkt;
1771 	if (prev == m)  {
1772 		ASSERT(qlen(q) == 1);
1773 		qtail(q) = NULL;
1774 	} else
1775 		qtail(q) = prev;
1776 	qlen(q)--;
1777 	m->m_nextpkt = NULL;
1778 	return (m);
1779 }
1780 
1781 /* randomly select a packet in the queue */
1782 mbuf_t *
1783 _getq_random(class_queue_t *q)
1784 {
1785 	struct mbuf	*m;
1786 	int		 i, n;
1787 
1788 	if ((m = qtail(q)) == NULL)
1789 		return NULL;
1790 	if (m->m_nextpkt == m) {
1791 		ASSERT(qlen(q) == 1);
1792 		qtail(q) = NULL;
1793 	} else {
1794 		struct mbuf *prev = NULL;
1795 
1796 		n = arc4random() % qlen(q) + 1;
1797 		for (i = 0; i < n; i++) {
1798 			prev = m;
1799 			m = m->m_nextpkt;
1800 		}
1801 		prev->m_nextpkt = m->m_nextpkt;
1802 		if (m == qtail(q))
1803 			qtail(q) = prev;
1804 	}
1805 	qlen(q)--;
1806 	m->m_nextpkt = NULL;
1807 	return (m);
1808 }
1809 
1810 void
1811 _removeq(class_queue_t *q, mbuf_t *m)
1812 {
1813 	mbuf_t	*m0, *prev;
1814 
1815 	m0 = qtail(q);
1816 	do {
1817 		prev = m0;
1818 		m0 = m0->m_nextpkt;
1819 	} while (m0 != m);
1820 	prev->m_nextpkt = m->m_nextpkt;
1821 	if (prev == m)
1822 		qtail(q) = NULL;
1823 	else if (qtail(q) == m)
1824 		qtail(q) = prev;
1825 	qlen(q)--;
1826 }
1827 
1828 void
1829 _flushq(class_queue_t *q)
1830 {
1831 	mbuf_t *m;
1832 
1833 	while ((m = _getq(q)) != NULL)
1834 		m_freem(m);
1835 	ASSERT(qlen(q) == 0);
1836 }
1837 
1838 #endif /* !__GNUC__ || ALTQ_DEBUG */
1839 #endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1840