xref: /freebsd/sys/net/altq/altq_red.c (revision d93a896ef95946b0bf1219866fcb324b78543444)
1 /*-
2  * Copyright (C) 1997-2003
3  *	Sony Computer Science Laboratories Inc.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  */
27 /*-
28  * Copyright (c) 1990-1994 Regents of the University of California.
29  * All rights reserved.
30  *
31  * Redistribution and use in source and binary forms, with or without
32  * modification, are permitted provided that the following conditions
33  * are met:
34  * 1. Redistributions of source code must retain the above copyright
35  *    notice, this list of conditions and the following disclaimer.
36  * 2. Redistributions in binary form must reproduce the above copyright
37  *    notice, this list of conditions and the following disclaimer in the
38  *    documentation and/or other materials provided with the distribution.
39  * 3. All advertising materials mentioning features or use of this software
40  *    must display the following acknowledgement:
41  *	This product includes software developed by the Computer Systems
42  *	Engineering Group at Lawrence Berkeley Laboratory.
43  * 4. Neither the name of the University nor of the Laboratory may be used
44  *    to endorse or promote products derived from this software without
45  *    specific prior written permission.
46  *
47  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57  * SUCH DAMAGE.
58  *
59  * $KAME: altq_red.c,v 1.18 2003/09/05 22:40:36 itojun Exp $
60  * $FreeBSD$
61  */
62 
63 #include "opt_altq.h"
64 #include "opt_inet.h"
65 #include "opt_inet6.h"
66 #ifdef ALTQ_RED	/* red is enabled by ALTQ_RED option in opt_altq.h */
67 
68 #include <sys/param.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/socket.h>
72 #include <sys/systm.h>
73 #include <sys/errno.h>
74 #if 1 /* ALTQ3_COMPAT */
75 #include <sys/sockio.h>
76 #include <sys/proc.h>
77 #include <sys/kernel.h>
78 #ifdef ALTQ_FLOWVALVE
79 #include <sys/queue.h>
80 #include <sys/time.h>
81 #endif
82 #endif /* ALTQ3_COMPAT */
83 
84 #include <net/if.h>
85 #include <net/if_var.h>
86 
87 #include <netinet/in.h>
88 #include <netinet/in_systm.h>
89 #include <netinet/ip.h>
90 #ifdef INET6
91 #include <netinet/ip6.h>
92 #endif
93 
94 #include <netpfil/pf/pf.h>
95 #include <netpfil/pf/pf_altq.h>
96 #include <netpfil/pf/pf_mtag.h>
97 #include <net/altq/altq.h>
98 #include <net/altq/altq_red.h>
99 #ifdef ALTQ3_COMPAT
100 #include <net/altq/altq_conf.h>
101 #ifdef ALTQ_FLOWVALVE
102 #include <net/altq/altq_flowvalve.h>
103 #endif
104 #endif
105 
106 /*
107  * ALTQ/RED (Random Early Detection) implementation using 32-bit
108  * fixed-point calculation.
109  *
110  * written by kjc using the ns code as a reference.
111  * you can learn more about red and ns from Sally's home page at
112  * http://www-nrg.ee.lbl.gov/floyd/
113  *
114  * most of the red parameter values are fixed in this implementation
115  * to prevent fixed-point overflow/underflow.
116  * if you change the parameters, watch out for overflow/underflow!
117  *
118  * the parameters used are recommended values by Sally.
119  * the corresponding ns config looks:
120  *	q_weight=0.00195
121  *	minthresh=5 maxthresh=15 queue-size=60
122  *	linterm=30
123  *	dropmech=drop-tail
124  *	bytes=false (can't be handled by 32-bit fixed-point)
125  *	doubleq=false dqthresh=false
126  *	wait=true
127  */
128 /*
129  * alternative red parameters for a slow link.
130  *
131  * assume the queue length becomes from zero to L and keeps L, it takes
132  * N packets for q_avg to reach 63% of L.
133  * when q_weight is 0.002, N is about 500 packets.
134  * for a slow link like dial-up, 500 packets takes more than 1 minute!
135  * when q_weight is 0.008, N is about 127 packets.
136  * when q_weight is 0.016, N is about 63 packets.
137  * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
138  * are allowed for 0.016.
139  * see Sally's paper for more details.
140  */
141 /* normal red parameters */
142 #define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
143 				/* q_weight = 0.00195 */
144 
145 /* red parameters for a slow link */
146 #define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
147 				/* q_weight = 0.0078125 */
148 
149 /* red parameters for a very slow link (e.g., dialup) */
150 #define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
151 				/* q_weight = 0.015625 */
152 
153 /* fixed-point uses 12-bit decimal places */
154 #define	FP_SHIFT	12	/* fixed-point shift */
155 
156 /* red parameters for drop probability */
157 #define	INV_P_MAX	10	/* inverse of max drop probability */
158 #define	TH_MIN		5	/* min threshold */
159 #define	TH_MAX		15	/* max threshold */
160 
161 #define	RED_LIMIT	60	/* default max queue length */
162 #define	RED_STATS		/* collect statistics */
163 
164 /*
165  * our default policy for forced-drop is drop-tail.
166  * (in altq-1.1.2 or earlier, the default was random-drop.
167  * but it makes more sense to punish the cause of the surge.)
168  * to switch to the random-drop policy, define "RED_RANDOM_DROP".
169  */
170 
171 #ifdef ALTQ3_COMPAT
172 #ifdef ALTQ_FLOWVALVE
173 /*
174  * flow-valve is an extension to protect red from unresponsive flows
175  * and to promote end-to-end congestion control.
176  * flow-valve observes the average drop rates of the flows that have
177  * experienced packet drops in the recent past.
178  * when the average drop rate exceeds the threshold, the flow is
179  * blocked by the flow-valve.  the trapped flow should back off
180  * exponentially to escape from the flow-valve.
181  */
182 #ifdef RED_RANDOM_DROP
183 #error "random-drop can't be used with flow-valve!"
184 #endif
185 #endif /* ALTQ_FLOWVALVE */
186 
187 /* red_list keeps all red_queue_t's allocated. */
188 static red_queue_t *red_list = NULL;
189 
190 #endif /* ALTQ3_COMPAT */
191 
192 /* default red parameter values */
193 static int default_th_min = TH_MIN;
194 static int default_th_max = TH_MAX;
195 static int default_inv_pmax = INV_P_MAX;
196 
197 #ifdef ALTQ3_COMPAT
198 /* internal function prototypes */
199 static int red_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
200 static struct mbuf *red_dequeue(struct ifaltq *, int);
201 static int red_request(struct ifaltq *, int, void *);
202 static void red_purgeq(red_queue_t *);
203 static int red_detach(red_queue_t *);
204 #ifdef ALTQ_FLOWVALVE
205 static __inline struct fve *flowlist_lookup(struct flowvalve *,
206 			 struct altq_pktattr *, struct timeval *);
207 static __inline struct fve *flowlist_reclaim(struct flowvalve *,
208 					     struct altq_pktattr *);
209 static __inline void flowlist_move_to_head(struct flowvalve *, struct fve *);
210 static __inline int fv_p2f(struct flowvalve *, int);
211 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
212 static struct flowvalve *fv_alloc(struct red *);
213 #endif
214 static void fv_destroy(struct flowvalve *);
215 static int fv_checkflow(struct flowvalve *, struct altq_pktattr *,
216 			struct fve **);
217 static void fv_dropbyred(struct flowvalve *fv, struct altq_pktattr *,
218 			 struct fve *);
219 #endif
220 #endif /* ALTQ3_COMPAT */
221 
222 /*
223  * red support routines
224  */
225 red_t *
226 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
227    int pkttime)
228 {
229 	red_t	*rp;
230 	int	 w, i;
231 	int	 npkts_per_sec;
232 
233 	rp = malloc(sizeof(red_t), M_DEVBUF, M_NOWAIT | M_ZERO);
234 	if (rp == NULL)
235 		return (NULL);
236 
237 	if (weight == 0)
238 		rp->red_weight = W_WEIGHT;
239 	else
240 		rp->red_weight = weight;
241 
242 	/* allocate weight table */
243 	rp->red_wtab = wtab_alloc(rp->red_weight);
244 	if (rp->red_wtab == NULL) {
245 		free(rp, M_DEVBUF);
246 		return (NULL);
247 	}
248 
249 	rp->red_avg = 0;
250 	rp->red_idle = 1;
251 
252 	if (inv_pmax == 0)
253 		rp->red_inv_pmax = default_inv_pmax;
254 	else
255 		rp->red_inv_pmax = inv_pmax;
256 	if (th_min == 0)
257 		rp->red_thmin = default_th_min;
258 	else
259 		rp->red_thmin = th_min;
260 	if (th_max == 0)
261 		rp->red_thmax = default_th_max;
262 	else
263 		rp->red_thmax = th_max;
264 
265 	rp->red_flags = flags;
266 
267 	if (pkttime == 0)
268 		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
269 		rp->red_pkttime = 800;
270 	else
271 		rp->red_pkttime = pkttime;
272 
273 	if (weight == 0) {
274 		/* when the link is very slow, adjust red parameters */
275 		npkts_per_sec = 1000000 / rp->red_pkttime;
276 		if (npkts_per_sec < 50) {
277 			/* up to about 400Kbps */
278 			rp->red_weight = W_WEIGHT_2;
279 		} else if (npkts_per_sec < 300) {
280 			/* up to about 2.4Mbps */
281 			rp->red_weight = W_WEIGHT_1;
282 		}
283 	}
284 
285 	/* calculate wshift.  weight must be power of 2 */
286 	w = rp->red_weight;
287 	for (i = 0; w > 1; i++)
288 		w = w >> 1;
289 	rp->red_wshift = i;
290 	w = 1 << rp->red_wshift;
291 	if (w != rp->red_weight) {
292 		printf("invalid weight value %d for red! use %d\n",
293 		       rp->red_weight, w);
294 		rp->red_weight = w;
295 	}
296 
297 	/*
298 	 * thmin_s and thmax_s are scaled versions of th_min and th_max
299 	 * to be compared with avg.
300 	 */
301 	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
302 	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
303 
304 	/*
305 	 * precompute probability denominator
306 	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
307 	 */
308 	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
309 			 * rp->red_inv_pmax) << FP_SHIFT;
310 
311 	microtime(&rp->red_last);
312 	return (rp);
313 }
314 
315 void
316 red_destroy(red_t *rp)
317 {
318 #ifdef ALTQ3_COMPAT
319 #ifdef ALTQ_FLOWVALVE
320 	if (rp->red_flowvalve != NULL)
321 		fv_destroy(rp->red_flowvalve);
322 #endif
323 #endif /* ALTQ3_COMPAT */
324 	wtab_destroy(rp->red_wtab);
325 	free(rp, M_DEVBUF);
326 }
327 
328 void
329 red_getstats(red_t *rp, struct redstats *sp)
330 {
331 	sp->q_avg		= rp->red_avg >> rp->red_wshift;
332 	sp->xmit_cnt		= rp->red_stats.xmit_cnt;
333 	sp->drop_cnt		= rp->red_stats.drop_cnt;
334 	sp->drop_forced		= rp->red_stats.drop_forced;
335 	sp->drop_unforced	= rp->red_stats.drop_unforced;
336 	sp->marked_packets	= rp->red_stats.marked_packets;
337 }
338 
339 int
340 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
341     struct altq_pktattr *pktattr)
342 {
343 	int avg, droptype;
344 	int n;
345 #ifdef ALTQ3_COMPAT
346 #ifdef ALTQ_FLOWVALVE
347 	struct fve *fve = NULL;
348 
349 	if (rp->red_flowvalve != NULL && rp->red_flowvalve->fv_flows > 0)
350 		if (fv_checkflow(rp->red_flowvalve, pktattr, &fve)) {
351 			m_freem(m);
352 			return (-1);
353 		}
354 #endif
355 #endif /* ALTQ3_COMPAT */
356 
357 	avg = rp->red_avg;
358 
359 	/*
360 	 * if we were idle, we pretend that n packets arrived during
361 	 * the idle period.
362 	 */
363 	if (rp->red_idle) {
364 		struct timeval now;
365 		int t;
366 
367 		rp->red_idle = 0;
368 		microtime(&now);
369 		t = (now.tv_sec - rp->red_last.tv_sec);
370 		if (t > 60) {
371 			/*
372 			 * being idle for more than 1 minute, set avg to zero.
373 			 * this prevents t from overflow.
374 			 */
375 			avg = 0;
376 		} else {
377 			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
378 			n = t / rp->red_pkttime - 1;
379 
380 			/* the following line does (avg = (1 - Wq)^n * avg) */
381 			if (n > 0)
382 				avg = (avg >> FP_SHIFT) *
383 				    pow_w(rp->red_wtab, n);
384 		}
385 	}
386 
387 	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
388 	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
389 	rp->red_avg = avg;		/* save the new value */
390 
391 	/*
392 	 * red_count keeps a tally of arriving traffic that has not
393 	 * been dropped.
394 	 */
395 	rp->red_count++;
396 
397 	/* see if we drop early */
398 	droptype = DTYPE_NODROP;
399 	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
400 		if (avg >= rp->red_thmax_s) {
401 			/* avg >= th_max: forced drop */
402 			droptype = DTYPE_FORCED;
403 		} else if (rp->red_old == 0) {
404 			/* first exceeds th_min */
405 			rp->red_count = 1;
406 			rp->red_old = 1;
407 		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
408 				      rp->red_probd, rp->red_count)) {
409 			/* mark or drop by red */
410 			if ((rp->red_flags & REDF_ECN) &&
411 			    mark_ecn(m, pktattr, rp->red_flags)) {
412 				/* successfully marked.  do not drop. */
413 				rp->red_count = 0;
414 #ifdef RED_STATS
415 				rp->red_stats.marked_packets++;
416 #endif
417 			} else {
418 				/* unforced drop by red */
419 				droptype = DTYPE_EARLY;
420 			}
421 		}
422 	} else {
423 		/* avg < th_min */
424 		rp->red_old = 0;
425 	}
426 
427 	/*
428 	 * if the queue length hits the hard limit, it's a forced drop.
429 	 */
430 	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
431 		droptype = DTYPE_FORCED;
432 
433 #ifdef RED_RANDOM_DROP
434 	/* if successful or forced drop, enqueue this packet. */
435 	if (droptype != DTYPE_EARLY)
436 		_addq(q, m);
437 #else
438 	/* if successful, enqueue this packet. */
439 	if (droptype == DTYPE_NODROP)
440 		_addq(q, m);
441 #endif
442 	if (droptype != DTYPE_NODROP) {
443 		if (droptype == DTYPE_EARLY) {
444 			/* drop the incoming packet */
445 #ifdef RED_STATS
446 			rp->red_stats.drop_unforced++;
447 #endif
448 		} else {
449 			/* forced drop, select a victim packet in the queue. */
450 #ifdef RED_RANDOM_DROP
451 			m = _getq_random(q);
452 #endif
453 #ifdef RED_STATS
454 			rp->red_stats.drop_forced++;
455 #endif
456 		}
457 #ifdef RED_STATS
458 		PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
459 #endif
460 		rp->red_count = 0;
461 #ifdef ALTQ3_COMPAT
462 #ifdef ALTQ_FLOWVALVE
463 		if (rp->red_flowvalve != NULL)
464 			fv_dropbyred(rp->red_flowvalve, pktattr, fve);
465 #endif
466 #endif /* ALTQ3_COMPAT */
467 		m_freem(m);
468 		return (-1);
469 	}
470 	/* successfully queued */
471 #ifdef RED_STATS
472 	PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
473 #endif
474 	return (0);
475 }
476 
477 /*
478  * early-drop probability is calculated as follows:
479  *   prob = p_max * (avg - th_min) / (th_max - th_min)
480  *   prob_a = prob / (2 - count*prob)
481  *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
482  * here prob_a increases as successive undrop count increases.
483  * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
484  * becomes 1 when (count >= (2 / prob))).
485  */
486 int
487 drop_early(int fp_len, int fp_probd, int count)
488 {
489 	int	d;		/* denominator of drop-probability */
490 
491 	d = fp_probd - count * fp_len;
492 	if (d <= 0)
493 		/* count exceeds the hard limit: drop or mark */
494 		return (1);
495 
496 	/*
497 	 * now the range of d is [1..600] in fixed-point. (when
498 	 * th_max-th_min=10 and p_max=1/30)
499 	 * drop probability = (avg - TH_MIN) / d
500 	 */
501 
502 	if ((arc4random() % d) < fp_len) {
503 		/* drop or mark */
504 		return (1);
505 	}
506 	/* no drop/mark */
507 	return (0);
508 }
509 
510 /*
511  * try to mark CE bit to the packet.
512  *    returns 1 if successfully marked, 0 otherwise.
513  */
514 int
515 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
516 {
517 	struct mbuf	*m0;
518 	struct pf_mtag	*at;
519 	void		*hdr;
520 
521 	at = pf_find_mtag(m);
522 	if (at != NULL) {
523 		hdr = at->hdr;
524 #ifdef ALTQ3_COMPAT
525 	} else if (pktattr != NULL) {
526 		af = pktattr->pattr_af;
527 		hdr = pktattr->pattr_hdr;
528 #endif /* ALTQ3_COMPAT */
529 	} else
530 		return (0);
531 
532 	/* verify that pattr_hdr is within the mbuf data */
533 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
534 		if (((caddr_t)hdr >= m0->m_data) &&
535 		    ((caddr_t)hdr < m0->m_data + m0->m_len))
536 			break;
537 	if (m0 == NULL) {
538 		/* ick, tag info is stale */
539 		return (0);
540 	}
541 
542 	switch (((struct ip *)hdr)->ip_v) {
543 	case IPVERSION:
544 		if (flags & REDF_ECN4) {
545 			struct ip *ip = hdr;
546 			u_int8_t otos;
547 			int sum;
548 
549 			if (ip->ip_v != 4)
550 				return (0);	/* version mismatch! */
551 
552 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
553 				return (0);	/* not-ECT */
554 			if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
555 				return (1);	/* already marked */
556 
557 			/*
558 			 * ecn-capable but not marked,
559 			 * mark CE and update checksum
560 			 */
561 			otos = ip->ip_tos;
562 			ip->ip_tos |= IPTOS_ECN_CE;
563 			/*
564 			 * update checksum (from RFC1624)
565 			 *	   HC' = ~(~HC + ~m + m')
566 			 */
567 			sum = ~ntohs(ip->ip_sum) & 0xffff;
568 			sum += (~otos & 0xffff) + ip->ip_tos;
569 			sum = (sum >> 16) + (sum & 0xffff);
570 			sum += (sum >> 16);  /* add carry */
571 			ip->ip_sum = htons(~sum & 0xffff);
572 			return (1);
573 		}
574 		break;
575 #ifdef INET6
576 	case (IPV6_VERSION >> 4):
577 		if (flags & REDF_ECN6) {
578 			struct ip6_hdr *ip6 = hdr;
579 			u_int32_t flowlabel;
580 
581 			flowlabel = ntohl(ip6->ip6_flow);
582 			if ((flowlabel >> 28) != 6)
583 				return (0);	/* version mismatch! */
584 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
585 			    (IPTOS_ECN_NOTECT << 20))
586 				return (0);	/* not-ECT */
587 			if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
588 			    (IPTOS_ECN_CE << 20))
589 				return (1);	/* already marked */
590 			/*
591 			 * ecn-capable but not marked,  mark CE
592 			 */
593 			flowlabel |= (IPTOS_ECN_CE << 20);
594 			ip6->ip6_flow = htonl(flowlabel);
595 			return (1);
596 		}
597 		break;
598 #endif  /* INET6 */
599 	}
600 
601 	/* not marked */
602 	return (0);
603 }
604 
605 struct mbuf *
606 red_getq(rp, q)
607 	red_t *rp;
608 	class_queue_t *q;
609 {
610 	struct mbuf *m;
611 
612 	if ((m = _getq(q)) == NULL) {
613 		if (rp->red_idle == 0) {
614 			rp->red_idle = 1;
615 			microtime(&rp->red_last);
616 		}
617 		return NULL;
618 	}
619 
620 	rp->red_idle = 0;
621 	return (m);
622 }
623 
624 /*
625  * helper routine to calibrate avg during idle.
626  * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
627  * here Wq = 1/weight and the code assumes Wq is close to zero.
628  *
629  * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
630  */
631 static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
632 
633 struct wtab *
634 wtab_alloc(int weight)
635 {
636 	struct wtab	*w;
637 	int		 i;
638 
639 	for (w = wtab_list; w != NULL; w = w->w_next)
640 		if (w->w_weight == weight) {
641 			w->w_refcount++;
642 			return (w);
643 		}
644 
645 	w = malloc(sizeof(struct wtab), M_DEVBUF, M_NOWAIT | M_ZERO);
646 	if (w == NULL)
647 		return (NULL);
648 	w->w_weight = weight;
649 	w->w_refcount = 1;
650 	w->w_next = wtab_list;
651 	wtab_list = w;
652 
653 	/* initialize the weight table */
654 	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
655 	for (i = 1; i < 32; i++) {
656 		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
657 		if (w->w_tab[i] == 0 && w->w_param_max == 0)
658 			w->w_param_max = 1 << i;
659 	}
660 
661 	return (w);
662 }
663 
664 int
665 wtab_destroy(struct wtab *w)
666 {
667 	struct wtab	*prev;
668 
669 	if (--w->w_refcount > 0)
670 		return (0);
671 
672 	if (wtab_list == w)
673 		wtab_list = w->w_next;
674 	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
675 		if (prev->w_next == w) {
676 			prev->w_next = w->w_next;
677 			break;
678 		}
679 
680 	free(w, M_DEVBUF);
681 	return (0);
682 }
683 
684 int32_t
685 pow_w(struct wtab *w, int n)
686 {
687 	int	i, bit;
688 	int32_t	val;
689 
690 	if (n >= w->w_param_max)
691 		return (0);
692 
693 	val = 1 << FP_SHIFT;
694 	if (n <= 0)
695 		return (val);
696 
697 	bit = 1;
698 	i = 0;
699 	while (n) {
700 		if (n & bit) {
701 			val = (val * w->w_tab[i]) >> FP_SHIFT;
702 			n &= ~bit;
703 		}
704 		i++;
705 		bit <<=  1;
706 	}
707 	return (val);
708 }
709 
710 #ifdef ALTQ3_COMPAT
711 /*
712  * red device interface
713  */
714 altqdev_decl(red);
715 
716 int
717 redopen(dev, flag, fmt, p)
718 	dev_t dev;
719 	int flag, fmt;
720 #if (__FreeBSD_version > 500000)
721 	struct thread *p;
722 #else
723 	struct proc *p;
724 #endif
725 {
726 	/* everything will be done when the queueing scheme is attached. */
727 	return 0;
728 }
729 
730 int
731 redclose(dev, flag, fmt, p)
732 	dev_t dev;
733 	int flag, fmt;
734 #if (__FreeBSD_version > 500000)
735 	struct thread *p;
736 #else
737 	struct proc *p;
738 #endif
739 {
740 	red_queue_t *rqp;
741 	int err, error = 0;
742 
743 	while ((rqp = red_list) != NULL) {
744 		/* destroy all */
745 		err = red_detach(rqp);
746 		if (err != 0 && error == 0)
747 			error = err;
748 	}
749 
750 	return error;
751 }
752 
753 int
754 redioctl(dev, cmd, addr, flag, p)
755 	dev_t dev;
756 	ioctlcmd_t cmd;
757 	caddr_t addr;
758 	int flag;
759 #if (__FreeBSD_version > 500000)
760 	struct thread *p;
761 #else
762 	struct proc *p;
763 #endif
764 {
765 	red_queue_t *rqp;
766 	struct red_interface *ifacep;
767 	struct ifnet *ifp;
768 	int	error = 0;
769 
770 	/* check super-user privilege */
771 	switch (cmd) {
772 	case RED_GETSTATS:
773 		break;
774 	default:
775 #if (__FreeBSD_version > 700000)
776 		if ((error = priv_check(p, PRIV_ALTQ_MANAGE)) != 0)
777 #elsif (__FreeBSD_version > 400000)
778 		if ((error = suser(p)) != 0)
779 #else
780 		if ((error = suser(p->p_ucred, &p->p_acflag)) != 0)
781 #endif
782 			return (error);
783 		break;
784 	}
785 
786 	switch (cmd) {
787 
788 	case RED_ENABLE:
789 		ifacep = (struct red_interface *)addr;
790 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
791 			error = EBADF;
792 			break;
793 		}
794 		error = altq_enable(rqp->rq_ifq);
795 		break;
796 
797 	case RED_DISABLE:
798 		ifacep = (struct red_interface *)addr;
799 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
800 			error = EBADF;
801 			break;
802 		}
803 		error = altq_disable(rqp->rq_ifq);
804 		break;
805 
806 	case RED_IF_ATTACH:
807 		ifp = ifunit(((struct red_interface *)addr)->red_ifname);
808 		if (ifp == NULL) {
809 			error = ENXIO;
810 			break;
811 		}
812 
813 		/* allocate and initialize red_queue_t */
814 		rqp = malloc(sizeof(red_queue_t), M_DEVBUF, M_WAITOK);
815 		if (rqp == NULL) {
816 			error = ENOMEM;
817 			break;
818 		}
819 		bzero(rqp, sizeof(red_queue_t));
820 
821 		rqp->rq_q = malloc(sizeof(class_queue_t),
822 		       M_DEVBUF, M_WAITOK);
823 		if (rqp->rq_q == NULL) {
824 			free(rqp, M_DEVBUF);
825 			error = ENOMEM;
826 			break;
827 		}
828 		bzero(rqp->rq_q, sizeof(class_queue_t));
829 
830 		rqp->rq_red = red_alloc(0, 0, 0, 0, 0, 0);
831 		if (rqp->rq_red == NULL) {
832 			free(rqp->rq_q, M_DEVBUF);
833 			free(rqp, M_DEVBUF);
834 			error = ENOMEM;
835 			break;
836 		}
837 
838 		rqp->rq_ifq = &ifp->if_snd;
839 		qtail(rqp->rq_q) = NULL;
840 		qlen(rqp->rq_q) = 0;
841 		qlimit(rqp->rq_q) = RED_LIMIT;
842 		qtype(rqp->rq_q) = Q_RED;
843 
844 		/*
845 		 * set RED to this ifnet structure.
846 		 */
847 		error = altq_attach(rqp->rq_ifq, ALTQT_RED, rqp,
848 				    red_enqueue, red_dequeue, red_request,
849 				    NULL, NULL);
850 		if (error) {
851 			red_destroy(rqp->rq_red);
852 			free(rqp->rq_q, M_DEVBUF);
853 			free(rqp, M_DEVBUF);
854 			break;
855 		}
856 
857 		/* add this state to the red list */
858 		rqp->rq_next = red_list;
859 		red_list = rqp;
860 		break;
861 
862 	case RED_IF_DETACH:
863 		ifacep = (struct red_interface *)addr;
864 		if ((rqp = altq_lookup(ifacep->red_ifname, ALTQT_RED)) == NULL) {
865 			error = EBADF;
866 			break;
867 		}
868 		error = red_detach(rqp);
869 		break;
870 
871 	case RED_GETSTATS:
872 		do {
873 			struct red_stats *q_stats;
874 			red_t *rp;
875 
876 			q_stats = (struct red_stats *)addr;
877 			if ((rqp = altq_lookup(q_stats->iface.red_ifname,
878 					     ALTQT_RED)) == NULL) {
879 				error = EBADF;
880 				break;
881 			}
882 
883 			q_stats->q_len 	   = qlen(rqp->rq_q);
884 			q_stats->q_limit   = qlimit(rqp->rq_q);
885 
886 			rp = rqp->rq_red;
887 			q_stats->q_avg 	   = rp->red_avg >> rp->red_wshift;
888 			q_stats->xmit_cnt  = rp->red_stats.xmit_cnt;
889 			q_stats->drop_cnt  = rp->red_stats.drop_cnt;
890 			q_stats->drop_forced   = rp->red_stats.drop_forced;
891 			q_stats->drop_unforced = rp->red_stats.drop_unforced;
892 			q_stats->marked_packets = rp->red_stats.marked_packets;
893 
894 			q_stats->weight		= rp->red_weight;
895 			q_stats->inv_pmax	= rp->red_inv_pmax;
896 			q_stats->th_min		= rp->red_thmin;
897 			q_stats->th_max		= rp->red_thmax;
898 
899 #ifdef ALTQ_FLOWVALVE
900 			if (rp->red_flowvalve != NULL) {
901 				struct flowvalve *fv = rp->red_flowvalve;
902 				q_stats->fv_flows    = fv->fv_flows;
903 				q_stats->fv_pass     = fv->fv_stats.pass;
904 				q_stats->fv_predrop  = fv->fv_stats.predrop;
905 				q_stats->fv_alloc    = fv->fv_stats.alloc;
906 				q_stats->fv_escape   = fv->fv_stats.escape;
907 			} else {
908 #endif /* ALTQ_FLOWVALVE */
909 				q_stats->fv_flows    = 0;
910 				q_stats->fv_pass     = 0;
911 				q_stats->fv_predrop  = 0;
912 				q_stats->fv_alloc    = 0;
913 				q_stats->fv_escape   = 0;
914 #ifdef ALTQ_FLOWVALVE
915 			}
916 #endif /* ALTQ_FLOWVALVE */
917 		} while (/*CONSTCOND*/ 0);
918 		break;
919 
920 	case RED_CONFIG:
921 		do {
922 			struct red_conf *fc;
923 			red_t *new;
924 			int s, limit;
925 
926 			fc = (struct red_conf *)addr;
927 			if ((rqp = altq_lookup(fc->iface.red_ifname,
928 					       ALTQT_RED)) == NULL) {
929 				error = EBADF;
930 				break;
931 			}
932 			new = red_alloc(fc->red_weight,
933 					fc->red_inv_pmax,
934 					fc->red_thmin,
935 					fc->red_thmax,
936 					fc->red_flags,
937 					fc->red_pkttime);
938 			if (new == NULL) {
939 				error = ENOMEM;
940 				break;
941 			}
942 
943 			s = splnet();
944 			red_purgeq(rqp);
945 			limit = fc->red_limit;
946 			if (limit < fc->red_thmax)
947 				limit = fc->red_thmax;
948 			qlimit(rqp->rq_q) = limit;
949 			fc->red_limit = limit;	/* write back the new value */
950 
951 			red_destroy(rqp->rq_red);
952 			rqp->rq_red = new;
953 
954 			splx(s);
955 
956 			/* write back new values */
957 			fc->red_limit = limit;
958 			fc->red_inv_pmax = rqp->rq_red->red_inv_pmax;
959 			fc->red_thmin = rqp->rq_red->red_thmin;
960 			fc->red_thmax = rqp->rq_red->red_thmax;
961 
962 		} while (/*CONSTCOND*/ 0);
963 		break;
964 
965 	case RED_SETDEFAULTS:
966 		do {
967 			struct redparams *rp;
968 
969 			rp = (struct redparams *)addr;
970 
971 			default_th_min = rp->th_min;
972 			default_th_max = rp->th_max;
973 			default_inv_pmax = rp->inv_pmax;
974 		} while (/*CONSTCOND*/ 0);
975 		break;
976 
977 	default:
978 		error = EINVAL;
979 		break;
980 	}
981 	return error;
982 }
983 
984 static int
985 red_detach(rqp)
986 	red_queue_t *rqp;
987 {
988 	red_queue_t *tmp;
989 	int error = 0;
990 
991 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
992 		altq_disable(rqp->rq_ifq);
993 
994 	if ((error = altq_detach(rqp->rq_ifq)))
995 		return (error);
996 
997 	if (red_list == rqp)
998 		red_list = rqp->rq_next;
999 	else {
1000 		for (tmp = red_list; tmp != NULL; tmp = tmp->rq_next)
1001 			if (tmp->rq_next == rqp) {
1002 				tmp->rq_next = rqp->rq_next;
1003 				break;
1004 			}
1005 		if (tmp == NULL)
1006 			printf("red_detach: no state found in red_list!\n");
1007 	}
1008 
1009 	red_destroy(rqp->rq_red);
1010 	free(rqp->rq_q, M_DEVBUF);
1011 	free(rqp, M_DEVBUF);
1012 	return (error);
1013 }
1014 
1015 /*
1016  * enqueue routine:
1017  *
1018  *	returns: 0 when successfully queued.
1019  *		 ENOBUFS when drop occurs.
1020  */
1021 static int
1022 red_enqueue(ifq, m, pktattr)
1023 	struct ifaltq *ifq;
1024 	struct mbuf *m;
1025 	struct altq_pktattr *pktattr;
1026 {
1027 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1028 
1029 	IFQ_LOCK_ASSERT(ifq);
1030 
1031 	if (red_addq(rqp->rq_red, rqp->rq_q, m, pktattr) < 0)
1032 		return ENOBUFS;
1033 	ifq->ifq_len++;
1034 	return 0;
1035 }
1036 
1037 /*
1038  * dequeue routine:
1039  *	must be called in splimp.
1040  *
1041  *	returns: mbuf dequeued.
1042  *		 NULL when no packet is available in the queue.
1043  */
1044 
1045 static struct mbuf *
1046 red_dequeue(ifq, op)
1047 	struct ifaltq *ifq;
1048 	int op;
1049 {
1050 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1051 	struct mbuf *m;
1052 
1053 	IFQ_LOCK_ASSERT(ifq);
1054 
1055 	if (op == ALTDQ_POLL)
1056 		return qhead(rqp->rq_q);
1057 
1058 	/* op == ALTDQ_REMOVE */
1059 	m =  red_getq(rqp->rq_red, rqp->rq_q);
1060 	if (m != NULL)
1061 		ifq->ifq_len--;
1062 	return (m);
1063 }
1064 
1065 static int
1066 red_request(ifq, req, arg)
1067 	struct ifaltq *ifq;
1068 	int req;
1069 	void *arg;
1070 {
1071 	red_queue_t *rqp = (red_queue_t *)ifq->altq_disc;
1072 
1073 	IFQ_LOCK_ASSERT(ifq);
1074 
1075 	switch (req) {
1076 	case ALTRQ_PURGE:
1077 		red_purgeq(rqp);
1078 		break;
1079 	}
1080 	return (0);
1081 }
1082 
1083 static void
1084 red_purgeq(rqp)
1085 	red_queue_t *rqp;
1086 {
1087 	_flushq(rqp->rq_q);
1088 	if (ALTQ_IS_ENABLED(rqp->rq_ifq))
1089 		rqp->rq_ifq->ifq_len = 0;
1090 }
1091 
1092 #ifdef ALTQ_FLOWVALVE
1093 
1094 #define	FV_PSHIFT	7	/* weight of average drop rate -- 1/128 */
1095 #define	FV_PSCALE(x)	((x) << FV_PSHIFT)
1096 #define	FV_PUNSCALE(x)	((x) >> FV_PSHIFT)
1097 #define	FV_FSHIFT	5	/* weight of average fraction -- 1/32 */
1098 #define	FV_FSCALE(x)	((x) << FV_FSHIFT)
1099 #define	FV_FUNSCALE(x)	((x) >> FV_FSHIFT)
1100 
1101 #define	FV_TIMER	(3 * hz)	/* timer value for garbage collector */
1102 #define	FV_FLOWLISTSIZE		64	/* how many flows in flowlist */
1103 
1104 #define	FV_N			10	/* update fve_f every FV_N packets */
1105 
1106 #define	FV_BACKOFFTHRESH	1  /* backoff threshold interval in second */
1107 #define	FV_TTHRESH		3  /* time threshold to delete fve */
1108 #define	FV_ALPHA		5  /* extra packet count */
1109 
1110 #define	FV_STATS
1111 
1112 #if (__FreeBSD_version > 300000)
1113 #define	FV_TIMESTAMP(tp)	getmicrotime(tp)
1114 #else
1115 #define	FV_TIMESTAMP(tp)	{ (*(tp)) = time; }
1116 #endif
1117 
1118 /*
1119  * Brtt table: 127 entry table to convert drop rate (p) to
1120  * the corresponding bandwidth fraction (f)
1121  * the following equation is implemented to use scaled values,
1122  * fve_p and fve_f, in the fixed point format.
1123  *
1124  *   Brtt(p) = 1 /(sqrt(4*p/3) + min(1,3*sqrt(p*6/8)) * p * (1+32 * p*p))
1125  *   f = Brtt(p) / (max_th + alpha)
1126  */
1127 #define	BRTT_SIZE	128
1128 #define	BRTT_SHIFT	12
1129 #define	BRTT_MASK	0x0007f000
1130 #define	BRTT_PMAX	(1 << (FV_PSHIFT + FP_SHIFT))
1131 
1132 const int brtt_tab[BRTT_SIZE] = {
1133 	0, 1262010, 877019, 703694, 598706, 525854, 471107, 427728,
1134 	392026, 361788, 335598, 312506, 291850, 273158, 256081, 240361,
1135 	225800, 212247, 199585, 187788, 178388, 169544, 161207, 153333,
1136 	145888, 138841, 132165, 125836, 119834, 114141, 108739, 103612,
1137 	98747, 94129, 89746, 85585, 81637, 77889, 74333, 70957,
1138 	67752, 64711, 61824, 59084, 56482, 54013, 51667, 49440,
1139 	47325, 45315, 43406, 41591, 39866, 38227, 36667, 35184,
1140 	33773, 32430, 31151, 29933, 28774, 27668, 26615, 25611,
1141 	24653, 23740, 22868, 22035, 21240, 20481, 19755, 19062,
1142 	18399, 17764, 17157, 16576, 16020, 15487, 14976, 14487,
1143 	14017, 13567, 13136, 12721, 12323, 11941, 11574, 11222,
1144 	10883, 10557, 10243, 9942, 9652, 9372, 9103, 8844,
1145 	8594, 8354, 8122, 7898, 7682, 7474, 7273, 7079,
1146 	6892, 6711, 6536, 6367, 6204, 6046, 5893, 5746,
1147 	5603, 5464, 5330, 5201, 5075, 4954, 4836, 4722,
1148 	4611, 4504, 4400, 4299, 4201, 4106, 4014, 3924
1149 };
1150 
1151 static __inline struct fve *
1152 flowlist_lookup(fv, pktattr, now)
1153 	struct flowvalve *fv;
1154 	struct altq_pktattr *pktattr;
1155 	struct timeval *now;
1156 {
1157 	struct fve *fve;
1158 	int flows;
1159 	struct ip *ip;
1160 #ifdef INET6
1161 	struct ip6_hdr *ip6;
1162 #endif
1163 	struct timeval tthresh;
1164 
1165 	if (pktattr == NULL)
1166 		return (NULL);
1167 
1168 	tthresh.tv_sec = now->tv_sec - FV_TTHRESH;
1169 	flows = 0;
1170 	/*
1171 	 * search the flow list
1172 	 */
1173 	switch (pktattr->pattr_af) {
1174 	case AF_INET:
1175 		ip = (struct ip *)pktattr->pattr_hdr;
1176 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1177 			if (fve->fve_lastdrop.tv_sec == 0)
1178 				break;
1179 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1180 				fve->fve_lastdrop.tv_sec = 0;
1181 				break;
1182 			}
1183 			if (fve->fve_flow.flow_af == AF_INET &&
1184 			    fve->fve_flow.flow_ip.ip_src.s_addr ==
1185 			    ip->ip_src.s_addr &&
1186 			    fve->fve_flow.flow_ip.ip_dst.s_addr ==
1187 			    ip->ip_dst.s_addr)
1188 				return (fve);
1189 			flows++;
1190 		}
1191 		break;
1192 #ifdef INET6
1193 	case AF_INET6:
1194 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1195 		TAILQ_FOREACH(fve, &fv->fv_flowlist, fve_lru){
1196 			if (fve->fve_lastdrop.tv_sec == 0)
1197 				break;
1198 			if (fve->fve_lastdrop.tv_sec < tthresh.tv_sec) {
1199 				fve->fve_lastdrop.tv_sec = 0;
1200 				break;
1201 			}
1202 			if (fve->fve_flow.flow_af == AF_INET6 &&
1203 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_src,
1204 					       &ip6->ip6_src) &&
1205 			    IN6_ARE_ADDR_EQUAL(&fve->fve_flow.flow_ip6.ip6_dst,
1206 					       &ip6->ip6_dst))
1207 				return (fve);
1208 			flows++;
1209 		}
1210 		break;
1211 #endif /* INET6 */
1212 
1213 	default:
1214 		/* unknown protocol.  no drop. */
1215 		return (NULL);
1216 	}
1217 	fv->fv_flows = flows;	/* save the number of active fve's */
1218 	return (NULL);
1219 }
1220 
1221 static __inline struct fve *
1222 flowlist_reclaim(fv, pktattr)
1223 	struct flowvalve *fv;
1224 	struct altq_pktattr *pktattr;
1225 {
1226 	struct fve *fve;
1227 	struct ip *ip;
1228 #ifdef INET6
1229 	struct ip6_hdr *ip6;
1230 #endif
1231 
1232 	/*
1233 	 * get an entry from the tail of the LRU list.
1234 	 */
1235 	fve = TAILQ_LAST(&fv->fv_flowlist, fv_flowhead);
1236 
1237 	switch (pktattr->pattr_af) {
1238 	case AF_INET:
1239 		ip = (struct ip *)pktattr->pattr_hdr;
1240 		fve->fve_flow.flow_af = AF_INET;
1241 		fve->fve_flow.flow_ip.ip_src = ip->ip_src;
1242 		fve->fve_flow.flow_ip.ip_dst = ip->ip_dst;
1243 		break;
1244 #ifdef INET6
1245 	case AF_INET6:
1246 		ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
1247 		fve->fve_flow.flow_af = AF_INET6;
1248 		fve->fve_flow.flow_ip6.ip6_src = ip6->ip6_src;
1249 		fve->fve_flow.flow_ip6.ip6_dst = ip6->ip6_dst;
1250 		break;
1251 #endif
1252 	}
1253 
1254 	fve->fve_state = Green;
1255 	fve->fve_p = 0.0;
1256 	fve->fve_f = 0.0;
1257 	fve->fve_ifseq = fv->fv_ifseq - 1;
1258 	fve->fve_count = 0;
1259 
1260 	fv->fv_flows++;
1261 #ifdef FV_STATS
1262 	fv->fv_stats.alloc++;
1263 #endif
1264 	return (fve);
1265 }
1266 
1267 static __inline void
1268 flowlist_move_to_head(fv, fve)
1269 	struct flowvalve *fv;
1270 	struct fve *fve;
1271 {
1272 	if (TAILQ_FIRST(&fv->fv_flowlist) != fve) {
1273 		TAILQ_REMOVE(&fv->fv_flowlist, fve, fve_lru);
1274 		TAILQ_INSERT_HEAD(&fv->fv_flowlist, fve, fve_lru);
1275 	}
1276 }
1277 
1278 #if 0 /* XXX: make the compiler happy (fv_alloc unused) */
1279 /*
1280  * allocate flowvalve structure
1281  */
1282 static struct flowvalve *
1283 fv_alloc(rp)
1284 	struct red *rp;
1285 {
1286 	struct flowvalve *fv;
1287 	struct fve *fve;
1288 	int i, num;
1289 
1290 	num = FV_FLOWLISTSIZE;
1291 	fv = malloc(sizeof(struct flowvalve),
1292 	       M_DEVBUF, M_WAITOK);
1293 	if (fv == NULL)
1294 		return (NULL);
1295 	bzero(fv, sizeof(struct flowvalve));
1296 
1297 	fv->fv_fves = malloc(sizeof(struct fve) * num,
1298 	       M_DEVBUF, M_WAITOK);
1299 	if (fv->fv_fves == NULL) {
1300 		free(fv, M_DEVBUF);
1301 		return (NULL);
1302 	}
1303 	bzero(fv->fv_fves, sizeof(struct fve) * num);
1304 
1305 	fv->fv_flows = 0;
1306 	TAILQ_INIT(&fv->fv_flowlist);
1307 	for (i = 0; i < num; i++) {
1308 		fve = &fv->fv_fves[i];
1309 		fve->fve_lastdrop.tv_sec = 0;
1310 		TAILQ_INSERT_TAIL(&fv->fv_flowlist, fve, fve_lru);
1311 	}
1312 
1313 	/* initialize drop rate threshold in scaled fixed-point */
1314 	fv->fv_pthresh = (FV_PSCALE(1) << FP_SHIFT) / rp->red_inv_pmax;
1315 
1316 	/* initialize drop rate to fraction table */
1317 	fv->fv_p2ftab = malloc(sizeof(int) * BRTT_SIZE,
1318 	       M_DEVBUF, M_WAITOK);
1319 	if (fv->fv_p2ftab == NULL) {
1320 		free(fv->fv_fves, M_DEVBUF);
1321 		free(fv, M_DEVBUF);
1322 		return (NULL);
1323 	}
1324 	/*
1325 	 * create the p2f table.
1326 	 * (shift is used to keep the precision)
1327 	 */
1328 	for (i = 1; i < BRTT_SIZE; i++) {
1329 		int f;
1330 
1331 		f = brtt_tab[i] << 8;
1332 		fv->fv_p2ftab[i] = (f / (rp->red_thmax + FV_ALPHA)) >> 8;
1333 	}
1334 
1335 	return (fv);
1336 }
1337 #endif
1338 
1339 static void fv_destroy(fv)
1340 	struct flowvalve *fv;
1341 {
1342 	free(fv->fv_p2ftab, M_DEVBUF);
1343 	free(fv->fv_fves, M_DEVBUF);
1344 	free(fv, M_DEVBUF);
1345 }
1346 
1347 static __inline int
1348 fv_p2f(fv, p)
1349 	struct flowvalve	*fv;
1350 	int	p;
1351 {
1352 	int val, f;
1353 
1354 	if (p >= BRTT_PMAX)
1355 		f = fv->fv_p2ftab[BRTT_SIZE-1];
1356 	else if ((val = (p & BRTT_MASK)))
1357 		f = fv->fv_p2ftab[(val >> BRTT_SHIFT)];
1358 	else
1359 		f = fv->fv_p2ftab[1];
1360 	return (f);
1361 }
1362 
1363 /*
1364  * check if an arriving packet should be pre-dropped.
1365  * called from red_addq() when a packet arrives.
1366  * returns 1 when the packet should be pre-dropped.
1367  * should be called in splimp.
1368  */
1369 static int
1370 fv_checkflow(fv, pktattr, fcache)
1371 	struct flowvalve *fv;
1372 	struct altq_pktattr *pktattr;
1373 	struct fve **fcache;
1374 {
1375 	struct fve *fve;
1376 	struct timeval now;
1377 
1378 	fv->fv_ifseq++;
1379 	FV_TIMESTAMP(&now);
1380 
1381 	if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1382 		/* no matching entry in the flowlist */
1383 		return (0);
1384 
1385 	*fcache = fve;
1386 
1387 	/* update fraction f for every FV_N packets */
1388 	if (++fve->fve_count == FV_N) {
1389 		/*
1390 		 * f = Wf * N / (fv_ifseq - fve_ifseq) + (1 - Wf) * f
1391 		 */
1392 		fve->fve_f =
1393 			(FV_N << FP_SHIFT) / (fv->fv_ifseq - fve->fve_ifseq)
1394 			+ fve->fve_f - FV_FUNSCALE(fve->fve_f);
1395 		fve->fve_ifseq = fv->fv_ifseq;
1396 		fve->fve_count = 0;
1397 	}
1398 
1399 	/*
1400 	 * overpumping test
1401 	 */
1402 	if (fve->fve_state == Green && fve->fve_p > fv->fv_pthresh) {
1403 		int fthresh;
1404 
1405 		/* calculate a threshold */
1406 		fthresh = fv_p2f(fv, fve->fve_p);
1407 		if (fve->fve_f > fthresh)
1408 			fve->fve_state = Red;
1409 	}
1410 
1411 	if (fve->fve_state == Red) {
1412 		/*
1413 		 * backoff test
1414 		 */
1415 		if (now.tv_sec - fve->fve_lastdrop.tv_sec > FV_BACKOFFTHRESH) {
1416 			/* no drop for at least FV_BACKOFFTHRESH sec */
1417 			fve->fve_p = 0;
1418 			fve->fve_state = Green;
1419 #ifdef FV_STATS
1420 			fv->fv_stats.escape++;
1421 #endif
1422 		} else {
1423 			/* block this flow */
1424 			flowlist_move_to_head(fv, fve);
1425 			fve->fve_lastdrop = now;
1426 #ifdef FV_STATS
1427 			fv->fv_stats.predrop++;
1428 #endif
1429 			return (1);
1430 		}
1431 	}
1432 
1433 	/*
1434 	 * p = (1 - Wp) * p
1435 	 */
1436 	fve->fve_p -= FV_PUNSCALE(fve->fve_p);
1437 	if (fve->fve_p < 0)
1438 		fve->fve_p = 0;
1439 #ifdef FV_STATS
1440 	fv->fv_stats.pass++;
1441 #endif
1442 	return (0);
1443 }
1444 
1445 /*
1446  * called from red_addq when a packet is dropped by red.
1447  * should be called in splimp.
1448  */
1449 static void fv_dropbyred(fv, pktattr, fcache)
1450 	struct flowvalve *fv;
1451 	struct altq_pktattr *pktattr;
1452 	struct fve *fcache;
1453 {
1454 	struct fve *fve;
1455 	struct timeval now;
1456 
1457 	if (pktattr == NULL)
1458 		return;
1459 	FV_TIMESTAMP(&now);
1460 
1461 	if (fcache != NULL)
1462 		/* the fve of this packet is already cached */
1463 		fve = fcache;
1464 	else if ((fve = flowlist_lookup(fv, pktattr, &now)) == NULL)
1465 		fve = flowlist_reclaim(fv, pktattr);
1466 
1467 	flowlist_move_to_head(fv, fve);
1468 
1469 	/*
1470 	 * update p:  the following line cancels the update
1471 	 *	      in fv_checkflow() and calculate
1472 	 *	p = Wp + (1 - Wp) * p
1473 	 */
1474 	fve->fve_p = (1 << FP_SHIFT) + fve->fve_p;
1475 
1476 	fve->fve_lastdrop = now;
1477 }
1478 
1479 #endif /* ALTQ_FLOWVALVE */
1480 
1481 #ifdef KLD_MODULE
1482 
1483 static struct altqsw red_sw =
1484 	{"red", redopen, redclose, redioctl};
1485 
1486 ALTQ_MODULE(altq_red, ALTQT_RED, &red_sw);
1487 MODULE_VERSION(altq_red, 1);
1488 
1489 #endif /* KLD_MODULE */
1490 #endif /* ALTQ3_COMPAT */
1491 
1492 #endif /* ALTQ_RED */
1493