xref: /freebsd/sys/net/altq/altq_subr.c (revision f374ba41f55c1a127303d92d830dd58eef2f5243)
1 /*-
2  * Copyright (C) 1997-2003
3  *	Sony Computer Science Laboratories Inc.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $
27  * $FreeBSD$
28  */
29 
30 #include "opt_altq.h"
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #include <sys/param.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/socket.h>
40 #include <sys/socketvar.h>
41 #include <sys/kernel.h>
42 #include <sys/errno.h>
43 #include <sys/syslog.h>
44 #include <sys/sysctl.h>
45 #include <sys/queue.h>
46 
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/if_private.h>
50 #include <net/if_dl.h>
51 #include <net/if_types.h>
52 #include <net/vnet.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_systm.h>
56 #include <netinet/ip.h>
57 #ifdef INET6
58 #include <netinet/ip6.h>
59 #endif
60 #include <netinet/tcp.h>
61 #include <netinet/udp.h>
62 
63 #include <netpfil/pf/pf.h>
64 #include <netpfil/pf/pf_altq.h>
65 #include <net/altq/altq.h>
66 
67 /* machine dependent clock related includes */
68 #include <sys/bus.h>
69 #include <sys/cpu.h>
70 #include <sys/eventhandler.h>
71 #include <machine/clock.h>
72 #if defined(__amd64__) || defined(__i386__)
73 #include <machine/cpufunc.h>		/* for pentium tsc */
74 #include <machine/specialreg.h>		/* for CPUID_TSC */
75 #include <machine/md_var.h>		/* for cpu_feature */
76 #endif /* __amd64 || __i386__ */
77 
78 /*
79  * internal function prototypes
80  */
81 static void	tbr_timeout(void *);
82 static struct mbuf *tbr_dequeue(struct ifaltq *, int);
83 static int tbr_timer = 0;	/* token bucket regulator timer */
84 static struct callout tbr_callout;
85 
86 #ifdef ALTQ3_CLFIER_COMPAT
87 static int 	extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *);
88 #ifdef INET6
89 static int 	extract_ports6(struct mbuf *, struct ip6_hdr *,
90 			       struct flowinfo_in6 *);
91 #endif
92 static int	apply_filter4(u_int32_t, struct flow_filter *,
93 			      struct flowinfo_in *);
94 static int	apply_ppfilter4(u_int32_t, struct flow_filter *,
95 				struct flowinfo_in *);
96 #ifdef INET6
97 static int	apply_filter6(u_int32_t, struct flow_filter6 *,
98 			      struct flowinfo_in6 *);
99 #endif
100 static int	apply_tosfilter4(u_int32_t, struct flow_filter *,
101 				 struct flowinfo_in *);
102 static u_long	get_filt_handle(struct acc_classifier *, int);
103 static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long);
104 static u_int32_t filt2fibmask(struct flow_filter *);
105 
106 static void 	ip4f_cache(struct ip *, struct flowinfo_in *);
107 static int 	ip4f_lookup(struct ip *, struct flowinfo_in *);
108 static int 	ip4f_init(void);
109 static struct ip4_frag	*ip4f_alloc(void);
110 static void 	ip4f_free(struct ip4_frag *);
111 #endif /* ALTQ3_CLFIER_COMPAT */
112 
113 #ifdef ALTQ
114 SYSCTL_NODE(_kern_features, OID_AUTO, altq, CTLFLAG_RD | CTLFLAG_CAPRD, 0,
115     "ALTQ packet queuing");
116 
117 #define	ALTQ_FEATURE(name, desc)					\
118 	SYSCTL_INT_WITH_LABEL(_kern_features_altq, OID_AUTO, name,	\
119 	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1,		\
120 	    desc, "feature")
121 
122 #ifdef ALTQ_CBQ
123 ALTQ_FEATURE(cbq, "ALTQ Class Based Queuing discipline");
124 #endif
125 #ifdef ALTQ_CODEL
126 ALTQ_FEATURE(codel, "ALTQ Controlled Delay discipline");
127 #endif
128 #ifdef ALTQ_RED
129 ALTQ_FEATURE(red, "ALTQ Random Early Detection discipline");
130 #endif
131 #ifdef ALTQ_RIO
132 ALTQ_FEATURE(rio, "ALTQ Random Early Drop discipline");
133 #endif
134 #ifdef ALTQ_HFSC
135 ALTQ_FEATURE(hfsc, "ALTQ Hierarchical Packet Scheduler discipline");
136 #endif
137 #ifdef ALTQ_PRIQ
138 ALTQ_FEATURE(priq, "ATLQ Priority Queuing discipline");
139 #endif
140 #ifdef ALTQ_FAIRQ
141 ALTQ_FEATURE(fairq, "ALTQ Fair Queuing discipline");
142 #endif
143 #endif
144 
145 /*
146  * alternate queueing support routines
147  */
148 
149 /* look up the queue state by the interface name and the queueing type. */
150 void *
151 altq_lookup(name, type)
152 	char *name;
153 	int type;
154 {
155 	struct ifnet *ifp;
156 
157 	if ((ifp = ifunit(name)) != NULL) {
158 		/* read if_snd unlocked */
159 		if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
160 			return (ifp->if_snd.altq_disc);
161 	}
162 
163 	return NULL;
164 }
165 
166 int
167 altq_attach(ifq, type, discipline, enqueue, dequeue, request)
168 	struct ifaltq *ifq;
169 	int type;
170 	void *discipline;
171 	int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
172 	struct mbuf *(*dequeue)(struct ifaltq *, int);
173 	int (*request)(struct ifaltq *, int, void *);
174 {
175 	IFQ_LOCK(ifq);
176 	if (!ALTQ_IS_READY(ifq)) {
177 		IFQ_UNLOCK(ifq);
178 		return ENXIO;
179 	}
180 
181 	ifq->altq_type     = type;
182 	ifq->altq_disc     = discipline;
183 	ifq->altq_enqueue  = enqueue;
184 	ifq->altq_dequeue  = dequeue;
185 	ifq->altq_request  = request;
186 	ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
187 	IFQ_UNLOCK(ifq);
188 	return 0;
189 }
190 
191 int
192 altq_detach(ifq)
193 	struct ifaltq *ifq;
194 {
195 	IFQ_LOCK(ifq);
196 
197 	if (!ALTQ_IS_READY(ifq)) {
198 		IFQ_UNLOCK(ifq);
199 		return ENXIO;
200 	}
201 	if (ALTQ_IS_ENABLED(ifq)) {
202 		IFQ_UNLOCK(ifq);
203 		return EBUSY;
204 	}
205 	if (!ALTQ_IS_ATTACHED(ifq)) {
206 		IFQ_UNLOCK(ifq);
207 		return (0);
208 	}
209 
210 	ifq->altq_type     = ALTQT_NONE;
211 	ifq->altq_disc     = NULL;
212 	ifq->altq_enqueue  = NULL;
213 	ifq->altq_dequeue  = NULL;
214 	ifq->altq_request  = NULL;
215 	ifq->altq_flags &= ALTQF_CANTCHANGE;
216 
217 	IFQ_UNLOCK(ifq);
218 	return 0;
219 }
220 
221 int
222 altq_enable(ifq)
223 	struct ifaltq *ifq;
224 {
225 	int s;
226 
227 	IFQ_LOCK(ifq);
228 
229 	if (!ALTQ_IS_READY(ifq)) {
230 		IFQ_UNLOCK(ifq);
231 		return ENXIO;
232 	}
233 	if (ALTQ_IS_ENABLED(ifq)) {
234 		IFQ_UNLOCK(ifq);
235 		return 0;
236 	}
237 
238 	s = splnet();
239 	IFQ_PURGE_NOLOCK(ifq);
240 	ASSERT(ifq->ifq_len == 0);
241 	ifq->ifq_drv_maxlen = 0;		/* disable bulk dequeue */
242 	ifq->altq_flags |= ALTQF_ENABLED;
243 	splx(s);
244 
245 	IFQ_UNLOCK(ifq);
246 	return 0;
247 }
248 
249 int
250 altq_disable(ifq)
251 	struct ifaltq *ifq;
252 {
253 	int s;
254 
255 	IFQ_LOCK(ifq);
256 	if (!ALTQ_IS_ENABLED(ifq)) {
257 		IFQ_UNLOCK(ifq);
258 		return 0;
259 	}
260 
261 	s = splnet();
262 	IFQ_PURGE_NOLOCK(ifq);
263 	ASSERT(ifq->ifq_len == 0);
264 	ifq->altq_flags &= ~(ALTQF_ENABLED);
265 	splx(s);
266 
267 	IFQ_UNLOCK(ifq);
268 	return 0;
269 }
270 
271 #ifdef ALTQ_DEBUG
272 void
273 altq_assert(file, line, failedexpr)
274 	const char *file, *failedexpr;
275 	int line;
276 {
277 	(void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n",
278 		     failedexpr, file, line);
279 	panic("altq assertion");
280 	/* NOTREACHED */
281 }
282 #endif
283 
284 /*
285  * internal representation of token bucket parameters
286  *	rate:	(byte_per_unittime << TBR_SHIFT)  / machclk_freq
287  *		(((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq
288  *	depth:	byte << TBR_SHIFT
289  *
290  */
291 #define	TBR_SHIFT	29
292 #define	TBR_SCALE(x)	((int64_t)(x) << TBR_SHIFT)
293 #define	TBR_UNSCALE(x)	((x) >> TBR_SHIFT)
294 
295 static struct mbuf *
296 tbr_dequeue(ifq, op)
297 	struct ifaltq *ifq;
298 	int op;
299 {
300 	struct tb_regulator *tbr;
301 	struct mbuf *m;
302 	int64_t interval;
303 	u_int64_t now;
304 
305 	IFQ_LOCK_ASSERT(ifq);
306 	tbr = ifq->altq_tbr;
307 	if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
308 		/* if this is a remove after poll, bypass tbr check */
309 	} else {
310 		/* update token only when it is negative */
311 		if (tbr->tbr_token <= 0) {
312 			now = read_machclk();
313 			interval = now - tbr->tbr_last;
314 			if (interval >= tbr->tbr_filluptime)
315 				tbr->tbr_token = tbr->tbr_depth;
316 			else {
317 				tbr->tbr_token += interval * tbr->tbr_rate;
318 				if (tbr->tbr_token > tbr->tbr_depth)
319 					tbr->tbr_token = tbr->tbr_depth;
320 			}
321 			tbr->tbr_last = now;
322 		}
323 		/* if token is still negative, don't allow dequeue */
324 		if (tbr->tbr_token <= 0)
325 			return (NULL);
326 	}
327 
328 	if (ALTQ_IS_ENABLED(ifq))
329 		m = (*ifq->altq_dequeue)(ifq, op);
330 	else {
331 		if (op == ALTDQ_POLL)
332 			_IF_POLL(ifq, m);
333 		else
334 			_IF_DEQUEUE(ifq, m);
335 	}
336 
337 	if (m != NULL && op == ALTDQ_REMOVE)
338 		tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
339 	tbr->tbr_lastop = op;
340 	return (m);
341 }
342 
343 /*
344  * set a token bucket regulator.
345  * if the specified rate is zero, the token bucket regulator is deleted.
346  */
347 int
348 tbr_set(ifq, profile)
349 	struct ifaltq *ifq;
350 	struct tb_profile *profile;
351 {
352 	struct tb_regulator *tbr, *otbr;
353 
354 	if (tbr_dequeue_ptr == NULL)
355 		tbr_dequeue_ptr = tbr_dequeue;
356 
357 	if (machclk_freq == 0)
358 		init_machclk();
359 	if (machclk_freq == 0) {
360 		printf("tbr_set: no cpu clock available!\n");
361 		return (ENXIO);
362 	}
363 
364 	IFQ_LOCK(ifq);
365 	if (profile->rate == 0) {
366 		/* delete this tbr */
367 		if ((tbr = ifq->altq_tbr) == NULL) {
368 			IFQ_UNLOCK(ifq);
369 			return (ENOENT);
370 		}
371 		ifq->altq_tbr = NULL;
372 		free(tbr, M_DEVBUF);
373 		IFQ_UNLOCK(ifq);
374 		return (0);
375 	}
376 
377 	tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO);
378 	if (tbr == NULL) {
379 		IFQ_UNLOCK(ifq);
380 		return (ENOMEM);
381 	}
382 
383 	tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
384 	tbr->tbr_depth = TBR_SCALE(profile->depth);
385 	if (tbr->tbr_rate > 0)
386 		tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
387 	else
388 		tbr->tbr_filluptime = LLONG_MAX;
389 	/*
390 	 *  The longest time between tbr_dequeue() calls will be about 1
391 	 *  system tick, as the callout that drives it is scheduled once per
392 	 *  tick.  The refill-time detection logic in tbr_dequeue() can only
393 	 *  properly detect the passage of up to LLONG_MAX machclk ticks.
394 	 *  Therefore, in order for this logic to function properly in the
395 	 *  extreme case, the maximum value of tbr_filluptime should be
396 	 *  LLONG_MAX less one system tick's worth of machclk ticks less
397 	 *  some additional slop factor (here one more system tick's worth
398 	 *  of machclk ticks).
399 	 */
400 	if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick))
401 		tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick;
402 	tbr->tbr_token = tbr->tbr_depth;
403 	tbr->tbr_last = read_machclk();
404 	tbr->tbr_lastop = ALTDQ_REMOVE;
405 
406 	otbr = ifq->altq_tbr;
407 	ifq->altq_tbr = tbr;	/* set the new tbr */
408 
409 	if (otbr != NULL)
410 		free(otbr, M_DEVBUF);
411 	else {
412 		if (tbr_timer == 0) {
413 			CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
414 			tbr_timer = 1;
415 		}
416 	}
417 	IFQ_UNLOCK(ifq);
418 	return (0);
419 }
420 
421 /*
422  * tbr_timeout goes through the interface list, and kicks the drivers
423  * if necessary.
424  *
425  * MPSAFE
426  */
427 static void
428 tbr_timeout(arg)
429 	void *arg;
430 {
431 	VNET_ITERATOR_DECL(vnet_iter);
432 	struct ifnet *ifp;
433 	struct epoch_tracker et;
434 	int active;
435 
436 	active = 0;
437 	NET_EPOCH_ENTER(et);
438 	VNET_LIST_RLOCK_NOSLEEP();
439 	VNET_FOREACH(vnet_iter) {
440 		CURVNET_SET(vnet_iter);
441 		for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp;
442 		    ifp = CK_STAILQ_NEXT(ifp, if_link)) {
443 			/* read from if_snd unlocked */
444 			if (!TBR_IS_ENABLED(&ifp->if_snd))
445 				continue;
446 			active++;
447 			if (!IFQ_IS_EMPTY(&ifp->if_snd) &&
448 			    ifp->if_start != NULL)
449 				(*ifp->if_start)(ifp);
450 		}
451 		CURVNET_RESTORE();
452 	}
453 	VNET_LIST_RUNLOCK_NOSLEEP();
454 	NET_EPOCH_EXIT(et);
455 	if (active > 0)
456 		CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
457 	else
458 		tbr_timer = 0;	/* don't need tbr_timer anymore */
459 }
460 
461 /*
462  * attach a discipline to the interface.  if one already exists, it is
463  * overridden.
464  * Locking is done in the discipline specific attach functions. Basically
465  * they call back to altq_attach which takes care of the attach and locking.
466  */
467 int
468 altq_pfattach(struct pf_altq *a)
469 {
470 	int error = 0;
471 
472 	switch (a->scheduler) {
473 	case ALTQT_NONE:
474 		break;
475 #ifdef ALTQ_CBQ
476 	case ALTQT_CBQ:
477 		error = cbq_pfattach(a);
478 		break;
479 #endif
480 #ifdef ALTQ_PRIQ
481 	case ALTQT_PRIQ:
482 		error = priq_pfattach(a);
483 		break;
484 #endif
485 #ifdef ALTQ_HFSC
486 	case ALTQT_HFSC:
487 		error = hfsc_pfattach(a);
488 		break;
489 #endif
490 #ifdef ALTQ_FAIRQ
491 	case ALTQT_FAIRQ:
492 		error = fairq_pfattach(a);
493 		break;
494 #endif
495 #ifdef ALTQ_CODEL
496 	case ALTQT_CODEL:
497 		error = codel_pfattach(a);
498 		break;
499 #endif
500 	default:
501 		error = ENXIO;
502 	}
503 
504 	return (error);
505 }
506 
507 /*
508  * detach a discipline from the interface.
509  * it is possible that the discipline was already overridden by another
510  * discipline.
511  */
512 int
513 altq_pfdetach(struct pf_altq *a)
514 {
515 	struct ifnet *ifp;
516 	int s, error = 0;
517 
518 	if ((ifp = ifunit(a->ifname)) == NULL)
519 		return (EINVAL);
520 
521 	/* if this discipline is no longer referenced, just return */
522 	/* read unlocked from if_snd */
523 	if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
524 		return (0);
525 
526 	s = splnet();
527 	/* read unlocked from if_snd, _disable and _detach take care */
528 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
529 		error = altq_disable(&ifp->if_snd);
530 	if (error == 0)
531 		error = altq_detach(&ifp->if_snd);
532 	splx(s);
533 
534 	return (error);
535 }
536 
537 /*
538  * add a discipline or a queue
539  * Locking is done in the discipline specific functions with regards to
540  * malloc with WAITOK, also it is not yet clear which lock to use.
541  */
542 int
543 altq_add(struct ifnet *ifp, struct pf_altq *a)
544 {
545 	int error = 0;
546 
547 	if (a->qname[0] != 0)
548 		return (altq_add_queue(a));
549 
550 	if (machclk_freq == 0)
551 		init_machclk();
552 	if (machclk_freq == 0)
553 		panic("altq_add: no cpu clock");
554 
555 	switch (a->scheduler) {
556 #ifdef ALTQ_CBQ
557 	case ALTQT_CBQ:
558 		error = cbq_add_altq(ifp, a);
559 		break;
560 #endif
561 #ifdef ALTQ_PRIQ
562 	case ALTQT_PRIQ:
563 		error = priq_add_altq(ifp, a);
564 		break;
565 #endif
566 #ifdef ALTQ_HFSC
567 	case ALTQT_HFSC:
568 		error = hfsc_add_altq(ifp, a);
569 		break;
570 #endif
571 #ifdef ALTQ_FAIRQ
572         case ALTQT_FAIRQ:
573                 error = fairq_add_altq(ifp, a);
574                 break;
575 #endif
576 #ifdef ALTQ_CODEL
577 	case ALTQT_CODEL:
578 		error = codel_add_altq(ifp, a);
579 		break;
580 #endif
581 	default:
582 		error = ENXIO;
583 	}
584 
585 	return (error);
586 }
587 
588 /*
589  * remove a discipline or a queue
590  * It is yet unclear what lock to use to protect this operation, the
591  * discipline specific functions will determine and grab it
592  */
593 int
594 altq_remove(struct pf_altq *a)
595 {
596 	int error = 0;
597 
598 	if (a->qname[0] != 0)
599 		return (altq_remove_queue(a));
600 
601 	switch (a->scheduler) {
602 #ifdef ALTQ_CBQ
603 	case ALTQT_CBQ:
604 		error = cbq_remove_altq(a);
605 		break;
606 #endif
607 #ifdef ALTQ_PRIQ
608 	case ALTQT_PRIQ:
609 		error = priq_remove_altq(a);
610 		break;
611 #endif
612 #ifdef ALTQ_HFSC
613 	case ALTQT_HFSC:
614 		error = hfsc_remove_altq(a);
615 		break;
616 #endif
617 #ifdef ALTQ_FAIRQ
618         case ALTQT_FAIRQ:
619                 error = fairq_remove_altq(a);
620                 break;
621 #endif
622 #ifdef ALTQ_CODEL
623 	case ALTQT_CODEL:
624 		error = codel_remove_altq(a);
625 		break;
626 #endif
627 	default:
628 		error = ENXIO;
629 	}
630 
631 	return (error);
632 }
633 
634 /*
635  * add a queue to the discipline
636  * It is yet unclear what lock to use to protect this operation, the
637  * discipline specific functions will determine and grab it
638  */
639 int
640 altq_add_queue(struct pf_altq *a)
641 {
642 	int error = 0;
643 
644 	switch (a->scheduler) {
645 #ifdef ALTQ_CBQ
646 	case ALTQT_CBQ:
647 		error = cbq_add_queue(a);
648 		break;
649 #endif
650 #ifdef ALTQ_PRIQ
651 	case ALTQT_PRIQ:
652 		error = priq_add_queue(a);
653 		break;
654 #endif
655 #ifdef ALTQ_HFSC
656 	case ALTQT_HFSC:
657 		error = hfsc_add_queue(a);
658 		break;
659 #endif
660 #ifdef ALTQ_FAIRQ
661         case ALTQT_FAIRQ:
662                 error = fairq_add_queue(a);
663                 break;
664 #endif
665 	default:
666 		error = ENXIO;
667 	}
668 
669 	return (error);
670 }
671 
672 /*
673  * remove a queue from the discipline
674  * It is yet unclear what lock to use to protect this operation, the
675  * discipline specific functions will determine and grab it
676  */
677 int
678 altq_remove_queue(struct pf_altq *a)
679 {
680 	int error = 0;
681 
682 	switch (a->scheduler) {
683 #ifdef ALTQ_CBQ
684 	case ALTQT_CBQ:
685 		error = cbq_remove_queue(a);
686 		break;
687 #endif
688 #ifdef ALTQ_PRIQ
689 	case ALTQT_PRIQ:
690 		error = priq_remove_queue(a);
691 		break;
692 #endif
693 #ifdef ALTQ_HFSC
694 	case ALTQT_HFSC:
695 		error = hfsc_remove_queue(a);
696 		break;
697 #endif
698 #ifdef ALTQ_FAIRQ
699         case ALTQT_FAIRQ:
700                 error = fairq_remove_queue(a);
701                 break;
702 #endif
703 	default:
704 		error = ENXIO;
705 	}
706 
707 	return (error);
708 }
709 
710 /*
711  * get queue statistics
712  * Locking is done in the discipline specific functions with regards to
713  * copyout operations, also it is not yet clear which lock to use.
714  */
715 int
716 altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
717 {
718 	int error = 0;
719 
720 	switch (a->scheduler) {
721 #ifdef ALTQ_CBQ
722 	case ALTQT_CBQ:
723 		error = cbq_getqstats(a, ubuf, nbytes, version);
724 		break;
725 #endif
726 #ifdef ALTQ_PRIQ
727 	case ALTQT_PRIQ:
728 		error = priq_getqstats(a, ubuf, nbytes, version);
729 		break;
730 #endif
731 #ifdef ALTQ_HFSC
732 	case ALTQT_HFSC:
733 		error = hfsc_getqstats(a, ubuf, nbytes, version);
734 		break;
735 #endif
736 #ifdef ALTQ_FAIRQ
737         case ALTQT_FAIRQ:
738                 error = fairq_getqstats(a, ubuf, nbytes, version);
739                 break;
740 #endif
741 #ifdef ALTQ_CODEL
742 	case ALTQT_CODEL:
743 		error = codel_getqstats(a, ubuf, nbytes, version);
744 		break;
745 #endif
746 	default:
747 		error = ENXIO;
748 	}
749 
750 	return (error);
751 }
752 
753 /*
754  * read and write diffserv field in IPv4 or IPv6 header
755  */
756 u_int8_t
757 read_dsfield(m, pktattr)
758 	struct mbuf *m;
759 	struct altq_pktattr *pktattr;
760 {
761 	struct mbuf *m0;
762 	u_int8_t ds_field = 0;
763 
764 	if (pktattr == NULL ||
765 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
766 		return ((u_int8_t)0);
767 
768 	/* verify that pattr_hdr is within the mbuf data */
769 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
770 		if ((pktattr->pattr_hdr >= m0->m_data) &&
771 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
772 			break;
773 	if (m0 == NULL) {
774 		/* ick, pattr_hdr is stale */
775 		pktattr->pattr_af = AF_UNSPEC;
776 #ifdef ALTQ_DEBUG
777 		printf("read_dsfield: can't locate header!\n");
778 #endif
779 		return ((u_int8_t)0);
780 	}
781 
782 	if (pktattr->pattr_af == AF_INET) {
783 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
784 
785 		if (ip->ip_v != 4)
786 			return ((u_int8_t)0);	/* version mismatch! */
787 		ds_field = ip->ip_tos;
788 	}
789 #ifdef INET6
790 	else if (pktattr->pattr_af == AF_INET6) {
791 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
792 		u_int32_t flowlabel;
793 
794 		flowlabel = ntohl(ip6->ip6_flow);
795 		if ((flowlabel >> 28) != 6)
796 			return ((u_int8_t)0);	/* version mismatch! */
797 		ds_field = (flowlabel >> 20) & 0xff;
798 	}
799 #endif
800 	return (ds_field);
801 }
802 
803 void
804 write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield)
805 {
806 	struct mbuf *m0;
807 
808 	if (pktattr == NULL ||
809 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
810 		return;
811 
812 	/* verify that pattr_hdr is within the mbuf data */
813 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
814 		if ((pktattr->pattr_hdr >= m0->m_data) &&
815 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
816 			break;
817 	if (m0 == NULL) {
818 		/* ick, pattr_hdr is stale */
819 		pktattr->pattr_af = AF_UNSPEC;
820 #ifdef ALTQ_DEBUG
821 		printf("write_dsfield: can't locate header!\n");
822 #endif
823 		return;
824 	}
825 
826 	if (pktattr->pattr_af == AF_INET) {
827 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
828 		u_int8_t old;
829 		int32_t sum;
830 
831 		if (ip->ip_v != 4)
832 			return;		/* version mismatch! */
833 		old = ip->ip_tos;
834 		dsfield |= old & 3;	/* leave CU bits */
835 		if (old == dsfield)
836 			return;
837 		ip->ip_tos = dsfield;
838 		/*
839 		 * update checksum (from RFC1624)
840 		 *	   HC' = ~(~HC + ~m + m')
841 		 */
842 		sum = ~ntohs(ip->ip_sum) & 0xffff;
843 		sum += 0xff00 + (~old & 0xff) + dsfield;
844 		sum = (sum >> 16) + (sum & 0xffff);
845 		sum += (sum >> 16);  /* add carry */
846 
847 		ip->ip_sum = htons(~sum & 0xffff);
848 	}
849 #ifdef INET6
850 	else if (pktattr->pattr_af == AF_INET6) {
851 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
852 		u_int32_t flowlabel;
853 
854 		flowlabel = ntohl(ip6->ip6_flow);
855 		if ((flowlabel >> 28) != 6)
856 			return;		/* version mismatch! */
857 		flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
858 		ip6->ip6_flow = htonl(flowlabel);
859 	}
860 #endif
861 	return;
862 }
863 
864 /*
865  * high resolution clock support taking advantage of a machine dependent
866  * high resolution time counter (e.g., timestamp counter of intel pentium).
867  * we assume
868  *  - 64-bit-long monotonically-increasing counter
869  *  - frequency range is 100M-4GHz (CPU speed)
870  */
871 /* if pcc is not available or disabled, emulate 256MHz using microtime() */
872 #define	MACHCLK_SHIFT	8
873 
874 int machclk_usepcc;
875 u_int32_t machclk_freq;
876 u_int32_t machclk_per_tick;
877 
878 #if defined(__i386__) && defined(__NetBSD__)
879 extern u_int64_t cpu_tsc_freq;
880 #endif
881 
882 /* Update TSC freq with the value indicated by the caller. */
883 static void
884 tsc_freq_changed(void *arg, const struct cf_level *level, int status)
885 {
886 	/* If there was an error during the transition, don't do anything. */
887 	if (status != 0)
888 		return;
889 
890 #if defined(__amd64__) || defined(__i386__)
891 	/* If TSC is P-state invariant, don't do anything. */
892 	if (tsc_is_invariant)
893 		return;
894 #endif
895 
896 	/* Total setting for this level gives the new frequency in MHz. */
897 	init_machclk();
898 }
899 EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
900     EVENTHANDLER_PRI_LAST);
901 
902 static void
903 init_machclk_setup(void)
904 {
905 	callout_init(&tbr_callout, 1);
906 
907 	machclk_usepcc = 1;
908 
909 #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC)
910 	machclk_usepcc = 0;
911 #endif
912 #if defined(__FreeBSD__) && defined(SMP)
913 	machclk_usepcc = 0;
914 #endif
915 #if defined(__NetBSD__) && defined(MULTIPROCESSOR)
916 	machclk_usepcc = 0;
917 #endif
918 #if defined(__amd64__) || defined(__i386__)
919 	/* check if TSC is available */
920 	if ((cpu_feature & CPUID_TSC) == 0 ||
921 	    atomic_load_acq_64(&tsc_freq) == 0)
922 		machclk_usepcc = 0;
923 #endif
924 }
925 
926 void
927 init_machclk(void)
928 {
929 	static int called;
930 
931 	/* Call one-time initialization function. */
932 	if (!called) {
933 		init_machclk_setup();
934 		called = 1;
935 	}
936 
937 	if (machclk_usepcc == 0) {
938 		/* emulate 256MHz using microtime() */
939 		machclk_freq = 1000000 << MACHCLK_SHIFT;
940 		machclk_per_tick = machclk_freq / hz;
941 #ifdef ALTQ_DEBUG
942 		printf("altq: emulate %uHz cpu clock\n", machclk_freq);
943 #endif
944 		return;
945 	}
946 
947 	/*
948 	 * if the clock frequency (of Pentium TSC or Alpha PCC) is
949 	 * accessible, just use it.
950 	 */
951 #if defined(__amd64__) || defined(__i386__)
952 	machclk_freq = atomic_load_acq_64(&tsc_freq);
953 #endif
954 
955 	/*
956 	 * if we don't know the clock frequency, measure it.
957 	 */
958 	if (machclk_freq == 0) {
959 		static int	wait;
960 		struct timeval	tv_start, tv_end;
961 		u_int64_t	start, end, diff;
962 		int		timo;
963 
964 		microtime(&tv_start);
965 		start = read_machclk();
966 		timo = hz;	/* 1 sec */
967 		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
968 		microtime(&tv_end);
969 		end = read_machclk();
970 		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
971 		    + tv_end.tv_usec - tv_start.tv_usec;
972 		if (diff != 0)
973 			machclk_freq = (u_int)((end - start) * 1000000 / diff);
974 	}
975 
976 	machclk_per_tick = machclk_freq / hz;
977 
978 #ifdef ALTQ_DEBUG
979 	printf("altq: CPU clock: %uHz\n", machclk_freq);
980 #endif
981 }
982 
983 #if defined(__OpenBSD__) && defined(__i386__)
984 static __inline u_int64_t
985 rdtsc(void)
986 {
987 	u_int64_t rv;
988 	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
989 	return (rv);
990 }
991 #endif /* __OpenBSD__ && __i386__ */
992 
993 u_int64_t
994 read_machclk(void)
995 {
996 	u_int64_t val;
997 
998 	if (machclk_usepcc) {
999 #if defined(__amd64__) || defined(__i386__)
1000 		val = rdtsc();
1001 #else
1002 		panic("read_machclk");
1003 #endif
1004 	} else {
1005 		struct timeval tv, boottime;
1006 
1007 		microtime(&tv);
1008 		getboottime(&boottime);
1009 		val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
1010 		    + tv.tv_usec) << MACHCLK_SHIFT);
1011 	}
1012 	return (val);
1013 }
1014 
1015 #ifdef ALTQ3_CLFIER_COMPAT
1016 
1017 #ifndef IPPROTO_ESP
1018 #define	IPPROTO_ESP	50		/* encapsulating security payload */
1019 #endif
1020 #ifndef IPPROTO_AH
1021 #define	IPPROTO_AH	51		/* authentication header */
1022 #endif
1023 
1024 /*
1025  * extract flow information from a given packet.
1026  * filt_mask shows flowinfo fields required.
1027  * we assume the ip header is in one mbuf, and addresses and ports are
1028  * in network byte order.
1029  */
1030 int
1031 altq_extractflow(m, af, flow, filt_bmask)
1032 	struct mbuf *m;
1033 	int af;
1034 	struct flowinfo *flow;
1035 	u_int32_t	filt_bmask;
1036 {
1037 
1038 	switch (af) {
1039 	case PF_INET: {
1040 		struct flowinfo_in *fin;
1041 		struct ip *ip;
1042 
1043 		ip = mtod(m, struct ip *);
1044 
1045 		if (ip->ip_v != 4)
1046 			break;
1047 
1048 		fin = (struct flowinfo_in *)flow;
1049 		fin->fi_len = sizeof(struct flowinfo_in);
1050 		fin->fi_family = AF_INET;
1051 
1052 		fin->fi_proto = ip->ip_p;
1053 		fin->fi_tos = ip->ip_tos;
1054 
1055 		fin->fi_src.s_addr = ip->ip_src.s_addr;
1056 		fin->fi_dst.s_addr = ip->ip_dst.s_addr;
1057 
1058 		if (filt_bmask & FIMB4_PORTS)
1059 			/* if port info is required, extract port numbers */
1060 			extract_ports4(m, ip, fin);
1061 		else {
1062 			fin->fi_sport = 0;
1063 			fin->fi_dport = 0;
1064 			fin->fi_gpi = 0;
1065 		}
1066 		return (1);
1067 	}
1068 
1069 #ifdef INET6
1070 	case PF_INET6: {
1071 		struct flowinfo_in6 *fin6;
1072 		struct ip6_hdr *ip6;
1073 
1074 		ip6 = mtod(m, struct ip6_hdr *);
1075 		/* should we check the ip version? */
1076 
1077 		fin6 = (struct flowinfo_in6 *)flow;
1078 		fin6->fi6_len = sizeof(struct flowinfo_in6);
1079 		fin6->fi6_family = AF_INET6;
1080 
1081 		fin6->fi6_proto = ip6->ip6_nxt;
1082 		fin6->fi6_tclass   = IPV6_TRAFFIC_CLASS(ip6);
1083 
1084 		fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff);
1085 		fin6->fi6_src = ip6->ip6_src;
1086 		fin6->fi6_dst = ip6->ip6_dst;
1087 
1088 		if ((filt_bmask & FIMB6_PORTS) ||
1089 		    ((filt_bmask & FIMB6_PROTO)
1090 		     && ip6->ip6_nxt > IPPROTO_IPV6))
1091 			/*
1092 			 * if port info is required, or proto is required
1093 			 * but there are option headers, extract port
1094 			 * and protocol numbers.
1095 			 */
1096 			extract_ports6(m, ip6, fin6);
1097 		else {
1098 			fin6->fi6_sport = 0;
1099 			fin6->fi6_dport = 0;
1100 			fin6->fi6_gpi = 0;
1101 		}
1102 		return (1);
1103 	}
1104 #endif /* INET6 */
1105 
1106 	default:
1107 		break;
1108 	}
1109 
1110 	/* failed */
1111 	flow->fi_len = sizeof(struct flowinfo);
1112 	flow->fi_family = AF_UNSPEC;
1113 	return (0);
1114 }
1115 
1116 /*
1117  * helper routine to extract port numbers
1118  */
1119 /* structure for ipsec and ipv6 option header template */
1120 struct _opt6 {
1121 	u_int8_t	opt6_nxt;	/* next header */
1122 	u_int8_t	opt6_hlen;	/* header extension length */
1123 	u_int16_t	_pad;
1124 	u_int32_t	ah_spi;		/* security parameter index
1125 					   for authentication header */
1126 };
1127 
1128 /*
1129  * extract port numbers from a ipv4 packet.
1130  */
1131 static int
1132 extract_ports4(m, ip, fin)
1133 	struct mbuf *m;
1134 	struct ip *ip;
1135 	struct flowinfo_in *fin;
1136 {
1137 	struct mbuf *m0;
1138 	u_short ip_off;
1139 	u_int8_t proto;
1140 	int 	off;
1141 
1142 	fin->fi_sport = 0;
1143 	fin->fi_dport = 0;
1144 	fin->fi_gpi = 0;
1145 
1146 	ip_off = ntohs(ip->ip_off);
1147 	/* if it is a fragment, try cached fragment info */
1148 	if (ip_off & IP_OFFMASK) {
1149 		ip4f_lookup(ip, fin);
1150 		return (1);
1151 	}
1152 
1153 	/* locate the mbuf containing the protocol header */
1154 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1155 		if (((caddr_t)ip >= m0->m_data) &&
1156 		    ((caddr_t)ip < m0->m_data + m0->m_len))
1157 			break;
1158 	if (m0 == NULL) {
1159 #ifdef ALTQ_DEBUG
1160 		printf("extract_ports4: can't locate header! ip=%p\n", ip);
1161 #endif
1162 		return (0);
1163 	}
1164 	off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2);
1165 	proto = ip->ip_p;
1166 
1167 #ifdef ALTQ_IPSEC
1168  again:
1169 #endif
1170 	while (off >= m0->m_len) {
1171 		off -= m0->m_len;
1172 		m0 = m0->m_next;
1173 		if (m0 == NULL)
1174 			return (0);  /* bogus ip_hl! */
1175 	}
1176 	if (m0->m_len < off + 4)
1177 		return (0);
1178 
1179 	switch (proto) {
1180 	case IPPROTO_TCP:
1181 	case IPPROTO_UDP: {
1182 		struct udphdr *udp;
1183 
1184 		udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1185 		fin->fi_sport = udp->uh_sport;
1186 		fin->fi_dport = udp->uh_dport;
1187 		fin->fi_proto = proto;
1188 		}
1189 		break;
1190 
1191 #ifdef ALTQ_IPSEC
1192 	case IPPROTO_ESP:
1193 		if (fin->fi_gpi == 0){
1194 			u_int32_t *gpi;
1195 
1196 			gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1197 			fin->fi_gpi   = *gpi;
1198 		}
1199 		fin->fi_proto = proto;
1200 		break;
1201 
1202 	case IPPROTO_AH: {
1203 			/* get next header and header length */
1204 			struct _opt6 *opt6;
1205 
1206 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1207 			proto = opt6->opt6_nxt;
1208 			off += 8 + (opt6->opt6_hlen * 4);
1209 			if (fin->fi_gpi == 0 && m0->m_len >= off + 8)
1210 				fin->fi_gpi = opt6->ah_spi;
1211 		}
1212 		/* goto the next header */
1213 		goto again;
1214 #endif  /* ALTQ_IPSEC */
1215 
1216 	default:
1217 		fin->fi_proto = proto;
1218 		return (0);
1219 	}
1220 
1221 	/* if this is a first fragment, cache it. */
1222 	if (ip_off & IP_MF)
1223 		ip4f_cache(ip, fin);
1224 
1225 	return (1);
1226 }
1227 
1228 #ifdef INET6
1229 static int
1230 extract_ports6(m, ip6, fin6)
1231 	struct mbuf *m;
1232 	struct ip6_hdr *ip6;
1233 	struct flowinfo_in6 *fin6;
1234 {
1235 	struct mbuf *m0;
1236 	int	off;
1237 	u_int8_t proto;
1238 
1239 	fin6->fi6_gpi   = 0;
1240 	fin6->fi6_sport = 0;
1241 	fin6->fi6_dport = 0;
1242 
1243 	/* locate the mbuf containing the protocol header */
1244 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1245 		if (((caddr_t)ip6 >= m0->m_data) &&
1246 		    ((caddr_t)ip6 < m0->m_data + m0->m_len))
1247 			break;
1248 	if (m0 == NULL) {
1249 #ifdef ALTQ_DEBUG
1250 		printf("extract_ports6: can't locate header! ip6=%p\n", ip6);
1251 #endif
1252 		return (0);
1253 	}
1254 	off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr);
1255 
1256 	proto = ip6->ip6_nxt;
1257 	do {
1258 		while (off >= m0->m_len) {
1259 			off -= m0->m_len;
1260 			m0 = m0->m_next;
1261 			if (m0 == NULL)
1262 				return (0);
1263 		}
1264 		if (m0->m_len < off + 4)
1265 			return (0);
1266 
1267 		switch (proto) {
1268 		case IPPROTO_TCP:
1269 		case IPPROTO_UDP: {
1270 			struct udphdr *udp;
1271 
1272 			udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1273 			fin6->fi6_sport = udp->uh_sport;
1274 			fin6->fi6_dport = udp->uh_dport;
1275 			fin6->fi6_proto = proto;
1276 			}
1277 			return (1);
1278 
1279 		case IPPROTO_ESP:
1280 			if (fin6->fi6_gpi == 0) {
1281 				u_int32_t *gpi;
1282 
1283 				gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1284 				fin6->fi6_gpi   = *gpi;
1285 			}
1286 			fin6->fi6_proto = proto;
1287 			return (1);
1288 
1289 		case IPPROTO_AH: {
1290 			/* get next header and header length */
1291 			struct _opt6 *opt6;
1292 
1293 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1294 			if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8)
1295 				fin6->fi6_gpi = opt6->ah_spi;
1296 			proto = opt6->opt6_nxt;
1297 			off += 8 + (opt6->opt6_hlen * 4);
1298 			/* goto the next header */
1299 			break;
1300 			}
1301 
1302 		case IPPROTO_HOPOPTS:
1303 		case IPPROTO_ROUTING:
1304 		case IPPROTO_DSTOPTS: {
1305 			/* get next header and header length */
1306 			struct _opt6 *opt6;
1307 
1308 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1309 			proto = opt6->opt6_nxt;
1310 			off += (opt6->opt6_hlen + 1) * 8;
1311 			/* goto the next header */
1312 			break;
1313 			}
1314 
1315 		case IPPROTO_FRAGMENT:
1316 			/* ipv6 fragmentations are not supported yet */
1317 		default:
1318 			fin6->fi6_proto = proto;
1319 			return (0);
1320 		}
1321 	} while (1);
1322 	/*NOTREACHED*/
1323 }
1324 #endif /* INET6 */
1325 
1326 /*
1327  * altq common classifier
1328  */
1329 int
1330 acc_add_filter(classifier, filter, class, phandle)
1331 	struct acc_classifier *classifier;
1332 	struct flow_filter *filter;
1333 	void	*class;
1334 	u_long	*phandle;
1335 {
1336 	struct acc_filter *afp, *prev, *tmp;
1337 	int	i, s;
1338 
1339 #ifdef INET6
1340 	if (filter->ff_flow.fi_family != AF_INET &&
1341 	    filter->ff_flow.fi_family != AF_INET6)
1342 		return (EINVAL);
1343 #else
1344 	if (filter->ff_flow.fi_family != AF_INET)
1345 		return (EINVAL);
1346 #endif
1347 
1348 	afp = malloc(sizeof(struct acc_filter),
1349 	       M_DEVBUF, M_WAITOK);
1350 	if (afp == NULL)
1351 		return (ENOMEM);
1352 	bzero(afp, sizeof(struct acc_filter));
1353 
1354 	afp->f_filter = *filter;
1355 	afp->f_class = class;
1356 
1357 	i = ACC_WILDCARD_INDEX;
1358 	if (filter->ff_flow.fi_family == AF_INET) {
1359 		struct flow_filter *filter4 = &afp->f_filter;
1360 
1361 		/*
1362 		 * if address is 0, it's a wildcard.  if address mask
1363 		 * isn't set, use full mask.
1364 		 */
1365 		if (filter4->ff_flow.fi_dst.s_addr == 0)
1366 			filter4->ff_mask.mask_dst.s_addr = 0;
1367 		else if (filter4->ff_mask.mask_dst.s_addr == 0)
1368 			filter4->ff_mask.mask_dst.s_addr = 0xffffffff;
1369 		if (filter4->ff_flow.fi_src.s_addr == 0)
1370 			filter4->ff_mask.mask_src.s_addr = 0;
1371 		else if (filter4->ff_mask.mask_src.s_addr == 0)
1372 			filter4->ff_mask.mask_src.s_addr = 0xffffffff;
1373 
1374 		/* clear extra bits in addresses  */
1375 		   filter4->ff_flow.fi_dst.s_addr &=
1376 		       filter4->ff_mask.mask_dst.s_addr;
1377 		   filter4->ff_flow.fi_src.s_addr &=
1378 		       filter4->ff_mask.mask_src.s_addr;
1379 
1380 		/*
1381 		 * if dst address is a wildcard, use hash-entry
1382 		 * ACC_WILDCARD_INDEX.
1383 		 */
1384 		if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff)
1385 			i = ACC_WILDCARD_INDEX;
1386 		else
1387 			i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr);
1388 	}
1389 #ifdef INET6
1390 	else if (filter->ff_flow.fi_family == AF_INET6) {
1391 		struct flow_filter6 *filter6 =
1392 			(struct flow_filter6 *)&afp->f_filter;
1393 #ifndef IN6MASK0 /* taken from kame ipv6 */
1394 #define	IN6MASK0	{{{ 0, 0, 0, 0 }}}
1395 #define	IN6MASK128	{{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}}
1396 		const struct in6_addr in6mask0 = IN6MASK0;
1397 		const struct in6_addr in6mask128 = IN6MASK128;
1398 #endif
1399 
1400 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst))
1401 			filter6->ff_mask6.mask6_dst = in6mask0;
1402 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst))
1403 			filter6->ff_mask6.mask6_dst = in6mask128;
1404 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src))
1405 			filter6->ff_mask6.mask6_src = in6mask0;
1406 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src))
1407 			filter6->ff_mask6.mask6_src = in6mask128;
1408 
1409 		/* clear extra bits in addresses  */
1410 		for (i = 0; i < 16; i++)
1411 			filter6->ff_flow6.fi6_dst.s6_addr[i] &=
1412 			    filter6->ff_mask6.mask6_dst.s6_addr[i];
1413 		for (i = 0; i < 16; i++)
1414 			filter6->ff_flow6.fi6_src.s6_addr[i] &=
1415 			    filter6->ff_mask6.mask6_src.s6_addr[i];
1416 
1417 		if (filter6->ff_flow6.fi6_flowlabel == 0)
1418 			i = ACC_WILDCARD_INDEX;
1419 		else
1420 			i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel);
1421 	}
1422 #endif /* INET6 */
1423 
1424 	afp->f_handle = get_filt_handle(classifier, i);
1425 
1426 	/* update filter bitmask */
1427 	afp->f_fbmask = filt2fibmask(filter);
1428 	classifier->acc_fbmask |= afp->f_fbmask;
1429 
1430 	/*
1431 	 * add this filter to the filter list.
1432 	 * filters are ordered from the highest rule number.
1433 	 */
1434 	s = splnet();
1435 	prev = NULL;
1436 	LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) {
1437 		if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno)
1438 			prev = tmp;
1439 		else
1440 			break;
1441 	}
1442 	if (prev == NULL)
1443 		LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain);
1444 	else
1445 		LIST_INSERT_AFTER(prev, afp, f_chain);
1446 	splx(s);
1447 
1448 	*phandle = afp->f_handle;
1449 	return (0);
1450 }
1451 
1452 int
1453 acc_delete_filter(classifier, handle)
1454 	struct acc_classifier *classifier;
1455 	u_long handle;
1456 {
1457 	struct acc_filter *afp;
1458 	int	s;
1459 
1460 	if ((afp = filth_to_filtp(classifier, handle)) == NULL)
1461 		return (EINVAL);
1462 
1463 	s = splnet();
1464 	LIST_REMOVE(afp, f_chain);
1465 	splx(s);
1466 
1467 	free(afp, M_DEVBUF);
1468 
1469 	/* todo: update filt_bmask */
1470 
1471 	return (0);
1472 }
1473 
1474 /*
1475  * delete filters referencing to the specified class.
1476  * if the all flag is not 0, delete all the filters.
1477  */
1478 int
1479 acc_discard_filters(classifier, class, all)
1480 	struct acc_classifier *classifier;
1481 	void	*class;
1482 	int	all;
1483 {
1484 	struct acc_filter *afp;
1485 	int	i, s;
1486 
1487 	s = splnet();
1488 	for (i = 0; i < ACC_FILTER_TABLESIZE; i++) {
1489 		do {
1490 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1491 				if (all || afp->f_class == class) {
1492 					LIST_REMOVE(afp, f_chain);
1493 					free(afp, M_DEVBUF);
1494 					/* start again from the head */
1495 					break;
1496 				}
1497 		} while (afp != NULL);
1498 	}
1499 	splx(s);
1500 
1501 	if (all)
1502 		classifier->acc_fbmask = 0;
1503 
1504 	return (0);
1505 }
1506 
1507 void *
1508 acc_classify(clfier, m, af)
1509 	void *clfier;
1510 	struct mbuf *m;
1511 	int af;
1512 {
1513 	struct acc_classifier *classifier;
1514 	struct flowinfo flow;
1515 	struct acc_filter *afp;
1516 	int	i;
1517 
1518 	classifier = (struct acc_classifier *)clfier;
1519 	altq_extractflow(m, af, &flow, classifier->acc_fbmask);
1520 
1521 	if (flow.fi_family == AF_INET) {
1522 		struct flowinfo_in *fp = (struct flowinfo_in *)&flow;
1523 
1524 		if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) {
1525 			/* only tos is used */
1526 			LIST_FOREACH(afp,
1527 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1528 				 f_chain)
1529 				if (apply_tosfilter4(afp->f_fbmask,
1530 						     &afp->f_filter, fp))
1531 					/* filter matched */
1532 					return (afp->f_class);
1533 		} else if ((classifier->acc_fbmask &
1534 			(~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL))
1535 		    == 0) {
1536 			/* only proto and ports are used */
1537 			LIST_FOREACH(afp,
1538 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1539 				 f_chain)
1540 				if (apply_ppfilter4(afp->f_fbmask,
1541 						    &afp->f_filter, fp))
1542 					/* filter matched */
1543 					return (afp->f_class);
1544 		} else {
1545 			/* get the filter hash entry from its dest address */
1546 			i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr);
1547 			do {
1548 				/*
1549 				 * go through this loop twice.  first for dst
1550 				 * hash, second for wildcards.
1551 				 */
1552 				LIST_FOREACH(afp, &classifier->acc_filters[i],
1553 					     f_chain)
1554 					if (apply_filter4(afp->f_fbmask,
1555 							  &afp->f_filter, fp))
1556 						/* filter matched */
1557 						return (afp->f_class);
1558 
1559 				/*
1560 				 * check again for filters with a dst addr
1561 				 * wildcard.
1562 				 * (daddr == 0 || dmask != 0xffffffff).
1563 				 */
1564 				if (i != ACC_WILDCARD_INDEX)
1565 					i = ACC_WILDCARD_INDEX;
1566 				else
1567 					break;
1568 			} while (1);
1569 		}
1570 	}
1571 #ifdef INET6
1572 	else if (flow.fi_family == AF_INET6) {
1573 		struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow;
1574 
1575 		/* get the filter hash entry from its flow ID */
1576 		if (fp6->fi6_flowlabel != 0)
1577 			i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel);
1578 		else
1579 			/* flowlable can be zero */
1580 			i = ACC_WILDCARD_INDEX;
1581 
1582 		/* go through this loop twice.  first for flow hash, second
1583 		   for wildcards. */
1584 		do {
1585 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1586 				if (apply_filter6(afp->f_fbmask,
1587 					(struct flow_filter6 *)&afp->f_filter,
1588 					fp6))
1589 					/* filter matched */
1590 					return (afp->f_class);
1591 
1592 			/*
1593 			 * check again for filters with a wildcard.
1594 			 */
1595 			if (i != ACC_WILDCARD_INDEX)
1596 				i = ACC_WILDCARD_INDEX;
1597 			else
1598 				break;
1599 		} while (1);
1600 	}
1601 #endif /* INET6 */
1602 
1603 	/* no filter matched */
1604 	return (NULL);
1605 }
1606 
1607 static int
1608 apply_filter4(fbmask, filt, pkt)
1609 	u_int32_t	fbmask;
1610 	struct flow_filter *filt;
1611 	struct flowinfo_in *pkt;
1612 {
1613 	if (filt->ff_flow.fi_family != AF_INET)
1614 		return (0);
1615 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1616 		return (0);
1617 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1618 		return (0);
1619 	if ((fbmask & FIMB4_DADDR) &&
1620 	    filt->ff_flow.fi_dst.s_addr !=
1621 	    (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr))
1622 		return (0);
1623 	if ((fbmask & FIMB4_SADDR) &&
1624 	    filt->ff_flow.fi_src.s_addr !=
1625 	    (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr))
1626 		return (0);
1627 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1628 		return (0);
1629 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1630 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1631 		return (0);
1632 	if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi))
1633 		return (0);
1634 	/* match */
1635 	return (1);
1636 }
1637 
1638 /*
1639  * filter matching function optimized for a common case that checks
1640  * only protocol and port numbers
1641  */
1642 static int
1643 apply_ppfilter4(fbmask, filt, pkt)
1644 	u_int32_t	fbmask;
1645 	struct flow_filter *filt;
1646 	struct flowinfo_in *pkt;
1647 {
1648 	if (filt->ff_flow.fi_family != AF_INET)
1649 		return (0);
1650 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1651 		return (0);
1652 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1653 		return (0);
1654 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1655 		return (0);
1656 	/* match */
1657 	return (1);
1658 }
1659 
1660 /*
1661  * filter matching function only for tos field.
1662  */
1663 static int
1664 apply_tosfilter4(fbmask, filt, pkt)
1665 	u_int32_t	fbmask;
1666 	struct flow_filter *filt;
1667 	struct flowinfo_in *pkt;
1668 {
1669 	if (filt->ff_flow.fi_family != AF_INET)
1670 		return (0);
1671 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1672 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1673 		return (0);
1674 	/* match */
1675 	return (1);
1676 }
1677 
1678 #ifdef INET6
1679 static int
1680 apply_filter6(fbmask, filt, pkt)
1681 	u_int32_t	fbmask;
1682 	struct flow_filter6 *filt;
1683 	struct flowinfo_in6 *pkt;
1684 {
1685 	int i;
1686 
1687 	if (filt->ff_flow6.fi6_family != AF_INET6)
1688 		return (0);
1689 	if ((fbmask & FIMB6_FLABEL) &&
1690 	    filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel)
1691 		return (0);
1692 	if ((fbmask & FIMB6_PROTO) &&
1693 	    filt->ff_flow6.fi6_proto != pkt->fi6_proto)
1694 		return (0);
1695 	if ((fbmask & FIMB6_SPORT) &&
1696 	    filt->ff_flow6.fi6_sport != pkt->fi6_sport)
1697 		return (0);
1698 	if ((fbmask & FIMB6_DPORT) &&
1699 	    filt->ff_flow6.fi6_dport != pkt->fi6_dport)
1700 		return (0);
1701 	if (fbmask & FIMB6_SADDR) {
1702 		for (i = 0; i < 4; i++)
1703 			if (filt->ff_flow6.fi6_src.s6_addr32[i] !=
1704 			    (pkt->fi6_src.s6_addr32[i] &
1705 			     filt->ff_mask6.mask6_src.s6_addr32[i]))
1706 				return (0);
1707 	}
1708 	if (fbmask & FIMB6_DADDR) {
1709 		for (i = 0; i < 4; i++)
1710 			if (filt->ff_flow6.fi6_dst.s6_addr32[i] !=
1711 			    (pkt->fi6_dst.s6_addr32[i] &
1712 			     filt->ff_mask6.mask6_dst.s6_addr32[i]))
1713 				return (0);
1714 	}
1715 	if ((fbmask & FIMB6_TCLASS) &&
1716 	    filt->ff_flow6.fi6_tclass !=
1717 	    (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass))
1718 		return (0);
1719 	if ((fbmask & FIMB6_GPI) &&
1720 	    filt->ff_flow6.fi6_gpi != pkt->fi6_gpi)
1721 		return (0);
1722 	/* match */
1723 	return (1);
1724 }
1725 #endif /* INET6 */
1726 
1727 /*
1728  *  filter handle:
1729  *	bit 20-28: index to the filter hash table
1730  *	bit  0-19: unique id in the hash bucket.
1731  */
1732 static u_long
1733 get_filt_handle(classifier, i)
1734 	struct acc_classifier *classifier;
1735 	int	i;
1736 {
1737 	static u_long handle_number = 1;
1738 	u_long 	handle;
1739 	struct acc_filter *afp;
1740 
1741 	while (1) {
1742 		handle = handle_number++ & 0x000fffff;
1743 
1744 		if (LIST_EMPTY(&classifier->acc_filters[i]))
1745 			break;
1746 
1747 		LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1748 			if ((afp->f_handle & 0x000fffff) == handle)
1749 				break;
1750 		if (afp == NULL)
1751 			break;
1752 		/* this handle is already used, try again */
1753 	}
1754 
1755 	return ((i << 20) | handle);
1756 }
1757 
1758 /* convert filter handle to filter pointer */
1759 static struct acc_filter *
1760 filth_to_filtp(classifier, handle)
1761 	struct acc_classifier *classifier;
1762 	u_long handle;
1763 {
1764 	struct acc_filter *afp;
1765 	int	i;
1766 
1767 	i = ACC_GET_HINDEX(handle);
1768 
1769 	LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1770 		if (afp->f_handle == handle)
1771 			return (afp);
1772 
1773 	return (NULL);
1774 }
1775 
1776 /* create flowinfo bitmask */
1777 static u_int32_t
1778 filt2fibmask(filt)
1779 	struct flow_filter *filt;
1780 {
1781 	u_int32_t mask = 0;
1782 #ifdef INET6
1783 	struct flow_filter6 *filt6;
1784 #endif
1785 
1786 	switch (filt->ff_flow.fi_family) {
1787 	case AF_INET:
1788 		if (filt->ff_flow.fi_proto != 0)
1789 			mask |= FIMB4_PROTO;
1790 		if (filt->ff_flow.fi_tos != 0)
1791 			mask |= FIMB4_TOS;
1792 		if (filt->ff_flow.fi_dst.s_addr != 0)
1793 			mask |= FIMB4_DADDR;
1794 		if (filt->ff_flow.fi_src.s_addr != 0)
1795 			mask |= FIMB4_SADDR;
1796 		if (filt->ff_flow.fi_sport != 0)
1797 			mask |= FIMB4_SPORT;
1798 		if (filt->ff_flow.fi_dport != 0)
1799 			mask |= FIMB4_DPORT;
1800 		if (filt->ff_flow.fi_gpi != 0)
1801 			mask |= FIMB4_GPI;
1802 		break;
1803 #ifdef INET6
1804 	case AF_INET6:
1805 		filt6 = (struct flow_filter6 *)filt;
1806 
1807 		if (filt6->ff_flow6.fi6_proto != 0)
1808 			mask |= FIMB6_PROTO;
1809 		if (filt6->ff_flow6.fi6_tclass != 0)
1810 			mask |= FIMB6_TCLASS;
1811 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst))
1812 			mask |= FIMB6_DADDR;
1813 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src))
1814 			mask |= FIMB6_SADDR;
1815 		if (filt6->ff_flow6.fi6_sport != 0)
1816 			mask |= FIMB6_SPORT;
1817 		if (filt6->ff_flow6.fi6_dport != 0)
1818 			mask |= FIMB6_DPORT;
1819 		if (filt6->ff_flow6.fi6_gpi != 0)
1820 			mask |= FIMB6_GPI;
1821 		if (filt6->ff_flow6.fi6_flowlabel != 0)
1822 			mask |= FIMB6_FLABEL;
1823 		break;
1824 #endif /* INET6 */
1825 	}
1826 	return (mask);
1827 }
1828 
1829 /*
1830  * helper functions to handle IPv4 fragments.
1831  * currently only in-sequence fragments are handled.
1832  *	- fragment info is cached in a LRU list.
1833  *	- when a first fragment is found, cache its flow info.
1834  *	- when a non-first fragment is found, lookup the cache.
1835  */
1836 
1837 struct ip4_frag {
1838     TAILQ_ENTRY(ip4_frag) ip4f_chain;
1839     char    ip4f_valid;
1840     u_short ip4f_id;
1841     struct flowinfo_in ip4f_info;
1842 };
1843 
1844 static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */
1845 
1846 #define	IP4F_TABSIZE		16	/* IPv4 fragment cache size */
1847 
1848 static void
1849 ip4f_cache(ip, fin)
1850 	struct ip *ip;
1851 	struct flowinfo_in *fin;
1852 {
1853 	struct ip4_frag *fp;
1854 
1855 	if (TAILQ_EMPTY(&ip4f_list)) {
1856 		/* first time call, allocate fragment cache entries. */
1857 		if (ip4f_init() < 0)
1858 			/* allocation failed! */
1859 			return;
1860 	}
1861 
1862 	fp = ip4f_alloc();
1863 	fp->ip4f_id = ip->ip_id;
1864 	fp->ip4f_info.fi_proto = ip->ip_p;
1865 	fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr;
1866 	fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr;
1867 
1868 	/* save port numbers */
1869 	fp->ip4f_info.fi_sport = fin->fi_sport;
1870 	fp->ip4f_info.fi_dport = fin->fi_dport;
1871 	fp->ip4f_info.fi_gpi   = fin->fi_gpi;
1872 }
1873 
1874 static int
1875 ip4f_lookup(ip, fin)
1876 	struct ip *ip;
1877 	struct flowinfo_in *fin;
1878 {
1879 	struct ip4_frag *fp;
1880 
1881 	for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid;
1882 	     fp = TAILQ_NEXT(fp, ip4f_chain))
1883 		if (ip->ip_id == fp->ip4f_id &&
1884 		    ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr &&
1885 		    ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr &&
1886 		    ip->ip_p == fp->ip4f_info.fi_proto) {
1887 			/* found the matching entry */
1888 			fin->fi_sport = fp->ip4f_info.fi_sport;
1889 			fin->fi_dport = fp->ip4f_info.fi_dport;
1890 			fin->fi_gpi   = fp->ip4f_info.fi_gpi;
1891 
1892 			if ((ntohs(ip->ip_off) & IP_MF) == 0)
1893 				/* this is the last fragment,
1894 				   release the entry. */
1895 				ip4f_free(fp);
1896 
1897 			return (1);
1898 		}
1899 
1900 	/* no matching entry found */
1901 	return (0);
1902 }
1903 
1904 static int
1905 ip4f_init(void)
1906 {
1907 	struct ip4_frag *fp;
1908 	int i;
1909 
1910 	TAILQ_INIT(&ip4f_list);
1911 	for (i=0; i<IP4F_TABSIZE; i++) {
1912 		fp = malloc(sizeof(struct ip4_frag),
1913 		       M_DEVBUF, M_NOWAIT);
1914 		if (fp == NULL) {
1915 			printf("ip4f_init: can't alloc %dth entry!\n", i);
1916 			if (i == 0)
1917 				return (-1);
1918 			return (0);
1919 		}
1920 		fp->ip4f_valid = 0;
1921 		TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1922 	}
1923 	return (0);
1924 }
1925 
1926 static struct ip4_frag *
1927 ip4f_alloc(void)
1928 {
1929 	struct ip4_frag *fp;
1930 
1931 	/* reclaim an entry at the tail, put it at the head */
1932 	fp = TAILQ_LAST(&ip4f_list, ip4f_list);
1933 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1934 	fp->ip4f_valid = 1;
1935 	TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain);
1936 	return (fp);
1937 }
1938 
1939 static void
1940 ip4f_free(fp)
1941 	struct ip4_frag *fp;
1942 {
1943 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1944 	fp->ip4f_valid = 0;
1945 	TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1946 }
1947 
1948 #endif /* ALTQ3_CLFIER_COMPAT */
1949