xref: /freebsd/sys/net/altq/altq_subr.c (revision 058ac3e8063366dafa634d9107642e12b038bf09)
1 /*-
2  * Copyright (C) 1997-2003
3  *	Sony Computer Science Laboratories Inc.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $KAME: altq_subr.c,v 1.21 2003/11/06 06:32:53 kjc Exp $
27  * $FreeBSD$
28  */
29 
30 #include "opt_altq.h"
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 
34 #include <sys/param.h>
35 #include <sys/malloc.h>
36 #include <sys/mbuf.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/socket.h>
40 #include <sys/socketvar.h>
41 #include <sys/kernel.h>
42 #include <sys/errno.h>
43 #include <sys/syslog.h>
44 #include <sys/sysctl.h>
45 #include <sys/queue.h>
46 
47 #include <net/if.h>
48 #include <net/if_var.h>
49 #include <net/if_dl.h>
50 #include <net/if_types.h>
51 #include <net/vnet.h>
52 
53 #include <netinet/in.h>
54 #include <netinet/in_systm.h>
55 #include <netinet/ip.h>
56 #ifdef INET6
57 #include <netinet/ip6.h>
58 #endif
59 #include <netinet/tcp.h>
60 #include <netinet/udp.h>
61 
62 #include <netpfil/pf/pf.h>
63 #include <netpfil/pf/pf_altq.h>
64 #include <net/altq/altq.h>
65 
66 /* machine dependent clock related includes */
67 #include <sys/bus.h>
68 #include <sys/cpu.h>
69 #include <sys/eventhandler.h>
70 #include <machine/clock.h>
71 #if defined(__amd64__) || defined(__i386__)
72 #include <machine/cpufunc.h>		/* for pentium tsc */
73 #include <machine/specialreg.h>		/* for CPUID_TSC */
74 #include <machine/md_var.h>		/* for cpu_feature */
75 #endif /* __amd64 || __i386__ */
76 
77 /*
78  * internal function prototypes
79  */
80 static void	tbr_timeout(void *);
81 static struct mbuf *tbr_dequeue(struct ifaltq *, int);
82 static int tbr_timer = 0;	/* token bucket regulator timer */
83 #if !defined(__FreeBSD__) || (__FreeBSD_version < 600000)
84 static struct callout tbr_callout = CALLOUT_INITIALIZER;
85 #else
86 static struct callout tbr_callout;
87 #endif
88 
89 #ifdef ALTQ3_CLFIER_COMPAT
90 static int 	extract_ports4(struct mbuf *, struct ip *, struct flowinfo_in *);
91 #ifdef INET6
92 static int 	extract_ports6(struct mbuf *, struct ip6_hdr *,
93 			       struct flowinfo_in6 *);
94 #endif
95 static int	apply_filter4(u_int32_t, struct flow_filter *,
96 			      struct flowinfo_in *);
97 static int	apply_ppfilter4(u_int32_t, struct flow_filter *,
98 				struct flowinfo_in *);
99 #ifdef INET6
100 static int	apply_filter6(u_int32_t, struct flow_filter6 *,
101 			      struct flowinfo_in6 *);
102 #endif
103 static int	apply_tosfilter4(u_int32_t, struct flow_filter *,
104 				 struct flowinfo_in *);
105 static u_long	get_filt_handle(struct acc_classifier *, int);
106 static struct acc_filter *filth_to_filtp(struct acc_classifier *, u_long);
107 static u_int32_t filt2fibmask(struct flow_filter *);
108 
109 static void 	ip4f_cache(struct ip *, struct flowinfo_in *);
110 static int 	ip4f_lookup(struct ip *, struct flowinfo_in *);
111 static int 	ip4f_init(void);
112 static struct ip4_frag	*ip4f_alloc(void);
113 static void 	ip4f_free(struct ip4_frag *);
114 #endif /* ALTQ3_CLFIER_COMPAT */
115 
116 #ifdef ALTQ
117 SYSCTL_NODE(_kern_features, OID_AUTO, altq, CTLFLAG_RD | CTLFLAG_CAPRD, 0,
118     "ALTQ packet queuing");
119 
120 #define	ALTQ_FEATURE(name, desc)					\
121 	SYSCTL_INT_WITH_LABEL(_kern_features_altq, OID_AUTO, name,	\
122 	    CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, 1,		\
123 	    desc, "feature")
124 
125 #ifdef ALTQ_CBQ
126 ALTQ_FEATURE(cbq, "ALTQ Class Based Queuing discipline");
127 #endif
128 #ifdef ALTQ_CODEL
129 ALTQ_FEATURE(codel, "ALTQ Controlled Delay discipline");
130 #endif
131 #ifdef ALTQ_RED
132 ALTQ_FEATURE(red, "ALTQ Random Early Detection discipline");
133 #endif
134 #ifdef ALTQ_RIO
135 ALTQ_FEATURE(rio, "ALTQ Random Early Drop discipline");
136 #endif
137 #ifdef ALTQ_HFSC
138 ALTQ_FEATURE(hfsc, "ALTQ Hierarchical Packet Scheduler discipline");
139 #endif
140 #ifdef ALTQ_PRIQ
141 ALTQ_FEATURE(priq, "ATLQ Priority Queuing discipline");
142 #endif
143 #ifdef ALTQ_FAIRQ
144 ALTQ_FEATURE(fairq, "ALTQ Fair Queuing discipline");
145 #endif
146 #endif
147 
148 /*
149  * alternate queueing support routines
150  */
151 
152 /* look up the queue state by the interface name and the queueing type. */
153 void *
154 altq_lookup(name, type)
155 	char *name;
156 	int type;
157 {
158 	struct ifnet *ifp;
159 
160 	if ((ifp = ifunit(name)) != NULL) {
161 		/* read if_snd unlocked */
162 		if (type != ALTQT_NONE && ifp->if_snd.altq_type == type)
163 			return (ifp->if_snd.altq_disc);
164 	}
165 
166 	return NULL;
167 }
168 
169 int
170 altq_attach(ifq, type, discipline, enqueue, dequeue, request)
171 	struct ifaltq *ifq;
172 	int type;
173 	void *discipline;
174 	int (*enqueue)(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
175 	struct mbuf *(*dequeue)(struct ifaltq *, int);
176 	int (*request)(struct ifaltq *, int, void *);
177 {
178 	IFQ_LOCK(ifq);
179 	if (!ALTQ_IS_READY(ifq)) {
180 		IFQ_UNLOCK(ifq);
181 		return ENXIO;
182 	}
183 
184 	ifq->altq_type     = type;
185 	ifq->altq_disc     = discipline;
186 	ifq->altq_enqueue  = enqueue;
187 	ifq->altq_dequeue  = dequeue;
188 	ifq->altq_request  = request;
189 	ifq->altq_flags &= (ALTQF_CANTCHANGE|ALTQF_ENABLED);
190 	IFQ_UNLOCK(ifq);
191 	return 0;
192 }
193 
194 int
195 altq_detach(ifq)
196 	struct ifaltq *ifq;
197 {
198 	IFQ_LOCK(ifq);
199 
200 	if (!ALTQ_IS_READY(ifq)) {
201 		IFQ_UNLOCK(ifq);
202 		return ENXIO;
203 	}
204 	if (ALTQ_IS_ENABLED(ifq)) {
205 		IFQ_UNLOCK(ifq);
206 		return EBUSY;
207 	}
208 	if (!ALTQ_IS_ATTACHED(ifq)) {
209 		IFQ_UNLOCK(ifq);
210 		return (0);
211 	}
212 
213 	ifq->altq_type     = ALTQT_NONE;
214 	ifq->altq_disc     = NULL;
215 	ifq->altq_enqueue  = NULL;
216 	ifq->altq_dequeue  = NULL;
217 	ifq->altq_request  = NULL;
218 	ifq->altq_flags &= ALTQF_CANTCHANGE;
219 
220 	IFQ_UNLOCK(ifq);
221 	return 0;
222 }
223 
224 int
225 altq_enable(ifq)
226 	struct ifaltq *ifq;
227 {
228 	int s;
229 
230 	IFQ_LOCK(ifq);
231 
232 	if (!ALTQ_IS_READY(ifq)) {
233 		IFQ_UNLOCK(ifq);
234 		return ENXIO;
235 	}
236 	if (ALTQ_IS_ENABLED(ifq)) {
237 		IFQ_UNLOCK(ifq);
238 		return 0;
239 	}
240 
241 	s = splnet();
242 	IFQ_PURGE_NOLOCK(ifq);
243 	ASSERT(ifq->ifq_len == 0);
244 	ifq->ifq_drv_maxlen = 0;		/* disable bulk dequeue */
245 	ifq->altq_flags |= ALTQF_ENABLED;
246 	splx(s);
247 
248 	IFQ_UNLOCK(ifq);
249 	return 0;
250 }
251 
252 int
253 altq_disable(ifq)
254 	struct ifaltq *ifq;
255 {
256 	int s;
257 
258 	IFQ_LOCK(ifq);
259 	if (!ALTQ_IS_ENABLED(ifq)) {
260 		IFQ_UNLOCK(ifq);
261 		return 0;
262 	}
263 
264 	s = splnet();
265 	IFQ_PURGE_NOLOCK(ifq);
266 	ASSERT(ifq->ifq_len == 0);
267 	ifq->altq_flags &= ~(ALTQF_ENABLED);
268 	splx(s);
269 
270 	IFQ_UNLOCK(ifq);
271 	return 0;
272 }
273 
274 #ifdef ALTQ_DEBUG
275 void
276 altq_assert(file, line, failedexpr)
277 	const char *file, *failedexpr;
278 	int line;
279 {
280 	(void)printf("altq assertion \"%s\" failed: file \"%s\", line %d\n",
281 		     failedexpr, file, line);
282 	panic("altq assertion");
283 	/* NOTREACHED */
284 }
285 #endif
286 
287 /*
288  * internal representation of token bucket parameters
289  *	rate:	(byte_per_unittime << TBR_SHIFT)  / machclk_freq
290  *		(((bits_per_sec) / 8) << TBR_SHIFT) / machclk_freq
291  *	depth:	byte << TBR_SHIFT
292  *
293  */
294 #define	TBR_SHIFT	29
295 #define	TBR_SCALE(x)	((int64_t)(x) << TBR_SHIFT)
296 #define	TBR_UNSCALE(x)	((x) >> TBR_SHIFT)
297 
298 static struct mbuf *
299 tbr_dequeue(ifq, op)
300 	struct ifaltq *ifq;
301 	int op;
302 {
303 	struct tb_regulator *tbr;
304 	struct mbuf *m;
305 	int64_t interval;
306 	u_int64_t now;
307 
308 	IFQ_LOCK_ASSERT(ifq);
309 	tbr = ifq->altq_tbr;
310 	if (op == ALTDQ_REMOVE && tbr->tbr_lastop == ALTDQ_POLL) {
311 		/* if this is a remove after poll, bypass tbr check */
312 	} else {
313 		/* update token only when it is negative */
314 		if (tbr->tbr_token <= 0) {
315 			now = read_machclk();
316 			interval = now - tbr->tbr_last;
317 			if (interval >= tbr->tbr_filluptime)
318 				tbr->tbr_token = tbr->tbr_depth;
319 			else {
320 				tbr->tbr_token += interval * tbr->tbr_rate;
321 				if (tbr->tbr_token > tbr->tbr_depth)
322 					tbr->tbr_token = tbr->tbr_depth;
323 			}
324 			tbr->tbr_last = now;
325 		}
326 		/* if token is still negative, don't allow dequeue */
327 		if (tbr->tbr_token <= 0)
328 			return (NULL);
329 	}
330 
331 	if (ALTQ_IS_ENABLED(ifq))
332 		m = (*ifq->altq_dequeue)(ifq, op);
333 	else {
334 		if (op == ALTDQ_POLL)
335 			_IF_POLL(ifq, m);
336 		else
337 			_IF_DEQUEUE(ifq, m);
338 	}
339 
340 	if (m != NULL && op == ALTDQ_REMOVE)
341 		tbr->tbr_token -= TBR_SCALE(m_pktlen(m));
342 	tbr->tbr_lastop = op;
343 	return (m);
344 }
345 
346 /*
347  * set a token bucket regulator.
348  * if the specified rate is zero, the token bucket regulator is deleted.
349  */
350 int
351 tbr_set(ifq, profile)
352 	struct ifaltq *ifq;
353 	struct tb_profile *profile;
354 {
355 	struct tb_regulator *tbr, *otbr;
356 
357 	if (tbr_dequeue_ptr == NULL)
358 		tbr_dequeue_ptr = tbr_dequeue;
359 
360 	if (machclk_freq == 0)
361 		init_machclk();
362 	if (machclk_freq == 0) {
363 		printf("tbr_set: no cpu clock available!\n");
364 		return (ENXIO);
365 	}
366 
367 	IFQ_LOCK(ifq);
368 	if (profile->rate == 0) {
369 		/* delete this tbr */
370 		if ((tbr = ifq->altq_tbr) == NULL) {
371 			IFQ_UNLOCK(ifq);
372 			return (ENOENT);
373 		}
374 		ifq->altq_tbr = NULL;
375 		free(tbr, M_DEVBUF);
376 		IFQ_UNLOCK(ifq);
377 		return (0);
378 	}
379 
380 	tbr = malloc(sizeof(struct tb_regulator), M_DEVBUF, M_NOWAIT | M_ZERO);
381 	if (tbr == NULL) {
382 		IFQ_UNLOCK(ifq);
383 		return (ENOMEM);
384 	}
385 
386 	tbr->tbr_rate = TBR_SCALE(profile->rate / 8) / machclk_freq;
387 	tbr->tbr_depth = TBR_SCALE(profile->depth);
388 	if (tbr->tbr_rate > 0)
389 		tbr->tbr_filluptime = tbr->tbr_depth / tbr->tbr_rate;
390 	else
391 		tbr->tbr_filluptime = LLONG_MAX;
392 	/*
393 	 *  The longest time between tbr_dequeue() calls will be about 1
394 	 *  system tick, as the callout that drives it is scheduled once per
395 	 *  tick.  The refill-time detection logic in tbr_dequeue() can only
396 	 *  properly detect the passage of up to LLONG_MAX machclk ticks.
397 	 *  Therefore, in order for this logic to function properly in the
398 	 *  extreme case, the maximum value of tbr_filluptime should be
399 	 *  LLONG_MAX less one system tick's worth of machclk ticks less
400 	 *  some additional slop factor (here one more system tick's worth
401 	 *  of machclk ticks).
402 	 */
403 	if (tbr->tbr_filluptime > (LLONG_MAX - 2 * machclk_per_tick))
404 		tbr->tbr_filluptime = LLONG_MAX - 2 * machclk_per_tick;
405 	tbr->tbr_token = tbr->tbr_depth;
406 	tbr->tbr_last = read_machclk();
407 	tbr->tbr_lastop = ALTDQ_REMOVE;
408 
409 	otbr = ifq->altq_tbr;
410 	ifq->altq_tbr = tbr;	/* set the new tbr */
411 
412 	if (otbr != NULL)
413 		free(otbr, M_DEVBUF);
414 	else {
415 		if (tbr_timer == 0) {
416 			CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
417 			tbr_timer = 1;
418 		}
419 	}
420 	IFQ_UNLOCK(ifq);
421 	return (0);
422 }
423 
424 /*
425  * tbr_timeout goes through the interface list, and kicks the drivers
426  * if necessary.
427  *
428  * MPSAFE
429  */
430 static void
431 tbr_timeout(arg)
432 	void *arg;
433 {
434 	VNET_ITERATOR_DECL(vnet_iter);
435 	struct ifnet *ifp;
436 	struct epoch_tracker et;
437 	int active;
438 
439 	active = 0;
440 	NET_EPOCH_ENTER(et);
441 	VNET_LIST_RLOCK_NOSLEEP();
442 	VNET_FOREACH(vnet_iter) {
443 		CURVNET_SET(vnet_iter);
444 		for (ifp = CK_STAILQ_FIRST(&V_ifnet); ifp;
445 		    ifp = CK_STAILQ_NEXT(ifp, if_link)) {
446 			/* read from if_snd unlocked */
447 			if (!TBR_IS_ENABLED(&ifp->if_snd))
448 				continue;
449 			active++;
450 			if (!IFQ_IS_EMPTY(&ifp->if_snd) &&
451 			    ifp->if_start != NULL)
452 				(*ifp->if_start)(ifp);
453 		}
454 		CURVNET_RESTORE();
455 	}
456 	VNET_LIST_RUNLOCK_NOSLEEP();
457 	NET_EPOCH_EXIT(et);
458 	if (active > 0)
459 		CALLOUT_RESET(&tbr_callout, 1, tbr_timeout, (void *)0);
460 	else
461 		tbr_timer = 0;	/* don't need tbr_timer anymore */
462 }
463 
464 /*
465  * attach a discipline to the interface.  if one already exists, it is
466  * overridden.
467  * Locking is done in the discipline specific attach functions. Basically
468  * they call back to altq_attach which takes care of the attach and locking.
469  */
470 int
471 altq_pfattach(struct pf_altq *a)
472 {
473 	int error = 0;
474 
475 	switch (a->scheduler) {
476 	case ALTQT_NONE:
477 		break;
478 #ifdef ALTQ_CBQ
479 	case ALTQT_CBQ:
480 		error = cbq_pfattach(a);
481 		break;
482 #endif
483 #ifdef ALTQ_PRIQ
484 	case ALTQT_PRIQ:
485 		error = priq_pfattach(a);
486 		break;
487 #endif
488 #ifdef ALTQ_HFSC
489 	case ALTQT_HFSC:
490 		error = hfsc_pfattach(a);
491 		break;
492 #endif
493 #ifdef ALTQ_FAIRQ
494 	case ALTQT_FAIRQ:
495 		error = fairq_pfattach(a);
496 		break;
497 #endif
498 #ifdef ALTQ_CODEL
499 	case ALTQT_CODEL:
500 		error = codel_pfattach(a);
501 		break;
502 #endif
503 	default:
504 		error = ENXIO;
505 	}
506 
507 	return (error);
508 }
509 
510 /*
511  * detach a discipline from the interface.
512  * it is possible that the discipline was already overridden by another
513  * discipline.
514  */
515 int
516 altq_pfdetach(struct pf_altq *a)
517 {
518 	struct ifnet *ifp;
519 	int s, error = 0;
520 
521 	if ((ifp = ifunit(a->ifname)) == NULL)
522 		return (EINVAL);
523 
524 	/* if this discipline is no longer referenced, just return */
525 	/* read unlocked from if_snd */
526 	if (a->altq_disc == NULL || a->altq_disc != ifp->if_snd.altq_disc)
527 		return (0);
528 
529 	s = splnet();
530 	/* read unlocked from if_snd, _disable and _detach take care */
531 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
532 		error = altq_disable(&ifp->if_snd);
533 	if (error == 0)
534 		error = altq_detach(&ifp->if_snd);
535 	splx(s);
536 
537 	return (error);
538 }
539 
540 /*
541  * add a discipline or a queue
542  * Locking is done in the discipline specific functions with regards to
543  * malloc with WAITOK, also it is not yet clear which lock to use.
544  */
545 int
546 altq_add(struct ifnet *ifp, struct pf_altq *a)
547 {
548 	int error = 0;
549 
550 	if (a->qname[0] != 0)
551 		return (altq_add_queue(a));
552 
553 	if (machclk_freq == 0)
554 		init_machclk();
555 	if (machclk_freq == 0)
556 		panic("altq_add: no cpu clock");
557 
558 	switch (a->scheduler) {
559 #ifdef ALTQ_CBQ
560 	case ALTQT_CBQ:
561 		error = cbq_add_altq(ifp, a);
562 		break;
563 #endif
564 #ifdef ALTQ_PRIQ
565 	case ALTQT_PRIQ:
566 		error = priq_add_altq(ifp, a);
567 		break;
568 #endif
569 #ifdef ALTQ_HFSC
570 	case ALTQT_HFSC:
571 		error = hfsc_add_altq(ifp, a);
572 		break;
573 #endif
574 #ifdef ALTQ_FAIRQ
575         case ALTQT_FAIRQ:
576                 error = fairq_add_altq(ifp, a);
577                 break;
578 #endif
579 #ifdef ALTQ_CODEL
580 	case ALTQT_CODEL:
581 		error = codel_add_altq(ifp, a);
582 		break;
583 #endif
584 	default:
585 		error = ENXIO;
586 	}
587 
588 	return (error);
589 }
590 
591 /*
592  * remove a discipline or a queue
593  * It is yet unclear what lock to use to protect this operation, the
594  * discipline specific functions will determine and grab it
595  */
596 int
597 altq_remove(struct pf_altq *a)
598 {
599 	int error = 0;
600 
601 	if (a->qname[0] != 0)
602 		return (altq_remove_queue(a));
603 
604 	switch (a->scheduler) {
605 #ifdef ALTQ_CBQ
606 	case ALTQT_CBQ:
607 		error = cbq_remove_altq(a);
608 		break;
609 #endif
610 #ifdef ALTQ_PRIQ
611 	case ALTQT_PRIQ:
612 		error = priq_remove_altq(a);
613 		break;
614 #endif
615 #ifdef ALTQ_HFSC
616 	case ALTQT_HFSC:
617 		error = hfsc_remove_altq(a);
618 		break;
619 #endif
620 #ifdef ALTQ_FAIRQ
621         case ALTQT_FAIRQ:
622                 error = fairq_remove_altq(a);
623                 break;
624 #endif
625 #ifdef ALTQ_CODEL
626 	case ALTQT_CODEL:
627 		error = codel_remove_altq(a);
628 		break;
629 #endif
630 	default:
631 		error = ENXIO;
632 	}
633 
634 	return (error);
635 }
636 
637 /*
638  * add a queue to the discipline
639  * It is yet unclear what lock to use to protect this operation, the
640  * discipline specific functions will determine and grab it
641  */
642 int
643 altq_add_queue(struct pf_altq *a)
644 {
645 	int error = 0;
646 
647 	switch (a->scheduler) {
648 #ifdef ALTQ_CBQ
649 	case ALTQT_CBQ:
650 		error = cbq_add_queue(a);
651 		break;
652 #endif
653 #ifdef ALTQ_PRIQ
654 	case ALTQT_PRIQ:
655 		error = priq_add_queue(a);
656 		break;
657 #endif
658 #ifdef ALTQ_HFSC
659 	case ALTQT_HFSC:
660 		error = hfsc_add_queue(a);
661 		break;
662 #endif
663 #ifdef ALTQ_FAIRQ
664         case ALTQT_FAIRQ:
665                 error = fairq_add_queue(a);
666                 break;
667 #endif
668 	default:
669 		error = ENXIO;
670 	}
671 
672 	return (error);
673 }
674 
675 /*
676  * remove a queue from the discipline
677  * It is yet unclear what lock to use to protect this operation, the
678  * discipline specific functions will determine and grab it
679  */
680 int
681 altq_remove_queue(struct pf_altq *a)
682 {
683 	int error = 0;
684 
685 	switch (a->scheduler) {
686 #ifdef ALTQ_CBQ
687 	case ALTQT_CBQ:
688 		error = cbq_remove_queue(a);
689 		break;
690 #endif
691 #ifdef ALTQ_PRIQ
692 	case ALTQT_PRIQ:
693 		error = priq_remove_queue(a);
694 		break;
695 #endif
696 #ifdef ALTQ_HFSC
697 	case ALTQT_HFSC:
698 		error = hfsc_remove_queue(a);
699 		break;
700 #endif
701 #ifdef ALTQ_FAIRQ
702         case ALTQT_FAIRQ:
703                 error = fairq_remove_queue(a);
704                 break;
705 #endif
706 	default:
707 		error = ENXIO;
708 	}
709 
710 	return (error);
711 }
712 
713 /*
714  * get queue statistics
715  * Locking is done in the discipline specific functions with regards to
716  * copyout operations, also it is not yet clear which lock to use.
717  */
718 int
719 altq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes, int version)
720 {
721 	int error = 0;
722 
723 	switch (a->scheduler) {
724 #ifdef ALTQ_CBQ
725 	case ALTQT_CBQ:
726 		error = cbq_getqstats(a, ubuf, nbytes, version);
727 		break;
728 #endif
729 #ifdef ALTQ_PRIQ
730 	case ALTQT_PRIQ:
731 		error = priq_getqstats(a, ubuf, nbytes, version);
732 		break;
733 #endif
734 #ifdef ALTQ_HFSC
735 	case ALTQT_HFSC:
736 		error = hfsc_getqstats(a, ubuf, nbytes, version);
737 		break;
738 #endif
739 #ifdef ALTQ_FAIRQ
740         case ALTQT_FAIRQ:
741                 error = fairq_getqstats(a, ubuf, nbytes, version);
742                 break;
743 #endif
744 #ifdef ALTQ_CODEL
745 	case ALTQT_CODEL:
746 		error = codel_getqstats(a, ubuf, nbytes, version);
747 		break;
748 #endif
749 	default:
750 		error = ENXIO;
751 	}
752 
753 	return (error);
754 }
755 
756 /*
757  * read and write diffserv field in IPv4 or IPv6 header
758  */
759 u_int8_t
760 read_dsfield(m, pktattr)
761 	struct mbuf *m;
762 	struct altq_pktattr *pktattr;
763 {
764 	struct mbuf *m0;
765 	u_int8_t ds_field = 0;
766 
767 	if (pktattr == NULL ||
768 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
769 		return ((u_int8_t)0);
770 
771 	/* verify that pattr_hdr is within the mbuf data */
772 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
773 		if ((pktattr->pattr_hdr >= m0->m_data) &&
774 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
775 			break;
776 	if (m0 == NULL) {
777 		/* ick, pattr_hdr is stale */
778 		pktattr->pattr_af = AF_UNSPEC;
779 #ifdef ALTQ_DEBUG
780 		printf("read_dsfield: can't locate header!\n");
781 #endif
782 		return ((u_int8_t)0);
783 	}
784 
785 	if (pktattr->pattr_af == AF_INET) {
786 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
787 
788 		if (ip->ip_v != 4)
789 			return ((u_int8_t)0);	/* version mismatch! */
790 		ds_field = ip->ip_tos;
791 	}
792 #ifdef INET6
793 	else if (pktattr->pattr_af == AF_INET6) {
794 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
795 		u_int32_t flowlabel;
796 
797 		flowlabel = ntohl(ip6->ip6_flow);
798 		if ((flowlabel >> 28) != 6)
799 			return ((u_int8_t)0);	/* version mismatch! */
800 		ds_field = (flowlabel >> 20) & 0xff;
801 	}
802 #endif
803 	return (ds_field);
804 }
805 
806 void
807 write_dsfield(struct mbuf *m, struct altq_pktattr *pktattr, u_int8_t dsfield)
808 {
809 	struct mbuf *m0;
810 
811 	if (pktattr == NULL ||
812 	    (pktattr->pattr_af != AF_INET && pktattr->pattr_af != AF_INET6))
813 		return;
814 
815 	/* verify that pattr_hdr is within the mbuf data */
816 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
817 		if ((pktattr->pattr_hdr >= m0->m_data) &&
818 		    (pktattr->pattr_hdr < m0->m_data + m0->m_len))
819 			break;
820 	if (m0 == NULL) {
821 		/* ick, pattr_hdr is stale */
822 		pktattr->pattr_af = AF_UNSPEC;
823 #ifdef ALTQ_DEBUG
824 		printf("write_dsfield: can't locate header!\n");
825 #endif
826 		return;
827 	}
828 
829 	if (pktattr->pattr_af == AF_INET) {
830 		struct ip *ip = (struct ip *)pktattr->pattr_hdr;
831 		u_int8_t old;
832 		int32_t sum;
833 
834 		if (ip->ip_v != 4)
835 			return;		/* version mismatch! */
836 		old = ip->ip_tos;
837 		dsfield |= old & 3;	/* leave CU bits */
838 		if (old == dsfield)
839 			return;
840 		ip->ip_tos = dsfield;
841 		/*
842 		 * update checksum (from RFC1624)
843 		 *	   HC' = ~(~HC + ~m + m')
844 		 */
845 		sum = ~ntohs(ip->ip_sum) & 0xffff;
846 		sum += 0xff00 + (~old & 0xff) + dsfield;
847 		sum = (sum >> 16) + (sum & 0xffff);
848 		sum += (sum >> 16);  /* add carry */
849 
850 		ip->ip_sum = htons(~sum & 0xffff);
851 	}
852 #ifdef INET6
853 	else if (pktattr->pattr_af == AF_INET6) {
854 		struct ip6_hdr *ip6 = (struct ip6_hdr *)pktattr->pattr_hdr;
855 		u_int32_t flowlabel;
856 
857 		flowlabel = ntohl(ip6->ip6_flow);
858 		if ((flowlabel >> 28) != 6)
859 			return;		/* version mismatch! */
860 		flowlabel = (flowlabel & 0xf03fffff) | (dsfield << 20);
861 		ip6->ip6_flow = htonl(flowlabel);
862 	}
863 #endif
864 	return;
865 }
866 
867 /*
868  * high resolution clock support taking advantage of a machine dependent
869  * high resolution time counter (e.g., timestamp counter of intel pentium).
870  * we assume
871  *  - 64-bit-long monotonically-increasing counter
872  *  - frequency range is 100M-4GHz (CPU speed)
873  */
874 /* if pcc is not available or disabled, emulate 256MHz using microtime() */
875 #define	MACHCLK_SHIFT	8
876 
877 int machclk_usepcc;
878 u_int32_t machclk_freq;
879 u_int32_t machclk_per_tick;
880 
881 #if defined(__i386__) && defined(__NetBSD__)
882 extern u_int64_t cpu_tsc_freq;
883 #endif
884 
885 #if (__FreeBSD_version >= 700035)
886 /* Update TSC freq with the value indicated by the caller. */
887 static void
888 tsc_freq_changed(void *arg, const struct cf_level *level, int status)
889 {
890 	/* If there was an error during the transition, don't do anything. */
891 	if (status != 0)
892 		return;
893 
894 #if (__FreeBSD_version >= 701102) && (defined(__amd64__) || defined(__i386__))
895 	/* If TSC is P-state invariant, don't do anything. */
896 	if (tsc_is_invariant)
897 		return;
898 #endif
899 
900 	/* Total setting for this level gives the new frequency in MHz. */
901 	init_machclk();
902 }
903 EVENTHANDLER_DEFINE(cpufreq_post_change, tsc_freq_changed, NULL,
904     EVENTHANDLER_PRI_LAST);
905 #endif /* __FreeBSD_version >= 700035 */
906 
907 static void
908 init_machclk_setup(void)
909 {
910 	callout_init(&tbr_callout, 1);
911 
912 	machclk_usepcc = 1;
913 
914 #if (!defined(__amd64__) && !defined(__i386__)) || defined(ALTQ_NOPCC)
915 	machclk_usepcc = 0;
916 #endif
917 #if defined(__FreeBSD__) && defined(SMP)
918 	machclk_usepcc = 0;
919 #endif
920 #if defined(__NetBSD__) && defined(MULTIPROCESSOR)
921 	machclk_usepcc = 0;
922 #endif
923 #if defined(__amd64__) || defined(__i386__)
924 	/* check if TSC is available */
925 	if ((cpu_feature & CPUID_TSC) == 0 ||
926 	    atomic_load_acq_64(&tsc_freq) == 0)
927 		machclk_usepcc = 0;
928 #endif
929 }
930 
931 void
932 init_machclk(void)
933 {
934 	static int called;
935 
936 	/* Call one-time initialization function. */
937 	if (!called) {
938 		init_machclk_setup();
939 		called = 1;
940 	}
941 
942 	if (machclk_usepcc == 0) {
943 		/* emulate 256MHz using microtime() */
944 		machclk_freq = 1000000 << MACHCLK_SHIFT;
945 		machclk_per_tick = machclk_freq / hz;
946 #ifdef ALTQ_DEBUG
947 		printf("altq: emulate %uHz cpu clock\n", machclk_freq);
948 #endif
949 		return;
950 	}
951 
952 	/*
953 	 * if the clock frequency (of Pentium TSC or Alpha PCC) is
954 	 * accessible, just use it.
955 	 */
956 #if defined(__amd64__) || defined(__i386__)
957 	machclk_freq = atomic_load_acq_64(&tsc_freq);
958 #endif
959 
960 	/*
961 	 * if we don't know the clock frequency, measure it.
962 	 */
963 	if (machclk_freq == 0) {
964 		static int	wait;
965 		struct timeval	tv_start, tv_end;
966 		u_int64_t	start, end, diff;
967 		int		timo;
968 
969 		microtime(&tv_start);
970 		start = read_machclk();
971 		timo = hz;	/* 1 sec */
972 		(void)tsleep(&wait, PWAIT | PCATCH, "init_machclk", timo);
973 		microtime(&tv_end);
974 		end = read_machclk();
975 		diff = (u_int64_t)(tv_end.tv_sec - tv_start.tv_sec) * 1000000
976 		    + tv_end.tv_usec - tv_start.tv_usec;
977 		if (diff != 0)
978 			machclk_freq = (u_int)((end - start) * 1000000 / diff);
979 	}
980 
981 	machclk_per_tick = machclk_freq / hz;
982 
983 #ifdef ALTQ_DEBUG
984 	printf("altq: CPU clock: %uHz\n", machclk_freq);
985 #endif
986 }
987 
988 #if defined(__OpenBSD__) && defined(__i386__)
989 static __inline u_int64_t
990 rdtsc(void)
991 {
992 	u_int64_t rv;
993 	__asm __volatile(".byte 0x0f, 0x31" : "=A" (rv));
994 	return (rv);
995 }
996 #endif /* __OpenBSD__ && __i386__ */
997 
998 u_int64_t
999 read_machclk(void)
1000 {
1001 	u_int64_t val;
1002 
1003 	if (machclk_usepcc) {
1004 #if defined(__amd64__) || defined(__i386__)
1005 		val = rdtsc();
1006 #else
1007 		panic("read_machclk");
1008 #endif
1009 	} else {
1010 		struct timeval tv, boottime;
1011 
1012 		microtime(&tv);
1013 		getboottime(&boottime);
1014 		val = (((u_int64_t)(tv.tv_sec - boottime.tv_sec) * 1000000
1015 		    + tv.tv_usec) << MACHCLK_SHIFT);
1016 	}
1017 	return (val);
1018 }
1019 
1020 #ifdef ALTQ3_CLFIER_COMPAT
1021 
1022 #ifndef IPPROTO_ESP
1023 #define	IPPROTO_ESP	50		/* encapsulating security payload */
1024 #endif
1025 #ifndef IPPROTO_AH
1026 #define	IPPROTO_AH	51		/* authentication header */
1027 #endif
1028 
1029 /*
1030  * extract flow information from a given packet.
1031  * filt_mask shows flowinfo fields required.
1032  * we assume the ip header is in one mbuf, and addresses and ports are
1033  * in network byte order.
1034  */
1035 int
1036 altq_extractflow(m, af, flow, filt_bmask)
1037 	struct mbuf *m;
1038 	int af;
1039 	struct flowinfo *flow;
1040 	u_int32_t	filt_bmask;
1041 {
1042 
1043 	switch (af) {
1044 	case PF_INET: {
1045 		struct flowinfo_in *fin;
1046 		struct ip *ip;
1047 
1048 		ip = mtod(m, struct ip *);
1049 
1050 		if (ip->ip_v != 4)
1051 			break;
1052 
1053 		fin = (struct flowinfo_in *)flow;
1054 		fin->fi_len = sizeof(struct flowinfo_in);
1055 		fin->fi_family = AF_INET;
1056 
1057 		fin->fi_proto = ip->ip_p;
1058 		fin->fi_tos = ip->ip_tos;
1059 
1060 		fin->fi_src.s_addr = ip->ip_src.s_addr;
1061 		fin->fi_dst.s_addr = ip->ip_dst.s_addr;
1062 
1063 		if (filt_bmask & FIMB4_PORTS)
1064 			/* if port info is required, extract port numbers */
1065 			extract_ports4(m, ip, fin);
1066 		else {
1067 			fin->fi_sport = 0;
1068 			fin->fi_dport = 0;
1069 			fin->fi_gpi = 0;
1070 		}
1071 		return (1);
1072 	}
1073 
1074 #ifdef INET6
1075 	case PF_INET6: {
1076 		struct flowinfo_in6 *fin6;
1077 		struct ip6_hdr *ip6;
1078 
1079 		ip6 = mtod(m, struct ip6_hdr *);
1080 		/* should we check the ip version? */
1081 
1082 		fin6 = (struct flowinfo_in6 *)flow;
1083 		fin6->fi6_len = sizeof(struct flowinfo_in6);
1084 		fin6->fi6_family = AF_INET6;
1085 
1086 		fin6->fi6_proto = ip6->ip6_nxt;
1087 		fin6->fi6_tclass   = IPV6_TRAFFIC_CLASS(ip6);
1088 
1089 		fin6->fi6_flowlabel = ip6->ip6_flow & htonl(0x000fffff);
1090 		fin6->fi6_src = ip6->ip6_src;
1091 		fin6->fi6_dst = ip6->ip6_dst;
1092 
1093 		if ((filt_bmask & FIMB6_PORTS) ||
1094 		    ((filt_bmask & FIMB6_PROTO)
1095 		     && ip6->ip6_nxt > IPPROTO_IPV6))
1096 			/*
1097 			 * if port info is required, or proto is required
1098 			 * but there are option headers, extract port
1099 			 * and protocol numbers.
1100 			 */
1101 			extract_ports6(m, ip6, fin6);
1102 		else {
1103 			fin6->fi6_sport = 0;
1104 			fin6->fi6_dport = 0;
1105 			fin6->fi6_gpi = 0;
1106 		}
1107 		return (1);
1108 	}
1109 #endif /* INET6 */
1110 
1111 	default:
1112 		break;
1113 	}
1114 
1115 	/* failed */
1116 	flow->fi_len = sizeof(struct flowinfo);
1117 	flow->fi_family = AF_UNSPEC;
1118 	return (0);
1119 }
1120 
1121 /*
1122  * helper routine to extract port numbers
1123  */
1124 /* structure for ipsec and ipv6 option header template */
1125 struct _opt6 {
1126 	u_int8_t	opt6_nxt;	/* next header */
1127 	u_int8_t	opt6_hlen;	/* header extension length */
1128 	u_int16_t	_pad;
1129 	u_int32_t	ah_spi;		/* security parameter index
1130 					   for authentication header */
1131 };
1132 
1133 /*
1134  * extract port numbers from a ipv4 packet.
1135  */
1136 static int
1137 extract_ports4(m, ip, fin)
1138 	struct mbuf *m;
1139 	struct ip *ip;
1140 	struct flowinfo_in *fin;
1141 {
1142 	struct mbuf *m0;
1143 	u_short ip_off;
1144 	u_int8_t proto;
1145 	int 	off;
1146 
1147 	fin->fi_sport = 0;
1148 	fin->fi_dport = 0;
1149 	fin->fi_gpi = 0;
1150 
1151 	ip_off = ntohs(ip->ip_off);
1152 	/* if it is a fragment, try cached fragment info */
1153 	if (ip_off & IP_OFFMASK) {
1154 		ip4f_lookup(ip, fin);
1155 		return (1);
1156 	}
1157 
1158 	/* locate the mbuf containing the protocol header */
1159 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1160 		if (((caddr_t)ip >= m0->m_data) &&
1161 		    ((caddr_t)ip < m0->m_data + m0->m_len))
1162 			break;
1163 	if (m0 == NULL) {
1164 #ifdef ALTQ_DEBUG
1165 		printf("extract_ports4: can't locate header! ip=%p\n", ip);
1166 #endif
1167 		return (0);
1168 	}
1169 	off = ((caddr_t)ip - m0->m_data) + (ip->ip_hl << 2);
1170 	proto = ip->ip_p;
1171 
1172 #ifdef ALTQ_IPSEC
1173  again:
1174 #endif
1175 	while (off >= m0->m_len) {
1176 		off -= m0->m_len;
1177 		m0 = m0->m_next;
1178 		if (m0 == NULL)
1179 			return (0);  /* bogus ip_hl! */
1180 	}
1181 	if (m0->m_len < off + 4)
1182 		return (0);
1183 
1184 	switch (proto) {
1185 	case IPPROTO_TCP:
1186 	case IPPROTO_UDP: {
1187 		struct udphdr *udp;
1188 
1189 		udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1190 		fin->fi_sport = udp->uh_sport;
1191 		fin->fi_dport = udp->uh_dport;
1192 		fin->fi_proto = proto;
1193 		}
1194 		break;
1195 
1196 #ifdef ALTQ_IPSEC
1197 	case IPPROTO_ESP:
1198 		if (fin->fi_gpi == 0){
1199 			u_int32_t *gpi;
1200 
1201 			gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1202 			fin->fi_gpi   = *gpi;
1203 		}
1204 		fin->fi_proto = proto;
1205 		break;
1206 
1207 	case IPPROTO_AH: {
1208 			/* get next header and header length */
1209 			struct _opt6 *opt6;
1210 
1211 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1212 			proto = opt6->opt6_nxt;
1213 			off += 8 + (opt6->opt6_hlen * 4);
1214 			if (fin->fi_gpi == 0 && m0->m_len >= off + 8)
1215 				fin->fi_gpi = opt6->ah_spi;
1216 		}
1217 		/* goto the next header */
1218 		goto again;
1219 #endif  /* ALTQ_IPSEC */
1220 
1221 	default:
1222 		fin->fi_proto = proto;
1223 		return (0);
1224 	}
1225 
1226 	/* if this is a first fragment, cache it. */
1227 	if (ip_off & IP_MF)
1228 		ip4f_cache(ip, fin);
1229 
1230 	return (1);
1231 }
1232 
1233 #ifdef INET6
1234 static int
1235 extract_ports6(m, ip6, fin6)
1236 	struct mbuf *m;
1237 	struct ip6_hdr *ip6;
1238 	struct flowinfo_in6 *fin6;
1239 {
1240 	struct mbuf *m0;
1241 	int	off;
1242 	u_int8_t proto;
1243 
1244 	fin6->fi6_gpi   = 0;
1245 	fin6->fi6_sport = 0;
1246 	fin6->fi6_dport = 0;
1247 
1248 	/* locate the mbuf containing the protocol header */
1249 	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1250 		if (((caddr_t)ip6 >= m0->m_data) &&
1251 		    ((caddr_t)ip6 < m0->m_data + m0->m_len))
1252 			break;
1253 	if (m0 == NULL) {
1254 #ifdef ALTQ_DEBUG
1255 		printf("extract_ports6: can't locate header! ip6=%p\n", ip6);
1256 #endif
1257 		return (0);
1258 	}
1259 	off = ((caddr_t)ip6 - m0->m_data) + sizeof(struct ip6_hdr);
1260 
1261 	proto = ip6->ip6_nxt;
1262 	do {
1263 		while (off >= m0->m_len) {
1264 			off -= m0->m_len;
1265 			m0 = m0->m_next;
1266 			if (m0 == NULL)
1267 				return (0);
1268 		}
1269 		if (m0->m_len < off + 4)
1270 			return (0);
1271 
1272 		switch (proto) {
1273 		case IPPROTO_TCP:
1274 		case IPPROTO_UDP: {
1275 			struct udphdr *udp;
1276 
1277 			udp = (struct udphdr *)(mtod(m0, caddr_t) + off);
1278 			fin6->fi6_sport = udp->uh_sport;
1279 			fin6->fi6_dport = udp->uh_dport;
1280 			fin6->fi6_proto = proto;
1281 			}
1282 			return (1);
1283 
1284 		case IPPROTO_ESP:
1285 			if (fin6->fi6_gpi == 0) {
1286 				u_int32_t *gpi;
1287 
1288 				gpi = (u_int32_t *)(mtod(m0, caddr_t) + off);
1289 				fin6->fi6_gpi   = *gpi;
1290 			}
1291 			fin6->fi6_proto = proto;
1292 			return (1);
1293 
1294 		case IPPROTO_AH: {
1295 			/* get next header and header length */
1296 			struct _opt6 *opt6;
1297 
1298 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1299 			if (fin6->fi6_gpi == 0 && m0->m_len >= off + 8)
1300 				fin6->fi6_gpi = opt6->ah_spi;
1301 			proto = opt6->opt6_nxt;
1302 			off += 8 + (opt6->opt6_hlen * 4);
1303 			/* goto the next header */
1304 			break;
1305 			}
1306 
1307 		case IPPROTO_HOPOPTS:
1308 		case IPPROTO_ROUTING:
1309 		case IPPROTO_DSTOPTS: {
1310 			/* get next header and header length */
1311 			struct _opt6 *opt6;
1312 
1313 			opt6 = (struct _opt6 *)(mtod(m0, caddr_t) + off);
1314 			proto = opt6->opt6_nxt;
1315 			off += (opt6->opt6_hlen + 1) * 8;
1316 			/* goto the next header */
1317 			break;
1318 			}
1319 
1320 		case IPPROTO_FRAGMENT:
1321 			/* ipv6 fragmentations are not supported yet */
1322 		default:
1323 			fin6->fi6_proto = proto;
1324 			return (0);
1325 		}
1326 	} while (1);
1327 	/*NOTREACHED*/
1328 }
1329 #endif /* INET6 */
1330 
1331 /*
1332  * altq common classifier
1333  */
1334 int
1335 acc_add_filter(classifier, filter, class, phandle)
1336 	struct acc_classifier *classifier;
1337 	struct flow_filter *filter;
1338 	void	*class;
1339 	u_long	*phandle;
1340 {
1341 	struct acc_filter *afp, *prev, *tmp;
1342 	int	i, s;
1343 
1344 #ifdef INET6
1345 	if (filter->ff_flow.fi_family != AF_INET &&
1346 	    filter->ff_flow.fi_family != AF_INET6)
1347 		return (EINVAL);
1348 #else
1349 	if (filter->ff_flow.fi_family != AF_INET)
1350 		return (EINVAL);
1351 #endif
1352 
1353 	afp = malloc(sizeof(struct acc_filter),
1354 	       M_DEVBUF, M_WAITOK);
1355 	if (afp == NULL)
1356 		return (ENOMEM);
1357 	bzero(afp, sizeof(struct acc_filter));
1358 
1359 	afp->f_filter = *filter;
1360 	afp->f_class = class;
1361 
1362 	i = ACC_WILDCARD_INDEX;
1363 	if (filter->ff_flow.fi_family == AF_INET) {
1364 		struct flow_filter *filter4 = &afp->f_filter;
1365 
1366 		/*
1367 		 * if address is 0, it's a wildcard.  if address mask
1368 		 * isn't set, use full mask.
1369 		 */
1370 		if (filter4->ff_flow.fi_dst.s_addr == 0)
1371 			filter4->ff_mask.mask_dst.s_addr = 0;
1372 		else if (filter4->ff_mask.mask_dst.s_addr == 0)
1373 			filter4->ff_mask.mask_dst.s_addr = 0xffffffff;
1374 		if (filter4->ff_flow.fi_src.s_addr == 0)
1375 			filter4->ff_mask.mask_src.s_addr = 0;
1376 		else if (filter4->ff_mask.mask_src.s_addr == 0)
1377 			filter4->ff_mask.mask_src.s_addr = 0xffffffff;
1378 
1379 		/* clear extra bits in addresses  */
1380 		   filter4->ff_flow.fi_dst.s_addr &=
1381 		       filter4->ff_mask.mask_dst.s_addr;
1382 		   filter4->ff_flow.fi_src.s_addr &=
1383 		       filter4->ff_mask.mask_src.s_addr;
1384 
1385 		/*
1386 		 * if dst address is a wildcard, use hash-entry
1387 		 * ACC_WILDCARD_INDEX.
1388 		 */
1389 		if (filter4->ff_mask.mask_dst.s_addr != 0xffffffff)
1390 			i = ACC_WILDCARD_INDEX;
1391 		else
1392 			i = ACC_GET_HASH_INDEX(filter4->ff_flow.fi_dst.s_addr);
1393 	}
1394 #ifdef INET6
1395 	else if (filter->ff_flow.fi_family == AF_INET6) {
1396 		struct flow_filter6 *filter6 =
1397 			(struct flow_filter6 *)&afp->f_filter;
1398 #ifndef IN6MASK0 /* taken from kame ipv6 */
1399 #define	IN6MASK0	{{{ 0, 0, 0, 0 }}}
1400 #define	IN6MASK128	{{{ 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }}}
1401 		const struct in6_addr in6mask0 = IN6MASK0;
1402 		const struct in6_addr in6mask128 = IN6MASK128;
1403 #endif
1404 
1405 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_dst))
1406 			filter6->ff_mask6.mask6_dst = in6mask0;
1407 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_dst))
1408 			filter6->ff_mask6.mask6_dst = in6mask128;
1409 		if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_flow6.fi6_src))
1410 			filter6->ff_mask6.mask6_src = in6mask0;
1411 		else if (IN6_IS_ADDR_UNSPECIFIED(&filter6->ff_mask6.mask6_src))
1412 			filter6->ff_mask6.mask6_src = in6mask128;
1413 
1414 		/* clear extra bits in addresses  */
1415 		for (i = 0; i < 16; i++)
1416 			filter6->ff_flow6.fi6_dst.s6_addr[i] &=
1417 			    filter6->ff_mask6.mask6_dst.s6_addr[i];
1418 		for (i = 0; i < 16; i++)
1419 			filter6->ff_flow6.fi6_src.s6_addr[i] &=
1420 			    filter6->ff_mask6.mask6_src.s6_addr[i];
1421 
1422 		if (filter6->ff_flow6.fi6_flowlabel == 0)
1423 			i = ACC_WILDCARD_INDEX;
1424 		else
1425 			i = ACC_GET_HASH_INDEX(filter6->ff_flow6.fi6_flowlabel);
1426 	}
1427 #endif /* INET6 */
1428 
1429 	afp->f_handle = get_filt_handle(classifier, i);
1430 
1431 	/* update filter bitmask */
1432 	afp->f_fbmask = filt2fibmask(filter);
1433 	classifier->acc_fbmask |= afp->f_fbmask;
1434 
1435 	/*
1436 	 * add this filter to the filter list.
1437 	 * filters are ordered from the highest rule number.
1438 	 */
1439 	s = splnet();
1440 	prev = NULL;
1441 	LIST_FOREACH(tmp, &classifier->acc_filters[i], f_chain) {
1442 		if (tmp->f_filter.ff_ruleno > afp->f_filter.ff_ruleno)
1443 			prev = tmp;
1444 		else
1445 			break;
1446 	}
1447 	if (prev == NULL)
1448 		LIST_INSERT_HEAD(&classifier->acc_filters[i], afp, f_chain);
1449 	else
1450 		LIST_INSERT_AFTER(prev, afp, f_chain);
1451 	splx(s);
1452 
1453 	*phandle = afp->f_handle;
1454 	return (0);
1455 }
1456 
1457 int
1458 acc_delete_filter(classifier, handle)
1459 	struct acc_classifier *classifier;
1460 	u_long handle;
1461 {
1462 	struct acc_filter *afp;
1463 	int	s;
1464 
1465 	if ((afp = filth_to_filtp(classifier, handle)) == NULL)
1466 		return (EINVAL);
1467 
1468 	s = splnet();
1469 	LIST_REMOVE(afp, f_chain);
1470 	splx(s);
1471 
1472 	free(afp, M_DEVBUF);
1473 
1474 	/* todo: update filt_bmask */
1475 
1476 	return (0);
1477 }
1478 
1479 /*
1480  * delete filters referencing to the specified class.
1481  * if the all flag is not 0, delete all the filters.
1482  */
1483 int
1484 acc_discard_filters(classifier, class, all)
1485 	struct acc_classifier *classifier;
1486 	void	*class;
1487 	int	all;
1488 {
1489 	struct acc_filter *afp;
1490 	int	i, s;
1491 
1492 	s = splnet();
1493 	for (i = 0; i < ACC_FILTER_TABLESIZE; i++) {
1494 		do {
1495 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1496 				if (all || afp->f_class == class) {
1497 					LIST_REMOVE(afp, f_chain);
1498 					free(afp, M_DEVBUF);
1499 					/* start again from the head */
1500 					break;
1501 				}
1502 		} while (afp != NULL);
1503 	}
1504 	splx(s);
1505 
1506 	if (all)
1507 		classifier->acc_fbmask = 0;
1508 
1509 	return (0);
1510 }
1511 
1512 void *
1513 acc_classify(clfier, m, af)
1514 	void *clfier;
1515 	struct mbuf *m;
1516 	int af;
1517 {
1518 	struct acc_classifier *classifier;
1519 	struct flowinfo flow;
1520 	struct acc_filter *afp;
1521 	int	i;
1522 
1523 	classifier = (struct acc_classifier *)clfier;
1524 	altq_extractflow(m, af, &flow, classifier->acc_fbmask);
1525 
1526 	if (flow.fi_family == AF_INET) {
1527 		struct flowinfo_in *fp = (struct flowinfo_in *)&flow;
1528 
1529 		if ((classifier->acc_fbmask & FIMB4_ALL) == FIMB4_TOS) {
1530 			/* only tos is used */
1531 			LIST_FOREACH(afp,
1532 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1533 				 f_chain)
1534 				if (apply_tosfilter4(afp->f_fbmask,
1535 						     &afp->f_filter, fp))
1536 					/* filter matched */
1537 					return (afp->f_class);
1538 		} else if ((classifier->acc_fbmask &
1539 			(~(FIMB4_PROTO|FIMB4_SPORT|FIMB4_DPORT) & FIMB4_ALL))
1540 		    == 0) {
1541 			/* only proto and ports are used */
1542 			LIST_FOREACH(afp,
1543 				 &classifier->acc_filters[ACC_WILDCARD_INDEX],
1544 				 f_chain)
1545 				if (apply_ppfilter4(afp->f_fbmask,
1546 						    &afp->f_filter, fp))
1547 					/* filter matched */
1548 					return (afp->f_class);
1549 		} else {
1550 			/* get the filter hash entry from its dest address */
1551 			i = ACC_GET_HASH_INDEX(fp->fi_dst.s_addr);
1552 			do {
1553 				/*
1554 				 * go through this loop twice.  first for dst
1555 				 * hash, second for wildcards.
1556 				 */
1557 				LIST_FOREACH(afp, &classifier->acc_filters[i],
1558 					     f_chain)
1559 					if (apply_filter4(afp->f_fbmask,
1560 							  &afp->f_filter, fp))
1561 						/* filter matched */
1562 						return (afp->f_class);
1563 
1564 				/*
1565 				 * check again for filters with a dst addr
1566 				 * wildcard.
1567 				 * (daddr == 0 || dmask != 0xffffffff).
1568 				 */
1569 				if (i != ACC_WILDCARD_INDEX)
1570 					i = ACC_WILDCARD_INDEX;
1571 				else
1572 					break;
1573 			} while (1);
1574 		}
1575 	}
1576 #ifdef INET6
1577 	else if (flow.fi_family == AF_INET6) {
1578 		struct flowinfo_in6 *fp6 = (struct flowinfo_in6 *)&flow;
1579 
1580 		/* get the filter hash entry from its flow ID */
1581 		if (fp6->fi6_flowlabel != 0)
1582 			i = ACC_GET_HASH_INDEX(fp6->fi6_flowlabel);
1583 		else
1584 			/* flowlable can be zero */
1585 			i = ACC_WILDCARD_INDEX;
1586 
1587 		/* go through this loop twice.  first for flow hash, second
1588 		   for wildcards. */
1589 		do {
1590 			LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1591 				if (apply_filter6(afp->f_fbmask,
1592 					(struct flow_filter6 *)&afp->f_filter,
1593 					fp6))
1594 					/* filter matched */
1595 					return (afp->f_class);
1596 
1597 			/*
1598 			 * check again for filters with a wildcard.
1599 			 */
1600 			if (i != ACC_WILDCARD_INDEX)
1601 				i = ACC_WILDCARD_INDEX;
1602 			else
1603 				break;
1604 		} while (1);
1605 	}
1606 #endif /* INET6 */
1607 
1608 	/* no filter matched */
1609 	return (NULL);
1610 }
1611 
1612 static int
1613 apply_filter4(fbmask, filt, pkt)
1614 	u_int32_t	fbmask;
1615 	struct flow_filter *filt;
1616 	struct flowinfo_in *pkt;
1617 {
1618 	if (filt->ff_flow.fi_family != AF_INET)
1619 		return (0);
1620 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1621 		return (0);
1622 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1623 		return (0);
1624 	if ((fbmask & FIMB4_DADDR) &&
1625 	    filt->ff_flow.fi_dst.s_addr !=
1626 	    (pkt->fi_dst.s_addr & filt->ff_mask.mask_dst.s_addr))
1627 		return (0);
1628 	if ((fbmask & FIMB4_SADDR) &&
1629 	    filt->ff_flow.fi_src.s_addr !=
1630 	    (pkt->fi_src.s_addr & filt->ff_mask.mask_src.s_addr))
1631 		return (0);
1632 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1633 		return (0);
1634 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1635 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1636 		return (0);
1637 	if ((fbmask & FIMB4_GPI) && filt->ff_flow.fi_gpi != (pkt->fi_gpi))
1638 		return (0);
1639 	/* match */
1640 	return (1);
1641 }
1642 
1643 /*
1644  * filter matching function optimized for a common case that checks
1645  * only protocol and port numbers
1646  */
1647 static int
1648 apply_ppfilter4(fbmask, filt, pkt)
1649 	u_int32_t	fbmask;
1650 	struct flow_filter *filt;
1651 	struct flowinfo_in *pkt;
1652 {
1653 	if (filt->ff_flow.fi_family != AF_INET)
1654 		return (0);
1655 	if ((fbmask & FIMB4_SPORT) && filt->ff_flow.fi_sport != pkt->fi_sport)
1656 		return (0);
1657 	if ((fbmask & FIMB4_DPORT) && filt->ff_flow.fi_dport != pkt->fi_dport)
1658 		return (0);
1659 	if ((fbmask & FIMB4_PROTO) && filt->ff_flow.fi_proto != pkt->fi_proto)
1660 		return (0);
1661 	/* match */
1662 	return (1);
1663 }
1664 
1665 /*
1666  * filter matching function only for tos field.
1667  */
1668 static int
1669 apply_tosfilter4(fbmask, filt, pkt)
1670 	u_int32_t	fbmask;
1671 	struct flow_filter *filt;
1672 	struct flowinfo_in *pkt;
1673 {
1674 	if (filt->ff_flow.fi_family != AF_INET)
1675 		return (0);
1676 	if ((fbmask & FIMB4_TOS) && filt->ff_flow.fi_tos !=
1677 	    (pkt->fi_tos & filt->ff_mask.mask_tos))
1678 		return (0);
1679 	/* match */
1680 	return (1);
1681 }
1682 
1683 #ifdef INET6
1684 static int
1685 apply_filter6(fbmask, filt, pkt)
1686 	u_int32_t	fbmask;
1687 	struct flow_filter6 *filt;
1688 	struct flowinfo_in6 *pkt;
1689 {
1690 	int i;
1691 
1692 	if (filt->ff_flow6.fi6_family != AF_INET6)
1693 		return (0);
1694 	if ((fbmask & FIMB6_FLABEL) &&
1695 	    filt->ff_flow6.fi6_flowlabel != pkt->fi6_flowlabel)
1696 		return (0);
1697 	if ((fbmask & FIMB6_PROTO) &&
1698 	    filt->ff_flow6.fi6_proto != pkt->fi6_proto)
1699 		return (0);
1700 	if ((fbmask & FIMB6_SPORT) &&
1701 	    filt->ff_flow6.fi6_sport != pkt->fi6_sport)
1702 		return (0);
1703 	if ((fbmask & FIMB6_DPORT) &&
1704 	    filt->ff_flow6.fi6_dport != pkt->fi6_dport)
1705 		return (0);
1706 	if (fbmask & FIMB6_SADDR) {
1707 		for (i = 0; i < 4; i++)
1708 			if (filt->ff_flow6.fi6_src.s6_addr32[i] !=
1709 			    (pkt->fi6_src.s6_addr32[i] &
1710 			     filt->ff_mask6.mask6_src.s6_addr32[i]))
1711 				return (0);
1712 	}
1713 	if (fbmask & FIMB6_DADDR) {
1714 		for (i = 0; i < 4; i++)
1715 			if (filt->ff_flow6.fi6_dst.s6_addr32[i] !=
1716 			    (pkt->fi6_dst.s6_addr32[i] &
1717 			     filt->ff_mask6.mask6_dst.s6_addr32[i]))
1718 				return (0);
1719 	}
1720 	if ((fbmask & FIMB6_TCLASS) &&
1721 	    filt->ff_flow6.fi6_tclass !=
1722 	    (pkt->fi6_tclass & filt->ff_mask6.mask6_tclass))
1723 		return (0);
1724 	if ((fbmask & FIMB6_GPI) &&
1725 	    filt->ff_flow6.fi6_gpi != pkt->fi6_gpi)
1726 		return (0);
1727 	/* match */
1728 	return (1);
1729 }
1730 #endif /* INET6 */
1731 
1732 /*
1733  *  filter handle:
1734  *	bit 20-28: index to the filter hash table
1735  *	bit  0-19: unique id in the hash bucket.
1736  */
1737 static u_long
1738 get_filt_handle(classifier, i)
1739 	struct acc_classifier *classifier;
1740 	int	i;
1741 {
1742 	static u_long handle_number = 1;
1743 	u_long 	handle;
1744 	struct acc_filter *afp;
1745 
1746 	while (1) {
1747 		handle = handle_number++ & 0x000fffff;
1748 
1749 		if (LIST_EMPTY(&classifier->acc_filters[i]))
1750 			break;
1751 
1752 		LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1753 			if ((afp->f_handle & 0x000fffff) == handle)
1754 				break;
1755 		if (afp == NULL)
1756 			break;
1757 		/* this handle is already used, try again */
1758 	}
1759 
1760 	return ((i << 20) | handle);
1761 }
1762 
1763 /* convert filter handle to filter pointer */
1764 static struct acc_filter *
1765 filth_to_filtp(classifier, handle)
1766 	struct acc_classifier *classifier;
1767 	u_long handle;
1768 {
1769 	struct acc_filter *afp;
1770 	int	i;
1771 
1772 	i = ACC_GET_HINDEX(handle);
1773 
1774 	LIST_FOREACH(afp, &classifier->acc_filters[i], f_chain)
1775 		if (afp->f_handle == handle)
1776 			return (afp);
1777 
1778 	return (NULL);
1779 }
1780 
1781 /* create flowinfo bitmask */
1782 static u_int32_t
1783 filt2fibmask(filt)
1784 	struct flow_filter *filt;
1785 {
1786 	u_int32_t mask = 0;
1787 #ifdef INET6
1788 	struct flow_filter6 *filt6;
1789 #endif
1790 
1791 	switch (filt->ff_flow.fi_family) {
1792 	case AF_INET:
1793 		if (filt->ff_flow.fi_proto != 0)
1794 			mask |= FIMB4_PROTO;
1795 		if (filt->ff_flow.fi_tos != 0)
1796 			mask |= FIMB4_TOS;
1797 		if (filt->ff_flow.fi_dst.s_addr != 0)
1798 			mask |= FIMB4_DADDR;
1799 		if (filt->ff_flow.fi_src.s_addr != 0)
1800 			mask |= FIMB4_SADDR;
1801 		if (filt->ff_flow.fi_sport != 0)
1802 			mask |= FIMB4_SPORT;
1803 		if (filt->ff_flow.fi_dport != 0)
1804 			mask |= FIMB4_DPORT;
1805 		if (filt->ff_flow.fi_gpi != 0)
1806 			mask |= FIMB4_GPI;
1807 		break;
1808 #ifdef INET6
1809 	case AF_INET6:
1810 		filt6 = (struct flow_filter6 *)filt;
1811 
1812 		if (filt6->ff_flow6.fi6_proto != 0)
1813 			mask |= FIMB6_PROTO;
1814 		if (filt6->ff_flow6.fi6_tclass != 0)
1815 			mask |= FIMB6_TCLASS;
1816 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_dst))
1817 			mask |= FIMB6_DADDR;
1818 		if (!IN6_IS_ADDR_UNSPECIFIED(&filt6->ff_flow6.fi6_src))
1819 			mask |= FIMB6_SADDR;
1820 		if (filt6->ff_flow6.fi6_sport != 0)
1821 			mask |= FIMB6_SPORT;
1822 		if (filt6->ff_flow6.fi6_dport != 0)
1823 			mask |= FIMB6_DPORT;
1824 		if (filt6->ff_flow6.fi6_gpi != 0)
1825 			mask |= FIMB6_GPI;
1826 		if (filt6->ff_flow6.fi6_flowlabel != 0)
1827 			mask |= FIMB6_FLABEL;
1828 		break;
1829 #endif /* INET6 */
1830 	}
1831 	return (mask);
1832 }
1833 
1834 /*
1835  * helper functions to handle IPv4 fragments.
1836  * currently only in-sequence fragments are handled.
1837  *	- fragment info is cached in a LRU list.
1838  *	- when a first fragment is found, cache its flow info.
1839  *	- when a non-first fragment is found, lookup the cache.
1840  */
1841 
1842 struct ip4_frag {
1843     TAILQ_ENTRY(ip4_frag) ip4f_chain;
1844     char    ip4f_valid;
1845     u_short ip4f_id;
1846     struct flowinfo_in ip4f_info;
1847 };
1848 
1849 static TAILQ_HEAD(ip4f_list, ip4_frag) ip4f_list; /* IPv4 fragment cache */
1850 
1851 #define	IP4F_TABSIZE		16	/* IPv4 fragment cache size */
1852 
1853 static void
1854 ip4f_cache(ip, fin)
1855 	struct ip *ip;
1856 	struct flowinfo_in *fin;
1857 {
1858 	struct ip4_frag *fp;
1859 
1860 	if (TAILQ_EMPTY(&ip4f_list)) {
1861 		/* first time call, allocate fragment cache entries. */
1862 		if (ip4f_init() < 0)
1863 			/* allocation failed! */
1864 			return;
1865 	}
1866 
1867 	fp = ip4f_alloc();
1868 	fp->ip4f_id = ip->ip_id;
1869 	fp->ip4f_info.fi_proto = ip->ip_p;
1870 	fp->ip4f_info.fi_src.s_addr = ip->ip_src.s_addr;
1871 	fp->ip4f_info.fi_dst.s_addr = ip->ip_dst.s_addr;
1872 
1873 	/* save port numbers */
1874 	fp->ip4f_info.fi_sport = fin->fi_sport;
1875 	fp->ip4f_info.fi_dport = fin->fi_dport;
1876 	fp->ip4f_info.fi_gpi   = fin->fi_gpi;
1877 }
1878 
1879 static int
1880 ip4f_lookup(ip, fin)
1881 	struct ip *ip;
1882 	struct flowinfo_in *fin;
1883 {
1884 	struct ip4_frag *fp;
1885 
1886 	for (fp = TAILQ_FIRST(&ip4f_list); fp != NULL && fp->ip4f_valid;
1887 	     fp = TAILQ_NEXT(fp, ip4f_chain))
1888 		if (ip->ip_id == fp->ip4f_id &&
1889 		    ip->ip_src.s_addr == fp->ip4f_info.fi_src.s_addr &&
1890 		    ip->ip_dst.s_addr == fp->ip4f_info.fi_dst.s_addr &&
1891 		    ip->ip_p == fp->ip4f_info.fi_proto) {
1892 			/* found the matching entry */
1893 			fin->fi_sport = fp->ip4f_info.fi_sport;
1894 			fin->fi_dport = fp->ip4f_info.fi_dport;
1895 			fin->fi_gpi   = fp->ip4f_info.fi_gpi;
1896 
1897 			if ((ntohs(ip->ip_off) & IP_MF) == 0)
1898 				/* this is the last fragment,
1899 				   release the entry. */
1900 				ip4f_free(fp);
1901 
1902 			return (1);
1903 		}
1904 
1905 	/* no matching entry found */
1906 	return (0);
1907 }
1908 
1909 static int
1910 ip4f_init(void)
1911 {
1912 	struct ip4_frag *fp;
1913 	int i;
1914 
1915 	TAILQ_INIT(&ip4f_list);
1916 	for (i=0; i<IP4F_TABSIZE; i++) {
1917 		fp = malloc(sizeof(struct ip4_frag),
1918 		       M_DEVBUF, M_NOWAIT);
1919 		if (fp == NULL) {
1920 			printf("ip4f_init: can't alloc %dth entry!\n", i);
1921 			if (i == 0)
1922 				return (-1);
1923 			return (0);
1924 		}
1925 		fp->ip4f_valid = 0;
1926 		TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1927 	}
1928 	return (0);
1929 }
1930 
1931 static struct ip4_frag *
1932 ip4f_alloc(void)
1933 {
1934 	struct ip4_frag *fp;
1935 
1936 	/* reclaim an entry at the tail, put it at the head */
1937 	fp = TAILQ_LAST(&ip4f_list, ip4f_list);
1938 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1939 	fp->ip4f_valid = 1;
1940 	TAILQ_INSERT_HEAD(&ip4f_list, fp, ip4f_chain);
1941 	return (fp);
1942 }
1943 
1944 static void
1945 ip4f_free(fp)
1946 	struct ip4_frag *fp;
1947 {
1948 	TAILQ_REMOVE(&ip4f_list, fp, ip4f_chain);
1949 	fp->ip4f_valid = 0;
1950 	TAILQ_INSERT_TAIL(&ip4f_list, fp, ip4f_chain);
1951 }
1952 
1953 #endif /* ALTQ3_CLFIER_COMPAT */
1954