xref: /freebsd/sys/net/bpf.c (revision f37852c17391fdf0e8309bcf684384dd0d854e43)
1 /*-
2  * Copyright (c) 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from the Stanford/CMU enet packet filter,
6  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
7  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
8  * Berkeley Laboratory.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_bpf.h"
41 #include "opt_compat.h"
42 #include "opt_ddb.h"
43 #include "opt_netgraph.h"
44 
45 #include <sys/types.h>
46 #include <sys/param.h>
47 #include <sys/lock.h>
48 #include <sys/rwlock.h>
49 #include <sys/systm.h>
50 #include <sys/conf.h>
51 #include <sys/fcntl.h>
52 #include <sys/jail.h>
53 #include <sys/malloc.h>
54 #include <sys/mbuf.h>
55 #include <sys/time.h>
56 #include <sys/priv.h>
57 #include <sys/proc.h>
58 #include <sys/signalvar.h>
59 #include <sys/filio.h>
60 #include <sys/sockio.h>
61 #include <sys/ttycom.h>
62 #include <sys/uio.h>
63 
64 #include <sys/event.h>
65 #include <sys/file.h>
66 #include <sys/poll.h>
67 #include <sys/proc.h>
68 
69 #include <sys/socket.h>
70 
71 #ifdef DDB
72 #include <ddb/ddb.h>
73 #endif
74 
75 #include <net/if.h>
76 #include <net/if_var.h>
77 #include <net/if_dl.h>
78 #include <net/bpf.h>
79 #include <net/bpf_buffer.h>
80 #ifdef BPF_JITTER
81 #include <net/bpf_jitter.h>
82 #endif
83 #include <net/bpf_zerocopy.h>
84 #include <net/bpfdesc.h>
85 #include <net/route.h>
86 #include <net/vnet.h>
87 
88 #include <netinet/in.h>
89 #include <netinet/if_ether.h>
90 #include <sys/kernel.h>
91 #include <sys/sysctl.h>
92 
93 #include <net80211/ieee80211_freebsd.h>
94 
95 #include <security/mac/mac_framework.h>
96 
97 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
98 
99 struct bpf_if {
100 #define	bif_next	bif_ext.bif_next
101 #define	bif_dlist	bif_ext.bif_dlist
102 	struct bpf_if_ext bif_ext;	/* public members */
103 	u_int		bif_dlt;	/* link layer type */
104 	u_int		bif_hdrlen;	/* length of link header */
105 	struct ifnet	*bif_ifp;	/* corresponding interface */
106 	struct rwlock	bif_lock;	/* interface lock */
107 	LIST_HEAD(, bpf_d) bif_wlist;	/* writer-only list */
108 	int		bif_flags;	/* Interface flags */
109 };
110 
111 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
112 
113 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
114 
115 #define PRINET  26			/* interruptible */
116 
117 #define	SIZEOF_BPF_HDR(type)	\
118     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
119 
120 #ifdef COMPAT_FREEBSD32
121 #include <sys/mount.h>
122 #include <compat/freebsd32/freebsd32.h>
123 #define BPF_ALIGNMENT32 sizeof(int32_t)
124 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
125 
126 #ifndef BURN_BRIDGES
127 /*
128  * 32-bit version of structure prepended to each packet.  We use this header
129  * instead of the standard one for 32-bit streams.  We mark the a stream as
130  * 32-bit the first time we see a 32-bit compat ioctl request.
131  */
132 struct bpf_hdr32 {
133 	struct timeval32 bh_tstamp;	/* time stamp */
134 	uint32_t	bh_caplen;	/* length of captured portion */
135 	uint32_t	bh_datalen;	/* original length of packet */
136 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
137 					   plus alignment padding) */
138 };
139 #endif
140 
141 struct bpf_program32 {
142 	u_int bf_len;
143 	uint32_t bf_insns;
144 };
145 
146 struct bpf_dltlist32 {
147 	u_int	bfl_len;
148 	u_int	bfl_list;
149 };
150 
151 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
152 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
153 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
154 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
155 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
156 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
157 #endif
158 
159 /*
160  * bpf_iflist is a list of BPF interface structures, each corresponding to a
161  * specific DLT.  The same network interface might have several BPF interface
162  * structures registered by different layers in the stack (i.e., 802.11
163  * frames, ethernet frames, etc).
164  */
165 static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
166 static struct mtx	bpf_mtx;		/* bpf global lock */
167 static int		bpf_bpfd_cnt;
168 
169 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
170 static void	bpf_detachd(struct bpf_d *);
171 static void	bpf_detachd_locked(struct bpf_d *);
172 static void	bpf_freed(struct bpf_d *);
173 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
174 		    struct sockaddr *, int *, struct bpf_d *);
175 static int	bpf_setif(struct bpf_d *, struct ifreq *);
176 static void	bpf_timed_out(void *);
177 static __inline void
178 		bpf_wakeup(struct bpf_d *);
179 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
180 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
181 		    struct bintime *);
182 static void	reset_d(struct bpf_d *);
183 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
184 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
185 static int	bpf_setdlt(struct bpf_d *, u_int);
186 static void	filt_bpfdetach(struct knote *);
187 static int	filt_bpfread(struct knote *, long);
188 static void	bpf_drvinit(void *);
189 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
190 
191 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
192 int bpf_maxinsns = BPF_MAXINSNS;
193 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
194     &bpf_maxinsns, 0, "Maximum bpf program instructions");
195 static int bpf_zerocopy_enable = 0;
196 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
197     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
198 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
199     bpf_stats_sysctl, "bpf statistics portal");
200 
201 static VNET_DEFINE(int, bpf_optimize_writers) = 0;
202 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
203 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
204     &VNET_NAME(bpf_optimize_writers), 0,
205     "Do not send packets until BPF program is set");
206 
207 static	d_open_t	bpfopen;
208 static	d_read_t	bpfread;
209 static	d_write_t	bpfwrite;
210 static	d_ioctl_t	bpfioctl;
211 static	d_poll_t	bpfpoll;
212 static	d_kqfilter_t	bpfkqfilter;
213 
214 static struct cdevsw bpf_cdevsw = {
215 	.d_version =	D_VERSION,
216 	.d_open =	bpfopen,
217 	.d_read =	bpfread,
218 	.d_write =	bpfwrite,
219 	.d_ioctl =	bpfioctl,
220 	.d_poll =	bpfpoll,
221 	.d_name =	"bpf",
222 	.d_kqfilter =	bpfkqfilter,
223 };
224 
225 static struct filterops bpfread_filtops = {
226 	.f_isfd = 1,
227 	.f_detach = filt_bpfdetach,
228 	.f_event = filt_bpfread,
229 };
230 
231 eventhandler_tag	bpf_ifdetach_cookie = NULL;
232 
233 /*
234  * LOCKING MODEL USED BY BPF:
235  * Locks:
236  * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
237  * some global counters and every bpf_if reference.
238  * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
239  * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
240  *   used by bpf_mtap code.
241  *
242  * Lock order:
243  *
244  * Global lock, interface lock, descriptor lock
245  *
246  * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
247  * working model. In many places (like bpf_detachd) we start with BPF descriptor
248  * (and we need to at least rlock it to get reliable interface pointer). This
249  * gives us potential LOR. As a result, we use global lock to protect from bpf_if
250  * change in every such place.
251  *
252  * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
253  * 3) descriptor main wlock.
254  * Reading bd_bif can be protected by any of these locks, typically global lock.
255  *
256  * Changing read/write BPF filter is protected by the same three locks,
257  * the same applies for reading.
258  *
259  * Sleeping in global lock is not allowed due to bpfdetach() using it.
260  */
261 
262 /*
263  * Wrapper functions for various buffering methods.  If the set of buffer
264  * modes expands, we will probably want to introduce a switch data structure
265  * similar to protosw, et.
266  */
267 static void
268 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
269     u_int len)
270 {
271 
272 	BPFD_LOCK_ASSERT(d);
273 
274 	switch (d->bd_bufmode) {
275 	case BPF_BUFMODE_BUFFER:
276 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
277 
278 	case BPF_BUFMODE_ZBUF:
279 		d->bd_zcopy++;
280 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
281 
282 	default:
283 		panic("bpf_buf_append_bytes");
284 	}
285 }
286 
287 static void
288 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
289     u_int len)
290 {
291 
292 	BPFD_LOCK_ASSERT(d);
293 
294 	switch (d->bd_bufmode) {
295 	case BPF_BUFMODE_BUFFER:
296 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
297 
298 	case BPF_BUFMODE_ZBUF:
299 		d->bd_zcopy++;
300 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
301 
302 	default:
303 		panic("bpf_buf_append_mbuf");
304 	}
305 }
306 
307 /*
308  * This function gets called when the free buffer is re-assigned.
309  */
310 static void
311 bpf_buf_reclaimed(struct bpf_d *d)
312 {
313 
314 	BPFD_LOCK_ASSERT(d);
315 
316 	switch (d->bd_bufmode) {
317 	case BPF_BUFMODE_BUFFER:
318 		return;
319 
320 	case BPF_BUFMODE_ZBUF:
321 		bpf_zerocopy_buf_reclaimed(d);
322 		return;
323 
324 	default:
325 		panic("bpf_buf_reclaimed");
326 	}
327 }
328 
329 /*
330  * If the buffer mechanism has a way to decide that a held buffer can be made
331  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
332  * returned if the buffer can be discarded, (0) is returned if it cannot.
333  */
334 static int
335 bpf_canfreebuf(struct bpf_d *d)
336 {
337 
338 	BPFD_LOCK_ASSERT(d);
339 
340 	switch (d->bd_bufmode) {
341 	case BPF_BUFMODE_ZBUF:
342 		return (bpf_zerocopy_canfreebuf(d));
343 	}
344 	return (0);
345 }
346 
347 /*
348  * Allow the buffer model to indicate that the current store buffer is
349  * immutable, regardless of the appearance of space.  Return (1) if the
350  * buffer is writable, and (0) if not.
351  */
352 static int
353 bpf_canwritebuf(struct bpf_d *d)
354 {
355 	BPFD_LOCK_ASSERT(d);
356 
357 	switch (d->bd_bufmode) {
358 	case BPF_BUFMODE_ZBUF:
359 		return (bpf_zerocopy_canwritebuf(d));
360 	}
361 	return (1);
362 }
363 
364 /*
365  * Notify buffer model that an attempt to write to the store buffer has
366  * resulted in a dropped packet, in which case the buffer may be considered
367  * full.
368  */
369 static void
370 bpf_buffull(struct bpf_d *d)
371 {
372 
373 	BPFD_LOCK_ASSERT(d);
374 
375 	switch (d->bd_bufmode) {
376 	case BPF_BUFMODE_ZBUF:
377 		bpf_zerocopy_buffull(d);
378 		break;
379 	}
380 }
381 
382 /*
383  * Notify the buffer model that a buffer has moved into the hold position.
384  */
385 void
386 bpf_bufheld(struct bpf_d *d)
387 {
388 
389 	BPFD_LOCK_ASSERT(d);
390 
391 	switch (d->bd_bufmode) {
392 	case BPF_BUFMODE_ZBUF:
393 		bpf_zerocopy_bufheld(d);
394 		break;
395 	}
396 }
397 
398 static void
399 bpf_free(struct bpf_d *d)
400 {
401 
402 	switch (d->bd_bufmode) {
403 	case BPF_BUFMODE_BUFFER:
404 		return (bpf_buffer_free(d));
405 
406 	case BPF_BUFMODE_ZBUF:
407 		return (bpf_zerocopy_free(d));
408 
409 	default:
410 		panic("bpf_buf_free");
411 	}
412 }
413 
414 static int
415 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
416 {
417 
418 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
419 		return (EOPNOTSUPP);
420 	return (bpf_buffer_uiomove(d, buf, len, uio));
421 }
422 
423 static int
424 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
425 {
426 
427 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
428 		return (EOPNOTSUPP);
429 	return (bpf_buffer_ioctl_sblen(d, i));
430 }
431 
432 static int
433 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
434 {
435 
436 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
437 		return (EOPNOTSUPP);
438 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
439 }
440 
441 static int
442 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
443 {
444 
445 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
446 		return (EOPNOTSUPP);
447 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
448 }
449 
450 static int
451 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
452 {
453 
454 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
455 		return (EOPNOTSUPP);
456 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
457 }
458 
459 /*
460  * General BPF functions.
461  */
462 static int
463 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
464     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
465 {
466 	const struct ieee80211_bpf_params *p;
467 	struct ether_header *eh;
468 	struct mbuf *m;
469 	int error;
470 	int len;
471 	int hlen;
472 	int slen;
473 
474 	/*
475 	 * Build a sockaddr based on the data link layer type.
476 	 * We do this at this level because the ethernet header
477 	 * is copied directly into the data field of the sockaddr.
478 	 * In the case of SLIP, there is no header and the packet
479 	 * is forwarded as is.
480 	 * Also, we are careful to leave room at the front of the mbuf
481 	 * for the link level header.
482 	 */
483 	switch (linktype) {
484 
485 	case DLT_SLIP:
486 		sockp->sa_family = AF_INET;
487 		hlen = 0;
488 		break;
489 
490 	case DLT_EN10MB:
491 		sockp->sa_family = AF_UNSPEC;
492 		/* XXX Would MAXLINKHDR be better? */
493 		hlen = ETHER_HDR_LEN;
494 		break;
495 
496 	case DLT_FDDI:
497 		sockp->sa_family = AF_IMPLINK;
498 		hlen = 0;
499 		break;
500 
501 	case DLT_RAW:
502 		sockp->sa_family = AF_UNSPEC;
503 		hlen = 0;
504 		break;
505 
506 	case DLT_NULL:
507 		/*
508 		 * null interface types require a 4 byte pseudo header which
509 		 * corresponds to the address family of the packet.
510 		 */
511 		sockp->sa_family = AF_UNSPEC;
512 		hlen = 4;
513 		break;
514 
515 	case DLT_ATM_RFC1483:
516 		/*
517 		 * en atm driver requires 4-byte atm pseudo header.
518 		 * though it isn't standard, vpi:vci needs to be
519 		 * specified anyway.
520 		 */
521 		sockp->sa_family = AF_UNSPEC;
522 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
523 		break;
524 
525 	case DLT_PPP:
526 		sockp->sa_family = AF_UNSPEC;
527 		hlen = 4;	/* This should match PPP_HDRLEN */
528 		break;
529 
530 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
531 		sockp->sa_family = AF_IEEE80211;
532 		hlen = 0;
533 		break;
534 
535 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
536 		sockp->sa_family = AF_IEEE80211;
537 		sockp->sa_len = 12;	/* XXX != 0 */
538 		hlen = sizeof(struct ieee80211_bpf_params);
539 		break;
540 
541 	default:
542 		return (EIO);
543 	}
544 
545 	len = uio->uio_resid;
546 	if (len < hlen || len - hlen > ifp->if_mtu)
547 		return (EMSGSIZE);
548 
549 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
550 	if (m == NULL)
551 		return (EIO);
552 	m->m_pkthdr.len = m->m_len = len;
553 	*mp = m;
554 
555 	error = uiomove(mtod(m, u_char *), len, uio);
556 	if (error)
557 		goto bad;
558 
559 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
560 	if (slen == 0) {
561 		error = EPERM;
562 		goto bad;
563 	}
564 
565 	/* Check for multicast destination */
566 	switch (linktype) {
567 	case DLT_EN10MB:
568 		eh = mtod(m, struct ether_header *);
569 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
570 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
571 			    ETHER_ADDR_LEN) == 0)
572 				m->m_flags |= M_BCAST;
573 			else
574 				m->m_flags |= M_MCAST;
575 		}
576 		if (d->bd_hdrcmplt == 0) {
577 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
578 			    sizeof(eh->ether_shost));
579 		}
580 		break;
581 	}
582 
583 	/*
584 	 * Make room for link header, and copy it to sockaddr
585 	 */
586 	if (hlen != 0) {
587 		if (sockp->sa_family == AF_IEEE80211) {
588 			/*
589 			 * Collect true length from the parameter header
590 			 * NB: sockp is known to be zero'd so if we do a
591 			 *     short copy unspecified parameters will be
592 			 *     zero.
593 			 * NB: packet may not be aligned after stripping
594 			 *     bpf params
595 			 * XXX check ibp_vers
596 			 */
597 			p = mtod(m, const struct ieee80211_bpf_params *);
598 			hlen = p->ibp_len;
599 			if (hlen > sizeof(sockp->sa_data)) {
600 				error = EINVAL;
601 				goto bad;
602 			}
603 		}
604 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
605 	}
606 	*hdrlen = hlen;
607 
608 	return (0);
609 bad:
610 	m_freem(m);
611 	return (error);
612 }
613 
614 /*
615  * Attach file to the bpf interface, i.e. make d listen on bp.
616  */
617 static void
618 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
619 {
620 	int op_w;
621 
622 	BPF_LOCK_ASSERT();
623 
624 	/*
625 	 * Save sysctl value to protect from sysctl change
626 	 * between reads
627 	 */
628 	op_w = V_bpf_optimize_writers || d->bd_writer;
629 
630 	if (d->bd_bif != NULL)
631 		bpf_detachd_locked(d);
632 	/*
633 	 * Point d at bp, and add d to the interface's list.
634 	 * Since there are many applications using BPF for
635 	 * sending raw packets only (dhcpd, cdpd are good examples)
636 	 * we can delay adding d to the list of active listeners until
637 	 * some filter is configured.
638 	 */
639 
640 	BPFIF_WLOCK(bp);
641 	BPFD_LOCK(d);
642 
643 	d->bd_bif = bp;
644 
645 	if (op_w != 0) {
646 		/* Add to writers-only list */
647 		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
648 		/*
649 		 * We decrement bd_writer on every filter set operation.
650 		 * First BIOCSETF is done by pcap_open_live() to set up
651 		 * snap length. After that appliation usually sets its own filter
652 		 */
653 		d->bd_writer = 2;
654 	} else
655 		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
656 
657 	BPFD_UNLOCK(d);
658 	BPFIF_WUNLOCK(bp);
659 
660 	bpf_bpfd_cnt++;
661 
662 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
663 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
664 
665 	if (op_w == 0)
666 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
667 }
668 
669 /*
670  * Check if we need to upgrade our descriptor @d from write-only mode.
671  */
672 static int
673 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
674 {
675 	int is_snap, need_upgrade;
676 
677 	/*
678 	 * Check if we've already upgraded or new filter is empty.
679 	 */
680 	if (d->bd_writer == 0 || fcode == NULL)
681 		return (0);
682 
683 	need_upgrade = 0;
684 
685 	/*
686 	 * Check if cmd looks like snaplen setting from
687 	 * pcap_bpf.c:pcap_open_live().
688 	 * Note we're not checking .k value here:
689 	 * while pcap_open_live() definitely sets to to non-zero value,
690 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
691 	 * do not consider upgrading immediately
692 	 */
693 	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
694 		is_snap = 1;
695 	else
696 		is_snap = 0;
697 
698 	if (is_snap == 0) {
699 		/*
700 		 * We're setting first filter and it doesn't look like
701 		 * setting snaplen.  We're probably using bpf directly.
702 		 * Upgrade immediately.
703 		 */
704 		need_upgrade = 1;
705 	} else {
706 		/*
707 		 * Do not require upgrade by first BIOCSETF
708 		 * (used to set snaplen) by pcap_open_live().
709 		 */
710 
711 		if (--d->bd_writer == 0) {
712 			/*
713 			 * First snaplen filter has already
714 			 * been set. This is probably catch-all
715 			 * filter
716 			 */
717 			need_upgrade = 1;
718 		}
719 	}
720 
721 	CTR5(KTR_NET,
722 	    "%s: filter function set by pid %d, "
723 	    "bd_writer counter %d, snap %d upgrade %d",
724 	    __func__, d->bd_pid, d->bd_writer,
725 	    is_snap, need_upgrade);
726 
727 	return (need_upgrade);
728 }
729 
730 /*
731  * Add d to the list of active bp filters.
732  * Requires bpf_attachd() to be called before.
733  */
734 static void
735 bpf_upgraded(struct bpf_d *d)
736 {
737 	struct bpf_if *bp;
738 
739 	BPF_LOCK_ASSERT();
740 
741 	bp = d->bd_bif;
742 
743 	/*
744 	 * Filter can be set several times without specifying interface.
745 	 * Mark d as reader and exit.
746 	 */
747 	if (bp == NULL) {
748 		BPFD_LOCK(d);
749 		d->bd_writer = 0;
750 		BPFD_UNLOCK(d);
751 		return;
752 	}
753 
754 	BPFIF_WLOCK(bp);
755 	BPFD_LOCK(d);
756 
757 	/* Remove from writers-only list */
758 	LIST_REMOVE(d, bd_next);
759 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
760 	/* Mark d as reader */
761 	d->bd_writer = 0;
762 
763 	BPFD_UNLOCK(d);
764 	BPFIF_WUNLOCK(bp);
765 
766 	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
767 
768 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
769 }
770 
771 /*
772  * Detach a file from its interface.
773  */
774 static void
775 bpf_detachd(struct bpf_d *d)
776 {
777 	BPF_LOCK();
778 	bpf_detachd_locked(d);
779 	BPF_UNLOCK();
780 }
781 
782 static void
783 bpf_detachd_locked(struct bpf_d *d)
784 {
785 	int error;
786 	struct bpf_if *bp;
787 	struct ifnet *ifp;
788 
789 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
790 
791 	BPF_LOCK_ASSERT();
792 
793 	/* Check if descriptor is attached */
794 	if ((bp = d->bd_bif) == NULL)
795 		return;
796 
797 	BPFIF_WLOCK(bp);
798 	BPFD_LOCK(d);
799 
800 	/* Save bd_writer value */
801 	error = d->bd_writer;
802 
803 	/*
804 	 * Remove d from the interface's descriptor list.
805 	 */
806 	LIST_REMOVE(d, bd_next);
807 
808 	ifp = bp->bif_ifp;
809 	d->bd_bif = NULL;
810 	BPFD_UNLOCK(d);
811 	BPFIF_WUNLOCK(bp);
812 
813 	bpf_bpfd_cnt--;
814 
815 	/* Call event handler iff d is attached */
816 	if (error == 0)
817 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
818 
819 	/*
820 	 * Check if this descriptor had requested promiscuous mode.
821 	 * If so, turn it off.
822 	 */
823 	if (d->bd_promisc) {
824 		d->bd_promisc = 0;
825 		CURVNET_SET(ifp->if_vnet);
826 		error = ifpromisc(ifp, 0);
827 		CURVNET_RESTORE();
828 		if (error != 0 && error != ENXIO) {
829 			/*
830 			 * ENXIO can happen if a pccard is unplugged
831 			 * Something is really wrong if we were able to put
832 			 * the driver into promiscuous mode, but can't
833 			 * take it out.
834 			 */
835 			if_printf(bp->bif_ifp,
836 				"bpf_detach: ifpromisc failed (%d)\n", error);
837 		}
838 	}
839 }
840 
841 /*
842  * Close the descriptor by detaching it from its interface,
843  * deallocating its buffers, and marking it free.
844  */
845 static void
846 bpf_dtor(void *data)
847 {
848 	struct bpf_d *d = data;
849 
850 	BPFD_LOCK(d);
851 	if (d->bd_state == BPF_WAITING)
852 		callout_stop(&d->bd_callout);
853 	d->bd_state = BPF_IDLE;
854 	BPFD_UNLOCK(d);
855 	funsetown(&d->bd_sigio);
856 	bpf_detachd(d);
857 #ifdef MAC
858 	mac_bpfdesc_destroy(d);
859 #endif /* MAC */
860 	seldrain(&d->bd_sel);
861 	knlist_destroy(&d->bd_sel.si_note);
862 	callout_drain(&d->bd_callout);
863 	bpf_freed(d);
864 	free(d, M_BPF);
865 }
866 
867 /*
868  * Open ethernet device.  Returns ENXIO for illegal minor device number,
869  * EBUSY if file is open by another process.
870  */
871 /* ARGSUSED */
872 static	int
873 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
874 {
875 	struct bpf_d *d;
876 	int error;
877 
878 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
879 	error = devfs_set_cdevpriv(d, bpf_dtor);
880 	if (error != 0) {
881 		free(d, M_BPF);
882 		return (error);
883 	}
884 
885 	/*
886 	 * For historical reasons, perform a one-time initialization call to
887 	 * the buffer routines, even though we're not yet committed to a
888 	 * particular buffer method.
889 	 */
890 	bpf_buffer_init(d);
891 	if ((flags & FREAD) == 0)
892 		d->bd_writer = 2;
893 	d->bd_hbuf_in_use = 0;
894 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
895 	d->bd_sig = SIGIO;
896 	d->bd_direction = BPF_D_INOUT;
897 	BPF_PID_REFRESH(d, td);
898 #ifdef MAC
899 	mac_bpfdesc_init(d);
900 	mac_bpfdesc_create(td->td_ucred, d);
901 #endif
902 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
903 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
904 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
905 
906 	return (0);
907 }
908 
909 /*
910  *  bpfread - read next chunk of packets from buffers
911  */
912 static	int
913 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
914 {
915 	struct bpf_d *d;
916 	int error;
917 	int non_block;
918 	int timed_out;
919 
920 	error = devfs_get_cdevpriv((void **)&d);
921 	if (error != 0)
922 		return (error);
923 
924 	/*
925 	 * Restrict application to use a buffer the same size as
926 	 * as kernel buffers.
927 	 */
928 	if (uio->uio_resid != d->bd_bufsize)
929 		return (EINVAL);
930 
931 	non_block = ((ioflag & O_NONBLOCK) != 0);
932 
933 	BPFD_LOCK(d);
934 	BPF_PID_REFRESH_CUR(d);
935 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
936 		BPFD_UNLOCK(d);
937 		return (EOPNOTSUPP);
938 	}
939 	if (d->bd_state == BPF_WAITING)
940 		callout_stop(&d->bd_callout);
941 	timed_out = (d->bd_state == BPF_TIMED_OUT);
942 	d->bd_state = BPF_IDLE;
943 	while (d->bd_hbuf_in_use) {
944 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
945 		    PRINET|PCATCH, "bd_hbuf", 0);
946 		if (error != 0) {
947 			BPFD_UNLOCK(d);
948 			return (error);
949 		}
950 	}
951 	/*
952 	 * If the hold buffer is empty, then do a timed sleep, which
953 	 * ends when the timeout expires or when enough packets
954 	 * have arrived to fill the store buffer.
955 	 */
956 	while (d->bd_hbuf == NULL) {
957 		if (d->bd_slen != 0) {
958 			/*
959 			 * A packet(s) either arrived since the previous
960 			 * read or arrived while we were asleep.
961 			 */
962 			if (d->bd_immediate || non_block || timed_out) {
963 				/*
964 				 * Rotate the buffers and return what's here
965 				 * if we are in immediate mode, non-blocking
966 				 * flag is set, or this descriptor timed out.
967 				 */
968 				ROTATE_BUFFERS(d);
969 				break;
970 			}
971 		}
972 
973 		/*
974 		 * No data is available, check to see if the bpf device
975 		 * is still pointed at a real interface.  If not, return
976 		 * ENXIO so that the userland process knows to rebind
977 		 * it before using it again.
978 		 */
979 		if (d->bd_bif == NULL) {
980 			BPFD_UNLOCK(d);
981 			return (ENXIO);
982 		}
983 
984 		if (non_block) {
985 			BPFD_UNLOCK(d);
986 			return (EWOULDBLOCK);
987 		}
988 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
989 		     "bpf", d->bd_rtout);
990 		if (error == EINTR || error == ERESTART) {
991 			BPFD_UNLOCK(d);
992 			return (error);
993 		}
994 		if (error == EWOULDBLOCK) {
995 			/*
996 			 * On a timeout, return what's in the buffer,
997 			 * which may be nothing.  If there is something
998 			 * in the store buffer, we can rotate the buffers.
999 			 */
1000 			if (d->bd_hbuf)
1001 				/*
1002 				 * We filled up the buffer in between
1003 				 * getting the timeout and arriving
1004 				 * here, so we don't need to rotate.
1005 				 */
1006 				break;
1007 
1008 			if (d->bd_slen == 0) {
1009 				BPFD_UNLOCK(d);
1010 				return (0);
1011 			}
1012 			ROTATE_BUFFERS(d);
1013 			break;
1014 		}
1015 	}
1016 	/*
1017 	 * At this point, we know we have something in the hold slot.
1018 	 */
1019 	d->bd_hbuf_in_use = 1;
1020 	BPFD_UNLOCK(d);
1021 
1022 	/*
1023 	 * Move data from hold buffer into user space.
1024 	 * We know the entire buffer is transferred since
1025 	 * we checked above that the read buffer is bpf_bufsize bytes.
1026   	 *
1027 	 * We do not have to worry about simultaneous reads because
1028 	 * we waited for sole access to the hold buffer above.
1029 	 */
1030 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
1031 
1032 	BPFD_LOCK(d);
1033 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
1034 	d->bd_fbuf = d->bd_hbuf;
1035 	d->bd_hbuf = NULL;
1036 	d->bd_hlen = 0;
1037 	bpf_buf_reclaimed(d);
1038 	d->bd_hbuf_in_use = 0;
1039 	wakeup(&d->bd_hbuf_in_use);
1040 	BPFD_UNLOCK(d);
1041 
1042 	return (error);
1043 }
1044 
1045 /*
1046  * If there are processes sleeping on this descriptor, wake them up.
1047  */
1048 static __inline void
1049 bpf_wakeup(struct bpf_d *d)
1050 {
1051 
1052 	BPFD_LOCK_ASSERT(d);
1053 	if (d->bd_state == BPF_WAITING) {
1054 		callout_stop(&d->bd_callout);
1055 		d->bd_state = BPF_IDLE;
1056 	}
1057 	wakeup(d);
1058 	if (d->bd_async && d->bd_sig && d->bd_sigio)
1059 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
1060 
1061 	selwakeuppri(&d->bd_sel, PRINET);
1062 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
1063 }
1064 
1065 static void
1066 bpf_timed_out(void *arg)
1067 {
1068 	struct bpf_d *d = (struct bpf_d *)arg;
1069 
1070 	BPFD_LOCK_ASSERT(d);
1071 
1072 	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
1073 		return;
1074 	if (d->bd_state == BPF_WAITING) {
1075 		d->bd_state = BPF_TIMED_OUT;
1076 		if (d->bd_slen != 0)
1077 			bpf_wakeup(d);
1078 	}
1079 }
1080 
1081 static int
1082 bpf_ready(struct bpf_d *d)
1083 {
1084 
1085 	BPFD_LOCK_ASSERT(d);
1086 
1087 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
1088 		return (1);
1089 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1090 	    d->bd_slen != 0)
1091 		return (1);
1092 	return (0);
1093 }
1094 
1095 static int
1096 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
1097 {
1098 	struct bpf_d *d;
1099 	struct ifnet *ifp;
1100 	struct mbuf *m, *mc;
1101 	struct sockaddr dst;
1102 	struct route ro;
1103 	int error, hlen;
1104 
1105 	error = devfs_get_cdevpriv((void **)&d);
1106 	if (error != 0)
1107 		return (error);
1108 
1109 	BPF_PID_REFRESH_CUR(d);
1110 	d->bd_wcount++;
1111 	/* XXX: locking required */
1112 	if (d->bd_bif == NULL) {
1113 		d->bd_wdcount++;
1114 		return (ENXIO);
1115 	}
1116 
1117 	ifp = d->bd_bif->bif_ifp;
1118 
1119 	if ((ifp->if_flags & IFF_UP) == 0) {
1120 		d->bd_wdcount++;
1121 		return (ENETDOWN);
1122 	}
1123 
1124 	if (uio->uio_resid == 0) {
1125 		d->bd_wdcount++;
1126 		return (0);
1127 	}
1128 
1129 	bzero(&dst, sizeof(dst));
1130 	m = NULL;
1131 	hlen = 0;
1132 	/* XXX: bpf_movein() can sleep */
1133 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
1134 	    &m, &dst, &hlen, d);
1135 	if (error) {
1136 		d->bd_wdcount++;
1137 		return (error);
1138 	}
1139 	d->bd_wfcount++;
1140 	if (d->bd_hdrcmplt)
1141 		dst.sa_family = pseudo_AF_HDRCMPLT;
1142 
1143 	if (d->bd_feedback) {
1144 		mc = m_dup(m, M_NOWAIT);
1145 		if (mc != NULL)
1146 			mc->m_pkthdr.rcvif = ifp;
1147 		/* Set M_PROMISC for outgoing packets to be discarded. */
1148 		if (d->bd_direction == BPF_D_INOUT)
1149 			m->m_flags |= M_PROMISC;
1150 	} else
1151 		mc = NULL;
1152 
1153 	m->m_pkthdr.len -= hlen;
1154 	m->m_len -= hlen;
1155 	m->m_data += hlen;	/* XXX */
1156 
1157 	CURVNET_SET(ifp->if_vnet);
1158 #ifdef MAC
1159 	BPFD_LOCK(d);
1160 	mac_bpfdesc_create_mbuf(d, m);
1161 	if (mc != NULL)
1162 		mac_bpfdesc_create_mbuf(d, mc);
1163 	BPFD_UNLOCK(d);
1164 #endif
1165 
1166 	bzero(&ro, sizeof(ro));
1167 	if (hlen != 0) {
1168 		ro.ro_prepend = (u_char *)&dst.sa_data;
1169 		ro.ro_plen = hlen;
1170 		ro.ro_flags = RT_HAS_HEADER;
1171 	}
1172 
1173 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
1174 	if (error)
1175 		d->bd_wdcount++;
1176 
1177 	if (mc != NULL) {
1178 		if (error == 0)
1179 			(*ifp->if_input)(ifp, mc);
1180 		else
1181 			m_freem(mc);
1182 	}
1183 	CURVNET_RESTORE();
1184 
1185 	return (error);
1186 }
1187 
1188 /*
1189  * Reset a descriptor by flushing its packet buffer and clearing the receive
1190  * and drop counts.  This is doable for kernel-only buffers, but with
1191  * zero-copy buffers, we can't write to (or rotate) buffers that are
1192  * currently owned by userspace.  It would be nice if we could encapsulate
1193  * this logic in the buffer code rather than here.
1194  */
1195 static void
1196 reset_d(struct bpf_d *d)
1197 {
1198 
1199 	BPFD_LOCK_ASSERT(d);
1200 
1201 	while (d->bd_hbuf_in_use)
1202 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
1203 		    "bd_hbuf", 0);
1204 	if ((d->bd_hbuf != NULL) &&
1205 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1206 		/* Free the hold buffer. */
1207 		d->bd_fbuf = d->bd_hbuf;
1208 		d->bd_hbuf = NULL;
1209 		d->bd_hlen = 0;
1210 		bpf_buf_reclaimed(d);
1211 	}
1212 	if (bpf_canwritebuf(d))
1213 		d->bd_slen = 0;
1214 	d->bd_rcount = 0;
1215 	d->bd_dcount = 0;
1216 	d->bd_fcount = 0;
1217 	d->bd_wcount = 0;
1218 	d->bd_wfcount = 0;
1219 	d->bd_wdcount = 0;
1220 	d->bd_zcopy = 0;
1221 }
1222 
1223 /*
1224  *  FIONREAD		Check for read packet available.
1225  *  BIOCGBLEN		Get buffer len [for read()].
1226  *  BIOCSETF		Set read filter.
1227  *  BIOCSETFNR		Set read filter without resetting descriptor.
1228  *  BIOCSETWF		Set write filter.
1229  *  BIOCFLUSH		Flush read packet buffer.
1230  *  BIOCPROMISC		Put interface into promiscuous mode.
1231  *  BIOCGDLT		Get link layer type.
1232  *  BIOCGETIF		Get interface name.
1233  *  BIOCSETIF		Set interface.
1234  *  BIOCSRTIMEOUT	Set read timeout.
1235  *  BIOCGRTIMEOUT	Get read timeout.
1236  *  BIOCGSTATS		Get packet stats.
1237  *  BIOCIMMEDIATE	Set immediate mode.
1238  *  BIOCVERSION		Get filter language version.
1239  *  BIOCGHDRCMPLT	Get "header already complete" flag
1240  *  BIOCSHDRCMPLT	Set "header already complete" flag
1241  *  BIOCGDIRECTION	Get packet direction flag
1242  *  BIOCSDIRECTION	Set packet direction flag
1243  *  BIOCGTSTAMP		Get time stamp format and resolution.
1244  *  BIOCSTSTAMP		Set time stamp format and resolution.
1245  *  BIOCLOCK		Set "locked" flag
1246  *  BIOCFEEDBACK	Set packet feedback mode.
1247  *  BIOCSETZBUF		Set current zero-copy buffer locations.
1248  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
1249  *  BIOCROTZBUF		Force rotation of zero-copy buffer
1250  *  BIOCSETBUFMODE	Set buffer mode.
1251  *  BIOCGETBUFMODE	Get current buffer mode.
1252  */
1253 /* ARGSUSED */
1254 static	int
1255 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1256     struct thread *td)
1257 {
1258 	struct bpf_d *d;
1259 	int error;
1260 
1261 	error = devfs_get_cdevpriv((void **)&d);
1262 	if (error != 0)
1263 		return (error);
1264 
1265 	/*
1266 	 * Refresh PID associated with this descriptor.
1267 	 */
1268 	BPFD_LOCK(d);
1269 	BPF_PID_REFRESH(d, td);
1270 	if (d->bd_state == BPF_WAITING)
1271 		callout_stop(&d->bd_callout);
1272 	d->bd_state = BPF_IDLE;
1273 	BPFD_UNLOCK(d);
1274 
1275 	if (d->bd_locked == 1) {
1276 		switch (cmd) {
1277 		case BIOCGBLEN:
1278 		case BIOCFLUSH:
1279 		case BIOCGDLT:
1280 		case BIOCGDLTLIST:
1281 #ifdef COMPAT_FREEBSD32
1282 		case BIOCGDLTLIST32:
1283 #endif
1284 		case BIOCGETIF:
1285 		case BIOCGRTIMEOUT:
1286 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1287 		case BIOCGRTIMEOUT32:
1288 #endif
1289 		case BIOCGSTATS:
1290 		case BIOCVERSION:
1291 		case BIOCGRSIG:
1292 		case BIOCGHDRCMPLT:
1293 		case BIOCSTSTAMP:
1294 		case BIOCFEEDBACK:
1295 		case FIONREAD:
1296 		case BIOCLOCK:
1297 		case BIOCSRTIMEOUT:
1298 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1299 		case BIOCSRTIMEOUT32:
1300 #endif
1301 		case BIOCIMMEDIATE:
1302 		case TIOCGPGRP:
1303 		case BIOCROTZBUF:
1304 			break;
1305 		default:
1306 			return (EPERM);
1307 		}
1308 	}
1309 #ifdef COMPAT_FREEBSD32
1310 	/*
1311 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1312 	 * that it will get 32-bit packet headers.
1313 	 */
1314 	switch (cmd) {
1315 	case BIOCSETF32:
1316 	case BIOCSETFNR32:
1317 	case BIOCSETWF32:
1318 	case BIOCGDLTLIST32:
1319 	case BIOCGRTIMEOUT32:
1320 	case BIOCSRTIMEOUT32:
1321 		BPFD_LOCK(d);
1322 		d->bd_compat32 = 1;
1323 		BPFD_UNLOCK(d);
1324 	}
1325 #endif
1326 
1327 	CURVNET_SET(TD_TO_VNET(td));
1328 	switch (cmd) {
1329 
1330 	default:
1331 		error = EINVAL;
1332 		break;
1333 
1334 	/*
1335 	 * Check for read packet available.
1336 	 */
1337 	case FIONREAD:
1338 		{
1339 			int n;
1340 
1341 			BPFD_LOCK(d);
1342 			n = d->bd_slen;
1343 			while (d->bd_hbuf_in_use)
1344 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1345 				    PRINET, "bd_hbuf", 0);
1346 			if (d->bd_hbuf)
1347 				n += d->bd_hlen;
1348 			BPFD_UNLOCK(d);
1349 
1350 			*(int *)addr = n;
1351 			break;
1352 		}
1353 
1354 	/*
1355 	 * Get buffer len [for read()].
1356 	 */
1357 	case BIOCGBLEN:
1358 		BPFD_LOCK(d);
1359 		*(u_int *)addr = d->bd_bufsize;
1360 		BPFD_UNLOCK(d);
1361 		break;
1362 
1363 	/*
1364 	 * Set buffer length.
1365 	 */
1366 	case BIOCSBLEN:
1367 		error = bpf_ioctl_sblen(d, (u_int *)addr);
1368 		break;
1369 
1370 	/*
1371 	 * Set link layer read filter.
1372 	 */
1373 	case BIOCSETF:
1374 	case BIOCSETFNR:
1375 	case BIOCSETWF:
1376 #ifdef COMPAT_FREEBSD32
1377 	case BIOCSETF32:
1378 	case BIOCSETFNR32:
1379 	case BIOCSETWF32:
1380 #endif
1381 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1382 		break;
1383 
1384 	/*
1385 	 * Flush read packet buffer.
1386 	 */
1387 	case BIOCFLUSH:
1388 		BPFD_LOCK(d);
1389 		reset_d(d);
1390 		BPFD_UNLOCK(d);
1391 		break;
1392 
1393 	/*
1394 	 * Put interface into promiscuous mode.
1395 	 */
1396 	case BIOCPROMISC:
1397 		if (d->bd_bif == NULL) {
1398 			/*
1399 			 * No interface attached yet.
1400 			 */
1401 			error = EINVAL;
1402 			break;
1403 		}
1404 		if (d->bd_promisc == 0) {
1405 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1406 			if (error == 0)
1407 				d->bd_promisc = 1;
1408 		}
1409 		break;
1410 
1411 	/*
1412 	 * Get current data link type.
1413 	 */
1414 	case BIOCGDLT:
1415 		BPF_LOCK();
1416 		if (d->bd_bif == NULL)
1417 			error = EINVAL;
1418 		else
1419 			*(u_int *)addr = d->bd_bif->bif_dlt;
1420 		BPF_UNLOCK();
1421 		break;
1422 
1423 	/*
1424 	 * Get a list of supported data link types.
1425 	 */
1426 #ifdef COMPAT_FREEBSD32
1427 	case BIOCGDLTLIST32:
1428 		{
1429 			struct bpf_dltlist32 *list32;
1430 			struct bpf_dltlist dltlist;
1431 
1432 			list32 = (struct bpf_dltlist32 *)addr;
1433 			dltlist.bfl_len = list32->bfl_len;
1434 			dltlist.bfl_list = PTRIN(list32->bfl_list);
1435 			BPF_LOCK();
1436 			if (d->bd_bif == NULL)
1437 				error = EINVAL;
1438 			else {
1439 				error = bpf_getdltlist(d, &dltlist);
1440 				if (error == 0)
1441 					list32->bfl_len = dltlist.bfl_len;
1442 			}
1443 			BPF_UNLOCK();
1444 			break;
1445 		}
1446 #endif
1447 
1448 	case BIOCGDLTLIST:
1449 		BPF_LOCK();
1450 		if (d->bd_bif == NULL)
1451 			error = EINVAL;
1452 		else
1453 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1454 		BPF_UNLOCK();
1455 		break;
1456 
1457 	/*
1458 	 * Set data link type.
1459 	 */
1460 	case BIOCSDLT:
1461 		BPF_LOCK();
1462 		if (d->bd_bif == NULL)
1463 			error = EINVAL;
1464 		else
1465 			error = bpf_setdlt(d, *(u_int *)addr);
1466 		BPF_UNLOCK();
1467 		break;
1468 
1469 	/*
1470 	 * Get interface name.
1471 	 */
1472 	case BIOCGETIF:
1473 		BPF_LOCK();
1474 		if (d->bd_bif == NULL)
1475 			error = EINVAL;
1476 		else {
1477 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1478 			struct ifreq *const ifr = (struct ifreq *)addr;
1479 
1480 			strlcpy(ifr->ifr_name, ifp->if_xname,
1481 			    sizeof(ifr->ifr_name));
1482 		}
1483 		BPF_UNLOCK();
1484 		break;
1485 
1486 	/*
1487 	 * Set interface.
1488 	 */
1489 	case BIOCSETIF:
1490 		{
1491 			int alloc_buf, size;
1492 
1493 			/*
1494 			 * Behavior here depends on the buffering model.  If
1495 			 * we're using kernel memory buffers, then we can
1496 			 * allocate them here.  If we're using zero-copy,
1497 			 * then the user process must have registered buffers
1498 			 * by the time we get here.
1499 			 */
1500 			alloc_buf = 0;
1501 			BPFD_LOCK(d);
1502 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1503 			    d->bd_sbuf == NULL)
1504 				alloc_buf = 1;
1505 			BPFD_UNLOCK(d);
1506 			if (alloc_buf) {
1507 				size = d->bd_bufsize;
1508 				error = bpf_buffer_ioctl_sblen(d, &size);
1509 				if (error != 0)
1510 					break;
1511 			}
1512 			BPF_LOCK();
1513 			error = bpf_setif(d, (struct ifreq *)addr);
1514 			BPF_UNLOCK();
1515 			break;
1516 		}
1517 
1518 	/*
1519 	 * Set read timeout.
1520 	 */
1521 	case BIOCSRTIMEOUT:
1522 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1523 	case BIOCSRTIMEOUT32:
1524 #endif
1525 		{
1526 			struct timeval *tv = (struct timeval *)addr;
1527 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1528 			struct timeval32 *tv32;
1529 			struct timeval tv64;
1530 
1531 			if (cmd == BIOCSRTIMEOUT32) {
1532 				tv32 = (struct timeval32 *)addr;
1533 				tv = &tv64;
1534 				tv->tv_sec = tv32->tv_sec;
1535 				tv->tv_usec = tv32->tv_usec;
1536 			} else
1537 #endif
1538 				tv = (struct timeval *)addr;
1539 
1540 			/*
1541 			 * Subtract 1 tick from tvtohz() since this isn't
1542 			 * a one-shot timer.
1543 			 */
1544 			if ((error = itimerfix(tv)) == 0)
1545 				d->bd_rtout = tvtohz(tv) - 1;
1546 			break;
1547 		}
1548 
1549 	/*
1550 	 * Get read timeout.
1551 	 */
1552 	case BIOCGRTIMEOUT:
1553 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1554 	case BIOCGRTIMEOUT32:
1555 #endif
1556 		{
1557 			struct timeval *tv;
1558 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1559 			struct timeval32 *tv32;
1560 			struct timeval tv64;
1561 
1562 			if (cmd == BIOCGRTIMEOUT32)
1563 				tv = &tv64;
1564 			else
1565 #endif
1566 				tv = (struct timeval *)addr;
1567 
1568 			tv->tv_sec = d->bd_rtout / hz;
1569 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1570 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1571 			if (cmd == BIOCGRTIMEOUT32) {
1572 				tv32 = (struct timeval32 *)addr;
1573 				tv32->tv_sec = tv->tv_sec;
1574 				tv32->tv_usec = tv->tv_usec;
1575 			}
1576 #endif
1577 
1578 			break;
1579 		}
1580 
1581 	/*
1582 	 * Get packet stats.
1583 	 */
1584 	case BIOCGSTATS:
1585 		{
1586 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1587 
1588 			/* XXXCSJP overflow */
1589 			bs->bs_recv = d->bd_rcount;
1590 			bs->bs_drop = d->bd_dcount;
1591 			break;
1592 		}
1593 
1594 	/*
1595 	 * Set immediate mode.
1596 	 */
1597 	case BIOCIMMEDIATE:
1598 		BPFD_LOCK(d);
1599 		d->bd_immediate = *(u_int *)addr;
1600 		BPFD_UNLOCK(d);
1601 		break;
1602 
1603 	case BIOCVERSION:
1604 		{
1605 			struct bpf_version *bv = (struct bpf_version *)addr;
1606 
1607 			bv->bv_major = BPF_MAJOR_VERSION;
1608 			bv->bv_minor = BPF_MINOR_VERSION;
1609 			break;
1610 		}
1611 
1612 	/*
1613 	 * Get "header already complete" flag
1614 	 */
1615 	case BIOCGHDRCMPLT:
1616 		BPFD_LOCK(d);
1617 		*(u_int *)addr = d->bd_hdrcmplt;
1618 		BPFD_UNLOCK(d);
1619 		break;
1620 
1621 	/*
1622 	 * Set "header already complete" flag
1623 	 */
1624 	case BIOCSHDRCMPLT:
1625 		BPFD_LOCK(d);
1626 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1627 		BPFD_UNLOCK(d);
1628 		break;
1629 
1630 	/*
1631 	 * Get packet direction flag
1632 	 */
1633 	case BIOCGDIRECTION:
1634 		BPFD_LOCK(d);
1635 		*(u_int *)addr = d->bd_direction;
1636 		BPFD_UNLOCK(d);
1637 		break;
1638 
1639 	/*
1640 	 * Set packet direction flag
1641 	 */
1642 	case BIOCSDIRECTION:
1643 		{
1644 			u_int	direction;
1645 
1646 			direction = *(u_int *)addr;
1647 			switch (direction) {
1648 			case BPF_D_IN:
1649 			case BPF_D_INOUT:
1650 			case BPF_D_OUT:
1651 				BPFD_LOCK(d);
1652 				d->bd_direction = direction;
1653 				BPFD_UNLOCK(d);
1654 				break;
1655 			default:
1656 				error = EINVAL;
1657 			}
1658 		}
1659 		break;
1660 
1661 	/*
1662 	 * Get packet timestamp format and resolution.
1663 	 */
1664 	case BIOCGTSTAMP:
1665 		BPFD_LOCK(d);
1666 		*(u_int *)addr = d->bd_tstamp;
1667 		BPFD_UNLOCK(d);
1668 		break;
1669 
1670 	/*
1671 	 * Set packet timestamp format and resolution.
1672 	 */
1673 	case BIOCSTSTAMP:
1674 		{
1675 			u_int	func;
1676 
1677 			func = *(u_int *)addr;
1678 			if (BPF_T_VALID(func))
1679 				d->bd_tstamp = func;
1680 			else
1681 				error = EINVAL;
1682 		}
1683 		break;
1684 
1685 	case BIOCFEEDBACK:
1686 		BPFD_LOCK(d);
1687 		d->bd_feedback = *(u_int *)addr;
1688 		BPFD_UNLOCK(d);
1689 		break;
1690 
1691 	case BIOCLOCK:
1692 		BPFD_LOCK(d);
1693 		d->bd_locked = 1;
1694 		BPFD_UNLOCK(d);
1695 		break;
1696 
1697 	case FIONBIO:		/* Non-blocking I/O */
1698 		break;
1699 
1700 	case FIOASYNC:		/* Send signal on receive packets */
1701 		BPFD_LOCK(d);
1702 		d->bd_async = *(int *)addr;
1703 		BPFD_UNLOCK(d);
1704 		break;
1705 
1706 	case FIOSETOWN:
1707 		/*
1708 		 * XXX: Add some sort of locking here?
1709 		 * fsetown() can sleep.
1710 		 */
1711 		error = fsetown(*(int *)addr, &d->bd_sigio);
1712 		break;
1713 
1714 	case FIOGETOWN:
1715 		BPFD_LOCK(d);
1716 		*(int *)addr = fgetown(&d->bd_sigio);
1717 		BPFD_UNLOCK(d);
1718 		break;
1719 
1720 	/* This is deprecated, FIOSETOWN should be used instead. */
1721 	case TIOCSPGRP:
1722 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1723 		break;
1724 
1725 	/* This is deprecated, FIOGETOWN should be used instead. */
1726 	case TIOCGPGRP:
1727 		*(int *)addr = -fgetown(&d->bd_sigio);
1728 		break;
1729 
1730 	case BIOCSRSIG:		/* Set receive signal */
1731 		{
1732 			u_int sig;
1733 
1734 			sig = *(u_int *)addr;
1735 
1736 			if (sig >= NSIG)
1737 				error = EINVAL;
1738 			else {
1739 				BPFD_LOCK(d);
1740 				d->bd_sig = sig;
1741 				BPFD_UNLOCK(d);
1742 			}
1743 			break;
1744 		}
1745 	case BIOCGRSIG:
1746 		BPFD_LOCK(d);
1747 		*(u_int *)addr = d->bd_sig;
1748 		BPFD_UNLOCK(d);
1749 		break;
1750 
1751 	case BIOCGETBUFMODE:
1752 		BPFD_LOCK(d);
1753 		*(u_int *)addr = d->bd_bufmode;
1754 		BPFD_UNLOCK(d);
1755 		break;
1756 
1757 	case BIOCSETBUFMODE:
1758 		/*
1759 		 * Allow the buffering mode to be changed as long as we
1760 		 * haven't yet committed to a particular mode.  Our
1761 		 * definition of commitment, for now, is whether or not a
1762 		 * buffer has been allocated or an interface attached, since
1763 		 * that's the point where things get tricky.
1764 		 */
1765 		switch (*(u_int *)addr) {
1766 		case BPF_BUFMODE_BUFFER:
1767 			break;
1768 
1769 		case BPF_BUFMODE_ZBUF:
1770 			if (bpf_zerocopy_enable)
1771 				break;
1772 			/* FALLSTHROUGH */
1773 
1774 		default:
1775 			CURVNET_RESTORE();
1776 			return (EINVAL);
1777 		}
1778 
1779 		BPFD_LOCK(d);
1780 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1781 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1782 			BPFD_UNLOCK(d);
1783 			CURVNET_RESTORE();
1784 			return (EBUSY);
1785 		}
1786 		d->bd_bufmode = *(u_int *)addr;
1787 		BPFD_UNLOCK(d);
1788 		break;
1789 
1790 	case BIOCGETZMAX:
1791 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1792 		break;
1793 
1794 	case BIOCSETZBUF:
1795 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1796 		break;
1797 
1798 	case BIOCROTZBUF:
1799 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1800 		break;
1801 	}
1802 	CURVNET_RESTORE();
1803 	return (error);
1804 }
1805 
1806 /*
1807  * Set d's packet filter program to fp.  If this file already has a filter,
1808  * free it and replace it.  Returns EINVAL for bogus requests.
1809  *
1810  * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
1811  * since reading d->bd_bif can't be protected by d or interface lock due to
1812  * lock order.
1813  *
1814  * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
1815  * interface read lock to read all filers.
1816  *
1817  */
1818 static int
1819 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1820 {
1821 #ifdef COMPAT_FREEBSD32
1822 	struct bpf_program fp_swab;
1823 	struct bpf_program32 *fp32;
1824 #endif
1825 	struct bpf_insn *fcode, *old;
1826 #ifdef BPF_JITTER
1827 	bpf_jit_filter *jfunc, *ofunc;
1828 #endif
1829 	size_t size;
1830 	u_int flen;
1831 	int need_upgrade;
1832 
1833 #ifdef COMPAT_FREEBSD32
1834 	switch (cmd) {
1835 	case BIOCSETF32:
1836 	case BIOCSETWF32:
1837 	case BIOCSETFNR32:
1838 		fp32 = (struct bpf_program32 *)fp;
1839 		fp_swab.bf_len = fp32->bf_len;
1840 		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1841 		fp = &fp_swab;
1842 		switch (cmd) {
1843 		case BIOCSETF32:
1844 			cmd = BIOCSETF;
1845 			break;
1846 		case BIOCSETWF32:
1847 			cmd = BIOCSETWF;
1848 			break;
1849 		}
1850 		break;
1851 	}
1852 #endif
1853 
1854 	fcode = NULL;
1855 #ifdef BPF_JITTER
1856 	jfunc = ofunc = NULL;
1857 #endif
1858 	need_upgrade = 0;
1859 
1860 	/*
1861 	 * Check new filter validness before acquiring any locks.
1862 	 * Allocate memory for new filter, if needed.
1863 	 */
1864 	flen = fp->bf_len;
1865 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1866 		return (EINVAL);
1867 	size = flen * sizeof(*fp->bf_insns);
1868 	if (size > 0) {
1869 		/* We're setting up new filter.  Copy and check actual data. */
1870 		fcode = malloc(size, M_BPF, M_WAITOK);
1871 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1872 		    !bpf_validate(fcode, flen)) {
1873 			free(fcode, M_BPF);
1874 			return (EINVAL);
1875 		}
1876 #ifdef BPF_JITTER
1877 		/* Filter is copied inside fcode and is perfectly valid. */
1878 		jfunc = bpf_jitter(fcode, flen);
1879 #endif
1880 	}
1881 
1882 	BPF_LOCK();
1883 
1884 	/*
1885 	 * Set up new filter.
1886 	 * Protect filter change by interface lock.
1887 	 * Additionally, we are protected by global lock here.
1888 	 */
1889 	if (d->bd_bif != NULL)
1890 		BPFIF_WLOCK(d->bd_bif);
1891 	BPFD_LOCK(d);
1892 	if (cmd == BIOCSETWF) {
1893 		old = d->bd_wfilter;
1894 		d->bd_wfilter = fcode;
1895 	} else {
1896 		old = d->bd_rfilter;
1897 		d->bd_rfilter = fcode;
1898 #ifdef BPF_JITTER
1899 		ofunc = d->bd_bfilter;
1900 		d->bd_bfilter = jfunc;
1901 #endif
1902 		if (cmd == BIOCSETF)
1903 			reset_d(d);
1904 
1905 		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
1906 	}
1907 	BPFD_UNLOCK(d);
1908 	if (d->bd_bif != NULL)
1909 		BPFIF_WUNLOCK(d->bd_bif);
1910 	if (old != NULL)
1911 		free(old, M_BPF);
1912 #ifdef BPF_JITTER
1913 	if (ofunc != NULL)
1914 		bpf_destroy_jit_filter(ofunc);
1915 #endif
1916 
1917 	/* Move d to active readers list. */
1918 	if (need_upgrade != 0)
1919 		bpf_upgraded(d);
1920 
1921 	BPF_UNLOCK();
1922 	return (0);
1923 }
1924 
1925 /*
1926  * Detach a file from its current interface (if attached at all) and attach
1927  * to the interface indicated by the name stored in ifr.
1928  * Return an errno or 0.
1929  */
1930 static int
1931 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1932 {
1933 	struct bpf_if *bp;
1934 	struct ifnet *theywant;
1935 
1936 	BPF_LOCK_ASSERT();
1937 
1938 	theywant = ifunit(ifr->ifr_name);
1939 	if (theywant == NULL || theywant->if_bpf == NULL)
1940 		return (ENXIO);
1941 
1942 	bp = theywant->if_bpf;
1943 
1944 	/* Check if interface is not being detached from BPF */
1945 	BPFIF_RLOCK(bp);
1946 	if (bp->bif_flags & BPFIF_FLAG_DYING) {
1947 		BPFIF_RUNLOCK(bp);
1948 		return (ENXIO);
1949 	}
1950 	BPFIF_RUNLOCK(bp);
1951 
1952 	/*
1953 	 * At this point, we expect the buffer is already allocated.  If not,
1954 	 * return an error.
1955 	 */
1956 	switch (d->bd_bufmode) {
1957 	case BPF_BUFMODE_BUFFER:
1958 	case BPF_BUFMODE_ZBUF:
1959 		if (d->bd_sbuf == NULL)
1960 			return (EINVAL);
1961 		break;
1962 
1963 	default:
1964 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1965 	}
1966 	if (bp != d->bd_bif)
1967 		bpf_attachd(d, bp);
1968 	BPFD_LOCK(d);
1969 	reset_d(d);
1970 	BPFD_UNLOCK(d);
1971 	return (0);
1972 }
1973 
1974 /*
1975  * Support for select() and poll() system calls
1976  *
1977  * Return true iff the specific operation will not block indefinitely.
1978  * Otherwise, return false but make a note that a selwakeup() must be done.
1979  */
1980 static int
1981 bpfpoll(struct cdev *dev, int events, struct thread *td)
1982 {
1983 	struct bpf_d *d;
1984 	int revents;
1985 
1986 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
1987 		return (events &
1988 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
1989 
1990 	/*
1991 	 * Refresh PID associated with this descriptor.
1992 	 */
1993 	revents = events & (POLLOUT | POLLWRNORM);
1994 	BPFD_LOCK(d);
1995 	BPF_PID_REFRESH(d, td);
1996 	if (events & (POLLIN | POLLRDNORM)) {
1997 		if (bpf_ready(d))
1998 			revents |= events & (POLLIN | POLLRDNORM);
1999 		else {
2000 			selrecord(td, &d->bd_sel);
2001 			/* Start the read timeout if necessary. */
2002 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2003 				callout_reset(&d->bd_callout, d->bd_rtout,
2004 				    bpf_timed_out, d);
2005 				d->bd_state = BPF_WAITING;
2006 			}
2007 		}
2008 	}
2009 	BPFD_UNLOCK(d);
2010 	return (revents);
2011 }
2012 
2013 /*
2014  * Support for kevent() system call.  Register EVFILT_READ filters and
2015  * reject all others.
2016  */
2017 int
2018 bpfkqfilter(struct cdev *dev, struct knote *kn)
2019 {
2020 	struct bpf_d *d;
2021 
2022 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
2023 	    kn->kn_filter != EVFILT_READ)
2024 		return (1);
2025 
2026 	/*
2027 	 * Refresh PID associated with this descriptor.
2028 	 */
2029 	BPFD_LOCK(d);
2030 	BPF_PID_REFRESH_CUR(d);
2031 	kn->kn_fop = &bpfread_filtops;
2032 	kn->kn_hook = d;
2033 	knlist_add(&d->bd_sel.si_note, kn, 1);
2034 	BPFD_UNLOCK(d);
2035 
2036 	return (0);
2037 }
2038 
2039 static void
2040 filt_bpfdetach(struct knote *kn)
2041 {
2042 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2043 
2044 	knlist_remove(&d->bd_sel.si_note, kn, 0);
2045 }
2046 
2047 static int
2048 filt_bpfread(struct knote *kn, long hint)
2049 {
2050 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2051 	int ready;
2052 
2053 	BPFD_LOCK_ASSERT(d);
2054 	ready = bpf_ready(d);
2055 	if (ready) {
2056 		kn->kn_data = d->bd_slen;
2057 		/*
2058 		 * Ignore the hold buffer if it is being copied to user space.
2059 		 */
2060 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
2061 			kn->kn_data += d->bd_hlen;
2062 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2063 		callout_reset(&d->bd_callout, d->bd_rtout,
2064 		    bpf_timed_out, d);
2065 		d->bd_state = BPF_WAITING;
2066 	}
2067 
2068 	return (ready);
2069 }
2070 
2071 #define	BPF_TSTAMP_NONE		0
2072 #define	BPF_TSTAMP_FAST		1
2073 #define	BPF_TSTAMP_NORMAL	2
2074 #define	BPF_TSTAMP_EXTERN	3
2075 
2076 static int
2077 bpf_ts_quality(int tstype)
2078 {
2079 
2080 	if (tstype == BPF_T_NONE)
2081 		return (BPF_TSTAMP_NONE);
2082 	if ((tstype & BPF_T_FAST) != 0)
2083 		return (BPF_TSTAMP_FAST);
2084 
2085 	return (BPF_TSTAMP_NORMAL);
2086 }
2087 
2088 static int
2089 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2090 {
2091 	struct m_tag *tag;
2092 	int quality;
2093 
2094 	quality = bpf_ts_quality(tstype);
2095 	if (quality == BPF_TSTAMP_NONE)
2096 		return (quality);
2097 
2098 	if (m != NULL) {
2099 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2100 		if (tag != NULL) {
2101 			*bt = *(struct bintime *)(tag + 1);
2102 			return (BPF_TSTAMP_EXTERN);
2103 		}
2104 	}
2105 	if (quality == BPF_TSTAMP_NORMAL)
2106 		binuptime(bt);
2107 	else
2108 		getbinuptime(bt);
2109 
2110 	return (quality);
2111 }
2112 
2113 /*
2114  * Incoming linkage from device drivers.  Process the packet pkt, of length
2115  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2116  * by each process' filter, and if accepted, stashed into the corresponding
2117  * buffer.
2118  */
2119 void
2120 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2121 {
2122 	struct bintime bt;
2123 	struct bpf_d *d;
2124 #ifdef BPF_JITTER
2125 	bpf_jit_filter *bf;
2126 #endif
2127 	u_int slen;
2128 	int gottime;
2129 
2130 	gottime = BPF_TSTAMP_NONE;
2131 
2132 	BPFIF_RLOCK(bp);
2133 
2134 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2135 		/*
2136 		 * We are not using any locks for d here because:
2137 		 * 1) any filter change is protected by interface
2138 		 * write lock
2139 		 * 2) destroying/detaching d is protected by interface
2140 		 * write lock, too
2141 		 */
2142 
2143 		/* XXX: Do not protect counter for the sake of performance. */
2144 		++d->bd_rcount;
2145 		/*
2146 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
2147 		 * way for the caller to indiciate to us whether this packet
2148 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
2149 		 * the interface pointers on the mbuf to figure it out.
2150 		 */
2151 #ifdef BPF_JITTER
2152 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2153 		if (bf != NULL)
2154 			slen = (*(bf->func))(pkt, pktlen, pktlen);
2155 		else
2156 #endif
2157 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2158 		if (slen != 0) {
2159 			/*
2160 			 * Filter matches. Let's to acquire write lock.
2161 			 */
2162 			BPFD_LOCK(d);
2163 
2164 			d->bd_fcount++;
2165 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2166 				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
2167 #ifdef MAC
2168 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2169 #endif
2170 				catchpacket(d, pkt, pktlen, slen,
2171 				    bpf_append_bytes, &bt);
2172 			BPFD_UNLOCK(d);
2173 		}
2174 	}
2175 	BPFIF_RUNLOCK(bp);
2176 }
2177 
2178 #define	BPF_CHECK_DIRECTION(d, r, i)				\
2179 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
2180 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
2181 
2182 /*
2183  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2184  * Locking model is explained in bpf_tap().
2185  */
2186 void
2187 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2188 {
2189 	struct bintime bt;
2190 	struct bpf_d *d;
2191 #ifdef BPF_JITTER
2192 	bpf_jit_filter *bf;
2193 #endif
2194 	u_int pktlen, slen;
2195 	int gottime;
2196 
2197 	/* Skip outgoing duplicate packets. */
2198 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2199 		m->m_flags &= ~M_PROMISC;
2200 		return;
2201 	}
2202 
2203 	pktlen = m_length(m, NULL);
2204 	gottime = BPF_TSTAMP_NONE;
2205 
2206 	BPFIF_RLOCK(bp);
2207 
2208 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2209 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2210 			continue;
2211 		++d->bd_rcount;
2212 #ifdef BPF_JITTER
2213 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2214 		/* XXX We cannot handle multiple mbufs. */
2215 		if (bf != NULL && m->m_next == NULL)
2216 			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
2217 		else
2218 #endif
2219 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2220 		if (slen != 0) {
2221 			BPFD_LOCK(d);
2222 
2223 			d->bd_fcount++;
2224 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2225 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2226 #ifdef MAC
2227 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2228 #endif
2229 				catchpacket(d, (u_char *)m, pktlen, slen,
2230 				    bpf_append_mbuf, &bt);
2231 			BPFD_UNLOCK(d);
2232 		}
2233 	}
2234 	BPFIF_RUNLOCK(bp);
2235 }
2236 
2237 /*
2238  * Incoming linkage from device drivers, when packet is in
2239  * an mbuf chain and to be prepended by a contiguous header.
2240  */
2241 void
2242 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2243 {
2244 	struct bintime bt;
2245 	struct mbuf mb;
2246 	struct bpf_d *d;
2247 	u_int pktlen, slen;
2248 	int gottime;
2249 
2250 	/* Skip outgoing duplicate packets. */
2251 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2252 		m->m_flags &= ~M_PROMISC;
2253 		return;
2254 	}
2255 
2256 	pktlen = m_length(m, NULL);
2257 	/*
2258 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
2259 	 * Note that we cut corners here; we only setup what's
2260 	 * absolutely needed--this mbuf should never go anywhere else.
2261 	 */
2262 	mb.m_next = m;
2263 	mb.m_data = data;
2264 	mb.m_len = dlen;
2265 	pktlen += dlen;
2266 
2267 	gottime = BPF_TSTAMP_NONE;
2268 
2269 	BPFIF_RLOCK(bp);
2270 
2271 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2272 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2273 			continue;
2274 		++d->bd_rcount;
2275 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2276 		if (slen != 0) {
2277 			BPFD_LOCK(d);
2278 
2279 			d->bd_fcount++;
2280 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2281 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2282 #ifdef MAC
2283 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2284 #endif
2285 				catchpacket(d, (u_char *)&mb, pktlen, slen,
2286 				    bpf_append_mbuf, &bt);
2287 			BPFD_UNLOCK(d);
2288 		}
2289 	}
2290 	BPFIF_RUNLOCK(bp);
2291 }
2292 
2293 #undef	BPF_CHECK_DIRECTION
2294 
2295 #undef	BPF_TSTAMP_NONE
2296 #undef	BPF_TSTAMP_FAST
2297 #undef	BPF_TSTAMP_NORMAL
2298 #undef	BPF_TSTAMP_EXTERN
2299 
2300 static int
2301 bpf_hdrlen(struct bpf_d *d)
2302 {
2303 	int hdrlen;
2304 
2305 	hdrlen = d->bd_bif->bif_hdrlen;
2306 #ifndef BURN_BRIDGES
2307 	if (d->bd_tstamp == BPF_T_NONE ||
2308 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2309 #ifdef COMPAT_FREEBSD32
2310 		if (d->bd_compat32)
2311 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2312 		else
2313 #endif
2314 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2315 	else
2316 #endif
2317 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2318 #ifdef COMPAT_FREEBSD32
2319 	if (d->bd_compat32)
2320 		hdrlen = BPF_WORDALIGN32(hdrlen);
2321 	else
2322 #endif
2323 		hdrlen = BPF_WORDALIGN(hdrlen);
2324 
2325 	return (hdrlen - d->bd_bif->bif_hdrlen);
2326 }
2327 
2328 static void
2329 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2330 {
2331 	struct bintime bt2, boottimebin;
2332 	struct timeval tsm;
2333 	struct timespec tsn;
2334 
2335 	if ((tstype & BPF_T_MONOTONIC) == 0) {
2336 		bt2 = *bt;
2337 		getboottimebin(&boottimebin);
2338 		bintime_add(&bt2, &boottimebin);
2339 		bt = &bt2;
2340 	}
2341 	switch (BPF_T_FORMAT(tstype)) {
2342 	case BPF_T_MICROTIME:
2343 		bintime2timeval(bt, &tsm);
2344 		ts->bt_sec = tsm.tv_sec;
2345 		ts->bt_frac = tsm.tv_usec;
2346 		break;
2347 	case BPF_T_NANOTIME:
2348 		bintime2timespec(bt, &tsn);
2349 		ts->bt_sec = tsn.tv_sec;
2350 		ts->bt_frac = tsn.tv_nsec;
2351 		break;
2352 	case BPF_T_BINTIME:
2353 		ts->bt_sec = bt->sec;
2354 		ts->bt_frac = bt->frac;
2355 		break;
2356 	}
2357 }
2358 
2359 /*
2360  * Move the packet data from interface memory (pkt) into the
2361  * store buffer.  "cpfn" is the routine called to do the actual data
2362  * transfer.  bcopy is passed in to copy contiguous chunks, while
2363  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2364  * pkt is really an mbuf.
2365  */
2366 static void
2367 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2368     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2369     struct bintime *bt)
2370 {
2371 	struct bpf_xhdr hdr;
2372 #ifndef BURN_BRIDGES
2373 	struct bpf_hdr hdr_old;
2374 #ifdef COMPAT_FREEBSD32
2375 	struct bpf_hdr32 hdr32_old;
2376 #endif
2377 #endif
2378 	int caplen, curlen, hdrlen, totlen;
2379 	int do_wakeup = 0;
2380 	int do_timestamp;
2381 	int tstype;
2382 
2383 	BPFD_LOCK_ASSERT(d);
2384 
2385 	/*
2386 	 * Detect whether user space has released a buffer back to us, and if
2387 	 * so, move it from being a hold buffer to a free buffer.  This may
2388 	 * not be the best place to do it (for example, we might only want to
2389 	 * run this check if we need the space), but for now it's a reliable
2390 	 * spot to do it.
2391 	 */
2392 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2393 		d->bd_fbuf = d->bd_hbuf;
2394 		d->bd_hbuf = NULL;
2395 		d->bd_hlen = 0;
2396 		bpf_buf_reclaimed(d);
2397 	}
2398 
2399 	/*
2400 	 * Figure out how many bytes to move.  If the packet is
2401 	 * greater or equal to the snapshot length, transfer that
2402 	 * much.  Otherwise, transfer the whole packet (unless
2403 	 * we hit the buffer size limit).
2404 	 */
2405 	hdrlen = bpf_hdrlen(d);
2406 	totlen = hdrlen + min(snaplen, pktlen);
2407 	if (totlen > d->bd_bufsize)
2408 		totlen = d->bd_bufsize;
2409 
2410 	/*
2411 	 * Round up the end of the previous packet to the next longword.
2412 	 *
2413 	 * Drop the packet if there's no room and no hope of room
2414 	 * If the packet would overflow the storage buffer or the storage
2415 	 * buffer is considered immutable by the buffer model, try to rotate
2416 	 * the buffer and wakeup pending processes.
2417 	 */
2418 #ifdef COMPAT_FREEBSD32
2419 	if (d->bd_compat32)
2420 		curlen = BPF_WORDALIGN32(d->bd_slen);
2421 	else
2422 #endif
2423 		curlen = BPF_WORDALIGN(d->bd_slen);
2424 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2425 		if (d->bd_fbuf == NULL) {
2426 			/*
2427 			 * There's no room in the store buffer, and no
2428 			 * prospect of room, so drop the packet.  Notify the
2429 			 * buffer model.
2430 			 */
2431 			bpf_buffull(d);
2432 			++d->bd_dcount;
2433 			return;
2434 		}
2435 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
2436 		ROTATE_BUFFERS(d);
2437 		do_wakeup = 1;
2438 		curlen = 0;
2439 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
2440 		/*
2441 		 * Immediate mode is set, or the read timeout has already
2442 		 * expired during a select call.  A packet arrived, so the
2443 		 * reader should be woken up.
2444 		 */
2445 		do_wakeup = 1;
2446 	caplen = totlen - hdrlen;
2447 	tstype = d->bd_tstamp;
2448 	do_timestamp = tstype != BPF_T_NONE;
2449 #ifndef BURN_BRIDGES
2450 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2451 		struct bpf_ts ts;
2452 		if (do_timestamp)
2453 			bpf_bintime2ts(bt, &ts, tstype);
2454 #ifdef COMPAT_FREEBSD32
2455 		if (d->bd_compat32) {
2456 			bzero(&hdr32_old, sizeof(hdr32_old));
2457 			if (do_timestamp) {
2458 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2459 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2460 			}
2461 			hdr32_old.bh_datalen = pktlen;
2462 			hdr32_old.bh_hdrlen = hdrlen;
2463 			hdr32_old.bh_caplen = caplen;
2464 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2465 			    sizeof(hdr32_old));
2466 			goto copy;
2467 		}
2468 #endif
2469 		bzero(&hdr_old, sizeof(hdr_old));
2470 		if (do_timestamp) {
2471 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2472 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2473 		}
2474 		hdr_old.bh_datalen = pktlen;
2475 		hdr_old.bh_hdrlen = hdrlen;
2476 		hdr_old.bh_caplen = caplen;
2477 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2478 		    sizeof(hdr_old));
2479 		goto copy;
2480 	}
2481 #endif
2482 
2483 	/*
2484 	 * Append the bpf header.  Note we append the actual header size, but
2485 	 * move forward the length of the header plus padding.
2486 	 */
2487 	bzero(&hdr, sizeof(hdr));
2488 	if (do_timestamp)
2489 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2490 	hdr.bh_datalen = pktlen;
2491 	hdr.bh_hdrlen = hdrlen;
2492 	hdr.bh_caplen = caplen;
2493 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2494 
2495 	/*
2496 	 * Copy the packet data into the store buffer and update its length.
2497 	 */
2498 #ifndef BURN_BRIDGES
2499 copy:
2500 #endif
2501 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2502 	d->bd_slen = curlen + totlen;
2503 
2504 	if (do_wakeup)
2505 		bpf_wakeup(d);
2506 }
2507 
2508 /*
2509  * Free buffers currently in use by a descriptor.
2510  * Called on close.
2511  */
2512 static void
2513 bpf_freed(struct bpf_d *d)
2514 {
2515 
2516 	/*
2517 	 * We don't need to lock out interrupts since this descriptor has
2518 	 * been detached from its interface and it yet hasn't been marked
2519 	 * free.
2520 	 */
2521 	bpf_free(d);
2522 	if (d->bd_rfilter != NULL) {
2523 		free((caddr_t)d->bd_rfilter, M_BPF);
2524 #ifdef BPF_JITTER
2525 		if (d->bd_bfilter != NULL)
2526 			bpf_destroy_jit_filter(d->bd_bfilter);
2527 #endif
2528 	}
2529 	if (d->bd_wfilter != NULL)
2530 		free((caddr_t)d->bd_wfilter, M_BPF);
2531 	mtx_destroy(&d->bd_lock);
2532 }
2533 
2534 /*
2535  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
2536  * fixed size of the link header (variable length headers not yet supported).
2537  */
2538 void
2539 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2540 {
2541 
2542 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2543 }
2544 
2545 /*
2546  * Attach an interface to bpf.  ifp is a pointer to the structure
2547  * defining the interface to be attached, dlt is the link layer type,
2548  * and hdrlen is the fixed size of the link header (variable length
2549  * headers are not yet supporrted).
2550  */
2551 void
2552 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2553 {
2554 	struct bpf_if *bp;
2555 
2556 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
2557 	if (bp == NULL)
2558 		panic("bpfattach");
2559 
2560 	LIST_INIT(&bp->bif_dlist);
2561 	LIST_INIT(&bp->bif_wlist);
2562 	bp->bif_ifp = ifp;
2563 	bp->bif_dlt = dlt;
2564 	rw_init(&bp->bif_lock, "bpf interface lock");
2565 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
2566 	*driverp = bp;
2567 
2568 	BPF_LOCK();
2569 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
2570 	BPF_UNLOCK();
2571 
2572 	bp->bif_hdrlen = hdrlen;
2573 
2574 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
2575 		if_printf(ifp, "bpf attached\n");
2576 }
2577 
2578 #ifdef VIMAGE
2579 /*
2580  * When moving interfaces between vnet instances we need a way to
2581  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
2582  * after the vmove.  We unfortunately have no device driver infrastructure
2583  * to query the interface for these values after creation/attach, thus
2584  * add this as a workaround.
2585  */
2586 int
2587 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
2588 {
2589 
2590 	if (bp == NULL)
2591 		return (ENXIO);
2592 	if (bif_dlt == NULL && bif_hdrlen == NULL)
2593 		return (0);
2594 
2595 	if (bif_dlt != NULL)
2596 		*bif_dlt = bp->bif_dlt;
2597 	if (bif_hdrlen != NULL)
2598 		*bif_hdrlen = bp->bif_hdrlen;
2599 
2600 	return (0);
2601 }
2602 #endif
2603 
2604 /*
2605  * Detach bpf from an interface. This involves detaching each descriptor
2606  * associated with the interface. Notify each descriptor as it's detached
2607  * so that any sleepers wake up and get ENXIO.
2608  */
2609 void
2610 bpfdetach(struct ifnet *ifp)
2611 {
2612 	struct bpf_if	*bp, *bp_temp;
2613 	struct bpf_d	*d;
2614 	int ndetached;
2615 
2616 	ndetached = 0;
2617 
2618 	BPF_LOCK();
2619 	/* Find all bpf_if struct's which reference ifp and detach them. */
2620 	LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
2621 		if (ifp != bp->bif_ifp)
2622 			continue;
2623 
2624 		LIST_REMOVE(bp, bif_next);
2625 		/* Add to to-be-freed list */
2626 		LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next);
2627 
2628 		ndetached++;
2629 		/*
2630 		 * Delay freeing bp till interface is detached
2631 		 * and all routes through this interface are removed.
2632 		 * Mark bp as detached to restrict new consumers.
2633 		 */
2634 		BPFIF_WLOCK(bp);
2635 		bp->bif_flags |= BPFIF_FLAG_DYING;
2636 		BPFIF_WUNLOCK(bp);
2637 
2638 		CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p",
2639 		    __func__, bp->bif_dlt, bp, ifp);
2640 
2641 		/* Free common descriptors */
2642 		while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
2643 			bpf_detachd_locked(d);
2644 			BPFD_LOCK(d);
2645 			bpf_wakeup(d);
2646 			BPFD_UNLOCK(d);
2647 		}
2648 
2649 		/* Free writer-only descriptors */
2650 		while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) {
2651 			bpf_detachd_locked(d);
2652 			BPFD_LOCK(d);
2653 			bpf_wakeup(d);
2654 			BPFD_UNLOCK(d);
2655 		}
2656 	}
2657 	BPF_UNLOCK();
2658 
2659 #ifdef INVARIANTS
2660 	if (ndetached == 0)
2661 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
2662 #endif
2663 }
2664 
2665 /*
2666  * Interface departure handler.
2667  * Note departure event does not guarantee interface is going down.
2668  * Interface renaming is currently done via departure/arrival event set.
2669  *
2670  * Departure handled is called after all routes pointing to
2671  * given interface are removed and interface is in down state
2672  * restricting any packets to be sent/received. We assume it is now safe
2673  * to free data allocated by BPF.
2674  */
2675 static void
2676 bpf_ifdetach(void *arg __unused, struct ifnet *ifp)
2677 {
2678 	struct bpf_if *bp, *bp_temp;
2679 	int nmatched = 0;
2680 
2681 	/* Ignore ifnet renaming. */
2682 	if (ifp->if_flags & IFF_RENAMING)
2683 		return;
2684 
2685 	BPF_LOCK();
2686 	/*
2687 	 * Find matching entries in free list.
2688 	 * Nothing should be found if bpfdetach() was not called.
2689 	 */
2690 	LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) {
2691 		if (ifp != bp->bif_ifp)
2692 			continue;
2693 
2694 		CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p",
2695 		    __func__, bp, ifp);
2696 
2697 		LIST_REMOVE(bp, bif_next);
2698 
2699 		rw_destroy(&bp->bif_lock);
2700 		free(bp, M_BPF);
2701 
2702 		nmatched++;
2703 	}
2704 	BPF_UNLOCK();
2705 
2706 	/*
2707 	 * Note that we cannot zero other pointers to
2708 	 * custom DLTs possibly used by given interface.
2709 	 */
2710 	if (nmatched != 0)
2711 		ifp->if_bpf = NULL;
2712 }
2713 
2714 /*
2715  * Get a list of available data link type of the interface.
2716  */
2717 static int
2718 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2719 {
2720 	struct ifnet *ifp;
2721 	struct bpf_if *bp;
2722 	u_int *lst;
2723 	int error, n, n1;
2724 
2725 	BPF_LOCK_ASSERT();
2726 
2727 	ifp = d->bd_bif->bif_ifp;
2728 again:
2729 	n1 = 0;
2730 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2731 		if (bp->bif_ifp == ifp)
2732 			n1++;
2733 	}
2734 	if (bfl->bfl_list == NULL) {
2735 		bfl->bfl_len = n1;
2736 		return (0);
2737 	}
2738 	if (n1 > bfl->bfl_len)
2739 		return (ENOMEM);
2740 	BPF_UNLOCK();
2741 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2742 	n = 0;
2743 	BPF_LOCK();
2744 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2745 		if (bp->bif_ifp != ifp)
2746 			continue;
2747 		if (n >= n1) {
2748 			free(lst, M_TEMP);
2749 			goto again;
2750 		}
2751 		lst[n] = bp->bif_dlt;
2752 		n++;
2753 	}
2754 	BPF_UNLOCK();
2755 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2756 	free(lst, M_TEMP);
2757 	BPF_LOCK();
2758 	bfl->bfl_len = n;
2759 	return (error);
2760 }
2761 
2762 /*
2763  * Set the data link type of a BPF instance.
2764  */
2765 static int
2766 bpf_setdlt(struct bpf_d *d, u_int dlt)
2767 {
2768 	int error, opromisc;
2769 	struct ifnet *ifp;
2770 	struct bpf_if *bp;
2771 
2772 	BPF_LOCK_ASSERT();
2773 
2774 	if (d->bd_bif->bif_dlt == dlt)
2775 		return (0);
2776 	ifp = d->bd_bif->bif_ifp;
2777 
2778 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2779 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
2780 			break;
2781 	}
2782 
2783 	if (bp != NULL) {
2784 		opromisc = d->bd_promisc;
2785 		bpf_attachd(d, bp);
2786 		BPFD_LOCK(d);
2787 		reset_d(d);
2788 		BPFD_UNLOCK(d);
2789 		if (opromisc) {
2790 			error = ifpromisc(bp->bif_ifp, 1);
2791 			if (error)
2792 				if_printf(bp->bif_ifp,
2793 					"bpf_setdlt: ifpromisc failed (%d)\n",
2794 					error);
2795 			else
2796 				d->bd_promisc = 1;
2797 		}
2798 	}
2799 	return (bp == NULL ? EINVAL : 0);
2800 }
2801 
2802 static void
2803 bpf_drvinit(void *unused)
2804 {
2805 	struct cdev *dev;
2806 
2807 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
2808 	LIST_INIT(&bpf_iflist);
2809 	LIST_INIT(&bpf_freelist);
2810 
2811 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2812 	/* For compatibility */
2813 	make_dev_alias(dev, "bpf0");
2814 
2815 	/* Register interface departure handler */
2816 	bpf_ifdetach_cookie = EVENTHANDLER_REGISTER(
2817 		    ifnet_departure_event, bpf_ifdetach, NULL,
2818 		    EVENTHANDLER_PRI_ANY);
2819 }
2820 
2821 /*
2822  * Zero out the various packet counters associated with all of the bpf
2823  * descriptors.  At some point, we will probably want to get a bit more
2824  * granular and allow the user to specify descriptors to be zeroed.
2825  */
2826 static void
2827 bpf_zero_counters(void)
2828 {
2829 	struct bpf_if *bp;
2830 	struct bpf_d *bd;
2831 
2832 	BPF_LOCK();
2833 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2834 		BPFIF_RLOCK(bp);
2835 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2836 			BPFD_LOCK(bd);
2837 			bd->bd_rcount = 0;
2838 			bd->bd_dcount = 0;
2839 			bd->bd_fcount = 0;
2840 			bd->bd_wcount = 0;
2841 			bd->bd_wfcount = 0;
2842 			bd->bd_zcopy = 0;
2843 			BPFD_UNLOCK(bd);
2844 		}
2845 		BPFIF_RUNLOCK(bp);
2846 	}
2847 	BPF_UNLOCK();
2848 }
2849 
2850 /*
2851  * Fill filter statistics
2852  */
2853 static void
2854 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2855 {
2856 
2857 	bzero(d, sizeof(*d));
2858 	BPFD_LOCK_ASSERT(bd);
2859 	d->bd_structsize = sizeof(*d);
2860 	/* XXX: reading should be protected by global lock */
2861 	d->bd_immediate = bd->bd_immediate;
2862 	d->bd_promisc = bd->bd_promisc;
2863 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
2864 	d->bd_direction = bd->bd_direction;
2865 	d->bd_feedback = bd->bd_feedback;
2866 	d->bd_async = bd->bd_async;
2867 	d->bd_rcount = bd->bd_rcount;
2868 	d->bd_dcount = bd->bd_dcount;
2869 	d->bd_fcount = bd->bd_fcount;
2870 	d->bd_sig = bd->bd_sig;
2871 	d->bd_slen = bd->bd_slen;
2872 	d->bd_hlen = bd->bd_hlen;
2873 	d->bd_bufsize = bd->bd_bufsize;
2874 	d->bd_pid = bd->bd_pid;
2875 	strlcpy(d->bd_ifname,
2876 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2877 	d->bd_locked = bd->bd_locked;
2878 	d->bd_wcount = bd->bd_wcount;
2879 	d->bd_wdcount = bd->bd_wdcount;
2880 	d->bd_wfcount = bd->bd_wfcount;
2881 	d->bd_zcopy = bd->bd_zcopy;
2882 	d->bd_bufmode = bd->bd_bufmode;
2883 }
2884 
2885 /*
2886  * Handle `netstat -B' stats request
2887  */
2888 static int
2889 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2890 {
2891 	static const struct xbpf_d zerostats;
2892 	struct xbpf_d *xbdbuf, *xbd, tempstats;
2893 	int index, error;
2894 	struct bpf_if *bp;
2895 	struct bpf_d *bd;
2896 
2897 	/*
2898 	 * XXX This is not technically correct. It is possible for non
2899 	 * privileged users to open bpf devices. It would make sense
2900 	 * if the users who opened the devices were able to retrieve
2901 	 * the statistics for them, too.
2902 	 */
2903 	error = priv_check(req->td, PRIV_NET_BPF);
2904 	if (error)
2905 		return (error);
2906 	/*
2907 	 * Check to see if the user is requesting that the counters be
2908 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
2909 	 * as we aren't allowing the user to set the counters currently.
2910 	 */
2911 	if (req->newptr != NULL) {
2912 		if (req->newlen != sizeof(tempstats))
2913 			return (EINVAL);
2914 		memset(&tempstats, 0, sizeof(tempstats));
2915 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
2916 		if (error)
2917 			return (error);
2918 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
2919 			return (EINVAL);
2920 		bpf_zero_counters();
2921 		return (0);
2922 	}
2923 	if (req->oldptr == NULL)
2924 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2925 	if (bpf_bpfd_cnt == 0)
2926 		return (SYSCTL_OUT(req, 0, 0));
2927 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2928 	BPF_LOCK();
2929 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2930 		BPF_UNLOCK();
2931 		free(xbdbuf, M_BPF);
2932 		return (ENOMEM);
2933 	}
2934 	index = 0;
2935 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2936 		BPFIF_RLOCK(bp);
2937 		/* Send writers-only first */
2938 		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
2939 			xbd = &xbdbuf[index++];
2940 			BPFD_LOCK(bd);
2941 			bpfstats_fill_xbpf(xbd, bd);
2942 			BPFD_UNLOCK(bd);
2943 		}
2944 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2945 			xbd = &xbdbuf[index++];
2946 			BPFD_LOCK(bd);
2947 			bpfstats_fill_xbpf(xbd, bd);
2948 			BPFD_UNLOCK(bd);
2949 		}
2950 		BPFIF_RUNLOCK(bp);
2951 	}
2952 	BPF_UNLOCK();
2953 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2954 	free(xbdbuf, M_BPF);
2955 	return (error);
2956 }
2957 
2958 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2959 
2960 #else /* !DEV_BPF && !NETGRAPH_BPF */
2961 /*
2962  * NOP stubs to allow bpf-using drivers to load and function.
2963  *
2964  * A 'better' implementation would allow the core bpf functionality
2965  * to be loaded at runtime.
2966  */
2967 static struct bpf_if bp_null;
2968 
2969 void
2970 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2971 {
2972 }
2973 
2974 void
2975 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2976 {
2977 }
2978 
2979 void
2980 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
2981 {
2982 }
2983 
2984 void
2985 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2986 {
2987 
2988 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2989 }
2990 
2991 void
2992 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2993 {
2994 
2995 	*driverp = &bp_null;
2996 }
2997 
2998 void
2999 bpfdetach(struct ifnet *ifp)
3000 {
3001 }
3002 
3003 u_int
3004 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3005 {
3006 	return -1;	/* "no filter" behaviour */
3007 }
3008 
3009 int
3010 bpf_validate(const struct bpf_insn *f, int len)
3011 {
3012 	return 0;		/* false */
3013 }
3014 
3015 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3016 
3017 #ifdef DDB
3018 static void
3019 bpf_show_bpf_if(struct bpf_if *bpf_if)
3020 {
3021 
3022 	if (bpf_if == NULL)
3023 		return;
3024 	db_printf("%p:\n", bpf_if);
3025 #define	BPF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, bpf_if->e);
3026 	/* bif_ext.bif_next */
3027 	/* bif_ext.bif_dlist */
3028 	BPF_DB_PRINTF("%#x", bif_dlt);
3029 	BPF_DB_PRINTF("%u", bif_hdrlen);
3030 	BPF_DB_PRINTF("%p", bif_ifp);
3031 	/* bif_lock */
3032 	/* bif_wlist */
3033 	BPF_DB_PRINTF("%#x", bif_flags);
3034 }
3035 
3036 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
3037 {
3038 
3039 	if (!have_addr) {
3040 		db_printf("usage: show bpf_if <struct bpf_if *>\n");
3041 		return;
3042 	}
3043 
3044 	bpf_show_bpf_if((struct bpf_if *)addr);
3045 }
3046 #endif
3047