xref: /freebsd/sys/net/bpf.c (revision bd18fd57db1df29da1a3adf94d47924a977a29c2)
1 /*-
2  * Copyright (c) 1990, 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from the Stanford/CMU enet packet filter,
6  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
7  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
8  * Berkeley Laboratory.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  *      @(#)bpf.c	8.4 (Berkeley) 1/9/95
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include "opt_bpf.h"
41 #include "opt_compat.h"
42 #include "opt_netgraph.h"
43 
44 #include <sys/types.h>
45 #include <sys/param.h>
46 #include <sys/lock.h>
47 #include <sys/rwlock.h>
48 #include <sys/systm.h>
49 #include <sys/conf.h>
50 #include <sys/fcntl.h>
51 #include <sys/jail.h>
52 #include <sys/malloc.h>
53 #include <sys/mbuf.h>
54 #include <sys/time.h>
55 #include <sys/priv.h>
56 #include <sys/proc.h>
57 #include <sys/signalvar.h>
58 #include <sys/filio.h>
59 #include <sys/sockio.h>
60 #include <sys/ttycom.h>
61 #include <sys/uio.h>
62 
63 #include <sys/event.h>
64 #include <sys/file.h>
65 #include <sys/poll.h>
66 #include <sys/proc.h>
67 
68 #include <sys/socket.h>
69 
70 #include <net/if.h>
71 #include <net/if_var.h>
72 #include <net/if_dl.h>
73 #include <net/bpf.h>
74 #include <net/bpf_buffer.h>
75 #ifdef BPF_JITTER
76 #include <net/bpf_jitter.h>
77 #endif
78 #include <net/bpf_zerocopy.h>
79 #include <net/bpfdesc.h>
80 #include <net/route.h>
81 #include <net/vnet.h>
82 
83 #include <netinet/in.h>
84 #include <netinet/if_ether.h>
85 #include <sys/kernel.h>
86 #include <sys/sysctl.h>
87 
88 #include <net80211/ieee80211_freebsd.h>
89 
90 #include <security/mac/mac_framework.h>
91 
92 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
93 
94 struct bpf_if {
95 #define	bif_next	bif_ext.bif_next
96 #define	bif_dlist	bif_ext.bif_dlist
97 	struct bpf_if_ext bif_ext;	/* public members */
98 	u_int		bif_dlt;	/* link layer type */
99 	u_int		bif_hdrlen;	/* length of link header */
100 	struct ifnet	*bif_ifp;	/* corresponding interface */
101 	struct rwlock	bif_lock;	/* interface lock */
102 	LIST_HEAD(, bpf_d) bif_wlist;	/* writer-only list */
103 	int		bif_flags;	/* Interface flags */
104 };
105 
106 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
107 
108 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
109 
110 #define PRINET  26			/* interruptible */
111 
112 #define	SIZEOF_BPF_HDR(type)	\
113     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
114 
115 #ifdef COMPAT_FREEBSD32
116 #include <sys/mount.h>
117 #include <compat/freebsd32/freebsd32.h>
118 #define BPF_ALIGNMENT32 sizeof(int32_t)
119 #define BPF_WORDALIGN32(x) (((x)+(BPF_ALIGNMENT32-1))&~(BPF_ALIGNMENT32-1))
120 
121 #ifndef BURN_BRIDGES
122 /*
123  * 32-bit version of structure prepended to each packet.  We use this header
124  * instead of the standard one for 32-bit streams.  We mark the a stream as
125  * 32-bit the first time we see a 32-bit compat ioctl request.
126  */
127 struct bpf_hdr32 {
128 	struct timeval32 bh_tstamp;	/* time stamp */
129 	uint32_t	bh_caplen;	/* length of captured portion */
130 	uint32_t	bh_datalen;	/* original length of packet */
131 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
132 					   plus alignment padding) */
133 };
134 #endif
135 
136 struct bpf_program32 {
137 	u_int bf_len;
138 	uint32_t bf_insns;
139 };
140 
141 struct bpf_dltlist32 {
142 	u_int	bfl_len;
143 	u_int	bfl_list;
144 };
145 
146 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
147 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
148 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
149 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
150 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
151 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
152 #endif
153 
154 /*
155  * bpf_iflist is a list of BPF interface structures, each corresponding to a
156  * specific DLT.  The same network interface might have several BPF interface
157  * structures registered by different layers in the stack (i.e., 802.11
158  * frames, ethernet frames, etc).
159  */
160 static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
161 static struct mtx	bpf_mtx;		/* bpf global lock */
162 static int		bpf_bpfd_cnt;
163 
164 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
165 static void	bpf_detachd(struct bpf_d *);
166 static void	bpf_detachd_locked(struct bpf_d *);
167 static void	bpf_freed(struct bpf_d *);
168 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
169 		    struct sockaddr *, int *, struct bpf_d *);
170 static int	bpf_setif(struct bpf_d *, struct ifreq *);
171 static void	bpf_timed_out(void *);
172 static __inline void
173 		bpf_wakeup(struct bpf_d *);
174 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
175 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
176 		    struct bintime *);
177 static void	reset_d(struct bpf_d *);
178 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
179 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
180 static int	bpf_setdlt(struct bpf_d *, u_int);
181 static void	filt_bpfdetach(struct knote *);
182 static int	filt_bpfread(struct knote *, long);
183 static void	bpf_drvinit(void *);
184 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
185 
186 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
187 int bpf_maxinsns = BPF_MAXINSNS;
188 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
189     &bpf_maxinsns, 0, "Maximum bpf program instructions");
190 static int bpf_zerocopy_enable = 0;
191 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
192     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
193 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
194     bpf_stats_sysctl, "bpf statistics portal");
195 
196 static VNET_DEFINE(int, bpf_optimize_writers) = 0;
197 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
198 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RW,
199     &VNET_NAME(bpf_optimize_writers), 0,
200     "Do not send packets until BPF program is set");
201 
202 static	d_open_t	bpfopen;
203 static	d_read_t	bpfread;
204 static	d_write_t	bpfwrite;
205 static	d_ioctl_t	bpfioctl;
206 static	d_poll_t	bpfpoll;
207 static	d_kqfilter_t	bpfkqfilter;
208 
209 static struct cdevsw bpf_cdevsw = {
210 	.d_version =	D_VERSION,
211 	.d_open =	bpfopen,
212 	.d_read =	bpfread,
213 	.d_write =	bpfwrite,
214 	.d_ioctl =	bpfioctl,
215 	.d_poll =	bpfpoll,
216 	.d_name =	"bpf",
217 	.d_kqfilter =	bpfkqfilter,
218 };
219 
220 static struct filterops bpfread_filtops = {
221 	.f_isfd = 1,
222 	.f_detach = filt_bpfdetach,
223 	.f_event = filt_bpfread,
224 };
225 
226 eventhandler_tag	bpf_ifdetach_cookie = NULL;
227 
228 /*
229  * LOCKING MODEL USED BY BPF:
230  * Locks:
231  * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
232  * some global counters and every bpf_if reference.
233  * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
234  * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
235  *   used by bpf_mtap code.
236  *
237  * Lock order:
238  *
239  * Global lock, interface lock, descriptor lock
240  *
241  * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
242  * working model. In many places (like bpf_detachd) we start with BPF descriptor
243  * (and we need to at least rlock it to get reliable interface pointer). This
244  * gives us potential LOR. As a result, we use global lock to protect from bpf_if
245  * change in every such place.
246  *
247  * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
248  * 3) descriptor main wlock.
249  * Reading bd_bif can be protected by any of these locks, typically global lock.
250  *
251  * Changing read/write BPF filter is protected by the same three locks,
252  * the same applies for reading.
253  *
254  * Sleeping in global lock is not allowed due to bpfdetach() using it.
255  */
256 
257 /*
258  * Wrapper functions for various buffering methods.  If the set of buffer
259  * modes expands, we will probably want to introduce a switch data structure
260  * similar to protosw, et.
261  */
262 static void
263 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
264     u_int len)
265 {
266 
267 	BPFD_LOCK_ASSERT(d);
268 
269 	switch (d->bd_bufmode) {
270 	case BPF_BUFMODE_BUFFER:
271 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
272 
273 	case BPF_BUFMODE_ZBUF:
274 		d->bd_zcopy++;
275 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
276 
277 	default:
278 		panic("bpf_buf_append_bytes");
279 	}
280 }
281 
282 static void
283 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
284     u_int len)
285 {
286 
287 	BPFD_LOCK_ASSERT(d);
288 
289 	switch (d->bd_bufmode) {
290 	case BPF_BUFMODE_BUFFER:
291 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
292 
293 	case BPF_BUFMODE_ZBUF:
294 		d->bd_zcopy++;
295 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
296 
297 	default:
298 		panic("bpf_buf_append_mbuf");
299 	}
300 }
301 
302 /*
303  * This function gets called when the free buffer is re-assigned.
304  */
305 static void
306 bpf_buf_reclaimed(struct bpf_d *d)
307 {
308 
309 	BPFD_LOCK_ASSERT(d);
310 
311 	switch (d->bd_bufmode) {
312 	case BPF_BUFMODE_BUFFER:
313 		return;
314 
315 	case BPF_BUFMODE_ZBUF:
316 		bpf_zerocopy_buf_reclaimed(d);
317 		return;
318 
319 	default:
320 		panic("bpf_buf_reclaimed");
321 	}
322 }
323 
324 /*
325  * If the buffer mechanism has a way to decide that a held buffer can be made
326  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
327  * returned if the buffer can be discarded, (0) is returned if it cannot.
328  */
329 static int
330 bpf_canfreebuf(struct bpf_d *d)
331 {
332 
333 	BPFD_LOCK_ASSERT(d);
334 
335 	switch (d->bd_bufmode) {
336 	case BPF_BUFMODE_ZBUF:
337 		return (bpf_zerocopy_canfreebuf(d));
338 	}
339 	return (0);
340 }
341 
342 /*
343  * Allow the buffer model to indicate that the current store buffer is
344  * immutable, regardless of the appearance of space.  Return (1) if the
345  * buffer is writable, and (0) if not.
346  */
347 static int
348 bpf_canwritebuf(struct bpf_d *d)
349 {
350 	BPFD_LOCK_ASSERT(d);
351 
352 	switch (d->bd_bufmode) {
353 	case BPF_BUFMODE_ZBUF:
354 		return (bpf_zerocopy_canwritebuf(d));
355 	}
356 	return (1);
357 }
358 
359 /*
360  * Notify buffer model that an attempt to write to the store buffer has
361  * resulted in a dropped packet, in which case the buffer may be considered
362  * full.
363  */
364 static void
365 bpf_buffull(struct bpf_d *d)
366 {
367 
368 	BPFD_LOCK_ASSERT(d);
369 
370 	switch (d->bd_bufmode) {
371 	case BPF_BUFMODE_ZBUF:
372 		bpf_zerocopy_buffull(d);
373 		break;
374 	}
375 }
376 
377 /*
378  * Notify the buffer model that a buffer has moved into the hold position.
379  */
380 void
381 bpf_bufheld(struct bpf_d *d)
382 {
383 
384 	BPFD_LOCK_ASSERT(d);
385 
386 	switch (d->bd_bufmode) {
387 	case BPF_BUFMODE_ZBUF:
388 		bpf_zerocopy_bufheld(d);
389 		break;
390 	}
391 }
392 
393 static void
394 bpf_free(struct bpf_d *d)
395 {
396 
397 	switch (d->bd_bufmode) {
398 	case BPF_BUFMODE_BUFFER:
399 		return (bpf_buffer_free(d));
400 
401 	case BPF_BUFMODE_ZBUF:
402 		return (bpf_zerocopy_free(d));
403 
404 	default:
405 		panic("bpf_buf_free");
406 	}
407 }
408 
409 static int
410 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
411 {
412 
413 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
414 		return (EOPNOTSUPP);
415 	return (bpf_buffer_uiomove(d, buf, len, uio));
416 }
417 
418 static int
419 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
420 {
421 
422 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
423 		return (EOPNOTSUPP);
424 	return (bpf_buffer_ioctl_sblen(d, i));
425 }
426 
427 static int
428 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
429 {
430 
431 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
432 		return (EOPNOTSUPP);
433 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
434 }
435 
436 static int
437 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
438 {
439 
440 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
441 		return (EOPNOTSUPP);
442 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
443 }
444 
445 static int
446 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
447 {
448 
449 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
450 		return (EOPNOTSUPP);
451 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
452 }
453 
454 /*
455  * General BPF functions.
456  */
457 static int
458 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
459     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
460 {
461 	const struct ieee80211_bpf_params *p;
462 	struct ether_header *eh;
463 	struct mbuf *m;
464 	int error;
465 	int len;
466 	int hlen;
467 	int slen;
468 
469 	/*
470 	 * Build a sockaddr based on the data link layer type.
471 	 * We do this at this level because the ethernet header
472 	 * is copied directly into the data field of the sockaddr.
473 	 * In the case of SLIP, there is no header and the packet
474 	 * is forwarded as is.
475 	 * Also, we are careful to leave room at the front of the mbuf
476 	 * for the link level header.
477 	 */
478 	switch (linktype) {
479 
480 	case DLT_SLIP:
481 		sockp->sa_family = AF_INET;
482 		hlen = 0;
483 		break;
484 
485 	case DLT_EN10MB:
486 		sockp->sa_family = AF_UNSPEC;
487 		/* XXX Would MAXLINKHDR be better? */
488 		hlen = ETHER_HDR_LEN;
489 		break;
490 
491 	case DLT_FDDI:
492 		sockp->sa_family = AF_IMPLINK;
493 		hlen = 0;
494 		break;
495 
496 	case DLT_RAW:
497 		sockp->sa_family = AF_UNSPEC;
498 		hlen = 0;
499 		break;
500 
501 	case DLT_NULL:
502 		/*
503 		 * null interface types require a 4 byte pseudo header which
504 		 * corresponds to the address family of the packet.
505 		 */
506 		sockp->sa_family = AF_UNSPEC;
507 		hlen = 4;
508 		break;
509 
510 	case DLT_ATM_RFC1483:
511 		/*
512 		 * en atm driver requires 4-byte atm pseudo header.
513 		 * though it isn't standard, vpi:vci needs to be
514 		 * specified anyway.
515 		 */
516 		sockp->sa_family = AF_UNSPEC;
517 		hlen = 12;	/* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
518 		break;
519 
520 	case DLT_PPP:
521 		sockp->sa_family = AF_UNSPEC;
522 		hlen = 4;	/* This should match PPP_HDRLEN */
523 		break;
524 
525 	case DLT_IEEE802_11:		/* IEEE 802.11 wireless */
526 		sockp->sa_family = AF_IEEE80211;
527 		hlen = 0;
528 		break;
529 
530 	case DLT_IEEE802_11_RADIO:	/* IEEE 802.11 wireless w/ phy params */
531 		sockp->sa_family = AF_IEEE80211;
532 		sockp->sa_len = 12;	/* XXX != 0 */
533 		hlen = sizeof(struct ieee80211_bpf_params);
534 		break;
535 
536 	default:
537 		return (EIO);
538 	}
539 
540 	len = uio->uio_resid;
541 	if (len < hlen || len - hlen > ifp->if_mtu)
542 		return (EMSGSIZE);
543 
544 	m = m_get2(len, M_WAITOK, MT_DATA, M_PKTHDR);
545 	if (m == NULL)
546 		return (EIO);
547 	m->m_pkthdr.len = m->m_len = len;
548 	*mp = m;
549 
550 	error = uiomove(mtod(m, u_char *), len, uio);
551 	if (error)
552 		goto bad;
553 
554 	slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
555 	if (slen == 0) {
556 		error = EPERM;
557 		goto bad;
558 	}
559 
560 	/* Check for multicast destination */
561 	switch (linktype) {
562 	case DLT_EN10MB:
563 		eh = mtod(m, struct ether_header *);
564 		if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
565 			if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
566 			    ETHER_ADDR_LEN) == 0)
567 				m->m_flags |= M_BCAST;
568 			else
569 				m->m_flags |= M_MCAST;
570 		}
571 		if (d->bd_hdrcmplt == 0) {
572 			memcpy(eh->ether_shost, IF_LLADDR(ifp),
573 			    sizeof(eh->ether_shost));
574 		}
575 		break;
576 	}
577 
578 	/*
579 	 * Make room for link header, and copy it to sockaddr
580 	 */
581 	if (hlen != 0) {
582 		if (sockp->sa_family == AF_IEEE80211) {
583 			/*
584 			 * Collect true length from the parameter header
585 			 * NB: sockp is known to be zero'd so if we do a
586 			 *     short copy unspecified parameters will be
587 			 *     zero.
588 			 * NB: packet may not be aligned after stripping
589 			 *     bpf params
590 			 * XXX check ibp_vers
591 			 */
592 			p = mtod(m, const struct ieee80211_bpf_params *);
593 			hlen = p->ibp_len;
594 			if (hlen > sizeof(sockp->sa_data)) {
595 				error = EINVAL;
596 				goto bad;
597 			}
598 		}
599 		bcopy(mtod(m, const void *), sockp->sa_data, hlen);
600 	}
601 	*hdrlen = hlen;
602 
603 	return (0);
604 bad:
605 	m_freem(m);
606 	return (error);
607 }
608 
609 /*
610  * Attach file to the bpf interface, i.e. make d listen on bp.
611  */
612 static void
613 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
614 {
615 	int op_w;
616 
617 	BPF_LOCK_ASSERT();
618 
619 	/*
620 	 * Save sysctl value to protect from sysctl change
621 	 * between reads
622 	 */
623 	op_w = V_bpf_optimize_writers || d->bd_writer;
624 
625 	if (d->bd_bif != NULL)
626 		bpf_detachd_locked(d);
627 	/*
628 	 * Point d at bp, and add d to the interface's list.
629 	 * Since there are many applications using BPF for
630 	 * sending raw packets only (dhcpd, cdpd are good examples)
631 	 * we can delay adding d to the list of active listeners until
632 	 * some filter is configured.
633 	 */
634 
635 	BPFIF_WLOCK(bp);
636 	BPFD_LOCK(d);
637 
638 	d->bd_bif = bp;
639 
640 	if (op_w != 0) {
641 		/* Add to writers-only list */
642 		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
643 		/*
644 		 * We decrement bd_writer on every filter set operation.
645 		 * First BIOCSETF is done by pcap_open_live() to set up
646 		 * snap length. After that appliation usually sets its own filter
647 		 */
648 		d->bd_writer = 2;
649 	} else
650 		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
651 
652 	BPFD_UNLOCK(d);
653 	BPFIF_WUNLOCK(bp);
654 
655 	bpf_bpfd_cnt++;
656 
657 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
658 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
659 
660 	if (op_w == 0)
661 		EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
662 }
663 
664 /*
665  * Check if we need to upgrade our descriptor @d from write-only mode.
666  */
667 static int
668 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
669 {
670 	int is_snap, need_upgrade;
671 
672 	/*
673 	 * Check if we've already upgraded or new filter is empty.
674 	 */
675 	if (d->bd_writer == 0 || fcode == NULL)
676 		return (0);
677 
678 	need_upgrade = 0;
679 
680 	/*
681 	 * Check if cmd looks like snaplen setting from
682 	 * pcap_bpf.c:pcap_open_live().
683 	 * Note we're not checking .k value here:
684 	 * while pcap_open_live() definitely sets to to non-zero value,
685 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
686 	 * do not consider upgrading immediately
687 	 */
688 	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
689 		is_snap = 1;
690 	else
691 		is_snap = 0;
692 
693 	if (is_snap == 0) {
694 		/*
695 		 * We're setting first filter and it doesn't look like
696 		 * setting snaplen.  We're probably using bpf directly.
697 		 * Upgrade immediately.
698 		 */
699 		need_upgrade = 1;
700 	} else {
701 		/*
702 		 * Do not require upgrade by first BIOCSETF
703 		 * (used to set snaplen) by pcap_open_live().
704 		 */
705 
706 		if (--d->bd_writer == 0) {
707 			/*
708 			 * First snaplen filter has already
709 			 * been set. This is probably catch-all
710 			 * filter
711 			 */
712 			need_upgrade = 1;
713 		}
714 	}
715 
716 	CTR5(KTR_NET,
717 	    "%s: filter function set by pid %d, "
718 	    "bd_writer counter %d, snap %d upgrade %d",
719 	    __func__, d->bd_pid, d->bd_writer,
720 	    is_snap, need_upgrade);
721 
722 	return (need_upgrade);
723 }
724 
725 /*
726  * Add d to the list of active bp filters.
727  * Requires bpf_attachd() to be called before.
728  */
729 static void
730 bpf_upgraded(struct bpf_d *d)
731 {
732 	struct bpf_if *bp;
733 
734 	BPF_LOCK_ASSERT();
735 
736 	bp = d->bd_bif;
737 
738 	/*
739 	 * Filter can be set several times without specifying interface.
740 	 * Mark d as reader and exit.
741 	 */
742 	if (bp == NULL) {
743 		BPFD_LOCK(d);
744 		d->bd_writer = 0;
745 		BPFD_UNLOCK(d);
746 		return;
747 	}
748 
749 	BPFIF_WLOCK(bp);
750 	BPFD_LOCK(d);
751 
752 	/* Remove from writers-only list */
753 	LIST_REMOVE(d, bd_next);
754 	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
755 	/* Mark d as reader */
756 	d->bd_writer = 0;
757 
758 	BPFD_UNLOCK(d);
759 	BPFIF_WUNLOCK(bp);
760 
761 	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
762 
763 	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
764 }
765 
766 /*
767  * Detach a file from its interface.
768  */
769 static void
770 bpf_detachd(struct bpf_d *d)
771 {
772 	BPF_LOCK();
773 	bpf_detachd_locked(d);
774 	BPF_UNLOCK();
775 }
776 
777 static void
778 bpf_detachd_locked(struct bpf_d *d)
779 {
780 	int error;
781 	struct bpf_if *bp;
782 	struct ifnet *ifp;
783 
784 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
785 
786 	BPF_LOCK_ASSERT();
787 
788 	/* Check if descriptor is attached */
789 	if ((bp = d->bd_bif) == NULL)
790 		return;
791 
792 	BPFIF_WLOCK(bp);
793 	BPFD_LOCK(d);
794 
795 	/* Save bd_writer value */
796 	error = d->bd_writer;
797 
798 	/*
799 	 * Remove d from the interface's descriptor list.
800 	 */
801 	LIST_REMOVE(d, bd_next);
802 
803 	ifp = bp->bif_ifp;
804 	d->bd_bif = NULL;
805 	BPFD_UNLOCK(d);
806 	BPFIF_WUNLOCK(bp);
807 
808 	bpf_bpfd_cnt--;
809 
810 	/* Call event handler iff d is attached */
811 	if (error == 0)
812 		EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
813 
814 	/*
815 	 * Check if this descriptor had requested promiscuous mode.
816 	 * If so, turn it off.
817 	 */
818 	if (d->bd_promisc) {
819 		d->bd_promisc = 0;
820 		CURVNET_SET(ifp->if_vnet);
821 		error = ifpromisc(ifp, 0);
822 		CURVNET_RESTORE();
823 		if (error != 0 && error != ENXIO) {
824 			/*
825 			 * ENXIO can happen if a pccard is unplugged
826 			 * Something is really wrong if we were able to put
827 			 * the driver into promiscuous mode, but can't
828 			 * take it out.
829 			 */
830 			if_printf(bp->bif_ifp,
831 				"bpf_detach: ifpromisc failed (%d)\n", error);
832 		}
833 	}
834 }
835 
836 /*
837  * Close the descriptor by detaching it from its interface,
838  * deallocating its buffers, and marking it free.
839  */
840 static void
841 bpf_dtor(void *data)
842 {
843 	struct bpf_d *d = data;
844 
845 	BPFD_LOCK(d);
846 	if (d->bd_state == BPF_WAITING)
847 		callout_stop(&d->bd_callout);
848 	d->bd_state = BPF_IDLE;
849 	BPFD_UNLOCK(d);
850 	funsetown(&d->bd_sigio);
851 	bpf_detachd(d);
852 #ifdef MAC
853 	mac_bpfdesc_destroy(d);
854 #endif /* MAC */
855 	seldrain(&d->bd_sel);
856 	knlist_destroy(&d->bd_sel.si_note);
857 	callout_drain(&d->bd_callout);
858 	bpf_freed(d);
859 	free(d, M_BPF);
860 }
861 
862 /*
863  * Open ethernet device.  Returns ENXIO for illegal minor device number,
864  * EBUSY if file is open by another process.
865  */
866 /* ARGSUSED */
867 static	int
868 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
869 {
870 	struct bpf_d *d;
871 	int error;
872 
873 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
874 	error = devfs_set_cdevpriv(d, bpf_dtor);
875 	if (error != 0) {
876 		free(d, M_BPF);
877 		return (error);
878 	}
879 
880 	/*
881 	 * For historical reasons, perform a one-time initialization call to
882 	 * the buffer routines, even though we're not yet committed to a
883 	 * particular buffer method.
884 	 */
885 	bpf_buffer_init(d);
886 	if ((flags & FREAD) == 0)
887 		d->bd_writer = 2;
888 	d->bd_hbuf_in_use = 0;
889 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
890 	d->bd_sig = SIGIO;
891 	d->bd_direction = BPF_D_INOUT;
892 	BPF_PID_REFRESH(d, td);
893 #ifdef MAC
894 	mac_bpfdesc_init(d);
895 	mac_bpfdesc_create(td->td_ucred, d);
896 #endif
897 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
898 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
899 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
900 
901 	return (0);
902 }
903 
904 /*
905  *  bpfread - read next chunk of packets from buffers
906  */
907 static	int
908 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
909 {
910 	struct bpf_d *d;
911 	int error;
912 	int non_block;
913 	int timed_out;
914 
915 	error = devfs_get_cdevpriv((void **)&d);
916 	if (error != 0)
917 		return (error);
918 
919 	/*
920 	 * Restrict application to use a buffer the same size as
921 	 * as kernel buffers.
922 	 */
923 	if (uio->uio_resid != d->bd_bufsize)
924 		return (EINVAL);
925 
926 	non_block = ((ioflag & O_NONBLOCK) != 0);
927 
928 	BPFD_LOCK(d);
929 	BPF_PID_REFRESH_CUR(d);
930 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
931 		BPFD_UNLOCK(d);
932 		return (EOPNOTSUPP);
933 	}
934 	if (d->bd_state == BPF_WAITING)
935 		callout_stop(&d->bd_callout);
936 	timed_out = (d->bd_state == BPF_TIMED_OUT);
937 	d->bd_state = BPF_IDLE;
938 	while (d->bd_hbuf_in_use) {
939 		error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
940 		    PRINET|PCATCH, "bd_hbuf", 0);
941 		if (error != 0) {
942 			BPFD_UNLOCK(d);
943 			return (error);
944 		}
945 	}
946 	/*
947 	 * If the hold buffer is empty, then do a timed sleep, which
948 	 * ends when the timeout expires or when enough packets
949 	 * have arrived to fill the store buffer.
950 	 */
951 	while (d->bd_hbuf == NULL) {
952 		if (d->bd_slen != 0) {
953 			/*
954 			 * A packet(s) either arrived since the previous
955 			 * read or arrived while we were asleep.
956 			 */
957 			if (d->bd_immediate || non_block || timed_out) {
958 				/*
959 				 * Rotate the buffers and return what's here
960 				 * if we are in immediate mode, non-blocking
961 				 * flag is set, or this descriptor timed out.
962 				 */
963 				ROTATE_BUFFERS(d);
964 				break;
965 			}
966 		}
967 
968 		/*
969 		 * No data is available, check to see if the bpf device
970 		 * is still pointed at a real interface.  If not, return
971 		 * ENXIO so that the userland process knows to rebind
972 		 * it before using it again.
973 		 */
974 		if (d->bd_bif == NULL) {
975 			BPFD_UNLOCK(d);
976 			return (ENXIO);
977 		}
978 
979 		if (non_block) {
980 			BPFD_UNLOCK(d);
981 			return (EWOULDBLOCK);
982 		}
983 		error = msleep(d, &d->bd_lock, PRINET|PCATCH,
984 		     "bpf", d->bd_rtout);
985 		if (error == EINTR || error == ERESTART) {
986 			BPFD_UNLOCK(d);
987 			return (error);
988 		}
989 		if (error == EWOULDBLOCK) {
990 			/*
991 			 * On a timeout, return what's in the buffer,
992 			 * which may be nothing.  If there is something
993 			 * in the store buffer, we can rotate the buffers.
994 			 */
995 			if (d->bd_hbuf)
996 				/*
997 				 * We filled up the buffer in between
998 				 * getting the timeout and arriving
999 				 * here, so we don't need to rotate.
1000 				 */
1001 				break;
1002 
1003 			if (d->bd_slen == 0) {
1004 				BPFD_UNLOCK(d);
1005 				return (0);
1006 			}
1007 			ROTATE_BUFFERS(d);
1008 			break;
1009 		}
1010 	}
1011 	/*
1012 	 * At this point, we know we have something in the hold slot.
1013 	 */
1014 	d->bd_hbuf_in_use = 1;
1015 	BPFD_UNLOCK(d);
1016 
1017 	/*
1018 	 * Move data from hold buffer into user space.
1019 	 * We know the entire buffer is transferred since
1020 	 * we checked above that the read buffer is bpf_bufsize bytes.
1021   	 *
1022 	 * We do not have to worry about simultaneous reads because
1023 	 * we waited for sole access to the hold buffer above.
1024 	 */
1025 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
1026 
1027 	BPFD_LOCK(d);
1028 	KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
1029 	d->bd_fbuf = d->bd_hbuf;
1030 	d->bd_hbuf = NULL;
1031 	d->bd_hlen = 0;
1032 	bpf_buf_reclaimed(d);
1033 	d->bd_hbuf_in_use = 0;
1034 	wakeup(&d->bd_hbuf_in_use);
1035 	BPFD_UNLOCK(d);
1036 
1037 	return (error);
1038 }
1039 
1040 /*
1041  * If there are processes sleeping on this descriptor, wake them up.
1042  */
1043 static __inline void
1044 bpf_wakeup(struct bpf_d *d)
1045 {
1046 
1047 	BPFD_LOCK_ASSERT(d);
1048 	if (d->bd_state == BPF_WAITING) {
1049 		callout_stop(&d->bd_callout);
1050 		d->bd_state = BPF_IDLE;
1051 	}
1052 	wakeup(d);
1053 	if (d->bd_async && d->bd_sig && d->bd_sigio)
1054 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
1055 
1056 	selwakeuppri(&d->bd_sel, PRINET);
1057 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
1058 }
1059 
1060 static void
1061 bpf_timed_out(void *arg)
1062 {
1063 	struct bpf_d *d = (struct bpf_d *)arg;
1064 
1065 	BPFD_LOCK_ASSERT(d);
1066 
1067 	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
1068 		return;
1069 	if (d->bd_state == BPF_WAITING) {
1070 		d->bd_state = BPF_TIMED_OUT;
1071 		if (d->bd_slen != 0)
1072 			bpf_wakeup(d);
1073 	}
1074 }
1075 
1076 static int
1077 bpf_ready(struct bpf_d *d)
1078 {
1079 
1080 	BPFD_LOCK_ASSERT(d);
1081 
1082 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
1083 		return (1);
1084 	if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1085 	    d->bd_slen != 0)
1086 		return (1);
1087 	return (0);
1088 }
1089 
1090 static int
1091 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
1092 {
1093 	struct bpf_d *d;
1094 	struct ifnet *ifp;
1095 	struct mbuf *m, *mc;
1096 	struct sockaddr dst;
1097 	struct route ro;
1098 	int error, hlen;
1099 
1100 	error = devfs_get_cdevpriv((void **)&d);
1101 	if (error != 0)
1102 		return (error);
1103 
1104 	BPF_PID_REFRESH_CUR(d);
1105 	d->bd_wcount++;
1106 	/* XXX: locking required */
1107 	if (d->bd_bif == NULL) {
1108 		d->bd_wdcount++;
1109 		return (ENXIO);
1110 	}
1111 
1112 	ifp = d->bd_bif->bif_ifp;
1113 
1114 	if ((ifp->if_flags & IFF_UP) == 0) {
1115 		d->bd_wdcount++;
1116 		return (ENETDOWN);
1117 	}
1118 
1119 	if (uio->uio_resid == 0) {
1120 		d->bd_wdcount++;
1121 		return (0);
1122 	}
1123 
1124 	bzero(&dst, sizeof(dst));
1125 	m = NULL;
1126 	hlen = 0;
1127 	/* XXX: bpf_movein() can sleep */
1128 	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
1129 	    &m, &dst, &hlen, d);
1130 	if (error) {
1131 		d->bd_wdcount++;
1132 		return (error);
1133 	}
1134 	d->bd_wfcount++;
1135 	if (d->bd_hdrcmplt)
1136 		dst.sa_family = pseudo_AF_HDRCMPLT;
1137 
1138 	if (d->bd_feedback) {
1139 		mc = m_dup(m, M_NOWAIT);
1140 		if (mc != NULL)
1141 			mc->m_pkthdr.rcvif = ifp;
1142 		/* Set M_PROMISC for outgoing packets to be discarded. */
1143 		if (d->bd_direction == BPF_D_INOUT)
1144 			m->m_flags |= M_PROMISC;
1145 	} else
1146 		mc = NULL;
1147 
1148 	m->m_pkthdr.len -= hlen;
1149 	m->m_len -= hlen;
1150 	m->m_data += hlen;	/* XXX */
1151 
1152 	CURVNET_SET(ifp->if_vnet);
1153 #ifdef MAC
1154 	BPFD_LOCK(d);
1155 	mac_bpfdesc_create_mbuf(d, m);
1156 	if (mc != NULL)
1157 		mac_bpfdesc_create_mbuf(d, mc);
1158 	BPFD_UNLOCK(d);
1159 #endif
1160 
1161 	bzero(&ro, sizeof(ro));
1162 	if (hlen != 0) {
1163 		ro.ro_prepend = (u_char *)&dst.sa_data;
1164 		ro.ro_plen = hlen;
1165 		ro.ro_flags = RT_HAS_HEADER;
1166 	}
1167 
1168 	error = (*ifp->if_output)(ifp, m, &dst, &ro);
1169 	if (error)
1170 		d->bd_wdcount++;
1171 
1172 	if (mc != NULL) {
1173 		if (error == 0)
1174 			(*ifp->if_input)(ifp, mc);
1175 		else
1176 			m_freem(mc);
1177 	}
1178 	CURVNET_RESTORE();
1179 
1180 	return (error);
1181 }
1182 
1183 /*
1184  * Reset a descriptor by flushing its packet buffer and clearing the receive
1185  * and drop counts.  This is doable for kernel-only buffers, but with
1186  * zero-copy buffers, we can't write to (or rotate) buffers that are
1187  * currently owned by userspace.  It would be nice if we could encapsulate
1188  * this logic in the buffer code rather than here.
1189  */
1190 static void
1191 reset_d(struct bpf_d *d)
1192 {
1193 
1194 	BPFD_LOCK_ASSERT(d);
1195 
1196 	while (d->bd_hbuf_in_use)
1197 		mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
1198 		    "bd_hbuf", 0);
1199 	if ((d->bd_hbuf != NULL) &&
1200 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1201 		/* Free the hold buffer. */
1202 		d->bd_fbuf = d->bd_hbuf;
1203 		d->bd_hbuf = NULL;
1204 		d->bd_hlen = 0;
1205 		bpf_buf_reclaimed(d);
1206 	}
1207 	if (bpf_canwritebuf(d))
1208 		d->bd_slen = 0;
1209 	d->bd_rcount = 0;
1210 	d->bd_dcount = 0;
1211 	d->bd_fcount = 0;
1212 	d->bd_wcount = 0;
1213 	d->bd_wfcount = 0;
1214 	d->bd_wdcount = 0;
1215 	d->bd_zcopy = 0;
1216 }
1217 
1218 /*
1219  *  FIONREAD		Check for read packet available.
1220  *  BIOCGBLEN		Get buffer len [for read()].
1221  *  BIOCSETF		Set read filter.
1222  *  BIOCSETFNR		Set read filter without resetting descriptor.
1223  *  BIOCSETWF		Set write filter.
1224  *  BIOCFLUSH		Flush read packet buffer.
1225  *  BIOCPROMISC		Put interface into promiscuous mode.
1226  *  BIOCGDLT		Get link layer type.
1227  *  BIOCGETIF		Get interface name.
1228  *  BIOCSETIF		Set interface.
1229  *  BIOCSRTIMEOUT	Set read timeout.
1230  *  BIOCGRTIMEOUT	Get read timeout.
1231  *  BIOCGSTATS		Get packet stats.
1232  *  BIOCIMMEDIATE	Set immediate mode.
1233  *  BIOCVERSION		Get filter language version.
1234  *  BIOCGHDRCMPLT	Get "header already complete" flag
1235  *  BIOCSHDRCMPLT	Set "header already complete" flag
1236  *  BIOCGDIRECTION	Get packet direction flag
1237  *  BIOCSDIRECTION	Set packet direction flag
1238  *  BIOCGTSTAMP		Get time stamp format and resolution.
1239  *  BIOCSTSTAMP		Set time stamp format and resolution.
1240  *  BIOCLOCK		Set "locked" flag
1241  *  BIOCFEEDBACK	Set packet feedback mode.
1242  *  BIOCSETZBUF		Set current zero-copy buffer locations.
1243  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
1244  *  BIOCROTZBUF		Force rotation of zero-copy buffer
1245  *  BIOCSETBUFMODE	Set buffer mode.
1246  *  BIOCGETBUFMODE	Get current buffer mode.
1247  */
1248 /* ARGSUSED */
1249 static	int
1250 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1251     struct thread *td)
1252 {
1253 	struct bpf_d *d;
1254 	int error;
1255 
1256 	error = devfs_get_cdevpriv((void **)&d);
1257 	if (error != 0)
1258 		return (error);
1259 
1260 	/*
1261 	 * Refresh PID associated with this descriptor.
1262 	 */
1263 	BPFD_LOCK(d);
1264 	BPF_PID_REFRESH(d, td);
1265 	if (d->bd_state == BPF_WAITING)
1266 		callout_stop(&d->bd_callout);
1267 	d->bd_state = BPF_IDLE;
1268 	BPFD_UNLOCK(d);
1269 
1270 	if (d->bd_locked == 1) {
1271 		switch (cmd) {
1272 		case BIOCGBLEN:
1273 		case BIOCFLUSH:
1274 		case BIOCGDLT:
1275 		case BIOCGDLTLIST:
1276 #ifdef COMPAT_FREEBSD32
1277 		case BIOCGDLTLIST32:
1278 #endif
1279 		case BIOCGETIF:
1280 		case BIOCGRTIMEOUT:
1281 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1282 		case BIOCGRTIMEOUT32:
1283 #endif
1284 		case BIOCGSTATS:
1285 		case BIOCVERSION:
1286 		case BIOCGRSIG:
1287 		case BIOCGHDRCMPLT:
1288 		case BIOCSTSTAMP:
1289 		case BIOCFEEDBACK:
1290 		case FIONREAD:
1291 		case BIOCLOCK:
1292 		case BIOCSRTIMEOUT:
1293 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1294 		case BIOCSRTIMEOUT32:
1295 #endif
1296 		case BIOCIMMEDIATE:
1297 		case TIOCGPGRP:
1298 		case BIOCROTZBUF:
1299 			break;
1300 		default:
1301 			return (EPERM);
1302 		}
1303 	}
1304 #ifdef COMPAT_FREEBSD32
1305 	/*
1306 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1307 	 * that it will get 32-bit packet headers.
1308 	 */
1309 	switch (cmd) {
1310 	case BIOCSETF32:
1311 	case BIOCSETFNR32:
1312 	case BIOCSETWF32:
1313 	case BIOCGDLTLIST32:
1314 	case BIOCGRTIMEOUT32:
1315 	case BIOCSRTIMEOUT32:
1316 		BPFD_LOCK(d);
1317 		d->bd_compat32 = 1;
1318 		BPFD_UNLOCK(d);
1319 	}
1320 #endif
1321 
1322 	CURVNET_SET(TD_TO_VNET(td));
1323 	switch (cmd) {
1324 
1325 	default:
1326 		error = EINVAL;
1327 		break;
1328 
1329 	/*
1330 	 * Check for read packet available.
1331 	 */
1332 	case FIONREAD:
1333 		{
1334 			int n;
1335 
1336 			BPFD_LOCK(d);
1337 			n = d->bd_slen;
1338 			while (d->bd_hbuf_in_use)
1339 				mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1340 				    PRINET, "bd_hbuf", 0);
1341 			if (d->bd_hbuf)
1342 				n += d->bd_hlen;
1343 			BPFD_UNLOCK(d);
1344 
1345 			*(int *)addr = n;
1346 			break;
1347 		}
1348 
1349 	/*
1350 	 * Get buffer len [for read()].
1351 	 */
1352 	case BIOCGBLEN:
1353 		BPFD_LOCK(d);
1354 		*(u_int *)addr = d->bd_bufsize;
1355 		BPFD_UNLOCK(d);
1356 		break;
1357 
1358 	/*
1359 	 * Set buffer length.
1360 	 */
1361 	case BIOCSBLEN:
1362 		error = bpf_ioctl_sblen(d, (u_int *)addr);
1363 		break;
1364 
1365 	/*
1366 	 * Set link layer read filter.
1367 	 */
1368 	case BIOCSETF:
1369 	case BIOCSETFNR:
1370 	case BIOCSETWF:
1371 #ifdef COMPAT_FREEBSD32
1372 	case BIOCSETF32:
1373 	case BIOCSETFNR32:
1374 	case BIOCSETWF32:
1375 #endif
1376 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1377 		break;
1378 
1379 	/*
1380 	 * Flush read packet buffer.
1381 	 */
1382 	case BIOCFLUSH:
1383 		BPFD_LOCK(d);
1384 		reset_d(d);
1385 		BPFD_UNLOCK(d);
1386 		break;
1387 
1388 	/*
1389 	 * Put interface into promiscuous mode.
1390 	 */
1391 	case BIOCPROMISC:
1392 		if (d->bd_bif == NULL) {
1393 			/*
1394 			 * No interface attached yet.
1395 			 */
1396 			error = EINVAL;
1397 			break;
1398 		}
1399 		if (d->bd_promisc == 0) {
1400 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
1401 			if (error == 0)
1402 				d->bd_promisc = 1;
1403 		}
1404 		break;
1405 
1406 	/*
1407 	 * Get current data link type.
1408 	 */
1409 	case BIOCGDLT:
1410 		BPF_LOCK();
1411 		if (d->bd_bif == NULL)
1412 			error = EINVAL;
1413 		else
1414 			*(u_int *)addr = d->bd_bif->bif_dlt;
1415 		BPF_UNLOCK();
1416 		break;
1417 
1418 	/*
1419 	 * Get a list of supported data link types.
1420 	 */
1421 #ifdef COMPAT_FREEBSD32
1422 	case BIOCGDLTLIST32:
1423 		{
1424 			struct bpf_dltlist32 *list32;
1425 			struct bpf_dltlist dltlist;
1426 
1427 			list32 = (struct bpf_dltlist32 *)addr;
1428 			dltlist.bfl_len = list32->bfl_len;
1429 			dltlist.bfl_list = PTRIN(list32->bfl_list);
1430 			BPF_LOCK();
1431 			if (d->bd_bif == NULL)
1432 				error = EINVAL;
1433 			else {
1434 				error = bpf_getdltlist(d, &dltlist);
1435 				if (error == 0)
1436 					list32->bfl_len = dltlist.bfl_len;
1437 			}
1438 			BPF_UNLOCK();
1439 			break;
1440 		}
1441 #endif
1442 
1443 	case BIOCGDLTLIST:
1444 		BPF_LOCK();
1445 		if (d->bd_bif == NULL)
1446 			error = EINVAL;
1447 		else
1448 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1449 		BPF_UNLOCK();
1450 		break;
1451 
1452 	/*
1453 	 * Set data link type.
1454 	 */
1455 	case BIOCSDLT:
1456 		BPF_LOCK();
1457 		if (d->bd_bif == NULL)
1458 			error = EINVAL;
1459 		else
1460 			error = bpf_setdlt(d, *(u_int *)addr);
1461 		BPF_UNLOCK();
1462 		break;
1463 
1464 	/*
1465 	 * Get interface name.
1466 	 */
1467 	case BIOCGETIF:
1468 		BPF_LOCK();
1469 		if (d->bd_bif == NULL)
1470 			error = EINVAL;
1471 		else {
1472 			struct ifnet *const ifp = d->bd_bif->bif_ifp;
1473 			struct ifreq *const ifr = (struct ifreq *)addr;
1474 
1475 			strlcpy(ifr->ifr_name, ifp->if_xname,
1476 			    sizeof(ifr->ifr_name));
1477 		}
1478 		BPF_UNLOCK();
1479 		break;
1480 
1481 	/*
1482 	 * Set interface.
1483 	 */
1484 	case BIOCSETIF:
1485 		{
1486 			int alloc_buf, size;
1487 
1488 			/*
1489 			 * Behavior here depends on the buffering model.  If
1490 			 * we're using kernel memory buffers, then we can
1491 			 * allocate them here.  If we're using zero-copy,
1492 			 * then the user process must have registered buffers
1493 			 * by the time we get here.
1494 			 */
1495 			alloc_buf = 0;
1496 			BPFD_LOCK(d);
1497 			if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1498 			    d->bd_sbuf == NULL)
1499 				alloc_buf = 1;
1500 			BPFD_UNLOCK(d);
1501 			if (alloc_buf) {
1502 				size = d->bd_bufsize;
1503 				error = bpf_buffer_ioctl_sblen(d, &size);
1504 				if (error != 0)
1505 					break;
1506 			}
1507 			BPF_LOCK();
1508 			error = bpf_setif(d, (struct ifreq *)addr);
1509 			BPF_UNLOCK();
1510 			break;
1511 		}
1512 
1513 	/*
1514 	 * Set read timeout.
1515 	 */
1516 	case BIOCSRTIMEOUT:
1517 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1518 	case BIOCSRTIMEOUT32:
1519 #endif
1520 		{
1521 			struct timeval *tv = (struct timeval *)addr;
1522 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1523 			struct timeval32 *tv32;
1524 			struct timeval tv64;
1525 
1526 			if (cmd == BIOCSRTIMEOUT32) {
1527 				tv32 = (struct timeval32 *)addr;
1528 				tv = &tv64;
1529 				tv->tv_sec = tv32->tv_sec;
1530 				tv->tv_usec = tv32->tv_usec;
1531 			} else
1532 #endif
1533 				tv = (struct timeval *)addr;
1534 
1535 			/*
1536 			 * Subtract 1 tick from tvtohz() since this isn't
1537 			 * a one-shot timer.
1538 			 */
1539 			if ((error = itimerfix(tv)) == 0)
1540 				d->bd_rtout = tvtohz(tv) - 1;
1541 			break;
1542 		}
1543 
1544 	/*
1545 	 * Get read timeout.
1546 	 */
1547 	case BIOCGRTIMEOUT:
1548 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1549 	case BIOCGRTIMEOUT32:
1550 #endif
1551 		{
1552 			struct timeval *tv;
1553 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1554 			struct timeval32 *tv32;
1555 			struct timeval tv64;
1556 
1557 			if (cmd == BIOCGRTIMEOUT32)
1558 				tv = &tv64;
1559 			else
1560 #endif
1561 				tv = (struct timeval *)addr;
1562 
1563 			tv->tv_sec = d->bd_rtout / hz;
1564 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1565 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1566 			if (cmd == BIOCGRTIMEOUT32) {
1567 				tv32 = (struct timeval32 *)addr;
1568 				tv32->tv_sec = tv->tv_sec;
1569 				tv32->tv_usec = tv->tv_usec;
1570 			}
1571 #endif
1572 
1573 			break;
1574 		}
1575 
1576 	/*
1577 	 * Get packet stats.
1578 	 */
1579 	case BIOCGSTATS:
1580 		{
1581 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1582 
1583 			/* XXXCSJP overflow */
1584 			bs->bs_recv = d->bd_rcount;
1585 			bs->bs_drop = d->bd_dcount;
1586 			break;
1587 		}
1588 
1589 	/*
1590 	 * Set immediate mode.
1591 	 */
1592 	case BIOCIMMEDIATE:
1593 		BPFD_LOCK(d);
1594 		d->bd_immediate = *(u_int *)addr;
1595 		BPFD_UNLOCK(d);
1596 		break;
1597 
1598 	case BIOCVERSION:
1599 		{
1600 			struct bpf_version *bv = (struct bpf_version *)addr;
1601 
1602 			bv->bv_major = BPF_MAJOR_VERSION;
1603 			bv->bv_minor = BPF_MINOR_VERSION;
1604 			break;
1605 		}
1606 
1607 	/*
1608 	 * Get "header already complete" flag
1609 	 */
1610 	case BIOCGHDRCMPLT:
1611 		BPFD_LOCK(d);
1612 		*(u_int *)addr = d->bd_hdrcmplt;
1613 		BPFD_UNLOCK(d);
1614 		break;
1615 
1616 	/*
1617 	 * Set "header already complete" flag
1618 	 */
1619 	case BIOCSHDRCMPLT:
1620 		BPFD_LOCK(d);
1621 		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1622 		BPFD_UNLOCK(d);
1623 		break;
1624 
1625 	/*
1626 	 * Get packet direction flag
1627 	 */
1628 	case BIOCGDIRECTION:
1629 		BPFD_LOCK(d);
1630 		*(u_int *)addr = d->bd_direction;
1631 		BPFD_UNLOCK(d);
1632 		break;
1633 
1634 	/*
1635 	 * Set packet direction flag
1636 	 */
1637 	case BIOCSDIRECTION:
1638 		{
1639 			u_int	direction;
1640 
1641 			direction = *(u_int *)addr;
1642 			switch (direction) {
1643 			case BPF_D_IN:
1644 			case BPF_D_INOUT:
1645 			case BPF_D_OUT:
1646 				BPFD_LOCK(d);
1647 				d->bd_direction = direction;
1648 				BPFD_UNLOCK(d);
1649 				break;
1650 			default:
1651 				error = EINVAL;
1652 			}
1653 		}
1654 		break;
1655 
1656 	/*
1657 	 * Get packet timestamp format and resolution.
1658 	 */
1659 	case BIOCGTSTAMP:
1660 		BPFD_LOCK(d);
1661 		*(u_int *)addr = d->bd_tstamp;
1662 		BPFD_UNLOCK(d);
1663 		break;
1664 
1665 	/*
1666 	 * Set packet timestamp format and resolution.
1667 	 */
1668 	case BIOCSTSTAMP:
1669 		{
1670 			u_int	func;
1671 
1672 			func = *(u_int *)addr;
1673 			if (BPF_T_VALID(func))
1674 				d->bd_tstamp = func;
1675 			else
1676 				error = EINVAL;
1677 		}
1678 		break;
1679 
1680 	case BIOCFEEDBACK:
1681 		BPFD_LOCK(d);
1682 		d->bd_feedback = *(u_int *)addr;
1683 		BPFD_UNLOCK(d);
1684 		break;
1685 
1686 	case BIOCLOCK:
1687 		BPFD_LOCK(d);
1688 		d->bd_locked = 1;
1689 		BPFD_UNLOCK(d);
1690 		break;
1691 
1692 	case FIONBIO:		/* Non-blocking I/O */
1693 		break;
1694 
1695 	case FIOASYNC:		/* Send signal on receive packets */
1696 		BPFD_LOCK(d);
1697 		d->bd_async = *(int *)addr;
1698 		BPFD_UNLOCK(d);
1699 		break;
1700 
1701 	case FIOSETOWN:
1702 		/*
1703 		 * XXX: Add some sort of locking here?
1704 		 * fsetown() can sleep.
1705 		 */
1706 		error = fsetown(*(int *)addr, &d->bd_sigio);
1707 		break;
1708 
1709 	case FIOGETOWN:
1710 		BPFD_LOCK(d);
1711 		*(int *)addr = fgetown(&d->bd_sigio);
1712 		BPFD_UNLOCK(d);
1713 		break;
1714 
1715 	/* This is deprecated, FIOSETOWN should be used instead. */
1716 	case TIOCSPGRP:
1717 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1718 		break;
1719 
1720 	/* This is deprecated, FIOGETOWN should be used instead. */
1721 	case TIOCGPGRP:
1722 		*(int *)addr = -fgetown(&d->bd_sigio);
1723 		break;
1724 
1725 	case BIOCSRSIG:		/* Set receive signal */
1726 		{
1727 			u_int sig;
1728 
1729 			sig = *(u_int *)addr;
1730 
1731 			if (sig >= NSIG)
1732 				error = EINVAL;
1733 			else {
1734 				BPFD_LOCK(d);
1735 				d->bd_sig = sig;
1736 				BPFD_UNLOCK(d);
1737 			}
1738 			break;
1739 		}
1740 	case BIOCGRSIG:
1741 		BPFD_LOCK(d);
1742 		*(u_int *)addr = d->bd_sig;
1743 		BPFD_UNLOCK(d);
1744 		break;
1745 
1746 	case BIOCGETBUFMODE:
1747 		BPFD_LOCK(d);
1748 		*(u_int *)addr = d->bd_bufmode;
1749 		BPFD_UNLOCK(d);
1750 		break;
1751 
1752 	case BIOCSETBUFMODE:
1753 		/*
1754 		 * Allow the buffering mode to be changed as long as we
1755 		 * haven't yet committed to a particular mode.  Our
1756 		 * definition of commitment, for now, is whether or not a
1757 		 * buffer has been allocated or an interface attached, since
1758 		 * that's the point where things get tricky.
1759 		 */
1760 		switch (*(u_int *)addr) {
1761 		case BPF_BUFMODE_BUFFER:
1762 			break;
1763 
1764 		case BPF_BUFMODE_ZBUF:
1765 			if (bpf_zerocopy_enable)
1766 				break;
1767 			/* FALLSTHROUGH */
1768 
1769 		default:
1770 			CURVNET_RESTORE();
1771 			return (EINVAL);
1772 		}
1773 
1774 		BPFD_LOCK(d);
1775 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1776 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1777 			BPFD_UNLOCK(d);
1778 			CURVNET_RESTORE();
1779 			return (EBUSY);
1780 		}
1781 		d->bd_bufmode = *(u_int *)addr;
1782 		BPFD_UNLOCK(d);
1783 		break;
1784 
1785 	case BIOCGETZMAX:
1786 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1787 		break;
1788 
1789 	case BIOCSETZBUF:
1790 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1791 		break;
1792 
1793 	case BIOCROTZBUF:
1794 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1795 		break;
1796 	}
1797 	CURVNET_RESTORE();
1798 	return (error);
1799 }
1800 
1801 /*
1802  * Set d's packet filter program to fp.  If this file already has a filter,
1803  * free it and replace it.  Returns EINVAL for bogus requests.
1804  *
1805  * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
1806  * since reading d->bd_bif can't be protected by d or interface lock due to
1807  * lock order.
1808  *
1809  * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
1810  * interface read lock to read all filers.
1811  *
1812  */
1813 static int
1814 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1815 {
1816 #ifdef COMPAT_FREEBSD32
1817 	struct bpf_program fp_swab;
1818 	struct bpf_program32 *fp32;
1819 #endif
1820 	struct bpf_insn *fcode, *old;
1821 #ifdef BPF_JITTER
1822 	bpf_jit_filter *jfunc, *ofunc;
1823 #endif
1824 	size_t size;
1825 	u_int flen;
1826 	int need_upgrade;
1827 
1828 #ifdef COMPAT_FREEBSD32
1829 	switch (cmd) {
1830 	case BIOCSETF32:
1831 	case BIOCSETWF32:
1832 	case BIOCSETFNR32:
1833 		fp32 = (struct bpf_program32 *)fp;
1834 		fp_swab.bf_len = fp32->bf_len;
1835 		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1836 		fp = &fp_swab;
1837 		switch (cmd) {
1838 		case BIOCSETF32:
1839 			cmd = BIOCSETF;
1840 			break;
1841 		case BIOCSETWF32:
1842 			cmd = BIOCSETWF;
1843 			break;
1844 		}
1845 		break;
1846 	}
1847 #endif
1848 
1849 	fcode = NULL;
1850 #ifdef BPF_JITTER
1851 	jfunc = ofunc = NULL;
1852 #endif
1853 	need_upgrade = 0;
1854 
1855 	/*
1856 	 * Check new filter validness before acquiring any locks.
1857 	 * Allocate memory for new filter, if needed.
1858 	 */
1859 	flen = fp->bf_len;
1860 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1861 		return (EINVAL);
1862 	size = flen * sizeof(*fp->bf_insns);
1863 	if (size > 0) {
1864 		/* We're setting up new filter.  Copy and check actual data. */
1865 		fcode = malloc(size, M_BPF, M_WAITOK);
1866 		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1867 		    !bpf_validate(fcode, flen)) {
1868 			free(fcode, M_BPF);
1869 			return (EINVAL);
1870 		}
1871 #ifdef BPF_JITTER
1872 		/* Filter is copied inside fcode and is perfectly valid. */
1873 		jfunc = bpf_jitter(fcode, flen);
1874 #endif
1875 	}
1876 
1877 	BPF_LOCK();
1878 
1879 	/*
1880 	 * Set up new filter.
1881 	 * Protect filter change by interface lock.
1882 	 * Additionally, we are protected by global lock here.
1883 	 */
1884 	if (d->bd_bif != NULL)
1885 		BPFIF_WLOCK(d->bd_bif);
1886 	BPFD_LOCK(d);
1887 	if (cmd == BIOCSETWF) {
1888 		old = d->bd_wfilter;
1889 		d->bd_wfilter = fcode;
1890 	} else {
1891 		old = d->bd_rfilter;
1892 		d->bd_rfilter = fcode;
1893 #ifdef BPF_JITTER
1894 		ofunc = d->bd_bfilter;
1895 		d->bd_bfilter = jfunc;
1896 #endif
1897 		if (cmd == BIOCSETF)
1898 			reset_d(d);
1899 
1900 		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
1901 	}
1902 	BPFD_UNLOCK(d);
1903 	if (d->bd_bif != NULL)
1904 		BPFIF_WUNLOCK(d->bd_bif);
1905 	if (old != NULL)
1906 		free(old, M_BPF);
1907 #ifdef BPF_JITTER
1908 	if (ofunc != NULL)
1909 		bpf_destroy_jit_filter(ofunc);
1910 #endif
1911 
1912 	/* Move d to active readers list. */
1913 	if (need_upgrade != 0)
1914 		bpf_upgraded(d);
1915 
1916 	BPF_UNLOCK();
1917 	return (0);
1918 }
1919 
1920 /*
1921  * Detach a file from its current interface (if attached at all) and attach
1922  * to the interface indicated by the name stored in ifr.
1923  * Return an errno or 0.
1924  */
1925 static int
1926 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1927 {
1928 	struct bpf_if *bp;
1929 	struct ifnet *theywant;
1930 
1931 	BPF_LOCK_ASSERT();
1932 
1933 	theywant = ifunit(ifr->ifr_name);
1934 	if (theywant == NULL || theywant->if_bpf == NULL)
1935 		return (ENXIO);
1936 
1937 	bp = theywant->if_bpf;
1938 
1939 	/* Check if interface is not being detached from BPF */
1940 	BPFIF_RLOCK(bp);
1941 	if (bp->bif_flags & BPFIF_FLAG_DYING) {
1942 		BPFIF_RUNLOCK(bp);
1943 		return (ENXIO);
1944 	}
1945 	BPFIF_RUNLOCK(bp);
1946 
1947 	/*
1948 	 * At this point, we expect the buffer is already allocated.  If not,
1949 	 * return an error.
1950 	 */
1951 	switch (d->bd_bufmode) {
1952 	case BPF_BUFMODE_BUFFER:
1953 	case BPF_BUFMODE_ZBUF:
1954 		if (d->bd_sbuf == NULL)
1955 			return (EINVAL);
1956 		break;
1957 
1958 	default:
1959 		panic("bpf_setif: bufmode %d", d->bd_bufmode);
1960 	}
1961 	if (bp != d->bd_bif)
1962 		bpf_attachd(d, bp);
1963 	BPFD_LOCK(d);
1964 	reset_d(d);
1965 	BPFD_UNLOCK(d);
1966 	return (0);
1967 }
1968 
1969 /*
1970  * Support for select() and poll() system calls
1971  *
1972  * Return true iff the specific operation will not block indefinitely.
1973  * Otherwise, return false but make a note that a selwakeup() must be done.
1974  */
1975 static int
1976 bpfpoll(struct cdev *dev, int events, struct thread *td)
1977 {
1978 	struct bpf_d *d;
1979 	int revents;
1980 
1981 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
1982 		return (events &
1983 		    (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
1984 
1985 	/*
1986 	 * Refresh PID associated with this descriptor.
1987 	 */
1988 	revents = events & (POLLOUT | POLLWRNORM);
1989 	BPFD_LOCK(d);
1990 	BPF_PID_REFRESH(d, td);
1991 	if (events & (POLLIN | POLLRDNORM)) {
1992 		if (bpf_ready(d))
1993 			revents |= events & (POLLIN | POLLRDNORM);
1994 		else {
1995 			selrecord(td, &d->bd_sel);
1996 			/* Start the read timeout if necessary. */
1997 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1998 				callout_reset(&d->bd_callout, d->bd_rtout,
1999 				    bpf_timed_out, d);
2000 				d->bd_state = BPF_WAITING;
2001 			}
2002 		}
2003 	}
2004 	BPFD_UNLOCK(d);
2005 	return (revents);
2006 }
2007 
2008 /*
2009  * Support for kevent() system call.  Register EVFILT_READ filters and
2010  * reject all others.
2011  */
2012 int
2013 bpfkqfilter(struct cdev *dev, struct knote *kn)
2014 {
2015 	struct bpf_d *d;
2016 
2017 	if (devfs_get_cdevpriv((void **)&d) != 0 ||
2018 	    kn->kn_filter != EVFILT_READ)
2019 		return (1);
2020 
2021 	/*
2022 	 * Refresh PID associated with this descriptor.
2023 	 */
2024 	BPFD_LOCK(d);
2025 	BPF_PID_REFRESH_CUR(d);
2026 	kn->kn_fop = &bpfread_filtops;
2027 	kn->kn_hook = d;
2028 	knlist_add(&d->bd_sel.si_note, kn, 1);
2029 	BPFD_UNLOCK(d);
2030 
2031 	return (0);
2032 }
2033 
2034 static void
2035 filt_bpfdetach(struct knote *kn)
2036 {
2037 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2038 
2039 	knlist_remove(&d->bd_sel.si_note, kn, 0);
2040 }
2041 
2042 static int
2043 filt_bpfread(struct knote *kn, long hint)
2044 {
2045 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2046 	int ready;
2047 
2048 	BPFD_LOCK_ASSERT(d);
2049 	ready = bpf_ready(d);
2050 	if (ready) {
2051 		kn->kn_data = d->bd_slen;
2052 		/*
2053 		 * Ignore the hold buffer if it is being copied to user space.
2054 		 */
2055 		if (!d->bd_hbuf_in_use && d->bd_hbuf)
2056 			kn->kn_data += d->bd_hlen;
2057 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2058 		callout_reset(&d->bd_callout, d->bd_rtout,
2059 		    bpf_timed_out, d);
2060 		d->bd_state = BPF_WAITING;
2061 	}
2062 
2063 	return (ready);
2064 }
2065 
2066 #define	BPF_TSTAMP_NONE		0
2067 #define	BPF_TSTAMP_FAST		1
2068 #define	BPF_TSTAMP_NORMAL	2
2069 #define	BPF_TSTAMP_EXTERN	3
2070 
2071 static int
2072 bpf_ts_quality(int tstype)
2073 {
2074 
2075 	if (tstype == BPF_T_NONE)
2076 		return (BPF_TSTAMP_NONE);
2077 	if ((tstype & BPF_T_FAST) != 0)
2078 		return (BPF_TSTAMP_FAST);
2079 
2080 	return (BPF_TSTAMP_NORMAL);
2081 }
2082 
2083 static int
2084 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2085 {
2086 	struct m_tag *tag;
2087 	int quality;
2088 
2089 	quality = bpf_ts_quality(tstype);
2090 	if (quality == BPF_TSTAMP_NONE)
2091 		return (quality);
2092 
2093 	if (m != NULL) {
2094 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2095 		if (tag != NULL) {
2096 			*bt = *(struct bintime *)(tag + 1);
2097 			return (BPF_TSTAMP_EXTERN);
2098 		}
2099 	}
2100 	if (quality == BPF_TSTAMP_NORMAL)
2101 		binuptime(bt);
2102 	else
2103 		getbinuptime(bt);
2104 
2105 	return (quality);
2106 }
2107 
2108 /*
2109  * Incoming linkage from device drivers.  Process the packet pkt, of length
2110  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2111  * by each process' filter, and if accepted, stashed into the corresponding
2112  * buffer.
2113  */
2114 void
2115 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2116 {
2117 	struct bintime bt;
2118 	struct bpf_d *d;
2119 #ifdef BPF_JITTER
2120 	bpf_jit_filter *bf;
2121 #endif
2122 	u_int slen;
2123 	int gottime;
2124 
2125 	gottime = BPF_TSTAMP_NONE;
2126 
2127 	BPFIF_RLOCK(bp);
2128 
2129 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2130 		/*
2131 		 * We are not using any locks for d here because:
2132 		 * 1) any filter change is protected by interface
2133 		 * write lock
2134 		 * 2) destroying/detaching d is protected by interface
2135 		 * write lock, too
2136 		 */
2137 
2138 		/* XXX: Do not protect counter for the sake of performance. */
2139 		++d->bd_rcount;
2140 		/*
2141 		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
2142 		 * way for the caller to indiciate to us whether this packet
2143 		 * is inbound or outbound.  In the bpf_mtap() routines, we use
2144 		 * the interface pointers on the mbuf to figure it out.
2145 		 */
2146 #ifdef BPF_JITTER
2147 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2148 		if (bf != NULL)
2149 			slen = (*(bf->func))(pkt, pktlen, pktlen);
2150 		else
2151 #endif
2152 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2153 		if (slen != 0) {
2154 			/*
2155 			 * Filter matches. Let's to acquire write lock.
2156 			 */
2157 			BPFD_LOCK(d);
2158 
2159 			d->bd_fcount++;
2160 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2161 				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
2162 #ifdef MAC
2163 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2164 #endif
2165 				catchpacket(d, pkt, pktlen, slen,
2166 				    bpf_append_bytes, &bt);
2167 			BPFD_UNLOCK(d);
2168 		}
2169 	}
2170 	BPFIF_RUNLOCK(bp);
2171 }
2172 
2173 #define	BPF_CHECK_DIRECTION(d, r, i)				\
2174 	    (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||	\
2175 	    ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
2176 
2177 /*
2178  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2179  * Locking model is explained in bpf_tap().
2180  */
2181 void
2182 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2183 {
2184 	struct bintime bt;
2185 	struct bpf_d *d;
2186 #ifdef BPF_JITTER
2187 	bpf_jit_filter *bf;
2188 #endif
2189 	u_int pktlen, slen;
2190 	int gottime;
2191 
2192 	/* Skip outgoing duplicate packets. */
2193 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2194 		m->m_flags &= ~M_PROMISC;
2195 		return;
2196 	}
2197 
2198 	pktlen = m_length(m, NULL);
2199 	gottime = BPF_TSTAMP_NONE;
2200 
2201 	BPFIF_RLOCK(bp);
2202 
2203 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2204 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2205 			continue;
2206 		++d->bd_rcount;
2207 #ifdef BPF_JITTER
2208 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2209 		/* XXX We cannot handle multiple mbufs. */
2210 		if (bf != NULL && m->m_next == NULL)
2211 			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
2212 		else
2213 #endif
2214 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2215 		if (slen != 0) {
2216 			BPFD_LOCK(d);
2217 
2218 			d->bd_fcount++;
2219 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2220 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2221 #ifdef MAC
2222 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2223 #endif
2224 				catchpacket(d, (u_char *)m, pktlen, slen,
2225 				    bpf_append_mbuf, &bt);
2226 			BPFD_UNLOCK(d);
2227 		}
2228 	}
2229 	BPFIF_RUNLOCK(bp);
2230 }
2231 
2232 /*
2233  * Incoming linkage from device drivers, when packet is in
2234  * an mbuf chain and to be prepended by a contiguous header.
2235  */
2236 void
2237 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2238 {
2239 	struct bintime bt;
2240 	struct mbuf mb;
2241 	struct bpf_d *d;
2242 	u_int pktlen, slen;
2243 	int gottime;
2244 
2245 	/* Skip outgoing duplicate packets. */
2246 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2247 		m->m_flags &= ~M_PROMISC;
2248 		return;
2249 	}
2250 
2251 	pktlen = m_length(m, NULL);
2252 	/*
2253 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
2254 	 * Note that we cut corners here; we only setup what's
2255 	 * absolutely needed--this mbuf should never go anywhere else.
2256 	 */
2257 	mb.m_next = m;
2258 	mb.m_data = data;
2259 	mb.m_len = dlen;
2260 	pktlen += dlen;
2261 
2262 	gottime = BPF_TSTAMP_NONE;
2263 
2264 	BPFIF_RLOCK(bp);
2265 
2266 	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2267 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2268 			continue;
2269 		++d->bd_rcount;
2270 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2271 		if (slen != 0) {
2272 			BPFD_LOCK(d);
2273 
2274 			d->bd_fcount++;
2275 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2276 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2277 #ifdef MAC
2278 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2279 #endif
2280 				catchpacket(d, (u_char *)&mb, pktlen, slen,
2281 				    bpf_append_mbuf, &bt);
2282 			BPFD_UNLOCK(d);
2283 		}
2284 	}
2285 	BPFIF_RUNLOCK(bp);
2286 }
2287 
2288 #undef	BPF_CHECK_DIRECTION
2289 
2290 #undef	BPF_TSTAMP_NONE
2291 #undef	BPF_TSTAMP_FAST
2292 #undef	BPF_TSTAMP_NORMAL
2293 #undef	BPF_TSTAMP_EXTERN
2294 
2295 static int
2296 bpf_hdrlen(struct bpf_d *d)
2297 {
2298 	int hdrlen;
2299 
2300 	hdrlen = d->bd_bif->bif_hdrlen;
2301 #ifndef BURN_BRIDGES
2302 	if (d->bd_tstamp == BPF_T_NONE ||
2303 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2304 #ifdef COMPAT_FREEBSD32
2305 		if (d->bd_compat32)
2306 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2307 		else
2308 #endif
2309 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2310 	else
2311 #endif
2312 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2313 #ifdef COMPAT_FREEBSD32
2314 	if (d->bd_compat32)
2315 		hdrlen = BPF_WORDALIGN32(hdrlen);
2316 	else
2317 #endif
2318 		hdrlen = BPF_WORDALIGN(hdrlen);
2319 
2320 	return (hdrlen - d->bd_bif->bif_hdrlen);
2321 }
2322 
2323 static void
2324 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2325 {
2326 	struct bintime bt2;
2327 	struct timeval tsm;
2328 	struct timespec tsn;
2329 
2330 	if ((tstype & BPF_T_MONOTONIC) == 0) {
2331 		bt2 = *bt;
2332 		bintime_add(&bt2, &boottimebin);
2333 		bt = &bt2;
2334 	}
2335 	switch (BPF_T_FORMAT(tstype)) {
2336 	case BPF_T_MICROTIME:
2337 		bintime2timeval(bt, &tsm);
2338 		ts->bt_sec = tsm.tv_sec;
2339 		ts->bt_frac = tsm.tv_usec;
2340 		break;
2341 	case BPF_T_NANOTIME:
2342 		bintime2timespec(bt, &tsn);
2343 		ts->bt_sec = tsn.tv_sec;
2344 		ts->bt_frac = tsn.tv_nsec;
2345 		break;
2346 	case BPF_T_BINTIME:
2347 		ts->bt_sec = bt->sec;
2348 		ts->bt_frac = bt->frac;
2349 		break;
2350 	}
2351 }
2352 
2353 /*
2354  * Move the packet data from interface memory (pkt) into the
2355  * store buffer.  "cpfn" is the routine called to do the actual data
2356  * transfer.  bcopy is passed in to copy contiguous chunks, while
2357  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2358  * pkt is really an mbuf.
2359  */
2360 static void
2361 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2362     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2363     struct bintime *bt)
2364 {
2365 	struct bpf_xhdr hdr;
2366 #ifndef BURN_BRIDGES
2367 	struct bpf_hdr hdr_old;
2368 #ifdef COMPAT_FREEBSD32
2369 	struct bpf_hdr32 hdr32_old;
2370 #endif
2371 #endif
2372 	int caplen, curlen, hdrlen, totlen;
2373 	int do_wakeup = 0;
2374 	int do_timestamp;
2375 	int tstype;
2376 
2377 	BPFD_LOCK_ASSERT(d);
2378 
2379 	/*
2380 	 * Detect whether user space has released a buffer back to us, and if
2381 	 * so, move it from being a hold buffer to a free buffer.  This may
2382 	 * not be the best place to do it (for example, we might only want to
2383 	 * run this check if we need the space), but for now it's a reliable
2384 	 * spot to do it.
2385 	 */
2386 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2387 		d->bd_fbuf = d->bd_hbuf;
2388 		d->bd_hbuf = NULL;
2389 		d->bd_hlen = 0;
2390 		bpf_buf_reclaimed(d);
2391 	}
2392 
2393 	/*
2394 	 * Figure out how many bytes to move.  If the packet is
2395 	 * greater or equal to the snapshot length, transfer that
2396 	 * much.  Otherwise, transfer the whole packet (unless
2397 	 * we hit the buffer size limit).
2398 	 */
2399 	hdrlen = bpf_hdrlen(d);
2400 	totlen = hdrlen + min(snaplen, pktlen);
2401 	if (totlen > d->bd_bufsize)
2402 		totlen = d->bd_bufsize;
2403 
2404 	/*
2405 	 * Round up the end of the previous packet to the next longword.
2406 	 *
2407 	 * Drop the packet if there's no room and no hope of room
2408 	 * If the packet would overflow the storage buffer or the storage
2409 	 * buffer is considered immutable by the buffer model, try to rotate
2410 	 * the buffer and wakeup pending processes.
2411 	 */
2412 #ifdef COMPAT_FREEBSD32
2413 	if (d->bd_compat32)
2414 		curlen = BPF_WORDALIGN32(d->bd_slen);
2415 	else
2416 #endif
2417 		curlen = BPF_WORDALIGN(d->bd_slen);
2418 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2419 		if (d->bd_fbuf == NULL) {
2420 			/*
2421 			 * There's no room in the store buffer, and no
2422 			 * prospect of room, so drop the packet.  Notify the
2423 			 * buffer model.
2424 			 */
2425 			bpf_buffull(d);
2426 			++d->bd_dcount;
2427 			return;
2428 		}
2429 		KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
2430 		ROTATE_BUFFERS(d);
2431 		do_wakeup = 1;
2432 		curlen = 0;
2433 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
2434 		/*
2435 		 * Immediate mode is set, or the read timeout has already
2436 		 * expired during a select call.  A packet arrived, so the
2437 		 * reader should be woken up.
2438 		 */
2439 		do_wakeup = 1;
2440 	caplen = totlen - hdrlen;
2441 	tstype = d->bd_tstamp;
2442 	do_timestamp = tstype != BPF_T_NONE;
2443 #ifndef BURN_BRIDGES
2444 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2445 		struct bpf_ts ts;
2446 		if (do_timestamp)
2447 			bpf_bintime2ts(bt, &ts, tstype);
2448 #ifdef COMPAT_FREEBSD32
2449 		if (d->bd_compat32) {
2450 			bzero(&hdr32_old, sizeof(hdr32_old));
2451 			if (do_timestamp) {
2452 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2453 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2454 			}
2455 			hdr32_old.bh_datalen = pktlen;
2456 			hdr32_old.bh_hdrlen = hdrlen;
2457 			hdr32_old.bh_caplen = caplen;
2458 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2459 			    sizeof(hdr32_old));
2460 			goto copy;
2461 		}
2462 #endif
2463 		bzero(&hdr_old, sizeof(hdr_old));
2464 		if (do_timestamp) {
2465 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2466 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2467 		}
2468 		hdr_old.bh_datalen = pktlen;
2469 		hdr_old.bh_hdrlen = hdrlen;
2470 		hdr_old.bh_caplen = caplen;
2471 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2472 		    sizeof(hdr_old));
2473 		goto copy;
2474 	}
2475 #endif
2476 
2477 	/*
2478 	 * Append the bpf header.  Note we append the actual header size, but
2479 	 * move forward the length of the header plus padding.
2480 	 */
2481 	bzero(&hdr, sizeof(hdr));
2482 	if (do_timestamp)
2483 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2484 	hdr.bh_datalen = pktlen;
2485 	hdr.bh_hdrlen = hdrlen;
2486 	hdr.bh_caplen = caplen;
2487 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2488 
2489 	/*
2490 	 * Copy the packet data into the store buffer and update its length.
2491 	 */
2492 #ifndef BURN_BRIDGES
2493 copy:
2494 #endif
2495 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2496 	d->bd_slen = curlen + totlen;
2497 
2498 	if (do_wakeup)
2499 		bpf_wakeup(d);
2500 }
2501 
2502 /*
2503  * Free buffers currently in use by a descriptor.
2504  * Called on close.
2505  */
2506 static void
2507 bpf_freed(struct bpf_d *d)
2508 {
2509 
2510 	/*
2511 	 * We don't need to lock out interrupts since this descriptor has
2512 	 * been detached from its interface and it yet hasn't been marked
2513 	 * free.
2514 	 */
2515 	bpf_free(d);
2516 	if (d->bd_rfilter != NULL) {
2517 		free((caddr_t)d->bd_rfilter, M_BPF);
2518 #ifdef BPF_JITTER
2519 		if (d->bd_bfilter != NULL)
2520 			bpf_destroy_jit_filter(d->bd_bfilter);
2521 #endif
2522 	}
2523 	if (d->bd_wfilter != NULL)
2524 		free((caddr_t)d->bd_wfilter, M_BPF);
2525 	mtx_destroy(&d->bd_lock);
2526 }
2527 
2528 /*
2529  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
2530  * fixed size of the link header (variable length headers not yet supported).
2531  */
2532 void
2533 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2534 {
2535 
2536 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2537 }
2538 
2539 /*
2540  * Attach an interface to bpf.  ifp is a pointer to the structure
2541  * defining the interface to be attached, dlt is the link layer type,
2542  * and hdrlen is the fixed size of the link header (variable length
2543  * headers are not yet supporrted).
2544  */
2545 void
2546 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2547 {
2548 	struct bpf_if *bp;
2549 
2550 	bp = malloc(sizeof(*bp), M_BPF, M_NOWAIT | M_ZERO);
2551 	if (bp == NULL)
2552 		panic("bpfattach");
2553 
2554 	LIST_INIT(&bp->bif_dlist);
2555 	LIST_INIT(&bp->bif_wlist);
2556 	bp->bif_ifp = ifp;
2557 	bp->bif_dlt = dlt;
2558 	rw_init(&bp->bif_lock, "bpf interface lock");
2559 	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
2560 	*driverp = bp;
2561 
2562 	BPF_LOCK();
2563 	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
2564 	BPF_UNLOCK();
2565 
2566 	bp->bif_hdrlen = hdrlen;
2567 
2568 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
2569 		if_printf(ifp, "bpf attached\n");
2570 }
2571 
2572 /*
2573  * Detach bpf from an interface. This involves detaching each descriptor
2574  * associated with the interface. Notify each descriptor as it's detached
2575  * so that any sleepers wake up and get ENXIO.
2576  */
2577 void
2578 bpfdetach(struct ifnet *ifp)
2579 {
2580 	struct bpf_if	*bp, *bp_temp;
2581 	struct bpf_d	*d;
2582 	int ndetached;
2583 
2584 	ndetached = 0;
2585 
2586 	BPF_LOCK();
2587 	/* Find all bpf_if struct's which reference ifp and detach them. */
2588 	LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
2589 		if (ifp != bp->bif_ifp)
2590 			continue;
2591 
2592 		LIST_REMOVE(bp, bif_next);
2593 		/* Add to to-be-freed list */
2594 		LIST_INSERT_HEAD(&bpf_freelist, bp, bif_next);
2595 
2596 		ndetached++;
2597 		/*
2598 		 * Delay freeing bp till interface is detached
2599 		 * and all routes through this interface are removed.
2600 		 * Mark bp as detached to restrict new consumers.
2601 		 */
2602 		BPFIF_WLOCK(bp);
2603 		bp->bif_flags |= BPFIF_FLAG_DYING;
2604 		BPFIF_WUNLOCK(bp);
2605 
2606 		CTR4(KTR_NET, "%s: sheduling free for encap %d (%p) for if %p",
2607 		    __func__, bp->bif_dlt, bp, ifp);
2608 
2609 		/* Free common descriptors */
2610 		while ((d = LIST_FIRST(&bp->bif_dlist)) != NULL) {
2611 			bpf_detachd_locked(d);
2612 			BPFD_LOCK(d);
2613 			bpf_wakeup(d);
2614 			BPFD_UNLOCK(d);
2615 		}
2616 
2617 		/* Free writer-only descriptors */
2618 		while ((d = LIST_FIRST(&bp->bif_wlist)) != NULL) {
2619 			bpf_detachd_locked(d);
2620 			BPFD_LOCK(d);
2621 			bpf_wakeup(d);
2622 			BPFD_UNLOCK(d);
2623 		}
2624 	}
2625 	BPF_UNLOCK();
2626 
2627 #ifdef INVARIANTS
2628 	if (ndetached == 0)
2629 		printf("bpfdetach: %s was not attached\n", ifp->if_xname);
2630 #endif
2631 }
2632 
2633 /*
2634  * Interface departure handler.
2635  * Note departure event does not guarantee interface is going down.
2636  * Interface renaming is currently done via departure/arrival event set.
2637  *
2638  * Departure handled is called after all routes pointing to
2639  * given interface are removed and interface is in down state
2640  * restricting any packets to be sent/received. We assume it is now safe
2641  * to free data allocated by BPF.
2642  */
2643 static void
2644 bpf_ifdetach(void *arg __unused, struct ifnet *ifp)
2645 {
2646 	struct bpf_if *bp, *bp_temp;
2647 	int nmatched = 0;
2648 
2649 	BPF_LOCK();
2650 	/*
2651 	 * Find matching entries in free list.
2652 	 * Nothing should be found if bpfdetach() was not called.
2653 	 */
2654 	LIST_FOREACH_SAFE(bp, &bpf_freelist, bif_next, bp_temp) {
2655 		if (ifp != bp->bif_ifp)
2656 			continue;
2657 
2658 		CTR3(KTR_NET, "%s: freeing BPF instance %p for interface %p",
2659 		    __func__, bp, ifp);
2660 
2661 		LIST_REMOVE(bp, bif_next);
2662 
2663 		rw_destroy(&bp->bif_lock);
2664 		free(bp, M_BPF);
2665 
2666 		nmatched++;
2667 	}
2668 	BPF_UNLOCK();
2669 
2670 	/*
2671 	 * Note that we cannot zero other pointers to
2672 	 * custom DLTs possibly used by given interface.
2673 	 */
2674 	if (nmatched != 0)
2675 		ifp->if_bpf = NULL;
2676 }
2677 
2678 /*
2679  * Get a list of available data link type of the interface.
2680  */
2681 static int
2682 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2683 {
2684 	int n, error;
2685 	struct ifnet *ifp;
2686 	struct bpf_if *bp;
2687 
2688 	BPF_LOCK_ASSERT();
2689 
2690 	ifp = d->bd_bif->bif_ifp;
2691 	n = 0;
2692 	error = 0;
2693 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2694 		if (bp->bif_ifp != ifp)
2695 			continue;
2696 		if (bfl->bfl_list != NULL) {
2697 			if (n >= bfl->bfl_len)
2698 				return (ENOMEM);
2699 			error = copyout(&bp->bif_dlt,
2700 			    bfl->bfl_list + n, sizeof(u_int));
2701 		}
2702 		n++;
2703 	}
2704 	bfl->bfl_len = n;
2705 	return (error);
2706 }
2707 
2708 /*
2709  * Set the data link type of a BPF instance.
2710  */
2711 static int
2712 bpf_setdlt(struct bpf_d *d, u_int dlt)
2713 {
2714 	int error, opromisc;
2715 	struct ifnet *ifp;
2716 	struct bpf_if *bp;
2717 
2718 	BPF_LOCK_ASSERT();
2719 
2720 	if (d->bd_bif->bif_dlt == dlt)
2721 		return (0);
2722 	ifp = d->bd_bif->bif_ifp;
2723 
2724 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2725 		if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
2726 			break;
2727 	}
2728 
2729 	if (bp != NULL) {
2730 		opromisc = d->bd_promisc;
2731 		bpf_attachd(d, bp);
2732 		BPFD_LOCK(d);
2733 		reset_d(d);
2734 		BPFD_UNLOCK(d);
2735 		if (opromisc) {
2736 			error = ifpromisc(bp->bif_ifp, 1);
2737 			if (error)
2738 				if_printf(bp->bif_ifp,
2739 					"bpf_setdlt: ifpromisc failed (%d)\n",
2740 					error);
2741 			else
2742 				d->bd_promisc = 1;
2743 		}
2744 	}
2745 	return (bp == NULL ? EINVAL : 0);
2746 }
2747 
2748 static void
2749 bpf_drvinit(void *unused)
2750 {
2751 	struct cdev *dev;
2752 
2753 	mtx_init(&bpf_mtx, "bpf global lock", NULL, MTX_DEF);
2754 	LIST_INIT(&bpf_iflist);
2755 	LIST_INIT(&bpf_freelist);
2756 
2757 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2758 	/* For compatibility */
2759 	make_dev_alias(dev, "bpf0");
2760 
2761 	/* Register interface departure handler */
2762 	bpf_ifdetach_cookie = EVENTHANDLER_REGISTER(
2763 		    ifnet_departure_event, bpf_ifdetach, NULL,
2764 		    EVENTHANDLER_PRI_ANY);
2765 }
2766 
2767 /*
2768  * Zero out the various packet counters associated with all of the bpf
2769  * descriptors.  At some point, we will probably want to get a bit more
2770  * granular and allow the user to specify descriptors to be zeroed.
2771  */
2772 static void
2773 bpf_zero_counters(void)
2774 {
2775 	struct bpf_if *bp;
2776 	struct bpf_d *bd;
2777 
2778 	BPF_LOCK();
2779 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2780 		BPFIF_RLOCK(bp);
2781 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2782 			BPFD_LOCK(bd);
2783 			bd->bd_rcount = 0;
2784 			bd->bd_dcount = 0;
2785 			bd->bd_fcount = 0;
2786 			bd->bd_wcount = 0;
2787 			bd->bd_wfcount = 0;
2788 			bd->bd_zcopy = 0;
2789 			BPFD_UNLOCK(bd);
2790 		}
2791 		BPFIF_RUNLOCK(bp);
2792 	}
2793 	BPF_UNLOCK();
2794 }
2795 
2796 /*
2797  * Fill filter statistics
2798  */
2799 static void
2800 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2801 {
2802 
2803 	bzero(d, sizeof(*d));
2804 	BPFD_LOCK_ASSERT(bd);
2805 	d->bd_structsize = sizeof(*d);
2806 	/* XXX: reading should be protected by global lock */
2807 	d->bd_immediate = bd->bd_immediate;
2808 	d->bd_promisc = bd->bd_promisc;
2809 	d->bd_hdrcmplt = bd->bd_hdrcmplt;
2810 	d->bd_direction = bd->bd_direction;
2811 	d->bd_feedback = bd->bd_feedback;
2812 	d->bd_async = bd->bd_async;
2813 	d->bd_rcount = bd->bd_rcount;
2814 	d->bd_dcount = bd->bd_dcount;
2815 	d->bd_fcount = bd->bd_fcount;
2816 	d->bd_sig = bd->bd_sig;
2817 	d->bd_slen = bd->bd_slen;
2818 	d->bd_hlen = bd->bd_hlen;
2819 	d->bd_bufsize = bd->bd_bufsize;
2820 	d->bd_pid = bd->bd_pid;
2821 	strlcpy(d->bd_ifname,
2822 	    bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2823 	d->bd_locked = bd->bd_locked;
2824 	d->bd_wcount = bd->bd_wcount;
2825 	d->bd_wdcount = bd->bd_wdcount;
2826 	d->bd_wfcount = bd->bd_wfcount;
2827 	d->bd_zcopy = bd->bd_zcopy;
2828 	d->bd_bufmode = bd->bd_bufmode;
2829 }
2830 
2831 /*
2832  * Handle `netstat -B' stats request
2833  */
2834 static int
2835 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2836 {
2837 	static const struct xbpf_d zerostats;
2838 	struct xbpf_d *xbdbuf, *xbd, tempstats;
2839 	int index, error;
2840 	struct bpf_if *bp;
2841 	struct bpf_d *bd;
2842 
2843 	/*
2844 	 * XXX This is not technically correct. It is possible for non
2845 	 * privileged users to open bpf devices. It would make sense
2846 	 * if the users who opened the devices were able to retrieve
2847 	 * the statistics for them, too.
2848 	 */
2849 	error = priv_check(req->td, PRIV_NET_BPF);
2850 	if (error)
2851 		return (error);
2852 	/*
2853 	 * Check to see if the user is requesting that the counters be
2854 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
2855 	 * as we aren't allowing the user to set the counters currently.
2856 	 */
2857 	if (req->newptr != NULL) {
2858 		if (req->newlen != sizeof(tempstats))
2859 			return (EINVAL);
2860 		memset(&tempstats, 0, sizeof(tempstats));
2861 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
2862 		if (error)
2863 			return (error);
2864 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
2865 			return (EINVAL);
2866 		bpf_zero_counters();
2867 		return (0);
2868 	}
2869 	if (req->oldptr == NULL)
2870 		return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
2871 	if (bpf_bpfd_cnt == 0)
2872 		return (SYSCTL_OUT(req, 0, 0));
2873 	xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
2874 	BPF_LOCK();
2875 	if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
2876 		BPF_UNLOCK();
2877 		free(xbdbuf, M_BPF);
2878 		return (ENOMEM);
2879 	}
2880 	index = 0;
2881 	LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2882 		BPFIF_RLOCK(bp);
2883 		/* Send writers-only first */
2884 		LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
2885 			xbd = &xbdbuf[index++];
2886 			BPFD_LOCK(bd);
2887 			bpfstats_fill_xbpf(xbd, bd);
2888 			BPFD_UNLOCK(bd);
2889 		}
2890 		LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2891 			xbd = &xbdbuf[index++];
2892 			BPFD_LOCK(bd);
2893 			bpfstats_fill_xbpf(xbd, bd);
2894 			BPFD_UNLOCK(bd);
2895 		}
2896 		BPFIF_RUNLOCK(bp);
2897 	}
2898 	BPF_UNLOCK();
2899 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2900 	free(xbdbuf, M_BPF);
2901 	return (error);
2902 }
2903 
2904 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
2905 
2906 #else /* !DEV_BPF && !NETGRAPH_BPF */
2907 /*
2908  * NOP stubs to allow bpf-using drivers to load and function.
2909  *
2910  * A 'better' implementation would allow the core bpf functionality
2911  * to be loaded at runtime.
2912  */
2913 static struct bpf_if bp_null;
2914 
2915 void
2916 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2917 {
2918 }
2919 
2920 void
2921 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2922 {
2923 }
2924 
2925 void
2926 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
2927 {
2928 }
2929 
2930 void
2931 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2932 {
2933 
2934 	bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2935 }
2936 
2937 void
2938 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
2939 {
2940 
2941 	*driverp = &bp_null;
2942 }
2943 
2944 void
2945 bpfdetach(struct ifnet *ifp)
2946 {
2947 }
2948 
2949 u_int
2950 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
2951 {
2952 	return -1;	/* "no filter" behaviour */
2953 }
2954 
2955 int
2956 bpf_validate(const struct bpf_insn *f, int len)
2957 {
2958 	return 0;		/* false */
2959 }
2960 
2961 #endif /* !DEV_BPF && !NETGRAPH_BPF */
2962