xref: /freebsd/sys/net/bpf.c (revision 04b994d19eec68a6b5d27ff4b0fa223a05f00e1f)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
7  *
8  * This code is derived from the Stanford/CMU enet packet filter,
9  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
10  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
11  * Berkeley Laboratory.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include <sys/cdefs.h>
39 #include "opt_bpf.h"
40 #include "opt_netgraph.h"
41 
42 #include <sys/param.h>
43 #include <sys/conf.h>
44 #include <sys/fcntl.h>
45 #include <sys/jail.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/mutex.h>
51 #include <sys/time.h>
52 #include <sys/priv.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/filio.h>
56 #include <sys/sockio.h>
57 #include <sys/ttycom.h>
58 #include <sys/uio.h>
59 #include <sys/sysent.h>
60 #include <sys/systm.h>
61 
62 #include <sys/file.h>
63 #include <sys/poll.h>
64 #include <sys/proc.h>
65 
66 #include <sys/socket.h>
67 
68 #include <net/if.h>
69 #include <net/if_var.h>
70 #include <net/if_private.h>
71 #include <net/if_vlan_var.h>
72 #include <net/bpf.h>
73 #include <net/bpf_buffer.h>
74 #ifdef BPF_JITTER
75 #include <net/bpf_jitter.h>
76 #endif
77 #include <net/bpf_zerocopy.h>
78 #include <net/bpfdesc.h>
79 #include <net/vnet.h>
80 
81 #include <sys/kernel.h>
82 #include <sys/sysctl.h>
83 
84 #include <security/mac/mac_framework.h>
85 
86 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
87 
88 struct bpf_if {
89 	struct bpfd_list	bif_dlist;	/* list of all interfaces */
90 	LIST_ENTRY(bpf_if)	bif_next;	/* descriptor list */
91 	u_int		bif_dlt;	/* link layer type */
92 	u_int		bif_hdrlen;	/* length of link header */
93 	volatile u_int	bif_refcnt;
94 	struct bpfd_list bif_wlist;	/* writer-only list */
95 	const struct bif_methods	*bif_methods;
96 	void		*bif_softc;
97 	const char	*bif_name;
98 	struct epoch_context epoch_ctx;
99 };
100 
101 /* See bpf_peers_present() in bpf.h. */
102 _Static_assert(offsetof(struct bpf_if, bif_dlist) == 0,
103     "bpf_if shall start with bif_dlist");
104 
105 static inline void
bif_attachd(struct bpf_if * bp)106 bif_attachd(struct bpf_if *bp)
107 {
108 	if (bp->bif_methods->bif_attachd != NULL)
109 		bp->bif_methods->bif_attachd(bp->bif_softc);
110 }
111 
112 static inline void
bif_detachd(struct bpf_if * bp)113 bif_detachd(struct bpf_if *bp)
114 {
115 	if (bp->bif_methods->bif_detachd != NULL)
116 		bp->bif_methods->bif_detachd(bp->bif_softc);
117 }
118 
119 static inline uint32_t
bif_wrsize(struct bpf_if * bp)120 bif_wrsize(struct bpf_if *bp)
121 {
122 	if (bp->bif_methods->bif_wrsize != NULL)
123 		return (bp->bif_methods->bif_wrsize(bp->bif_softc));
124 	else
125 		return (0);
126 }
127 
128 static inline int
bif_promisc(struct bpf_if * bp,bool on)129 bif_promisc(struct bpf_if *bp, bool on)
130 {
131 	if (bp->bif_methods->bif_promisc != NULL)
132 		return (bp->bif_methods->bif_promisc(bp->bif_softc, on));
133 	else
134 		return (0);
135 }
136 
137 #ifdef MAC
138 static inline int
bif_mac_check_receive(struct bpf_if * bp,struct bpf_d * d)139 bif_mac_check_receive(struct bpf_if *bp, struct bpf_d *d)
140 {
141 	if (bp->bif_methods->bif_mac_check_receive != NULL)
142 		return (bp->bif_methods->bif_mac_check_receive(bp->bif_softc,
143 		    d));
144 	else
145 		return (0);
146 }
147 #endif
148 
149 /*
150  * XXXGL: Once we migrate to tapping KPI that would specify packet direction
151  * we no longer need bif_chkdir method.
152  */
153 static inline bool
bpf_chkdir(struct bpf_d * d,struct mbuf * m)154 bpf_chkdir(struct bpf_d *d, struct mbuf *m)
155 {
156 	return (d->bd_bif->bif_methods->bif_chkdir(d->bd_bif->bif_softc, m,
157 	    d->bd_direction));
158 }
159 
160 struct bpf_program_buffer {
161 	struct epoch_context	epoch_ctx;
162 #ifdef BPF_JITTER
163 	bpf_jit_filter		*func;
164 #endif
165 	void			*buffer[0];
166 };
167 
168 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
169 
170 #define PRINET  26			/* interruptible */
171 #define BPF_PRIO_MAX	7
172 
173 #define	SIZEOF_BPF_HDR(type)	\
174     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
175 
176 #ifdef COMPAT_FREEBSD32
177 #include <sys/mount.h>
178 #include <compat/freebsd32/freebsd32.h>
179 #define BPF_ALIGNMENT32 sizeof(int32_t)
180 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
181 
182 #ifndef BURN_BRIDGES
183 /*
184  * 32-bit version of structure prepended to each packet.  We use this header
185  * instead of the standard one for 32-bit streams.  We mark the a stream as
186  * 32-bit the first time we see a 32-bit compat ioctl request.
187  */
188 struct bpf_hdr32 {
189 	struct timeval32 bh_tstamp;	/* time stamp */
190 	uint32_t	bh_caplen;	/* length of captured portion */
191 	uint32_t	bh_datalen;	/* original length of packet */
192 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
193 					   plus alignment padding) */
194 };
195 #endif
196 
197 struct bpf_program32 {
198 	u_int bf_len;
199 	uint32_t bf_insns;
200 };
201 
202 struct bpf_dltlist32 {
203 	u_int	bfl_len;
204 	u_int	bfl_list;
205 };
206 
207 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
208 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
209 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
210 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
211 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
212 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
213 #endif
214 
215 #define BPF_LOCK()		sx_xlock(&bpf_sx)
216 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
217 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
218 /*
219  * bpf_iflist is a list of BPF interface structures, each corresponding to a
220  * specific DLT. The same network interface might have several BPF interface
221  * structures registered by different layers in the stack (i.e., 802.11
222  * frames, ethernet frames, etc).
223  */
224 VNET_DEFINE_STATIC(LIST_HEAD(, bpf_if), bpf_iflist) = LIST_HEAD_INITIALIZER();
225 #define	V_bpf_iflist	VNET(bpf_iflist)
226 static struct sx	bpf_sx;		/* bpf global lock */
227 
228 static void	bpfif_ref(struct bpf_if *);
229 static void	bpfif_rele(struct bpf_if *);
230 
231 static void	bpfd_ref(struct bpf_d *);
232 static void	bpfd_rele(struct bpf_d *);
233 static int	bpf_attachd(struct bpf_d *d, struct bpf_if *);
234 static void	bpf_detachd(struct bpf_d *, bool);
235 static void	bpfd_free(epoch_context_t);
236 static void	bpf_timed_out(void *);
237 static __inline void
238 		bpf_wakeup(struct bpf_d *);
239 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
240 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
241 		    struct bintime *);
242 static void	reset_d(struct bpf_d *);
243 static int	bpf_getiflist(struct bpf_iflist *);
244 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
245 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
246 static int	bpf_setdlt(struct bpf_d *, u_int);
247 static void	filt_bpfdetach(struct knote *);
248 static int	filt_bpfread(struct knote *, long);
249 static int	filt_bpfwrite(struct knote *, long);
250 static void	bpf_drvinit(void *);
251 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
252 
253 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
254     "bpf sysctl");
255 int bpf_maxinsns = BPF_MAXINSNS;
256 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
257     &bpf_maxinsns, 0, "Maximum bpf program instructions");
258 static int bpf_zerocopy_enable = 0;
259 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
260     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
261 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats,
262     CTLFLAG_VNET | CTLFLAG_MPSAFE | CTLFLAG_RW,
263     bpf_stats_sysctl, "bpf statistics portal");
264 
265 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
266 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
267 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
268     &VNET_NAME(bpf_optimize_writers), 0,
269     "Do not send packets until BPF program is set");
270 
271 static	d_open_t	bpfopen;
272 static	d_read_t	bpfread;
273 static	d_write_t	bpfwrite;
274 static	d_ioctl_t	bpfioctl;
275 static	d_poll_t	bpfpoll;
276 static	d_kqfilter_t	bpfkqfilter;
277 
278 static struct cdevsw bpf_cdevsw = {
279 	.d_version =	D_VERSION,
280 	.d_open =	bpfopen,
281 	.d_read =	bpfread,
282 	.d_write =	bpfwrite,
283 	.d_ioctl =	bpfioctl,
284 	.d_poll =	bpfpoll,
285 	.d_name =	"bpf",
286 	.d_kqfilter =	bpfkqfilter,
287 };
288 
289 static const struct filterops bpfread_filtops = {
290 	.f_isfd = 1,
291 	.f_detach = filt_bpfdetach,
292 	.f_event = filt_bpfread,
293 	.f_copy = knote_triv_copy,
294 };
295 
296 static const struct filterops bpfwrite_filtops = {
297 	.f_isfd = 1,
298 	.f_detach = filt_bpfdetach,
299 	.f_event = filt_bpfwrite,
300 	.f_copy = knote_triv_copy,
301 };
302 
303 /*
304  * LOCKING MODEL USED BY BPF
305  *
306  * Locks:
307  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
308  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
309  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
310  * structure fields used by bpf_*tap* code.
311  *
312  * Lock order: global lock, then descriptor lock.
313  *
314  * There are several possible consumers:
315  *
316  * 1. The kernel registers interface pointer with bpfattach().
317  * Each call allocates new bpf_if structure, references ifnet pointer
318  * and links bpf_if into bpf_iflist chain. This is protected with global
319  * lock.
320  *
321  * 2. An userland application uses ioctl() call to bpf_d descriptor.
322  * All such call are serialized with global lock. BPF filters can be
323  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
324  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
325  * filter pointers, even if change will happen during bpf_tap execution.
326  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
327  *
328  * 3. An userland application can write packets into bpf_d descriptor.
329  * There we need to be sure, that ifnet won't disappear during bpfwrite().
330  *
331  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
332  * bif_dlist is protected with net_epoch_preempt section. So, it should
333  * be safe to make access to bpf_d descriptor inside the section.
334  *
335  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
336  * are modified with global lock held and actual free() is done using
337  * NET_EPOCH_CALL().
338  */
339 
340 static void
bpfif_free(epoch_context_t ctx)341 bpfif_free(epoch_context_t ctx)
342 {
343 	struct bpf_if *bp;
344 
345 	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
346 	free(bp, M_BPF);
347 }
348 
349 static void
bpfif_ref(struct bpf_if * bp)350 bpfif_ref(struct bpf_if *bp)
351 {
352 
353 	refcount_acquire(&bp->bif_refcnt);
354 }
355 
356 static void
bpfif_rele(struct bpf_if * bp)357 bpfif_rele(struct bpf_if *bp)
358 {
359 
360 	if (!refcount_release(&bp->bif_refcnt))
361 		return;
362 	NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
363 }
364 
365 static void
bpfd_ref(struct bpf_d * d)366 bpfd_ref(struct bpf_d *d)
367 {
368 
369 	refcount_acquire(&d->bd_refcnt);
370 }
371 
372 static void
bpfd_rele(struct bpf_d * d)373 bpfd_rele(struct bpf_d *d)
374 {
375 
376 	if (!refcount_release(&d->bd_refcnt))
377 		return;
378 	NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
379 }
380 
381 static struct bpf_program_buffer*
bpf_program_buffer_alloc(size_t size,int flags)382 bpf_program_buffer_alloc(size_t size, int flags)
383 {
384 
385 	return (malloc(sizeof(struct bpf_program_buffer) + size,
386 	    M_BPF, flags));
387 }
388 
389 static void
bpf_program_buffer_free(epoch_context_t ctx)390 bpf_program_buffer_free(epoch_context_t ctx)
391 {
392 	struct bpf_program_buffer *ptr;
393 
394 	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
395 #ifdef BPF_JITTER
396 	if (ptr->func != NULL)
397 		bpf_destroy_jit_filter(ptr->func);
398 #endif
399 	free(ptr, M_BPF);
400 }
401 
402 /*
403  * Wrapper functions for various buffering methods.  If the set of buffer
404  * modes expands, we will probably want to introduce a switch data structure
405  * similar to protosw, et.
406  */
407 static void
bpf_append_bytes(struct bpf_d * d,caddr_t buf,u_int offset,void * src,u_int len)408 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
409     u_int len)
410 {
411 
412 	BPFD_LOCK_ASSERT(d);
413 
414 	switch (d->bd_bufmode) {
415 	case BPF_BUFMODE_BUFFER:
416 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
417 
418 	case BPF_BUFMODE_ZBUF:
419 		counter_u64_add(d->bd_zcopy, 1);
420 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
421 
422 	default:
423 		panic("bpf_buf_append_bytes");
424 	}
425 }
426 
427 static void
bpf_append_mbuf(struct bpf_d * d,caddr_t buf,u_int offset,void * src,u_int len)428 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
429     u_int len)
430 {
431 
432 	BPFD_LOCK_ASSERT(d);
433 
434 	switch (d->bd_bufmode) {
435 	case BPF_BUFMODE_BUFFER:
436 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
437 
438 	case BPF_BUFMODE_ZBUF:
439 		counter_u64_add(d->bd_zcopy, 1);
440 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
441 
442 	default:
443 		panic("bpf_buf_append_mbuf");
444 	}
445 }
446 
447 /*
448  * This function gets called when the free buffer is re-assigned.
449  */
450 static void
bpf_buf_reclaimed(struct bpf_d * d)451 bpf_buf_reclaimed(struct bpf_d *d)
452 {
453 
454 	BPFD_LOCK_ASSERT(d);
455 
456 	switch (d->bd_bufmode) {
457 	case BPF_BUFMODE_BUFFER:
458 		return;
459 
460 	case BPF_BUFMODE_ZBUF:
461 		bpf_zerocopy_buf_reclaimed(d);
462 		return;
463 
464 	default:
465 		panic("bpf_buf_reclaimed");
466 	}
467 }
468 
469 /*
470  * If the buffer mechanism has a way to decide that a held buffer can be made
471  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
472  * returned if the buffer can be discarded, (0) is returned if it cannot.
473  */
474 static int
bpf_canfreebuf(struct bpf_d * d)475 bpf_canfreebuf(struct bpf_d *d)
476 {
477 
478 	BPFD_LOCK_ASSERT(d);
479 
480 	switch (d->bd_bufmode) {
481 	case BPF_BUFMODE_ZBUF:
482 		return (bpf_zerocopy_canfreebuf(d));
483 	}
484 	return (0);
485 }
486 
487 /*
488  * Allow the buffer model to indicate that the current store buffer is
489  * immutable, regardless of the appearance of space.  Return (1) if the
490  * buffer is writable, and (0) if not.
491  */
492 static int
bpf_canwritebuf(struct bpf_d * d)493 bpf_canwritebuf(struct bpf_d *d)
494 {
495 	BPFD_LOCK_ASSERT(d);
496 
497 	switch (d->bd_bufmode) {
498 	case BPF_BUFMODE_ZBUF:
499 		return (bpf_zerocopy_canwritebuf(d));
500 	}
501 	return (1);
502 }
503 
504 /*
505  * Notify buffer model that an attempt to write to the store buffer has
506  * resulted in a dropped packet, in which case the buffer may be considered
507  * full.
508  */
509 static void
bpf_buffull(struct bpf_d * d)510 bpf_buffull(struct bpf_d *d)
511 {
512 
513 	BPFD_LOCK_ASSERT(d);
514 
515 	switch (d->bd_bufmode) {
516 	case BPF_BUFMODE_ZBUF:
517 		bpf_zerocopy_buffull(d);
518 		break;
519 	}
520 }
521 
522 /*
523  * Notify the buffer model that a buffer has moved into the hold position.
524  */
525 void
bpf_bufheld(struct bpf_d * d)526 bpf_bufheld(struct bpf_d *d)
527 {
528 
529 	BPFD_LOCK_ASSERT(d);
530 
531 	switch (d->bd_bufmode) {
532 	case BPF_BUFMODE_ZBUF:
533 		bpf_zerocopy_bufheld(d);
534 		break;
535 	}
536 }
537 
538 static void
bpf_free(struct bpf_d * d)539 bpf_free(struct bpf_d *d)
540 {
541 
542 	switch (d->bd_bufmode) {
543 	case BPF_BUFMODE_BUFFER:
544 		return (bpf_buffer_free(d));
545 
546 	case BPF_BUFMODE_ZBUF:
547 		return (bpf_zerocopy_free(d));
548 
549 	default:
550 		panic("bpf_buf_free");
551 	}
552 }
553 
554 static int
bpf_uiomove(struct bpf_d * d,caddr_t buf,u_int len,struct uio * uio)555 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
556 {
557 
558 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
559 		return (EOPNOTSUPP);
560 	return (bpf_buffer_uiomove(d, buf, len, uio));
561 }
562 
563 static int
bpf_ioctl_sblen(struct bpf_d * d,u_int * i)564 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
565 {
566 
567 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
568 		return (EOPNOTSUPP);
569 	return (bpf_buffer_ioctl_sblen(d, i));
570 }
571 
572 static int
bpf_ioctl_getzmax(struct thread * td,struct bpf_d * d,size_t * i)573 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
574 {
575 
576 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
577 		return (EOPNOTSUPP);
578 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
579 }
580 
581 static int
bpf_ioctl_rotzbuf(struct thread * td,struct bpf_d * d,struct bpf_zbuf * bz)582 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
583 {
584 
585 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
586 		return (EOPNOTSUPP);
587 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
588 }
589 
590 static int
bpf_ioctl_setzbuf(struct thread * td,struct bpf_d * d,struct bpf_zbuf * bz)591 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
592 {
593 
594 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
595 		return (EOPNOTSUPP);
596 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
597 }
598 
599 /*
600  * Check if we need to upgrade our descriptor @d from write-only mode.
601  */
602 static int
bpf_check_upgrade(u_long cmd,struct bpf_d * d,struct bpf_insn * fcode,int flen)603 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
604     int flen)
605 {
606 	int is_snap, need_upgrade;
607 
608 	/*
609 	 * Check if we've already upgraded or new filter is empty.
610 	 */
611 	if (d->bd_writer == 0 || fcode == NULL)
612 		return (0);
613 
614 	need_upgrade = 0;
615 
616 	/*
617 	 * Check if cmd looks like snaplen setting from
618 	 * pcap_bpf.c:pcap_open_live().
619 	 * Note we're not checking .k value here:
620 	 * while pcap_open_live() definitely sets to non-zero value,
621 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
622 	 * do not consider upgrading immediately
623 	 */
624 	if (cmd == BIOCSETF && flen == 1 &&
625 	    fcode[0].code == (BPF_RET | BPF_K))
626 		is_snap = 1;
627 	else
628 		is_snap = 0;
629 
630 	if (is_snap == 0) {
631 		/*
632 		 * We're setting first filter and it doesn't look like
633 		 * setting snaplen.  We're probably using bpf directly.
634 		 * Upgrade immediately.
635 		 */
636 		need_upgrade = 1;
637 	} else {
638 		/*
639 		 * Do not require upgrade by first BIOCSETF
640 		 * (used to set snaplen) by pcap_open_live().
641 		 */
642 
643 		if (--d->bd_writer == 0) {
644 			/*
645 			 * First snaplen filter has already
646 			 * been set. This is probably catch-all
647 			 * filter
648 			 */
649 			need_upgrade = 1;
650 		}
651 	}
652 
653 	CTR5(KTR_NET,
654 	    "%s: filter function set by pid %d, "
655 	    "bd_writer counter %d, snap %d upgrade %d",
656 	    __func__, d->bd_pid, d->bd_writer,
657 	    is_snap, need_upgrade);
658 
659 	return (need_upgrade);
660 }
661 
662 /*
663  * Detach a file from its interface.
664  */
665 static void
bpf_detachd(struct bpf_d * d,bool detached_ifp)666 bpf_detachd(struct bpf_d *d, bool detached_ifp)
667 {
668 	struct bpf_if *bp;
669 	bool writer;
670 
671 	BPF_LOCK_ASSERT();
672 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
673 
674 	/* Check if descriptor is attached */
675 	if ((bp = d->bd_bif) == NULL)
676 		return;
677 
678 	BPFD_LOCK(d);
679 	CK_LIST_REMOVE(d, bd_next);
680 	writer = (d->bd_writer > 0);
681 	if (detached_ifp) {
682 		d->bd_bif = NULL;
683 		/*
684 		 * Notify descriptor as it's detached, so that any
685 		 * sleepers wake up and get ENXIO.
686 		 */
687 		bpf_wakeup(d);
688 	}
689 	BPFD_UNLOCK(d);
690 
691 	if (!writer)
692 		bif_detachd(bp);
693 
694 	if (d->bd_promisc && !detached_ifp) {
695 		d->bd_promisc = 0;
696 		(void)bif_promisc(bp, false);
697 	}
698 
699 	bpfif_rele(bp);
700 }
701 
702 /*
703  * Close the descriptor by detaching it from its interface,
704  * deallocating its buffers, and marking it free.
705  */
706 static void
bpf_dtor(void * data)707 bpf_dtor(void *data)
708 {
709 	struct bpf_d *d = data;
710 
711 	BPFD_LOCK(d);
712 	if (d->bd_state == BPF_WAITING)
713 		callout_stop(&d->bd_callout);
714 	d->bd_state = BPF_IDLE;
715 	BPFD_UNLOCK(d);
716 	funsetown(&d->bd_sigio);
717 	BPF_LOCK();
718 	bpf_detachd(d, false);
719 	BPF_UNLOCK();
720 #ifdef MAC
721 	mac_bpfdesc_destroy(d);
722 #endif /* MAC */
723 	seldrain(&d->bd_sel);
724 	knlist_destroy(&d->bd_sel.si_note);
725 	callout_drain(&d->bd_callout);
726 	bpfd_rele(d);
727 }
728 
729 /*
730  * Open ethernet device.  Returns ENXIO for illegal minor device number,
731  * EBUSY if file is open by another process.
732  */
733 /* ARGSUSED */
734 static	int
bpfopen(struct cdev * dev,int flags,int fmt,struct thread * td)735 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
736 {
737 	struct bpf_d *d;
738 	int error;
739 
740 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
741 	error = devfs_set_cdevpriv(d, bpf_dtor);
742 	if (error != 0) {
743 		free(d, M_BPF);
744 		return (error);
745 	}
746 
747 	/* Setup counters */
748 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
749 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
750 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
751 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
752 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
753 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
754 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
755 
756 	/*
757 	 * For historical reasons, perform a one-time initialization call to
758 	 * the buffer routines, even though we're not yet committed to a
759 	 * particular buffer method.
760 	 */
761 	bpf_buffer_init(d);
762 	if ((flags & FREAD) == 0)
763 		d->bd_writer = 2;
764 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
765 	d->bd_sig = SIGIO;
766 	d->bd_direction = BPF_D_INOUT;
767 	refcount_init(&d->bd_refcnt, 1);
768 	BPF_PID_REFRESH(d, td);
769 #ifdef MAC
770 	mac_bpfdesc_init(d);
771 	mac_bpfdesc_create(td->td_ucred, d);
772 #endif
773 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
774 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
775 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
776 
777 	/* Disable VLAN pcp tagging. */
778 	d->bd_pcp = 0;
779 
780 	return (0);
781 }
782 
783 /*
784  *  bpfread - read next chunk of packets from buffers
785  */
786 static	int
bpfread(struct cdev * dev,struct uio * uio,int ioflag)787 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
788 {
789 	struct bpf_d *d;
790 	int error;
791 	int non_block;
792 	int timed_out;
793 
794 	error = devfs_get_cdevpriv((void **)&d);
795 	if (error != 0)
796 		return (error);
797 
798 	/*
799 	 * Restrict application to use a buffer the same size as
800 	 * as kernel buffers.
801 	 */
802 	if (uio->uio_resid != d->bd_bufsize)
803 		return (EINVAL);
804 
805 	non_block = ((ioflag & O_NONBLOCK) != 0);
806 
807 	BPFD_LOCK(d);
808 	BPF_PID_REFRESH_CUR(d);
809 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
810 		BPFD_UNLOCK(d);
811 		return (EOPNOTSUPP);
812 	}
813 	if (d->bd_state == BPF_WAITING)
814 		callout_stop(&d->bd_callout);
815 	timed_out = (d->bd_state == BPF_TIMED_OUT);
816 	d->bd_state = BPF_IDLE;
817 	while (d->bd_flags & BPFD_HBUF_INUSE) {
818 		error = mtx_sleep(&d->bd_hbuf, &d->bd_lock, PRINET | PCATCH,
819 		    "bd_hbuf", 0);
820 		if (error != 0) {
821 			BPFD_UNLOCK(d);
822 			return (error);
823 		}
824 	}
825 	/*
826 	 * If the hold buffer is empty, then do a timed sleep, which
827 	 * ends when the timeout expires or when enough packets
828 	 * have arrived to fill the store buffer.
829 	 */
830 	while (d->bd_hbuf == NULL) {
831 		if (d->bd_slen != 0) {
832 			/*
833 			 * A packet(s) either arrived since the previous
834 			 * read or arrived while we were asleep.
835 			 */
836 			if ((d->bd_flags & BPFD_IMMEDIATE) || non_block ||
837 			    timed_out) {
838 				/*
839 				 * Rotate the buffers and return what's here
840 				 * if we are in immediate mode, non-blocking
841 				 * flag is set, or this descriptor timed out.
842 				 */
843 				ROTATE_BUFFERS(d);
844 				break;
845 			}
846 		}
847 
848 		/*
849 		 * No data is available, check to see if the bpf device
850 		 * is still pointed at a real interface.  If not, return
851 		 * ENXIO so that the userland process knows to rebind
852 		 * it before using it again.
853 		 */
854 		if (d->bd_bif == NULL) {
855 			BPFD_UNLOCK(d);
856 			return (ENXIO);
857 		}
858 
859 		if (non_block) {
860 			BPFD_UNLOCK(d);
861 			return (EWOULDBLOCK);
862 		}
863 		error = msleep(d, &d->bd_lock, PRINET | PCATCH,
864 		     "bpf", d->bd_rtout);
865 		if (error == EINTR || error == ERESTART) {
866 			BPFD_UNLOCK(d);
867 			return (error);
868 		}
869 		if (error == EWOULDBLOCK) {
870 			/*
871 			 * On a timeout, return what's in the buffer,
872 			 * which may be nothing.  If there is something
873 			 * in the store buffer, we can rotate the buffers.
874 			 */
875 			if (d->bd_hbuf)
876 				/*
877 				 * We filled up the buffer in between
878 				 * getting the timeout and arriving
879 				 * here, so we don't need to rotate.
880 				 */
881 				break;
882 
883 			if (d->bd_slen == 0) {
884 				BPFD_UNLOCK(d);
885 				return (0);
886 			}
887 			ROTATE_BUFFERS(d);
888 			break;
889 		}
890 	}
891 	/*
892 	 * At this point, we know we have something in the hold slot.
893 	 */
894 	d->bd_flags |= BPFD_HBUF_INUSE;
895 	BPFD_UNLOCK(d);
896 
897 	/*
898 	 * Move data from hold buffer into user space.
899 	 * We know the entire buffer is transferred since
900 	 * we checked above that the read buffer is bpf_bufsize bytes.
901   	 *
902 	 * We do not have to worry about simultaneous reads because
903 	 * we waited for sole access to the hold buffer above.
904 	 */
905 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
906 
907 	BPFD_LOCK(d);
908 	if (d->bd_flags & BPFD_HBUF_INUSE) {
909 		KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
910 		d->bd_fbuf = d->bd_hbuf;
911 		d->bd_hbuf = NULL;
912 		d->bd_hlen = 0;
913 		bpf_buf_reclaimed(d);
914 		d->bd_flags &= ~BPFD_HBUF_INUSE;
915 		wakeup(&d->bd_hbuf);
916 	}
917 	BPFD_UNLOCK(d);
918 
919 	return (error);
920 }
921 
922 /*
923  * If there are processes sleeping on this descriptor, wake them up.
924  */
925 static __inline void
bpf_wakeup(struct bpf_d * d)926 bpf_wakeup(struct bpf_d *d)
927 {
928 
929 	BPFD_LOCK_ASSERT(d);
930 	if (d->bd_state == BPF_WAITING) {
931 		callout_stop(&d->bd_callout);
932 		d->bd_state = BPF_IDLE;
933 	}
934 	wakeup(d);
935 	if ((d->bd_flags & BPFD_ASYNC) && d->bd_sig && d->bd_sigio)
936 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
937 
938 	selwakeuppri(&d->bd_sel, PRINET);
939 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
940 }
941 
942 static void
bpf_timed_out(void * arg)943 bpf_timed_out(void *arg)
944 {
945 	struct bpf_d *d = (struct bpf_d *)arg;
946 
947 	BPFD_LOCK_ASSERT(d);
948 
949 	if (callout_pending(&d->bd_callout) ||
950 	    !callout_active(&d->bd_callout))
951 		return;
952 	if (d->bd_state == BPF_WAITING) {
953 		d->bd_state = BPF_TIMED_OUT;
954 		if (d->bd_slen != 0)
955 			bpf_wakeup(d);
956 	}
957 }
958 
959 static int
bpf_ready(struct bpf_d * d)960 bpf_ready(struct bpf_d *d)
961 {
962 
963 	BPFD_LOCK_ASSERT(d);
964 
965 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
966 		return (1);
967 	if (((d->bd_flags & BPFD_IMMEDIATE) || d->bd_state == BPF_TIMED_OUT) &&
968 	    d->bd_slen != 0)
969 		return (1);
970 	return (0);
971 }
972 
973 static int
bpfwrite(struct cdev * dev,struct uio * uio,int ioflag)974 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
975 {
976 	struct epoch_tracker et;
977 	struct bpf_if *bp;
978 	struct bpf_d *d;
979 	struct mbuf *m, *mc;
980 	ssize_t len;
981 	int error;
982 
983 	error = devfs_get_cdevpriv((void **)&d);
984 	if (error != 0)
985 		return (error);
986 
987 	if (uio->uio_resid == 0)
988 		return (0);
989 
990 	BPFD_LOCK(d);
991 	if ((bp = d->bd_bif) == NULL)
992 		error = ENXIO;
993 	else if (bp->bif_methods->bif_write == NULL)
994 		error = EOPNOTSUPP;
995 	if (error) {
996 		BPFD_UNLOCK(d);
997 		counter_u64_add(d->bd_wdcount, 1);
998 		return (error);
999 	}
1000 	bpfd_ref(d);
1001 	BPFD_UNLOCK(d);
1002 
1003 	len = uio->uio_resid;
1004 	/* Allocate a mbuf, up to MJUM16BYTES bytes, for our write. */
1005 	m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
1006 	if (m == NULL) {
1007 		error = ENOMEM;
1008 		goto fail_wref;
1009 	}
1010 	m->m_pkthdr.len = m->m_len = len;
1011 
1012 	error = uiomove(mtod(m, u_char *), len, uio);
1013 	if (error)
1014 		goto fail_wref;
1015 
1016 	if (bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len) == 0) {
1017 		error = EPERM;
1018 		goto fail_wref;
1019 	}
1020 
1021 	if (d->bd_flags & BPFD_FEEDBACK) {
1022 		mc = m_dup(m, M_WAITOK);
1023 		/* Set M_PROMISC for outgoing packets to be discarded. */
1024 		if (d->bd_direction == BPF_D_INOUT)
1025 			m->m_flags |= M_PROMISC;
1026 	} else
1027 		mc = NULL;
1028 
1029 	/* XXXGL: should belong to bpf_ifnet.c */
1030 	if (d->bd_pcp != 0)
1031 		(void)vlan_set_pcp(m, d->bd_pcp);
1032 
1033 	BPFD_LOCK(d);
1034 #ifdef MAC
1035 	mac_bpfdesc_create_mbuf(d, m);
1036 	if (mc != NULL)
1037 		mac_bpfdesc_create_mbuf(d, mc);
1038 #endif
1039 	/*
1040 	 * Check that descriptor is still attached to the interface.
1041 	 * This can happen on bpfdetach() or if other thread did BIOCSDLT.
1042 	 */
1043 	if (__predict_false(d->bd_bif != bp)) {
1044 		BPFD_UNLOCK(d);
1045 		m_freem(mc);
1046 		error = ENXIO;
1047 		goto fail_wref;
1048 	}
1049 	BPFD_UNLOCK(d);
1050 
1051 	NET_EPOCH_ENTER(et);
1052 	error = bp->bif_methods->bif_write(bp->bif_softc, m, mc, d->bd_flags);
1053 	NET_EPOCH_EXIT(et);
1054 	if (error)
1055 		counter_u64_add(d->bd_wdcount, 1);
1056 	else
1057 		counter_u64_add(d->bd_wfcount, 1);
1058 	bpfd_rele(d);
1059 
1060 	return (error);
1061 
1062 fail_wref:
1063 	counter_u64_add(d->bd_wdcount, 1);
1064 	bpfd_rele(d);
1065 	m_freem(m);
1066 	return (error);
1067 }
1068 
1069 /*
1070  * Reset a descriptor by flushing its packet buffer and clearing the receive
1071  * and drop counts.  This is doable for kernel-only buffers, but with
1072  * zero-copy buffers, we can't write to (or rotate) buffers that are
1073  * currently owned by userspace.  It would be nice if we could encapsulate
1074  * this logic in the buffer code rather than here.
1075  */
1076 static void
reset_d(struct bpf_d * d)1077 reset_d(struct bpf_d *d)
1078 {
1079 
1080 	BPFD_LOCK_ASSERT(d);
1081 
1082 	while (d->bd_flags & BPFD_HBUF_INUSE)
1083 		mtx_sleep(&d->bd_hbuf, &d->bd_lock, PRINET, "bd_hbuf", 0);
1084 	if ((d->bd_hbuf != NULL) &&
1085 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1086 		/* Free the hold buffer. */
1087 		d->bd_fbuf = d->bd_hbuf;
1088 		d->bd_hbuf = NULL;
1089 		d->bd_hlen = 0;
1090 		bpf_buf_reclaimed(d);
1091 	}
1092 	if (bpf_canwritebuf(d))
1093 		d->bd_slen = 0;
1094 	counter_u64_zero(d->bd_rcount);
1095 	counter_u64_zero(d->bd_dcount);
1096 	counter_u64_zero(d->bd_fcount);
1097 	counter_u64_zero(d->bd_wcount);
1098 	counter_u64_zero(d->bd_wfcount);
1099 	counter_u64_zero(d->bd_wdcount);
1100 	counter_u64_zero(d->bd_zcopy);
1101 }
1102 
1103 /*
1104  *  FIONREAD		Check for read packet available.
1105  *  BIOCGETIFLIST	Get list of all tap points.
1106  *  BIOCGBLEN		Get buffer len [for read()].
1107  *  BIOCSETF		Set read filter.
1108  *  BIOCSETFNR		Set read filter without resetting descriptor.
1109  *  BIOCSETWF		Set write filter.
1110  *  BIOCFLUSH		Flush read packet buffer.
1111  *  BIOCPROMISC		Put interface into promiscuous mode.
1112  *  BIOCGDLT		Get link layer type.
1113  *  BIOCGETIF		Get interface name.
1114  *  BIOCSETIF		Set interface.
1115  *  BIOCSRTIMEOUT	Set read timeout.
1116  *  BIOCGRTIMEOUT	Get read timeout.
1117  *  BIOCGSTATS		Get packet stats.
1118  *  BIOCIMMEDIATE	Set immediate mode.
1119  *  BIOCVERSION		Get filter language version.
1120  *  BIOCGHDRCMPLT	Get "header already complete" flag
1121  *  BIOCSHDRCMPLT	Set "header already complete" flag
1122  *  BIOCGDIRECTION	Get packet direction flag
1123  *  BIOCSDIRECTION	Set packet direction flag
1124  *  BIOCGTSTAMP		Get time stamp format and resolution.
1125  *  BIOCSTSTAMP		Set time stamp format and resolution.
1126  *  BIOCLOCK		Set "locked" flag
1127  *  BIOCFEEDBACK	Set packet feedback mode.
1128  *  BIOCSETZBUF		Set current zero-copy buffer locations.
1129  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
1130  *  BIOCROTZBUF		Force rotation of zero-copy buffer
1131  *  BIOCSETBUFMODE	Set buffer mode.
1132  *  BIOCGETBUFMODE	Get current buffer mode.
1133  *  BIOCSETVLANPCP	Set VLAN PCP tag.
1134  */
1135 /* ARGSUSED */
1136 static	int
bpfioctl(struct cdev * dev,u_long cmd,caddr_t addr,int flags,struct thread * td)1137 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1138     struct thread *td)
1139 {
1140 	struct bpf_d *d;
1141 	int error;
1142 
1143 	error = devfs_get_cdevpriv((void **)&d);
1144 	if (error != 0)
1145 		return (error);
1146 
1147 	/*
1148 	 * Refresh PID associated with this descriptor.
1149 	 */
1150 	BPFD_LOCK(d);
1151 	BPF_PID_REFRESH(d, td);
1152 	if (d->bd_state == BPF_WAITING)
1153 		callout_stop(&d->bd_callout);
1154 	d->bd_state = BPF_IDLE;
1155 	BPFD_UNLOCK(d);
1156 
1157 	if (d->bd_flags & BPFD_LOCKED) {
1158 		switch (cmd) {
1159 		case BIOCGETIFLIST:
1160 		case BIOCGBLEN:
1161 		case BIOCFLUSH:
1162 		case BIOCGDLT:
1163 		case BIOCGDLTLIST:
1164 #ifdef COMPAT_FREEBSD32
1165 		case BIOCGDLTLIST32:
1166 #endif
1167 		case BIOCGETIF:
1168 		case BIOCGRTIMEOUT:
1169 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1170 		case BIOCGRTIMEOUT32:
1171 #endif
1172 		case BIOCGSTATS:
1173 		case BIOCVERSION:
1174 		case BIOCGRSIG:
1175 		case BIOCGHDRCMPLT:
1176 		case BIOCSTSTAMP:
1177 		case BIOCFEEDBACK:
1178 		case FIONREAD:
1179 		case BIOCLOCK:
1180 		case BIOCSRTIMEOUT:
1181 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1182 		case BIOCSRTIMEOUT32:
1183 #endif
1184 		case BIOCIMMEDIATE:
1185 		case TIOCGPGRP:
1186 		case BIOCROTZBUF:
1187 			break;
1188 		default:
1189 			return (EPERM);
1190 		}
1191 	}
1192 #ifdef COMPAT_FREEBSD32
1193 	/*
1194 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1195 	 * that it will get 32-bit packet headers.
1196 	 */
1197 	switch (cmd) {
1198 	case BIOCSETF32:
1199 	case BIOCSETFNR32:
1200 	case BIOCSETWF32:
1201 	case BIOCGDLTLIST32:
1202 	case BIOCGRTIMEOUT32:
1203 	case BIOCSRTIMEOUT32:
1204 		if (SV_CURPROC_FLAG(SV_ILP32)) {
1205 			BPFD_LOCK(d);
1206 			d->bd_compat32 = 1;
1207 			BPFD_UNLOCK(d);
1208 		}
1209 	}
1210 #endif
1211 
1212 #if defined(COMPAT_FREEBSD32)
1213 	if (SV_CURPROC_FLAG(SV_ILP32)) {
1214 		/*
1215 		 * On platforms other than amd64, BIOC[GS]RTIMEOUT32 is equal to
1216 		 * BIOC[GS]RTIMEOUT. Since this is difficult to handle in the
1217 		 * switch command, map them.
1218 		 */
1219 		if (cmd == BIOCSRTIMEOUT32)
1220 			cmd = BIOCSRTIMEOUT;
1221 		if (cmd == BIOCGRTIMEOUT32)
1222 			cmd = BIOCGRTIMEOUT;
1223 	}
1224 #endif
1225 	CURVNET_SET(TD_TO_VNET(td));
1226 	switch (cmd) {
1227 	default:
1228 		error = EINVAL;
1229 		break;
1230 
1231 	/*
1232 	 * Check for read packet available.
1233 	 */
1234 	case FIONREAD:
1235 		{
1236 			int n;
1237 
1238 			BPFD_LOCK(d);
1239 			n = d->bd_slen;
1240 			while (d->bd_flags & BPFD_HBUF_INUSE)
1241 				mtx_sleep(&d->bd_hbuf, &d->bd_lock,
1242 				    PRINET, "bd_hbuf", 0);
1243 			if (d->bd_hbuf)
1244 				n += d->bd_hlen;
1245 			BPFD_UNLOCK(d);
1246 
1247 			*(int *)addr = n;
1248 			break;
1249 		}
1250 	/*
1251 	 * Get list of all tap points.
1252 	 */
1253 	case BIOCGETIFLIST:
1254 		error = bpf_getiflist((struct bpf_iflist *)addr);
1255 		break;
1256 
1257 	/*
1258 	 * Get buffer len [for read()].
1259 	 */
1260 	case BIOCGBLEN:
1261 		BPFD_LOCK(d);
1262 		*(u_int *)addr = d->bd_bufsize;
1263 		BPFD_UNLOCK(d);
1264 		break;
1265 
1266 	/*
1267 	 * Set buffer length.
1268 	 */
1269 	case BIOCSBLEN:
1270 		error = bpf_ioctl_sblen(d, (u_int *)addr);
1271 		break;
1272 
1273 	/*
1274 	 * Set link layer read filter.
1275 	 */
1276 	case BIOCSETF:
1277 	case BIOCSETFNR:
1278 	case BIOCSETWF:
1279 #ifdef COMPAT_FREEBSD32
1280 	case BIOCSETF32:
1281 	case BIOCSETFNR32:
1282 	case BIOCSETWF32:
1283 #endif
1284 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1285 		break;
1286 
1287 	/*
1288 	 * Flush read packet buffer.
1289 	 */
1290 	case BIOCFLUSH:
1291 		BPFD_LOCK(d);
1292 		reset_d(d);
1293 		BPFD_UNLOCK(d);
1294 		break;
1295 
1296 	/*
1297 	 * Put interface into promiscuous mode.
1298 	 */
1299 	case BIOCPROMISC:
1300 		BPF_LOCK();
1301 		if (d->bd_bif == NULL) {
1302 			/*
1303 			 * No interface attached yet.
1304 			 */
1305 			error = EINVAL;
1306 		} else if (d->bd_promisc == 0) {
1307 			struct bpf_if *bp = d->bd_bif;
1308 
1309 			if ((error = bif_promisc(bp, true)) == 0)
1310 				d->bd_promisc = 1;
1311 		}
1312 		BPF_UNLOCK();
1313 		break;
1314 
1315 	/*
1316 	 * Get current data link type.
1317 	 */
1318 	case BIOCGDLT:
1319 		BPF_LOCK();
1320 		if (d->bd_bif == NULL)
1321 			error = EINVAL;
1322 		else
1323 			*(u_int *)addr = d->bd_bif->bif_dlt;
1324 		BPF_UNLOCK();
1325 		break;
1326 
1327 	/*
1328 	 * Get a list of supported data link types.
1329 	 */
1330 #ifdef COMPAT_FREEBSD32
1331 	case BIOCGDLTLIST32:
1332 		{
1333 			struct bpf_dltlist32 *list32;
1334 			struct bpf_dltlist dltlist;
1335 
1336 			list32 = (struct bpf_dltlist32 *)addr;
1337 			dltlist.bfl_len = list32->bfl_len;
1338 			dltlist.bfl_list = PTRIN(list32->bfl_list);
1339 			BPF_LOCK();
1340 			if (d->bd_bif == NULL)
1341 				error = EINVAL;
1342 			else {
1343 				error = bpf_getdltlist(d, &dltlist);
1344 				if (error == 0)
1345 					list32->bfl_len = dltlist.bfl_len;
1346 			}
1347 			BPF_UNLOCK();
1348 			break;
1349 		}
1350 #endif
1351 
1352 	case BIOCGDLTLIST:
1353 		BPF_LOCK();
1354 		if (d->bd_bif == NULL)
1355 			error = EINVAL;
1356 		else
1357 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1358 		BPF_UNLOCK();
1359 		break;
1360 
1361 	/*
1362 	 * Set data link type.
1363 	 */
1364 	case BIOCSDLT:
1365 		BPF_LOCK();
1366 		if (d->bd_bif == NULL)
1367 			error = EINVAL;
1368 		else
1369 			error = bpf_setdlt(d, *(u_int *)addr);
1370 		BPF_UNLOCK();
1371 		break;
1372 
1373 	/*
1374 	 * Get interface name.
1375 	 */
1376 	case BIOCGETIF:
1377 		BPF_LOCK();
1378 		if (d->bd_bif == NULL)
1379 			error = EINVAL;
1380 		else {
1381 			struct bpf_if *const bp = d->bd_bif;
1382 			struct ifreq *const ifr = (struct ifreq *)addr;
1383 
1384 			strlcpy(ifr->ifr_name, bp->bif_name,
1385 			    sizeof(ifr->ifr_name));
1386 		}
1387 		BPF_UNLOCK();
1388 		break;
1389 
1390 	/*
1391 	 * Set interface.
1392 	 */
1393 	case BIOCSETIF: {
1394 		struct ifreq *const ifr = (struct ifreq *)addr;
1395 		struct bpf_if *bp;
1396 
1397 		/*
1398 		 * Behavior here depends on the buffering model.  If we're
1399 		 * using kernel memory buffers, then we can allocate them here.
1400 		 * If we're using zero-copy, then the user process must have
1401 		 * registered buffers by the time we get here.
1402 		 */
1403 		BPFD_LOCK(d);
1404 		if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1405 		    d->bd_sbuf == NULL) {
1406 			u_int size;
1407 
1408 			size = d->bd_bufsize;
1409 			BPFD_UNLOCK(d);
1410 			error = bpf_buffer_ioctl_sblen(d, &size);
1411 			if (error != 0)
1412 				break;
1413 		} else
1414 			BPFD_UNLOCK(d);
1415 		BPF_LOCK();
1416 		/*
1417 		 * Look through attached interfaces for the named one.
1418 		 */
1419 		LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1420 			if (strncmp(ifr->ifr_name, bp->bif_name,
1421 			    sizeof(ifr->ifr_name)) == 0)
1422 				break;
1423 		}
1424 		if (bp != NULL)
1425 			error = bpf_attachd(d, bp);
1426 		else
1427 			error = ENXIO;
1428 		BPF_UNLOCK();
1429 		break;
1430 	}
1431 	/*
1432 	 * Set read timeout.
1433 	 */
1434 	case BIOCSRTIMEOUT:
1435 		{
1436 			struct timeval *tv = (struct timeval *)addr;
1437 #ifdef COMPAT_FREEBSD32
1438 			struct timeval32 *tv32;
1439 			struct timeval tv64;
1440 
1441 			if (SV_CURPROC_FLAG(SV_ILP32)) {
1442 				tv32 = (struct timeval32 *)addr;
1443 				tv = &tv64;
1444 				tv->tv_sec = tv32->tv_sec;
1445 				tv->tv_usec = tv32->tv_usec;
1446 			}
1447 #endif
1448 
1449 			/*
1450 			 * Subtract 1 tick from tvtohz() since this isn't
1451 			 * a one-shot timer.
1452 			 */
1453 			if ((error = itimerfix(tv)) == 0)
1454 				d->bd_rtout = tvtohz(tv) - 1;
1455 			break;
1456 		}
1457 
1458 	/*
1459 	 * Get read timeout.
1460 	 */
1461 	case BIOCGRTIMEOUT:
1462 		{
1463 			struct timeval *tv = (struct timeval *)addr;
1464 #ifdef COMPAT_FREEBSD32
1465 			struct timeval32 *tv32;
1466 			struct timeval tv64;
1467 
1468 			if (SV_CURPROC_FLAG(SV_ILP32))
1469 				tv = &tv64;
1470 #endif
1471 			tv->tv_sec = d->bd_rtout / hz;
1472 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1473 #ifdef COMPAT_FREEBSD32
1474 			if (SV_CURPROC_FLAG(SV_ILP32)) {
1475 				tv32 = (struct timeval32 *)addr;
1476 				tv32->tv_sec = tv->tv_sec;
1477 				tv32->tv_usec = tv->tv_usec;
1478 			}
1479 #endif
1480 			break;
1481 		}
1482 
1483 	/*
1484 	 * Get packet stats.
1485 	 */
1486 	case BIOCGSTATS:
1487 		{
1488 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1489 
1490 			/* XXXCSJP overflow */
1491 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
1492 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
1493 			break;
1494 		}
1495 
1496 	/*
1497 	 * Set immediate mode.
1498 	 */
1499 	case BIOCIMMEDIATE:
1500 		BPFD_LOCK(d);
1501 		d->bd_flags |= *(u_int *)addr ? BPFD_IMMEDIATE : 0;
1502 		BPFD_UNLOCK(d);
1503 		break;
1504 
1505 	case BIOCVERSION:
1506 		{
1507 			struct bpf_version *bv = (struct bpf_version *)addr;
1508 
1509 			bv->bv_major = BPF_MAJOR_VERSION;
1510 			bv->bv_minor = BPF_MINOR_VERSION;
1511 			break;
1512 		}
1513 
1514 	/*
1515 	 * Get "header already complete" flag
1516 	 */
1517 	case BIOCGHDRCMPLT:
1518 		BPFD_LOCK(d);
1519 		*(u_int *)addr = d->bd_flags & BPFD_HDRCMPLT ? 1 : 0;
1520 		BPFD_UNLOCK(d);
1521 		break;
1522 
1523 	/*
1524 	 * Set "header already complete" flag
1525 	 */
1526 	case BIOCSHDRCMPLT:
1527 		BPFD_LOCK(d);
1528 		d->bd_flags |= *(u_int *)addr ? BPFD_HDRCMPLT : 0;
1529 		BPFD_UNLOCK(d);
1530 		break;
1531 
1532 	/*
1533 	 * Get packet direction flag
1534 	 */
1535 	case BIOCGDIRECTION:
1536 		BPFD_LOCK(d);
1537 		*(u_int *)addr = d->bd_direction;
1538 		BPFD_UNLOCK(d);
1539 		break;
1540 
1541 	/*
1542 	 * Set packet direction flag
1543 	 */
1544 	case BIOCSDIRECTION:
1545 		{
1546 			u_int	direction;
1547 
1548 			direction = *(u_int *)addr;
1549 			switch (direction) {
1550 			case BPF_D_IN:
1551 			case BPF_D_INOUT:
1552 			case BPF_D_OUT:
1553 				BPFD_LOCK(d);
1554 				d->bd_direction = direction;
1555 				BPFD_UNLOCK(d);
1556 				break;
1557 			default:
1558 				error = EINVAL;
1559 			}
1560 		}
1561 		break;
1562 
1563 	/*
1564 	 * Get packet timestamp format and resolution.
1565 	 */
1566 	case BIOCGTSTAMP:
1567 		BPFD_LOCK(d);
1568 		*(u_int *)addr = d->bd_tstamp;
1569 		BPFD_UNLOCK(d);
1570 		break;
1571 
1572 	/*
1573 	 * Set packet timestamp format and resolution.
1574 	 */
1575 	case BIOCSTSTAMP:
1576 		{
1577 			u_int	func;
1578 
1579 			func = *(u_int *)addr;
1580 			if (BPF_T_VALID(func))
1581 				d->bd_tstamp = func;
1582 			else
1583 				error = EINVAL;
1584 		}
1585 		break;
1586 
1587 	case BIOCFEEDBACK:
1588 		BPFD_LOCK(d);
1589 		d->bd_flags |= *(u_int *)addr ? BPFD_FEEDBACK : 0;
1590 		BPFD_UNLOCK(d);
1591 		break;
1592 
1593 	case BIOCLOCK:
1594 		BPFD_LOCK(d);
1595 		d->bd_flags |= BPFD_LOCKED;
1596 		BPFD_UNLOCK(d);
1597 		break;
1598 
1599 	case FIONBIO:		/* Non-blocking I/O */
1600 		break;
1601 
1602 	case FIOASYNC:		/* Send signal on receive packets */
1603 		BPFD_LOCK(d);
1604 		d->bd_flags |= *(u_int *)addr ? BPFD_ASYNC : 0;
1605 		BPFD_UNLOCK(d);
1606 		break;
1607 
1608 	case FIOSETOWN:
1609 		/*
1610 		 * XXX: Add some sort of locking here?
1611 		 * fsetown() can sleep.
1612 		 */
1613 		error = fsetown(*(int *)addr, &d->bd_sigio);
1614 		break;
1615 
1616 	case FIOGETOWN:
1617 		BPFD_LOCK(d);
1618 		*(int *)addr = fgetown(&d->bd_sigio);
1619 		BPFD_UNLOCK(d);
1620 		break;
1621 
1622 	/* This is deprecated, FIOSETOWN should be used instead. */
1623 	case TIOCSPGRP:
1624 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1625 		break;
1626 
1627 	/* This is deprecated, FIOGETOWN should be used instead. */
1628 	case TIOCGPGRP:
1629 		*(int *)addr = -fgetown(&d->bd_sigio);
1630 		break;
1631 
1632 	case BIOCSRSIG:		/* Set receive signal */
1633 		{
1634 			u_int sig;
1635 
1636 			sig = *(u_int *)addr;
1637 
1638 			if (sig >= NSIG)
1639 				error = EINVAL;
1640 			else {
1641 				BPFD_LOCK(d);
1642 				d->bd_sig = sig;
1643 				BPFD_UNLOCK(d);
1644 			}
1645 			break;
1646 		}
1647 	case BIOCGRSIG:
1648 		BPFD_LOCK(d);
1649 		*(u_int *)addr = d->bd_sig;
1650 		BPFD_UNLOCK(d);
1651 		break;
1652 
1653 	case BIOCGETBUFMODE:
1654 		BPFD_LOCK(d);
1655 		*(u_int *)addr = d->bd_bufmode;
1656 		BPFD_UNLOCK(d);
1657 		break;
1658 
1659 	case BIOCSETBUFMODE:
1660 		/*
1661 		 * Allow the buffering mode to be changed as long as we
1662 		 * haven't yet committed to a particular mode.  Our
1663 		 * definition of commitment, for now, is whether or not a
1664 		 * buffer has been allocated or an interface attached, since
1665 		 * that's the point where things get tricky.
1666 		 */
1667 		switch (*(u_int *)addr) {
1668 		case BPF_BUFMODE_BUFFER:
1669 			break;
1670 
1671 		case BPF_BUFMODE_ZBUF:
1672 			if (bpf_zerocopy_enable)
1673 				break;
1674 			/* FALLSTHROUGH */
1675 
1676 		default:
1677 			CURVNET_RESTORE();
1678 			return (EINVAL);
1679 		}
1680 
1681 		BPFD_LOCK(d);
1682 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1683 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1684 			BPFD_UNLOCK(d);
1685 			CURVNET_RESTORE();
1686 			return (EBUSY);
1687 		}
1688 		d->bd_bufmode = *(u_int *)addr;
1689 		BPFD_UNLOCK(d);
1690 		break;
1691 
1692 	case BIOCGETZMAX:
1693 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1694 		break;
1695 
1696 	case BIOCSETZBUF:
1697 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1698 		break;
1699 
1700 	case BIOCROTZBUF:
1701 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1702 		break;
1703 
1704 	case BIOCSETVLANPCP:
1705 		{
1706 			u_int pcp;
1707 
1708 			pcp = *(u_int *)addr;
1709 			if (pcp > BPF_PRIO_MAX || pcp < 0) {
1710 				error = EINVAL;
1711 				break;
1712 			}
1713 			d->bd_pcp = pcp;
1714 			break;
1715 		}
1716 	}
1717 	CURVNET_RESTORE();
1718 	return (error);
1719 }
1720 
1721 /*
1722  * Return list of available tapping points, or report how much space is
1723  * required for a successful return.
1724  */
1725 static int
bpf_getiflist(struct bpf_iflist * bi)1726 bpf_getiflist(struct bpf_iflist *bi)
1727 {
1728 	struct bpf_if *bp;
1729 	u_int allsize, size, cnt;
1730 	char *uaddr;
1731 
1732 	BPF_LOCK();
1733 
1734 	cnt = allsize = size = 0;
1735 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1736 		allsize += strlen(bp->bif_name) + 1;
1737 		if (++cnt == bi->bi_count)
1738 			size = allsize;
1739 	}
1740 	if (size == 0)
1741 		size = allsize;
1742 
1743 	if (bi->bi_size == 0) {
1744 		BPF_UNLOCK();
1745 		bi->bi_size = size;
1746 		bi->bi_count = cnt;
1747 		return (0);
1748 	} else if (bi->bi_size < size) {
1749 		BPF_UNLOCK();
1750 		return (ENOSPC);
1751 	}
1752 
1753 	uaddr = bi->bi_ubuf;
1754 	cnt = 0;
1755 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1756 		u_int len;
1757 		int error;
1758 
1759 		len = strlen(bp->bif_name) + 1;
1760 		if ((error = copyout(bp->bif_name, uaddr, len)) != 0) {
1761 			BPF_UNLOCK();
1762 			return (error);
1763 		}
1764 		if (++cnt == bi->bi_count)
1765 			break;
1766 		uaddr += len;
1767 	}
1768 	BPF_UNLOCK();
1769 	bi->bi_count = cnt;
1770 
1771 	return (0);
1772 }
1773 
1774 /*
1775  * Set d's packet filter program to fp. If this file already has a filter,
1776  * free it and replace it. Returns EINVAL for bogus requests.
1777  *
1778  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
1779  * calls.
1780  */
1781 static int
bpf_setf(struct bpf_d * d,struct bpf_program * fp,u_long cmd)1782 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1783 {
1784 #ifdef COMPAT_FREEBSD32
1785 	struct bpf_program fp_swab;
1786 	struct bpf_program32 *fp32;
1787 #endif
1788 	struct bpf_program_buffer *fcode;
1789 	struct bpf_insn *filter;
1790 #ifdef BPF_JITTER
1791 	bpf_jit_filter *jfunc;
1792 #endif
1793 	size_t size;
1794 	u_int flen;
1795 	bool track_event;
1796 
1797 #ifdef COMPAT_FREEBSD32
1798 	switch (cmd) {
1799 	case BIOCSETF32:
1800 	case BIOCSETWF32:
1801 	case BIOCSETFNR32:
1802 		fp32 = (struct bpf_program32 *)fp;
1803 		fp_swab.bf_len = fp32->bf_len;
1804 		fp_swab.bf_insns =
1805 		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1806 		fp = &fp_swab;
1807 		switch (cmd) {
1808 		case BIOCSETF32:
1809 			cmd = BIOCSETF;
1810 			break;
1811 		case BIOCSETWF32:
1812 			cmd = BIOCSETWF;
1813 			break;
1814 		}
1815 		break;
1816 	}
1817 #endif
1818 
1819 	filter = NULL;
1820 #ifdef BPF_JITTER
1821 	jfunc = NULL;
1822 #endif
1823 	/*
1824 	 * Check new filter validness before acquiring any locks.
1825 	 * Allocate memory for new filter, if needed.
1826 	 */
1827 	flen = fp->bf_len;
1828 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1829 		return (EINVAL);
1830 	size = flen * sizeof(*fp->bf_insns);
1831 	if (size > 0) {
1832 		/* We're setting up new filter. Copy and check actual data. */
1833 		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
1834 		filter = (struct bpf_insn *)fcode->buffer;
1835 		if (copyin(fp->bf_insns, filter, size) != 0 ||
1836 		    !bpf_validate(filter, flen)) {
1837 			free(fcode, M_BPF);
1838 			return (EINVAL);
1839 		}
1840 #ifdef BPF_JITTER
1841 		if (cmd != BIOCSETWF) {
1842 			/*
1843 			 * Filter is copied inside fcode and is
1844 			 * perfectly valid.
1845 			 */
1846 			jfunc = bpf_jitter(filter, flen);
1847 		}
1848 #endif
1849 	}
1850 
1851 	track_event = false;
1852 	fcode = NULL;
1853 
1854 	BPF_LOCK();
1855 	BPFD_LOCK(d);
1856 	/* Set up new filter. */
1857 	if (cmd == BIOCSETWF) {
1858 		if (d->bd_wfilter != NULL) {
1859 			fcode = __containerof((void *)d->bd_wfilter,
1860 			    struct bpf_program_buffer, buffer);
1861 #ifdef BPF_JITTER
1862 			fcode->func = NULL;
1863 #endif
1864 		}
1865 		d->bd_wfilter = filter;
1866 	} else {
1867 		if (d->bd_rfilter != NULL) {
1868 			fcode = __containerof((void *)d->bd_rfilter,
1869 			    struct bpf_program_buffer, buffer);
1870 #ifdef BPF_JITTER
1871 			fcode->func = d->bd_bfilter;
1872 #endif
1873 		}
1874 		d->bd_rfilter = filter;
1875 #ifdef BPF_JITTER
1876 		d->bd_bfilter = jfunc;
1877 #endif
1878 		if (cmd == BIOCSETF)
1879 			reset_d(d);
1880 
1881 		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
1882 			/*
1883 			 * Filter can be set several times without
1884 			 * specifying interface. In this case just mark d
1885 			 * as reader.
1886 			 */
1887 			d->bd_writer = 0;
1888 			if (d->bd_bif != NULL) {
1889 				/*
1890 				 * Remove descriptor from writers-only list
1891 				 * and add it to active readers list.
1892 				 */
1893 				CK_LIST_REMOVE(d, bd_next);
1894 				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
1895 				    d, bd_next);
1896 				CTR2(KTR_NET,
1897 				    "%s: upgrade required by pid %d",
1898 				    __func__, d->bd_pid);
1899 				track_event = true;
1900 			}
1901 		}
1902 	}
1903 	BPFD_UNLOCK(d);
1904 
1905 	if (fcode != NULL)
1906 		NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
1907 
1908 	if (track_event)
1909 		bif_attachd(d->bd_bif);
1910 
1911 	BPF_UNLOCK();
1912 	return (0);
1913 }
1914 
1915 /*
1916  * Attach descriptor to a tap point, possibly detaching from the old one,
1917  * reset the counters.
1918  * XXXGL: this KPI is subject to change
1919  */
1920 static int
bpf_attachd(struct bpf_d * d,struct bpf_if * bp)1921 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
1922 {
1923 	bool writer;
1924 
1925 	BPF_LOCK_ASSERT();
1926 
1927 	/*
1928 	 * At this point, we expect the buffer is already allocated.  If not,
1929 	 * return an error.
1930 	 */
1931 	switch (d->bd_bufmode) {
1932 	case BPF_BUFMODE_BUFFER:
1933 	case BPF_BUFMODE_ZBUF:
1934 		if (d->bd_sbuf == NULL)
1935 			return (EINVAL);
1936 		break;
1937 
1938 	default:
1939 		panic("%s: bufmode %d", __func__, d->bd_bufmode);
1940 	}
1941 
1942 	if (bp == d->bd_bif) {
1943 		BPFD_LOCK(d);
1944 		reset_d(d);
1945 		BPFD_UNLOCK(d);
1946 		return (0);
1947 	} else if (d->bd_bif != NULL)
1948 		bpf_detachd(d, false);
1949 
1950 	/*
1951 	 * Save sysctl value to protect from sysctl change between reads.
1952 	 */
1953 	writer = V_bpf_optimize_writers || (d->bd_writer > 0);
1954 
1955 	/*
1956 	 * Point d at bp, and add d to the interface's list.
1957 	 * Since there are many applications using BPF for
1958 	 * sending raw packets only (dhcpd, cdpd are good examples)
1959 	 * we can delay adding d to the list of active listeners until
1960 	 * some filter is configured.
1961 	 */
1962 	BPFD_LOCK(d);
1963 	/*
1964 	 * Hold reference to bpif while descriptor uses this interface.
1965 	 */
1966 	bpfif_ref(bp);
1967 	d->bd_bif = bp;
1968 	if (writer) {
1969 		/* Add to writers-only list */
1970 		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
1971 		/*
1972 		 * We decrement bd_writer on every filter set operation.
1973 		 * First BIOCSETF is done by pcap_open_live() to set up
1974 		 * snap length. After that application usually sets its own
1975 		 * filter.
1976 		 */
1977 		d->bd_writer = 2;
1978 	} else
1979 		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
1980 
1981 	reset_d(d);
1982 
1983 	/* Trigger EVFILT_WRITE events. */
1984 	bpf_wakeup(d);
1985 
1986 	BPFD_UNLOCK(d);
1987 
1988 	CTR3(KTR_NET, "%s: called by pid %d, adding to %s list",
1989 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
1990 
1991 	if (!writer)
1992 		bif_attachd(bp);
1993 
1994 	return (0);
1995 }
1996 
1997 /*
1998  * Support for select() and poll() system calls
1999  *
2000  * Return true iff the specific operation will not block indefinitely.
2001  * Otherwise, return false but make a note that a selwakeup() must be done.
2002  */
2003 static int
bpfpoll(struct cdev * dev,int events,struct thread * td)2004 bpfpoll(struct cdev *dev, int events, struct thread *td)
2005 {
2006 	struct bpf_d *d;
2007 	int revents;
2008 
2009 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
2010 		return (events &
2011 		    (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM));
2012 
2013 	/*
2014 	 * Refresh PID associated with this descriptor.
2015 	 */
2016 	revents = events & (POLLOUT | POLLWRNORM);
2017 	BPFD_LOCK(d);
2018 	BPF_PID_REFRESH(d, td);
2019 	if (events & (POLLIN | POLLRDNORM)) {
2020 		if (bpf_ready(d))
2021 			revents |= events & (POLLIN | POLLRDNORM);
2022 		else {
2023 			selrecord(td, &d->bd_sel);
2024 			/* Start the read timeout if necessary. */
2025 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2026 				callout_reset(&d->bd_callout, d->bd_rtout,
2027 				    bpf_timed_out, d);
2028 				d->bd_state = BPF_WAITING;
2029 			}
2030 		}
2031 	}
2032 	BPFD_UNLOCK(d);
2033 	return (revents);
2034 }
2035 
2036 /*
2037  * Support for kevent() system call.  Register EVFILT_READ filters and
2038  * reject all others.
2039  */
2040 int
bpfkqfilter(struct cdev * dev,struct knote * kn)2041 bpfkqfilter(struct cdev *dev, struct knote *kn)
2042 {
2043 	struct bpf_d *d;
2044 
2045 	if (devfs_get_cdevpriv((void **)&d) != 0)
2046 		return (1);
2047 
2048 	switch (kn->kn_filter) {
2049 	case EVFILT_READ:
2050 		kn->kn_fop = &bpfread_filtops;
2051 		break;
2052 
2053 	case EVFILT_WRITE:
2054 		kn->kn_fop = &bpfwrite_filtops;
2055 		break;
2056 
2057 	default:
2058 		return (1);
2059 	}
2060 
2061 	/*
2062 	 * Refresh PID associated with this descriptor.
2063 	 */
2064 	BPFD_LOCK(d);
2065 	BPF_PID_REFRESH_CUR(d);
2066 	kn->kn_hook = d;
2067 	knlist_add(&d->bd_sel.si_note, kn, 1);
2068 	BPFD_UNLOCK(d);
2069 
2070 	return (0);
2071 }
2072 
2073 static void
filt_bpfdetach(struct knote * kn)2074 filt_bpfdetach(struct knote *kn)
2075 {
2076 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2077 
2078 	knlist_remove(&d->bd_sel.si_note, kn, 0);
2079 }
2080 
2081 static int
filt_bpfread(struct knote * kn,long hint)2082 filt_bpfread(struct knote *kn, long hint)
2083 {
2084 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2085 	int ready;
2086 
2087 	BPFD_LOCK_ASSERT(d);
2088 	ready = bpf_ready(d);
2089 	if (ready) {
2090 		kn->kn_data = d->bd_slen;
2091 		/*
2092 		 * Ignore the hold buffer if it is being copied to user space.
2093 		 */
2094 		if (!(d->bd_flags & BPFD_HBUF_INUSE) && d->bd_hbuf)
2095 			kn->kn_data += d->bd_hlen;
2096 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2097 		callout_reset(&d->bd_callout, d->bd_rtout,
2098 		    bpf_timed_out, d);
2099 		d->bd_state = BPF_WAITING;
2100 	}
2101 
2102 	return (ready);
2103 }
2104 
2105 static int
filt_bpfwrite(struct knote * kn,long hint)2106 filt_bpfwrite(struct knote *kn, long hint)
2107 {
2108 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2109 
2110 	BPFD_LOCK_ASSERT(d);
2111 
2112 	if (d->bd_bif == NULL) {
2113 		kn->kn_data = 0;
2114 		return (0);
2115 	} else {
2116 		kn->kn_data = bif_wrsize(d->bd_bif);
2117 		return (1);
2118 	}
2119 }
2120 
2121 #define	BPF_TSTAMP_NONE		0
2122 #define	BPF_TSTAMP_FAST		1
2123 #define	BPF_TSTAMP_NORMAL	2
2124 #define	BPF_TSTAMP_EXTERN	3
2125 
2126 static int
bpf_ts_quality(int tstype)2127 bpf_ts_quality(int tstype)
2128 {
2129 
2130 	if (tstype == BPF_T_NONE)
2131 		return (BPF_TSTAMP_NONE);
2132 	if ((tstype & BPF_T_FAST) != 0)
2133 		return (BPF_TSTAMP_FAST);
2134 
2135 	return (BPF_TSTAMP_NORMAL);
2136 }
2137 
2138 static int
bpf_gettime(struct bintime * bt,int tstype,struct mbuf * m)2139 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2140 {
2141 	struct timespec ts;
2142 	struct m_tag *tag;
2143 	int quality;
2144 
2145 	quality = bpf_ts_quality(tstype);
2146 	if (quality == BPF_TSTAMP_NONE)
2147 		return (quality);
2148 
2149 	if (m != NULL) {
2150 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) {
2151 			mbuf_tstmp2timespec(m, &ts);
2152 			timespec2bintime(&ts, bt);
2153 			return (BPF_TSTAMP_EXTERN);
2154 		}
2155 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2156 		if (tag != NULL) {
2157 			*bt = *(struct bintime *)(tag + 1);
2158 			return (BPF_TSTAMP_EXTERN);
2159 		}
2160 	}
2161 	if (quality == BPF_TSTAMP_NORMAL)
2162 		binuptime(bt);
2163 	else
2164 		getbinuptime(bt);
2165 
2166 	return (quality);
2167 }
2168 
2169 /*
2170  * Incoming linkage from device drivers.  Process the packet pkt, of length
2171  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2172  * by each process' filter, and if accepted, stashed into the corresponding
2173  * buffer.
2174  */
2175 void
bpf_tap(struct bpf_if * bp,u_char * pkt,u_int pktlen)2176 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2177 {
2178 	struct epoch_tracker et;
2179 	struct bintime bt;
2180 	struct bpf_d *d;
2181 #ifdef BPF_JITTER
2182 	bpf_jit_filter *bf;
2183 #endif
2184 	u_int slen;
2185 	int gottime;
2186 
2187 	gottime = BPF_TSTAMP_NONE;
2188 	NET_EPOCH_ENTER(et);
2189 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2190 		counter_u64_add(d->bd_rcount, 1);
2191 		/*
2192 		 * NB: We don't check the direction here since there
2193 		 * is no way for the caller to indiciate to us whether this
2194 		 * packet is inbound or outbound. In the bpf_mtap() routines,
2195 		 * we use the interface pointers on the mbuf to figure it out.
2196 		 */
2197 #ifdef BPF_JITTER
2198 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2199 		if (bf != NULL)
2200 			slen = (*(bf->func))(pkt, pktlen, pktlen);
2201 		else
2202 #endif
2203 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2204 		if (slen != 0) {
2205 			/*
2206 			 * Filter matches. Let's to acquire write lock.
2207 			 */
2208 			BPFD_LOCK(d);
2209 			counter_u64_add(d->bd_fcount, 1);
2210 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2211 				gottime = bpf_gettime(&bt, d->bd_tstamp,
2212 				    NULL);
2213 #ifdef MAC
2214 			if (bif_mac_check_receive(bp, d) == 0)
2215 #endif
2216 				catchpacket(d, pkt, pktlen, slen,
2217 				    bpf_append_bytes, &bt);
2218 			BPFD_UNLOCK(d);
2219 		}
2220 	}
2221 	NET_EPOCH_EXIT(et);
2222 }
2223 
2224 void
bpf_tap_if(if_t ifp,u_char * pkt,u_int pktlen)2225 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
2226 {
2227 	if (bpf_peers_present(ifp->if_bpf))
2228 		bpf_tap(ifp->if_bpf, pkt, pktlen);
2229 }
2230 
2231 /*
2232  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2233  * Locking model is explained in bpf_tap().
2234  */
2235 void
bpf_mtap(struct bpf_if * bp,struct mbuf * m)2236 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2237 {
2238 	struct epoch_tracker et;
2239 	struct bintime bt;
2240 	struct bpf_d *d;
2241 #ifdef BPF_JITTER
2242 	bpf_jit_filter *bf;
2243 #endif
2244 	u_int pktlen, slen;
2245 	int gottime;
2246 
2247 	/* Skip outgoing duplicate packets. */
2248 	if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
2249 		m->m_flags &= ~M_PROMISC;
2250 		return;
2251 	}
2252 
2253 	pktlen = m_length(m, NULL);
2254 	gottime = BPF_TSTAMP_NONE;
2255 
2256 	NET_EPOCH_ENTER(et);
2257 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2258 		if (bpf_chkdir(d, m))
2259 			continue;
2260 		counter_u64_add(d->bd_rcount, 1);
2261 #ifdef BPF_JITTER
2262 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2263 		/* XXX We cannot handle multiple mbufs. */
2264 		if (bf != NULL && m->m_next == NULL)
2265 			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
2266 			    pktlen);
2267 		else
2268 #endif
2269 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2270 		if (slen != 0) {
2271 			BPFD_LOCK(d);
2272 
2273 			counter_u64_add(d->bd_fcount, 1);
2274 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2275 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2276 #ifdef MAC
2277 			if (bif_mac_check_receive(bp, d) == 0)
2278 #endif
2279 				catchpacket(d, (u_char *)m, pktlen, slen,
2280 				    bpf_append_mbuf, &bt);
2281 			BPFD_UNLOCK(d);
2282 		}
2283 	}
2284 	NET_EPOCH_EXIT(et);
2285 }
2286 
2287 void
bpf_mtap_if(if_t ifp,struct mbuf * m)2288 bpf_mtap_if(if_t ifp, struct mbuf *m)
2289 {
2290 	if (bpf_peers_present(ifp->if_bpf)) {
2291 		M_ASSERTVALID(m);
2292 		bpf_mtap(ifp->if_bpf, m);
2293 	}
2294 }
2295 
2296 /*
2297  * Incoming linkage from device drivers, when packet is in
2298  * an mbuf chain and to be prepended by a contiguous header.
2299  */
2300 void
bpf_mtap2(struct bpf_if * bp,void * data,u_int dlen,struct mbuf * m)2301 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2302 {
2303 	struct epoch_tracker et;
2304 	struct bintime bt;
2305 	struct mbuf mb;
2306 	struct bpf_d *d;
2307 	u_int pktlen, slen;
2308 	int gottime;
2309 
2310 	/* Skip outgoing duplicate packets. */
2311 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2312 		m->m_flags &= ~M_PROMISC;
2313 		return;
2314 	}
2315 
2316 	pktlen = m_length(m, NULL);
2317 	/*
2318 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
2319 	 * Note that we cut corners here; we only setup what's
2320 	 * absolutely needed--this mbuf should never go anywhere else.
2321 	 */
2322 	mb.m_flags = 0;
2323 	mb.m_next = m;
2324 	mb.m_data = data;
2325 	mb.m_len = dlen;
2326 	pktlen += dlen;
2327 
2328 	gottime = BPF_TSTAMP_NONE;
2329 
2330 	NET_EPOCH_ENTER(et);
2331 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2332 		if (bpf_chkdir(d, m))
2333 			continue;
2334 		counter_u64_add(d->bd_rcount, 1);
2335 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2336 		if (slen != 0) {
2337 			BPFD_LOCK(d);
2338 
2339 			counter_u64_add(d->bd_fcount, 1);
2340 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2341 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2342 #ifdef MAC
2343 			if (bif_mac_check_receive(bp, d) == 0)
2344 #endif
2345 				catchpacket(d, (u_char *)&mb, pktlen, slen,
2346 				    bpf_append_mbuf, &bt);
2347 			BPFD_UNLOCK(d);
2348 		}
2349 	}
2350 	NET_EPOCH_EXIT(et);
2351 }
2352 
2353 void
bpf_mtap2_if(if_t ifp,void * data,u_int dlen,struct mbuf * m)2354 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
2355 {
2356 	if (bpf_peers_present(ifp->if_bpf)) {
2357 		M_ASSERTVALID(m);
2358 		bpf_mtap2(ifp->if_bpf, data, dlen, m);
2359 	}
2360 }
2361 
2362 #undef	BPF_TSTAMP_NONE
2363 #undef	BPF_TSTAMP_FAST
2364 #undef	BPF_TSTAMP_NORMAL
2365 #undef	BPF_TSTAMP_EXTERN
2366 
2367 static int
bpf_hdrlen(struct bpf_d * d)2368 bpf_hdrlen(struct bpf_d *d)
2369 {
2370 	int hdrlen;
2371 
2372 	hdrlen = d->bd_bif->bif_hdrlen;
2373 #ifndef BURN_BRIDGES
2374 	if (d->bd_tstamp == BPF_T_NONE ||
2375 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2376 #ifdef COMPAT_FREEBSD32
2377 		if (d->bd_compat32)
2378 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2379 		else
2380 #endif
2381 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2382 	else
2383 #endif
2384 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2385 #ifdef COMPAT_FREEBSD32
2386 	if (d->bd_compat32)
2387 		hdrlen = BPF_WORDALIGN32(hdrlen);
2388 	else
2389 #endif
2390 		hdrlen = BPF_WORDALIGN(hdrlen);
2391 
2392 	return (hdrlen - d->bd_bif->bif_hdrlen);
2393 }
2394 
2395 static void
bpf_bintime2ts(struct bintime * bt,struct bpf_ts * ts,int tstype)2396 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2397 {
2398 	struct bintime bt2, boottimebin;
2399 	struct timeval tsm;
2400 	struct timespec tsn;
2401 
2402 	if ((tstype & BPF_T_MONOTONIC) == 0) {
2403 		bt2 = *bt;
2404 		getboottimebin(&boottimebin);
2405 		bintime_add(&bt2, &boottimebin);
2406 		bt = &bt2;
2407 	}
2408 	switch (BPF_T_FORMAT(tstype)) {
2409 	case BPF_T_MICROTIME:
2410 		bintime2timeval(bt, &tsm);
2411 		ts->bt_sec = tsm.tv_sec;
2412 		ts->bt_frac = tsm.tv_usec;
2413 		break;
2414 	case BPF_T_NANOTIME:
2415 		bintime2timespec(bt, &tsn);
2416 		ts->bt_sec = tsn.tv_sec;
2417 		ts->bt_frac = tsn.tv_nsec;
2418 		break;
2419 	case BPF_T_BINTIME:
2420 		ts->bt_sec = bt->sec;
2421 		ts->bt_frac = bt->frac;
2422 		break;
2423 	}
2424 }
2425 
2426 /*
2427  * Move the packet data from interface memory (pkt) into the
2428  * store buffer.  "cpfn" is the routine called to do the actual data
2429  * transfer.  bcopy is passed in to copy contiguous chunks, while
2430  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2431  * pkt is really an mbuf.
2432  */
2433 static void
catchpacket(struct bpf_d * d,u_char * pkt,u_int pktlen,u_int snaplen,void (* cpfn)(struct bpf_d *,caddr_t,u_int,void *,u_int),struct bintime * bt)2434 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2435     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2436     struct bintime *bt)
2437 {
2438 	static char zeroes[BPF_ALIGNMENT];
2439 	struct bpf_xhdr hdr;
2440 #ifndef BURN_BRIDGES
2441 	struct bpf_hdr hdr_old;
2442 #ifdef COMPAT_FREEBSD32
2443 	struct bpf_hdr32 hdr32_old;
2444 #endif
2445 #endif
2446 	int caplen, curlen, hdrlen, pad, totlen;
2447 	int do_wakeup = 0;
2448 	int do_timestamp;
2449 	int tstype;
2450 
2451 	BPFD_LOCK_ASSERT(d);
2452 	if (d->bd_bif == NULL) {
2453 		/* Descriptor was detached in concurrent thread */
2454 		counter_u64_add(d->bd_dcount, 1);
2455 		return;
2456 	}
2457 
2458 	/*
2459 	 * Detect whether user space has released a buffer back to us, and if
2460 	 * so, move it from being a hold buffer to a free buffer.  This may
2461 	 * not be the best place to do it (for example, we might only want to
2462 	 * run this check if we need the space), but for now it's a reliable
2463 	 * spot to do it.
2464 	 */
2465 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2466 		d->bd_fbuf = d->bd_hbuf;
2467 		d->bd_hbuf = NULL;
2468 		d->bd_hlen = 0;
2469 		bpf_buf_reclaimed(d);
2470 	}
2471 
2472 	/*
2473 	 * Figure out how many bytes to move.  If the packet is
2474 	 * greater or equal to the snapshot length, transfer that
2475 	 * much.  Otherwise, transfer the whole packet (unless
2476 	 * we hit the buffer size limit).
2477 	 */
2478 	hdrlen = bpf_hdrlen(d);
2479 	totlen = hdrlen + min(snaplen, pktlen);
2480 	if (totlen > d->bd_bufsize)
2481 		totlen = d->bd_bufsize;
2482 
2483 	/*
2484 	 * Round up the end of the previous packet to the next longword.
2485 	 *
2486 	 * Drop the packet if there's no room and no hope of room
2487 	 * If the packet would overflow the storage buffer or the storage
2488 	 * buffer is considered immutable by the buffer model, try to rotate
2489 	 * the buffer and wakeup pending processes.
2490 	 */
2491 #ifdef COMPAT_FREEBSD32
2492 	if (d->bd_compat32)
2493 		curlen = BPF_WORDALIGN32(d->bd_slen);
2494 	else
2495 #endif
2496 		curlen = BPF_WORDALIGN(d->bd_slen);
2497 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2498 		if (d->bd_fbuf == NULL) {
2499 			/*
2500 			 * There's no room in the store buffer, and no
2501 			 * prospect of room, so drop the packet.  Notify the
2502 			 * buffer model.
2503 			 */
2504 			bpf_buffull(d);
2505 			counter_u64_add(d->bd_dcount, 1);
2506 			return;
2507 		}
2508 		KASSERT(!(d->bd_flags & BPFD_HBUF_INUSE),
2509 		    ("hold buffer is in use"));
2510 		ROTATE_BUFFERS(d);
2511 		do_wakeup = 1;
2512 		curlen = 0;
2513 	} else {
2514 		if ((d->bd_flags & BPFD_IMMEDIATE) ||
2515 		    d->bd_state == BPF_TIMED_OUT) {
2516 			/*
2517 			 * Immediate mode is set, or the read timeout has
2518 			 * already expired during a select call.  A packet
2519 			 * arrived, so the reader should be woken up.
2520 			 */
2521 			do_wakeup = 1;
2522 		}
2523 		pad = curlen - d->bd_slen;
2524 		KASSERT(pad >= 0 && pad <= sizeof(zeroes),
2525 		    ("%s: invalid pad byte count %d", __func__, pad));
2526 		if (pad > 0) {
2527 			/* Zero pad bytes. */
2528 			bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes,
2529 			    pad);
2530 		}
2531 	}
2532 
2533 	caplen = totlen - hdrlen;
2534 	tstype = d->bd_tstamp;
2535 	do_timestamp = tstype != BPF_T_NONE;
2536 #ifndef BURN_BRIDGES
2537 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2538 		struct bpf_ts ts;
2539 		if (do_timestamp)
2540 			bpf_bintime2ts(bt, &ts, tstype);
2541 #ifdef COMPAT_FREEBSD32
2542 		if (d->bd_compat32) {
2543 			bzero(&hdr32_old, sizeof(hdr32_old));
2544 			if (do_timestamp) {
2545 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2546 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2547 			}
2548 			hdr32_old.bh_datalen = pktlen;
2549 			hdr32_old.bh_hdrlen = hdrlen;
2550 			hdr32_old.bh_caplen = caplen;
2551 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2552 			    sizeof(hdr32_old));
2553 			goto copy;
2554 		}
2555 #endif
2556 		bzero(&hdr_old, sizeof(hdr_old));
2557 		if (do_timestamp) {
2558 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2559 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2560 		}
2561 		hdr_old.bh_datalen = pktlen;
2562 		hdr_old.bh_hdrlen = hdrlen;
2563 		hdr_old.bh_caplen = caplen;
2564 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2565 		    sizeof(hdr_old));
2566 		goto copy;
2567 	}
2568 #endif
2569 
2570 	/*
2571 	 * Append the bpf header.  Note we append the actual header size, but
2572 	 * move forward the length of the header plus padding.
2573 	 */
2574 	bzero(&hdr, sizeof(hdr));
2575 	if (do_timestamp)
2576 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2577 	hdr.bh_datalen = pktlen;
2578 	hdr.bh_hdrlen = hdrlen;
2579 	hdr.bh_caplen = caplen;
2580 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2581 
2582 	/*
2583 	 * Copy the packet data into the store buffer and update its length.
2584 	 */
2585 #ifndef BURN_BRIDGES
2586 copy:
2587 #endif
2588 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2589 	d->bd_slen = curlen + totlen;
2590 
2591 	if (do_wakeup)
2592 		bpf_wakeup(d);
2593 }
2594 
2595 /*
2596  * Free buffers currently in use by a descriptor.
2597  * Called on close.
2598  */
2599 static void
bpfd_free(epoch_context_t ctx)2600 bpfd_free(epoch_context_t ctx)
2601 {
2602 	struct bpf_d *d;
2603 	struct bpf_program_buffer *p;
2604 
2605 	/*
2606 	 * We don't need to lock out interrupts since this descriptor has
2607 	 * been detached from its interface and it yet hasn't been marked
2608 	 * free.
2609 	 */
2610 	d = __containerof(ctx, struct bpf_d, epoch_ctx);
2611 	bpf_free(d);
2612 	if (d->bd_rfilter != NULL) {
2613 		p = __containerof((void *)d->bd_rfilter,
2614 		    struct bpf_program_buffer, buffer);
2615 #ifdef BPF_JITTER
2616 		p->func = d->bd_bfilter;
2617 #endif
2618 		bpf_program_buffer_free(&p->epoch_ctx);
2619 	}
2620 	if (d->bd_wfilter != NULL) {
2621 		p = __containerof((void *)d->bd_wfilter,
2622 		    struct bpf_program_buffer, buffer);
2623 #ifdef BPF_JITTER
2624 		p->func = NULL;
2625 #endif
2626 		bpf_program_buffer_free(&p->epoch_ctx);
2627 	}
2628 
2629 	mtx_destroy(&d->bd_lock);
2630 	counter_u64_free(d->bd_rcount);
2631 	counter_u64_free(d->bd_dcount);
2632 	counter_u64_free(d->bd_fcount);
2633 	counter_u64_free(d->bd_wcount);
2634 	counter_u64_free(d->bd_wfcount);
2635 	counter_u64_free(d->bd_wdcount);
2636 	counter_u64_free(d->bd_zcopy);
2637 	free(d, M_BPF);
2638 }
2639 
2640 /*
2641  * Attach a tap point to bpf.
2642  * XXX: with current KPI it is consumer's responsibility to avoid duplicates.
2643  */
2644 struct bpf_if *
bpf_attach(const char * name,u_int dlt,u_int hdrlen,const struct bif_methods * methods,void * sc)2645 bpf_attach(const char *name, u_int dlt, u_int hdrlen,
2646     const struct bif_methods *methods, void *sc)
2647 {
2648 	struct bpf_if *bp;
2649 
2650 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
2651 
2652 	CK_LIST_INIT(&bp->bif_dlist);
2653 	CK_LIST_INIT(&bp->bif_wlist);
2654 	bp->bif_dlt = dlt;
2655 	bp->bif_hdrlen = hdrlen;
2656 	bp->bif_softc = sc;
2657 	bp->bif_name = name;
2658 	bp->bif_methods = methods;
2659 	refcount_init(&bp->bif_refcnt, 1);
2660 	BPF_LOCK();
2661 	LIST_INSERT_HEAD(&V_bpf_iflist, bp, bif_next);
2662 	BPF_UNLOCK();
2663 
2664 	return (bp);
2665 }
2666 
2667 #ifdef VIMAGE
2668 /*
2669  * Detach descriptors on interface's vmove event.
2670  * XXXGL: shouldn't be a special case, but a full detach.
2671  */
2672 void
bpf_ifdetach(struct ifnet * ifp)2673 bpf_ifdetach(struct ifnet *ifp)
2674 {
2675 	struct bpf_if *bp;
2676 	struct bpf_d *d;
2677 
2678 	BPF_LOCK();
2679 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2680 		/* XXXGL: assuming softc is ifnet here */
2681 		if (bp->bif_softc != ifp)
2682 			continue;
2683 
2684 		/* Detach common descriptors */
2685 		while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2686 			bpf_detachd(d, true);
2687 		}
2688 
2689 		/* Detach writer-only descriptors */
2690 		while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2691 			bpf_detachd(d, true);
2692 		}
2693 	}
2694 	BPF_UNLOCK();
2695 }
2696 #endif
2697 
2698 /*
2699  * Detach bpf tap point.  This involves detaching each descriptor associated
2700  * with the interface.  Notify each descriptor as it's detached so that any
2701  * sleepers wake up and get ENXIO.
2702  */
2703 void
bpf_detach(struct bpf_if * bp)2704 bpf_detach(struct bpf_if *bp)
2705 {
2706 	struct bpf_d *d;
2707 
2708 	BPF_LOCK();
2709 	LIST_REMOVE(bp, bif_next);
2710 
2711 	CTR3(KTR_NET, "%s: sheduling free for encap %d for bp %p",
2712 	    __func__, bp->bif_dlt, bp);
2713 
2714 	/* Detach common descriptors */
2715 	while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2716 		bpf_detachd(d, true);
2717 	}
2718 
2719 	/* Detach writer-only descriptors */
2720 	while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2721 		bpf_detachd(d, true);
2722 	}
2723 	bpfif_rele(bp);
2724 	BPF_UNLOCK();
2725 }
2726 
2727 #ifdef VIMAGE
2728 /*
2729  * Move bpf to a different VNET.  This KPI is a crutch to support if_vmove
2730  * and is not supposed to be used anywhere else.
2731  */
2732 void
bpf_vmove(struct bpf_if * bp)2733 bpf_vmove(struct bpf_if *bp)
2734 {
2735 
2736 	BPF_LOCK();
2737 	LIST_REMOVE(bp, bif_next);
2738 	LIST_INSERT_HEAD(&V_bpf_iflist, bp, bif_next);
2739 	BPF_UNLOCK();
2740 }
2741 #endif
2742 
2743 bool
bpf_peers_present_if(struct ifnet * ifp)2744 bpf_peers_present_if(struct ifnet *ifp)
2745 {
2746 	return (bpf_peers_present(ifp->if_bpf));
2747 }
2748 
2749 /*
2750  * Get a list of available data link type of the tap point.  If a tap point
2751  * attaches more than one time, it is supposed to attach with different DLTs
2752  * but with the same name pointer.
2753  */
2754 static int
bpf_getdltlist(struct bpf_d * d,struct bpf_dltlist * bfl)2755 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2756 {
2757 	const char *name;
2758 	struct bpf_if *bp;
2759 	u_int *lst;
2760 	int error, n, n1;
2761 
2762 	BPF_LOCK_ASSERT();
2763 
2764 	name = d->bd_bif->bif_name;
2765 	n1 = 0;
2766 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2767 		if (bp->bif_name == name)
2768 			n1++;
2769 	}
2770 	if (bfl->bfl_list == NULL) {
2771 		bfl->bfl_len = n1;
2772 		return (0);
2773 	}
2774 	if (n1 > bfl->bfl_len)
2775 		return (ENOMEM);
2776 
2777 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2778 	n = 0;
2779 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2780 		if (bp->bif_name != name)
2781 			continue;
2782 		lst[n++] = bp->bif_dlt;
2783 	}
2784 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2785 	free(lst, M_TEMP);
2786 	bfl->bfl_len = n;
2787 	return (error);
2788 }
2789 
2790 /*
2791  * Set the data link type of a BPF descriptor.  The convention is that
2792  * application first do BIOCSETIF and then BIOCSETDLT, thus the descriptor
2793  * is supposed to be already attached.  Only one kernel facility provides
2794  * tapping points with same name but different DLT - ieee80211_radiotap.
2795  *
2796  * XXXGL: this function definitely looks suspicious, e.g. it clearly doesn't
2797  * clear promisc on the old bpf_if.  The convention about reference counting
2798  * is also unclear.
2799  */
2800 static int
bpf_setdlt(struct bpf_d * d,u_int dlt)2801 bpf_setdlt(struct bpf_d *d, u_int dlt)
2802 {
2803 	int error, opromisc;
2804 	const char *name;
2805 	struct bpf_if *bp;
2806 
2807 	BPF_LOCK_ASSERT();
2808 	MPASS(d->bd_bif != NULL);
2809 
2810 	/*
2811 	 * It is safe to check bd_bif without BPFD_LOCK, it can not be
2812 	 * changed while we hold global lock.
2813 	 */
2814 	if (d->bd_bif->bif_dlt == dlt)
2815 		return (0);
2816 
2817 	name = d->bd_bif->bif_name;
2818 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2819 		if (bp->bif_name == name && bp->bif_dlt == dlt)
2820 			break;
2821 	}
2822 	if (bp == NULL)
2823 		return (EINVAL);
2824 
2825 	opromisc = d->bd_promisc;
2826 	bpf_attachd(d, bp);
2827 	if (opromisc) {
2828 		error = bp->bif_methods->bif_promisc(bp->bif_softc, true);
2829 		if (error)
2830 			printf("%s: bif_promisc on %s failed (%d)\n",
2831 			    __func__, bp->bif_name, error);
2832 		else
2833 			d->bd_promisc = 1;
2834 	}
2835 	return (0);
2836 }
2837 
2838 static void
bpf_drvinit(void * unused)2839 bpf_drvinit(void *unused)
2840 {
2841 	struct cdev *dev;
2842 
2843 	sx_init(&bpf_sx, "bpf global lock");
2844 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2845 	/* For compatibility */
2846 	make_dev_alias(dev, "bpf0");
2847 }
2848 
2849 /*
2850  * Zero out the various packet counters associated with all of the bpf
2851  * descriptors.  At some point, we will probably want to get a bit more
2852  * granular and allow the user to specify descriptors to be zeroed.
2853  */
2854 static void
bpf_zero_counters(void)2855 bpf_zero_counters(void)
2856 {
2857 	struct bpf_if *bp;
2858 	struct bpf_d *bd;
2859 
2860 	BPF_LOCK();
2861 	/*
2862 	 * We are protected by global lock here, interfaces and
2863 	 * descriptors can not be deleted while we hold it.
2864 	 */
2865 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2866 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2867 			counter_u64_zero(bd->bd_rcount);
2868 			counter_u64_zero(bd->bd_dcount);
2869 			counter_u64_zero(bd->bd_fcount);
2870 			counter_u64_zero(bd->bd_wcount);
2871 			counter_u64_zero(bd->bd_wfcount);
2872 			counter_u64_zero(bd->bd_zcopy);
2873 		}
2874 	}
2875 	BPF_UNLOCK();
2876 }
2877 
2878 /*
2879  * Fill filter statistics
2880  */
2881 static void
bpfstats_fill_xbpf(struct xbpf_d * d,struct bpf_d * bd)2882 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2883 {
2884 
2885 	BPF_LOCK_ASSERT();
2886 	bzero(d, sizeof(*d));
2887 	d->bd_structsize = sizeof(*d);
2888 	d->bd_immediate = bd->bd_flags & BPFD_IMMEDIATE ? 1 : 0;
2889 	d->bd_promisc = bd->bd_promisc;
2890 	d->bd_hdrcmplt = bd->bd_flags & BPFD_HDRCMPLT ? 1 : 0;
2891 	d->bd_direction = bd->bd_direction;
2892 	d->bd_feedback = bd->bd_flags & BPFD_FEEDBACK ? 1 : 0;
2893 	d->bd_async = bd->bd_flags & BPFD_ASYNC ? 1 : 0;
2894 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
2895 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
2896 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
2897 	d->bd_sig = bd->bd_sig;
2898 	d->bd_slen = bd->bd_slen;
2899 	d->bd_hlen = bd->bd_hlen;
2900 	d->bd_bufsize = bd->bd_bufsize;
2901 	d->bd_pid = bd->bd_pid;
2902 	strlcpy(d->bd_ifname, bd->bd_bif->bif_name, sizeof(d->bd_ifname));
2903 	d->bd_locked = bd->bd_flags & BPFD_LOCKED ? 1 : 0;
2904 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
2905 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
2906 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
2907 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
2908 	d->bd_bufmode = bd->bd_bufmode;
2909 }
2910 
2911 /*
2912  * Handle `netstat -B' stats request
2913  */
2914 static int
bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)2915 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2916 {
2917 	static const struct xbpf_d zerostats;
2918 	struct xbpf_d *xbdbuf, *xbd, tempstats;
2919 	u_int bpfd_cnt, index;
2920 	int error;
2921 	struct bpf_if *bp;
2922 	struct bpf_d *bd;
2923 
2924 	/*
2925 	 * XXX This is not technically correct. It is possible for non
2926 	 * privileged users to open bpf devices. It would make sense
2927 	 * if the users who opened the devices were able to retrieve
2928 	 * the statistics for them, too.
2929 	 */
2930 	error = priv_check(req->td, PRIV_NET_BPF);
2931 	if (error)
2932 		return (error);
2933 	/*
2934 	 * Check to see if the user is requesting that the counters be
2935 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
2936 	 * as we aren't allowing the user to set the counters currently.
2937 	 */
2938 	if (req->newptr != NULL) {
2939 		if (req->newlen != sizeof(tempstats))
2940 			return (EINVAL);
2941 		memset(&tempstats, 0, sizeof(tempstats));
2942 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
2943 		if (error)
2944 			return (error);
2945 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
2946 			return (EINVAL);
2947 		bpf_zero_counters();
2948 		return (0);
2949 	}
2950 	bpfd_cnt = 0;
2951 	BPF_LOCK();
2952 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2953 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next)
2954 			bpfd_cnt++;
2955 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next)
2956 			bpfd_cnt++;
2957 	}
2958 	if (bpfd_cnt == 0 || req->oldptr == NULL) {
2959 		BPF_UNLOCK();
2960 		return (SYSCTL_OUT(req, 0, bpfd_cnt * sizeof(*xbd)));
2961 	}
2962 	if (req->oldlen < bpfd_cnt * sizeof(*xbd)) {
2963 		BPF_UNLOCK();
2964 		return (ENOMEM);
2965 	}
2966 	xbdbuf = malloc(bpfd_cnt * sizeof(*xbd), M_BPF, M_WAITOK);
2967 	index = 0;
2968 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2969 		/* Send writers-only first */
2970 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
2971 			MPASS(index <= bpfd_cnt);
2972 			xbd = &xbdbuf[index++];
2973 			bpfstats_fill_xbpf(xbd, bd);
2974 		}
2975 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2976 			MPASS(index <= bpfd_cnt);
2977 			xbd = &xbdbuf[index++];
2978 			bpfstats_fill_xbpf(xbd, bd);
2979 		}
2980 	}
2981 	BPF_UNLOCK();
2982 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2983 	free(xbdbuf, M_BPF);
2984 	return (error);
2985 }
2986 
2987 SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, bpf_drvinit, NULL);
2988 
2989 #else /* !DEV_BPF && !NETGRAPH_BPF */
2990 
2991 /*
2992  * NOP stubs to allow bpf-using drivers to load and function.
2993  *
2994  * A 'better' implementation would allow the core bpf functionality
2995  * to be loaded at runtime.
2996  */
2997 
2998 void
bpf_tap(struct bpf_if * bp,u_char * pkt,u_int pktlen)2999 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
3000 {
3001 }
3002 
3003 void
bpf_tap_if(if_t ifp,u_char * pkt,u_int pktlen)3004 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
3005 {
3006 }
3007 
3008 void
bpf_mtap(struct bpf_if * bp,struct mbuf * m)3009 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
3010 {
3011 }
3012 
3013 void
bpf_mtap_if(if_t ifp,struct mbuf * m)3014 bpf_mtap_if(if_t ifp, struct mbuf *m)
3015 {
3016 }
3017 
3018 void
bpf_mtap2(struct bpf_if * bp,void * d,u_int l,struct mbuf * m)3019 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
3020 {
3021 }
3022 
3023 void
bpf_mtap2_if(if_t ifp,void * data,u_int dlen,struct mbuf * m)3024 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
3025 {
3026 }
3027 
3028 void
bpfattach(struct ifnet * ifp,u_int dlt,u_int hdrlen)3029 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3030 {
3031 	static const struct bpfd_list dead_bpf_if = CK_LIST_HEAD_INITIALIZER();
3032 
3033 	ifp->if_bpf = __DECONST(struct bpf_if *, &dead_bpf_if);
3034 }
3035 
3036 void
bpfdetach(struct ifnet * ifp)3037 bpfdetach(struct ifnet *ifp)
3038 {
3039 }
3040 
3041 bool
bpf_peers_present_if(struct ifnet * ifp)3042 bpf_peers_present_if(struct ifnet *ifp)
3043 {
3044 	return (false);
3045 }
3046 
3047 u_int
bpf_filter(const struct bpf_insn * pc,u_char * p,u_int wirelen,u_int buflen)3048 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3049 {
3050 	return (-1);	/* "no filter" behaviour */
3051 }
3052 
3053 int
bpf_validate(const struct bpf_insn * f,int len)3054 bpf_validate(const struct bpf_insn *f, int len)
3055 {
3056 	return (0);	/* false */
3057 }
3058 
3059 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3060