xref: /freebsd/sys/net/bpf.c (revision a4d766caf7114392b80793c8306d5a712f31557c)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
7  *
8  * This code is derived from the Stanford/CMU enet packet filter,
9  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
10  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
11  * Berkeley Laboratory.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #include <sys/cdefs.h>
39 #include "opt_bpf.h"
40 #include "opt_netgraph.h"
41 
42 #include <sys/param.h>
43 #include <sys/conf.h>
44 #include <sys/fcntl.h>
45 #include <sys/jail.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mbuf.h>
50 #include <sys/mutex.h>
51 #include <sys/time.h>
52 #include <sys/priv.h>
53 #include <sys/proc.h>
54 #include <sys/signalvar.h>
55 #include <sys/filio.h>
56 #include <sys/sockio.h>
57 #include <sys/ttycom.h>
58 #include <sys/uio.h>
59 #include <sys/sysent.h>
60 #include <sys/systm.h>
61 
62 #include <sys/file.h>
63 #include <sys/poll.h>
64 #include <sys/proc.h>
65 
66 #include <sys/socket.h>
67 
68 #include <net/if.h>
69 #include <net/if_var.h>
70 #include <net/if_private.h>
71 #include <net/if_vlan_var.h>
72 #include <net/bpf.h>
73 #include <net/bpf_buffer.h>
74 #ifdef BPF_JITTER
75 #include <net/bpf_jitter.h>
76 #endif
77 #include <net/bpf_zerocopy.h>
78 #include <net/bpfdesc.h>
79 #include <net/vnet.h>
80 
81 #include <sys/kernel.h>
82 #include <sys/sysctl.h>
83 
84 #include <security/mac/mac_framework.h>
85 
86 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
87 
88 struct bpf_if {
89 	struct bpfd_list	bif_dlist;	/* list of all interfaces */
90 	LIST_ENTRY(bpf_if)	bif_next;	/* descriptor list */
91 	u_int		bif_dlt;	/* link layer type */
92 	u_int		bif_hdrlen;	/* length of link header */
93 	volatile u_int	bif_refcnt;
94 	struct bpfd_list bif_wlist;	/* writer-only list */
95 	const struct bif_methods	*bif_methods;
96 	void		*bif_softc;
97 	const char	*bif_name;
98 	struct epoch_context epoch_ctx;
99 };
100 
101 /* See bpf_peers_present() in bpf.h. */
102 _Static_assert(offsetof(struct bpf_if, bif_dlist) == 0,
103     "bpf_if shall start with bif_dlist");
104 
105 static inline void
bif_attachd(struct bpf_if * bp)106 bif_attachd(struct bpf_if *bp)
107 {
108 	if (bp->bif_methods->bif_attachd != NULL)
109 		bp->bif_methods->bif_attachd(bp->bif_softc);
110 }
111 
112 static inline void
bif_detachd(struct bpf_if * bp)113 bif_detachd(struct bpf_if *bp)
114 {
115 	if (bp->bif_methods->bif_detachd != NULL)
116 		bp->bif_methods->bif_detachd(bp->bif_softc);
117 }
118 
119 static inline uint32_t
bif_wrsize(struct bpf_if * bp)120 bif_wrsize(struct bpf_if *bp)
121 {
122 	if (bp->bif_methods->bif_wrsize != NULL)
123 		return (bp->bif_methods->bif_wrsize(bp->bif_softc));
124 	else
125 		return (0);
126 }
127 
128 static inline int
bif_promisc(struct bpf_if * bp,bool on)129 bif_promisc(struct bpf_if *bp, bool on)
130 {
131 	if (bp->bif_methods->bif_promisc != NULL)
132 		return (bp->bif_methods->bif_promisc(bp->bif_softc, on));
133 	else
134 		return (0);
135 }
136 
137 #ifdef MAC
138 static inline int
bif_mac_check_receive(struct bpf_if * bp,struct bpf_d * d)139 bif_mac_check_receive(struct bpf_if *bp, struct bpf_d *d)
140 {
141 	if (bp->bif_methods->bif_mac_check_receive != NULL)
142 		return (bp->bif_methods->bif_mac_check_receive(bp->bif_softc,
143 		    d));
144 	else
145 		return (0);
146 }
147 #endif
148 
149 /*
150  * XXXGL: Once we migrate to tapping KPI that would specify packet direction
151  * we no longer need bif_chkdir method.
152  */
153 static inline bool
bpf_chkdir(struct bpf_d * d,struct mbuf * m)154 bpf_chkdir(struct bpf_d *d, struct mbuf *m)
155 {
156 	return (d->bd_bif->bif_methods->bif_chkdir(d->bd_bif->bif_softc, m,
157 	    d->bd_direction));
158 }
159 
160 struct bpf_program_buffer {
161 	struct epoch_context	epoch_ctx;
162 #ifdef BPF_JITTER
163 	bpf_jit_filter		*func;
164 #endif
165 	void			*buffer[0];
166 };
167 
168 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
169 
170 #define PRINET  26			/* interruptible */
171 #define BPF_PRIO_MAX	7
172 
173 #define	SIZEOF_BPF_HDR(type)	\
174     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
175 
176 #ifdef COMPAT_FREEBSD32
177 #include <sys/mount.h>
178 #include <compat/freebsd32/freebsd32.h>
179 #define BPF_ALIGNMENT32 sizeof(int32_t)
180 #define	BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
181 
182 #ifndef BURN_BRIDGES
183 /*
184  * 32-bit version of structure prepended to each packet.  We use this header
185  * instead of the standard one for 32-bit streams.  We mark the a stream as
186  * 32-bit the first time we see a 32-bit compat ioctl request.
187  */
188 struct bpf_hdr32 {
189 	struct timeval32 bh_tstamp;	/* time stamp */
190 	uint32_t	bh_caplen;	/* length of captured portion */
191 	uint32_t	bh_datalen;	/* original length of packet */
192 	uint16_t	bh_hdrlen;	/* length of bpf header (this struct
193 					   plus alignment padding) */
194 };
195 #endif
196 
197 struct bpf_program32 {
198 	u_int bf_len;
199 	uint32_t bf_insns;
200 };
201 
202 struct bpf_dltlist32 {
203 	u_int	bfl_len;
204 	u_int	bfl_list;
205 };
206 
207 #define	BIOCSETF32	_IOW('B', 103, struct bpf_program32)
208 #define	BIOCSRTIMEOUT32	_IOW('B', 109, struct timeval32)
209 #define	BIOCGRTIMEOUT32	_IOR('B', 110, struct timeval32)
210 #define	BIOCGDLTLIST32	_IOWR('B', 121, struct bpf_dltlist32)
211 #define	BIOCSETWF32	_IOW('B', 123, struct bpf_program32)
212 #define	BIOCSETFNR32	_IOW('B', 130, struct bpf_program32)
213 #endif
214 
215 #define BPF_LOCK()		sx_xlock(&bpf_sx)
216 #define BPF_UNLOCK()		sx_xunlock(&bpf_sx)
217 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
218 /*
219  * bpf_iflist is a list of BPF interface structures, each corresponding to a
220  * specific DLT. The same network interface might have several BPF interface
221  * structures registered by different layers in the stack (i.e., 802.11
222  * frames, ethernet frames, etc).
223  */
224 VNET_DEFINE_STATIC(LIST_HEAD(, bpf_if), bpf_iflist) = LIST_HEAD_INITIALIZER();
225 #define	V_bpf_iflist	VNET(bpf_iflist)
226 static struct sx	bpf_sx;		/* bpf global lock */
227 
228 static void	bpfif_ref(struct bpf_if *);
229 static void	bpfif_rele(struct bpf_if *);
230 
231 static void	bpfd_ref(struct bpf_d *);
232 static void	bpfd_rele(struct bpf_d *);
233 static int	bpf_attachd(struct bpf_d *d, struct bpf_if *);
234 static void	bpf_detachd(struct bpf_d *, bool);
235 static void	bpfd_free(epoch_context_t);
236 static void	bpf_timed_out(void *);
237 static __inline void
238 		bpf_wakeup(struct bpf_d *);
239 static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
240 		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
241 		    struct bintime *);
242 static void	reset_d(struct bpf_d *);
243 static int	bpf_getiflist(struct bpf_iflist *);
244 static int	bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
245 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
246 static int	bpf_setdlt(struct bpf_d *, u_int);
247 static void	filt_bpfdetach(struct knote *);
248 static int	filt_bpfread(struct knote *, long);
249 static int	filt_bpfwrite(struct knote *, long);
250 static void	bpf_drvinit(void *);
251 static int	bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
252 
253 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
254     "bpf sysctl");
255 int bpf_maxinsns = BPF_MAXINSNS;
256 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
257     &bpf_maxinsns, 0, "Maximum bpf program instructions");
258 static int bpf_zerocopy_enable = 0;
259 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
260     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
261 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats,
262     CTLFLAG_VNET | CTLFLAG_MPSAFE | CTLFLAG_RW,
263     bpf_stats_sysctl, "bpf statistics portal");
264 
265 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
266 #define	V_bpf_optimize_writers VNET(bpf_optimize_writers)
267 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
268     &VNET_NAME(bpf_optimize_writers), 0,
269     "Do not send packets until BPF program is set");
270 
271 static	d_open_t	bpfopen;
272 static	d_read_t	bpfread;
273 static	d_write_t	bpfwrite;
274 static	d_ioctl_t	bpfioctl;
275 static	d_poll_t	bpfpoll;
276 static	d_kqfilter_t	bpfkqfilter;
277 
278 static struct cdevsw bpf_cdevsw = {
279 	.d_version =	D_VERSION,
280 	.d_open =	bpfopen,
281 	.d_read =	bpfread,
282 	.d_write =	bpfwrite,
283 	.d_ioctl =	bpfioctl,
284 	.d_poll =	bpfpoll,
285 	.d_name =	"bpf",
286 	.d_kqfilter =	bpfkqfilter,
287 };
288 
289 static const struct filterops bpfread_filtops = {
290 	.f_isfd = 1,
291 	.f_detach = filt_bpfdetach,
292 	.f_event = filt_bpfread,
293 	.f_copy = knote_triv_copy,
294 };
295 
296 static const struct filterops bpfwrite_filtops = {
297 	.f_isfd = 1,
298 	.f_detach = filt_bpfdetach,
299 	.f_event = filt_bpfwrite,
300 	.f_copy = knote_triv_copy,
301 };
302 
303 /*
304  * LOCKING MODEL USED BY BPF
305  *
306  * Locks:
307  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
308  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
309  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
310  * structure fields used by bpf_*tap* code.
311  *
312  * Lock order: global lock, then descriptor lock.
313  *
314  * There are several possible consumers:
315  *
316  * 1. The kernel registers interface pointer with bpfattach().
317  * Each call allocates new bpf_if structure, references ifnet pointer
318  * and links bpf_if into bpf_iflist chain. This is protected with global
319  * lock.
320  *
321  * 2. An userland application uses ioctl() call to bpf_d descriptor.
322  * All such call are serialized with global lock. BPF filters can be
323  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
324  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
325  * filter pointers, even if change will happen during bpf_tap execution.
326  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
327  *
328  * 3. An userland application can write packets into bpf_d descriptor.
329  * There we need to be sure, that ifnet won't disappear during bpfwrite().
330  *
331  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
332  * bif_dlist is protected with net_epoch_preempt section. So, it should
333  * be safe to make access to bpf_d descriptor inside the section.
334  *
335  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
336  * are modified with global lock held and actual free() is done using
337  * NET_EPOCH_CALL().
338  */
339 
340 static void
bpfif_free(epoch_context_t ctx)341 bpfif_free(epoch_context_t ctx)
342 {
343 	struct bpf_if *bp;
344 
345 	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
346 	free(bp, M_BPF);
347 }
348 
349 static void
bpfif_ref(struct bpf_if * bp)350 bpfif_ref(struct bpf_if *bp)
351 {
352 
353 	refcount_acquire(&bp->bif_refcnt);
354 }
355 
356 static void
bpfif_rele(struct bpf_if * bp)357 bpfif_rele(struct bpf_if *bp)
358 {
359 
360 	if (!refcount_release(&bp->bif_refcnt))
361 		return;
362 	NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
363 }
364 
365 static void
bpfd_ref(struct bpf_d * d)366 bpfd_ref(struct bpf_d *d)
367 {
368 
369 	refcount_acquire(&d->bd_refcnt);
370 }
371 
372 static void
bpfd_rele(struct bpf_d * d)373 bpfd_rele(struct bpf_d *d)
374 {
375 
376 	if (!refcount_release(&d->bd_refcnt))
377 		return;
378 	NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
379 }
380 
381 static struct bpf_program_buffer*
bpf_program_buffer_alloc(size_t size,int flags)382 bpf_program_buffer_alloc(size_t size, int flags)
383 {
384 
385 	return (malloc(sizeof(struct bpf_program_buffer) + size,
386 	    M_BPF, flags));
387 }
388 
389 static void
bpf_program_buffer_free(epoch_context_t ctx)390 bpf_program_buffer_free(epoch_context_t ctx)
391 {
392 	struct bpf_program_buffer *ptr;
393 
394 	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
395 #ifdef BPF_JITTER
396 	if (ptr->func != NULL)
397 		bpf_destroy_jit_filter(ptr->func);
398 #endif
399 	free(ptr, M_BPF);
400 }
401 
402 /*
403  * Wrapper functions for various buffering methods.  If the set of buffer
404  * modes expands, we will probably want to introduce a switch data structure
405  * similar to protosw, et.
406  */
407 static void
bpf_append_bytes(struct bpf_d * d,caddr_t buf,u_int offset,void * src,u_int len)408 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
409     u_int len)
410 {
411 
412 	BPFD_LOCK_ASSERT(d);
413 
414 	switch (d->bd_bufmode) {
415 	case BPF_BUFMODE_BUFFER:
416 		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
417 
418 	case BPF_BUFMODE_ZBUF:
419 		counter_u64_add(d->bd_zcopy, 1);
420 		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
421 
422 	default:
423 		panic("bpf_buf_append_bytes");
424 	}
425 }
426 
427 static void
bpf_append_mbuf(struct bpf_d * d,caddr_t buf,u_int offset,void * src,u_int len)428 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
429     u_int len)
430 {
431 
432 	BPFD_LOCK_ASSERT(d);
433 
434 	switch (d->bd_bufmode) {
435 	case BPF_BUFMODE_BUFFER:
436 		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
437 
438 	case BPF_BUFMODE_ZBUF:
439 		counter_u64_add(d->bd_zcopy, 1);
440 		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
441 
442 	default:
443 		panic("bpf_buf_append_mbuf");
444 	}
445 }
446 
447 /*
448  * This function gets called when the free buffer is re-assigned.
449  */
450 static void
bpf_buf_reclaimed(struct bpf_d * d)451 bpf_buf_reclaimed(struct bpf_d *d)
452 {
453 
454 	BPFD_LOCK_ASSERT(d);
455 
456 	switch (d->bd_bufmode) {
457 	case BPF_BUFMODE_BUFFER:
458 		return;
459 
460 	case BPF_BUFMODE_ZBUF:
461 		bpf_zerocopy_buf_reclaimed(d);
462 		return;
463 
464 	default:
465 		panic("bpf_buf_reclaimed");
466 	}
467 }
468 
469 /*
470  * If the buffer mechanism has a way to decide that a held buffer can be made
471  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
472  * returned if the buffer can be discarded, (0) is returned if it cannot.
473  */
474 static int
bpf_canfreebuf(struct bpf_d * d)475 bpf_canfreebuf(struct bpf_d *d)
476 {
477 
478 	BPFD_LOCK_ASSERT(d);
479 
480 	switch (d->bd_bufmode) {
481 	case BPF_BUFMODE_ZBUF:
482 		return (bpf_zerocopy_canfreebuf(d));
483 	}
484 	return (0);
485 }
486 
487 /*
488  * Allow the buffer model to indicate that the current store buffer is
489  * immutable, regardless of the appearance of space.  Return (1) if the
490  * buffer is writable, and (0) if not.
491  */
492 static int
bpf_canwritebuf(struct bpf_d * d)493 bpf_canwritebuf(struct bpf_d *d)
494 {
495 	BPFD_LOCK_ASSERT(d);
496 
497 	switch (d->bd_bufmode) {
498 	case BPF_BUFMODE_ZBUF:
499 		return (bpf_zerocopy_canwritebuf(d));
500 	}
501 	return (1);
502 }
503 
504 /*
505  * Notify buffer model that an attempt to write to the store buffer has
506  * resulted in a dropped packet, in which case the buffer may be considered
507  * full.
508  */
509 static void
bpf_buffull(struct bpf_d * d)510 bpf_buffull(struct bpf_d *d)
511 {
512 
513 	BPFD_LOCK_ASSERT(d);
514 
515 	switch (d->bd_bufmode) {
516 	case BPF_BUFMODE_ZBUF:
517 		bpf_zerocopy_buffull(d);
518 		break;
519 	}
520 }
521 
522 /*
523  * Notify the buffer model that a buffer has moved into the hold position.
524  */
525 void
bpf_bufheld(struct bpf_d * d)526 bpf_bufheld(struct bpf_d *d)
527 {
528 
529 	BPFD_LOCK_ASSERT(d);
530 
531 	switch (d->bd_bufmode) {
532 	case BPF_BUFMODE_ZBUF:
533 		bpf_zerocopy_bufheld(d);
534 		break;
535 	}
536 }
537 
538 static void
bpf_free(struct bpf_d * d)539 bpf_free(struct bpf_d *d)
540 {
541 
542 	switch (d->bd_bufmode) {
543 	case BPF_BUFMODE_BUFFER:
544 		return (bpf_buffer_free(d));
545 
546 	case BPF_BUFMODE_ZBUF:
547 		return (bpf_zerocopy_free(d));
548 
549 	default:
550 		panic("bpf_buf_free");
551 	}
552 }
553 
554 static int
bpf_uiomove(struct bpf_d * d,caddr_t buf,u_int len,struct uio * uio)555 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
556 {
557 
558 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
559 		return (EOPNOTSUPP);
560 	return (bpf_buffer_uiomove(d, buf, len, uio));
561 }
562 
563 static int
bpf_ioctl_sblen(struct bpf_d * d,u_int * i)564 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
565 {
566 
567 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
568 		return (EOPNOTSUPP);
569 	return (bpf_buffer_ioctl_sblen(d, i));
570 }
571 
572 static int
bpf_ioctl_getzmax(struct thread * td,struct bpf_d * d,size_t * i)573 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
574 {
575 
576 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
577 		return (EOPNOTSUPP);
578 	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
579 }
580 
581 static int
bpf_ioctl_rotzbuf(struct thread * td,struct bpf_d * d,struct bpf_zbuf * bz)582 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
583 {
584 
585 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
586 		return (EOPNOTSUPP);
587 	return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
588 }
589 
590 static int
bpf_ioctl_setzbuf(struct thread * td,struct bpf_d * d,struct bpf_zbuf * bz)591 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
592 {
593 
594 	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
595 		return (EOPNOTSUPP);
596 	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
597 }
598 
599 /*
600  * Check if we need to upgrade our descriptor @d from write-only mode.
601  */
602 static int
bpf_check_upgrade(u_long cmd,struct bpf_d * d,struct bpf_insn * fcode,int flen)603 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
604     int flen)
605 {
606 	int is_snap, need_upgrade;
607 
608 	/*
609 	 * Check if we've already upgraded or new filter is empty.
610 	 */
611 	if (d->bd_writer == 0 || fcode == NULL)
612 		return (0);
613 
614 	need_upgrade = 0;
615 
616 	/*
617 	 * Check if cmd looks like snaplen setting from
618 	 * pcap_bpf.c:pcap_open_live().
619 	 * Note we're not checking .k value here:
620 	 * while pcap_open_live() definitely sets to non-zero value,
621 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
622 	 * do not consider upgrading immediately
623 	 */
624 	if (cmd == BIOCSETF && flen == 1 &&
625 	    fcode[0].code == (BPF_RET | BPF_K))
626 		is_snap = 1;
627 	else
628 		is_snap = 0;
629 
630 	if (is_snap == 0) {
631 		/*
632 		 * We're setting first filter and it doesn't look like
633 		 * setting snaplen.  We're probably using bpf directly.
634 		 * Upgrade immediately.
635 		 */
636 		need_upgrade = 1;
637 	} else {
638 		/*
639 		 * Do not require upgrade by first BIOCSETF
640 		 * (used to set snaplen) by pcap_open_live().
641 		 */
642 
643 		if (--d->bd_writer == 0) {
644 			/*
645 			 * First snaplen filter has already
646 			 * been set. This is probably catch-all
647 			 * filter
648 			 */
649 			need_upgrade = 1;
650 		}
651 	}
652 
653 	CTR5(KTR_NET,
654 	    "%s: filter function set by pid %d, "
655 	    "bd_writer counter %d, snap %d upgrade %d",
656 	    __func__, d->bd_pid, d->bd_writer,
657 	    is_snap, need_upgrade);
658 
659 	return (need_upgrade);
660 }
661 
662 /*
663  * Detach a file from its interface.
664  */
665 static void
bpf_detachd(struct bpf_d * d,bool detached_ifp)666 bpf_detachd(struct bpf_d *d, bool detached_ifp)
667 {
668 	struct bpf_if *bp;
669 	bool writer;
670 
671 	BPF_LOCK_ASSERT();
672 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
673 
674 	/* Check if descriptor is attached */
675 	if ((bp = d->bd_bif) == NULL)
676 		return;
677 
678 	BPFD_LOCK(d);
679 	CK_LIST_REMOVE(d, bd_next);
680 	writer = (d->bd_writer > 0);
681 	d->bd_bif = NULL;
682 	if (detached_ifp) {
683 		/*
684 		 * Notify descriptor as it's detached, so that any
685 		 * sleepers wake up and get ENXIO.
686 		 */
687 		bpf_wakeup(d);
688 	}
689 	BPFD_UNLOCK(d);
690 
691 	if (!writer)
692 		bif_detachd(bp);
693 
694 	if (d->bd_promisc && !detached_ifp) {
695 		d->bd_promisc = 0;
696 		(void)bif_promisc(bp, false);
697 	}
698 
699 	bpfif_rele(bp);
700 }
701 
702 /*
703  * Close the descriptor by detaching it from its interface,
704  * deallocating its buffers, and marking it free.
705  */
706 static void
bpf_dtor(void * data)707 bpf_dtor(void *data)
708 {
709 	struct bpf_d *d = data;
710 
711 	BPFD_LOCK(d);
712 	if (d->bd_state == BPF_WAITING)
713 		callout_stop(&d->bd_callout);
714 	d->bd_state = BPF_IDLE;
715 	BPFD_UNLOCK(d);
716 	funsetown(&d->bd_sigio);
717 	BPF_LOCK();
718 	bpf_detachd(d, false);
719 	BPF_UNLOCK();
720 #ifdef MAC
721 	mac_bpfdesc_destroy(d);
722 #endif /* MAC */
723 	seldrain(&d->bd_sel);
724 	knlist_destroy(&d->bd_sel.si_note);
725 	callout_drain(&d->bd_callout);
726 	bpfd_rele(d);
727 }
728 
729 /*
730  * Open ethernet device.  Returns ENXIO for illegal minor device number,
731  * EBUSY if file is open by another process.
732  */
733 /* ARGSUSED */
734 static	int
bpfopen(struct cdev * dev,int flags,int fmt,struct thread * td)735 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
736 {
737 	struct bpf_d *d;
738 	int error;
739 
740 	d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
741 	error = devfs_set_cdevpriv(d, bpf_dtor);
742 	if (error != 0) {
743 		free(d, M_BPF);
744 		return (error);
745 	}
746 
747 	/* Setup counters */
748 	d->bd_rcount = counter_u64_alloc(M_WAITOK);
749 	d->bd_dcount = counter_u64_alloc(M_WAITOK);
750 	d->bd_fcount = counter_u64_alloc(M_WAITOK);
751 	d->bd_wcount = counter_u64_alloc(M_WAITOK);
752 	d->bd_wfcount = counter_u64_alloc(M_WAITOK);
753 	d->bd_wdcount = counter_u64_alloc(M_WAITOK);
754 	d->bd_zcopy = counter_u64_alloc(M_WAITOK);
755 
756 	/*
757 	 * For historical reasons, perform a one-time initialization call to
758 	 * the buffer routines, even though we're not yet committed to a
759 	 * particular buffer method.
760 	 */
761 	bpf_buffer_init(d);
762 	if ((flags & FREAD) == 0)
763 		d->bd_writer = 2;
764 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
765 	d->bd_sig = SIGIO;
766 	d->bd_direction = BPF_D_INOUT;
767 	refcount_init(&d->bd_refcnt, 1);
768 	BPF_PID_REFRESH(d, td);
769 #ifdef MAC
770 	mac_bpfdesc_init(d);
771 	mac_bpfdesc_create(td->td_ucred, d);
772 #endif
773 	mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
774 	callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
775 	knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
776 
777 	/* Disable VLAN pcp tagging. */
778 	d->bd_pcp = 0;
779 
780 	return (0);
781 }
782 
783 /*
784  *  bpfread - read next chunk of packets from buffers
785  */
786 static	int
bpfread(struct cdev * dev,struct uio * uio,int ioflag)787 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
788 {
789 	struct bpf_d *d;
790 	int error;
791 	int non_block;
792 	int timed_out;
793 
794 	error = devfs_get_cdevpriv((void **)&d);
795 	if (error != 0)
796 		return (error);
797 
798 	/*
799 	 * Restrict application to use a buffer the same size as
800 	 * as kernel buffers.
801 	 */
802 	if (uio->uio_resid != d->bd_bufsize)
803 		return (EINVAL);
804 
805 	non_block = ((ioflag & O_NONBLOCK) != 0);
806 
807 	BPFD_LOCK(d);
808 	BPF_PID_REFRESH_CUR(d);
809 	if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
810 		BPFD_UNLOCK(d);
811 		return (EOPNOTSUPP);
812 	}
813 	if (d->bd_state == BPF_WAITING)
814 		callout_stop(&d->bd_callout);
815 	timed_out = (d->bd_state == BPF_TIMED_OUT);
816 	d->bd_state = BPF_IDLE;
817 	while (d->bd_flags & BPFD_HBUF_INUSE) {
818 		error = mtx_sleep(&d->bd_hbuf, &d->bd_lock, PRINET | PCATCH,
819 		    "bd_hbuf", 0);
820 		if (error != 0) {
821 			BPFD_UNLOCK(d);
822 			return (error);
823 		}
824 	}
825 	/*
826 	 * If the hold buffer is empty, then do a timed sleep, which
827 	 * ends when the timeout expires or when enough packets
828 	 * have arrived to fill the store buffer.
829 	 */
830 	while (d->bd_hbuf == NULL) {
831 		if (d->bd_slen != 0) {
832 			/*
833 			 * A packet(s) either arrived since the previous
834 			 * read or arrived while we were asleep.
835 			 */
836 			if ((d->bd_flags & BPFD_IMMEDIATE) || non_block ||
837 			    timed_out) {
838 				/*
839 				 * Rotate the buffers and return what's here
840 				 * if we are in immediate mode, non-blocking
841 				 * flag is set, or this descriptor timed out.
842 				 */
843 				ROTATE_BUFFERS(d);
844 				break;
845 			}
846 		}
847 
848 		/*
849 		 * No data is available, check to see if the bpf device
850 		 * is still pointed at a real interface.  If not, return
851 		 * ENXIO so that the userland process knows to rebind
852 		 * it before using it again.
853 		 */
854 		if (d->bd_bif == NULL) {
855 			BPFD_UNLOCK(d);
856 			return (ENXIO);
857 		}
858 
859 		if (non_block) {
860 			BPFD_UNLOCK(d);
861 			return (EWOULDBLOCK);
862 		}
863 		error = msleep(d, &d->bd_lock, PRINET | PCATCH,
864 		     "bpf", d->bd_rtout);
865 		if (error == EINTR || error == ERESTART) {
866 			BPFD_UNLOCK(d);
867 			return (error);
868 		}
869 		if (error == EWOULDBLOCK) {
870 			/*
871 			 * On a timeout, return what's in the buffer,
872 			 * which may be nothing.  If there is something
873 			 * in the store buffer, we can rotate the buffers.
874 			 */
875 			if (d->bd_hbuf)
876 				/*
877 				 * We filled up the buffer in between
878 				 * getting the timeout and arriving
879 				 * here, so we don't need to rotate.
880 				 */
881 				break;
882 
883 			if (d->bd_slen == 0) {
884 				BPFD_UNLOCK(d);
885 				return (0);
886 			}
887 			ROTATE_BUFFERS(d);
888 			break;
889 		}
890 	}
891 	/*
892 	 * At this point, we know we have something in the hold slot.
893 	 */
894 	d->bd_flags |= BPFD_HBUF_INUSE;
895 	BPFD_UNLOCK(d);
896 
897 	/*
898 	 * Move data from hold buffer into user space.
899 	 * We know the entire buffer is transferred since
900 	 * we checked above that the read buffer is bpf_bufsize bytes.
901   	 *
902 	 * We do not have to worry about simultaneous reads because
903 	 * we waited for sole access to the hold buffer above.
904 	 */
905 	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
906 
907 	BPFD_LOCK(d);
908 	if (d->bd_flags & BPFD_HBUF_INUSE) {
909 		KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
910 		d->bd_fbuf = d->bd_hbuf;
911 		d->bd_hbuf = NULL;
912 		d->bd_hlen = 0;
913 		bpf_buf_reclaimed(d);
914 		d->bd_flags &= ~BPFD_HBUF_INUSE;
915 		wakeup(&d->bd_hbuf);
916 	}
917 	BPFD_UNLOCK(d);
918 
919 	return (error);
920 }
921 
922 /*
923  * If there are processes sleeping on this descriptor, wake them up.
924  */
925 static __inline void
bpf_wakeup(struct bpf_d * d)926 bpf_wakeup(struct bpf_d *d)
927 {
928 
929 	BPFD_LOCK_ASSERT(d);
930 	if (d->bd_state == BPF_WAITING) {
931 		callout_stop(&d->bd_callout);
932 		d->bd_state = BPF_IDLE;
933 	}
934 	wakeup(d);
935 	if ((d->bd_flags & BPFD_ASYNC) && d->bd_sig && d->bd_sigio)
936 		pgsigio(&d->bd_sigio, d->bd_sig, 0);
937 
938 	selwakeuppri(&d->bd_sel, PRINET);
939 	KNOTE_LOCKED(&d->bd_sel.si_note, 0);
940 }
941 
942 static void
bpf_timed_out(void * arg)943 bpf_timed_out(void *arg)
944 {
945 	struct bpf_d *d = (struct bpf_d *)arg;
946 
947 	BPFD_LOCK_ASSERT(d);
948 
949 	if (callout_pending(&d->bd_callout) ||
950 	    !callout_active(&d->bd_callout))
951 		return;
952 	if (d->bd_state == BPF_WAITING) {
953 		d->bd_state = BPF_TIMED_OUT;
954 		if (d->bd_slen != 0)
955 			bpf_wakeup(d);
956 	}
957 }
958 
959 static int
bpf_ready(struct bpf_d * d)960 bpf_ready(struct bpf_d *d)
961 {
962 
963 	BPFD_LOCK_ASSERT(d);
964 
965 	if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
966 		return (1);
967 	if (((d->bd_flags & BPFD_IMMEDIATE) || d->bd_state == BPF_TIMED_OUT) &&
968 	    d->bd_slen != 0)
969 		return (1);
970 	return (0);
971 }
972 
973 static int
bpfwrite(struct cdev * dev,struct uio * uio,int ioflag)974 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
975 {
976 	struct epoch_tracker et;
977 	struct bpf_if *bp;
978 	struct bpf_d *d;
979 	struct mbuf *m, *mc;
980 	ssize_t len;
981 	int error;
982 
983 	error = devfs_get_cdevpriv((void **)&d);
984 	if (error != 0)
985 		return (error);
986 
987 	if (uio->uio_resid == 0)
988 		return (0);
989 
990 	BPFD_LOCK(d);
991 	if ((bp = d->bd_bif) == NULL)
992 		error = ENXIO;
993 	else if (bp->bif_methods->bif_write == NULL)
994 		error = EOPNOTSUPP;
995 	if (error) {
996 		BPFD_UNLOCK(d);
997 		counter_u64_add(d->bd_wdcount, 1);
998 		return (error);
999 	}
1000 	bpfd_ref(d);
1001 	BPFD_UNLOCK(d);
1002 
1003 	len = uio->uio_resid;
1004 	/* Allocate a mbuf, up to MJUM16BYTES bytes, for our write. */
1005 	m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
1006 	if (m == NULL) {
1007 		error = ENOMEM;
1008 		goto fail_wref;
1009 	}
1010 	m->m_pkthdr.len = m->m_len = len;
1011 
1012 	error = uiomove(mtod(m, u_char *), len, uio);
1013 	if (error)
1014 		goto fail_wref;
1015 
1016 	if (bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len) == 0) {
1017 		error = EPERM;
1018 		goto fail_wref;
1019 	}
1020 
1021 	if (d->bd_flags & BPFD_FEEDBACK) {
1022 		mc = m_dup(m, M_WAITOK);
1023 		/* Set M_PROMISC for outgoing packets to be discarded. */
1024 		if (d->bd_direction == BPF_D_INOUT)
1025 			m->m_flags |= M_PROMISC;
1026 	} else
1027 		mc = NULL;
1028 
1029 	/* XXXGL: should belong to bpf_ifnet.c */
1030 	if (d->bd_pcp != 0)
1031 		(void)vlan_set_pcp(m, d->bd_pcp);
1032 
1033 	BPFD_LOCK(d);
1034 #ifdef MAC
1035 	mac_bpfdesc_create_mbuf(d, m);
1036 	if (mc != NULL)
1037 		mac_bpfdesc_create_mbuf(d, mc);
1038 #endif
1039 	/*
1040 	 * Check that descriptor is still attached to the interface.
1041 	 * This can happen on bpfdetach() or if other thread did BIOCSDLT.
1042 	 */
1043 	if (__predict_false(d->bd_bif != bp)) {
1044 		BPFD_UNLOCK(d);
1045 		m_freem(mc);
1046 		error = ENXIO;
1047 		goto fail_wref;
1048 	}
1049 	BPFD_UNLOCK(d);
1050 
1051 	NET_EPOCH_ENTER(et);
1052 	error = bp->bif_methods->bif_write(bp->bif_softc, m, mc, d->bd_flags);
1053 	NET_EPOCH_EXIT(et);
1054 	if (error)
1055 		counter_u64_add(d->bd_wdcount, 1);
1056 	else
1057 		counter_u64_add(d->bd_wfcount, 1);
1058 	bpfd_rele(d);
1059 
1060 	return (error);
1061 
1062 fail_wref:
1063 	counter_u64_add(d->bd_wdcount, 1);
1064 	bpfd_rele(d);
1065 	m_freem(m);
1066 	return (error);
1067 }
1068 
1069 /*
1070  * Reset a descriptor by flushing its packet buffer and clearing the receive
1071  * and drop counts.  This is doable for kernel-only buffers, but with
1072  * zero-copy buffers, we can't write to (or rotate) buffers that are
1073  * currently owned by userspace.  It would be nice if we could encapsulate
1074  * this logic in the buffer code rather than here.
1075  */
1076 static void
reset_d(struct bpf_d * d)1077 reset_d(struct bpf_d *d)
1078 {
1079 
1080 	BPFD_LOCK_ASSERT(d);
1081 
1082 	while (d->bd_flags & BPFD_HBUF_INUSE)
1083 		mtx_sleep(&d->bd_hbuf, &d->bd_lock, PRINET, "bd_hbuf", 0);
1084 	if ((d->bd_hbuf != NULL) &&
1085 	    (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1086 		/* Free the hold buffer. */
1087 		d->bd_fbuf = d->bd_hbuf;
1088 		d->bd_hbuf = NULL;
1089 		d->bd_hlen = 0;
1090 		bpf_buf_reclaimed(d);
1091 	}
1092 	if (bpf_canwritebuf(d))
1093 		d->bd_slen = 0;
1094 	counter_u64_zero(d->bd_rcount);
1095 	counter_u64_zero(d->bd_dcount);
1096 	counter_u64_zero(d->bd_fcount);
1097 	counter_u64_zero(d->bd_wcount);
1098 	counter_u64_zero(d->bd_wfcount);
1099 	counter_u64_zero(d->bd_wdcount);
1100 	counter_u64_zero(d->bd_zcopy);
1101 }
1102 
1103 /*
1104  *  FIONREAD		Check for read packet available.
1105  *  BIOCGETIFLIST	Get list of all tap points.
1106  *  BIOCGBLEN		Get buffer len [for read()].
1107  *  BIOCSETF		Set read filter.
1108  *  BIOCSETFNR		Set read filter without resetting descriptor.
1109  *  BIOCSETWF		Set write filter.
1110  *  BIOCFLUSH		Flush read packet buffer.
1111  *  BIOCPROMISC		Put interface into promiscuous mode.
1112  *  BIOCGDLT		Get link layer type.
1113  *  BIOCGETIF		Get interface name.
1114  *  BIOCSETIF		Set interface.
1115  *  BIOCSRTIMEOUT	Set read timeout.
1116  *  BIOCGRTIMEOUT	Get read timeout.
1117  *  BIOCGSTATS		Get packet stats.
1118  *  BIOCIMMEDIATE	Set immediate mode.
1119  *  BIOCVERSION		Get filter language version.
1120  *  BIOCGHDRCMPLT	Get "header already complete" flag
1121  *  BIOCSHDRCMPLT	Set "header already complete" flag
1122  *  BIOCGDIRECTION	Get packet direction flag
1123  *  BIOCSDIRECTION	Set packet direction flag
1124  *  BIOCGTSTAMP		Get time stamp format and resolution.
1125  *  BIOCSTSTAMP		Set time stamp format and resolution.
1126  *  BIOCLOCK		Set "locked" flag
1127  *  BIOCFEEDBACK	Set packet feedback mode.
1128  *  BIOCSETZBUF		Set current zero-copy buffer locations.
1129  *  BIOCGETZMAX		Get maximum zero-copy buffer size.
1130  *  BIOCROTZBUF		Force rotation of zero-copy buffer
1131  *  BIOCSETBUFMODE	Set buffer mode.
1132  *  BIOCGETBUFMODE	Get current buffer mode.
1133  *  BIOCSETVLANPCP	Set VLAN PCP tag.
1134  */
1135 /* ARGSUSED */
1136 static	int
bpfioctl(struct cdev * dev,u_long cmd,caddr_t addr,int flags,struct thread * td)1137 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1138     struct thread *td)
1139 {
1140 	struct bpf_d *d;
1141 	int error;
1142 
1143 	error = devfs_get_cdevpriv((void **)&d);
1144 	if (error != 0)
1145 		return (error);
1146 
1147 	/*
1148 	 * Refresh PID associated with this descriptor.
1149 	 */
1150 	BPFD_LOCK(d);
1151 	BPF_PID_REFRESH(d, td);
1152 	if (d->bd_state == BPF_WAITING)
1153 		callout_stop(&d->bd_callout);
1154 	d->bd_state = BPF_IDLE;
1155 	BPFD_UNLOCK(d);
1156 
1157 	if (d->bd_flags & BPFD_LOCKED) {
1158 		switch (cmd) {
1159 		case BIOCGETIFLIST:
1160 		case BIOCGBLEN:
1161 		case BIOCFLUSH:
1162 		case BIOCGDLT:
1163 		case BIOCGDLTLIST:
1164 #ifdef COMPAT_FREEBSD32
1165 		case BIOCGDLTLIST32:
1166 #endif
1167 		case BIOCGETIF:
1168 		case BIOCGRTIMEOUT:
1169 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1170 		case BIOCGRTIMEOUT32:
1171 #endif
1172 		case BIOCGSTATS:
1173 		case BIOCVERSION:
1174 		case BIOCGRSIG:
1175 		case BIOCGHDRCMPLT:
1176 		case BIOCSTSTAMP:
1177 		case BIOCFEEDBACK:
1178 		case FIONREAD:
1179 		case BIOCLOCK:
1180 		case BIOCSRTIMEOUT:
1181 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1182 		case BIOCSRTIMEOUT32:
1183 #endif
1184 		case BIOCIMMEDIATE:
1185 		case TIOCGPGRP:
1186 		case BIOCROTZBUF:
1187 			break;
1188 		default:
1189 			return (EPERM);
1190 		}
1191 	}
1192 #ifdef COMPAT_FREEBSD32
1193 	/*
1194 	 * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1195 	 * that it will get 32-bit packet headers.
1196 	 */
1197 	switch (cmd) {
1198 	case BIOCSETF32:
1199 	case BIOCSETFNR32:
1200 	case BIOCSETWF32:
1201 	case BIOCGDLTLIST32:
1202 	case BIOCGRTIMEOUT32:
1203 	case BIOCSRTIMEOUT32:
1204 		if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1205 			BPFD_LOCK(d);
1206 			d->bd_compat32 = 1;
1207 			BPFD_UNLOCK(d);
1208 		}
1209 	}
1210 #endif
1211 
1212 	CURVNET_SET(TD_TO_VNET(td));
1213 	switch (cmd) {
1214 	default:
1215 		error = EINVAL;
1216 		break;
1217 
1218 	/*
1219 	 * Check for read packet available.
1220 	 */
1221 	case FIONREAD:
1222 		{
1223 			int n;
1224 
1225 			BPFD_LOCK(d);
1226 			n = d->bd_slen;
1227 			while (d->bd_flags & BPFD_HBUF_INUSE)
1228 				mtx_sleep(&d->bd_hbuf, &d->bd_lock,
1229 				    PRINET, "bd_hbuf", 0);
1230 			if (d->bd_hbuf)
1231 				n += d->bd_hlen;
1232 			BPFD_UNLOCK(d);
1233 
1234 			*(int *)addr = n;
1235 			break;
1236 		}
1237 	/*
1238 	 * Get list of all tap points.
1239 	 */
1240 	case BIOCGETIFLIST:
1241 		error = bpf_getiflist((struct bpf_iflist *)addr);
1242 		break;
1243 
1244 	/*
1245 	 * Get buffer len [for read()].
1246 	 */
1247 	case BIOCGBLEN:
1248 		BPFD_LOCK(d);
1249 		*(u_int *)addr = d->bd_bufsize;
1250 		BPFD_UNLOCK(d);
1251 		break;
1252 
1253 	/*
1254 	 * Set buffer length.
1255 	 */
1256 	case BIOCSBLEN:
1257 		error = bpf_ioctl_sblen(d, (u_int *)addr);
1258 		break;
1259 
1260 	/*
1261 	 * Set link layer read filter.
1262 	 */
1263 	case BIOCSETF:
1264 	case BIOCSETFNR:
1265 	case BIOCSETWF:
1266 #ifdef COMPAT_FREEBSD32
1267 	case BIOCSETF32:
1268 	case BIOCSETFNR32:
1269 	case BIOCSETWF32:
1270 #endif
1271 		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1272 		break;
1273 
1274 	/*
1275 	 * Flush read packet buffer.
1276 	 */
1277 	case BIOCFLUSH:
1278 		BPFD_LOCK(d);
1279 		reset_d(d);
1280 		BPFD_UNLOCK(d);
1281 		break;
1282 
1283 	/*
1284 	 * Put interface into promiscuous mode.
1285 	 */
1286 	case BIOCPROMISC:
1287 		BPF_LOCK();
1288 		if (d->bd_bif == NULL) {
1289 			/*
1290 			 * No interface attached yet.
1291 			 */
1292 			error = EINVAL;
1293 		} else if (d->bd_promisc == 0) {
1294 			struct bpf_if *bp = d->bd_bif;
1295 
1296 			if ((error = bif_promisc(bp, true)) == 0)
1297 				d->bd_promisc = 1;
1298 		}
1299 		BPF_UNLOCK();
1300 		break;
1301 
1302 	/*
1303 	 * Get current data link type.
1304 	 */
1305 	case BIOCGDLT:
1306 		BPF_LOCK();
1307 		if (d->bd_bif == NULL)
1308 			error = EINVAL;
1309 		else
1310 			*(u_int *)addr = d->bd_bif->bif_dlt;
1311 		BPF_UNLOCK();
1312 		break;
1313 
1314 	/*
1315 	 * Get a list of supported data link types.
1316 	 */
1317 #ifdef COMPAT_FREEBSD32
1318 	case BIOCGDLTLIST32:
1319 		{
1320 			struct bpf_dltlist32 *list32;
1321 			struct bpf_dltlist dltlist;
1322 
1323 			list32 = (struct bpf_dltlist32 *)addr;
1324 			dltlist.bfl_len = list32->bfl_len;
1325 			dltlist.bfl_list = PTRIN(list32->bfl_list);
1326 			BPF_LOCK();
1327 			if (d->bd_bif == NULL)
1328 				error = EINVAL;
1329 			else {
1330 				error = bpf_getdltlist(d, &dltlist);
1331 				if (error == 0)
1332 					list32->bfl_len = dltlist.bfl_len;
1333 			}
1334 			BPF_UNLOCK();
1335 			break;
1336 		}
1337 #endif
1338 
1339 	case BIOCGDLTLIST:
1340 		BPF_LOCK();
1341 		if (d->bd_bif == NULL)
1342 			error = EINVAL;
1343 		else
1344 			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1345 		BPF_UNLOCK();
1346 		break;
1347 
1348 	/*
1349 	 * Set data link type.
1350 	 */
1351 	case BIOCSDLT:
1352 		BPF_LOCK();
1353 		if (d->bd_bif == NULL)
1354 			error = EINVAL;
1355 		else
1356 			error = bpf_setdlt(d, *(u_int *)addr);
1357 		BPF_UNLOCK();
1358 		break;
1359 
1360 	/*
1361 	 * Get interface name.
1362 	 */
1363 	case BIOCGETIF:
1364 		BPF_LOCK();
1365 		if (d->bd_bif == NULL)
1366 			error = EINVAL;
1367 		else {
1368 			struct bpf_if *const bp = d->bd_bif;
1369 			struct ifreq *const ifr = (struct ifreq *)addr;
1370 
1371 			strlcpy(ifr->ifr_name, bp->bif_name,
1372 			    sizeof(ifr->ifr_name));
1373 		}
1374 		BPF_UNLOCK();
1375 		break;
1376 
1377 	/*
1378 	 * Set interface.
1379 	 */
1380 	case BIOCSETIF: {
1381 		struct ifreq *const ifr = (struct ifreq *)addr;
1382 		struct bpf_if *bp;
1383 
1384 		/*
1385 		 * Behavior here depends on the buffering model.  If we're
1386 		 * using kernel memory buffers, then we can allocate them here.
1387 		 * If we're using zero-copy, then the user process must have
1388 		 * registered buffers by the time we get here.
1389 		 */
1390 		BPFD_LOCK(d);
1391 		if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1392 		    d->bd_sbuf == NULL) {
1393 			u_int size;
1394 
1395 			size = d->bd_bufsize;
1396 			BPFD_UNLOCK(d);
1397 			error = bpf_buffer_ioctl_sblen(d, &size);
1398 			if (error != 0)
1399 				break;
1400 		} else
1401 			BPFD_UNLOCK(d);
1402 		BPF_LOCK();
1403 		/*
1404 		 * Look through attached interfaces for the named one.
1405 		 */
1406 		LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1407 			if (strncmp(ifr->ifr_name, bp->bif_name,
1408 			    sizeof(ifr->ifr_name)) == 0)
1409 				break;
1410 		}
1411 		if (bp != NULL)
1412 			error = bpf_attachd(d, bp);
1413 		else
1414 			error = ENXIO;
1415 		BPF_UNLOCK();
1416 		break;
1417 	}
1418 	/*
1419 	 * Set read timeout.
1420 	 */
1421 	case BIOCSRTIMEOUT:
1422 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1423 	case BIOCSRTIMEOUT32:
1424 #endif
1425 		{
1426 			struct timeval *tv = (struct timeval *)addr;
1427 #if defined(COMPAT_FREEBSD32)
1428 			struct timeval32 *tv32;
1429 			struct timeval tv64;
1430 
1431 			if (cmd == BIOCSRTIMEOUT32) {
1432 				tv32 = (struct timeval32 *)addr;
1433 				tv = &tv64;
1434 				tv->tv_sec = tv32->tv_sec;
1435 				tv->tv_usec = tv32->tv_usec;
1436 			} else
1437 #endif
1438 				tv = (struct timeval *)addr;
1439 
1440 			/*
1441 			 * Subtract 1 tick from tvtohz() since this isn't
1442 			 * a one-shot timer.
1443 			 */
1444 			if ((error = itimerfix(tv)) == 0)
1445 				d->bd_rtout = tvtohz(tv) - 1;
1446 			break;
1447 		}
1448 
1449 	/*
1450 	 * Get read timeout.
1451 	 */
1452 	case BIOCGRTIMEOUT:
1453 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1454 	case BIOCGRTIMEOUT32:
1455 #endif
1456 		{
1457 			struct timeval *tv;
1458 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1459 			struct timeval32 *tv32;
1460 			struct timeval tv64;
1461 
1462 			if (cmd == BIOCGRTIMEOUT32)
1463 				tv = &tv64;
1464 			else
1465 #endif
1466 				tv = (struct timeval *)addr;
1467 
1468 			tv->tv_sec = d->bd_rtout / hz;
1469 			tv->tv_usec = (d->bd_rtout % hz) * tick;
1470 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1471 			if (cmd == BIOCGRTIMEOUT32) {
1472 				tv32 = (struct timeval32 *)addr;
1473 				tv32->tv_sec = tv->tv_sec;
1474 				tv32->tv_usec = tv->tv_usec;
1475 			}
1476 #endif
1477 
1478 			break;
1479 		}
1480 
1481 	/*
1482 	 * Get packet stats.
1483 	 */
1484 	case BIOCGSTATS:
1485 		{
1486 			struct bpf_stat *bs = (struct bpf_stat *)addr;
1487 
1488 			/* XXXCSJP overflow */
1489 			bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
1490 			bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
1491 			break;
1492 		}
1493 
1494 	/*
1495 	 * Set immediate mode.
1496 	 */
1497 	case BIOCIMMEDIATE:
1498 		BPFD_LOCK(d);
1499 		d->bd_flags |= *(u_int *)addr ? BPFD_IMMEDIATE : 0;
1500 		BPFD_UNLOCK(d);
1501 		break;
1502 
1503 	case BIOCVERSION:
1504 		{
1505 			struct bpf_version *bv = (struct bpf_version *)addr;
1506 
1507 			bv->bv_major = BPF_MAJOR_VERSION;
1508 			bv->bv_minor = BPF_MINOR_VERSION;
1509 			break;
1510 		}
1511 
1512 	/*
1513 	 * Get "header already complete" flag
1514 	 */
1515 	case BIOCGHDRCMPLT:
1516 		BPFD_LOCK(d);
1517 		*(u_int *)addr = d->bd_flags & BPFD_HDRCMPLT ? 1 : 0;
1518 		BPFD_UNLOCK(d);
1519 		break;
1520 
1521 	/*
1522 	 * Set "header already complete" flag
1523 	 */
1524 	case BIOCSHDRCMPLT:
1525 		BPFD_LOCK(d);
1526 		d->bd_flags |= *(u_int *)addr ? BPFD_HDRCMPLT : 0;
1527 		BPFD_UNLOCK(d);
1528 		break;
1529 
1530 	/*
1531 	 * Get packet direction flag
1532 	 */
1533 	case BIOCGDIRECTION:
1534 		BPFD_LOCK(d);
1535 		*(u_int *)addr = d->bd_direction;
1536 		BPFD_UNLOCK(d);
1537 		break;
1538 
1539 	/*
1540 	 * Set packet direction flag
1541 	 */
1542 	case BIOCSDIRECTION:
1543 		{
1544 			u_int	direction;
1545 
1546 			direction = *(u_int *)addr;
1547 			switch (direction) {
1548 			case BPF_D_IN:
1549 			case BPF_D_INOUT:
1550 			case BPF_D_OUT:
1551 				BPFD_LOCK(d);
1552 				d->bd_direction = direction;
1553 				BPFD_UNLOCK(d);
1554 				break;
1555 			default:
1556 				error = EINVAL;
1557 			}
1558 		}
1559 		break;
1560 
1561 	/*
1562 	 * Get packet timestamp format and resolution.
1563 	 */
1564 	case BIOCGTSTAMP:
1565 		BPFD_LOCK(d);
1566 		*(u_int *)addr = d->bd_tstamp;
1567 		BPFD_UNLOCK(d);
1568 		break;
1569 
1570 	/*
1571 	 * Set packet timestamp format and resolution.
1572 	 */
1573 	case BIOCSTSTAMP:
1574 		{
1575 			u_int	func;
1576 
1577 			func = *(u_int *)addr;
1578 			if (BPF_T_VALID(func))
1579 				d->bd_tstamp = func;
1580 			else
1581 				error = EINVAL;
1582 		}
1583 		break;
1584 
1585 	case BIOCFEEDBACK:
1586 		BPFD_LOCK(d);
1587 		d->bd_flags |= *(u_int *)addr ? BPFD_FEEDBACK : 0;
1588 		BPFD_UNLOCK(d);
1589 		break;
1590 
1591 	case BIOCLOCK:
1592 		BPFD_LOCK(d);
1593 		d->bd_flags |= BPFD_LOCKED;
1594 		BPFD_UNLOCK(d);
1595 		break;
1596 
1597 	case FIONBIO:		/* Non-blocking I/O */
1598 		break;
1599 
1600 	case FIOASYNC:		/* Send signal on receive packets */
1601 		BPFD_LOCK(d);
1602 		d->bd_flags |= *(u_int *)addr ? BPFD_ASYNC : 0;
1603 		BPFD_UNLOCK(d);
1604 		break;
1605 
1606 	case FIOSETOWN:
1607 		/*
1608 		 * XXX: Add some sort of locking here?
1609 		 * fsetown() can sleep.
1610 		 */
1611 		error = fsetown(*(int *)addr, &d->bd_sigio);
1612 		break;
1613 
1614 	case FIOGETOWN:
1615 		BPFD_LOCK(d);
1616 		*(int *)addr = fgetown(&d->bd_sigio);
1617 		BPFD_UNLOCK(d);
1618 		break;
1619 
1620 	/* This is deprecated, FIOSETOWN should be used instead. */
1621 	case TIOCSPGRP:
1622 		error = fsetown(-(*(int *)addr), &d->bd_sigio);
1623 		break;
1624 
1625 	/* This is deprecated, FIOGETOWN should be used instead. */
1626 	case TIOCGPGRP:
1627 		*(int *)addr = -fgetown(&d->bd_sigio);
1628 		break;
1629 
1630 	case BIOCSRSIG:		/* Set receive signal */
1631 		{
1632 			u_int sig;
1633 
1634 			sig = *(u_int *)addr;
1635 
1636 			if (sig >= NSIG)
1637 				error = EINVAL;
1638 			else {
1639 				BPFD_LOCK(d);
1640 				d->bd_sig = sig;
1641 				BPFD_UNLOCK(d);
1642 			}
1643 			break;
1644 		}
1645 	case BIOCGRSIG:
1646 		BPFD_LOCK(d);
1647 		*(u_int *)addr = d->bd_sig;
1648 		BPFD_UNLOCK(d);
1649 		break;
1650 
1651 	case BIOCGETBUFMODE:
1652 		BPFD_LOCK(d);
1653 		*(u_int *)addr = d->bd_bufmode;
1654 		BPFD_UNLOCK(d);
1655 		break;
1656 
1657 	case BIOCSETBUFMODE:
1658 		/*
1659 		 * Allow the buffering mode to be changed as long as we
1660 		 * haven't yet committed to a particular mode.  Our
1661 		 * definition of commitment, for now, is whether or not a
1662 		 * buffer has been allocated or an interface attached, since
1663 		 * that's the point where things get tricky.
1664 		 */
1665 		switch (*(u_int *)addr) {
1666 		case BPF_BUFMODE_BUFFER:
1667 			break;
1668 
1669 		case BPF_BUFMODE_ZBUF:
1670 			if (bpf_zerocopy_enable)
1671 				break;
1672 			/* FALLSTHROUGH */
1673 
1674 		default:
1675 			CURVNET_RESTORE();
1676 			return (EINVAL);
1677 		}
1678 
1679 		BPFD_LOCK(d);
1680 		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1681 		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
1682 			BPFD_UNLOCK(d);
1683 			CURVNET_RESTORE();
1684 			return (EBUSY);
1685 		}
1686 		d->bd_bufmode = *(u_int *)addr;
1687 		BPFD_UNLOCK(d);
1688 		break;
1689 
1690 	case BIOCGETZMAX:
1691 		error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1692 		break;
1693 
1694 	case BIOCSETZBUF:
1695 		error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1696 		break;
1697 
1698 	case BIOCROTZBUF:
1699 		error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1700 		break;
1701 
1702 	case BIOCSETVLANPCP:
1703 		{
1704 			u_int pcp;
1705 
1706 			pcp = *(u_int *)addr;
1707 			if (pcp > BPF_PRIO_MAX || pcp < 0) {
1708 				error = EINVAL;
1709 				break;
1710 			}
1711 			d->bd_pcp = pcp;
1712 			break;
1713 		}
1714 	}
1715 	CURVNET_RESTORE();
1716 	return (error);
1717 }
1718 
1719 /*
1720  * Return list of available tapping points, or report how much space is
1721  * required for a successful return.
1722  */
1723 static int
bpf_getiflist(struct bpf_iflist * bi)1724 bpf_getiflist(struct bpf_iflist *bi)
1725 {
1726 	struct bpf_if *bp;
1727 	u_int allsize, size, cnt;
1728 	char *uaddr;
1729 
1730 	BPF_LOCK();
1731 
1732 	cnt = allsize = size = 0;
1733 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1734 		allsize += strlen(bp->bif_name) + 1;
1735 		if (++cnt == bi->bi_count)
1736 			size = allsize;
1737 	}
1738 	if (size == 0)
1739 		size = allsize;
1740 
1741 	if (bi->bi_size == 0) {
1742 		BPF_UNLOCK();
1743 		bi->bi_size = size;
1744 		bi->bi_count = cnt;
1745 		return (0);
1746 	} else if (bi->bi_size < size) {
1747 		BPF_UNLOCK();
1748 		return (ENOSPC);
1749 	}
1750 
1751 	uaddr = bi->bi_ubuf;
1752 	cnt = 0;
1753 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
1754 		u_int len;
1755 		int error;
1756 
1757 		len = strlen(bp->bif_name) + 1;
1758 		if ((error = copyout(bp->bif_name, uaddr, len)) != 0) {
1759 			BPF_UNLOCK();
1760 			return (error);
1761 		}
1762 		if (++cnt == bi->bi_count)
1763 			break;
1764 		uaddr += len;
1765 	}
1766 	BPF_UNLOCK();
1767 	bi->bi_count = cnt;
1768 
1769 	return (0);
1770 }
1771 
1772 /*
1773  * Set d's packet filter program to fp. If this file already has a filter,
1774  * free it and replace it. Returns EINVAL for bogus requests.
1775  *
1776  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
1777  * calls.
1778  */
1779 static int
bpf_setf(struct bpf_d * d,struct bpf_program * fp,u_long cmd)1780 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1781 {
1782 #ifdef COMPAT_FREEBSD32
1783 	struct bpf_program fp_swab;
1784 	struct bpf_program32 *fp32;
1785 #endif
1786 	struct bpf_program_buffer *fcode;
1787 	struct bpf_insn *filter;
1788 #ifdef BPF_JITTER
1789 	bpf_jit_filter *jfunc;
1790 #endif
1791 	size_t size;
1792 	u_int flen;
1793 	bool track_event;
1794 
1795 #ifdef COMPAT_FREEBSD32
1796 	switch (cmd) {
1797 	case BIOCSETF32:
1798 	case BIOCSETWF32:
1799 	case BIOCSETFNR32:
1800 		fp32 = (struct bpf_program32 *)fp;
1801 		fp_swab.bf_len = fp32->bf_len;
1802 		fp_swab.bf_insns =
1803 		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1804 		fp = &fp_swab;
1805 		switch (cmd) {
1806 		case BIOCSETF32:
1807 			cmd = BIOCSETF;
1808 			break;
1809 		case BIOCSETWF32:
1810 			cmd = BIOCSETWF;
1811 			break;
1812 		}
1813 		break;
1814 	}
1815 #endif
1816 
1817 	filter = NULL;
1818 #ifdef BPF_JITTER
1819 	jfunc = NULL;
1820 #endif
1821 	/*
1822 	 * Check new filter validness before acquiring any locks.
1823 	 * Allocate memory for new filter, if needed.
1824 	 */
1825 	flen = fp->bf_len;
1826 	if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1827 		return (EINVAL);
1828 	size = flen * sizeof(*fp->bf_insns);
1829 	if (size > 0) {
1830 		/* We're setting up new filter. Copy and check actual data. */
1831 		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
1832 		filter = (struct bpf_insn *)fcode->buffer;
1833 		if (copyin(fp->bf_insns, filter, size) != 0 ||
1834 		    !bpf_validate(filter, flen)) {
1835 			free(fcode, M_BPF);
1836 			return (EINVAL);
1837 		}
1838 #ifdef BPF_JITTER
1839 		if (cmd != BIOCSETWF) {
1840 			/*
1841 			 * Filter is copied inside fcode and is
1842 			 * perfectly valid.
1843 			 */
1844 			jfunc = bpf_jitter(filter, flen);
1845 		}
1846 #endif
1847 	}
1848 
1849 	track_event = false;
1850 	fcode = NULL;
1851 
1852 	BPF_LOCK();
1853 	BPFD_LOCK(d);
1854 	/* Set up new filter. */
1855 	if (cmd == BIOCSETWF) {
1856 		if (d->bd_wfilter != NULL) {
1857 			fcode = __containerof((void *)d->bd_wfilter,
1858 			    struct bpf_program_buffer, buffer);
1859 #ifdef BPF_JITTER
1860 			fcode->func = NULL;
1861 #endif
1862 		}
1863 		d->bd_wfilter = filter;
1864 	} else {
1865 		if (d->bd_rfilter != NULL) {
1866 			fcode = __containerof((void *)d->bd_rfilter,
1867 			    struct bpf_program_buffer, buffer);
1868 #ifdef BPF_JITTER
1869 			fcode->func = d->bd_bfilter;
1870 #endif
1871 		}
1872 		d->bd_rfilter = filter;
1873 #ifdef BPF_JITTER
1874 		d->bd_bfilter = jfunc;
1875 #endif
1876 		if (cmd == BIOCSETF)
1877 			reset_d(d);
1878 
1879 		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
1880 			/*
1881 			 * Filter can be set several times without
1882 			 * specifying interface. In this case just mark d
1883 			 * as reader.
1884 			 */
1885 			d->bd_writer = 0;
1886 			if (d->bd_bif != NULL) {
1887 				/*
1888 				 * Remove descriptor from writers-only list
1889 				 * and add it to active readers list.
1890 				 */
1891 				CK_LIST_REMOVE(d, bd_next);
1892 				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
1893 				    d, bd_next);
1894 				CTR2(KTR_NET,
1895 				    "%s: upgrade required by pid %d",
1896 				    __func__, d->bd_pid);
1897 				track_event = true;
1898 			}
1899 		}
1900 	}
1901 	BPFD_UNLOCK(d);
1902 
1903 	if (fcode != NULL)
1904 		NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
1905 
1906 	if (track_event)
1907 		bif_attachd(d->bd_bif);
1908 
1909 	BPF_UNLOCK();
1910 	return (0);
1911 }
1912 
1913 /*
1914  * Attach descriptor to a tap point, possibly detaching from the old one,
1915  * reset the counters.
1916  * XXXGL: this KPI is subject to change
1917  */
1918 static int
bpf_attachd(struct bpf_d * d,struct bpf_if * bp)1919 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
1920 {
1921 	bool writer;
1922 
1923 	BPF_LOCK_ASSERT();
1924 
1925 	/*
1926 	 * At this point, we expect the buffer is already allocated.  If not,
1927 	 * return an error.
1928 	 */
1929 	switch (d->bd_bufmode) {
1930 	case BPF_BUFMODE_BUFFER:
1931 	case BPF_BUFMODE_ZBUF:
1932 		if (d->bd_sbuf == NULL)
1933 			return (EINVAL);
1934 		break;
1935 
1936 	default:
1937 		panic("%s: bufmode %d", __func__, d->bd_bufmode);
1938 	}
1939 
1940 	if (bp == d->bd_bif) {
1941 		BPFD_LOCK(d);
1942 		reset_d(d);
1943 		BPFD_UNLOCK(d);
1944 		return (0);
1945 	} else if (d->bd_bif != NULL)
1946 		bpf_detachd(d, false);
1947 
1948 	/*
1949 	 * Save sysctl value to protect from sysctl change between reads.
1950 	 */
1951 	writer = V_bpf_optimize_writers || (d->bd_writer > 0);
1952 
1953 	/*
1954 	 * Point d at bp, and add d to the interface's list.
1955 	 * Since there are many applications using BPF for
1956 	 * sending raw packets only (dhcpd, cdpd are good examples)
1957 	 * we can delay adding d to the list of active listeners until
1958 	 * some filter is configured.
1959 	 */
1960 	BPFD_LOCK(d);
1961 	/*
1962 	 * Hold reference to bpif while descriptor uses this interface.
1963 	 */
1964 	bpfif_ref(bp);
1965 	d->bd_bif = bp;
1966 	if (writer) {
1967 		/* Add to writers-only list */
1968 		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
1969 		/*
1970 		 * We decrement bd_writer on every filter set operation.
1971 		 * First BIOCSETF is done by pcap_open_live() to set up
1972 		 * snap length. After that application usually sets its own
1973 		 * filter.
1974 		 */
1975 		d->bd_writer = 2;
1976 	} else
1977 		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
1978 
1979 	reset_d(d);
1980 
1981 	/* Trigger EVFILT_WRITE events. */
1982 	bpf_wakeup(d);
1983 
1984 	BPFD_UNLOCK(d);
1985 
1986 	CTR3(KTR_NET, "%s: called by pid %d, adding to %s list",
1987 	    __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
1988 
1989 	if (!writer)
1990 		bif_attachd(bp);
1991 
1992 	return (0);
1993 }
1994 
1995 /*
1996  * Support for select() and poll() system calls
1997  *
1998  * Return true iff the specific operation will not block indefinitely.
1999  * Otherwise, return false but make a note that a selwakeup() must be done.
2000  */
2001 static int
bpfpoll(struct cdev * dev,int events,struct thread * td)2002 bpfpoll(struct cdev *dev, int events, struct thread *td)
2003 {
2004 	struct bpf_d *d;
2005 	int revents;
2006 
2007 	if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
2008 		return (events &
2009 		    (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM));
2010 
2011 	/*
2012 	 * Refresh PID associated with this descriptor.
2013 	 */
2014 	revents = events & (POLLOUT | POLLWRNORM);
2015 	BPFD_LOCK(d);
2016 	BPF_PID_REFRESH(d, td);
2017 	if (events & (POLLIN | POLLRDNORM)) {
2018 		if (bpf_ready(d))
2019 			revents |= events & (POLLIN | POLLRDNORM);
2020 		else {
2021 			selrecord(td, &d->bd_sel);
2022 			/* Start the read timeout if necessary. */
2023 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2024 				callout_reset(&d->bd_callout, d->bd_rtout,
2025 				    bpf_timed_out, d);
2026 				d->bd_state = BPF_WAITING;
2027 			}
2028 		}
2029 	}
2030 	BPFD_UNLOCK(d);
2031 	return (revents);
2032 }
2033 
2034 /*
2035  * Support for kevent() system call.  Register EVFILT_READ filters and
2036  * reject all others.
2037  */
2038 int
bpfkqfilter(struct cdev * dev,struct knote * kn)2039 bpfkqfilter(struct cdev *dev, struct knote *kn)
2040 {
2041 	struct bpf_d *d;
2042 
2043 	if (devfs_get_cdevpriv((void **)&d) != 0)
2044 		return (1);
2045 
2046 	switch (kn->kn_filter) {
2047 	case EVFILT_READ:
2048 		kn->kn_fop = &bpfread_filtops;
2049 		break;
2050 
2051 	case EVFILT_WRITE:
2052 		kn->kn_fop = &bpfwrite_filtops;
2053 		break;
2054 
2055 	default:
2056 		return (1);
2057 	}
2058 
2059 	/*
2060 	 * Refresh PID associated with this descriptor.
2061 	 */
2062 	BPFD_LOCK(d);
2063 	BPF_PID_REFRESH_CUR(d);
2064 	kn->kn_hook = d;
2065 	knlist_add(&d->bd_sel.si_note, kn, 1);
2066 	BPFD_UNLOCK(d);
2067 
2068 	return (0);
2069 }
2070 
2071 static void
filt_bpfdetach(struct knote * kn)2072 filt_bpfdetach(struct knote *kn)
2073 {
2074 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2075 
2076 	knlist_remove(&d->bd_sel.si_note, kn, 0);
2077 }
2078 
2079 static int
filt_bpfread(struct knote * kn,long hint)2080 filt_bpfread(struct knote *kn, long hint)
2081 {
2082 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2083 	int ready;
2084 
2085 	BPFD_LOCK_ASSERT(d);
2086 	ready = bpf_ready(d);
2087 	if (ready) {
2088 		kn->kn_data = d->bd_slen;
2089 		/*
2090 		 * Ignore the hold buffer if it is being copied to user space.
2091 		 */
2092 		if (!(d->bd_flags & BPFD_HBUF_INUSE) && d->bd_hbuf)
2093 			kn->kn_data += d->bd_hlen;
2094 	} else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2095 		callout_reset(&d->bd_callout, d->bd_rtout,
2096 		    bpf_timed_out, d);
2097 		d->bd_state = BPF_WAITING;
2098 	}
2099 
2100 	return (ready);
2101 }
2102 
2103 static int
filt_bpfwrite(struct knote * kn,long hint)2104 filt_bpfwrite(struct knote *kn, long hint)
2105 {
2106 	struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2107 
2108 	BPFD_LOCK_ASSERT(d);
2109 
2110 	if (d->bd_bif == NULL) {
2111 		kn->kn_data = 0;
2112 		return (0);
2113 	} else {
2114 		kn->kn_data = bif_wrsize(d->bd_bif);
2115 		return (1);
2116 	}
2117 }
2118 
2119 #define	BPF_TSTAMP_NONE		0
2120 #define	BPF_TSTAMP_FAST		1
2121 #define	BPF_TSTAMP_NORMAL	2
2122 #define	BPF_TSTAMP_EXTERN	3
2123 
2124 static int
bpf_ts_quality(int tstype)2125 bpf_ts_quality(int tstype)
2126 {
2127 
2128 	if (tstype == BPF_T_NONE)
2129 		return (BPF_TSTAMP_NONE);
2130 	if ((tstype & BPF_T_FAST) != 0)
2131 		return (BPF_TSTAMP_FAST);
2132 
2133 	return (BPF_TSTAMP_NORMAL);
2134 }
2135 
2136 static int
bpf_gettime(struct bintime * bt,int tstype,struct mbuf * m)2137 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2138 {
2139 	struct timespec ts;
2140 	struct m_tag *tag;
2141 	int quality;
2142 
2143 	quality = bpf_ts_quality(tstype);
2144 	if (quality == BPF_TSTAMP_NONE)
2145 		return (quality);
2146 
2147 	if (m != NULL) {
2148 		if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) {
2149 			mbuf_tstmp2timespec(m, &ts);
2150 			timespec2bintime(&ts, bt);
2151 			return (BPF_TSTAMP_EXTERN);
2152 		}
2153 		tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2154 		if (tag != NULL) {
2155 			*bt = *(struct bintime *)(tag + 1);
2156 			return (BPF_TSTAMP_EXTERN);
2157 		}
2158 	}
2159 	if (quality == BPF_TSTAMP_NORMAL)
2160 		binuptime(bt);
2161 	else
2162 		getbinuptime(bt);
2163 
2164 	return (quality);
2165 }
2166 
2167 /*
2168  * Incoming linkage from device drivers.  Process the packet pkt, of length
2169  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2170  * by each process' filter, and if accepted, stashed into the corresponding
2171  * buffer.
2172  */
2173 void
bpf_tap(struct bpf_if * bp,u_char * pkt,u_int pktlen)2174 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2175 {
2176 	struct epoch_tracker et;
2177 	struct bintime bt;
2178 	struct bpf_d *d;
2179 #ifdef BPF_JITTER
2180 	bpf_jit_filter *bf;
2181 #endif
2182 	u_int slen;
2183 	int gottime;
2184 
2185 	gottime = BPF_TSTAMP_NONE;
2186 	NET_EPOCH_ENTER(et);
2187 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2188 		counter_u64_add(d->bd_rcount, 1);
2189 		/*
2190 		 * NB: We don't check the direction here since there
2191 		 * is no way for the caller to indiciate to us whether this
2192 		 * packet is inbound or outbound. In the bpf_mtap() routines,
2193 		 * we use the interface pointers on the mbuf to figure it out.
2194 		 */
2195 #ifdef BPF_JITTER
2196 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2197 		if (bf != NULL)
2198 			slen = (*(bf->func))(pkt, pktlen, pktlen);
2199 		else
2200 #endif
2201 		slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2202 		if (slen != 0) {
2203 			/*
2204 			 * Filter matches. Let's to acquire write lock.
2205 			 */
2206 			BPFD_LOCK(d);
2207 			counter_u64_add(d->bd_fcount, 1);
2208 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2209 				gottime = bpf_gettime(&bt, d->bd_tstamp,
2210 				    NULL);
2211 #ifdef MAC
2212 			if (bif_mac_check_receive(bp, d) == 0)
2213 #endif
2214 				catchpacket(d, pkt, pktlen, slen,
2215 				    bpf_append_bytes, &bt);
2216 			BPFD_UNLOCK(d);
2217 		}
2218 	}
2219 	NET_EPOCH_EXIT(et);
2220 }
2221 
2222 void
bpf_tap_if(if_t ifp,u_char * pkt,u_int pktlen)2223 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
2224 {
2225 	if (bpf_peers_present(ifp->if_bpf))
2226 		bpf_tap(ifp->if_bpf, pkt, pktlen);
2227 }
2228 
2229 /*
2230  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2231  * Locking model is explained in bpf_tap().
2232  */
2233 void
bpf_mtap(struct bpf_if * bp,struct mbuf * m)2234 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2235 {
2236 	struct epoch_tracker et;
2237 	struct bintime bt;
2238 	struct bpf_d *d;
2239 #ifdef BPF_JITTER
2240 	bpf_jit_filter *bf;
2241 #endif
2242 	u_int pktlen, slen;
2243 	int gottime;
2244 
2245 	/* Skip outgoing duplicate packets. */
2246 	if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
2247 		m->m_flags &= ~M_PROMISC;
2248 		return;
2249 	}
2250 
2251 	pktlen = m_length(m, NULL);
2252 	gottime = BPF_TSTAMP_NONE;
2253 
2254 	NET_EPOCH_ENTER(et);
2255 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2256 		if (bpf_chkdir(d, m))
2257 			continue;
2258 		counter_u64_add(d->bd_rcount, 1);
2259 #ifdef BPF_JITTER
2260 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2261 		/* XXX We cannot handle multiple mbufs. */
2262 		if (bf != NULL && m->m_next == NULL)
2263 			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
2264 			    pktlen);
2265 		else
2266 #endif
2267 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2268 		if (slen != 0) {
2269 			BPFD_LOCK(d);
2270 
2271 			counter_u64_add(d->bd_fcount, 1);
2272 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2273 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2274 #ifdef MAC
2275 			if (bif_mac_check_receive(bp, d) == 0)
2276 #endif
2277 				catchpacket(d, (u_char *)m, pktlen, slen,
2278 				    bpf_append_mbuf, &bt);
2279 			BPFD_UNLOCK(d);
2280 		}
2281 	}
2282 	NET_EPOCH_EXIT(et);
2283 }
2284 
2285 void
bpf_mtap_if(if_t ifp,struct mbuf * m)2286 bpf_mtap_if(if_t ifp, struct mbuf *m)
2287 {
2288 	if (bpf_peers_present(ifp->if_bpf)) {
2289 		M_ASSERTVALID(m);
2290 		bpf_mtap(ifp->if_bpf, m);
2291 	}
2292 }
2293 
2294 /*
2295  * Incoming linkage from device drivers, when packet is in
2296  * an mbuf chain and to be prepended by a contiguous header.
2297  */
2298 void
bpf_mtap2(struct bpf_if * bp,void * data,u_int dlen,struct mbuf * m)2299 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2300 {
2301 	struct epoch_tracker et;
2302 	struct bintime bt;
2303 	struct mbuf mb;
2304 	struct bpf_d *d;
2305 	u_int pktlen, slen;
2306 	int gottime;
2307 
2308 	/* Skip outgoing duplicate packets. */
2309 	if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2310 		m->m_flags &= ~M_PROMISC;
2311 		return;
2312 	}
2313 
2314 	pktlen = m_length(m, NULL);
2315 	/*
2316 	 * Craft on-stack mbuf suitable for passing to bpf_filter.
2317 	 * Note that we cut corners here; we only setup what's
2318 	 * absolutely needed--this mbuf should never go anywhere else.
2319 	 */
2320 	mb.m_flags = 0;
2321 	mb.m_next = m;
2322 	mb.m_data = data;
2323 	mb.m_len = dlen;
2324 	pktlen += dlen;
2325 
2326 	gottime = BPF_TSTAMP_NONE;
2327 
2328 	NET_EPOCH_ENTER(et);
2329 	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2330 		if (bpf_chkdir(d, m))
2331 			continue;
2332 		counter_u64_add(d->bd_rcount, 1);
2333 		slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2334 		if (slen != 0) {
2335 			BPFD_LOCK(d);
2336 
2337 			counter_u64_add(d->bd_fcount, 1);
2338 			if (gottime < bpf_ts_quality(d->bd_tstamp))
2339 				gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2340 #ifdef MAC
2341 			if (bif_mac_check_receive(bp, d) == 0)
2342 #endif
2343 				catchpacket(d, (u_char *)&mb, pktlen, slen,
2344 				    bpf_append_mbuf, &bt);
2345 			BPFD_UNLOCK(d);
2346 		}
2347 	}
2348 	NET_EPOCH_EXIT(et);
2349 }
2350 
2351 void
bpf_mtap2_if(if_t ifp,void * data,u_int dlen,struct mbuf * m)2352 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
2353 {
2354 	if (bpf_peers_present(ifp->if_bpf)) {
2355 		M_ASSERTVALID(m);
2356 		bpf_mtap2(ifp->if_bpf, data, dlen, m);
2357 	}
2358 }
2359 
2360 #undef	BPF_TSTAMP_NONE
2361 #undef	BPF_TSTAMP_FAST
2362 #undef	BPF_TSTAMP_NORMAL
2363 #undef	BPF_TSTAMP_EXTERN
2364 
2365 static int
bpf_hdrlen(struct bpf_d * d)2366 bpf_hdrlen(struct bpf_d *d)
2367 {
2368 	int hdrlen;
2369 
2370 	hdrlen = d->bd_bif->bif_hdrlen;
2371 #ifndef BURN_BRIDGES
2372 	if (d->bd_tstamp == BPF_T_NONE ||
2373 	    BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2374 #ifdef COMPAT_FREEBSD32
2375 		if (d->bd_compat32)
2376 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2377 		else
2378 #endif
2379 			hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2380 	else
2381 #endif
2382 		hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2383 #ifdef COMPAT_FREEBSD32
2384 	if (d->bd_compat32)
2385 		hdrlen = BPF_WORDALIGN32(hdrlen);
2386 	else
2387 #endif
2388 		hdrlen = BPF_WORDALIGN(hdrlen);
2389 
2390 	return (hdrlen - d->bd_bif->bif_hdrlen);
2391 }
2392 
2393 static void
bpf_bintime2ts(struct bintime * bt,struct bpf_ts * ts,int tstype)2394 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2395 {
2396 	struct bintime bt2, boottimebin;
2397 	struct timeval tsm;
2398 	struct timespec tsn;
2399 
2400 	if ((tstype & BPF_T_MONOTONIC) == 0) {
2401 		bt2 = *bt;
2402 		getboottimebin(&boottimebin);
2403 		bintime_add(&bt2, &boottimebin);
2404 		bt = &bt2;
2405 	}
2406 	switch (BPF_T_FORMAT(tstype)) {
2407 	case BPF_T_MICROTIME:
2408 		bintime2timeval(bt, &tsm);
2409 		ts->bt_sec = tsm.tv_sec;
2410 		ts->bt_frac = tsm.tv_usec;
2411 		break;
2412 	case BPF_T_NANOTIME:
2413 		bintime2timespec(bt, &tsn);
2414 		ts->bt_sec = tsn.tv_sec;
2415 		ts->bt_frac = tsn.tv_nsec;
2416 		break;
2417 	case BPF_T_BINTIME:
2418 		ts->bt_sec = bt->sec;
2419 		ts->bt_frac = bt->frac;
2420 		break;
2421 	}
2422 }
2423 
2424 /*
2425  * Move the packet data from interface memory (pkt) into the
2426  * store buffer.  "cpfn" is the routine called to do the actual data
2427  * transfer.  bcopy is passed in to copy contiguous chunks, while
2428  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2429  * pkt is really an mbuf.
2430  */
2431 static void
catchpacket(struct bpf_d * d,u_char * pkt,u_int pktlen,u_int snaplen,void (* cpfn)(struct bpf_d *,caddr_t,u_int,void *,u_int),struct bintime * bt)2432 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2433     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2434     struct bintime *bt)
2435 {
2436 	static char zeroes[BPF_ALIGNMENT];
2437 	struct bpf_xhdr hdr;
2438 #ifndef BURN_BRIDGES
2439 	struct bpf_hdr hdr_old;
2440 #ifdef COMPAT_FREEBSD32
2441 	struct bpf_hdr32 hdr32_old;
2442 #endif
2443 #endif
2444 	int caplen, curlen, hdrlen, pad, totlen;
2445 	int do_wakeup = 0;
2446 	int do_timestamp;
2447 	int tstype;
2448 
2449 	BPFD_LOCK_ASSERT(d);
2450 	if (d->bd_bif == NULL) {
2451 		/* Descriptor was detached in concurrent thread */
2452 		counter_u64_add(d->bd_dcount, 1);
2453 		return;
2454 	}
2455 
2456 	/*
2457 	 * Detect whether user space has released a buffer back to us, and if
2458 	 * so, move it from being a hold buffer to a free buffer.  This may
2459 	 * not be the best place to do it (for example, we might only want to
2460 	 * run this check if we need the space), but for now it's a reliable
2461 	 * spot to do it.
2462 	 */
2463 	if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2464 		d->bd_fbuf = d->bd_hbuf;
2465 		d->bd_hbuf = NULL;
2466 		d->bd_hlen = 0;
2467 		bpf_buf_reclaimed(d);
2468 	}
2469 
2470 	/*
2471 	 * Figure out how many bytes to move.  If the packet is
2472 	 * greater or equal to the snapshot length, transfer that
2473 	 * much.  Otherwise, transfer the whole packet (unless
2474 	 * we hit the buffer size limit).
2475 	 */
2476 	hdrlen = bpf_hdrlen(d);
2477 	totlen = hdrlen + min(snaplen, pktlen);
2478 	if (totlen > d->bd_bufsize)
2479 		totlen = d->bd_bufsize;
2480 
2481 	/*
2482 	 * Round up the end of the previous packet to the next longword.
2483 	 *
2484 	 * Drop the packet if there's no room and no hope of room
2485 	 * If the packet would overflow the storage buffer or the storage
2486 	 * buffer is considered immutable by the buffer model, try to rotate
2487 	 * the buffer and wakeup pending processes.
2488 	 */
2489 #ifdef COMPAT_FREEBSD32
2490 	if (d->bd_compat32)
2491 		curlen = BPF_WORDALIGN32(d->bd_slen);
2492 	else
2493 #endif
2494 		curlen = BPF_WORDALIGN(d->bd_slen);
2495 	if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2496 		if (d->bd_fbuf == NULL) {
2497 			/*
2498 			 * There's no room in the store buffer, and no
2499 			 * prospect of room, so drop the packet.  Notify the
2500 			 * buffer model.
2501 			 */
2502 			bpf_buffull(d);
2503 			counter_u64_add(d->bd_dcount, 1);
2504 			return;
2505 		}
2506 		KASSERT(!(d->bd_flags & BPFD_HBUF_INUSE),
2507 		    ("hold buffer is in use"));
2508 		ROTATE_BUFFERS(d);
2509 		do_wakeup = 1;
2510 		curlen = 0;
2511 	} else {
2512 		if ((d->bd_flags & BPFD_IMMEDIATE) ||
2513 		    d->bd_state == BPF_TIMED_OUT) {
2514 			/*
2515 			 * Immediate mode is set, or the read timeout has
2516 			 * already expired during a select call.  A packet
2517 			 * arrived, so the reader should be woken up.
2518 			 */
2519 			do_wakeup = 1;
2520 		}
2521 		pad = curlen - d->bd_slen;
2522 		KASSERT(pad >= 0 && pad <= sizeof(zeroes),
2523 		    ("%s: invalid pad byte count %d", __func__, pad));
2524 		if (pad > 0) {
2525 			/* Zero pad bytes. */
2526 			bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes,
2527 			    pad);
2528 		}
2529 	}
2530 
2531 	caplen = totlen - hdrlen;
2532 	tstype = d->bd_tstamp;
2533 	do_timestamp = tstype != BPF_T_NONE;
2534 #ifndef BURN_BRIDGES
2535 	if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2536 		struct bpf_ts ts;
2537 		if (do_timestamp)
2538 			bpf_bintime2ts(bt, &ts, tstype);
2539 #ifdef COMPAT_FREEBSD32
2540 		if (d->bd_compat32) {
2541 			bzero(&hdr32_old, sizeof(hdr32_old));
2542 			if (do_timestamp) {
2543 				hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2544 				hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2545 			}
2546 			hdr32_old.bh_datalen = pktlen;
2547 			hdr32_old.bh_hdrlen = hdrlen;
2548 			hdr32_old.bh_caplen = caplen;
2549 			bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2550 			    sizeof(hdr32_old));
2551 			goto copy;
2552 		}
2553 #endif
2554 		bzero(&hdr_old, sizeof(hdr_old));
2555 		if (do_timestamp) {
2556 			hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2557 			hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2558 		}
2559 		hdr_old.bh_datalen = pktlen;
2560 		hdr_old.bh_hdrlen = hdrlen;
2561 		hdr_old.bh_caplen = caplen;
2562 		bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2563 		    sizeof(hdr_old));
2564 		goto copy;
2565 	}
2566 #endif
2567 
2568 	/*
2569 	 * Append the bpf header.  Note we append the actual header size, but
2570 	 * move forward the length of the header plus padding.
2571 	 */
2572 	bzero(&hdr, sizeof(hdr));
2573 	if (do_timestamp)
2574 		bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2575 	hdr.bh_datalen = pktlen;
2576 	hdr.bh_hdrlen = hdrlen;
2577 	hdr.bh_caplen = caplen;
2578 	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2579 
2580 	/*
2581 	 * Copy the packet data into the store buffer and update its length.
2582 	 */
2583 #ifndef BURN_BRIDGES
2584 copy:
2585 #endif
2586 	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2587 	d->bd_slen = curlen + totlen;
2588 
2589 	if (do_wakeup)
2590 		bpf_wakeup(d);
2591 }
2592 
2593 /*
2594  * Free buffers currently in use by a descriptor.
2595  * Called on close.
2596  */
2597 static void
bpfd_free(epoch_context_t ctx)2598 bpfd_free(epoch_context_t ctx)
2599 {
2600 	struct bpf_d *d;
2601 	struct bpf_program_buffer *p;
2602 
2603 	/*
2604 	 * We don't need to lock out interrupts since this descriptor has
2605 	 * been detached from its interface and it yet hasn't been marked
2606 	 * free.
2607 	 */
2608 	d = __containerof(ctx, struct bpf_d, epoch_ctx);
2609 	bpf_free(d);
2610 	if (d->bd_rfilter != NULL) {
2611 		p = __containerof((void *)d->bd_rfilter,
2612 		    struct bpf_program_buffer, buffer);
2613 #ifdef BPF_JITTER
2614 		p->func = d->bd_bfilter;
2615 #endif
2616 		bpf_program_buffer_free(&p->epoch_ctx);
2617 	}
2618 	if (d->bd_wfilter != NULL) {
2619 		p = __containerof((void *)d->bd_wfilter,
2620 		    struct bpf_program_buffer, buffer);
2621 #ifdef BPF_JITTER
2622 		p->func = NULL;
2623 #endif
2624 		bpf_program_buffer_free(&p->epoch_ctx);
2625 	}
2626 
2627 	mtx_destroy(&d->bd_lock);
2628 	counter_u64_free(d->bd_rcount);
2629 	counter_u64_free(d->bd_dcount);
2630 	counter_u64_free(d->bd_fcount);
2631 	counter_u64_free(d->bd_wcount);
2632 	counter_u64_free(d->bd_wfcount);
2633 	counter_u64_free(d->bd_wdcount);
2634 	counter_u64_free(d->bd_zcopy);
2635 	free(d, M_BPF);
2636 }
2637 
2638 /*
2639  * Attach a tap point to bpf.
2640  * XXX: with current KPI it is consumer's responsibility to avoid duplicates.
2641  */
2642 struct bpf_if *
bpf_attach(const char * name,u_int dlt,u_int hdrlen,const struct bif_methods * methods,void * sc)2643 bpf_attach(const char *name, u_int dlt, u_int hdrlen,
2644     const struct bif_methods *methods, void *sc)
2645 {
2646 	struct bpf_if *bp;
2647 
2648 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
2649 
2650 	CK_LIST_INIT(&bp->bif_dlist);
2651 	CK_LIST_INIT(&bp->bif_wlist);
2652 	bp->bif_dlt = dlt;
2653 	bp->bif_hdrlen = hdrlen;
2654 	bp->bif_softc = sc;
2655 	bp->bif_name = name;
2656 	bp->bif_methods = methods;
2657 	refcount_init(&bp->bif_refcnt, 1);
2658 	BPF_LOCK();
2659 	LIST_INSERT_HEAD(&V_bpf_iflist, bp, bif_next);
2660 	BPF_UNLOCK();
2661 
2662 	return (bp);
2663 }
2664 
2665 #ifdef VIMAGE
2666 /*
2667  * Detach descriptors on interface's vmove event.
2668  * XXXGL: shouldn't be a special case, but a full detach.
2669  */
2670 void
bpf_ifdetach(struct ifnet * ifp)2671 bpf_ifdetach(struct ifnet *ifp)
2672 {
2673 	struct bpf_if *bp;
2674 	struct bpf_d *d;
2675 
2676 	BPF_LOCK();
2677 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2678 		/* XXXGL: assuming softc is ifnet here */
2679 		if (bp->bif_softc != ifp)
2680 			continue;
2681 
2682 		/* Detach common descriptors */
2683 		while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2684 			bpf_detachd(d, true);
2685 		}
2686 
2687 		/* Detach writer-only descriptors */
2688 		while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2689 			bpf_detachd(d, true);
2690 		}
2691 	}
2692 	BPF_UNLOCK();
2693 }
2694 #endif
2695 
2696 /*
2697  * Detach bpf tap point.  This involves detaching each descriptor associated
2698  * with the interface.  Notify each descriptor as it's detached so that any
2699  * sleepers wake up and get ENXIO.
2700  */
2701 void
bpf_detach(struct bpf_if * bp)2702 bpf_detach(struct bpf_if *bp)
2703 {
2704 	struct bpf_d *d;
2705 
2706 	BPF_LOCK();
2707 	LIST_REMOVE(bp, bif_next);
2708 
2709 	CTR3(KTR_NET, "%s: sheduling free for encap %d for bp %p",
2710 	    __func__, bp->bif_dlt, bp);
2711 
2712 	/* Detach common descriptors */
2713 	while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2714 		bpf_detachd(d, true);
2715 	}
2716 
2717 	/* Detach writer-only descriptors */
2718 	while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2719 		bpf_detachd(d, true);
2720 	}
2721 	bpfif_rele(bp);
2722 	BPF_UNLOCK();
2723 }
2724 
2725 #ifdef VIMAGE
2726 /*
2727  * Move bpf to a different VNET.  This KPI is a crutch to support if_vmove
2728  * and is not supposed to be used anywhere else.
2729  */
2730 void
bpf_vmove(struct bpf_if * bp)2731 bpf_vmove(struct bpf_if *bp)
2732 {
2733 
2734 	BPF_LOCK();
2735 	LIST_REMOVE(bp, bif_next);
2736 	LIST_INSERT_HEAD(&V_bpf_iflist, bp, bif_next);
2737 	BPF_UNLOCK();
2738 }
2739 #endif
2740 
2741 bool
bpf_peers_present_if(struct ifnet * ifp)2742 bpf_peers_present_if(struct ifnet *ifp)
2743 {
2744 	return (bpf_peers_present(ifp->if_bpf));
2745 }
2746 
2747 /*
2748  * Get a list of available data link type of the tap point.  If a tap point
2749  * attaches more than one time, it is supposed to attach with different DLTs
2750  * but with the same name pointer.
2751  */
2752 static int
bpf_getdltlist(struct bpf_d * d,struct bpf_dltlist * bfl)2753 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2754 {
2755 	const char *name;
2756 	struct bpf_if *bp;
2757 	u_int *lst;
2758 	int error, n, n1;
2759 
2760 	BPF_LOCK_ASSERT();
2761 
2762 	name = d->bd_bif->bif_name;
2763 	n1 = 0;
2764 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2765 		if (bp->bif_name == name)
2766 			n1++;
2767 	}
2768 	if (bfl->bfl_list == NULL) {
2769 		bfl->bfl_len = n1;
2770 		return (0);
2771 	}
2772 	if (n1 > bfl->bfl_len)
2773 		return (ENOMEM);
2774 
2775 	lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2776 	n = 0;
2777 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2778 		if (bp->bif_name != name)
2779 			continue;
2780 		lst[n++] = bp->bif_dlt;
2781 	}
2782 	error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2783 	free(lst, M_TEMP);
2784 	bfl->bfl_len = n;
2785 	return (error);
2786 }
2787 
2788 /*
2789  * Set the data link type of a BPF descriptor.  The convention is that
2790  * application first do BIOCSETIF and then BIOCSETDLT, thus the descriptor
2791  * is supposed to be already attached.  Only one kernel facility provides
2792  * tapping points with same name but different DLT - ieee80211_radiotap.
2793  *
2794  * XXXGL: this function definitely looks suspicious, e.g. it clearly doesn't
2795  * clear promisc on the old bpf_if.  The convention about reference counting
2796  * is also unclear.
2797  */
2798 static int
bpf_setdlt(struct bpf_d * d,u_int dlt)2799 bpf_setdlt(struct bpf_d *d, u_int dlt)
2800 {
2801 	int error, opromisc;
2802 	const char *name;
2803 	struct bpf_if *bp;
2804 
2805 	BPF_LOCK_ASSERT();
2806 	MPASS(d->bd_bif != NULL);
2807 
2808 	/*
2809 	 * It is safe to check bd_bif without BPFD_LOCK, it can not be
2810 	 * changed while we hold global lock.
2811 	 */
2812 	if (d->bd_bif->bif_dlt == dlt)
2813 		return (0);
2814 
2815 	name = d->bd_bif->bif_name;
2816 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2817 		if (bp->bif_name == name && bp->bif_dlt == dlt)
2818 			break;
2819 	}
2820 	if (bp == NULL)
2821 		return (EINVAL);
2822 
2823 	opromisc = d->bd_promisc;
2824 	bpf_detachd(d, false);
2825 	bpf_attachd(d, bp);
2826 	if (opromisc) {
2827 		error = bp->bif_methods->bif_promisc(bp->bif_softc, true);
2828 		if (error)
2829 			printf("%s: bif_promisc on %s failed (%d)\n",
2830 			    __func__, bp->bif_name, error);
2831 		else
2832 			d->bd_promisc = 1;
2833 	}
2834 	return (0);
2835 }
2836 
2837 static void
bpf_drvinit(void * unused)2838 bpf_drvinit(void *unused)
2839 {
2840 	struct cdev *dev;
2841 
2842 	sx_init(&bpf_sx, "bpf global lock");
2843 	dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2844 	/* For compatibility */
2845 	make_dev_alias(dev, "bpf0");
2846 }
2847 
2848 /*
2849  * Zero out the various packet counters associated with all of the bpf
2850  * descriptors.  At some point, we will probably want to get a bit more
2851  * granular and allow the user to specify descriptors to be zeroed.
2852  */
2853 static void
bpf_zero_counters(void)2854 bpf_zero_counters(void)
2855 {
2856 	struct bpf_if *bp;
2857 	struct bpf_d *bd;
2858 
2859 	BPF_LOCK();
2860 	/*
2861 	 * We are protected by global lock here, interfaces and
2862 	 * descriptors can not be deleted while we hold it.
2863 	 */
2864 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2865 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2866 			counter_u64_zero(bd->bd_rcount);
2867 			counter_u64_zero(bd->bd_dcount);
2868 			counter_u64_zero(bd->bd_fcount);
2869 			counter_u64_zero(bd->bd_wcount);
2870 			counter_u64_zero(bd->bd_wfcount);
2871 			counter_u64_zero(bd->bd_zcopy);
2872 		}
2873 	}
2874 	BPF_UNLOCK();
2875 }
2876 
2877 /*
2878  * Fill filter statistics
2879  */
2880 static void
bpfstats_fill_xbpf(struct xbpf_d * d,struct bpf_d * bd)2881 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2882 {
2883 
2884 	BPF_LOCK_ASSERT();
2885 	bzero(d, sizeof(*d));
2886 	d->bd_structsize = sizeof(*d);
2887 	d->bd_immediate = bd->bd_flags & BPFD_IMMEDIATE ? 1 : 0;
2888 	d->bd_promisc = bd->bd_promisc;
2889 	d->bd_hdrcmplt = bd->bd_flags & BPFD_HDRCMPLT ? 1 : 0;
2890 	d->bd_direction = bd->bd_direction;
2891 	d->bd_feedback = bd->bd_flags & BPFD_FEEDBACK ? 1 : 0;
2892 	d->bd_async = bd->bd_flags & BPFD_ASYNC ? 1 : 0;
2893 	d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
2894 	d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
2895 	d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
2896 	d->bd_sig = bd->bd_sig;
2897 	d->bd_slen = bd->bd_slen;
2898 	d->bd_hlen = bd->bd_hlen;
2899 	d->bd_bufsize = bd->bd_bufsize;
2900 	d->bd_pid = bd->bd_pid;
2901 	strlcpy(d->bd_ifname, bd->bd_bif->bif_name, sizeof(d->bd_ifname));
2902 	d->bd_locked = bd->bd_flags & BPFD_LOCKED ? 1 : 0;
2903 	d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
2904 	d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
2905 	d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
2906 	d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
2907 	d->bd_bufmode = bd->bd_bufmode;
2908 }
2909 
2910 /*
2911  * Handle `netstat -B' stats request
2912  */
2913 static int
bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)2914 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
2915 {
2916 	static const struct xbpf_d zerostats;
2917 	struct xbpf_d *xbdbuf, *xbd, tempstats;
2918 	u_int bpfd_cnt, index;
2919 	int error;
2920 	struct bpf_if *bp;
2921 	struct bpf_d *bd;
2922 
2923 	/*
2924 	 * XXX This is not technically correct. It is possible for non
2925 	 * privileged users to open bpf devices. It would make sense
2926 	 * if the users who opened the devices were able to retrieve
2927 	 * the statistics for them, too.
2928 	 */
2929 	error = priv_check(req->td, PRIV_NET_BPF);
2930 	if (error)
2931 		return (error);
2932 	/*
2933 	 * Check to see if the user is requesting that the counters be
2934 	 * zeroed out.  Explicitly check that the supplied data is zeroed,
2935 	 * as we aren't allowing the user to set the counters currently.
2936 	 */
2937 	if (req->newptr != NULL) {
2938 		if (req->newlen != sizeof(tempstats))
2939 			return (EINVAL);
2940 		memset(&tempstats, 0, sizeof(tempstats));
2941 		error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
2942 		if (error)
2943 			return (error);
2944 		if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
2945 			return (EINVAL);
2946 		bpf_zero_counters();
2947 		return (0);
2948 	}
2949 	bpfd_cnt = 0;
2950 	BPF_LOCK();
2951 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2952 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next)
2953 			bpfd_cnt++;
2954 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next)
2955 			bpfd_cnt++;
2956 	}
2957 	if (bpfd_cnt == 0 || req->oldptr == NULL) {
2958 		BPF_UNLOCK();
2959 		return (SYSCTL_OUT(req, 0, bpfd_cnt * sizeof(*xbd)));
2960 	}
2961 	if (req->oldlen < bpfd_cnt * sizeof(*xbd)) {
2962 		BPF_UNLOCK();
2963 		return (ENOMEM);
2964 	}
2965 	xbdbuf = malloc(bpfd_cnt * sizeof(*xbd), M_BPF, M_WAITOK);
2966 	index = 0;
2967 	LIST_FOREACH(bp, &V_bpf_iflist, bif_next) {
2968 		/* Send writers-only first */
2969 		CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
2970 			MPASS(index <= bpfd_cnt);
2971 			xbd = &xbdbuf[index++];
2972 			bpfstats_fill_xbpf(xbd, bd);
2973 		}
2974 		CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2975 			MPASS(index <= bpfd_cnt);
2976 			xbd = &xbdbuf[index++];
2977 			bpfstats_fill_xbpf(xbd, bd);
2978 		}
2979 	}
2980 	BPF_UNLOCK();
2981 	error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
2982 	free(xbdbuf, M_BPF);
2983 	return (error);
2984 }
2985 
2986 SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, bpf_drvinit, NULL);
2987 
2988 #else /* !DEV_BPF && !NETGRAPH_BPF */
2989 
2990 /*
2991  * NOP stubs to allow bpf-using drivers to load and function.
2992  *
2993  * A 'better' implementation would allow the core bpf functionality
2994  * to be loaded at runtime.
2995  */
2996 
2997 void
bpf_tap(struct bpf_if * bp,u_char * pkt,u_int pktlen)2998 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2999 {
3000 }
3001 
3002 void
bpf_tap_if(if_t ifp,u_char * pkt,u_int pktlen)3003 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
3004 {
3005 }
3006 
3007 void
bpf_mtap(struct bpf_if * bp,struct mbuf * m)3008 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
3009 {
3010 }
3011 
3012 void
bpf_mtap_if(if_t ifp,struct mbuf * m)3013 bpf_mtap_if(if_t ifp, struct mbuf *m)
3014 {
3015 }
3016 
3017 void
bpf_mtap2(struct bpf_if * bp,void * d,u_int l,struct mbuf * m)3018 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
3019 {
3020 }
3021 
3022 void
bpf_mtap2_if(if_t ifp,void * data,u_int dlen,struct mbuf * m)3023 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
3024 {
3025 }
3026 
3027 void
bpfattach(struct ifnet * ifp,u_int dlt,u_int hdrlen)3028 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3029 {
3030 	static const struct bpfd_list dead_bpf_if = CK_LIST_HEAD_INITIALIZER();
3031 
3032 	ifp->if_bpf = __DECONST(struct bpf_if *, &dead_bpf_if);
3033 }
3034 
3035 void
bpfdetach(struct ifnet * ifp)3036 bpfdetach(struct ifnet *ifp)
3037 {
3038 }
3039 
3040 bool
bpf_peers_present_if(struct ifnet * ifp)3041 bpf_peers_present_if(struct ifnet *ifp)
3042 {
3043 	return (false);
3044 }
3045 
3046 u_int
bpf_filter(const struct bpf_insn * pc,u_char * p,u_int wirelen,u_int buflen)3047 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3048 {
3049 	return (-1);	/* "no filter" behaviour */
3050 }
3051 
3052 int
bpf_validate(const struct bpf_insn * f,int len)3053 bpf_validate(const struct bpf_insn *f, int len)
3054 {
3055 	return (0);	/* false */
3056 }
3057 
3058 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3059