xref: /illumos-gate/usr/src/uts/common/io/bpf/bpf.c (revision ed093b41a93e8563e6e1e5dae0768dda2a7bcc27)
1 /*	$NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from the Stanford/CMU enet packet filter,
8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10  * Berkeley Laboratory.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)bpf.c	8.4 (Berkeley) 1/9/95
37  * static char rcsid[] =
38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39  */
40 /*
41  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
42  * Use is subject to license terms.
43  * Copyright 2017 Joyent, Inc.
44  */
45 
46 /*
47  * The BPF implements the following access controls for zones attempting
48  * to read and write data. Writing of data requires that the net_rawaccess
49  * privilege is held whilst reading data requires either net_rawaccess or
50  * net_observerability.
51  *
52  *                              | Shared |  Exclusive |   Global
53  * -----------------------------+--------+------------+------------+
54  * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
55  * -----------------------------+--------+------------+------------+
56  * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
57  * -----------------------------+--------+------------+------------+
58  * Raw access to all NICs       |  None  |    None    | Read/Write |
59  * -----------------------------+--------+------------+------------+
60  *
61  * The BPF driver is written as a cloning driver: each call to bpfopen()
62  * allocates a new minor number. This provides BPF with a 1:1 relationship
63  * between open's and close's. There is some amount of "descriptor state"
64  * that is kept per open. Pointers to this data are stored in a hash table
65  * (bpf_hash) that is index'd by the minor device number for each open file.
66  */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/time.h>
70 #include <sys/ioctl.h>
71 #include <sys/queue.h>
72 #include <sys/filio.h>
73 #include <sys/policy.h>
74 #include <sys/cmn_err.h>
75 #include <sys/uio.h>
76 #include <sys/file.h>
77 #include <sys/sysmacros.h>
78 #include <sys/zone.h>
79 
80 #include <sys/socket.h>
81 #include <sys/errno.h>
82 #include <sys/poll.h>
83 #include <sys/dlpi.h>
84 #include <sys/neti.h>
85 
86 #include <net/if.h>
87 
88 #include <net/bpf.h>
89 #include <net/bpfdesc.h>
90 #include <net/dlt.h>
91 
92 #include <netinet/in.h>
93 #include <sys/mac.h>
94 #include <sys/mac_client.h>
95 #include <sys/mac_impl.h>
96 #include <sys/time_std_impl.h>
97 #include <sys/hook.h>
98 #include <sys/hook_event.h>
99 
100 
101 #define	mtod(_v, _t)	(_t)((_v)->b_rptr)
102 #define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
103 
104 /*
105  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
106  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
107  */
108 #define	BPF_BUFSIZE (32 * 1024)
109 
110 typedef void *(*cp_fn_t)(void *, const void *, size_t);
111 
112 /*
113  * The default read buffer size, and limit for BIOCSBLEN.
114  */
115 int bpf_bufsize = BPF_BUFSIZE;
116 int bpf_maxbufsize = (16 * 1024 * 1024);
117 static mod_hash_t *bpf_hash = NULL;
118 
119 /*
120  * Use a mutex to avoid a race condition between gathering the stats/peers
121  * and opening/closing the device.
122  */
123 static kcondvar_t bpf_dlt_waiter;
124 static kmutex_t bpf_mtx;
125 static bpf_kstats_t ks_stats;
126 static bpf_kstats_t bpf_kstats = {
127 	{ "readWait",		KSTAT_DATA_UINT64 },
128 	{ "writeOk",		KSTAT_DATA_UINT64 },
129 	{ "writeError",		KSTAT_DATA_UINT64 },
130 	{ "receive",		KSTAT_DATA_UINT64 },
131 	{ "captured",		KSTAT_DATA_UINT64 },
132 	{ "dropped",		KSTAT_DATA_UINT64 },
133 };
134 static kstat_t *bpf_ksp;
135 
136 /*
137  *  bpf_list is a list of the BPF descriptors currently open
138  */
139 LIST_HEAD(, bpf_d) bpf_list;
140 
141 static int	bpf_allocbufs(struct bpf_d *);
142 static void	bpf_clear_timeout(struct bpf_d *);
143 static void	bpf_deliver(struct bpf_d *, cp_fn_t,
144 		    void *, uint_t, uint_t, boolean_t);
145 static void	bpf_freed(struct bpf_d *);
146 static int	bpf_ifname(struct bpf_d *d, char *, int);
147 static void	*bpf_mcpy(void *, const void *, size_t);
148 static int	bpf_attachd(struct bpf_d *, const char *, int);
149 static void	bpf_detachd(struct bpf_d *);
150 static int	bpf_setif(struct bpf_d *, char *, int);
151 static void	bpf_timed_out(void *);
152 static inline void
153 		bpf_wakeup(struct bpf_d *);
154 static void	catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
155 		    cp_fn_t, struct timeval *);
156 static void	reset_d(struct bpf_d *);
157 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
158 static int	bpf_setdlt(struct bpf_d *, void *);
159 static void	bpf_dev_add(struct bpf_d *);
160 static struct bpf_d *bpf_dev_find(minor_t);
161 static struct bpf_d *bpf_dev_get(minor_t);
162 static void	bpf_dev_remove(struct bpf_d *);
163 
164 static int
165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
166 {
167 	mblk_t *m;
168 	int error;
169 	int len;
170 	int hlen;
171 	int align;
172 
173 	/*
174 	 * Build a sockaddr based on the data link layer type.
175 	 * We do this at this level because the ethernet header
176 	 * is copied directly into the data field of the sockaddr.
177 	 * In the case of SLIP, there is no header and the packet
178 	 * is forwarded as is.
179 	 * Also, we are careful to leave room at the front of the mbuf
180 	 * for the link level header.
181 	 */
182 	switch (linktype) {
183 
184 	case DLT_EN10MB:
185 		hlen = sizeof (struct ether_header);
186 		break;
187 
188 	case DLT_FDDI:
189 		hlen = 16;
190 		break;
191 
192 	case DLT_NULL:
193 		hlen = 0;
194 		break;
195 
196 	case DLT_IPOIB:
197 		hlen = 44;
198 		break;
199 
200 	default:
201 		return (EIO);
202 	}
203 
204 	align = 4 - (hlen & 3);
205 
206 	len = uio->uio_resid;
207 	/*
208 	 * If there aren't enough bytes for a link level header or the
209 	 * packet length exceeds the interface mtu, return an error.
210 	 */
211 	if (len < hlen || len - hlen > mtu)
212 		return (EMSGSIZE);
213 
214 	m = allocb(len + align, BPRI_MED);
215 	if (m == NULL) {
216 		error = ENOBUFS;
217 		goto bad;
218 	}
219 
220 	/* Insure the data is properly aligned */
221 	if (align > 0)
222 		m->b_rptr += align;
223 	m->b_wptr = m->b_rptr + len;
224 
225 	error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
226 	if (error)
227 		goto bad;
228 	*mp = m;
229 	return (0);
230 
231 bad:
232 	if (m != NULL)
233 		freemsg(m);
234 	return (error);
235 }
236 
237 
238 /*
239  * Attach file to the bpf interface, i.e. make d listen on bp.
240  */
241 static int
242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
243 {
244 	bpf_provider_list_t *bp;
245 	bpf_provider_t *bpr;
246 	boolean_t zonematch;
247 	zoneid_t niczone;
248 	uintptr_t mcip;
249 	zoneid_t zone;
250 	uint_t nicdlt;
251 	uintptr_t mh;
252 	int hdrlen;
253 	int error;
254 
255 	ASSERT(d->bd_bif == (uintptr_t)NULL);
256 	ASSERT(d->bd_mcip == (uintptr_t)NULL);
257 	zone = d->bd_zone;
258 	zonematch = B_TRUE;
259 	error = 0;
260 	bpr = NULL;
261 again:
262 	mh = 0;
263 	mcip = 0;
264 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
265 		bpr = bp->bpl_what;
266 		error = MBPF_OPEN(bpr, ifname, &mh, zone);
267 		if (error != 0)
268 			goto next;
269 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
270 		if (error != 0)
271 			goto next;
272 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
273 		if (error != 0)
274 			goto next;
275 
276 		nicdlt = bpf_dl_to_dlt(nicdlt);
277 		if (dlt != -1 && dlt != nicdlt) {
278 			error = ENOENT;
279 			goto next;
280 		}
281 
282 		error = MBPF_GET_ZONE(bpr, mh, &niczone);
283 		if (error != 0)
284 			goto next;
285 
286 		DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
287 		    uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
288 
289 		if (zonematch && niczone != zone) {
290 			error = ENOENT;
291 			goto next;
292 		}
293 		break;
294 next:
295 		if (mcip != 0) {
296 			MBPF_CLIENT_CLOSE(bpr, mcip);
297 			mcip = 0;
298 		}
299 		if (mh != 0) {
300 			MBPF_CLOSE(bpr, mh);
301 			mh = 0;
302 		}
303 	}
304 	if (error != 0) {
305 		if (zonematch && (zone == GLOBAL_ZONEID)) {
306 			/*
307 			 * If we failed to do an exact match for the global
308 			 * zone using the global zoneid, try again in case
309 			 * the network interface is owned by a local zone.
310 			 */
311 			zonematch = B_FALSE;
312 			goto again;
313 		}
314 		return (error);
315 	}
316 
317 	/* No providers? */
318 	if (bpr == NULL)
319 		return (ENOENT);
320 
321 	d->bd_mac = *bpr;
322 	d->bd_mcip = mcip;
323 	d->bd_bif = mh;
324 	d->bd_dlt = nicdlt;
325 	hdrlen = bpf_dl_hdrsize(nicdlt);
326 	d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
327 
328 	(void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
329 	    sizeof (d->bd_ifname));
330 
331 	(void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
332 	    zone);
333 	(void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
334 	    &d->bd_promisc_handle, d->bd_promisc_flags);
335 	return (0);
336 }
337 
338 /*
339  * Detach a file from its interface.
340  */
341 static void
342 bpf_detachd(struct bpf_d *d)
343 {
344 	uintptr_t mph;
345 	uintptr_t mch;
346 	uintptr_t mh;
347 
348 	ASSERT(d->bd_inuse == -1);
349 	mch = d->bd_mcip;
350 	d->bd_mcip = 0;
351 	mh = d->bd_bif;
352 	d->bd_bif = 0;
353 
354 	/*
355 	 * Check if this descriptor had requested promiscuous mode.
356 	 * If so, turn it off. There's no need to take any action
357 	 * here, that is done when MBPF_PROMISC_REMOVE is used;
358 	 * bd_promisc is just a local flag to stop promiscuous mode
359 	 * from being set more than once.
360 	 */
361 	if (d->bd_promisc)
362 		d->bd_promisc = 0;
363 
364 	/*
365 	 * Take device out of "promiscuous" mode.  Since we were able to
366 	 * enter "promiscuous" mode, we should be able to turn it off.
367 	 * Note, this field stores a pointer used to support both
368 	 * promiscuous and non-promiscuous callbacks for packets.
369 	 */
370 	mph = d->bd_promisc_handle;
371 	d->bd_promisc_handle = 0;
372 
373 	/*
374 	 * The lock has to be dropped here because mac_promisc_remove may
375 	 * need to wait for mac_promisc_dispatch, which has called into
376 	 * bpf and catchpacket is waiting for bd_lock...
377 	 * i.e mac_promisc_remove() needs to be called with none of the
378 	 * locks held that are part of the bpf_mtap() call path.
379 	 */
380 	mutex_exit(&d->bd_lock);
381 	if (mph != 0)
382 		MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
383 
384 	if (mch != 0)
385 		MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
386 
387 	if (mh != 0)
388 		MBPF_CLOSE(&d->bd_mac, mh);
389 
390 	/*
391 	 * Because this function is called with bd_lock held, so it must
392 	 * exit with it held.
393 	 */
394 	mutex_enter(&d->bd_lock);
395 	*d->bd_ifname = '\0';
396 	(void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
397 }
398 
399 
400 /*
401  * bpfilterattach() is called at load time.
402  */
403 int
404 bpfilterattach(void)
405 {
406 
407 	bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
408 	    mod_hash_null_keydtor);
409 	if (bpf_hash == NULL)
410 		return (ENOMEM);
411 
412 	(void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
413 
414 	bpf_ksp = kstat_create("bpf", 0, "global", "misc",
415 	    KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
416 	    KSTAT_FLAG_VIRTUAL);
417 	if (bpf_ksp != NULL) {
418 		bpf_ksp->ks_data = &ks_stats;
419 		kstat_install(bpf_ksp);
420 	} else {
421 		mod_hash_destroy_idhash(bpf_hash);
422 		bpf_hash = NULL;
423 		return (EEXIST);
424 	}
425 
426 	cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
427 	mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
428 
429 	LIST_INIT(&bpf_list);
430 
431 	return (0);
432 }
433 
434 
435 /*
436  * bpfilterdetach() is called at unload time.
437  */
438 int
439 bpfilterdetach(void)
440 {
441 
442 	if (bpf_ksp != NULL) {
443 		kstat_delete(bpf_ksp);
444 		bpf_ksp = NULL;
445 	}
446 
447 	mod_hash_destroy_idhash(bpf_hash);
448 	bpf_hash = NULL;
449 
450 	cv_destroy(&bpf_dlt_waiter);
451 	mutex_destroy(&bpf_mtx);
452 
453 	return (0);
454 }
455 
456 /*
457  * Open ethernet device. Clones.
458  */
459 /* ARGSUSED */
460 int
461 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
462 {
463 	struct bpf_d *d;
464 	uint_t dmin;
465 
466 	/*
467 	 * The security policy described at the top of this file is
468 	 * enforced here.
469 	 */
470 	if ((flag & FWRITE) != 0) {
471 		if (secpolicy_net_rawaccess(cred) != 0)
472 			return (EACCES);
473 	}
474 
475 	if ((flag & FREAD) != 0) {
476 		if ((secpolicy_net_observability(cred) != 0) &&
477 		    (secpolicy_net_rawaccess(cred) != 0))
478 			return (EACCES);
479 	}
480 
481 	if ((flag & (FWRITE|FREAD)) == 0)
482 		return (ENXIO);
483 
484 	/*
485 	 * A structure is allocated per open file in BPF to store settings
486 	 * such as buffer capture size, provide private buffers, etc.
487 	 */
488 	d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
489 	d->bd_bufsize = bpf_bufsize;
490 	d->bd_fmode = flag;
491 	d->bd_zone = crgetzoneid(cred);
492 	d->bd_seesent = 1;
493 	d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
494 	    MAC_PROMISC_FLAGS_NO_COPY;
495 	mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
496 	cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
497 
498 	mutex_enter(&bpf_mtx);
499 	/*
500 	 * Find an unused minor number. Obviously this is an O(n) algorithm
501 	 * and doesn't scale particularly well, so if there are large numbers
502 	 * of open file descriptors happening in real use, this design may
503 	 * need to be revisited.
504 	 */
505 	for (dmin = 0; dmin < L_MAXMIN; dmin++)
506 		if (bpf_dev_find(dmin) == NULL)
507 			break;
508 	if (dmin == L_MAXMIN) {
509 		mutex_exit(&bpf_mtx);
510 		kmem_free(d, sizeof (*d));
511 		return (ENXIO);
512 	}
513 	d->bd_dev = dmin;
514 	LIST_INSERT_HEAD(&bpf_list, d, bd_list);
515 	bpf_dev_add(d);
516 	mutex_exit(&bpf_mtx);
517 
518 	*devp = makedevice(getmajor(*devp), dmin);
519 
520 	return (0);
521 }
522 
523 /*
524  * Close the descriptor by detaching it from its interface,
525  * deallocating its buffers, and marking it free.
526  *
527  * Because we only allow a device to be opened once, there is always a
528  * 1 to 1 relationship between opens and closes supporting this function.
529  */
530 /* ARGSUSED */
531 int
532 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
533 {
534 	struct bpf_d *d = bpf_dev_get(getminor(dev));
535 
536 	mutex_enter(&d->bd_lock);
537 
538 	while (d->bd_inuse != 0) {
539 		d->bd_waiting++;
540 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
541 			d->bd_waiting--;
542 			mutex_exit(&d->bd_lock);
543 			return (EINTR);
544 		}
545 		d->bd_waiting--;
546 	}
547 
548 	d->bd_inuse = -1;
549 	if (d->bd_state == BPF_WAITING)
550 		bpf_clear_timeout(d);
551 	d->bd_state = BPF_IDLE;
552 	if (d->bd_bif)
553 		bpf_detachd(d);
554 	mutex_exit(&d->bd_lock);
555 
556 	mutex_enter(&bpf_mtx);
557 	LIST_REMOVE(d, bd_list);
558 	bpf_dev_remove(d);
559 	mutex_exit(&bpf_mtx);
560 
561 	mutex_enter(&d->bd_lock);
562 	mutex_destroy(&d->bd_lock);
563 	cv_destroy(&d->bd_wait);
564 
565 	bpf_freed(d);
566 	kmem_free(d, sizeof (*d));
567 
568 	return (0);
569 }
570 
571 /*
572  * Rotate the packet buffers in descriptor d.  Move the store buffer
573  * into the hold slot, and the free buffer into the store slot.
574  * Zero the length of the new store buffer.
575  */
576 #define	ROTATE_BUFFERS(d) \
577 	(d)->bd_hbuf = (d)->bd_sbuf; \
578 	(d)->bd_hlen = (d)->bd_slen; \
579 	(d)->bd_sbuf = (d)->bd_fbuf; \
580 	(d)->bd_slen = 0; \
581 	(d)->bd_fbuf = 0;
582 /*
583  *  bpfread - read next chunk of packets from buffers
584  */
585 /* ARGSUSED */
586 int
587 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
588 {
589 	struct bpf_d *d = bpf_dev_get(getminor(dev));
590 	int timed_out;
591 	ulong_t delay;
592 	int error;
593 
594 	if ((d->bd_fmode & FREAD) == 0)
595 		return (EBADF);
596 
597 	/*
598 	 * Restrict application to use a buffer the same size as
599 	 * the kernel buffers.
600 	 */
601 	if (uio->uio_resid != d->bd_bufsize)
602 		return (EINVAL);
603 
604 	mutex_enter(&d->bd_lock);
605 	if (d->bd_state == BPF_WAITING)
606 		bpf_clear_timeout(d);
607 	timed_out = (d->bd_state == BPF_TIMED_OUT);
608 	d->bd_state = BPF_IDLE;
609 	/*
610 	 * If the hold buffer is empty, then do a timed sleep, which
611 	 * ends when the timeout expires or when enough packets
612 	 * have arrived to fill the store buffer.
613 	 */
614 	while (d->bd_hbuf == 0) {
615 		if (d->bd_nonblock) {
616 			if (d->bd_slen == 0) {
617 				mutex_exit(&d->bd_lock);
618 				return (EWOULDBLOCK);
619 			}
620 			ROTATE_BUFFERS(d);
621 			break;
622 		}
623 
624 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
625 			/*
626 			 * A packet(s) either arrived since the previous
627 			 * read or arrived while we were asleep.
628 			 * Rotate the buffers and return what's here.
629 			 */
630 			ROTATE_BUFFERS(d);
631 			break;
632 		}
633 		ks_stats.kp_read_wait.value.ui64++;
634 		delay = ddi_get_lbolt() + d->bd_rtout;
635 		error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
636 		if (error == 0) {
637 			mutex_exit(&d->bd_lock);
638 			return (EINTR);
639 		}
640 		if (error == -1) {
641 			/*
642 			 * On a timeout, return what's in the buffer,
643 			 * which may be nothing.  If there is something
644 			 * in the store buffer, we can rotate the buffers.
645 			 */
646 			if (d->bd_hbuf)
647 				/*
648 				 * We filled up the buffer in between
649 				 * getting the timeout and arriving
650 				 * here, so we don't need to rotate.
651 				 */
652 				break;
653 
654 			if (d->bd_slen == 0) {
655 				mutex_exit(&d->bd_lock);
656 				return (0);
657 			}
658 			ROTATE_BUFFERS(d);
659 		}
660 	}
661 	/*
662 	 * At this point, we know we have something in the hold slot.
663 	 */
664 	mutex_exit(&d->bd_lock);
665 
666 	/*
667 	 * Move data from hold buffer into user space.
668 	 * We know the entire buffer is transferred since
669 	 * we checked above that the read buffer is bpf_bufsize bytes.
670 	 */
671 	error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
672 
673 	mutex_enter(&d->bd_lock);
674 	d->bd_fbuf = d->bd_hbuf;
675 	d->bd_hbuf = 0;
676 	d->bd_hlen = 0;
677 done:
678 	mutex_exit(&d->bd_lock);
679 	return (error);
680 }
681 
682 
683 /*
684  * If there are processes sleeping on this descriptor, wake them up.
685  * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
686  * so there is no code here grabbing it.
687  */
688 static inline void
689 bpf_wakeup(struct bpf_d *d)
690 {
691 	cv_signal(&d->bd_wait);
692 }
693 
694 static void
695 bpf_timed_out(void *arg)
696 {
697 	struct bpf_d *d = arg;
698 
699 	mutex_enter(&d->bd_lock);
700 	if (d->bd_state == BPF_WAITING) {
701 		d->bd_state = BPF_TIMED_OUT;
702 		if (d->bd_slen != 0)
703 			cv_signal(&d->bd_wait);
704 	}
705 	mutex_exit(&d->bd_lock);
706 }
707 
708 
709 /* ARGSUSED */
710 int
711 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
712 {
713 	struct bpf_d *d = bpf_dev_get(getminor(dev));
714 	uintptr_t mch;
715 	uint_t mtu;
716 	mblk_t *m;
717 	int error;
718 	int dlt;
719 
720 	if ((d->bd_fmode & FWRITE) == 0)
721 		return (EBADF);
722 
723 	mutex_enter(&d->bd_lock);
724 	if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
725 		mutex_exit(&d->bd_lock);
726 		return (EINTR);
727 	}
728 
729 	if (uio->uio_resid == 0) {
730 		mutex_exit(&d->bd_lock);
731 		return (0);
732 	}
733 
734 	while (d->bd_inuse < 0) {
735 		d->bd_waiting++;
736 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
737 			d->bd_waiting--;
738 			mutex_exit(&d->bd_lock);
739 			return (EINTR);
740 		}
741 		d->bd_waiting--;
742 	}
743 
744 	mutex_exit(&d->bd_lock);
745 
746 	dlt = d->bd_dlt;
747 	mch = d->bd_mcip;
748 	MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
749 	d->bd_inuse++;
750 
751 	m = NULL;
752 	if (dlt == DLT_IPNET) {
753 		error = EIO;
754 		goto done;
755 	}
756 
757 	error = bpf_movein(uio, dlt, mtu, &m);
758 	if (error)
759 		goto done;
760 
761 	DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
762 	    uint_t, mtu, mblk_t *, m);
763 
764 	if (M_LEN(m) > mtu) {
765 		error = EMSGSIZE;
766 		goto done;
767 	}
768 
769 	error = MBPF_TX(&d->bd_mac, mch, m);
770 	/*
771 	 * The "tx" action here is required to consume the mblk_t.
772 	 */
773 	m = NULL;
774 
775 done:
776 	if (error == 0)
777 		ks_stats.kp_write_ok.value.ui64++;
778 	else
779 		ks_stats.kp_write_error.value.ui64++;
780 	if (m != NULL)
781 		freemsg(m);
782 
783 	mutex_enter(&d->bd_lock);
784 	d->bd_inuse--;
785 	if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
786 		cv_signal(&d->bd_wait);
787 	mutex_exit(&d->bd_lock);
788 
789 	/*
790 	 * The driver frees the mbuf.
791 	 */
792 	return (error);
793 }
794 
795 
796 /*
797  * Reset a descriptor by flushing its packet buffer and clearing the
798  * receive and drop counts.  Should be called at splnet.
799  */
800 static void
801 reset_d(struct bpf_d *d)
802 {
803 	if (d->bd_hbuf) {
804 		/* Free the hold buffer. */
805 		d->bd_fbuf = d->bd_hbuf;
806 		d->bd_hbuf = 0;
807 	}
808 	d->bd_slen = 0;
809 	d->bd_hlen = 0;
810 	d->bd_rcount = 0;
811 	d->bd_dcount = 0;
812 	d->bd_ccount = 0;
813 }
814 
815 /*
816  *  FIONREAD		Check for read packet available.
817  *  BIOCGBLEN		Get buffer len [for read()].
818  *  BIOCSETF		Set ethernet read filter.
819  *  BIOCFLUSH		Flush read packet buffer.
820  *  BIOCPROMISC		Put interface into promiscuous mode.
821  *  BIOCGDLT		Get link layer type.
822  *  BIOCGETIF		Get interface name.
823  *  BIOCSETIF		Set interface.
824  *  BIOCSRTIMEOUT	Set read timeout.
825  *  BIOCGRTIMEOUT	Get read timeout.
826  *  BIOCGSTATS		Get packet stats.
827  *  BIOCIMMEDIATE	Set immediate mode.
828  *  BIOCVERSION		Get filter language version.
829  *  BIOCGHDRCMPLT	Get "header already complete" flag.
830  *  BIOCSHDRCMPLT	Set "header already complete" flag.
831  */
832 /* ARGSUSED */
833 int
834 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
835 {
836 	struct bpf_d *d = bpf_dev_get(getminor(dev));
837 	struct bpf_program prog;
838 	struct lifreq lifreq;
839 	struct ifreq ifreq;
840 	int error = 0;
841 	uint_t size;
842 
843 	/*
844 	 * Refresh the PID associated with this bpf file.
845 	 */
846 	mutex_enter(&d->bd_lock);
847 	if (d->bd_state == BPF_WAITING)
848 		bpf_clear_timeout(d);
849 	d->bd_state = BPF_IDLE;
850 	mutex_exit(&d->bd_lock);
851 
852 	switch (cmd) {
853 
854 	default:
855 		error = EINVAL;
856 		break;
857 
858 	/*
859 	 * Check for read packet available.
860 	 */
861 	case FIONREAD:
862 		{
863 			int n;
864 
865 			mutex_enter(&d->bd_lock);
866 			n = d->bd_slen;
867 			if (d->bd_hbuf)
868 				n += d->bd_hlen;
869 			mutex_exit(&d->bd_lock);
870 
871 			*(int *)addr = n;
872 			break;
873 		}
874 
875 	/*
876 	 * Get buffer len [for read()].
877 	 */
878 	case BIOCGBLEN:
879 		error = copyout(&d->bd_bufsize, (void *)addr,
880 		    sizeof (d->bd_bufsize));
881 		break;
882 
883 	/*
884 	 * Set buffer length.
885 	 */
886 	case BIOCSBLEN:
887 		if (copyin((void *)addr, &size, sizeof (size)) != 0) {
888 			error = EFAULT;
889 			break;
890 		}
891 
892 		mutex_enter(&d->bd_lock);
893 		if (d->bd_bif != 0) {
894 			error = EINVAL;
895 		} else {
896 			if (size > bpf_maxbufsize)
897 				size = bpf_maxbufsize;
898 			else if (size < BPF_MINBUFSIZE)
899 				size = BPF_MINBUFSIZE;
900 
901 			d->bd_bufsize = size;
902 		}
903 		mutex_exit(&d->bd_lock);
904 
905 		if (error == 0)
906 			error = copyout(&size, (void *)addr, sizeof (size));
907 		break;
908 
909 	/*
910 	 * Set link layer read filter.
911 	 */
912 	case BIOCSETF:
913 		if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
914 			error = EFAULT;
915 			break;
916 		}
917 		error = bpf_setf(d, &prog);
918 		break;
919 
920 	/*
921 	 * Flush read packet buffer.
922 	 */
923 	case BIOCFLUSH:
924 		mutex_enter(&d->bd_lock);
925 		reset_d(d);
926 		mutex_exit(&d->bd_lock);
927 		break;
928 
929 	/*
930 	 * Put interface into promiscuous mode.
931 	 * This is a one-way ioctl, it is not used to turn promiscuous
932 	 * mode off.
933 	 */
934 	case BIOCPROMISC:
935 		if (d->bd_bif == 0) {
936 			/*
937 			 * No interface attached yet.
938 			 */
939 			error = EINVAL;
940 			break;
941 		}
942 		mutex_enter(&d->bd_lock);
943 		if (d->bd_promisc == 0) {
944 
945 			if (d->bd_promisc_handle) {
946 				uintptr_t mph;
947 
948 				mph = d->bd_promisc_handle;
949 				d->bd_promisc_handle = 0;
950 
951 				mutex_exit(&d->bd_lock);
952 				MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
953 				mutex_enter(&d->bd_lock);
954 			}
955 
956 			d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
957 			error = MBPF_PROMISC_ADD(&d->bd_mac,
958 			    d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
959 			    &d->bd_promisc_handle, d->bd_promisc_flags);
960 			if (error == 0)
961 				d->bd_promisc = 1;
962 		}
963 		mutex_exit(&d->bd_lock);
964 		break;
965 
966 	/*
967 	 * Get device parameters.
968 	 */
969 	case BIOCGDLT:
970 		if (d->bd_bif == 0)
971 			error = EINVAL;
972 		else
973 			error = copyout(&d->bd_dlt, (void *)addr,
974 			    sizeof (d->bd_dlt));
975 		break;
976 
977 	/*
978 	 * Get a list of supported device parameters.
979 	 */
980 	case BIOCGDLTLIST:
981 		if (d->bd_bif == 0) {
982 			error = EINVAL;
983 		} else {
984 			struct bpf_dltlist list;
985 
986 			if (copyin((void *)addr, &list, sizeof (list)) != 0) {
987 				error = EFAULT;
988 				break;
989 			}
990 			error = bpf_getdltlist(d, &list);
991 			if ((error == 0) &&
992 			    copyout(&list, (void *)addr, sizeof (list)) != 0)
993 				error = EFAULT;
994 		}
995 		break;
996 
997 	/*
998 	 * Set device parameters.
999 	 */
1000 	case BIOCSDLT:
1001 		error = bpf_setdlt(d, (void *)addr);
1002 		break;
1003 
1004 	/*
1005 	 * Get interface name.
1006 	 */
1007 	case BIOCGETIF:
1008 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1009 			error = EFAULT;
1010 			break;
1011 		}
1012 		error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1013 		if ((error == 0) &&
1014 		    copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1015 			error = EFAULT;
1016 			break;
1017 		}
1018 		break;
1019 
1020 	/*
1021 	 * Set interface.
1022 	 */
1023 	case BIOCSETIF:
1024 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1025 			error = EFAULT;
1026 			break;
1027 		}
1028 		error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1029 		break;
1030 
1031 	/*
1032 	 * Get interface name.
1033 	 */
1034 	case BIOCGETLIF:
1035 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1036 			error = EFAULT;
1037 			break;
1038 		}
1039 		error = bpf_ifname(d, lifreq.lifr_name,
1040 		    sizeof (lifreq.lifr_name));
1041 		if ((error == 0) &&
1042 		    copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1043 			error = EFAULT;
1044 			break;
1045 		}
1046 		break;
1047 
1048 	/*
1049 	 * Set interface.
1050 	 */
1051 	case BIOCSETLIF:
1052 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1053 			error = EFAULT;
1054 			break;
1055 		}
1056 		error = bpf_setif(d, lifreq.lifr_name,
1057 		    sizeof (lifreq.lifr_name));
1058 		break;
1059 
1060 #ifdef _SYSCALL32_IMPL
1061 	/*
1062 	 * Set read timeout.
1063 	 */
1064 	case BIOCSRTIMEOUT32:
1065 		{
1066 			struct timeval32 tv;
1067 
1068 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1069 				error = EFAULT;
1070 				break;
1071 			}
1072 
1073 			/* Convert the timeout in microseconds to ticks */
1074 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1075 			    tv.tv_usec);
1076 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1077 				d->bd_rtout = 1;
1078 			break;
1079 		}
1080 
1081 	/*
1082 	 * Get read timeout.
1083 	 */
1084 	case BIOCGRTIMEOUT32:
1085 		{
1086 			struct timeval32 tv;
1087 			clock_t ticks;
1088 
1089 			ticks = drv_hztousec(d->bd_rtout);
1090 			tv.tv_sec = ticks / 1000000;
1091 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1092 			error = copyout(&tv, (void *)addr, sizeof (tv));
1093 			break;
1094 		}
1095 
1096 	/*
1097 	 * Get a list of supported device parameters.
1098 	 */
1099 	case BIOCGDLTLIST32:
1100 		if (d->bd_bif == 0) {
1101 			error = EINVAL;
1102 		} else {
1103 			struct bpf_dltlist32 lst32;
1104 			struct bpf_dltlist list;
1105 
1106 			if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1107 				error = EFAULT;
1108 				break;
1109 			}
1110 
1111 			list.bfl_len = lst32.bfl_len;
1112 			list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1113 			error = bpf_getdltlist(d, &list);
1114 			if (error == 0) {
1115 				lst32.bfl_len = list.bfl_len;
1116 
1117 				if (copyout(&lst32, (void *)addr,
1118 				    sizeof (lst32)) != 0)
1119 					error = EFAULT;
1120 			}
1121 		}
1122 		break;
1123 
1124 	/*
1125 	 * Set link layer read filter.
1126 	 */
1127 	case BIOCSETF32: {
1128 		struct bpf_program32 prog32;
1129 
1130 		if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1131 			error = EFAULT;
1132 			break;
1133 		}
1134 		prog.bf_len = prog32.bf_len;
1135 		prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1136 		error = bpf_setf(d, &prog);
1137 		break;
1138 	}
1139 #endif
1140 
1141 	/*
1142 	 * Set read timeout.
1143 	 */
1144 	case BIOCSRTIMEOUT:
1145 		{
1146 			struct timeval tv;
1147 
1148 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1149 				error = EFAULT;
1150 				break;
1151 			}
1152 
1153 			/* Convert the timeout in microseconds to ticks */
1154 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1155 			    tv.tv_usec);
1156 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1157 				d->bd_rtout = 1;
1158 			break;
1159 		}
1160 
1161 	/*
1162 	 * Get read timeout.
1163 	 */
1164 	case BIOCGRTIMEOUT:
1165 		{
1166 			struct timeval tv;
1167 			clock_t ticks;
1168 
1169 			ticks = drv_hztousec(d->bd_rtout);
1170 			tv.tv_sec = ticks / 1000000;
1171 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1172 			if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1173 				error = EFAULT;
1174 			break;
1175 		}
1176 
1177 	/*
1178 	 * Get packet stats.
1179 	 */
1180 	case BIOCGSTATS:
1181 		{
1182 			struct bpf_stat bs;
1183 
1184 			bs.bs_recv = d->bd_rcount;
1185 			bs.bs_drop = d->bd_dcount;
1186 			bs.bs_capt = d->bd_ccount;
1187 			if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1188 				error = EFAULT;
1189 			break;
1190 		}
1191 
1192 	/*
1193 	 * Set immediate mode.
1194 	 */
1195 	case BIOCIMMEDIATE:
1196 		if (copyin((void *)addr, &d->bd_immediate,
1197 		    sizeof (d->bd_immediate)) != 0)
1198 			error = EFAULT;
1199 		break;
1200 
1201 	case BIOCVERSION:
1202 		{
1203 			struct bpf_version bv;
1204 
1205 			bv.bv_major = BPF_MAJOR_VERSION;
1206 			bv.bv_minor = BPF_MINOR_VERSION;
1207 			if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1208 				error = EFAULT;
1209 			break;
1210 		}
1211 
1212 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1213 		if (copyout(&d->bd_hdrcmplt, (void *)addr,
1214 		    sizeof (d->bd_hdrcmplt)) != 0)
1215 			error = EFAULT;
1216 		break;
1217 
1218 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1219 		if (copyin((void *)addr, &d->bd_hdrcmplt,
1220 		    sizeof (d->bd_hdrcmplt)) != 0)
1221 			error = EFAULT;
1222 		break;
1223 
1224 	/*
1225 	 * Get "see sent packets" flag
1226 	 */
1227 	case BIOCGSEESENT:
1228 		if (copyout(&d->bd_seesent, (void *)addr,
1229 		    sizeof (d->bd_seesent)) != 0)
1230 			error = EFAULT;
1231 		break;
1232 
1233 	/*
1234 	 * Set "see sent" packets flag
1235 	 */
1236 	case BIOCSSEESENT:
1237 		if (copyin((void *)addr, &d->bd_seesent,
1238 		    sizeof (d->bd_seesent)) != 0)
1239 			error = EFAULT;
1240 		break;
1241 
1242 	case FIONBIO:		/* Non-blocking I/O */
1243 		if (copyin((void *)addr, &d->bd_nonblock,
1244 		    sizeof (d->bd_nonblock)) != 0)
1245 			error = EFAULT;
1246 		break;
1247 	}
1248 	return (error);
1249 }
1250 
1251 /*
1252  * Set d's packet filter program to fp.  If this file already has a filter,
1253  * free it and replace it. If the new filter is "empty" (has a 0 size), then
1254  * the result is to just remove and free the existing filter.
1255  * Returns EINVAL for bogus requests.
1256  */
1257 int
1258 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1259 {
1260 	struct bpf_insn *fcode, *old;
1261 	uint_t flen, size;
1262 	size_t oldsize;
1263 
1264 	if (fp->bf_insns == 0) {
1265 		if (fp->bf_len != 0)
1266 			return (EINVAL);
1267 		mutex_enter(&d->bd_lock);
1268 		old = d->bd_filter;
1269 		oldsize = d->bd_filter_size;
1270 		d->bd_filter = 0;
1271 		d->bd_filter_size = 0;
1272 		reset_d(d);
1273 		mutex_exit(&d->bd_lock);
1274 		if (old != 0)
1275 			kmem_free(old, oldsize);
1276 		return (0);
1277 	}
1278 	flen = fp->bf_len;
1279 	if (flen > BPF_MAXINSNS)
1280 		return (EINVAL);
1281 
1282 	size = flen * sizeof (*fp->bf_insns);
1283 	fcode = kmem_alloc(size, KM_SLEEP);
1284 	if (copyin(fp->bf_insns, fcode, size) != 0)
1285 		return (EFAULT);
1286 
1287 	if (bpf_validate(fcode, (int)flen)) {
1288 		mutex_enter(&d->bd_lock);
1289 		old = d->bd_filter;
1290 		oldsize = d->bd_filter_size;
1291 		d->bd_filter = fcode;
1292 		d->bd_filter_size = size;
1293 		reset_d(d);
1294 		mutex_exit(&d->bd_lock);
1295 		if (old != 0)
1296 			kmem_free(old, oldsize);
1297 
1298 		return (0);
1299 	}
1300 	kmem_free(fcode, size);
1301 	return (EINVAL);
1302 }
1303 
1304 /*
1305  * Detach a file from its current interface (if attached at all) and attach
1306  * to the interface indicated by the name stored in ifname.
1307  * Return an errno or 0.
1308  */
1309 static int
1310 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1311 {
1312 	int unit_seen;
1313 	int error = 0;
1314 	char *cp;
1315 	int i;
1316 
1317 	/*
1318 	 * Make sure the provided name has a unit number, and default
1319 	 * it to '0' if not specified.
1320 	 * XXX This is ugly ... do this differently?
1321 	 */
1322 	unit_seen = 0;
1323 	cp = ifname;
1324 	cp[namesize - 1] = '\0';	/* sanity */
1325 	while (*cp++)
1326 		if (*cp >= '0' && *cp <= '9')
1327 			unit_seen = 1;
1328 	if (!unit_seen) {
1329 		/* Make sure to leave room for the '\0'. */
1330 		for (i = 0; i < (namesize - 1); ++i) {
1331 			if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1332 			    (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1333 				continue;
1334 			ifname[i] = '0';
1335 		}
1336 	}
1337 
1338 	/*
1339 	 * Make sure that only one call to this function happens at a time
1340 	 * and that we're not interleaving a read/write
1341 	 */
1342 	mutex_enter(&d->bd_lock);
1343 	while (d->bd_inuse != 0) {
1344 		d->bd_waiting++;
1345 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1346 			d->bd_waiting--;
1347 			mutex_exit(&d->bd_lock);
1348 			return (EINTR);
1349 		}
1350 		d->bd_waiting--;
1351 	}
1352 	d->bd_inuse = -1;
1353 	mutex_exit(&d->bd_lock);
1354 
1355 	if (d->bd_sbuf == 0)
1356 		error = bpf_allocbufs(d);
1357 
1358 	if (error == 0) {
1359 		mutex_enter(&d->bd_lock);
1360 		if (d->bd_bif)
1361 			/*
1362 			 * Detach if attached to something else.
1363 			 */
1364 			bpf_detachd(d);
1365 
1366 		error = bpf_attachd(d, ifname, -1);
1367 		reset_d(d);
1368 		d->bd_inuse = 0;
1369 		if (d->bd_waiting != 0)
1370 			cv_signal(&d->bd_wait);
1371 		mutex_exit(&d->bd_lock);
1372 		return (error);
1373 	}
1374 
1375 	mutex_enter(&d->bd_lock);
1376 	d->bd_inuse = 0;
1377 	if (d->bd_waiting != 0)
1378 		cv_signal(&d->bd_wait);
1379 	mutex_exit(&d->bd_lock);
1380 
1381 	/*
1382 	 * Try tickle the mac layer into attaching the device...
1383 	 */
1384 	return (bpf_provider_tickle(ifname, d->bd_zone));
1385 }
1386 
1387 /*
1388  * Copy the interface name to the ifreq.
1389  */
1390 static int
1391 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1392 {
1393 
1394 	mutex_enter(&d->bd_lock);
1395 	if (d->bd_bif == 0) {
1396 		mutex_exit(&d->bd_lock);
1397 		return (EINVAL);
1398 	}
1399 
1400 	(void) strlcpy(buffer, d->bd_ifname, bufsize);
1401 	mutex_exit(&d->bd_lock);
1402 
1403 	return (0);
1404 }
1405 
1406 /* ARGSUSED */
1407 int
1408 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409     struct pollhead **phpp)
1410 {
1411 	struct bpf_d *d = bpf_dev_get(getminor(dev));
1412 
1413 	/*
1414 	 * Until this driver is modified to issue proper pollwakeup() calls on
1415 	 * its pollhead, edge-triggered polling is not allowed.
1416 	 */
1417 	if (events & POLLET) {
1418 		return (EPERM);
1419 	}
1420 
1421 	if (events & (POLLIN | POLLRDNORM)) {
1422 		/*
1423 		 * An imitation of the FIONREAD ioctl code.
1424 		 */
1425 		mutex_enter(&d->bd_lock);
1426 		if (d->bd_hlen != 0 ||
1427 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1428 		    d->bd_slen != 0)) {
1429 			*reventsp |= events & (POLLIN | POLLRDNORM);
1430 		} else {
1431 			/*
1432 			 * Until the bpf driver has been updated to include
1433 			 * adequate pollwakeup() logic, no pollhead will be
1434 			 * emitted here, preventing the resource from being
1435 			 * cached by poll()/devpoll/epoll.
1436 			 */
1437 			*reventsp = 0;
1438 			/* Start the read timeout if necessary */
1439 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1440 				bpf_clear_timeout(d);
1441 				/*
1442 				 * Only allow the timeout to be set once.
1443 				 */
1444 				if (d->bd_callout == 0)
1445 					d->bd_callout = timeout(bpf_timed_out,
1446 					    d, d->bd_rtout);
1447 				d->bd_state = BPF_WAITING;
1448 			}
1449 		}
1450 		mutex_exit(&d->bd_lock);
1451 	}
1452 
1453 	return (0);
1454 }
1455 
1456 /*
1457  * Copy data from an mblk_t chain into a buffer. This works for ipnet
1458  * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1459  * packet itself.
1460  */
1461 static void *
1462 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1463 {
1464 	const mblk_t *m;
1465 	uint_t count;
1466 	uchar_t *dst;
1467 
1468 	m = src_arg;
1469 	dst = dst_arg;
1470 	while (len > 0) {
1471 		if (m == NULL)
1472 			panic("bpf_mcpy");
1473 		count = (uint_t)min(M_LEN(m), len);
1474 		(void) memcpy(dst, mtod(m, const void *), count);
1475 		m = m->b_cont;
1476 		dst += count;
1477 		len -= count;
1478 	}
1479 	return (dst_arg);
1480 }
1481 
1482 /*
1483  * Dispatch a packet to all the listeners on interface bp.
1484  *
1485  * marg    pointer to the packet, either a data buffer or an mbuf chain
1486  * buflen  buffer length, if marg is a data buffer
1487  * cpfn    a function that can copy marg into the listener's buffer
1488  * pktlen  length of the packet
1489  * issent  boolean indicating whether the packet was sent or receive
1490  */
1491 static inline void
1492 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1493     uint_t buflen, boolean_t issent)
1494 {
1495 	struct timeval tv;
1496 	uint_t slen;
1497 
1498 	if (!d->bd_seesent && issent)
1499 		return;
1500 
1501 	/*
1502 	 * Accuracy of the packet counters in BPF is vital so it
1503 	 * is important to protect even the outer ones.
1504 	 */
1505 	mutex_enter(&d->bd_lock);
1506 	slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1507 	DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1508 	    struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1509 	d->bd_rcount++;
1510 	ks_stats.kp_receive.value.ui64++;
1511 	if (slen != 0) {
1512 		uniqtime(&tv);
1513 		catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1514 	}
1515 	mutex_exit(&d->bd_lock);
1516 }
1517 
1518 /*
1519  * Incoming linkage from device drivers.
1520  */
1521 /* ARGSUSED */
1522 void
1523 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1524 {
1525 	cp_fn_t cpfn;
1526 	struct bpf_d *d = arg;
1527 	uint_t pktlen, buflen;
1528 	void *marg;
1529 
1530 	pktlen = msgdsize(m);
1531 
1532 	if (pktlen == M_LEN(m)) {
1533 		cpfn = (cp_fn_t)memcpy;
1534 		marg = mtod(m, void *);
1535 		buflen = pktlen;
1536 	} else {
1537 		cpfn = bpf_mcpy;
1538 		marg = m;
1539 		buflen = 0;
1540 	}
1541 
1542 	bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1543 }
1544 
1545 /*
1546  * Incoming linkage from ipnet.
1547  * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1548  * from all network interfaces. Thus the tap function needs to apply a
1549  * filter using the interface index/id to immitate snoop'ing on just the
1550  * specified interface.
1551  */
1552 /* ARGSUSED */
1553 void
1554 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1555 {
1556 	hook_pkt_observe_t *hdr;
1557 	struct bpf_d *d = arg;
1558 
1559 	hdr = (hook_pkt_observe_t *)m->b_rptr;
1560 	if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1561 		return;
1562 	bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1563 
1564 }
1565 
1566 /*
1567  * Move the packet data from interface memory (pkt) into the
1568  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1569  * otherwise 0.  "copy" is the routine called to do the actual data
1570  * transfer.  memcpy is passed in to copy contiguous chunks, while
1571  * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1572  * pkt is really an mbuf.
1573  */
1574 static void
1575 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1576     cp_fn_t cpfn, struct timeval *tv)
1577 {
1578 	struct bpf_hdr *hp;
1579 	int totlen, curlen;
1580 	int hdrlen = d->bd_hdrlen;
1581 	int do_wakeup = 0;
1582 
1583 	++d->bd_ccount;
1584 	ks_stats.kp_capture.value.ui64++;
1585 	/*
1586 	 * Figure out how many bytes to move.  If the packet is
1587 	 * greater or equal to the snapshot length, transfer that
1588 	 * much.  Otherwise, transfer the whole packet (unless
1589 	 * we hit the buffer size limit).
1590 	 */
1591 	totlen = hdrlen + min(snaplen, pktlen);
1592 	if (totlen > d->bd_bufsize)
1593 		totlen = d->bd_bufsize;
1594 
1595 	/*
1596 	 * Round up the end of the previous packet to the next longword.
1597 	 */
1598 	curlen = BPF_WORDALIGN(d->bd_slen);
1599 	if (curlen + totlen > d->bd_bufsize) {
1600 		/*
1601 		 * This packet will overflow the storage buffer.
1602 		 * Rotate the buffers if we can, then wakeup any
1603 		 * pending reads.
1604 		 */
1605 		if (d->bd_fbuf == 0) {
1606 			/*
1607 			 * We haven't completed the previous read yet,
1608 			 * so drop the packet.
1609 			 */
1610 			++d->bd_dcount;
1611 			ks_stats.kp_dropped.value.ui64++;
1612 			return;
1613 		}
1614 		ROTATE_BUFFERS(d);
1615 		do_wakeup = 1;
1616 		curlen = 0;
1617 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1618 		/*
1619 		 * Immediate mode is set, or the read timeout has
1620 		 * already expired during a select call.  A packet
1621 		 * arrived, so the reader should be woken up.
1622 		 */
1623 		do_wakeup = 1;
1624 	}
1625 
1626 	/*
1627 	 * Append the bpf header to the existing buffer before we add
1628 	 * on the actual packet data.
1629 	 */
1630 	hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1631 	hp->bh_tstamp.tv_sec = tv->tv_sec;
1632 	hp->bh_tstamp.tv_usec = tv->tv_usec;
1633 	hp->bh_datalen = pktlen;
1634 	hp->bh_hdrlen = (uint16_t)hdrlen;
1635 	/*
1636 	 * Copy the packet data into the store buffer and update its length.
1637 	 */
1638 	(*cpfn)((uchar_t *)hp + hdrlen, pkt,
1639 	    (hp->bh_caplen = totlen - hdrlen));
1640 	d->bd_slen = curlen + totlen;
1641 
1642 	/*
1643 	 * Call bpf_wakeup after bd_slen has been updated.
1644 	 */
1645 	if (do_wakeup)
1646 		bpf_wakeup(d);
1647 }
1648 
1649 /*
1650  * Initialize all nonzero fields of a descriptor.
1651  */
1652 static int
1653 bpf_allocbufs(struct bpf_d *d)
1654 {
1655 
1656 	d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1657 	if (!d->bd_fbuf)
1658 		return (ENOBUFS);
1659 	d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1660 	if (!d->bd_sbuf) {
1661 		kmem_free(d->bd_fbuf, d->bd_bufsize);
1662 		return (ENOBUFS);
1663 	}
1664 	d->bd_slen = 0;
1665 	d->bd_hlen = 0;
1666 	return (0);
1667 }
1668 
1669 /*
1670  * Free buffers currently in use by a descriptor.
1671  * Called on close.
1672  */
1673 static void
1674 bpf_freed(struct bpf_d *d)
1675 {
1676 	/*
1677 	 * At this point the descriptor has been detached from its
1678 	 * interface and it yet hasn't been marked free.
1679 	 */
1680 	if (d->bd_sbuf != 0) {
1681 		kmem_free(d->bd_sbuf, d->bd_bufsize);
1682 		if (d->bd_hbuf != 0)
1683 			kmem_free(d->bd_hbuf, d->bd_bufsize);
1684 		if (d->bd_fbuf != 0)
1685 			kmem_free(d->bd_fbuf, d->bd_bufsize);
1686 	}
1687 	if (d->bd_filter)
1688 		kmem_free(d->bd_filter, d->bd_filter_size);
1689 }
1690 
1691 /*
1692  * Get a list of available data link type of the interface.
1693  */
1694 static int
1695 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1696 {
1697 	bpf_provider_list_t *bp;
1698 	bpf_provider_t *bpr;
1699 	zoneid_t zoneid;
1700 	uintptr_t mcip;
1701 	uint_t nicdlt;
1702 	uintptr_t mh;
1703 	int error;
1704 	int n;
1705 
1706 	n = 0;
1707 	mh = 0;
1708 	mcip = 0;
1709 	error = 0;
1710 	mutex_enter(&d->bd_lock);
1711 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1712 		bpr = bp->bpl_what;
1713 		error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1714 		if (error != 0)
1715 			goto next;
1716 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1717 		if (error != 0)
1718 			goto next;
1719 		error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1720 		if (error != 0)
1721 			goto next;
1722 		if (d->bd_zone != GLOBAL_ZONEID &&
1723 		    d->bd_zone != zoneid)
1724 			goto next;
1725 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1726 		if (error != 0)
1727 			goto next;
1728 		nicdlt = bpf_dl_to_dlt(nicdlt);
1729 		if (listp->bfl_list != NULL) {
1730 			if (n >= listp->bfl_len) {
1731 				MBPF_CLIENT_CLOSE(bpr, mcip);
1732 				MBPF_CLOSE(bpr, mh);
1733 				break;
1734 			}
1735 			/*
1736 			 * Bumping of bd_inuse ensures the structure does not
1737 			 * disappear while the copyout runs and allows the for
1738 			 * loop to be continued.
1739 			 */
1740 			d->bd_inuse++;
1741 			mutex_exit(&d->bd_lock);
1742 			if (copyout(&nicdlt,
1743 			    listp->bfl_list + n, sizeof (uint_t)) != 0)
1744 				error = EFAULT;
1745 			mutex_enter(&d->bd_lock);
1746 			if (error != 0)
1747 				break;
1748 			d->bd_inuse--;
1749 		}
1750 		n++;
1751 next:
1752 		if (mcip != 0) {
1753 			MBPF_CLIENT_CLOSE(bpr, mcip);
1754 			mcip = 0;
1755 		}
1756 		if (mh != 0) {
1757 			MBPF_CLOSE(bpr, mh);
1758 			mh = 0;
1759 		}
1760 	}
1761 	mutex_exit(&d->bd_lock);
1762 
1763 	/*
1764 	 * It is quite possible that one or more provider to BPF may not
1765 	 * know about a link name whlist others do. In that case, so long
1766 	 * as we have one success, do not declare an error unless it was
1767 	 * an EFAULT as this indicates a problem that needs to be reported.
1768 	 */
1769 	if ((error != EFAULT) && (n > 0))
1770 		error = 0;
1771 
1772 	listp->bfl_len = n;
1773 	return (error);
1774 }
1775 
1776 /*
1777  * Set the data link type of a BPF instance.
1778  */
1779 static int
1780 bpf_setdlt(struct bpf_d *d, void *addr)
1781 {
1782 	char ifname[LIFNAMSIZ+1];
1783 	zoneid_t niczone;
1784 	int error;
1785 	int dlt;
1786 
1787 	if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1788 		return (EFAULT);
1789 
1790 	mutex_enter(&d->bd_lock);
1791 
1792 	if (d->bd_bif == 0) {			/* Interface not set */
1793 		mutex_exit(&d->bd_lock);
1794 		return (EINVAL);
1795 	}
1796 	if (d->bd_dlt == dlt) {	/* NULL-op */
1797 		mutex_exit(&d->bd_lock);
1798 		return (0);
1799 	}
1800 
1801 	error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1802 	if (error != 0) {
1803 		mutex_exit(&d->bd_lock);
1804 		return (error);
1805 	}
1806 
1807 	/*
1808 	 * See the matrix at the top of the file for the permissions table
1809 	 * enforced by this driver.
1810 	 */
1811 	if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1812 	    (niczone != d->bd_zone)) {
1813 		mutex_exit(&d->bd_lock);
1814 		return (EINVAL);
1815 	}
1816 
1817 	(void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1818 	d->bd_inuse = -1;
1819 	bpf_detachd(d);
1820 	error = bpf_attachd(d, ifname, dlt);
1821 	reset_d(d);
1822 	d->bd_inuse = 0;
1823 
1824 	mutex_exit(&d->bd_lock);
1825 	return (error);
1826 }
1827 
1828 /*
1829  * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1830  * with the necessary protection to retrieve and modify bd_callout but it
1831  * does not hold the lock for its entire duration... see below...
1832  */
1833 static void
1834 bpf_clear_timeout(struct bpf_d *d)
1835 {
1836 	timeout_id_t tid = d->bd_callout;
1837 	d->bd_callout = 0;
1838 	d->bd_inuse++;
1839 
1840 	/*
1841 	 * If the timeout has fired and is waiting on bd_lock, we could
1842 	 * deadlock here because untimeout if bd_lock is held and would
1843 	 * wait for bpf_timed_out to finish and it never would.
1844 	 */
1845 	if (tid != 0) {
1846 		mutex_exit(&d->bd_lock);
1847 		(void) untimeout(tid);
1848 		mutex_enter(&d->bd_lock);
1849 	}
1850 
1851 	d->bd_inuse--;
1852 }
1853 
1854 /*
1855  * As a cloning device driver, BPF needs to keep track of which device
1856  * numbers are in use and which ones are not. A hash table, indexed by
1857  * the minor device number, is used to store the pointers to the
1858  * individual descriptors that are allocated in bpfopen().
1859  * The functions below present the interface for that hash table to
1860  * the rest of the driver.
1861  */
1862 static struct bpf_d *
1863 bpf_dev_find(minor_t minor)
1864 {
1865 	struct bpf_d *d = NULL;
1866 
1867 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1868 	    (mod_hash_val_t *)&d);
1869 
1870 	return (d);
1871 }
1872 
1873 static void
1874 bpf_dev_add(struct bpf_d *d)
1875 {
1876 	(void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1877 	    (mod_hash_val_t)d);
1878 }
1879 
1880 static void
1881 bpf_dev_remove(struct bpf_d *d)
1882 {
1883 	struct bpf_d *stor;
1884 
1885 	(void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1886 	    (mod_hash_val_t *)&stor);
1887 	ASSERT(stor == d);
1888 }
1889 
1890 /*
1891  * bpf_def_get should only ever be called for a minor number that exists,
1892  * thus there should always be a pointer in the hash table that corresponds
1893  * to it.
1894  */
1895 static struct bpf_d *
1896 bpf_dev_get(minor_t minor)
1897 {
1898 	struct bpf_d *d = NULL;
1899 
1900 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1901 	    (mod_hash_val_t *)&d);
1902 	ASSERT(d != NULL);
1903 
1904 	return (d);
1905 }
1906