xref: /illumos-gate/usr/src/uts/common/io/bpf/bpf.c (revision 08f1bbed5edd2a2e9c8be7b7424c32e67c2f3f2c)
1 /*	$NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from the Stanford/CMU enet packet filter,
8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10  * Berkeley Laboratory.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)bpf.c	8.4 (Berkeley) 1/9/95
37  * static char rcsid[] =
38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39  */
40 /*
41  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
42  * Use is subject to license terms.
43  * Copyright 2017 Joyent, Inc.
44  */
45 
46 /*
47  * The BPF implements the following access controls for zones attempting
48  * to read and write data. Writing of data requires that the net_rawaccess
49  * privilege is held whilst reading data requires either net_rawaccess or
50  * net_observerability.
51  *
52  *                              | Shared |  Exclusive |   Global
53  * -----------------------------+--------+------------+------------+
54  * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
55  * -----------------------------+--------+------------+------------+
56  * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
57  * -----------------------------+--------+------------+------------+
58  * Raw access to all NICs       |  None  |    None    | Read/Write |
59  * -----------------------------+--------+------------+------------+
60  *
61  * The BPF driver is written as a cloning driver: each call to bpfopen()
62  * allocates a new minor number. This provides BPF with a 1:1 relationship
63  * between open's and close's. There is some amount of "descriptor state"
64  * that is kept per open. Pointers to this data are stored in a hash table
65  * (bpf_hash) that is index'd by the minor device number for each open file.
66  */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/time.h>
70 #include <sys/ioctl.h>
71 #include <sys/queue.h>
72 #include <sys/filio.h>
73 #include <sys/policy.h>
74 #include <sys/cmn_err.h>
75 #include <sys/uio.h>
76 #include <sys/file.h>
77 #include <sys/sysmacros.h>
78 #include <sys/zone.h>
79 
80 #include <sys/socket.h>
81 #include <sys/errno.h>
82 #include <sys/poll.h>
83 #include <sys/dlpi.h>
84 #include <sys/neti.h>
85 
86 #include <net/if.h>
87 
88 #include <net/bpf.h>
89 #include <net/bpfdesc.h>
90 #include <net/dlt.h>
91 
92 #include <netinet/in.h>
93 #include <sys/mac.h>
94 #include <sys/mac_client.h>
95 #include <sys/mac_impl.h>
96 #include <sys/time_std_impl.h>
97 #include <sys/hook.h>
98 #include <sys/hook_event.h>
99 
100 
101 #define	mtod(_v, _t)	(_t)((_v)->b_rptr)
102 #define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
103 
104 /*
105  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
106  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
107  */
108 #define	BPF_BUFSIZE (32 * 1024)
109 
110 typedef void *(*cp_fn_t)(void *, const void *, size_t);
111 
112 /*
113  * The default read buffer size, and limit for BIOCSBLEN.
114  */
115 int bpf_bufsize = BPF_BUFSIZE;
116 int bpf_maxbufsize = (16 * 1024 * 1024);
117 static mod_hash_t *bpf_hash = NULL;
118 
119 /*
120  * Use a mutex to avoid a race condition between gathering the stats/peers
121  * and opening/closing the device.
122  */
123 static kcondvar_t bpf_dlt_waiter;
124 static kmutex_t bpf_mtx;
125 static bpf_kstats_t ks_stats;
126 static bpf_kstats_t bpf_kstats = {
127 	{ "readWait",		KSTAT_DATA_UINT64 },
128 	{ "writeOk",		KSTAT_DATA_UINT64 },
129 	{ "writeError",		KSTAT_DATA_UINT64 },
130 	{ "receive",		KSTAT_DATA_UINT64 },
131 	{ "captured",		KSTAT_DATA_UINT64 },
132 	{ "dropped",		KSTAT_DATA_UINT64 },
133 };
134 static kstat_t *bpf_ksp;
135 
136 /*
137  *  bpf_list is a list of the BPF descriptors currently open
138  */
139 LIST_HEAD(, bpf_d) bpf_list;
140 
141 static int	bpf_allocbufs(struct bpf_d *);
142 static void	bpf_clear_timeout(struct bpf_d *);
143 static void	bpf_deliver(struct bpf_d *, cp_fn_t,
144 		    void *, uint_t, uint_t, boolean_t);
145 static void	bpf_freed(struct bpf_d *);
146 static int	bpf_ifname(struct bpf_d *d, char *, int);
147 static void	*bpf_mcpy(void *, const void *, size_t);
148 static int	bpf_attachd(struct bpf_d *, const char *, int);
149 static void	bpf_detachd(struct bpf_d *);
150 static int	bpf_setif(struct bpf_d *, char *, int);
151 static void	bpf_timed_out(void *);
152 static inline void
153 		bpf_wakeup(struct bpf_d *);
154 static void	catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
155 		    cp_fn_t, struct timeval *);
156 static void	reset_d(struct bpf_d *);
157 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
158 static int	bpf_setdlt(struct bpf_d *, void *);
159 static void	bpf_dev_add(struct bpf_d *);
160 static struct bpf_d *bpf_dev_find(minor_t);
161 static struct bpf_d *bpf_dev_get(minor_t);
162 static void	bpf_dev_remove(struct bpf_d *);
163 
164 static int
165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
166 {
167 	mblk_t *m;
168 	int error;
169 	int len;
170 	int hlen;
171 	int align;
172 
173 	/*
174 	 * Build a sockaddr based on the data link layer type.
175 	 * We do this at this level because the ethernet header
176 	 * is copied directly into the data field of the sockaddr.
177 	 * In the case of SLIP, there is no header and the packet
178 	 * is forwarded as is.
179 	 * Also, we are careful to leave room at the front of the mbuf
180 	 * for the link level header.
181 	 */
182 	switch (linktype) {
183 
184 	case DLT_EN10MB:
185 		hlen = sizeof (struct ether_header);
186 		break;
187 
188 	case DLT_FDDI:
189 		hlen = 16;
190 		break;
191 
192 	case DLT_NULL:
193 		hlen = 0;
194 		break;
195 
196 	case DLT_IPOIB:
197 		hlen = 44;
198 		break;
199 
200 	default:
201 		return (EIO);
202 	}
203 
204 	align = 4 - (hlen & 3);
205 
206 	len = uio->uio_resid;
207 	/*
208 	 * If there aren't enough bytes for a link level header or the
209 	 * packet length exceeds the interface mtu, return an error.
210 	 */
211 	if (len < hlen || len - hlen > mtu)
212 		return (EMSGSIZE);
213 
214 	m = allocb(len + align, BPRI_MED);
215 	if (m == NULL) {
216 		error = ENOBUFS;
217 		goto bad;
218 	}
219 
220 	/* Insure the data is properly aligned */
221 	if (align > 0)
222 		m->b_rptr += align;
223 	m->b_wptr = m->b_rptr + len;
224 
225 	error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
226 	if (error)
227 		goto bad;
228 	*mp = m;
229 	return (0);
230 
231 bad:
232 	if (m != NULL)
233 		freemsg(m);
234 	return (error);
235 }
236 
237 
238 /*
239  * Attach file to the bpf interface, i.e. make d listen on bp.
240  */
241 static int
242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
243 {
244 	bpf_provider_list_t *bp;
245 	bpf_provider_t *bpr;
246 	boolean_t zonematch;
247 	zoneid_t niczone;
248 	uintptr_t mcip;
249 	zoneid_t zone;
250 	uint_t nicdlt;
251 	uintptr_t mh;
252 	int hdrlen;
253 	int error;
254 
255 	ASSERT(d->bd_bif == NULL);
256 	ASSERT(d->bd_mcip == NULL);
257 	zone = d->bd_zone;
258 	zonematch = B_TRUE;
259 again:
260 	mh = 0;
261 	mcip = 0;
262 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
263 		bpr = bp->bpl_what;
264 		error = MBPF_OPEN(bpr, ifname, &mh, zone);
265 		if (error != 0)
266 			goto next;
267 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
268 		if (error != 0)
269 			goto next;
270 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
271 		if (error != 0)
272 			goto next;
273 
274 		nicdlt = bpf_dl_to_dlt(nicdlt);
275 		if (dlt != -1 && dlt != nicdlt) {
276 			error = ENOENT;
277 			goto next;
278 		}
279 
280 		error = MBPF_GET_ZONE(bpr, mh, &niczone);
281 		if (error != 0)
282 			goto next;
283 
284 		DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
285 		    uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
286 
287 		if (zonematch && niczone != zone) {
288 			error = ENOENT;
289 			goto next;
290 		}
291 		break;
292 next:
293 		if (mcip != 0) {
294 			MBPF_CLIENT_CLOSE(bpr, mcip);
295 			mcip = 0;
296 		}
297 		if (mh != 0) {
298 			MBPF_CLOSE(bpr, mh);
299 			mh = 0;
300 		}
301 	}
302 	if (error != 0) {
303 		if (zonematch && (zone == GLOBAL_ZONEID)) {
304 			/*
305 			 * If we failed to do an exact match for the global
306 			 * zone using the global zoneid, try again in case
307 			 * the network interface is owned by a local zone.
308 			 */
309 			zonematch = B_FALSE;
310 			goto again;
311 		}
312 		return (error);
313 	}
314 
315 	d->bd_mac = *bpr;
316 	d->bd_mcip = mcip;
317 	d->bd_bif = mh;
318 	d->bd_dlt = nicdlt;
319 	hdrlen = bpf_dl_hdrsize(nicdlt);
320 	d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
321 
322 	(void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
323 	    sizeof (d->bd_ifname));
324 
325 	(void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
326 	    zone);
327 	(void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
328 	    &d->bd_promisc_handle, d->bd_promisc_flags);
329 	return (0);
330 }
331 
332 /*
333  * Detach a file from its interface.
334  */
335 static void
336 bpf_detachd(struct bpf_d *d)
337 {
338 	uintptr_t mph;
339 	uintptr_t mch;
340 	uintptr_t mh;
341 
342 	ASSERT(d->bd_inuse == -1);
343 	mch = d->bd_mcip;
344 	d->bd_mcip = 0;
345 	mh = d->bd_bif;
346 	d->bd_bif = 0;
347 
348 	/*
349 	 * Check if this descriptor had requested promiscuous mode.
350 	 * If so, turn it off. There's no need to take any action
351 	 * here, that is done when MBPF_PROMISC_REMOVE is used;
352 	 * bd_promisc is just a local flag to stop promiscuous mode
353 	 * from being set more than once.
354 	 */
355 	if (d->bd_promisc)
356 		d->bd_promisc = 0;
357 
358 	/*
359 	 * Take device out of "promiscuous" mode.  Since we were able to
360 	 * enter "promiscuous" mode, we should be able to turn it off.
361 	 * Note, this field stores a pointer used to support both
362 	 * promiscuous and non-promiscuous callbacks for packets.
363 	 */
364 	mph = d->bd_promisc_handle;
365 	d->bd_promisc_handle = 0;
366 
367 	/*
368 	 * The lock has to be dropped here because mac_promisc_remove may
369 	 * need to wait for mac_promisc_dispatch, which has called into
370 	 * bpf and catchpacket is waiting for bd_lock...
371 	 * i.e mac_promisc_remove() needs to be called with none of the
372 	 * locks held that are part of the bpf_mtap() call path.
373 	 */
374 	mutex_exit(&d->bd_lock);
375 	if (mph != 0)
376 		MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
377 
378 	if (mch != 0)
379 		MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
380 
381 	if (mh != 0)
382 		MBPF_CLOSE(&d->bd_mac, mh);
383 
384 	/*
385 	 * Because this function is called with bd_lock held, so it must
386 	 * exit with it held.
387 	 */
388 	mutex_enter(&d->bd_lock);
389 	*d->bd_ifname = '\0';
390 	(void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
391 }
392 
393 
394 /*
395  * bpfilterattach() is called at load time.
396  */
397 int
398 bpfilterattach(void)
399 {
400 
401 	bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
402 	    mod_hash_null_keydtor);
403 	if (bpf_hash == NULL)
404 		return (ENOMEM);
405 
406 	(void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
407 
408 	bpf_ksp = kstat_create("bpf", 0, "global", "misc",
409 	    KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
410 	    KSTAT_FLAG_VIRTUAL);
411 	if (bpf_ksp != NULL) {
412 		bpf_ksp->ks_data = &ks_stats;
413 		kstat_install(bpf_ksp);
414 	} else {
415 		mod_hash_destroy_idhash(bpf_hash);
416 		bpf_hash = NULL;
417 		return (EEXIST);
418 	}
419 
420 	cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
421 	mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
422 
423 	LIST_INIT(&bpf_list);
424 
425 	return (0);
426 }
427 
428 
429 /*
430  * bpfilterdetach() is called at unload time.
431  */
432 int
433 bpfilterdetach(void)
434 {
435 
436 	if (bpf_ksp != NULL) {
437 		kstat_delete(bpf_ksp);
438 		bpf_ksp = NULL;
439 	}
440 
441 	mod_hash_destroy_idhash(bpf_hash);
442 	bpf_hash = NULL;
443 
444 	cv_destroy(&bpf_dlt_waiter);
445 	mutex_destroy(&bpf_mtx);
446 
447 	return (0);
448 }
449 
450 /*
451  * Open ethernet device. Clones.
452  */
453 /* ARGSUSED */
454 int
455 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
456 {
457 	struct bpf_d *d;
458 	uint_t dmin;
459 
460 	/*
461 	 * The security policy described at the top of this file is
462 	 * enforced here.
463 	 */
464 	if ((flag & FWRITE) != 0) {
465 		if (secpolicy_net_rawaccess(cred) != 0)
466 			return (EACCES);
467 	}
468 
469 	if ((flag & FREAD) != 0) {
470 		if ((secpolicy_net_observability(cred) != 0) &&
471 		    (secpolicy_net_rawaccess(cred) != 0))
472 			return (EACCES);
473 	}
474 
475 	if ((flag & (FWRITE|FREAD)) == 0)
476 		return (ENXIO);
477 
478 	/*
479 	 * A structure is allocated per open file in BPF to store settings
480 	 * such as buffer capture size, provide private buffers, etc.
481 	 */
482 	d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
483 	d->bd_bufsize = bpf_bufsize;
484 	d->bd_fmode = flag;
485 	d->bd_zone = crgetzoneid(cred);
486 	d->bd_seesent = 1;
487 	d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
488 	    MAC_PROMISC_FLAGS_NO_COPY;
489 	mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
490 	cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
491 
492 	mutex_enter(&bpf_mtx);
493 	/*
494 	 * Find an unused minor number. Obviously this is an O(n) algorithm
495 	 * and doesn't scale particularly well, so if there are large numbers
496 	 * of open file descriptors happening in real use, this design may
497 	 * need to be revisited.
498 	 */
499 	for (dmin = 0; dmin < L_MAXMIN; dmin++)
500 		if (bpf_dev_find(dmin) == NULL)
501 			break;
502 	if (dmin == L_MAXMIN) {
503 		mutex_exit(&bpf_mtx);
504 		kmem_free(d, sizeof (*d));
505 		return (ENXIO);
506 	}
507 	d->bd_dev = dmin;
508 	LIST_INSERT_HEAD(&bpf_list, d, bd_list);
509 	bpf_dev_add(d);
510 	mutex_exit(&bpf_mtx);
511 
512 	*devp = makedevice(getmajor(*devp), dmin);
513 
514 	return (0);
515 }
516 
517 /*
518  * Close the descriptor by detaching it from its interface,
519  * deallocating its buffers, and marking it free.
520  *
521  * Because we only allow a device to be opened once, there is always a
522  * 1 to 1 relationship between opens and closes supporting this function.
523  */
524 /* ARGSUSED */
525 int
526 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
527 {
528 	struct bpf_d *d = bpf_dev_get(getminor(dev));
529 
530 	mutex_enter(&d->bd_lock);
531 
532 	while (d->bd_inuse != 0) {
533 		d->bd_waiting++;
534 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
535 			d->bd_waiting--;
536 			mutex_exit(&d->bd_lock);
537 			return (EINTR);
538 		}
539 		d->bd_waiting--;
540 	}
541 
542 	d->bd_inuse = -1;
543 	if (d->bd_state == BPF_WAITING)
544 		bpf_clear_timeout(d);
545 	d->bd_state = BPF_IDLE;
546 	if (d->bd_bif)
547 		bpf_detachd(d);
548 	mutex_exit(&d->bd_lock);
549 
550 	mutex_enter(&bpf_mtx);
551 	LIST_REMOVE(d, bd_list);
552 	bpf_dev_remove(d);
553 	mutex_exit(&bpf_mtx);
554 
555 	mutex_enter(&d->bd_lock);
556 	mutex_destroy(&d->bd_lock);
557 	cv_destroy(&d->bd_wait);
558 
559 	bpf_freed(d);
560 	kmem_free(d, sizeof (*d));
561 
562 	return (0);
563 }
564 
565 /*
566  * Rotate the packet buffers in descriptor d.  Move the store buffer
567  * into the hold slot, and the free buffer into the store slot.
568  * Zero the length of the new store buffer.
569  */
570 #define	ROTATE_BUFFERS(d) \
571 	(d)->bd_hbuf = (d)->bd_sbuf; \
572 	(d)->bd_hlen = (d)->bd_slen; \
573 	(d)->bd_sbuf = (d)->bd_fbuf; \
574 	(d)->bd_slen = 0; \
575 	(d)->bd_fbuf = 0;
576 /*
577  *  bpfread - read next chunk of packets from buffers
578  */
579 /* ARGSUSED */
580 int
581 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
582 {
583 	struct bpf_d *d = bpf_dev_get(getminor(dev));
584 	int timed_out;
585 	ulong_t delay;
586 	int error;
587 
588 	if ((d->bd_fmode & FREAD) == 0)
589 		return (EBADF);
590 
591 	/*
592 	 * Restrict application to use a buffer the same size as
593 	 * the kernel buffers.
594 	 */
595 	if (uio->uio_resid != d->bd_bufsize)
596 		return (EINVAL);
597 
598 	mutex_enter(&d->bd_lock);
599 	if (d->bd_state == BPF_WAITING)
600 		bpf_clear_timeout(d);
601 	timed_out = (d->bd_state == BPF_TIMED_OUT);
602 	d->bd_state = BPF_IDLE;
603 	/*
604 	 * If the hold buffer is empty, then do a timed sleep, which
605 	 * ends when the timeout expires or when enough packets
606 	 * have arrived to fill the store buffer.
607 	 */
608 	while (d->bd_hbuf == 0) {
609 		if (d->bd_nonblock) {
610 			if (d->bd_slen == 0) {
611 				mutex_exit(&d->bd_lock);
612 				return (EWOULDBLOCK);
613 			}
614 			ROTATE_BUFFERS(d);
615 			break;
616 		}
617 
618 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
619 			/*
620 			 * A packet(s) either arrived since the previous
621 			 * read or arrived while we were asleep.
622 			 * Rotate the buffers and return what's here.
623 			 */
624 			ROTATE_BUFFERS(d);
625 			break;
626 		}
627 		ks_stats.kp_read_wait.value.ui64++;
628 		delay = ddi_get_lbolt() + d->bd_rtout;
629 		error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
630 		if (error == 0) {
631 			mutex_exit(&d->bd_lock);
632 			return (EINTR);
633 		}
634 		if (error == -1) {
635 			/*
636 			 * On a timeout, return what's in the buffer,
637 			 * which may be nothing.  If there is something
638 			 * in the store buffer, we can rotate the buffers.
639 			 */
640 			if (d->bd_hbuf)
641 				/*
642 				 * We filled up the buffer in between
643 				 * getting the timeout and arriving
644 				 * here, so we don't need to rotate.
645 				 */
646 				break;
647 
648 			if (d->bd_slen == 0) {
649 				mutex_exit(&d->bd_lock);
650 				return (0);
651 			}
652 			ROTATE_BUFFERS(d);
653 		}
654 	}
655 	/*
656 	 * At this point, we know we have something in the hold slot.
657 	 */
658 	mutex_exit(&d->bd_lock);
659 
660 	/*
661 	 * Move data from hold buffer into user space.
662 	 * We know the entire buffer is transferred since
663 	 * we checked above that the read buffer is bpf_bufsize bytes.
664 	 */
665 	error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
666 
667 	mutex_enter(&d->bd_lock);
668 	d->bd_fbuf = d->bd_hbuf;
669 	d->bd_hbuf = 0;
670 	d->bd_hlen = 0;
671 done:
672 	mutex_exit(&d->bd_lock);
673 	return (error);
674 }
675 
676 
677 /*
678  * If there are processes sleeping on this descriptor, wake them up.
679  * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
680  * so there is no code here grabbing it.
681  */
682 static inline void
683 bpf_wakeup(struct bpf_d *d)
684 {
685 	cv_signal(&d->bd_wait);
686 }
687 
688 static void
689 bpf_timed_out(void *arg)
690 {
691 	struct bpf_d *d = arg;
692 
693 	mutex_enter(&d->bd_lock);
694 	if (d->bd_state == BPF_WAITING) {
695 		d->bd_state = BPF_TIMED_OUT;
696 		if (d->bd_slen != 0)
697 			cv_signal(&d->bd_wait);
698 	}
699 	mutex_exit(&d->bd_lock);
700 }
701 
702 
703 /* ARGSUSED */
704 int
705 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
706 {
707 	struct bpf_d *d = bpf_dev_get(getminor(dev));
708 	uintptr_t mch;
709 	uint_t mtu;
710 	mblk_t *m;
711 	int error;
712 	int dlt;
713 
714 	if ((d->bd_fmode & FWRITE) == 0)
715 		return (EBADF);
716 
717 	mutex_enter(&d->bd_lock);
718 	if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
719 		mutex_exit(&d->bd_lock);
720 		return (EINTR);
721 	}
722 
723 	if (uio->uio_resid == 0) {
724 		mutex_exit(&d->bd_lock);
725 		return (0);
726 	}
727 
728 	while (d->bd_inuse < 0) {
729 		d->bd_waiting++;
730 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
731 			d->bd_waiting--;
732 			mutex_exit(&d->bd_lock);
733 			return (EINTR);
734 		}
735 		d->bd_waiting--;
736 	}
737 
738 	mutex_exit(&d->bd_lock);
739 
740 	dlt = d->bd_dlt;
741 	mch = d->bd_mcip;
742 	MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
743 	d->bd_inuse++;
744 
745 	m = NULL;
746 	if (dlt == DLT_IPNET) {
747 		error = EIO;
748 		goto done;
749 	}
750 
751 	error = bpf_movein(uio, dlt, mtu, &m);
752 	if (error)
753 		goto done;
754 
755 	DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
756 	    uint_t, mtu, mblk_t *, m);
757 
758 	if (M_LEN(m) > mtu) {
759 		error = EMSGSIZE;
760 		goto done;
761 	}
762 
763 	error = MBPF_TX(&d->bd_mac, mch, m);
764 	/*
765 	 * The "tx" action here is required to consume the mblk_t.
766 	 */
767 	m = NULL;
768 
769 done:
770 	if (error == 0)
771 		ks_stats.kp_write_ok.value.ui64++;
772 	else
773 		ks_stats.kp_write_error.value.ui64++;
774 	if (m != NULL)
775 		freemsg(m);
776 
777 	mutex_enter(&d->bd_lock);
778 	d->bd_inuse--;
779 	if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
780 		cv_signal(&d->bd_wait);
781 	mutex_exit(&d->bd_lock);
782 
783 	/*
784 	 * The driver frees the mbuf.
785 	 */
786 	return (error);
787 }
788 
789 
790 /*
791  * Reset a descriptor by flushing its packet buffer and clearing the
792  * receive and drop counts.  Should be called at splnet.
793  */
794 static void
795 reset_d(struct bpf_d *d)
796 {
797 	if (d->bd_hbuf) {
798 		/* Free the hold buffer. */
799 		d->bd_fbuf = d->bd_hbuf;
800 		d->bd_hbuf = 0;
801 	}
802 	d->bd_slen = 0;
803 	d->bd_hlen = 0;
804 	d->bd_rcount = 0;
805 	d->bd_dcount = 0;
806 	d->bd_ccount = 0;
807 }
808 
809 /*
810  *  FIONREAD		Check for read packet available.
811  *  BIOCGBLEN		Get buffer len [for read()].
812  *  BIOCSETF		Set ethernet read filter.
813  *  BIOCFLUSH		Flush read packet buffer.
814  *  BIOCPROMISC		Put interface into promiscuous mode.
815  *  BIOCGDLT		Get link layer type.
816  *  BIOCGETIF		Get interface name.
817  *  BIOCSETIF		Set interface.
818  *  BIOCSRTIMEOUT	Set read timeout.
819  *  BIOCGRTIMEOUT	Get read timeout.
820  *  BIOCGSTATS		Get packet stats.
821  *  BIOCIMMEDIATE	Set immediate mode.
822  *  BIOCVERSION		Get filter language version.
823  *  BIOCGHDRCMPLT	Get "header already complete" flag.
824  *  BIOCSHDRCMPLT	Set "header already complete" flag.
825  */
826 /* ARGSUSED */
827 int
828 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
829 {
830 	struct bpf_d *d = bpf_dev_get(getminor(dev));
831 	struct bpf_program prog;
832 	struct lifreq lifreq;
833 	struct ifreq ifreq;
834 	int error = 0;
835 	uint_t size;
836 
837 	/*
838 	 * Refresh the PID associated with this bpf file.
839 	 */
840 	mutex_enter(&d->bd_lock);
841 	if (d->bd_state == BPF_WAITING)
842 		bpf_clear_timeout(d);
843 	d->bd_state = BPF_IDLE;
844 	mutex_exit(&d->bd_lock);
845 
846 	switch (cmd) {
847 
848 	default:
849 		error = EINVAL;
850 		break;
851 
852 	/*
853 	 * Check for read packet available.
854 	 */
855 	case FIONREAD:
856 		{
857 			int n;
858 
859 			mutex_enter(&d->bd_lock);
860 			n = d->bd_slen;
861 			if (d->bd_hbuf)
862 				n += d->bd_hlen;
863 			mutex_exit(&d->bd_lock);
864 
865 			*(int *)addr = n;
866 			break;
867 		}
868 
869 	/*
870 	 * Get buffer len [for read()].
871 	 */
872 	case BIOCGBLEN:
873 		error = copyout(&d->bd_bufsize, (void *)addr,
874 		    sizeof (d->bd_bufsize));
875 		break;
876 
877 	/*
878 	 * Set buffer length.
879 	 */
880 	case BIOCSBLEN:
881 		if (copyin((void *)addr, &size, sizeof (size)) != 0) {
882 			error = EFAULT;
883 			break;
884 		}
885 
886 		mutex_enter(&d->bd_lock);
887 		if (d->bd_bif != 0) {
888 			error = EINVAL;
889 		} else {
890 			if (size > bpf_maxbufsize)
891 				size = bpf_maxbufsize;
892 			else if (size < BPF_MINBUFSIZE)
893 				size = BPF_MINBUFSIZE;
894 
895 			d->bd_bufsize = size;
896 		}
897 		mutex_exit(&d->bd_lock);
898 
899 		if (error == 0)
900 			error = copyout(&size, (void *)addr, sizeof (size));
901 		break;
902 
903 	/*
904 	 * Set link layer read filter.
905 	 */
906 	case BIOCSETF:
907 		if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
908 			error = EFAULT;
909 			break;
910 		}
911 		error = bpf_setf(d, &prog);
912 		break;
913 
914 	/*
915 	 * Flush read packet buffer.
916 	 */
917 	case BIOCFLUSH:
918 		mutex_enter(&d->bd_lock);
919 		reset_d(d);
920 		mutex_exit(&d->bd_lock);
921 		break;
922 
923 	/*
924 	 * Put interface into promiscuous mode.
925 	 * This is a one-way ioctl, it is not used to turn promiscuous
926 	 * mode off.
927 	 */
928 	case BIOCPROMISC:
929 		if (d->bd_bif == 0) {
930 			/*
931 			 * No interface attached yet.
932 			 */
933 			error = EINVAL;
934 			break;
935 		}
936 		mutex_enter(&d->bd_lock);
937 		if (d->bd_promisc == 0) {
938 
939 			if (d->bd_promisc_handle) {
940 				uintptr_t mph;
941 
942 				mph = d->bd_promisc_handle;
943 				d->bd_promisc_handle = 0;
944 
945 				mutex_exit(&d->bd_lock);
946 				MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
947 				mutex_enter(&d->bd_lock);
948 			}
949 
950 			d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
951 			error = MBPF_PROMISC_ADD(&d->bd_mac,
952 			    d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
953 			    &d->bd_promisc_handle, d->bd_promisc_flags);
954 			if (error == 0)
955 				d->bd_promisc = 1;
956 		}
957 		mutex_exit(&d->bd_lock);
958 		break;
959 
960 	/*
961 	 * Get device parameters.
962 	 */
963 	case BIOCGDLT:
964 		if (d->bd_bif == 0)
965 			error = EINVAL;
966 		else
967 			error = copyout(&d->bd_dlt, (void *)addr,
968 			    sizeof (d->bd_dlt));
969 		break;
970 
971 	/*
972 	 * Get a list of supported device parameters.
973 	 */
974 	case BIOCGDLTLIST:
975 		if (d->bd_bif == 0) {
976 			error = EINVAL;
977 		} else {
978 			struct bpf_dltlist list;
979 
980 			if (copyin((void *)addr, &list, sizeof (list)) != 0) {
981 				error = EFAULT;
982 				break;
983 			}
984 			error = bpf_getdltlist(d, &list);
985 			if ((error == 0) &&
986 			    copyout(&list, (void *)addr, sizeof (list)) != 0)
987 				error = EFAULT;
988 		}
989 		break;
990 
991 	/*
992 	 * Set device parameters.
993 	 */
994 	case BIOCSDLT:
995 		error = bpf_setdlt(d, (void *)addr);
996 		break;
997 
998 	/*
999 	 * Get interface name.
1000 	 */
1001 	case BIOCGETIF:
1002 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1003 			error = EFAULT;
1004 			break;
1005 		}
1006 		error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1007 		if ((error == 0) &&
1008 		    copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1009 			error = EFAULT;
1010 			break;
1011 		}
1012 		break;
1013 
1014 	/*
1015 	 * Set interface.
1016 	 */
1017 	case BIOCSETIF:
1018 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1019 			error = EFAULT;
1020 			break;
1021 		}
1022 		error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1023 		break;
1024 
1025 	/*
1026 	 * Get interface name.
1027 	 */
1028 	case BIOCGETLIF:
1029 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1030 			error = EFAULT;
1031 			break;
1032 		}
1033 		error = bpf_ifname(d, lifreq.lifr_name,
1034 		    sizeof (lifreq.lifr_name));
1035 		if ((error == 0) &&
1036 		    copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1037 			error = EFAULT;
1038 			break;
1039 		}
1040 		break;
1041 
1042 	/*
1043 	 * Set interface.
1044 	 */
1045 	case BIOCSETLIF:
1046 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1047 			error = EFAULT;
1048 			break;
1049 		}
1050 		error = bpf_setif(d, lifreq.lifr_name,
1051 		    sizeof (lifreq.lifr_name));
1052 		break;
1053 
1054 #ifdef _SYSCALL32_IMPL
1055 	/*
1056 	 * Set read timeout.
1057 	 */
1058 	case BIOCSRTIMEOUT32:
1059 		{
1060 			struct timeval32 tv;
1061 
1062 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1063 				error = EFAULT;
1064 				break;
1065 			}
1066 
1067 			/* Convert the timeout in microseconds to ticks */
1068 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1069 			    tv.tv_usec);
1070 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1071 				d->bd_rtout = 1;
1072 			break;
1073 		}
1074 
1075 	/*
1076 	 * Get read timeout.
1077 	 */
1078 	case BIOCGRTIMEOUT32:
1079 		{
1080 			struct timeval32 tv;
1081 			clock_t ticks;
1082 
1083 			ticks = drv_hztousec(d->bd_rtout);
1084 			tv.tv_sec = ticks / 1000000;
1085 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1086 			error = copyout(&tv, (void *)addr, sizeof (tv));
1087 			break;
1088 		}
1089 
1090 	/*
1091 	 * Get a list of supported device parameters.
1092 	 */
1093 	case BIOCGDLTLIST32:
1094 		if (d->bd_bif == 0) {
1095 			error = EINVAL;
1096 		} else {
1097 			struct bpf_dltlist32 lst32;
1098 			struct bpf_dltlist list;
1099 
1100 			if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1101 				error = EFAULT;
1102 				break;
1103 			}
1104 
1105 			list.bfl_len = lst32.bfl_len;
1106 			list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1107 			error = bpf_getdltlist(d, &list);
1108 			if (error == 0) {
1109 				lst32.bfl_len = list.bfl_len;
1110 
1111 				if (copyout(&lst32, (void *)addr,
1112 				    sizeof (lst32)) != 0)
1113 					error = EFAULT;
1114 			}
1115 		}
1116 		break;
1117 
1118 	/*
1119 	 * Set link layer read filter.
1120 	 */
1121 	case BIOCSETF32: {
1122 		struct bpf_program32 prog32;
1123 
1124 		if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1125 			error = EFAULT;
1126 			break;
1127 		}
1128 		prog.bf_len = prog32.bf_len;
1129 		prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1130 		error = bpf_setf(d, &prog);
1131 		break;
1132 	}
1133 #endif
1134 
1135 	/*
1136 	 * Set read timeout.
1137 	 */
1138 	case BIOCSRTIMEOUT:
1139 		{
1140 			struct timeval tv;
1141 
1142 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1143 				error = EFAULT;
1144 				break;
1145 			}
1146 
1147 			/* Convert the timeout in microseconds to ticks */
1148 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1149 			    tv.tv_usec);
1150 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1151 				d->bd_rtout = 1;
1152 			break;
1153 		}
1154 
1155 	/*
1156 	 * Get read timeout.
1157 	 */
1158 	case BIOCGRTIMEOUT:
1159 		{
1160 			struct timeval tv;
1161 			clock_t ticks;
1162 
1163 			ticks = drv_hztousec(d->bd_rtout);
1164 			tv.tv_sec = ticks / 1000000;
1165 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1166 			if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1167 				error = EFAULT;
1168 			break;
1169 		}
1170 
1171 	/*
1172 	 * Get packet stats.
1173 	 */
1174 	case BIOCGSTATS:
1175 		{
1176 			struct bpf_stat bs;
1177 
1178 			bs.bs_recv = d->bd_rcount;
1179 			bs.bs_drop = d->bd_dcount;
1180 			bs.bs_capt = d->bd_ccount;
1181 			if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1182 				error = EFAULT;
1183 			break;
1184 		}
1185 
1186 	/*
1187 	 * Set immediate mode.
1188 	 */
1189 	case BIOCIMMEDIATE:
1190 		if (copyin((void *)addr, &d->bd_immediate,
1191 		    sizeof (d->bd_immediate)) != 0)
1192 			error = EFAULT;
1193 		break;
1194 
1195 	case BIOCVERSION:
1196 		{
1197 			struct bpf_version bv;
1198 
1199 			bv.bv_major = BPF_MAJOR_VERSION;
1200 			bv.bv_minor = BPF_MINOR_VERSION;
1201 			if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1202 				error = EFAULT;
1203 			break;
1204 		}
1205 
1206 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1207 		if (copyout(&d->bd_hdrcmplt, (void *)addr,
1208 		    sizeof (d->bd_hdrcmplt)) != 0)
1209 			error = EFAULT;
1210 		break;
1211 
1212 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1213 		if (copyin((void *)addr, &d->bd_hdrcmplt,
1214 		    sizeof (d->bd_hdrcmplt)) != 0)
1215 			error = EFAULT;
1216 		break;
1217 
1218 	/*
1219 	 * Get "see sent packets" flag
1220 	 */
1221 	case BIOCGSEESENT:
1222 		if (copyout(&d->bd_seesent, (void *)addr,
1223 		    sizeof (d->bd_seesent)) != 0)
1224 			error = EFAULT;
1225 		break;
1226 
1227 	/*
1228 	 * Set "see sent" packets flag
1229 	 */
1230 	case BIOCSSEESENT:
1231 		if (copyin((void *)addr, &d->bd_seesent,
1232 		    sizeof (d->bd_seesent)) != 0)
1233 			error = EFAULT;
1234 		break;
1235 
1236 	case FIONBIO:		/* Non-blocking I/O */
1237 		if (copyin((void *)addr, &d->bd_nonblock,
1238 		    sizeof (d->bd_nonblock)) != 0)
1239 			error = EFAULT;
1240 		break;
1241 	}
1242 	return (error);
1243 }
1244 
1245 /*
1246  * Set d's packet filter program to fp.  If this file already has a filter,
1247  * free it and replace it. If the new filter is "empty" (has a 0 size), then
1248  * the result is to just remove and free the existing filter.
1249  * Returns EINVAL for bogus requests.
1250  */
1251 int
1252 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1253 {
1254 	struct bpf_insn *fcode, *old;
1255 	uint_t flen, size;
1256 	size_t oldsize;
1257 
1258 	if (fp->bf_insns == 0) {
1259 		if (fp->bf_len != 0)
1260 			return (EINVAL);
1261 		mutex_enter(&d->bd_lock);
1262 		old = d->bd_filter;
1263 		oldsize = d->bd_filter_size;
1264 		d->bd_filter = 0;
1265 		d->bd_filter_size = 0;
1266 		reset_d(d);
1267 		mutex_exit(&d->bd_lock);
1268 		if (old != 0)
1269 			kmem_free(old, oldsize);
1270 		return (0);
1271 	}
1272 	flen = fp->bf_len;
1273 	if (flen > BPF_MAXINSNS)
1274 		return (EINVAL);
1275 
1276 	size = flen * sizeof (*fp->bf_insns);
1277 	fcode = kmem_alloc(size, KM_SLEEP);
1278 	if (copyin(fp->bf_insns, fcode, size) != 0)
1279 		return (EFAULT);
1280 
1281 	if (bpf_validate(fcode, (int)flen)) {
1282 		mutex_enter(&d->bd_lock);
1283 		old = d->bd_filter;
1284 		oldsize = d->bd_filter_size;
1285 		d->bd_filter = fcode;
1286 		d->bd_filter_size = size;
1287 		reset_d(d);
1288 		mutex_exit(&d->bd_lock);
1289 		if (old != 0)
1290 			kmem_free(old, oldsize);
1291 
1292 		return (0);
1293 	}
1294 	kmem_free(fcode, size);
1295 	return (EINVAL);
1296 }
1297 
1298 /*
1299  * Detach a file from its current interface (if attached at all) and attach
1300  * to the interface indicated by the name stored in ifname.
1301  * Return an errno or 0.
1302  */
1303 static int
1304 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1305 {
1306 	int unit_seen;
1307 	int error = 0;
1308 	char *cp;
1309 	int i;
1310 
1311 	/*
1312 	 * Make sure the provided name has a unit number, and default
1313 	 * it to '0' if not specified.
1314 	 * XXX This is ugly ... do this differently?
1315 	 */
1316 	unit_seen = 0;
1317 	cp = ifname;
1318 	cp[namesize - 1] = '\0';	/* sanity */
1319 	while (*cp++)
1320 		if (*cp >= '0' && *cp <= '9')
1321 			unit_seen = 1;
1322 	if (!unit_seen) {
1323 		/* Make sure to leave room for the '\0'. */
1324 		for (i = 0; i < (namesize - 1); ++i) {
1325 			if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1326 			    (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1327 				continue;
1328 			ifname[i] = '0';
1329 		}
1330 	}
1331 
1332 	/*
1333 	 * Make sure that only one call to this function happens at a time
1334 	 * and that we're not interleaving a read/write
1335 	 */
1336 	mutex_enter(&d->bd_lock);
1337 	while (d->bd_inuse != 0) {
1338 		d->bd_waiting++;
1339 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1340 			d->bd_waiting--;
1341 			mutex_exit(&d->bd_lock);
1342 			return (EINTR);
1343 		}
1344 		d->bd_waiting--;
1345 	}
1346 	d->bd_inuse = -1;
1347 	mutex_exit(&d->bd_lock);
1348 
1349 	if (d->bd_sbuf == 0)
1350 		error = bpf_allocbufs(d);
1351 
1352 	if (error == 0) {
1353 		mutex_enter(&d->bd_lock);
1354 		if (d->bd_bif)
1355 			/*
1356 			 * Detach if attached to something else.
1357 			 */
1358 			bpf_detachd(d);
1359 
1360 		error = bpf_attachd(d, ifname, -1);
1361 		reset_d(d);
1362 		d->bd_inuse = 0;
1363 		if (d->bd_waiting != 0)
1364 			cv_signal(&d->bd_wait);
1365 		mutex_exit(&d->bd_lock);
1366 		return (error);
1367 	}
1368 
1369 	mutex_enter(&d->bd_lock);
1370 	d->bd_inuse = 0;
1371 	if (d->bd_waiting != 0)
1372 		cv_signal(&d->bd_wait);
1373 	mutex_exit(&d->bd_lock);
1374 
1375 	/*
1376 	 * Try tickle the mac layer into attaching the device...
1377 	 */
1378 	return (bpf_provider_tickle(ifname, d->bd_zone));
1379 }
1380 
1381 /*
1382  * Copy the interface name to the ifreq.
1383  */
1384 static int
1385 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1386 {
1387 
1388 	mutex_enter(&d->bd_lock);
1389 	if (d->bd_bif == 0) {
1390 		mutex_exit(&d->bd_lock);
1391 		return (EINVAL);
1392 	}
1393 
1394 	(void) strlcpy(buffer, d->bd_ifname, bufsize);
1395 	mutex_exit(&d->bd_lock);
1396 
1397 	return (0);
1398 }
1399 
1400 /* ARGSUSED */
1401 int
1402 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1403     struct pollhead **phpp)
1404 {
1405 	struct bpf_d *d = bpf_dev_get(getminor(dev));
1406 
1407 	/*
1408 	 * Until this driver is modified to issue proper pollwakeup() calls on
1409 	 * its pollhead, edge-triggered polling is not allowed.
1410 	 */
1411 	if (events & POLLET) {
1412 		return (EPERM);
1413 	}
1414 
1415 	if (events & (POLLIN | POLLRDNORM)) {
1416 		/*
1417 		 * An imitation of the FIONREAD ioctl code.
1418 		 */
1419 		mutex_enter(&d->bd_lock);
1420 		if (d->bd_hlen != 0 ||
1421 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1422 		    d->bd_slen != 0)) {
1423 			*reventsp |= events & (POLLIN | POLLRDNORM);
1424 		} else {
1425 			/*
1426 			 * Until the bpf driver has been updated to include
1427 			 * adequate pollwakeup() logic, no pollhead will be
1428 			 * emitted here, preventing the resource from being
1429 			 * cached by poll()/devpoll/epoll.
1430 			 */
1431 			*reventsp = 0;
1432 			/* Start the read timeout if necessary */
1433 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1434 				bpf_clear_timeout(d);
1435 				/*
1436 				 * Only allow the timeout to be set once.
1437 				 */
1438 				if (d->bd_callout == 0)
1439 					d->bd_callout = timeout(bpf_timed_out,
1440 					    d, d->bd_rtout);
1441 				d->bd_state = BPF_WAITING;
1442 			}
1443 		}
1444 		mutex_exit(&d->bd_lock);
1445 	}
1446 
1447 	return (0);
1448 }
1449 
1450 /*
1451  * Copy data from an mblk_t chain into a buffer. This works for ipnet
1452  * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1453  * packet itself.
1454  */
1455 static void *
1456 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1457 {
1458 	const mblk_t *m;
1459 	uint_t count;
1460 	uchar_t *dst;
1461 
1462 	m = src_arg;
1463 	dst = dst_arg;
1464 	while (len > 0) {
1465 		if (m == NULL)
1466 			panic("bpf_mcpy");
1467 		count = (uint_t)min(M_LEN(m), len);
1468 		(void) memcpy(dst, mtod(m, const void *), count);
1469 		m = m->b_cont;
1470 		dst += count;
1471 		len -= count;
1472 	}
1473 	return (dst_arg);
1474 }
1475 
1476 /*
1477  * Dispatch a packet to all the listeners on interface bp.
1478  *
1479  * marg    pointer to the packet, either a data buffer or an mbuf chain
1480  * buflen  buffer length, if marg is a data buffer
1481  * cpfn    a function that can copy marg into the listener's buffer
1482  * pktlen  length of the packet
1483  * issent  boolean indicating whether the packet was sent or receive
1484  */
1485 static inline void
1486 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1487     uint_t buflen, boolean_t issent)
1488 {
1489 	struct timeval tv;
1490 	uint_t slen;
1491 
1492 	if (!d->bd_seesent && issent)
1493 		return;
1494 
1495 	/*
1496 	 * Accuracy of the packet counters in BPF is vital so it
1497 	 * is important to protect even the outer ones.
1498 	 */
1499 	mutex_enter(&d->bd_lock);
1500 	slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1501 	DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1502 	    struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1503 	d->bd_rcount++;
1504 	ks_stats.kp_receive.value.ui64++;
1505 	if (slen != 0) {
1506 		uniqtime(&tv);
1507 		catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1508 	}
1509 	mutex_exit(&d->bd_lock);
1510 }
1511 
1512 /*
1513  * Incoming linkage from device drivers.
1514  */
1515 /* ARGSUSED */
1516 void
1517 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1518 {
1519 	cp_fn_t cpfn;
1520 	struct bpf_d *d = arg;
1521 	uint_t pktlen, buflen;
1522 	void *marg;
1523 
1524 	pktlen = msgdsize(m);
1525 
1526 	if (pktlen == M_LEN(m)) {
1527 		cpfn = (cp_fn_t)memcpy;
1528 		marg = mtod(m, void *);
1529 		buflen = pktlen;
1530 	} else {
1531 		cpfn = bpf_mcpy;
1532 		marg = m;
1533 		buflen = 0;
1534 	}
1535 
1536 	bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1537 }
1538 
1539 /*
1540  * Incoming linkage from ipnet.
1541  * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1542  * from all network interfaces. Thus the tap function needs to apply a
1543  * filter using the interface index/id to immitate snoop'ing on just the
1544  * specified interface.
1545  */
1546 /* ARGSUSED */
1547 void
1548 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1549 {
1550 	hook_pkt_observe_t *hdr;
1551 	struct bpf_d *d = arg;
1552 
1553 	hdr = (hook_pkt_observe_t *)m->b_rptr;
1554 	if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1555 		return;
1556 	bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1557 
1558 }
1559 
1560 /*
1561  * Move the packet data from interface memory (pkt) into the
1562  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1563  * otherwise 0.  "copy" is the routine called to do the actual data
1564  * transfer.  memcpy is passed in to copy contiguous chunks, while
1565  * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1566  * pkt is really an mbuf.
1567  */
1568 static void
1569 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1570     cp_fn_t cpfn, struct timeval *tv)
1571 {
1572 	struct bpf_hdr *hp;
1573 	int totlen, curlen;
1574 	int hdrlen = d->bd_hdrlen;
1575 	int do_wakeup = 0;
1576 
1577 	++d->bd_ccount;
1578 	ks_stats.kp_capture.value.ui64++;
1579 	/*
1580 	 * Figure out how many bytes to move.  If the packet is
1581 	 * greater or equal to the snapshot length, transfer that
1582 	 * much.  Otherwise, transfer the whole packet (unless
1583 	 * we hit the buffer size limit).
1584 	 */
1585 	totlen = hdrlen + min(snaplen, pktlen);
1586 	if (totlen > d->bd_bufsize)
1587 		totlen = d->bd_bufsize;
1588 
1589 	/*
1590 	 * Round up the end of the previous packet to the next longword.
1591 	 */
1592 	curlen = BPF_WORDALIGN(d->bd_slen);
1593 	if (curlen + totlen > d->bd_bufsize) {
1594 		/*
1595 		 * This packet will overflow the storage buffer.
1596 		 * Rotate the buffers if we can, then wakeup any
1597 		 * pending reads.
1598 		 */
1599 		if (d->bd_fbuf == 0) {
1600 			/*
1601 			 * We haven't completed the previous read yet,
1602 			 * so drop the packet.
1603 			 */
1604 			++d->bd_dcount;
1605 			ks_stats.kp_dropped.value.ui64++;
1606 			return;
1607 		}
1608 		ROTATE_BUFFERS(d);
1609 		do_wakeup = 1;
1610 		curlen = 0;
1611 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1612 		/*
1613 		 * Immediate mode is set, or the read timeout has
1614 		 * already expired during a select call.  A packet
1615 		 * arrived, so the reader should be woken up.
1616 		 */
1617 		do_wakeup = 1;
1618 	}
1619 
1620 	/*
1621 	 * Append the bpf header to the existing buffer before we add
1622 	 * on the actual packet data.
1623 	 */
1624 	hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1625 	hp->bh_tstamp.tv_sec = tv->tv_sec;
1626 	hp->bh_tstamp.tv_usec = tv->tv_usec;
1627 	hp->bh_datalen = pktlen;
1628 	hp->bh_hdrlen = (uint16_t)hdrlen;
1629 	/*
1630 	 * Copy the packet data into the store buffer and update its length.
1631 	 */
1632 	(*cpfn)((uchar_t *)hp + hdrlen, pkt,
1633 	    (hp->bh_caplen = totlen - hdrlen));
1634 	d->bd_slen = curlen + totlen;
1635 
1636 	/*
1637 	 * Call bpf_wakeup after bd_slen has been updated.
1638 	 */
1639 	if (do_wakeup)
1640 		bpf_wakeup(d);
1641 }
1642 
1643 /*
1644  * Initialize all nonzero fields of a descriptor.
1645  */
1646 static int
1647 bpf_allocbufs(struct bpf_d *d)
1648 {
1649 
1650 	d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1651 	if (!d->bd_fbuf)
1652 		return (ENOBUFS);
1653 	d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1654 	if (!d->bd_sbuf) {
1655 		kmem_free(d->bd_fbuf, d->bd_bufsize);
1656 		return (ENOBUFS);
1657 	}
1658 	d->bd_slen = 0;
1659 	d->bd_hlen = 0;
1660 	return (0);
1661 }
1662 
1663 /*
1664  * Free buffers currently in use by a descriptor.
1665  * Called on close.
1666  */
1667 static void
1668 bpf_freed(struct bpf_d *d)
1669 {
1670 	/*
1671 	 * At this point the descriptor has been detached from its
1672 	 * interface and it yet hasn't been marked free.
1673 	 */
1674 	if (d->bd_sbuf != 0) {
1675 		kmem_free(d->bd_sbuf, d->bd_bufsize);
1676 		if (d->bd_hbuf != 0)
1677 			kmem_free(d->bd_hbuf, d->bd_bufsize);
1678 		if (d->bd_fbuf != 0)
1679 			kmem_free(d->bd_fbuf, d->bd_bufsize);
1680 	}
1681 	if (d->bd_filter)
1682 		kmem_free(d->bd_filter, d->bd_filter_size);
1683 }
1684 
1685 /*
1686  * Get a list of available data link type of the interface.
1687  */
1688 static int
1689 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1690 {
1691 	bpf_provider_list_t *bp;
1692 	bpf_provider_t *bpr;
1693 	zoneid_t zoneid;
1694 	uintptr_t mcip;
1695 	uint_t nicdlt;
1696 	uintptr_t mh;
1697 	int error;
1698 	int n;
1699 
1700 	n = 0;
1701 	mh = 0;
1702 	mcip = 0;
1703 	error = 0;
1704 	mutex_enter(&d->bd_lock);
1705 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1706 		bpr = bp->bpl_what;
1707 		error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1708 		if (error != 0)
1709 			goto next;
1710 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1711 		if (error != 0)
1712 			goto next;
1713 		error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1714 		if (error != 0)
1715 			goto next;
1716 		if (d->bd_zone != GLOBAL_ZONEID &&
1717 		    d->bd_zone != zoneid)
1718 			goto next;
1719 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1720 		if (error != 0)
1721 			goto next;
1722 		nicdlt = bpf_dl_to_dlt(nicdlt);
1723 		if (listp->bfl_list != NULL) {
1724 			if (n >= listp->bfl_len) {
1725 				MBPF_CLIENT_CLOSE(bpr, mcip);
1726 				MBPF_CLOSE(bpr, mh);
1727 				break;
1728 			}
1729 			/*
1730 			 * Bumping of bd_inuse ensures the structure does not
1731 			 * disappear while the copyout runs and allows the for
1732 			 * loop to be continued.
1733 			 */
1734 			d->bd_inuse++;
1735 			mutex_exit(&d->bd_lock);
1736 			if (copyout(&nicdlt,
1737 			    listp->bfl_list + n, sizeof (uint_t)) != 0)
1738 				error = EFAULT;
1739 			mutex_enter(&d->bd_lock);
1740 			if (error != 0)
1741 				break;
1742 			d->bd_inuse--;
1743 		}
1744 		n++;
1745 next:
1746 		if (mcip != 0) {
1747 			MBPF_CLIENT_CLOSE(bpr, mcip);
1748 			mcip = 0;
1749 		}
1750 		if (mh != 0) {
1751 			MBPF_CLOSE(bpr, mh);
1752 			mh = 0;
1753 		}
1754 	}
1755 	mutex_exit(&d->bd_lock);
1756 
1757 	/*
1758 	 * It is quite possible that one or more provider to BPF may not
1759 	 * know about a link name whlist others do. In that case, so long
1760 	 * as we have one success, do not declare an error unless it was
1761 	 * an EFAULT as this indicates a problem that needs to be reported.
1762 	 */
1763 	if ((error != EFAULT) && (n > 0))
1764 		error = 0;
1765 
1766 	listp->bfl_len = n;
1767 	return (error);
1768 }
1769 
1770 /*
1771  * Set the data link type of a BPF instance.
1772  */
1773 static int
1774 bpf_setdlt(struct bpf_d *d, void *addr)
1775 {
1776 	char ifname[LIFNAMSIZ+1];
1777 	zoneid_t niczone;
1778 	int error;
1779 	int dlt;
1780 
1781 	if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1782 		return (EFAULT);
1783 
1784 	mutex_enter(&d->bd_lock);
1785 
1786 	if (d->bd_bif == 0) {			/* Interface not set */
1787 		mutex_exit(&d->bd_lock);
1788 		return (EINVAL);
1789 	}
1790 	if (d->bd_dlt == dlt) {	/* NULL-op */
1791 		mutex_exit(&d->bd_lock);
1792 		return (0);
1793 	}
1794 
1795 	error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1796 	if (error != 0) {
1797 		mutex_exit(&d->bd_lock);
1798 		return (error);
1799 	}
1800 
1801 	/*
1802 	 * See the matrix at the top of the file for the permissions table
1803 	 * enforced by this driver.
1804 	 */
1805 	if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1806 	    (niczone != d->bd_zone)) {
1807 		mutex_exit(&d->bd_lock);
1808 		return (EINVAL);
1809 	}
1810 
1811 	(void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1812 	d->bd_inuse = -1;
1813 	bpf_detachd(d);
1814 	error = bpf_attachd(d, ifname, dlt);
1815 	reset_d(d);
1816 	d->bd_inuse = 0;
1817 
1818 	mutex_exit(&d->bd_lock);
1819 	return (error);
1820 }
1821 
1822 /*
1823  * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1824  * with the necessary protection to retrieve and modify bd_callout but it
1825  * does not hold the lock for its entire duration... see below...
1826  */
1827 static void
1828 bpf_clear_timeout(struct bpf_d *d)
1829 {
1830 	timeout_id_t tid = d->bd_callout;
1831 	d->bd_callout = 0;
1832 	d->bd_inuse++;
1833 
1834 	/*
1835 	 * If the timeout has fired and is waiting on bd_lock, we could
1836 	 * deadlock here because untimeout if bd_lock is held and would
1837 	 * wait for bpf_timed_out to finish and it never would.
1838 	 */
1839 	if (tid != 0) {
1840 		mutex_exit(&d->bd_lock);
1841 		(void) untimeout(tid);
1842 		mutex_enter(&d->bd_lock);
1843 	}
1844 
1845 	d->bd_inuse--;
1846 }
1847 
1848 /*
1849  * As a cloning device driver, BPF needs to keep track of which device
1850  * numbers are in use and which ones are not. A hash table, indexed by
1851  * the minor device number, is used to store the pointers to the
1852  * individual descriptors that are allocated in bpfopen().
1853  * The functions below present the interface for that hash table to
1854  * the rest of the driver.
1855  */
1856 static struct bpf_d *
1857 bpf_dev_find(minor_t minor)
1858 {
1859 	struct bpf_d *d = NULL;
1860 
1861 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1862 	    (mod_hash_val_t *)&d);
1863 
1864 	return (d);
1865 }
1866 
1867 static void
1868 bpf_dev_add(struct bpf_d *d)
1869 {
1870 	(void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1871 	    (mod_hash_val_t)d);
1872 }
1873 
1874 static void
1875 bpf_dev_remove(struct bpf_d *d)
1876 {
1877 	struct bpf_d *stor;
1878 
1879 	(void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1880 	    (mod_hash_val_t *)&stor);
1881 	ASSERT(stor == d);
1882 }
1883 
1884 /*
1885  * bpf_def_get should only ever be called for a minor number that exists,
1886  * thus there should always be a pointer in the hash table that corresponds
1887  * to it.
1888  */
1889 static struct bpf_d *
1890 bpf_dev_get(minor_t minor)
1891 {
1892 	struct bpf_d *d = NULL;
1893 
1894 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1895 	    (mod_hash_val_t *)&d);
1896 	ASSERT(d != NULL);
1897 
1898 	return (d);
1899 }
1900