xref: /illumos-gate/usr/src/uts/common/io/bpf/bpf.c (revision 5422785d352a2bb398daceab3d1898a8aa64d006)
1 /*	$NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1990, 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from the Stanford/CMU enet packet filter,
8  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10  * Berkeley Laboratory.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)bpf.c	8.4 (Berkeley) 1/9/95
37  * static char rcsid[] =
38  * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39  */
40 /*
41  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
42  * Use is subject to license terms.
43  */
44 
45 /*
46  * The BPF implements the following access controls for zones attempting
47  * to read and write data. Writing of data requires that the net_rawaccess
48  * privilege is held whilst reading data requires either net_rawaccess or
49  * net_observerability.
50  *
51  *                              | Shared |  Exclusive |   Global
52  * -----------------------------+--------+------------+------------+
53  * DLT_IPNET in local zone      |  Read  |    Read    |    Read    |
54  * -----------------------------+--------+------------+------------+
55  * Raw access to local zone NIC |  None  | Read/Write | Read/Write |
56  * -----------------------------+--------+------------+------------+
57  * Raw access to all NICs       |  None  |    None    | Read/Write |
58  * -----------------------------+--------+------------+------------+
59  *
60  * The BPF driver is written as a cloning driver: each call to bpfopen()
61  * allocates a new minor number. This provides BPF with a 1:1 relationship
62  * between open's and close's. There is some amount of "descriptor state"
63  * that is kept per open. Pointers to this data are stored in a hash table
64  * (bpf_hash) that is index'd by the minor device number for each open file.
65  */
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/time.h>
69 #include <sys/ioctl.h>
70 #include <sys/queue.h>
71 #include <sys/filio.h>
72 #include <sys/policy.h>
73 #include <sys/cmn_err.h>
74 #include <sys/uio.h>
75 #include <sys/file.h>
76 #include <sys/sysmacros.h>
77 #include <sys/zone.h>
78 
79 #include <sys/socket.h>
80 #include <sys/errno.h>
81 #include <sys/poll.h>
82 #include <sys/dlpi.h>
83 #include <sys/neti.h>
84 
85 #include <net/if.h>
86 
87 #include <net/bpf.h>
88 #include <net/bpfdesc.h>
89 #include <net/dlt.h>
90 
91 #include <netinet/in.h>
92 #include <sys/mac.h>
93 #include <sys/mac_client.h>
94 #include <sys/mac_impl.h>
95 #include <sys/time_std_impl.h>
96 #include <sys/hook.h>
97 #include <sys/hook_event.h>
98 
99 
100 #define	mtod(_v, _t)	(_t)((_v)->b_rptr)
101 #define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
102 
103 /*
104  * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
105  * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
106  */
107 #define	BPF_BUFSIZE (32 * 1024)
108 
109 typedef void *(*cp_fn_t)(void *, const void *, size_t);
110 
111 /*
112  * The default read buffer size, and limit for BIOCSBLEN.
113  */
114 int bpf_bufsize = BPF_BUFSIZE;
115 int bpf_maxbufsize = (16 * 1024 * 1024);
116 static mod_hash_t *bpf_hash = NULL;
117 
118 /*
119  * Use a mutex to avoid a race condition between gathering the stats/peers
120  * and opening/closing the device.
121  */
122 static kcondvar_t bpf_dlt_waiter;
123 static kmutex_t bpf_mtx;
124 static bpf_kstats_t ks_stats;
125 static bpf_kstats_t bpf_kstats = {
126 	{ "readWait",		KSTAT_DATA_UINT64 },
127 	{ "writeOk",		KSTAT_DATA_UINT64 },
128 	{ "writeError",		KSTAT_DATA_UINT64 },
129 	{ "receive",		KSTAT_DATA_UINT64 },
130 	{ "captured",		KSTAT_DATA_UINT64 },
131 	{ "dropped",		KSTAT_DATA_UINT64 },
132 };
133 static kstat_t *bpf_ksp;
134 
135 /*
136  *  bpf_list is a list of the BPF descriptors currently open
137  */
138 LIST_HEAD(, bpf_d) bpf_list;
139 
140 static int	bpf_allocbufs(struct bpf_d *);
141 static void	bpf_clear_timeout(struct bpf_d *);
142 static void	bpf_deliver(struct bpf_d *, cp_fn_t,
143 		    void *, uint_t, uint_t, boolean_t);
144 static void	bpf_freed(struct bpf_d *);
145 static int	bpf_ifname(struct bpf_d *d, char *, int);
146 static void	*bpf_mcpy(void *, const void *, size_t);
147 static int	bpf_attachd(struct bpf_d *, const char *, int);
148 static void	bpf_detachd(struct bpf_d *);
149 static int	bpf_setif(struct bpf_d *, char *, int);
150 static void	bpf_timed_out(void *);
151 static inline void
152 		bpf_wakeup(struct bpf_d *);
153 static void	catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
154 		    cp_fn_t, struct timeval *);
155 static void	reset_d(struct bpf_d *);
156 static int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
157 static int	bpf_setdlt(struct bpf_d *, void *);
158 static void	bpf_dev_add(struct bpf_d *);
159 static struct bpf_d *bpf_dev_find(minor_t);
160 static struct bpf_d *bpf_dev_get(minor_t);
161 static void	bpf_dev_remove(struct bpf_d *);
162 
163 static int
164 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
165 {
166 	mblk_t *m;
167 	int error;
168 	int len;
169 	int hlen;
170 	int align;
171 
172 	/*
173 	 * Build a sockaddr based on the data link layer type.
174 	 * We do this at this level because the ethernet header
175 	 * is copied directly into the data field of the sockaddr.
176 	 * In the case of SLIP, there is no header and the packet
177 	 * is forwarded as is.
178 	 * Also, we are careful to leave room at the front of the mbuf
179 	 * for the link level header.
180 	 */
181 	switch (linktype) {
182 
183 	case DLT_EN10MB:
184 		hlen = sizeof (struct ether_header);
185 		break;
186 
187 	case DLT_FDDI:
188 		hlen = 16;
189 		break;
190 
191 	case DLT_NULL:
192 		hlen = 0;
193 		break;
194 
195 	case DLT_IPOIB:
196 		hlen = 44;
197 		break;
198 
199 	default:
200 		return (EIO);
201 	}
202 
203 	align = 4 - (hlen & 3);
204 
205 	len = uio->uio_resid;
206 	/*
207 	 * If there aren't enough bytes for a link level header or the
208 	 * packet length exceeds the interface mtu, return an error.
209 	 */
210 	if (len < hlen || len - hlen > mtu)
211 		return (EMSGSIZE);
212 
213 	m = allocb(len + align, BPRI_MED);
214 	if (m == NULL) {
215 		error = ENOBUFS;
216 		goto bad;
217 	}
218 
219 	/* Insure the data is properly aligned */
220 	if (align > 0)
221 		m->b_rptr += align;
222 	m->b_wptr = m->b_rptr + len;
223 
224 	error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
225 	if (error)
226 		goto bad;
227 	*mp = m;
228 	return (0);
229 
230 bad:
231 	if (m != NULL)
232 		freemsg(m);
233 	return (error);
234 }
235 
236 
237 /*
238  * Attach file to the bpf interface, i.e. make d listen on bp.
239  */
240 static int
241 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
242 {
243 	bpf_provider_list_t *bp;
244 	bpf_provider_t *bpr;
245 	boolean_t zonematch;
246 	zoneid_t niczone;
247 	uintptr_t mcip;
248 	zoneid_t zone;
249 	uint_t nicdlt;
250 	uintptr_t mh;
251 	int hdrlen;
252 	int error;
253 
254 	ASSERT(d->bd_bif == NULL);
255 	ASSERT(d->bd_mcip == NULL);
256 	zone = d->bd_zone;
257 	zonematch = B_TRUE;
258 again:
259 	mh = 0;
260 	mcip = 0;
261 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
262 		bpr = bp->bpl_what;
263 		error = MBPF_OPEN(bpr, ifname, &mh, zone);
264 		if (error != 0)
265 			goto next;
266 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
267 		if (error != 0)
268 			goto next;
269 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
270 		if (error != 0)
271 			goto next;
272 
273 		nicdlt = bpf_dl_to_dlt(nicdlt);
274 		if (dlt != -1 && dlt != nicdlt) {
275 			error = ENOENT;
276 			goto next;
277 		}
278 
279 		error = MBPF_GET_ZONE(bpr, mh, &niczone);
280 		if (error != 0)
281 			goto next;
282 
283 		DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
284 		    uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
285 
286 		if (zonematch && niczone != zone) {
287 			error = ENOENT;
288 			goto next;
289 		}
290 		break;
291 next:
292 		if (mcip != 0) {
293 			MBPF_CLIENT_CLOSE(bpr, mcip);
294 			mcip = 0;
295 		}
296 		if (mh != NULL) {
297 			MBPF_CLOSE(bpr, mh);
298 			mh = 0;
299 		}
300 	}
301 	if (error != 0) {
302 		if (zonematch && (zone == GLOBAL_ZONEID)) {
303 			/*
304 			 * If we failed to do an exact match for the global
305 			 * zone using the global zoneid, try again in case
306 			 * the network interface is owned by a local zone.
307 			 */
308 			zonematch = B_FALSE;
309 			goto again;
310 		}
311 		return (error);
312 	}
313 
314 	d->bd_mac = *bpr;
315 	d->bd_mcip = mcip;
316 	d->bd_bif = mh;
317 	d->bd_dlt = nicdlt;
318 	hdrlen = bpf_dl_hdrsize(nicdlt);
319 	d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
320 
321 	(void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
322 	    sizeof (d->bd_ifname));
323 
324 	(void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
325 	    zone);
326 	(void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
327 	    &d->bd_promisc_handle, d->bd_promisc_flags);
328 	return (0);
329 }
330 
331 /*
332  * Detach a file from its interface.
333  */
334 static void
335 bpf_detachd(struct bpf_d *d)
336 {
337 	uintptr_t mph;
338 	uintptr_t mch;
339 	uintptr_t mh;
340 
341 	ASSERT(d->bd_inuse == -1);
342 	mch = d->bd_mcip;
343 	d->bd_mcip = 0;
344 	mh = d->bd_bif;
345 	d->bd_bif = 0;
346 
347 	/*
348 	 * Check if this descriptor had requested promiscuous mode.
349 	 * If so, turn it off. There's no need to take any action
350 	 * here, that is done when MBPF_PROMISC_REMOVE is used;
351 	 * bd_promisc is just a local flag to stop promiscuous mode
352 	 * from being set more than once.
353 	 */
354 	if (d->bd_promisc)
355 		d->bd_promisc = 0;
356 
357 	/*
358 	 * Take device out of "promiscuous" mode.  Since we were able to
359 	 * enter "promiscuous" mode, we should be able to turn it off.
360 	 * Note, this field stores a pointer used to support both
361 	 * promiscuous and non-promiscuous callbacks for packets.
362 	 */
363 	mph = d->bd_promisc_handle;
364 	d->bd_promisc_handle = 0;
365 
366 	/*
367 	 * The lock has to be dropped here because mac_promisc_remove may
368 	 * need to wait for mac_promisc_dispatch, which has called into
369 	 * bpf and catchpacket is waiting for bd_lock...
370 	 * i.e mac_promisc_remove() needs to be called with none of the
371 	 * locks held that are part of the bpf_mtap() call path.
372 	 */
373 	mutex_exit(&d->bd_lock);
374 	if (mph != 0)
375 		MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
376 
377 	if (mch != 0)
378 		MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
379 
380 	if (mh != 0)
381 		MBPF_CLOSE(&d->bd_mac, mh);
382 
383 	/*
384 	 * Because this function is called with bd_lock held, so it must
385 	 * exit with it held.
386 	 */
387 	mutex_enter(&d->bd_lock);
388 	*d->bd_ifname = '\0';
389 	(void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
390 }
391 
392 
393 /*
394  * bpfilterattach() is called at load time.
395  */
396 int
397 bpfilterattach(void)
398 {
399 
400 	bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
401 	    mod_hash_null_keydtor);
402 	if (bpf_hash == NULL)
403 		return (ENOMEM);
404 
405 	(void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
406 
407 	bpf_ksp = kstat_create("bpf", 0, "global", "misc",
408 	    KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
409 	    KSTAT_FLAG_VIRTUAL);
410 	if (bpf_ksp != NULL) {
411 		bpf_ksp->ks_data = &ks_stats;
412 		kstat_install(bpf_ksp);
413 	} else {
414 		mod_hash_destroy_idhash(bpf_hash);
415 		bpf_hash = NULL;
416 		return (EEXIST);
417 	}
418 
419 	cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
420 	mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
421 
422 	LIST_INIT(&bpf_list);
423 
424 	return (0);
425 }
426 
427 
428 /*
429  * bpfilterdetach() is called at unload time.
430  */
431 int
432 bpfilterdetach(void)
433 {
434 
435 	if (bpf_ksp != NULL) {
436 		kstat_delete(bpf_ksp);
437 		bpf_ksp = NULL;
438 	}
439 
440 	mod_hash_destroy_idhash(bpf_hash);
441 	bpf_hash = NULL;
442 
443 	cv_destroy(&bpf_dlt_waiter);
444 	mutex_destroy(&bpf_mtx);
445 
446 	return (0);
447 }
448 
449 /*
450  * Open ethernet device. Clones.
451  */
452 /* ARGSUSED */
453 int
454 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
455 {
456 	struct bpf_d *d;
457 	uint_t dmin;
458 
459 	/*
460 	 * The security policy described at the top of this file is
461 	 * enforced here.
462 	 */
463 	if ((flag & FWRITE) != 0) {
464 		if (secpolicy_net_rawaccess(cred) != 0)
465 			return (EACCES);
466 	}
467 
468 	if ((flag & FREAD) != 0) {
469 		if ((secpolicy_net_observability(cred) != 0) &&
470 		    (secpolicy_net_rawaccess(cred) != 0))
471 			return (EACCES);
472 	}
473 
474 	if ((flag & (FWRITE|FREAD)) == 0)
475 		return (ENXIO);
476 
477 	/*
478 	 * A structure is allocated per open file in BPF to store settings
479 	 * such as buffer capture size, provide private buffers, etc.
480 	 */
481 	d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
482 	d->bd_bufsize = bpf_bufsize;
483 	d->bd_fmode = flag;
484 	d->bd_zone = crgetzoneid(cred);
485 	d->bd_seesent = 1;
486 	d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
487 	    MAC_PROMISC_FLAGS_NO_COPY;
488 	mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
489 	cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
490 
491 	mutex_enter(&bpf_mtx);
492 	/*
493 	 * Find an unused minor number. Obviously this is an O(n) algorithm
494 	 * and doesn't scale particularly well, so if there are large numbers
495 	 * of open file descriptors happening in real use, this design may
496 	 * need to be revisited.
497 	 */
498 	for (dmin = 0; dmin < L_MAXMIN; dmin++)
499 		if (bpf_dev_find(dmin) == NULL)
500 			break;
501 	if (dmin == L_MAXMIN) {
502 		mutex_exit(&bpf_mtx);
503 		kmem_free(d, sizeof (*d));
504 		return (ENXIO);
505 	}
506 	d->bd_dev = dmin;
507 	LIST_INSERT_HEAD(&bpf_list, d, bd_list);
508 	bpf_dev_add(d);
509 	mutex_exit(&bpf_mtx);
510 
511 	*devp = makedevice(getmajor(*devp), dmin);
512 
513 	return (0);
514 }
515 
516 /*
517  * Close the descriptor by detaching it from its interface,
518  * deallocating its buffers, and marking it free.
519  *
520  * Because we only allow a device to be opened once, there is always a
521  * 1 to 1 relationship between opens and closes supporting this function.
522  */
523 /* ARGSUSED */
524 int
525 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
526 {
527 	struct bpf_d *d = bpf_dev_get(getminor(dev));
528 
529 	mutex_enter(&d->bd_lock);
530 
531 	while (d->bd_inuse != 0) {
532 		d->bd_waiting++;
533 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
534 			d->bd_waiting--;
535 			mutex_exit(&d->bd_lock);
536 			return (EINTR);
537 		}
538 		d->bd_waiting--;
539 	}
540 
541 	d->bd_inuse = -1;
542 	if (d->bd_state == BPF_WAITING)
543 		bpf_clear_timeout(d);
544 	d->bd_state = BPF_IDLE;
545 	if (d->bd_bif)
546 		bpf_detachd(d);
547 	mutex_exit(&d->bd_lock);
548 
549 	mutex_enter(&bpf_mtx);
550 	LIST_REMOVE(d, bd_list);
551 	bpf_dev_remove(d);
552 	mutex_exit(&bpf_mtx);
553 
554 	mutex_enter(&d->bd_lock);
555 	mutex_destroy(&d->bd_lock);
556 	cv_destroy(&d->bd_wait);
557 
558 	bpf_freed(d);
559 	kmem_free(d, sizeof (*d));
560 
561 	return (0);
562 }
563 
564 /*
565  * Rotate the packet buffers in descriptor d.  Move the store buffer
566  * into the hold slot, and the free buffer into the store slot.
567  * Zero the length of the new store buffer.
568  */
569 #define	ROTATE_BUFFERS(d) \
570 	(d)->bd_hbuf = (d)->bd_sbuf; \
571 	(d)->bd_hlen = (d)->bd_slen; \
572 	(d)->bd_sbuf = (d)->bd_fbuf; \
573 	(d)->bd_slen = 0; \
574 	(d)->bd_fbuf = 0;
575 /*
576  *  bpfread - read next chunk of packets from buffers
577  */
578 /* ARGSUSED */
579 int
580 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
581 {
582 	struct bpf_d *d = bpf_dev_get(getminor(dev));
583 	int timed_out;
584 	ulong_t delay;
585 	int error;
586 
587 	if ((d->bd_fmode & FREAD) == 0)
588 		return (EBADF);
589 
590 	/*
591 	 * Restrict application to use a buffer the same size as
592 	 * the kernel buffers.
593 	 */
594 	if (uio->uio_resid != d->bd_bufsize)
595 		return (EINVAL);
596 
597 	mutex_enter(&d->bd_lock);
598 	if (d->bd_state == BPF_WAITING)
599 		bpf_clear_timeout(d);
600 	timed_out = (d->bd_state == BPF_TIMED_OUT);
601 	d->bd_state = BPF_IDLE;
602 	/*
603 	 * If the hold buffer is empty, then do a timed sleep, which
604 	 * ends when the timeout expires or when enough packets
605 	 * have arrived to fill the store buffer.
606 	 */
607 	while (d->bd_hbuf == 0) {
608 		if (d->bd_nonblock) {
609 			if (d->bd_slen == 0) {
610 				mutex_exit(&d->bd_lock);
611 				return (EWOULDBLOCK);
612 			}
613 			ROTATE_BUFFERS(d);
614 			break;
615 		}
616 
617 		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
618 			/*
619 			 * A packet(s) either arrived since the previous
620 			 * read or arrived while we were asleep.
621 			 * Rotate the buffers and return what's here.
622 			 */
623 			ROTATE_BUFFERS(d);
624 			break;
625 		}
626 		ks_stats.kp_read_wait.value.ui64++;
627 		delay = ddi_get_lbolt() + d->bd_rtout;
628 		error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
629 		if (error == 0) {
630 			mutex_exit(&d->bd_lock);
631 			return (EINTR);
632 		}
633 		if (error == -1) {
634 			/*
635 			 * On a timeout, return what's in the buffer,
636 			 * which may be nothing.  If there is something
637 			 * in the store buffer, we can rotate the buffers.
638 			 */
639 			if (d->bd_hbuf)
640 				/*
641 				 * We filled up the buffer in between
642 				 * getting the timeout and arriving
643 				 * here, so we don't need to rotate.
644 				 */
645 				break;
646 
647 			if (d->bd_slen == 0) {
648 				mutex_exit(&d->bd_lock);
649 				return (0);
650 			}
651 			ROTATE_BUFFERS(d);
652 		}
653 	}
654 	/*
655 	 * At this point, we know we have something in the hold slot.
656 	 */
657 	mutex_exit(&d->bd_lock);
658 
659 	/*
660 	 * Move data from hold buffer into user space.
661 	 * We know the entire buffer is transferred since
662 	 * we checked above that the read buffer is bpf_bufsize bytes.
663 	 */
664 	error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
665 
666 	mutex_enter(&d->bd_lock);
667 	d->bd_fbuf = d->bd_hbuf;
668 	d->bd_hbuf = 0;
669 	d->bd_hlen = 0;
670 done:
671 	mutex_exit(&d->bd_lock);
672 	return (error);
673 }
674 
675 
676 /*
677  * If there are processes sleeping on this descriptor, wake them up.
678  * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
679  * so there is no code here grabbing it.
680  */
681 static inline void
682 bpf_wakeup(struct bpf_d *d)
683 {
684 	cv_signal(&d->bd_wait);
685 }
686 
687 static void
688 bpf_timed_out(void *arg)
689 {
690 	struct bpf_d *d = arg;
691 
692 	mutex_enter(&d->bd_lock);
693 	if (d->bd_state == BPF_WAITING) {
694 		d->bd_state = BPF_TIMED_OUT;
695 		if (d->bd_slen != 0)
696 			cv_signal(&d->bd_wait);
697 	}
698 	mutex_exit(&d->bd_lock);
699 }
700 
701 
702 /* ARGSUSED */
703 int
704 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
705 {
706 	struct bpf_d *d = bpf_dev_get(getminor(dev));
707 	uintptr_t mch;
708 	uint_t mtu;
709 	mblk_t *m;
710 	int error;
711 	int dlt;
712 
713 	if ((d->bd_fmode & FWRITE) == 0)
714 		return (EBADF);
715 
716 	mutex_enter(&d->bd_lock);
717 	if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
718 		mutex_exit(&d->bd_lock);
719 		return (EINTR);
720 	}
721 
722 	if (uio->uio_resid == 0) {
723 		mutex_exit(&d->bd_lock);
724 		return (0);
725 	}
726 
727 	while (d->bd_inuse < 0) {
728 		d->bd_waiting++;
729 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
730 			d->bd_waiting--;
731 			mutex_exit(&d->bd_lock);
732 			return (EINTR);
733 		}
734 		d->bd_waiting--;
735 	}
736 
737 	mutex_exit(&d->bd_lock);
738 
739 	dlt = d->bd_dlt;
740 	mch = d->bd_mcip;
741 	MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
742 	d->bd_inuse++;
743 
744 	m = NULL;
745 	if (dlt == DLT_IPNET) {
746 		error = EIO;
747 		goto done;
748 	}
749 
750 	error = bpf_movein(uio, dlt, mtu, &m);
751 	if (error)
752 		goto done;
753 
754 	DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
755 	    uint_t, mtu, mblk_t *, m);
756 
757 	if (M_LEN(m) > mtu) {
758 		error = EMSGSIZE;
759 		goto done;
760 	}
761 
762 	error = MBPF_TX(&d->bd_mac, mch, m);
763 	/*
764 	 * The "tx" action here is required to consume the mblk_t.
765 	 */
766 	m = NULL;
767 
768 done:
769 	if (error == 0)
770 		ks_stats.kp_write_ok.value.ui64++;
771 	else
772 		ks_stats.kp_write_error.value.ui64++;
773 	if (m != NULL)
774 		freemsg(m);
775 
776 	mutex_enter(&d->bd_lock);
777 	d->bd_inuse--;
778 	if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
779 		cv_signal(&d->bd_wait);
780 	mutex_exit(&d->bd_lock);
781 
782 	/*
783 	 * The driver frees the mbuf.
784 	 */
785 	return (error);
786 }
787 
788 
789 /*
790  * Reset a descriptor by flushing its packet buffer and clearing the
791  * receive and drop counts.  Should be called at splnet.
792  */
793 static void
794 reset_d(struct bpf_d *d)
795 {
796 	if (d->bd_hbuf) {
797 		/* Free the hold buffer. */
798 		d->bd_fbuf = d->bd_hbuf;
799 		d->bd_hbuf = 0;
800 	}
801 	d->bd_slen = 0;
802 	d->bd_hlen = 0;
803 	d->bd_rcount = 0;
804 	d->bd_dcount = 0;
805 	d->bd_ccount = 0;
806 }
807 
808 /*
809  *  FIONREAD		Check for read packet available.
810  *  BIOCGBLEN		Get buffer len [for read()].
811  *  BIOCSETF		Set ethernet read filter.
812  *  BIOCFLUSH		Flush read packet buffer.
813  *  BIOCPROMISC		Put interface into promiscuous mode.
814  *  BIOCGDLT		Get link layer type.
815  *  BIOCGETIF		Get interface name.
816  *  BIOCSETIF		Set interface.
817  *  BIOCSRTIMEOUT	Set read timeout.
818  *  BIOCGRTIMEOUT	Get read timeout.
819  *  BIOCGSTATS		Get packet stats.
820  *  BIOCIMMEDIATE	Set immediate mode.
821  *  BIOCVERSION		Get filter language version.
822  *  BIOCGHDRCMPLT	Get "header already complete" flag.
823  *  BIOCSHDRCMPLT	Set "header already complete" flag.
824  */
825 /* ARGSUSED */
826 int
827 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
828 {
829 	struct bpf_d *d = bpf_dev_get(getminor(dev));
830 	struct bpf_program prog;
831 	struct lifreq lifreq;
832 	struct ifreq ifreq;
833 	int error = 0;
834 	uint_t size;
835 
836 	/*
837 	 * Refresh the PID associated with this bpf file.
838 	 */
839 	mutex_enter(&d->bd_lock);
840 	if (d->bd_state == BPF_WAITING)
841 		bpf_clear_timeout(d);
842 	d->bd_state = BPF_IDLE;
843 	mutex_exit(&d->bd_lock);
844 
845 	switch (cmd) {
846 
847 	default:
848 		error = EINVAL;
849 		break;
850 
851 	/*
852 	 * Check for read packet available.
853 	 */
854 	case FIONREAD:
855 		{
856 			int n;
857 
858 			mutex_enter(&d->bd_lock);
859 			n = d->bd_slen;
860 			if (d->bd_hbuf)
861 				n += d->bd_hlen;
862 			mutex_exit(&d->bd_lock);
863 
864 			*(int *)addr = n;
865 			break;
866 		}
867 
868 	/*
869 	 * Get buffer len [for read()].
870 	 */
871 	case BIOCGBLEN:
872 		error = copyout(&d->bd_bufsize, (void *)addr,
873 		    sizeof (d->bd_bufsize));
874 		break;
875 
876 	/*
877 	 * Set buffer length.
878 	 */
879 	case BIOCSBLEN:
880 		if (copyin((void *)addr, &size, sizeof (size)) != 0) {
881 			error = EFAULT;
882 			break;
883 		}
884 
885 		mutex_enter(&d->bd_lock);
886 		if (d->bd_bif != 0) {
887 			error = EINVAL;
888 		} else {
889 			if (size > bpf_maxbufsize)
890 				size = bpf_maxbufsize;
891 			else if (size < BPF_MINBUFSIZE)
892 				size = BPF_MINBUFSIZE;
893 
894 			d->bd_bufsize = size;
895 		}
896 		mutex_exit(&d->bd_lock);
897 
898 		if (error == 0)
899 			error = copyout(&size, (void *)addr, sizeof (size));
900 		break;
901 
902 	/*
903 	 * Set link layer read filter.
904 	 */
905 	case BIOCSETF:
906 		if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
907 			error = EFAULT;
908 			break;
909 		}
910 		error = bpf_setf(d, &prog);
911 		break;
912 
913 	/*
914 	 * Flush read packet buffer.
915 	 */
916 	case BIOCFLUSH:
917 		mutex_enter(&d->bd_lock);
918 		reset_d(d);
919 		mutex_exit(&d->bd_lock);
920 		break;
921 
922 	/*
923 	 * Put interface into promiscuous mode.
924 	 * This is a one-way ioctl, it is not used to turn promiscuous
925 	 * mode off.
926 	 */
927 	case BIOCPROMISC:
928 		if (d->bd_bif == 0) {
929 			/*
930 			 * No interface attached yet.
931 			 */
932 			error = EINVAL;
933 			break;
934 		}
935 		mutex_enter(&d->bd_lock);
936 		if (d->bd_promisc == 0) {
937 
938 			if (d->bd_promisc_handle) {
939 				uintptr_t mph;
940 
941 				mph = d->bd_promisc_handle;
942 				d->bd_promisc_handle = 0;
943 
944 				mutex_exit(&d->bd_lock);
945 				MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
946 				mutex_enter(&d->bd_lock);
947 			}
948 
949 			d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
950 			error = MBPF_PROMISC_ADD(&d->bd_mac,
951 			    d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
952 			    &d->bd_promisc_handle, d->bd_promisc_flags);
953 			if (error == 0)
954 				d->bd_promisc = 1;
955 		}
956 		mutex_exit(&d->bd_lock);
957 		break;
958 
959 	/*
960 	 * Get device parameters.
961 	 */
962 	case BIOCGDLT:
963 		if (d->bd_bif == 0)
964 			error = EINVAL;
965 		else
966 			error = copyout(&d->bd_dlt, (void *)addr,
967 			    sizeof (d->bd_dlt));
968 		break;
969 
970 	/*
971 	 * Get a list of supported device parameters.
972 	 */
973 	case BIOCGDLTLIST:
974 		if (d->bd_bif == 0) {
975 			error = EINVAL;
976 		} else {
977 			struct bpf_dltlist list;
978 
979 			if (copyin((void *)addr, &list, sizeof (list)) != 0) {
980 				error = EFAULT;
981 				break;
982 			}
983 			error = bpf_getdltlist(d, &list);
984 			if ((error == 0) &&
985 			    copyout(&list, (void *)addr, sizeof (list)) != 0)
986 				error = EFAULT;
987 		}
988 		break;
989 
990 	/*
991 	 * Set device parameters.
992 	 */
993 	case BIOCSDLT:
994 		error = bpf_setdlt(d, (void *)addr);
995 		break;
996 
997 	/*
998 	 * Get interface name.
999 	 */
1000 	case BIOCGETIF:
1001 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1002 			error = EFAULT;
1003 			break;
1004 		}
1005 		error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1006 		if ((error == 0) &&
1007 		    copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1008 			error = EFAULT;
1009 			break;
1010 		}
1011 		break;
1012 
1013 	/*
1014 	 * Set interface.
1015 	 */
1016 	case BIOCSETIF:
1017 		if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1018 			error = EFAULT;
1019 			break;
1020 		}
1021 		error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1022 		break;
1023 
1024 	/*
1025 	 * Get interface name.
1026 	 */
1027 	case BIOCGETLIF:
1028 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1029 			error = EFAULT;
1030 			break;
1031 		}
1032 		error = bpf_ifname(d, lifreq.lifr_name,
1033 		    sizeof (lifreq.lifr_name));
1034 		if ((error == 0) &&
1035 		    copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1036 			error = EFAULT;
1037 			break;
1038 		}
1039 		break;
1040 
1041 	/*
1042 	 * Set interface.
1043 	 */
1044 	case BIOCSETLIF:
1045 		if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1046 			error = EFAULT;
1047 			break;
1048 		}
1049 		error = bpf_setif(d, lifreq.lifr_name,
1050 		    sizeof (lifreq.lifr_name));
1051 		break;
1052 
1053 #ifdef _SYSCALL32_IMPL
1054 	/*
1055 	 * Set read timeout.
1056 	 */
1057 	case BIOCSRTIMEOUT32:
1058 		{
1059 			struct timeval32 tv;
1060 
1061 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1062 				error = EFAULT;
1063 				break;
1064 			}
1065 
1066 			/* Convert the timeout in microseconds to ticks */
1067 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1068 			    tv.tv_usec);
1069 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1070 				d->bd_rtout = 1;
1071 			break;
1072 		}
1073 
1074 	/*
1075 	 * Get read timeout.
1076 	 */
1077 	case BIOCGRTIMEOUT32:
1078 		{
1079 			struct timeval32 tv;
1080 			clock_t ticks;
1081 
1082 			ticks = drv_hztousec(d->bd_rtout);
1083 			tv.tv_sec = ticks / 1000000;
1084 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1085 			error = copyout(&tv, (void *)addr, sizeof (tv));
1086 			break;
1087 		}
1088 
1089 	/*
1090 	 * Get a list of supported device parameters.
1091 	 */
1092 	case BIOCGDLTLIST32:
1093 		if (d->bd_bif == 0) {
1094 			error = EINVAL;
1095 		} else {
1096 			struct bpf_dltlist32 lst32;
1097 			struct bpf_dltlist list;
1098 
1099 			if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1100 				error = EFAULT;
1101 				break;
1102 			}
1103 
1104 			list.bfl_len = lst32.bfl_len;
1105 			list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1106 			error = bpf_getdltlist(d, &list);
1107 			if (error == 0) {
1108 				lst32.bfl_len = list.bfl_len;
1109 
1110 				if (copyout(&lst32, (void *)addr,
1111 				    sizeof (lst32)) != 0)
1112 					error = EFAULT;
1113 			}
1114 		}
1115 		break;
1116 
1117 	/*
1118 	 * Set link layer read filter.
1119 	 */
1120 	case BIOCSETF32: {
1121 		struct bpf_program32 prog32;
1122 
1123 		if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1124 			error = EFAULT;
1125 			break;
1126 		}
1127 		prog.bf_len = prog32.bf_len;
1128 		prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1129 		error = bpf_setf(d, &prog);
1130 		break;
1131 	}
1132 #endif
1133 
1134 	/*
1135 	 * Set read timeout.
1136 	 */
1137 	case BIOCSRTIMEOUT:
1138 		{
1139 			struct timeval tv;
1140 
1141 			if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1142 				error = EFAULT;
1143 				break;
1144 			}
1145 
1146 			/* Convert the timeout in microseconds to ticks */
1147 			d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1148 			    tv.tv_usec);
1149 			if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1150 				d->bd_rtout = 1;
1151 			break;
1152 		}
1153 
1154 	/*
1155 	 * Get read timeout.
1156 	 */
1157 	case BIOCGRTIMEOUT:
1158 		{
1159 			struct timeval tv;
1160 			clock_t ticks;
1161 
1162 			ticks = drv_hztousec(d->bd_rtout);
1163 			tv.tv_sec = ticks / 1000000;
1164 			tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1165 			if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1166 				error = EFAULT;
1167 			break;
1168 		}
1169 
1170 	/*
1171 	 * Get packet stats.
1172 	 */
1173 	case BIOCGSTATS:
1174 		{
1175 			struct bpf_stat bs;
1176 
1177 			bs.bs_recv = d->bd_rcount;
1178 			bs.bs_drop = d->bd_dcount;
1179 			bs.bs_capt = d->bd_ccount;
1180 			if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1181 				error = EFAULT;
1182 			break;
1183 		}
1184 
1185 	/*
1186 	 * Set immediate mode.
1187 	 */
1188 	case BIOCIMMEDIATE:
1189 		if (copyin((void *)addr, &d->bd_immediate,
1190 		    sizeof (d->bd_immediate)) != 0)
1191 			error = EFAULT;
1192 		break;
1193 
1194 	case BIOCVERSION:
1195 		{
1196 			struct bpf_version bv;
1197 
1198 			bv.bv_major = BPF_MAJOR_VERSION;
1199 			bv.bv_minor = BPF_MINOR_VERSION;
1200 			if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1201 				error = EFAULT;
1202 			break;
1203 		}
1204 
1205 	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1206 		if (copyout(&d->bd_hdrcmplt, (void *)addr,
1207 		    sizeof (d->bd_hdrcmplt)) != 0)
1208 			error = EFAULT;
1209 		break;
1210 
1211 	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1212 		if (copyin((void *)addr, &d->bd_hdrcmplt,
1213 		    sizeof (d->bd_hdrcmplt)) != 0)
1214 			error = EFAULT;
1215 		break;
1216 
1217 	/*
1218 	 * Get "see sent packets" flag
1219 	 */
1220 	case BIOCGSEESENT:
1221 		if (copyout(&d->bd_seesent, (void *)addr,
1222 		    sizeof (d->bd_seesent)) != 0)
1223 			error = EFAULT;
1224 		break;
1225 
1226 	/*
1227 	 * Set "see sent" packets flag
1228 	 */
1229 	case BIOCSSEESENT:
1230 		if (copyin((void *)addr, &d->bd_seesent,
1231 		    sizeof (d->bd_seesent)) != 0)
1232 			error = EFAULT;
1233 		break;
1234 
1235 	case FIONBIO:		/* Non-blocking I/O */
1236 		if (copyin((void *)addr, &d->bd_nonblock,
1237 		    sizeof (d->bd_nonblock)) != 0)
1238 			error = EFAULT;
1239 		break;
1240 	}
1241 	return (error);
1242 }
1243 
1244 /*
1245  * Set d's packet filter program to fp.  If this file already has a filter,
1246  * free it and replace it. If the new filter is "empty" (has a 0 size), then
1247  * the result is to just remove and free the existing filter.
1248  * Returns EINVAL for bogus requests.
1249  */
1250 int
1251 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1252 {
1253 	struct bpf_insn *fcode, *old;
1254 	uint_t flen, size;
1255 	size_t oldsize;
1256 
1257 	if (fp->bf_insns == 0) {
1258 		if (fp->bf_len != 0)
1259 			return (EINVAL);
1260 		mutex_enter(&d->bd_lock);
1261 		old = d->bd_filter;
1262 		oldsize = d->bd_filter_size;
1263 		d->bd_filter = 0;
1264 		d->bd_filter_size = 0;
1265 		reset_d(d);
1266 		mutex_exit(&d->bd_lock);
1267 		if (old != 0)
1268 			kmem_free(old, oldsize);
1269 		return (0);
1270 	}
1271 	flen = fp->bf_len;
1272 	if (flen > BPF_MAXINSNS)
1273 		return (EINVAL);
1274 
1275 	size = flen * sizeof (*fp->bf_insns);
1276 	fcode = kmem_alloc(size, KM_SLEEP);
1277 	if (copyin(fp->bf_insns, fcode, size) != 0)
1278 		return (EFAULT);
1279 
1280 	if (bpf_validate(fcode, (int)flen)) {
1281 		mutex_enter(&d->bd_lock);
1282 		old = d->bd_filter;
1283 		oldsize = d->bd_filter_size;
1284 		d->bd_filter = fcode;
1285 		d->bd_filter_size = size;
1286 		reset_d(d);
1287 		mutex_exit(&d->bd_lock);
1288 		if (old != 0)
1289 			kmem_free(old, oldsize);
1290 
1291 		return (0);
1292 	}
1293 	kmem_free(fcode, size);
1294 	return (EINVAL);
1295 }
1296 
1297 /*
1298  * Detach a file from its current interface (if attached at all) and attach
1299  * to the interface indicated by the name stored in ifname.
1300  * Return an errno or 0.
1301  */
1302 static int
1303 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1304 {
1305 	int unit_seen;
1306 	int error = 0;
1307 	char *cp;
1308 	int i;
1309 
1310 	/*
1311 	 * Make sure the provided name has a unit number, and default
1312 	 * it to '0' if not specified.
1313 	 * XXX This is ugly ... do this differently?
1314 	 */
1315 	unit_seen = 0;
1316 	cp = ifname;
1317 	cp[namesize - 1] = '\0';	/* sanity */
1318 	while (*cp++)
1319 		if (*cp >= '0' && *cp <= '9')
1320 			unit_seen = 1;
1321 	if (!unit_seen) {
1322 		/* Make sure to leave room for the '\0'. */
1323 		for (i = 0; i < (namesize - 1); ++i) {
1324 			if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1325 			    (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1326 				continue;
1327 			ifname[i] = '0';
1328 		}
1329 	}
1330 
1331 	/*
1332 	 * Make sure that only one call to this function happens at a time
1333 	 * and that we're not interleaving a read/write
1334 	 */
1335 	mutex_enter(&d->bd_lock);
1336 	while (d->bd_inuse != 0) {
1337 		d->bd_waiting++;
1338 		if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1339 			d->bd_waiting--;
1340 			mutex_exit(&d->bd_lock);
1341 			return (EINTR);
1342 		}
1343 		d->bd_waiting--;
1344 	}
1345 	d->bd_inuse = -1;
1346 	mutex_exit(&d->bd_lock);
1347 
1348 	if (d->bd_sbuf == 0)
1349 		error = bpf_allocbufs(d);
1350 
1351 	if (error == 0) {
1352 		mutex_enter(&d->bd_lock);
1353 		if (d->bd_bif)
1354 			/*
1355 			 * Detach if attached to something else.
1356 			 */
1357 			bpf_detachd(d);
1358 
1359 		error = bpf_attachd(d, ifname, -1);
1360 		reset_d(d);
1361 		d->bd_inuse = 0;
1362 		if (d->bd_waiting != 0)
1363 			cv_signal(&d->bd_wait);
1364 		mutex_exit(&d->bd_lock);
1365 		return (error);
1366 	}
1367 
1368 	mutex_enter(&d->bd_lock);
1369 	d->bd_inuse = 0;
1370 	if (d->bd_waiting != 0)
1371 		cv_signal(&d->bd_wait);
1372 	mutex_exit(&d->bd_lock);
1373 
1374 	/*
1375 	 * Try tickle the mac layer into attaching the device...
1376 	 */
1377 	return (bpf_provider_tickle(ifname, d->bd_zone));
1378 }
1379 
1380 /*
1381  * Copy the interface name to the ifreq.
1382  */
1383 static int
1384 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1385 {
1386 
1387 	mutex_enter(&d->bd_lock);
1388 	if (d->bd_bif == NULL) {
1389 		mutex_exit(&d->bd_lock);
1390 		return (EINVAL);
1391 	}
1392 
1393 	(void) strlcpy(buffer, d->bd_ifname, bufsize);
1394 	mutex_exit(&d->bd_lock);
1395 
1396 	return (0);
1397 }
1398 
1399 /*
1400  * Support for poll() system call
1401  *
1402  * Return true iff the specific operation will not block indefinitely - with
1403  * the assumption that it is safe to positively acknowledge a request for the
1404  * ability to write to the BPF device.
1405  * Otherwise, return false but make a note that a selnotify() must be done.
1406  */
1407 int
1408 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409     struct pollhead **phpp)
1410 {
1411 	struct bpf_d *d = bpf_dev_get(getminor(dev));
1412 
1413 	if (events & (POLLIN | POLLRDNORM)) {
1414 		/*
1415 		 * An imitation of the FIONREAD ioctl code.
1416 		 */
1417 		mutex_enter(&d->bd_lock);
1418 		if (d->bd_hlen != 0 ||
1419 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1420 		    d->bd_slen != 0)) {
1421 			*reventsp |= events & (POLLIN | POLLRDNORM);
1422 		} else {
1423 			*reventsp = 0;
1424 			if (!anyyet)
1425 				*phpp = &d->bd_poll;
1426 			/* Start the read timeout if necessary */
1427 			if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1428 				bpf_clear_timeout(d);
1429 				/*
1430 				 * Only allow the timeout to be set once.
1431 				 */
1432 				if (d->bd_callout == 0)
1433 					d->bd_callout = timeout(bpf_timed_out,
1434 					    d, d->bd_rtout);
1435 				d->bd_state = BPF_WAITING;
1436 			}
1437 		}
1438 		mutex_exit(&d->bd_lock);
1439 	}
1440 
1441 	return (0);
1442 }
1443 
1444 /*
1445  * Copy data from an mblk_t chain into a buffer. This works for ipnet
1446  * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1447  * packet itself.
1448  */
1449 static void *
1450 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1451 {
1452 	const mblk_t *m;
1453 	uint_t count;
1454 	uchar_t *dst;
1455 
1456 	m = src_arg;
1457 	dst = dst_arg;
1458 	while (len > 0) {
1459 		if (m == NULL)
1460 			panic("bpf_mcpy");
1461 		count = (uint_t)min(M_LEN(m), len);
1462 		(void) memcpy(dst, mtod(m, const void *), count);
1463 		m = m->b_cont;
1464 		dst += count;
1465 		len -= count;
1466 	}
1467 	return (dst_arg);
1468 }
1469 
1470 /*
1471  * Dispatch a packet to all the listeners on interface bp.
1472  *
1473  * marg    pointer to the packet, either a data buffer or an mbuf chain
1474  * buflen  buffer length, if marg is a data buffer
1475  * cpfn    a function that can copy marg into the listener's buffer
1476  * pktlen  length of the packet
1477  * issent  boolean indicating whether the packet was sent or receive
1478  */
1479 static inline void
1480 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1481     uint_t buflen, boolean_t issent)
1482 {
1483 	struct timeval tv;
1484 	uint_t slen;
1485 
1486 	if (!d->bd_seesent && issent)
1487 		return;
1488 
1489 	/*
1490 	 * Accuracy of the packet counters in BPF is vital so it
1491 	 * is important to protect even the outer ones.
1492 	 */
1493 	mutex_enter(&d->bd_lock);
1494 	slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1495 	DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1496 	    struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1497 	d->bd_rcount++;
1498 	ks_stats.kp_receive.value.ui64++;
1499 	if (slen != 0) {
1500 		uniqtime(&tv);
1501 		catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1502 	}
1503 	mutex_exit(&d->bd_lock);
1504 }
1505 
1506 /*
1507  * Incoming linkage from device drivers.
1508  */
1509 /* ARGSUSED */
1510 void
1511 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1512 {
1513 	cp_fn_t cpfn;
1514 	struct bpf_d *d = arg;
1515 	uint_t pktlen, buflen;
1516 	void *marg;
1517 
1518 	pktlen = msgdsize(m);
1519 
1520 	if (pktlen == M_LEN(m)) {
1521 		cpfn = (cp_fn_t)memcpy;
1522 		marg = mtod(m, void *);
1523 		buflen = pktlen;
1524 	} else {
1525 		cpfn = bpf_mcpy;
1526 		marg = m;
1527 		buflen = 0;
1528 	}
1529 
1530 	bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1531 }
1532 
1533 /*
1534  * Incoming linkage from ipnet.
1535  * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1536  * from all network interfaces. Thus the tap function needs to apply a
1537  * filter using the interface index/id to immitate snoop'ing on just the
1538  * specified interface.
1539  */
1540 /* ARGSUSED */
1541 void
1542 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1543 {
1544 	hook_pkt_observe_t *hdr;
1545 	struct bpf_d *d = arg;
1546 
1547 	hdr = (hook_pkt_observe_t *)m->b_rptr;
1548 	if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1549 		return;
1550 	bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1551 
1552 }
1553 
1554 /*
1555  * Move the packet data from interface memory (pkt) into the
1556  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
1557  * otherwise 0.  "copy" is the routine called to do the actual data
1558  * transfer.  memcpy is passed in to copy contiguous chunks, while
1559  * bpf_mcpy is passed in to copy mbuf chains.  In the latter case,
1560  * pkt is really an mbuf.
1561  */
1562 static void
1563 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1564     cp_fn_t cpfn, struct timeval *tv)
1565 {
1566 	struct bpf_hdr *hp;
1567 	int totlen, curlen;
1568 	int hdrlen = d->bd_hdrlen;
1569 	int do_wakeup = 0;
1570 
1571 	++d->bd_ccount;
1572 	ks_stats.kp_capture.value.ui64++;
1573 	/*
1574 	 * Figure out how many bytes to move.  If the packet is
1575 	 * greater or equal to the snapshot length, transfer that
1576 	 * much.  Otherwise, transfer the whole packet (unless
1577 	 * we hit the buffer size limit).
1578 	 */
1579 	totlen = hdrlen + min(snaplen, pktlen);
1580 	if (totlen > d->bd_bufsize)
1581 		totlen = d->bd_bufsize;
1582 
1583 	/*
1584 	 * Round up the end of the previous packet to the next longword.
1585 	 */
1586 	curlen = BPF_WORDALIGN(d->bd_slen);
1587 	if (curlen + totlen > d->bd_bufsize) {
1588 		/*
1589 		 * This packet will overflow the storage buffer.
1590 		 * Rotate the buffers if we can, then wakeup any
1591 		 * pending reads.
1592 		 */
1593 		if (d->bd_fbuf == 0) {
1594 			/*
1595 			 * We haven't completed the previous read yet,
1596 			 * so drop the packet.
1597 			 */
1598 			++d->bd_dcount;
1599 			ks_stats.kp_dropped.value.ui64++;
1600 			return;
1601 		}
1602 		ROTATE_BUFFERS(d);
1603 		do_wakeup = 1;
1604 		curlen = 0;
1605 	} else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1606 		/*
1607 		 * Immediate mode is set, or the read timeout has
1608 		 * already expired during a select call.  A packet
1609 		 * arrived, so the reader should be woken up.
1610 		 */
1611 		do_wakeup = 1;
1612 	}
1613 
1614 	/*
1615 	 * Append the bpf header to the existing buffer before we add
1616 	 * on the actual packet data.
1617 	 */
1618 	hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1619 	hp->bh_tstamp.tv_sec = tv->tv_sec;
1620 	hp->bh_tstamp.tv_usec = tv->tv_usec;
1621 	hp->bh_datalen = pktlen;
1622 	hp->bh_hdrlen = (uint16_t)hdrlen;
1623 	/*
1624 	 * Copy the packet data into the store buffer and update its length.
1625 	 */
1626 	(*cpfn)((uchar_t *)hp + hdrlen, pkt,
1627 	    (hp->bh_caplen = totlen - hdrlen));
1628 	d->bd_slen = curlen + totlen;
1629 
1630 	/*
1631 	 * Call bpf_wakeup after bd_slen has been updated.
1632 	 */
1633 	if (do_wakeup)
1634 		bpf_wakeup(d);
1635 }
1636 
1637 /*
1638  * Initialize all nonzero fields of a descriptor.
1639  */
1640 static int
1641 bpf_allocbufs(struct bpf_d *d)
1642 {
1643 
1644 	d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1645 	if (!d->bd_fbuf)
1646 		return (ENOBUFS);
1647 	d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1648 	if (!d->bd_sbuf) {
1649 		kmem_free(d->bd_fbuf, d->bd_bufsize);
1650 		return (ENOBUFS);
1651 	}
1652 	d->bd_slen = 0;
1653 	d->bd_hlen = 0;
1654 	return (0);
1655 }
1656 
1657 /*
1658  * Free buffers currently in use by a descriptor.
1659  * Called on close.
1660  */
1661 static void
1662 bpf_freed(struct bpf_d *d)
1663 {
1664 	/*
1665 	 * At this point the descriptor has been detached from its
1666 	 * interface and it yet hasn't been marked free.
1667 	 */
1668 	if (d->bd_sbuf != 0) {
1669 		kmem_free(d->bd_sbuf, d->bd_bufsize);
1670 		if (d->bd_hbuf != 0)
1671 			kmem_free(d->bd_hbuf, d->bd_bufsize);
1672 		if (d->bd_fbuf != 0)
1673 			kmem_free(d->bd_fbuf, d->bd_bufsize);
1674 	}
1675 	if (d->bd_filter)
1676 		kmem_free(d->bd_filter, d->bd_filter_size);
1677 }
1678 
1679 /*
1680  * Get a list of available data link type of the interface.
1681  */
1682 static int
1683 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1684 {
1685 	bpf_provider_list_t *bp;
1686 	bpf_provider_t *bpr;
1687 	zoneid_t zoneid;
1688 	uintptr_t mcip;
1689 	uint_t nicdlt;
1690 	uintptr_t mh;
1691 	int error;
1692 	int n;
1693 
1694 	n = 0;
1695 	mh = 0;
1696 	mcip = 0;
1697 	error = 0;
1698 	mutex_enter(&d->bd_lock);
1699 	LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1700 		bpr = bp->bpl_what;
1701 		error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1702 		if (error != 0)
1703 			goto next;
1704 		error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1705 		if (error != 0)
1706 			goto next;
1707 		error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1708 		if (error != 0)
1709 			goto next;
1710 		if (d->bd_zone != GLOBAL_ZONEID &&
1711 		    d->bd_zone != zoneid)
1712 			goto next;
1713 		error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1714 		if (error != 0)
1715 			goto next;
1716 		nicdlt = bpf_dl_to_dlt(nicdlt);
1717 		if (listp->bfl_list != NULL) {
1718 			if (n >= listp->bfl_len) {
1719 				MBPF_CLIENT_CLOSE(bpr, mcip);
1720 				MBPF_CLOSE(bpr, mh);
1721 				break;
1722 			}
1723 			/*
1724 			 * Bumping of bd_inuse ensures the structure does not
1725 			 * disappear while the copyout runs and allows the for
1726 			 * loop to be continued.
1727 			 */
1728 			d->bd_inuse++;
1729 			mutex_exit(&d->bd_lock);
1730 			if (copyout(&nicdlt,
1731 			    listp->bfl_list + n, sizeof (uint_t)) != 0)
1732 				error = EFAULT;
1733 			mutex_enter(&d->bd_lock);
1734 			if (error != 0)
1735 				break;
1736 			d->bd_inuse--;
1737 		}
1738 		n++;
1739 next:
1740 		if (mcip != 0) {
1741 			MBPF_CLIENT_CLOSE(bpr, mcip);
1742 			mcip = 0;
1743 		}
1744 		if (mh != 0) {
1745 			MBPF_CLOSE(bpr, mh);
1746 			mh = 0;
1747 		}
1748 	}
1749 	mutex_exit(&d->bd_lock);
1750 
1751 	/*
1752 	 * It is quite possible that one or more provider to BPF may not
1753 	 * know about a link name whlist others do. In that case, so long
1754 	 * as we have one success, do not declare an error unless it was
1755 	 * an EFAULT as this indicates a problem that needs to be reported.
1756 	 */
1757 	if ((error != EFAULT) && (n > 0))
1758 		error = 0;
1759 
1760 	listp->bfl_len = n;
1761 	return (error);
1762 }
1763 
1764 /*
1765  * Set the data link type of a BPF instance.
1766  */
1767 static int
1768 bpf_setdlt(struct bpf_d *d, void *addr)
1769 {
1770 	char ifname[LIFNAMSIZ+1];
1771 	zoneid_t niczone;
1772 	int error;
1773 	int dlt;
1774 
1775 	if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1776 		return (EFAULT);
1777 
1778 	mutex_enter(&d->bd_lock);
1779 
1780 	if (d->bd_bif == 0) {			/* Interface not set */
1781 		mutex_exit(&d->bd_lock);
1782 		return (EINVAL);
1783 	}
1784 	if (d->bd_dlt == dlt) {	/* NULL-op */
1785 		mutex_exit(&d->bd_lock);
1786 		return (0);
1787 	}
1788 
1789 	error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1790 	if (error != 0) {
1791 		mutex_exit(&d->bd_lock);
1792 		return (error);
1793 	}
1794 
1795 	/*
1796 	 * See the matrix at the top of the file for the permissions table
1797 	 * enforced by this driver.
1798 	 */
1799 	if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1800 	    (niczone != d->bd_zone)) {
1801 		mutex_exit(&d->bd_lock);
1802 		return (EINVAL);
1803 	}
1804 
1805 	(void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1806 	d->bd_inuse = -1;
1807 	bpf_detachd(d);
1808 	error = bpf_attachd(d, ifname, dlt);
1809 	reset_d(d);
1810 	d->bd_inuse = 0;
1811 
1812 	mutex_exit(&d->bd_lock);
1813 	return (error);
1814 }
1815 
1816 /*
1817  * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1818  * with the necessary protection to retrieve and modify bd_callout but it
1819  * does not hold the lock for its entire duration... see below...
1820  */
1821 static void
1822 bpf_clear_timeout(struct bpf_d *d)
1823 {
1824 	timeout_id_t tid = d->bd_callout;
1825 	d->bd_callout = 0;
1826 	d->bd_inuse++;
1827 
1828 	/*
1829 	 * If the timeout has fired and is waiting on bd_lock, we could
1830 	 * deadlock here because untimeout if bd_lock is held and would
1831 	 * wait for bpf_timed_out to finish and it never would.
1832 	 */
1833 	if (tid != 0) {
1834 		mutex_exit(&d->bd_lock);
1835 		(void) untimeout(tid);
1836 		mutex_enter(&d->bd_lock);
1837 	}
1838 
1839 	d->bd_inuse--;
1840 }
1841 
1842 /*
1843  * As a cloning device driver, BPF needs to keep track of which device
1844  * numbers are in use and which ones are not. A hash table, indexed by
1845  * the minor device number, is used to store the pointers to the
1846  * individual descriptors that are allocated in bpfopen().
1847  * The functions below present the interface for that hash table to
1848  * the rest of the driver.
1849  */
1850 static struct bpf_d *
1851 bpf_dev_find(minor_t minor)
1852 {
1853 	struct bpf_d *d = NULL;
1854 
1855 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1856 	    (mod_hash_val_t *)&d);
1857 
1858 	return (d);
1859 }
1860 
1861 static void
1862 bpf_dev_add(struct bpf_d *d)
1863 {
1864 	(void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1865 	    (mod_hash_val_t)d);
1866 }
1867 
1868 static void
1869 bpf_dev_remove(struct bpf_d *d)
1870 {
1871 	struct bpf_d *stor;
1872 
1873 	(void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1874 	    (mod_hash_val_t *)&stor);
1875 	ASSERT(stor == d);
1876 }
1877 
1878 /*
1879  * bpf_def_get should only ever be called for a minor number that exists,
1880  * thus there should always be a pointer in the hash table that corresponds
1881  * to it.
1882  */
1883 static struct bpf_d *
1884 bpf_dev_get(minor_t minor)
1885 {
1886 	struct bpf_d *d = NULL;
1887 
1888 	(void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1889 	    (mod_hash_val_t *)&d);
1890 	ASSERT(d != NULL);
1891 
1892 	return (d);
1893 }
1894