1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1990, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from the Stanford/CMU enet packet filter,
8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10 * Berkeley Laboratory.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95
37 * static char rcsid[] =
38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39 */
40 /*
41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
42 * Use is subject to license terms.
43 * Copyright 2017 Joyent, Inc.
44 */
45
46 /*
47 * The BPF implements the following access controls for zones attempting
48 * to read and write data. Writing of data requires that the net_rawaccess
49 * privilege is held whilst reading data requires either net_rawaccess or
50 * net_observerability.
51 *
52 * | Shared | Exclusive | Global
53 * -----------------------------+--------+------------+------------+
54 * DLT_IPNET in local zone | Read | Read | Read |
55 * -----------------------------+--------+------------+------------+
56 * Raw access to local zone NIC | None | Read/Write | Read/Write |
57 * -----------------------------+--------+------------+------------+
58 * Raw access to all NICs | None | None | Read/Write |
59 * -----------------------------+--------+------------+------------+
60 *
61 * The BPF driver is written as a cloning driver: each call to bpfopen()
62 * allocates a new minor number. This provides BPF with a 1:1 relationship
63 * between open's and close's. There is some amount of "descriptor state"
64 * that is kept per open. Pointers to this data are stored in a hash table
65 * (bpf_hash) that is index'd by the minor device number for each open file.
66 */
67 #include <sys/param.h>
68 #include <sys/systm.h>
69 #include <sys/time.h>
70 #include <sys/ioctl.h>
71 #include <sys/queue.h>
72 #include <sys/filio.h>
73 #include <sys/policy.h>
74 #include <sys/cmn_err.h>
75 #include <sys/uio.h>
76 #include <sys/file.h>
77 #include <sys/sysmacros.h>
78 #include <sys/zone.h>
79
80 #include <sys/socket.h>
81 #include <sys/errno.h>
82 #include <sys/poll.h>
83 #include <sys/dlpi.h>
84 #include <sys/neti.h>
85
86 #include <net/if.h>
87
88 #include <net/bpf.h>
89 #include <net/bpfdesc.h>
90 #include <net/dlt.h>
91
92 #include <netinet/in.h>
93 #include <sys/mac.h>
94 #include <sys/mac_client.h>
95 #include <sys/mac_impl.h>
96 #include <sys/time_std_impl.h>
97 #include <sys/hook.h>
98 #include <sys/hook_event.h>
99
100
101 #define mtod(_v, _t) (_t)((_v)->b_rptr)
102 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
103
104 /*
105 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
106 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
107 */
108 #define BPF_BUFSIZE (32 * 1024)
109
110 typedef void *(*cp_fn_t)(void *, const void *, size_t);
111
112 /*
113 * The default read buffer size, and limit for BIOCSBLEN.
114 */
115 int bpf_bufsize = BPF_BUFSIZE;
116 int bpf_maxbufsize = (16 * 1024 * 1024);
117 static mod_hash_t *bpf_hash = NULL;
118
119 /*
120 * Use a mutex to avoid a race condition between gathering the stats/peers
121 * and opening/closing the device.
122 */
123 static kcondvar_t bpf_dlt_waiter;
124 static kmutex_t bpf_mtx;
125 static bpf_kstats_t ks_stats;
126 static bpf_kstats_t bpf_kstats = {
127 { "readWait", KSTAT_DATA_UINT64 },
128 { "writeOk", KSTAT_DATA_UINT64 },
129 { "writeError", KSTAT_DATA_UINT64 },
130 { "receive", KSTAT_DATA_UINT64 },
131 { "captured", KSTAT_DATA_UINT64 },
132 { "dropped", KSTAT_DATA_UINT64 },
133 };
134 static kstat_t *bpf_ksp;
135
136 /*
137 * bpf_list is a list of the BPF descriptors currently open
138 */
139 LIST_HEAD(, bpf_d) bpf_list;
140
141 static int bpf_allocbufs(struct bpf_d *);
142 static void bpf_clear_timeout(struct bpf_d *);
143 static void bpf_deliver(struct bpf_d *, cp_fn_t,
144 void *, uint_t, uint_t, boolean_t);
145 static void bpf_freed(struct bpf_d *);
146 static int bpf_ifname(struct bpf_d *d, char *, int);
147 static void *bpf_mcpy(void *, const void *, size_t);
148 static int bpf_attachd(struct bpf_d *, const char *, int);
149 static void bpf_detachd(struct bpf_d *);
150 static int bpf_setif(struct bpf_d *, char *, int);
151 static void bpf_timed_out(void *);
152 static inline void
153 bpf_wakeup(struct bpf_d *);
154 static void catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
155 cp_fn_t, struct timeval *);
156 static void reset_d(struct bpf_d *);
157 static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
158 static int bpf_setdlt(struct bpf_d *, void *);
159 static void bpf_dev_add(struct bpf_d *);
160 static struct bpf_d *bpf_dev_find(minor_t);
161 static struct bpf_d *bpf_dev_get(minor_t);
162 static void bpf_dev_remove(struct bpf_d *);
163
164 static int
bpf_movein(struct uio * uio,int linktype,int mtu,mblk_t ** mp)165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
166 {
167 mblk_t *m;
168 int error;
169 int len;
170 int hlen;
171 int align;
172
173 /*
174 * Build a sockaddr based on the data link layer type.
175 * We do this at this level because the ethernet header
176 * is copied directly into the data field of the sockaddr.
177 * In the case of SLIP, there is no header and the packet
178 * is forwarded as is.
179 * Also, we are careful to leave room at the front of the mbuf
180 * for the link level header.
181 */
182 switch (linktype) {
183
184 case DLT_EN10MB:
185 hlen = sizeof (struct ether_header);
186 break;
187
188 case DLT_FDDI:
189 hlen = 16;
190 break;
191
192 case DLT_NULL:
193 hlen = 0;
194 break;
195
196 case DLT_IPOIB:
197 hlen = 44;
198 break;
199
200 default:
201 return (EIO);
202 }
203
204 align = 4 - (hlen & 3);
205
206 len = uio->uio_resid;
207 /*
208 * If there aren't enough bytes for a link level header or the
209 * packet length exceeds the interface mtu, return an error.
210 */
211 if (len < hlen || len - hlen > mtu)
212 return (EMSGSIZE);
213
214 m = allocb(len + align, BPRI_MED);
215 if (m == NULL) {
216 error = ENOBUFS;
217 goto bad;
218 }
219
220 /* Insure the data is properly aligned */
221 if (align > 0)
222 m->b_rptr += align;
223 m->b_wptr = m->b_rptr + len;
224
225 error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
226 if (error)
227 goto bad;
228 *mp = m;
229 return (0);
230
231 bad:
232 if (m != NULL)
233 freemsg(m);
234 return (error);
235 }
236
237
238 /*
239 * Attach file to the bpf interface, i.e. make d listen on bp.
240 */
241 static int
bpf_attachd(struct bpf_d * d,const char * ifname,int dlt)242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
243 {
244 bpf_provider_list_t *bp;
245 bpf_provider_t *bpr;
246 boolean_t zonematch;
247 zoneid_t niczone;
248 uintptr_t mcip;
249 zoneid_t zone;
250 uint_t nicdlt;
251 uintptr_t mh;
252 int hdrlen;
253 int error;
254
255 ASSERT(d->bd_bif == (uintptr_t)NULL);
256 ASSERT(d->bd_mcip == (uintptr_t)NULL);
257 zone = d->bd_zone;
258 zonematch = B_TRUE;
259 error = 0;
260 bpr = NULL;
261 again:
262 mh = 0;
263 mcip = 0;
264 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
265 bpr = bp->bpl_what;
266 error = MBPF_OPEN(bpr, ifname, &mh, zone);
267 if (error != 0)
268 goto next;
269 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
270 if (error != 0)
271 goto next;
272 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
273 if (error != 0)
274 goto next;
275
276 nicdlt = bpf_dl_to_dlt(nicdlt);
277 if (dlt != -1 && dlt != nicdlt) {
278 error = ENOENT;
279 goto next;
280 }
281
282 error = MBPF_GET_ZONE(bpr, mh, &niczone);
283 if (error != 0)
284 goto next;
285
286 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
287 uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
288
289 if (zonematch && niczone != zone) {
290 error = ENOENT;
291 goto next;
292 }
293 break;
294 next:
295 if (mcip != 0) {
296 MBPF_CLIENT_CLOSE(bpr, mcip);
297 mcip = 0;
298 }
299 if (mh != 0) {
300 MBPF_CLOSE(bpr, mh);
301 mh = 0;
302 }
303 }
304 if (error != 0) {
305 if (zonematch && (zone == GLOBAL_ZONEID)) {
306 /*
307 * If we failed to do an exact match for the global
308 * zone using the global zoneid, try again in case
309 * the network interface is owned by a local zone.
310 */
311 zonematch = B_FALSE;
312 goto again;
313 }
314 return (error);
315 }
316
317 /* No providers? */
318 if (bpr == NULL)
319 return (ENOENT);
320
321 d->bd_mac = *bpr;
322 d->bd_mcip = mcip;
323 d->bd_bif = mh;
324 d->bd_dlt = nicdlt;
325 hdrlen = bpf_dl_hdrsize(nicdlt);
326 d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
327
328 (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
329 sizeof (d->bd_ifname));
330
331 (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
332 zone);
333 (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
334 &d->bd_promisc_handle, d->bd_promisc_flags);
335 return (0);
336 }
337
338 /*
339 * Detach a file from its interface.
340 */
341 static void
bpf_detachd(struct bpf_d * d)342 bpf_detachd(struct bpf_d *d)
343 {
344 uintptr_t mph;
345 uintptr_t mch;
346 uintptr_t mh;
347
348 ASSERT(d->bd_inuse == -1);
349 mch = d->bd_mcip;
350 d->bd_mcip = 0;
351 mh = d->bd_bif;
352 d->bd_bif = 0;
353
354 /*
355 * Check if this descriptor had requested promiscuous mode.
356 * If so, turn it off. There's no need to take any action
357 * here, that is done when MBPF_PROMISC_REMOVE is used;
358 * bd_promisc is just a local flag to stop promiscuous mode
359 * from being set more than once.
360 */
361 if (d->bd_promisc)
362 d->bd_promisc = 0;
363
364 /*
365 * Take device out of "promiscuous" mode. Since we were able to
366 * enter "promiscuous" mode, we should be able to turn it off.
367 * Note, this field stores a pointer used to support both
368 * promiscuous and non-promiscuous callbacks for packets.
369 */
370 mph = d->bd_promisc_handle;
371 d->bd_promisc_handle = 0;
372
373 /*
374 * The lock has to be dropped here because mac_promisc_remove may
375 * need to wait for mac_promisc_dispatch, which has called into
376 * bpf and catchpacket is waiting for bd_lock...
377 * i.e mac_promisc_remove() needs to be called with none of the
378 * locks held that are part of the bpf_mtap() call path.
379 */
380 mutex_exit(&d->bd_lock);
381 if (mph != 0)
382 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
383
384 if (mch != 0)
385 MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
386
387 if (mh != 0)
388 MBPF_CLOSE(&d->bd_mac, mh);
389
390 /*
391 * Because this function is called with bd_lock held, so it must
392 * exit with it held.
393 */
394 mutex_enter(&d->bd_lock);
395 *d->bd_ifname = '\0';
396 (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
397 }
398
399
400 /*
401 * bpfilterattach() is called at load time.
402 */
403 int
bpfilterattach(void)404 bpfilterattach(void)
405 {
406
407 bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
408 mod_hash_null_keydtor);
409 if (bpf_hash == NULL)
410 return (ENOMEM);
411
412 (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
413
414 bpf_ksp = kstat_create("bpf", 0, "global", "misc",
415 KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
416 KSTAT_FLAG_VIRTUAL);
417 if (bpf_ksp != NULL) {
418 bpf_ksp->ks_data = &ks_stats;
419 kstat_install(bpf_ksp);
420 } else {
421 mod_hash_destroy_idhash(bpf_hash);
422 bpf_hash = NULL;
423 return (EEXIST);
424 }
425
426 cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
427 mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
428
429 LIST_INIT(&bpf_list);
430
431 return (0);
432 }
433
434
435 /*
436 * bpfilterdetach() is called at unload time.
437 */
438 int
bpfilterdetach(void)439 bpfilterdetach(void)
440 {
441
442 if (bpf_ksp != NULL) {
443 kstat_delete(bpf_ksp);
444 bpf_ksp = NULL;
445 }
446
447 mod_hash_destroy_idhash(bpf_hash);
448 bpf_hash = NULL;
449
450 cv_destroy(&bpf_dlt_waiter);
451 mutex_destroy(&bpf_mtx);
452
453 return (0);
454 }
455
456 /*
457 * Open ethernet device. Clones.
458 */
459 /* ARGSUSED */
460 int
bpfopen(dev_t * devp,int flag,int mode,cred_t * cred)461 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
462 {
463 struct bpf_d *d;
464 uint_t dmin;
465
466 /*
467 * The security policy described at the top of this file is
468 * enforced here.
469 */
470 if ((flag & FWRITE) != 0) {
471 if (secpolicy_net_rawaccess(cred) != 0)
472 return (EACCES);
473 }
474
475 if ((flag & FREAD) != 0) {
476 if ((secpolicy_net_observability(cred) != 0) &&
477 (secpolicy_net_rawaccess(cred) != 0))
478 return (EACCES);
479 }
480
481 if ((flag & (FWRITE|FREAD)) == 0)
482 return (ENXIO);
483
484 /*
485 * A structure is allocated per open file in BPF to store settings
486 * such as buffer capture size, provide private buffers, etc.
487 */
488 d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
489 d->bd_bufsize = bpf_bufsize;
490 d->bd_fmode = flag;
491 d->bd_zone = crgetzoneid(cred);
492 d->bd_seesent = 1;
493 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
494 MAC_PROMISC_FLAGS_NO_COPY;
495 mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
496 cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
497
498 mutex_enter(&bpf_mtx);
499 /*
500 * Find an unused minor number. Obviously this is an O(n) algorithm
501 * and doesn't scale particularly well, so if there are large numbers
502 * of open file descriptors happening in real use, this design may
503 * need to be revisited.
504 */
505 for (dmin = 0; dmin < L_MAXMIN; dmin++)
506 if (bpf_dev_find(dmin) == NULL)
507 break;
508 if (dmin == L_MAXMIN) {
509 mutex_exit(&bpf_mtx);
510 kmem_free(d, sizeof (*d));
511 return (ENXIO);
512 }
513 d->bd_dev = dmin;
514 LIST_INSERT_HEAD(&bpf_list, d, bd_list);
515 bpf_dev_add(d);
516 mutex_exit(&bpf_mtx);
517
518 *devp = makedevice(getmajor(*devp), dmin);
519
520 return (0);
521 }
522
523 /*
524 * Close the descriptor by detaching it from its interface,
525 * deallocating its buffers, and marking it free.
526 *
527 * Because we only allow a device to be opened once, there is always a
528 * 1 to 1 relationship between opens and closes supporting this function.
529 */
530 /* ARGSUSED */
531 int
bpfclose(dev_t dev,int flag,int otyp,cred_t * cred_p)532 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
533 {
534 struct bpf_d *d = bpf_dev_get(getminor(dev));
535
536 mutex_enter(&d->bd_lock);
537
538 while (d->bd_inuse != 0) {
539 d->bd_waiting++;
540 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
541 d->bd_waiting--;
542 mutex_exit(&d->bd_lock);
543 return (EINTR);
544 }
545 d->bd_waiting--;
546 }
547
548 d->bd_inuse = -1;
549 if (d->bd_state == BPF_WAITING)
550 bpf_clear_timeout(d);
551 d->bd_state = BPF_IDLE;
552 if (d->bd_bif)
553 bpf_detachd(d);
554 mutex_exit(&d->bd_lock);
555
556 mutex_enter(&bpf_mtx);
557 LIST_REMOVE(d, bd_list);
558 bpf_dev_remove(d);
559 mutex_exit(&bpf_mtx);
560
561 mutex_enter(&d->bd_lock);
562 mutex_destroy(&d->bd_lock);
563 cv_destroy(&d->bd_wait);
564
565 bpf_freed(d);
566 kmem_free(d, sizeof (*d));
567
568 return (0);
569 }
570
571 /*
572 * Rotate the packet buffers in descriptor d. Move the store buffer
573 * into the hold slot, and the free buffer into the store slot.
574 * Zero the length of the new store buffer.
575 */
576 #define ROTATE_BUFFERS(d) \
577 (d)->bd_hbuf = (d)->bd_sbuf; \
578 (d)->bd_hlen = (d)->bd_slen; \
579 (d)->bd_sbuf = (d)->bd_fbuf; \
580 (d)->bd_slen = 0; \
581 (d)->bd_fbuf = 0;
582 /*
583 * bpfread - read next chunk of packets from buffers
584 */
585 /* ARGSUSED */
586 int
bpfread(dev_t dev,struct uio * uio,cred_t * cred)587 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
588 {
589 struct bpf_d *d = bpf_dev_get(getminor(dev));
590 int timed_out;
591 ulong_t delay;
592 int error;
593
594 if ((d->bd_fmode & FREAD) == 0)
595 return (EBADF);
596
597 /*
598 * Restrict application to use a buffer the same size as
599 * the kernel buffers.
600 */
601 if (uio->uio_resid != d->bd_bufsize)
602 return (EINVAL);
603
604 mutex_enter(&d->bd_lock);
605 if (d->bd_state == BPF_WAITING)
606 bpf_clear_timeout(d);
607 timed_out = (d->bd_state == BPF_TIMED_OUT);
608 d->bd_state = BPF_IDLE;
609 /*
610 * If the hold buffer is empty, then do a timed sleep, which
611 * ends when the timeout expires or when enough packets
612 * have arrived to fill the store buffer.
613 */
614 while (d->bd_hbuf == 0) {
615 if (d->bd_nonblock) {
616 if (d->bd_slen == 0) {
617 mutex_exit(&d->bd_lock);
618 return (EWOULDBLOCK);
619 }
620 ROTATE_BUFFERS(d);
621 break;
622 }
623
624 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
625 /*
626 * A packet(s) either arrived since the previous
627 * read or arrived while we were asleep.
628 * Rotate the buffers and return what's here.
629 */
630 ROTATE_BUFFERS(d);
631 break;
632 }
633 ks_stats.kp_read_wait.value.ui64++;
634 delay = ddi_get_lbolt() + d->bd_rtout;
635 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
636 if (error == 0) {
637 mutex_exit(&d->bd_lock);
638 return (EINTR);
639 }
640 if (error == -1) {
641 /*
642 * On a timeout, return what's in the buffer,
643 * which may be nothing. If there is something
644 * in the store buffer, we can rotate the buffers.
645 */
646 if (d->bd_hbuf)
647 /*
648 * We filled up the buffer in between
649 * getting the timeout and arriving
650 * here, so we don't need to rotate.
651 */
652 break;
653
654 if (d->bd_slen == 0) {
655 mutex_exit(&d->bd_lock);
656 return (0);
657 }
658 ROTATE_BUFFERS(d);
659 }
660 }
661 /*
662 * At this point, we know we have something in the hold slot.
663 */
664 mutex_exit(&d->bd_lock);
665
666 /*
667 * Move data from hold buffer into user space.
668 * We know the entire buffer is transferred since
669 * we checked above that the read buffer is bpf_bufsize bytes.
670 */
671 error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
672
673 mutex_enter(&d->bd_lock);
674 d->bd_fbuf = d->bd_hbuf;
675 d->bd_hbuf = 0;
676 d->bd_hlen = 0;
677 done:
678 mutex_exit(&d->bd_lock);
679 return (error);
680 }
681
682
683 /*
684 * If there are processes sleeping on this descriptor, wake them up.
685 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
686 * so there is no code here grabbing it.
687 */
688 static inline void
bpf_wakeup(struct bpf_d * d)689 bpf_wakeup(struct bpf_d *d)
690 {
691 cv_signal(&d->bd_wait);
692 }
693
694 static void
bpf_timed_out(void * arg)695 bpf_timed_out(void *arg)
696 {
697 struct bpf_d *d = arg;
698
699 mutex_enter(&d->bd_lock);
700 if (d->bd_state == BPF_WAITING) {
701 d->bd_state = BPF_TIMED_OUT;
702 if (d->bd_slen != 0)
703 cv_signal(&d->bd_wait);
704 }
705 mutex_exit(&d->bd_lock);
706 }
707
708
709 /* ARGSUSED */
710 int
bpfwrite(dev_t dev,struct uio * uio,cred_t * cred)711 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
712 {
713 struct bpf_d *d = bpf_dev_get(getminor(dev));
714 uintptr_t mch;
715 uint_t mtu;
716 mblk_t *m;
717 int error;
718 int dlt;
719
720 if ((d->bd_fmode & FWRITE) == 0)
721 return (EBADF);
722
723 mutex_enter(&d->bd_lock);
724 if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
725 mutex_exit(&d->bd_lock);
726 return (EINTR);
727 }
728
729 if (uio->uio_resid == 0) {
730 mutex_exit(&d->bd_lock);
731 return (0);
732 }
733
734 while (d->bd_inuse < 0) {
735 d->bd_waiting++;
736 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
737 d->bd_waiting--;
738 mutex_exit(&d->bd_lock);
739 return (EINTR);
740 }
741 d->bd_waiting--;
742 }
743
744 mutex_exit(&d->bd_lock);
745
746 dlt = d->bd_dlt;
747 mch = d->bd_mcip;
748 MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
749 d->bd_inuse++;
750
751 m = NULL;
752 if (dlt == DLT_IPNET) {
753 error = EIO;
754 goto done;
755 }
756
757 error = bpf_movein(uio, dlt, mtu, &m);
758 if (error)
759 goto done;
760
761 DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
762 uint_t, mtu, mblk_t *, m);
763
764 if (M_LEN(m) > mtu) {
765 error = EMSGSIZE;
766 goto done;
767 }
768
769 error = MBPF_TX(&d->bd_mac, mch, m);
770 /*
771 * The "tx" action here is required to consume the mblk_t.
772 */
773 m = NULL;
774
775 done:
776 if (error == 0)
777 ks_stats.kp_write_ok.value.ui64++;
778 else
779 ks_stats.kp_write_error.value.ui64++;
780 if (m != NULL)
781 freemsg(m);
782
783 mutex_enter(&d->bd_lock);
784 d->bd_inuse--;
785 if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
786 cv_signal(&d->bd_wait);
787 mutex_exit(&d->bd_lock);
788
789 /*
790 * The driver frees the mbuf.
791 */
792 return (error);
793 }
794
795
796 /*
797 * Reset a descriptor by flushing its packet buffer and clearing the
798 * receive and drop counts. Should be called at splnet.
799 */
800 static void
reset_d(struct bpf_d * d)801 reset_d(struct bpf_d *d)
802 {
803 if (d->bd_hbuf) {
804 /* Free the hold buffer. */
805 d->bd_fbuf = d->bd_hbuf;
806 d->bd_hbuf = 0;
807 }
808 d->bd_slen = 0;
809 d->bd_hlen = 0;
810 d->bd_rcount = 0;
811 d->bd_dcount = 0;
812 d->bd_ccount = 0;
813 }
814
815 /*
816 * FIONREAD Check for read packet available.
817 * BIOCGBLEN Get buffer len [for read()].
818 * BIOCSETF Set ethernet read filter.
819 * BIOCFLUSH Flush read packet buffer.
820 * BIOCPROMISC Put interface into promiscuous mode.
821 * BIOCGDLT Get link layer type.
822 * BIOCGETIF Get interface name.
823 * BIOCSETIF Set interface.
824 * BIOCSRTIMEOUT Set read timeout.
825 * BIOCGRTIMEOUT Get read timeout.
826 * BIOCGSTATS Get packet stats.
827 * BIOCIMMEDIATE Set immediate mode.
828 * BIOCVERSION Get filter language version.
829 * BIOCGHDRCMPLT Get "header already complete" flag.
830 * BIOCSHDRCMPLT Set "header already complete" flag.
831 */
832 /* ARGSUSED */
833 int
bpfioctl(dev_t dev,int cmd,intptr_t addr,int mode,cred_t * cred,int * rval)834 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
835 {
836 struct bpf_d *d = bpf_dev_get(getminor(dev));
837 struct bpf_program prog;
838 struct lifreq lifreq;
839 struct ifreq ifreq;
840 int error = 0;
841 uint_t size;
842
843 /*
844 * Refresh the PID associated with this bpf file.
845 */
846 mutex_enter(&d->bd_lock);
847 if (d->bd_state == BPF_WAITING)
848 bpf_clear_timeout(d);
849 d->bd_state = BPF_IDLE;
850 mutex_exit(&d->bd_lock);
851
852 switch (cmd) {
853
854 default:
855 error = EINVAL;
856 break;
857
858 /*
859 * Check for read packet available.
860 */
861 case FIONREAD:
862 {
863 int n;
864
865 mutex_enter(&d->bd_lock);
866 n = d->bd_slen;
867 if (d->bd_hbuf)
868 n += d->bd_hlen;
869 mutex_exit(&d->bd_lock);
870
871 *(int *)addr = n;
872 break;
873 }
874
875 /*
876 * Get buffer len [for read()].
877 */
878 case BIOCGBLEN:
879 error = copyout(&d->bd_bufsize, (void *)addr,
880 sizeof (d->bd_bufsize));
881 break;
882
883 /*
884 * Set buffer length.
885 */
886 case BIOCSBLEN:
887 if (copyin((void *)addr, &size, sizeof (size)) != 0) {
888 error = EFAULT;
889 break;
890 }
891
892 mutex_enter(&d->bd_lock);
893 if (d->bd_bif != 0) {
894 error = EINVAL;
895 } else {
896 if (size > bpf_maxbufsize)
897 size = bpf_maxbufsize;
898 else if (size < BPF_MINBUFSIZE)
899 size = BPF_MINBUFSIZE;
900
901 d->bd_bufsize = size;
902 }
903 mutex_exit(&d->bd_lock);
904
905 if (error == 0)
906 error = copyout(&size, (void *)addr, sizeof (size));
907 break;
908
909 /*
910 * Set link layer read filter.
911 */
912 case BIOCSETF:
913 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
914 error = EFAULT;
915 break;
916 }
917 error = bpf_setf(d, &prog);
918 break;
919
920 /*
921 * Flush read packet buffer.
922 */
923 case BIOCFLUSH:
924 mutex_enter(&d->bd_lock);
925 reset_d(d);
926 mutex_exit(&d->bd_lock);
927 break;
928
929 /*
930 * Put interface into promiscuous mode.
931 * This is a one-way ioctl, it is not used to turn promiscuous
932 * mode off.
933 */
934 case BIOCPROMISC:
935 if (d->bd_bif == 0) {
936 /*
937 * No interface attached yet.
938 */
939 error = EINVAL;
940 break;
941 }
942 mutex_enter(&d->bd_lock);
943 if (d->bd_promisc == 0) {
944
945 if (d->bd_promisc_handle) {
946 uintptr_t mph;
947
948 mph = d->bd_promisc_handle;
949 d->bd_promisc_handle = 0;
950
951 mutex_exit(&d->bd_lock);
952 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
953 mutex_enter(&d->bd_lock);
954 }
955
956 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
957 error = MBPF_PROMISC_ADD(&d->bd_mac,
958 d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
959 &d->bd_promisc_handle, d->bd_promisc_flags);
960 if (error == 0)
961 d->bd_promisc = 1;
962 }
963 mutex_exit(&d->bd_lock);
964 break;
965
966 /*
967 * Get device parameters.
968 */
969 case BIOCGDLT:
970 if (d->bd_bif == 0)
971 error = EINVAL;
972 else
973 error = copyout(&d->bd_dlt, (void *)addr,
974 sizeof (d->bd_dlt));
975 break;
976
977 /*
978 * Get a list of supported device parameters.
979 */
980 case BIOCGDLTLIST:
981 if (d->bd_bif == 0) {
982 error = EINVAL;
983 } else {
984 struct bpf_dltlist list;
985
986 if (copyin((void *)addr, &list, sizeof (list)) != 0) {
987 error = EFAULT;
988 break;
989 }
990 error = bpf_getdltlist(d, &list);
991 if ((error == 0) &&
992 copyout(&list, (void *)addr, sizeof (list)) != 0)
993 error = EFAULT;
994 }
995 break;
996
997 /*
998 * Set device parameters.
999 */
1000 case BIOCSDLT:
1001 error = bpf_setdlt(d, (void *)addr);
1002 break;
1003
1004 /*
1005 * Get interface name.
1006 */
1007 case BIOCGETIF:
1008 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1009 error = EFAULT;
1010 break;
1011 }
1012 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1013 if ((error == 0) &&
1014 copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1015 error = EFAULT;
1016 break;
1017 }
1018 break;
1019
1020 /*
1021 * Set interface.
1022 */
1023 case BIOCSETIF:
1024 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1025 error = EFAULT;
1026 break;
1027 }
1028 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1029 break;
1030
1031 /*
1032 * Get interface name.
1033 */
1034 case BIOCGETLIF:
1035 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1036 error = EFAULT;
1037 break;
1038 }
1039 error = bpf_ifname(d, lifreq.lifr_name,
1040 sizeof (lifreq.lifr_name));
1041 if ((error == 0) &&
1042 copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1043 error = EFAULT;
1044 break;
1045 }
1046 break;
1047
1048 /*
1049 * Set interface.
1050 */
1051 case BIOCSETLIF:
1052 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1053 error = EFAULT;
1054 break;
1055 }
1056 error = bpf_setif(d, lifreq.lifr_name,
1057 sizeof (lifreq.lifr_name));
1058 break;
1059
1060 #ifdef _SYSCALL32_IMPL
1061 /*
1062 * Set read timeout.
1063 */
1064 case BIOCSRTIMEOUT32:
1065 {
1066 struct timeval32 tv;
1067
1068 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1069 error = EFAULT;
1070 break;
1071 }
1072
1073 /* Convert the timeout in microseconds to ticks */
1074 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1075 tv.tv_usec);
1076 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1077 d->bd_rtout = 1;
1078 break;
1079 }
1080
1081 /*
1082 * Get read timeout.
1083 */
1084 case BIOCGRTIMEOUT32:
1085 {
1086 struct timeval32 tv;
1087 clock_t ticks;
1088
1089 ticks = drv_hztousec(d->bd_rtout);
1090 tv.tv_sec = ticks / 1000000;
1091 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1092 error = copyout(&tv, (void *)addr, sizeof (tv));
1093 break;
1094 }
1095
1096 /*
1097 * Get a list of supported device parameters.
1098 */
1099 case BIOCGDLTLIST32:
1100 if (d->bd_bif == 0) {
1101 error = EINVAL;
1102 } else {
1103 struct bpf_dltlist32 lst32;
1104 struct bpf_dltlist list;
1105
1106 if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1107 error = EFAULT;
1108 break;
1109 }
1110
1111 list.bfl_len = lst32.bfl_len;
1112 list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1113 error = bpf_getdltlist(d, &list);
1114 if (error == 0) {
1115 lst32.bfl_len = list.bfl_len;
1116
1117 if (copyout(&lst32, (void *)addr,
1118 sizeof (lst32)) != 0)
1119 error = EFAULT;
1120 }
1121 }
1122 break;
1123
1124 /*
1125 * Set link layer read filter.
1126 */
1127 case BIOCSETF32: {
1128 struct bpf_program32 prog32;
1129
1130 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1131 error = EFAULT;
1132 break;
1133 }
1134 prog.bf_len = prog32.bf_len;
1135 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1136 error = bpf_setf(d, &prog);
1137 break;
1138 }
1139 #endif
1140
1141 /*
1142 * Set read timeout.
1143 */
1144 case BIOCSRTIMEOUT:
1145 {
1146 struct timeval tv;
1147
1148 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1149 error = EFAULT;
1150 break;
1151 }
1152
1153 /* Convert the timeout in microseconds to ticks */
1154 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1155 tv.tv_usec);
1156 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1157 d->bd_rtout = 1;
1158 break;
1159 }
1160
1161 /*
1162 * Get read timeout.
1163 */
1164 case BIOCGRTIMEOUT:
1165 {
1166 struct timeval tv;
1167 clock_t ticks;
1168
1169 ticks = drv_hztousec(d->bd_rtout);
1170 tv.tv_sec = ticks / 1000000;
1171 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1172 if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1173 error = EFAULT;
1174 break;
1175 }
1176
1177 /*
1178 * Get packet stats.
1179 */
1180 case BIOCGSTATS:
1181 {
1182 struct bpf_stat bs;
1183
1184 bs.bs_recv = d->bd_rcount;
1185 bs.bs_drop = d->bd_dcount;
1186 bs.bs_capt = d->bd_ccount;
1187 if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1188 error = EFAULT;
1189 break;
1190 }
1191
1192 /*
1193 * Set immediate mode.
1194 */
1195 case BIOCIMMEDIATE:
1196 if (copyin((void *)addr, &d->bd_immediate,
1197 sizeof (d->bd_immediate)) != 0)
1198 error = EFAULT;
1199 break;
1200
1201 case BIOCVERSION:
1202 {
1203 struct bpf_version bv;
1204
1205 bv.bv_major = BPF_MAJOR_VERSION;
1206 bv.bv_minor = BPF_MINOR_VERSION;
1207 if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1208 error = EFAULT;
1209 break;
1210 }
1211
1212 case BIOCGHDRCMPLT: /* get "header already complete" flag */
1213 if (copyout(&d->bd_hdrcmplt, (void *)addr,
1214 sizeof (d->bd_hdrcmplt)) != 0)
1215 error = EFAULT;
1216 break;
1217
1218 case BIOCSHDRCMPLT: /* set "header already complete" flag */
1219 if (copyin((void *)addr, &d->bd_hdrcmplt,
1220 sizeof (d->bd_hdrcmplt)) != 0)
1221 error = EFAULT;
1222 break;
1223
1224 /*
1225 * Get "see sent packets" flag
1226 */
1227 case BIOCGSEESENT:
1228 if (copyout(&d->bd_seesent, (void *)addr,
1229 sizeof (d->bd_seesent)) != 0)
1230 error = EFAULT;
1231 break;
1232
1233 /*
1234 * Set "see sent" packets flag
1235 */
1236 case BIOCSSEESENT:
1237 if (copyin((void *)addr, &d->bd_seesent,
1238 sizeof (d->bd_seesent)) != 0)
1239 error = EFAULT;
1240 break;
1241
1242 case FIONBIO: /* Non-blocking I/O */
1243 if (copyin((void *)addr, &d->bd_nonblock,
1244 sizeof (d->bd_nonblock)) != 0)
1245 error = EFAULT;
1246 break;
1247 }
1248 return (error);
1249 }
1250
1251 /*
1252 * Set d's packet filter program to fp. If this file already has a filter,
1253 * free it and replace it. If the new filter is "empty" (has a 0 size), then
1254 * the result is to just remove and free the existing filter.
1255 * Returns EINVAL for bogus requests.
1256 */
1257 int
bpf_setf(struct bpf_d * d,struct bpf_program * fp)1258 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1259 {
1260 struct bpf_insn *fcode, *old;
1261 uint_t flen, size;
1262 size_t oldsize;
1263
1264 if (fp->bf_insns == 0) {
1265 if (fp->bf_len != 0)
1266 return (EINVAL);
1267 mutex_enter(&d->bd_lock);
1268 old = d->bd_filter;
1269 oldsize = d->bd_filter_size;
1270 d->bd_filter = 0;
1271 d->bd_filter_size = 0;
1272 reset_d(d);
1273 mutex_exit(&d->bd_lock);
1274 if (old != 0)
1275 kmem_free(old, oldsize);
1276 return (0);
1277 }
1278 flen = fp->bf_len;
1279 if (flen > BPF_MAXINSNS)
1280 return (EINVAL);
1281
1282 size = flen * sizeof (*fp->bf_insns);
1283 fcode = kmem_alloc(size, KM_SLEEP);
1284 if (copyin(fp->bf_insns, fcode, size) != 0)
1285 return (EFAULT);
1286
1287 if (bpf_validate(fcode, (int)flen)) {
1288 mutex_enter(&d->bd_lock);
1289 old = d->bd_filter;
1290 oldsize = d->bd_filter_size;
1291 d->bd_filter = fcode;
1292 d->bd_filter_size = size;
1293 reset_d(d);
1294 mutex_exit(&d->bd_lock);
1295 if (old != 0)
1296 kmem_free(old, oldsize);
1297
1298 return (0);
1299 }
1300 kmem_free(fcode, size);
1301 return (EINVAL);
1302 }
1303
1304 /*
1305 * Detach a file from its current interface (if attached at all) and attach
1306 * to the interface indicated by the name stored in ifname.
1307 * Return an errno or 0.
1308 */
1309 static int
bpf_setif(struct bpf_d * d,char * ifname,int namesize)1310 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1311 {
1312 int unit_seen;
1313 int error = 0;
1314 char *cp;
1315 int i;
1316
1317 /*
1318 * Make sure the provided name has a unit number, and default
1319 * it to '0' if not specified.
1320 * XXX This is ugly ... do this differently?
1321 */
1322 unit_seen = 0;
1323 cp = ifname;
1324 cp[namesize - 1] = '\0'; /* sanity */
1325 while (*cp++)
1326 if (*cp >= '0' && *cp <= '9')
1327 unit_seen = 1;
1328 if (!unit_seen) {
1329 /* Make sure to leave room for the '\0'. */
1330 for (i = 0; i < (namesize - 1); ++i) {
1331 if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1332 (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1333 continue;
1334 ifname[i] = '0';
1335 }
1336 }
1337
1338 /*
1339 * Make sure that only one call to this function happens at a time
1340 * and that we're not interleaving a read/write
1341 */
1342 mutex_enter(&d->bd_lock);
1343 while (d->bd_inuse != 0) {
1344 d->bd_waiting++;
1345 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1346 d->bd_waiting--;
1347 mutex_exit(&d->bd_lock);
1348 return (EINTR);
1349 }
1350 d->bd_waiting--;
1351 }
1352 d->bd_inuse = -1;
1353 mutex_exit(&d->bd_lock);
1354
1355 if (d->bd_sbuf == 0)
1356 error = bpf_allocbufs(d);
1357
1358 if (error == 0) {
1359 mutex_enter(&d->bd_lock);
1360 if (d->bd_bif)
1361 /*
1362 * Detach if attached to something else.
1363 */
1364 bpf_detachd(d);
1365
1366 error = bpf_attachd(d, ifname, -1);
1367 reset_d(d);
1368 d->bd_inuse = 0;
1369 if (d->bd_waiting != 0)
1370 cv_signal(&d->bd_wait);
1371 mutex_exit(&d->bd_lock);
1372 return (error);
1373 }
1374
1375 mutex_enter(&d->bd_lock);
1376 d->bd_inuse = 0;
1377 if (d->bd_waiting != 0)
1378 cv_signal(&d->bd_wait);
1379 mutex_exit(&d->bd_lock);
1380
1381 /*
1382 * Try tickle the mac layer into attaching the device...
1383 */
1384 return (bpf_provider_tickle(ifname, d->bd_zone));
1385 }
1386
1387 /*
1388 * Copy the interface name to the ifreq.
1389 */
1390 static int
bpf_ifname(struct bpf_d * d,char * buffer,int bufsize)1391 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1392 {
1393
1394 mutex_enter(&d->bd_lock);
1395 if (d->bd_bif == 0) {
1396 mutex_exit(&d->bd_lock);
1397 return (EINVAL);
1398 }
1399
1400 (void) strlcpy(buffer, d->bd_ifname, bufsize);
1401 mutex_exit(&d->bd_lock);
1402
1403 return (0);
1404 }
1405
1406 /* ARGSUSED */
1407 int
bpfchpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)1408 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409 struct pollhead **phpp)
1410 {
1411 struct bpf_d *d = bpf_dev_get(getminor(dev));
1412
1413 /*
1414 * Until this driver is modified to issue proper pollwakeup() calls on
1415 * its pollhead, edge-triggered polling is not allowed.
1416 */
1417 if (events & POLLET) {
1418 return (EPERM);
1419 }
1420
1421 if (events & (POLLIN | POLLRDNORM)) {
1422 /*
1423 * An imitation of the FIONREAD ioctl code.
1424 */
1425 mutex_enter(&d->bd_lock);
1426 if (d->bd_hlen != 0 ||
1427 ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1428 d->bd_slen != 0)) {
1429 *reventsp |= events & (POLLIN | POLLRDNORM);
1430 } else {
1431 /*
1432 * Until the bpf driver has been updated to include
1433 * adequate pollwakeup() logic, no pollhead will be
1434 * emitted here, preventing the resource from being
1435 * cached by poll()/devpoll/epoll.
1436 */
1437 *reventsp = 0;
1438 /* Start the read timeout if necessary */
1439 if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1440 bpf_clear_timeout(d);
1441 /*
1442 * Only allow the timeout to be set once.
1443 */
1444 if (d->bd_callout == 0)
1445 d->bd_callout = timeout(bpf_timed_out,
1446 d, d->bd_rtout);
1447 d->bd_state = BPF_WAITING;
1448 }
1449 }
1450 mutex_exit(&d->bd_lock);
1451 }
1452
1453 return (0);
1454 }
1455
1456 /*
1457 * Copy data from an mblk_t chain into a buffer. This works for ipnet
1458 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1459 * packet itself.
1460 */
1461 static void *
bpf_mcpy(void * dst_arg,const void * src_arg,size_t len)1462 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1463 {
1464 const mblk_t *m;
1465 uint_t count;
1466 uchar_t *dst;
1467
1468 m = src_arg;
1469 dst = dst_arg;
1470 while (len > 0) {
1471 if (m == NULL)
1472 panic("bpf_mcpy");
1473 count = (uint_t)min(M_LEN(m), len);
1474 (void) memcpy(dst, mtod(m, const void *), count);
1475 m = m->b_cont;
1476 dst += count;
1477 len -= count;
1478 }
1479 return (dst_arg);
1480 }
1481
1482 /*
1483 * Dispatch a packet to all the listeners on interface bp.
1484 *
1485 * marg pointer to the packet, either a data buffer or an mbuf chain
1486 * buflen buffer length, if marg is a data buffer
1487 * cpfn a function that can copy marg into the listener's buffer
1488 * pktlen length of the packet
1489 * issent boolean indicating whether the packet was sent or receive
1490 */
1491 static inline void
bpf_deliver(struct bpf_d * d,cp_fn_t cpfn,void * marg,uint_t pktlen,uint_t buflen,boolean_t issent)1492 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1493 uint_t buflen, boolean_t issent)
1494 {
1495 struct timeval tv;
1496 uint_t slen;
1497
1498 if (!d->bd_seesent && issent)
1499 return;
1500
1501 /*
1502 * Accuracy of the packet counters in BPF is vital so it
1503 * is important to protect even the outer ones.
1504 */
1505 mutex_enter(&d->bd_lock);
1506 slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1507 DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1508 struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1509 d->bd_rcount++;
1510 ks_stats.kp_receive.value.ui64++;
1511 if (slen != 0) {
1512 uniqtime(&tv);
1513 catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1514 }
1515 mutex_exit(&d->bd_lock);
1516 }
1517
1518 /*
1519 * Incoming linkage from device drivers.
1520 */
1521 /* ARGSUSED */
1522 void
bpf_mtap(void * arg,mac_resource_handle_t mrh,mblk_t * m,boolean_t issent)1523 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1524 {
1525 cp_fn_t cpfn;
1526 struct bpf_d *d = arg;
1527 uint_t pktlen, buflen;
1528 void *marg;
1529
1530 pktlen = msgdsize(m);
1531
1532 if (pktlen == M_LEN(m)) {
1533 cpfn = (cp_fn_t)memcpy;
1534 marg = mtod(m, void *);
1535 buflen = pktlen;
1536 } else {
1537 cpfn = bpf_mcpy;
1538 marg = m;
1539 buflen = 0;
1540 }
1541
1542 bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1543 }
1544
1545 /*
1546 * Incoming linkage from ipnet.
1547 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1548 * from all network interfaces. Thus the tap function needs to apply a
1549 * filter using the interface index/id to immitate snoop'ing on just the
1550 * specified interface.
1551 */
1552 /* ARGSUSED */
1553 void
bpf_itap(void * arg,mblk_t * m,boolean_t issent,uint_t length)1554 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1555 {
1556 hook_pkt_observe_t *hdr;
1557 struct bpf_d *d = arg;
1558
1559 hdr = (hook_pkt_observe_t *)m->b_rptr;
1560 if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1561 return;
1562 bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1563
1564 }
1565
1566 /*
1567 * Move the packet data from interface memory (pkt) into the
1568 * store buffer. Return 1 if it's time to wakeup a listener (buffer full),
1569 * otherwise 0. "copy" is the routine called to do the actual data
1570 * transfer. memcpy is passed in to copy contiguous chunks, while
1571 * bpf_mcpy is passed in to copy mbuf chains. In the latter case,
1572 * pkt is really an mbuf.
1573 */
1574 static void
catchpacket(struct bpf_d * d,uchar_t * pkt,uint_t pktlen,uint_t snaplen,cp_fn_t cpfn,struct timeval * tv)1575 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1576 cp_fn_t cpfn, struct timeval *tv)
1577 {
1578 struct bpf_hdr *hp;
1579 int totlen, curlen;
1580 int hdrlen = d->bd_hdrlen;
1581 int do_wakeup = 0;
1582
1583 ++d->bd_ccount;
1584 ks_stats.kp_capture.value.ui64++;
1585 /*
1586 * Figure out how many bytes to move. If the packet is
1587 * greater or equal to the snapshot length, transfer that
1588 * much. Otherwise, transfer the whole packet (unless
1589 * we hit the buffer size limit).
1590 */
1591 totlen = hdrlen + min(snaplen, pktlen);
1592 if (totlen > d->bd_bufsize)
1593 totlen = d->bd_bufsize;
1594
1595 /*
1596 * Round up the end of the previous packet to the next longword.
1597 */
1598 curlen = BPF_WORDALIGN(d->bd_slen);
1599 if (curlen + totlen > d->bd_bufsize) {
1600 /*
1601 * This packet will overflow the storage buffer.
1602 * Rotate the buffers if we can, then wakeup any
1603 * pending reads.
1604 */
1605 if (d->bd_fbuf == 0) {
1606 /*
1607 * We haven't completed the previous read yet,
1608 * so drop the packet.
1609 */
1610 ++d->bd_dcount;
1611 ks_stats.kp_dropped.value.ui64++;
1612 return;
1613 }
1614 ROTATE_BUFFERS(d);
1615 do_wakeup = 1;
1616 curlen = 0;
1617 } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1618 /*
1619 * Immediate mode is set, or the read timeout has
1620 * already expired during a select call. A packet
1621 * arrived, so the reader should be woken up.
1622 */
1623 do_wakeup = 1;
1624 }
1625
1626 /*
1627 * Append the bpf header to the existing buffer before we add
1628 * on the actual packet data.
1629 */
1630 hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1631 hp->bh_tstamp.tv_sec = tv->tv_sec;
1632 hp->bh_tstamp.tv_usec = tv->tv_usec;
1633 hp->bh_datalen = pktlen;
1634 hp->bh_hdrlen = (uint16_t)hdrlen;
1635 /*
1636 * Copy the packet data into the store buffer and update its length.
1637 */
1638 (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1639 (hp->bh_caplen = totlen - hdrlen));
1640 d->bd_slen = curlen + totlen;
1641
1642 /*
1643 * Call bpf_wakeup after bd_slen has been updated.
1644 */
1645 if (do_wakeup)
1646 bpf_wakeup(d);
1647 }
1648
1649 /*
1650 * Initialize all nonzero fields of a descriptor.
1651 */
1652 static int
bpf_allocbufs(struct bpf_d * d)1653 bpf_allocbufs(struct bpf_d *d)
1654 {
1655
1656 d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1657 if (!d->bd_fbuf)
1658 return (ENOBUFS);
1659 d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1660 if (!d->bd_sbuf) {
1661 kmem_free(d->bd_fbuf, d->bd_bufsize);
1662 return (ENOBUFS);
1663 }
1664 d->bd_slen = 0;
1665 d->bd_hlen = 0;
1666 return (0);
1667 }
1668
1669 /*
1670 * Free buffers currently in use by a descriptor.
1671 * Called on close.
1672 */
1673 static void
bpf_freed(struct bpf_d * d)1674 bpf_freed(struct bpf_d *d)
1675 {
1676 /*
1677 * At this point the descriptor has been detached from its
1678 * interface and it yet hasn't been marked free.
1679 */
1680 if (d->bd_sbuf != 0) {
1681 kmem_free(d->bd_sbuf, d->bd_bufsize);
1682 if (d->bd_hbuf != 0)
1683 kmem_free(d->bd_hbuf, d->bd_bufsize);
1684 if (d->bd_fbuf != 0)
1685 kmem_free(d->bd_fbuf, d->bd_bufsize);
1686 }
1687 if (d->bd_filter)
1688 kmem_free(d->bd_filter, d->bd_filter_size);
1689 }
1690
1691 /*
1692 * Get a list of available data link type of the interface.
1693 */
1694 static int
bpf_getdltlist(struct bpf_d * d,struct bpf_dltlist * listp)1695 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1696 {
1697 bpf_provider_list_t *bp;
1698 bpf_provider_t *bpr;
1699 zoneid_t zoneid;
1700 uintptr_t mcip;
1701 uint_t nicdlt;
1702 uintptr_t mh;
1703 int error;
1704 int n;
1705
1706 n = 0;
1707 mh = 0;
1708 mcip = 0;
1709 error = 0;
1710 mutex_enter(&d->bd_lock);
1711 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1712 bpr = bp->bpl_what;
1713 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1714 if (error != 0)
1715 goto next;
1716 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1717 if (error != 0)
1718 goto next;
1719 error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1720 if (error != 0)
1721 goto next;
1722 if (d->bd_zone != GLOBAL_ZONEID &&
1723 d->bd_zone != zoneid)
1724 goto next;
1725 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1726 if (error != 0)
1727 goto next;
1728 nicdlt = bpf_dl_to_dlt(nicdlt);
1729 if (listp->bfl_list != NULL) {
1730 if (n >= listp->bfl_len) {
1731 MBPF_CLIENT_CLOSE(bpr, mcip);
1732 MBPF_CLOSE(bpr, mh);
1733 break;
1734 }
1735 /*
1736 * Bumping of bd_inuse ensures the structure does not
1737 * disappear while the copyout runs and allows the for
1738 * loop to be continued.
1739 */
1740 d->bd_inuse++;
1741 mutex_exit(&d->bd_lock);
1742 if (copyout(&nicdlt,
1743 listp->bfl_list + n, sizeof (uint_t)) != 0)
1744 error = EFAULT;
1745 mutex_enter(&d->bd_lock);
1746 if (error != 0)
1747 break;
1748 d->bd_inuse--;
1749 }
1750 n++;
1751 next:
1752 if (mcip != 0) {
1753 MBPF_CLIENT_CLOSE(bpr, mcip);
1754 mcip = 0;
1755 }
1756 if (mh != 0) {
1757 MBPF_CLOSE(bpr, mh);
1758 mh = 0;
1759 }
1760 }
1761 mutex_exit(&d->bd_lock);
1762
1763 /*
1764 * It is quite possible that one or more provider to BPF may not
1765 * know about a link name whlist others do. In that case, so long
1766 * as we have one success, do not declare an error unless it was
1767 * an EFAULT as this indicates a problem that needs to be reported.
1768 */
1769 if ((error != EFAULT) && (n > 0))
1770 error = 0;
1771
1772 listp->bfl_len = n;
1773 return (error);
1774 }
1775
1776 /*
1777 * Set the data link type of a BPF instance.
1778 */
1779 static int
bpf_setdlt(struct bpf_d * d,void * addr)1780 bpf_setdlt(struct bpf_d *d, void *addr)
1781 {
1782 char ifname[LIFNAMSIZ+1];
1783 zoneid_t niczone;
1784 int error;
1785 int dlt;
1786
1787 if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1788 return (EFAULT);
1789
1790 mutex_enter(&d->bd_lock);
1791
1792 if (d->bd_bif == 0) { /* Interface not set */
1793 mutex_exit(&d->bd_lock);
1794 return (EINVAL);
1795 }
1796 if (d->bd_dlt == dlt) { /* NULL-op */
1797 mutex_exit(&d->bd_lock);
1798 return (0);
1799 }
1800
1801 error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1802 if (error != 0) {
1803 mutex_exit(&d->bd_lock);
1804 return (error);
1805 }
1806
1807 /*
1808 * See the matrix at the top of the file for the permissions table
1809 * enforced by this driver.
1810 */
1811 if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1812 (niczone != d->bd_zone)) {
1813 mutex_exit(&d->bd_lock);
1814 return (EINVAL);
1815 }
1816
1817 (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1818 d->bd_inuse = -1;
1819 bpf_detachd(d);
1820 error = bpf_attachd(d, ifname, dlt);
1821 reset_d(d);
1822 d->bd_inuse = 0;
1823
1824 mutex_exit(&d->bd_lock);
1825 return (error);
1826 }
1827
1828 /*
1829 * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1830 * with the necessary protection to retrieve and modify bd_callout but it
1831 * does not hold the lock for its entire duration... see below...
1832 */
1833 static void
bpf_clear_timeout(struct bpf_d * d)1834 bpf_clear_timeout(struct bpf_d *d)
1835 {
1836 timeout_id_t tid = d->bd_callout;
1837 d->bd_callout = 0;
1838 d->bd_inuse++;
1839
1840 /*
1841 * If the timeout has fired and is waiting on bd_lock, we could
1842 * deadlock here because untimeout if bd_lock is held and would
1843 * wait for bpf_timed_out to finish and it never would.
1844 */
1845 if (tid != 0) {
1846 mutex_exit(&d->bd_lock);
1847 (void) untimeout(tid);
1848 mutex_enter(&d->bd_lock);
1849 }
1850
1851 d->bd_inuse--;
1852 }
1853
1854 /*
1855 * As a cloning device driver, BPF needs to keep track of which device
1856 * numbers are in use and which ones are not. A hash table, indexed by
1857 * the minor device number, is used to store the pointers to the
1858 * individual descriptors that are allocated in bpfopen().
1859 * The functions below present the interface for that hash table to
1860 * the rest of the driver.
1861 */
1862 static struct bpf_d *
bpf_dev_find(minor_t minor)1863 bpf_dev_find(minor_t minor)
1864 {
1865 struct bpf_d *d = NULL;
1866
1867 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1868 (mod_hash_val_t *)&d);
1869
1870 return (d);
1871 }
1872
1873 static void
bpf_dev_add(struct bpf_d * d)1874 bpf_dev_add(struct bpf_d *d)
1875 {
1876 (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1877 (mod_hash_val_t)d);
1878 }
1879
1880 static void
bpf_dev_remove(struct bpf_d * d)1881 bpf_dev_remove(struct bpf_d *d)
1882 {
1883 struct bpf_d *stor;
1884
1885 (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1886 (mod_hash_val_t *)&stor);
1887 ASSERT(stor == d);
1888 }
1889
1890 /*
1891 * bpf_def_get should only ever be called for a minor number that exists,
1892 * thus there should always be a pointer in the hash table that corresponds
1893 * to it.
1894 */
1895 static struct bpf_d *
bpf_dev_get(minor_t minor)1896 bpf_dev_get(minor_t minor)
1897 {
1898 struct bpf_d *d = NULL;
1899
1900 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1901 (mod_hash_val_t *)&d);
1902 ASSERT(d != NULL);
1903
1904 return (d);
1905 }
1906