1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */
2
3 /*
4 * Copyright (c) 1990, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from the Stanford/CMU enet packet filter,
8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
10 * Berkeley Laboratory.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95
37 * static char rcsid[] =
38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp ";
39 */
40 /*
41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
42 * Use is subject to license terms.
43 */
44
45 /*
46 * The BPF implements the following access controls for zones attempting
47 * to read and write data. Writing of data requires that the net_rawaccess
48 * privilege is held whilst reading data requires either net_rawaccess or
49 * net_observerability.
50 *
51 * | Shared | Exclusive | Global
52 * -----------------------------+--------+------------+------------+
53 * DLT_IPNET in local zone | Read | Read | Read |
54 * -----------------------------+--------+------------+------------+
55 * Raw access to local zone NIC | None | Read/Write | Read/Write |
56 * -----------------------------+--------+------------+------------+
57 * Raw access to all NICs | None | None | Read/Write |
58 * -----------------------------+--------+------------+------------+
59 *
60 * The BPF driver is written as a cloning driver: each call to bpfopen()
61 * allocates a new minor number. This provides BPF with a 1:1 relationship
62 * between open's and close's. There is some amount of "descriptor state"
63 * that is kept per open. Pointers to this data are stored in a hash table
64 * (bpf_hash) that is index'd by the minor device number for each open file.
65 */
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/time.h>
69 #include <sys/ioctl.h>
70 #include <sys/queue.h>
71 #include <sys/filio.h>
72 #include <sys/policy.h>
73 #include <sys/cmn_err.h>
74 #include <sys/uio.h>
75 #include <sys/file.h>
76 #include <sys/sysmacros.h>
77 #include <sys/zone.h>
78
79 #include <sys/socket.h>
80 #include <sys/errno.h>
81 #include <sys/poll.h>
82 #include <sys/dlpi.h>
83 #include <sys/neti.h>
84
85 #include <net/if.h>
86
87 #include <net/bpf.h>
88 #include <net/bpfdesc.h>
89 #include <net/dlt.h>
90
91 #include <netinet/in.h>
92 #include <sys/mac.h>
93 #include <sys/mac_client.h>
94 #include <sys/mac_impl.h>
95 #include <sys/time_std_impl.h>
96 #include <sys/hook.h>
97 #include <sys/hook_event.h>
98
99
100 #define mtod(_v, _t) (_t)((_v)->b_rptr)
101 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
102
103 /*
104 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet
105 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k).
106 */
107 #define BPF_BUFSIZE (32 * 1024)
108
109 typedef void *(*cp_fn_t)(void *, const void *, size_t);
110
111 /*
112 * The default read buffer size, and limit for BIOCSBLEN.
113 */
114 int bpf_bufsize = BPF_BUFSIZE;
115 int bpf_maxbufsize = (16 * 1024 * 1024);
116 static mod_hash_t *bpf_hash = NULL;
117
118 /*
119 * Use a mutex to avoid a race condition between gathering the stats/peers
120 * and opening/closing the device.
121 */
122 static kcondvar_t bpf_dlt_waiter;
123 static kmutex_t bpf_mtx;
124 static bpf_kstats_t ks_stats;
125 static bpf_kstats_t bpf_kstats = {
126 { "readWait", KSTAT_DATA_UINT64 },
127 { "writeOk", KSTAT_DATA_UINT64 },
128 { "writeError", KSTAT_DATA_UINT64 },
129 { "receive", KSTAT_DATA_UINT64 },
130 { "captured", KSTAT_DATA_UINT64 },
131 { "dropped", KSTAT_DATA_UINT64 },
132 };
133 static kstat_t *bpf_ksp;
134
135 /*
136 * bpf_list is a list of the BPF descriptors currently open
137 */
138 LIST_HEAD(, bpf_d) bpf_list;
139
140 static int bpf_allocbufs(struct bpf_d *);
141 static void bpf_clear_timeout(struct bpf_d *);
142 static void bpf_deliver(struct bpf_d *, cp_fn_t,
143 void *, uint_t, uint_t, boolean_t);
144 static void bpf_freed(struct bpf_d *);
145 static int bpf_ifname(struct bpf_d *d, char *, int);
146 static void *bpf_mcpy(void *, const void *, size_t);
147 static int bpf_attachd(struct bpf_d *, const char *, int);
148 static void bpf_detachd(struct bpf_d *);
149 static int bpf_setif(struct bpf_d *, char *, int);
150 static void bpf_timed_out(void *);
151 static inline void
152 bpf_wakeup(struct bpf_d *);
153 static void catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t,
154 cp_fn_t, struct timeval *);
155 static void reset_d(struct bpf_d *);
156 static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
157 static int bpf_setdlt(struct bpf_d *, void *);
158 static void bpf_dev_add(struct bpf_d *);
159 static struct bpf_d *bpf_dev_find(minor_t);
160 static struct bpf_d *bpf_dev_get(minor_t);
161 static void bpf_dev_remove(struct bpf_d *);
162
163 static int
bpf_movein(struct uio * uio,int linktype,int mtu,mblk_t ** mp)164 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp)
165 {
166 mblk_t *m;
167 int error;
168 int len;
169 int hlen;
170 int align;
171
172 /*
173 * Build a sockaddr based on the data link layer type.
174 * We do this at this level because the ethernet header
175 * is copied directly into the data field of the sockaddr.
176 * In the case of SLIP, there is no header and the packet
177 * is forwarded as is.
178 * Also, we are careful to leave room at the front of the mbuf
179 * for the link level header.
180 */
181 switch (linktype) {
182
183 case DLT_EN10MB:
184 hlen = sizeof (struct ether_header);
185 break;
186
187 case DLT_FDDI:
188 hlen = 16;
189 break;
190
191 case DLT_NULL:
192 hlen = 0;
193 break;
194
195 case DLT_IPOIB:
196 hlen = 44;
197 break;
198
199 default:
200 return (EIO);
201 }
202
203 align = 4 - (hlen & 3);
204
205 len = uio->uio_resid;
206 /*
207 * If there aren't enough bytes for a link level header or the
208 * packet length exceeds the interface mtu, return an error.
209 */
210 if (len < hlen || len - hlen > mtu)
211 return (EMSGSIZE);
212
213 m = allocb(len + align, BPRI_MED);
214 if (m == NULL) {
215 error = ENOBUFS;
216 goto bad;
217 }
218
219 /* Insure the data is properly aligned */
220 if (align > 0)
221 m->b_rptr += align;
222 m->b_wptr = m->b_rptr + len;
223
224 error = uiomove(mtod(m, void *), len, UIO_WRITE, uio);
225 if (error)
226 goto bad;
227 *mp = m;
228 return (0);
229
230 bad:
231 if (m != NULL)
232 freemsg(m);
233 return (error);
234 }
235
236
237 /*
238 * Attach file to the bpf interface, i.e. make d listen on bp.
239 */
240 static int
bpf_attachd(struct bpf_d * d,const char * ifname,int dlt)241 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt)
242 {
243 bpf_provider_list_t *bp;
244 bpf_provider_t *bpr;
245 boolean_t zonematch;
246 zoneid_t niczone;
247 uintptr_t mcip;
248 zoneid_t zone;
249 uint_t nicdlt;
250 uintptr_t mh;
251 int hdrlen;
252 int error;
253
254 ASSERT(d->bd_bif == NULL);
255 ASSERT(d->bd_mcip == NULL);
256 zone = d->bd_zone;
257 zonematch = B_TRUE;
258 again:
259 mh = 0;
260 mcip = 0;
261 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
262 bpr = bp->bpl_what;
263 error = MBPF_OPEN(bpr, ifname, &mh, zone);
264 if (error != 0)
265 goto next;
266 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
267 if (error != 0)
268 goto next;
269 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
270 if (error != 0)
271 goto next;
272
273 nicdlt = bpf_dl_to_dlt(nicdlt);
274 if (dlt != -1 && dlt != nicdlt) {
275 error = ENOENT;
276 goto next;
277 }
278
279 error = MBPF_GET_ZONE(bpr, mh, &niczone);
280 if (error != 0)
281 goto next;
282
283 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr,
284 uintptr_t, mh, int, nicdlt, zoneid_t, niczone);
285
286 if (zonematch && niczone != zone) {
287 error = ENOENT;
288 goto next;
289 }
290 break;
291 next:
292 if (mcip != 0) {
293 MBPF_CLIENT_CLOSE(bpr, mcip);
294 mcip = 0;
295 }
296 if (mh != NULL) {
297 MBPF_CLOSE(bpr, mh);
298 mh = 0;
299 }
300 }
301 if (error != 0) {
302 if (zonematch && (zone == GLOBAL_ZONEID)) {
303 /*
304 * If we failed to do an exact match for the global
305 * zone using the global zoneid, try again in case
306 * the network interface is owned by a local zone.
307 */
308 zonematch = B_FALSE;
309 goto again;
310 }
311 return (error);
312 }
313
314 d->bd_mac = *bpr;
315 d->bd_mcip = mcip;
316 d->bd_bif = mh;
317 d->bd_dlt = nicdlt;
318 hdrlen = bpf_dl_hdrsize(nicdlt);
319 d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
320
321 (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip),
322 sizeof (d->bd_ifname));
323
324 (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid,
325 zone);
326 (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d,
327 &d->bd_promisc_handle, d->bd_promisc_flags);
328 return (0);
329 }
330
331 /*
332 * Detach a file from its interface.
333 */
334 static void
bpf_detachd(struct bpf_d * d)335 bpf_detachd(struct bpf_d *d)
336 {
337 uintptr_t mph;
338 uintptr_t mch;
339 uintptr_t mh;
340
341 ASSERT(d->bd_inuse == -1);
342 mch = d->bd_mcip;
343 d->bd_mcip = 0;
344 mh = d->bd_bif;
345 d->bd_bif = 0;
346
347 /*
348 * Check if this descriptor had requested promiscuous mode.
349 * If so, turn it off. There's no need to take any action
350 * here, that is done when MBPF_PROMISC_REMOVE is used;
351 * bd_promisc is just a local flag to stop promiscuous mode
352 * from being set more than once.
353 */
354 if (d->bd_promisc)
355 d->bd_promisc = 0;
356
357 /*
358 * Take device out of "promiscuous" mode. Since we were able to
359 * enter "promiscuous" mode, we should be able to turn it off.
360 * Note, this field stores a pointer used to support both
361 * promiscuous and non-promiscuous callbacks for packets.
362 */
363 mph = d->bd_promisc_handle;
364 d->bd_promisc_handle = 0;
365
366 /*
367 * The lock has to be dropped here because mac_promisc_remove may
368 * need to wait for mac_promisc_dispatch, which has called into
369 * bpf and catchpacket is waiting for bd_lock...
370 * i.e mac_promisc_remove() needs to be called with none of the
371 * locks held that are part of the bpf_mtap() call path.
372 */
373 mutex_exit(&d->bd_lock);
374 if (mph != 0)
375 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
376
377 if (mch != 0)
378 MBPF_CLIENT_CLOSE(&d->bd_mac, mch);
379
380 if (mh != 0)
381 MBPF_CLOSE(&d->bd_mac, mh);
382
383 /*
384 * Because this function is called with bd_lock held, so it must
385 * exit with it held.
386 */
387 mutex_enter(&d->bd_lock);
388 *d->bd_ifname = '\0';
389 (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac));
390 }
391
392
393 /*
394 * bpfilterattach() is called at load time.
395 */
396 int
bpfilterattach(void)397 bpfilterattach(void)
398 {
399
400 bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31,
401 mod_hash_null_keydtor);
402 if (bpf_hash == NULL)
403 return (ENOMEM);
404
405 (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats));
406
407 bpf_ksp = kstat_create("bpf", 0, "global", "misc",
408 KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t),
409 KSTAT_FLAG_VIRTUAL);
410 if (bpf_ksp != NULL) {
411 bpf_ksp->ks_data = &ks_stats;
412 kstat_install(bpf_ksp);
413 } else {
414 mod_hash_destroy_idhash(bpf_hash);
415 bpf_hash = NULL;
416 return (EEXIST);
417 }
418
419 cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL);
420 mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL);
421
422 LIST_INIT(&bpf_list);
423
424 return (0);
425 }
426
427
428 /*
429 * bpfilterdetach() is called at unload time.
430 */
431 int
bpfilterdetach(void)432 bpfilterdetach(void)
433 {
434
435 if (bpf_ksp != NULL) {
436 kstat_delete(bpf_ksp);
437 bpf_ksp = NULL;
438 }
439
440 mod_hash_destroy_idhash(bpf_hash);
441 bpf_hash = NULL;
442
443 cv_destroy(&bpf_dlt_waiter);
444 mutex_destroy(&bpf_mtx);
445
446 return (0);
447 }
448
449 /*
450 * Open ethernet device. Clones.
451 */
452 /* ARGSUSED */
453 int
bpfopen(dev_t * devp,int flag,int mode,cred_t * cred)454 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred)
455 {
456 struct bpf_d *d;
457 uint_t dmin;
458
459 /*
460 * The security policy described at the top of this file is
461 * enforced here.
462 */
463 if ((flag & FWRITE) != 0) {
464 if (secpolicy_net_rawaccess(cred) != 0)
465 return (EACCES);
466 }
467
468 if ((flag & FREAD) != 0) {
469 if ((secpolicy_net_observability(cred) != 0) &&
470 (secpolicy_net_rawaccess(cred) != 0))
471 return (EACCES);
472 }
473
474 if ((flag & (FWRITE|FREAD)) == 0)
475 return (ENXIO);
476
477 /*
478 * A structure is allocated per open file in BPF to store settings
479 * such as buffer capture size, provide private buffers, etc.
480 */
481 d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP);
482 d->bd_bufsize = bpf_bufsize;
483 d->bd_fmode = flag;
484 d->bd_zone = crgetzoneid(cred);
485 d->bd_seesent = 1;
486 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS|
487 MAC_PROMISC_FLAGS_NO_COPY;
488 mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL);
489 cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL);
490
491 mutex_enter(&bpf_mtx);
492 /*
493 * Find an unused minor number. Obviously this is an O(n) algorithm
494 * and doesn't scale particularly well, so if there are large numbers
495 * of open file descriptors happening in real use, this design may
496 * need to be revisited.
497 */
498 for (dmin = 0; dmin < L_MAXMIN; dmin++)
499 if (bpf_dev_find(dmin) == NULL)
500 break;
501 if (dmin == L_MAXMIN) {
502 mutex_exit(&bpf_mtx);
503 kmem_free(d, sizeof (*d));
504 return (ENXIO);
505 }
506 d->bd_dev = dmin;
507 LIST_INSERT_HEAD(&bpf_list, d, bd_list);
508 bpf_dev_add(d);
509 mutex_exit(&bpf_mtx);
510
511 *devp = makedevice(getmajor(*devp), dmin);
512
513 return (0);
514 }
515
516 /*
517 * Close the descriptor by detaching it from its interface,
518 * deallocating its buffers, and marking it free.
519 *
520 * Because we only allow a device to be opened once, there is always a
521 * 1 to 1 relationship between opens and closes supporting this function.
522 */
523 /* ARGSUSED */
524 int
bpfclose(dev_t dev,int flag,int otyp,cred_t * cred_p)525 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p)
526 {
527 struct bpf_d *d = bpf_dev_get(getminor(dev));
528
529 mutex_enter(&d->bd_lock);
530
531 while (d->bd_inuse != 0) {
532 d->bd_waiting++;
533 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
534 d->bd_waiting--;
535 mutex_exit(&d->bd_lock);
536 return (EINTR);
537 }
538 d->bd_waiting--;
539 }
540
541 d->bd_inuse = -1;
542 if (d->bd_state == BPF_WAITING)
543 bpf_clear_timeout(d);
544 d->bd_state = BPF_IDLE;
545 if (d->bd_bif)
546 bpf_detachd(d);
547 mutex_exit(&d->bd_lock);
548
549 mutex_enter(&bpf_mtx);
550 LIST_REMOVE(d, bd_list);
551 bpf_dev_remove(d);
552 mutex_exit(&bpf_mtx);
553
554 mutex_enter(&d->bd_lock);
555 mutex_destroy(&d->bd_lock);
556 cv_destroy(&d->bd_wait);
557
558 bpf_freed(d);
559 kmem_free(d, sizeof (*d));
560
561 return (0);
562 }
563
564 /*
565 * Rotate the packet buffers in descriptor d. Move the store buffer
566 * into the hold slot, and the free buffer into the store slot.
567 * Zero the length of the new store buffer.
568 */
569 #define ROTATE_BUFFERS(d) \
570 (d)->bd_hbuf = (d)->bd_sbuf; \
571 (d)->bd_hlen = (d)->bd_slen; \
572 (d)->bd_sbuf = (d)->bd_fbuf; \
573 (d)->bd_slen = 0; \
574 (d)->bd_fbuf = 0;
575 /*
576 * bpfread - read next chunk of packets from buffers
577 */
578 /* ARGSUSED */
579 int
bpfread(dev_t dev,struct uio * uio,cred_t * cred)580 bpfread(dev_t dev, struct uio *uio, cred_t *cred)
581 {
582 struct bpf_d *d = bpf_dev_get(getminor(dev));
583 int timed_out;
584 ulong_t delay;
585 int error;
586
587 if ((d->bd_fmode & FREAD) == 0)
588 return (EBADF);
589
590 /*
591 * Restrict application to use a buffer the same size as
592 * the kernel buffers.
593 */
594 if (uio->uio_resid != d->bd_bufsize)
595 return (EINVAL);
596
597 mutex_enter(&d->bd_lock);
598 if (d->bd_state == BPF_WAITING)
599 bpf_clear_timeout(d);
600 timed_out = (d->bd_state == BPF_TIMED_OUT);
601 d->bd_state = BPF_IDLE;
602 /*
603 * If the hold buffer is empty, then do a timed sleep, which
604 * ends when the timeout expires or when enough packets
605 * have arrived to fill the store buffer.
606 */
607 while (d->bd_hbuf == 0) {
608 if (d->bd_nonblock) {
609 if (d->bd_slen == 0) {
610 mutex_exit(&d->bd_lock);
611 return (EWOULDBLOCK);
612 }
613 ROTATE_BUFFERS(d);
614 break;
615 }
616
617 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
618 /*
619 * A packet(s) either arrived since the previous
620 * read or arrived while we were asleep.
621 * Rotate the buffers and return what's here.
622 */
623 ROTATE_BUFFERS(d);
624 break;
625 }
626 ks_stats.kp_read_wait.value.ui64++;
627 delay = ddi_get_lbolt() + d->bd_rtout;
628 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay);
629 if (error == 0) {
630 mutex_exit(&d->bd_lock);
631 return (EINTR);
632 }
633 if (error == -1) {
634 /*
635 * On a timeout, return what's in the buffer,
636 * which may be nothing. If there is something
637 * in the store buffer, we can rotate the buffers.
638 */
639 if (d->bd_hbuf)
640 /*
641 * We filled up the buffer in between
642 * getting the timeout and arriving
643 * here, so we don't need to rotate.
644 */
645 break;
646
647 if (d->bd_slen == 0) {
648 mutex_exit(&d->bd_lock);
649 return (0);
650 }
651 ROTATE_BUFFERS(d);
652 }
653 }
654 /*
655 * At this point, we know we have something in the hold slot.
656 */
657 mutex_exit(&d->bd_lock);
658
659 /*
660 * Move data from hold buffer into user space.
661 * We know the entire buffer is transferred since
662 * we checked above that the read buffer is bpf_bufsize bytes.
663 */
664 error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio);
665
666 mutex_enter(&d->bd_lock);
667 d->bd_fbuf = d->bd_hbuf;
668 d->bd_hbuf = 0;
669 d->bd_hlen = 0;
670 done:
671 mutex_exit(&d->bd_lock);
672 return (error);
673 }
674
675
676 /*
677 * If there are processes sleeping on this descriptor, wake them up.
678 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver,
679 * so there is no code here grabbing it.
680 */
681 static inline void
bpf_wakeup(struct bpf_d * d)682 bpf_wakeup(struct bpf_d *d)
683 {
684 cv_signal(&d->bd_wait);
685 }
686
687 static void
bpf_timed_out(void * arg)688 bpf_timed_out(void *arg)
689 {
690 struct bpf_d *d = arg;
691
692 mutex_enter(&d->bd_lock);
693 if (d->bd_state == BPF_WAITING) {
694 d->bd_state = BPF_TIMED_OUT;
695 if (d->bd_slen != 0)
696 cv_signal(&d->bd_wait);
697 }
698 mutex_exit(&d->bd_lock);
699 }
700
701
702 /* ARGSUSED */
703 int
bpfwrite(dev_t dev,struct uio * uio,cred_t * cred)704 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred)
705 {
706 struct bpf_d *d = bpf_dev_get(getminor(dev));
707 uintptr_t mch;
708 uint_t mtu;
709 mblk_t *m;
710 int error;
711 int dlt;
712
713 if ((d->bd_fmode & FWRITE) == 0)
714 return (EBADF);
715
716 mutex_enter(&d->bd_lock);
717 if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) {
718 mutex_exit(&d->bd_lock);
719 return (EINTR);
720 }
721
722 if (uio->uio_resid == 0) {
723 mutex_exit(&d->bd_lock);
724 return (0);
725 }
726
727 while (d->bd_inuse < 0) {
728 d->bd_waiting++;
729 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
730 d->bd_waiting--;
731 mutex_exit(&d->bd_lock);
732 return (EINTR);
733 }
734 d->bd_waiting--;
735 }
736
737 mutex_exit(&d->bd_lock);
738
739 dlt = d->bd_dlt;
740 mch = d->bd_mcip;
741 MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu);
742 d->bd_inuse++;
743
744 m = NULL;
745 if (dlt == DLT_IPNET) {
746 error = EIO;
747 goto done;
748 }
749
750 error = bpf_movein(uio, dlt, mtu, &m);
751 if (error)
752 goto done;
753
754 DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt,
755 uint_t, mtu, mblk_t *, m);
756
757 if (M_LEN(m) > mtu) {
758 error = EMSGSIZE;
759 goto done;
760 }
761
762 error = MBPF_TX(&d->bd_mac, mch, m);
763 /*
764 * The "tx" action here is required to consume the mblk_t.
765 */
766 m = NULL;
767
768 done:
769 if (error == 0)
770 ks_stats.kp_write_ok.value.ui64++;
771 else
772 ks_stats.kp_write_error.value.ui64++;
773 if (m != NULL)
774 freemsg(m);
775
776 mutex_enter(&d->bd_lock);
777 d->bd_inuse--;
778 if ((d->bd_inuse == 0) && (d->bd_waiting != 0))
779 cv_signal(&d->bd_wait);
780 mutex_exit(&d->bd_lock);
781
782 /*
783 * The driver frees the mbuf.
784 */
785 return (error);
786 }
787
788
789 /*
790 * Reset a descriptor by flushing its packet buffer and clearing the
791 * receive and drop counts. Should be called at splnet.
792 */
793 static void
reset_d(struct bpf_d * d)794 reset_d(struct bpf_d *d)
795 {
796 if (d->bd_hbuf) {
797 /* Free the hold buffer. */
798 d->bd_fbuf = d->bd_hbuf;
799 d->bd_hbuf = 0;
800 }
801 d->bd_slen = 0;
802 d->bd_hlen = 0;
803 d->bd_rcount = 0;
804 d->bd_dcount = 0;
805 d->bd_ccount = 0;
806 }
807
808 /*
809 * FIONREAD Check for read packet available.
810 * BIOCGBLEN Get buffer len [for read()].
811 * BIOCSETF Set ethernet read filter.
812 * BIOCFLUSH Flush read packet buffer.
813 * BIOCPROMISC Put interface into promiscuous mode.
814 * BIOCGDLT Get link layer type.
815 * BIOCGETIF Get interface name.
816 * BIOCSETIF Set interface.
817 * BIOCSRTIMEOUT Set read timeout.
818 * BIOCGRTIMEOUT Get read timeout.
819 * BIOCGSTATS Get packet stats.
820 * BIOCIMMEDIATE Set immediate mode.
821 * BIOCVERSION Get filter language version.
822 * BIOCGHDRCMPLT Get "header already complete" flag.
823 * BIOCSHDRCMPLT Set "header already complete" flag.
824 */
825 /* ARGSUSED */
826 int
bpfioctl(dev_t dev,int cmd,intptr_t addr,int mode,cred_t * cred,int * rval)827 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval)
828 {
829 struct bpf_d *d = bpf_dev_get(getminor(dev));
830 struct bpf_program prog;
831 struct lifreq lifreq;
832 struct ifreq ifreq;
833 int error = 0;
834 uint_t size;
835
836 /*
837 * Refresh the PID associated with this bpf file.
838 */
839 mutex_enter(&d->bd_lock);
840 if (d->bd_state == BPF_WAITING)
841 bpf_clear_timeout(d);
842 d->bd_state = BPF_IDLE;
843 mutex_exit(&d->bd_lock);
844
845 switch (cmd) {
846
847 default:
848 error = EINVAL;
849 break;
850
851 /*
852 * Check for read packet available.
853 */
854 case FIONREAD:
855 {
856 int n;
857
858 mutex_enter(&d->bd_lock);
859 n = d->bd_slen;
860 if (d->bd_hbuf)
861 n += d->bd_hlen;
862 mutex_exit(&d->bd_lock);
863
864 *(int *)addr = n;
865 break;
866 }
867
868 /*
869 * Get buffer len [for read()].
870 */
871 case BIOCGBLEN:
872 error = copyout(&d->bd_bufsize, (void *)addr,
873 sizeof (d->bd_bufsize));
874 break;
875
876 /*
877 * Set buffer length.
878 */
879 case BIOCSBLEN:
880 if (copyin((void *)addr, &size, sizeof (size)) != 0) {
881 error = EFAULT;
882 break;
883 }
884
885 mutex_enter(&d->bd_lock);
886 if (d->bd_bif != 0) {
887 error = EINVAL;
888 } else {
889 if (size > bpf_maxbufsize)
890 size = bpf_maxbufsize;
891 else if (size < BPF_MINBUFSIZE)
892 size = BPF_MINBUFSIZE;
893
894 d->bd_bufsize = size;
895 }
896 mutex_exit(&d->bd_lock);
897
898 if (error == 0)
899 error = copyout(&size, (void *)addr, sizeof (size));
900 break;
901
902 /*
903 * Set link layer read filter.
904 */
905 case BIOCSETF:
906 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) {
907 error = EFAULT;
908 break;
909 }
910 error = bpf_setf(d, &prog);
911 break;
912
913 /*
914 * Flush read packet buffer.
915 */
916 case BIOCFLUSH:
917 mutex_enter(&d->bd_lock);
918 reset_d(d);
919 mutex_exit(&d->bd_lock);
920 break;
921
922 /*
923 * Put interface into promiscuous mode.
924 * This is a one-way ioctl, it is not used to turn promiscuous
925 * mode off.
926 */
927 case BIOCPROMISC:
928 if (d->bd_bif == 0) {
929 /*
930 * No interface attached yet.
931 */
932 error = EINVAL;
933 break;
934 }
935 mutex_enter(&d->bd_lock);
936 if (d->bd_promisc == 0) {
937
938 if (d->bd_promisc_handle) {
939 uintptr_t mph;
940
941 mph = d->bd_promisc_handle;
942 d->bd_promisc_handle = 0;
943
944 mutex_exit(&d->bd_lock);
945 MBPF_PROMISC_REMOVE(&d->bd_mac, mph);
946 mutex_enter(&d->bd_lock);
947 }
948
949 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY;
950 error = MBPF_PROMISC_ADD(&d->bd_mac,
951 d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d,
952 &d->bd_promisc_handle, d->bd_promisc_flags);
953 if (error == 0)
954 d->bd_promisc = 1;
955 }
956 mutex_exit(&d->bd_lock);
957 break;
958
959 /*
960 * Get device parameters.
961 */
962 case BIOCGDLT:
963 if (d->bd_bif == 0)
964 error = EINVAL;
965 else
966 error = copyout(&d->bd_dlt, (void *)addr,
967 sizeof (d->bd_dlt));
968 break;
969
970 /*
971 * Get a list of supported device parameters.
972 */
973 case BIOCGDLTLIST:
974 if (d->bd_bif == 0) {
975 error = EINVAL;
976 } else {
977 struct bpf_dltlist list;
978
979 if (copyin((void *)addr, &list, sizeof (list)) != 0) {
980 error = EFAULT;
981 break;
982 }
983 error = bpf_getdltlist(d, &list);
984 if ((error == 0) &&
985 copyout(&list, (void *)addr, sizeof (list)) != 0)
986 error = EFAULT;
987 }
988 break;
989
990 /*
991 * Set device parameters.
992 */
993 case BIOCSDLT:
994 error = bpf_setdlt(d, (void *)addr);
995 break;
996
997 /*
998 * Get interface name.
999 */
1000 case BIOCGETIF:
1001 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1002 error = EFAULT;
1003 break;
1004 }
1005 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1006 if ((error == 0) &&
1007 copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) {
1008 error = EFAULT;
1009 break;
1010 }
1011 break;
1012
1013 /*
1014 * Set interface.
1015 */
1016 case BIOCSETIF:
1017 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) {
1018 error = EFAULT;
1019 break;
1020 }
1021 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name));
1022 break;
1023
1024 /*
1025 * Get interface name.
1026 */
1027 case BIOCGETLIF:
1028 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1029 error = EFAULT;
1030 break;
1031 }
1032 error = bpf_ifname(d, lifreq.lifr_name,
1033 sizeof (lifreq.lifr_name));
1034 if ((error == 0) &&
1035 copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) {
1036 error = EFAULT;
1037 break;
1038 }
1039 break;
1040
1041 /*
1042 * Set interface.
1043 */
1044 case BIOCSETLIF:
1045 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) {
1046 error = EFAULT;
1047 break;
1048 }
1049 error = bpf_setif(d, lifreq.lifr_name,
1050 sizeof (lifreq.lifr_name));
1051 break;
1052
1053 #ifdef _SYSCALL32_IMPL
1054 /*
1055 * Set read timeout.
1056 */
1057 case BIOCSRTIMEOUT32:
1058 {
1059 struct timeval32 tv;
1060
1061 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1062 error = EFAULT;
1063 break;
1064 }
1065
1066 /* Convert the timeout in microseconds to ticks */
1067 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1068 tv.tv_usec);
1069 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1070 d->bd_rtout = 1;
1071 break;
1072 }
1073
1074 /*
1075 * Get read timeout.
1076 */
1077 case BIOCGRTIMEOUT32:
1078 {
1079 struct timeval32 tv;
1080 clock_t ticks;
1081
1082 ticks = drv_hztousec(d->bd_rtout);
1083 tv.tv_sec = ticks / 1000000;
1084 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1085 error = copyout(&tv, (void *)addr, sizeof (tv));
1086 break;
1087 }
1088
1089 /*
1090 * Get a list of supported device parameters.
1091 */
1092 case BIOCGDLTLIST32:
1093 if (d->bd_bif == 0) {
1094 error = EINVAL;
1095 } else {
1096 struct bpf_dltlist32 lst32;
1097 struct bpf_dltlist list;
1098
1099 if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) {
1100 error = EFAULT;
1101 break;
1102 }
1103
1104 list.bfl_len = lst32.bfl_len;
1105 list.bfl_list = (void *)(uint64_t)lst32.bfl_list;
1106 error = bpf_getdltlist(d, &list);
1107 if (error == 0) {
1108 lst32.bfl_len = list.bfl_len;
1109
1110 if (copyout(&lst32, (void *)addr,
1111 sizeof (lst32)) != 0)
1112 error = EFAULT;
1113 }
1114 }
1115 break;
1116
1117 /*
1118 * Set link layer read filter.
1119 */
1120 case BIOCSETF32: {
1121 struct bpf_program32 prog32;
1122
1123 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) {
1124 error = EFAULT;
1125 break;
1126 }
1127 prog.bf_len = prog32.bf_len;
1128 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns;
1129 error = bpf_setf(d, &prog);
1130 break;
1131 }
1132 #endif
1133
1134 /*
1135 * Set read timeout.
1136 */
1137 case BIOCSRTIMEOUT:
1138 {
1139 struct timeval tv;
1140
1141 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) {
1142 error = EFAULT;
1143 break;
1144 }
1145
1146 /* Convert the timeout in microseconds to ticks */
1147 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 +
1148 tv.tv_usec);
1149 if ((d->bd_rtout == 0) && (tv.tv_usec != 0))
1150 d->bd_rtout = 1;
1151 break;
1152 }
1153
1154 /*
1155 * Get read timeout.
1156 */
1157 case BIOCGRTIMEOUT:
1158 {
1159 struct timeval tv;
1160 clock_t ticks;
1161
1162 ticks = drv_hztousec(d->bd_rtout);
1163 tv.tv_sec = ticks / 1000000;
1164 tv.tv_usec = ticks - (tv.tv_sec * 1000000);
1165 if (copyout(&tv, (void *)addr, sizeof (tv)) != 0)
1166 error = EFAULT;
1167 break;
1168 }
1169
1170 /*
1171 * Get packet stats.
1172 */
1173 case BIOCGSTATS:
1174 {
1175 struct bpf_stat bs;
1176
1177 bs.bs_recv = d->bd_rcount;
1178 bs.bs_drop = d->bd_dcount;
1179 bs.bs_capt = d->bd_ccount;
1180 if (copyout(&bs, (void *)addr, sizeof (bs)) != 0)
1181 error = EFAULT;
1182 break;
1183 }
1184
1185 /*
1186 * Set immediate mode.
1187 */
1188 case BIOCIMMEDIATE:
1189 if (copyin((void *)addr, &d->bd_immediate,
1190 sizeof (d->bd_immediate)) != 0)
1191 error = EFAULT;
1192 break;
1193
1194 case BIOCVERSION:
1195 {
1196 struct bpf_version bv;
1197
1198 bv.bv_major = BPF_MAJOR_VERSION;
1199 bv.bv_minor = BPF_MINOR_VERSION;
1200 if (copyout(&bv, (void *)addr, sizeof (bv)) != 0)
1201 error = EFAULT;
1202 break;
1203 }
1204
1205 case BIOCGHDRCMPLT: /* get "header already complete" flag */
1206 if (copyout(&d->bd_hdrcmplt, (void *)addr,
1207 sizeof (d->bd_hdrcmplt)) != 0)
1208 error = EFAULT;
1209 break;
1210
1211 case BIOCSHDRCMPLT: /* set "header already complete" flag */
1212 if (copyin((void *)addr, &d->bd_hdrcmplt,
1213 sizeof (d->bd_hdrcmplt)) != 0)
1214 error = EFAULT;
1215 break;
1216
1217 /*
1218 * Get "see sent packets" flag
1219 */
1220 case BIOCGSEESENT:
1221 if (copyout(&d->bd_seesent, (void *)addr,
1222 sizeof (d->bd_seesent)) != 0)
1223 error = EFAULT;
1224 break;
1225
1226 /*
1227 * Set "see sent" packets flag
1228 */
1229 case BIOCSSEESENT:
1230 if (copyin((void *)addr, &d->bd_seesent,
1231 sizeof (d->bd_seesent)) != 0)
1232 error = EFAULT;
1233 break;
1234
1235 case FIONBIO: /* Non-blocking I/O */
1236 if (copyin((void *)addr, &d->bd_nonblock,
1237 sizeof (d->bd_nonblock)) != 0)
1238 error = EFAULT;
1239 break;
1240 }
1241 return (error);
1242 }
1243
1244 /*
1245 * Set d's packet filter program to fp. If this file already has a filter,
1246 * free it and replace it. If the new filter is "empty" (has a 0 size), then
1247 * the result is to just remove and free the existing filter.
1248 * Returns EINVAL for bogus requests.
1249 */
1250 int
bpf_setf(struct bpf_d * d,struct bpf_program * fp)1251 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
1252 {
1253 struct bpf_insn *fcode, *old;
1254 uint_t flen, size;
1255 size_t oldsize;
1256
1257 if (fp->bf_insns == 0) {
1258 if (fp->bf_len != 0)
1259 return (EINVAL);
1260 mutex_enter(&d->bd_lock);
1261 old = d->bd_filter;
1262 oldsize = d->bd_filter_size;
1263 d->bd_filter = 0;
1264 d->bd_filter_size = 0;
1265 reset_d(d);
1266 mutex_exit(&d->bd_lock);
1267 if (old != 0)
1268 kmem_free(old, oldsize);
1269 return (0);
1270 }
1271 flen = fp->bf_len;
1272 if (flen > BPF_MAXINSNS)
1273 return (EINVAL);
1274
1275 size = flen * sizeof (*fp->bf_insns);
1276 fcode = kmem_alloc(size, KM_SLEEP);
1277 if (copyin(fp->bf_insns, fcode, size) != 0)
1278 return (EFAULT);
1279
1280 if (bpf_validate(fcode, (int)flen)) {
1281 mutex_enter(&d->bd_lock);
1282 old = d->bd_filter;
1283 oldsize = d->bd_filter_size;
1284 d->bd_filter = fcode;
1285 d->bd_filter_size = size;
1286 reset_d(d);
1287 mutex_exit(&d->bd_lock);
1288 if (old != 0)
1289 kmem_free(old, oldsize);
1290
1291 return (0);
1292 }
1293 kmem_free(fcode, size);
1294 return (EINVAL);
1295 }
1296
1297 /*
1298 * Detach a file from its current interface (if attached at all) and attach
1299 * to the interface indicated by the name stored in ifname.
1300 * Return an errno or 0.
1301 */
1302 static int
bpf_setif(struct bpf_d * d,char * ifname,int namesize)1303 bpf_setif(struct bpf_d *d, char *ifname, int namesize)
1304 {
1305 int unit_seen;
1306 int error = 0;
1307 char *cp;
1308 int i;
1309
1310 /*
1311 * Make sure the provided name has a unit number, and default
1312 * it to '0' if not specified.
1313 * XXX This is ugly ... do this differently?
1314 */
1315 unit_seen = 0;
1316 cp = ifname;
1317 cp[namesize - 1] = '\0'; /* sanity */
1318 while (*cp++)
1319 if (*cp >= '0' && *cp <= '9')
1320 unit_seen = 1;
1321 if (!unit_seen) {
1322 /* Make sure to leave room for the '\0'. */
1323 for (i = 0; i < (namesize - 1); ++i) {
1324 if ((ifname[i] >= 'a' && ifname[i] <= 'z') ||
1325 (ifname[i] >= 'A' && ifname[i] <= 'Z'))
1326 continue;
1327 ifname[i] = '0';
1328 }
1329 }
1330
1331 /*
1332 * Make sure that only one call to this function happens at a time
1333 * and that we're not interleaving a read/write
1334 */
1335 mutex_enter(&d->bd_lock);
1336 while (d->bd_inuse != 0) {
1337 d->bd_waiting++;
1338 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) {
1339 d->bd_waiting--;
1340 mutex_exit(&d->bd_lock);
1341 return (EINTR);
1342 }
1343 d->bd_waiting--;
1344 }
1345 d->bd_inuse = -1;
1346 mutex_exit(&d->bd_lock);
1347
1348 if (d->bd_sbuf == 0)
1349 error = bpf_allocbufs(d);
1350
1351 if (error == 0) {
1352 mutex_enter(&d->bd_lock);
1353 if (d->bd_bif)
1354 /*
1355 * Detach if attached to something else.
1356 */
1357 bpf_detachd(d);
1358
1359 error = bpf_attachd(d, ifname, -1);
1360 reset_d(d);
1361 d->bd_inuse = 0;
1362 if (d->bd_waiting != 0)
1363 cv_signal(&d->bd_wait);
1364 mutex_exit(&d->bd_lock);
1365 return (error);
1366 }
1367
1368 mutex_enter(&d->bd_lock);
1369 d->bd_inuse = 0;
1370 if (d->bd_waiting != 0)
1371 cv_signal(&d->bd_wait);
1372 mutex_exit(&d->bd_lock);
1373
1374 /*
1375 * Try tickle the mac layer into attaching the device...
1376 */
1377 return (bpf_provider_tickle(ifname, d->bd_zone));
1378 }
1379
1380 /*
1381 * Copy the interface name to the ifreq.
1382 */
1383 static int
bpf_ifname(struct bpf_d * d,char * buffer,int bufsize)1384 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize)
1385 {
1386
1387 mutex_enter(&d->bd_lock);
1388 if (d->bd_bif == NULL) {
1389 mutex_exit(&d->bd_lock);
1390 return (EINVAL);
1391 }
1392
1393 (void) strlcpy(buffer, d->bd_ifname, bufsize);
1394 mutex_exit(&d->bd_lock);
1395
1396 return (0);
1397 }
1398
1399 /*
1400 * Support for poll() system call
1401 *
1402 * Return true iff the specific operation will not block indefinitely - with
1403 * the assumption that it is safe to positively acknowledge a request for the
1404 * ability to write to the BPF device.
1405 * Otherwise, return false but make a note that a selnotify() must be done.
1406 */
1407 int
bpfchpoll(dev_t dev,short events,int anyyet,short * reventsp,struct pollhead ** phpp)1408 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp,
1409 struct pollhead **phpp)
1410 {
1411 struct bpf_d *d = bpf_dev_get(getminor(dev));
1412
1413 if (events & (POLLIN | POLLRDNORM)) {
1414 /*
1415 * An imitation of the FIONREAD ioctl code.
1416 */
1417 mutex_enter(&d->bd_lock);
1418 if (d->bd_hlen != 0 ||
1419 ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1420 d->bd_slen != 0)) {
1421 *reventsp |= events & (POLLIN | POLLRDNORM);
1422 } else {
1423 *reventsp = 0;
1424 if (!anyyet)
1425 *phpp = &d->bd_poll;
1426 /* Start the read timeout if necessary */
1427 if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
1428 bpf_clear_timeout(d);
1429 /*
1430 * Only allow the timeout to be set once.
1431 */
1432 if (d->bd_callout == 0)
1433 d->bd_callout = timeout(bpf_timed_out,
1434 d, d->bd_rtout);
1435 d->bd_state = BPF_WAITING;
1436 }
1437 }
1438 mutex_exit(&d->bd_lock);
1439 }
1440
1441 return (0);
1442 }
1443
1444 /*
1445 * Copy data from an mblk_t chain into a buffer. This works for ipnet
1446 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the
1447 * packet itself.
1448 */
1449 static void *
bpf_mcpy(void * dst_arg,const void * src_arg,size_t len)1450 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len)
1451 {
1452 const mblk_t *m;
1453 uint_t count;
1454 uchar_t *dst;
1455
1456 m = src_arg;
1457 dst = dst_arg;
1458 while (len > 0) {
1459 if (m == NULL)
1460 panic("bpf_mcpy");
1461 count = (uint_t)min(M_LEN(m), len);
1462 (void) memcpy(dst, mtod(m, const void *), count);
1463 m = m->b_cont;
1464 dst += count;
1465 len -= count;
1466 }
1467 return (dst_arg);
1468 }
1469
1470 /*
1471 * Dispatch a packet to all the listeners on interface bp.
1472 *
1473 * marg pointer to the packet, either a data buffer or an mbuf chain
1474 * buflen buffer length, if marg is a data buffer
1475 * cpfn a function that can copy marg into the listener's buffer
1476 * pktlen length of the packet
1477 * issent boolean indicating whether the packet was sent or receive
1478 */
1479 static inline void
bpf_deliver(struct bpf_d * d,cp_fn_t cpfn,void * marg,uint_t pktlen,uint_t buflen,boolean_t issent)1480 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen,
1481 uint_t buflen, boolean_t issent)
1482 {
1483 struct timeval tv;
1484 uint_t slen;
1485
1486 if (!d->bd_seesent && issent)
1487 return;
1488
1489 /*
1490 * Accuracy of the packet counters in BPF is vital so it
1491 * is important to protect even the outer ones.
1492 */
1493 mutex_enter(&d->bd_lock);
1494 slen = bpf_filter(d->bd_filter, marg, pktlen, buflen);
1495 DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif,
1496 struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen);
1497 d->bd_rcount++;
1498 ks_stats.kp_receive.value.ui64++;
1499 if (slen != 0) {
1500 uniqtime(&tv);
1501 catchpacket(d, marg, pktlen, slen, cpfn, &tv);
1502 }
1503 mutex_exit(&d->bd_lock);
1504 }
1505
1506 /*
1507 * Incoming linkage from device drivers.
1508 */
1509 /* ARGSUSED */
1510 void
bpf_mtap(void * arg,mac_resource_handle_t mrh,mblk_t * m,boolean_t issent)1511 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent)
1512 {
1513 cp_fn_t cpfn;
1514 struct bpf_d *d = arg;
1515 uint_t pktlen, buflen;
1516 void *marg;
1517
1518 pktlen = msgdsize(m);
1519
1520 if (pktlen == M_LEN(m)) {
1521 cpfn = (cp_fn_t)memcpy;
1522 marg = mtod(m, void *);
1523 buflen = pktlen;
1524 } else {
1525 cpfn = bpf_mcpy;
1526 marg = m;
1527 buflen = 0;
1528 }
1529
1530 bpf_deliver(d, cpfn, marg, pktlen, buflen, issent);
1531 }
1532
1533 /*
1534 * Incoming linkage from ipnet.
1535 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets
1536 * from all network interfaces. Thus the tap function needs to apply a
1537 * filter using the interface index/id to immitate snoop'ing on just the
1538 * specified interface.
1539 */
1540 /* ARGSUSED */
1541 void
bpf_itap(void * arg,mblk_t * m,boolean_t issent,uint_t length)1542 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length)
1543 {
1544 hook_pkt_observe_t *hdr;
1545 struct bpf_d *d = arg;
1546
1547 hdr = (hook_pkt_observe_t *)m->b_rptr;
1548 if (ntohl(hdr->hpo_ifindex) != d->bd_linkid)
1549 return;
1550 bpf_deliver(d, bpf_mcpy, m, length, 0, issent);
1551
1552 }
1553
1554 /*
1555 * Move the packet data from interface memory (pkt) into the
1556 * store buffer. Return 1 if it's time to wakeup a listener (buffer full),
1557 * otherwise 0. "copy" is the routine called to do the actual data
1558 * transfer. memcpy is passed in to copy contiguous chunks, while
1559 * bpf_mcpy is passed in to copy mbuf chains. In the latter case,
1560 * pkt is really an mbuf.
1561 */
1562 static void
catchpacket(struct bpf_d * d,uchar_t * pkt,uint_t pktlen,uint_t snaplen,cp_fn_t cpfn,struct timeval * tv)1563 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen,
1564 cp_fn_t cpfn, struct timeval *tv)
1565 {
1566 struct bpf_hdr *hp;
1567 int totlen, curlen;
1568 int hdrlen = d->bd_hdrlen;
1569 int do_wakeup = 0;
1570
1571 ++d->bd_ccount;
1572 ks_stats.kp_capture.value.ui64++;
1573 /*
1574 * Figure out how many bytes to move. If the packet is
1575 * greater or equal to the snapshot length, transfer that
1576 * much. Otherwise, transfer the whole packet (unless
1577 * we hit the buffer size limit).
1578 */
1579 totlen = hdrlen + min(snaplen, pktlen);
1580 if (totlen > d->bd_bufsize)
1581 totlen = d->bd_bufsize;
1582
1583 /*
1584 * Round up the end of the previous packet to the next longword.
1585 */
1586 curlen = BPF_WORDALIGN(d->bd_slen);
1587 if (curlen + totlen > d->bd_bufsize) {
1588 /*
1589 * This packet will overflow the storage buffer.
1590 * Rotate the buffers if we can, then wakeup any
1591 * pending reads.
1592 */
1593 if (d->bd_fbuf == 0) {
1594 /*
1595 * We haven't completed the previous read yet,
1596 * so drop the packet.
1597 */
1598 ++d->bd_dcount;
1599 ks_stats.kp_dropped.value.ui64++;
1600 return;
1601 }
1602 ROTATE_BUFFERS(d);
1603 do_wakeup = 1;
1604 curlen = 0;
1605 } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
1606 /*
1607 * Immediate mode is set, or the read timeout has
1608 * already expired during a select call. A packet
1609 * arrived, so the reader should be woken up.
1610 */
1611 do_wakeup = 1;
1612 }
1613
1614 /*
1615 * Append the bpf header to the existing buffer before we add
1616 * on the actual packet data.
1617 */
1618 hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen);
1619 hp->bh_tstamp.tv_sec = tv->tv_sec;
1620 hp->bh_tstamp.tv_usec = tv->tv_usec;
1621 hp->bh_datalen = pktlen;
1622 hp->bh_hdrlen = (uint16_t)hdrlen;
1623 /*
1624 * Copy the packet data into the store buffer and update its length.
1625 */
1626 (*cpfn)((uchar_t *)hp + hdrlen, pkt,
1627 (hp->bh_caplen = totlen - hdrlen));
1628 d->bd_slen = curlen + totlen;
1629
1630 /*
1631 * Call bpf_wakeup after bd_slen has been updated.
1632 */
1633 if (do_wakeup)
1634 bpf_wakeup(d);
1635 }
1636
1637 /*
1638 * Initialize all nonzero fields of a descriptor.
1639 */
1640 static int
bpf_allocbufs(struct bpf_d * d)1641 bpf_allocbufs(struct bpf_d *d)
1642 {
1643
1644 d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1645 if (!d->bd_fbuf)
1646 return (ENOBUFS);
1647 d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP);
1648 if (!d->bd_sbuf) {
1649 kmem_free(d->bd_fbuf, d->bd_bufsize);
1650 return (ENOBUFS);
1651 }
1652 d->bd_slen = 0;
1653 d->bd_hlen = 0;
1654 return (0);
1655 }
1656
1657 /*
1658 * Free buffers currently in use by a descriptor.
1659 * Called on close.
1660 */
1661 static void
bpf_freed(struct bpf_d * d)1662 bpf_freed(struct bpf_d *d)
1663 {
1664 /*
1665 * At this point the descriptor has been detached from its
1666 * interface and it yet hasn't been marked free.
1667 */
1668 if (d->bd_sbuf != 0) {
1669 kmem_free(d->bd_sbuf, d->bd_bufsize);
1670 if (d->bd_hbuf != 0)
1671 kmem_free(d->bd_hbuf, d->bd_bufsize);
1672 if (d->bd_fbuf != 0)
1673 kmem_free(d->bd_fbuf, d->bd_bufsize);
1674 }
1675 if (d->bd_filter)
1676 kmem_free(d->bd_filter, d->bd_filter_size);
1677 }
1678
1679 /*
1680 * Get a list of available data link type of the interface.
1681 */
1682 static int
bpf_getdltlist(struct bpf_d * d,struct bpf_dltlist * listp)1683 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp)
1684 {
1685 bpf_provider_list_t *bp;
1686 bpf_provider_t *bpr;
1687 zoneid_t zoneid;
1688 uintptr_t mcip;
1689 uint_t nicdlt;
1690 uintptr_t mh;
1691 int error;
1692 int n;
1693
1694 n = 0;
1695 mh = 0;
1696 mcip = 0;
1697 error = 0;
1698 mutex_enter(&d->bd_lock);
1699 LIST_FOREACH(bp, &bpf_providers, bpl_next) {
1700 bpr = bp->bpl_what;
1701 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone);
1702 if (error != 0)
1703 goto next;
1704 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip);
1705 if (error != 0)
1706 goto next;
1707 error = MBPF_GET_ZONE(bpr, mh, &zoneid);
1708 if (error != 0)
1709 goto next;
1710 if (d->bd_zone != GLOBAL_ZONEID &&
1711 d->bd_zone != zoneid)
1712 goto next;
1713 error = MBPF_GET_DLT(bpr, mh, &nicdlt);
1714 if (error != 0)
1715 goto next;
1716 nicdlt = bpf_dl_to_dlt(nicdlt);
1717 if (listp->bfl_list != NULL) {
1718 if (n >= listp->bfl_len) {
1719 MBPF_CLIENT_CLOSE(bpr, mcip);
1720 MBPF_CLOSE(bpr, mh);
1721 break;
1722 }
1723 /*
1724 * Bumping of bd_inuse ensures the structure does not
1725 * disappear while the copyout runs and allows the for
1726 * loop to be continued.
1727 */
1728 d->bd_inuse++;
1729 mutex_exit(&d->bd_lock);
1730 if (copyout(&nicdlt,
1731 listp->bfl_list + n, sizeof (uint_t)) != 0)
1732 error = EFAULT;
1733 mutex_enter(&d->bd_lock);
1734 if (error != 0)
1735 break;
1736 d->bd_inuse--;
1737 }
1738 n++;
1739 next:
1740 if (mcip != 0) {
1741 MBPF_CLIENT_CLOSE(bpr, mcip);
1742 mcip = 0;
1743 }
1744 if (mh != 0) {
1745 MBPF_CLOSE(bpr, mh);
1746 mh = 0;
1747 }
1748 }
1749 mutex_exit(&d->bd_lock);
1750
1751 /*
1752 * It is quite possible that one or more provider to BPF may not
1753 * know about a link name whlist others do. In that case, so long
1754 * as we have one success, do not declare an error unless it was
1755 * an EFAULT as this indicates a problem that needs to be reported.
1756 */
1757 if ((error != EFAULT) && (n > 0))
1758 error = 0;
1759
1760 listp->bfl_len = n;
1761 return (error);
1762 }
1763
1764 /*
1765 * Set the data link type of a BPF instance.
1766 */
1767 static int
bpf_setdlt(struct bpf_d * d,void * addr)1768 bpf_setdlt(struct bpf_d *d, void *addr)
1769 {
1770 char ifname[LIFNAMSIZ+1];
1771 zoneid_t niczone;
1772 int error;
1773 int dlt;
1774
1775 if (copyin(addr, &dlt, sizeof (dlt)) != 0)
1776 return (EFAULT);
1777
1778 mutex_enter(&d->bd_lock);
1779
1780 if (d->bd_bif == 0) { /* Interface not set */
1781 mutex_exit(&d->bd_lock);
1782 return (EINVAL);
1783 }
1784 if (d->bd_dlt == dlt) { /* NULL-op */
1785 mutex_exit(&d->bd_lock);
1786 return (0);
1787 }
1788
1789 error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone);
1790 if (error != 0) {
1791 mutex_exit(&d->bd_lock);
1792 return (error);
1793 }
1794
1795 /*
1796 * See the matrix at the top of the file for the permissions table
1797 * enforced by this driver.
1798 */
1799 if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) &&
1800 (niczone != d->bd_zone)) {
1801 mutex_exit(&d->bd_lock);
1802 return (EINVAL);
1803 }
1804
1805 (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname));
1806 d->bd_inuse = -1;
1807 bpf_detachd(d);
1808 error = bpf_attachd(d, ifname, dlt);
1809 reset_d(d);
1810 d->bd_inuse = 0;
1811
1812 mutex_exit(&d->bd_lock);
1813 return (error);
1814 }
1815
1816 /*
1817 * bpf_clear_timeout is called with the bd_lock mutex held, providing it
1818 * with the necessary protection to retrieve and modify bd_callout but it
1819 * does not hold the lock for its entire duration... see below...
1820 */
1821 static void
bpf_clear_timeout(struct bpf_d * d)1822 bpf_clear_timeout(struct bpf_d *d)
1823 {
1824 timeout_id_t tid = d->bd_callout;
1825 d->bd_callout = 0;
1826 d->bd_inuse++;
1827
1828 /*
1829 * If the timeout has fired and is waiting on bd_lock, we could
1830 * deadlock here because untimeout if bd_lock is held and would
1831 * wait for bpf_timed_out to finish and it never would.
1832 */
1833 if (tid != 0) {
1834 mutex_exit(&d->bd_lock);
1835 (void) untimeout(tid);
1836 mutex_enter(&d->bd_lock);
1837 }
1838
1839 d->bd_inuse--;
1840 }
1841
1842 /*
1843 * As a cloning device driver, BPF needs to keep track of which device
1844 * numbers are in use and which ones are not. A hash table, indexed by
1845 * the minor device number, is used to store the pointers to the
1846 * individual descriptors that are allocated in bpfopen().
1847 * The functions below present the interface for that hash table to
1848 * the rest of the driver.
1849 */
1850 static struct bpf_d *
bpf_dev_find(minor_t minor)1851 bpf_dev_find(minor_t minor)
1852 {
1853 struct bpf_d *d = NULL;
1854
1855 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1856 (mod_hash_val_t *)&d);
1857
1858 return (d);
1859 }
1860
1861 static void
bpf_dev_add(struct bpf_d * d)1862 bpf_dev_add(struct bpf_d *d)
1863 {
1864 (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1865 (mod_hash_val_t)d);
1866 }
1867
1868 static void
bpf_dev_remove(struct bpf_d * d)1869 bpf_dev_remove(struct bpf_d *d)
1870 {
1871 struct bpf_d *stor;
1872
1873 (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev,
1874 (mod_hash_val_t *)&stor);
1875 ASSERT(stor == d);
1876 }
1877
1878 /*
1879 * bpf_def_get should only ever be called for a minor number that exists,
1880 * thus there should always be a pointer in the hash table that corresponds
1881 * to it.
1882 */
1883 static struct bpf_d *
bpf_dev_get(minor_t minor)1884 bpf_dev_get(minor_t minor)
1885 {
1886 struct bpf_d *d = NULL;
1887
1888 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor,
1889 (mod_hash_val_t *)&d);
1890 ASSERT(d != NULL);
1891
1892 return (d);
1893 }
1894