1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */ 2 3 /* 4 * Copyright (c) 1990, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from the Stanford/CMU enet packet filter, 8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed 9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence 10 * Berkeley Laboratory. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95 37 * static char rcsid[] = 38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp "; 39 */ 40 /* 41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 42 * Use is subject to license terms. 43 */ 44 45 /* 46 * The BPF implements the following access controls for zones attempting 47 * to read and write data. Writing of data requires that the net_rawaccess 48 * privilege is held whilst reading data requires either net_rawaccess or 49 * net_observerability. 50 * 51 * | Shared | Exclusive | Global 52 * -----------------------------+--------+------------+------------+ 53 * DLT_IPNET in local zone | Read | Read | Read | 54 * -----------------------------+--------+------------+------------+ 55 * Raw access to local zone NIC | None | Read/Write | Read/Write | 56 * -----------------------------+--------+------------+------------+ 57 * Raw access to all NICs | None | None | Read/Write | 58 * -----------------------------+--------+------------+------------+ 59 * 60 * The BPF driver is written as a cloning driver: each call to bpfopen() 61 * allocates a new minor number. This provides BPF with a 1:1 relationship 62 * between open's and close's. There is some amount of "descriptor state" 63 * that is kept per open. Pointers to this data are stored in a hash table 64 * (bpf_hash) that is index'd by the minor device number for each open file. 65 */ 66 #include <sys/param.h> 67 #include <sys/systm.h> 68 #include <sys/time.h> 69 #include <sys/ioctl.h> 70 #include <sys/queue.h> 71 #include <sys/filio.h> 72 #include <sys/policy.h> 73 #include <sys/cmn_err.h> 74 #include <sys/uio.h> 75 #include <sys/file.h> 76 #include <sys/sysmacros.h> 77 #include <sys/zone.h> 78 79 #include <sys/socket.h> 80 #include <sys/errno.h> 81 #include <sys/poll.h> 82 #include <sys/dlpi.h> 83 #include <sys/neti.h> 84 85 #include <net/if.h> 86 87 #include <net/bpf.h> 88 #include <net/bpfdesc.h> 89 #include <net/dlt.h> 90 91 #include <netinet/in.h> 92 #include <sys/mac.h> 93 #include <sys/mac_client.h> 94 #include <sys/mac_impl.h> 95 #include <sys/time_std_impl.h> 96 #include <sys/hook.h> 97 #include <sys/hook_event.h> 98 99 100 #define mtod(_v, _t) (_t)((_v)->b_rptr) 101 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) 102 103 /* 104 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet 105 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k). 106 */ 107 #define BPF_BUFSIZE (32 * 1024) 108 109 typedef void *(*cp_fn_t)(void *, const void *, size_t); 110 111 /* 112 * The default read buffer size, and limit for BIOCSBLEN. 113 */ 114 int bpf_bufsize = BPF_BUFSIZE; 115 int bpf_maxbufsize = (16 * 1024 * 1024); 116 int bpf_debug = 0; 117 mod_hash_t *bpf_hash = NULL; 118 119 /* 120 * Use a mutex to avoid a race condition between gathering the stats/peers 121 * and opening/closing the device. 122 */ 123 static kcondvar_t bpf_dlt_waiter; 124 static kmutex_t bpf_mtx; 125 static bpf_kstats_t ks_stats; 126 static bpf_kstats_t bpf_kstats = { 127 { "readWait", KSTAT_DATA_UINT64 }, 128 { "writeOk", KSTAT_DATA_UINT64 }, 129 { "writeError", KSTAT_DATA_UINT64 }, 130 { "receive", KSTAT_DATA_UINT64 }, 131 { "captured", KSTAT_DATA_UINT64 }, 132 { "dropped", KSTAT_DATA_UINT64 }, 133 }; 134 static kstat_t *bpf_ksp; 135 136 /* 137 * bpf_iflist is the list of interfaces; each corresponds to an ifnet 138 * bpf_dtab holds the descriptors, indexed by minor device # 139 */ 140 TAILQ_HEAD(, bpf_if) bpf_iflist; 141 LIST_HEAD(, bpf_d) bpf_list; 142 143 static int bpf_allocbufs(struct bpf_d *); 144 static void bpf_clear_timeout(struct bpf_d *); 145 static void bpf_debug_nic_action(char *, struct bpf_if *); 146 static void bpf_deliver(struct bpf_d *, cp_fn_t, 147 void *, uint_t, uint_t, boolean_t); 148 static struct bpf_if * 149 bpf_findif(struct bpf_d *, char *, int); 150 static void bpf_freed(struct bpf_d *); 151 static int bpf_ifname(struct bpf_d *d, char *, int); 152 static void *bpf_mcpy(void *, const void *, size_t); 153 static void bpf_attachd(struct bpf_d *, struct bpf_if *); 154 static void bpf_detachd(struct bpf_d *); 155 static int bpf_setif(struct bpf_d *, char *, int); 156 static void bpf_timed_out(void *); 157 static inline void 158 bpf_wakeup(struct bpf_d *); 159 static void catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t, 160 cp_fn_t, struct timeval *); 161 static void reset_d(struct bpf_d *); 162 static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); 163 static int bpf_setdlt(struct bpf_d *, void *); 164 static void bpf_dev_add(struct bpf_d *); 165 static struct bpf_d *bpf_dev_find(minor_t); 166 static struct bpf_d *bpf_dev_get(minor_t); 167 static void bpf_dev_remove(struct bpf_d *); 168 169 static int 170 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp) 171 { 172 mblk_t *m; 173 int error; 174 int len; 175 int hlen; 176 int align; 177 178 /* 179 * Build a sockaddr based on the data link layer type. 180 * We do this at this level because the ethernet header 181 * is copied directly into the data field of the sockaddr. 182 * In the case of SLIP, there is no header and the packet 183 * is forwarded as is. 184 * Also, we are careful to leave room at the front of the mbuf 185 * for the link level header. 186 */ 187 switch (linktype) { 188 189 case DLT_EN10MB: 190 hlen = sizeof (struct ether_header); 191 break; 192 193 case DLT_FDDI: 194 hlen = 16; 195 break; 196 197 case DLT_NULL: 198 hlen = 0; 199 break; 200 201 case DLT_IPOIB: 202 hlen = 44; 203 break; 204 205 default: 206 return (EIO); 207 } 208 209 align = 4 - (hlen & 3); 210 211 len = uio->uio_resid; 212 /* 213 * If there aren't enough bytes for a link level header or the 214 * packet length exceeds the interface mtu, return an error. 215 */ 216 if (len < hlen || len - hlen > mtu) 217 return (EMSGSIZE); 218 219 m = allocb(len + align, BPRI_MED); 220 if (m == NULL) { 221 error = ENOBUFS; 222 goto bad; 223 } 224 225 /* Insure the data is properly aligned */ 226 if (align > 0) 227 m->b_rptr += align; 228 m->b_wptr = m->b_rptr + len; 229 230 error = uiomove(mtod(m, void *), len, UIO_WRITE, uio); 231 if (error) 232 goto bad; 233 *mp = m; 234 return (0); 235 236 bad: 237 if (m != NULL) 238 freemsg(m); 239 return (error); 240 } 241 242 243 /* 244 * Attach file to the bpf interface, i.e. make d listen on bp. 245 */ 246 static void 247 bpf_attachd(struct bpf_d *d, struct bpf_if *bp) 248 { 249 uintptr_t mh = bp->bif_ifp; 250 251 ASSERT(bp != NULL); 252 ASSERT(d->bd_bif == NULL); 253 /* 254 * Point d at bp, and add d to the interface's list of listeners. 255 * Finally, point the driver's bpf cookie at the interface so 256 * it will divert packets to bpf. 257 * 258 * Note: Although this results in what looks like a lock order 259 * reversal (bd_lock is held), the deadlock threat is not present 260 * because the descriptor is not attached to any interface and 261 * therefore there cannot be a packet waiting on bd_lock in 262 * catchpacket. 263 */ 264 mutex_enter(&bp->bif_lock); 265 d->bd_bif = bp; 266 LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next); 267 mutex_exit(&bp->bif_lock); 268 269 if (MBPF_CLIENT_OPEN(&bp->bif_mac, mh, &d->bd_mcip) == 0) 270 (void) MBPF_PROMISC_ADD(&bp->bif_mac, d->bd_mcip, 0, d, 271 &d->bd_promisc_handle, d->bd_promisc_flags); 272 } 273 274 /* 275 * Detach a file from its interface. 276 */ 277 static void 278 bpf_detachd(struct bpf_d *d) 279 { 280 struct bpf_if *bp; 281 uintptr_t mph; 282 uintptr_t mch; 283 284 mch = d->bd_mcip; 285 d->bd_mcip = 0; 286 bp = d->bd_bif; 287 ASSERT(bp != NULL); 288 289 /* 290 * Check if this descriptor had requested promiscuous mode. 291 * If so, turn it off. There's no need to take any action 292 * here, that is done when MBPF_PROMISC_REMOVE is used; 293 * bd_promisc is just a local flag to stop promiscuous mode 294 * from being set more than once. 295 */ 296 if (d->bd_promisc) 297 d->bd_promisc = 0; 298 299 /* 300 * Take device out of "promiscuous" mode. Since we were able to 301 * enter "promiscuous" mode, we should be able to turn it off. 302 * Note, this field stores a pointer used to support both 303 * promiscuous and non-promiscuous callbacks for packets. 304 */ 305 mph = d->bd_promisc_handle; 306 d->bd_promisc_handle = 0; 307 308 /* 309 * The lock has to be dropped here because mac_promisc_remove may 310 * need to wait for mac_promisc_dispatch, which has called into 311 * bpf and catchpacket is waiting for bd_lock... 312 * i.e mac_promisc_remove() needs to be called with none of the 313 * locks held that are part of the bpf_mtap() call path. 314 */ 315 mutex_exit(&d->bd_lock); 316 if (mph != 0) 317 MBPF_PROMISC_REMOVE(&bp->bif_mac, mph); 318 319 if (mch != 0) 320 MBPF_CLIENT_CLOSE(&bp->bif_mac, mch); 321 322 /* 323 * bd_lock needs to stay not held by this function until after 324 * it has finished with bif_lock, otherwise there's a lock order 325 * reversal with bpf_deliver and the system can deadlock. 326 * 327 * Remove d from the interface's descriptor list. 328 */ 329 mutex_enter(&bp->bif_lock); 330 LIST_REMOVE(d, bd_next); 331 mutex_exit(&bp->bif_lock); 332 333 /* 334 * Because this function is called with bd_lock held, so it must 335 * exit with it held. 336 */ 337 mutex_enter(&d->bd_lock); 338 /* 339 * bd_bif cannot be cleared until after the promisc callback has been 340 * removed. 341 */ 342 d->bd_bif = 0; 343 } 344 345 346 /* 347 * bpfilterattach() is called at load time. 348 */ 349 int 350 bpfilterattach(void) 351 { 352 353 bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31, 354 mod_hash_null_keydtor); 355 if (bpf_hash == NULL) 356 return (ENOMEM); 357 358 (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats)); 359 360 bpf_ksp = kstat_create("bpf", 0, "global", "misc", 361 KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t), 362 KSTAT_FLAG_VIRTUAL); 363 if (bpf_ksp != NULL) { 364 bpf_ksp->ks_data = &ks_stats; 365 kstat_install(bpf_ksp); 366 } else { 367 mod_hash_destroy_idhash(bpf_hash); 368 bpf_hash = NULL; 369 return (EEXIST); 370 } 371 372 cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL); 373 mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL); 374 375 LIST_INIT(&bpf_list); 376 TAILQ_INIT(&bpf_iflist); 377 378 return (0); 379 } 380 381 382 /* 383 * bpfilterdetach() is called at unload time. 384 */ 385 int 386 bpfilterdetach(void) 387 { 388 struct bpf_if *bp; 389 390 if (bpf_ksp != NULL) { 391 kstat_delete(bpf_ksp); 392 bpf_ksp = NULL; 393 } 394 395 /* 396 * When no attach/detach callbacks can arrive from mac, 397 * this is now safe without a lock. 398 */ 399 while ((bp = TAILQ_FIRST(&bpf_iflist)) != NULL) 400 bpfdetach(bp->bif_ifp); 401 402 mutex_enter(&bpf_mtx); 403 if (!LIST_EMPTY(&bpf_list)) { 404 mutex_exit(&bpf_mtx); 405 return (EBUSY); 406 } 407 mutex_exit(&bpf_mtx); 408 409 mod_hash_destroy_idhash(bpf_hash); 410 bpf_hash = NULL; 411 412 cv_destroy(&bpf_dlt_waiter); 413 mutex_destroy(&bpf_mtx); 414 415 return (0); 416 } 417 418 /* 419 * Open ethernet device. Clones. 420 */ 421 /* ARGSUSED */ 422 int 423 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred) 424 { 425 struct bpf_d *d; 426 uint_t dmin; 427 428 /* 429 * The security policy described at the top of this file is 430 * enforced here. 431 */ 432 if ((flag & FWRITE) != 0) { 433 if (secpolicy_net_rawaccess(cred) != 0) 434 return (EACCES); 435 } 436 437 if ((flag & FREAD) != 0) { 438 if ((secpolicy_net_observability(cred) != 0) && 439 (secpolicy_net_rawaccess(cred) != 0)) 440 return (EACCES); 441 } 442 443 if ((flag & (FWRITE|FREAD)) == 0) 444 return (ENXIO); 445 446 /* 447 * If BPF is being opened from a non-global zone, trigger a call 448 * back into the driver to see if it needs to initialise local 449 * state in a zone. 450 */ 451 if (crgetzoneid(cred) != GLOBAL_ZONEID) 452 bpf_open_zone(crgetzoneid(cred)); 453 454 /* 455 * A structure is allocated per open file in BPF to store settings 456 * such as buffer capture size, provide private buffers, etc. 457 */ 458 d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP); 459 d->bd_bufsize = bpf_bufsize; 460 d->bd_fmode = flag; 461 d->bd_zone = crgetzoneid(cred); 462 d->bd_seesent = 1; 463 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS| 464 MAC_PROMISC_FLAGS_NO_COPY; 465 mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL); 466 cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL); 467 468 mutex_enter(&bpf_mtx); 469 /* 470 * Find an unused minor number. Obviously this is an O(n) algorithm 471 * and doesn't scale particularly well, so if there are large numbers 472 * of open file descriptors happening in real use, this design may 473 * need to be revisited. 474 */ 475 for (dmin = 0; dmin < L_MAXMIN; dmin++) 476 if (bpf_dev_find(dmin) == NULL) 477 break; 478 if (dmin == L_MAXMIN) { 479 mutex_exit(&bpf_mtx); 480 kmem_free(d, sizeof (*d)); 481 return (ENXIO); 482 } 483 d->bd_dev = dmin; 484 LIST_INSERT_HEAD(&bpf_list, d, bd_list); 485 bpf_dev_add(d); 486 mutex_exit(&bpf_mtx); 487 488 *devp = makedevice(getmajor(*devp), dmin); 489 490 return (0); 491 } 492 493 /* 494 * Close the descriptor by detaching it from its interface, 495 * deallocating its buffers, and marking it free. 496 * 497 * Because we only allow a device to be opened once, there is always a 498 * 1 to 1 relationship between opens and closes supporting this function. 499 */ 500 /* ARGSUSED */ 501 int 502 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p) 503 { 504 struct bpf_d *d = bpf_dev_get(getminor(dev)); 505 506 mutex_enter(&d->bd_lock); 507 if (d->bd_state == BPF_WAITING) 508 bpf_clear_timeout(d); 509 d->bd_state = BPF_IDLE; 510 if (d->bd_bif) 511 bpf_detachd(d); 512 mutex_exit(&d->bd_lock); 513 514 mutex_enter(&bpf_mtx); 515 LIST_REMOVE(d, bd_list); 516 bpf_dev_remove(d); 517 mutex_exit(&bpf_mtx); 518 519 mutex_enter(&d->bd_lock); 520 mutex_destroy(&d->bd_lock); 521 cv_destroy(&d->bd_wait); 522 523 bpf_freed(d); 524 kmem_free(d, sizeof (*d)); 525 526 return (0); 527 } 528 529 /* 530 * Rotate the packet buffers in descriptor d. Move the store buffer 531 * into the hold slot, and the free buffer into the store slot. 532 * Zero the length of the new store buffer. 533 */ 534 #define ROTATE_BUFFERS(d) \ 535 (d)->bd_hbuf = (d)->bd_sbuf; \ 536 (d)->bd_hlen = (d)->bd_slen; \ 537 (d)->bd_sbuf = (d)->bd_fbuf; \ 538 (d)->bd_slen = 0; \ 539 (d)->bd_fbuf = 0; 540 /* 541 * bpfread - read next chunk of packets from buffers 542 */ 543 /* ARGSUSED */ 544 int 545 bpfread(dev_t dev, struct uio *uio, cred_t *cred) 546 { 547 struct bpf_d *d = bpf_dev_get(getminor(dev)); 548 int timed_out; 549 ulong_t delay; 550 int error; 551 552 if ((d->bd_fmode & FREAD) == 0) 553 return (EBADF); 554 555 /* 556 * Restrict application to use a buffer the same size as 557 * the kernel buffers. 558 */ 559 if (uio->uio_resid != d->bd_bufsize) 560 return (EINVAL); 561 562 mutex_enter(&d->bd_lock); 563 if (d->bd_state == BPF_WAITING) 564 bpf_clear_timeout(d); 565 timed_out = (d->bd_state == BPF_TIMED_OUT); 566 d->bd_state = BPF_IDLE; 567 /* 568 * If the hold buffer is empty, then do a timed sleep, which 569 * ends when the timeout expires or when enough packets 570 * have arrived to fill the store buffer. 571 */ 572 while (d->bd_hbuf == 0) { 573 if (d->bd_nonblock) { 574 if (d->bd_slen == 0) { 575 mutex_exit(&d->bd_lock); 576 return (EWOULDBLOCK); 577 } 578 ROTATE_BUFFERS(d); 579 break; 580 } 581 582 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) { 583 /* 584 * A packet(s) either arrived since the previous 585 * read or arrived while we were asleep. 586 * Rotate the buffers and return what's here. 587 */ 588 ROTATE_BUFFERS(d); 589 break; 590 } 591 ks_stats.kp_read_wait.value.ui64++; 592 delay = ddi_get_lbolt() + d->bd_rtout; 593 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay); 594 if (error == 0) { 595 mutex_exit(&d->bd_lock); 596 return (EINTR); 597 } 598 if (error == -1) { 599 /* 600 * On a timeout, return what's in the buffer, 601 * which may be nothing. If there is something 602 * in the store buffer, we can rotate the buffers. 603 */ 604 if (d->bd_hbuf) 605 /* 606 * We filled up the buffer in between 607 * getting the timeout and arriving 608 * here, so we don't need to rotate. 609 */ 610 break; 611 612 if (d->bd_slen == 0) { 613 mutex_exit(&d->bd_lock); 614 return (0); 615 } 616 ROTATE_BUFFERS(d); 617 } 618 } 619 /* 620 * At this point, we know we have something in the hold slot. 621 */ 622 mutex_exit(&d->bd_lock); 623 624 /* 625 * Move data from hold buffer into user space. 626 * We know the entire buffer is transferred since 627 * we checked above that the read buffer is bpf_bufsize bytes. 628 */ 629 error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio); 630 631 mutex_enter(&d->bd_lock); 632 d->bd_fbuf = d->bd_hbuf; 633 d->bd_hbuf = 0; 634 d->bd_hlen = 0; 635 done: 636 mutex_exit(&d->bd_lock); 637 return (error); 638 } 639 640 641 /* 642 * If there are processes sleeping on this descriptor, wake them up. 643 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver, 644 * so there is no code here grabbing it. 645 */ 646 static inline void 647 bpf_wakeup(struct bpf_d *d) 648 { 649 cv_signal(&d->bd_wait); 650 } 651 652 static void 653 bpf_timed_out(void *arg) 654 { 655 struct bpf_d *d = arg; 656 657 mutex_enter(&d->bd_lock); 658 if (d->bd_state == BPF_WAITING) { 659 d->bd_state = BPF_TIMED_OUT; 660 if (d->bd_slen != 0) 661 cv_signal(&d->bd_wait); 662 } 663 mutex_exit(&d->bd_lock); 664 } 665 666 667 /* ARGSUSED */ 668 int 669 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred) 670 { 671 struct bpf_d *d = bpf_dev_get(getminor(dev)); 672 struct bpf_if *bp; 673 uintptr_t mch; 674 uintptr_t ifp; 675 uint_t mtu; 676 mblk_t *m; 677 int error; 678 int dlt; 679 680 if ((d->bd_fmode & FWRITE) == 0) 681 return (EBADF); 682 683 mutex_enter(&d->bd_lock); 684 if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif->bif_ifp == 0) { 685 mutex_exit(&d->bd_lock); 686 return (EINTR); 687 } 688 689 if (uio->uio_resid == 0) { 690 mutex_exit(&d->bd_lock); 691 return (0); 692 } 693 694 while (d->bd_inuse < 0) { 695 d->bd_waiting++; 696 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) { 697 d->bd_waiting--; 698 mutex_exit(&d->bd_lock); 699 return (EINTR); 700 } 701 d->bd_waiting--; 702 } 703 704 mutex_exit(&d->bd_lock); 705 706 bp = d->bd_bif; 707 dlt = bp->bif_dlt; 708 mch = d->bd_mcip; 709 ifp = bp->bif_ifp; 710 MBPF_SDU_GET(&bp->bif_mac, ifp, &mtu); 711 d->bd_inuse++; 712 713 m = NULL; 714 if (dlt == DLT_IPNET) { 715 error = EIO; 716 goto done; 717 } 718 719 error = bpf_movein(uio, dlt, mtu, &m); 720 if (error) 721 goto done; 722 723 DTRACE_PROBE5(bpf__tx, struct bpf_d *, d, struct bpf_if *, bp, 724 int, dlt, uint_t, mtu, mblk_t *, m); 725 726 if (M_LEN(m) > mtu) { 727 error = EMSGSIZE; 728 goto done; 729 } 730 731 error = MBPF_TX(&bp->bif_mac, mch, m); 732 /* 733 * The "tx" action here is required to consume the mblk_t. 734 */ 735 m = NULL; 736 737 done: 738 if (error == 0) 739 ks_stats.kp_write_ok.value.ui64++; 740 else 741 ks_stats.kp_write_error.value.ui64++; 742 if (m != NULL) 743 freemsg(m); 744 745 mutex_enter(&d->bd_lock); 746 d->bd_inuse--; 747 if ((d->bd_inuse == 0) && (d->bd_waiting != 0)) 748 cv_signal(&d->bd_wait); 749 mutex_exit(&d->bd_lock); 750 751 /* 752 * The driver frees the mbuf. 753 */ 754 return (error); 755 } 756 757 758 /* 759 * Reset a descriptor by flushing its packet buffer and clearing the 760 * receive and drop counts. Should be called at splnet. 761 */ 762 static void 763 reset_d(struct bpf_d *d) 764 { 765 if (d->bd_hbuf) { 766 /* Free the hold buffer. */ 767 d->bd_fbuf = d->bd_hbuf; 768 d->bd_hbuf = 0; 769 } 770 d->bd_slen = 0; 771 d->bd_hlen = 0; 772 d->bd_rcount = 0; 773 d->bd_dcount = 0; 774 d->bd_ccount = 0; 775 } 776 777 /* 778 * FIONREAD Check for read packet available. 779 * BIOCGBLEN Get buffer len [for read()]. 780 * BIOCSETF Set ethernet read filter. 781 * BIOCFLUSH Flush read packet buffer. 782 * BIOCPROMISC Put interface into promiscuous mode. 783 * BIOCGDLT Get link layer type. 784 * BIOCGETIF Get interface name. 785 * BIOCSETIF Set interface. 786 * BIOCSRTIMEOUT Set read timeout. 787 * BIOCGRTIMEOUT Get read timeout. 788 * BIOCGSTATS Get packet stats. 789 * BIOCIMMEDIATE Set immediate mode. 790 * BIOCVERSION Get filter language version. 791 * BIOCGHDRCMPLT Get "header already complete" flag. 792 * BIOCSHDRCMPLT Set "header already complete" flag. 793 */ 794 /* ARGSUSED */ 795 int 796 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval) 797 { 798 struct bpf_d *d = bpf_dev_get(getminor(dev)); 799 struct bpf_program prog; 800 struct lifreq lifreq; 801 struct ifreq ifreq; 802 int error = 0; 803 uint_t size; 804 805 /* 806 * Refresh the PID associated with this bpf file. 807 */ 808 mutex_enter(&d->bd_lock); 809 if (d->bd_state == BPF_WAITING) 810 bpf_clear_timeout(d); 811 d->bd_state = BPF_IDLE; 812 mutex_exit(&d->bd_lock); 813 814 switch (cmd) { 815 816 default: 817 error = EINVAL; 818 break; 819 820 /* 821 * Check for read packet available. 822 */ 823 case FIONREAD: 824 { 825 int n; 826 827 mutex_enter(&d->bd_lock); 828 n = d->bd_slen; 829 if (d->bd_hbuf) 830 n += d->bd_hlen; 831 mutex_exit(&d->bd_lock); 832 833 *(int *)addr = n; 834 break; 835 } 836 837 /* 838 * Get buffer len [for read()]. 839 */ 840 case BIOCGBLEN: 841 error = copyout(&d->bd_bufsize, (void *)addr, 842 sizeof (d->bd_bufsize)); 843 break; 844 845 /* 846 * Set buffer length. 847 */ 848 case BIOCSBLEN: 849 if (copyin((void *)addr, &size, sizeof (size)) != 0) { 850 error = EFAULT; 851 break; 852 } 853 854 mutex_enter(&d->bd_lock); 855 if (d->bd_bif != 0) { 856 error = EINVAL; 857 } else { 858 if (size > bpf_maxbufsize) 859 size = bpf_maxbufsize; 860 else if (size < BPF_MINBUFSIZE) 861 size = BPF_MINBUFSIZE; 862 863 d->bd_bufsize = size; 864 } 865 mutex_exit(&d->bd_lock); 866 867 if (error == 0) 868 error = copyout(&size, (void *)addr, sizeof (size)); 869 break; 870 871 /* 872 * Set link layer read filter. 873 */ 874 case BIOCSETF: 875 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) { 876 error = EFAULT; 877 break; 878 } 879 error = bpf_setf(d, &prog); 880 break; 881 882 /* 883 * Flush read packet buffer. 884 */ 885 case BIOCFLUSH: 886 mutex_enter(&d->bd_lock); 887 reset_d(d); 888 mutex_exit(&d->bd_lock); 889 break; 890 891 /* 892 * Put interface into promiscuous mode. 893 * This is a one-way ioctl, it is not used to turn promiscuous 894 * mode off. 895 */ 896 case BIOCPROMISC: 897 if (d->bd_bif == 0) { 898 /* 899 * No interface attached yet. 900 */ 901 error = EINVAL; 902 break; 903 } 904 mutex_enter(&d->bd_lock); 905 if (d->bd_promisc == 0) { 906 907 if (d->bd_promisc_handle) { 908 uintptr_t mph; 909 910 mph = d->bd_promisc_handle; 911 d->bd_promisc_handle = 0; 912 913 mutex_exit(&d->bd_lock); 914 MBPF_PROMISC_REMOVE(&d->bd_bif->bif_mac, mph); 915 mutex_enter(&d->bd_lock); 916 } 917 918 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY; 919 error = MBPF_PROMISC_ADD(&d->bd_bif->bif_mac, 920 d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d, 921 &d->bd_promisc_handle, d->bd_promisc_flags); 922 if (error == 0) 923 d->bd_promisc = 1; 924 } 925 mutex_exit(&d->bd_lock); 926 break; 927 928 /* 929 * Get device parameters. 930 */ 931 case BIOCGDLT: 932 if (d->bd_bif == 0) 933 error = EINVAL; 934 else 935 error = copyout(&d->bd_bif->bif_dlt, (void *)addr, 936 sizeof (d->bd_bif->bif_dlt)); 937 break; 938 939 /* 940 * Get a list of supported device parameters. 941 */ 942 case BIOCGDLTLIST: 943 if (d->bd_bif == 0) { 944 error = EINVAL; 945 } else { 946 struct bpf_dltlist list; 947 948 if (copyin((void *)addr, &list, sizeof (list)) != 0) { 949 error = EFAULT; 950 break; 951 } 952 error = bpf_getdltlist(d, &list); 953 if ((error == 0) && 954 copyout(&list, (void *)addr, sizeof (list)) != 0) 955 error = EFAULT; 956 } 957 break; 958 959 /* 960 * Set device parameters. 961 */ 962 case BIOCSDLT: 963 error = bpf_setdlt(d, (void *)addr); 964 break; 965 966 /* 967 * Get interface name. 968 */ 969 case BIOCGETIF: 970 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) { 971 error = EFAULT; 972 break; 973 } 974 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name)); 975 if ((error == 0) && 976 copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) { 977 error = EFAULT; 978 break; 979 } 980 break; 981 982 /* 983 * Set interface. 984 */ 985 case BIOCSETIF: 986 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) { 987 error = EFAULT; 988 break; 989 } 990 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name)); 991 break; 992 993 /* 994 * Get interface name. 995 */ 996 case BIOCGETLIF: 997 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) { 998 error = EFAULT; 999 break; 1000 } 1001 error = bpf_ifname(d, lifreq.lifr_name, 1002 sizeof (lifreq.lifr_name)); 1003 if ((error == 0) && 1004 copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) { 1005 error = EFAULT; 1006 break; 1007 } 1008 break; 1009 1010 /* 1011 * Set interface. 1012 */ 1013 case BIOCSETLIF: 1014 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) { 1015 error = EFAULT; 1016 break; 1017 } 1018 error = bpf_setif(d, lifreq.lifr_name, 1019 sizeof (lifreq.lifr_name)); 1020 break; 1021 1022 #ifdef _SYSCALL32_IMPL 1023 /* 1024 * Set read timeout. 1025 */ 1026 case BIOCSRTIMEOUT32: 1027 { 1028 struct timeval32 tv; 1029 1030 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) { 1031 error = EFAULT; 1032 break; 1033 } 1034 1035 /* Convert the timeout in microseconds to ticks */ 1036 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 + 1037 tv.tv_usec); 1038 if ((d->bd_rtout == 0) && (tv.tv_usec != 0)) 1039 d->bd_rtout = 1; 1040 break; 1041 } 1042 1043 /* 1044 * Get read timeout. 1045 */ 1046 case BIOCGRTIMEOUT32: 1047 { 1048 struct timeval32 tv; 1049 clock_t ticks; 1050 1051 ticks = drv_hztousec(d->bd_rtout); 1052 tv.tv_sec = ticks / 1000000; 1053 tv.tv_usec = ticks - (tv.tv_sec * 1000000); 1054 error = copyout(&tv, (void *)addr, sizeof (tv)); 1055 break; 1056 } 1057 1058 /* 1059 * Get a list of supported device parameters. 1060 */ 1061 case BIOCGDLTLIST32: 1062 if (d->bd_bif == 0) { 1063 error = EINVAL; 1064 } else { 1065 struct bpf_dltlist32 lst32; 1066 struct bpf_dltlist list; 1067 1068 if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) { 1069 error = EFAULT; 1070 break; 1071 } 1072 1073 list.bfl_len = lst32.bfl_len; 1074 list.bfl_list = (void *)(uint64_t)lst32.bfl_list; 1075 error = bpf_getdltlist(d, &list); 1076 if (error == 0) { 1077 lst32.bfl_len = list.bfl_len; 1078 1079 if (copyout(&lst32, (void *)addr, 1080 sizeof (lst32)) != 0) 1081 error = EFAULT; 1082 } 1083 } 1084 break; 1085 1086 /* 1087 * Set link layer read filter. 1088 */ 1089 case BIOCSETF32: { 1090 struct bpf_program32 prog32; 1091 1092 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) { 1093 error = EFAULT; 1094 break; 1095 } 1096 prog.bf_len = prog32.bf_len; 1097 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns; 1098 error = bpf_setf(d, &prog); 1099 break; 1100 } 1101 #endif 1102 1103 /* 1104 * Set read timeout. 1105 */ 1106 case BIOCSRTIMEOUT: 1107 { 1108 struct timeval tv; 1109 1110 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) { 1111 error = EFAULT; 1112 break; 1113 } 1114 1115 /* Convert the timeout in microseconds to ticks */ 1116 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 + 1117 tv.tv_usec); 1118 if ((d->bd_rtout == 0) && (tv.tv_usec != 0)) 1119 d->bd_rtout = 1; 1120 break; 1121 } 1122 1123 /* 1124 * Get read timeout. 1125 */ 1126 case BIOCGRTIMEOUT: 1127 { 1128 struct timeval tv; 1129 clock_t ticks; 1130 1131 ticks = drv_hztousec(d->bd_rtout); 1132 tv.tv_sec = ticks / 1000000; 1133 tv.tv_usec = ticks - (tv.tv_sec * 1000000); 1134 if (copyout(&tv, (void *)addr, sizeof (tv)) != 0) 1135 error = EFAULT; 1136 break; 1137 } 1138 1139 /* 1140 * Get packet stats. 1141 */ 1142 case BIOCGSTATS: 1143 { 1144 struct bpf_stat bs; 1145 1146 bs.bs_recv = d->bd_rcount; 1147 bs.bs_drop = d->bd_dcount; 1148 bs.bs_capt = d->bd_ccount; 1149 if (copyout(&bs, (void *)addr, sizeof (bs)) != 0) 1150 error = EFAULT; 1151 break; 1152 } 1153 1154 /* 1155 * Set immediate mode. 1156 */ 1157 case BIOCIMMEDIATE: 1158 if (copyin((void *)addr, &d->bd_immediate, 1159 sizeof (d->bd_immediate)) != 0) 1160 error = EFAULT; 1161 break; 1162 1163 case BIOCVERSION: 1164 { 1165 struct bpf_version bv; 1166 1167 bv.bv_major = BPF_MAJOR_VERSION; 1168 bv.bv_minor = BPF_MINOR_VERSION; 1169 if (copyout(&bv, (void *)addr, sizeof (bv)) != 0) 1170 error = EFAULT; 1171 break; 1172 } 1173 1174 case BIOCGHDRCMPLT: /* get "header already complete" flag */ 1175 if (copyout(&d->bd_hdrcmplt, (void *)addr, 1176 sizeof (d->bd_hdrcmplt)) != 0) 1177 error = EFAULT; 1178 break; 1179 1180 case BIOCSHDRCMPLT: /* set "header already complete" flag */ 1181 if (copyin((void *)addr, &d->bd_hdrcmplt, 1182 sizeof (d->bd_hdrcmplt)) != 0) 1183 error = EFAULT; 1184 break; 1185 1186 /* 1187 * Get "see sent packets" flag 1188 */ 1189 case BIOCGSEESENT: 1190 if (copyout(&d->bd_seesent, (void *)addr, 1191 sizeof (d->bd_seesent)) != 0) 1192 error = EFAULT; 1193 break; 1194 1195 /* 1196 * Set "see sent" packets flag 1197 */ 1198 case BIOCSSEESENT: 1199 if (copyin((void *)addr, &d->bd_seesent, 1200 sizeof (d->bd_seesent)) != 0) 1201 error = EFAULT; 1202 break; 1203 1204 case FIONBIO: /* Non-blocking I/O */ 1205 if (copyin((void *)addr, &d->bd_nonblock, 1206 sizeof (d->bd_nonblock)) != 0) 1207 error = EFAULT; 1208 break; 1209 } 1210 return (error); 1211 } 1212 1213 /* 1214 * Set d's packet filter program to fp. If this file already has a filter, 1215 * free it and replace it. If the new filter is "empty" (has a 0 size), then 1216 * the result is to just remove and free the existing filter. 1217 * Returns EINVAL for bogus requests. 1218 */ 1219 int 1220 bpf_setf(struct bpf_d *d, struct bpf_program *fp) 1221 { 1222 struct bpf_insn *fcode, *old; 1223 uint_t flen, size; 1224 size_t oldsize; 1225 1226 if (fp->bf_insns == 0) { 1227 if (fp->bf_len != 0) 1228 return (EINVAL); 1229 mutex_enter(&d->bd_lock); 1230 old = d->bd_filter; 1231 oldsize = d->bd_filter_size; 1232 d->bd_filter = 0; 1233 d->bd_filter_size = 0; 1234 reset_d(d); 1235 mutex_exit(&d->bd_lock); 1236 if (old != 0) 1237 kmem_free(old, oldsize); 1238 return (0); 1239 } 1240 flen = fp->bf_len; 1241 if (flen > BPF_MAXINSNS) 1242 return (EINVAL); 1243 1244 size = flen * sizeof (*fp->bf_insns); 1245 fcode = kmem_alloc(size, KM_SLEEP); 1246 if (copyin(fp->bf_insns, fcode, size) != 0) 1247 return (EFAULT); 1248 1249 if (bpf_validate(fcode, (int)flen)) { 1250 mutex_enter(&d->bd_lock); 1251 old = d->bd_filter; 1252 oldsize = d->bd_filter_size; 1253 d->bd_filter = fcode; 1254 d->bd_filter_size = size; 1255 reset_d(d); 1256 mutex_exit(&d->bd_lock); 1257 if (old != 0) 1258 kmem_free(old, oldsize); 1259 1260 return (0); 1261 } 1262 kmem_free(fcode, size); 1263 return (EINVAL); 1264 } 1265 1266 /* 1267 * Detach a file from its current interface (if attached at all) and attach 1268 * to the interface indicated by the name stored in ifr. 1269 * Return an errno or 0. 1270 */ 1271 static int 1272 bpf_setif(struct bpf_d *d, char *ifname, int namesize) 1273 { 1274 struct bpf_if *bp; 1275 int unit_seen; 1276 char *cp; 1277 int i; 1278 1279 /* 1280 * Make sure the provided name has a unit number, and default 1281 * it to '0' if not specified. 1282 * XXX This is ugly ... do this differently? 1283 */ 1284 unit_seen = 0; 1285 cp = ifname; 1286 cp[namesize - 1] = '\0'; /* sanity */ 1287 while (*cp++) 1288 if (*cp >= '0' && *cp <= '9') 1289 unit_seen = 1; 1290 if (!unit_seen) { 1291 /* Make sure to leave room for the '\0'. */ 1292 for (i = 0; i < (namesize - 1); ++i) { 1293 if ((ifname[i] >= 'a' && ifname[i] <= 'z') || 1294 (ifname[i] >= 'A' && ifname[i] <= 'Z')) 1295 continue; 1296 ifname[i] = '0'; 1297 } 1298 } 1299 1300 /* 1301 * Make sure that only one call to this function happens at a time 1302 * and that we're not interleaving a read/write 1303 */ 1304 mutex_enter(&d->bd_lock); 1305 while (d->bd_inuse != 0) { 1306 d->bd_waiting++; 1307 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) { 1308 d->bd_waiting--; 1309 mutex_exit(&d->bd_lock); 1310 return (EINTR); 1311 } 1312 d->bd_waiting--; 1313 } 1314 d->bd_inuse = -1; 1315 mutex_exit(&d->bd_lock); 1316 1317 /* 1318 * Look through attached interfaces for the named one. 1319 * 1320 * The search is done twice - once 1321 */ 1322 mutex_enter(&bpf_mtx); 1323 1324 bp = bpf_findif(d, ifname, -1); 1325 1326 if (bp != NULL) { 1327 int error = 0; 1328 1329 if (d->bd_sbuf == 0) 1330 error = bpf_allocbufs(d); 1331 1332 /* 1333 * We found the requested interface. 1334 * If we're already attached to requested interface, 1335 * just flush the buffer. 1336 */ 1337 mutex_enter(&d->bd_lock); 1338 if (error == 0 && bp != d->bd_bif) { 1339 if (d->bd_bif) 1340 /* 1341 * Detach if attached to something else. 1342 */ 1343 bpf_detachd(d); 1344 1345 bpf_attachd(d, bp); 1346 } 1347 reset_d(d); 1348 d->bd_inuse = 0; 1349 if (d->bd_waiting != 0) 1350 cv_signal(&d->bd_wait); 1351 mutex_exit(&d->bd_lock); 1352 mutex_exit(&bpf_mtx); 1353 return (error); 1354 } 1355 1356 mutex_enter(&d->bd_lock); 1357 d->bd_inuse = 0; 1358 if (d->bd_waiting != 0) 1359 cv_signal(&d->bd_wait); 1360 mutex_exit(&d->bd_lock); 1361 mutex_exit(&bpf_mtx); 1362 1363 /* 1364 * Try tickle the mac layer into attaching the device... 1365 */ 1366 return (bpf_provider_tickle(ifname, d->bd_zone)); 1367 } 1368 1369 /* 1370 * Copy the interface name to the ifreq. 1371 */ 1372 static int 1373 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize) 1374 { 1375 struct bpf_if *bp; 1376 1377 mutex_enter(&d->bd_lock); 1378 bp = d->bd_bif; 1379 if (bp == NULL) { 1380 mutex_exit(&d->bd_lock); 1381 return (EINVAL); 1382 } 1383 1384 (void) strlcpy(buffer, bp->bif_ifname, bufsize); 1385 mutex_exit(&d->bd_lock); 1386 1387 return (0); 1388 } 1389 1390 /* 1391 * Support for poll() system call 1392 * 1393 * Return true iff the specific operation will not block indefinitely - with 1394 * the assumption that it is safe to positively acknowledge a request for the 1395 * ability to write to the BPF device. 1396 * Otherwise, return false but make a note that a selnotify() must be done. 1397 */ 1398 int 1399 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp, 1400 struct pollhead **phpp) 1401 { 1402 struct bpf_d *d = bpf_dev_get(getminor(dev)); 1403 1404 if (events & (POLLIN | POLLRDNORM)) { 1405 /* 1406 * An imitation of the FIONREAD ioctl code. 1407 */ 1408 mutex_enter(&d->bd_lock); 1409 if (d->bd_hlen != 0 || 1410 ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && 1411 d->bd_slen != 0)) { 1412 *reventsp |= events & (POLLIN | POLLRDNORM); 1413 } else { 1414 *reventsp = 0; 1415 if (!anyyet) 1416 *phpp = &d->bd_poll; 1417 /* Start the read timeout if necessary */ 1418 if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { 1419 bpf_clear_timeout(d); 1420 /* 1421 * Only allow the timeout to be set once. 1422 */ 1423 if (d->bd_callout == 0) 1424 d->bd_callout = timeout(bpf_timed_out, 1425 d, d->bd_rtout); 1426 d->bd_state = BPF_WAITING; 1427 } 1428 } 1429 mutex_exit(&d->bd_lock); 1430 } 1431 1432 return (0); 1433 } 1434 1435 /* 1436 * Copy data from an mblk_t chain into a buffer. This works for ipnet 1437 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the 1438 * packet itself. 1439 */ 1440 static void * 1441 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len) 1442 { 1443 const mblk_t *m; 1444 uint_t count; 1445 uchar_t *dst; 1446 1447 m = src_arg; 1448 dst = dst_arg; 1449 while (len > 0) { 1450 if (m == NULL) 1451 panic("bpf_mcpy"); 1452 count = (uint_t)min(M_LEN(m), len); 1453 (void) memcpy(dst, mtod(m, const void *), count); 1454 m = m->b_cont; 1455 dst += count; 1456 len -= count; 1457 } 1458 return (dst_arg); 1459 } 1460 1461 /* 1462 * Dispatch a packet to all the listeners on interface bp. 1463 * 1464 * marg pointer to the packet, either a data buffer or an mbuf chain 1465 * buflen buffer length, if marg is a data buffer 1466 * cpfn a function that can copy marg into the listener's buffer 1467 * pktlen length of the packet 1468 * issent boolean indicating whether the packet was sent or receive 1469 */ 1470 static inline void 1471 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen, 1472 uint_t buflen, boolean_t issent) 1473 { 1474 struct timeval tv; 1475 uint_t slen; 1476 1477 if (!d->bd_seesent && issent) 1478 return; 1479 1480 /* 1481 * Accuracy of the packet counters in BPF is vital so it 1482 * is important to protect even the outer ones. 1483 */ 1484 mutex_enter(&d->bd_lock); 1485 slen = bpf_filter(d->bd_filter, marg, pktlen, buflen); 1486 DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif, 1487 struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen); 1488 d->bd_rcount++; 1489 ks_stats.kp_receive.value.ui64++; 1490 if (slen != 0) { 1491 uniqtime(&tv); 1492 catchpacket(d, marg, pktlen, slen, cpfn, &tv); 1493 } 1494 mutex_exit(&d->bd_lock); 1495 } 1496 1497 /* 1498 * Incoming linkage from device drivers. 1499 */ 1500 /* ARGSUSED */ 1501 void 1502 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent) 1503 { 1504 cp_fn_t cpfn; 1505 struct bpf_d *d = arg; 1506 uint_t pktlen, buflen; 1507 void *marg; 1508 1509 pktlen = msgdsize(m); 1510 1511 if (pktlen == M_LEN(m)) { 1512 cpfn = (cp_fn_t)memcpy; 1513 marg = mtod(m, void *); 1514 buflen = pktlen; 1515 } else { 1516 cpfn = bpf_mcpy; 1517 marg = m; 1518 buflen = 0; 1519 } 1520 1521 bpf_deliver(d, cpfn, marg, pktlen, buflen, issent); 1522 } 1523 1524 /* 1525 * Incoming linkage from ipnet. 1526 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets 1527 * from all network interfaces. Thus the tap function needs to apply a 1528 * filter using the interface index/id to immitate snoop'ing on just the 1529 * specified interface. 1530 */ 1531 /* ARGSUSED */ 1532 void 1533 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length) 1534 { 1535 hook_pkt_observe_t *hdr; 1536 struct bpf_d *d = arg; 1537 1538 hdr = (hook_pkt_observe_t *)m->b_rptr; 1539 if (ntohl(hdr->hpo_ifindex) != d->bd_bif->bif_linkid) 1540 return; 1541 bpf_deliver(d, bpf_mcpy, m, length, 0, issent); 1542 1543 } 1544 1545 /* 1546 * Move the packet data from interface memory (pkt) into the 1547 * store buffer. Return 1 if it's time to wakeup a listener (buffer full), 1548 * otherwise 0. "copy" is the routine called to do the actual data 1549 * transfer. memcpy is passed in to copy contiguous chunks, while 1550 * bpf_mcpy is passed in to copy mbuf chains. In the latter case, 1551 * pkt is really an mbuf. 1552 */ 1553 static void 1554 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen, 1555 cp_fn_t cpfn, struct timeval *tv) 1556 { 1557 struct bpf_hdr *hp; 1558 int totlen, curlen; 1559 int hdrlen = d->bd_bif->bif_hdrlen; 1560 int do_wakeup = 0; 1561 1562 ++d->bd_ccount; 1563 ks_stats.kp_capture.value.ui64++; 1564 /* 1565 * Figure out how many bytes to move. If the packet is 1566 * greater or equal to the snapshot length, transfer that 1567 * much. Otherwise, transfer the whole packet (unless 1568 * we hit the buffer size limit). 1569 */ 1570 totlen = hdrlen + min(snaplen, pktlen); 1571 if (totlen > d->bd_bufsize) 1572 totlen = d->bd_bufsize; 1573 1574 /* 1575 * Round up the end of the previous packet to the next longword. 1576 */ 1577 curlen = BPF_WORDALIGN(d->bd_slen); 1578 if (curlen + totlen > d->bd_bufsize) { 1579 /* 1580 * This packet will overflow the storage buffer. 1581 * Rotate the buffers if we can, then wakeup any 1582 * pending reads. 1583 */ 1584 if (d->bd_fbuf == 0) { 1585 /* 1586 * We haven't completed the previous read yet, 1587 * so drop the packet. 1588 */ 1589 ++d->bd_dcount; 1590 ks_stats.kp_dropped.value.ui64++; 1591 return; 1592 } 1593 ROTATE_BUFFERS(d); 1594 do_wakeup = 1; 1595 curlen = 0; 1596 } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { 1597 /* 1598 * Immediate mode is set, or the read timeout has 1599 * already expired during a select call. A packet 1600 * arrived, so the reader should be woken up. 1601 */ 1602 do_wakeup = 1; 1603 } 1604 1605 /* 1606 * Append the bpf header to the existing buffer before we add 1607 * on the actual packet data. 1608 */ 1609 hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen); 1610 hp->bh_tstamp.tv_sec = tv->tv_sec; 1611 hp->bh_tstamp.tv_usec = tv->tv_usec; 1612 hp->bh_datalen = pktlen; 1613 hp->bh_hdrlen = (uint16_t)hdrlen; 1614 /* 1615 * Copy the packet data into the store buffer and update its length. 1616 */ 1617 (*cpfn)((uchar_t *)hp + hdrlen, pkt, 1618 (hp->bh_caplen = totlen - hdrlen)); 1619 d->bd_slen = curlen + totlen; 1620 1621 /* 1622 * Call bpf_wakeup after bd_slen has been updated. 1623 */ 1624 if (do_wakeup) 1625 bpf_wakeup(d); 1626 } 1627 1628 /* 1629 * Initialize all nonzero fields of a descriptor. 1630 */ 1631 static int 1632 bpf_allocbufs(struct bpf_d *d) 1633 { 1634 1635 d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); 1636 if (!d->bd_fbuf) 1637 return (ENOBUFS); 1638 d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); 1639 if (!d->bd_sbuf) { 1640 kmem_free(d->bd_fbuf, d->bd_bufsize); 1641 return (ENOBUFS); 1642 } 1643 d->bd_slen = 0; 1644 d->bd_hlen = 0; 1645 return (0); 1646 } 1647 1648 /* 1649 * Free buffers currently in use by a descriptor. 1650 * Called on close. 1651 */ 1652 static void 1653 bpf_freed(struct bpf_d *d) 1654 { 1655 /* 1656 * At this point the descriptor has been detached from its 1657 * interface and it yet hasn't been marked free. 1658 */ 1659 if (d->bd_sbuf != 0) { 1660 kmem_free(d->bd_sbuf, d->bd_bufsize); 1661 if (d->bd_hbuf != 0) 1662 kmem_free(d->bd_hbuf, d->bd_bufsize); 1663 if (d->bd_fbuf != 0) 1664 kmem_free(d->bd_fbuf, d->bd_bufsize); 1665 } 1666 if (d->bd_filter) 1667 kmem_free(d->bd_filter, d->bd_filter_size); 1668 } 1669 1670 /* 1671 * Attach additional dlt for a interface to bpf. 1672 * dlt is the link layer type. 1673 * 1674 * The zoneid is passed in explicitly to prevent the need to 1675 * do a lookup in dls using the linkid. Such a lookup would need 1676 * to use the same hash table that gets used for walking when 1677 * dls_set_bpfattach() is called. 1678 */ 1679 void 1680 bpfattach(uintptr_t ifp, int dlt, zoneid_t zoneid, int provider) 1681 { 1682 bpf_provider_t *bpr; 1683 struct bpf_if *bp; 1684 uintptr_t client; 1685 int hdrlen; 1686 1687 bpr = bpf_find_provider_by_id(provider); 1688 if (bpr == NULL) { 1689 if (bpf_debug) 1690 cmn_err(CE_WARN, "bpfattach: unknown provider %d", 1691 provider); 1692 return; 1693 } 1694 1695 bp = kmem_zalloc(sizeof (*bp), KM_NOSLEEP); 1696 if (bp == NULL) { 1697 if (bpf_debug) 1698 cmn_err(CE_WARN, "bpfattach: no memory for bpf_if"); 1699 return; 1700 } 1701 bp->bif_mac = *bpr; 1702 1703 /* 1704 * To get the user-visible name, it is necessary to get the mac 1705 * client name of an interface and for this, we need to do the 1706 * mac_client_open. Leaving it open is undesirable because it 1707 * creates an open reference that is hard to see from outside 1708 * of bpf, potentially leading to data structures not being 1709 * cleaned up when they should. 1710 */ 1711 if (MBPF_CLIENT_OPEN(&bp->bif_mac, ifp, &client) != 0) { 1712 if (bpf_debug) 1713 cmn_err(CE_WARN, 1714 "bpfattach: mac_client_open fail for %s", 1715 MBPF_NAME(&bp->bif_mac, ifp)); 1716 kmem_free(bp, sizeof (*bp)); 1717 return; 1718 } 1719 (void) strlcpy(bp->bif_ifname, MBPF_CLIENT_NAME(&bp->bif_mac, client), 1720 sizeof (bp->bif_ifname)); 1721 MBPF_CLIENT_CLOSE(&bp->bif_mac, client); 1722 1723 bp->bif_ifp = ifp; 1724 bp->bif_dlt = bpf_dl_to_dlt(dlt); 1725 bp->bif_zoneid = zoneid; 1726 LIST_INIT(&bp->bif_dlist); 1727 1728 /* 1729 * Compute the length of the bpf header. This is not necessarily 1730 * equal to SIZEOF_BPF_HDR because we want to insert spacing such 1731 * that the network layer header begins on a longword boundary (for 1732 * performance reasons and to alleviate alignment restrictions). 1733 */ 1734 hdrlen = bpf_dl_hdrsize(dlt); 1735 bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; 1736 1737 if (MBPF_GET_LINKID(&bp->bif_mac, MBPF_NAME(&bp->bif_mac, ifp), 1738 &bp->bif_linkid, zoneid) != 0) { 1739 if (bpf_debug) { 1740 cmn_err(CE_WARN, 1741 "bpfattach: linkid resolution fail for %s/%s", 1742 MBPF_NAME(&bp->bif_mac, ifp), bp->bif_ifname); 1743 } 1744 kmem_free(bp, sizeof (*bp)); 1745 return; 1746 } 1747 mutex_init(&bp->bif_lock, NULL, MUTEX_DRIVER, NULL); 1748 1749 bpf_debug_nic_action("attached to", bp); 1750 1751 mutex_enter(&bpf_mtx); 1752 TAILQ_INSERT_TAIL(&bpf_iflist, bp, bif_next); 1753 mutex_exit(&bpf_mtx); 1754 } 1755 1756 /* 1757 * Remove an interface from bpf. 1758 */ 1759 void 1760 bpfdetach(uintptr_t ifp) 1761 { 1762 struct bpf_if *bp; 1763 struct bpf_d *d; 1764 int removed = 0; 1765 1766 mutex_enter(&bpf_mtx); 1767 /* 1768 * Loop through all of the known descriptors to find any that are 1769 * using the interface that wants to be detached. 1770 */ 1771 LIST_FOREACH(d, &bpf_list, bd_list) { 1772 mutex_enter(&d->bd_lock); 1773 bp = d->bd_bif; 1774 if (bp != NULL && bp->bif_ifp == ifp) { 1775 /* 1776 * Detach the descriptor from an interface now. 1777 * It will be free'ed later by close routine. 1778 */ 1779 bpf_detachd(d); 1780 } 1781 mutex_exit(&d->bd_lock); 1782 } 1783 1784 again: 1785 TAILQ_FOREACH(bp, &bpf_iflist, bif_next) { 1786 if (bp->bif_ifp == ifp) { 1787 TAILQ_REMOVE(&bpf_iflist, bp, bif_next); 1788 bpf_debug_nic_action("detached from", bp); 1789 while (bp->bif_inuse != 0) 1790 cv_wait(&bpf_dlt_waiter, &bpf_mtx); 1791 kmem_free(bp, sizeof (*bp)); 1792 removed++; 1793 goto again; 1794 } 1795 } 1796 mutex_exit(&bpf_mtx); 1797 1798 ASSERT(removed > 0); 1799 } 1800 1801 /* 1802 * Get a list of available data link type of the interface. 1803 */ 1804 static int 1805 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp) 1806 { 1807 char ifname[LIFNAMSIZ+1]; 1808 struct bpf_if *bp; 1809 uintptr_t ifp; 1810 int n, error; 1811 1812 mutex_enter(&bpf_mtx); 1813 ifp = d->bd_bif->bif_ifp; 1814 (void) strlcpy(ifname, MBPF_NAME(&d->bd_bif->bif_mac, ifp), 1815 sizeof (ifname)); 1816 n = 0; 1817 error = 0; 1818 TAILQ_FOREACH(bp, &bpf_iflist, bif_next) { 1819 if (strcmp(bp->bif_ifname, ifname) != 0) 1820 continue; 1821 if (d->bd_zone != GLOBAL_ZONEID && 1822 d->bd_zone != bp->bif_zoneid) 1823 continue; 1824 if (listp->bfl_list != NULL) { 1825 if (n >= listp->bfl_len) 1826 return (ENOMEM); 1827 /* 1828 * Bumping of bif_inuse ensures the structure does not 1829 * disappear while the copyout runs and allows the for 1830 * loop to be continued. 1831 */ 1832 bp->bif_inuse++; 1833 mutex_exit(&bpf_mtx); 1834 if (copyout(&bp->bif_dlt, 1835 listp->bfl_list + n, sizeof (uint_t)) != 0) 1836 error = EFAULT; 1837 mutex_enter(&bpf_mtx); 1838 bp->bif_inuse--; 1839 } 1840 n++; 1841 } 1842 cv_signal(&bpf_dlt_waiter); 1843 mutex_exit(&bpf_mtx); 1844 listp->bfl_len = n; 1845 return (error); 1846 } 1847 1848 /* 1849 * Set the data link type of a BPF instance. 1850 */ 1851 static int 1852 bpf_setdlt(struct bpf_d *d, void *addr) 1853 { 1854 char ifname[LIFNAMSIZ+1]; 1855 struct bpf_if *bp; 1856 int error; 1857 int dlt; 1858 1859 if (copyin(addr, &dlt, sizeof (dlt)) != 0) 1860 return (EFAULT); 1861 /* 1862 * The established order is get bpf_mtx before bd_lock, even 1863 * though bpf_mtx is not needed until the loop... 1864 */ 1865 mutex_enter(&bpf_mtx); 1866 mutex_enter(&d->bd_lock); 1867 1868 if (d->bd_bif == 0) { /* Interface not set */ 1869 mutex_exit(&d->bd_lock); 1870 mutex_exit(&bpf_mtx); 1871 return (EINVAL); 1872 } 1873 if (d->bd_bif->bif_dlt == dlt) { /* NULL-op */ 1874 mutex_exit(&d->bd_lock); 1875 mutex_exit(&bpf_mtx); 1876 return (0); 1877 } 1878 1879 /* 1880 * See the matrix at the top of the file for the permissions table 1881 * enforced by this driver. 1882 */ 1883 if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) && 1884 (d->bd_bif->bif_zoneid != d->bd_zone)) { 1885 mutex_exit(&d->bd_lock); 1886 mutex_exit(&bpf_mtx); 1887 return (EINVAL); 1888 } 1889 1890 (void) strlcpy(ifname, 1891 MBPF_NAME(&d->bd_bif->bif_mac, d->bd_bif->bif_ifp), 1892 sizeof (ifname)); 1893 1894 bp = bpf_findif(d, ifname, dlt); 1895 1896 mutex_exit(&bpf_mtx); 1897 /* 1898 * Now only bd_lock is held. 1899 * 1900 * If there was no matching interface that supports the requested 1901 * DLT, return an error and leave the current binding alone. 1902 */ 1903 if (bp == NULL) { 1904 mutex_exit(&d->bd_lock); 1905 return (EINVAL); 1906 } 1907 1908 error = 0; 1909 bpf_detachd(d); 1910 bpf_attachd(d, bp); 1911 reset_d(d); 1912 1913 mutex_exit(&d->bd_lock); 1914 return (error); 1915 } 1916 1917 /* 1918 * bpf_clear_timeout is called with the bd_lock mutex held, providing it 1919 * with the necessary protection to retrieve and modify bd_callout but it 1920 * does not hold the lock for its entire duration... see below... 1921 */ 1922 static void 1923 bpf_clear_timeout(struct bpf_d *d) 1924 { 1925 timeout_id_t tid = d->bd_callout; 1926 d->bd_callout = 0; 1927 d->bd_inuse++; 1928 1929 /* 1930 * If the timeout has fired and is waiting on bd_lock, we could 1931 * deadlock here because untimeout if bd_lock is held and would 1932 * wait for bpf_timed_out to finish and it never would. 1933 */ 1934 if (tid != 0) { 1935 mutex_exit(&d->bd_lock); 1936 (void) untimeout(tid); 1937 mutex_enter(&d->bd_lock); 1938 } 1939 1940 d->bd_inuse--; 1941 } 1942 1943 /* 1944 * As a cloning device driver, BPF needs to keep track of which device 1945 * numbers are in use and which ones are not. A hash table, indexed by 1946 * the minor device number, is used to store the pointers to the 1947 * individual descriptors that are allocated in bpfopen(). 1948 * The functions below present the interface for that hash table to 1949 * the rest of the driver. 1950 */ 1951 static struct bpf_d * 1952 bpf_dev_find(minor_t minor) 1953 { 1954 struct bpf_d *d = NULL; 1955 1956 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor, 1957 (mod_hash_val_t *)&d); 1958 1959 return (d); 1960 } 1961 1962 static void 1963 bpf_dev_add(struct bpf_d *d) 1964 { 1965 (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev, 1966 (mod_hash_val_t)d); 1967 } 1968 1969 static void 1970 bpf_dev_remove(struct bpf_d *d) 1971 { 1972 struct bpf_d *stor; 1973 1974 (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev, 1975 (mod_hash_val_t *)&stor); 1976 ASSERT(stor == d); 1977 } 1978 1979 /* 1980 * bpf_def_get should only ever be called for a minor number that exists, 1981 * thus there should always be a pointer in the hash table that corresponds 1982 * to it. 1983 */ 1984 static struct bpf_d * 1985 bpf_dev_get(minor_t minor) 1986 { 1987 struct bpf_d *d = NULL; 1988 1989 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor, 1990 (mod_hash_val_t *)&d); 1991 ASSERT(d != NULL); 1992 1993 return (d); 1994 } 1995 1996 static void 1997 bpf_debug_nic_action(char *txt, struct bpf_if *bp) 1998 { 1999 if (bpf_debug) { 2000 cmn_err(CE_CONT, "%s %s %s/%d/%d/%d\n", bp->bif_ifname, txt, 2001 MBPF_NAME(&bp->bif_mac, bp->bif_ifp), bp->bif_linkid, 2002 bp->bif_zoneid, bp->bif_dlt); 2003 } 2004 } 2005 2006 /* 2007 * Finding a BPF network interface is a two pass job. 2008 * In the first pass, the best possible match is made on zone, DLT and 2009 * interface name. 2010 * In the second pass, we allow global zone snoopers to attach to interfaces 2011 * that are reserved for other zones. 2012 * This ensures that the global zone will always see its own interfaces first 2013 * before attaching to those that belong to a shared IP instance zone. 2014 */ 2015 static struct bpf_if * 2016 bpf_findif(struct bpf_d *d, char *ifname, int dlt) 2017 { 2018 struct bpf_if *bp; 2019 2020 TAILQ_FOREACH(bp, &bpf_iflist, bif_next) { 2021 if ((bp->bif_ifp == 0) || 2022 (strcmp(ifname, bp->bif_ifname) != 0)) 2023 continue; 2024 2025 if (bp->bif_zoneid != d->bd_zone) 2026 continue; 2027 2028 if ((dlt != -1) && (dlt != bp->bif_dlt)) 2029 continue; 2030 2031 return (bp); 2032 } 2033 2034 if (d->bd_zone == GLOBAL_ZONEID) { 2035 TAILQ_FOREACH(bp, &bpf_iflist, bif_next) { 2036 if ((bp->bif_ifp == 0) || 2037 (strcmp(ifname, bp->bif_ifname) != 0)) 2038 continue; 2039 2040 if ((dlt != -1) && (dlt != bp->bif_dlt)) 2041 continue; 2042 return (bp); 2043 } 2044 } 2045 2046 return (NULL); 2047 } 2048