1 /* $NetBSD: bpf.c,v 1.143 2009/03/11 05:55:22 mrg Exp $ */ 2 3 /* 4 * Copyright (c) 1990, 1991, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from the Stanford/CMU enet packet filter, 8 * (net/enet.c) distributed as part of 4.3BSD, and code contributed 9 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence 10 * Berkeley Laboratory. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * @(#)bpf.c 8.4 (Berkeley) 1/9/95 37 * static char rcsid[] = 38 * "Header: bpf.c,v 1.67 96/09/26 22:00:52 leres Exp "; 39 */ 40 /* 41 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 42 * Use is subject to license terms. 43 * Copyright 2017 Joyent, Inc. 44 */ 45 46 /* 47 * The BPF implements the following access controls for zones attempting 48 * to read and write data. Writing of data requires that the net_rawaccess 49 * privilege is held whilst reading data requires either net_rawaccess or 50 * net_observerability. 51 * 52 * | Shared | Exclusive | Global 53 * -----------------------------+--------+------------+------------+ 54 * DLT_IPNET in local zone | Read | Read | Read | 55 * -----------------------------+--------+------------+------------+ 56 * Raw access to local zone NIC | None | Read/Write | Read/Write | 57 * -----------------------------+--------+------------+------------+ 58 * Raw access to all NICs | None | None | Read/Write | 59 * -----------------------------+--------+------------+------------+ 60 * 61 * The BPF driver is written as a cloning driver: each call to bpfopen() 62 * allocates a new minor number. This provides BPF with a 1:1 relationship 63 * between open's and close's. There is some amount of "descriptor state" 64 * that is kept per open. Pointers to this data are stored in a hash table 65 * (bpf_hash) that is index'd by the minor device number for each open file. 66 */ 67 #include <sys/param.h> 68 #include <sys/systm.h> 69 #include <sys/time.h> 70 #include <sys/ioctl.h> 71 #include <sys/queue.h> 72 #include <sys/filio.h> 73 #include <sys/policy.h> 74 #include <sys/cmn_err.h> 75 #include <sys/uio.h> 76 #include <sys/file.h> 77 #include <sys/sysmacros.h> 78 #include <sys/zone.h> 79 80 #include <sys/socket.h> 81 #include <sys/errno.h> 82 #include <sys/poll.h> 83 #include <sys/dlpi.h> 84 #include <sys/neti.h> 85 86 #include <net/if.h> 87 88 #include <net/bpf.h> 89 #include <net/bpfdesc.h> 90 #include <net/dlt.h> 91 92 #include <netinet/in.h> 93 #include <sys/mac.h> 94 #include <sys/mac_client.h> 95 #include <sys/mac_impl.h> 96 #include <sys/time_std_impl.h> 97 #include <sys/hook.h> 98 #include <sys/hook_event.h> 99 100 101 #define mtod(_v, _t) (_t)((_v)->b_rptr) 102 #define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) 103 104 /* 105 * 4096 is too small for FDDI frames. 8192 is too small for gigabit Ethernet 106 * jumbos (circa 9k), ATM, or Intel gig/10gig ethernet jumbos (16k). 107 */ 108 #define BPF_BUFSIZE (32 * 1024) 109 110 typedef void *(*cp_fn_t)(void *, const void *, size_t); 111 112 /* 113 * The default read buffer size, and limit for BIOCSBLEN. 114 */ 115 int bpf_bufsize = BPF_BUFSIZE; 116 int bpf_maxbufsize = (16 * 1024 * 1024); 117 static mod_hash_t *bpf_hash = NULL; 118 119 /* 120 * Use a mutex to avoid a race condition between gathering the stats/peers 121 * and opening/closing the device. 122 */ 123 static kcondvar_t bpf_dlt_waiter; 124 static kmutex_t bpf_mtx; 125 static bpf_kstats_t ks_stats; 126 static bpf_kstats_t bpf_kstats = { 127 { "readWait", KSTAT_DATA_UINT64 }, 128 { "writeOk", KSTAT_DATA_UINT64 }, 129 { "writeError", KSTAT_DATA_UINT64 }, 130 { "receive", KSTAT_DATA_UINT64 }, 131 { "captured", KSTAT_DATA_UINT64 }, 132 { "dropped", KSTAT_DATA_UINT64 }, 133 }; 134 static kstat_t *bpf_ksp; 135 136 /* 137 * bpf_list is a list of the BPF descriptors currently open 138 */ 139 LIST_HEAD(, bpf_d) bpf_list; 140 141 static int bpf_allocbufs(struct bpf_d *); 142 static void bpf_clear_timeout(struct bpf_d *); 143 static void bpf_deliver(struct bpf_d *, cp_fn_t, 144 void *, uint_t, uint_t, boolean_t); 145 static void bpf_freed(struct bpf_d *); 146 static int bpf_ifname(struct bpf_d *d, char *, int); 147 static void *bpf_mcpy(void *, const void *, size_t); 148 static int bpf_attachd(struct bpf_d *, const char *, int); 149 static void bpf_detachd(struct bpf_d *); 150 static int bpf_setif(struct bpf_d *, char *, int); 151 static void bpf_timed_out(void *); 152 static inline void 153 bpf_wakeup(struct bpf_d *); 154 static void catchpacket(struct bpf_d *, uchar_t *, uint_t, uint_t, 155 cp_fn_t, struct timeval *); 156 static void reset_d(struct bpf_d *); 157 static int bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *); 158 static int bpf_setdlt(struct bpf_d *, void *); 159 static void bpf_dev_add(struct bpf_d *); 160 static struct bpf_d *bpf_dev_find(minor_t); 161 static struct bpf_d *bpf_dev_get(minor_t); 162 static void bpf_dev_remove(struct bpf_d *); 163 164 static int 165 bpf_movein(struct uio *uio, int linktype, int mtu, mblk_t **mp) 166 { 167 mblk_t *m; 168 int error; 169 int len; 170 int hlen; 171 int align; 172 173 /* 174 * Build a sockaddr based on the data link layer type. 175 * We do this at this level because the ethernet header 176 * is copied directly into the data field of the sockaddr. 177 * In the case of SLIP, there is no header and the packet 178 * is forwarded as is. 179 * Also, we are careful to leave room at the front of the mbuf 180 * for the link level header. 181 */ 182 switch (linktype) { 183 184 case DLT_EN10MB: 185 hlen = sizeof (struct ether_header); 186 break; 187 188 case DLT_FDDI: 189 hlen = 16; 190 break; 191 192 case DLT_NULL: 193 hlen = 0; 194 break; 195 196 case DLT_IPOIB: 197 hlen = 44; 198 break; 199 200 default: 201 return (EIO); 202 } 203 204 align = 4 - (hlen & 3); 205 206 len = uio->uio_resid; 207 /* 208 * If there aren't enough bytes for a link level header or the 209 * packet length exceeds the interface mtu, return an error. 210 */ 211 if (len < hlen || len - hlen > mtu) 212 return (EMSGSIZE); 213 214 m = allocb(len + align, BPRI_MED); 215 if (m == NULL) { 216 error = ENOBUFS; 217 goto bad; 218 } 219 220 /* Insure the data is properly aligned */ 221 if (align > 0) 222 m->b_rptr += align; 223 m->b_wptr = m->b_rptr + len; 224 225 error = uiomove(mtod(m, void *), len, UIO_WRITE, uio); 226 if (error) 227 goto bad; 228 *mp = m; 229 return (0); 230 231 bad: 232 if (m != NULL) 233 freemsg(m); 234 return (error); 235 } 236 237 238 /* 239 * Attach file to the bpf interface, i.e. make d listen on bp. 240 */ 241 static int 242 bpf_attachd(struct bpf_d *d, const char *ifname, int dlt) 243 { 244 bpf_provider_list_t *bp; 245 bpf_provider_t *bpr; 246 boolean_t zonematch; 247 zoneid_t niczone; 248 uintptr_t mcip; 249 zoneid_t zone; 250 uint_t nicdlt; 251 uintptr_t mh; 252 int hdrlen; 253 int error; 254 255 ASSERT(d->bd_bif == NULL); 256 ASSERT(d->bd_mcip == NULL); 257 zone = d->bd_zone; 258 zonematch = B_TRUE; 259 again: 260 mh = 0; 261 mcip = 0; 262 LIST_FOREACH(bp, &bpf_providers, bpl_next) { 263 bpr = bp->bpl_what; 264 error = MBPF_OPEN(bpr, ifname, &mh, zone); 265 if (error != 0) 266 goto next; 267 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip); 268 if (error != 0) 269 goto next; 270 error = MBPF_GET_DLT(bpr, mh, &nicdlt); 271 if (error != 0) 272 goto next; 273 274 nicdlt = bpf_dl_to_dlt(nicdlt); 275 if (dlt != -1 && dlt != nicdlt) { 276 error = ENOENT; 277 goto next; 278 } 279 280 error = MBPF_GET_ZONE(bpr, mh, &niczone); 281 if (error != 0) 282 goto next; 283 284 DTRACE_PROBE4(bpf__attach, struct bpf_provider_s *, bpr, 285 uintptr_t, mh, int, nicdlt, zoneid_t, niczone); 286 287 if (zonematch && niczone != zone) { 288 error = ENOENT; 289 goto next; 290 } 291 break; 292 next: 293 if (mcip != 0) { 294 MBPF_CLIENT_CLOSE(bpr, mcip); 295 mcip = 0; 296 } 297 if (mh != NULL) { 298 MBPF_CLOSE(bpr, mh); 299 mh = 0; 300 } 301 } 302 if (error != 0) { 303 if (zonematch && (zone == GLOBAL_ZONEID)) { 304 /* 305 * If we failed to do an exact match for the global 306 * zone using the global zoneid, try again in case 307 * the network interface is owned by a local zone. 308 */ 309 zonematch = B_FALSE; 310 goto again; 311 } 312 return (error); 313 } 314 315 d->bd_mac = *bpr; 316 d->bd_mcip = mcip; 317 d->bd_bif = mh; 318 d->bd_dlt = nicdlt; 319 hdrlen = bpf_dl_hdrsize(nicdlt); 320 d->bd_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen; 321 322 (void) strlcpy(d->bd_ifname, MBPF_CLIENT_NAME(&d->bd_mac, mcip), 323 sizeof (d->bd_ifname)); 324 325 (void) MBPF_GET_LINKID(&d->bd_mac, d->bd_ifname, &d->bd_linkid, 326 zone); 327 (void) MBPF_PROMISC_ADD(&d->bd_mac, d->bd_mcip, 0, d, 328 &d->bd_promisc_handle, d->bd_promisc_flags); 329 return (0); 330 } 331 332 /* 333 * Detach a file from its interface. 334 */ 335 static void 336 bpf_detachd(struct bpf_d *d) 337 { 338 uintptr_t mph; 339 uintptr_t mch; 340 uintptr_t mh; 341 342 ASSERT(d->bd_inuse == -1); 343 mch = d->bd_mcip; 344 d->bd_mcip = 0; 345 mh = d->bd_bif; 346 d->bd_bif = 0; 347 348 /* 349 * Check if this descriptor had requested promiscuous mode. 350 * If so, turn it off. There's no need to take any action 351 * here, that is done when MBPF_PROMISC_REMOVE is used; 352 * bd_promisc is just a local flag to stop promiscuous mode 353 * from being set more than once. 354 */ 355 if (d->bd_promisc) 356 d->bd_promisc = 0; 357 358 /* 359 * Take device out of "promiscuous" mode. Since we were able to 360 * enter "promiscuous" mode, we should be able to turn it off. 361 * Note, this field stores a pointer used to support both 362 * promiscuous and non-promiscuous callbacks for packets. 363 */ 364 mph = d->bd_promisc_handle; 365 d->bd_promisc_handle = 0; 366 367 /* 368 * The lock has to be dropped here because mac_promisc_remove may 369 * need to wait for mac_promisc_dispatch, which has called into 370 * bpf and catchpacket is waiting for bd_lock... 371 * i.e mac_promisc_remove() needs to be called with none of the 372 * locks held that are part of the bpf_mtap() call path. 373 */ 374 mutex_exit(&d->bd_lock); 375 if (mph != 0) 376 MBPF_PROMISC_REMOVE(&d->bd_mac, mph); 377 378 if (mch != 0) 379 MBPF_CLIENT_CLOSE(&d->bd_mac, mch); 380 381 if (mh != 0) 382 MBPF_CLOSE(&d->bd_mac, mh); 383 384 /* 385 * Because this function is called with bd_lock held, so it must 386 * exit with it held. 387 */ 388 mutex_enter(&d->bd_lock); 389 *d->bd_ifname = '\0'; 390 (void) memset(&d->bd_mac, 0, sizeof (d->bd_mac)); 391 } 392 393 394 /* 395 * bpfilterattach() is called at load time. 396 */ 397 int 398 bpfilterattach(void) 399 { 400 401 bpf_hash = mod_hash_create_idhash("bpf_dev_tab", 31, 402 mod_hash_null_keydtor); 403 if (bpf_hash == NULL) 404 return (ENOMEM); 405 406 (void) memcpy(&ks_stats, &bpf_kstats, sizeof (bpf_kstats)); 407 408 bpf_ksp = kstat_create("bpf", 0, "global", "misc", 409 KSTAT_TYPE_NAMED, sizeof (bpf_kstats) / sizeof (kstat_named_t), 410 KSTAT_FLAG_VIRTUAL); 411 if (bpf_ksp != NULL) { 412 bpf_ksp->ks_data = &ks_stats; 413 kstat_install(bpf_ksp); 414 } else { 415 mod_hash_destroy_idhash(bpf_hash); 416 bpf_hash = NULL; 417 return (EEXIST); 418 } 419 420 cv_init(&bpf_dlt_waiter, NULL, CV_DRIVER, NULL); 421 mutex_init(&bpf_mtx, NULL, MUTEX_DRIVER, NULL); 422 423 LIST_INIT(&bpf_list); 424 425 return (0); 426 } 427 428 429 /* 430 * bpfilterdetach() is called at unload time. 431 */ 432 int 433 bpfilterdetach(void) 434 { 435 436 if (bpf_ksp != NULL) { 437 kstat_delete(bpf_ksp); 438 bpf_ksp = NULL; 439 } 440 441 mod_hash_destroy_idhash(bpf_hash); 442 bpf_hash = NULL; 443 444 cv_destroy(&bpf_dlt_waiter); 445 mutex_destroy(&bpf_mtx); 446 447 return (0); 448 } 449 450 /* 451 * Open ethernet device. Clones. 452 */ 453 /* ARGSUSED */ 454 int 455 bpfopen(dev_t *devp, int flag, int mode, cred_t *cred) 456 { 457 struct bpf_d *d; 458 uint_t dmin; 459 460 /* 461 * The security policy described at the top of this file is 462 * enforced here. 463 */ 464 if ((flag & FWRITE) != 0) { 465 if (secpolicy_net_rawaccess(cred) != 0) 466 return (EACCES); 467 } 468 469 if ((flag & FREAD) != 0) { 470 if ((secpolicy_net_observability(cred) != 0) && 471 (secpolicy_net_rawaccess(cred) != 0)) 472 return (EACCES); 473 } 474 475 if ((flag & (FWRITE|FREAD)) == 0) 476 return (ENXIO); 477 478 /* 479 * A structure is allocated per open file in BPF to store settings 480 * such as buffer capture size, provide private buffers, etc. 481 */ 482 d = (struct bpf_d *)kmem_zalloc(sizeof (*d), KM_SLEEP); 483 d->bd_bufsize = bpf_bufsize; 484 d->bd_fmode = flag; 485 d->bd_zone = crgetzoneid(cred); 486 d->bd_seesent = 1; 487 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_PHYS| 488 MAC_PROMISC_FLAGS_NO_COPY; 489 mutex_init(&d->bd_lock, NULL, MUTEX_DRIVER, NULL); 490 cv_init(&d->bd_wait, NULL, CV_DRIVER, NULL); 491 492 mutex_enter(&bpf_mtx); 493 /* 494 * Find an unused minor number. Obviously this is an O(n) algorithm 495 * and doesn't scale particularly well, so if there are large numbers 496 * of open file descriptors happening in real use, this design may 497 * need to be revisited. 498 */ 499 for (dmin = 0; dmin < L_MAXMIN; dmin++) 500 if (bpf_dev_find(dmin) == NULL) 501 break; 502 if (dmin == L_MAXMIN) { 503 mutex_exit(&bpf_mtx); 504 kmem_free(d, sizeof (*d)); 505 return (ENXIO); 506 } 507 d->bd_dev = dmin; 508 LIST_INSERT_HEAD(&bpf_list, d, bd_list); 509 bpf_dev_add(d); 510 mutex_exit(&bpf_mtx); 511 512 *devp = makedevice(getmajor(*devp), dmin); 513 514 return (0); 515 } 516 517 /* 518 * Close the descriptor by detaching it from its interface, 519 * deallocating its buffers, and marking it free. 520 * 521 * Because we only allow a device to be opened once, there is always a 522 * 1 to 1 relationship between opens and closes supporting this function. 523 */ 524 /* ARGSUSED */ 525 int 526 bpfclose(dev_t dev, int flag, int otyp, cred_t *cred_p) 527 { 528 struct bpf_d *d = bpf_dev_get(getminor(dev)); 529 530 mutex_enter(&d->bd_lock); 531 532 while (d->bd_inuse != 0) { 533 d->bd_waiting++; 534 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) { 535 d->bd_waiting--; 536 mutex_exit(&d->bd_lock); 537 return (EINTR); 538 } 539 d->bd_waiting--; 540 } 541 542 d->bd_inuse = -1; 543 if (d->bd_state == BPF_WAITING) 544 bpf_clear_timeout(d); 545 d->bd_state = BPF_IDLE; 546 if (d->bd_bif) 547 bpf_detachd(d); 548 mutex_exit(&d->bd_lock); 549 550 mutex_enter(&bpf_mtx); 551 LIST_REMOVE(d, bd_list); 552 bpf_dev_remove(d); 553 mutex_exit(&bpf_mtx); 554 555 mutex_enter(&d->bd_lock); 556 mutex_destroy(&d->bd_lock); 557 cv_destroy(&d->bd_wait); 558 559 bpf_freed(d); 560 kmem_free(d, sizeof (*d)); 561 562 return (0); 563 } 564 565 /* 566 * Rotate the packet buffers in descriptor d. Move the store buffer 567 * into the hold slot, and the free buffer into the store slot. 568 * Zero the length of the new store buffer. 569 */ 570 #define ROTATE_BUFFERS(d) \ 571 (d)->bd_hbuf = (d)->bd_sbuf; \ 572 (d)->bd_hlen = (d)->bd_slen; \ 573 (d)->bd_sbuf = (d)->bd_fbuf; \ 574 (d)->bd_slen = 0; \ 575 (d)->bd_fbuf = 0; 576 /* 577 * bpfread - read next chunk of packets from buffers 578 */ 579 /* ARGSUSED */ 580 int 581 bpfread(dev_t dev, struct uio *uio, cred_t *cred) 582 { 583 struct bpf_d *d = bpf_dev_get(getminor(dev)); 584 int timed_out; 585 ulong_t delay; 586 int error; 587 588 if ((d->bd_fmode & FREAD) == 0) 589 return (EBADF); 590 591 /* 592 * Restrict application to use a buffer the same size as 593 * the kernel buffers. 594 */ 595 if (uio->uio_resid != d->bd_bufsize) 596 return (EINVAL); 597 598 mutex_enter(&d->bd_lock); 599 if (d->bd_state == BPF_WAITING) 600 bpf_clear_timeout(d); 601 timed_out = (d->bd_state == BPF_TIMED_OUT); 602 d->bd_state = BPF_IDLE; 603 /* 604 * If the hold buffer is empty, then do a timed sleep, which 605 * ends when the timeout expires or when enough packets 606 * have arrived to fill the store buffer. 607 */ 608 while (d->bd_hbuf == 0) { 609 if (d->bd_nonblock) { 610 if (d->bd_slen == 0) { 611 mutex_exit(&d->bd_lock); 612 return (EWOULDBLOCK); 613 } 614 ROTATE_BUFFERS(d); 615 break; 616 } 617 618 if ((d->bd_immediate || timed_out) && d->bd_slen != 0) { 619 /* 620 * A packet(s) either arrived since the previous 621 * read or arrived while we were asleep. 622 * Rotate the buffers and return what's here. 623 */ 624 ROTATE_BUFFERS(d); 625 break; 626 } 627 ks_stats.kp_read_wait.value.ui64++; 628 delay = ddi_get_lbolt() + d->bd_rtout; 629 error = cv_timedwait_sig(&d->bd_wait, &d->bd_lock, delay); 630 if (error == 0) { 631 mutex_exit(&d->bd_lock); 632 return (EINTR); 633 } 634 if (error == -1) { 635 /* 636 * On a timeout, return what's in the buffer, 637 * which may be nothing. If there is something 638 * in the store buffer, we can rotate the buffers. 639 */ 640 if (d->bd_hbuf) 641 /* 642 * We filled up the buffer in between 643 * getting the timeout and arriving 644 * here, so we don't need to rotate. 645 */ 646 break; 647 648 if (d->bd_slen == 0) { 649 mutex_exit(&d->bd_lock); 650 return (0); 651 } 652 ROTATE_BUFFERS(d); 653 } 654 } 655 /* 656 * At this point, we know we have something in the hold slot. 657 */ 658 mutex_exit(&d->bd_lock); 659 660 /* 661 * Move data from hold buffer into user space. 662 * We know the entire buffer is transferred since 663 * we checked above that the read buffer is bpf_bufsize bytes. 664 */ 665 error = uiomove(d->bd_hbuf, d->bd_hlen, UIO_READ, uio); 666 667 mutex_enter(&d->bd_lock); 668 d->bd_fbuf = d->bd_hbuf; 669 d->bd_hbuf = 0; 670 d->bd_hlen = 0; 671 done: 672 mutex_exit(&d->bd_lock); 673 return (error); 674 } 675 676 677 /* 678 * If there are processes sleeping on this descriptor, wake them up. 679 * NOTE: the lock for bd_wait is bd_lock and is held by bpf_deliver, 680 * so there is no code here grabbing it. 681 */ 682 static inline void 683 bpf_wakeup(struct bpf_d *d) 684 { 685 cv_signal(&d->bd_wait); 686 } 687 688 static void 689 bpf_timed_out(void *arg) 690 { 691 struct bpf_d *d = arg; 692 693 mutex_enter(&d->bd_lock); 694 if (d->bd_state == BPF_WAITING) { 695 d->bd_state = BPF_TIMED_OUT; 696 if (d->bd_slen != 0) 697 cv_signal(&d->bd_wait); 698 } 699 mutex_exit(&d->bd_lock); 700 } 701 702 703 /* ARGSUSED */ 704 int 705 bpfwrite(dev_t dev, struct uio *uio, cred_t *cred) 706 { 707 struct bpf_d *d = bpf_dev_get(getminor(dev)); 708 uintptr_t mch; 709 uint_t mtu; 710 mblk_t *m; 711 int error; 712 int dlt; 713 714 if ((d->bd_fmode & FWRITE) == 0) 715 return (EBADF); 716 717 mutex_enter(&d->bd_lock); 718 if (d->bd_bif == 0 || d->bd_mcip == 0 || d->bd_bif == 0) { 719 mutex_exit(&d->bd_lock); 720 return (EINTR); 721 } 722 723 if (uio->uio_resid == 0) { 724 mutex_exit(&d->bd_lock); 725 return (0); 726 } 727 728 while (d->bd_inuse < 0) { 729 d->bd_waiting++; 730 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) { 731 d->bd_waiting--; 732 mutex_exit(&d->bd_lock); 733 return (EINTR); 734 } 735 d->bd_waiting--; 736 } 737 738 mutex_exit(&d->bd_lock); 739 740 dlt = d->bd_dlt; 741 mch = d->bd_mcip; 742 MBPF_SDU_GET(&d->bd_mac, d->bd_bif, &mtu); 743 d->bd_inuse++; 744 745 m = NULL; 746 if (dlt == DLT_IPNET) { 747 error = EIO; 748 goto done; 749 } 750 751 error = bpf_movein(uio, dlt, mtu, &m); 752 if (error) 753 goto done; 754 755 DTRACE_PROBE4(bpf__tx, struct bpf_d *, d, int, dlt, 756 uint_t, mtu, mblk_t *, m); 757 758 if (M_LEN(m) > mtu) { 759 error = EMSGSIZE; 760 goto done; 761 } 762 763 error = MBPF_TX(&d->bd_mac, mch, m); 764 /* 765 * The "tx" action here is required to consume the mblk_t. 766 */ 767 m = NULL; 768 769 done: 770 if (error == 0) 771 ks_stats.kp_write_ok.value.ui64++; 772 else 773 ks_stats.kp_write_error.value.ui64++; 774 if (m != NULL) 775 freemsg(m); 776 777 mutex_enter(&d->bd_lock); 778 d->bd_inuse--; 779 if ((d->bd_inuse == 0) && (d->bd_waiting != 0)) 780 cv_signal(&d->bd_wait); 781 mutex_exit(&d->bd_lock); 782 783 /* 784 * The driver frees the mbuf. 785 */ 786 return (error); 787 } 788 789 790 /* 791 * Reset a descriptor by flushing its packet buffer and clearing the 792 * receive and drop counts. Should be called at splnet. 793 */ 794 static void 795 reset_d(struct bpf_d *d) 796 { 797 if (d->bd_hbuf) { 798 /* Free the hold buffer. */ 799 d->bd_fbuf = d->bd_hbuf; 800 d->bd_hbuf = 0; 801 } 802 d->bd_slen = 0; 803 d->bd_hlen = 0; 804 d->bd_rcount = 0; 805 d->bd_dcount = 0; 806 d->bd_ccount = 0; 807 } 808 809 /* 810 * FIONREAD Check for read packet available. 811 * BIOCGBLEN Get buffer len [for read()]. 812 * BIOCSETF Set ethernet read filter. 813 * BIOCFLUSH Flush read packet buffer. 814 * BIOCPROMISC Put interface into promiscuous mode. 815 * BIOCGDLT Get link layer type. 816 * BIOCGETIF Get interface name. 817 * BIOCSETIF Set interface. 818 * BIOCSRTIMEOUT Set read timeout. 819 * BIOCGRTIMEOUT Get read timeout. 820 * BIOCGSTATS Get packet stats. 821 * BIOCIMMEDIATE Set immediate mode. 822 * BIOCVERSION Get filter language version. 823 * BIOCGHDRCMPLT Get "header already complete" flag. 824 * BIOCSHDRCMPLT Set "header already complete" flag. 825 */ 826 /* ARGSUSED */ 827 int 828 bpfioctl(dev_t dev, int cmd, intptr_t addr, int mode, cred_t *cred, int *rval) 829 { 830 struct bpf_d *d = bpf_dev_get(getminor(dev)); 831 struct bpf_program prog; 832 struct lifreq lifreq; 833 struct ifreq ifreq; 834 int error = 0; 835 uint_t size; 836 837 /* 838 * Refresh the PID associated with this bpf file. 839 */ 840 mutex_enter(&d->bd_lock); 841 if (d->bd_state == BPF_WAITING) 842 bpf_clear_timeout(d); 843 d->bd_state = BPF_IDLE; 844 mutex_exit(&d->bd_lock); 845 846 switch (cmd) { 847 848 default: 849 error = EINVAL; 850 break; 851 852 /* 853 * Check for read packet available. 854 */ 855 case FIONREAD: 856 { 857 int n; 858 859 mutex_enter(&d->bd_lock); 860 n = d->bd_slen; 861 if (d->bd_hbuf) 862 n += d->bd_hlen; 863 mutex_exit(&d->bd_lock); 864 865 *(int *)addr = n; 866 break; 867 } 868 869 /* 870 * Get buffer len [for read()]. 871 */ 872 case BIOCGBLEN: 873 error = copyout(&d->bd_bufsize, (void *)addr, 874 sizeof (d->bd_bufsize)); 875 break; 876 877 /* 878 * Set buffer length. 879 */ 880 case BIOCSBLEN: 881 if (copyin((void *)addr, &size, sizeof (size)) != 0) { 882 error = EFAULT; 883 break; 884 } 885 886 mutex_enter(&d->bd_lock); 887 if (d->bd_bif != 0) { 888 error = EINVAL; 889 } else { 890 if (size > bpf_maxbufsize) 891 size = bpf_maxbufsize; 892 else if (size < BPF_MINBUFSIZE) 893 size = BPF_MINBUFSIZE; 894 895 d->bd_bufsize = size; 896 } 897 mutex_exit(&d->bd_lock); 898 899 if (error == 0) 900 error = copyout(&size, (void *)addr, sizeof (size)); 901 break; 902 903 /* 904 * Set link layer read filter. 905 */ 906 case BIOCSETF: 907 if (ddi_copyin((void *)addr, &prog, sizeof (prog), mode)) { 908 error = EFAULT; 909 break; 910 } 911 error = bpf_setf(d, &prog); 912 break; 913 914 /* 915 * Flush read packet buffer. 916 */ 917 case BIOCFLUSH: 918 mutex_enter(&d->bd_lock); 919 reset_d(d); 920 mutex_exit(&d->bd_lock); 921 break; 922 923 /* 924 * Put interface into promiscuous mode. 925 * This is a one-way ioctl, it is not used to turn promiscuous 926 * mode off. 927 */ 928 case BIOCPROMISC: 929 if (d->bd_bif == 0) { 930 /* 931 * No interface attached yet. 932 */ 933 error = EINVAL; 934 break; 935 } 936 mutex_enter(&d->bd_lock); 937 if (d->bd_promisc == 0) { 938 939 if (d->bd_promisc_handle) { 940 uintptr_t mph; 941 942 mph = d->bd_promisc_handle; 943 d->bd_promisc_handle = 0; 944 945 mutex_exit(&d->bd_lock); 946 MBPF_PROMISC_REMOVE(&d->bd_mac, mph); 947 mutex_enter(&d->bd_lock); 948 } 949 950 d->bd_promisc_flags = MAC_PROMISC_FLAGS_NO_COPY; 951 error = MBPF_PROMISC_ADD(&d->bd_mac, 952 d->bd_mcip, MAC_CLIENT_PROMISC_ALL, d, 953 &d->bd_promisc_handle, d->bd_promisc_flags); 954 if (error == 0) 955 d->bd_promisc = 1; 956 } 957 mutex_exit(&d->bd_lock); 958 break; 959 960 /* 961 * Get device parameters. 962 */ 963 case BIOCGDLT: 964 if (d->bd_bif == 0) 965 error = EINVAL; 966 else 967 error = copyout(&d->bd_dlt, (void *)addr, 968 sizeof (d->bd_dlt)); 969 break; 970 971 /* 972 * Get a list of supported device parameters. 973 */ 974 case BIOCGDLTLIST: 975 if (d->bd_bif == 0) { 976 error = EINVAL; 977 } else { 978 struct bpf_dltlist list; 979 980 if (copyin((void *)addr, &list, sizeof (list)) != 0) { 981 error = EFAULT; 982 break; 983 } 984 error = bpf_getdltlist(d, &list); 985 if ((error == 0) && 986 copyout(&list, (void *)addr, sizeof (list)) != 0) 987 error = EFAULT; 988 } 989 break; 990 991 /* 992 * Set device parameters. 993 */ 994 case BIOCSDLT: 995 error = bpf_setdlt(d, (void *)addr); 996 break; 997 998 /* 999 * Get interface name. 1000 */ 1001 case BIOCGETIF: 1002 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) { 1003 error = EFAULT; 1004 break; 1005 } 1006 error = bpf_ifname(d, ifreq.ifr_name, sizeof (ifreq.ifr_name)); 1007 if ((error == 0) && 1008 copyout(&ifreq, (void *)addr, sizeof (ifreq)) != 0) { 1009 error = EFAULT; 1010 break; 1011 } 1012 break; 1013 1014 /* 1015 * Set interface. 1016 */ 1017 case BIOCSETIF: 1018 if (copyin((void *)addr, &ifreq, sizeof (ifreq)) != 0) { 1019 error = EFAULT; 1020 break; 1021 } 1022 error = bpf_setif(d, ifreq.ifr_name, sizeof (ifreq.ifr_name)); 1023 break; 1024 1025 /* 1026 * Get interface name. 1027 */ 1028 case BIOCGETLIF: 1029 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) { 1030 error = EFAULT; 1031 break; 1032 } 1033 error = bpf_ifname(d, lifreq.lifr_name, 1034 sizeof (lifreq.lifr_name)); 1035 if ((error == 0) && 1036 copyout(&lifreq, (void *)addr, sizeof (lifreq)) != 0) { 1037 error = EFAULT; 1038 break; 1039 } 1040 break; 1041 1042 /* 1043 * Set interface. 1044 */ 1045 case BIOCSETLIF: 1046 if (copyin((void *)addr, &lifreq, sizeof (lifreq)) != 0) { 1047 error = EFAULT; 1048 break; 1049 } 1050 error = bpf_setif(d, lifreq.lifr_name, 1051 sizeof (lifreq.lifr_name)); 1052 break; 1053 1054 #ifdef _SYSCALL32_IMPL 1055 /* 1056 * Set read timeout. 1057 */ 1058 case BIOCSRTIMEOUT32: 1059 { 1060 struct timeval32 tv; 1061 1062 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) { 1063 error = EFAULT; 1064 break; 1065 } 1066 1067 /* Convert the timeout in microseconds to ticks */ 1068 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 + 1069 tv.tv_usec); 1070 if ((d->bd_rtout == 0) && (tv.tv_usec != 0)) 1071 d->bd_rtout = 1; 1072 break; 1073 } 1074 1075 /* 1076 * Get read timeout. 1077 */ 1078 case BIOCGRTIMEOUT32: 1079 { 1080 struct timeval32 tv; 1081 clock_t ticks; 1082 1083 ticks = drv_hztousec(d->bd_rtout); 1084 tv.tv_sec = ticks / 1000000; 1085 tv.tv_usec = ticks - (tv.tv_sec * 1000000); 1086 error = copyout(&tv, (void *)addr, sizeof (tv)); 1087 break; 1088 } 1089 1090 /* 1091 * Get a list of supported device parameters. 1092 */ 1093 case BIOCGDLTLIST32: 1094 if (d->bd_bif == 0) { 1095 error = EINVAL; 1096 } else { 1097 struct bpf_dltlist32 lst32; 1098 struct bpf_dltlist list; 1099 1100 if (copyin((void *)addr, &lst32, sizeof (lst32)) != 0) { 1101 error = EFAULT; 1102 break; 1103 } 1104 1105 list.bfl_len = lst32.bfl_len; 1106 list.bfl_list = (void *)(uint64_t)lst32.bfl_list; 1107 error = bpf_getdltlist(d, &list); 1108 if (error == 0) { 1109 lst32.bfl_len = list.bfl_len; 1110 1111 if (copyout(&lst32, (void *)addr, 1112 sizeof (lst32)) != 0) 1113 error = EFAULT; 1114 } 1115 } 1116 break; 1117 1118 /* 1119 * Set link layer read filter. 1120 */ 1121 case BIOCSETF32: { 1122 struct bpf_program32 prog32; 1123 1124 if (ddi_copyin((void *)addr, &prog32, sizeof (prog), mode)) { 1125 error = EFAULT; 1126 break; 1127 } 1128 prog.bf_len = prog32.bf_len; 1129 prog.bf_insns = (void *)(uint64_t)prog32.bf_insns; 1130 error = bpf_setf(d, &prog); 1131 break; 1132 } 1133 #endif 1134 1135 /* 1136 * Set read timeout. 1137 */ 1138 case BIOCSRTIMEOUT: 1139 { 1140 struct timeval tv; 1141 1142 if (copyin((void *)addr, &tv, sizeof (tv)) != 0) { 1143 error = EFAULT; 1144 break; 1145 } 1146 1147 /* Convert the timeout in microseconds to ticks */ 1148 d->bd_rtout = drv_usectohz(tv.tv_sec * 1000000 + 1149 tv.tv_usec); 1150 if ((d->bd_rtout == 0) && (tv.tv_usec != 0)) 1151 d->bd_rtout = 1; 1152 break; 1153 } 1154 1155 /* 1156 * Get read timeout. 1157 */ 1158 case BIOCGRTIMEOUT: 1159 { 1160 struct timeval tv; 1161 clock_t ticks; 1162 1163 ticks = drv_hztousec(d->bd_rtout); 1164 tv.tv_sec = ticks / 1000000; 1165 tv.tv_usec = ticks - (tv.tv_sec * 1000000); 1166 if (copyout(&tv, (void *)addr, sizeof (tv)) != 0) 1167 error = EFAULT; 1168 break; 1169 } 1170 1171 /* 1172 * Get packet stats. 1173 */ 1174 case BIOCGSTATS: 1175 { 1176 struct bpf_stat bs; 1177 1178 bs.bs_recv = d->bd_rcount; 1179 bs.bs_drop = d->bd_dcount; 1180 bs.bs_capt = d->bd_ccount; 1181 if (copyout(&bs, (void *)addr, sizeof (bs)) != 0) 1182 error = EFAULT; 1183 break; 1184 } 1185 1186 /* 1187 * Set immediate mode. 1188 */ 1189 case BIOCIMMEDIATE: 1190 if (copyin((void *)addr, &d->bd_immediate, 1191 sizeof (d->bd_immediate)) != 0) 1192 error = EFAULT; 1193 break; 1194 1195 case BIOCVERSION: 1196 { 1197 struct bpf_version bv; 1198 1199 bv.bv_major = BPF_MAJOR_VERSION; 1200 bv.bv_minor = BPF_MINOR_VERSION; 1201 if (copyout(&bv, (void *)addr, sizeof (bv)) != 0) 1202 error = EFAULT; 1203 break; 1204 } 1205 1206 case BIOCGHDRCMPLT: /* get "header already complete" flag */ 1207 if (copyout(&d->bd_hdrcmplt, (void *)addr, 1208 sizeof (d->bd_hdrcmplt)) != 0) 1209 error = EFAULT; 1210 break; 1211 1212 case BIOCSHDRCMPLT: /* set "header already complete" flag */ 1213 if (copyin((void *)addr, &d->bd_hdrcmplt, 1214 sizeof (d->bd_hdrcmplt)) != 0) 1215 error = EFAULT; 1216 break; 1217 1218 /* 1219 * Get "see sent packets" flag 1220 */ 1221 case BIOCGSEESENT: 1222 if (copyout(&d->bd_seesent, (void *)addr, 1223 sizeof (d->bd_seesent)) != 0) 1224 error = EFAULT; 1225 break; 1226 1227 /* 1228 * Set "see sent" packets flag 1229 */ 1230 case BIOCSSEESENT: 1231 if (copyin((void *)addr, &d->bd_seesent, 1232 sizeof (d->bd_seesent)) != 0) 1233 error = EFAULT; 1234 break; 1235 1236 case FIONBIO: /* Non-blocking I/O */ 1237 if (copyin((void *)addr, &d->bd_nonblock, 1238 sizeof (d->bd_nonblock)) != 0) 1239 error = EFAULT; 1240 break; 1241 } 1242 return (error); 1243 } 1244 1245 /* 1246 * Set d's packet filter program to fp. If this file already has a filter, 1247 * free it and replace it. If the new filter is "empty" (has a 0 size), then 1248 * the result is to just remove and free the existing filter. 1249 * Returns EINVAL for bogus requests. 1250 */ 1251 int 1252 bpf_setf(struct bpf_d *d, struct bpf_program *fp) 1253 { 1254 struct bpf_insn *fcode, *old; 1255 uint_t flen, size; 1256 size_t oldsize; 1257 1258 if (fp->bf_insns == 0) { 1259 if (fp->bf_len != 0) 1260 return (EINVAL); 1261 mutex_enter(&d->bd_lock); 1262 old = d->bd_filter; 1263 oldsize = d->bd_filter_size; 1264 d->bd_filter = 0; 1265 d->bd_filter_size = 0; 1266 reset_d(d); 1267 mutex_exit(&d->bd_lock); 1268 if (old != 0) 1269 kmem_free(old, oldsize); 1270 return (0); 1271 } 1272 flen = fp->bf_len; 1273 if (flen > BPF_MAXINSNS) 1274 return (EINVAL); 1275 1276 size = flen * sizeof (*fp->bf_insns); 1277 fcode = kmem_alloc(size, KM_SLEEP); 1278 if (copyin(fp->bf_insns, fcode, size) != 0) 1279 return (EFAULT); 1280 1281 if (bpf_validate(fcode, (int)flen)) { 1282 mutex_enter(&d->bd_lock); 1283 old = d->bd_filter; 1284 oldsize = d->bd_filter_size; 1285 d->bd_filter = fcode; 1286 d->bd_filter_size = size; 1287 reset_d(d); 1288 mutex_exit(&d->bd_lock); 1289 if (old != 0) 1290 kmem_free(old, oldsize); 1291 1292 return (0); 1293 } 1294 kmem_free(fcode, size); 1295 return (EINVAL); 1296 } 1297 1298 /* 1299 * Detach a file from its current interface (if attached at all) and attach 1300 * to the interface indicated by the name stored in ifname. 1301 * Return an errno or 0. 1302 */ 1303 static int 1304 bpf_setif(struct bpf_d *d, char *ifname, int namesize) 1305 { 1306 int unit_seen; 1307 int error = 0; 1308 char *cp; 1309 int i; 1310 1311 /* 1312 * Make sure the provided name has a unit number, and default 1313 * it to '0' if not specified. 1314 * XXX This is ugly ... do this differently? 1315 */ 1316 unit_seen = 0; 1317 cp = ifname; 1318 cp[namesize - 1] = '\0'; /* sanity */ 1319 while (*cp++) 1320 if (*cp >= '0' && *cp <= '9') 1321 unit_seen = 1; 1322 if (!unit_seen) { 1323 /* Make sure to leave room for the '\0'. */ 1324 for (i = 0; i < (namesize - 1); ++i) { 1325 if ((ifname[i] >= 'a' && ifname[i] <= 'z') || 1326 (ifname[i] >= 'A' && ifname[i] <= 'Z')) 1327 continue; 1328 ifname[i] = '0'; 1329 } 1330 } 1331 1332 /* 1333 * Make sure that only one call to this function happens at a time 1334 * and that we're not interleaving a read/write 1335 */ 1336 mutex_enter(&d->bd_lock); 1337 while (d->bd_inuse != 0) { 1338 d->bd_waiting++; 1339 if (cv_wait_sig(&d->bd_wait, &d->bd_lock) <= 0) { 1340 d->bd_waiting--; 1341 mutex_exit(&d->bd_lock); 1342 return (EINTR); 1343 } 1344 d->bd_waiting--; 1345 } 1346 d->bd_inuse = -1; 1347 mutex_exit(&d->bd_lock); 1348 1349 if (d->bd_sbuf == 0) 1350 error = bpf_allocbufs(d); 1351 1352 if (error == 0) { 1353 mutex_enter(&d->bd_lock); 1354 if (d->bd_bif) 1355 /* 1356 * Detach if attached to something else. 1357 */ 1358 bpf_detachd(d); 1359 1360 error = bpf_attachd(d, ifname, -1); 1361 reset_d(d); 1362 d->bd_inuse = 0; 1363 if (d->bd_waiting != 0) 1364 cv_signal(&d->bd_wait); 1365 mutex_exit(&d->bd_lock); 1366 return (error); 1367 } 1368 1369 mutex_enter(&d->bd_lock); 1370 d->bd_inuse = 0; 1371 if (d->bd_waiting != 0) 1372 cv_signal(&d->bd_wait); 1373 mutex_exit(&d->bd_lock); 1374 1375 /* 1376 * Try tickle the mac layer into attaching the device... 1377 */ 1378 return (bpf_provider_tickle(ifname, d->bd_zone)); 1379 } 1380 1381 /* 1382 * Copy the interface name to the ifreq. 1383 */ 1384 static int 1385 bpf_ifname(struct bpf_d *d, char *buffer, int bufsize) 1386 { 1387 1388 mutex_enter(&d->bd_lock); 1389 if (d->bd_bif == NULL) { 1390 mutex_exit(&d->bd_lock); 1391 return (EINVAL); 1392 } 1393 1394 (void) strlcpy(buffer, d->bd_ifname, bufsize); 1395 mutex_exit(&d->bd_lock); 1396 1397 return (0); 1398 } 1399 1400 /* ARGSUSED */ 1401 int 1402 bpfchpoll(dev_t dev, short events, int anyyet, short *reventsp, 1403 struct pollhead **phpp) 1404 { 1405 struct bpf_d *d = bpf_dev_get(getminor(dev)); 1406 1407 /* 1408 * Until this driver is modified to issue proper pollwakeup() calls on 1409 * its pollhead, edge-triggered polling is not allowed. 1410 */ 1411 if (events & POLLET) { 1412 return (EPERM); 1413 } 1414 1415 if (events & (POLLIN | POLLRDNORM)) { 1416 /* 1417 * An imitation of the FIONREAD ioctl code. 1418 */ 1419 mutex_enter(&d->bd_lock); 1420 if (d->bd_hlen != 0 || 1421 ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) && 1422 d->bd_slen != 0)) { 1423 *reventsp |= events & (POLLIN | POLLRDNORM); 1424 } else { 1425 /* 1426 * Until the bpf driver has been updated to include 1427 * adequate pollwakeup() logic, no pollhead will be 1428 * emitted here, preventing the resource from being 1429 * cached by poll()/devpoll/epoll. 1430 */ 1431 *reventsp = 0; 1432 /* Start the read timeout if necessary */ 1433 if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) { 1434 bpf_clear_timeout(d); 1435 /* 1436 * Only allow the timeout to be set once. 1437 */ 1438 if (d->bd_callout == 0) 1439 d->bd_callout = timeout(bpf_timed_out, 1440 d, d->bd_rtout); 1441 d->bd_state = BPF_WAITING; 1442 } 1443 } 1444 mutex_exit(&d->bd_lock); 1445 } 1446 1447 return (0); 1448 } 1449 1450 /* 1451 * Copy data from an mblk_t chain into a buffer. This works for ipnet 1452 * because the dl_ipnetinfo_t is placed in an mblk_t that leads the 1453 * packet itself. 1454 */ 1455 static void * 1456 bpf_mcpy(void *dst_arg, const void *src_arg, size_t len) 1457 { 1458 const mblk_t *m; 1459 uint_t count; 1460 uchar_t *dst; 1461 1462 m = src_arg; 1463 dst = dst_arg; 1464 while (len > 0) { 1465 if (m == NULL) 1466 panic("bpf_mcpy"); 1467 count = (uint_t)min(M_LEN(m), len); 1468 (void) memcpy(dst, mtod(m, const void *), count); 1469 m = m->b_cont; 1470 dst += count; 1471 len -= count; 1472 } 1473 return (dst_arg); 1474 } 1475 1476 /* 1477 * Dispatch a packet to all the listeners on interface bp. 1478 * 1479 * marg pointer to the packet, either a data buffer or an mbuf chain 1480 * buflen buffer length, if marg is a data buffer 1481 * cpfn a function that can copy marg into the listener's buffer 1482 * pktlen length of the packet 1483 * issent boolean indicating whether the packet was sent or receive 1484 */ 1485 static inline void 1486 bpf_deliver(struct bpf_d *d, cp_fn_t cpfn, void *marg, uint_t pktlen, 1487 uint_t buflen, boolean_t issent) 1488 { 1489 struct timeval tv; 1490 uint_t slen; 1491 1492 if (!d->bd_seesent && issent) 1493 return; 1494 1495 /* 1496 * Accuracy of the packet counters in BPF is vital so it 1497 * is important to protect even the outer ones. 1498 */ 1499 mutex_enter(&d->bd_lock); 1500 slen = bpf_filter(d->bd_filter, marg, pktlen, buflen); 1501 DTRACE_PROBE5(bpf__packet, struct bpf_if *, d->bd_bif, 1502 struct bpf_d *, d, void *, marg, uint_t, pktlen, uint_t, slen); 1503 d->bd_rcount++; 1504 ks_stats.kp_receive.value.ui64++; 1505 if (slen != 0) { 1506 uniqtime(&tv); 1507 catchpacket(d, marg, pktlen, slen, cpfn, &tv); 1508 } 1509 mutex_exit(&d->bd_lock); 1510 } 1511 1512 /* 1513 * Incoming linkage from device drivers. 1514 */ 1515 /* ARGSUSED */ 1516 void 1517 bpf_mtap(void *arg, mac_resource_handle_t mrh, mblk_t *m, boolean_t issent) 1518 { 1519 cp_fn_t cpfn; 1520 struct bpf_d *d = arg; 1521 uint_t pktlen, buflen; 1522 void *marg; 1523 1524 pktlen = msgdsize(m); 1525 1526 if (pktlen == M_LEN(m)) { 1527 cpfn = (cp_fn_t)memcpy; 1528 marg = mtod(m, void *); 1529 buflen = pktlen; 1530 } else { 1531 cpfn = bpf_mcpy; 1532 marg = m; 1533 buflen = 0; 1534 } 1535 1536 bpf_deliver(d, cpfn, marg, pktlen, buflen, issent); 1537 } 1538 1539 /* 1540 * Incoming linkage from ipnet. 1541 * In ipnet, there is only one event, NH_OBSERVE, that delivers packets 1542 * from all network interfaces. Thus the tap function needs to apply a 1543 * filter using the interface index/id to immitate snoop'ing on just the 1544 * specified interface. 1545 */ 1546 /* ARGSUSED */ 1547 void 1548 bpf_itap(void *arg, mblk_t *m, boolean_t issent, uint_t length) 1549 { 1550 hook_pkt_observe_t *hdr; 1551 struct bpf_d *d = arg; 1552 1553 hdr = (hook_pkt_observe_t *)m->b_rptr; 1554 if (ntohl(hdr->hpo_ifindex) != d->bd_linkid) 1555 return; 1556 bpf_deliver(d, bpf_mcpy, m, length, 0, issent); 1557 1558 } 1559 1560 /* 1561 * Move the packet data from interface memory (pkt) into the 1562 * store buffer. Return 1 if it's time to wakeup a listener (buffer full), 1563 * otherwise 0. "copy" is the routine called to do the actual data 1564 * transfer. memcpy is passed in to copy contiguous chunks, while 1565 * bpf_mcpy is passed in to copy mbuf chains. In the latter case, 1566 * pkt is really an mbuf. 1567 */ 1568 static void 1569 catchpacket(struct bpf_d *d, uchar_t *pkt, uint_t pktlen, uint_t snaplen, 1570 cp_fn_t cpfn, struct timeval *tv) 1571 { 1572 struct bpf_hdr *hp; 1573 int totlen, curlen; 1574 int hdrlen = d->bd_hdrlen; 1575 int do_wakeup = 0; 1576 1577 ++d->bd_ccount; 1578 ks_stats.kp_capture.value.ui64++; 1579 /* 1580 * Figure out how many bytes to move. If the packet is 1581 * greater or equal to the snapshot length, transfer that 1582 * much. Otherwise, transfer the whole packet (unless 1583 * we hit the buffer size limit). 1584 */ 1585 totlen = hdrlen + min(snaplen, pktlen); 1586 if (totlen > d->bd_bufsize) 1587 totlen = d->bd_bufsize; 1588 1589 /* 1590 * Round up the end of the previous packet to the next longword. 1591 */ 1592 curlen = BPF_WORDALIGN(d->bd_slen); 1593 if (curlen + totlen > d->bd_bufsize) { 1594 /* 1595 * This packet will overflow the storage buffer. 1596 * Rotate the buffers if we can, then wakeup any 1597 * pending reads. 1598 */ 1599 if (d->bd_fbuf == 0) { 1600 /* 1601 * We haven't completed the previous read yet, 1602 * so drop the packet. 1603 */ 1604 ++d->bd_dcount; 1605 ks_stats.kp_dropped.value.ui64++; 1606 return; 1607 } 1608 ROTATE_BUFFERS(d); 1609 do_wakeup = 1; 1610 curlen = 0; 1611 } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) { 1612 /* 1613 * Immediate mode is set, or the read timeout has 1614 * already expired during a select call. A packet 1615 * arrived, so the reader should be woken up. 1616 */ 1617 do_wakeup = 1; 1618 } 1619 1620 /* 1621 * Append the bpf header to the existing buffer before we add 1622 * on the actual packet data. 1623 */ 1624 hp = (struct bpf_hdr *)((char *)d->bd_sbuf + curlen); 1625 hp->bh_tstamp.tv_sec = tv->tv_sec; 1626 hp->bh_tstamp.tv_usec = tv->tv_usec; 1627 hp->bh_datalen = pktlen; 1628 hp->bh_hdrlen = (uint16_t)hdrlen; 1629 /* 1630 * Copy the packet data into the store buffer and update its length. 1631 */ 1632 (*cpfn)((uchar_t *)hp + hdrlen, pkt, 1633 (hp->bh_caplen = totlen - hdrlen)); 1634 d->bd_slen = curlen + totlen; 1635 1636 /* 1637 * Call bpf_wakeup after bd_slen has been updated. 1638 */ 1639 if (do_wakeup) 1640 bpf_wakeup(d); 1641 } 1642 1643 /* 1644 * Initialize all nonzero fields of a descriptor. 1645 */ 1646 static int 1647 bpf_allocbufs(struct bpf_d *d) 1648 { 1649 1650 d->bd_fbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); 1651 if (!d->bd_fbuf) 1652 return (ENOBUFS); 1653 d->bd_sbuf = kmem_zalloc(d->bd_bufsize, KM_NOSLEEP); 1654 if (!d->bd_sbuf) { 1655 kmem_free(d->bd_fbuf, d->bd_bufsize); 1656 return (ENOBUFS); 1657 } 1658 d->bd_slen = 0; 1659 d->bd_hlen = 0; 1660 return (0); 1661 } 1662 1663 /* 1664 * Free buffers currently in use by a descriptor. 1665 * Called on close. 1666 */ 1667 static void 1668 bpf_freed(struct bpf_d *d) 1669 { 1670 /* 1671 * At this point the descriptor has been detached from its 1672 * interface and it yet hasn't been marked free. 1673 */ 1674 if (d->bd_sbuf != 0) { 1675 kmem_free(d->bd_sbuf, d->bd_bufsize); 1676 if (d->bd_hbuf != 0) 1677 kmem_free(d->bd_hbuf, d->bd_bufsize); 1678 if (d->bd_fbuf != 0) 1679 kmem_free(d->bd_fbuf, d->bd_bufsize); 1680 } 1681 if (d->bd_filter) 1682 kmem_free(d->bd_filter, d->bd_filter_size); 1683 } 1684 1685 /* 1686 * Get a list of available data link type of the interface. 1687 */ 1688 static int 1689 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *listp) 1690 { 1691 bpf_provider_list_t *bp; 1692 bpf_provider_t *bpr; 1693 zoneid_t zoneid; 1694 uintptr_t mcip; 1695 uint_t nicdlt; 1696 uintptr_t mh; 1697 int error; 1698 int n; 1699 1700 n = 0; 1701 mh = 0; 1702 mcip = 0; 1703 error = 0; 1704 mutex_enter(&d->bd_lock); 1705 LIST_FOREACH(bp, &bpf_providers, bpl_next) { 1706 bpr = bp->bpl_what; 1707 error = MBPF_OPEN(bpr, d->bd_ifname, &mh, d->bd_zone); 1708 if (error != 0) 1709 goto next; 1710 error = MBPF_CLIENT_OPEN(bpr, mh, &mcip); 1711 if (error != 0) 1712 goto next; 1713 error = MBPF_GET_ZONE(bpr, mh, &zoneid); 1714 if (error != 0) 1715 goto next; 1716 if (d->bd_zone != GLOBAL_ZONEID && 1717 d->bd_zone != zoneid) 1718 goto next; 1719 error = MBPF_GET_DLT(bpr, mh, &nicdlt); 1720 if (error != 0) 1721 goto next; 1722 nicdlt = bpf_dl_to_dlt(nicdlt); 1723 if (listp->bfl_list != NULL) { 1724 if (n >= listp->bfl_len) { 1725 MBPF_CLIENT_CLOSE(bpr, mcip); 1726 MBPF_CLOSE(bpr, mh); 1727 break; 1728 } 1729 /* 1730 * Bumping of bd_inuse ensures the structure does not 1731 * disappear while the copyout runs and allows the for 1732 * loop to be continued. 1733 */ 1734 d->bd_inuse++; 1735 mutex_exit(&d->bd_lock); 1736 if (copyout(&nicdlt, 1737 listp->bfl_list + n, sizeof (uint_t)) != 0) 1738 error = EFAULT; 1739 mutex_enter(&d->bd_lock); 1740 if (error != 0) 1741 break; 1742 d->bd_inuse--; 1743 } 1744 n++; 1745 next: 1746 if (mcip != 0) { 1747 MBPF_CLIENT_CLOSE(bpr, mcip); 1748 mcip = 0; 1749 } 1750 if (mh != 0) { 1751 MBPF_CLOSE(bpr, mh); 1752 mh = 0; 1753 } 1754 } 1755 mutex_exit(&d->bd_lock); 1756 1757 /* 1758 * It is quite possible that one or more provider to BPF may not 1759 * know about a link name whlist others do. In that case, so long 1760 * as we have one success, do not declare an error unless it was 1761 * an EFAULT as this indicates a problem that needs to be reported. 1762 */ 1763 if ((error != EFAULT) && (n > 0)) 1764 error = 0; 1765 1766 listp->bfl_len = n; 1767 return (error); 1768 } 1769 1770 /* 1771 * Set the data link type of a BPF instance. 1772 */ 1773 static int 1774 bpf_setdlt(struct bpf_d *d, void *addr) 1775 { 1776 char ifname[LIFNAMSIZ+1]; 1777 zoneid_t niczone; 1778 int error; 1779 int dlt; 1780 1781 if (copyin(addr, &dlt, sizeof (dlt)) != 0) 1782 return (EFAULT); 1783 1784 mutex_enter(&d->bd_lock); 1785 1786 if (d->bd_bif == 0) { /* Interface not set */ 1787 mutex_exit(&d->bd_lock); 1788 return (EINVAL); 1789 } 1790 if (d->bd_dlt == dlt) { /* NULL-op */ 1791 mutex_exit(&d->bd_lock); 1792 return (0); 1793 } 1794 1795 error = MBPF_GET_ZONE(&d->bd_mac, d->bd_bif, &niczone); 1796 if (error != 0) { 1797 mutex_exit(&d->bd_lock); 1798 return (error); 1799 } 1800 1801 /* 1802 * See the matrix at the top of the file for the permissions table 1803 * enforced by this driver. 1804 */ 1805 if ((d->bd_zone != GLOBAL_ZONEID) && (dlt != DLT_IPNET) && 1806 (niczone != d->bd_zone)) { 1807 mutex_exit(&d->bd_lock); 1808 return (EINVAL); 1809 } 1810 1811 (void) strlcpy(ifname, d->bd_ifname, sizeof (ifname)); 1812 d->bd_inuse = -1; 1813 bpf_detachd(d); 1814 error = bpf_attachd(d, ifname, dlt); 1815 reset_d(d); 1816 d->bd_inuse = 0; 1817 1818 mutex_exit(&d->bd_lock); 1819 return (error); 1820 } 1821 1822 /* 1823 * bpf_clear_timeout is called with the bd_lock mutex held, providing it 1824 * with the necessary protection to retrieve and modify bd_callout but it 1825 * does not hold the lock for its entire duration... see below... 1826 */ 1827 static void 1828 bpf_clear_timeout(struct bpf_d *d) 1829 { 1830 timeout_id_t tid = d->bd_callout; 1831 d->bd_callout = 0; 1832 d->bd_inuse++; 1833 1834 /* 1835 * If the timeout has fired and is waiting on bd_lock, we could 1836 * deadlock here because untimeout if bd_lock is held and would 1837 * wait for bpf_timed_out to finish and it never would. 1838 */ 1839 if (tid != 0) { 1840 mutex_exit(&d->bd_lock); 1841 (void) untimeout(tid); 1842 mutex_enter(&d->bd_lock); 1843 } 1844 1845 d->bd_inuse--; 1846 } 1847 1848 /* 1849 * As a cloning device driver, BPF needs to keep track of which device 1850 * numbers are in use and which ones are not. A hash table, indexed by 1851 * the minor device number, is used to store the pointers to the 1852 * individual descriptors that are allocated in bpfopen(). 1853 * The functions below present the interface for that hash table to 1854 * the rest of the driver. 1855 */ 1856 static struct bpf_d * 1857 bpf_dev_find(minor_t minor) 1858 { 1859 struct bpf_d *d = NULL; 1860 1861 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor, 1862 (mod_hash_val_t *)&d); 1863 1864 return (d); 1865 } 1866 1867 static void 1868 bpf_dev_add(struct bpf_d *d) 1869 { 1870 (void) mod_hash_insert(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev, 1871 (mod_hash_val_t)d); 1872 } 1873 1874 static void 1875 bpf_dev_remove(struct bpf_d *d) 1876 { 1877 struct bpf_d *stor; 1878 1879 (void) mod_hash_remove(bpf_hash, (mod_hash_key_t)(uintptr_t)d->bd_dev, 1880 (mod_hash_val_t *)&stor); 1881 ASSERT(stor == d); 1882 } 1883 1884 /* 1885 * bpf_def_get should only ever be called for a minor number that exists, 1886 * thus there should always be a pointer in the hash table that corresponds 1887 * to it. 1888 */ 1889 static struct bpf_d * 1890 bpf_dev_get(minor_t minor) 1891 { 1892 struct bpf_d *d = NULL; 1893 1894 (void) mod_hash_find(bpf_hash, (mod_hash_key_t)(uintptr_t)minor, 1895 (mod_hash_val_t *)&d); 1896 ASSERT(d != NULL); 1897 1898 return (d); 1899 } 1900