1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012 by Delphix. All rights reserved. 28 * Copyright 2019 Joyent, Inc. 29 * Copyright 2022 Oxide Computer Company 30 */ 31 32 #include <sys/types.h> 33 #include <sys/devops.h> 34 #include <sys/conf.h> 35 #include <sys/modctl.h> 36 #include <sys/sunddi.h> 37 #include <sys/stat.h> 38 #include <sys/poll_impl.h> 39 #include <sys/errno.h> 40 #include <sys/kmem.h> 41 #include <sys/mkdev.h> 42 #include <sys/debug.h> 43 #include <sys/file.h> 44 #include <sys/sysmacros.h> 45 #include <sys/systm.h> 46 #include <sys/bitmap.h> 47 #include <sys/devpoll.h> 48 #include <sys/rctl.h> 49 #include <sys/resource.h> 50 #include <sys/schedctl.h> 51 #include <sys/epoll.h> 52 53 #define RESERVED 1 54 55 /* local data struct */ 56 static dp_entry_t **devpolltbl; /* dev poll entries */ 57 static size_t dptblsize; 58 59 static kmutex_t devpoll_lock; /* lock protecting dev tbl */ 60 int devpoll_init; /* is /dev/poll initialized already */ 61 62 /* device local functions */ 63 64 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp); 65 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp); 66 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 67 int *rvalp); 68 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, 69 struct pollhead **phpp); 70 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp); 71 static dev_info_t *dpdevi; 72 73 74 static struct cb_ops dp_cb_ops = { 75 dpopen, /* open */ 76 dpclose, /* close */ 77 nodev, /* strategy */ 78 nodev, /* print */ 79 nodev, /* dump */ 80 nodev, /* read */ 81 dpwrite, /* write */ 82 dpioctl, /* ioctl */ 83 nodev, /* devmap */ 84 nodev, /* mmap */ 85 nodev, /* segmap */ 86 dppoll, /* poll */ 87 ddi_prop_op, /* prop_op */ 88 (struct streamtab *)0, /* streamtab */ 89 D_MP, /* flags */ 90 CB_REV, /* cb_ops revision */ 91 nodev, /* aread */ 92 nodev /* awrite */ 93 }; 94 95 static int dpattach(dev_info_t *, ddi_attach_cmd_t); 96 static int dpdetach(dev_info_t *, ddi_detach_cmd_t); 97 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 98 99 static struct dev_ops dp_ops = { 100 DEVO_REV, /* devo_rev */ 101 0, /* refcnt */ 102 dpinfo, /* info */ 103 nulldev, /* identify */ 104 nulldev, /* probe */ 105 dpattach, /* attach */ 106 dpdetach, /* detach */ 107 nodev, /* reset */ 108 &dp_cb_ops, /* driver operations */ 109 (struct bus_ops *)NULL, /* bus operations */ 110 nulldev, /* power */ 111 ddi_quiesce_not_needed, /* quiesce */ 112 }; 113 114 115 static struct modldrv modldrv = { 116 &mod_driverops, /* type of module - a driver */ 117 "/dev/poll driver", 118 &dp_ops, 119 }; 120 121 static struct modlinkage modlinkage = { 122 MODREV_1, 123 (void *)&modldrv, 124 NULL 125 }; 126 127 static void pcachelink_assoc(pollcache_t *, pollcache_t *); 128 static void pcachelink_mark_stale(pollcache_t *); 129 static void pcachelink_purge_stale(pollcache_t *); 130 static void pcachelink_purge_all(pollcache_t *); 131 132 133 /* 134 * Locking Design 135 * 136 * The /dev/poll driver shares most of its code with poll sys call whose 137 * code is in common/syscall/poll.c. In poll(2) design, the pollcache 138 * structure is per lwp. An implicit assumption is made there that some 139 * portion of pollcache will never be touched by other lwps. E.g., in 140 * poll(2) design, no lwp will ever need to grow bitmap of other lwp. 141 * This assumption is not true for /dev/poll; hence the need for extra 142 * locking. 143 * 144 * To allow more parallelism, each /dev/poll file descriptor (indexed by 145 * minor number) has its own lock. Since read (dpioctl) is a much more 146 * frequent operation than write, we want to allow multiple reads on same 147 * /dev/poll fd. However, we prevent writes from being starved by giving 148 * priority to write operation. Theoretically writes can starve reads as 149 * well. But in practical sense this is not important because (1) writes 150 * happens less often than reads, and (2) write operation defines the 151 * content of poll fd a cache set. If writes happens so often that they 152 * can starve reads, that means the cached set is very unstable. It may 153 * not make sense to read an unstable cache set anyway. Therefore, the 154 * writers starving readers case is not handled in this design. 155 */ 156 157 int 158 _init() 159 { 160 int error; 161 162 dptblsize = DEVPOLLSIZE; 163 devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); 164 mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL); 165 devpoll_init = 1; 166 if ((error = mod_install(&modlinkage)) != 0) { 167 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); 168 devpoll_init = 0; 169 } 170 return (error); 171 } 172 173 int 174 _fini() 175 { 176 int error; 177 178 if ((error = mod_remove(&modlinkage)) != 0) { 179 return (error); 180 } 181 mutex_destroy(&devpoll_lock); 182 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); 183 return (0); 184 } 185 186 int 187 _info(struct modinfo *modinfop) 188 { 189 return (mod_info(&modlinkage, modinfop)); 190 } 191 192 /*ARGSUSED*/ 193 static int 194 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd) 195 { 196 if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, 0) 197 == DDI_FAILURE) { 198 ddi_remove_minor_node(devi, NULL); 199 return (DDI_FAILURE); 200 } 201 dpdevi = devi; 202 return (DDI_SUCCESS); 203 } 204 205 static int 206 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd) 207 { 208 if (cmd != DDI_DETACH) 209 return (DDI_FAILURE); 210 211 ddi_remove_minor_node(devi, NULL); 212 return (DDI_SUCCESS); 213 } 214 215 /* ARGSUSED */ 216 static int 217 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 218 { 219 int error; 220 221 switch (infocmd) { 222 case DDI_INFO_DEVT2DEVINFO: 223 *result = (void *)dpdevi; 224 error = DDI_SUCCESS; 225 break; 226 case DDI_INFO_DEVT2INSTANCE: 227 *result = (void *)0; 228 error = DDI_SUCCESS; 229 break; 230 default: 231 error = DDI_FAILURE; 232 } 233 return (error); 234 } 235 236 /* 237 * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major 238 * differences are: (1) /dev/poll requires scanning the bitmap starting at 239 * where it was stopped last time, instead of always starting from 0, 240 * (2) since user may not have cleaned up the cached fds when they are 241 * closed, some polldats in cache may refer to closed or reused fds. We 242 * need to check for those cases. 243 * 244 * NOTE: Upon closing an fd, automatic poll cache cleanup is done for 245 * poll(2) caches but NOT for /dev/poll caches. So expect some 246 * stale entries! 247 */ 248 static int 249 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds, 250 int *fdcntp) 251 { 252 int start, ostart, end, fdcnt, error = 0; 253 boolean_t done, no_wrap; 254 pollfd_t *pfdp; 255 epoll_event_t *epoll; 256 const short mask = POLLRDHUP | POLLWRBAND; 257 const boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; 258 259 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 260 if (pcp->pc_bitmap == NULL) { 261 /* No Need to search because no poll fd has been cached. */ 262 return (0); 263 } 264 265 if (is_epoll) { 266 pfdp = NULL; 267 epoll = (epoll_event_t *)dpbuf; 268 } else { 269 pfdp = (pollfd_t *)dpbuf; 270 epoll = NULL; 271 } 272 retry: 273 start = ostart = pcp->pc_mapstart; 274 end = pcp->pc_mapend; 275 276 if (start == 0) { 277 /* 278 * started from every begining, no need to wrap around. 279 */ 280 no_wrap = B_TRUE; 281 } else { 282 no_wrap = B_FALSE; 283 } 284 done = B_FALSE; 285 fdcnt = 0; 286 while ((fdcnt < nfds) && !done) { 287 pollhead_t *php = NULL; 288 short revent = 0; 289 uf_entry_gen_t gen; 290 int fd; 291 292 /* 293 * Examine the bit map in a circular fashion 294 * to avoid starvation. Always resume from 295 * last stop. Scan till end of the map. Then 296 * wrap around. 297 */ 298 fd = bt_getlowbit(pcp->pc_bitmap, start, end); 299 ASSERT(fd <= end); 300 if (fd >= 0) { 301 file_t *fp; 302 polldat_t *pdp; 303 304 if (fd == end) { 305 if (no_wrap) { 306 done = B_TRUE; 307 } else { 308 start = 0; 309 end = ostart - 1; 310 no_wrap = B_TRUE; 311 } 312 } else { 313 start = fd + 1; 314 } 315 pdp = pcache_lookup_fd(pcp, fd); 316 repoll: 317 ASSERT(pdp != NULL); 318 ASSERT(pdp->pd_fd == fd); 319 if (pdp->pd_fp == NULL) { 320 /* 321 * The fd is POLLREMOVed. This fd is 322 * logically no longer cached. So move 323 * on to the next one. 324 */ 325 continue; 326 } 327 if ((fp = getf_gen(fd, &gen)) == NULL) { 328 if (is_epoll) { 329 /* 330 * In the epoll compatibility case, we 331 * actually perform the implicit 332 * removal to remain closer to the 333 * epoll semantics. 334 */ 335 pdp->pd_fp = NULL; 336 pdp->pd_events = 0; 337 338 polldat_disassociate(pdp); 339 340 BT_CLEAR(pcp->pc_bitmap, fd); 341 } else if (pfdp != NULL) { 342 /* 343 * The fd has been closed, but user has 344 * not done a POLLREMOVE on this fd 345 * yet. Instead of cleaning it here 346 * implicitly, we return POLLNVAL. This 347 * is consistent with poll(2) polling a 348 * closed fd. Hope this will remind 349 * user to do a POLLREMOVE. 350 */ 351 pfdp[fdcnt].fd = fd; 352 pfdp[fdcnt].revents = POLLNVAL; 353 fdcnt++; 354 } 355 continue; 356 } 357 358 /* 359 * Detect a change to the resource underlying a cached 360 * file descriptor. While the fd generation comparison 361 * will catch nearly all cases, the file_t comparison 362 * is maintained as a failsafe as well. 363 */ 364 if (gen != pdp->pd_gen || fp != pdp->pd_fp) { 365 /* 366 * The user is polling on a cached fd which was 367 * closed and then reused. Unfortunately there 368 * is no good way to communicate this fact to 369 * the consumer. 370 * 371 * When this situation has been detected, it's 372 * likely that any existing pollhead is 373 * ill-suited to perform proper wake-ups. 374 * 375 * Clean up the old entry under the expectation 376 * that a valid one will be provided as part of 377 * the later VOP_POLL. 378 */ 379 polldat_disassociate(pdp); 380 381 /* 382 * Since epoll is expected to act on the 383 * underlying 'struct file' (in Linux terms, 384 * our vnode_t would be a closer analog) rather 385 * than the fd itself, an implicit remove 386 * is necessary under these circumstances to 387 * suppress any results (or errors) from the 388 * new resource occupying the fd. 389 */ 390 if (is_epoll) { 391 pdp->pd_fp = NULL; 392 pdp->pd_events = 0; 393 BT_CLEAR(pcp->pc_bitmap, fd); 394 releasef(fd); 395 continue; 396 } else { 397 /* 398 * Regular /dev/poll is unbothered 399 * about the fd reassignment. 400 */ 401 pdp->pd_fp = fp; 402 pdp->pd_gen = gen; 403 } 404 } 405 406 /* 407 * Skip entries marked with the sentinal value for 408 * having already fired under oneshot conditions. 409 */ 410 if (pdp->pd_events == POLLONESHOT) { 411 releasef(fd); 412 BT_CLEAR(pcp->pc_bitmap, fd); 413 continue; 414 } 415 416 /* 417 * XXX - pollrelock() logic needs to know which 418 * which pollcache lock to grab. It'd be a 419 * cleaner solution if we could pass pcp as 420 * an arguement in VOP_POLL interface instead 421 * of implicitly passing it using thread_t 422 * struct. On the other hand, changing VOP_POLL 423 * interface will require all driver/file system 424 * poll routine to change. May want to revisit 425 * the tradeoff later. 426 */ 427 curthread->t_pollcache = pcp; 428 error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, 429 &revent, &php, NULL); 430 431 /* 432 * Recheck edge-triggered descriptors which lack a 433 * pollhead. While this check is performed when an fd 434 * is added to the pollcache in dpwrite(), subsequent 435 * descriptor manipulation could cause a different 436 * resource to be present now. 437 */ 438 if ((pdp->pd_events & POLLET) && error == 0 && 439 pdp->pd_php == NULL && php == NULL && revent != 0) { 440 short levent = 0; 441 442 /* 443 * The same POLLET-only VOP_POLL is used in an 444 * attempt to coax a pollhead from older 445 * driver logic. 446 */ 447 error = VOP_POLL(fp->f_vnode, POLLET, 448 0, &levent, &php, NULL); 449 } 450 451 curthread->t_pollcache = NULL; 452 releasef(fd); 453 if (error != 0) { 454 break; 455 } 456 457 /* 458 * layered devices (e.g. console driver) 459 * may change the vnode and thus the pollhead 460 * pointer out from underneath us. 461 */ 462 if (php != NULL && pdp->pd_php != NULL && 463 php != pdp->pd_php) { 464 polldat_disassociate(pdp); 465 polldat_associate(pdp, php); 466 /* 467 * The bit should still be set. 468 */ 469 ASSERT(BT_TEST(pcp->pc_bitmap, fd)); 470 goto retry; 471 } 472 473 if (revent != 0) { 474 if (pfdp != NULL) { 475 pfdp[fdcnt].fd = fd; 476 pfdp[fdcnt].events = pdp->pd_events; 477 pfdp[fdcnt].revents = revent; 478 } else if (epoll != NULL) { 479 epoll_event_t *ep = &epoll[fdcnt]; 480 481 ASSERT(epoll != NULL); 482 ep->data.u64 = pdp->pd_epolldata; 483 484 /* 485 * Since POLLNVAL is a legal event for 486 * VOP_POLL handlers to emit, it must 487 * be translated epoll-legal. 488 */ 489 if (revent & POLLNVAL) { 490 revent &= ~POLLNVAL; 491 revent |= POLLERR; 492 } 493 494 /* 495 * If any of the event bits are set for 496 * which poll and epoll representations 497 * differ, swizzle in the native epoll 498 * values. 499 */ 500 if (revent & mask) { 501 ep->events = (revent & ~mask) | 502 ((revent & POLLRDHUP) ? 503 EPOLLRDHUP : 0) | 504 ((revent & POLLWRBAND) ? 505 EPOLLWRBAND : 0); 506 } else { 507 ep->events = revent; 508 } 509 510 /* 511 * We define POLLWRNORM to be POLLOUT, 512 * but epoll has separate definitions 513 * for them; if POLLOUT is set and the 514 * user has asked for EPOLLWRNORM, set 515 * that as well. 516 */ 517 if ((revent & POLLOUT) && 518 (pdp->pd_events & EPOLLWRNORM)) { 519 ep->events |= EPOLLWRNORM; 520 } 521 } else { 522 pollstate_t *ps = 523 curthread->t_pollstate; 524 /* 525 * The devpoll handle itself is being 526 * polled. Notify the caller of any 527 * readable event(s), leaving as much 528 * state as possible untouched. 529 */ 530 VERIFY(fdcnt == 0); 531 VERIFY(ps != NULL); 532 533 /* 534 * If a call to pollunlock() fails 535 * during VOP_POLL, skip over the fd 536 * and continue polling. 537 * 538 * Otherwise, report that there is an 539 * event pending. 540 */ 541 if ((ps->ps_flags & POLLSTATE_ULFAIL) 542 != 0) { 543 ps->ps_flags &= 544 ~POLLSTATE_ULFAIL; 545 continue; 546 } else { 547 fdcnt++; 548 break; 549 } 550 } 551 552 /* Handle special polling modes. */ 553 if (pdp->pd_events & POLLONESHOT) { 554 /* 555 * Entries operating under POLLONESHOT 556 * will be marked with a sentinel value 557 * to indicate that they have "fired" 558 * when emitting an event. This will 559 * disable them from polling until a 560 * later add/modify event rearms them. 561 */ 562 pdp->pd_events = POLLONESHOT; 563 polldat_disassociate(pdp); 564 BT_CLEAR(pcp->pc_bitmap, fd); 565 } else if (pdp->pd_events & POLLET) { 566 /* 567 * Wire up the pollhead which should 568 * have been provided. Edge-triggered 569 * polling cannot function properly 570 * with drivers which do not emit one. 571 */ 572 if (php != NULL && 573 pdp->pd_php == NULL) { 574 polldat_associate(pdp, php); 575 } 576 577 /* 578 * If the driver has emitted a pollhead, 579 * clear the bit in the bitmap which 580 * effectively latches the edge on a 581 * pollwakeup() from the driver. 582 */ 583 if (pdp->pd_php != NULL) { 584 BT_CLEAR(pcp->pc_bitmap, fd); 585 } 586 } 587 588 fdcnt++; 589 } else if (php != NULL) { 590 /* 591 * We clear a bit or cache a poll fd if 592 * the driver returns a poll head ptr, 593 * which is expected in the case of 0 594 * revents. Some buggy driver may return 595 * NULL php pointer with 0 revents. In 596 * this case, we just treat the driver as 597 * "noncachable" and not clearing the bit 598 * in bitmap. 599 */ 600 if ((pdp->pd_php != NULL) && 601 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 602 BT_CLEAR(pcp->pc_bitmap, fd); 603 } 604 if (pdp->pd_php == NULL) { 605 polldat_associate(pdp, php); 606 /* 607 * An event of interest may have 608 * arrived between the VOP_POLL() and 609 * the polldat_associate(), so we 610 * must check again. 611 */ 612 goto repoll; 613 } 614 } 615 } else { 616 /* 617 * No bit set in the range. Check for wrap around. 618 */ 619 if (!no_wrap) { 620 start = 0; 621 end = ostart - 1; 622 no_wrap = B_TRUE; 623 } else { 624 done = B_TRUE; 625 } 626 } 627 } 628 629 if (!done) { 630 pcp->pc_mapstart = start; 631 } 632 ASSERT(*fdcntp == 0); 633 *fdcntp = fdcnt; 634 return (error); 635 } 636 637 /*ARGSUSED*/ 638 static int 639 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) 640 { 641 minor_t minordev; 642 dp_entry_t *dpep; 643 pollcache_t *pcp; 644 645 ASSERT(devpoll_init); 646 ASSERT(dptblsize <= MAXMIN); 647 mutex_enter(&devpoll_lock); 648 for (minordev = 0; minordev < dptblsize; minordev++) { 649 if (devpolltbl[minordev] == NULL) { 650 devpolltbl[minordev] = (dp_entry_t *)RESERVED; 651 break; 652 } 653 } 654 if (minordev == dptblsize) { 655 dp_entry_t **newtbl; 656 size_t oldsize; 657 658 /* 659 * Used up every entry in the existing devpoll table. 660 * Grow the table by DEVPOLLSIZE. 661 */ 662 if ((oldsize = dptblsize) >= MAXMIN) { 663 mutex_exit(&devpoll_lock); 664 return (ENXIO); 665 } 666 dptblsize += DEVPOLLSIZE; 667 if (dptblsize > MAXMIN) { 668 dptblsize = MAXMIN; 669 } 670 newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); 671 bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize); 672 kmem_free(devpolltbl, sizeof (caddr_t) * oldsize); 673 devpolltbl = newtbl; 674 devpolltbl[minordev] = (dp_entry_t *)RESERVED; 675 } 676 mutex_exit(&devpoll_lock); 677 678 dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP); 679 /* 680 * allocate a pollcache skeleton here. Delay allocating bitmap 681 * structures until dpwrite() time, since we don't know the 682 * optimal size yet. We also delay setting the pid until either 683 * dpwrite() or attempt to poll on the instance, allowing parents 684 * to create instances of /dev/poll for their children. (In the 685 * epoll compatibility case, this check isn't performed to maintain 686 * semantic compatibility.) 687 */ 688 pcp = pcache_alloc(); 689 dpep->dpe_pcache = pcp; 690 pcp->pc_pid = -1; 691 *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ 692 mutex_enter(&devpoll_lock); 693 ASSERT(minordev < dptblsize); 694 ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED); 695 devpolltbl[minordev] = dpep; 696 mutex_exit(&devpoll_lock); 697 return (0); 698 } 699 700 /* 701 * Write to dev/poll add/remove fd's to/from a cached poll fd set, 702 * or change poll events for a watched fd. 703 */ 704 /*ARGSUSED*/ 705 static int 706 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) 707 { 708 minor_t minor; 709 dp_entry_t *dpep; 710 pollcache_t *pcp; 711 pollfd_t *pollfdp, *pfdp; 712 dvpoll_epollfd_t *epfdp; 713 uintptr_t limit; 714 int error; 715 uint_t size; 716 size_t copysize, uiosize; 717 nfds_t pollfdnum; 718 boolean_t is_epoll, fds_added = B_FALSE; 719 720 minor = getminor(dev); 721 722 mutex_enter(&devpoll_lock); 723 ASSERT(minor < dptblsize); 724 dpep = devpolltbl[minor]; 725 ASSERT(dpep != NULL); 726 mutex_exit(&devpoll_lock); 727 728 mutex_enter(&dpep->dpe_lock); 729 pcp = dpep->dpe_pcache; 730 is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; 731 size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t); 732 mutex_exit(&dpep->dpe_lock); 733 734 if (!is_epoll && curproc->p_pid != pcp->pc_pid) { 735 if (pcp->pc_pid != -1) { 736 return (EACCES); 737 } 738 739 pcp->pc_pid = curproc->p_pid; 740 } 741 742 if (uiop->uio_resid < 0) { 743 /* No one else is this careful, but maybe they should be. */ 744 return (EINVAL); 745 } 746 747 uiosize = (size_t)uiop->uio_resid; 748 pollfdnum = uiosize / size; 749 750 /* 751 * For epoll-enabled handles, restrict the allowed write size to 2. 752 * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD 753 * operation which is expanded into two operations (DEL and ADD). 754 * 755 * All other operations performed through epoll_ctl(3C) will consist of 756 * a single entry. 757 */ 758 if (is_epoll && pollfdnum > 2) { 759 return (EINVAL); 760 } 761 762 /* 763 * We want to make sure that pollfdnum isn't large enough to DoS us, 764 * but we also don't want to grab p_lock unnecessarily -- so we 765 * perform the full check against our resource limits if and only if 766 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX. 767 */ 768 if (pollfdnum > UINT8_MAX) { 769 mutex_enter(&curproc->p_lock); 770 if (pollfdnum > 771 (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], 772 curproc->p_rctls, curproc)) { 773 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 774 curproc->p_rctls, curproc, RCA_SAFE); 775 mutex_exit(&curproc->p_lock); 776 return (EINVAL); 777 } 778 mutex_exit(&curproc->p_lock); 779 } 780 781 /* 782 * Copy in the pollfd array. Walk through the array and add 783 * each polled fd to the cached set. 784 */ 785 pollfdp = kmem_alloc(uiosize, KM_SLEEP); 786 limit = (uintptr_t)pollfdp + (pollfdnum * size); 787 788 /* 789 * Although /dev/poll uses the write(2) interface to cache fds, it's 790 * not supposed to function as a seekable device. To prevent offset 791 * from growing and eventually exceed the maximum, reset the offset 792 * here for every call. 793 */ 794 uiop->uio_loffset = 0; 795 796 /* 797 * Use uiocopy instead of uiomove when populating pollfdp, keeping 798 * uio_resid untouched for now. Write syscalls will translate EINTR 799 * into a success if they detect "successfully transfered" data via an 800 * updated uio_resid. Falsely suppressing such errors is disastrous. 801 */ 802 if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop, 803 ©size)) != 0) { 804 kmem_free(pollfdp, uiosize); 805 return (error); 806 } 807 808 /* 809 * We are about to enter the core portion of dpwrite(). Make sure this 810 * write has exclusive access in this portion of the code, i.e., no 811 * other writers in this code. 812 * 813 * Waiting for all readers to drop their references to the dpe is 814 * unecessary since the pollcache itself is protected by pc_lock. 815 */ 816 mutex_enter(&dpep->dpe_lock); 817 dpep->dpe_writerwait++; 818 while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { 819 ASSERT(dpep->dpe_refcnt != 0); 820 821 /* 822 * The epoll API does not allow EINTR as a result when making 823 * modifications to the set of polled fds. Given that write 824 * activity is relatively quick and the size of accepted writes 825 * is limited above to two entries, a signal-ignorant wait is 826 * used here to avoid the EINTR. 827 */ 828 if (is_epoll) { 829 cv_wait(&dpep->dpe_cv, &dpep->dpe_lock); 830 continue; 831 } 832 833 /* 834 * Non-epoll writers to /dev/poll handles can tolerate EINTR. 835 */ 836 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { 837 dpep->dpe_writerwait--; 838 mutex_exit(&dpep->dpe_lock); 839 kmem_free(pollfdp, uiosize); 840 return (EINTR); 841 } 842 } 843 dpep->dpe_writerwait--; 844 dpep->dpe_flag |= DP_WRITER_PRESENT; 845 dpep->dpe_refcnt++; 846 847 if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) { 848 /* 849 * The epoll compat mode was enabled while we were waiting to 850 * establish write access. It is not safe to continue since 851 * state was prepared for non-epoll operation. 852 */ 853 error = EBUSY; 854 goto bypass; 855 } 856 mutex_exit(&dpep->dpe_lock); 857 858 /* 859 * Since the dpwrite() may recursively walk an added /dev/poll handle, 860 * pollstate_enter() deadlock and loop detection must be used. 861 */ 862 (void) pollstate_create(); 863 VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); 864 865 if (pcp->pc_bitmap == NULL) { 866 pcache_create(pcp, pollfdnum); 867 } 868 for (pfdp = pollfdp; (uintptr_t)pfdp < limit; 869 pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { 870 int fd = pfdp->fd; 871 polldat_t *pdp; 872 873 if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { 874 /* 875 * epoll semantics demand that we return EBADF if our 876 * specified fd is invalid. 877 */ 878 if (is_epoll) { 879 error = EBADF; 880 break; 881 } 882 883 continue; 884 } 885 886 pdp = pcache_lookup_fd(pcp, fd); 887 if (pfdp->events != POLLREMOVE) { 888 uf_entry_gen_t gen; 889 file_t *fp = NULL; 890 struct pollhead *php = NULL; 891 892 /* 893 * If we're in epoll compatibility mode, check that the 894 * fd is valid before allocating anything for it; epoll 895 * semantics demand that we return EBADF if our 896 * specified fd is invalid. 897 */ 898 if (is_epoll) { 899 if ((fp = getf_gen(fd, &gen)) == NULL) { 900 error = EBADF; 901 break; 902 } 903 } 904 if (pdp == NULL) { 905 pdp = pcache_alloc_fd(0); 906 pdp->pd_fd = fd; 907 pdp->pd_pcache = pcp; 908 pcache_insert_fd(pcp, pdp, pollfdnum); 909 } 910 911 if (is_epoll) { 912 /* 913 * If the fd is already a member of the epoll 914 * set, error emission is needed only when the 915 * fd assignment generation matches the one 916 * recorded in the polldat_t. Absence of such 917 * a generation match indicates that a new 918 * resource has been assigned at that fd. 919 * 920 * Caveat: It is possible to force a generation 921 * update while keeping the same backing 922 * resource. This is possible via dup2, but 923 * does not represent real-world use cases, 924 * making the lack of error acceptable. 925 */ 926 if (pdp->pd_fp != NULL && pdp->pd_gen == gen) { 927 error = EEXIST; 928 releasef(fd); 929 break; 930 } 931 932 /* 933 * We have decided that the cached information 934 * was stale. Reset pd_events to assure that 935 * we don't mistakenly operate on cached event 936 * disposition. This configures the implicit 937 * subscription to HUP and ERR events which 938 * epoll features. 939 */ 940 pdp->pd_events = POLLERR|POLLHUP; 941 942 epfdp = (dvpoll_epollfd_t *)pfdp; 943 pdp->pd_epolldata = epfdp->dpep_data; 944 } 945 946 ASSERT(pdp->pd_fd == fd); 947 ASSERT(pdp->pd_pcache == pcp); 948 if (fd >= pcp->pc_mapsize) { 949 mutex_exit(&pcp->pc_lock); 950 pcache_grow_map(pcp, fd); 951 mutex_enter(&pcp->pc_lock); 952 } 953 if (fd > pcp->pc_mapend) { 954 pcp->pc_mapend = fd; 955 } 956 957 if (!is_epoll) { 958 ASSERT(fp == NULL); 959 960 if ((fp = getf_gen(fd, &gen)) == NULL) { 961 /* 962 * The fd is not valid. Since we can't 963 * pass this error back in the write() 964 * call, set the bit in bitmap to force 965 * DP_POLL ioctl to examine it. 966 */ 967 BT_SET(pcp->pc_bitmap, fd); 968 pdp->pd_events |= pfdp->events; 969 continue; 970 } 971 /* 972 * Don't do VOP_POLL for an already cached fd 973 * with same poll events. 974 */ 975 if ((pdp->pd_events == pfdp->events) && 976 (pdp->pd_fp == fp)) { 977 /* 978 * the events are already cached 979 */ 980 releasef(fd); 981 continue; 982 } 983 } 984 985 986 /* 987 * do VOP_POLL and cache this poll fd. 988 */ 989 /* 990 * XXX - pollrelock() logic needs to know which 991 * which pollcache lock to grab. It'd be a 992 * cleaner solution if we could pass pcp as 993 * an arguement in VOP_POLL interface instead 994 * of implicitly passing it using thread_t 995 * struct. On the other hand, changing VOP_POLL 996 * interface will require all driver/file system 997 * poll routine to change. May want to revisit 998 * the tradeoff later. 999 */ 1000 curthread->t_pollcache = pcp; 1001 error = VOP_POLL(fp->f_vnode, pfdp->events, 0, 1002 &pfdp->revents, &php, NULL); 1003 1004 /* 1005 * Edge-triggered polling requires a pollhead in order 1006 * to initiate wake-ups properly. Drivers which are 1007 * savvy to POLLET presence, which should include 1008 * everything in-gate, will always emit one, regardless 1009 * of revent status. Older drivers which only emit a 1010 * pollhead if 'revents == 0' are given a second chance 1011 * here via a second VOP_POLL, with only POLLET set in 1012 * the events of interest. These circumstances should 1013 * induce any cacheable drivers to emit a pollhead for 1014 * wake-ups. 1015 * 1016 * Drivers which never emit a pollhead will simply 1017 * disobey the expectation of edge-triggered behavior. 1018 * This includes recursive epoll which, even on Linux, 1019 * yields its events in a level-triggered fashion only. 1020 */ 1021 if ((pfdp->events & POLLET) != 0 && error == 0 && 1022 php == NULL) { 1023 short levent = 0; 1024 1025 error = VOP_POLL(fp->f_vnode, POLLET, 0, 1026 &levent, &php, NULL); 1027 } 1028 1029 curthread->t_pollcache = NULL; 1030 /* 1031 * We always set the bit when this fd is cached; 1032 * this forces the first DP_POLL to poll this fd. 1033 * Real performance gain comes from subsequent 1034 * DP_POLL. We also attempt a polldat_associate(); 1035 * if it's not possible, we'll do it in dpioctl(). 1036 */ 1037 BT_SET(pcp->pc_bitmap, fd); 1038 if (error != 0) { 1039 releasef(fd); 1040 break; 1041 } 1042 pdp->pd_fp = fp; 1043 pdp->pd_gen = gen; 1044 pdp->pd_events |= pfdp->events; 1045 if (php != NULL) { 1046 if (pdp->pd_php == NULL) { 1047 polldat_associate(pdp, php); 1048 } else { 1049 if (pdp->pd_php != php) { 1050 polldat_disassociate(pdp); 1051 polldat_associate(pdp, php); 1052 } 1053 } 1054 } 1055 fds_added = B_TRUE; 1056 releasef(fd); 1057 } else { 1058 if (pdp == NULL || pdp->pd_fp == NULL) { 1059 if (is_epoll) { 1060 /* 1061 * As with the add case (above), epoll 1062 * semantics demand that we error out 1063 * in this case. 1064 */ 1065 error = ENOENT; 1066 break; 1067 } 1068 1069 continue; 1070 } 1071 ASSERT(pdp->pd_fd == fd); 1072 pdp->pd_fp = NULL; 1073 pdp->pd_events = 0; 1074 ASSERT(pdp->pd_thread == NULL); 1075 polldat_disassociate(pdp); 1076 BT_CLEAR(pcp->pc_bitmap, fd); 1077 } 1078 } 1079 /* 1080 * Wake any pollcache waiters so they can check the new descriptors. 1081 * 1082 * Any fds added to an recursive-capable pollcache could themselves be 1083 * /dev/poll handles. To ensure that proper event propagation occurs, 1084 * parent pollcaches are woken too, so that they can create any needed 1085 * pollcache links. 1086 */ 1087 if (fds_added) { 1088 cv_broadcast(&pcp->pc_cv); 1089 pcache_wake_parents(pcp); 1090 } 1091 pollstate_exit(pcp); 1092 mutex_enter(&dpep->dpe_lock); 1093 bypass: 1094 dpep->dpe_flag &= ~DP_WRITER_PRESENT; 1095 dpep->dpe_refcnt--; 1096 cv_broadcast(&dpep->dpe_cv); 1097 mutex_exit(&dpep->dpe_lock); 1098 kmem_free(pollfdp, uiosize); 1099 if (error == 0) { 1100 /* 1101 * The state of uio_resid is updated only after the pollcache 1102 * is successfully modified. 1103 */ 1104 uioskip(uiop, copysize); 1105 } 1106 return (error); 1107 } 1108 1109 #define DP_SIGMASK_RESTORE(ksetp) { \ 1110 if (ksetp != NULL) { \ 1111 mutex_enter(&p->p_lock); \ 1112 if (lwp->lwp_cursig == 0) { \ 1113 t->t_hold = lwp->lwp_sigoldmask; \ 1114 t->t_flag &= ~T_TOMASK; \ 1115 } \ 1116 mutex_exit(&p->p_lock); \ 1117 } \ 1118 } 1119 1120 /*ARGSUSED*/ 1121 static int 1122 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 1123 { 1124 minor_t minor; 1125 dp_entry_t *dpep; 1126 pollcache_t *pcp; 1127 hrtime_t now; 1128 int error = 0; 1129 boolean_t is_epoll; 1130 STRUCT_DECL(dvpoll, dvpoll); 1131 1132 if (cmd == DP_POLL || cmd == DP_PPOLL) { 1133 /* do this now, before we sleep on DP_WRITER_PRESENT */ 1134 now = gethrtime(); 1135 } 1136 1137 minor = getminor(dev); 1138 mutex_enter(&devpoll_lock); 1139 ASSERT(minor < dptblsize); 1140 dpep = devpolltbl[minor]; 1141 mutex_exit(&devpoll_lock); 1142 ASSERT(dpep != NULL); 1143 pcp = dpep->dpe_pcache; 1144 1145 mutex_enter(&dpep->dpe_lock); 1146 is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; 1147 1148 if (cmd == DP_EPOLLCOMPAT) { 1149 if (dpep->dpe_refcnt != 0) { 1150 /* 1151 * We can't turn on epoll compatibility while there 1152 * are outstanding operations. 1153 */ 1154 mutex_exit(&dpep->dpe_lock); 1155 return (EBUSY); 1156 } 1157 1158 /* 1159 * epoll compatibility is a one-way street: there's no way 1160 * to turn it off for a particular open. 1161 */ 1162 dpep->dpe_flag |= DP_ISEPOLLCOMPAT; 1163 1164 /* Record the epoll-enabled nature in the pollcache too */ 1165 mutex_enter(&pcp->pc_lock); 1166 pcp->pc_flag |= PC_EPOLL; 1167 mutex_exit(&pcp->pc_lock); 1168 1169 mutex_exit(&dpep->dpe_lock); 1170 return (0); 1171 } 1172 1173 if (!is_epoll && curproc->p_pid != pcp->pc_pid) { 1174 if (pcp->pc_pid != -1) { 1175 mutex_exit(&dpep->dpe_lock); 1176 return (EACCES); 1177 } 1178 1179 pcp->pc_pid = curproc->p_pid; 1180 } 1181 1182 /* Wait until all writers have cleared the handle before continuing */ 1183 while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 || 1184 (dpep->dpe_writerwait != 0)) { 1185 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { 1186 mutex_exit(&dpep->dpe_lock); 1187 return (EINTR); 1188 } 1189 } 1190 dpep->dpe_refcnt++; 1191 mutex_exit(&dpep->dpe_lock); 1192 1193 switch (cmd) { 1194 case DP_POLL: 1195 case DP_PPOLL: 1196 { 1197 pollstate_t *ps; 1198 nfds_t nfds; 1199 int fdcnt = 0; 1200 size_t size, fdsize, dpsize; 1201 hrtime_t deadline = 0; 1202 k_sigset_t *ksetp = NULL; 1203 k_sigset_t kset; 1204 sigset_t set; 1205 kthread_t *t = curthread; 1206 klwp_t *lwp = ttolwp(t); 1207 struct proc *p = ttoproc(curthread); 1208 1209 STRUCT_INIT(dvpoll, mode); 1210 1211 /* 1212 * The dp_setp member is only required/consumed for DP_PPOLL, 1213 * which otherwise uses the same structure as DP_POLL. 1214 */ 1215 if (cmd == DP_POLL) { 1216 dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - 1217 (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); 1218 } else { 1219 ASSERT(cmd == DP_PPOLL); 1220 dpsize = STRUCT_SIZE(dvpoll); 1221 } 1222 1223 if ((mode & FKIOCTL) != 0) { 1224 /* Kernel-internal ioctl call */ 1225 bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); 1226 error = 0; 1227 } else { 1228 error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), 1229 dpsize); 1230 } 1231 1232 if (error) { 1233 DP_REFRELE(dpep); 1234 return (EFAULT); 1235 } 1236 1237 deadline = STRUCT_FGET(dvpoll, dp_timeout); 1238 if (deadline > 0) { 1239 /* 1240 * Convert the deadline from relative milliseconds 1241 * to absolute nanoseconds. They must wait for at 1242 * least a tick. 1243 */ 1244 deadline = MSEC2NSEC(deadline); 1245 deadline = MAX(deadline, nsec_per_tick); 1246 deadline += now; 1247 } 1248 1249 if (cmd == DP_PPOLL) { 1250 void *setp = STRUCT_FGETP(dvpoll, dp_setp); 1251 1252 if (setp != NULL) { 1253 if ((mode & FKIOCTL) != 0) { 1254 /* Use the signal set directly */ 1255 ksetp = (k_sigset_t *)setp; 1256 } else { 1257 if (copyin(setp, &set, sizeof (set))) { 1258 DP_REFRELE(dpep); 1259 return (EFAULT); 1260 } 1261 sigutok(&set, &kset); 1262 ksetp = &kset; 1263 } 1264 1265 mutex_enter(&p->p_lock); 1266 schedctl_finish_sigblock(t); 1267 lwp->lwp_sigoldmask = t->t_hold; 1268 t->t_hold = *ksetp; 1269 t->t_flag |= T_TOMASK; 1270 1271 /* 1272 * Like ppoll() with a non-NULL sigset, we'll 1273 * call cv_reltimedwait_sig() just to check for 1274 * signals. This call will return immediately 1275 * with either 0 (signalled) or -1 (no signal). 1276 * There are some conditions whereby we can 1277 * get 0 from cv_reltimedwait_sig() without 1278 * a true signal (e.g., a directed stop), so 1279 * we restore our signal mask in the unlikely 1280 * event that lwp_cursig is 0. 1281 */ 1282 if (!cv_reltimedwait_sig(&t->t_delay_cv, 1283 &p->p_lock, 0, TR_CLOCK_TICK)) { 1284 if (lwp->lwp_cursig == 0) { 1285 t->t_hold = lwp->lwp_sigoldmask; 1286 t->t_flag &= ~T_TOMASK; 1287 } 1288 1289 mutex_exit(&p->p_lock); 1290 1291 DP_REFRELE(dpep); 1292 return (EINTR); 1293 } 1294 1295 mutex_exit(&p->p_lock); 1296 } 1297 } 1298 1299 if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { 1300 /* 1301 * We are just using DP_POLL to sleep, so 1302 * we don't any of the devpoll apparatus. 1303 * Do not check for signals if we have a zero timeout. 1304 */ 1305 DP_REFRELE(dpep); 1306 if (deadline == 0) { 1307 DP_SIGMASK_RESTORE(ksetp); 1308 return (0); 1309 } 1310 1311 mutex_enter(&curthread->t_delay_lock); 1312 while ((error = 1313 cv_timedwait_sig_hrtime(&curthread->t_delay_cv, 1314 &curthread->t_delay_lock, deadline)) > 0) 1315 continue; 1316 mutex_exit(&curthread->t_delay_lock); 1317 1318 DP_SIGMASK_RESTORE(ksetp); 1319 1320 return (error == 0 ? EINTR : 0); 1321 } 1322 1323 if (is_epoll) { 1324 size = nfds * (fdsize = sizeof (epoll_event_t)); 1325 } else { 1326 size = nfds * (fdsize = sizeof (pollfd_t)); 1327 } 1328 1329 /* 1330 * XXX It would be nice not to have to alloc each time, but it 1331 * requires another per thread structure hook. This can be 1332 * implemented later if data suggests that it's necessary. 1333 */ 1334 ps = pollstate_create(); 1335 1336 if (ps->ps_dpbufsize < size) { 1337 /* 1338 * If nfds is larger than twice the current maximum 1339 * open file count, we'll silently clamp it. This 1340 * only limits our exposure to allocating an 1341 * inordinate amount of kernel memory; it doesn't 1342 * otherwise affect the semantics. (We have this 1343 * check at twice the maximum instead of merely the 1344 * maximum because some applications pass an nfds that 1345 * is only slightly larger than their limit.) 1346 */ 1347 mutex_enter(&p->p_lock); 1348 if ((nfds >> 1) > p->p_fno_ctl) { 1349 nfds = p->p_fno_ctl; 1350 size = nfds * fdsize; 1351 } 1352 mutex_exit(&p->p_lock); 1353 1354 if (ps->ps_dpbufsize < size) { 1355 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 1356 ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); 1357 ps->ps_dpbufsize = size; 1358 } 1359 } 1360 1361 VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); 1362 for (;;) { 1363 pcp->pc_flag &= ~PC_POLLWAKE; 1364 1365 /* 1366 * Mark all child pcachelinks as stale. 1367 * Those which are still part of the tree will be 1368 * marked as valid during the poll. 1369 */ 1370 pcachelink_mark_stale(pcp); 1371 1372 error = dp_pcache_poll(dpep, ps->ps_dpbuf, 1373 pcp, nfds, &fdcnt); 1374 if (fdcnt > 0 || error != 0) 1375 break; 1376 1377 /* Purge still-stale child pcachelinks */ 1378 pcachelink_purge_stale(pcp); 1379 1380 /* 1381 * A pollwake has happened since we polled cache. 1382 */ 1383 if (pcp->pc_flag & PC_POLLWAKE) 1384 continue; 1385 1386 /* 1387 * Sleep until we are notified, signaled, or timed out. 1388 */ 1389 if (deadline == 0) { 1390 /* immediate timeout; do not check signals */ 1391 break; 1392 } 1393 1394 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 1395 &pcp->pc_lock, deadline); 1396 1397 /* 1398 * If we were awakened by a signal or timeout then 1399 * break the loop, else poll again. 1400 */ 1401 if (error <= 0) { 1402 error = (error == 0) ? EINTR : 0; 1403 break; 1404 } else { 1405 error = 0; 1406 } 1407 } 1408 pollstate_exit(pcp); 1409 1410 DP_SIGMASK_RESTORE(ksetp); 1411 1412 if (error == 0 && fdcnt > 0) { 1413 /* 1414 * It should be noted that FKIOCTL does not influence 1415 * the copyout (vs bcopy) of dp_fds at this time. 1416 */ 1417 if (copyout(ps->ps_dpbuf, 1418 STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { 1419 DP_REFRELE(dpep); 1420 return (EFAULT); 1421 } 1422 *rvalp = fdcnt; 1423 } 1424 break; 1425 } 1426 1427 case DP_ISPOLLED: 1428 { 1429 pollfd_t pollfd; 1430 polldat_t *pdp; 1431 1432 STRUCT_INIT(dvpoll, mode); 1433 error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t)); 1434 if (error) { 1435 DP_REFRELE(dpep); 1436 return (EFAULT); 1437 } 1438 mutex_enter(&pcp->pc_lock); 1439 if (pcp->pc_hash == NULL) { 1440 /* 1441 * No Need to search because no poll fd 1442 * has been cached. 1443 */ 1444 mutex_exit(&pcp->pc_lock); 1445 DP_REFRELE(dpep); 1446 return (0); 1447 } 1448 if (pollfd.fd < 0) { 1449 mutex_exit(&pcp->pc_lock); 1450 break; 1451 } 1452 pdp = pcache_lookup_fd(pcp, pollfd.fd); 1453 if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) && 1454 (pdp->pd_fp != NULL)) { 1455 pollfd.revents = pdp->pd_events; 1456 if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) { 1457 mutex_exit(&pcp->pc_lock); 1458 DP_REFRELE(dpep); 1459 return (EFAULT); 1460 } 1461 *rvalp = 1; 1462 } 1463 mutex_exit(&pcp->pc_lock); 1464 break; 1465 } 1466 1467 default: 1468 DP_REFRELE(dpep); 1469 return (EINVAL); 1470 } 1471 DP_REFRELE(dpep); 1472 return (error); 1473 } 1474 1475 /* 1476 * Overview of Recursive Polling 1477 * 1478 * It is possible for /dev/poll to poll for events on file descriptors which 1479 * themselves are /dev/poll handles. Pending events in the child handle are 1480 * represented as readable data via the POLLIN flag. To limit surface area, 1481 * this recursion is presently allowed on only /dev/poll handles which have 1482 * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is 1483 * limited to 5 in order to be consistent with Linux epoll. 1484 * 1485 * Extending dppoll() for VOP_POLL: 1486 * 1487 * The recursive /dev/poll implementation begins by extending dppoll() to 1488 * report when resources contained in the pollcache have relevant event state. 1489 * At the highest level, it means calling dp_pcache_poll() so it indicates if 1490 * fd events are present without consuming them or altering the pollcache 1491 * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will 1492 * yield the initiating event. Additionally, the VOP_POLL should return in 1493 * such a way that dp_pcache_poll() does not clear the parent bitmap entry 1494 * which corresponds to the child /dev/poll fd. This means that child 1495 * pollcaches will be checked during every poll which facilitates wake-up 1496 * behavior detailed below. 1497 * 1498 * Pollcache Links and Wake Events: 1499 * 1500 * Recursive /dev/poll avoids complicated pollcache locking constraints during 1501 * pollwakeup events by eschewing the traditional pollhead mechanism in favor 1502 * of a different approach. For each pollcache at the root of a recursive 1503 * /dev/poll "tree", pcachelink_t structures are established to all child 1504 * /dev/poll pollcaches. During pollnotify() in a child pollcache, the 1505 * linked list of pcachelink_t entries is walked, where those marked as valid 1506 * incur a cv_broadcast to their parent pollcache. Most notably, these 1507 * pcachelink_t cv wakeups are performed without acquiring pc_lock on the 1508 * parent pollcache (which would require careful deadlock avoidance). This 1509 * still allows the woken poll on the parent to discover the pertinent events 1510 * due to the fact that bitmap entires for the child pollcache are always 1511 * maintained by the dppoll() logic above. 1512 * 1513 * Depth Limiting and Loop Prevention: 1514 * 1515 * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and 1516 * loop constraints are enforced via pollstate_enter(). The pollcache_t 1517 * pointer is compared against any existing entries in ps_pc_stack and is added 1518 * to the end if no match (and therefore loop) is found. Once poll operations 1519 * for a given pollcache_t are complete, pollstate_exit() clears the pointer 1520 * from the list. The pollstate_enter() and pollstate_exit() functions are 1521 * responsible for acquiring and releasing pc_lock, respectively. 1522 * 1523 * Deadlock Safety: 1524 * 1525 * Descending through a tree of recursive /dev/poll handles involves the tricky 1526 * business of sequentially entering multiple pollcache locks. This tree 1527 * topology cannot define a lock acquisition order in such a way that it is 1528 * immune to deadlocks between threads. The pollstate_enter() and 1529 * pollstate_exit() functions provide an interface for recursive /dev/poll 1530 * operations to safely lock pollcaches while failing gracefully in the face of 1531 * deadlocking topologies. (See pollstate_contend() for more detail about how 1532 * deadlocks are detected and resolved.) 1533 */ 1534 1535 /*ARGSUSED*/ 1536 static int 1537 dppoll(dev_t dev, short events, int anyyet, short *reventsp, 1538 struct pollhead **phpp) 1539 { 1540 minor_t minor; 1541 dp_entry_t *dpep; 1542 pollcache_t *pcp; 1543 int res, rc = 0; 1544 1545 minor = getminor(dev); 1546 mutex_enter(&devpoll_lock); 1547 ASSERT(minor < dptblsize); 1548 dpep = devpolltbl[minor]; 1549 ASSERT(dpep != NULL); 1550 mutex_exit(&devpoll_lock); 1551 1552 mutex_enter(&dpep->dpe_lock); 1553 if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) { 1554 /* Poll recursion is not yet supported for non-epoll handles */ 1555 *reventsp = POLLERR; 1556 mutex_exit(&dpep->dpe_lock); 1557 return (0); 1558 } else { 1559 dpep->dpe_refcnt++; 1560 pcp = dpep->dpe_pcache; 1561 mutex_exit(&dpep->dpe_lock); 1562 } 1563 1564 res = pollstate_enter(pcp); 1565 if (res == PSE_SUCCESS) { 1566 nfds_t nfds = 1; 1567 int fdcnt = 0; 1568 pollstate_t *ps = curthread->t_pollstate; 1569 1570 /* 1571 * Recursive polling will only emit certain events. Skip a 1572 * scan of the pollcache if those events are not of interest. 1573 */ 1574 if (events & (POLLIN|POLLRDNORM)) { 1575 rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); 1576 } else { 1577 rc = 0; 1578 fdcnt = 0; 1579 } 1580 1581 if (rc == 0 && fdcnt > 0) { 1582 *reventsp = POLLIN|POLLRDNORM; 1583 } else { 1584 *reventsp = 0; 1585 } 1586 pcachelink_assoc(pcp, ps->ps_pc_stack[0]); 1587 pollstate_exit(pcp); 1588 } else { 1589 switch (res) { 1590 case PSE_FAIL_DEPTH: 1591 rc = EINVAL; 1592 break; 1593 case PSE_FAIL_LOOP: 1594 case PSE_FAIL_DEADLOCK: 1595 rc = ELOOP; 1596 break; 1597 default: 1598 /* 1599 * If anything else has gone awry, such as being polled 1600 * from an unexpected context, fall back to the 1601 * recursion-intolerant response. 1602 */ 1603 *reventsp = POLLERR; 1604 rc = 0; 1605 break; 1606 } 1607 } 1608 1609 DP_REFRELE(dpep); 1610 return (rc); 1611 } 1612 1613 /* 1614 * devpoll close should do enough clean up before the pollcache is deleted, 1615 * i.e., it should ensure no one still references the pollcache later. 1616 * There is no "permission" check in here. Any process having the last 1617 * reference of this /dev/poll fd can close. 1618 */ 1619 /*ARGSUSED*/ 1620 static int 1621 dpclose(dev_t dev, int flag, int otyp, cred_t *credp) 1622 { 1623 minor_t minor; 1624 dp_entry_t *dpep; 1625 pollcache_t *pcp; 1626 int i; 1627 polldat_t **hashtbl; 1628 polldat_t *pdp; 1629 1630 minor = getminor(dev); 1631 1632 mutex_enter(&devpoll_lock); 1633 dpep = devpolltbl[minor]; 1634 ASSERT(dpep != NULL); 1635 devpolltbl[minor] = NULL; 1636 mutex_exit(&devpoll_lock); 1637 pcp = dpep->dpe_pcache; 1638 ASSERT(pcp != NULL); 1639 /* 1640 * At this point, no other lwp can access this pollcache via the 1641 * /dev/poll fd. This pollcache is going away, so do the clean 1642 * up without the pc_lock. 1643 */ 1644 hashtbl = pcp->pc_hash; 1645 for (i = 0; i < pcp->pc_hashsize; i++) { 1646 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1647 polldat_disassociate(pdp); 1648 pdp->pd_fp = NULL; 1649 } 1650 } 1651 /* 1652 * pollwakeup() may still interact with this pollcache. Wait until 1653 * it is done. 1654 */ 1655 mutex_enter(&pcp->pc_no_exit); 1656 ASSERT(pcp->pc_busy >= 0); 1657 while (pcp->pc_busy > 0) 1658 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 1659 mutex_exit(&pcp->pc_no_exit); 1660 1661 /* Clean up any pollcache links created via recursive /dev/poll */ 1662 if (pcp->pc_parents != NULL || pcp->pc_children != NULL) { 1663 /* 1664 * Because of the locking rules for pcachelink manipulation, 1665 * acquring pc_lock is required for this step. 1666 */ 1667 mutex_enter(&pcp->pc_lock); 1668 pcachelink_purge_all(pcp); 1669 mutex_exit(&pcp->pc_lock); 1670 } 1671 1672 pcache_destroy(pcp); 1673 ASSERT(dpep->dpe_refcnt == 0); 1674 kmem_free(dpep, sizeof (dp_entry_t)); 1675 return (0); 1676 } 1677 1678 static void 1679 pcachelink_locked_rele(pcachelink_t *pl) 1680 { 1681 ASSERT(MUTEX_HELD(&pl->pcl_lock)); 1682 VERIFY(pl->pcl_refcnt >= 1); 1683 1684 pl->pcl_refcnt--; 1685 if (pl->pcl_refcnt == 0) { 1686 VERIFY(pl->pcl_state == PCL_INVALID); 1687 ASSERT(pl->pcl_parent_pc == NULL); 1688 ASSERT(pl->pcl_child_pc == NULL); 1689 ASSERT(pl->pcl_parent_next == NULL); 1690 ASSERT(pl->pcl_child_next == NULL); 1691 1692 pl->pcl_state = PCL_FREE; 1693 mutex_destroy(&pl->pcl_lock); 1694 kmem_free(pl, sizeof (pcachelink_t)); 1695 } else { 1696 mutex_exit(&pl->pcl_lock); 1697 } 1698 } 1699 1700 /* 1701 * Associate parent and child pollcaches via a pcachelink_t. If an existing 1702 * link (stale or valid) between the two is found, it will be reused. If a 1703 * suitable link is not found for reuse, a new one will be allocated. 1704 */ 1705 static void 1706 pcachelink_assoc(pollcache_t *child, pollcache_t *parent) 1707 { 1708 pcachelink_t *pl, **plpn; 1709 1710 ASSERT(MUTEX_HELD(&child->pc_lock)); 1711 ASSERT(MUTEX_HELD(&parent->pc_lock)); 1712 1713 /* Search for an existing link we can reuse. */ 1714 plpn = &child->pc_parents; 1715 for (pl = child->pc_parents; pl != NULL; pl = *plpn) { 1716 mutex_enter(&pl->pcl_lock); 1717 if (pl->pcl_state == PCL_INVALID) { 1718 /* Clean any invalid links while walking the list */ 1719 *plpn = pl->pcl_parent_next; 1720 pl->pcl_child_pc = NULL; 1721 pl->pcl_parent_next = NULL; 1722 pcachelink_locked_rele(pl); 1723 } else if (pl->pcl_parent_pc == parent) { 1724 /* Successfully found parent link */ 1725 ASSERT(pl->pcl_state == PCL_VALID || 1726 pl->pcl_state == PCL_STALE); 1727 pl->pcl_state = PCL_VALID; 1728 mutex_exit(&pl->pcl_lock); 1729 return; 1730 } else { 1731 plpn = &pl->pcl_parent_next; 1732 mutex_exit(&pl->pcl_lock); 1733 } 1734 } 1735 1736 /* No existing link to the parent was found. Create a fresh one. */ 1737 pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP); 1738 mutex_init(&pl->pcl_lock, NULL, MUTEX_DEFAULT, NULL); 1739 1740 pl->pcl_parent_pc = parent; 1741 pl->pcl_child_next = parent->pc_children; 1742 parent->pc_children = pl; 1743 pl->pcl_refcnt++; 1744 1745 pl->pcl_child_pc = child; 1746 pl->pcl_parent_next = child->pc_parents; 1747 child->pc_parents = pl; 1748 pl->pcl_refcnt++; 1749 1750 pl->pcl_state = PCL_VALID; 1751 } 1752 1753 /* 1754 * Mark all child links in a pollcache as stale. Any invalid child links found 1755 * during iteration are purged. 1756 */ 1757 static void 1758 pcachelink_mark_stale(pollcache_t *pcp) 1759 { 1760 pcachelink_t *pl, **plpn; 1761 1762 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1763 1764 plpn = &pcp->pc_children; 1765 for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { 1766 mutex_enter(&pl->pcl_lock); 1767 if (pl->pcl_state == PCL_INVALID) { 1768 /* 1769 * Remove any invalid links while we are going to the 1770 * trouble of walking the list. 1771 */ 1772 *plpn = pl->pcl_child_next; 1773 pl->pcl_parent_pc = NULL; 1774 pl->pcl_child_next = NULL; 1775 pcachelink_locked_rele(pl); 1776 } else { 1777 pl->pcl_state = PCL_STALE; 1778 plpn = &pl->pcl_child_next; 1779 mutex_exit(&pl->pcl_lock); 1780 } 1781 } 1782 } 1783 1784 /* 1785 * Purge all stale (or invalid) child links from a pollcache. 1786 */ 1787 static void 1788 pcachelink_purge_stale(pollcache_t *pcp) 1789 { 1790 pcachelink_t *pl, **plpn; 1791 1792 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1793 1794 plpn = &pcp->pc_children; 1795 for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { 1796 mutex_enter(&pl->pcl_lock); 1797 switch (pl->pcl_state) { 1798 case PCL_STALE: 1799 pl->pcl_state = PCL_INVALID; 1800 /* FALLTHROUGH */ 1801 case PCL_INVALID: 1802 *plpn = pl->pcl_child_next; 1803 pl->pcl_parent_pc = NULL; 1804 pl->pcl_child_next = NULL; 1805 pcachelink_locked_rele(pl); 1806 break; 1807 default: 1808 plpn = &pl->pcl_child_next; 1809 mutex_exit(&pl->pcl_lock); 1810 } 1811 } 1812 } 1813 1814 /* 1815 * Purge all child and parent links from a pollcache, regardless of status. 1816 */ 1817 static void 1818 pcachelink_purge_all(pollcache_t *pcp) 1819 { 1820 pcachelink_t *pl, **plpn; 1821 1822 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1823 1824 plpn = &pcp->pc_parents; 1825 for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) { 1826 mutex_enter(&pl->pcl_lock); 1827 pl->pcl_state = PCL_INVALID; 1828 *plpn = pl->pcl_parent_next; 1829 pl->pcl_child_pc = NULL; 1830 pl->pcl_parent_next = NULL; 1831 pcachelink_locked_rele(pl); 1832 } 1833 1834 plpn = &pcp->pc_children; 1835 for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { 1836 mutex_enter(&pl->pcl_lock); 1837 pl->pcl_state = PCL_INVALID; 1838 *plpn = pl->pcl_child_next; 1839 pl->pcl_parent_pc = NULL; 1840 pl->pcl_child_next = NULL; 1841 pcachelink_locked_rele(pl); 1842 } 1843 1844 ASSERT(pcp->pc_parents == NULL); 1845 ASSERT(pcp->pc_children == NULL); 1846 } 1847