1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012 by Delphix. All rights reserved. 28 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/devops.h> 33 #include <sys/conf.h> 34 #include <sys/modctl.h> 35 #include <sys/sunddi.h> 36 #include <sys/stat.h> 37 #include <sys/poll_impl.h> 38 #include <sys/errno.h> 39 #include <sys/kmem.h> 40 #include <sys/mkdev.h> 41 #include <sys/debug.h> 42 #include <sys/file.h> 43 #include <sys/sysmacros.h> 44 #include <sys/systm.h> 45 #include <sys/bitmap.h> 46 #include <sys/devpoll.h> 47 #include <sys/rctl.h> 48 #include <sys/resource.h> 49 #include <sys/schedctl.h> 50 #include <sys/epoll.h> 51 52 #define RESERVED 1 53 54 /* local data struct */ 55 static dp_entry_t **devpolltbl; /* dev poll entries */ 56 static size_t dptblsize; 57 58 static kmutex_t devpoll_lock; /* lock protecting dev tbl */ 59 int devpoll_init; /* is /dev/poll initialized already */ 60 61 /* device local functions */ 62 63 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp); 64 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp); 65 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 66 int *rvalp); 67 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, 68 struct pollhead **phpp); 69 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp); 70 static dev_info_t *dpdevi; 71 72 73 static struct cb_ops dp_cb_ops = { 74 dpopen, /* open */ 75 dpclose, /* close */ 76 nodev, /* strategy */ 77 nodev, /* print */ 78 nodev, /* dump */ 79 nodev, /* read */ 80 dpwrite, /* write */ 81 dpioctl, /* ioctl */ 82 nodev, /* devmap */ 83 nodev, /* mmap */ 84 nodev, /* segmap */ 85 dppoll, /* poll */ 86 ddi_prop_op, /* prop_op */ 87 (struct streamtab *)0, /* streamtab */ 88 D_MP, /* flags */ 89 CB_REV, /* cb_ops revision */ 90 nodev, /* aread */ 91 nodev /* awrite */ 92 }; 93 94 static int dpattach(dev_info_t *, ddi_attach_cmd_t); 95 static int dpdetach(dev_info_t *, ddi_detach_cmd_t); 96 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 97 98 static struct dev_ops dp_ops = { 99 DEVO_REV, /* devo_rev */ 100 0, /* refcnt */ 101 dpinfo, /* info */ 102 nulldev, /* identify */ 103 nulldev, /* probe */ 104 dpattach, /* attach */ 105 dpdetach, /* detach */ 106 nodev, /* reset */ 107 &dp_cb_ops, /* driver operations */ 108 (struct bus_ops *)NULL, /* bus operations */ 109 nulldev, /* power */ 110 ddi_quiesce_not_needed, /* quiesce */ 111 }; 112 113 114 static struct modldrv modldrv = { 115 &mod_driverops, /* type of module - a driver */ 116 "/dev/poll driver", 117 &dp_ops, 118 }; 119 120 static struct modlinkage modlinkage = { 121 MODREV_1, 122 (void *)&modldrv, 123 NULL 124 }; 125 126 /* 127 * Locking Design 128 * 129 * The /dev/poll driver shares most of its code with poll sys call whose 130 * code is in common/syscall/poll.c. In poll(2) design, the pollcache 131 * structure is per lwp. An implicit assumption is made there that some 132 * portion of pollcache will never be touched by other lwps. E.g., in 133 * poll(2) design, no lwp will ever need to grow bitmap of other lwp. 134 * This assumption is not true for /dev/poll; hence the need for extra 135 * locking. 136 * 137 * To allow more parallelism, each /dev/poll file descriptor (indexed by 138 * minor number) has its own lock. Since read (dpioctl) is a much more 139 * frequent operation than write, we want to allow multiple reads on same 140 * /dev/poll fd. However, we prevent writes from being starved by giving 141 * priority to write operation. Theoretically writes can starve reads as 142 * well. But in practical sense this is not important because (1) writes 143 * happens less often than reads, and (2) write operation defines the 144 * content of poll fd a cache set. If writes happens so often that they 145 * can starve reads, that means the cached set is very unstable. It may 146 * not make sense to read an unstable cache set anyway. Therefore, the 147 * writers starving readers case is not handled in this design. 148 */ 149 150 int 151 _init() 152 { 153 int error; 154 155 dptblsize = DEVPOLLSIZE; 156 devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); 157 mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL); 158 devpoll_init = 1; 159 if ((error = mod_install(&modlinkage)) != 0) { 160 mutex_destroy(&devpoll_lock); 161 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); 162 devpoll_init = 0; 163 } 164 return (error); 165 } 166 167 int 168 _fini() 169 { 170 int error; 171 172 if ((error = mod_remove(&modlinkage)) != 0) { 173 return (error); 174 } 175 mutex_destroy(&devpoll_lock); 176 kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); 177 return (0); 178 } 179 180 int 181 _info(struct modinfo *modinfop) 182 { 183 return (mod_info(&modlinkage, modinfop)); 184 } 185 186 /*ARGSUSED*/ 187 static int 188 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd) 189 { 190 if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL) 191 == DDI_FAILURE) { 192 ddi_remove_minor_node(devi, NULL); 193 return (DDI_FAILURE); 194 } 195 dpdevi = devi; 196 return (DDI_SUCCESS); 197 } 198 199 static int 200 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd) 201 { 202 if (cmd != DDI_DETACH) 203 return (DDI_FAILURE); 204 205 ddi_remove_minor_node(devi, NULL); 206 return (DDI_SUCCESS); 207 } 208 209 /* ARGSUSED */ 210 static int 211 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 212 { 213 int error; 214 215 switch (infocmd) { 216 case DDI_INFO_DEVT2DEVINFO: 217 *result = (void *)dpdevi; 218 error = DDI_SUCCESS; 219 break; 220 case DDI_INFO_DEVT2INSTANCE: 221 *result = (void *)0; 222 error = DDI_SUCCESS; 223 break; 224 default: 225 error = DDI_FAILURE; 226 } 227 return (error); 228 } 229 230 /* 231 * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major 232 * differences are: (1) /dev/poll requires scanning the bitmap starting at 233 * where it was stopped last time, instead of always starting from 0, 234 * (2) since user may not have cleaned up the cached fds when they are 235 * closed, some polldats in cache may refer to closed or reused fds. We 236 * need to check for those cases. 237 * 238 * NOTE: Upon closing an fd, automatic poll cache cleanup is done for 239 * poll(2) caches but NOT for /dev/poll caches. So expect some 240 * stale entries! 241 */ 242 static int 243 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, 244 pollcache_t *pcp, nfds_t nfds, int *fdcntp) 245 { 246 int start, ostart, end; 247 int fdcnt, fd; 248 boolean_t done; 249 file_t *fp; 250 short revent; 251 boolean_t no_wrap; 252 pollhead_t *php; 253 polldat_t *pdp; 254 pollfd_t *pfdp; 255 epoll_event_t *epoll; 256 int error = 0; 257 short mask = POLLRDHUP | POLLWRBAND; 258 259 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 260 if (pcp->pc_bitmap == NULL) { 261 /* 262 * No Need to search because no poll fd 263 * has been cached. 264 */ 265 return (error); 266 } 267 268 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 269 pfdp = NULL; 270 epoll = (epoll_event_t *)dpbuf; 271 } else { 272 pfdp = (pollfd_t *)dpbuf; 273 epoll = NULL; 274 } 275 retry: 276 start = ostart = pcp->pc_mapstart; 277 end = pcp->pc_mapend; 278 php = NULL; 279 280 if (start == 0) { 281 /* 282 * started from every begining, no need to wrap around. 283 */ 284 no_wrap = B_TRUE; 285 } else { 286 no_wrap = B_FALSE; 287 } 288 done = B_FALSE; 289 fdcnt = 0; 290 while ((fdcnt < nfds) && !done) { 291 php = NULL; 292 revent = 0; 293 /* 294 * Examine the bit map in a circular fashion 295 * to avoid starvation. Always resume from 296 * last stop. Scan till end of the map. Then 297 * wrap around. 298 */ 299 fd = bt_getlowbit(pcp->pc_bitmap, start, end); 300 ASSERT(fd <= end); 301 if (fd >= 0) { 302 if (fd == end) { 303 if (no_wrap) { 304 done = B_TRUE; 305 } else { 306 start = 0; 307 end = ostart - 1; 308 no_wrap = B_TRUE; 309 } 310 } else { 311 start = fd + 1; 312 } 313 pdp = pcache_lookup_fd(pcp, fd); 314 repoll: 315 ASSERT(pdp != NULL); 316 ASSERT(pdp->pd_fd == fd); 317 if (pdp->pd_fp == NULL) { 318 /* 319 * The fd is POLLREMOVed. This fd is 320 * logically no longer cached. So move 321 * on to the next one. 322 */ 323 continue; 324 } 325 if ((fp = getf(fd)) == NULL) { 326 /* 327 * The fd has been closed, but user has not 328 * done a POLLREMOVE on this fd yet. Instead 329 * of cleaning it here implicitly, we return 330 * POLLNVAL. This is consistent with poll(2) 331 * polling a closed fd. Hope this will remind 332 * user to do a POLLREMOVE. 333 */ 334 if (pfdp != NULL) { 335 pfdp[fdcnt].fd = fd; 336 pfdp[fdcnt].revents = POLLNVAL; 337 fdcnt++; 338 continue; 339 } 340 341 /* 342 * In the epoll compatibility case, we actually 343 * perform the implicit removal to remain 344 * closer to the epoll semantics. 345 */ 346 ASSERT(epoll != NULL); 347 348 pdp->pd_fp = NULL; 349 pdp->pd_events = 0; 350 351 if (php != NULL) { 352 pollhead_delete(php, pdp); 353 pdp->pd_php = NULL; 354 } 355 356 BT_CLEAR(pcp->pc_bitmap, fd); 357 continue; 358 } 359 360 if (fp != pdp->pd_fp) { 361 /* 362 * user is polling on a cached fd which was 363 * closed and then reused. Unfortunately 364 * there is no good way to inform user. 365 * If the file struct is also reused, we 366 * may not be able to detect the fd reuse 367 * at all. As long as this does not 368 * cause system failure and/or memory leak, 369 * we will play along. Man page states if 370 * user does not clean up closed fds, polling 371 * results will be indeterministic. 372 * 373 * XXX - perhaps log the detection of fd 374 * reuse? 375 */ 376 pdp->pd_fp = fp; 377 } 378 /* 379 * XXX - pollrelock() logic needs to know which 380 * which pollcache lock to grab. It'd be a 381 * cleaner solution if we could pass pcp as 382 * an arguement in VOP_POLL interface instead 383 * of implicitly passing it using thread_t 384 * struct. On the other hand, changing VOP_POLL 385 * interface will require all driver/file system 386 * poll routine to change. May want to revisit 387 * the tradeoff later. 388 */ 389 curthread->t_pollcache = pcp; 390 error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, 391 &revent, &php, NULL); 392 curthread->t_pollcache = NULL; 393 releasef(fd); 394 if (error != 0) { 395 break; 396 } 397 /* 398 * layered devices (e.g. console driver) 399 * may change the vnode and thus the pollhead 400 * pointer out from underneath us. 401 */ 402 if (php != NULL && pdp->pd_php != NULL && 403 php != pdp->pd_php) { 404 pollhead_delete(pdp->pd_php, pdp); 405 pdp->pd_php = php; 406 pollhead_insert(php, pdp); 407 /* 408 * The bit should still be set. 409 */ 410 ASSERT(BT_TEST(pcp->pc_bitmap, fd)); 411 goto retry; 412 } 413 414 if (revent != 0) { 415 if (pfdp != NULL) { 416 pfdp[fdcnt].fd = fd; 417 pfdp[fdcnt].events = pdp->pd_events; 418 pfdp[fdcnt].revents = revent; 419 } else { 420 epoll_event_t *ep = &epoll[fdcnt]; 421 422 ASSERT(epoll != NULL); 423 ep->data.u64 = pdp->pd_epolldata; 424 425 /* 426 * If any of the event bits are set for 427 * which poll and epoll representations 428 * differ, swizzle in the native epoll 429 * values. 430 */ 431 if (revent & mask) { 432 ep->events = (revent & ~mask) | 433 ((revent & POLLRDHUP) ? 434 EPOLLRDHUP : 0) | 435 ((revent & POLLWRBAND) ? 436 EPOLLWRBAND : 0); 437 } else { 438 ep->events = revent; 439 } 440 441 /* 442 * We define POLLWRNORM to be POLLOUT, 443 * but epoll has separate definitions 444 * for them; if POLLOUT is set and the 445 * user has asked for EPOLLWRNORM, set 446 * that as well. 447 */ 448 if ((revent & POLLOUT) && 449 (pdp->pd_events & EPOLLWRNORM)) { 450 ep->events |= EPOLLWRNORM; 451 } 452 } 453 454 /* 455 * If POLLET is set, clear the bit in the 456 * bitmap -- which effectively latches the 457 * edge on a pollwakeup() from the driver. 458 */ 459 if (pdp->pd_events & POLLET) 460 BT_CLEAR(pcp->pc_bitmap, fd); 461 462 /* 463 * If POLLONESHOT is set, perform the implicit 464 * POLLREMOVE. 465 */ 466 if (pdp->pd_events & POLLONESHOT) { 467 pdp->pd_fp = NULL; 468 pdp->pd_events = 0; 469 470 if (php != NULL) { 471 pollhead_delete(php, pdp); 472 pdp->pd_php = NULL; 473 } 474 475 BT_CLEAR(pcp->pc_bitmap, fd); 476 } 477 478 fdcnt++; 479 } else if (php != NULL) { 480 /* 481 * We clear a bit or cache a poll fd if 482 * the driver returns a poll head ptr, 483 * which is expected in the case of 0 484 * revents. Some buggy driver may return 485 * NULL php pointer with 0 revents. In 486 * this case, we just treat the driver as 487 * "noncachable" and not clearing the bit 488 * in bitmap. 489 */ 490 if ((pdp->pd_php != NULL) && 491 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 492 BT_CLEAR(pcp->pc_bitmap, fd); 493 } 494 if (pdp->pd_php == NULL) { 495 pollhead_insert(php, pdp); 496 pdp->pd_php = php; 497 /* 498 * An event of interest may have 499 * arrived between the VOP_POLL() and 500 * the pollhead_insert(); check again. 501 */ 502 goto repoll; 503 } 504 } 505 } else { 506 /* 507 * No bit set in the range. Check for wrap around. 508 */ 509 if (!no_wrap) { 510 start = 0; 511 end = ostart - 1; 512 no_wrap = B_TRUE; 513 } else { 514 done = B_TRUE; 515 } 516 } 517 } 518 519 if (!done) { 520 pcp->pc_mapstart = start; 521 } 522 ASSERT(*fdcntp == 0); 523 *fdcntp = fdcnt; 524 return (error); 525 } 526 527 /*ARGSUSED*/ 528 static int 529 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) 530 { 531 minor_t minordev; 532 dp_entry_t *dpep; 533 pollcache_t *pcp; 534 535 ASSERT(devpoll_init); 536 ASSERT(dptblsize <= MAXMIN); 537 mutex_enter(&devpoll_lock); 538 for (minordev = 0; minordev < dptblsize; minordev++) { 539 if (devpolltbl[minordev] == NULL) { 540 devpolltbl[minordev] = (dp_entry_t *)RESERVED; 541 break; 542 } 543 } 544 if (minordev == dptblsize) { 545 dp_entry_t **newtbl; 546 size_t oldsize; 547 548 /* 549 * Used up every entry in the existing devpoll table. 550 * Grow the table by DEVPOLLSIZE. 551 */ 552 if ((oldsize = dptblsize) >= MAXMIN) { 553 mutex_exit(&devpoll_lock); 554 return (ENXIO); 555 } 556 dptblsize += DEVPOLLSIZE; 557 if (dptblsize > MAXMIN) { 558 dptblsize = MAXMIN; 559 } 560 newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); 561 bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize); 562 kmem_free(devpolltbl, sizeof (caddr_t) * oldsize); 563 devpolltbl = newtbl; 564 devpolltbl[minordev] = (dp_entry_t *)RESERVED; 565 } 566 mutex_exit(&devpoll_lock); 567 568 dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP); 569 /* 570 * allocate a pollcache skeleton here. Delay allocating bitmap 571 * structures until dpwrite() time, since we don't know the 572 * optimal size yet. We also delay setting the pid until either 573 * dpwrite() or attempt to poll on the instance, allowing parents 574 * to create instances of /dev/poll for their children. (In the 575 * epoll compatibility case, this check isn't performed to maintain 576 * semantic compatibility.) 577 */ 578 pcp = pcache_alloc(); 579 dpep->dpe_pcache = pcp; 580 pcp->pc_pid = -1; 581 *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ 582 mutex_enter(&devpoll_lock); 583 ASSERT(minordev < dptblsize); 584 ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED); 585 devpolltbl[minordev] = dpep; 586 mutex_exit(&devpoll_lock); 587 return (0); 588 } 589 590 /* 591 * Write to dev/poll add/remove fd's to/from a cached poll fd set, 592 * or change poll events for a watched fd. 593 */ 594 /*ARGSUSED*/ 595 static int 596 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) 597 { 598 minor_t minor; 599 dp_entry_t *dpep; 600 pollcache_t *pcp; 601 pollfd_t *pollfdp, *pfdp; 602 dvpoll_epollfd_t *epfdp; 603 uintptr_t limit; 604 int error, size; 605 ssize_t uiosize; 606 nfds_t pollfdnum; 607 struct pollhead *php = NULL; 608 polldat_t *pdp; 609 int fd; 610 file_t *fp; 611 612 minor = getminor(dev); 613 614 mutex_enter(&devpoll_lock); 615 ASSERT(minor < dptblsize); 616 dpep = devpolltbl[minor]; 617 ASSERT(dpep != NULL); 618 mutex_exit(&devpoll_lock); 619 pcp = dpep->dpe_pcache; 620 621 if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && 622 curproc->p_pid != pcp->pc_pid) { 623 if (pcp->pc_pid != -1) 624 return (EACCES); 625 626 pcp->pc_pid = curproc->p_pid; 627 } 628 629 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 630 size = sizeof (dvpoll_epollfd_t); 631 } else { 632 size = sizeof (pollfd_t); 633 } 634 635 uiosize = uiop->uio_resid; 636 pollfdnum = uiosize / size; 637 mutex_enter(&curproc->p_lock); 638 if (pollfdnum > (uint_t)rctl_enforced_value( 639 rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) { 640 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 641 curproc->p_rctls, curproc, RCA_SAFE); 642 mutex_exit(&curproc->p_lock); 643 return (set_errno(EINVAL)); 644 } 645 mutex_exit(&curproc->p_lock); 646 /* 647 * Copy in the pollfd array. Walk through the array and add 648 * each polled fd to the cached set. 649 */ 650 pollfdp = kmem_alloc(uiosize, KM_SLEEP); 651 limit = (uintptr_t)pollfdp + (pollfdnum * size); 652 653 /* 654 * Although /dev/poll uses the write(2) interface to cache fds, it's 655 * not supposed to function as a seekable device. To prevent offset 656 * from growing and eventually exceed the maximum, reset the offset 657 * here for every call. 658 */ 659 uiop->uio_loffset = 0; 660 if ((error = uiomove((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop)) 661 != 0) { 662 kmem_free(pollfdp, uiosize); 663 return (error); 664 } 665 /* 666 * We are about to enter the core portion of dpwrite(). Make sure this 667 * write has exclusive access in this portion of the code, i.e., no 668 * other writers in this code and no other readers in dpioctl. 669 */ 670 mutex_enter(&dpep->dpe_lock); 671 dpep->dpe_writerwait++; 672 while (dpep->dpe_refcnt != 0) { 673 /* 674 * We need to do a bit of a dance here: we need to drop 675 * our dpe_lock and grab the pc_lock to broadcast the pc_cv to 676 * kick any DP_POLL/DP_PPOLL sleepers. 677 */ 678 mutex_exit(&dpep->dpe_lock); 679 mutex_enter(&pcp->pc_lock); 680 pcp->pc_flag |= PC_WRITEWANTED; 681 cv_broadcast(&pcp->pc_cv); 682 mutex_exit(&pcp->pc_lock); 683 mutex_enter(&dpep->dpe_lock); 684 685 if (dpep->dpe_refcnt == 0) 686 break; 687 688 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { 689 dpep->dpe_writerwait--; 690 mutex_exit(&dpep->dpe_lock); 691 mutex_enter(&pcp->pc_lock); 692 pcp->pc_flag &= ~PC_WRITEWANTED; 693 mutex_exit(&pcp->pc_lock); 694 kmem_free(pollfdp, uiosize); 695 return (set_errno(EINTR)); 696 } 697 } 698 dpep->dpe_writerwait--; 699 dpep->dpe_flag |= DP_WRITER_PRESENT; 700 dpep->dpe_refcnt++; 701 702 mutex_exit(&dpep->dpe_lock); 703 704 mutex_enter(&pcp->pc_lock); 705 pcp->pc_flag &= ~PC_WRITEWANTED; 706 707 if (pcp->pc_bitmap == NULL) { 708 pcache_create(pcp, pollfdnum); 709 } 710 for (pfdp = pollfdp; (uintptr_t)pfdp < limit; 711 pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { 712 fd = pfdp->fd; 713 if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { 714 /* 715 * epoll semantics demand that we return EBADF if our 716 * specified fd is invalid. 717 */ 718 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 719 error = EBADF; 720 break; 721 } 722 723 continue; 724 } 725 726 pdp = pcache_lookup_fd(pcp, fd); 727 if (pfdp->events != POLLREMOVE) { 728 729 fp = NULL; 730 731 if (pdp == NULL) { 732 /* 733 * If we're in epoll compatibility mode, check 734 * that the fd is valid before allocating 735 * anything for it; epoll semantics demand that 736 * we return EBADF if our specified fd is 737 * invalid. 738 */ 739 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 740 if ((fp = getf(fd)) == NULL) { 741 error = EBADF; 742 break; 743 } 744 } 745 746 pdp = pcache_alloc_fd(0); 747 pdp->pd_fd = fd; 748 pdp->pd_pcache = pcp; 749 pcache_insert_fd(pcp, pdp, pollfdnum); 750 } else { 751 /* 752 * epoll semantics demand that we error out if 753 * a file descriptor is added twice, which we 754 * check (imperfectly) by checking if we both 755 * have the file descriptor cached and the 756 * file pointer that correponds to the file 757 * descriptor matches our cached value. If 758 * there is a pointer mismatch, the file 759 * descriptor was closed without being removed. 760 * The converse is clearly not true, however, 761 * so to narrow the window by which a spurious 762 * EEXIST may be returned, we also check if 763 * this fp has been added to an epoll control 764 * descriptor in the past; if it hasn't, we 765 * know that this is due to fp reuse -- it's 766 * not a true EEXIST case. (By performing this 767 * additional check, we limit the window of 768 * spurious EEXIST to situations where a single 769 * file descriptor is being used across two or 770 * more epoll control descriptors -- and even 771 * then, the file descriptor must be closed and 772 * reused in a relatively tight time span.) 773 */ 774 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 775 if (pdp->pd_fp != NULL && 776 (fp = getf(fd)) != NULL && 777 fp == pdp->pd_fp && 778 (fp->f_flag2 & FEPOLLED)) { 779 error = EEXIST; 780 releasef(fd); 781 break; 782 } 783 784 /* 785 * We have decided that the cached 786 * information was stale: it either 787 * didn't match, or the fp had never 788 * actually been epoll()'d on before. 789 * We need to now clear our pd_events 790 * to assure that we don't mistakenly 791 * operate on cached event disposition. 792 */ 793 pdp->pd_events = 0; 794 } 795 } 796 797 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 798 epfdp = (dvpoll_epollfd_t *)pfdp; 799 pdp->pd_epolldata = epfdp->dpep_data; 800 } 801 802 ASSERT(pdp->pd_fd == fd); 803 ASSERT(pdp->pd_pcache == pcp); 804 if (fd >= pcp->pc_mapsize) { 805 mutex_exit(&pcp->pc_lock); 806 pcache_grow_map(pcp, fd); 807 mutex_enter(&pcp->pc_lock); 808 } 809 if (fd > pcp->pc_mapend) { 810 pcp->pc_mapend = fd; 811 } 812 if (fp == NULL && (fp = getf(fd)) == NULL) { 813 /* 814 * The fd is not valid. Since we can't pass 815 * this error back in the write() call, set 816 * the bit in bitmap to force DP_POLL ioctl 817 * to examine it. 818 */ 819 BT_SET(pcp->pc_bitmap, fd); 820 pdp->pd_events |= pfdp->events; 821 continue; 822 } 823 824 /* 825 * To (greatly) reduce EEXIST false positives, we 826 * denote that this fp has been epoll()'d. We do this 827 * regardless of epoll compatibility mode, as the flag 828 * is harmless if not in epoll compatibility mode. 829 */ 830 fp->f_flag2 |= FEPOLLED; 831 832 /* 833 * Don't do VOP_POLL for an already cached fd with 834 * same poll events. 835 */ 836 if ((pdp->pd_events == pfdp->events) && 837 (pdp->pd_fp == fp)) { 838 /* 839 * the events are already cached 840 */ 841 releasef(fd); 842 continue; 843 } 844 845 /* 846 * do VOP_POLL and cache this poll fd. 847 */ 848 /* 849 * XXX - pollrelock() logic needs to know which 850 * which pollcache lock to grab. It'd be a 851 * cleaner solution if we could pass pcp as 852 * an arguement in VOP_POLL interface instead 853 * of implicitly passing it using thread_t 854 * struct. On the other hand, changing VOP_POLL 855 * interface will require all driver/file system 856 * poll routine to change. May want to revisit 857 * the tradeoff later. 858 */ 859 curthread->t_pollcache = pcp; 860 error = VOP_POLL(fp->f_vnode, pfdp->events, 0, 861 &pfdp->revents, &php, NULL); 862 curthread->t_pollcache = NULL; 863 /* 864 * We always set the bit when this fd is cached; 865 * this forces the first DP_POLL to poll this fd. 866 * Real performance gain comes from subsequent 867 * DP_POLL. We also attempt a pollhead_insert(); 868 * if it's not possible, we'll do it in dpioctl(). 869 */ 870 BT_SET(pcp->pc_bitmap, fd); 871 if (error != 0) { 872 releasef(fd); 873 break; 874 } 875 pdp->pd_fp = fp; 876 pdp->pd_events |= pfdp->events; 877 if (php != NULL) { 878 if (pdp->pd_php == NULL) { 879 pollhead_insert(php, pdp); 880 pdp->pd_php = php; 881 } else { 882 if (pdp->pd_php != php) { 883 pollhead_delete(pdp->pd_php, 884 pdp); 885 pollhead_insert(php, pdp); 886 pdp->pd_php = php; 887 } 888 } 889 890 } 891 releasef(fd); 892 } else { 893 if (pdp == NULL || pdp->pd_fp == NULL) { 894 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 895 /* 896 * As with the add case (above), epoll 897 * semantics demand that we error out 898 * in this case. 899 */ 900 error = ENOENT; 901 break; 902 } 903 904 continue; 905 } 906 ASSERT(pdp->pd_fd == fd); 907 pdp->pd_fp = NULL; 908 pdp->pd_events = 0; 909 ASSERT(pdp->pd_thread == NULL); 910 if (pdp->pd_php != NULL) { 911 pollhead_delete(pdp->pd_php, pdp); 912 pdp->pd_php = NULL; 913 } 914 BT_CLEAR(pcp->pc_bitmap, fd); 915 } 916 } 917 mutex_exit(&pcp->pc_lock); 918 mutex_enter(&dpep->dpe_lock); 919 dpep->dpe_flag &= ~DP_WRITER_PRESENT; 920 ASSERT(dpep->dpe_refcnt == 1); 921 dpep->dpe_refcnt--; 922 cv_broadcast(&dpep->dpe_cv); 923 mutex_exit(&dpep->dpe_lock); 924 kmem_free(pollfdp, uiosize); 925 return (error); 926 } 927 928 #define DP_SIGMASK_RESTORE(ksetp) { \ 929 if (ksetp != NULL) { \ 930 mutex_enter(&p->p_lock); \ 931 if (lwp->lwp_cursig == 0) { \ 932 t->t_hold = lwp->lwp_sigoldmask; \ 933 t->t_flag &= ~T_TOMASK; \ 934 } \ 935 mutex_exit(&p->p_lock); \ 936 } \ 937 } 938 939 /*ARGSUSED*/ 940 static int 941 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) 942 { 943 minor_t minor; 944 dp_entry_t *dpep; 945 pollcache_t *pcp; 946 hrtime_t now; 947 int error = 0; 948 STRUCT_DECL(dvpoll, dvpoll); 949 950 if (cmd == DP_POLL || cmd == DP_PPOLL) { 951 /* do this now, before we sleep on DP_WRITER_PRESENT */ 952 now = gethrtime(); 953 } 954 955 minor = getminor(dev); 956 mutex_enter(&devpoll_lock); 957 ASSERT(minor < dptblsize); 958 dpep = devpolltbl[minor]; 959 mutex_exit(&devpoll_lock); 960 ASSERT(dpep != NULL); 961 pcp = dpep->dpe_pcache; 962 963 mutex_enter(&dpep->dpe_lock); 964 965 if (cmd == DP_EPOLLCOMPAT) { 966 if (dpep->dpe_refcnt != 0) { 967 /* 968 * We can't turn on epoll compatibility while there 969 * are outstanding operations. 970 */ 971 mutex_exit(&dpep->dpe_lock); 972 return (EBUSY); 973 } 974 975 /* 976 * epoll compatibility is a one-way street: there's no way 977 * to turn it off for a particular open. 978 */ 979 dpep->dpe_flag |= DP_ISEPOLLCOMPAT; 980 mutex_exit(&dpep->dpe_lock); 981 982 return (0); 983 } 984 985 if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) && 986 curproc->p_pid != pcp->pc_pid) { 987 if (pcp->pc_pid != -1) { 988 mutex_exit(&dpep->dpe_lock); 989 return (EACCES); 990 } 991 992 pcp->pc_pid = curproc->p_pid; 993 } 994 995 while ((dpep->dpe_flag & DP_WRITER_PRESENT) || 996 (dpep->dpe_writerwait != 0)) { 997 if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { 998 mutex_exit(&dpep->dpe_lock); 999 return (EINTR); 1000 } 1001 } 1002 dpep->dpe_refcnt++; 1003 mutex_exit(&dpep->dpe_lock); 1004 1005 switch (cmd) { 1006 case DP_POLL: 1007 case DP_PPOLL: 1008 { 1009 pollstate_t *ps; 1010 nfds_t nfds; 1011 int fdcnt = 0; 1012 size_t size, fdsize, dpsize; 1013 hrtime_t deadline = 0; 1014 k_sigset_t *ksetp = NULL; 1015 k_sigset_t kset; 1016 sigset_t set; 1017 kthread_t *t = curthread; 1018 klwp_t *lwp = ttolwp(t); 1019 struct proc *p = ttoproc(curthread); 1020 1021 STRUCT_INIT(dvpoll, mode); 1022 1023 /* 1024 * The dp_setp member is only required/consumed for DP_PPOLL, 1025 * which otherwise uses the same structure as DP_POLL. 1026 */ 1027 if (cmd == DP_POLL) { 1028 dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - 1029 (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); 1030 } else { 1031 ASSERT(cmd == DP_PPOLL); 1032 dpsize = STRUCT_SIZE(dvpoll); 1033 } 1034 1035 if ((mode & FKIOCTL) != 0) { 1036 /* Kernel-internal ioctl call */ 1037 bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); 1038 error = 0; 1039 } else { 1040 error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), 1041 dpsize); 1042 } 1043 1044 if (error) { 1045 DP_REFRELE(dpep); 1046 return (EFAULT); 1047 } 1048 1049 deadline = STRUCT_FGET(dvpoll, dp_timeout); 1050 if (deadline > 0) { 1051 /* 1052 * Convert the deadline from relative milliseconds 1053 * to absolute nanoseconds. They must wait for at 1054 * least a tick. 1055 */ 1056 deadline = MSEC2NSEC(deadline); 1057 deadline = MAX(deadline, nsec_per_tick); 1058 deadline += now; 1059 } 1060 1061 if (cmd == DP_PPOLL) { 1062 void *setp = STRUCT_FGETP(dvpoll, dp_setp); 1063 1064 if (setp != NULL) { 1065 if (copyin(setp, &set, sizeof (set))) { 1066 DP_REFRELE(dpep); 1067 return (EFAULT); 1068 } 1069 1070 sigutok(&set, &kset); 1071 ksetp = &kset; 1072 1073 mutex_enter(&p->p_lock); 1074 schedctl_finish_sigblock(t); 1075 lwp->lwp_sigoldmask = t->t_hold; 1076 t->t_hold = *ksetp; 1077 t->t_flag |= T_TOMASK; 1078 1079 /* 1080 * Like ppoll() with a non-NULL sigset, we'll 1081 * call cv_reltimedwait_sig() just to check for 1082 * signals. This call will return immediately 1083 * with either 0 (signalled) or -1 (no signal). 1084 * There are some conditions whereby we can 1085 * get 0 from cv_reltimedwait_sig() without 1086 * a true signal (e.g., a directed stop), so 1087 * we restore our signal mask in the unlikely 1088 * event that lwp_cursig is 0. 1089 */ 1090 if (!cv_reltimedwait_sig(&t->t_delay_cv, 1091 &p->p_lock, 0, TR_CLOCK_TICK)) { 1092 if (lwp->lwp_cursig == 0) { 1093 t->t_hold = lwp->lwp_sigoldmask; 1094 t->t_flag &= ~T_TOMASK; 1095 } 1096 1097 mutex_exit(&p->p_lock); 1098 1099 DP_REFRELE(dpep); 1100 return (EINTR); 1101 } 1102 1103 mutex_exit(&p->p_lock); 1104 } 1105 } 1106 1107 if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { 1108 /* 1109 * We are just using DP_POLL to sleep, so 1110 * we don't any of the devpoll apparatus. 1111 * Do not check for signals if we have a zero timeout. 1112 */ 1113 DP_REFRELE(dpep); 1114 if (deadline == 0) { 1115 DP_SIGMASK_RESTORE(ksetp); 1116 return (0); 1117 } 1118 1119 mutex_enter(&curthread->t_delay_lock); 1120 while ((error = 1121 cv_timedwait_sig_hrtime(&curthread->t_delay_cv, 1122 &curthread->t_delay_lock, deadline)) > 0) 1123 continue; 1124 mutex_exit(&curthread->t_delay_lock); 1125 1126 DP_SIGMASK_RESTORE(ksetp); 1127 1128 return (error == 0 ? EINTR : 0); 1129 } 1130 1131 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 1132 size = nfds * (fdsize = sizeof (epoll_event_t)); 1133 } else { 1134 size = nfds * (fdsize = sizeof (pollfd_t)); 1135 } 1136 1137 /* 1138 * XXX It would be nice not to have to alloc each time, but it 1139 * requires another per thread structure hook. This can be 1140 * implemented later if data suggests that it's necessary. 1141 */ 1142 if ((ps = curthread->t_pollstate) == NULL) { 1143 curthread->t_pollstate = pollstate_create(); 1144 ps = curthread->t_pollstate; 1145 } 1146 1147 if (ps->ps_dpbufsize < size) { 1148 /* 1149 * If nfds is larger than twice the current maximum 1150 * open file count, we'll silently clamp it. This 1151 * only limits our exposure to allocating an 1152 * inordinate amount of kernel memory; it doesn't 1153 * otherwise affect the semantics. (We have this 1154 * check at twice the maximum instead of merely the 1155 * maximum because some applications pass an nfds that 1156 * is only slightly larger than their limit.) 1157 */ 1158 mutex_enter(&p->p_lock); 1159 if ((nfds >> 1) > p->p_fno_ctl) { 1160 nfds = p->p_fno_ctl; 1161 size = nfds * fdsize; 1162 } 1163 mutex_exit(&p->p_lock); 1164 1165 if (ps->ps_dpbufsize < size) { 1166 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 1167 ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); 1168 ps->ps_dpbufsize = size; 1169 } 1170 } 1171 1172 mutex_enter(&pcp->pc_lock); 1173 for (;;) { 1174 pcp->pc_flag &= ~PC_POLLWAKE; 1175 1176 error = dp_pcache_poll(dpep, ps->ps_dpbuf, 1177 pcp, nfds, &fdcnt); 1178 if (fdcnt > 0 || error != 0) 1179 break; 1180 1181 /* 1182 * A pollwake has happened since we polled cache. 1183 */ 1184 if (pcp->pc_flag & PC_POLLWAKE) 1185 continue; 1186 1187 /* 1188 * Sleep until we are notified, signaled, or timed out. 1189 */ 1190 if (deadline == 0) { 1191 /* immediate timeout; do not check signals */ 1192 break; 1193 } 1194 1195 if (!(pcp->pc_flag & PC_WRITEWANTED)) { 1196 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 1197 &pcp->pc_lock, deadline); 1198 } else { 1199 error = 1; 1200 } 1201 1202 if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) { 1203 /* 1204 * We've been kicked off of our cv because a 1205 * writer wants in. We're going to drop our 1206 * reference count and then wait until the 1207 * writer is gone -- at which point we'll 1208 * reacquire the pc_lock and call into 1209 * dp_pcache_poll() to get the updated state. 1210 */ 1211 mutex_exit(&pcp->pc_lock); 1212 1213 mutex_enter(&dpep->dpe_lock); 1214 dpep->dpe_refcnt--; 1215 cv_broadcast(&dpep->dpe_cv); 1216 1217 while ((dpep->dpe_flag & DP_WRITER_PRESENT) || 1218 (dpep->dpe_writerwait != 0)) { 1219 error = cv_wait_sig_swap(&dpep->dpe_cv, 1220 &dpep->dpe_lock); 1221 } 1222 1223 dpep->dpe_refcnt++; 1224 mutex_exit(&dpep->dpe_lock); 1225 mutex_enter(&pcp->pc_lock); 1226 } 1227 1228 /* 1229 * If we were awakened by a signal or timeout 1230 * then break the loop, else poll again. 1231 */ 1232 if (error <= 0) { 1233 error = (error == 0) ? EINTR : 0; 1234 break; 1235 } else { 1236 error = 0; 1237 } 1238 } 1239 mutex_exit(&pcp->pc_lock); 1240 1241 DP_SIGMASK_RESTORE(ksetp); 1242 1243 if (error == 0 && fdcnt > 0) { 1244 if (copyout(ps->ps_dpbuf, 1245 STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { 1246 DP_REFRELE(dpep); 1247 return (EFAULT); 1248 } 1249 *rvalp = fdcnt; 1250 } 1251 break; 1252 } 1253 1254 case DP_ISPOLLED: 1255 { 1256 pollfd_t pollfd; 1257 polldat_t *pdp; 1258 1259 STRUCT_INIT(dvpoll, mode); 1260 error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t)); 1261 if (error) { 1262 DP_REFRELE(dpep); 1263 return (EFAULT); 1264 } 1265 mutex_enter(&pcp->pc_lock); 1266 if (pcp->pc_hash == NULL) { 1267 /* 1268 * No Need to search because no poll fd 1269 * has been cached. 1270 */ 1271 mutex_exit(&pcp->pc_lock); 1272 DP_REFRELE(dpep); 1273 return (0); 1274 } 1275 if (pollfd.fd < 0) { 1276 mutex_exit(&pcp->pc_lock); 1277 break; 1278 } 1279 pdp = pcache_lookup_fd(pcp, pollfd.fd); 1280 if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) && 1281 (pdp->pd_fp != NULL)) { 1282 pollfd.revents = pdp->pd_events; 1283 if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) { 1284 mutex_exit(&pcp->pc_lock); 1285 DP_REFRELE(dpep); 1286 return (EFAULT); 1287 } 1288 *rvalp = 1; 1289 } 1290 mutex_exit(&pcp->pc_lock); 1291 break; 1292 } 1293 1294 default: 1295 DP_REFRELE(dpep); 1296 return (EINVAL); 1297 } 1298 DP_REFRELE(dpep); 1299 return (error); 1300 } 1301 1302 /*ARGSUSED*/ 1303 static int 1304 dppoll(dev_t dev, short events, int anyyet, short *reventsp, 1305 struct pollhead **phpp) 1306 { 1307 minor_t minor; 1308 dp_entry_t *dpep; 1309 1310 minor = getminor(dev); 1311 1312 mutex_enter(&devpoll_lock); 1313 dpep = devpolltbl[minor]; 1314 ASSERT(dpep != NULL); 1315 mutex_exit(&devpoll_lock); 1316 1317 /* 1318 * Polling on a /dev/poll fd is not fully supported yet. 1319 */ 1320 if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) { 1321 /* no error in epoll compat. mode */ 1322 *reventsp = 0; 1323 } else { 1324 *reventsp = POLLERR; 1325 } 1326 return (0); 1327 } 1328 1329 /* 1330 * devpoll close should do enough clean up before the pollcache is deleted, 1331 * i.e., it should ensure no one still references the pollcache later. 1332 * There is no "permission" check in here. Any process having the last 1333 * reference of this /dev/poll fd can close. 1334 */ 1335 /*ARGSUSED*/ 1336 static int 1337 dpclose(dev_t dev, int flag, int otyp, cred_t *credp) 1338 { 1339 minor_t minor; 1340 dp_entry_t *dpep; 1341 pollcache_t *pcp; 1342 int i; 1343 polldat_t **hashtbl; 1344 polldat_t *pdp; 1345 1346 minor = getminor(dev); 1347 1348 mutex_enter(&devpoll_lock); 1349 dpep = devpolltbl[minor]; 1350 ASSERT(dpep != NULL); 1351 devpolltbl[minor] = NULL; 1352 mutex_exit(&devpoll_lock); 1353 pcp = dpep->dpe_pcache; 1354 ASSERT(pcp != NULL); 1355 /* 1356 * At this point, no other lwp can access this pollcache via the 1357 * /dev/poll fd. This pollcache is going away, so do the clean 1358 * up without the pc_lock. 1359 */ 1360 hashtbl = pcp->pc_hash; 1361 for (i = 0; i < pcp->pc_hashsize; i++) { 1362 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1363 if (pdp->pd_php != NULL) { 1364 pollhead_delete(pdp->pd_php, pdp); 1365 pdp->pd_php = NULL; 1366 pdp->pd_fp = NULL; 1367 } 1368 } 1369 } 1370 /* 1371 * pollwakeup() may still interact with this pollcache. Wait until 1372 * it is done. 1373 */ 1374 mutex_enter(&pcp->pc_no_exit); 1375 ASSERT(pcp->pc_busy >= 0); 1376 while (pcp->pc_busy > 0) 1377 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 1378 mutex_exit(&pcp->pc_no_exit); 1379 pcache_destroy(pcp); 1380 ASSERT(dpep->dpe_refcnt == 0); 1381 kmem_free(dpep, sizeof (dp_entry_t)); 1382 return (0); 1383 } 1384