1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 2011 by Delphix. All rights reserved. */ 28 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 29 /* All Rights Reserved */ 30 31 /* 32 * Portions of this source code were derived from Berkeley 4.3 BSD 33 * under license from the Regents of the University of California. 34 */ 35 36 #include <sys/param.h> 37 #include <sys/isa_defs.h> 38 #include <sys/types.h> 39 #include <sys/sysmacros.h> 40 #include <sys/user.h> 41 #include <sys/systm.h> 42 #include <sys/errno.h> 43 #include <sys/time.h> 44 #include <sys/vnode.h> 45 #include <sys/file.h> 46 #include <sys/mode.h> 47 #include <sys/proc.h> 48 #include <sys/uio.h> 49 #include <sys/poll_impl.h> 50 #include <sys/kmem.h> 51 #include <sys/cmn_err.h> 52 #include <sys/debug.h> 53 #include <sys/bitmap.h> 54 #include <sys/kstat.h> 55 #include <sys/rctl.h> 56 #include <sys/port_impl.h> 57 #include <sys/schedctl.h> 58 #include <sys/cpu.h> 59 60 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 61 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 62 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 63 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 64 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 65 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 66 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 67 68 /* 69 * global counters to collect some stats 70 */ 71 static struct { 72 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 73 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 74 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 75 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 76 } pollstats = { 77 { "polllistmiss", KSTAT_DATA_UINT64 }, 78 { "pollcachehit", KSTAT_DATA_UINT64 }, 79 { "pollcachephit", KSTAT_DATA_UINT64 }, 80 { "pollcachemiss", KSTAT_DATA_UINT64 } 81 }; 82 83 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 84 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 85 86 struct pplock { 87 kmutex_t pp_lock; 88 short pp_flag; 89 kcondvar_t pp_wait_cv; 90 int32_t pp_pad; /* to a nice round 16 bytes */ 91 }; 92 93 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 94 95 #ifdef DEBUG 96 static int pollchecksanity(pollstate_t *, nfds_t); 97 static int pollcheckxref(pollstate_t *, int); 98 static void pollcheckphlist(void); 99 static int pollcheckrevents(pollstate_t *, int, int, int); 100 static void checkpolldat(pollstate_t *); 101 #endif /* DEBUG */ 102 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 103 int *); 104 105 /* 106 * Data structure overview: 107 * The per-thread poll state consists of 108 * one pollstate_t 109 * one pollcache_t 110 * one bitmap with one event bit per fd 111 * a (two-dimensional) hashed array of polldat_t structures - one entry 112 * per fd 113 * 114 * This conglomerate of data structures interact with 115 * the pollhead which is used by VOP_POLL and pollwakeup 116 * (protected by the PHLOCK, cached array of plocks), and 117 * the fpollinfo list hanging off the fi_list which is used to notify 118 * poll when a cached fd is closed. This is protected by uf_lock. 119 * 120 * Invariants: 121 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 122 * is on that pollhead. This is modified atomically under pc_lock. 123 * 124 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 125 * list for that open file. 126 * This is modified atomically under pc_lock. 127 * 128 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 129 * Iff pd_ref[i].xf_refcnt >= 1 then 130 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 131 * Iff pd_ref[i].xf_refcnt > 1 then 132 * In ps_pcacheset[i].pcs_pollfd between index 133 * pd_ref[i].xf_position] and the end of the list 134 * there are xf_refcnt entries with .fd == pd_fd 135 * 136 * Locking design: 137 * Whenever possible the design relies on the fact that the poll cache state 138 * is per thread thus for both poll and exit it is self-synchronizing. 139 * Thus the key interactions where other threads access the state are: 140 * pollwakeup (and polltime), and 141 * close cleaning up the cached references to an open file 142 * 143 * The two key locks in poll proper is ps_lock and pc_lock. 144 * 145 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 146 * to ensure that modifications to pollcacheset structure are serialized. 147 * This lock is held through most of poll() except where poll sleeps 148 * since there is little need to handle closes concurrently with the execution 149 * of poll. 150 * The pc_lock protects most of the fields in pollcache structure and polldat 151 * structures (which are accessed by poll, pollwakeup, and polltime) 152 * with the exception of fields that are only modified when only one thread 153 * can access this per-thread state. 154 * Those exceptions occur in poll when first allocating the per-thread state, 155 * when poll grows the number of polldat (never shrinks), and when 156 * exit/pollcleanup has ensured that there are no references from either 157 * pollheads or fpollinfo to the threads poll state. 158 * 159 * Poll(2) system call is the only path which ps_lock and pc_lock are both 160 * held, in that order. It needs ps_lock to synchronize with close and 161 * lwp_exit; and pc_lock with pollwakeup. 162 * 163 * The locking interaction between pc_lock and PHLOCK take into account 164 * that poll acquires these locks in the order of pc_lock and then PHLOCK 165 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 166 * deadlock avoidance by dropping the locks and reacquiring them in the 167 * reverse order. For this to work pollwakeup needs to prevent the thread 168 * from exiting and freeing all of the poll related state. Thus is done 169 * using 170 * the pc_no_exit lock 171 * the pc_busy counter 172 * the pc_busy_cv condition variable 173 * 174 * The locking interaction between pc_lock and uf_lock has similar 175 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 176 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 177 * to prevent poll or exit from doing a delfpollinfo after which the thread 178 * might exit. But the cleanup needs to acquire pc_lock when modifying 179 * the poll cache state. The solution is to use pc_busy and do the close 180 * cleanup in two phases: 181 * First close calls pollblockexit which increments pc_busy. 182 * This prevents the per-thread poll related state from being freed. 183 * Then close drops uf_lock and calls pollcacheclean. 184 * This routine can then acquire pc_lock and remove any references 185 * to the closing fd (as well as recording that it has been closed 186 * so that a POLLNVAL can be generated even if the fd is reused before 187 * poll has been woken up and checked getf() again). 188 * 189 * When removing a polled fd from poll cache, the fd is always removed 190 * from pollhead list first and then from fpollinfo list, i.e., 191 * pollhead_delete() is called before delfpollinfo(). 192 * 193 * 194 * Locking hierarchy: 195 * pc_no_exit is a leaf level lock. 196 * ps_lock is held when acquiring pc_lock (except when pollwakeup 197 * acquires pc_lock). 198 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 199 * pollhead_delete) 200 * pc_lock is always held (but this is not required) 201 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 202 * from pcache_clean_entry). 203 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 204 * uf_lock. 205 * pc_lock is held across getf/releasef which acquire uf_lock. 206 * ps_lock might be held across getf/releasef which acquire uf_lock. 207 * pollwakeup tries to acquire pc_lock while holding PHLOCK 208 * but drops the locks and reacquire them in reverse order to avoid 209 * deadlock. 210 * 211 * Note also that there is deadlock avoidance support for VOP_POLL routines 212 * and pollwakeup involving a file system or driver lock. 213 * See below. 214 */ 215 216 /* 217 * Deadlock avoidance support for VOP_POLL() routines. This is 218 * sometimes necessary to prevent deadlock between polling threads 219 * (which hold poll locks on entry to xx_poll(), then acquire foo) 220 * and pollwakeup() threads (which hold foo, then acquire poll locks). 221 * 222 * pollunlock(void) releases whatever poll locks the current thread holds, 223 * returning a cookie for use by pollrelock(); 224 * 225 * pollrelock(cookie) reacquires previously dropped poll locks; 226 * 227 * polllock(php, mutex) does the common case: pollunlock(), 228 * acquire the problematic mutex, pollrelock(). 229 */ 230 int 231 pollunlock(void) 232 { 233 pollcache_t *pcp; 234 int lockstate = 0; 235 236 /* 237 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 238 * If the pollrelock/pollunlock is called as a result of poll(2), 239 * the t_pollcache should be NULL. 240 */ 241 if (curthread->t_pollcache == NULL) 242 pcp = curthread->t_pollstate->ps_pcache; 243 else 244 pcp = curthread->t_pollcache; 245 246 if (mutex_owned(&pcp->pc_lock)) { 247 lockstate = 1; 248 mutex_exit(&pcp->pc_lock); 249 } 250 return (lockstate); 251 } 252 253 void 254 pollrelock(int lockstate) 255 { 256 pollcache_t *pcp; 257 258 /* 259 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 260 * If the pollrelock/pollunlock is called as a result of poll(2), 261 * the t_pollcache should be NULL. 262 */ 263 if (curthread->t_pollcache == NULL) 264 pcp = curthread->t_pollstate->ps_pcache; 265 else 266 pcp = curthread->t_pollcache; 267 268 if (lockstate > 0) 269 mutex_enter(&pcp->pc_lock); 270 } 271 272 /* ARGSUSED */ 273 void 274 polllock(pollhead_t *php, kmutex_t *lp) 275 { 276 if (!mutex_tryenter(lp)) { 277 int lockstate = pollunlock(); 278 mutex_enter(lp); 279 pollrelock(lockstate); 280 } 281 } 282 283 static int 284 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 285 { 286 kthread_t *t = curthread; 287 klwp_t *lwp = ttolwp(t); 288 proc_t *p = ttoproc(t); 289 int fdcnt = 0; 290 int i; 291 int imm_timeout = 0; 292 clock_t *deltap = NULL; 293 clock_t delta; 294 pollfd_t *pollfdp; 295 pollstate_t *ps; 296 pollcache_t *pcp; 297 int error = 0; 298 nfds_t old_nfds; 299 int cacheindex = 0; /* which cache set is used */ 300 301 /* 302 * Determine the precise future time of the requested timeout, if any. 303 */ 304 if (tsp != NULL) { 305 if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 306 imm_timeout = 1; 307 } else { 308 /* 309 * cv_relwaituntil_sig operates at 310 * the tick granularity, which by default is 10 ms. 311 * Convert the specified timespec to ticks, rounding 312 * up to at least 1 tick to avoid flooding the 313 * system with small high resolution timers. 314 */ 315 delta = SEC_TO_TICK(tsp->tv_sec) + 316 NSEC_TO_TICK(tsp->tv_nsec); 317 if (delta < 1) { 318 delta = 1; 319 } 320 deltap = δ 321 } 322 } 323 324 /* 325 * Reset our signal mask, if requested. 326 */ 327 if (ksetp != NULL) { 328 mutex_enter(&p->p_lock); 329 schedctl_finish_sigblock(t); 330 lwp->lwp_sigoldmask = t->t_hold; 331 t->t_hold = *ksetp; 332 t->t_flag |= T_TOMASK; 333 /* 334 * Call cv_reltimedwait_sig() just to check for signals. 335 * We will return immediately with either 0 or -1. 336 */ 337 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 338 TR_CLOCK_TICK)) { 339 mutex_exit(&p->p_lock); 340 error = EINTR; 341 goto pollout; 342 } 343 mutex_exit(&p->p_lock); 344 } 345 346 /* 347 * Check to see if this guy just wants to use poll() as a timeout. 348 * If yes then bypass all the other stuff and make him sleep. 349 */ 350 if (nfds == 0) { 351 /* 352 * Sleep until we have passed the requested future 353 * time or until interrupted by a signal. 354 * Do not check for signals if we have a zero timeout. 355 */ 356 if (!imm_timeout) { 357 mutex_enter(&t->t_delay_lock); 358 while ((delta = cv_relwaituntil_sig(&t->t_delay_cv, 359 &t->t_delay_lock, deltap, TR_MILLISEC)) > 0) 360 continue; 361 mutex_exit(&t->t_delay_lock); 362 if (delta == 0) 363 error = EINTR; 364 } 365 goto pollout; 366 } 367 368 if (nfds > p->p_fno_ctl) { 369 mutex_enter(&p->p_lock); 370 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 371 p->p_rctls, p, RCA_SAFE); 372 mutex_exit(&p->p_lock); 373 error = EINVAL; 374 goto pollout; 375 } 376 377 /* 378 * Need to allocate memory for pollstate before anything because 379 * the mutex and cv are created in this space 380 */ 381 if ((ps = t->t_pollstate) == NULL) { 382 t->t_pollstate = pollstate_create(); 383 ps = t->t_pollstate; 384 } 385 386 if (ps->ps_pcache == NULL) 387 ps->ps_pcache = pcache_alloc(); 388 pcp = ps->ps_pcache; 389 390 /* 391 * NOTE: for performance, buffers are saved across poll() calls. 392 * The theory is that if a process polls heavily, it tends to poll 393 * on the same set of descriptors. Therefore, we only reallocate 394 * buffers when nfds changes. There is no hysteresis control, 395 * because there is no data to suggest that this is necessary; 396 * the penalty of reallocating is not *that* great in any event. 397 */ 398 old_nfds = ps->ps_nfds; 399 if (nfds != old_nfds) { 400 401 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 402 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 403 ps->ps_pollfd = pollfdp; 404 ps->ps_nfds = nfds; 405 } 406 407 pollfdp = ps->ps_pollfd; 408 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 409 error = EFAULT; 410 goto pollout; 411 } 412 413 if (fds == NULL) { 414 /* 415 * If the process has page 0 mapped, then the copyin() above 416 * will succeed even if fds is NULL. However, our cached 417 * poll lists are keyed by the address of the passed-in fds 418 * structure, and we use the value NULL to indicate an unused 419 * poll cache list entry. As such, we elect not to support 420 * NULL as a valid (user) memory address and fail the poll() 421 * call. 422 */ 423 error = EINVAL; 424 goto pollout; 425 } 426 427 /* 428 * If this thread polls for the first time, allocate ALL poll 429 * cache data structures and cache the poll fd list. This 430 * allocation is delayed till now because lwp's polling 0 fd 431 * (i.e. using poll as timeout()) don't need this memory. 432 */ 433 mutex_enter(&ps->ps_lock); 434 pcp = ps->ps_pcache; 435 ASSERT(pcp != NULL); 436 if (pcp->pc_bitmap == NULL) { 437 pcache_create(pcp, nfds); 438 /* 439 * poll and cache this poll fd list in ps_pcacheset[0]. 440 */ 441 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 442 if (fdcnt || error) { 443 mutex_exit(&ps->ps_lock); 444 goto pollout; 445 } 446 } else { 447 pollcacheset_t *pcset = ps->ps_pcacheset; 448 449 /* 450 * Not first time polling. Select a cached poll list by 451 * matching user pollfd list buffer address. 452 */ 453 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 454 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 455 if ((++pcset[cacheindex].pcs_count) == 0) { 456 /* 457 * counter is wrapping around. 458 */ 459 pcacheset_reset_count(ps, cacheindex); 460 } 461 /* 462 * examine and resolve possible 463 * difference of the current poll 464 * list and previously cached one. 465 * If there is an error during resolve(), 466 * the callee will guarantee the consistency 467 * of cached poll list and cache content. 468 */ 469 error = pcacheset_resolve(ps, nfds, &fdcnt, 470 cacheindex); 471 if (error) { 472 mutex_exit(&ps->ps_lock); 473 goto pollout; 474 } 475 break; 476 } 477 478 /* 479 * Note that pcs_usradr field of an used entry won't be 480 * NULL because it stores the address of passed-in fds, 481 * and NULL fds will not be cached (Then it is either 482 * the special timeout case when nfds is 0 or it returns 483 * failure directly). 484 */ 485 if (pcset[cacheindex].pcs_usradr == NULL) { 486 /* 487 * found an unused entry. Use it to cache 488 * this poll list. 489 */ 490 error = pcacheset_cache_list(ps, fds, &fdcnt, 491 cacheindex); 492 if (fdcnt || error) { 493 mutex_exit(&ps->ps_lock); 494 goto pollout; 495 } 496 break; 497 } 498 } 499 if (cacheindex == ps->ps_nsets) { 500 /* 501 * We failed to find a matching cached poll fd list. 502 * replace an old list. 503 */ 504 pollstats.polllistmiss.value.ui64++; 505 cacheindex = pcacheset_replace(ps); 506 ASSERT(cacheindex < ps->ps_nsets); 507 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 508 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 509 if (error) { 510 mutex_exit(&ps->ps_lock); 511 goto pollout; 512 } 513 } 514 } 515 516 /* 517 * Always scan the bitmap with the lock on the pollcache held. 518 * This is to make sure that a wakeup does not come undetected. 519 * If the lock is not held, a pollwakeup could have come for an 520 * fd we already checked but before this thread sleeps, in which 521 * case the wakeup is missed. Now we hold the pcache lock and 522 * check the bitmap again. This will prevent wakeup from happening 523 * while we hold pcache lock since pollwakeup() will also lock 524 * the pcache before updating poll bitmap. 525 */ 526 mutex_enter(&pcp->pc_lock); 527 for (;;) { 528 pcp->pc_flag = 0; 529 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 530 if (fdcnt || error) { 531 mutex_exit(&pcp->pc_lock); 532 mutex_exit(&ps->ps_lock); 533 break; 534 } 535 536 /* 537 * If T_POLLWAKE is set, a pollwakeup() was performed on 538 * one of the file descriptors. This can happen only if 539 * one of the VOP_POLL() functions dropped pcp->pc_lock. 540 * The only current cases of this is in procfs (prpoll()) 541 * and STREAMS (strpoll()). 542 */ 543 if (pcp->pc_flag & T_POLLWAKE) 544 continue; 545 546 /* 547 * If you get here, the poll of fds was unsuccessful. 548 * Wait until some fd becomes readable, writable, or gets 549 * an exception, or until a signal or a timeout occurs. 550 * Do not check for signals if we have a zero timeout. 551 */ 552 mutex_exit(&ps->ps_lock); 553 if (imm_timeout) { 554 delta = -1; 555 } else { 556 delta = cv_relwaituntil_sig(&pcp->pc_cv, &pcp->pc_lock, 557 deltap, TR_MILLISEC); 558 } 559 mutex_exit(&pcp->pc_lock); 560 /* 561 * If we have received a signal or timed out 562 * then break out and return. 563 */ 564 if (delta <= 0) { 565 if (delta == 0) 566 error = EINTR; 567 break; 568 } 569 /* 570 * We have not received a signal or timed out. 571 * Continue around and poll fds again. 572 */ 573 mutex_enter(&ps->ps_lock); 574 mutex_enter(&pcp->pc_lock); 575 } 576 577 pollout: 578 /* 579 * If we changed the signal mask but we received 580 * no signal then restore the signal mask. 581 * Otherwise psig() will deal with the signal mask. 582 */ 583 if (ksetp != NULL) { 584 mutex_enter(&p->p_lock); 585 if (lwp->lwp_cursig == 0) { 586 t->t_hold = lwp->lwp_sigoldmask; 587 t->t_flag &= ~T_TOMASK; 588 } 589 mutex_exit(&p->p_lock); 590 } 591 592 if (error) 593 return (set_errno(error)); 594 595 /* 596 * Copy out the events and return the fdcnt to the user. 597 */ 598 if (nfds != 0 && 599 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 600 return (set_errno(EFAULT)); 601 602 #ifdef DEBUG 603 /* 604 * Another sanity check: 605 */ 606 if (fdcnt) { 607 int reventcnt = 0; 608 609 for (i = 0; i < nfds; i++) { 610 if (pollfdp[i].fd < 0) { 611 ASSERT(pollfdp[i].revents == 0); 612 continue; 613 } 614 if (pollfdp[i].revents) { 615 reventcnt++; 616 } 617 } 618 ASSERT(fdcnt == reventcnt); 619 } else { 620 for (i = 0; i < nfds; i++) { 621 ASSERT(pollfdp[i].revents == 0); 622 } 623 } 624 #endif /* DEBUG */ 625 626 return (fdcnt); 627 } 628 629 /* 630 * This is the system call trap that poll(), 631 * select() and pselect() are built upon. 632 * It is a private interface between libc and the kernel. 633 */ 634 int 635 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 636 { 637 timespec_t ts; 638 timespec_t *tsp; 639 sigset_t set; 640 k_sigset_t kset; 641 k_sigset_t *ksetp; 642 model_t datamodel = get_udatamodel(); 643 644 if (timeoutp == NULL) 645 tsp = NULL; 646 else { 647 if (datamodel == DATAMODEL_NATIVE) { 648 if (copyin(timeoutp, &ts, sizeof (ts))) 649 return (set_errno(EFAULT)); 650 } else { 651 timespec32_t ts32; 652 653 if (copyin(timeoutp, &ts32, sizeof (ts32))) 654 return (set_errno(EFAULT)); 655 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 656 } 657 658 if (itimerspecfix(&ts)) 659 return (set_errno(EINVAL)); 660 tsp = &ts; 661 } 662 663 if (setp == NULL) 664 ksetp = NULL; 665 else { 666 if (copyin(setp, &set, sizeof (set))) 667 return (set_errno(EFAULT)); 668 sigutok(&set, &kset); 669 ksetp = &kset; 670 } 671 672 return (poll_common(fds, nfds, tsp, ksetp)); 673 } 674 675 /* 676 * Clean up any state left around by poll(2). Called when a thread exits. 677 */ 678 void 679 pollcleanup() 680 { 681 pollstate_t *ps = curthread->t_pollstate; 682 pollcache_t *pcp; 683 684 if (ps == NULL) 685 return; 686 pcp = ps->ps_pcache; 687 /* 688 * free up all cached poll fds 689 */ 690 if (pcp == NULL) { 691 /* this pollstate is used by /dev/poll */ 692 goto pollcleanout; 693 } 694 695 if (pcp->pc_bitmap != NULL) { 696 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 697 /* 698 * a close lwp can race with us when cleaning up a polldat 699 * entry. We hold the ps_lock when cleaning hash table. 700 * Since this pollcache is going away anyway, there is no 701 * need to hold the pc_lock. 702 */ 703 mutex_enter(&ps->ps_lock); 704 pcache_clean(pcp); 705 mutex_exit(&ps->ps_lock); 706 #ifdef DEBUG 707 /* 708 * At this point, all fds cached by this lwp should be 709 * cleaned up. There should be no fd in fi_list still 710 * reference this thread. 711 */ 712 checkfpollinfo(); /* sanity check */ 713 pollcheckphlist(); /* sanity check */ 714 #endif /* DEBUG */ 715 } 716 /* 717 * Be sure no one is referencing thread before exiting 718 */ 719 mutex_enter(&pcp->pc_no_exit); 720 ASSERT(pcp->pc_busy >= 0); 721 while (pcp->pc_busy > 0) 722 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 723 mutex_exit(&pcp->pc_no_exit); 724 pollcleanout: 725 pollstate_destroy(ps); 726 curthread->t_pollstate = NULL; 727 } 728 729 /* 730 * pollwakeup() - poke threads waiting in poll() for some event 731 * on a particular object. 732 * 733 * The threads hanging off of the specified pollhead structure are scanned. 734 * If their event mask matches the specified event(s), then pollnotify() is 735 * called to poke the thread. 736 * 737 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 738 * all waiting threads are poked. 739 * 740 * It is important that pollnotify() not drop the lock protecting the list 741 * of threads. 742 */ 743 void 744 pollwakeup(pollhead_t *php, short events_arg) 745 { 746 polldat_t *pdp; 747 int events = (ushort_t)events_arg; 748 struct plist { 749 port_t *pp; 750 int pevents; 751 struct plist *next; 752 }; 753 struct plist *plhead = NULL, *pltail = NULL; 754 755 retry: 756 PH_ENTER(php); 757 758 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 759 if ((pdp->pd_events & events) || 760 (events & (POLLHUP | POLLERR))) { 761 762 pollcache_t *pcp; 763 764 if (pdp->pd_portev != NULL) { 765 port_kevent_t *pkevp = pdp->pd_portev; 766 /* 767 * Object (fd) is associated with an event port, 768 * => send event notification to the port. 769 */ 770 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 771 mutex_enter(&pkevp->portkev_lock); 772 if (pkevp->portkev_flags & PORT_KEV_VALID) { 773 int pevents; 774 775 pkevp->portkev_flags &= ~PORT_KEV_VALID; 776 pkevp->portkev_events |= events & 777 (pdp->pd_events | POLLHUP | 778 POLLERR); 779 /* 780 * portkev_lock mutex will be released 781 * by port_send_event(). 782 */ 783 port_send_event(pkevp); 784 785 /* 786 * If we have some thread polling the 787 * port's fd, add it to the list. They 788 * will be notified later. 789 * The port_pollwkup() will flag the 790 * port_t so that it will not disappear 791 * till port_pollwkdone() is called. 792 */ 793 pevents = 794 port_pollwkup(pkevp->portkev_port); 795 if (pevents) { 796 struct plist *t; 797 t = kmem_zalloc( 798 sizeof (struct plist), 799 KM_SLEEP); 800 t->pp = pkevp->portkev_port; 801 t->pevents = pevents; 802 if (plhead == NULL) { 803 plhead = t; 804 } else { 805 pltail->next = t; 806 } 807 pltail = t; 808 } 809 } else { 810 mutex_exit(&pkevp->portkev_lock); 811 } 812 continue; 813 } 814 815 pcp = pdp->pd_pcache; 816 817 /* 818 * Try to grab the lock for this thread. If 819 * we don't get it then we may deadlock so 820 * back out and restart all over again. Note 821 * that the failure rate is very very low. 822 */ 823 if (mutex_tryenter(&pcp->pc_lock)) { 824 pollnotify(pcp, pdp->pd_fd); 825 mutex_exit(&pcp->pc_lock); 826 } else { 827 /* 828 * We are here because: 829 * 1) This thread has been woke up 830 * and is trying to get out of poll(). 831 * 2) Some other thread is also here 832 * but with a different pollhead lock. 833 * 834 * So, we need to drop the lock on pollhead 835 * because of (1) but we want to prevent 836 * that thread from doing lwp_exit() or 837 * devpoll close. We want to ensure that 838 * the pollcache pointer is still invalid. 839 * 840 * Solution: Grab the pcp->pc_no_exit lock, 841 * increment the pc_busy counter, drop every 842 * lock in sight. Get out of the way and wait 843 * for type (2) threads to finish. 844 */ 845 846 mutex_enter(&pcp->pc_no_exit); 847 pcp->pc_busy++; /* prevents exit()'s */ 848 mutex_exit(&pcp->pc_no_exit); 849 850 PH_EXIT(php); 851 mutex_enter(&pcp->pc_lock); 852 mutex_exit(&pcp->pc_lock); 853 mutex_enter(&pcp->pc_no_exit); 854 pcp->pc_busy--; 855 if (pcp->pc_busy == 0) { 856 /* 857 * Wakeup the thread waiting in 858 * thread_exit(). 859 */ 860 cv_signal(&pcp->pc_busy_cv); 861 } 862 mutex_exit(&pcp->pc_no_exit); 863 goto retry; 864 } 865 } 866 } 867 868 869 /* 870 * Event ports - If this php is of the port on the list, 871 * call port_pollwkdone() to release it. The port_pollwkdone() 872 * needs to be called before dropping the PH lock so that any new 873 * thread attempting to poll this port are blocked. There can be 874 * only one thread here in pollwakeup notifying this port's fd. 875 */ 876 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 877 struct plist *t; 878 port_pollwkdone(plhead->pp); 879 t = plhead; 880 plhead = plhead->next; 881 kmem_free(t, sizeof (struct plist)); 882 } 883 PH_EXIT(php); 884 885 /* 886 * Event ports - Notify threads polling the event port's fd. 887 * This is normally done in port_send_event() where it calls 888 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 889 * we do it here in pollwakeup() to avoid a recursive call. 890 */ 891 if (plhead != NULL) { 892 php = &plhead->pp->port_pollhd; 893 events = plhead->pevents; 894 goto retry; 895 } 896 } 897 898 /* 899 * This function is called to inform a thread that 900 * an event being polled for has occurred. 901 * The pollstate lock on the thread should be held on entry. 902 */ 903 void 904 pollnotify(pollcache_t *pcp, int fd) 905 { 906 ASSERT(fd < pcp->pc_mapsize); 907 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 908 BT_SET(pcp->pc_bitmap, fd); 909 pcp->pc_flag |= T_POLLWAKE; 910 cv_signal(&pcp->pc_cv); 911 } 912 913 /* 914 * add a polldat entry to pollhead ph_list. The polldat struct is used 915 * by pollwakeup to wake sleeping pollers when polled events has happened. 916 */ 917 void 918 pollhead_insert(pollhead_t *php, polldat_t *pdp) 919 { 920 PH_ENTER(php); 921 ASSERT(pdp->pd_next == NULL); 922 #ifdef DEBUG 923 { 924 /* 925 * the polldat should not be already on the list 926 */ 927 polldat_t *wp; 928 for (wp = php->ph_list; wp; wp = wp->pd_next) { 929 ASSERT(wp != pdp); 930 } 931 } 932 #endif /* DEBUG */ 933 pdp->pd_next = php->ph_list; 934 php->ph_list = pdp; 935 PH_EXIT(php); 936 } 937 938 /* 939 * Delete the polldat entry from ph_list. 940 */ 941 void 942 pollhead_delete(pollhead_t *php, polldat_t *pdp) 943 { 944 polldat_t *wp; 945 polldat_t **wpp; 946 947 PH_ENTER(php); 948 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 949 if (wp == pdp) { 950 *wpp = pdp->pd_next; 951 pdp->pd_next = NULL; 952 break; 953 } 954 } 955 #ifdef DEBUG 956 /* assert that pdp is no longer in the list */ 957 for (wp = *wpp; wp; wp = wp->pd_next) { 958 ASSERT(wp != pdp); 959 } 960 #endif /* DEBUG */ 961 PH_EXIT(php); 962 } 963 964 /* 965 * walk through the poll fd lists to see if they are identical. This is an 966 * expensive operation and should not be done more than once for each poll() 967 * call. 968 * 969 * As an optimization (i.e., not having to go through the lists more than 970 * once), this routine also clear the revents field of pollfd in 'current'. 971 * Zeroing out the revents field of each entry in current poll list is 972 * required by poll man page. 973 * 974 * Since the events field of cached list has illegal poll events filtered 975 * out, the current list applies the same filtering before comparison. 976 * 977 * The routine stops when it detects a meaningful difference, or when it 978 * exhausts the lists. 979 */ 980 int 981 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 982 { 983 int ix; 984 985 for (ix = 0; ix < n; ix++) { 986 /* Prefetch 64 bytes worth of 8-byte elements */ 987 if ((ix & 0x7) == 0) { 988 prefetch_write_many((caddr_t)¤t[ix + 8]); 989 prefetch_write_many((caddr_t)&cached[ix + 8]); 990 } 991 if (current[ix].fd == cached[ix].fd) { 992 /* 993 * Filter out invalid poll events while we are in 994 * inside the loop. 995 */ 996 if (current[ix].events & ~VALID_POLL_EVENTS) { 997 current[ix].events &= VALID_POLL_EVENTS; 998 if (newlist != NULL) 999 newlist[ix].events = current[ix].events; 1000 } 1001 if (current[ix].events == cached[ix].events) { 1002 current[ix].revents = 0; 1003 continue; 1004 } 1005 } 1006 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1007 current[ix].revents = 0; 1008 continue; 1009 } 1010 return (ix); 1011 } 1012 return (ix); 1013 } 1014 1015 /* 1016 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1017 * does not find it in the hash table. 1018 */ 1019 polldat_t * 1020 pcache_lookup_fd(pollcache_t *pcp, int fd) 1021 { 1022 int hashindex; 1023 polldat_t *pdp; 1024 1025 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1026 pdp = pcp->pc_hash[hashindex]; 1027 while (pdp != NULL) { 1028 if (pdp->pd_fd == fd) 1029 break; 1030 pdp = pdp->pd_hashnext; 1031 } 1032 return (pdp); 1033 } 1034 1035 polldat_t * 1036 pcache_alloc_fd(int nsets) 1037 { 1038 polldat_t *pdp; 1039 1040 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1041 if (nsets > 0) { 1042 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1043 pdp->pd_nsets = nsets; 1044 } 1045 return (pdp); 1046 } 1047 1048 /* 1049 * This routine inserts a polldat into the pollcache's hash table. It 1050 * may be necessary to grow the size of the hash table. 1051 */ 1052 void 1053 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1054 { 1055 int hashindex; 1056 int fd; 1057 1058 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1059 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1060 pcache_grow_hashtbl(pcp, nfds); 1061 } 1062 fd = pdp->pd_fd; 1063 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1064 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1065 pcp->pc_hash[hashindex] = pdp; 1066 pcp->pc_fdcount++; 1067 1068 #ifdef DEBUG 1069 { 1070 /* 1071 * same fd should not appear on a hash list twice 1072 */ 1073 polldat_t *pdp1; 1074 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1075 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1076 } 1077 } 1078 #endif /* DEBUG */ 1079 } 1080 1081 /* 1082 * Grow the hash table -- either double the table size or round it to the 1083 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1084 * elements on the hash table. 1085 */ 1086 void 1087 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1088 { 1089 int oldsize; 1090 polldat_t **oldtbl; 1091 polldat_t *pdp, *pdp1; 1092 int i; 1093 #ifdef DEBUG 1094 int count = 0; 1095 #endif 1096 1097 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1098 oldsize = pcp->pc_hashsize; 1099 oldtbl = pcp->pc_hash; 1100 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1101 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1102 ~(POLLHASHCHUNKSZ - 1); 1103 } else { 1104 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1105 } 1106 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1107 KM_SLEEP); 1108 /* 1109 * rehash existing elements 1110 */ 1111 pcp->pc_fdcount = 0; 1112 for (i = 0; i < oldsize; i++) { 1113 pdp = oldtbl[i]; 1114 while (pdp != NULL) { 1115 pdp1 = pdp->pd_hashnext; 1116 pcache_insert_fd(pcp, pdp, nfds); 1117 pdp = pdp1; 1118 #ifdef DEBUG 1119 count++; 1120 #endif 1121 } 1122 } 1123 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1124 ASSERT(pcp->pc_fdcount == count); 1125 } 1126 1127 void 1128 pcache_grow_map(pollcache_t *pcp, int fd) 1129 { 1130 int newsize; 1131 ulong_t *newmap; 1132 1133 /* 1134 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1135 * power of 2. 1136 */ 1137 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1138 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1139 KM_SLEEP); 1140 /* 1141 * don't want pollwakeup to set a bit while growing the bitmap. 1142 */ 1143 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1144 mutex_enter(&pcp->pc_lock); 1145 bcopy(pcp->pc_bitmap, newmap, 1146 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1147 kmem_free(pcp->pc_bitmap, 1148 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1149 pcp->pc_bitmap = newmap; 1150 pcp->pc_mapsize = newsize; 1151 mutex_exit(&pcp->pc_lock); 1152 } 1153 1154 /* 1155 * remove all the reference from pollhead list and fpollinfo lists. 1156 */ 1157 void 1158 pcache_clean(pollcache_t *pcp) 1159 { 1160 int i; 1161 polldat_t **hashtbl; 1162 polldat_t *pdp; 1163 1164 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1165 hashtbl = pcp->pc_hash; 1166 for (i = 0; i < pcp->pc_hashsize; i++) { 1167 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1168 if (pdp->pd_php != NULL) { 1169 pollhead_delete(pdp->pd_php, pdp); 1170 pdp->pd_php = NULL; 1171 } 1172 if (pdp->pd_fp != NULL) { 1173 delfpollinfo(pdp->pd_fd); 1174 pdp->pd_fp = NULL; 1175 } 1176 } 1177 } 1178 } 1179 1180 void 1181 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1182 { 1183 int i; 1184 int fd = pdp->pd_fd; 1185 1186 /* 1187 * we come here because an earlier close() on this cached poll fd. 1188 */ 1189 ASSERT(pdp->pd_fp == NULL); 1190 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1191 pdp->pd_events = 0; 1192 for (i = 0; i < ps->ps_nsets; i++) { 1193 xref_t *refp; 1194 pollcacheset_t *pcsp; 1195 1196 ASSERT(pdp->pd_ref != NULL); 1197 refp = &pdp->pd_ref[i]; 1198 if (refp->xf_refcnt) { 1199 ASSERT(refp->xf_position >= 0); 1200 pcsp = &ps->ps_pcacheset[i]; 1201 if (refp->xf_refcnt == 1) { 1202 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1203 refp->xf_refcnt = 0; 1204 pdp->pd_count--; 1205 } else if (refp->xf_refcnt > 1) { 1206 int j; 1207 1208 /* 1209 * turn off every appearance in pcs_pollfd list 1210 */ 1211 for (j = refp->xf_position; 1212 j < pcsp->pcs_nfds; j++) { 1213 if (pcsp->pcs_pollfd[j].fd == fd) { 1214 pcsp->pcs_pollfd[j].fd = -1; 1215 refp->xf_refcnt--; 1216 pdp->pd_count--; 1217 } 1218 } 1219 } 1220 ASSERT(refp->xf_refcnt == 0); 1221 refp->xf_position = POLLPOSINVAL; 1222 } 1223 } 1224 ASSERT(pdp->pd_count == 0); 1225 } 1226 1227 /* 1228 * Insert poll fd into the pollcache, and add poll registration. 1229 * This routine is called after getf() and before releasef(). So the vnode 1230 * can not disappear even if we block here. 1231 * If there is an error, the polled fd is not cached. 1232 */ 1233 int 1234 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1235 ssize_t pos, int which) 1236 { 1237 pollcache_t *pcp = ps->ps_pcache; 1238 polldat_t *pdp; 1239 int error; 1240 int fd; 1241 pollhead_t *memphp = NULL; 1242 xref_t *refp; 1243 int newpollfd = 0; 1244 1245 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1246 /* 1247 * The poll caching uses the existing VOP_POLL interface. If there 1248 * is no polled events, we want the polled device to set its "some 1249 * one is sleeping in poll" flag. When the polled events happen 1250 * later, the driver will call pollwakeup(). We achieve this by 1251 * always passing 0 in the third parameter ("anyyet") when calling 1252 * VOP_POLL. This parameter is not looked at by drivers when the 1253 * polled events exist. If a driver chooses to ignore this parameter 1254 * and call pollwakeup whenever the polled events happen, that will 1255 * be OK too. 1256 */ 1257 ASSERT(curthread->t_pollcache == NULL); 1258 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1259 &memphp, NULL); 1260 if (error) { 1261 return (error); 1262 } 1263 if (pollfdp->revents) { 1264 (*fdcntp)++; 1265 } 1266 /* 1267 * polling the underlying device succeeded. Now we can cache it. 1268 * A close can't come in here because we have not done a releasef() 1269 * yet. 1270 */ 1271 fd = pollfdp->fd; 1272 pdp = pcache_lookup_fd(pcp, fd); 1273 if (pdp == NULL) { 1274 ASSERT(ps->ps_nsets > 0); 1275 pdp = pcache_alloc_fd(ps->ps_nsets); 1276 newpollfd = 1; 1277 } 1278 /* 1279 * If this entry was used to cache a poll fd which was closed, and 1280 * this entry has not been cleaned, do it now. 1281 */ 1282 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1283 pcacheset_invalidate(ps, pdp); 1284 ASSERT(pdp->pd_next == NULL); 1285 } 1286 if (pdp->pd_count == 0) { 1287 pdp->pd_fd = fd; 1288 pdp->pd_fp = fp; 1289 addfpollinfo(fd); 1290 pdp->pd_thread = curthread; 1291 pdp->pd_pcache = pcp; 1292 /* 1293 * the entry is never used or cleared by removing a cached 1294 * pollfd (pcache_delete_fd). So all the fields should be clear. 1295 */ 1296 ASSERT(pdp->pd_next == NULL); 1297 } 1298 1299 /* 1300 * A polled fd is considered cached. So there should be a fpollinfo 1301 * entry on uf_fpollinfo list. 1302 */ 1303 ASSERT(infpollinfo(fd)); 1304 /* 1305 * If there is an inconsistency, we want to know it here. 1306 */ 1307 ASSERT(pdp->pd_fp == fp); 1308 1309 /* 1310 * XXX pd_events is a union of all polled events on this fd, possibly 1311 * by different threads. Unless this is a new first poll(), pd_events 1312 * never shrinks. If an event is no longer polled by a process, there 1313 * is no way to cancel that event. In that case, poll degrade to its 1314 * old form -- polling on this fd every time poll() is called. The 1315 * assumption is an app always polls the same type of events. 1316 */ 1317 pdp->pd_events |= pollfdp->events; 1318 1319 pdp->pd_count++; 1320 /* 1321 * There is not much special handling for multiple appearances of 1322 * same fd other than xf_position always recording the first 1323 * appearance in poll list. If this is called from pcacheset_cache_list, 1324 * a VOP_POLL is called on every pollfd entry; therefore each 1325 * revents and fdcnt should be set correctly. If this is called from 1326 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1327 * pick up the right count and handle revents field of each pollfd 1328 * entry. 1329 */ 1330 ASSERT(pdp->pd_ref != NULL); 1331 refp = &pdp->pd_ref[which]; 1332 if (refp->xf_refcnt == 0) { 1333 refp->xf_position = pos; 1334 } else { 1335 /* 1336 * xf_position records the fd's first appearance in poll list 1337 */ 1338 if (pos < refp->xf_position) { 1339 refp->xf_position = pos; 1340 } 1341 } 1342 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1343 refp->xf_refcnt++; 1344 if (fd >= pcp->pc_mapsize) { 1345 pcache_grow_map(pcp, fd); 1346 } 1347 if (fd > pcp->pc_mapend) { 1348 pcp->pc_mapend = fd; 1349 } 1350 if (newpollfd != 0) { 1351 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1352 } 1353 if (memphp) { 1354 if (pdp->pd_php == NULL) { 1355 pollhead_insert(memphp, pdp); 1356 pdp->pd_php = memphp; 1357 } else { 1358 if (memphp != pdp->pd_php) { 1359 /* 1360 * layered devices (e.g. console driver) 1361 * may change the vnode and thus the pollhead 1362 * pointer out from underneath us. 1363 */ 1364 pollhead_delete(pdp->pd_php, pdp); 1365 pollhead_insert(memphp, pdp); 1366 pdp->pd_php = memphp; 1367 } 1368 } 1369 } 1370 /* 1371 * Since there is a considerable window between VOP_POLL and when 1372 * we actually put the polldat struct on the pollhead list, we could 1373 * miss a pollwakeup. In the case of polling additional events, we 1374 * don't update the events until after VOP_POLL. So we could miss 1375 * pollwakeup there too. So we always set the bit here just to be 1376 * safe. The real performance gain is in subsequent pcache_poll. 1377 */ 1378 mutex_enter(&pcp->pc_lock); 1379 BT_SET(pcp->pc_bitmap, fd); 1380 mutex_exit(&pcp->pc_lock); 1381 return (0); 1382 } 1383 1384 /* 1385 * The entry is not really deleted. The fields are cleared so that the 1386 * entry is no longer useful, but it will remain in the hash table for reuse 1387 * later. It will be freed when the polling lwp exits. 1388 */ 1389 int 1390 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1391 { 1392 pollcache_t *pcp = ps->ps_pcache; 1393 polldat_t *pdp; 1394 xref_t *refp; 1395 1396 ASSERT(fd < pcp->pc_mapsize); 1397 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1398 1399 pdp = pcache_lookup_fd(pcp, fd); 1400 ASSERT(pdp != NULL); 1401 ASSERT(pdp->pd_count > 0); 1402 ASSERT(pdp->pd_ref != NULL); 1403 refp = &pdp->pd_ref[which]; 1404 if (pdp->pd_count == 1) { 1405 pdp->pd_events = 0; 1406 refp->xf_position = POLLPOSINVAL; 1407 ASSERT(refp->xf_refcnt == 1); 1408 refp->xf_refcnt = 0; 1409 if (pdp->pd_php) { 1410 /* 1411 * It is possible for a wakeup thread to get ahead 1412 * of the following pollhead_delete and set the bit in 1413 * bitmap. It is OK because the bit will be cleared 1414 * here anyway. 1415 */ 1416 pollhead_delete(pdp->pd_php, pdp); 1417 pdp->pd_php = NULL; 1418 } 1419 pdp->pd_count = 0; 1420 if (pdp->pd_fp != NULL) { 1421 pdp->pd_fp = NULL; 1422 delfpollinfo(fd); 1423 } 1424 mutex_enter(&pcp->pc_lock); 1425 BT_CLEAR(pcp->pc_bitmap, fd); 1426 mutex_exit(&pcp->pc_lock); 1427 return (0); 1428 } 1429 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1430 /* 1431 * fd cached here has been closed. This is the first 1432 * pcache_delete_fd called after the close. Clean up the 1433 * entire entry. 1434 */ 1435 pcacheset_invalidate(ps, pdp); 1436 ASSERT(pdp->pd_php == NULL); 1437 mutex_enter(&pcp->pc_lock); 1438 BT_CLEAR(pcp->pc_bitmap, fd); 1439 mutex_exit(&pcp->pc_lock); 1440 return (0); 1441 } 1442 #ifdef DEBUG 1443 if (getf(fd) != NULL) { 1444 ASSERT(infpollinfo(fd)); 1445 releasef(fd); 1446 } 1447 #endif /* DEBUG */ 1448 pdp->pd_count--; 1449 ASSERT(refp->xf_refcnt > 0); 1450 if (--refp->xf_refcnt == 0) { 1451 refp->xf_position = POLLPOSINVAL; 1452 } else { 1453 ASSERT(pos >= refp->xf_position); 1454 if (pos == refp->xf_position) { 1455 /* 1456 * The xref position is no longer valid. 1457 * Reset it to a special value and let 1458 * caller know it needs to updatexref() 1459 * with a new xf_position value. 1460 */ 1461 refp->xf_position = POLLPOSTRANS; 1462 return (1); 1463 } 1464 } 1465 return (0); 1466 } 1467 1468 void 1469 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1470 { 1471 polldat_t *pdp; 1472 1473 pdp = pcache_lookup_fd(pcp, fd); 1474 ASSERT(pdp != NULL); 1475 ASSERT(pdp->pd_ref != NULL); 1476 pdp->pd_ref[which].xf_position = pos; 1477 } 1478 1479 #ifdef DEBUG 1480 /* 1481 * For each polled fd, it's either in the bitmap or cached in 1482 * pcache hash table. If this routine returns 0, something is wrong. 1483 */ 1484 static int 1485 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1486 { 1487 int i; 1488 int fd; 1489 pollcache_t *pcp = ps->ps_pcache; 1490 polldat_t *pdp; 1491 pollfd_t *pollfdp = ps->ps_pollfd; 1492 file_t *fp; 1493 1494 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1495 for (i = 0; i < nfds; i++) { 1496 fd = pollfdp[i].fd; 1497 if (fd < 0) { 1498 ASSERT(pollfdp[i].revents == 0); 1499 continue; 1500 } 1501 if (pollfdp[i].revents == POLLNVAL) 1502 continue; 1503 if ((fp = getf(fd)) == NULL) 1504 continue; 1505 pdp = pcache_lookup_fd(pcp, fd); 1506 ASSERT(pdp != NULL); 1507 ASSERT(infpollinfo(fd)); 1508 ASSERT(pdp->pd_fp == fp); 1509 releasef(fd); 1510 if (BT_TEST(pcp->pc_bitmap, fd)) 1511 continue; 1512 if (pdp->pd_php == NULL) 1513 return (0); 1514 } 1515 return (1); 1516 } 1517 #endif /* DEBUG */ 1518 1519 /* 1520 * resolve the difference between the current poll list and a cached one. 1521 */ 1522 int 1523 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1524 { 1525 int i; 1526 pollcache_t *pcp = ps->ps_pcache; 1527 pollfd_t *newlist = NULL; 1528 pollfd_t *current = ps->ps_pollfd; 1529 pollfd_t *cached; 1530 pollcacheset_t *pcsp; 1531 int common; 1532 int count = 0; 1533 int offset; 1534 int remain; 1535 int fd; 1536 file_t *fp; 1537 int fdcnt = 0; 1538 int cnt = 0; 1539 nfds_t old_nfds; 1540 int error = 0; 1541 int mismatch = 0; 1542 1543 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1544 #ifdef DEBUG 1545 checkpolldat(ps); 1546 #endif 1547 pcsp = &ps->ps_pcacheset[which]; 1548 old_nfds = pcsp->pcs_nfds; 1549 common = (nfds > old_nfds) ? old_nfds : nfds; 1550 if (nfds != old_nfds) { 1551 /* 1552 * the length of poll list has changed. allocate a new 1553 * pollfd list. 1554 */ 1555 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1556 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1557 } 1558 /* 1559 * Compare the overlapping part of the current fd list with the 1560 * cached one. Whenever a difference is found, resolve it. 1561 * The comparison is done on the current poll list and the 1562 * cached list. But we may be setting up the newlist to be the 1563 * cached list for next poll. 1564 */ 1565 cached = pcsp->pcs_pollfd; 1566 remain = common; 1567 1568 while (count < common) { 1569 int tmpfd; 1570 pollfd_t *np; 1571 1572 np = (newlist != NULL) ? &newlist[count] : NULL; 1573 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1574 remain); 1575 /* 1576 * Collect stats. If lists are completed the first time, 1577 * it's a hit. Otherwise, it's a partial hit or miss. 1578 */ 1579 if ((count == 0) && (offset == common)) { 1580 pollstats.pollcachehit.value.ui64++; 1581 } else { 1582 mismatch++; 1583 } 1584 count += offset; 1585 if (offset < remain) { 1586 ASSERT(count < common); 1587 ASSERT((current[count].fd != cached[count].fd) || 1588 (current[count].events != cached[count].events)); 1589 /* 1590 * Filter out invalid events. 1591 */ 1592 if (current[count].events & ~VALID_POLL_EVENTS) { 1593 if (newlist != NULL) { 1594 newlist[count].events = 1595 current[count].events &= 1596 VALID_POLL_EVENTS; 1597 } else { 1598 current[count].events &= 1599 VALID_POLL_EVENTS; 1600 } 1601 } 1602 /* 1603 * when resolving a difference, we always remove the 1604 * fd from cache before inserting one into cache. 1605 */ 1606 if (cached[count].fd >= 0) { 1607 tmpfd = cached[count].fd; 1608 if (pcache_delete_fd(ps, tmpfd, count, which, 1609 (uint_t)cached[count].events)) { 1610 /* 1611 * This should be rare but needed for 1612 * correctness. 1613 * 1614 * The first appearance in cached list 1615 * is being "turned off". The same fd 1616 * appear more than once in the cached 1617 * poll list. Find the next one on the 1618 * list and update the cached 1619 * xf_position field. 1620 */ 1621 for (i = count + 1; i < old_nfds; i++) { 1622 if (cached[i].fd == tmpfd) { 1623 pcache_update_xref(pcp, 1624 tmpfd, (ssize_t)i, 1625 which); 1626 break; 1627 } 1628 } 1629 ASSERT(i <= old_nfds); 1630 } 1631 /* 1632 * In case a new cache list is allocated, 1633 * need to keep both cache lists in sync 1634 * b/c the new one can be freed if we have 1635 * an error later. 1636 */ 1637 cached[count].fd = -1; 1638 if (newlist != NULL) { 1639 newlist[count].fd = -1; 1640 } 1641 } 1642 if ((tmpfd = current[count].fd) >= 0) { 1643 /* 1644 * add to the cached fd tbl and bitmap. 1645 */ 1646 if ((fp = getf(tmpfd)) == NULL) { 1647 current[count].revents = POLLNVAL; 1648 if (newlist != NULL) { 1649 newlist[count].fd = -1; 1650 } 1651 cached[count].fd = -1; 1652 fdcnt++; 1653 } else { 1654 /* 1655 * Here we don't care about the 1656 * fdcnt. We will examine the bitmap 1657 * later and pick up the correct 1658 * fdcnt there. So we never bother 1659 * to check value of 'cnt'. 1660 */ 1661 error = pcache_insert(ps, fp, 1662 ¤t[count], &cnt, 1663 (ssize_t)count, which); 1664 /* 1665 * if no error, we want to do releasef 1666 * after we updated cache poll list 1667 * entry so that close() won't race 1668 * us. 1669 */ 1670 if (error) { 1671 /* 1672 * If we encountered an error, 1673 * we have invalidated an 1674 * entry in cached poll list 1675 * (in pcache_delete_fd() above) 1676 * but failed to add one here. 1677 * This is OK b/c what's in the 1678 * cached list is consistent 1679 * with content of cache. 1680 * It will not have any ill 1681 * effect on next poll(). 1682 */ 1683 releasef(tmpfd); 1684 if (newlist != NULL) { 1685 kmem_free(newlist, 1686 nfds * 1687 sizeof (pollfd_t)); 1688 } 1689 return (error); 1690 } 1691 /* 1692 * If we have allocated a new(temp) 1693 * cache list, we need to keep both 1694 * in sync b/c the new one can be freed 1695 * if we have an error later. 1696 */ 1697 if (newlist != NULL) { 1698 newlist[count].fd = 1699 current[count].fd; 1700 newlist[count].events = 1701 current[count].events; 1702 } 1703 cached[count].fd = current[count].fd; 1704 cached[count].events = 1705 current[count].events; 1706 releasef(tmpfd); 1707 } 1708 } else { 1709 current[count].revents = 0; 1710 } 1711 count++; 1712 remain = common - count; 1713 } 1714 } 1715 if (mismatch != 0) { 1716 if (mismatch == common) { 1717 pollstats.pollcachemiss.value.ui64++; 1718 } else { 1719 pollstats.pollcachephit.value.ui64++; 1720 } 1721 } 1722 /* 1723 * take care of the non overlapping part of a list 1724 */ 1725 if (nfds > old_nfds) { 1726 ASSERT(newlist != NULL); 1727 for (i = old_nfds; i < nfds; i++) { 1728 /* filter out invalid events */ 1729 if (current[i].events & ~VALID_POLL_EVENTS) { 1730 newlist[i].events = current[i].events = 1731 current[i].events & VALID_POLL_EVENTS; 1732 } 1733 if ((fd = current[i].fd) < 0) { 1734 current[i].revents = 0; 1735 continue; 1736 } 1737 /* 1738 * add to the cached fd tbl and bitmap. 1739 */ 1740 if ((fp = getf(fd)) == NULL) { 1741 current[i].revents = POLLNVAL; 1742 newlist[i].fd = -1; 1743 fdcnt++; 1744 continue; 1745 } 1746 /* 1747 * Here we don't care about the 1748 * fdcnt. We will examine the bitmap 1749 * later and pick up the correct 1750 * fdcnt there. So we never bother to 1751 * check 'cnt'. 1752 */ 1753 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1754 (ssize_t)i, which); 1755 releasef(fd); 1756 if (error) { 1757 /* 1758 * Here we are half way through adding newly 1759 * polled fd. Undo enough to keep the cache 1760 * list consistent with the cache content. 1761 */ 1762 pcacheset_remove_list(ps, current, old_nfds, 1763 i, which, 0); 1764 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1765 return (error); 1766 } 1767 } 1768 } 1769 if (old_nfds > nfds) { 1770 /* 1771 * remove the fd's which are no longer polled. 1772 */ 1773 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1774 which, 1); 1775 } 1776 /* 1777 * set difference resolved. update nfds and cachedlist 1778 * in pollstate struct. 1779 */ 1780 if (newlist != NULL) { 1781 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1782 /* 1783 * By now, the pollfd.revents field should 1784 * all be zeroed. 1785 */ 1786 pcsp->pcs_pollfd = newlist; 1787 pcsp->pcs_nfds = nfds; 1788 } 1789 ASSERT(*fdcntp == 0); 1790 *fdcntp = fdcnt; 1791 /* 1792 * By now for every fd in pollfdp, one of the following should be 1793 * true. Otherwise we will miss a polled event. 1794 * 1795 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1796 * will be called on this fd in next poll. 1797 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1798 * pollnotify will happen. 1799 */ 1800 ASSERT(pollchecksanity(ps, nfds)); 1801 /* 1802 * make sure cross reference between cached poll lists and cached 1803 * poll fds are correct. 1804 */ 1805 ASSERT(pollcheckxref(ps, which)); 1806 /* 1807 * ensure each polldat in pollcache reference a polled fd in 1808 * pollcacheset. 1809 */ 1810 #ifdef DEBUG 1811 checkpolldat(ps); 1812 #endif 1813 return (0); 1814 } 1815 1816 #ifdef DEBUG 1817 static int 1818 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1819 { 1820 int i; 1821 int reventcnt = 0; 1822 1823 for (i = 0; i < nfds; i++) { 1824 if (pollfdp[i].fd < 0) { 1825 ASSERT(pollfdp[i].revents == 0); 1826 continue; 1827 } 1828 if (pollfdp[i].revents) { 1829 reventcnt++; 1830 } 1831 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1832 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1833 } 1834 } 1835 return (reventcnt); 1836 } 1837 #endif /* DEBUG */ 1838 1839 /* 1840 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1841 * is held upon entry. 1842 */ 1843 int 1844 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1845 int which) 1846 { 1847 int i; 1848 pollcache_t *pcp; 1849 int fd; 1850 int begin, end, done; 1851 pollhead_t *php; 1852 int fdcnt; 1853 int error = 0; 1854 file_t *fp; 1855 polldat_t *pdp; 1856 xref_t *refp; 1857 int entry; 1858 1859 pcp = ps->ps_pcache; 1860 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1861 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1862 retry: 1863 done = 0; 1864 begin = 0; 1865 fdcnt = 0; 1866 end = pcp->pc_mapend; 1867 while ((fdcnt < nfds) && !done) { 1868 php = NULL; 1869 /* 1870 * only poll fds which may have events 1871 */ 1872 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1873 ASSERT(fd <= end); 1874 if (fd >= 0) { 1875 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1876 /* 1877 * adjust map pointers for next round 1878 */ 1879 if (fd == end) { 1880 done = 1; 1881 } else { 1882 begin = fd + 1; 1883 } 1884 /* 1885 * A bitmap caches poll state information of 1886 * multiple poll lists. Call VOP_POLL only if 1887 * the bit corresponds to an fd in this poll 1888 * list. 1889 */ 1890 pdp = pcache_lookup_fd(pcp, fd); 1891 ASSERT(pdp != NULL); 1892 ASSERT(pdp->pd_ref != NULL); 1893 refp = &pdp->pd_ref[which]; 1894 if (refp->xf_refcnt == 0) 1895 continue; 1896 entry = refp->xf_position; 1897 ASSERT((entry >= 0) && (entry < nfds)); 1898 ASSERT(pollfdp[entry].fd == fd); 1899 /* 1900 * we are in this routine implies that we have 1901 * successfully polled this fd in the past. 1902 * Check to see this fd is closed while we are 1903 * blocked in poll. This ensures that we don't 1904 * miss a close on the fd in the case this fd is 1905 * reused. 1906 */ 1907 if (pdp->pd_fp == NULL) { 1908 ASSERT(pdp->pd_count > 0); 1909 pollfdp[entry].revents = POLLNVAL; 1910 fdcnt++; 1911 if (refp->xf_refcnt > 1) { 1912 /* 1913 * this fd appeared multiple time 1914 * in the poll list. Find all of them. 1915 */ 1916 for (i = entry + 1; i < nfds; i++) { 1917 if (pollfdp[i].fd == fd) { 1918 pollfdp[i].revents = 1919 POLLNVAL; 1920 fdcnt++; 1921 } 1922 } 1923 } 1924 pcacheset_invalidate(ps, pdp); 1925 continue; 1926 } 1927 /* 1928 * We can be here polling a device that is being 1929 * closed (i.e. the file pointer is set to NULL, 1930 * but pollcacheclean has not happened yet). 1931 */ 1932 if ((fp = getf(fd)) == NULL) { 1933 pollfdp[entry].revents = POLLNVAL; 1934 fdcnt++; 1935 if (refp->xf_refcnt > 1) { 1936 /* 1937 * this fd appeared multiple time 1938 * in the poll list. Find all of them. 1939 */ 1940 for (i = entry + 1; i < nfds; i++) { 1941 if (pollfdp[i].fd == fd) { 1942 pollfdp[i].revents = 1943 POLLNVAL; 1944 fdcnt++; 1945 } 1946 } 1947 } 1948 continue; 1949 } 1950 ASSERT(pdp->pd_fp == fp); 1951 ASSERT(infpollinfo(fd)); 1952 /* 1953 * Since we no longer hold poll head lock across 1954 * VOP_POLL, pollunlock logic can be simplifed. 1955 */ 1956 ASSERT(pdp->pd_php == NULL || 1957 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 1958 /* 1959 * underlying file systems may set a "pollpending" 1960 * flag when it sees the poll may block. Pollwakeup() 1961 * is called by wakeup thread if pollpending is set. 1962 * Pass a 0 fdcnt so that the underlying file system 1963 * will set the "pollpending" flag set when there is 1964 * no polled events. 1965 * 1966 * Use pollfdp[].events for actual polling because 1967 * the pd_events is union of all cached poll events 1968 * on this fd. The events parameter also affects 1969 * how the polled device sets the "poll pending" 1970 * flag. 1971 */ 1972 ASSERT(curthread->t_pollcache == NULL); 1973 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 1974 &pollfdp[entry].revents, &php, NULL); 1975 /* 1976 * releasef after completely done with this cached 1977 * poll entry. To prevent close() coming in to clear 1978 * this entry. 1979 */ 1980 if (error) { 1981 releasef(fd); 1982 break; 1983 } 1984 /* 1985 * layered devices (e.g. console driver) 1986 * may change the vnode and thus the pollhead 1987 * pointer out from underneath us. 1988 */ 1989 if (php != NULL && pdp->pd_php != NULL && 1990 php != pdp->pd_php) { 1991 releasef(fd); 1992 pollhead_delete(pdp->pd_php, pdp); 1993 pdp->pd_php = php; 1994 pollhead_insert(php, pdp); 1995 /* 1996 * We could have missed a wakeup on the new 1997 * target device. Make sure the new target 1998 * gets polled once. 1999 */ 2000 BT_SET(pcp->pc_bitmap, fd); 2001 goto retry; 2002 } 2003 2004 if (pollfdp[entry].revents) { 2005 ASSERT(refp->xf_refcnt >= 1); 2006 fdcnt++; 2007 if (refp->xf_refcnt > 1) { 2008 /* 2009 * this fd appeared multiple time 2010 * in the poll list. This is rare but 2011 * we have to look at all of them for 2012 * correctness. 2013 */ 2014 error = plist_chkdupfd(fp, pdp, ps, 2015 pollfdp, entry, &fdcnt); 2016 if (error > 0) { 2017 releasef(fd); 2018 break; 2019 } 2020 if (error < 0) { 2021 goto retry; 2022 } 2023 } 2024 releasef(fd); 2025 } else { 2026 /* 2027 * VOP_POLL didn't return any revents. We can 2028 * clear the bit in bitmap only if we have the 2029 * pollhead ptr cached and no other cached 2030 * entry is polling different events on this fd. 2031 * VOP_POLL may have dropped the ps_lock. Make 2032 * sure pollwakeup has not happened before clear 2033 * the bit. 2034 */ 2035 if ((pdp->pd_php != NULL) && 2036 (pollfdp[entry].events == pdp->pd_events) && 2037 ((pcp->pc_flag & T_POLLWAKE) == 0)) { 2038 BT_CLEAR(pcp->pc_bitmap, fd); 2039 } 2040 /* 2041 * if the fd can be cached now but not before, 2042 * do it now. 2043 */ 2044 if ((pdp->pd_php == NULL) && (php != NULL)) { 2045 pdp->pd_php = php; 2046 pollhead_insert(php, pdp); 2047 /* 2048 * We are inserting a polldat struct for 2049 * the first time. We may have missed a 2050 * wakeup on this device. Re-poll once. 2051 * This should be a rare event. 2052 */ 2053 releasef(fd); 2054 goto retry; 2055 } 2056 if (refp->xf_refcnt > 1) { 2057 /* 2058 * this fd appeared multiple time 2059 * in the poll list. This is rare but 2060 * we have to look at all of them for 2061 * correctness. 2062 */ 2063 error = plist_chkdupfd(fp, pdp, ps, 2064 pollfdp, entry, &fdcnt); 2065 if (error > 0) { 2066 releasef(fd); 2067 break; 2068 } 2069 if (error < 0) { 2070 goto retry; 2071 } 2072 } 2073 releasef(fd); 2074 } 2075 } else { 2076 done = 1; 2077 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2078 } 2079 } 2080 if (!error) { 2081 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2082 *fdcntp += fdcnt; 2083 } 2084 return (error); 2085 } 2086 2087 /* 2088 * Going through the poll list without much locking. Poll all fds and 2089 * cache all valid fds in the pollcache. 2090 */ 2091 int 2092 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2093 { 2094 pollfd_t *pollfdp = ps->ps_pollfd; 2095 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2096 pollfd_t *newfdlist; 2097 int i; 2098 int fd; 2099 file_t *fp; 2100 int error = 0; 2101 2102 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2103 ASSERT(which < ps->ps_nsets); 2104 ASSERT(pcacheset != NULL); 2105 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2106 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2107 /* 2108 * cache the new poll list in pollcachset. 2109 */ 2110 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2111 2112 pcacheset[which].pcs_pollfd = newfdlist; 2113 pcacheset[which].pcs_nfds = ps->ps_nfds; 2114 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2115 2116 /* 2117 * We have saved a copy of current poll fd list in one pollcacheset. 2118 * The 'revents' field of the new list is not yet set to 0. Loop 2119 * through the new list just to do that is expensive. We do that 2120 * while polling the list. 2121 */ 2122 for (i = 0; i < ps->ps_nfds; i++) { 2123 fd = pollfdp[i].fd; 2124 /* 2125 * We also filter out the illegal poll events in the event 2126 * field for the cached poll list/set. 2127 */ 2128 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2129 newfdlist[i].events = pollfdp[i].events = 2130 pollfdp[i].events & VALID_POLL_EVENTS; 2131 } 2132 if (fd < 0) { 2133 pollfdp[i].revents = 0; 2134 continue; 2135 } 2136 if ((fp = getf(fd)) == NULL) { 2137 pollfdp[i].revents = POLLNVAL; 2138 /* 2139 * invalidate this cache entry in the cached poll list 2140 */ 2141 newfdlist[i].fd = -1; 2142 (*fdcntp)++; 2143 continue; 2144 } 2145 /* 2146 * cache this fd. 2147 */ 2148 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2149 which); 2150 releasef(fd); 2151 if (error) { 2152 /* 2153 * Here we are half way through caching a new 2154 * poll list. Undo every thing. 2155 */ 2156 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2157 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2158 pcacheset[which].pcs_pollfd = NULL; 2159 pcacheset[which].pcs_usradr = NULL; 2160 break; 2161 } 2162 } 2163 return (error); 2164 } 2165 2166 /* 2167 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2168 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2169 * wake any sleeping poller, then remove the polldat from the driver. 2170 * The routine is called with ps_pcachelock held. 2171 */ 2172 void 2173 pcache_clean_entry(pollstate_t *ps, int fd) 2174 { 2175 pollcache_t *pcp; 2176 polldat_t *pdp; 2177 int i; 2178 2179 ASSERT(ps != NULL); 2180 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2181 pcp = ps->ps_pcache; 2182 ASSERT(pcp); 2183 pdp = pcache_lookup_fd(pcp, fd); 2184 ASSERT(pdp != NULL); 2185 /* 2186 * the corresponding fpollinfo in fi_list has been removed by 2187 * a close on this fd. Reset the cached fp ptr here. 2188 */ 2189 pdp->pd_fp = NULL; 2190 /* 2191 * XXX - This routine also touches data in pcacheset struct. 2192 * 2193 * set the event in cached poll lists to POLLCLOSED. This invalidate 2194 * the cached poll fd entry in that poll list, which will force a 2195 * removal of this cached entry in next poll(). The cleanup is done 2196 * at the removal time. 2197 */ 2198 ASSERT(pdp->pd_ref != NULL); 2199 for (i = 0; i < ps->ps_nsets; i++) { 2200 xref_t *refp; 2201 pollcacheset_t *pcsp; 2202 2203 refp = &pdp->pd_ref[i]; 2204 if (refp->xf_refcnt) { 2205 ASSERT(refp->xf_position >= 0); 2206 pcsp = &ps->ps_pcacheset[i]; 2207 if (refp->xf_refcnt == 1) { 2208 pcsp->pcs_pollfd[refp->xf_position].events = 2209 (short)POLLCLOSED; 2210 } 2211 if (refp->xf_refcnt > 1) { 2212 int j; 2213 /* 2214 * mark every matching entry in pcs_pollfd 2215 */ 2216 for (j = refp->xf_position; 2217 j < pcsp->pcs_nfds; j++) { 2218 if (pcsp->pcs_pollfd[j].fd == fd) { 2219 pcsp->pcs_pollfd[j].events = 2220 (short)POLLCLOSED; 2221 } 2222 } 2223 } 2224 } 2225 } 2226 if (pdp->pd_php) { 2227 pollwakeup(pdp->pd_php, POLLHUP); 2228 pollhead_delete(pdp->pd_php, pdp); 2229 pdp->pd_php = NULL; 2230 } 2231 } 2232 2233 /* 2234 * This is the first time this thread has ever polled, 2235 * so we have to create its pollstate structure. 2236 * This will persist for the life of the thread, 2237 * until it calls pollcleanup(). 2238 */ 2239 pollstate_t * 2240 pollstate_create(void) 2241 { 2242 pollstate_t *ps; 2243 2244 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2245 ps->ps_nsets = POLLFDSETS; 2246 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2247 return (ps); 2248 } 2249 2250 void 2251 pollstate_destroy(pollstate_t *ps) 2252 { 2253 if (ps->ps_pollfd != NULL) { 2254 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2255 ps->ps_pollfd = NULL; 2256 } 2257 if (ps->ps_pcache != NULL) { 2258 pcache_destroy(ps->ps_pcache); 2259 ps->ps_pcache = NULL; 2260 } 2261 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2262 ps->ps_pcacheset = NULL; 2263 if (ps->ps_dpbuf != NULL) { 2264 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); 2265 ps->ps_dpbuf = NULL; 2266 } 2267 mutex_destroy(&ps->ps_lock); 2268 kmem_free(ps, sizeof (pollstate_t)); 2269 } 2270 2271 /* 2272 * We are holding the appropriate uf_lock entering this routine. 2273 * Bump up the ps_busy count to prevent the thread from exiting. 2274 */ 2275 void 2276 pollblockexit(fpollinfo_t *fpip) 2277 { 2278 for (; fpip; fpip = fpip->fp_next) { 2279 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2280 2281 mutex_enter(&pcp->pc_no_exit); 2282 pcp->pc_busy++; /* prevents exit()'s */ 2283 mutex_exit(&pcp->pc_no_exit); 2284 } 2285 } 2286 2287 /* 2288 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2289 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2290 * this cache entry. We can't clean the polldat entry clean up here because 2291 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2292 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2293 * pcache_clean_entry to call pollwakeup(). 2294 */ 2295 void 2296 pollcacheclean(fpollinfo_t *fip, int fd) 2297 { 2298 struct fpollinfo *fpip, *fpip2; 2299 2300 fpip = fip; 2301 while (fpip) { 2302 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2303 pollcache_t *pcp = ps->ps_pcache; 2304 2305 mutex_enter(&ps->ps_lock); 2306 pcache_clean_entry(ps, fd); 2307 mutex_exit(&ps->ps_lock); 2308 mutex_enter(&pcp->pc_no_exit); 2309 pcp->pc_busy--; 2310 if (pcp->pc_busy == 0) { 2311 /* 2312 * Wakeup the thread waiting in 2313 * thread_exit(). 2314 */ 2315 cv_signal(&pcp->pc_busy_cv); 2316 } 2317 mutex_exit(&pcp->pc_no_exit); 2318 2319 fpip2 = fpip; 2320 fpip = fpip->fp_next; 2321 kmem_free(fpip2, sizeof (fpollinfo_t)); 2322 } 2323 } 2324 2325 /* 2326 * one of the cache line's counter is wrapping around. Reset all cache line 2327 * counters to zero except one. This is simplistic, but probably works 2328 * effectively. 2329 */ 2330 void 2331 pcacheset_reset_count(pollstate_t *ps, int index) 2332 { 2333 int i; 2334 2335 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2336 for (i = 0; i < ps->ps_nsets; i++) { 2337 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2338 ps->ps_pcacheset[i].pcs_count = 0; 2339 } 2340 } 2341 ps->ps_pcacheset[index].pcs_count = 1; 2342 } 2343 2344 /* 2345 * this routine implements poll cache list replacement policy. 2346 * It is currently choose the "least used". 2347 */ 2348 int 2349 pcacheset_replace(pollstate_t *ps) 2350 { 2351 int i; 2352 int index = 0; 2353 2354 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2355 for (i = 1; i < ps->ps_nsets; i++) { 2356 if (ps->ps_pcacheset[index].pcs_count > 2357 ps->ps_pcacheset[i].pcs_count) { 2358 index = i; 2359 } 2360 } 2361 ps->ps_pcacheset[index].pcs_count = 0; 2362 return (index); 2363 } 2364 2365 /* 2366 * this routine is called by strclose to remove remaining polldat struct on 2367 * the pollhead list of the device being closed. There are two reasons as why 2368 * the polldat structures still remain on the pollhead list: 2369 * 2370 * (1) The layered device(e.g.the console driver). 2371 * In this case, the existence of a polldat implies that the thread putting 2372 * the polldat on this list has not exited yet. Before the thread exits, it 2373 * will have to hold this pollhead lock to remove the polldat. So holding the 2374 * pollhead lock here effectively prevents the thread which put the polldat 2375 * on this list from exiting. 2376 * 2377 * (2) /dev/poll. 2378 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2379 * pollhead list if the process has not done a POLLREMOVE before closing the 2380 * polled fd. We just unlink it here. 2381 */ 2382 void 2383 pollhead_clean(pollhead_t *php) 2384 { 2385 polldat_t *pdp; 2386 2387 /* 2388 * In case(1), while we must prevent the thread in question from 2389 * exiting, we must also obey the proper locking order, i.e. 2390 * (ps_lock -> phlock). 2391 */ 2392 PH_ENTER(php); 2393 while (php->ph_list != NULL) { 2394 pollstate_t *ps; 2395 pollcache_t *pcp; 2396 2397 pdp = php->ph_list; 2398 ASSERT(pdp->pd_php == php); 2399 if (pdp->pd_thread == NULL) { 2400 /* 2401 * This is case(2). Since the ph_lock is sufficient 2402 * to synchronize this lwp with any other /dev/poll 2403 * lwp, just unlink the polldat. 2404 */ 2405 php->ph_list = pdp->pd_next; 2406 pdp->pd_php = NULL; 2407 pdp->pd_next = NULL; 2408 continue; 2409 } 2410 ps = pdp->pd_thread->t_pollstate; 2411 ASSERT(ps != NULL); 2412 pcp = pdp->pd_pcache; 2413 ASSERT(pcp != NULL); 2414 mutex_enter(&pcp->pc_no_exit); 2415 pcp->pc_busy++; /* prevents exit()'s */ 2416 mutex_exit(&pcp->pc_no_exit); 2417 /* 2418 * Now get the locks in proper order to avoid deadlock. 2419 */ 2420 PH_EXIT(php); 2421 mutex_enter(&ps->ps_lock); 2422 /* 2423 * while we dropped the pollhead lock, the element could be 2424 * taken off the list already. 2425 */ 2426 PH_ENTER(php); 2427 if (pdp->pd_php == php) { 2428 ASSERT(pdp == php->ph_list); 2429 php->ph_list = pdp->pd_next; 2430 pdp->pd_php = NULL; 2431 pdp->pd_next = NULL; 2432 } 2433 PH_EXIT(php); 2434 mutex_exit(&ps->ps_lock); 2435 mutex_enter(&pcp->pc_no_exit); 2436 pcp->pc_busy--; 2437 if (pcp->pc_busy == 0) { 2438 /* 2439 * Wakeup the thread waiting in 2440 * thread_exit(). 2441 */ 2442 cv_signal(&pcp->pc_busy_cv); 2443 } 2444 mutex_exit(&pcp->pc_no_exit); 2445 PH_ENTER(php); 2446 } 2447 PH_EXIT(php); 2448 } 2449 2450 /* 2451 * The remove_list is called to cleanup a partially cached 'current' list or 2452 * to remove a partial list which is no longer cached. The flag value of 1 2453 * indicates the second case. 2454 */ 2455 void 2456 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2457 int cacheindex, int flag) 2458 { 2459 int i; 2460 2461 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2462 for (i = start; i < end; i++) { 2463 if ((pollfdp[i].fd >= 0) && 2464 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2465 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2466 (uint_t)pollfdp[i].events)) { 2467 int j; 2468 int fd = pollfdp[i].fd; 2469 2470 for (j = i + 1; j < end; j++) { 2471 if (pollfdp[j].fd == fd) { 2472 pcache_update_xref( 2473 ps->ps_pcache, fd, 2474 (ssize_t)j, cacheindex); 2475 break; 2476 } 2477 } 2478 ASSERT(j <= end); 2479 } 2480 } 2481 } 2482 } 2483 2484 #ifdef DEBUG 2485 2486 #include<sys/strsubr.h> 2487 /* 2488 * make sure curthread is not on anyone's pollhead list any more. 2489 */ 2490 static void 2491 pollcheckphlist() 2492 { 2493 int i; 2494 file_t *fp; 2495 uf_entry_t *ufp; 2496 uf_info_t *fip = P_FINFO(curproc); 2497 struct stdata *stp; 2498 polldat_t *pdp; 2499 2500 mutex_enter(&fip->fi_lock); 2501 for (i = 0; i < fip->fi_nfiles; i++) { 2502 UF_ENTER(ufp, fip, i); 2503 if ((fp = ufp->uf_file) != NULL) { 2504 if ((stp = fp->f_vnode->v_stream) != NULL) { 2505 PH_ENTER(&stp->sd_pollist); 2506 pdp = stp->sd_pollist.ph_list; 2507 while (pdp) { 2508 ASSERT(pdp->pd_thread != curthread); 2509 pdp = pdp->pd_next; 2510 } 2511 PH_EXIT(&stp->sd_pollist); 2512 } 2513 } 2514 UF_EXIT(ufp); 2515 } 2516 mutex_exit(&fip->fi_lock); 2517 } 2518 2519 /* 2520 * for resolved set poll list, the xref info in the pcache should be 2521 * consistent with this poll list. 2522 */ 2523 static int 2524 pollcheckxref(pollstate_t *ps, int cacheindex) 2525 { 2526 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2527 pollcache_t *pcp = ps->ps_pcache; 2528 polldat_t *pdp; 2529 int i; 2530 xref_t *refp; 2531 2532 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2533 if (pollfdp[i].fd < 0) { 2534 continue; 2535 } 2536 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2537 ASSERT(pdp != NULL); 2538 ASSERT(pdp->pd_ref != NULL); 2539 refp = &pdp->pd_ref[cacheindex]; 2540 if (refp->xf_position >= 0) { 2541 ASSERT(refp->xf_refcnt >= 1); 2542 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2543 if (refp->xf_refcnt > 1) { 2544 int j; 2545 int count = 0; 2546 2547 for (j = refp->xf_position; 2548 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2549 j++) { 2550 if (pollfdp[j].fd == pdp->pd_fd) { 2551 count++; 2552 } 2553 } 2554 ASSERT(count == refp->xf_refcnt); 2555 } 2556 } 2557 } 2558 return (1); 2559 } 2560 2561 /* 2562 * For every cached pollfd, its polldat struct should be consistent with 2563 * what is in the pcacheset lists. 2564 */ 2565 static void 2566 checkpolldat(pollstate_t *ps) 2567 { 2568 pollcache_t *pcp = ps->ps_pcache; 2569 polldat_t **hashtbl; 2570 int i; 2571 2572 hashtbl = pcp->pc_hash; 2573 for (i = 0; i < pcp->pc_hashsize; i++) { 2574 polldat_t *pdp; 2575 2576 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2577 ASSERT(pdp->pd_ref != NULL); 2578 if (pdp->pd_count > 0) { 2579 xref_t *refp; 2580 int j; 2581 pollcacheset_t *pcsp; 2582 pollfd_t *pollfd; 2583 2584 for (j = 0; j < ps->ps_nsets; j++) { 2585 refp = &pdp->pd_ref[j]; 2586 if (refp->xf_refcnt > 0) { 2587 pcsp = &ps->ps_pcacheset[j]; 2588 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2589 pollfd = pcsp->pcs_pollfd; 2590 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2591 } 2592 } 2593 } 2594 } 2595 } 2596 } 2597 2598 /* 2599 * every wfd element on ph_list must have a corresponding fpollinfo on the 2600 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2601 */ 2602 void 2603 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2604 { 2605 stdata_t *stp; 2606 polldat_t *pdp; 2607 fpollinfo_t *fpip2; 2608 2609 if ((stp = vp->v_stream) == NULL) { 2610 return; 2611 } 2612 PH_ENTER(&stp->sd_pollist); 2613 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2614 if (pdp->pd_thread != NULL && 2615 pdp->pd_thread->t_procp == curthread->t_procp) { 2616 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2617 if (pdp->pd_thread == fpip2->fp_thread) { 2618 break; 2619 } 2620 } 2621 ASSERT(fpip2 != NULL); 2622 } 2623 } 2624 PH_EXIT(&stp->sd_pollist); 2625 } 2626 2627 /* 2628 * For each cached fd whose bit is not set in bitmap, its revents field in 2629 * current poll list should be 0. 2630 */ 2631 static int 2632 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2633 { 2634 pollcache_t *pcp = ps->ps_pcache; 2635 pollfd_t *pollfdp = ps->ps_pollfd; 2636 int i; 2637 2638 for (i = begin; i < end; i++) { 2639 polldat_t *pdp; 2640 2641 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2642 pdp = pcache_lookup_fd(pcp, i); 2643 if (pdp && pdp->pd_fp != NULL) { 2644 xref_t *refp; 2645 int entry; 2646 2647 ASSERT(pdp->pd_ref != NULL); 2648 refp = &pdp->pd_ref[cacheindex]; 2649 if (refp->xf_refcnt == 0) { 2650 continue; 2651 } 2652 entry = refp->xf_position; 2653 ASSERT(entry >= 0); 2654 ASSERT(pollfdp[entry].revents == 0); 2655 if (refp->xf_refcnt > 1) { 2656 int j; 2657 2658 for (j = entry + 1; j < ps->ps_nfds; j++) { 2659 if (pollfdp[j].fd == i) { 2660 ASSERT(pollfdp[j].revents == 0); 2661 } 2662 } 2663 } 2664 } 2665 } 2666 return (1); 2667 } 2668 2669 #endif /* DEBUG */ 2670 2671 pollcache_t * 2672 pcache_alloc() 2673 { 2674 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2675 } 2676 2677 void 2678 pcache_create(pollcache_t *pcp, nfds_t nfds) 2679 { 2680 size_t mapsize; 2681 2682 /* 2683 * allocate enough bits for the poll fd list 2684 */ 2685 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2686 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2687 } 2688 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2689 KM_SLEEP); 2690 pcp->pc_mapsize = mapsize; 2691 /* 2692 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2693 * number of fd to start with, allocate a bigger hash table (to the 2694 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2695 * hash table is expensive. 2696 */ 2697 if (nfds < POLLHASHCHUNKSZ) { 2698 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2699 } else { 2700 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2701 ~(POLLHASHCHUNKSZ - 1); 2702 } 2703 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2704 KM_SLEEP); 2705 } 2706 2707 void 2708 pcache_destroy(pollcache_t *pcp) 2709 { 2710 polldat_t **hashtbl; 2711 int i; 2712 2713 hashtbl = pcp->pc_hash; 2714 for (i = 0; i < pcp->pc_hashsize; i++) { 2715 if (hashtbl[i] != NULL) { 2716 polldat_t *pdp, *pdp2; 2717 2718 pdp = hashtbl[i]; 2719 while (pdp != NULL) { 2720 pdp2 = pdp->pd_hashnext; 2721 if (pdp->pd_ref != NULL) { 2722 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2723 pdp->pd_nsets); 2724 } 2725 kmem_free(pdp, sizeof (polldat_t)); 2726 pdp = pdp2; 2727 pcp->pc_fdcount--; 2728 } 2729 } 2730 } 2731 ASSERT(pcp->pc_fdcount == 0); 2732 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2733 kmem_free(pcp->pc_bitmap, 2734 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2735 mutex_destroy(&pcp->pc_no_exit); 2736 mutex_destroy(&pcp->pc_lock); 2737 cv_destroy(&pcp->pc_cv); 2738 cv_destroy(&pcp->pc_busy_cv); 2739 kmem_free(pcp, sizeof (pollcache_t)); 2740 } 2741 2742 pollcacheset_t * 2743 pcacheset_create(int nsets) 2744 { 2745 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 2746 } 2747 2748 void 2749 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 2750 { 2751 int i; 2752 2753 for (i = 0; i < nsets; i++) { 2754 if (pcsp[i].pcs_pollfd != NULL) { 2755 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 2756 sizeof (pollfd_t)); 2757 } 2758 } 2759 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 2760 } 2761 2762 /* 2763 * Check each duplicated poll fd in the poll list. It may be necessary to 2764 * VOP_POLL the same fd again using different poll events. getf() has been 2765 * done by caller. This routine returns 0 if it can sucessfully process the 2766 * entire poll fd list. It returns -1 if underlying vnode has changed during 2767 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 2768 * value if VOP_POLL failed. 2769 */ 2770 static int 2771 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 2772 int entry, int *fdcntp) 2773 { 2774 int i; 2775 int fd; 2776 nfds_t nfds = psp->ps_nfds; 2777 2778 fd = pollfdp[entry].fd; 2779 for (i = entry + 1; i < nfds; i++) { 2780 if (pollfdp[i].fd == fd) { 2781 if (pollfdp[i].events == pollfdp[entry].events) { 2782 if ((pollfdp[i].revents = 2783 pollfdp[entry].revents) != 0) { 2784 (*fdcntp)++; 2785 } 2786 } else { 2787 2788 int error; 2789 pollhead_t *php; 2790 pollcache_t *pcp = psp->ps_pcache; 2791 2792 /* 2793 * the events are different. VOP_POLL on this 2794 * fd so that we don't miss any revents. 2795 */ 2796 php = NULL; 2797 ASSERT(curthread->t_pollcache == NULL); 2798 error = VOP_POLL(fp->f_vnode, 2799 pollfdp[i].events, 0, 2800 &pollfdp[i].revents, &php, NULL); 2801 if (error) { 2802 return (error); 2803 } 2804 /* 2805 * layered devices(e.g. console driver) 2806 * may change the vnode and thus the pollhead 2807 * pointer out from underneath us. 2808 */ 2809 if (php != NULL && pdp->pd_php != NULL && 2810 php != pdp->pd_php) { 2811 pollhead_delete(pdp->pd_php, pdp); 2812 pdp->pd_php = php; 2813 pollhead_insert(php, pdp); 2814 /* 2815 * We could have missed a wakeup on the 2816 * new target device. Make sure the new 2817 * target gets polled once. 2818 */ 2819 BT_SET(pcp->pc_bitmap, fd); 2820 return (-1); 2821 } 2822 if (pollfdp[i].revents) { 2823 (*fdcntp)++; 2824 } 2825 } 2826 } 2827 } 2828 return (0); 2829 } 2830