1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012 by Delphix. All rights reserved. 32 */ 33 34 /* 35 * Portions of this source code were derived from Berkeley 4.3 BSD 36 * under license from the Regents of the University of California. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/isa_defs.h> 41 #include <sys/types.h> 42 #include <sys/sysmacros.h> 43 #include <sys/user.h> 44 #include <sys/systm.h> 45 #include <sys/errno.h> 46 #include <sys/time.h> 47 #include <sys/vnode.h> 48 #include <sys/file.h> 49 #include <sys/mode.h> 50 #include <sys/proc.h> 51 #include <sys/uio.h> 52 #include <sys/poll_impl.h> 53 #include <sys/kmem.h> 54 #include <sys/cmn_err.h> 55 #include <sys/debug.h> 56 #include <sys/bitmap.h> 57 #include <sys/kstat.h> 58 #include <sys/rctl.h> 59 #include <sys/port_impl.h> 60 #include <sys/schedctl.h> 61 #include <sys/cpu.h> 62 63 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 64 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 65 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 66 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 67 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 68 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 69 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 70 71 /* 72 * global counters to collect some stats 73 */ 74 static struct { 75 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 76 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 77 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 78 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 79 } pollstats = { 80 { "polllistmiss", KSTAT_DATA_UINT64 }, 81 { "pollcachehit", KSTAT_DATA_UINT64 }, 82 { "pollcachephit", KSTAT_DATA_UINT64 }, 83 { "pollcachemiss", KSTAT_DATA_UINT64 } 84 }; 85 86 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 87 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 88 89 struct pplock { 90 kmutex_t pp_lock; 91 short pp_flag; 92 kcondvar_t pp_wait_cv; 93 int32_t pp_pad; /* to a nice round 16 bytes */ 94 }; 95 96 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 97 98 #ifdef DEBUG 99 static int pollchecksanity(pollstate_t *, nfds_t); 100 static int pollcheckxref(pollstate_t *, int); 101 static void pollcheckphlist(void); 102 static int pollcheckrevents(pollstate_t *, int, int, int); 103 static void checkpolldat(pollstate_t *); 104 #endif /* DEBUG */ 105 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 106 int *); 107 108 /* 109 * Data structure overview: 110 * The per-thread poll state consists of 111 * one pollstate_t 112 * one pollcache_t 113 * one bitmap with one event bit per fd 114 * a (two-dimensional) hashed array of polldat_t structures - one entry 115 * per fd 116 * 117 * This conglomerate of data structures interact with 118 * the pollhead which is used by VOP_POLL and pollwakeup 119 * (protected by the PHLOCK, cached array of plocks), and 120 * the fpollinfo list hanging off the fi_list which is used to notify 121 * poll when a cached fd is closed. This is protected by uf_lock. 122 * 123 * Invariants: 124 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 125 * is on that pollhead. This is modified atomically under pc_lock. 126 * 127 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 128 * list for that open file. 129 * This is modified atomically under pc_lock. 130 * 131 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 132 * Iff pd_ref[i].xf_refcnt >= 1 then 133 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 134 * Iff pd_ref[i].xf_refcnt > 1 then 135 * In ps_pcacheset[i].pcs_pollfd between index 136 * pd_ref[i].xf_position] and the end of the list 137 * there are xf_refcnt entries with .fd == pd_fd 138 * 139 * Locking design: 140 * Whenever possible the design relies on the fact that the poll cache state 141 * is per thread thus for both poll and exit it is self-synchronizing. 142 * Thus the key interactions where other threads access the state are: 143 * pollwakeup (and polltime), and 144 * close cleaning up the cached references to an open file 145 * 146 * The two key locks in poll proper is ps_lock and pc_lock. 147 * 148 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 149 * to ensure that modifications to pollcacheset structure are serialized. 150 * This lock is held through most of poll() except where poll sleeps 151 * since there is little need to handle closes concurrently with the execution 152 * of poll. 153 * The pc_lock protects most of the fields in pollcache structure and polldat 154 * structures (which are accessed by poll, pollwakeup, and polltime) 155 * with the exception of fields that are only modified when only one thread 156 * can access this per-thread state. 157 * Those exceptions occur in poll when first allocating the per-thread state, 158 * when poll grows the number of polldat (never shrinks), and when 159 * exit/pollcleanup has ensured that there are no references from either 160 * pollheads or fpollinfo to the threads poll state. 161 * 162 * Poll(2) system call is the only path which ps_lock and pc_lock are both 163 * held, in that order. It needs ps_lock to synchronize with close and 164 * lwp_exit; and pc_lock with pollwakeup. 165 * 166 * The locking interaction between pc_lock and PHLOCK take into account 167 * that poll acquires these locks in the order of pc_lock and then PHLOCK 168 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 169 * deadlock avoidance by dropping the locks and reacquiring them in the 170 * reverse order. For this to work pollwakeup needs to prevent the thread 171 * from exiting and freeing all of the poll related state. Thus is done 172 * using 173 * the pc_no_exit lock 174 * the pc_busy counter 175 * the pc_busy_cv condition variable 176 * 177 * The locking interaction between pc_lock and uf_lock has similar 178 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 179 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 180 * to prevent poll or exit from doing a delfpollinfo after which the thread 181 * might exit. But the cleanup needs to acquire pc_lock when modifying 182 * the poll cache state. The solution is to use pc_busy and do the close 183 * cleanup in two phases: 184 * First close calls pollblockexit which increments pc_busy. 185 * This prevents the per-thread poll related state from being freed. 186 * Then close drops uf_lock and calls pollcacheclean. 187 * This routine can then acquire pc_lock and remove any references 188 * to the closing fd (as well as recording that it has been closed 189 * so that a POLLNVAL can be generated even if the fd is reused before 190 * poll has been woken up and checked getf() again). 191 * 192 * When removing a polled fd from poll cache, the fd is always removed 193 * from pollhead list first and then from fpollinfo list, i.e., 194 * pollhead_delete() is called before delfpollinfo(). 195 * 196 * 197 * Locking hierarchy: 198 * pc_no_exit is a leaf level lock. 199 * ps_lock is held when acquiring pc_lock (except when pollwakeup 200 * acquires pc_lock). 201 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 202 * pollhead_delete) 203 * pc_lock is always held (but this is not required) 204 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 205 * from pcache_clean_entry). 206 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 207 * uf_lock. 208 * pc_lock is held across getf/releasef which acquire uf_lock. 209 * ps_lock might be held across getf/releasef which acquire uf_lock. 210 * pollwakeup tries to acquire pc_lock while holding PHLOCK 211 * but drops the locks and reacquire them in reverse order to avoid 212 * deadlock. 213 * 214 * Note also that there is deadlock avoidance support for VOP_POLL routines 215 * and pollwakeup involving a file system or driver lock. 216 * See below. 217 */ 218 219 /* 220 * Deadlock avoidance support for VOP_POLL() routines. This is 221 * sometimes necessary to prevent deadlock between polling threads 222 * (which hold poll locks on entry to xx_poll(), then acquire foo) 223 * and pollwakeup() threads (which hold foo, then acquire poll locks). 224 * 225 * pollunlock(void) releases whatever poll locks the current thread holds, 226 * returning a cookie for use by pollrelock(); 227 * 228 * pollrelock(cookie) reacquires previously dropped poll locks; 229 * 230 * polllock(php, mutex) does the common case: pollunlock(), 231 * acquire the problematic mutex, pollrelock(). 232 */ 233 int 234 pollunlock(void) 235 { 236 pollcache_t *pcp; 237 int lockstate = 0; 238 239 /* 240 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 241 * If the pollrelock/pollunlock is called as a result of poll(2), 242 * the t_pollcache should be NULL. 243 */ 244 if (curthread->t_pollcache == NULL) 245 pcp = curthread->t_pollstate->ps_pcache; 246 else 247 pcp = curthread->t_pollcache; 248 249 if (mutex_owned(&pcp->pc_lock)) { 250 lockstate = 1; 251 mutex_exit(&pcp->pc_lock); 252 } 253 return (lockstate); 254 } 255 256 void 257 pollrelock(int lockstate) 258 { 259 pollcache_t *pcp; 260 261 /* 262 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 263 * If the pollrelock/pollunlock is called as a result of poll(2), 264 * the t_pollcache should be NULL. 265 */ 266 if (curthread->t_pollcache == NULL) 267 pcp = curthread->t_pollstate->ps_pcache; 268 else 269 pcp = curthread->t_pollcache; 270 271 if (lockstate > 0) 272 mutex_enter(&pcp->pc_lock); 273 } 274 275 /* ARGSUSED */ 276 void 277 polllock(pollhead_t *php, kmutex_t *lp) 278 { 279 if (!mutex_tryenter(lp)) { 280 int lockstate = pollunlock(); 281 mutex_enter(lp); 282 pollrelock(lockstate); 283 } 284 } 285 286 static int 287 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 288 { 289 kthread_t *t = curthread; 290 klwp_t *lwp = ttolwp(t); 291 proc_t *p = ttoproc(t); 292 int fdcnt = 0; 293 int i; 294 hrtime_t deadline; /* hrtime value when we want to return */ 295 pollfd_t *pollfdp; 296 pollstate_t *ps; 297 pollcache_t *pcp; 298 int error = 0; 299 nfds_t old_nfds; 300 int cacheindex = 0; /* which cache set is used */ 301 302 /* 303 * Determine the precise future time of the requested timeout, if any. 304 */ 305 if (tsp == NULL) { 306 deadline = -1; 307 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 308 deadline = 0; 309 } else { 310 /* They must wait at least a tick. */ 311 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 312 deadline = MAX(deadline, nsec_per_tick); 313 deadline += gethrtime(); 314 } 315 316 /* 317 * Reset our signal mask, if requested. 318 */ 319 if (ksetp != NULL) { 320 mutex_enter(&p->p_lock); 321 schedctl_finish_sigblock(t); 322 lwp->lwp_sigoldmask = t->t_hold; 323 t->t_hold = *ksetp; 324 t->t_flag |= T_TOMASK; 325 /* 326 * Call cv_reltimedwait_sig() just to check for signals. 327 * We will return immediately with either 0 or -1. 328 */ 329 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 330 TR_CLOCK_TICK)) { 331 mutex_exit(&p->p_lock); 332 error = EINTR; 333 goto pollout; 334 } 335 mutex_exit(&p->p_lock); 336 } 337 338 /* 339 * Check to see if this guy just wants to use poll() as a timeout. 340 * If yes then bypass all the other stuff and make him sleep. 341 */ 342 if (nfds == 0) { 343 /* 344 * Sleep until we have passed the requested future 345 * time or until interrupted by a signal. 346 * Do not check for signals if we do not want to wait. 347 */ 348 if (deadline != 0) { 349 mutex_enter(&t->t_delay_lock); 350 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 351 &t->t_delay_lock, deadline)) > 0) 352 continue; 353 mutex_exit(&t->t_delay_lock); 354 error = (error == 0) ? EINTR : 0; 355 } 356 goto pollout; 357 } 358 359 if (nfds > p->p_fno_ctl) { 360 mutex_enter(&p->p_lock); 361 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 362 p->p_rctls, p, RCA_SAFE); 363 mutex_exit(&p->p_lock); 364 error = EINVAL; 365 goto pollout; 366 } 367 368 /* 369 * Need to allocate memory for pollstate before anything because 370 * the mutex and cv are created in this space 371 */ 372 if ((ps = t->t_pollstate) == NULL) { 373 t->t_pollstate = pollstate_create(); 374 ps = t->t_pollstate; 375 } 376 377 if (ps->ps_pcache == NULL) 378 ps->ps_pcache = pcache_alloc(); 379 pcp = ps->ps_pcache; 380 381 /* 382 * NOTE: for performance, buffers are saved across poll() calls. 383 * The theory is that if a process polls heavily, it tends to poll 384 * on the same set of descriptors. Therefore, we only reallocate 385 * buffers when nfds changes. There is no hysteresis control, 386 * because there is no data to suggest that this is necessary; 387 * the penalty of reallocating is not *that* great in any event. 388 */ 389 old_nfds = ps->ps_nfds; 390 if (nfds != old_nfds) { 391 392 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 393 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 394 ps->ps_pollfd = pollfdp; 395 ps->ps_nfds = nfds; 396 } 397 398 pollfdp = ps->ps_pollfd; 399 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 400 error = EFAULT; 401 goto pollout; 402 } 403 404 if (fds == NULL) { 405 /* 406 * If the process has page 0 mapped, then the copyin() above 407 * will succeed even if fds is NULL. However, our cached 408 * poll lists are keyed by the address of the passed-in fds 409 * structure, and we use the value NULL to indicate an unused 410 * poll cache list entry. As such, we elect not to support 411 * NULL as a valid (user) memory address and fail the poll() 412 * call. 413 */ 414 error = EINVAL; 415 goto pollout; 416 } 417 418 /* 419 * If this thread polls for the first time, allocate ALL poll 420 * cache data structures and cache the poll fd list. This 421 * allocation is delayed till now because lwp's polling 0 fd 422 * (i.e. using poll as timeout()) don't need this memory. 423 */ 424 mutex_enter(&ps->ps_lock); 425 pcp = ps->ps_pcache; 426 ASSERT(pcp != NULL); 427 if (pcp->pc_bitmap == NULL) { 428 pcache_create(pcp, nfds); 429 /* 430 * poll and cache this poll fd list in ps_pcacheset[0]. 431 */ 432 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 433 if (fdcnt || error) { 434 mutex_exit(&ps->ps_lock); 435 goto pollout; 436 } 437 } else { 438 pollcacheset_t *pcset = ps->ps_pcacheset; 439 440 /* 441 * Not first time polling. Select a cached poll list by 442 * matching user pollfd list buffer address. 443 */ 444 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 445 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 446 if ((++pcset[cacheindex].pcs_count) == 0) { 447 /* 448 * counter is wrapping around. 449 */ 450 pcacheset_reset_count(ps, cacheindex); 451 } 452 /* 453 * examine and resolve possible 454 * difference of the current poll 455 * list and previously cached one. 456 * If there is an error during resolve(), 457 * the callee will guarantee the consistency 458 * of cached poll list and cache content. 459 */ 460 error = pcacheset_resolve(ps, nfds, &fdcnt, 461 cacheindex); 462 if (error) { 463 mutex_exit(&ps->ps_lock); 464 goto pollout; 465 } 466 break; 467 } 468 469 /* 470 * Note that pcs_usradr field of an used entry won't be 471 * NULL because it stores the address of passed-in fds, 472 * and NULL fds will not be cached (Then it is either 473 * the special timeout case when nfds is 0 or it returns 474 * failure directly). 475 */ 476 if (pcset[cacheindex].pcs_usradr == NULL) { 477 /* 478 * found an unused entry. Use it to cache 479 * this poll list. 480 */ 481 error = pcacheset_cache_list(ps, fds, &fdcnt, 482 cacheindex); 483 if (fdcnt || error) { 484 mutex_exit(&ps->ps_lock); 485 goto pollout; 486 } 487 break; 488 } 489 } 490 if (cacheindex == ps->ps_nsets) { 491 /* 492 * We failed to find a matching cached poll fd list. 493 * replace an old list. 494 */ 495 pollstats.polllistmiss.value.ui64++; 496 cacheindex = pcacheset_replace(ps); 497 ASSERT(cacheindex < ps->ps_nsets); 498 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 499 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 500 if (error) { 501 mutex_exit(&ps->ps_lock); 502 goto pollout; 503 } 504 } 505 } 506 507 /* 508 * Always scan the bitmap with the lock on the pollcache held. 509 * This is to make sure that a wakeup does not come undetected. 510 * If the lock is not held, a pollwakeup could have come for an 511 * fd we already checked but before this thread sleeps, in which 512 * case the wakeup is missed. Now we hold the pcache lock and 513 * check the bitmap again. This will prevent wakeup from happening 514 * while we hold pcache lock since pollwakeup() will also lock 515 * the pcache before updating poll bitmap. 516 */ 517 mutex_enter(&pcp->pc_lock); 518 for (;;) { 519 pcp->pc_flag = 0; 520 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 521 if (fdcnt || error) { 522 mutex_exit(&pcp->pc_lock); 523 mutex_exit(&ps->ps_lock); 524 break; 525 } 526 527 /* 528 * If T_POLLWAKE is set, a pollwakeup() was performed on 529 * one of the file descriptors. This can happen only if 530 * one of the VOP_POLL() functions dropped pcp->pc_lock. 531 * The only current cases of this is in procfs (prpoll()) 532 * and STREAMS (strpoll()). 533 */ 534 if (pcp->pc_flag & T_POLLWAKE) 535 continue; 536 537 /* 538 * If you get here, the poll of fds was unsuccessful. 539 * Wait until some fd becomes readable, writable, or gets 540 * an exception, or until a signal or a timeout occurs. 541 * Do not check for signals if we have a zero timeout. 542 */ 543 mutex_exit(&ps->ps_lock); 544 if (deadline == 0) { 545 error = -1; 546 } else { 547 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 548 &pcp->pc_lock, deadline); 549 } 550 mutex_exit(&pcp->pc_lock); 551 /* 552 * If we have received a signal or timed out 553 * then break out and return. 554 */ 555 if (error <= 0) { 556 error = (error == 0) ? EINTR : 0; 557 break; 558 } 559 /* 560 * We have not received a signal or timed out. 561 * Continue around and poll fds again. 562 */ 563 mutex_enter(&ps->ps_lock); 564 mutex_enter(&pcp->pc_lock); 565 } 566 567 pollout: 568 /* 569 * If we changed the signal mask but we received 570 * no signal then restore the signal mask. 571 * Otherwise psig() will deal with the signal mask. 572 */ 573 if (ksetp != NULL) { 574 mutex_enter(&p->p_lock); 575 if (lwp->lwp_cursig == 0) { 576 t->t_hold = lwp->lwp_sigoldmask; 577 t->t_flag &= ~T_TOMASK; 578 } 579 mutex_exit(&p->p_lock); 580 } 581 582 if (error) 583 return (set_errno(error)); 584 585 /* 586 * Copy out the events and return the fdcnt to the user. 587 */ 588 if (nfds != 0 && 589 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 590 return (set_errno(EFAULT)); 591 592 #ifdef DEBUG 593 /* 594 * Another sanity check: 595 */ 596 if (fdcnt) { 597 int reventcnt = 0; 598 599 for (i = 0; i < nfds; i++) { 600 if (pollfdp[i].fd < 0) { 601 ASSERT(pollfdp[i].revents == 0); 602 continue; 603 } 604 if (pollfdp[i].revents) { 605 reventcnt++; 606 } 607 } 608 ASSERT(fdcnt == reventcnt); 609 } else { 610 for (i = 0; i < nfds; i++) { 611 ASSERT(pollfdp[i].revents == 0); 612 } 613 } 614 #endif /* DEBUG */ 615 616 return (fdcnt); 617 } 618 619 /* 620 * This is the system call trap that poll(), 621 * select() and pselect() are built upon. 622 * It is a private interface between libc and the kernel. 623 */ 624 int 625 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 626 { 627 timespec_t ts; 628 timespec_t *tsp; 629 sigset_t set; 630 k_sigset_t kset; 631 k_sigset_t *ksetp; 632 model_t datamodel = get_udatamodel(); 633 634 if (timeoutp == NULL) 635 tsp = NULL; 636 else { 637 if (datamodel == DATAMODEL_NATIVE) { 638 if (copyin(timeoutp, &ts, sizeof (ts))) 639 return (set_errno(EFAULT)); 640 } else { 641 timespec32_t ts32; 642 643 if (copyin(timeoutp, &ts32, sizeof (ts32))) 644 return (set_errno(EFAULT)); 645 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 646 } 647 648 if (itimerspecfix(&ts)) 649 return (set_errno(EINVAL)); 650 tsp = &ts; 651 } 652 653 if (setp == NULL) 654 ksetp = NULL; 655 else { 656 if (copyin(setp, &set, sizeof (set))) 657 return (set_errno(EFAULT)); 658 sigutok(&set, &kset); 659 ksetp = &kset; 660 } 661 662 return (poll_common(fds, nfds, tsp, ksetp)); 663 } 664 665 /* 666 * Clean up any state left around by poll(2). Called when a thread exits. 667 */ 668 void 669 pollcleanup() 670 { 671 pollstate_t *ps = curthread->t_pollstate; 672 pollcache_t *pcp; 673 674 if (ps == NULL) 675 return; 676 pcp = ps->ps_pcache; 677 /* 678 * free up all cached poll fds 679 */ 680 if (pcp == NULL) { 681 /* this pollstate is used by /dev/poll */ 682 goto pollcleanout; 683 } 684 685 if (pcp->pc_bitmap != NULL) { 686 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 687 /* 688 * a close lwp can race with us when cleaning up a polldat 689 * entry. We hold the ps_lock when cleaning hash table. 690 * Since this pollcache is going away anyway, there is no 691 * need to hold the pc_lock. 692 */ 693 mutex_enter(&ps->ps_lock); 694 pcache_clean(pcp); 695 mutex_exit(&ps->ps_lock); 696 #ifdef DEBUG 697 /* 698 * At this point, all fds cached by this lwp should be 699 * cleaned up. There should be no fd in fi_list still 700 * reference this thread. 701 */ 702 checkfpollinfo(); /* sanity check */ 703 pollcheckphlist(); /* sanity check */ 704 #endif /* DEBUG */ 705 } 706 /* 707 * Be sure no one is referencing thread before exiting 708 */ 709 mutex_enter(&pcp->pc_no_exit); 710 ASSERT(pcp->pc_busy >= 0); 711 while (pcp->pc_busy > 0) 712 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 713 mutex_exit(&pcp->pc_no_exit); 714 pollcleanout: 715 pollstate_destroy(ps); 716 curthread->t_pollstate = NULL; 717 } 718 719 /* 720 * pollwakeup() - poke threads waiting in poll() for some event 721 * on a particular object. 722 * 723 * The threads hanging off of the specified pollhead structure are scanned. 724 * If their event mask matches the specified event(s), then pollnotify() is 725 * called to poke the thread. 726 * 727 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 728 * all waiting threads are poked. 729 * 730 * It is important that pollnotify() not drop the lock protecting the list 731 * of threads. 732 */ 733 void 734 pollwakeup(pollhead_t *php, short events_arg) 735 { 736 polldat_t *pdp; 737 int events = (ushort_t)events_arg; 738 struct plist { 739 port_t *pp; 740 int pevents; 741 struct plist *next; 742 }; 743 struct plist *plhead = NULL, *pltail = NULL; 744 745 retry: 746 PH_ENTER(php); 747 748 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 749 if ((pdp->pd_events & events) || 750 (events & (POLLHUP | POLLERR))) { 751 752 pollcache_t *pcp; 753 754 if (pdp->pd_portev != NULL) { 755 port_kevent_t *pkevp = pdp->pd_portev; 756 /* 757 * Object (fd) is associated with an event port, 758 * => send event notification to the port. 759 */ 760 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 761 mutex_enter(&pkevp->portkev_lock); 762 if (pkevp->portkev_flags & PORT_KEV_VALID) { 763 int pevents; 764 765 pkevp->portkev_flags &= ~PORT_KEV_VALID; 766 pkevp->portkev_events |= events & 767 (pdp->pd_events | POLLHUP | 768 POLLERR); 769 /* 770 * portkev_lock mutex will be released 771 * by port_send_event(). 772 */ 773 port_send_event(pkevp); 774 775 /* 776 * If we have some thread polling the 777 * port's fd, add it to the list. They 778 * will be notified later. 779 * The port_pollwkup() will flag the 780 * port_t so that it will not disappear 781 * till port_pollwkdone() is called. 782 */ 783 pevents = 784 port_pollwkup(pkevp->portkev_port); 785 if (pevents) { 786 struct plist *t; 787 t = kmem_zalloc( 788 sizeof (struct plist), 789 KM_SLEEP); 790 t->pp = pkevp->portkev_port; 791 t->pevents = pevents; 792 if (plhead == NULL) { 793 plhead = t; 794 } else { 795 pltail->next = t; 796 } 797 pltail = t; 798 } 799 } else { 800 mutex_exit(&pkevp->portkev_lock); 801 } 802 continue; 803 } 804 805 pcp = pdp->pd_pcache; 806 807 /* 808 * Try to grab the lock for this thread. If 809 * we don't get it then we may deadlock so 810 * back out and restart all over again. Note 811 * that the failure rate is very very low. 812 */ 813 if (mutex_tryenter(&pcp->pc_lock)) { 814 pollnotify(pcp, pdp->pd_fd); 815 mutex_exit(&pcp->pc_lock); 816 } else { 817 /* 818 * We are here because: 819 * 1) This thread has been woke up 820 * and is trying to get out of poll(). 821 * 2) Some other thread is also here 822 * but with a different pollhead lock. 823 * 824 * So, we need to drop the lock on pollhead 825 * because of (1) but we want to prevent 826 * that thread from doing lwp_exit() or 827 * devpoll close. We want to ensure that 828 * the pollcache pointer is still invalid. 829 * 830 * Solution: Grab the pcp->pc_no_exit lock, 831 * increment the pc_busy counter, drop every 832 * lock in sight. Get out of the way and wait 833 * for type (2) threads to finish. 834 */ 835 836 mutex_enter(&pcp->pc_no_exit); 837 pcp->pc_busy++; /* prevents exit()'s */ 838 mutex_exit(&pcp->pc_no_exit); 839 840 PH_EXIT(php); 841 mutex_enter(&pcp->pc_lock); 842 mutex_exit(&pcp->pc_lock); 843 mutex_enter(&pcp->pc_no_exit); 844 pcp->pc_busy--; 845 if (pcp->pc_busy == 0) { 846 /* 847 * Wakeup the thread waiting in 848 * thread_exit(). 849 */ 850 cv_signal(&pcp->pc_busy_cv); 851 } 852 mutex_exit(&pcp->pc_no_exit); 853 goto retry; 854 } 855 } 856 } 857 858 859 /* 860 * Event ports - If this php is of the port on the list, 861 * call port_pollwkdone() to release it. The port_pollwkdone() 862 * needs to be called before dropping the PH lock so that any new 863 * thread attempting to poll this port are blocked. There can be 864 * only one thread here in pollwakeup notifying this port's fd. 865 */ 866 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 867 struct plist *t; 868 port_pollwkdone(plhead->pp); 869 t = plhead; 870 plhead = plhead->next; 871 kmem_free(t, sizeof (struct plist)); 872 } 873 PH_EXIT(php); 874 875 /* 876 * Event ports - Notify threads polling the event port's fd. 877 * This is normally done in port_send_event() where it calls 878 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 879 * we do it here in pollwakeup() to avoid a recursive call. 880 */ 881 if (plhead != NULL) { 882 php = &plhead->pp->port_pollhd; 883 events = plhead->pevents; 884 goto retry; 885 } 886 } 887 888 /* 889 * This function is called to inform a thread that 890 * an event being polled for has occurred. 891 * The pollstate lock on the thread should be held on entry. 892 */ 893 void 894 pollnotify(pollcache_t *pcp, int fd) 895 { 896 ASSERT(fd < pcp->pc_mapsize); 897 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 898 BT_SET(pcp->pc_bitmap, fd); 899 pcp->pc_flag |= T_POLLWAKE; 900 cv_signal(&pcp->pc_cv); 901 } 902 903 /* 904 * add a polldat entry to pollhead ph_list. The polldat struct is used 905 * by pollwakeup to wake sleeping pollers when polled events has happened. 906 */ 907 void 908 pollhead_insert(pollhead_t *php, polldat_t *pdp) 909 { 910 PH_ENTER(php); 911 ASSERT(pdp->pd_next == NULL); 912 #ifdef DEBUG 913 { 914 /* 915 * the polldat should not be already on the list 916 */ 917 polldat_t *wp; 918 for (wp = php->ph_list; wp; wp = wp->pd_next) { 919 ASSERT(wp != pdp); 920 } 921 } 922 #endif /* DEBUG */ 923 pdp->pd_next = php->ph_list; 924 php->ph_list = pdp; 925 PH_EXIT(php); 926 } 927 928 /* 929 * Delete the polldat entry from ph_list. 930 */ 931 void 932 pollhead_delete(pollhead_t *php, polldat_t *pdp) 933 { 934 polldat_t *wp; 935 polldat_t **wpp; 936 937 PH_ENTER(php); 938 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 939 if (wp == pdp) { 940 *wpp = pdp->pd_next; 941 pdp->pd_next = NULL; 942 break; 943 } 944 } 945 #ifdef DEBUG 946 /* assert that pdp is no longer in the list */ 947 for (wp = *wpp; wp; wp = wp->pd_next) { 948 ASSERT(wp != pdp); 949 } 950 #endif /* DEBUG */ 951 PH_EXIT(php); 952 } 953 954 /* 955 * walk through the poll fd lists to see if they are identical. This is an 956 * expensive operation and should not be done more than once for each poll() 957 * call. 958 * 959 * As an optimization (i.e., not having to go through the lists more than 960 * once), this routine also clear the revents field of pollfd in 'current'. 961 * Zeroing out the revents field of each entry in current poll list is 962 * required by poll man page. 963 * 964 * Since the events field of cached list has illegal poll events filtered 965 * out, the current list applies the same filtering before comparison. 966 * 967 * The routine stops when it detects a meaningful difference, or when it 968 * exhausts the lists. 969 */ 970 int 971 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 972 { 973 int ix; 974 975 for (ix = 0; ix < n; ix++) { 976 /* Prefetch 64 bytes worth of 8-byte elements */ 977 if ((ix & 0x7) == 0) { 978 prefetch_write_many((caddr_t)¤t[ix + 8]); 979 prefetch_write_many((caddr_t)&cached[ix + 8]); 980 } 981 if (current[ix].fd == cached[ix].fd) { 982 /* 983 * Filter out invalid poll events while we are in 984 * inside the loop. 985 */ 986 if (current[ix].events & ~VALID_POLL_EVENTS) { 987 current[ix].events &= VALID_POLL_EVENTS; 988 if (newlist != NULL) 989 newlist[ix].events = current[ix].events; 990 } 991 if (current[ix].events == cached[ix].events) { 992 current[ix].revents = 0; 993 continue; 994 } 995 } 996 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 997 current[ix].revents = 0; 998 continue; 999 } 1000 return (ix); 1001 } 1002 return (ix); 1003 } 1004 1005 /* 1006 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1007 * does not find it in the hash table. 1008 */ 1009 polldat_t * 1010 pcache_lookup_fd(pollcache_t *pcp, int fd) 1011 { 1012 int hashindex; 1013 polldat_t *pdp; 1014 1015 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1016 pdp = pcp->pc_hash[hashindex]; 1017 while (pdp != NULL) { 1018 if (pdp->pd_fd == fd) 1019 break; 1020 pdp = pdp->pd_hashnext; 1021 } 1022 return (pdp); 1023 } 1024 1025 polldat_t * 1026 pcache_alloc_fd(int nsets) 1027 { 1028 polldat_t *pdp; 1029 1030 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1031 if (nsets > 0) { 1032 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1033 pdp->pd_nsets = nsets; 1034 } 1035 return (pdp); 1036 } 1037 1038 /* 1039 * This routine inserts a polldat into the pollcache's hash table. It 1040 * may be necessary to grow the size of the hash table. 1041 */ 1042 void 1043 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1044 { 1045 int hashindex; 1046 int fd; 1047 1048 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1049 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1050 pcache_grow_hashtbl(pcp, nfds); 1051 } 1052 fd = pdp->pd_fd; 1053 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1054 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1055 pcp->pc_hash[hashindex] = pdp; 1056 pcp->pc_fdcount++; 1057 1058 #ifdef DEBUG 1059 { 1060 /* 1061 * same fd should not appear on a hash list twice 1062 */ 1063 polldat_t *pdp1; 1064 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1065 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1066 } 1067 } 1068 #endif /* DEBUG */ 1069 } 1070 1071 /* 1072 * Grow the hash table -- either double the table size or round it to the 1073 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1074 * elements on the hash table. 1075 */ 1076 void 1077 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1078 { 1079 int oldsize; 1080 polldat_t **oldtbl; 1081 polldat_t *pdp, *pdp1; 1082 int i; 1083 #ifdef DEBUG 1084 int count = 0; 1085 #endif 1086 1087 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1088 oldsize = pcp->pc_hashsize; 1089 oldtbl = pcp->pc_hash; 1090 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1091 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1092 ~(POLLHASHCHUNKSZ - 1); 1093 } else { 1094 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1095 } 1096 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1097 KM_SLEEP); 1098 /* 1099 * rehash existing elements 1100 */ 1101 pcp->pc_fdcount = 0; 1102 for (i = 0; i < oldsize; i++) { 1103 pdp = oldtbl[i]; 1104 while (pdp != NULL) { 1105 pdp1 = pdp->pd_hashnext; 1106 pcache_insert_fd(pcp, pdp, nfds); 1107 pdp = pdp1; 1108 #ifdef DEBUG 1109 count++; 1110 #endif 1111 } 1112 } 1113 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1114 ASSERT(pcp->pc_fdcount == count); 1115 } 1116 1117 void 1118 pcache_grow_map(pollcache_t *pcp, int fd) 1119 { 1120 int newsize; 1121 ulong_t *newmap; 1122 1123 /* 1124 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1125 * power of 2. 1126 */ 1127 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1128 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1129 KM_SLEEP); 1130 /* 1131 * don't want pollwakeup to set a bit while growing the bitmap. 1132 */ 1133 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1134 mutex_enter(&pcp->pc_lock); 1135 bcopy(pcp->pc_bitmap, newmap, 1136 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1137 kmem_free(pcp->pc_bitmap, 1138 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1139 pcp->pc_bitmap = newmap; 1140 pcp->pc_mapsize = newsize; 1141 mutex_exit(&pcp->pc_lock); 1142 } 1143 1144 /* 1145 * remove all the reference from pollhead list and fpollinfo lists. 1146 */ 1147 void 1148 pcache_clean(pollcache_t *pcp) 1149 { 1150 int i; 1151 polldat_t **hashtbl; 1152 polldat_t *pdp; 1153 1154 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1155 hashtbl = pcp->pc_hash; 1156 for (i = 0; i < pcp->pc_hashsize; i++) { 1157 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1158 if (pdp->pd_php != NULL) { 1159 pollhead_delete(pdp->pd_php, pdp); 1160 pdp->pd_php = NULL; 1161 } 1162 if (pdp->pd_fp != NULL) { 1163 delfpollinfo(pdp->pd_fd); 1164 pdp->pd_fp = NULL; 1165 } 1166 } 1167 } 1168 } 1169 1170 void 1171 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1172 { 1173 int i; 1174 int fd = pdp->pd_fd; 1175 1176 /* 1177 * we come here because an earlier close() on this cached poll fd. 1178 */ 1179 ASSERT(pdp->pd_fp == NULL); 1180 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1181 pdp->pd_events = 0; 1182 for (i = 0; i < ps->ps_nsets; i++) { 1183 xref_t *refp; 1184 pollcacheset_t *pcsp; 1185 1186 ASSERT(pdp->pd_ref != NULL); 1187 refp = &pdp->pd_ref[i]; 1188 if (refp->xf_refcnt) { 1189 ASSERT(refp->xf_position >= 0); 1190 pcsp = &ps->ps_pcacheset[i]; 1191 if (refp->xf_refcnt == 1) { 1192 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1193 refp->xf_refcnt = 0; 1194 pdp->pd_count--; 1195 } else if (refp->xf_refcnt > 1) { 1196 int j; 1197 1198 /* 1199 * turn off every appearance in pcs_pollfd list 1200 */ 1201 for (j = refp->xf_position; 1202 j < pcsp->pcs_nfds; j++) { 1203 if (pcsp->pcs_pollfd[j].fd == fd) { 1204 pcsp->pcs_pollfd[j].fd = -1; 1205 refp->xf_refcnt--; 1206 pdp->pd_count--; 1207 } 1208 } 1209 } 1210 ASSERT(refp->xf_refcnt == 0); 1211 refp->xf_position = POLLPOSINVAL; 1212 } 1213 } 1214 ASSERT(pdp->pd_count == 0); 1215 } 1216 1217 /* 1218 * Insert poll fd into the pollcache, and add poll registration. 1219 * This routine is called after getf() and before releasef(). So the vnode 1220 * can not disappear even if we block here. 1221 * If there is an error, the polled fd is not cached. 1222 */ 1223 int 1224 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1225 ssize_t pos, int which) 1226 { 1227 pollcache_t *pcp = ps->ps_pcache; 1228 polldat_t *pdp; 1229 int error; 1230 int fd; 1231 pollhead_t *memphp = NULL; 1232 xref_t *refp; 1233 int newpollfd = 0; 1234 1235 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1236 /* 1237 * The poll caching uses the existing VOP_POLL interface. If there 1238 * is no polled events, we want the polled device to set its "some 1239 * one is sleeping in poll" flag. When the polled events happen 1240 * later, the driver will call pollwakeup(). We achieve this by 1241 * always passing 0 in the third parameter ("anyyet") when calling 1242 * VOP_POLL. This parameter is not looked at by drivers when the 1243 * polled events exist. If a driver chooses to ignore this parameter 1244 * and call pollwakeup whenever the polled events happen, that will 1245 * be OK too. 1246 */ 1247 ASSERT(curthread->t_pollcache == NULL); 1248 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1249 &memphp, NULL); 1250 if (error) { 1251 return (error); 1252 } 1253 if (pollfdp->revents) { 1254 (*fdcntp)++; 1255 } 1256 /* 1257 * polling the underlying device succeeded. Now we can cache it. 1258 * A close can't come in here because we have not done a releasef() 1259 * yet. 1260 */ 1261 fd = pollfdp->fd; 1262 pdp = pcache_lookup_fd(pcp, fd); 1263 if (pdp == NULL) { 1264 ASSERT(ps->ps_nsets > 0); 1265 pdp = pcache_alloc_fd(ps->ps_nsets); 1266 newpollfd = 1; 1267 } 1268 /* 1269 * If this entry was used to cache a poll fd which was closed, and 1270 * this entry has not been cleaned, do it now. 1271 */ 1272 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1273 pcacheset_invalidate(ps, pdp); 1274 ASSERT(pdp->pd_next == NULL); 1275 } 1276 if (pdp->pd_count == 0) { 1277 pdp->pd_fd = fd; 1278 pdp->pd_fp = fp; 1279 addfpollinfo(fd); 1280 pdp->pd_thread = curthread; 1281 pdp->pd_pcache = pcp; 1282 /* 1283 * the entry is never used or cleared by removing a cached 1284 * pollfd (pcache_delete_fd). So all the fields should be clear. 1285 */ 1286 ASSERT(pdp->pd_next == NULL); 1287 } 1288 1289 /* 1290 * A polled fd is considered cached. So there should be a fpollinfo 1291 * entry on uf_fpollinfo list. 1292 */ 1293 ASSERT(infpollinfo(fd)); 1294 /* 1295 * If there is an inconsistency, we want to know it here. 1296 */ 1297 ASSERT(pdp->pd_fp == fp); 1298 1299 /* 1300 * XXX pd_events is a union of all polled events on this fd, possibly 1301 * by different threads. Unless this is a new first poll(), pd_events 1302 * never shrinks. If an event is no longer polled by a process, there 1303 * is no way to cancel that event. In that case, poll degrade to its 1304 * old form -- polling on this fd every time poll() is called. The 1305 * assumption is an app always polls the same type of events. 1306 */ 1307 pdp->pd_events |= pollfdp->events; 1308 1309 pdp->pd_count++; 1310 /* 1311 * There is not much special handling for multiple appearances of 1312 * same fd other than xf_position always recording the first 1313 * appearance in poll list. If this is called from pcacheset_cache_list, 1314 * a VOP_POLL is called on every pollfd entry; therefore each 1315 * revents and fdcnt should be set correctly. If this is called from 1316 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1317 * pick up the right count and handle revents field of each pollfd 1318 * entry. 1319 */ 1320 ASSERT(pdp->pd_ref != NULL); 1321 refp = &pdp->pd_ref[which]; 1322 if (refp->xf_refcnt == 0) { 1323 refp->xf_position = pos; 1324 } else { 1325 /* 1326 * xf_position records the fd's first appearance in poll list 1327 */ 1328 if (pos < refp->xf_position) { 1329 refp->xf_position = pos; 1330 } 1331 } 1332 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1333 refp->xf_refcnt++; 1334 if (fd >= pcp->pc_mapsize) { 1335 pcache_grow_map(pcp, fd); 1336 } 1337 if (fd > pcp->pc_mapend) { 1338 pcp->pc_mapend = fd; 1339 } 1340 if (newpollfd != 0) { 1341 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1342 } 1343 if (memphp) { 1344 if (pdp->pd_php == NULL) { 1345 pollhead_insert(memphp, pdp); 1346 pdp->pd_php = memphp; 1347 } else { 1348 if (memphp != pdp->pd_php) { 1349 /* 1350 * layered devices (e.g. console driver) 1351 * may change the vnode and thus the pollhead 1352 * pointer out from underneath us. 1353 */ 1354 pollhead_delete(pdp->pd_php, pdp); 1355 pollhead_insert(memphp, pdp); 1356 pdp->pd_php = memphp; 1357 } 1358 } 1359 } 1360 /* 1361 * Since there is a considerable window between VOP_POLL and when 1362 * we actually put the polldat struct on the pollhead list, we could 1363 * miss a pollwakeup. In the case of polling additional events, we 1364 * don't update the events until after VOP_POLL. So we could miss 1365 * pollwakeup there too. So we always set the bit here just to be 1366 * safe. The real performance gain is in subsequent pcache_poll. 1367 */ 1368 mutex_enter(&pcp->pc_lock); 1369 BT_SET(pcp->pc_bitmap, fd); 1370 mutex_exit(&pcp->pc_lock); 1371 return (0); 1372 } 1373 1374 /* 1375 * The entry is not really deleted. The fields are cleared so that the 1376 * entry is no longer useful, but it will remain in the hash table for reuse 1377 * later. It will be freed when the polling lwp exits. 1378 */ 1379 int 1380 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1381 { 1382 pollcache_t *pcp = ps->ps_pcache; 1383 polldat_t *pdp; 1384 xref_t *refp; 1385 1386 ASSERT(fd < pcp->pc_mapsize); 1387 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1388 1389 pdp = pcache_lookup_fd(pcp, fd); 1390 ASSERT(pdp != NULL); 1391 ASSERT(pdp->pd_count > 0); 1392 ASSERT(pdp->pd_ref != NULL); 1393 refp = &pdp->pd_ref[which]; 1394 if (pdp->pd_count == 1) { 1395 pdp->pd_events = 0; 1396 refp->xf_position = POLLPOSINVAL; 1397 ASSERT(refp->xf_refcnt == 1); 1398 refp->xf_refcnt = 0; 1399 if (pdp->pd_php) { 1400 /* 1401 * It is possible for a wakeup thread to get ahead 1402 * of the following pollhead_delete and set the bit in 1403 * bitmap. It is OK because the bit will be cleared 1404 * here anyway. 1405 */ 1406 pollhead_delete(pdp->pd_php, pdp); 1407 pdp->pd_php = NULL; 1408 } 1409 pdp->pd_count = 0; 1410 if (pdp->pd_fp != NULL) { 1411 pdp->pd_fp = NULL; 1412 delfpollinfo(fd); 1413 } 1414 mutex_enter(&pcp->pc_lock); 1415 BT_CLEAR(pcp->pc_bitmap, fd); 1416 mutex_exit(&pcp->pc_lock); 1417 return (0); 1418 } 1419 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1420 /* 1421 * fd cached here has been closed. This is the first 1422 * pcache_delete_fd called after the close. Clean up the 1423 * entire entry. 1424 */ 1425 pcacheset_invalidate(ps, pdp); 1426 ASSERT(pdp->pd_php == NULL); 1427 mutex_enter(&pcp->pc_lock); 1428 BT_CLEAR(pcp->pc_bitmap, fd); 1429 mutex_exit(&pcp->pc_lock); 1430 return (0); 1431 } 1432 #ifdef DEBUG 1433 if (getf(fd) != NULL) { 1434 ASSERT(infpollinfo(fd)); 1435 releasef(fd); 1436 } 1437 #endif /* DEBUG */ 1438 pdp->pd_count--; 1439 ASSERT(refp->xf_refcnt > 0); 1440 if (--refp->xf_refcnt == 0) { 1441 refp->xf_position = POLLPOSINVAL; 1442 } else { 1443 ASSERT(pos >= refp->xf_position); 1444 if (pos == refp->xf_position) { 1445 /* 1446 * The xref position is no longer valid. 1447 * Reset it to a special value and let 1448 * caller know it needs to updatexref() 1449 * with a new xf_position value. 1450 */ 1451 refp->xf_position = POLLPOSTRANS; 1452 return (1); 1453 } 1454 } 1455 return (0); 1456 } 1457 1458 void 1459 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1460 { 1461 polldat_t *pdp; 1462 1463 pdp = pcache_lookup_fd(pcp, fd); 1464 ASSERT(pdp != NULL); 1465 ASSERT(pdp->pd_ref != NULL); 1466 pdp->pd_ref[which].xf_position = pos; 1467 } 1468 1469 #ifdef DEBUG 1470 /* 1471 * For each polled fd, it's either in the bitmap or cached in 1472 * pcache hash table. If this routine returns 0, something is wrong. 1473 */ 1474 static int 1475 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1476 { 1477 int i; 1478 int fd; 1479 pollcache_t *pcp = ps->ps_pcache; 1480 polldat_t *pdp; 1481 pollfd_t *pollfdp = ps->ps_pollfd; 1482 file_t *fp; 1483 1484 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1485 for (i = 0; i < nfds; i++) { 1486 fd = pollfdp[i].fd; 1487 if (fd < 0) { 1488 ASSERT(pollfdp[i].revents == 0); 1489 continue; 1490 } 1491 if (pollfdp[i].revents == POLLNVAL) 1492 continue; 1493 if ((fp = getf(fd)) == NULL) 1494 continue; 1495 pdp = pcache_lookup_fd(pcp, fd); 1496 ASSERT(pdp != NULL); 1497 ASSERT(infpollinfo(fd)); 1498 ASSERT(pdp->pd_fp == fp); 1499 releasef(fd); 1500 if (BT_TEST(pcp->pc_bitmap, fd)) 1501 continue; 1502 if (pdp->pd_php == NULL) 1503 return (0); 1504 } 1505 return (1); 1506 } 1507 #endif /* DEBUG */ 1508 1509 /* 1510 * resolve the difference between the current poll list and a cached one. 1511 */ 1512 int 1513 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1514 { 1515 int i; 1516 pollcache_t *pcp = ps->ps_pcache; 1517 pollfd_t *newlist = NULL; 1518 pollfd_t *current = ps->ps_pollfd; 1519 pollfd_t *cached; 1520 pollcacheset_t *pcsp; 1521 int common; 1522 int count = 0; 1523 int offset; 1524 int remain; 1525 int fd; 1526 file_t *fp; 1527 int fdcnt = 0; 1528 int cnt = 0; 1529 nfds_t old_nfds; 1530 int error = 0; 1531 int mismatch = 0; 1532 1533 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1534 #ifdef DEBUG 1535 checkpolldat(ps); 1536 #endif 1537 pcsp = &ps->ps_pcacheset[which]; 1538 old_nfds = pcsp->pcs_nfds; 1539 common = (nfds > old_nfds) ? old_nfds : nfds; 1540 if (nfds != old_nfds) { 1541 /* 1542 * the length of poll list has changed. allocate a new 1543 * pollfd list. 1544 */ 1545 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1546 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1547 } 1548 /* 1549 * Compare the overlapping part of the current fd list with the 1550 * cached one. Whenever a difference is found, resolve it. 1551 * The comparison is done on the current poll list and the 1552 * cached list. But we may be setting up the newlist to be the 1553 * cached list for next poll. 1554 */ 1555 cached = pcsp->pcs_pollfd; 1556 remain = common; 1557 1558 while (count < common) { 1559 int tmpfd; 1560 pollfd_t *np; 1561 1562 np = (newlist != NULL) ? &newlist[count] : NULL; 1563 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1564 remain); 1565 /* 1566 * Collect stats. If lists are completed the first time, 1567 * it's a hit. Otherwise, it's a partial hit or miss. 1568 */ 1569 if ((count == 0) && (offset == common)) { 1570 pollstats.pollcachehit.value.ui64++; 1571 } else { 1572 mismatch++; 1573 } 1574 count += offset; 1575 if (offset < remain) { 1576 ASSERT(count < common); 1577 ASSERT((current[count].fd != cached[count].fd) || 1578 (current[count].events != cached[count].events)); 1579 /* 1580 * Filter out invalid events. 1581 */ 1582 if (current[count].events & ~VALID_POLL_EVENTS) { 1583 if (newlist != NULL) { 1584 newlist[count].events = 1585 current[count].events &= 1586 VALID_POLL_EVENTS; 1587 } else { 1588 current[count].events &= 1589 VALID_POLL_EVENTS; 1590 } 1591 } 1592 /* 1593 * when resolving a difference, we always remove the 1594 * fd from cache before inserting one into cache. 1595 */ 1596 if (cached[count].fd >= 0) { 1597 tmpfd = cached[count].fd; 1598 if (pcache_delete_fd(ps, tmpfd, count, which, 1599 (uint_t)cached[count].events)) { 1600 /* 1601 * This should be rare but needed for 1602 * correctness. 1603 * 1604 * The first appearance in cached list 1605 * is being "turned off". The same fd 1606 * appear more than once in the cached 1607 * poll list. Find the next one on the 1608 * list and update the cached 1609 * xf_position field. 1610 */ 1611 for (i = count + 1; i < old_nfds; i++) { 1612 if (cached[i].fd == tmpfd) { 1613 pcache_update_xref(pcp, 1614 tmpfd, (ssize_t)i, 1615 which); 1616 break; 1617 } 1618 } 1619 ASSERT(i <= old_nfds); 1620 } 1621 /* 1622 * In case a new cache list is allocated, 1623 * need to keep both cache lists in sync 1624 * b/c the new one can be freed if we have 1625 * an error later. 1626 */ 1627 cached[count].fd = -1; 1628 if (newlist != NULL) { 1629 newlist[count].fd = -1; 1630 } 1631 } 1632 if ((tmpfd = current[count].fd) >= 0) { 1633 /* 1634 * add to the cached fd tbl and bitmap. 1635 */ 1636 if ((fp = getf(tmpfd)) == NULL) { 1637 current[count].revents = POLLNVAL; 1638 if (newlist != NULL) { 1639 newlist[count].fd = -1; 1640 } 1641 cached[count].fd = -1; 1642 fdcnt++; 1643 } else { 1644 /* 1645 * Here we don't care about the 1646 * fdcnt. We will examine the bitmap 1647 * later and pick up the correct 1648 * fdcnt there. So we never bother 1649 * to check value of 'cnt'. 1650 */ 1651 error = pcache_insert(ps, fp, 1652 ¤t[count], &cnt, 1653 (ssize_t)count, which); 1654 /* 1655 * if no error, we want to do releasef 1656 * after we updated cache poll list 1657 * entry so that close() won't race 1658 * us. 1659 */ 1660 if (error) { 1661 /* 1662 * If we encountered an error, 1663 * we have invalidated an 1664 * entry in cached poll list 1665 * (in pcache_delete_fd() above) 1666 * but failed to add one here. 1667 * This is OK b/c what's in the 1668 * cached list is consistent 1669 * with content of cache. 1670 * It will not have any ill 1671 * effect on next poll(). 1672 */ 1673 releasef(tmpfd); 1674 if (newlist != NULL) { 1675 kmem_free(newlist, 1676 nfds * 1677 sizeof (pollfd_t)); 1678 } 1679 return (error); 1680 } 1681 /* 1682 * If we have allocated a new(temp) 1683 * cache list, we need to keep both 1684 * in sync b/c the new one can be freed 1685 * if we have an error later. 1686 */ 1687 if (newlist != NULL) { 1688 newlist[count].fd = 1689 current[count].fd; 1690 newlist[count].events = 1691 current[count].events; 1692 } 1693 cached[count].fd = current[count].fd; 1694 cached[count].events = 1695 current[count].events; 1696 releasef(tmpfd); 1697 } 1698 } else { 1699 current[count].revents = 0; 1700 } 1701 count++; 1702 remain = common - count; 1703 } 1704 } 1705 if (mismatch != 0) { 1706 if (mismatch == common) { 1707 pollstats.pollcachemiss.value.ui64++; 1708 } else { 1709 pollstats.pollcachephit.value.ui64++; 1710 } 1711 } 1712 /* 1713 * take care of the non overlapping part of a list 1714 */ 1715 if (nfds > old_nfds) { 1716 ASSERT(newlist != NULL); 1717 for (i = old_nfds; i < nfds; i++) { 1718 /* filter out invalid events */ 1719 if (current[i].events & ~VALID_POLL_EVENTS) { 1720 newlist[i].events = current[i].events = 1721 current[i].events & VALID_POLL_EVENTS; 1722 } 1723 if ((fd = current[i].fd) < 0) { 1724 current[i].revents = 0; 1725 continue; 1726 } 1727 /* 1728 * add to the cached fd tbl and bitmap. 1729 */ 1730 if ((fp = getf(fd)) == NULL) { 1731 current[i].revents = POLLNVAL; 1732 newlist[i].fd = -1; 1733 fdcnt++; 1734 continue; 1735 } 1736 /* 1737 * Here we don't care about the 1738 * fdcnt. We will examine the bitmap 1739 * later and pick up the correct 1740 * fdcnt there. So we never bother to 1741 * check 'cnt'. 1742 */ 1743 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1744 (ssize_t)i, which); 1745 releasef(fd); 1746 if (error) { 1747 /* 1748 * Here we are half way through adding newly 1749 * polled fd. Undo enough to keep the cache 1750 * list consistent with the cache content. 1751 */ 1752 pcacheset_remove_list(ps, current, old_nfds, 1753 i, which, 0); 1754 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1755 return (error); 1756 } 1757 } 1758 } 1759 if (old_nfds > nfds) { 1760 /* 1761 * remove the fd's which are no longer polled. 1762 */ 1763 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1764 which, 1); 1765 } 1766 /* 1767 * set difference resolved. update nfds and cachedlist 1768 * in pollstate struct. 1769 */ 1770 if (newlist != NULL) { 1771 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1772 /* 1773 * By now, the pollfd.revents field should 1774 * all be zeroed. 1775 */ 1776 pcsp->pcs_pollfd = newlist; 1777 pcsp->pcs_nfds = nfds; 1778 } 1779 ASSERT(*fdcntp == 0); 1780 *fdcntp = fdcnt; 1781 /* 1782 * By now for every fd in pollfdp, one of the following should be 1783 * true. Otherwise we will miss a polled event. 1784 * 1785 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1786 * will be called on this fd in next poll. 1787 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1788 * pollnotify will happen. 1789 */ 1790 ASSERT(pollchecksanity(ps, nfds)); 1791 /* 1792 * make sure cross reference between cached poll lists and cached 1793 * poll fds are correct. 1794 */ 1795 ASSERT(pollcheckxref(ps, which)); 1796 /* 1797 * ensure each polldat in pollcache reference a polled fd in 1798 * pollcacheset. 1799 */ 1800 #ifdef DEBUG 1801 checkpolldat(ps); 1802 #endif 1803 return (0); 1804 } 1805 1806 #ifdef DEBUG 1807 static int 1808 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1809 { 1810 int i; 1811 int reventcnt = 0; 1812 1813 for (i = 0; i < nfds; i++) { 1814 if (pollfdp[i].fd < 0) { 1815 ASSERT(pollfdp[i].revents == 0); 1816 continue; 1817 } 1818 if (pollfdp[i].revents) { 1819 reventcnt++; 1820 } 1821 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1822 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1823 } 1824 } 1825 return (reventcnt); 1826 } 1827 #endif /* DEBUG */ 1828 1829 /* 1830 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1831 * is held upon entry. 1832 */ 1833 int 1834 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1835 int which) 1836 { 1837 int i; 1838 pollcache_t *pcp; 1839 int fd; 1840 int begin, end, done; 1841 pollhead_t *php; 1842 int fdcnt; 1843 int error = 0; 1844 file_t *fp; 1845 polldat_t *pdp; 1846 xref_t *refp; 1847 int entry; 1848 1849 pcp = ps->ps_pcache; 1850 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1851 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1852 retry: 1853 done = 0; 1854 begin = 0; 1855 fdcnt = 0; 1856 end = pcp->pc_mapend; 1857 while ((fdcnt < nfds) && !done) { 1858 php = NULL; 1859 /* 1860 * only poll fds which may have events 1861 */ 1862 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1863 ASSERT(fd <= end); 1864 if (fd >= 0) { 1865 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1866 /* 1867 * adjust map pointers for next round 1868 */ 1869 if (fd == end) { 1870 done = 1; 1871 } else { 1872 begin = fd + 1; 1873 } 1874 /* 1875 * A bitmap caches poll state information of 1876 * multiple poll lists. Call VOP_POLL only if 1877 * the bit corresponds to an fd in this poll 1878 * list. 1879 */ 1880 pdp = pcache_lookup_fd(pcp, fd); 1881 ASSERT(pdp != NULL); 1882 ASSERT(pdp->pd_ref != NULL); 1883 refp = &pdp->pd_ref[which]; 1884 if (refp->xf_refcnt == 0) 1885 continue; 1886 entry = refp->xf_position; 1887 ASSERT((entry >= 0) && (entry < nfds)); 1888 ASSERT(pollfdp[entry].fd == fd); 1889 /* 1890 * we are in this routine implies that we have 1891 * successfully polled this fd in the past. 1892 * Check to see this fd is closed while we are 1893 * blocked in poll. This ensures that we don't 1894 * miss a close on the fd in the case this fd is 1895 * reused. 1896 */ 1897 if (pdp->pd_fp == NULL) { 1898 ASSERT(pdp->pd_count > 0); 1899 pollfdp[entry].revents = POLLNVAL; 1900 fdcnt++; 1901 if (refp->xf_refcnt > 1) { 1902 /* 1903 * this fd appeared multiple time 1904 * in the poll list. Find all of them. 1905 */ 1906 for (i = entry + 1; i < nfds; i++) { 1907 if (pollfdp[i].fd == fd) { 1908 pollfdp[i].revents = 1909 POLLNVAL; 1910 fdcnt++; 1911 } 1912 } 1913 } 1914 pcacheset_invalidate(ps, pdp); 1915 continue; 1916 } 1917 /* 1918 * We can be here polling a device that is being 1919 * closed (i.e. the file pointer is set to NULL, 1920 * but pollcacheclean has not happened yet). 1921 */ 1922 if ((fp = getf(fd)) == NULL) { 1923 pollfdp[entry].revents = POLLNVAL; 1924 fdcnt++; 1925 if (refp->xf_refcnt > 1) { 1926 /* 1927 * this fd appeared multiple time 1928 * in the poll list. Find all of them. 1929 */ 1930 for (i = entry + 1; i < nfds; i++) { 1931 if (pollfdp[i].fd == fd) { 1932 pollfdp[i].revents = 1933 POLLNVAL; 1934 fdcnt++; 1935 } 1936 } 1937 } 1938 continue; 1939 } 1940 ASSERT(pdp->pd_fp == fp); 1941 ASSERT(infpollinfo(fd)); 1942 /* 1943 * Since we no longer hold poll head lock across 1944 * VOP_POLL, pollunlock logic can be simplifed. 1945 */ 1946 ASSERT(pdp->pd_php == NULL || 1947 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 1948 /* 1949 * underlying file systems may set a "pollpending" 1950 * flag when it sees the poll may block. Pollwakeup() 1951 * is called by wakeup thread if pollpending is set. 1952 * Pass a 0 fdcnt so that the underlying file system 1953 * will set the "pollpending" flag set when there is 1954 * no polled events. 1955 * 1956 * Use pollfdp[].events for actual polling because 1957 * the pd_events is union of all cached poll events 1958 * on this fd. The events parameter also affects 1959 * how the polled device sets the "poll pending" 1960 * flag. 1961 */ 1962 ASSERT(curthread->t_pollcache == NULL); 1963 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 1964 &pollfdp[entry].revents, &php, NULL); 1965 /* 1966 * releasef after completely done with this cached 1967 * poll entry. To prevent close() coming in to clear 1968 * this entry. 1969 */ 1970 if (error) { 1971 releasef(fd); 1972 break; 1973 } 1974 /* 1975 * layered devices (e.g. console driver) 1976 * may change the vnode and thus the pollhead 1977 * pointer out from underneath us. 1978 */ 1979 if (php != NULL && pdp->pd_php != NULL && 1980 php != pdp->pd_php) { 1981 releasef(fd); 1982 pollhead_delete(pdp->pd_php, pdp); 1983 pdp->pd_php = php; 1984 pollhead_insert(php, pdp); 1985 /* 1986 * We could have missed a wakeup on the new 1987 * target device. Make sure the new target 1988 * gets polled once. 1989 */ 1990 BT_SET(pcp->pc_bitmap, fd); 1991 goto retry; 1992 } 1993 1994 if (pollfdp[entry].revents) { 1995 ASSERT(refp->xf_refcnt >= 1); 1996 fdcnt++; 1997 if (refp->xf_refcnt > 1) { 1998 /* 1999 * this fd appeared multiple time 2000 * in the poll list. This is rare but 2001 * we have to look at all of them for 2002 * correctness. 2003 */ 2004 error = plist_chkdupfd(fp, pdp, ps, 2005 pollfdp, entry, &fdcnt); 2006 if (error > 0) { 2007 releasef(fd); 2008 break; 2009 } 2010 if (error < 0) { 2011 goto retry; 2012 } 2013 } 2014 releasef(fd); 2015 } else { 2016 /* 2017 * VOP_POLL didn't return any revents. We can 2018 * clear the bit in bitmap only if we have the 2019 * pollhead ptr cached and no other cached 2020 * entry is polling different events on this fd. 2021 * VOP_POLL may have dropped the ps_lock. Make 2022 * sure pollwakeup has not happened before clear 2023 * the bit. 2024 */ 2025 if ((pdp->pd_php != NULL) && 2026 (pollfdp[entry].events == pdp->pd_events) && 2027 ((pcp->pc_flag & T_POLLWAKE) == 0)) { 2028 BT_CLEAR(pcp->pc_bitmap, fd); 2029 } 2030 /* 2031 * if the fd can be cached now but not before, 2032 * do it now. 2033 */ 2034 if ((pdp->pd_php == NULL) && (php != NULL)) { 2035 pdp->pd_php = php; 2036 pollhead_insert(php, pdp); 2037 /* 2038 * We are inserting a polldat struct for 2039 * the first time. We may have missed a 2040 * wakeup on this device. Re-poll once. 2041 * This should be a rare event. 2042 */ 2043 releasef(fd); 2044 goto retry; 2045 } 2046 if (refp->xf_refcnt > 1) { 2047 /* 2048 * this fd appeared multiple time 2049 * in the poll list. This is rare but 2050 * we have to look at all of them for 2051 * correctness. 2052 */ 2053 error = plist_chkdupfd(fp, pdp, ps, 2054 pollfdp, entry, &fdcnt); 2055 if (error > 0) { 2056 releasef(fd); 2057 break; 2058 } 2059 if (error < 0) { 2060 goto retry; 2061 } 2062 } 2063 releasef(fd); 2064 } 2065 } else { 2066 done = 1; 2067 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2068 } 2069 } 2070 if (!error) { 2071 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2072 *fdcntp += fdcnt; 2073 } 2074 return (error); 2075 } 2076 2077 /* 2078 * Going through the poll list without much locking. Poll all fds and 2079 * cache all valid fds in the pollcache. 2080 */ 2081 int 2082 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2083 { 2084 pollfd_t *pollfdp = ps->ps_pollfd; 2085 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2086 pollfd_t *newfdlist; 2087 int i; 2088 int fd; 2089 file_t *fp; 2090 int error = 0; 2091 2092 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2093 ASSERT(which < ps->ps_nsets); 2094 ASSERT(pcacheset != NULL); 2095 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2096 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2097 /* 2098 * cache the new poll list in pollcachset. 2099 */ 2100 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2101 2102 pcacheset[which].pcs_pollfd = newfdlist; 2103 pcacheset[which].pcs_nfds = ps->ps_nfds; 2104 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2105 2106 /* 2107 * We have saved a copy of current poll fd list in one pollcacheset. 2108 * The 'revents' field of the new list is not yet set to 0. Loop 2109 * through the new list just to do that is expensive. We do that 2110 * while polling the list. 2111 */ 2112 for (i = 0; i < ps->ps_nfds; i++) { 2113 fd = pollfdp[i].fd; 2114 /* 2115 * We also filter out the illegal poll events in the event 2116 * field for the cached poll list/set. 2117 */ 2118 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2119 newfdlist[i].events = pollfdp[i].events = 2120 pollfdp[i].events & VALID_POLL_EVENTS; 2121 } 2122 if (fd < 0) { 2123 pollfdp[i].revents = 0; 2124 continue; 2125 } 2126 if ((fp = getf(fd)) == NULL) { 2127 pollfdp[i].revents = POLLNVAL; 2128 /* 2129 * invalidate this cache entry in the cached poll list 2130 */ 2131 newfdlist[i].fd = -1; 2132 (*fdcntp)++; 2133 continue; 2134 } 2135 /* 2136 * cache this fd. 2137 */ 2138 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2139 which); 2140 releasef(fd); 2141 if (error) { 2142 /* 2143 * Here we are half way through caching a new 2144 * poll list. Undo every thing. 2145 */ 2146 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2147 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2148 pcacheset[which].pcs_pollfd = NULL; 2149 pcacheset[which].pcs_usradr = NULL; 2150 break; 2151 } 2152 } 2153 return (error); 2154 } 2155 2156 /* 2157 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2158 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2159 * wake any sleeping poller, then remove the polldat from the driver. 2160 * The routine is called with ps_pcachelock held. 2161 */ 2162 void 2163 pcache_clean_entry(pollstate_t *ps, int fd) 2164 { 2165 pollcache_t *pcp; 2166 polldat_t *pdp; 2167 int i; 2168 2169 ASSERT(ps != NULL); 2170 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2171 pcp = ps->ps_pcache; 2172 ASSERT(pcp); 2173 pdp = pcache_lookup_fd(pcp, fd); 2174 ASSERT(pdp != NULL); 2175 /* 2176 * the corresponding fpollinfo in fi_list has been removed by 2177 * a close on this fd. Reset the cached fp ptr here. 2178 */ 2179 pdp->pd_fp = NULL; 2180 /* 2181 * XXX - This routine also touches data in pcacheset struct. 2182 * 2183 * set the event in cached poll lists to POLLCLOSED. This invalidate 2184 * the cached poll fd entry in that poll list, which will force a 2185 * removal of this cached entry in next poll(). The cleanup is done 2186 * at the removal time. 2187 */ 2188 ASSERT(pdp->pd_ref != NULL); 2189 for (i = 0; i < ps->ps_nsets; i++) { 2190 xref_t *refp; 2191 pollcacheset_t *pcsp; 2192 2193 refp = &pdp->pd_ref[i]; 2194 if (refp->xf_refcnt) { 2195 ASSERT(refp->xf_position >= 0); 2196 pcsp = &ps->ps_pcacheset[i]; 2197 if (refp->xf_refcnt == 1) { 2198 pcsp->pcs_pollfd[refp->xf_position].events = 2199 (short)POLLCLOSED; 2200 } 2201 if (refp->xf_refcnt > 1) { 2202 int j; 2203 /* 2204 * mark every matching entry in pcs_pollfd 2205 */ 2206 for (j = refp->xf_position; 2207 j < pcsp->pcs_nfds; j++) { 2208 if (pcsp->pcs_pollfd[j].fd == fd) { 2209 pcsp->pcs_pollfd[j].events = 2210 (short)POLLCLOSED; 2211 } 2212 } 2213 } 2214 } 2215 } 2216 if (pdp->pd_php) { 2217 pollwakeup(pdp->pd_php, POLLHUP); 2218 pollhead_delete(pdp->pd_php, pdp); 2219 pdp->pd_php = NULL; 2220 } 2221 } 2222 2223 /* 2224 * This is the first time this thread has ever polled, 2225 * so we have to create its pollstate structure. 2226 * This will persist for the life of the thread, 2227 * until it calls pollcleanup(). 2228 */ 2229 pollstate_t * 2230 pollstate_create(void) 2231 { 2232 pollstate_t *ps; 2233 2234 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2235 ps->ps_nsets = POLLFDSETS; 2236 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2237 return (ps); 2238 } 2239 2240 void 2241 pollstate_destroy(pollstate_t *ps) 2242 { 2243 if (ps->ps_pollfd != NULL) { 2244 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2245 ps->ps_pollfd = NULL; 2246 } 2247 if (ps->ps_pcache != NULL) { 2248 pcache_destroy(ps->ps_pcache); 2249 ps->ps_pcache = NULL; 2250 } 2251 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2252 ps->ps_pcacheset = NULL; 2253 if (ps->ps_dpbuf != NULL) { 2254 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); 2255 ps->ps_dpbuf = NULL; 2256 } 2257 mutex_destroy(&ps->ps_lock); 2258 kmem_free(ps, sizeof (pollstate_t)); 2259 } 2260 2261 /* 2262 * We are holding the appropriate uf_lock entering this routine. 2263 * Bump up the ps_busy count to prevent the thread from exiting. 2264 */ 2265 void 2266 pollblockexit(fpollinfo_t *fpip) 2267 { 2268 for (; fpip; fpip = fpip->fp_next) { 2269 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2270 2271 mutex_enter(&pcp->pc_no_exit); 2272 pcp->pc_busy++; /* prevents exit()'s */ 2273 mutex_exit(&pcp->pc_no_exit); 2274 } 2275 } 2276 2277 /* 2278 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2279 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2280 * this cache entry. We can't clean the polldat entry clean up here because 2281 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2282 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2283 * pcache_clean_entry to call pollwakeup(). 2284 */ 2285 void 2286 pollcacheclean(fpollinfo_t *fip, int fd) 2287 { 2288 struct fpollinfo *fpip, *fpip2; 2289 2290 fpip = fip; 2291 while (fpip) { 2292 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2293 pollcache_t *pcp = ps->ps_pcache; 2294 2295 mutex_enter(&ps->ps_lock); 2296 pcache_clean_entry(ps, fd); 2297 mutex_exit(&ps->ps_lock); 2298 mutex_enter(&pcp->pc_no_exit); 2299 pcp->pc_busy--; 2300 if (pcp->pc_busy == 0) { 2301 /* 2302 * Wakeup the thread waiting in 2303 * thread_exit(). 2304 */ 2305 cv_signal(&pcp->pc_busy_cv); 2306 } 2307 mutex_exit(&pcp->pc_no_exit); 2308 2309 fpip2 = fpip; 2310 fpip = fpip->fp_next; 2311 kmem_free(fpip2, sizeof (fpollinfo_t)); 2312 } 2313 } 2314 2315 /* 2316 * one of the cache line's counter is wrapping around. Reset all cache line 2317 * counters to zero except one. This is simplistic, but probably works 2318 * effectively. 2319 */ 2320 void 2321 pcacheset_reset_count(pollstate_t *ps, int index) 2322 { 2323 int i; 2324 2325 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2326 for (i = 0; i < ps->ps_nsets; i++) { 2327 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2328 ps->ps_pcacheset[i].pcs_count = 0; 2329 } 2330 } 2331 ps->ps_pcacheset[index].pcs_count = 1; 2332 } 2333 2334 /* 2335 * this routine implements poll cache list replacement policy. 2336 * It is currently choose the "least used". 2337 */ 2338 int 2339 pcacheset_replace(pollstate_t *ps) 2340 { 2341 int i; 2342 int index = 0; 2343 2344 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2345 for (i = 1; i < ps->ps_nsets; i++) { 2346 if (ps->ps_pcacheset[index].pcs_count > 2347 ps->ps_pcacheset[i].pcs_count) { 2348 index = i; 2349 } 2350 } 2351 ps->ps_pcacheset[index].pcs_count = 0; 2352 return (index); 2353 } 2354 2355 /* 2356 * this routine is called by strclose to remove remaining polldat struct on 2357 * the pollhead list of the device being closed. There are two reasons as why 2358 * the polldat structures still remain on the pollhead list: 2359 * 2360 * (1) The layered device(e.g.the console driver). 2361 * In this case, the existence of a polldat implies that the thread putting 2362 * the polldat on this list has not exited yet. Before the thread exits, it 2363 * will have to hold this pollhead lock to remove the polldat. So holding the 2364 * pollhead lock here effectively prevents the thread which put the polldat 2365 * on this list from exiting. 2366 * 2367 * (2) /dev/poll. 2368 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2369 * pollhead list if the process has not done a POLLREMOVE before closing the 2370 * polled fd. We just unlink it here. 2371 */ 2372 void 2373 pollhead_clean(pollhead_t *php) 2374 { 2375 polldat_t *pdp; 2376 2377 /* 2378 * In case(1), while we must prevent the thread in question from 2379 * exiting, we must also obey the proper locking order, i.e. 2380 * (ps_lock -> phlock). 2381 */ 2382 PH_ENTER(php); 2383 while (php->ph_list != NULL) { 2384 pollstate_t *ps; 2385 pollcache_t *pcp; 2386 2387 pdp = php->ph_list; 2388 ASSERT(pdp->pd_php == php); 2389 if (pdp->pd_thread == NULL) { 2390 /* 2391 * This is case(2). Since the ph_lock is sufficient 2392 * to synchronize this lwp with any other /dev/poll 2393 * lwp, just unlink the polldat. 2394 */ 2395 php->ph_list = pdp->pd_next; 2396 pdp->pd_php = NULL; 2397 pdp->pd_next = NULL; 2398 continue; 2399 } 2400 ps = pdp->pd_thread->t_pollstate; 2401 ASSERT(ps != NULL); 2402 pcp = pdp->pd_pcache; 2403 ASSERT(pcp != NULL); 2404 mutex_enter(&pcp->pc_no_exit); 2405 pcp->pc_busy++; /* prevents exit()'s */ 2406 mutex_exit(&pcp->pc_no_exit); 2407 /* 2408 * Now get the locks in proper order to avoid deadlock. 2409 */ 2410 PH_EXIT(php); 2411 mutex_enter(&ps->ps_lock); 2412 /* 2413 * while we dropped the pollhead lock, the element could be 2414 * taken off the list already. 2415 */ 2416 PH_ENTER(php); 2417 if (pdp->pd_php == php) { 2418 ASSERT(pdp == php->ph_list); 2419 php->ph_list = pdp->pd_next; 2420 pdp->pd_php = NULL; 2421 pdp->pd_next = NULL; 2422 } 2423 PH_EXIT(php); 2424 mutex_exit(&ps->ps_lock); 2425 mutex_enter(&pcp->pc_no_exit); 2426 pcp->pc_busy--; 2427 if (pcp->pc_busy == 0) { 2428 /* 2429 * Wakeup the thread waiting in 2430 * thread_exit(). 2431 */ 2432 cv_signal(&pcp->pc_busy_cv); 2433 } 2434 mutex_exit(&pcp->pc_no_exit); 2435 PH_ENTER(php); 2436 } 2437 PH_EXIT(php); 2438 } 2439 2440 /* 2441 * The remove_list is called to cleanup a partially cached 'current' list or 2442 * to remove a partial list which is no longer cached. The flag value of 1 2443 * indicates the second case. 2444 */ 2445 void 2446 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2447 int cacheindex, int flag) 2448 { 2449 int i; 2450 2451 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2452 for (i = start; i < end; i++) { 2453 if ((pollfdp[i].fd >= 0) && 2454 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2455 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2456 (uint_t)pollfdp[i].events)) { 2457 int j; 2458 int fd = pollfdp[i].fd; 2459 2460 for (j = i + 1; j < end; j++) { 2461 if (pollfdp[j].fd == fd) { 2462 pcache_update_xref( 2463 ps->ps_pcache, fd, 2464 (ssize_t)j, cacheindex); 2465 break; 2466 } 2467 } 2468 ASSERT(j <= end); 2469 } 2470 } 2471 } 2472 } 2473 2474 #ifdef DEBUG 2475 2476 #include<sys/strsubr.h> 2477 /* 2478 * make sure curthread is not on anyone's pollhead list any more. 2479 */ 2480 static void 2481 pollcheckphlist() 2482 { 2483 int i; 2484 file_t *fp; 2485 uf_entry_t *ufp; 2486 uf_info_t *fip = P_FINFO(curproc); 2487 struct stdata *stp; 2488 polldat_t *pdp; 2489 2490 mutex_enter(&fip->fi_lock); 2491 for (i = 0; i < fip->fi_nfiles; i++) { 2492 UF_ENTER(ufp, fip, i); 2493 if ((fp = ufp->uf_file) != NULL) { 2494 if ((stp = fp->f_vnode->v_stream) != NULL) { 2495 PH_ENTER(&stp->sd_pollist); 2496 pdp = stp->sd_pollist.ph_list; 2497 while (pdp) { 2498 ASSERT(pdp->pd_thread != curthread); 2499 pdp = pdp->pd_next; 2500 } 2501 PH_EXIT(&stp->sd_pollist); 2502 } 2503 } 2504 UF_EXIT(ufp); 2505 } 2506 mutex_exit(&fip->fi_lock); 2507 } 2508 2509 /* 2510 * for resolved set poll list, the xref info in the pcache should be 2511 * consistent with this poll list. 2512 */ 2513 static int 2514 pollcheckxref(pollstate_t *ps, int cacheindex) 2515 { 2516 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2517 pollcache_t *pcp = ps->ps_pcache; 2518 polldat_t *pdp; 2519 int i; 2520 xref_t *refp; 2521 2522 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2523 if (pollfdp[i].fd < 0) { 2524 continue; 2525 } 2526 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2527 ASSERT(pdp != NULL); 2528 ASSERT(pdp->pd_ref != NULL); 2529 refp = &pdp->pd_ref[cacheindex]; 2530 if (refp->xf_position >= 0) { 2531 ASSERT(refp->xf_refcnt >= 1); 2532 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2533 if (refp->xf_refcnt > 1) { 2534 int j; 2535 int count = 0; 2536 2537 for (j = refp->xf_position; 2538 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2539 j++) { 2540 if (pollfdp[j].fd == pdp->pd_fd) { 2541 count++; 2542 } 2543 } 2544 ASSERT(count == refp->xf_refcnt); 2545 } 2546 } 2547 } 2548 return (1); 2549 } 2550 2551 /* 2552 * For every cached pollfd, its polldat struct should be consistent with 2553 * what is in the pcacheset lists. 2554 */ 2555 static void 2556 checkpolldat(pollstate_t *ps) 2557 { 2558 pollcache_t *pcp = ps->ps_pcache; 2559 polldat_t **hashtbl; 2560 int i; 2561 2562 hashtbl = pcp->pc_hash; 2563 for (i = 0; i < pcp->pc_hashsize; i++) { 2564 polldat_t *pdp; 2565 2566 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2567 ASSERT(pdp->pd_ref != NULL); 2568 if (pdp->pd_count > 0) { 2569 xref_t *refp; 2570 int j; 2571 pollcacheset_t *pcsp; 2572 pollfd_t *pollfd; 2573 2574 for (j = 0; j < ps->ps_nsets; j++) { 2575 refp = &pdp->pd_ref[j]; 2576 if (refp->xf_refcnt > 0) { 2577 pcsp = &ps->ps_pcacheset[j]; 2578 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2579 pollfd = pcsp->pcs_pollfd; 2580 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2581 } 2582 } 2583 } 2584 } 2585 } 2586 } 2587 2588 /* 2589 * every wfd element on ph_list must have a corresponding fpollinfo on the 2590 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2591 */ 2592 void 2593 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2594 { 2595 stdata_t *stp; 2596 polldat_t *pdp; 2597 fpollinfo_t *fpip2; 2598 2599 if ((stp = vp->v_stream) == NULL) { 2600 return; 2601 } 2602 PH_ENTER(&stp->sd_pollist); 2603 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2604 if (pdp->pd_thread != NULL && 2605 pdp->pd_thread->t_procp == curthread->t_procp) { 2606 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2607 if (pdp->pd_thread == fpip2->fp_thread) { 2608 break; 2609 } 2610 } 2611 ASSERT(fpip2 != NULL); 2612 } 2613 } 2614 PH_EXIT(&stp->sd_pollist); 2615 } 2616 2617 /* 2618 * For each cached fd whose bit is not set in bitmap, its revents field in 2619 * current poll list should be 0. 2620 */ 2621 static int 2622 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2623 { 2624 pollcache_t *pcp = ps->ps_pcache; 2625 pollfd_t *pollfdp = ps->ps_pollfd; 2626 int i; 2627 2628 for (i = begin; i < end; i++) { 2629 polldat_t *pdp; 2630 2631 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2632 pdp = pcache_lookup_fd(pcp, i); 2633 if (pdp && pdp->pd_fp != NULL) { 2634 xref_t *refp; 2635 int entry; 2636 2637 ASSERT(pdp->pd_ref != NULL); 2638 refp = &pdp->pd_ref[cacheindex]; 2639 if (refp->xf_refcnt == 0) { 2640 continue; 2641 } 2642 entry = refp->xf_position; 2643 ASSERT(entry >= 0); 2644 ASSERT(pollfdp[entry].revents == 0); 2645 if (refp->xf_refcnt > 1) { 2646 int j; 2647 2648 for (j = entry + 1; j < ps->ps_nfds; j++) { 2649 if (pollfdp[j].fd == i) { 2650 ASSERT(pollfdp[j].revents == 0); 2651 } 2652 } 2653 } 2654 } 2655 } 2656 return (1); 2657 } 2658 2659 #endif /* DEBUG */ 2660 2661 pollcache_t * 2662 pcache_alloc() 2663 { 2664 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2665 } 2666 2667 void 2668 pcache_create(pollcache_t *pcp, nfds_t nfds) 2669 { 2670 size_t mapsize; 2671 2672 /* 2673 * allocate enough bits for the poll fd list 2674 */ 2675 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2676 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2677 } 2678 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2679 KM_SLEEP); 2680 pcp->pc_mapsize = mapsize; 2681 /* 2682 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2683 * number of fd to start with, allocate a bigger hash table (to the 2684 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2685 * hash table is expensive. 2686 */ 2687 if (nfds < POLLHASHCHUNKSZ) { 2688 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2689 } else { 2690 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2691 ~(POLLHASHCHUNKSZ - 1); 2692 } 2693 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2694 KM_SLEEP); 2695 } 2696 2697 void 2698 pcache_destroy(pollcache_t *pcp) 2699 { 2700 polldat_t **hashtbl; 2701 int i; 2702 2703 hashtbl = pcp->pc_hash; 2704 for (i = 0; i < pcp->pc_hashsize; i++) { 2705 if (hashtbl[i] != NULL) { 2706 polldat_t *pdp, *pdp2; 2707 2708 pdp = hashtbl[i]; 2709 while (pdp != NULL) { 2710 pdp2 = pdp->pd_hashnext; 2711 if (pdp->pd_ref != NULL) { 2712 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2713 pdp->pd_nsets); 2714 } 2715 kmem_free(pdp, sizeof (polldat_t)); 2716 pdp = pdp2; 2717 pcp->pc_fdcount--; 2718 } 2719 } 2720 } 2721 ASSERT(pcp->pc_fdcount == 0); 2722 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2723 kmem_free(pcp->pc_bitmap, 2724 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2725 mutex_destroy(&pcp->pc_no_exit); 2726 mutex_destroy(&pcp->pc_lock); 2727 cv_destroy(&pcp->pc_cv); 2728 cv_destroy(&pcp->pc_busy_cv); 2729 kmem_free(pcp, sizeof (pollcache_t)); 2730 } 2731 2732 pollcacheset_t * 2733 pcacheset_create(int nsets) 2734 { 2735 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 2736 } 2737 2738 void 2739 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 2740 { 2741 int i; 2742 2743 for (i = 0; i < nsets; i++) { 2744 if (pcsp[i].pcs_pollfd != NULL) { 2745 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 2746 sizeof (pollfd_t)); 2747 } 2748 } 2749 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 2750 } 2751 2752 /* 2753 * Check each duplicated poll fd in the poll list. It may be necessary to 2754 * VOP_POLL the same fd again using different poll events. getf() has been 2755 * done by caller. This routine returns 0 if it can sucessfully process the 2756 * entire poll fd list. It returns -1 if underlying vnode has changed during 2757 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 2758 * value if VOP_POLL failed. 2759 */ 2760 static int 2761 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 2762 int entry, int *fdcntp) 2763 { 2764 int i; 2765 int fd; 2766 nfds_t nfds = psp->ps_nfds; 2767 2768 fd = pollfdp[entry].fd; 2769 for (i = entry + 1; i < nfds; i++) { 2770 if (pollfdp[i].fd == fd) { 2771 if (pollfdp[i].events == pollfdp[entry].events) { 2772 if ((pollfdp[i].revents = 2773 pollfdp[entry].revents) != 0) { 2774 (*fdcntp)++; 2775 } 2776 } else { 2777 2778 int error; 2779 pollhead_t *php; 2780 pollcache_t *pcp = psp->ps_pcache; 2781 2782 /* 2783 * the events are different. VOP_POLL on this 2784 * fd so that we don't miss any revents. 2785 */ 2786 php = NULL; 2787 ASSERT(curthread->t_pollcache == NULL); 2788 error = VOP_POLL(fp->f_vnode, 2789 pollfdp[i].events, 0, 2790 &pollfdp[i].revents, &php, NULL); 2791 if (error) { 2792 return (error); 2793 } 2794 /* 2795 * layered devices(e.g. console driver) 2796 * may change the vnode and thus the pollhead 2797 * pointer out from underneath us. 2798 */ 2799 if (php != NULL && pdp->pd_php != NULL && 2800 php != pdp->pd_php) { 2801 pollhead_delete(pdp->pd_php, pdp); 2802 pdp->pd_php = php; 2803 pollhead_insert(php, pdp); 2804 /* 2805 * We could have missed a wakeup on the 2806 * new target device. Make sure the new 2807 * target gets polled once. 2808 */ 2809 BT_SET(pcp->pc_bitmap, fd); 2810 return (-1); 2811 } 2812 if (pollfdp[i].revents) { 2813 (*fdcntp)++; 2814 } 2815 } 2816 } 2817 } 2818 return (0); 2819 } 2820