1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012 by Delphix. All rights reserved. 32 * Copyright (c) 2014, Joyent, Inc. All rights reserved. 33 */ 34 35 /* 36 * Portions of this source code were derived from Berkeley 4.3 BSD 37 * under license from the Regents of the University of California. 38 */ 39 40 #include <sys/param.h> 41 #include <sys/isa_defs.h> 42 #include <sys/types.h> 43 #include <sys/sysmacros.h> 44 #include <sys/user.h> 45 #include <sys/systm.h> 46 #include <sys/errno.h> 47 #include <sys/time.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/mode.h> 51 #include <sys/proc.h> 52 #include <sys/uio.h> 53 #include <sys/poll_impl.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/debug.h> 57 #include <sys/bitmap.h> 58 #include <sys/kstat.h> 59 #include <sys/rctl.h> 60 #include <sys/port_impl.h> 61 #include <sys/schedctl.h> 62 #include <sys/cpu.h> 63 64 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 65 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 66 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 67 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 68 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 69 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 70 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 71 72 /* 73 * global counters to collect some stats 74 */ 75 static struct { 76 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 77 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 78 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 79 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 80 } pollstats = { 81 { "polllistmiss", KSTAT_DATA_UINT64 }, 82 { "pollcachehit", KSTAT_DATA_UINT64 }, 83 { "pollcachephit", KSTAT_DATA_UINT64 }, 84 { "pollcachemiss", KSTAT_DATA_UINT64 } 85 }; 86 87 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 88 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 89 90 struct pplock { 91 kmutex_t pp_lock; 92 short pp_flag; 93 kcondvar_t pp_wait_cv; 94 int32_t pp_pad; /* to a nice round 16 bytes */ 95 }; 96 97 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 98 99 #ifdef DEBUG 100 static int pollchecksanity(pollstate_t *, nfds_t); 101 static int pollcheckxref(pollstate_t *, int); 102 static void pollcheckphlist(void); 103 static int pollcheckrevents(pollstate_t *, int, int, int); 104 static void checkpolldat(pollstate_t *); 105 #endif /* DEBUG */ 106 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 107 int *); 108 109 /* 110 * Data structure overview: 111 * The per-thread poll state consists of 112 * one pollstate_t 113 * one pollcache_t 114 * one bitmap with one event bit per fd 115 * a (two-dimensional) hashed array of polldat_t structures - one entry 116 * per fd 117 * 118 * This conglomerate of data structures interact with 119 * the pollhead which is used by VOP_POLL and pollwakeup 120 * (protected by the PHLOCK, cached array of plocks), and 121 * the fpollinfo list hanging off the fi_list which is used to notify 122 * poll when a cached fd is closed. This is protected by uf_lock. 123 * 124 * Invariants: 125 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 126 * is on that pollhead. This is modified atomically under pc_lock. 127 * 128 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 129 * list for that open file. 130 * This is modified atomically under pc_lock. 131 * 132 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 133 * Iff pd_ref[i].xf_refcnt >= 1 then 134 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 135 * Iff pd_ref[i].xf_refcnt > 1 then 136 * In ps_pcacheset[i].pcs_pollfd between index 137 * pd_ref[i].xf_position] and the end of the list 138 * there are xf_refcnt entries with .fd == pd_fd 139 * 140 * Locking design: 141 * Whenever possible the design relies on the fact that the poll cache state 142 * is per thread thus for both poll and exit it is self-synchronizing. 143 * Thus the key interactions where other threads access the state are: 144 * pollwakeup (and polltime), and 145 * close cleaning up the cached references to an open file 146 * 147 * The two key locks in poll proper is ps_lock and pc_lock. 148 * 149 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 150 * to ensure that modifications to pollcacheset structure are serialized. 151 * This lock is held through most of poll() except where poll sleeps 152 * since there is little need to handle closes concurrently with the execution 153 * of poll. 154 * The pc_lock protects most of the fields in pollcache structure and polldat 155 * structures (which are accessed by poll, pollwakeup, and polltime) 156 * with the exception of fields that are only modified when only one thread 157 * can access this per-thread state. 158 * Those exceptions occur in poll when first allocating the per-thread state, 159 * when poll grows the number of polldat (never shrinks), and when 160 * exit/pollcleanup has ensured that there are no references from either 161 * pollheads or fpollinfo to the threads poll state. 162 * 163 * Poll(2) system call is the only path which ps_lock and pc_lock are both 164 * held, in that order. It needs ps_lock to synchronize with close and 165 * lwp_exit; and pc_lock with pollwakeup. 166 * 167 * The locking interaction between pc_lock and PHLOCK take into account 168 * that poll acquires these locks in the order of pc_lock and then PHLOCK 169 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 170 * deadlock avoidance by dropping the locks and reacquiring them in the 171 * reverse order. For this to work pollwakeup needs to prevent the thread 172 * from exiting and freeing all of the poll related state. Thus is done 173 * using 174 * the pc_no_exit lock 175 * the pc_busy counter 176 * the pc_busy_cv condition variable 177 * 178 * The locking interaction between pc_lock and uf_lock has similar 179 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 180 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 181 * to prevent poll or exit from doing a delfpollinfo after which the thread 182 * might exit. But the cleanup needs to acquire pc_lock when modifying 183 * the poll cache state. The solution is to use pc_busy and do the close 184 * cleanup in two phases: 185 * First close calls pollblockexit which increments pc_busy. 186 * This prevents the per-thread poll related state from being freed. 187 * Then close drops uf_lock and calls pollcacheclean. 188 * This routine can then acquire pc_lock and remove any references 189 * to the closing fd (as well as recording that it has been closed 190 * so that a POLLNVAL can be generated even if the fd is reused before 191 * poll has been woken up and checked getf() again). 192 * 193 * When removing a polled fd from poll cache, the fd is always removed 194 * from pollhead list first and then from fpollinfo list, i.e., 195 * pollhead_delete() is called before delfpollinfo(). 196 * 197 * 198 * Locking hierarchy: 199 * pc_no_exit is a leaf level lock. 200 * ps_lock is held when acquiring pc_lock (except when pollwakeup 201 * acquires pc_lock). 202 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 203 * pollhead_delete) 204 * pc_lock is always held (but this is not required) 205 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 206 * from pcache_clean_entry). 207 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 208 * uf_lock. 209 * pc_lock is held across getf/releasef which acquire uf_lock. 210 * ps_lock might be held across getf/releasef which acquire uf_lock. 211 * pollwakeup tries to acquire pc_lock while holding PHLOCK 212 * but drops the locks and reacquire them in reverse order to avoid 213 * deadlock. 214 * 215 * Note also that there is deadlock avoidance support for VOP_POLL routines 216 * and pollwakeup involving a file system or driver lock. 217 * See below. 218 */ 219 220 /* 221 * Deadlock avoidance support for VOP_POLL() routines. This is 222 * sometimes necessary to prevent deadlock between polling threads 223 * (which hold poll locks on entry to xx_poll(), then acquire foo) 224 * and pollwakeup() threads (which hold foo, then acquire poll locks). 225 * 226 * pollunlock(void) releases whatever poll locks the current thread holds, 227 * returning a cookie for use by pollrelock(); 228 * 229 * pollrelock(cookie) reacquires previously dropped poll locks; 230 * 231 * polllock(php, mutex) does the common case: pollunlock(), 232 * acquire the problematic mutex, pollrelock(). 233 */ 234 int 235 pollunlock(void) 236 { 237 pollcache_t *pcp; 238 int lockstate = 0; 239 240 /* 241 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 242 * If the pollrelock/pollunlock is called as a result of poll(2), 243 * the t_pollcache should be NULL. 244 */ 245 if (curthread->t_pollcache == NULL) 246 pcp = curthread->t_pollstate->ps_pcache; 247 else 248 pcp = curthread->t_pollcache; 249 250 if (mutex_owned(&pcp->pc_lock)) { 251 lockstate = 1; 252 mutex_exit(&pcp->pc_lock); 253 } 254 return (lockstate); 255 } 256 257 void 258 pollrelock(int lockstate) 259 { 260 pollcache_t *pcp; 261 262 /* 263 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 264 * If the pollrelock/pollunlock is called as a result of poll(2), 265 * the t_pollcache should be NULL. 266 */ 267 if (curthread->t_pollcache == NULL) 268 pcp = curthread->t_pollstate->ps_pcache; 269 else 270 pcp = curthread->t_pollcache; 271 272 if (lockstate > 0) 273 mutex_enter(&pcp->pc_lock); 274 } 275 276 /* ARGSUSED */ 277 void 278 polllock(pollhead_t *php, kmutex_t *lp) 279 { 280 if (!mutex_tryenter(lp)) { 281 int lockstate = pollunlock(); 282 mutex_enter(lp); 283 pollrelock(lockstate); 284 } 285 } 286 287 static int 288 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 289 { 290 kthread_t *t = curthread; 291 klwp_t *lwp = ttolwp(t); 292 proc_t *p = ttoproc(t); 293 int fdcnt = 0; 294 int i; 295 hrtime_t deadline; /* hrtime value when we want to return */ 296 pollfd_t *pollfdp; 297 pollstate_t *ps; 298 pollcache_t *pcp; 299 int error = 0; 300 nfds_t old_nfds; 301 int cacheindex = 0; /* which cache set is used */ 302 303 /* 304 * Determine the precise future time of the requested timeout, if any. 305 */ 306 if (tsp == NULL) { 307 deadline = -1; 308 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 309 deadline = 0; 310 } else { 311 /* They must wait at least a tick. */ 312 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 313 deadline = MAX(deadline, nsec_per_tick); 314 deadline += gethrtime(); 315 } 316 317 /* 318 * Reset our signal mask, if requested. 319 */ 320 if (ksetp != NULL) { 321 mutex_enter(&p->p_lock); 322 schedctl_finish_sigblock(t); 323 lwp->lwp_sigoldmask = t->t_hold; 324 t->t_hold = *ksetp; 325 t->t_flag |= T_TOMASK; 326 /* 327 * Call cv_reltimedwait_sig() just to check for signals. 328 * We will return immediately with either 0 or -1. 329 */ 330 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 331 TR_CLOCK_TICK)) { 332 mutex_exit(&p->p_lock); 333 error = EINTR; 334 goto pollout; 335 } 336 mutex_exit(&p->p_lock); 337 } 338 339 /* 340 * Check to see if this guy just wants to use poll() as a timeout. 341 * If yes then bypass all the other stuff and make him sleep. 342 */ 343 if (nfds == 0) { 344 /* 345 * Sleep until we have passed the requested future 346 * time or until interrupted by a signal. 347 * Do not check for signals if we do not want to wait. 348 */ 349 if (deadline != 0) { 350 mutex_enter(&t->t_delay_lock); 351 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 352 &t->t_delay_lock, deadline)) > 0) 353 continue; 354 mutex_exit(&t->t_delay_lock); 355 error = (error == 0) ? EINTR : 0; 356 } 357 goto pollout; 358 } 359 360 if (nfds > p->p_fno_ctl) { 361 mutex_enter(&p->p_lock); 362 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 363 p->p_rctls, p, RCA_SAFE); 364 mutex_exit(&p->p_lock); 365 error = EINVAL; 366 goto pollout; 367 } 368 369 /* 370 * Need to allocate memory for pollstate before anything because 371 * the mutex and cv are created in this space 372 */ 373 if ((ps = t->t_pollstate) == NULL) { 374 t->t_pollstate = pollstate_create(); 375 ps = t->t_pollstate; 376 } 377 378 if (ps->ps_pcache == NULL) 379 ps->ps_pcache = pcache_alloc(); 380 pcp = ps->ps_pcache; 381 382 /* 383 * NOTE: for performance, buffers are saved across poll() calls. 384 * The theory is that if a process polls heavily, it tends to poll 385 * on the same set of descriptors. Therefore, we only reallocate 386 * buffers when nfds changes. There is no hysteresis control, 387 * because there is no data to suggest that this is necessary; 388 * the penalty of reallocating is not *that* great in any event. 389 */ 390 old_nfds = ps->ps_nfds; 391 if (nfds != old_nfds) { 392 393 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 394 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 395 ps->ps_pollfd = pollfdp; 396 ps->ps_nfds = nfds; 397 } 398 399 pollfdp = ps->ps_pollfd; 400 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 401 error = EFAULT; 402 goto pollout; 403 } 404 405 if (fds == NULL) { 406 /* 407 * If the process has page 0 mapped, then the copyin() above 408 * will succeed even if fds is NULL. However, our cached 409 * poll lists are keyed by the address of the passed-in fds 410 * structure, and we use the value NULL to indicate an unused 411 * poll cache list entry. As such, we elect not to support 412 * NULL as a valid (user) memory address and fail the poll() 413 * call. 414 */ 415 error = EINVAL; 416 goto pollout; 417 } 418 419 /* 420 * If this thread polls for the first time, allocate ALL poll 421 * cache data structures and cache the poll fd list. This 422 * allocation is delayed till now because lwp's polling 0 fd 423 * (i.e. using poll as timeout()) don't need this memory. 424 */ 425 mutex_enter(&ps->ps_lock); 426 pcp = ps->ps_pcache; 427 ASSERT(pcp != NULL); 428 if (pcp->pc_bitmap == NULL) { 429 pcache_create(pcp, nfds); 430 /* 431 * poll and cache this poll fd list in ps_pcacheset[0]. 432 */ 433 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 434 if (fdcnt || error) { 435 mutex_exit(&ps->ps_lock); 436 goto pollout; 437 } 438 } else { 439 pollcacheset_t *pcset = ps->ps_pcacheset; 440 441 /* 442 * Not first time polling. Select a cached poll list by 443 * matching user pollfd list buffer address. 444 */ 445 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 446 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 447 if ((++pcset[cacheindex].pcs_count) == 0) { 448 /* 449 * counter is wrapping around. 450 */ 451 pcacheset_reset_count(ps, cacheindex); 452 } 453 /* 454 * examine and resolve possible 455 * difference of the current poll 456 * list and previously cached one. 457 * If there is an error during resolve(), 458 * the callee will guarantee the consistency 459 * of cached poll list and cache content. 460 */ 461 error = pcacheset_resolve(ps, nfds, &fdcnt, 462 cacheindex); 463 if (error) { 464 mutex_exit(&ps->ps_lock); 465 goto pollout; 466 } 467 break; 468 } 469 470 /* 471 * Note that pcs_usradr field of an used entry won't be 472 * NULL because it stores the address of passed-in fds, 473 * and NULL fds will not be cached (Then it is either 474 * the special timeout case when nfds is 0 or it returns 475 * failure directly). 476 */ 477 if (pcset[cacheindex].pcs_usradr == NULL) { 478 /* 479 * found an unused entry. Use it to cache 480 * this poll list. 481 */ 482 error = pcacheset_cache_list(ps, fds, &fdcnt, 483 cacheindex); 484 if (fdcnt || error) { 485 mutex_exit(&ps->ps_lock); 486 goto pollout; 487 } 488 break; 489 } 490 } 491 if (cacheindex == ps->ps_nsets) { 492 /* 493 * We failed to find a matching cached poll fd list. 494 * replace an old list. 495 */ 496 pollstats.polllistmiss.value.ui64++; 497 cacheindex = pcacheset_replace(ps); 498 ASSERT(cacheindex < ps->ps_nsets); 499 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 500 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 501 if (error) { 502 mutex_exit(&ps->ps_lock); 503 goto pollout; 504 } 505 } 506 } 507 508 /* 509 * Always scan the bitmap with the lock on the pollcache held. 510 * This is to make sure that a wakeup does not come undetected. 511 * If the lock is not held, a pollwakeup could have come for an 512 * fd we already checked but before this thread sleeps, in which 513 * case the wakeup is missed. Now we hold the pcache lock and 514 * check the bitmap again. This will prevent wakeup from happening 515 * while we hold pcache lock since pollwakeup() will also lock 516 * the pcache before updating poll bitmap. 517 */ 518 mutex_enter(&pcp->pc_lock); 519 for (;;) { 520 pcp->pc_flag = 0; 521 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 522 if (fdcnt || error) { 523 mutex_exit(&pcp->pc_lock); 524 mutex_exit(&ps->ps_lock); 525 break; 526 } 527 528 /* 529 * If PC_POLLWAKE is set, a pollwakeup() was performed on 530 * one of the file descriptors. This can happen only if 531 * one of the VOP_POLL() functions dropped pcp->pc_lock. 532 * The only current cases of this is in procfs (prpoll()) 533 * and STREAMS (strpoll()). 534 */ 535 if (pcp->pc_flag & PC_POLLWAKE) 536 continue; 537 538 /* 539 * If you get here, the poll of fds was unsuccessful. 540 * Wait until some fd becomes readable, writable, or gets 541 * an exception, or until a signal or a timeout occurs. 542 * Do not check for signals if we have a zero timeout. 543 */ 544 mutex_exit(&ps->ps_lock); 545 if (deadline == 0) { 546 error = -1; 547 } else { 548 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 549 &pcp->pc_lock, deadline); 550 } 551 mutex_exit(&pcp->pc_lock); 552 /* 553 * If we have received a signal or timed out 554 * then break out and return. 555 */ 556 if (error <= 0) { 557 error = (error == 0) ? EINTR : 0; 558 break; 559 } 560 /* 561 * We have not received a signal or timed out. 562 * Continue around and poll fds again. 563 */ 564 mutex_enter(&ps->ps_lock); 565 mutex_enter(&pcp->pc_lock); 566 } 567 568 pollout: 569 /* 570 * If we changed the signal mask but we received 571 * no signal then restore the signal mask. 572 * Otherwise psig() will deal with the signal mask. 573 */ 574 if (ksetp != NULL) { 575 mutex_enter(&p->p_lock); 576 if (lwp->lwp_cursig == 0) { 577 t->t_hold = lwp->lwp_sigoldmask; 578 t->t_flag &= ~T_TOMASK; 579 } 580 mutex_exit(&p->p_lock); 581 } 582 583 if (error) 584 return (set_errno(error)); 585 586 /* 587 * Copy out the events and return the fdcnt to the user. 588 */ 589 if (nfds != 0 && 590 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 591 return (set_errno(EFAULT)); 592 593 #ifdef DEBUG 594 /* 595 * Another sanity check: 596 */ 597 if (fdcnt) { 598 int reventcnt = 0; 599 600 for (i = 0; i < nfds; i++) { 601 if (pollfdp[i].fd < 0) { 602 ASSERT(pollfdp[i].revents == 0); 603 continue; 604 } 605 if (pollfdp[i].revents) { 606 reventcnt++; 607 } 608 } 609 ASSERT(fdcnt == reventcnt); 610 } else { 611 for (i = 0; i < nfds; i++) { 612 ASSERT(pollfdp[i].revents == 0); 613 } 614 } 615 #endif /* DEBUG */ 616 617 return (fdcnt); 618 } 619 620 /* 621 * This is the system call trap that poll(), 622 * select() and pselect() are built upon. 623 * It is a private interface between libc and the kernel. 624 */ 625 int 626 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 627 { 628 timespec_t ts; 629 timespec_t *tsp; 630 sigset_t set; 631 k_sigset_t kset; 632 k_sigset_t *ksetp; 633 model_t datamodel = get_udatamodel(); 634 635 if (timeoutp == NULL) 636 tsp = NULL; 637 else { 638 if (datamodel == DATAMODEL_NATIVE) { 639 if (copyin(timeoutp, &ts, sizeof (ts))) 640 return (set_errno(EFAULT)); 641 } else { 642 timespec32_t ts32; 643 644 if (copyin(timeoutp, &ts32, sizeof (ts32))) 645 return (set_errno(EFAULT)); 646 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 647 } 648 649 if (itimerspecfix(&ts)) 650 return (set_errno(EINVAL)); 651 tsp = &ts; 652 } 653 654 if (setp == NULL) 655 ksetp = NULL; 656 else { 657 if (copyin(setp, &set, sizeof (set))) 658 return (set_errno(EFAULT)); 659 sigutok(&set, &kset); 660 ksetp = &kset; 661 } 662 663 return (poll_common(fds, nfds, tsp, ksetp)); 664 } 665 666 /* 667 * Clean up any state left around by poll(2). Called when a thread exits. 668 */ 669 void 670 pollcleanup() 671 { 672 pollstate_t *ps = curthread->t_pollstate; 673 pollcache_t *pcp; 674 675 if (ps == NULL) 676 return; 677 pcp = ps->ps_pcache; 678 /* 679 * free up all cached poll fds 680 */ 681 if (pcp == NULL) { 682 /* this pollstate is used by /dev/poll */ 683 goto pollcleanout; 684 } 685 686 if (pcp->pc_bitmap != NULL) { 687 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 688 /* 689 * a close lwp can race with us when cleaning up a polldat 690 * entry. We hold the ps_lock when cleaning hash table. 691 * Since this pollcache is going away anyway, there is no 692 * need to hold the pc_lock. 693 */ 694 mutex_enter(&ps->ps_lock); 695 pcache_clean(pcp); 696 mutex_exit(&ps->ps_lock); 697 #ifdef DEBUG 698 /* 699 * At this point, all fds cached by this lwp should be 700 * cleaned up. There should be no fd in fi_list still 701 * reference this thread. 702 */ 703 checkfpollinfo(); /* sanity check */ 704 pollcheckphlist(); /* sanity check */ 705 #endif /* DEBUG */ 706 } 707 /* 708 * Be sure no one is referencing thread before exiting 709 */ 710 mutex_enter(&pcp->pc_no_exit); 711 ASSERT(pcp->pc_busy >= 0); 712 while (pcp->pc_busy > 0) 713 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 714 mutex_exit(&pcp->pc_no_exit); 715 pollcleanout: 716 pollstate_destroy(ps); 717 curthread->t_pollstate = NULL; 718 } 719 720 /* 721 * pollwakeup() - poke threads waiting in poll() for some event 722 * on a particular object. 723 * 724 * The threads hanging off of the specified pollhead structure are scanned. 725 * If their event mask matches the specified event(s), then pollnotify() is 726 * called to poke the thread. 727 * 728 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 729 * all waiting threads are poked. 730 * 731 * It is important that pollnotify() not drop the lock protecting the list 732 * of threads. 733 */ 734 void 735 pollwakeup(pollhead_t *php, short events_arg) 736 { 737 polldat_t *pdp; 738 int events = (ushort_t)events_arg; 739 struct plist { 740 port_t *pp; 741 int pevents; 742 struct plist *next; 743 }; 744 struct plist *plhead = NULL, *pltail = NULL; 745 746 retry: 747 PH_ENTER(php); 748 749 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 750 if ((pdp->pd_events & events) || 751 (events & (POLLHUP | POLLERR))) { 752 753 pollcache_t *pcp; 754 755 if (pdp->pd_portev != NULL) { 756 port_kevent_t *pkevp = pdp->pd_portev; 757 /* 758 * Object (fd) is associated with an event port, 759 * => send event notification to the port. 760 */ 761 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 762 mutex_enter(&pkevp->portkev_lock); 763 if (pkevp->portkev_flags & PORT_KEV_VALID) { 764 int pevents; 765 766 pkevp->portkev_flags &= ~PORT_KEV_VALID; 767 pkevp->portkev_events |= events & 768 (pdp->pd_events | POLLHUP | 769 POLLERR); 770 /* 771 * portkev_lock mutex will be released 772 * by port_send_event(). 773 */ 774 port_send_event(pkevp); 775 776 /* 777 * If we have some thread polling the 778 * port's fd, add it to the list. They 779 * will be notified later. 780 * The port_pollwkup() will flag the 781 * port_t so that it will not disappear 782 * till port_pollwkdone() is called. 783 */ 784 pevents = 785 port_pollwkup(pkevp->portkev_port); 786 if (pevents) { 787 struct plist *t; 788 t = kmem_zalloc( 789 sizeof (struct plist), 790 KM_SLEEP); 791 t->pp = pkevp->portkev_port; 792 t->pevents = pevents; 793 if (plhead == NULL) { 794 plhead = t; 795 } else { 796 pltail->next = t; 797 } 798 pltail = t; 799 } 800 } else { 801 mutex_exit(&pkevp->portkev_lock); 802 } 803 continue; 804 } 805 806 pcp = pdp->pd_pcache; 807 808 /* 809 * Try to grab the lock for this thread. If 810 * we don't get it then we may deadlock so 811 * back out and restart all over again. Note 812 * that the failure rate is very very low. 813 */ 814 if (mutex_tryenter(&pcp->pc_lock)) { 815 pollnotify(pcp, pdp->pd_fd); 816 mutex_exit(&pcp->pc_lock); 817 } else { 818 /* 819 * We are here because: 820 * 1) This thread has been woke up 821 * and is trying to get out of poll(). 822 * 2) Some other thread is also here 823 * but with a different pollhead lock. 824 * 825 * So, we need to drop the lock on pollhead 826 * because of (1) but we want to prevent 827 * that thread from doing lwp_exit() or 828 * devpoll close. We want to ensure that 829 * the pollcache pointer is still invalid. 830 * 831 * Solution: Grab the pcp->pc_no_exit lock, 832 * increment the pc_busy counter, drop every 833 * lock in sight. Get out of the way and wait 834 * for type (2) threads to finish. 835 */ 836 837 mutex_enter(&pcp->pc_no_exit); 838 pcp->pc_busy++; /* prevents exit()'s */ 839 mutex_exit(&pcp->pc_no_exit); 840 841 PH_EXIT(php); 842 mutex_enter(&pcp->pc_lock); 843 mutex_exit(&pcp->pc_lock); 844 mutex_enter(&pcp->pc_no_exit); 845 pcp->pc_busy--; 846 if (pcp->pc_busy == 0) { 847 /* 848 * Wakeup the thread waiting in 849 * thread_exit(). 850 */ 851 cv_signal(&pcp->pc_busy_cv); 852 } 853 mutex_exit(&pcp->pc_no_exit); 854 goto retry; 855 } 856 } 857 } 858 859 860 /* 861 * Event ports - If this php is of the port on the list, 862 * call port_pollwkdone() to release it. The port_pollwkdone() 863 * needs to be called before dropping the PH lock so that any new 864 * thread attempting to poll this port are blocked. There can be 865 * only one thread here in pollwakeup notifying this port's fd. 866 */ 867 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 868 struct plist *t; 869 port_pollwkdone(plhead->pp); 870 t = plhead; 871 plhead = plhead->next; 872 kmem_free(t, sizeof (struct plist)); 873 } 874 PH_EXIT(php); 875 876 /* 877 * Event ports - Notify threads polling the event port's fd. 878 * This is normally done in port_send_event() where it calls 879 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 880 * we do it here in pollwakeup() to avoid a recursive call. 881 */ 882 if (plhead != NULL) { 883 php = &plhead->pp->port_pollhd; 884 events = plhead->pevents; 885 goto retry; 886 } 887 } 888 889 /* 890 * This function is called to inform a thread (or threads) that an event being 891 * polled on has occurred. The pollstate lock on the thread should be held 892 * on entry. 893 */ 894 void 895 pollnotify(pollcache_t *pcp, int fd) 896 { 897 ASSERT(fd < pcp->pc_mapsize); 898 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 899 BT_SET(pcp->pc_bitmap, fd); 900 pcp->pc_flag |= PC_POLLWAKE; 901 cv_broadcast(&pcp->pc_cv); 902 } 903 904 /* 905 * add a polldat entry to pollhead ph_list. The polldat struct is used 906 * by pollwakeup to wake sleeping pollers when polled events has happened. 907 */ 908 void 909 pollhead_insert(pollhead_t *php, polldat_t *pdp) 910 { 911 PH_ENTER(php); 912 ASSERT(pdp->pd_next == NULL); 913 #ifdef DEBUG 914 { 915 /* 916 * the polldat should not be already on the list 917 */ 918 polldat_t *wp; 919 for (wp = php->ph_list; wp; wp = wp->pd_next) { 920 ASSERT(wp != pdp); 921 } 922 } 923 #endif /* DEBUG */ 924 pdp->pd_next = php->ph_list; 925 php->ph_list = pdp; 926 PH_EXIT(php); 927 } 928 929 /* 930 * Delete the polldat entry from ph_list. 931 */ 932 void 933 pollhead_delete(pollhead_t *php, polldat_t *pdp) 934 { 935 polldat_t *wp; 936 polldat_t **wpp; 937 938 PH_ENTER(php); 939 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 940 if (wp == pdp) { 941 *wpp = pdp->pd_next; 942 pdp->pd_next = NULL; 943 break; 944 } 945 } 946 #ifdef DEBUG 947 /* assert that pdp is no longer in the list */ 948 for (wp = *wpp; wp; wp = wp->pd_next) { 949 ASSERT(wp != pdp); 950 } 951 #endif /* DEBUG */ 952 PH_EXIT(php); 953 } 954 955 /* 956 * walk through the poll fd lists to see if they are identical. This is an 957 * expensive operation and should not be done more than once for each poll() 958 * call. 959 * 960 * As an optimization (i.e., not having to go through the lists more than 961 * once), this routine also clear the revents field of pollfd in 'current'. 962 * Zeroing out the revents field of each entry in current poll list is 963 * required by poll man page. 964 * 965 * Since the events field of cached list has illegal poll events filtered 966 * out, the current list applies the same filtering before comparison. 967 * 968 * The routine stops when it detects a meaningful difference, or when it 969 * exhausts the lists. 970 */ 971 int 972 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 973 { 974 int ix; 975 976 for (ix = 0; ix < n; ix++) { 977 /* Prefetch 64 bytes worth of 8-byte elements */ 978 if ((ix & 0x7) == 0) { 979 prefetch_write_many((caddr_t)¤t[ix + 8]); 980 prefetch_write_many((caddr_t)&cached[ix + 8]); 981 } 982 if (current[ix].fd == cached[ix].fd) { 983 /* 984 * Filter out invalid poll events while we are in 985 * inside the loop. 986 */ 987 if (current[ix].events & ~VALID_POLL_EVENTS) { 988 current[ix].events &= VALID_POLL_EVENTS; 989 if (newlist != NULL) 990 newlist[ix].events = current[ix].events; 991 } 992 if (current[ix].events == cached[ix].events) { 993 current[ix].revents = 0; 994 continue; 995 } 996 } 997 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 998 current[ix].revents = 0; 999 continue; 1000 } 1001 return (ix); 1002 } 1003 return (ix); 1004 } 1005 1006 /* 1007 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1008 * does not find it in the hash table. 1009 */ 1010 polldat_t * 1011 pcache_lookup_fd(pollcache_t *pcp, int fd) 1012 { 1013 int hashindex; 1014 polldat_t *pdp; 1015 1016 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1017 pdp = pcp->pc_hash[hashindex]; 1018 while (pdp != NULL) { 1019 if (pdp->pd_fd == fd) 1020 break; 1021 pdp = pdp->pd_hashnext; 1022 } 1023 return (pdp); 1024 } 1025 1026 polldat_t * 1027 pcache_alloc_fd(int nsets) 1028 { 1029 polldat_t *pdp; 1030 1031 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1032 if (nsets > 0) { 1033 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1034 pdp->pd_nsets = nsets; 1035 } 1036 return (pdp); 1037 } 1038 1039 /* 1040 * This routine inserts a polldat into the pollcache's hash table. It 1041 * may be necessary to grow the size of the hash table. 1042 */ 1043 void 1044 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1045 { 1046 int hashindex; 1047 int fd; 1048 1049 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1050 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1051 pcache_grow_hashtbl(pcp, nfds); 1052 } 1053 fd = pdp->pd_fd; 1054 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1055 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1056 pcp->pc_hash[hashindex] = pdp; 1057 pcp->pc_fdcount++; 1058 1059 #ifdef DEBUG 1060 { 1061 /* 1062 * same fd should not appear on a hash list twice 1063 */ 1064 polldat_t *pdp1; 1065 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1066 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1067 } 1068 } 1069 #endif /* DEBUG */ 1070 } 1071 1072 /* 1073 * Grow the hash table -- either double the table size or round it to the 1074 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1075 * elements on the hash table. 1076 */ 1077 void 1078 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1079 { 1080 int oldsize; 1081 polldat_t **oldtbl; 1082 polldat_t *pdp, *pdp1; 1083 int i; 1084 #ifdef DEBUG 1085 int count = 0; 1086 #endif 1087 1088 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1089 oldsize = pcp->pc_hashsize; 1090 oldtbl = pcp->pc_hash; 1091 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1092 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1093 ~(POLLHASHCHUNKSZ - 1); 1094 } else { 1095 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1096 } 1097 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1098 KM_SLEEP); 1099 /* 1100 * rehash existing elements 1101 */ 1102 pcp->pc_fdcount = 0; 1103 for (i = 0; i < oldsize; i++) { 1104 pdp = oldtbl[i]; 1105 while (pdp != NULL) { 1106 pdp1 = pdp->pd_hashnext; 1107 pcache_insert_fd(pcp, pdp, nfds); 1108 pdp = pdp1; 1109 #ifdef DEBUG 1110 count++; 1111 #endif 1112 } 1113 } 1114 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1115 ASSERT(pcp->pc_fdcount == count); 1116 } 1117 1118 void 1119 pcache_grow_map(pollcache_t *pcp, int fd) 1120 { 1121 int newsize; 1122 ulong_t *newmap; 1123 1124 /* 1125 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1126 * power of 2. 1127 */ 1128 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1129 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1130 KM_SLEEP); 1131 /* 1132 * don't want pollwakeup to set a bit while growing the bitmap. 1133 */ 1134 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1135 mutex_enter(&pcp->pc_lock); 1136 bcopy(pcp->pc_bitmap, newmap, 1137 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1138 kmem_free(pcp->pc_bitmap, 1139 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1140 pcp->pc_bitmap = newmap; 1141 pcp->pc_mapsize = newsize; 1142 mutex_exit(&pcp->pc_lock); 1143 } 1144 1145 /* 1146 * remove all the reference from pollhead list and fpollinfo lists. 1147 */ 1148 void 1149 pcache_clean(pollcache_t *pcp) 1150 { 1151 int i; 1152 polldat_t **hashtbl; 1153 polldat_t *pdp; 1154 1155 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1156 hashtbl = pcp->pc_hash; 1157 for (i = 0; i < pcp->pc_hashsize; i++) { 1158 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1159 if (pdp->pd_php != NULL) { 1160 pollhead_delete(pdp->pd_php, pdp); 1161 pdp->pd_php = NULL; 1162 } 1163 if (pdp->pd_fp != NULL) { 1164 delfpollinfo(pdp->pd_fd); 1165 pdp->pd_fp = NULL; 1166 } 1167 } 1168 } 1169 } 1170 1171 void 1172 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1173 { 1174 int i; 1175 int fd = pdp->pd_fd; 1176 1177 /* 1178 * we come here because an earlier close() on this cached poll fd. 1179 */ 1180 ASSERT(pdp->pd_fp == NULL); 1181 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1182 pdp->pd_events = 0; 1183 for (i = 0; i < ps->ps_nsets; i++) { 1184 xref_t *refp; 1185 pollcacheset_t *pcsp; 1186 1187 ASSERT(pdp->pd_ref != NULL); 1188 refp = &pdp->pd_ref[i]; 1189 if (refp->xf_refcnt) { 1190 ASSERT(refp->xf_position >= 0); 1191 pcsp = &ps->ps_pcacheset[i]; 1192 if (refp->xf_refcnt == 1) { 1193 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1194 refp->xf_refcnt = 0; 1195 pdp->pd_count--; 1196 } else if (refp->xf_refcnt > 1) { 1197 int j; 1198 1199 /* 1200 * turn off every appearance in pcs_pollfd list 1201 */ 1202 for (j = refp->xf_position; 1203 j < pcsp->pcs_nfds; j++) { 1204 if (pcsp->pcs_pollfd[j].fd == fd) { 1205 pcsp->pcs_pollfd[j].fd = -1; 1206 refp->xf_refcnt--; 1207 pdp->pd_count--; 1208 } 1209 } 1210 } 1211 ASSERT(refp->xf_refcnt == 0); 1212 refp->xf_position = POLLPOSINVAL; 1213 } 1214 } 1215 ASSERT(pdp->pd_count == 0); 1216 } 1217 1218 /* 1219 * Insert poll fd into the pollcache, and add poll registration. 1220 * This routine is called after getf() and before releasef(). So the vnode 1221 * can not disappear even if we block here. 1222 * If there is an error, the polled fd is not cached. 1223 */ 1224 int 1225 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1226 ssize_t pos, int which) 1227 { 1228 pollcache_t *pcp = ps->ps_pcache; 1229 polldat_t *pdp; 1230 int error; 1231 int fd; 1232 pollhead_t *memphp = NULL; 1233 xref_t *refp; 1234 int newpollfd = 0; 1235 1236 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1237 /* 1238 * The poll caching uses the existing VOP_POLL interface. If there 1239 * is no polled events, we want the polled device to set its "some 1240 * one is sleeping in poll" flag. When the polled events happen 1241 * later, the driver will call pollwakeup(). We achieve this by 1242 * always passing 0 in the third parameter ("anyyet") when calling 1243 * VOP_POLL. This parameter is not looked at by drivers when the 1244 * polled events exist. If a driver chooses to ignore this parameter 1245 * and call pollwakeup whenever the polled events happen, that will 1246 * be OK too. 1247 */ 1248 ASSERT(curthread->t_pollcache == NULL); 1249 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1250 &memphp, NULL); 1251 if (error) { 1252 return (error); 1253 } 1254 if (pollfdp->revents) { 1255 (*fdcntp)++; 1256 } 1257 /* 1258 * polling the underlying device succeeded. Now we can cache it. 1259 * A close can't come in here because we have not done a releasef() 1260 * yet. 1261 */ 1262 fd = pollfdp->fd; 1263 pdp = pcache_lookup_fd(pcp, fd); 1264 if (pdp == NULL) { 1265 ASSERT(ps->ps_nsets > 0); 1266 pdp = pcache_alloc_fd(ps->ps_nsets); 1267 newpollfd = 1; 1268 } 1269 /* 1270 * If this entry was used to cache a poll fd which was closed, and 1271 * this entry has not been cleaned, do it now. 1272 */ 1273 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1274 pcacheset_invalidate(ps, pdp); 1275 ASSERT(pdp->pd_next == NULL); 1276 } 1277 if (pdp->pd_count == 0) { 1278 pdp->pd_fd = fd; 1279 pdp->pd_fp = fp; 1280 addfpollinfo(fd); 1281 pdp->pd_thread = curthread; 1282 pdp->pd_pcache = pcp; 1283 /* 1284 * the entry is never used or cleared by removing a cached 1285 * pollfd (pcache_delete_fd). So all the fields should be clear. 1286 */ 1287 ASSERT(pdp->pd_next == NULL); 1288 } 1289 1290 /* 1291 * A polled fd is considered cached. So there should be a fpollinfo 1292 * entry on uf_fpollinfo list. 1293 */ 1294 ASSERT(infpollinfo(fd)); 1295 /* 1296 * If there is an inconsistency, we want to know it here. 1297 */ 1298 ASSERT(pdp->pd_fp == fp); 1299 1300 /* 1301 * XXX pd_events is a union of all polled events on this fd, possibly 1302 * by different threads. Unless this is a new first poll(), pd_events 1303 * never shrinks. If an event is no longer polled by a process, there 1304 * is no way to cancel that event. In that case, poll degrade to its 1305 * old form -- polling on this fd every time poll() is called. The 1306 * assumption is an app always polls the same type of events. 1307 */ 1308 pdp->pd_events |= pollfdp->events; 1309 1310 pdp->pd_count++; 1311 /* 1312 * There is not much special handling for multiple appearances of 1313 * same fd other than xf_position always recording the first 1314 * appearance in poll list. If this is called from pcacheset_cache_list, 1315 * a VOP_POLL is called on every pollfd entry; therefore each 1316 * revents and fdcnt should be set correctly. If this is called from 1317 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1318 * pick up the right count and handle revents field of each pollfd 1319 * entry. 1320 */ 1321 ASSERT(pdp->pd_ref != NULL); 1322 refp = &pdp->pd_ref[which]; 1323 if (refp->xf_refcnt == 0) { 1324 refp->xf_position = pos; 1325 } else { 1326 /* 1327 * xf_position records the fd's first appearance in poll list 1328 */ 1329 if (pos < refp->xf_position) { 1330 refp->xf_position = pos; 1331 } 1332 } 1333 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1334 refp->xf_refcnt++; 1335 if (fd >= pcp->pc_mapsize) { 1336 pcache_grow_map(pcp, fd); 1337 } 1338 if (fd > pcp->pc_mapend) { 1339 pcp->pc_mapend = fd; 1340 } 1341 if (newpollfd != 0) { 1342 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1343 } 1344 if (memphp) { 1345 if (pdp->pd_php == NULL) { 1346 pollhead_insert(memphp, pdp); 1347 pdp->pd_php = memphp; 1348 } else { 1349 if (memphp != pdp->pd_php) { 1350 /* 1351 * layered devices (e.g. console driver) 1352 * may change the vnode and thus the pollhead 1353 * pointer out from underneath us. 1354 */ 1355 pollhead_delete(pdp->pd_php, pdp); 1356 pollhead_insert(memphp, pdp); 1357 pdp->pd_php = memphp; 1358 } 1359 } 1360 } 1361 /* 1362 * Since there is a considerable window between VOP_POLL and when 1363 * we actually put the polldat struct on the pollhead list, we could 1364 * miss a pollwakeup. In the case of polling additional events, we 1365 * don't update the events until after VOP_POLL. So we could miss 1366 * pollwakeup there too. So we always set the bit here just to be 1367 * safe. The real performance gain is in subsequent pcache_poll. 1368 */ 1369 mutex_enter(&pcp->pc_lock); 1370 BT_SET(pcp->pc_bitmap, fd); 1371 mutex_exit(&pcp->pc_lock); 1372 return (0); 1373 } 1374 1375 /* 1376 * The entry is not really deleted. The fields are cleared so that the 1377 * entry is no longer useful, but it will remain in the hash table for reuse 1378 * later. It will be freed when the polling lwp exits. 1379 */ 1380 int 1381 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1382 { 1383 pollcache_t *pcp = ps->ps_pcache; 1384 polldat_t *pdp; 1385 xref_t *refp; 1386 1387 ASSERT(fd < pcp->pc_mapsize); 1388 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1389 1390 pdp = pcache_lookup_fd(pcp, fd); 1391 ASSERT(pdp != NULL); 1392 ASSERT(pdp->pd_count > 0); 1393 ASSERT(pdp->pd_ref != NULL); 1394 refp = &pdp->pd_ref[which]; 1395 if (pdp->pd_count == 1) { 1396 pdp->pd_events = 0; 1397 refp->xf_position = POLLPOSINVAL; 1398 ASSERT(refp->xf_refcnt == 1); 1399 refp->xf_refcnt = 0; 1400 if (pdp->pd_php) { 1401 /* 1402 * It is possible for a wakeup thread to get ahead 1403 * of the following pollhead_delete and set the bit in 1404 * bitmap. It is OK because the bit will be cleared 1405 * here anyway. 1406 */ 1407 pollhead_delete(pdp->pd_php, pdp); 1408 pdp->pd_php = NULL; 1409 } 1410 pdp->pd_count = 0; 1411 if (pdp->pd_fp != NULL) { 1412 pdp->pd_fp = NULL; 1413 delfpollinfo(fd); 1414 } 1415 mutex_enter(&pcp->pc_lock); 1416 BT_CLEAR(pcp->pc_bitmap, fd); 1417 mutex_exit(&pcp->pc_lock); 1418 return (0); 1419 } 1420 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1421 /* 1422 * fd cached here has been closed. This is the first 1423 * pcache_delete_fd called after the close. Clean up the 1424 * entire entry. 1425 */ 1426 pcacheset_invalidate(ps, pdp); 1427 ASSERT(pdp->pd_php == NULL); 1428 mutex_enter(&pcp->pc_lock); 1429 BT_CLEAR(pcp->pc_bitmap, fd); 1430 mutex_exit(&pcp->pc_lock); 1431 return (0); 1432 } 1433 #ifdef DEBUG 1434 if (getf(fd) != NULL) { 1435 ASSERT(infpollinfo(fd)); 1436 releasef(fd); 1437 } 1438 #endif /* DEBUG */ 1439 pdp->pd_count--; 1440 ASSERT(refp->xf_refcnt > 0); 1441 if (--refp->xf_refcnt == 0) { 1442 refp->xf_position = POLLPOSINVAL; 1443 } else { 1444 ASSERT(pos >= refp->xf_position); 1445 if (pos == refp->xf_position) { 1446 /* 1447 * The xref position is no longer valid. 1448 * Reset it to a special value and let 1449 * caller know it needs to updatexref() 1450 * with a new xf_position value. 1451 */ 1452 refp->xf_position = POLLPOSTRANS; 1453 return (1); 1454 } 1455 } 1456 return (0); 1457 } 1458 1459 void 1460 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1461 { 1462 polldat_t *pdp; 1463 1464 pdp = pcache_lookup_fd(pcp, fd); 1465 ASSERT(pdp != NULL); 1466 ASSERT(pdp->pd_ref != NULL); 1467 pdp->pd_ref[which].xf_position = pos; 1468 } 1469 1470 #ifdef DEBUG 1471 /* 1472 * For each polled fd, it's either in the bitmap or cached in 1473 * pcache hash table. If this routine returns 0, something is wrong. 1474 */ 1475 static int 1476 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1477 { 1478 int i; 1479 int fd; 1480 pollcache_t *pcp = ps->ps_pcache; 1481 polldat_t *pdp; 1482 pollfd_t *pollfdp = ps->ps_pollfd; 1483 file_t *fp; 1484 1485 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1486 for (i = 0; i < nfds; i++) { 1487 fd = pollfdp[i].fd; 1488 if (fd < 0) { 1489 ASSERT(pollfdp[i].revents == 0); 1490 continue; 1491 } 1492 if (pollfdp[i].revents == POLLNVAL) 1493 continue; 1494 if ((fp = getf(fd)) == NULL) 1495 continue; 1496 pdp = pcache_lookup_fd(pcp, fd); 1497 ASSERT(pdp != NULL); 1498 ASSERT(infpollinfo(fd)); 1499 ASSERT(pdp->pd_fp == fp); 1500 releasef(fd); 1501 if (BT_TEST(pcp->pc_bitmap, fd)) 1502 continue; 1503 if (pdp->pd_php == NULL) 1504 return (0); 1505 } 1506 return (1); 1507 } 1508 #endif /* DEBUG */ 1509 1510 /* 1511 * resolve the difference between the current poll list and a cached one. 1512 */ 1513 int 1514 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1515 { 1516 int i; 1517 pollcache_t *pcp = ps->ps_pcache; 1518 pollfd_t *newlist = NULL; 1519 pollfd_t *current = ps->ps_pollfd; 1520 pollfd_t *cached; 1521 pollcacheset_t *pcsp; 1522 int common; 1523 int count = 0; 1524 int offset; 1525 int remain; 1526 int fd; 1527 file_t *fp; 1528 int fdcnt = 0; 1529 int cnt = 0; 1530 nfds_t old_nfds; 1531 int error = 0; 1532 int mismatch = 0; 1533 1534 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1535 #ifdef DEBUG 1536 checkpolldat(ps); 1537 #endif 1538 pcsp = &ps->ps_pcacheset[which]; 1539 old_nfds = pcsp->pcs_nfds; 1540 common = (nfds > old_nfds) ? old_nfds : nfds; 1541 if (nfds != old_nfds) { 1542 /* 1543 * the length of poll list has changed. allocate a new 1544 * pollfd list. 1545 */ 1546 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1547 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1548 } 1549 /* 1550 * Compare the overlapping part of the current fd list with the 1551 * cached one. Whenever a difference is found, resolve it. 1552 * The comparison is done on the current poll list and the 1553 * cached list. But we may be setting up the newlist to be the 1554 * cached list for next poll. 1555 */ 1556 cached = pcsp->pcs_pollfd; 1557 remain = common; 1558 1559 while (count < common) { 1560 int tmpfd; 1561 pollfd_t *np; 1562 1563 np = (newlist != NULL) ? &newlist[count] : NULL; 1564 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1565 remain); 1566 /* 1567 * Collect stats. If lists are completed the first time, 1568 * it's a hit. Otherwise, it's a partial hit or miss. 1569 */ 1570 if ((count == 0) && (offset == common)) { 1571 pollstats.pollcachehit.value.ui64++; 1572 } else { 1573 mismatch++; 1574 } 1575 count += offset; 1576 if (offset < remain) { 1577 ASSERT(count < common); 1578 ASSERT((current[count].fd != cached[count].fd) || 1579 (current[count].events != cached[count].events)); 1580 /* 1581 * Filter out invalid events. 1582 */ 1583 if (current[count].events & ~VALID_POLL_EVENTS) { 1584 if (newlist != NULL) { 1585 newlist[count].events = 1586 current[count].events &= 1587 VALID_POLL_EVENTS; 1588 } else { 1589 current[count].events &= 1590 VALID_POLL_EVENTS; 1591 } 1592 } 1593 /* 1594 * when resolving a difference, we always remove the 1595 * fd from cache before inserting one into cache. 1596 */ 1597 if (cached[count].fd >= 0) { 1598 tmpfd = cached[count].fd; 1599 if (pcache_delete_fd(ps, tmpfd, count, which, 1600 (uint_t)cached[count].events)) { 1601 /* 1602 * This should be rare but needed for 1603 * correctness. 1604 * 1605 * The first appearance in cached list 1606 * is being "turned off". The same fd 1607 * appear more than once in the cached 1608 * poll list. Find the next one on the 1609 * list and update the cached 1610 * xf_position field. 1611 */ 1612 for (i = count + 1; i < old_nfds; i++) { 1613 if (cached[i].fd == tmpfd) { 1614 pcache_update_xref(pcp, 1615 tmpfd, (ssize_t)i, 1616 which); 1617 break; 1618 } 1619 } 1620 ASSERT(i <= old_nfds); 1621 } 1622 /* 1623 * In case a new cache list is allocated, 1624 * need to keep both cache lists in sync 1625 * b/c the new one can be freed if we have 1626 * an error later. 1627 */ 1628 cached[count].fd = -1; 1629 if (newlist != NULL) { 1630 newlist[count].fd = -1; 1631 } 1632 } 1633 if ((tmpfd = current[count].fd) >= 0) { 1634 /* 1635 * add to the cached fd tbl and bitmap. 1636 */ 1637 if ((fp = getf(tmpfd)) == NULL) { 1638 current[count].revents = POLLNVAL; 1639 if (newlist != NULL) { 1640 newlist[count].fd = -1; 1641 } 1642 cached[count].fd = -1; 1643 fdcnt++; 1644 } else { 1645 /* 1646 * Here we don't care about the 1647 * fdcnt. We will examine the bitmap 1648 * later and pick up the correct 1649 * fdcnt there. So we never bother 1650 * to check value of 'cnt'. 1651 */ 1652 error = pcache_insert(ps, fp, 1653 ¤t[count], &cnt, 1654 (ssize_t)count, which); 1655 /* 1656 * if no error, we want to do releasef 1657 * after we updated cache poll list 1658 * entry so that close() won't race 1659 * us. 1660 */ 1661 if (error) { 1662 /* 1663 * If we encountered an error, 1664 * we have invalidated an 1665 * entry in cached poll list 1666 * (in pcache_delete_fd() above) 1667 * but failed to add one here. 1668 * This is OK b/c what's in the 1669 * cached list is consistent 1670 * with content of cache. 1671 * It will not have any ill 1672 * effect on next poll(). 1673 */ 1674 releasef(tmpfd); 1675 if (newlist != NULL) { 1676 kmem_free(newlist, 1677 nfds * 1678 sizeof (pollfd_t)); 1679 } 1680 return (error); 1681 } 1682 /* 1683 * If we have allocated a new(temp) 1684 * cache list, we need to keep both 1685 * in sync b/c the new one can be freed 1686 * if we have an error later. 1687 */ 1688 if (newlist != NULL) { 1689 newlist[count].fd = 1690 current[count].fd; 1691 newlist[count].events = 1692 current[count].events; 1693 } 1694 cached[count].fd = current[count].fd; 1695 cached[count].events = 1696 current[count].events; 1697 releasef(tmpfd); 1698 } 1699 } else { 1700 current[count].revents = 0; 1701 } 1702 count++; 1703 remain = common - count; 1704 } 1705 } 1706 if (mismatch != 0) { 1707 if (mismatch == common) { 1708 pollstats.pollcachemiss.value.ui64++; 1709 } else { 1710 pollstats.pollcachephit.value.ui64++; 1711 } 1712 } 1713 /* 1714 * take care of the non overlapping part of a list 1715 */ 1716 if (nfds > old_nfds) { 1717 ASSERT(newlist != NULL); 1718 for (i = old_nfds; i < nfds; i++) { 1719 /* filter out invalid events */ 1720 if (current[i].events & ~VALID_POLL_EVENTS) { 1721 newlist[i].events = current[i].events = 1722 current[i].events & VALID_POLL_EVENTS; 1723 } 1724 if ((fd = current[i].fd) < 0) { 1725 current[i].revents = 0; 1726 continue; 1727 } 1728 /* 1729 * add to the cached fd tbl and bitmap. 1730 */ 1731 if ((fp = getf(fd)) == NULL) { 1732 current[i].revents = POLLNVAL; 1733 newlist[i].fd = -1; 1734 fdcnt++; 1735 continue; 1736 } 1737 /* 1738 * Here we don't care about the 1739 * fdcnt. We will examine the bitmap 1740 * later and pick up the correct 1741 * fdcnt there. So we never bother to 1742 * check 'cnt'. 1743 */ 1744 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1745 (ssize_t)i, which); 1746 releasef(fd); 1747 if (error) { 1748 /* 1749 * Here we are half way through adding newly 1750 * polled fd. Undo enough to keep the cache 1751 * list consistent with the cache content. 1752 */ 1753 pcacheset_remove_list(ps, current, old_nfds, 1754 i, which, 0); 1755 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1756 return (error); 1757 } 1758 } 1759 } 1760 if (old_nfds > nfds) { 1761 /* 1762 * remove the fd's which are no longer polled. 1763 */ 1764 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1765 which, 1); 1766 } 1767 /* 1768 * set difference resolved. update nfds and cachedlist 1769 * in pollstate struct. 1770 */ 1771 if (newlist != NULL) { 1772 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1773 /* 1774 * By now, the pollfd.revents field should 1775 * all be zeroed. 1776 */ 1777 pcsp->pcs_pollfd = newlist; 1778 pcsp->pcs_nfds = nfds; 1779 } 1780 ASSERT(*fdcntp == 0); 1781 *fdcntp = fdcnt; 1782 /* 1783 * By now for every fd in pollfdp, one of the following should be 1784 * true. Otherwise we will miss a polled event. 1785 * 1786 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1787 * will be called on this fd in next poll. 1788 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1789 * pollnotify will happen. 1790 */ 1791 ASSERT(pollchecksanity(ps, nfds)); 1792 /* 1793 * make sure cross reference between cached poll lists and cached 1794 * poll fds are correct. 1795 */ 1796 ASSERT(pollcheckxref(ps, which)); 1797 /* 1798 * ensure each polldat in pollcache reference a polled fd in 1799 * pollcacheset. 1800 */ 1801 #ifdef DEBUG 1802 checkpolldat(ps); 1803 #endif 1804 return (0); 1805 } 1806 1807 #ifdef DEBUG 1808 static int 1809 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1810 { 1811 int i; 1812 int reventcnt = 0; 1813 1814 for (i = 0; i < nfds; i++) { 1815 if (pollfdp[i].fd < 0) { 1816 ASSERT(pollfdp[i].revents == 0); 1817 continue; 1818 } 1819 if (pollfdp[i].revents) { 1820 reventcnt++; 1821 } 1822 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1823 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1824 } 1825 } 1826 return (reventcnt); 1827 } 1828 #endif /* DEBUG */ 1829 1830 /* 1831 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1832 * is held upon entry. 1833 */ 1834 int 1835 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1836 int which) 1837 { 1838 int i; 1839 pollcache_t *pcp; 1840 int fd; 1841 int begin, end, done; 1842 pollhead_t *php; 1843 int fdcnt; 1844 int error = 0; 1845 file_t *fp; 1846 polldat_t *pdp; 1847 xref_t *refp; 1848 int entry; 1849 1850 pcp = ps->ps_pcache; 1851 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1852 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1853 retry: 1854 done = 0; 1855 begin = 0; 1856 fdcnt = 0; 1857 end = pcp->pc_mapend; 1858 while ((fdcnt < nfds) && !done) { 1859 php = NULL; 1860 /* 1861 * only poll fds which may have events 1862 */ 1863 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1864 ASSERT(fd <= end); 1865 if (fd >= 0) { 1866 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1867 /* 1868 * adjust map pointers for next round 1869 */ 1870 if (fd == end) { 1871 done = 1; 1872 } else { 1873 begin = fd + 1; 1874 } 1875 /* 1876 * A bitmap caches poll state information of 1877 * multiple poll lists. Call VOP_POLL only if 1878 * the bit corresponds to an fd in this poll 1879 * list. 1880 */ 1881 pdp = pcache_lookup_fd(pcp, fd); 1882 ASSERT(pdp != NULL); 1883 ASSERT(pdp->pd_ref != NULL); 1884 refp = &pdp->pd_ref[which]; 1885 if (refp->xf_refcnt == 0) 1886 continue; 1887 entry = refp->xf_position; 1888 ASSERT((entry >= 0) && (entry < nfds)); 1889 ASSERT(pollfdp[entry].fd == fd); 1890 /* 1891 * we are in this routine implies that we have 1892 * successfully polled this fd in the past. 1893 * Check to see this fd is closed while we are 1894 * blocked in poll. This ensures that we don't 1895 * miss a close on the fd in the case this fd is 1896 * reused. 1897 */ 1898 if (pdp->pd_fp == NULL) { 1899 ASSERT(pdp->pd_count > 0); 1900 pollfdp[entry].revents = POLLNVAL; 1901 fdcnt++; 1902 if (refp->xf_refcnt > 1) { 1903 /* 1904 * this fd appeared multiple time 1905 * in the poll list. Find all of them. 1906 */ 1907 for (i = entry + 1; i < nfds; i++) { 1908 if (pollfdp[i].fd == fd) { 1909 pollfdp[i].revents = 1910 POLLNVAL; 1911 fdcnt++; 1912 } 1913 } 1914 } 1915 pcacheset_invalidate(ps, pdp); 1916 continue; 1917 } 1918 /* 1919 * We can be here polling a device that is being 1920 * closed (i.e. the file pointer is set to NULL, 1921 * but pollcacheclean has not happened yet). 1922 */ 1923 if ((fp = getf(fd)) == NULL) { 1924 pollfdp[entry].revents = POLLNVAL; 1925 fdcnt++; 1926 if (refp->xf_refcnt > 1) { 1927 /* 1928 * this fd appeared multiple time 1929 * in the poll list. Find all of them. 1930 */ 1931 for (i = entry + 1; i < nfds; i++) { 1932 if (pollfdp[i].fd == fd) { 1933 pollfdp[i].revents = 1934 POLLNVAL; 1935 fdcnt++; 1936 } 1937 } 1938 } 1939 continue; 1940 } 1941 ASSERT(pdp->pd_fp == fp); 1942 ASSERT(infpollinfo(fd)); 1943 /* 1944 * Since we no longer hold poll head lock across 1945 * VOP_POLL, pollunlock logic can be simplifed. 1946 */ 1947 ASSERT(pdp->pd_php == NULL || 1948 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 1949 /* 1950 * underlying file systems may set a "pollpending" 1951 * flag when it sees the poll may block. Pollwakeup() 1952 * is called by wakeup thread if pollpending is set. 1953 * Pass a 0 fdcnt so that the underlying file system 1954 * will set the "pollpending" flag set when there is 1955 * no polled events. 1956 * 1957 * Use pollfdp[].events for actual polling because 1958 * the pd_events is union of all cached poll events 1959 * on this fd. The events parameter also affects 1960 * how the polled device sets the "poll pending" 1961 * flag. 1962 */ 1963 ASSERT(curthread->t_pollcache == NULL); 1964 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 1965 &pollfdp[entry].revents, &php, NULL); 1966 /* 1967 * releasef after completely done with this cached 1968 * poll entry. To prevent close() coming in to clear 1969 * this entry. 1970 */ 1971 if (error) { 1972 releasef(fd); 1973 break; 1974 } 1975 /* 1976 * layered devices (e.g. console driver) 1977 * may change the vnode and thus the pollhead 1978 * pointer out from underneath us. 1979 */ 1980 if (php != NULL && pdp->pd_php != NULL && 1981 php != pdp->pd_php) { 1982 releasef(fd); 1983 pollhead_delete(pdp->pd_php, pdp); 1984 pdp->pd_php = php; 1985 pollhead_insert(php, pdp); 1986 /* 1987 * We could have missed a wakeup on the new 1988 * target device. Make sure the new target 1989 * gets polled once. 1990 */ 1991 BT_SET(pcp->pc_bitmap, fd); 1992 goto retry; 1993 } 1994 1995 if (pollfdp[entry].revents) { 1996 ASSERT(refp->xf_refcnt >= 1); 1997 fdcnt++; 1998 if (refp->xf_refcnt > 1) { 1999 /* 2000 * this fd appeared multiple time 2001 * in the poll list. This is rare but 2002 * we have to look at all of them for 2003 * correctness. 2004 */ 2005 error = plist_chkdupfd(fp, pdp, ps, 2006 pollfdp, entry, &fdcnt); 2007 if (error > 0) { 2008 releasef(fd); 2009 break; 2010 } 2011 if (error < 0) { 2012 goto retry; 2013 } 2014 } 2015 releasef(fd); 2016 } else { 2017 /* 2018 * VOP_POLL didn't return any revents. We can 2019 * clear the bit in bitmap only if we have the 2020 * pollhead ptr cached and no other cached 2021 * entry is polling different events on this fd. 2022 * VOP_POLL may have dropped the ps_lock. Make 2023 * sure pollwakeup has not happened before clear 2024 * the bit. 2025 */ 2026 if ((pdp->pd_php != NULL) && 2027 (pollfdp[entry].events == pdp->pd_events) && 2028 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 2029 BT_CLEAR(pcp->pc_bitmap, fd); 2030 } 2031 /* 2032 * if the fd can be cached now but not before, 2033 * do it now. 2034 */ 2035 if ((pdp->pd_php == NULL) && (php != NULL)) { 2036 pdp->pd_php = php; 2037 pollhead_insert(php, pdp); 2038 /* 2039 * We are inserting a polldat struct for 2040 * the first time. We may have missed a 2041 * wakeup on this device. Re-poll once. 2042 * This should be a rare event. 2043 */ 2044 releasef(fd); 2045 goto retry; 2046 } 2047 if (refp->xf_refcnt > 1) { 2048 /* 2049 * this fd appeared multiple time 2050 * in the poll list. This is rare but 2051 * we have to look at all of them for 2052 * correctness. 2053 */ 2054 error = plist_chkdupfd(fp, pdp, ps, 2055 pollfdp, entry, &fdcnt); 2056 if (error > 0) { 2057 releasef(fd); 2058 break; 2059 } 2060 if (error < 0) { 2061 goto retry; 2062 } 2063 } 2064 releasef(fd); 2065 } 2066 } else { 2067 done = 1; 2068 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2069 } 2070 } 2071 if (!error) { 2072 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2073 *fdcntp += fdcnt; 2074 } 2075 return (error); 2076 } 2077 2078 /* 2079 * Going through the poll list without much locking. Poll all fds and 2080 * cache all valid fds in the pollcache. 2081 */ 2082 int 2083 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2084 { 2085 pollfd_t *pollfdp = ps->ps_pollfd; 2086 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2087 pollfd_t *newfdlist; 2088 int i; 2089 int fd; 2090 file_t *fp; 2091 int error = 0; 2092 2093 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2094 ASSERT(which < ps->ps_nsets); 2095 ASSERT(pcacheset != NULL); 2096 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2097 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2098 /* 2099 * cache the new poll list in pollcachset. 2100 */ 2101 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2102 2103 pcacheset[which].pcs_pollfd = newfdlist; 2104 pcacheset[which].pcs_nfds = ps->ps_nfds; 2105 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2106 2107 /* 2108 * We have saved a copy of current poll fd list in one pollcacheset. 2109 * The 'revents' field of the new list is not yet set to 0. Loop 2110 * through the new list just to do that is expensive. We do that 2111 * while polling the list. 2112 */ 2113 for (i = 0; i < ps->ps_nfds; i++) { 2114 fd = pollfdp[i].fd; 2115 /* 2116 * We also filter out the illegal poll events in the event 2117 * field for the cached poll list/set. 2118 */ 2119 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2120 newfdlist[i].events = pollfdp[i].events = 2121 pollfdp[i].events & VALID_POLL_EVENTS; 2122 } 2123 if (fd < 0) { 2124 pollfdp[i].revents = 0; 2125 continue; 2126 } 2127 if ((fp = getf(fd)) == NULL) { 2128 pollfdp[i].revents = POLLNVAL; 2129 /* 2130 * invalidate this cache entry in the cached poll list 2131 */ 2132 newfdlist[i].fd = -1; 2133 (*fdcntp)++; 2134 continue; 2135 } 2136 /* 2137 * cache this fd. 2138 */ 2139 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2140 which); 2141 releasef(fd); 2142 if (error) { 2143 /* 2144 * Here we are half way through caching a new 2145 * poll list. Undo every thing. 2146 */ 2147 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2148 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2149 pcacheset[which].pcs_pollfd = NULL; 2150 pcacheset[which].pcs_usradr = NULL; 2151 break; 2152 } 2153 } 2154 return (error); 2155 } 2156 2157 /* 2158 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2159 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2160 * wake any sleeping poller, then remove the polldat from the driver. 2161 * The routine is called with ps_pcachelock held. 2162 */ 2163 void 2164 pcache_clean_entry(pollstate_t *ps, int fd) 2165 { 2166 pollcache_t *pcp; 2167 polldat_t *pdp; 2168 int i; 2169 2170 ASSERT(ps != NULL); 2171 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2172 pcp = ps->ps_pcache; 2173 ASSERT(pcp); 2174 pdp = pcache_lookup_fd(pcp, fd); 2175 ASSERT(pdp != NULL); 2176 /* 2177 * the corresponding fpollinfo in fi_list has been removed by 2178 * a close on this fd. Reset the cached fp ptr here. 2179 */ 2180 pdp->pd_fp = NULL; 2181 /* 2182 * XXX - This routine also touches data in pcacheset struct. 2183 * 2184 * set the event in cached poll lists to POLLCLOSED. This invalidate 2185 * the cached poll fd entry in that poll list, which will force a 2186 * removal of this cached entry in next poll(). The cleanup is done 2187 * at the removal time. 2188 */ 2189 ASSERT(pdp->pd_ref != NULL); 2190 for (i = 0; i < ps->ps_nsets; i++) { 2191 xref_t *refp; 2192 pollcacheset_t *pcsp; 2193 2194 refp = &pdp->pd_ref[i]; 2195 if (refp->xf_refcnt) { 2196 ASSERT(refp->xf_position >= 0); 2197 pcsp = &ps->ps_pcacheset[i]; 2198 if (refp->xf_refcnt == 1) { 2199 pcsp->pcs_pollfd[refp->xf_position].events = 2200 (short)POLLCLOSED; 2201 } 2202 if (refp->xf_refcnt > 1) { 2203 int j; 2204 /* 2205 * mark every matching entry in pcs_pollfd 2206 */ 2207 for (j = refp->xf_position; 2208 j < pcsp->pcs_nfds; j++) { 2209 if (pcsp->pcs_pollfd[j].fd == fd) { 2210 pcsp->pcs_pollfd[j].events = 2211 (short)POLLCLOSED; 2212 } 2213 } 2214 } 2215 } 2216 } 2217 if (pdp->pd_php) { 2218 pollwakeup(pdp->pd_php, POLLHUP); 2219 pollhead_delete(pdp->pd_php, pdp); 2220 pdp->pd_php = NULL; 2221 } 2222 } 2223 2224 /* 2225 * This is the first time this thread has ever polled, 2226 * so we have to create its pollstate structure. 2227 * This will persist for the life of the thread, 2228 * until it calls pollcleanup(). 2229 */ 2230 pollstate_t * 2231 pollstate_create(void) 2232 { 2233 pollstate_t *ps; 2234 2235 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2236 ps->ps_nsets = POLLFDSETS; 2237 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2238 return (ps); 2239 } 2240 2241 void 2242 pollstate_destroy(pollstate_t *ps) 2243 { 2244 if (ps->ps_pollfd != NULL) { 2245 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2246 ps->ps_pollfd = NULL; 2247 } 2248 if (ps->ps_pcache != NULL) { 2249 pcache_destroy(ps->ps_pcache); 2250 ps->ps_pcache = NULL; 2251 } 2252 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2253 ps->ps_pcacheset = NULL; 2254 if (ps->ps_dpbuf != NULL) { 2255 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 2256 ps->ps_dpbuf = NULL; 2257 } 2258 mutex_destroy(&ps->ps_lock); 2259 kmem_free(ps, sizeof (pollstate_t)); 2260 } 2261 2262 /* 2263 * We are holding the appropriate uf_lock entering this routine. 2264 * Bump up the ps_busy count to prevent the thread from exiting. 2265 */ 2266 void 2267 pollblockexit(fpollinfo_t *fpip) 2268 { 2269 for (; fpip; fpip = fpip->fp_next) { 2270 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2271 2272 mutex_enter(&pcp->pc_no_exit); 2273 pcp->pc_busy++; /* prevents exit()'s */ 2274 mutex_exit(&pcp->pc_no_exit); 2275 } 2276 } 2277 2278 /* 2279 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2280 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2281 * this cache entry. We can't clean the polldat entry clean up here because 2282 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2283 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2284 * pcache_clean_entry to call pollwakeup(). 2285 */ 2286 void 2287 pollcacheclean(fpollinfo_t *fip, int fd) 2288 { 2289 struct fpollinfo *fpip, *fpip2; 2290 2291 fpip = fip; 2292 while (fpip) { 2293 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2294 pollcache_t *pcp = ps->ps_pcache; 2295 2296 mutex_enter(&ps->ps_lock); 2297 pcache_clean_entry(ps, fd); 2298 mutex_exit(&ps->ps_lock); 2299 mutex_enter(&pcp->pc_no_exit); 2300 pcp->pc_busy--; 2301 if (pcp->pc_busy == 0) { 2302 /* 2303 * Wakeup the thread waiting in 2304 * thread_exit(). 2305 */ 2306 cv_signal(&pcp->pc_busy_cv); 2307 } 2308 mutex_exit(&pcp->pc_no_exit); 2309 2310 fpip2 = fpip; 2311 fpip = fpip->fp_next; 2312 kmem_free(fpip2, sizeof (fpollinfo_t)); 2313 } 2314 } 2315 2316 /* 2317 * one of the cache line's counter is wrapping around. Reset all cache line 2318 * counters to zero except one. This is simplistic, but probably works 2319 * effectively. 2320 */ 2321 void 2322 pcacheset_reset_count(pollstate_t *ps, int index) 2323 { 2324 int i; 2325 2326 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2327 for (i = 0; i < ps->ps_nsets; i++) { 2328 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2329 ps->ps_pcacheset[i].pcs_count = 0; 2330 } 2331 } 2332 ps->ps_pcacheset[index].pcs_count = 1; 2333 } 2334 2335 /* 2336 * this routine implements poll cache list replacement policy. 2337 * It is currently choose the "least used". 2338 */ 2339 int 2340 pcacheset_replace(pollstate_t *ps) 2341 { 2342 int i; 2343 int index = 0; 2344 2345 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2346 for (i = 1; i < ps->ps_nsets; i++) { 2347 if (ps->ps_pcacheset[index].pcs_count > 2348 ps->ps_pcacheset[i].pcs_count) { 2349 index = i; 2350 } 2351 } 2352 ps->ps_pcacheset[index].pcs_count = 0; 2353 return (index); 2354 } 2355 2356 /* 2357 * this routine is called by strclose to remove remaining polldat struct on 2358 * the pollhead list of the device being closed. There are two reasons as why 2359 * the polldat structures still remain on the pollhead list: 2360 * 2361 * (1) The layered device(e.g.the console driver). 2362 * In this case, the existence of a polldat implies that the thread putting 2363 * the polldat on this list has not exited yet. Before the thread exits, it 2364 * will have to hold this pollhead lock to remove the polldat. So holding the 2365 * pollhead lock here effectively prevents the thread which put the polldat 2366 * on this list from exiting. 2367 * 2368 * (2) /dev/poll. 2369 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2370 * pollhead list if the process has not done a POLLREMOVE before closing the 2371 * polled fd. We just unlink it here. 2372 */ 2373 void 2374 pollhead_clean(pollhead_t *php) 2375 { 2376 polldat_t *pdp; 2377 2378 /* 2379 * In case(1), while we must prevent the thread in question from 2380 * exiting, we must also obey the proper locking order, i.e. 2381 * (ps_lock -> phlock). 2382 */ 2383 PH_ENTER(php); 2384 while (php->ph_list != NULL) { 2385 pollstate_t *ps; 2386 pollcache_t *pcp; 2387 2388 pdp = php->ph_list; 2389 ASSERT(pdp->pd_php == php); 2390 if (pdp->pd_thread == NULL) { 2391 /* 2392 * This is case(2). Since the ph_lock is sufficient 2393 * to synchronize this lwp with any other /dev/poll 2394 * lwp, just unlink the polldat. 2395 */ 2396 php->ph_list = pdp->pd_next; 2397 pdp->pd_php = NULL; 2398 pdp->pd_next = NULL; 2399 continue; 2400 } 2401 ps = pdp->pd_thread->t_pollstate; 2402 ASSERT(ps != NULL); 2403 pcp = pdp->pd_pcache; 2404 ASSERT(pcp != NULL); 2405 mutex_enter(&pcp->pc_no_exit); 2406 pcp->pc_busy++; /* prevents exit()'s */ 2407 mutex_exit(&pcp->pc_no_exit); 2408 /* 2409 * Now get the locks in proper order to avoid deadlock. 2410 */ 2411 PH_EXIT(php); 2412 mutex_enter(&ps->ps_lock); 2413 /* 2414 * while we dropped the pollhead lock, the element could be 2415 * taken off the list already. 2416 */ 2417 PH_ENTER(php); 2418 if (pdp->pd_php == php) { 2419 ASSERT(pdp == php->ph_list); 2420 php->ph_list = pdp->pd_next; 2421 pdp->pd_php = NULL; 2422 pdp->pd_next = NULL; 2423 } 2424 PH_EXIT(php); 2425 mutex_exit(&ps->ps_lock); 2426 mutex_enter(&pcp->pc_no_exit); 2427 pcp->pc_busy--; 2428 if (pcp->pc_busy == 0) { 2429 /* 2430 * Wakeup the thread waiting in 2431 * thread_exit(). 2432 */ 2433 cv_signal(&pcp->pc_busy_cv); 2434 } 2435 mutex_exit(&pcp->pc_no_exit); 2436 PH_ENTER(php); 2437 } 2438 PH_EXIT(php); 2439 } 2440 2441 /* 2442 * The remove_list is called to cleanup a partially cached 'current' list or 2443 * to remove a partial list which is no longer cached. The flag value of 1 2444 * indicates the second case. 2445 */ 2446 void 2447 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2448 int cacheindex, int flag) 2449 { 2450 int i; 2451 2452 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2453 for (i = start; i < end; i++) { 2454 if ((pollfdp[i].fd >= 0) && 2455 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2456 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2457 (uint_t)pollfdp[i].events)) { 2458 int j; 2459 int fd = pollfdp[i].fd; 2460 2461 for (j = i + 1; j < end; j++) { 2462 if (pollfdp[j].fd == fd) { 2463 pcache_update_xref( 2464 ps->ps_pcache, fd, 2465 (ssize_t)j, cacheindex); 2466 break; 2467 } 2468 } 2469 ASSERT(j <= end); 2470 } 2471 } 2472 } 2473 } 2474 2475 #ifdef DEBUG 2476 2477 #include<sys/strsubr.h> 2478 /* 2479 * make sure curthread is not on anyone's pollhead list any more. 2480 */ 2481 static void 2482 pollcheckphlist() 2483 { 2484 int i; 2485 file_t *fp; 2486 uf_entry_t *ufp; 2487 uf_info_t *fip = P_FINFO(curproc); 2488 struct stdata *stp; 2489 polldat_t *pdp; 2490 2491 mutex_enter(&fip->fi_lock); 2492 for (i = 0; i < fip->fi_nfiles; i++) { 2493 UF_ENTER(ufp, fip, i); 2494 if ((fp = ufp->uf_file) != NULL) { 2495 if ((stp = fp->f_vnode->v_stream) != NULL) { 2496 PH_ENTER(&stp->sd_pollist); 2497 pdp = stp->sd_pollist.ph_list; 2498 while (pdp) { 2499 ASSERT(pdp->pd_thread != curthread); 2500 pdp = pdp->pd_next; 2501 } 2502 PH_EXIT(&stp->sd_pollist); 2503 } 2504 } 2505 UF_EXIT(ufp); 2506 } 2507 mutex_exit(&fip->fi_lock); 2508 } 2509 2510 /* 2511 * for resolved set poll list, the xref info in the pcache should be 2512 * consistent with this poll list. 2513 */ 2514 static int 2515 pollcheckxref(pollstate_t *ps, int cacheindex) 2516 { 2517 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2518 pollcache_t *pcp = ps->ps_pcache; 2519 polldat_t *pdp; 2520 int i; 2521 xref_t *refp; 2522 2523 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2524 if (pollfdp[i].fd < 0) { 2525 continue; 2526 } 2527 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2528 ASSERT(pdp != NULL); 2529 ASSERT(pdp->pd_ref != NULL); 2530 refp = &pdp->pd_ref[cacheindex]; 2531 if (refp->xf_position >= 0) { 2532 ASSERT(refp->xf_refcnt >= 1); 2533 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2534 if (refp->xf_refcnt > 1) { 2535 int j; 2536 int count = 0; 2537 2538 for (j = refp->xf_position; 2539 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2540 j++) { 2541 if (pollfdp[j].fd == pdp->pd_fd) { 2542 count++; 2543 } 2544 } 2545 ASSERT(count == refp->xf_refcnt); 2546 } 2547 } 2548 } 2549 return (1); 2550 } 2551 2552 /* 2553 * For every cached pollfd, its polldat struct should be consistent with 2554 * what is in the pcacheset lists. 2555 */ 2556 static void 2557 checkpolldat(pollstate_t *ps) 2558 { 2559 pollcache_t *pcp = ps->ps_pcache; 2560 polldat_t **hashtbl; 2561 int i; 2562 2563 hashtbl = pcp->pc_hash; 2564 for (i = 0; i < pcp->pc_hashsize; i++) { 2565 polldat_t *pdp; 2566 2567 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2568 ASSERT(pdp->pd_ref != NULL); 2569 if (pdp->pd_count > 0) { 2570 xref_t *refp; 2571 int j; 2572 pollcacheset_t *pcsp; 2573 pollfd_t *pollfd; 2574 2575 for (j = 0; j < ps->ps_nsets; j++) { 2576 refp = &pdp->pd_ref[j]; 2577 if (refp->xf_refcnt > 0) { 2578 pcsp = &ps->ps_pcacheset[j]; 2579 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2580 pollfd = pcsp->pcs_pollfd; 2581 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2582 } 2583 } 2584 } 2585 } 2586 } 2587 } 2588 2589 /* 2590 * every wfd element on ph_list must have a corresponding fpollinfo on the 2591 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2592 */ 2593 void 2594 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2595 { 2596 stdata_t *stp; 2597 polldat_t *pdp; 2598 fpollinfo_t *fpip2; 2599 2600 if ((stp = vp->v_stream) == NULL) { 2601 return; 2602 } 2603 PH_ENTER(&stp->sd_pollist); 2604 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2605 if (pdp->pd_thread != NULL && 2606 pdp->pd_thread->t_procp == curthread->t_procp) { 2607 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2608 if (pdp->pd_thread == fpip2->fp_thread) { 2609 break; 2610 } 2611 } 2612 ASSERT(fpip2 != NULL); 2613 } 2614 } 2615 PH_EXIT(&stp->sd_pollist); 2616 } 2617 2618 /* 2619 * For each cached fd whose bit is not set in bitmap, its revents field in 2620 * current poll list should be 0. 2621 */ 2622 static int 2623 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2624 { 2625 pollcache_t *pcp = ps->ps_pcache; 2626 pollfd_t *pollfdp = ps->ps_pollfd; 2627 int i; 2628 2629 for (i = begin; i < end; i++) { 2630 polldat_t *pdp; 2631 2632 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2633 pdp = pcache_lookup_fd(pcp, i); 2634 if (pdp && pdp->pd_fp != NULL) { 2635 xref_t *refp; 2636 int entry; 2637 2638 ASSERT(pdp->pd_ref != NULL); 2639 refp = &pdp->pd_ref[cacheindex]; 2640 if (refp->xf_refcnt == 0) { 2641 continue; 2642 } 2643 entry = refp->xf_position; 2644 ASSERT(entry >= 0); 2645 ASSERT(pollfdp[entry].revents == 0); 2646 if (refp->xf_refcnt > 1) { 2647 int j; 2648 2649 for (j = entry + 1; j < ps->ps_nfds; j++) { 2650 if (pollfdp[j].fd == i) { 2651 ASSERT(pollfdp[j].revents == 0); 2652 } 2653 } 2654 } 2655 } 2656 } 2657 return (1); 2658 } 2659 2660 #endif /* DEBUG */ 2661 2662 pollcache_t * 2663 pcache_alloc() 2664 { 2665 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2666 } 2667 2668 void 2669 pcache_create(pollcache_t *pcp, nfds_t nfds) 2670 { 2671 size_t mapsize; 2672 2673 /* 2674 * allocate enough bits for the poll fd list 2675 */ 2676 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2677 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2678 } 2679 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2680 KM_SLEEP); 2681 pcp->pc_mapsize = mapsize; 2682 /* 2683 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2684 * number of fd to start with, allocate a bigger hash table (to the 2685 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2686 * hash table is expensive. 2687 */ 2688 if (nfds < POLLHASHCHUNKSZ) { 2689 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2690 } else { 2691 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2692 ~(POLLHASHCHUNKSZ - 1); 2693 } 2694 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2695 KM_SLEEP); 2696 } 2697 2698 void 2699 pcache_destroy(pollcache_t *pcp) 2700 { 2701 polldat_t **hashtbl; 2702 int i; 2703 2704 hashtbl = pcp->pc_hash; 2705 for (i = 0; i < pcp->pc_hashsize; i++) { 2706 if (hashtbl[i] != NULL) { 2707 polldat_t *pdp, *pdp2; 2708 2709 pdp = hashtbl[i]; 2710 while (pdp != NULL) { 2711 pdp2 = pdp->pd_hashnext; 2712 if (pdp->pd_ref != NULL) { 2713 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2714 pdp->pd_nsets); 2715 } 2716 kmem_free(pdp, sizeof (polldat_t)); 2717 pdp = pdp2; 2718 pcp->pc_fdcount--; 2719 } 2720 } 2721 } 2722 ASSERT(pcp->pc_fdcount == 0); 2723 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2724 kmem_free(pcp->pc_bitmap, 2725 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2726 mutex_destroy(&pcp->pc_no_exit); 2727 mutex_destroy(&pcp->pc_lock); 2728 cv_destroy(&pcp->pc_cv); 2729 cv_destroy(&pcp->pc_busy_cv); 2730 kmem_free(pcp, sizeof (pollcache_t)); 2731 } 2732 2733 pollcacheset_t * 2734 pcacheset_create(int nsets) 2735 { 2736 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 2737 } 2738 2739 void 2740 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 2741 { 2742 int i; 2743 2744 for (i = 0; i < nsets; i++) { 2745 if (pcsp[i].pcs_pollfd != NULL) { 2746 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 2747 sizeof (pollfd_t)); 2748 } 2749 } 2750 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 2751 } 2752 2753 /* 2754 * Check each duplicated poll fd in the poll list. It may be necessary to 2755 * VOP_POLL the same fd again using different poll events. getf() has been 2756 * done by caller. This routine returns 0 if it can sucessfully process the 2757 * entire poll fd list. It returns -1 if underlying vnode has changed during 2758 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 2759 * value if VOP_POLL failed. 2760 */ 2761 static int 2762 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 2763 int entry, int *fdcntp) 2764 { 2765 int i; 2766 int fd; 2767 nfds_t nfds = psp->ps_nfds; 2768 2769 fd = pollfdp[entry].fd; 2770 for (i = entry + 1; i < nfds; i++) { 2771 if (pollfdp[i].fd == fd) { 2772 if (pollfdp[i].events == pollfdp[entry].events) { 2773 if ((pollfdp[i].revents = 2774 pollfdp[entry].revents) != 0) { 2775 (*fdcntp)++; 2776 } 2777 } else { 2778 2779 int error; 2780 pollhead_t *php; 2781 pollcache_t *pcp = psp->ps_pcache; 2782 2783 /* 2784 * the events are different. VOP_POLL on this 2785 * fd so that we don't miss any revents. 2786 */ 2787 php = NULL; 2788 ASSERT(curthread->t_pollcache == NULL); 2789 error = VOP_POLL(fp->f_vnode, 2790 pollfdp[i].events, 0, 2791 &pollfdp[i].revents, &php, NULL); 2792 if (error) { 2793 return (error); 2794 } 2795 /* 2796 * layered devices(e.g. console driver) 2797 * may change the vnode and thus the pollhead 2798 * pointer out from underneath us. 2799 */ 2800 if (php != NULL && pdp->pd_php != NULL && 2801 php != pdp->pd_php) { 2802 pollhead_delete(pdp->pd_php, pdp); 2803 pdp->pd_php = php; 2804 pollhead_insert(php, pdp); 2805 /* 2806 * We could have missed a wakeup on the 2807 * new target device. Make sure the new 2808 * target gets polled once. 2809 */ 2810 BT_SET(pcp->pc_bitmap, fd); 2811 return (-1); 2812 } 2813 if (pollfdp[i].revents) { 2814 (*fdcntp)++; 2815 } 2816 } 2817 } 2818 } 2819 return (0); 2820 } 2821