1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012 by Delphix. All rights reserved. 32 */ 33 34 /* 35 * Portions of this source code were derived from Berkeley 4.3 BSD 36 * under license from the Regents of the University of California. 37 */ 38 39 #include <sys/param.h> 40 #include <sys/isa_defs.h> 41 #include <sys/types.h> 42 #include <sys/sysmacros.h> 43 #include <sys/user.h> 44 #include <sys/systm.h> 45 #include <sys/errno.h> 46 #include <sys/time.h> 47 #include <sys/vnode.h> 48 #include <sys/file.h> 49 #include <sys/mode.h> 50 #include <sys/proc.h> 51 #include <sys/uio.h> 52 #include <sys/poll_impl.h> 53 #include <sys/kmem.h> 54 #include <sys/cmn_err.h> 55 #include <sys/debug.h> 56 #include <sys/bitmap.h> 57 #include <sys/kstat.h> 58 #include <sys/rctl.h> 59 #include <sys/port_impl.h> 60 #include <sys/schedctl.h> 61 #include <sys/cpu.h> 62 #include <sys/random.h> 63 64 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 65 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 66 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 67 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 68 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 69 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 70 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 71 72 /* settable in /etc/system */ 73 uint32_t randomize_pollwakeup = 0; 74 75 /* 76 * global counters to collect some stats 77 */ 78 static struct { 79 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 80 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 81 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 82 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 83 } pollstats = { 84 { "polllistmiss", KSTAT_DATA_UINT64 }, 85 { "pollcachehit", KSTAT_DATA_UINT64 }, 86 { "pollcachephit", KSTAT_DATA_UINT64 }, 87 { "pollcachemiss", KSTAT_DATA_UINT64 } 88 }; 89 90 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 91 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 92 93 struct pplock { 94 kmutex_t pp_lock; 95 short pp_flag; 96 kcondvar_t pp_wait_cv; 97 int32_t pp_pad; /* to a nice round 16 bytes */ 98 }; 99 100 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 101 102 #ifdef DEBUG 103 static int pollchecksanity(pollstate_t *, nfds_t); 104 static int pollcheckxref(pollstate_t *, int); 105 static void pollcheckphlist(void); 106 static int pollcheckrevents(pollstate_t *, int, int, int); 107 static void checkpolldat(pollstate_t *); 108 #endif /* DEBUG */ 109 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 110 int *); 111 112 /* 113 * Data structure overview: 114 * The per-thread poll state consists of 115 * one pollstate_t 116 * one pollcache_t 117 * one bitmap with one event bit per fd 118 * a (two-dimensional) hashed array of polldat_t structures - one entry 119 * per fd 120 * 121 * This conglomerate of data structures interact with 122 * the pollhead which is used by VOP_POLL and pollwakeup 123 * (protected by the PHLOCK, cached array of plocks), and 124 * the fpollinfo list hanging off the fi_list which is used to notify 125 * poll when a cached fd is closed. This is protected by uf_lock. 126 * 127 * Invariants: 128 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 129 * is on that pollhead. This is modified atomically under pc_lock. 130 * 131 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 132 * list for that open file. 133 * This is modified atomically under pc_lock. 134 * 135 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 136 * Iff pd_ref[i].xf_refcnt >= 1 then 137 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 138 * Iff pd_ref[i].xf_refcnt > 1 then 139 * In ps_pcacheset[i].pcs_pollfd between index 140 * pd_ref[i].xf_position] and the end of the list 141 * there are xf_refcnt entries with .fd == pd_fd 142 * 143 * Locking design: 144 * Whenever possible the design relies on the fact that the poll cache state 145 * is per thread thus for both poll and exit it is self-synchronizing. 146 * Thus the key interactions where other threads access the state are: 147 * pollwakeup (and polltime), and 148 * close cleaning up the cached references to an open file 149 * 150 * The two key locks in poll proper is ps_lock and pc_lock. 151 * 152 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 153 * to ensure that modifications to pollcacheset structure are serialized. 154 * This lock is held through most of poll() except where poll sleeps 155 * since there is little need to handle closes concurrently with the execution 156 * of poll. 157 * The pc_lock protects most of the fields in pollcache structure and polldat 158 * structures (which are accessed by poll, pollwakeup, and polltime) 159 * with the exception of fields that are only modified when only one thread 160 * can access this per-thread state. 161 * Those exceptions occur in poll when first allocating the per-thread state, 162 * when poll grows the number of polldat (never shrinks), and when 163 * exit/pollcleanup has ensured that there are no references from either 164 * pollheads or fpollinfo to the threads poll state. 165 * 166 * Poll(2) system call is the only path which ps_lock and pc_lock are both 167 * held, in that order. It needs ps_lock to synchronize with close and 168 * lwp_exit; and pc_lock with pollwakeup. 169 * 170 * The locking interaction between pc_lock and PHLOCK take into account 171 * that poll acquires these locks in the order of pc_lock and then PHLOCK 172 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 173 * deadlock avoidance by dropping the locks and reacquiring them in the 174 * reverse order. For this to work pollwakeup needs to prevent the thread 175 * from exiting and freeing all of the poll related state. Thus is done 176 * using 177 * the pc_no_exit lock 178 * the pc_busy counter 179 * the pc_busy_cv condition variable 180 * 181 * The locking interaction between pc_lock and uf_lock has similar 182 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 183 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 184 * to prevent poll or exit from doing a delfpollinfo after which the thread 185 * might exit. But the cleanup needs to acquire pc_lock when modifying 186 * the poll cache state. The solution is to use pc_busy and do the close 187 * cleanup in two phases: 188 * First close calls pollblockexit which increments pc_busy. 189 * This prevents the per-thread poll related state from being freed. 190 * Then close drops uf_lock and calls pollcacheclean. 191 * This routine can then acquire pc_lock and remove any references 192 * to the closing fd (as well as recording that it has been closed 193 * so that a POLLNVAL can be generated even if the fd is reused before 194 * poll has been woken up and checked getf() again). 195 * 196 * When removing a polled fd from poll cache, the fd is always removed 197 * from pollhead list first and then from fpollinfo list, i.e., 198 * pollhead_delete() is called before delfpollinfo(). 199 * 200 * 201 * Locking hierarchy: 202 * pc_no_exit is a leaf level lock. 203 * ps_lock is held when acquiring pc_lock (except when pollwakeup 204 * acquires pc_lock). 205 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 206 * pollhead_delete) 207 * pc_lock is always held (but this is not required) 208 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 209 * from pcache_clean_entry). 210 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 211 * uf_lock. 212 * pc_lock is held across getf/releasef which acquire uf_lock. 213 * ps_lock might be held across getf/releasef which acquire uf_lock. 214 * pollwakeup tries to acquire pc_lock while holding PHLOCK 215 * but drops the locks and reacquire them in reverse order to avoid 216 * deadlock. 217 * 218 * Note also that there is deadlock avoidance support for VOP_POLL routines 219 * and pollwakeup involving a file system or driver lock. 220 * See below. 221 */ 222 223 /* 224 * Deadlock avoidance support for VOP_POLL() routines. This is 225 * sometimes necessary to prevent deadlock between polling threads 226 * (which hold poll locks on entry to xx_poll(), then acquire foo) 227 * and pollwakeup() threads (which hold foo, then acquire poll locks). 228 * 229 * pollunlock(void) releases whatever poll locks the current thread holds, 230 * returning a cookie for use by pollrelock(); 231 * 232 * pollrelock(cookie) reacquires previously dropped poll locks; 233 * 234 * polllock(php, mutex) does the common case: pollunlock(), 235 * acquire the problematic mutex, pollrelock(). 236 */ 237 int 238 pollunlock(void) 239 { 240 pollcache_t *pcp; 241 int lockstate = 0; 242 243 /* 244 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 245 * If the pollrelock/pollunlock is called as a result of poll(2), 246 * the t_pollcache should be NULL. 247 */ 248 if (curthread->t_pollcache == NULL) 249 pcp = curthread->t_pollstate->ps_pcache; 250 else 251 pcp = curthread->t_pollcache; 252 253 if (mutex_owned(&pcp->pc_lock)) { 254 lockstate = 1; 255 mutex_exit(&pcp->pc_lock); 256 } 257 return (lockstate); 258 } 259 260 void 261 pollrelock(int lockstate) 262 { 263 pollcache_t *pcp; 264 265 /* 266 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 267 * If the pollrelock/pollunlock is called as a result of poll(2), 268 * the t_pollcache should be NULL. 269 */ 270 if (curthread->t_pollcache == NULL) 271 pcp = curthread->t_pollstate->ps_pcache; 272 else 273 pcp = curthread->t_pollcache; 274 275 if (lockstate > 0) 276 mutex_enter(&pcp->pc_lock); 277 } 278 279 /* ARGSUSED */ 280 void 281 polllock(pollhead_t *php, kmutex_t *lp) 282 { 283 if (!mutex_tryenter(lp)) { 284 int lockstate = pollunlock(); 285 mutex_enter(lp); 286 pollrelock(lockstate); 287 } 288 } 289 290 static int 291 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 292 { 293 kthread_t *t = curthread; 294 klwp_t *lwp = ttolwp(t); 295 proc_t *p = ttoproc(t); 296 int fdcnt = 0; 297 int i; 298 hrtime_t deadline; /* hrtime value when we want to return */ 299 pollfd_t *pollfdp; 300 pollstate_t *ps; 301 pollcache_t *pcp; 302 int error = 0; 303 nfds_t old_nfds; 304 int cacheindex = 0; /* which cache set is used */ 305 306 /* 307 * Determine the precise future time of the requested timeout, if any. 308 */ 309 if (tsp == NULL) { 310 deadline = -1; 311 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 312 deadline = 0; 313 } else { 314 /* They must wait at least a tick. */ 315 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 316 deadline = MAX(deadline, nsec_per_tick); 317 deadline += gethrtime(); 318 } 319 320 /* 321 * Reset our signal mask, if requested. 322 */ 323 if (ksetp != NULL) { 324 mutex_enter(&p->p_lock); 325 schedctl_finish_sigblock(t); 326 lwp->lwp_sigoldmask = t->t_hold; 327 t->t_hold = *ksetp; 328 t->t_flag |= T_TOMASK; 329 /* 330 * Call cv_reltimedwait_sig() just to check for signals. 331 * We will return immediately with either 0 or -1. 332 */ 333 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 334 TR_CLOCK_TICK)) { 335 mutex_exit(&p->p_lock); 336 error = EINTR; 337 goto pollout; 338 } 339 mutex_exit(&p->p_lock); 340 } 341 342 /* 343 * Check to see if this guy just wants to use poll() as a timeout. 344 * If yes then bypass all the other stuff and make him sleep. 345 */ 346 if (nfds == 0) { 347 /* 348 * Sleep until we have passed the requested future 349 * time or until interrupted by a signal. 350 * Do not check for signals if we do not want to wait. 351 */ 352 if (deadline != 0) { 353 mutex_enter(&t->t_delay_lock); 354 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 355 &t->t_delay_lock, deadline)) > 0) 356 continue; 357 mutex_exit(&t->t_delay_lock); 358 error = (error == 0) ? EINTR : 0; 359 } 360 goto pollout; 361 } 362 363 if (nfds > p->p_fno_ctl) { 364 mutex_enter(&p->p_lock); 365 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 366 p->p_rctls, p, RCA_SAFE); 367 mutex_exit(&p->p_lock); 368 error = EINVAL; 369 goto pollout; 370 } 371 372 /* 373 * Need to allocate memory for pollstate before anything because 374 * the mutex and cv are created in this space 375 */ 376 if ((ps = t->t_pollstate) == NULL) { 377 t->t_pollstate = pollstate_create(); 378 ps = t->t_pollstate; 379 } 380 381 if (ps->ps_pcache == NULL) 382 ps->ps_pcache = pcache_alloc(); 383 pcp = ps->ps_pcache; 384 385 /* 386 * NOTE: for performance, buffers are saved across poll() calls. 387 * The theory is that if a process polls heavily, it tends to poll 388 * on the same set of descriptors. Therefore, we only reallocate 389 * buffers when nfds changes. There is no hysteresis control, 390 * because there is no data to suggest that this is necessary; 391 * the penalty of reallocating is not *that* great in any event. 392 */ 393 old_nfds = ps->ps_nfds; 394 if (nfds != old_nfds) { 395 396 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 397 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 398 ps->ps_pollfd = pollfdp; 399 ps->ps_nfds = nfds; 400 } 401 402 pollfdp = ps->ps_pollfd; 403 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 404 error = EFAULT; 405 goto pollout; 406 } 407 408 if (fds == NULL) { 409 /* 410 * If the process has page 0 mapped, then the copyin() above 411 * will succeed even if fds is NULL. However, our cached 412 * poll lists are keyed by the address of the passed-in fds 413 * structure, and we use the value NULL to indicate an unused 414 * poll cache list entry. As such, we elect not to support 415 * NULL as a valid (user) memory address and fail the poll() 416 * call. 417 */ 418 error = EINVAL; 419 goto pollout; 420 } 421 422 /* 423 * If this thread polls for the first time, allocate ALL poll 424 * cache data structures and cache the poll fd list. This 425 * allocation is delayed till now because lwp's polling 0 fd 426 * (i.e. using poll as timeout()) don't need this memory. 427 */ 428 mutex_enter(&ps->ps_lock); 429 pcp = ps->ps_pcache; 430 ASSERT(pcp != NULL); 431 if (pcp->pc_bitmap == NULL) { 432 pcache_create(pcp, nfds); 433 /* 434 * poll and cache this poll fd list in ps_pcacheset[0]. 435 */ 436 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 437 if (fdcnt || error) { 438 mutex_exit(&ps->ps_lock); 439 goto pollout; 440 } 441 } else { 442 pollcacheset_t *pcset = ps->ps_pcacheset; 443 444 /* 445 * Not first time polling. Select a cached poll list by 446 * matching user pollfd list buffer address. 447 */ 448 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 449 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 450 if ((++pcset[cacheindex].pcs_count) == 0) { 451 /* 452 * counter is wrapping around. 453 */ 454 pcacheset_reset_count(ps, cacheindex); 455 } 456 /* 457 * examine and resolve possible 458 * difference of the current poll 459 * list and previously cached one. 460 * If there is an error during resolve(), 461 * the callee will guarantee the consistency 462 * of cached poll list and cache content. 463 */ 464 error = pcacheset_resolve(ps, nfds, &fdcnt, 465 cacheindex); 466 if (error) { 467 mutex_exit(&ps->ps_lock); 468 goto pollout; 469 } 470 break; 471 } 472 473 /* 474 * Note that pcs_usradr field of an used entry won't be 475 * NULL because it stores the address of passed-in fds, 476 * and NULL fds will not be cached (Then it is either 477 * the special timeout case when nfds is 0 or it returns 478 * failure directly). 479 */ 480 if (pcset[cacheindex].pcs_usradr == NULL) { 481 /* 482 * found an unused entry. Use it to cache 483 * this poll list. 484 */ 485 error = pcacheset_cache_list(ps, fds, &fdcnt, 486 cacheindex); 487 if (fdcnt || error) { 488 mutex_exit(&ps->ps_lock); 489 goto pollout; 490 } 491 break; 492 } 493 } 494 if (cacheindex == ps->ps_nsets) { 495 /* 496 * We failed to find a matching cached poll fd list. 497 * replace an old list. 498 */ 499 pollstats.polllistmiss.value.ui64++; 500 cacheindex = pcacheset_replace(ps); 501 ASSERT(cacheindex < ps->ps_nsets); 502 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 503 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 504 if (error) { 505 mutex_exit(&ps->ps_lock); 506 goto pollout; 507 } 508 } 509 } 510 511 /* 512 * Always scan the bitmap with the lock on the pollcache held. 513 * This is to make sure that a wakeup does not come undetected. 514 * If the lock is not held, a pollwakeup could have come for an 515 * fd we already checked but before this thread sleeps, in which 516 * case the wakeup is missed. Now we hold the pcache lock and 517 * check the bitmap again. This will prevent wakeup from happening 518 * while we hold pcache lock since pollwakeup() will also lock 519 * the pcache before updating poll bitmap. 520 */ 521 mutex_enter(&pcp->pc_lock); 522 for (;;) { 523 pcp->pc_flag = 0; 524 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 525 if (fdcnt || error) { 526 mutex_exit(&pcp->pc_lock); 527 mutex_exit(&ps->ps_lock); 528 break; 529 } 530 531 /* 532 * If T_POLLWAKE is set, a pollwakeup() was performed on 533 * one of the file descriptors. This can happen only if 534 * one of the VOP_POLL() functions dropped pcp->pc_lock. 535 * The only current cases of this is in procfs (prpoll()) 536 * and STREAMS (strpoll()). 537 */ 538 if (pcp->pc_flag & T_POLLWAKE) 539 continue; 540 541 /* 542 * If you get here, the poll of fds was unsuccessful. 543 * Wait until some fd becomes readable, writable, or gets 544 * an exception, or until a signal or a timeout occurs. 545 * Do not check for signals if we have a zero timeout. 546 */ 547 mutex_exit(&ps->ps_lock); 548 if (deadline == 0) { 549 error = -1; 550 } else { 551 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 552 &pcp->pc_lock, deadline); 553 } 554 mutex_exit(&pcp->pc_lock); 555 /* 556 * If we have received a signal or timed out 557 * then break out and return. 558 */ 559 if (error <= 0) { 560 error = (error == 0) ? EINTR : 0; 561 break; 562 } 563 /* 564 * We have not received a signal or timed out. 565 * Continue around and poll fds again. 566 */ 567 mutex_enter(&ps->ps_lock); 568 mutex_enter(&pcp->pc_lock); 569 } 570 571 pollout: 572 /* 573 * If we changed the signal mask but we received 574 * no signal then restore the signal mask. 575 * Otherwise psig() will deal with the signal mask. 576 */ 577 if (ksetp != NULL) { 578 mutex_enter(&p->p_lock); 579 if (lwp->lwp_cursig == 0) { 580 t->t_hold = lwp->lwp_sigoldmask; 581 t->t_flag &= ~T_TOMASK; 582 } 583 mutex_exit(&p->p_lock); 584 } 585 586 if (error) 587 return (set_errno(error)); 588 589 /* 590 * Copy out the events and return the fdcnt to the user. 591 */ 592 if (nfds != 0 && 593 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 594 return (set_errno(EFAULT)); 595 596 #ifdef DEBUG 597 /* 598 * Another sanity check: 599 */ 600 if (fdcnt) { 601 int reventcnt = 0; 602 603 for (i = 0; i < nfds; i++) { 604 if (pollfdp[i].fd < 0) { 605 ASSERT(pollfdp[i].revents == 0); 606 continue; 607 } 608 if (pollfdp[i].revents) { 609 reventcnt++; 610 } 611 } 612 ASSERT(fdcnt == reventcnt); 613 } else { 614 for (i = 0; i < nfds; i++) { 615 ASSERT(pollfdp[i].revents == 0); 616 } 617 } 618 #endif /* DEBUG */ 619 620 return (fdcnt); 621 } 622 623 /* 624 * This is the system call trap that poll(), 625 * select() and pselect() are built upon. 626 * It is a private interface between libc and the kernel. 627 */ 628 int 629 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 630 { 631 timespec_t ts; 632 timespec_t *tsp; 633 sigset_t set; 634 k_sigset_t kset; 635 k_sigset_t *ksetp; 636 model_t datamodel = get_udatamodel(); 637 638 if (timeoutp == NULL) 639 tsp = NULL; 640 else { 641 if (datamodel == DATAMODEL_NATIVE) { 642 if (copyin(timeoutp, &ts, sizeof (ts))) 643 return (set_errno(EFAULT)); 644 } else { 645 timespec32_t ts32; 646 647 if (copyin(timeoutp, &ts32, sizeof (ts32))) 648 return (set_errno(EFAULT)); 649 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 650 } 651 652 if (itimerspecfix(&ts)) 653 return (set_errno(EINVAL)); 654 tsp = &ts; 655 } 656 657 if (setp == NULL) 658 ksetp = NULL; 659 else { 660 if (copyin(setp, &set, sizeof (set))) 661 return (set_errno(EFAULT)); 662 sigutok(&set, &kset); 663 ksetp = &kset; 664 } 665 666 return (poll_common(fds, nfds, tsp, ksetp)); 667 } 668 669 /* 670 * Clean up any state left around by poll(2). Called when a thread exits. 671 */ 672 void 673 pollcleanup() 674 { 675 pollstate_t *ps = curthread->t_pollstate; 676 pollcache_t *pcp; 677 678 if (ps == NULL) 679 return; 680 pcp = ps->ps_pcache; 681 /* 682 * free up all cached poll fds 683 */ 684 if (pcp == NULL) { 685 /* this pollstate is used by /dev/poll */ 686 goto pollcleanout; 687 } 688 689 if (pcp->pc_bitmap != NULL) { 690 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 691 /* 692 * a close lwp can race with us when cleaning up a polldat 693 * entry. We hold the ps_lock when cleaning hash table. 694 * Since this pollcache is going away anyway, there is no 695 * need to hold the pc_lock. 696 */ 697 mutex_enter(&ps->ps_lock); 698 pcache_clean(pcp); 699 mutex_exit(&ps->ps_lock); 700 #ifdef DEBUG 701 /* 702 * At this point, all fds cached by this lwp should be 703 * cleaned up. There should be no fd in fi_list still 704 * reference this thread. 705 */ 706 checkfpollinfo(); /* sanity check */ 707 pollcheckphlist(); /* sanity check */ 708 #endif /* DEBUG */ 709 } 710 /* 711 * Be sure no one is referencing thread before exiting 712 */ 713 mutex_enter(&pcp->pc_no_exit); 714 ASSERT(pcp->pc_busy >= 0); 715 while (pcp->pc_busy > 0) 716 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 717 mutex_exit(&pcp->pc_no_exit); 718 pollcleanout: 719 pollstate_destroy(ps); 720 curthread->t_pollstate = NULL; 721 } 722 723 /* 724 * pollwakeup() - poke threads waiting in poll() for some event 725 * on a particular object. 726 * 727 * The threads hanging off of the specified pollhead structure are scanned. 728 * If their event mask matches the specified event(s), then pollnotify() is 729 * called to poke the thread. 730 * 731 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 732 * all waiting threads are poked. 733 * 734 * It is important that pollnotify() not drop the lock protecting the list 735 * of threads. 736 */ 737 void 738 pollwakeup(pollhead_t *php, short events_arg) 739 { 740 polldat_t *pdp; 741 polldat_t *first; 742 int events = (ushort_t)events_arg; 743 struct plist { 744 port_t *pp; 745 int pevents; 746 struct plist *next; 747 }; 748 struct plist *plhead = NULL, *pltail = NULL; 749 750 retry: 751 PH_ENTER(php); 752 753 if (php->ph_list == NULL) { 754 PH_EXIT(php); 755 return; 756 } 757 758 if (randomize_pollwakeup) { 759 size_t entries = 0; 760 size_t r = 0; 761 762 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) 763 ++entries; 764 ASSERT(entries != 0); 765 random_get_pseudo_bytes((uint8_t *)&r, sizeof(r)); 766 r %= entries; 767 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) 768 if (r-- == 0) 769 break; 770 ASSERT(pdp != NULL); 771 } else { 772 pdp = php->ph_list; 773 } 774 775 first = pdp; 776 for (;;) { 777 if ((pdp->pd_events & events) || 778 (events & (POLLHUP | POLLERR))) { 779 780 pollcache_t *pcp; 781 782 if (pdp->pd_portev != NULL) { 783 port_kevent_t *pkevp = pdp->pd_portev; 784 /* 785 * Object (fd) is associated with an event port, 786 * => send event notification to the port. 787 */ 788 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 789 mutex_enter(&pkevp->portkev_lock); 790 if (pkevp->portkev_flags & PORT_KEV_VALID) { 791 int pevents; 792 793 pkevp->portkev_flags &= ~PORT_KEV_VALID; 794 pkevp->portkev_events |= events & 795 (pdp->pd_events | POLLHUP | 796 POLLERR); 797 /* 798 * portkev_lock mutex will be released 799 * by port_send_event(). 800 */ 801 port_send_event(pkevp); 802 803 /* 804 * If we have some thread polling the 805 * port's fd, add it to the list. They 806 * will be notified later. 807 * The port_pollwkup() will flag the 808 * port_t so that it will not disappear 809 * till port_pollwkdone() is called. 810 */ 811 pevents = 812 port_pollwkup(pkevp->portkev_port); 813 if (pevents) { 814 struct plist *t; 815 t = kmem_zalloc( 816 sizeof (struct plist), 817 KM_SLEEP); 818 t->pp = pkevp->portkev_port; 819 t->pevents = pevents; 820 if (plhead == NULL) { 821 plhead = t; 822 } else { 823 pltail->next = t; 824 } 825 pltail = t; 826 } 827 } else { 828 mutex_exit(&pkevp->portkev_lock); 829 } 830 goto next; 831 } 832 833 pcp = pdp->pd_pcache; 834 835 /* 836 * Try to grab the lock for this thread. If 837 * we don't get it then we may deadlock so 838 * back out and restart all over again. Note 839 * that the failure rate is very very low. 840 */ 841 if (mutex_tryenter(&pcp->pc_lock)) { 842 pollnotify(pcp, pdp->pd_fd); 843 mutex_exit(&pcp->pc_lock); 844 } else { 845 /* 846 * We are here because: 847 * 1) This thread has been woke up 848 * and is trying to get out of poll(). 849 * 2) Some other thread is also here 850 * but with a different pollhead lock. 851 * 852 * So, we need to drop the lock on pollhead 853 * because of (1) but we want to prevent 854 * that thread from doing lwp_exit() or 855 * devpoll close. We want to ensure that 856 * the pollcache pointer is still invalid. 857 * 858 * Solution: Grab the pcp->pc_no_exit lock, 859 * increment the pc_busy counter, drop every 860 * lock in sight. Get out of the way and wait 861 * for type (2) threads to finish. 862 */ 863 864 mutex_enter(&pcp->pc_no_exit); 865 pcp->pc_busy++; /* prevents exit()'s */ 866 mutex_exit(&pcp->pc_no_exit); 867 868 PH_EXIT(php); 869 mutex_enter(&pcp->pc_lock); 870 mutex_exit(&pcp->pc_lock); 871 mutex_enter(&pcp->pc_no_exit); 872 pcp->pc_busy--; 873 if (pcp->pc_busy == 0) { 874 /* 875 * Wakeup the thread waiting in 876 * thread_exit(). 877 */ 878 cv_signal(&pcp->pc_busy_cv); 879 } 880 mutex_exit(&pcp->pc_no_exit); 881 goto retry; 882 } 883 } 884 next: 885 pdp = pdp->pd_next; 886 if (pdp == NULL) 887 pdp = php->ph_list; 888 if (pdp == first) 889 break; 890 } 891 892 893 /* 894 * Event ports - If this php is of the port on the list, 895 * call port_pollwkdone() to release it. The port_pollwkdone() 896 * needs to be called before dropping the PH lock so that any new 897 * thread attempting to poll this port are blocked. There can be 898 * only one thread here in pollwakeup notifying this port's fd. 899 */ 900 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 901 struct plist *t; 902 port_pollwkdone(plhead->pp); 903 t = plhead; 904 plhead = plhead->next; 905 kmem_free(t, sizeof (struct plist)); 906 } 907 PH_EXIT(php); 908 909 /* 910 * Event ports - Notify threads polling the event port's fd. 911 * This is normally done in port_send_event() where it calls 912 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 913 * we do it here in pollwakeup() to avoid a recursive call. 914 */ 915 if (plhead != NULL) { 916 php = &plhead->pp->port_pollhd; 917 events = plhead->pevents; 918 goto retry; 919 } 920 } 921 922 /* 923 * This function is called to inform a thread that 924 * an event being polled for has occurred. 925 * The pollstate lock on the thread should be held on entry. 926 */ 927 void 928 pollnotify(pollcache_t *pcp, int fd) 929 { 930 ASSERT(fd < pcp->pc_mapsize); 931 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 932 BT_SET(pcp->pc_bitmap, fd); 933 pcp->pc_flag |= T_POLLWAKE; 934 cv_signal(&pcp->pc_cv); 935 } 936 937 /* 938 * add a polldat entry to pollhead ph_list. The polldat struct is used 939 * by pollwakeup to wake sleeping pollers when polled events has happened. 940 */ 941 void 942 pollhead_insert(pollhead_t *php, polldat_t *pdp) 943 { 944 PH_ENTER(php); 945 ASSERT(pdp->pd_next == NULL); 946 #ifdef DEBUG 947 { 948 /* 949 * the polldat should not be already on the list 950 */ 951 polldat_t *wp; 952 for (wp = php->ph_list; wp; wp = wp->pd_next) { 953 ASSERT(wp != pdp); 954 } 955 } 956 #endif /* DEBUG */ 957 pdp->pd_next = php->ph_list; 958 php->ph_list = pdp; 959 PH_EXIT(php); 960 } 961 962 /* 963 * Delete the polldat entry from ph_list. 964 */ 965 void 966 pollhead_delete(pollhead_t *php, polldat_t *pdp) 967 { 968 polldat_t *wp; 969 polldat_t **wpp; 970 971 PH_ENTER(php); 972 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 973 if (wp == pdp) { 974 *wpp = pdp->pd_next; 975 pdp->pd_next = NULL; 976 break; 977 } 978 } 979 #ifdef DEBUG 980 /* assert that pdp is no longer in the list */ 981 for (wp = *wpp; wp; wp = wp->pd_next) { 982 ASSERT(wp != pdp); 983 } 984 #endif /* DEBUG */ 985 PH_EXIT(php); 986 } 987 988 /* 989 * walk through the poll fd lists to see if they are identical. This is an 990 * expensive operation and should not be done more than once for each poll() 991 * call. 992 * 993 * As an optimization (i.e., not having to go through the lists more than 994 * once), this routine also clear the revents field of pollfd in 'current'. 995 * Zeroing out the revents field of each entry in current poll list is 996 * required by poll man page. 997 * 998 * Since the events field of cached list has illegal poll events filtered 999 * out, the current list applies the same filtering before comparison. 1000 * 1001 * The routine stops when it detects a meaningful difference, or when it 1002 * exhausts the lists. 1003 */ 1004 int 1005 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 1006 { 1007 int ix; 1008 1009 for (ix = 0; ix < n; ix++) { 1010 /* Prefetch 64 bytes worth of 8-byte elements */ 1011 if ((ix & 0x7) == 0) { 1012 prefetch_write_many((caddr_t)¤t[ix + 8]); 1013 prefetch_write_many((caddr_t)&cached[ix + 8]); 1014 } 1015 if (current[ix].fd == cached[ix].fd) { 1016 /* 1017 * Filter out invalid poll events while we are in 1018 * inside the loop. 1019 */ 1020 if (current[ix].events & ~VALID_POLL_EVENTS) { 1021 current[ix].events &= VALID_POLL_EVENTS; 1022 if (newlist != NULL) 1023 newlist[ix].events = current[ix].events; 1024 } 1025 if (current[ix].events == cached[ix].events) { 1026 current[ix].revents = 0; 1027 continue; 1028 } 1029 } 1030 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1031 current[ix].revents = 0; 1032 continue; 1033 } 1034 return (ix); 1035 } 1036 return (ix); 1037 } 1038 1039 /* 1040 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1041 * does not find it in the hash table. 1042 */ 1043 polldat_t * 1044 pcache_lookup_fd(pollcache_t *pcp, int fd) 1045 { 1046 int hashindex; 1047 polldat_t *pdp; 1048 1049 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1050 pdp = pcp->pc_hash[hashindex]; 1051 while (pdp != NULL) { 1052 if (pdp->pd_fd == fd) 1053 break; 1054 pdp = pdp->pd_hashnext; 1055 } 1056 return (pdp); 1057 } 1058 1059 polldat_t * 1060 pcache_alloc_fd(int nsets) 1061 { 1062 polldat_t *pdp; 1063 1064 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1065 if (nsets > 0) { 1066 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1067 pdp->pd_nsets = nsets; 1068 } 1069 return (pdp); 1070 } 1071 1072 /* 1073 * This routine inserts a polldat into the pollcache's hash table. It 1074 * may be necessary to grow the size of the hash table. 1075 */ 1076 void 1077 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1078 { 1079 int hashindex; 1080 int fd; 1081 1082 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1083 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1084 pcache_grow_hashtbl(pcp, nfds); 1085 } 1086 fd = pdp->pd_fd; 1087 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1088 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1089 pcp->pc_hash[hashindex] = pdp; 1090 pcp->pc_fdcount++; 1091 1092 #ifdef DEBUG 1093 { 1094 /* 1095 * same fd should not appear on a hash list twice 1096 */ 1097 polldat_t *pdp1; 1098 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1099 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1100 } 1101 } 1102 #endif /* DEBUG */ 1103 } 1104 1105 /* 1106 * Grow the hash table -- either double the table size or round it to the 1107 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1108 * elements on the hash table. 1109 */ 1110 void 1111 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1112 { 1113 int oldsize; 1114 polldat_t **oldtbl; 1115 polldat_t *pdp, *pdp1; 1116 int i; 1117 #ifdef DEBUG 1118 int count = 0; 1119 #endif 1120 1121 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1122 oldsize = pcp->pc_hashsize; 1123 oldtbl = pcp->pc_hash; 1124 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1125 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1126 ~(POLLHASHCHUNKSZ - 1); 1127 } else { 1128 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1129 } 1130 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1131 KM_SLEEP); 1132 /* 1133 * rehash existing elements 1134 */ 1135 pcp->pc_fdcount = 0; 1136 for (i = 0; i < oldsize; i++) { 1137 pdp = oldtbl[i]; 1138 while (pdp != NULL) { 1139 pdp1 = pdp->pd_hashnext; 1140 pcache_insert_fd(pcp, pdp, nfds); 1141 pdp = pdp1; 1142 #ifdef DEBUG 1143 count++; 1144 #endif 1145 } 1146 } 1147 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1148 ASSERT(pcp->pc_fdcount == count); 1149 } 1150 1151 void 1152 pcache_grow_map(pollcache_t *pcp, int fd) 1153 { 1154 int newsize; 1155 ulong_t *newmap; 1156 1157 /* 1158 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1159 * power of 2. 1160 */ 1161 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1162 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1163 KM_SLEEP); 1164 /* 1165 * don't want pollwakeup to set a bit while growing the bitmap. 1166 */ 1167 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1168 mutex_enter(&pcp->pc_lock); 1169 bcopy(pcp->pc_bitmap, newmap, 1170 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1171 kmem_free(pcp->pc_bitmap, 1172 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1173 pcp->pc_bitmap = newmap; 1174 pcp->pc_mapsize = newsize; 1175 mutex_exit(&pcp->pc_lock); 1176 } 1177 1178 /* 1179 * remove all the reference from pollhead list and fpollinfo lists. 1180 */ 1181 void 1182 pcache_clean(pollcache_t *pcp) 1183 { 1184 int i; 1185 polldat_t **hashtbl; 1186 polldat_t *pdp; 1187 1188 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1189 hashtbl = pcp->pc_hash; 1190 for (i = 0; i < pcp->pc_hashsize; i++) { 1191 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1192 if (pdp->pd_php != NULL) { 1193 pollhead_delete(pdp->pd_php, pdp); 1194 pdp->pd_php = NULL; 1195 } 1196 if (pdp->pd_fp != NULL) { 1197 delfpollinfo(pdp->pd_fd); 1198 pdp->pd_fp = NULL; 1199 } 1200 } 1201 } 1202 } 1203 1204 void 1205 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1206 { 1207 int i; 1208 int fd = pdp->pd_fd; 1209 1210 /* 1211 * we come here because an earlier close() on this cached poll fd. 1212 */ 1213 ASSERT(pdp->pd_fp == NULL); 1214 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1215 pdp->pd_events = 0; 1216 for (i = 0; i < ps->ps_nsets; i++) { 1217 xref_t *refp; 1218 pollcacheset_t *pcsp; 1219 1220 ASSERT(pdp->pd_ref != NULL); 1221 refp = &pdp->pd_ref[i]; 1222 if (refp->xf_refcnt) { 1223 ASSERT(refp->xf_position >= 0); 1224 pcsp = &ps->ps_pcacheset[i]; 1225 if (refp->xf_refcnt == 1) { 1226 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1227 refp->xf_refcnt = 0; 1228 pdp->pd_count--; 1229 } else if (refp->xf_refcnt > 1) { 1230 int j; 1231 1232 /* 1233 * turn off every appearance in pcs_pollfd list 1234 */ 1235 for (j = refp->xf_position; 1236 j < pcsp->pcs_nfds; j++) { 1237 if (pcsp->pcs_pollfd[j].fd == fd) { 1238 pcsp->pcs_pollfd[j].fd = -1; 1239 refp->xf_refcnt--; 1240 pdp->pd_count--; 1241 } 1242 } 1243 } 1244 ASSERT(refp->xf_refcnt == 0); 1245 refp->xf_position = POLLPOSINVAL; 1246 } 1247 } 1248 ASSERT(pdp->pd_count == 0); 1249 } 1250 1251 /* 1252 * Insert poll fd into the pollcache, and add poll registration. 1253 * This routine is called after getf() and before releasef(). So the vnode 1254 * can not disappear even if we block here. 1255 * If there is an error, the polled fd is not cached. 1256 */ 1257 int 1258 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1259 ssize_t pos, int which) 1260 { 1261 pollcache_t *pcp = ps->ps_pcache; 1262 polldat_t *pdp; 1263 int error; 1264 int fd; 1265 pollhead_t *memphp = NULL; 1266 xref_t *refp; 1267 int newpollfd = 0; 1268 1269 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1270 /* 1271 * The poll caching uses the existing VOP_POLL interface. If there 1272 * is no polled events, we want the polled device to set its "some 1273 * one is sleeping in poll" flag. When the polled events happen 1274 * later, the driver will call pollwakeup(). We achieve this by 1275 * always passing 0 in the third parameter ("anyyet") when calling 1276 * VOP_POLL. This parameter is not looked at by drivers when the 1277 * polled events exist. If a driver chooses to ignore this parameter 1278 * and call pollwakeup whenever the polled events happen, that will 1279 * be OK too. 1280 */ 1281 ASSERT(curthread->t_pollcache == NULL); 1282 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1283 &memphp, NULL); 1284 if (error) { 1285 return (error); 1286 } 1287 if (pollfdp->revents) { 1288 (*fdcntp)++; 1289 } 1290 /* 1291 * polling the underlying device succeeded. Now we can cache it. 1292 * A close can't come in here because we have not done a releasef() 1293 * yet. 1294 */ 1295 fd = pollfdp->fd; 1296 pdp = pcache_lookup_fd(pcp, fd); 1297 if (pdp == NULL) { 1298 ASSERT(ps->ps_nsets > 0); 1299 pdp = pcache_alloc_fd(ps->ps_nsets); 1300 newpollfd = 1; 1301 } 1302 /* 1303 * If this entry was used to cache a poll fd which was closed, and 1304 * this entry has not been cleaned, do it now. 1305 */ 1306 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1307 pcacheset_invalidate(ps, pdp); 1308 ASSERT(pdp->pd_next == NULL); 1309 } 1310 if (pdp->pd_count == 0) { 1311 pdp->pd_fd = fd; 1312 pdp->pd_fp = fp; 1313 addfpollinfo(fd); 1314 pdp->pd_thread = curthread; 1315 pdp->pd_pcache = pcp; 1316 /* 1317 * the entry is never used or cleared by removing a cached 1318 * pollfd (pcache_delete_fd). So all the fields should be clear. 1319 */ 1320 ASSERT(pdp->pd_next == NULL); 1321 } 1322 1323 /* 1324 * A polled fd is considered cached. So there should be a fpollinfo 1325 * entry on uf_fpollinfo list. 1326 */ 1327 ASSERT(infpollinfo(fd)); 1328 /* 1329 * If there is an inconsistency, we want to know it here. 1330 */ 1331 ASSERT(pdp->pd_fp == fp); 1332 1333 /* 1334 * XXX pd_events is a union of all polled events on this fd, possibly 1335 * by different threads. Unless this is a new first poll(), pd_events 1336 * never shrinks. If an event is no longer polled by a process, there 1337 * is no way to cancel that event. In that case, poll degrade to its 1338 * old form -- polling on this fd every time poll() is called. The 1339 * assumption is an app always polls the same type of events. 1340 */ 1341 pdp->pd_events |= pollfdp->events; 1342 1343 pdp->pd_count++; 1344 /* 1345 * There is not much special handling for multiple appearances of 1346 * same fd other than xf_position always recording the first 1347 * appearance in poll list. If this is called from pcacheset_cache_list, 1348 * a VOP_POLL is called on every pollfd entry; therefore each 1349 * revents and fdcnt should be set correctly. If this is called from 1350 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1351 * pick up the right count and handle revents field of each pollfd 1352 * entry. 1353 */ 1354 ASSERT(pdp->pd_ref != NULL); 1355 refp = &pdp->pd_ref[which]; 1356 if (refp->xf_refcnt == 0) { 1357 refp->xf_position = pos; 1358 } else { 1359 /* 1360 * xf_position records the fd's first appearance in poll list 1361 */ 1362 if (pos < refp->xf_position) { 1363 refp->xf_position = pos; 1364 } 1365 } 1366 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1367 refp->xf_refcnt++; 1368 if (fd >= pcp->pc_mapsize) { 1369 pcache_grow_map(pcp, fd); 1370 } 1371 if (fd > pcp->pc_mapend) { 1372 pcp->pc_mapend = fd; 1373 } 1374 if (newpollfd != 0) { 1375 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1376 } 1377 if (memphp) { 1378 if (pdp->pd_php == NULL) { 1379 pollhead_insert(memphp, pdp); 1380 pdp->pd_php = memphp; 1381 } else { 1382 if (memphp != pdp->pd_php) { 1383 /* 1384 * layered devices (e.g. console driver) 1385 * may change the vnode and thus the pollhead 1386 * pointer out from underneath us. 1387 */ 1388 pollhead_delete(pdp->pd_php, pdp); 1389 pollhead_insert(memphp, pdp); 1390 pdp->pd_php = memphp; 1391 } 1392 } 1393 } 1394 /* 1395 * Since there is a considerable window between VOP_POLL and when 1396 * we actually put the polldat struct on the pollhead list, we could 1397 * miss a pollwakeup. In the case of polling additional events, we 1398 * don't update the events until after VOP_POLL. So we could miss 1399 * pollwakeup there too. So we always set the bit here just to be 1400 * safe. The real performance gain is in subsequent pcache_poll. 1401 */ 1402 mutex_enter(&pcp->pc_lock); 1403 BT_SET(pcp->pc_bitmap, fd); 1404 mutex_exit(&pcp->pc_lock); 1405 return (0); 1406 } 1407 1408 /* 1409 * The entry is not really deleted. The fields are cleared so that the 1410 * entry is no longer useful, but it will remain in the hash table for reuse 1411 * later. It will be freed when the polling lwp exits. 1412 */ 1413 int 1414 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1415 { 1416 pollcache_t *pcp = ps->ps_pcache; 1417 polldat_t *pdp; 1418 xref_t *refp; 1419 1420 ASSERT(fd < pcp->pc_mapsize); 1421 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1422 1423 pdp = pcache_lookup_fd(pcp, fd); 1424 ASSERT(pdp != NULL); 1425 ASSERT(pdp->pd_count > 0); 1426 ASSERT(pdp->pd_ref != NULL); 1427 refp = &pdp->pd_ref[which]; 1428 if (pdp->pd_count == 1) { 1429 pdp->pd_events = 0; 1430 refp->xf_position = POLLPOSINVAL; 1431 ASSERT(refp->xf_refcnt == 1); 1432 refp->xf_refcnt = 0; 1433 if (pdp->pd_php) { 1434 /* 1435 * It is possible for a wakeup thread to get ahead 1436 * of the following pollhead_delete and set the bit in 1437 * bitmap. It is OK because the bit will be cleared 1438 * here anyway. 1439 */ 1440 pollhead_delete(pdp->pd_php, pdp); 1441 pdp->pd_php = NULL; 1442 } 1443 pdp->pd_count = 0; 1444 if (pdp->pd_fp != NULL) { 1445 pdp->pd_fp = NULL; 1446 delfpollinfo(fd); 1447 } 1448 mutex_enter(&pcp->pc_lock); 1449 BT_CLEAR(pcp->pc_bitmap, fd); 1450 mutex_exit(&pcp->pc_lock); 1451 return (0); 1452 } 1453 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1454 /* 1455 * fd cached here has been closed. This is the first 1456 * pcache_delete_fd called after the close. Clean up the 1457 * entire entry. 1458 */ 1459 pcacheset_invalidate(ps, pdp); 1460 ASSERT(pdp->pd_php == NULL); 1461 mutex_enter(&pcp->pc_lock); 1462 BT_CLEAR(pcp->pc_bitmap, fd); 1463 mutex_exit(&pcp->pc_lock); 1464 return (0); 1465 } 1466 #ifdef DEBUG 1467 if (getf(fd) != NULL) { 1468 ASSERT(infpollinfo(fd)); 1469 releasef(fd); 1470 } 1471 #endif /* DEBUG */ 1472 pdp->pd_count--; 1473 ASSERT(refp->xf_refcnt > 0); 1474 if (--refp->xf_refcnt == 0) { 1475 refp->xf_position = POLLPOSINVAL; 1476 } else { 1477 ASSERT(pos >= refp->xf_position); 1478 if (pos == refp->xf_position) { 1479 /* 1480 * The xref position is no longer valid. 1481 * Reset it to a special value and let 1482 * caller know it needs to updatexref() 1483 * with a new xf_position value. 1484 */ 1485 refp->xf_position = POLLPOSTRANS; 1486 return (1); 1487 } 1488 } 1489 return (0); 1490 } 1491 1492 void 1493 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1494 { 1495 polldat_t *pdp; 1496 1497 pdp = pcache_lookup_fd(pcp, fd); 1498 ASSERT(pdp != NULL); 1499 ASSERT(pdp->pd_ref != NULL); 1500 pdp->pd_ref[which].xf_position = pos; 1501 } 1502 1503 #ifdef DEBUG 1504 /* 1505 * For each polled fd, it's either in the bitmap or cached in 1506 * pcache hash table. If this routine returns 0, something is wrong. 1507 */ 1508 static int 1509 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1510 { 1511 int i; 1512 int fd; 1513 pollcache_t *pcp = ps->ps_pcache; 1514 polldat_t *pdp; 1515 pollfd_t *pollfdp = ps->ps_pollfd; 1516 file_t *fp; 1517 1518 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1519 for (i = 0; i < nfds; i++) { 1520 fd = pollfdp[i].fd; 1521 if (fd < 0) { 1522 ASSERT(pollfdp[i].revents == 0); 1523 continue; 1524 } 1525 if (pollfdp[i].revents == POLLNVAL) 1526 continue; 1527 if ((fp = getf(fd)) == NULL) 1528 continue; 1529 pdp = pcache_lookup_fd(pcp, fd); 1530 ASSERT(pdp != NULL); 1531 ASSERT(infpollinfo(fd)); 1532 ASSERT(pdp->pd_fp == fp); 1533 releasef(fd); 1534 if (BT_TEST(pcp->pc_bitmap, fd)) 1535 continue; 1536 if (pdp->pd_php == NULL) 1537 return (0); 1538 } 1539 return (1); 1540 } 1541 #endif /* DEBUG */ 1542 1543 /* 1544 * resolve the difference between the current poll list and a cached one. 1545 */ 1546 int 1547 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1548 { 1549 int i; 1550 pollcache_t *pcp = ps->ps_pcache; 1551 pollfd_t *newlist = NULL; 1552 pollfd_t *current = ps->ps_pollfd; 1553 pollfd_t *cached; 1554 pollcacheset_t *pcsp; 1555 int common; 1556 int count = 0; 1557 int offset; 1558 int remain; 1559 int fd; 1560 file_t *fp; 1561 int fdcnt = 0; 1562 int cnt = 0; 1563 nfds_t old_nfds; 1564 int error = 0; 1565 int mismatch = 0; 1566 1567 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1568 #ifdef DEBUG 1569 checkpolldat(ps); 1570 #endif 1571 pcsp = &ps->ps_pcacheset[which]; 1572 old_nfds = pcsp->pcs_nfds; 1573 common = (nfds > old_nfds) ? old_nfds : nfds; 1574 if (nfds != old_nfds) { 1575 /* 1576 * the length of poll list has changed. allocate a new 1577 * pollfd list. 1578 */ 1579 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1580 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1581 } 1582 /* 1583 * Compare the overlapping part of the current fd list with the 1584 * cached one. Whenever a difference is found, resolve it. 1585 * The comparison is done on the current poll list and the 1586 * cached list. But we may be setting up the newlist to be the 1587 * cached list for next poll. 1588 */ 1589 cached = pcsp->pcs_pollfd; 1590 remain = common; 1591 1592 while (count < common) { 1593 int tmpfd; 1594 pollfd_t *np; 1595 1596 np = (newlist != NULL) ? &newlist[count] : NULL; 1597 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1598 remain); 1599 /* 1600 * Collect stats. If lists are completed the first time, 1601 * it's a hit. Otherwise, it's a partial hit or miss. 1602 */ 1603 if ((count == 0) && (offset == common)) { 1604 pollstats.pollcachehit.value.ui64++; 1605 } else { 1606 mismatch++; 1607 } 1608 count += offset; 1609 if (offset < remain) { 1610 ASSERT(count < common); 1611 ASSERT((current[count].fd != cached[count].fd) || 1612 (current[count].events != cached[count].events)); 1613 /* 1614 * Filter out invalid events. 1615 */ 1616 if (current[count].events & ~VALID_POLL_EVENTS) { 1617 if (newlist != NULL) { 1618 newlist[count].events = 1619 current[count].events &= 1620 VALID_POLL_EVENTS; 1621 } else { 1622 current[count].events &= 1623 VALID_POLL_EVENTS; 1624 } 1625 } 1626 /* 1627 * when resolving a difference, we always remove the 1628 * fd from cache before inserting one into cache. 1629 */ 1630 if (cached[count].fd >= 0) { 1631 tmpfd = cached[count].fd; 1632 if (pcache_delete_fd(ps, tmpfd, count, which, 1633 (uint_t)cached[count].events)) { 1634 /* 1635 * This should be rare but needed for 1636 * correctness. 1637 * 1638 * The first appearance in cached list 1639 * is being "turned off". The same fd 1640 * appear more than once in the cached 1641 * poll list. Find the next one on the 1642 * list and update the cached 1643 * xf_position field. 1644 */ 1645 for (i = count + 1; i < old_nfds; i++) { 1646 if (cached[i].fd == tmpfd) { 1647 pcache_update_xref(pcp, 1648 tmpfd, (ssize_t)i, 1649 which); 1650 break; 1651 } 1652 } 1653 ASSERT(i <= old_nfds); 1654 } 1655 /* 1656 * In case a new cache list is allocated, 1657 * need to keep both cache lists in sync 1658 * b/c the new one can be freed if we have 1659 * an error later. 1660 */ 1661 cached[count].fd = -1; 1662 if (newlist != NULL) { 1663 newlist[count].fd = -1; 1664 } 1665 } 1666 if ((tmpfd = current[count].fd) >= 0) { 1667 /* 1668 * add to the cached fd tbl and bitmap. 1669 */ 1670 if ((fp = getf(tmpfd)) == NULL) { 1671 current[count].revents = POLLNVAL; 1672 if (newlist != NULL) { 1673 newlist[count].fd = -1; 1674 } 1675 cached[count].fd = -1; 1676 fdcnt++; 1677 } else { 1678 /* 1679 * Here we don't care about the 1680 * fdcnt. We will examine the bitmap 1681 * later and pick up the correct 1682 * fdcnt there. So we never bother 1683 * to check value of 'cnt'. 1684 */ 1685 error = pcache_insert(ps, fp, 1686 ¤t[count], &cnt, 1687 (ssize_t)count, which); 1688 /* 1689 * if no error, we want to do releasef 1690 * after we updated cache poll list 1691 * entry so that close() won't race 1692 * us. 1693 */ 1694 if (error) { 1695 /* 1696 * If we encountered an error, 1697 * we have invalidated an 1698 * entry in cached poll list 1699 * (in pcache_delete_fd() above) 1700 * but failed to add one here. 1701 * This is OK b/c what's in the 1702 * cached list is consistent 1703 * with content of cache. 1704 * It will not have any ill 1705 * effect on next poll(). 1706 */ 1707 releasef(tmpfd); 1708 if (newlist != NULL) { 1709 kmem_free(newlist, 1710 nfds * 1711 sizeof (pollfd_t)); 1712 } 1713 return (error); 1714 } 1715 /* 1716 * If we have allocated a new(temp) 1717 * cache list, we need to keep both 1718 * in sync b/c the new one can be freed 1719 * if we have an error later. 1720 */ 1721 if (newlist != NULL) { 1722 newlist[count].fd = 1723 current[count].fd; 1724 newlist[count].events = 1725 current[count].events; 1726 } 1727 cached[count].fd = current[count].fd; 1728 cached[count].events = 1729 current[count].events; 1730 releasef(tmpfd); 1731 } 1732 } else { 1733 current[count].revents = 0; 1734 } 1735 count++; 1736 remain = common - count; 1737 } 1738 } 1739 if (mismatch != 0) { 1740 if (mismatch == common) { 1741 pollstats.pollcachemiss.value.ui64++; 1742 } else { 1743 pollstats.pollcachephit.value.ui64++; 1744 } 1745 } 1746 /* 1747 * take care of the non overlapping part of a list 1748 */ 1749 if (nfds > old_nfds) { 1750 ASSERT(newlist != NULL); 1751 for (i = old_nfds; i < nfds; i++) { 1752 /* filter out invalid events */ 1753 if (current[i].events & ~VALID_POLL_EVENTS) { 1754 newlist[i].events = current[i].events = 1755 current[i].events & VALID_POLL_EVENTS; 1756 } 1757 if ((fd = current[i].fd) < 0) { 1758 current[i].revents = 0; 1759 continue; 1760 } 1761 /* 1762 * add to the cached fd tbl and bitmap. 1763 */ 1764 if ((fp = getf(fd)) == NULL) { 1765 current[i].revents = POLLNVAL; 1766 newlist[i].fd = -1; 1767 fdcnt++; 1768 continue; 1769 } 1770 /* 1771 * Here we don't care about the 1772 * fdcnt. We will examine the bitmap 1773 * later and pick up the correct 1774 * fdcnt there. So we never bother to 1775 * check 'cnt'. 1776 */ 1777 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1778 (ssize_t)i, which); 1779 releasef(fd); 1780 if (error) { 1781 /* 1782 * Here we are half way through adding newly 1783 * polled fd. Undo enough to keep the cache 1784 * list consistent with the cache content. 1785 */ 1786 pcacheset_remove_list(ps, current, old_nfds, 1787 i, which, 0); 1788 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1789 return (error); 1790 } 1791 } 1792 } 1793 if (old_nfds > nfds) { 1794 /* 1795 * remove the fd's which are no longer polled. 1796 */ 1797 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1798 which, 1); 1799 } 1800 /* 1801 * set difference resolved. update nfds and cachedlist 1802 * in pollstate struct. 1803 */ 1804 if (newlist != NULL) { 1805 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1806 /* 1807 * By now, the pollfd.revents field should 1808 * all be zeroed. 1809 */ 1810 pcsp->pcs_pollfd = newlist; 1811 pcsp->pcs_nfds = nfds; 1812 } 1813 ASSERT(*fdcntp == 0); 1814 *fdcntp = fdcnt; 1815 /* 1816 * By now for every fd in pollfdp, one of the following should be 1817 * true. Otherwise we will miss a polled event. 1818 * 1819 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1820 * will be called on this fd in next poll. 1821 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1822 * pollnotify will happen. 1823 */ 1824 ASSERT(pollchecksanity(ps, nfds)); 1825 /* 1826 * make sure cross reference between cached poll lists and cached 1827 * poll fds are correct. 1828 */ 1829 ASSERT(pollcheckxref(ps, which)); 1830 /* 1831 * ensure each polldat in pollcache reference a polled fd in 1832 * pollcacheset. 1833 */ 1834 #ifdef DEBUG 1835 checkpolldat(ps); 1836 #endif 1837 return (0); 1838 } 1839 1840 #ifdef DEBUG 1841 static int 1842 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1843 { 1844 int i; 1845 int reventcnt = 0; 1846 1847 for (i = 0; i < nfds; i++) { 1848 if (pollfdp[i].fd < 0) { 1849 ASSERT(pollfdp[i].revents == 0); 1850 continue; 1851 } 1852 if (pollfdp[i].revents) { 1853 reventcnt++; 1854 } 1855 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1856 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1857 } 1858 } 1859 return (reventcnt); 1860 } 1861 #endif /* DEBUG */ 1862 1863 /* 1864 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1865 * is held upon entry. 1866 */ 1867 int 1868 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1869 int which) 1870 { 1871 int i; 1872 pollcache_t *pcp; 1873 int fd; 1874 int begin, end, done; 1875 pollhead_t *php; 1876 int fdcnt; 1877 int error = 0; 1878 file_t *fp; 1879 polldat_t *pdp; 1880 xref_t *refp; 1881 int entry; 1882 1883 pcp = ps->ps_pcache; 1884 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1885 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1886 retry: 1887 done = 0; 1888 begin = 0; 1889 fdcnt = 0; 1890 end = pcp->pc_mapend; 1891 while ((fdcnt < nfds) && !done) { 1892 php = NULL; 1893 /* 1894 * only poll fds which may have events 1895 */ 1896 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1897 ASSERT(fd <= end); 1898 if (fd >= 0) { 1899 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1900 /* 1901 * adjust map pointers for next round 1902 */ 1903 if (fd == end) { 1904 done = 1; 1905 } else { 1906 begin = fd + 1; 1907 } 1908 /* 1909 * A bitmap caches poll state information of 1910 * multiple poll lists. Call VOP_POLL only if 1911 * the bit corresponds to an fd in this poll 1912 * list. 1913 */ 1914 pdp = pcache_lookup_fd(pcp, fd); 1915 ASSERT(pdp != NULL); 1916 ASSERT(pdp->pd_ref != NULL); 1917 refp = &pdp->pd_ref[which]; 1918 if (refp->xf_refcnt == 0) 1919 continue; 1920 entry = refp->xf_position; 1921 ASSERT((entry >= 0) && (entry < nfds)); 1922 ASSERT(pollfdp[entry].fd == fd); 1923 /* 1924 * we are in this routine implies that we have 1925 * successfully polled this fd in the past. 1926 * Check to see this fd is closed while we are 1927 * blocked in poll. This ensures that we don't 1928 * miss a close on the fd in the case this fd is 1929 * reused. 1930 */ 1931 if (pdp->pd_fp == NULL) { 1932 ASSERT(pdp->pd_count > 0); 1933 pollfdp[entry].revents = POLLNVAL; 1934 fdcnt++; 1935 if (refp->xf_refcnt > 1) { 1936 /* 1937 * this fd appeared multiple time 1938 * in the poll list. Find all of them. 1939 */ 1940 for (i = entry + 1; i < nfds; i++) { 1941 if (pollfdp[i].fd == fd) { 1942 pollfdp[i].revents = 1943 POLLNVAL; 1944 fdcnt++; 1945 } 1946 } 1947 } 1948 pcacheset_invalidate(ps, pdp); 1949 continue; 1950 } 1951 /* 1952 * We can be here polling a device that is being 1953 * closed (i.e. the file pointer is set to NULL, 1954 * but pollcacheclean has not happened yet). 1955 */ 1956 if ((fp = getf(fd)) == NULL) { 1957 pollfdp[entry].revents = POLLNVAL; 1958 fdcnt++; 1959 if (refp->xf_refcnt > 1) { 1960 /* 1961 * this fd appeared multiple time 1962 * in the poll list. Find all of them. 1963 */ 1964 for (i = entry + 1; i < nfds; i++) { 1965 if (pollfdp[i].fd == fd) { 1966 pollfdp[i].revents = 1967 POLLNVAL; 1968 fdcnt++; 1969 } 1970 } 1971 } 1972 continue; 1973 } 1974 ASSERT(pdp->pd_fp == fp); 1975 ASSERT(infpollinfo(fd)); 1976 /* 1977 * Since we no longer hold poll head lock across 1978 * VOP_POLL, pollunlock logic can be simplifed. 1979 */ 1980 ASSERT(pdp->pd_php == NULL || 1981 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 1982 /* 1983 * underlying file systems may set a "pollpending" 1984 * flag when it sees the poll may block. Pollwakeup() 1985 * is called by wakeup thread if pollpending is set. 1986 * Pass a 0 fdcnt so that the underlying file system 1987 * will set the "pollpending" flag set when there is 1988 * no polled events. 1989 * 1990 * Use pollfdp[].events for actual polling because 1991 * the pd_events is union of all cached poll events 1992 * on this fd. The events parameter also affects 1993 * how the polled device sets the "poll pending" 1994 * flag. 1995 */ 1996 ASSERT(curthread->t_pollcache == NULL); 1997 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 1998 &pollfdp[entry].revents, &php, NULL); 1999 /* 2000 * releasef after completely done with this cached 2001 * poll entry. To prevent close() coming in to clear 2002 * this entry. 2003 */ 2004 if (error) { 2005 releasef(fd); 2006 break; 2007 } 2008 /* 2009 * layered devices (e.g. console driver) 2010 * may change the vnode and thus the pollhead 2011 * pointer out from underneath us. 2012 */ 2013 if (php != NULL && pdp->pd_php != NULL && 2014 php != pdp->pd_php) { 2015 releasef(fd); 2016 pollhead_delete(pdp->pd_php, pdp); 2017 pdp->pd_php = php; 2018 pollhead_insert(php, pdp); 2019 /* 2020 * We could have missed a wakeup on the new 2021 * target device. Make sure the new target 2022 * gets polled once. 2023 */ 2024 BT_SET(pcp->pc_bitmap, fd); 2025 goto retry; 2026 } 2027 2028 if (pollfdp[entry].revents) { 2029 ASSERT(refp->xf_refcnt >= 1); 2030 fdcnt++; 2031 if (refp->xf_refcnt > 1) { 2032 /* 2033 * this fd appeared multiple time 2034 * in the poll list. This is rare but 2035 * we have to look at all of them for 2036 * correctness. 2037 */ 2038 error = plist_chkdupfd(fp, pdp, ps, 2039 pollfdp, entry, &fdcnt); 2040 if (error > 0) { 2041 releasef(fd); 2042 break; 2043 } 2044 if (error < 0) { 2045 goto retry; 2046 } 2047 } 2048 releasef(fd); 2049 } else { 2050 /* 2051 * VOP_POLL didn't return any revents. We can 2052 * clear the bit in bitmap only if we have the 2053 * pollhead ptr cached and no other cached 2054 * entry is polling different events on this fd. 2055 * VOP_POLL may have dropped the ps_lock. Make 2056 * sure pollwakeup has not happened before clear 2057 * the bit. 2058 */ 2059 if ((pdp->pd_php != NULL) && 2060 (pollfdp[entry].events == pdp->pd_events) && 2061 ((pcp->pc_flag & T_POLLWAKE) == 0)) { 2062 BT_CLEAR(pcp->pc_bitmap, fd); 2063 } 2064 /* 2065 * if the fd can be cached now but not before, 2066 * do it now. 2067 */ 2068 if ((pdp->pd_php == NULL) && (php != NULL)) { 2069 pdp->pd_php = php; 2070 pollhead_insert(php, pdp); 2071 /* 2072 * We are inserting a polldat struct for 2073 * the first time. We may have missed a 2074 * wakeup on this device. Re-poll once. 2075 * This should be a rare event. 2076 */ 2077 releasef(fd); 2078 goto retry; 2079 } 2080 if (refp->xf_refcnt > 1) { 2081 /* 2082 * this fd appeared multiple time 2083 * in the poll list. This is rare but 2084 * we have to look at all of them for 2085 * correctness. 2086 */ 2087 error = plist_chkdupfd(fp, pdp, ps, 2088 pollfdp, entry, &fdcnt); 2089 if (error > 0) { 2090 releasef(fd); 2091 break; 2092 } 2093 if (error < 0) { 2094 goto retry; 2095 } 2096 } 2097 releasef(fd); 2098 } 2099 } else { 2100 done = 1; 2101 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2102 } 2103 } 2104 if (!error) { 2105 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2106 *fdcntp += fdcnt; 2107 } 2108 return (error); 2109 } 2110 2111 /* 2112 * Going through the poll list without much locking. Poll all fds and 2113 * cache all valid fds in the pollcache. 2114 */ 2115 int 2116 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2117 { 2118 pollfd_t *pollfdp = ps->ps_pollfd; 2119 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2120 pollfd_t *newfdlist; 2121 int i; 2122 int fd; 2123 file_t *fp; 2124 int error = 0; 2125 2126 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2127 ASSERT(which < ps->ps_nsets); 2128 ASSERT(pcacheset != NULL); 2129 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2130 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2131 /* 2132 * cache the new poll list in pollcachset. 2133 */ 2134 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2135 2136 pcacheset[which].pcs_pollfd = newfdlist; 2137 pcacheset[which].pcs_nfds = ps->ps_nfds; 2138 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2139 2140 /* 2141 * We have saved a copy of current poll fd list in one pollcacheset. 2142 * The 'revents' field of the new list is not yet set to 0. Loop 2143 * through the new list just to do that is expensive. We do that 2144 * while polling the list. 2145 */ 2146 for (i = 0; i < ps->ps_nfds; i++) { 2147 fd = pollfdp[i].fd; 2148 /* 2149 * We also filter out the illegal poll events in the event 2150 * field for the cached poll list/set. 2151 */ 2152 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2153 newfdlist[i].events = pollfdp[i].events = 2154 pollfdp[i].events & VALID_POLL_EVENTS; 2155 } 2156 if (fd < 0) { 2157 pollfdp[i].revents = 0; 2158 continue; 2159 } 2160 if ((fp = getf(fd)) == NULL) { 2161 pollfdp[i].revents = POLLNVAL; 2162 /* 2163 * invalidate this cache entry in the cached poll list 2164 */ 2165 newfdlist[i].fd = -1; 2166 (*fdcntp)++; 2167 continue; 2168 } 2169 /* 2170 * cache this fd. 2171 */ 2172 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2173 which); 2174 releasef(fd); 2175 if (error) { 2176 /* 2177 * Here we are half way through caching a new 2178 * poll list. Undo every thing. 2179 */ 2180 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2181 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2182 pcacheset[which].pcs_pollfd = NULL; 2183 pcacheset[which].pcs_usradr = NULL; 2184 break; 2185 } 2186 } 2187 return (error); 2188 } 2189 2190 /* 2191 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2192 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2193 * wake any sleeping poller, then remove the polldat from the driver. 2194 * The routine is called with ps_pcachelock held. 2195 */ 2196 void 2197 pcache_clean_entry(pollstate_t *ps, int fd) 2198 { 2199 pollcache_t *pcp; 2200 polldat_t *pdp; 2201 int i; 2202 2203 ASSERT(ps != NULL); 2204 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2205 pcp = ps->ps_pcache; 2206 ASSERT(pcp); 2207 pdp = pcache_lookup_fd(pcp, fd); 2208 ASSERT(pdp != NULL); 2209 /* 2210 * the corresponding fpollinfo in fi_list has been removed by 2211 * a close on this fd. Reset the cached fp ptr here. 2212 */ 2213 pdp->pd_fp = NULL; 2214 /* 2215 * XXX - This routine also touches data in pcacheset struct. 2216 * 2217 * set the event in cached poll lists to POLLCLOSED. This invalidate 2218 * the cached poll fd entry in that poll list, which will force a 2219 * removal of this cached entry in next poll(). The cleanup is done 2220 * at the removal time. 2221 */ 2222 ASSERT(pdp->pd_ref != NULL); 2223 for (i = 0; i < ps->ps_nsets; i++) { 2224 xref_t *refp; 2225 pollcacheset_t *pcsp; 2226 2227 refp = &pdp->pd_ref[i]; 2228 if (refp->xf_refcnt) { 2229 ASSERT(refp->xf_position >= 0); 2230 pcsp = &ps->ps_pcacheset[i]; 2231 if (refp->xf_refcnt == 1) { 2232 pcsp->pcs_pollfd[refp->xf_position].events = 2233 (short)POLLCLOSED; 2234 } 2235 if (refp->xf_refcnt > 1) { 2236 int j; 2237 /* 2238 * mark every matching entry in pcs_pollfd 2239 */ 2240 for (j = refp->xf_position; 2241 j < pcsp->pcs_nfds; j++) { 2242 if (pcsp->pcs_pollfd[j].fd == fd) { 2243 pcsp->pcs_pollfd[j].events = 2244 (short)POLLCLOSED; 2245 } 2246 } 2247 } 2248 } 2249 } 2250 if (pdp->pd_php) { 2251 pollwakeup(pdp->pd_php, POLLHUP); 2252 pollhead_delete(pdp->pd_php, pdp); 2253 pdp->pd_php = NULL; 2254 } 2255 } 2256 2257 /* 2258 * This is the first time this thread has ever polled, 2259 * so we have to create its pollstate structure. 2260 * This will persist for the life of the thread, 2261 * until it calls pollcleanup(). 2262 */ 2263 pollstate_t * 2264 pollstate_create(void) 2265 { 2266 pollstate_t *ps; 2267 2268 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2269 ps->ps_nsets = POLLFDSETS; 2270 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2271 return (ps); 2272 } 2273 2274 void 2275 pollstate_destroy(pollstate_t *ps) 2276 { 2277 if (ps->ps_pollfd != NULL) { 2278 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2279 ps->ps_pollfd = NULL; 2280 } 2281 if (ps->ps_pcache != NULL) { 2282 pcache_destroy(ps->ps_pcache); 2283 ps->ps_pcache = NULL; 2284 } 2285 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2286 ps->ps_pcacheset = NULL; 2287 if (ps->ps_dpbuf != NULL) { 2288 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize * sizeof (pollfd_t)); 2289 ps->ps_dpbuf = NULL; 2290 } 2291 mutex_destroy(&ps->ps_lock); 2292 kmem_free(ps, sizeof (pollstate_t)); 2293 } 2294 2295 /* 2296 * We are holding the appropriate uf_lock entering this routine. 2297 * Bump up the ps_busy count to prevent the thread from exiting. 2298 */ 2299 void 2300 pollblockexit(fpollinfo_t *fpip) 2301 { 2302 for (; fpip; fpip = fpip->fp_next) { 2303 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2304 2305 mutex_enter(&pcp->pc_no_exit); 2306 pcp->pc_busy++; /* prevents exit()'s */ 2307 mutex_exit(&pcp->pc_no_exit); 2308 } 2309 } 2310 2311 /* 2312 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2313 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2314 * this cache entry. We can't clean the polldat entry clean up here because 2315 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2316 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2317 * pcache_clean_entry to call pollwakeup(). 2318 */ 2319 void 2320 pollcacheclean(fpollinfo_t *fip, int fd) 2321 { 2322 struct fpollinfo *fpip, *fpip2; 2323 2324 fpip = fip; 2325 while (fpip) { 2326 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2327 pollcache_t *pcp = ps->ps_pcache; 2328 2329 mutex_enter(&ps->ps_lock); 2330 pcache_clean_entry(ps, fd); 2331 mutex_exit(&ps->ps_lock); 2332 mutex_enter(&pcp->pc_no_exit); 2333 pcp->pc_busy--; 2334 if (pcp->pc_busy == 0) { 2335 /* 2336 * Wakeup the thread waiting in 2337 * thread_exit(). 2338 */ 2339 cv_signal(&pcp->pc_busy_cv); 2340 } 2341 mutex_exit(&pcp->pc_no_exit); 2342 2343 fpip2 = fpip; 2344 fpip = fpip->fp_next; 2345 kmem_free(fpip2, sizeof (fpollinfo_t)); 2346 } 2347 } 2348 2349 /* 2350 * one of the cache line's counter is wrapping around. Reset all cache line 2351 * counters to zero except one. This is simplistic, but probably works 2352 * effectively. 2353 */ 2354 void 2355 pcacheset_reset_count(pollstate_t *ps, int index) 2356 { 2357 int i; 2358 2359 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2360 for (i = 0; i < ps->ps_nsets; i++) { 2361 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2362 ps->ps_pcacheset[i].pcs_count = 0; 2363 } 2364 } 2365 ps->ps_pcacheset[index].pcs_count = 1; 2366 } 2367 2368 /* 2369 * this routine implements poll cache list replacement policy. 2370 * It is currently choose the "least used". 2371 */ 2372 int 2373 pcacheset_replace(pollstate_t *ps) 2374 { 2375 int i; 2376 int index = 0; 2377 2378 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2379 for (i = 1; i < ps->ps_nsets; i++) { 2380 if (ps->ps_pcacheset[index].pcs_count > 2381 ps->ps_pcacheset[i].pcs_count) { 2382 index = i; 2383 } 2384 } 2385 ps->ps_pcacheset[index].pcs_count = 0; 2386 return (index); 2387 } 2388 2389 /* 2390 * this routine is called by strclose to remove remaining polldat struct on 2391 * the pollhead list of the device being closed. There are two reasons as why 2392 * the polldat structures still remain on the pollhead list: 2393 * 2394 * (1) The layered device(e.g.the console driver). 2395 * In this case, the existence of a polldat implies that the thread putting 2396 * the polldat on this list has not exited yet. Before the thread exits, it 2397 * will have to hold this pollhead lock to remove the polldat. So holding the 2398 * pollhead lock here effectively prevents the thread which put the polldat 2399 * on this list from exiting. 2400 * 2401 * (2) /dev/poll. 2402 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2403 * pollhead list if the process has not done a POLLREMOVE before closing the 2404 * polled fd. We just unlink it here. 2405 */ 2406 void 2407 pollhead_clean(pollhead_t *php) 2408 { 2409 polldat_t *pdp; 2410 2411 /* 2412 * In case(1), while we must prevent the thread in question from 2413 * exiting, we must also obey the proper locking order, i.e. 2414 * (ps_lock -> phlock). 2415 */ 2416 PH_ENTER(php); 2417 while (php->ph_list != NULL) { 2418 pollstate_t *ps; 2419 pollcache_t *pcp; 2420 2421 pdp = php->ph_list; 2422 ASSERT(pdp->pd_php == php); 2423 if (pdp->pd_thread == NULL) { 2424 /* 2425 * This is case(2). Since the ph_lock is sufficient 2426 * to synchronize this lwp with any other /dev/poll 2427 * lwp, just unlink the polldat. 2428 */ 2429 php->ph_list = pdp->pd_next; 2430 pdp->pd_php = NULL; 2431 pdp->pd_next = NULL; 2432 continue; 2433 } 2434 ps = pdp->pd_thread->t_pollstate; 2435 ASSERT(ps != NULL); 2436 pcp = pdp->pd_pcache; 2437 ASSERT(pcp != NULL); 2438 mutex_enter(&pcp->pc_no_exit); 2439 pcp->pc_busy++; /* prevents exit()'s */ 2440 mutex_exit(&pcp->pc_no_exit); 2441 /* 2442 * Now get the locks in proper order to avoid deadlock. 2443 */ 2444 PH_EXIT(php); 2445 mutex_enter(&ps->ps_lock); 2446 /* 2447 * while we dropped the pollhead lock, the element could be 2448 * taken off the list already. 2449 */ 2450 PH_ENTER(php); 2451 if (pdp->pd_php == php) { 2452 ASSERT(pdp == php->ph_list); 2453 php->ph_list = pdp->pd_next; 2454 pdp->pd_php = NULL; 2455 pdp->pd_next = NULL; 2456 } 2457 PH_EXIT(php); 2458 mutex_exit(&ps->ps_lock); 2459 mutex_enter(&pcp->pc_no_exit); 2460 pcp->pc_busy--; 2461 if (pcp->pc_busy == 0) { 2462 /* 2463 * Wakeup the thread waiting in 2464 * thread_exit(). 2465 */ 2466 cv_signal(&pcp->pc_busy_cv); 2467 } 2468 mutex_exit(&pcp->pc_no_exit); 2469 PH_ENTER(php); 2470 } 2471 PH_EXIT(php); 2472 } 2473 2474 /* 2475 * The remove_list is called to cleanup a partially cached 'current' list or 2476 * to remove a partial list which is no longer cached. The flag value of 1 2477 * indicates the second case. 2478 */ 2479 void 2480 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2481 int cacheindex, int flag) 2482 { 2483 int i; 2484 2485 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2486 for (i = start; i < end; i++) { 2487 if ((pollfdp[i].fd >= 0) && 2488 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2489 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2490 (uint_t)pollfdp[i].events)) { 2491 int j; 2492 int fd = pollfdp[i].fd; 2493 2494 for (j = i + 1; j < end; j++) { 2495 if (pollfdp[j].fd == fd) { 2496 pcache_update_xref( 2497 ps->ps_pcache, fd, 2498 (ssize_t)j, cacheindex); 2499 break; 2500 } 2501 } 2502 ASSERT(j <= end); 2503 } 2504 } 2505 } 2506 } 2507 2508 #ifdef DEBUG 2509 2510 #include<sys/strsubr.h> 2511 /* 2512 * make sure curthread is not on anyone's pollhead list any more. 2513 */ 2514 static void 2515 pollcheckphlist() 2516 { 2517 int i; 2518 file_t *fp; 2519 uf_entry_t *ufp; 2520 uf_info_t *fip = P_FINFO(curproc); 2521 struct stdata *stp; 2522 polldat_t *pdp; 2523 2524 mutex_enter(&fip->fi_lock); 2525 for (i = 0; i < fip->fi_nfiles; i++) { 2526 UF_ENTER(ufp, fip, i); 2527 if ((fp = ufp->uf_file) != NULL) { 2528 if ((stp = fp->f_vnode->v_stream) != NULL) { 2529 PH_ENTER(&stp->sd_pollist); 2530 pdp = stp->sd_pollist.ph_list; 2531 while (pdp) { 2532 ASSERT(pdp->pd_thread != curthread); 2533 pdp = pdp->pd_next; 2534 } 2535 PH_EXIT(&stp->sd_pollist); 2536 } 2537 } 2538 UF_EXIT(ufp); 2539 } 2540 mutex_exit(&fip->fi_lock); 2541 } 2542 2543 /* 2544 * for resolved set poll list, the xref info in the pcache should be 2545 * consistent with this poll list. 2546 */ 2547 static int 2548 pollcheckxref(pollstate_t *ps, int cacheindex) 2549 { 2550 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2551 pollcache_t *pcp = ps->ps_pcache; 2552 polldat_t *pdp; 2553 int i; 2554 xref_t *refp; 2555 2556 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2557 if (pollfdp[i].fd < 0) { 2558 continue; 2559 } 2560 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2561 ASSERT(pdp != NULL); 2562 ASSERT(pdp->pd_ref != NULL); 2563 refp = &pdp->pd_ref[cacheindex]; 2564 if (refp->xf_position >= 0) { 2565 ASSERT(refp->xf_refcnt >= 1); 2566 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2567 if (refp->xf_refcnt > 1) { 2568 int j; 2569 int count = 0; 2570 2571 for (j = refp->xf_position; 2572 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2573 j++) { 2574 if (pollfdp[j].fd == pdp->pd_fd) { 2575 count++; 2576 } 2577 } 2578 ASSERT(count == refp->xf_refcnt); 2579 } 2580 } 2581 } 2582 return (1); 2583 } 2584 2585 /* 2586 * For every cached pollfd, its polldat struct should be consistent with 2587 * what is in the pcacheset lists. 2588 */ 2589 static void 2590 checkpolldat(pollstate_t *ps) 2591 { 2592 pollcache_t *pcp = ps->ps_pcache; 2593 polldat_t **hashtbl; 2594 int i; 2595 2596 hashtbl = pcp->pc_hash; 2597 for (i = 0; i < pcp->pc_hashsize; i++) { 2598 polldat_t *pdp; 2599 2600 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2601 ASSERT(pdp->pd_ref != NULL); 2602 if (pdp->pd_count > 0) { 2603 xref_t *refp; 2604 int j; 2605 pollcacheset_t *pcsp; 2606 pollfd_t *pollfd; 2607 2608 for (j = 0; j < ps->ps_nsets; j++) { 2609 refp = &pdp->pd_ref[j]; 2610 if (refp->xf_refcnt > 0) { 2611 pcsp = &ps->ps_pcacheset[j]; 2612 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2613 pollfd = pcsp->pcs_pollfd; 2614 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2615 } 2616 } 2617 } 2618 } 2619 } 2620 } 2621 2622 /* 2623 * every wfd element on ph_list must have a corresponding fpollinfo on the 2624 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2625 */ 2626 void 2627 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2628 { 2629 stdata_t *stp; 2630 polldat_t *pdp; 2631 fpollinfo_t *fpip2; 2632 2633 if ((stp = vp->v_stream) == NULL) { 2634 return; 2635 } 2636 PH_ENTER(&stp->sd_pollist); 2637 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2638 if (pdp->pd_thread != NULL && 2639 pdp->pd_thread->t_procp == curthread->t_procp) { 2640 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2641 if (pdp->pd_thread == fpip2->fp_thread) { 2642 break; 2643 } 2644 } 2645 ASSERT(fpip2 != NULL); 2646 } 2647 } 2648 PH_EXIT(&stp->sd_pollist); 2649 } 2650 2651 /* 2652 * For each cached fd whose bit is not set in bitmap, its revents field in 2653 * current poll list should be 0. 2654 */ 2655 static int 2656 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2657 { 2658 pollcache_t *pcp = ps->ps_pcache; 2659 pollfd_t *pollfdp = ps->ps_pollfd; 2660 int i; 2661 2662 for (i = begin; i < end; i++) { 2663 polldat_t *pdp; 2664 2665 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2666 pdp = pcache_lookup_fd(pcp, i); 2667 if (pdp && pdp->pd_fp != NULL) { 2668 xref_t *refp; 2669 int entry; 2670 2671 ASSERT(pdp->pd_ref != NULL); 2672 refp = &pdp->pd_ref[cacheindex]; 2673 if (refp->xf_refcnt == 0) { 2674 continue; 2675 } 2676 entry = refp->xf_position; 2677 ASSERT(entry >= 0); 2678 ASSERT(pollfdp[entry].revents == 0); 2679 if (refp->xf_refcnt > 1) { 2680 int j; 2681 2682 for (j = entry + 1; j < ps->ps_nfds; j++) { 2683 if (pollfdp[j].fd == i) { 2684 ASSERT(pollfdp[j].revents == 0); 2685 } 2686 } 2687 } 2688 } 2689 } 2690 return (1); 2691 } 2692 2693 #endif /* DEBUG */ 2694 2695 pollcache_t * 2696 pcache_alloc() 2697 { 2698 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2699 } 2700 2701 void 2702 pcache_create(pollcache_t *pcp, nfds_t nfds) 2703 { 2704 size_t mapsize; 2705 2706 /* 2707 * allocate enough bits for the poll fd list 2708 */ 2709 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2710 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2711 } 2712 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2713 KM_SLEEP); 2714 pcp->pc_mapsize = mapsize; 2715 /* 2716 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2717 * number of fd to start with, allocate a bigger hash table (to the 2718 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2719 * hash table is expensive. 2720 */ 2721 if (nfds < POLLHASHCHUNKSZ) { 2722 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2723 } else { 2724 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2725 ~(POLLHASHCHUNKSZ - 1); 2726 } 2727 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2728 KM_SLEEP); 2729 } 2730 2731 void 2732 pcache_destroy(pollcache_t *pcp) 2733 { 2734 polldat_t **hashtbl; 2735 int i; 2736 2737 hashtbl = pcp->pc_hash; 2738 for (i = 0; i < pcp->pc_hashsize; i++) { 2739 if (hashtbl[i] != NULL) { 2740 polldat_t *pdp, *pdp2; 2741 2742 pdp = hashtbl[i]; 2743 while (pdp != NULL) { 2744 pdp2 = pdp->pd_hashnext; 2745 if (pdp->pd_ref != NULL) { 2746 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2747 pdp->pd_nsets); 2748 } 2749 kmem_free(pdp, sizeof (polldat_t)); 2750 pdp = pdp2; 2751 pcp->pc_fdcount--; 2752 } 2753 } 2754 } 2755 ASSERT(pcp->pc_fdcount == 0); 2756 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2757 kmem_free(pcp->pc_bitmap, 2758 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2759 mutex_destroy(&pcp->pc_no_exit); 2760 mutex_destroy(&pcp->pc_lock); 2761 cv_destroy(&pcp->pc_cv); 2762 cv_destroy(&pcp->pc_busy_cv); 2763 kmem_free(pcp, sizeof (pollcache_t)); 2764 } 2765 2766 pollcacheset_t * 2767 pcacheset_create(int nsets) 2768 { 2769 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 2770 } 2771 2772 void 2773 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 2774 { 2775 int i; 2776 2777 for (i = 0; i < nsets; i++) { 2778 if (pcsp[i].pcs_pollfd != NULL) { 2779 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 2780 sizeof (pollfd_t)); 2781 } 2782 } 2783 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 2784 } 2785 2786 /* 2787 * Check each duplicated poll fd in the poll list. It may be necessary to 2788 * VOP_POLL the same fd again using different poll events. getf() has been 2789 * done by caller. This routine returns 0 if it can sucessfully process the 2790 * entire poll fd list. It returns -1 if underlying vnode has changed during 2791 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 2792 * value if VOP_POLL failed. 2793 */ 2794 static int 2795 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 2796 int entry, int *fdcntp) 2797 { 2798 int i; 2799 int fd; 2800 nfds_t nfds = psp->ps_nfds; 2801 2802 fd = pollfdp[entry].fd; 2803 for (i = entry + 1; i < nfds; i++) { 2804 if (pollfdp[i].fd == fd) { 2805 if (pollfdp[i].events == pollfdp[entry].events) { 2806 if ((pollfdp[i].revents = 2807 pollfdp[entry].revents) != 0) { 2808 (*fdcntp)++; 2809 } 2810 } else { 2811 2812 int error; 2813 pollhead_t *php; 2814 pollcache_t *pcp = psp->ps_pcache; 2815 2816 /* 2817 * the events are different. VOP_POLL on this 2818 * fd so that we don't miss any revents. 2819 */ 2820 php = NULL; 2821 ASSERT(curthread->t_pollcache == NULL); 2822 error = VOP_POLL(fp->f_vnode, 2823 pollfdp[i].events, 0, 2824 &pollfdp[i].revents, &php, NULL); 2825 if (error) { 2826 return (error); 2827 } 2828 /* 2829 * layered devices(e.g. console driver) 2830 * may change the vnode and thus the pollhead 2831 * pointer out from underneath us. 2832 */ 2833 if (php != NULL && pdp->pd_php != NULL && 2834 php != pdp->pd_php) { 2835 pollhead_delete(pdp->pd_php, pdp); 2836 pdp->pd_php = php; 2837 pollhead_insert(php, pdp); 2838 /* 2839 * We could have missed a wakeup on the 2840 * new target device. Make sure the new 2841 * target gets polled once. 2842 */ 2843 BT_SET(pcp->pc_bitmap, fd); 2844 return (-1); 2845 } 2846 if (pollfdp[i].revents) { 2847 (*fdcntp)++; 2848 } 2849 } 2850 } 2851 } 2852 return (0); 2853 } 2854