1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 32 * Copyright 2015, Joyent, Inc. 33 * Copyright 2022 Oxide Computer Company 34 */ 35 36 /* 37 * Portions of this source code were derived from Berkeley 4.3 BSD 38 * under license from the Regents of the University of California. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/isa_defs.h> 43 #include <sys/types.h> 44 #include <sys/sysmacros.h> 45 #include <sys/user.h> 46 #include <sys/systm.h> 47 #include <sys/errno.h> 48 #include <sys/time.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/mode.h> 52 #include <sys/proc.h> 53 #include <sys/uio.h> 54 #include <sys/poll_impl.h> 55 #include <sys/kmem.h> 56 #include <sys/cmn_err.h> 57 #include <sys/debug.h> 58 #include <sys/bitmap.h> 59 #include <sys/kstat.h> 60 #include <sys/rctl.h> 61 #include <sys/port_impl.h> 62 #include <sys/schedctl.h> 63 #include <sys/cpu.h> 64 65 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 66 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 67 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 68 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 69 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 70 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 71 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 72 73 /* 74 * global counters to collect some stats 75 */ 76 static struct { 77 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 78 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 79 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 80 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 81 kstat_named_t pollunlockfail; /* failed to perform pollunlock */ 82 } pollstats = { 83 { "polllistmiss", KSTAT_DATA_UINT64 }, 84 { "pollcachehit", KSTAT_DATA_UINT64 }, 85 { "pollcachephit", KSTAT_DATA_UINT64 }, 86 { "pollcachemiss", KSTAT_DATA_UINT64 }, 87 { "pollunlockfail", KSTAT_DATA_UINT64 } 88 }; 89 90 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 91 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 92 93 struct pplock { 94 kmutex_t pp_lock; 95 short pp_flag; 96 kcondvar_t pp_wait_cv; 97 int32_t pp_pad; /* to a nice round 16 bytes */ 98 }; 99 100 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 101 102 /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */ 103 static kmutex_t pollstate_contenders_lock; 104 static pollstate_t *pollstate_contenders = NULL; 105 106 #ifdef DEBUG 107 static int pollchecksanity(pollstate_t *, nfds_t); 108 static int pollcheckxref(pollstate_t *, int); 109 static void pollcheckphlist(void); 110 static int pollcheckrevents(pollstate_t *, int, int, int); 111 static void checkpolldat(pollstate_t *); 112 #endif /* DEBUG */ 113 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 114 int *); 115 116 /* 117 * Data structure overview: 118 * The per-thread poll state consists of 119 * one pollstate_t 120 * one pollcache_t 121 * one bitmap with one event bit per fd 122 * a (two-dimensional) hashed array of polldat_t structures - one entry 123 * per fd 124 * 125 * This conglomerate of data structures interact with 126 * the pollhead which is used by VOP_POLL and pollwakeup 127 * (protected by the PHLOCK, cached array of plocks), and 128 * the fpollinfo list hanging off the fi_list which is used to notify 129 * poll when a cached fd is closed. This is protected by uf_lock. 130 * 131 * Invariants: 132 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 133 * is on that pollhead. This is modified atomically under pc_lock. 134 * 135 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 136 * list for that open file. 137 * This is modified atomically under pc_lock. 138 * 139 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 140 * Iff pd_ref[i].xf_refcnt >= 1 then 141 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 142 * Iff pd_ref[i].xf_refcnt > 1 then 143 * In ps_pcacheset[i].pcs_pollfd between index 144 * pd_ref[i].xf_position] and the end of the list 145 * there are xf_refcnt entries with .fd == pd_fd 146 * 147 * Locking design: 148 * Whenever possible the design relies on the fact that the poll cache state 149 * is per thread thus for both poll and exit it is self-synchronizing. 150 * Thus the key interactions where other threads access the state are: 151 * pollwakeup (and polltime), and 152 * close cleaning up the cached references to an open file 153 * 154 * The two key locks in poll proper is ps_lock and pc_lock. 155 * 156 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 157 * to ensure that modifications to pollcacheset structure are serialized. 158 * This lock is held through most of poll() except where poll sleeps 159 * since there is little need to handle closes concurrently with the execution 160 * of poll. 161 * The pc_lock protects most of the fields in pollcache structure and polldat 162 * structures (which are accessed by poll, pollwakeup, and polltime) 163 * with the exception of fields that are only modified when only one thread 164 * can access this per-thread state. 165 * Those exceptions occur in poll when first allocating the per-thread state, 166 * when poll grows the number of polldat (never shrinks), and when 167 * exit/pollcleanup has ensured that there are no references from either 168 * pollheads or fpollinfo to the threads poll state. 169 * 170 * Poll(2) system call is the only path which ps_lock and pc_lock are both 171 * held, in that order. It needs ps_lock to synchronize with close and 172 * lwp_exit; and pc_lock with pollwakeup. 173 * 174 * The locking interaction between pc_lock and PHLOCK take into account 175 * that poll acquires these locks in the order of pc_lock and then PHLOCK 176 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 177 * deadlock avoidance by dropping the locks and reacquiring them in the 178 * reverse order. For this to work pollwakeup needs to prevent the thread 179 * from exiting and freeing all of the poll related state. Thus is done 180 * using 181 * the pc_no_exit lock 182 * the pc_busy counter 183 * the pc_busy_cv condition variable 184 * 185 * The locking interaction between pc_lock and uf_lock has similar 186 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 187 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 188 * to prevent poll or exit from doing a delfpollinfo after which the thread 189 * might exit. But the cleanup needs to acquire pc_lock when modifying 190 * the poll cache state. The solution is to use pc_busy and do the close 191 * cleanup in two phases: 192 * First close calls pollblockexit which increments pc_busy. 193 * This prevents the per-thread poll related state from being freed. 194 * Then close drops uf_lock and calls pollcacheclean. 195 * This routine can then acquire pc_lock and remove any references 196 * to the closing fd (as well as recording that it has been closed 197 * so that a POLLNVAL can be generated even if the fd is reused before 198 * poll has been woken up and checked getf() again). 199 * 200 * When removing a polled fd from poll cache, the fd is always removed 201 * from pollhead list first and then from fpollinfo list, i.e., 202 * polldat_disassociate() is called before delfpollinfo(). 203 * 204 * 205 * Locking hierarchy: 206 * pc_no_exit is a leaf level lock. 207 * ps_lock is held when acquiring pc_lock (except when pollwakeup 208 * acquires pc_lock). 209 * pc_lock might be held when acquiring PHLOCK (polldat_associate/ 210 * polldat_disassociate) 211 * pc_lock is always held (but this is not required) 212 * when acquiring PHLOCK (in polladd/polldat_disassociate and pollwakeup 213 * called from pcache_clean_entry). 214 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 215 * uf_lock. 216 * pc_lock is held across getf/releasef which acquire uf_lock. 217 * ps_lock might be held across getf/releasef which acquire uf_lock. 218 * pollwakeup tries to acquire pc_lock while holding PHLOCK 219 * but drops the locks and reacquire them in reverse order to avoid 220 * deadlock. 221 * 222 * Note also that there is deadlock avoidance support for VOP_POLL routines 223 * and pollwakeup involving a file system or driver lock. 224 * See below. 225 */ 226 227 /* 228 * Deadlock avoidance support for VOP_POLL() routines. This is 229 * sometimes necessary to prevent deadlock between polling threads 230 * (which hold poll locks on entry to xx_poll(), then acquire foo) 231 * and pollwakeup() threads (which hold foo, then acquire poll locks). 232 * 233 * pollunlock(*cookie) releases whatever poll locks the current thread holds, 234 * setting a cookie for use by pollrelock(); 235 * 236 * pollrelock(cookie) reacquires previously dropped poll locks; 237 * 238 * polllock(php, mutex) does the common case: pollunlock(), 239 * acquire the problematic mutex, pollrelock(). 240 * 241 * If polllock() or pollunlock() return non-zero, it indicates that a recursive 242 * /dev/poll is in progress and pollcache locks cannot be dropped. Callers 243 * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL. 244 */ 245 int 246 pollunlock(int *lockstate) 247 { 248 pollstate_t *ps = curthread->t_pollstate; 249 pollcache_t *pcp; 250 251 ASSERT(lockstate != NULL); 252 253 /* 254 * There is no way to safely perform a pollunlock() while in the depths 255 * of a recursive /dev/poll operation. 256 */ 257 if (ps != NULL && ps->ps_depth > 1) { 258 ps->ps_flags |= POLLSTATE_ULFAIL; 259 pollstats.pollunlockfail.value.ui64++; 260 return (-1); 261 } 262 263 /* 264 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 265 * If the pollrelock/pollunlock is called as a result of poll(2), 266 * the t_pollcache should be NULL. 267 */ 268 if (curthread->t_pollcache == NULL) 269 pcp = ps->ps_pcache; 270 else 271 pcp = curthread->t_pollcache; 272 273 if (!mutex_owned(&pcp->pc_lock)) { 274 *lockstate = 0; 275 } else { 276 *lockstate = 1; 277 mutex_exit(&pcp->pc_lock); 278 } 279 return (0); 280 } 281 282 void 283 pollrelock(int lockstate) 284 { 285 pollstate_t *ps = curthread->t_pollstate; 286 pollcache_t *pcp; 287 288 /* Skip this whole ordeal if the pollcache was not locked to begin */ 289 if (lockstate == 0) 290 return; 291 292 /* 293 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 294 * If the pollrelock/pollunlock is called as a result of poll(2), 295 * the t_pollcache should be NULL. 296 */ 297 if (curthread->t_pollcache == NULL) 298 pcp = ps->ps_pcache; 299 else 300 pcp = curthread->t_pollcache; 301 302 mutex_enter(&pcp->pc_lock); 303 } 304 305 /* ARGSUSED */ 306 int 307 polllock(pollhead_t *php, kmutex_t *lp) 308 { 309 if (mutex_tryenter(lp) == 0) { 310 int state; 311 312 if (pollunlock(&state) != 0) { 313 return (-1); 314 } 315 mutex_enter(lp); 316 pollrelock(state); 317 } 318 return (0); 319 } 320 321 static int 322 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 323 { 324 kthread_t *t = curthread; 325 klwp_t *lwp = ttolwp(t); 326 proc_t *p = ttoproc(t); 327 int fdcnt = 0; 328 int i; 329 hrtime_t deadline; /* hrtime value when we want to return */ 330 pollfd_t *pollfdp; 331 pollstate_t *ps; 332 pollcache_t *pcp; 333 int error = 0; 334 nfds_t old_nfds; 335 int cacheindex = 0; /* which cache set is used */ 336 337 /* 338 * Determine the precise future time of the requested timeout, if any. 339 */ 340 if (tsp == NULL) { 341 deadline = -1; 342 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 343 deadline = 0; 344 } else { 345 /* They must wait at least a tick. */ 346 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 347 deadline = MAX(deadline, nsec_per_tick); 348 deadline += gethrtime(); 349 } 350 351 /* 352 * Reset our signal mask, if requested. 353 */ 354 if (ksetp != NULL) { 355 mutex_enter(&p->p_lock); 356 schedctl_finish_sigblock(t); 357 lwp->lwp_sigoldmask = t->t_hold; 358 t->t_hold = *ksetp; 359 t->t_flag |= T_TOMASK; 360 /* 361 * Call cv_reltimedwait_sig() just to check for signals. 362 * We will return immediately with either 0 or -1. 363 */ 364 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 365 TR_CLOCK_TICK)) { 366 mutex_exit(&p->p_lock); 367 error = EINTR; 368 goto pollout; 369 } 370 mutex_exit(&p->p_lock); 371 } 372 373 /* 374 * Check to see if this one just wants to use poll() as a timeout. 375 * If yes then bypass all the other stuff and make it sleep. 376 */ 377 if (nfds == 0) { 378 /* 379 * Sleep until we have passed the requested future 380 * time or until interrupted by a signal. 381 * Do not check for signals if we do not want to wait. 382 */ 383 if (deadline != 0) { 384 mutex_enter(&t->t_delay_lock); 385 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 386 &t->t_delay_lock, deadline)) > 0) 387 continue; 388 mutex_exit(&t->t_delay_lock); 389 error = (error == 0) ? EINTR : 0; 390 } 391 goto pollout; 392 } 393 394 if (nfds > p->p_fno_ctl) { 395 mutex_enter(&p->p_lock); 396 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 397 p->p_rctls, p, RCA_SAFE); 398 mutex_exit(&p->p_lock); 399 error = EINVAL; 400 goto pollout; 401 } 402 403 /* 404 * Need to allocate memory for pollstate before anything because 405 * the mutex and cv are created in this space 406 */ 407 ps = pollstate_create(); 408 409 if (ps->ps_pcache == NULL) 410 ps->ps_pcache = pcache_alloc(); 411 pcp = ps->ps_pcache; 412 413 /* 414 * NOTE: for performance, buffers are saved across poll() calls. 415 * The theory is that if a process polls heavily, it tends to poll 416 * on the same set of descriptors. Therefore, we only reallocate 417 * buffers when nfds changes. There is no hysteresis control, 418 * because there is no data to suggest that this is necessary; 419 * the penalty of reallocating is not *that* great in any event. 420 */ 421 old_nfds = ps->ps_nfds; 422 if (nfds != old_nfds) { 423 424 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 425 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 426 ps->ps_pollfd = pollfdp; 427 ps->ps_nfds = nfds; 428 } 429 430 pollfdp = ps->ps_pollfd; 431 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 432 error = EFAULT; 433 goto pollout; 434 } 435 436 if (fds == NULL) { 437 /* 438 * If the process has page 0 mapped, then the copyin() above 439 * will succeed even if fds is NULL. However, our cached 440 * poll lists are keyed by the address of the passed-in fds 441 * structure, and we use the value NULL to indicate an unused 442 * poll cache list entry. As such, we elect not to support 443 * NULL as a valid (user) memory address and fail the poll() 444 * call. 445 */ 446 error = EINVAL; 447 goto pollout; 448 } 449 450 /* 451 * If this thread polls for the first time, allocate ALL poll 452 * cache data structures and cache the poll fd list. This 453 * allocation is delayed till now because lwp's polling 0 fd 454 * (i.e. using poll as timeout()) don't need this memory. 455 */ 456 mutex_enter(&ps->ps_lock); 457 pcp = ps->ps_pcache; 458 ASSERT(pcp != NULL); 459 if (pcp->pc_bitmap == NULL) { 460 pcache_create(pcp, nfds); 461 /* 462 * poll and cache this poll fd list in ps_pcacheset[0]. 463 */ 464 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 465 if (fdcnt || error) { 466 mutex_exit(&ps->ps_lock); 467 goto pollout; 468 } 469 } else { 470 pollcacheset_t *pcset = ps->ps_pcacheset; 471 472 /* 473 * Not first time polling. Select a cached poll list by 474 * matching user pollfd list buffer address. 475 */ 476 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 477 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 478 if ((++pcset[cacheindex].pcs_count) == 0) { 479 /* 480 * counter is wrapping around. 481 */ 482 pcacheset_reset_count(ps, cacheindex); 483 } 484 /* 485 * examine and resolve possible 486 * difference of the current poll 487 * list and previously cached one. 488 * If there is an error during resolve(), 489 * the callee will guarantee the consistency 490 * of cached poll list and cache content. 491 */ 492 error = pcacheset_resolve(ps, nfds, &fdcnt, 493 cacheindex); 494 if (error) { 495 mutex_exit(&ps->ps_lock); 496 goto pollout; 497 } 498 break; 499 } 500 501 /* 502 * Note that pcs_usradr field of an used entry won't be 503 * 0 because it stores the address of passed-in fds, 504 * and 0 fds will not be cached (Then it is either 505 * the special timeout case when nfds is 0 or it returns 506 * failure directly). 507 */ 508 if (pcset[cacheindex].pcs_usradr == (uintptr_t)NULL) { 509 /* 510 * found an unused entry. Use it to cache 511 * this poll list. 512 */ 513 error = pcacheset_cache_list(ps, fds, &fdcnt, 514 cacheindex); 515 if (fdcnt || error) { 516 mutex_exit(&ps->ps_lock); 517 goto pollout; 518 } 519 break; 520 } 521 } 522 if (cacheindex == ps->ps_nsets) { 523 /* 524 * We failed to find a matching cached poll fd list. 525 * replace an old list. 526 */ 527 pollstats.polllistmiss.value.ui64++; 528 cacheindex = pcacheset_replace(ps); 529 ASSERT(cacheindex < ps->ps_nsets); 530 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 531 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 532 if (error) { 533 mutex_exit(&ps->ps_lock); 534 goto pollout; 535 } 536 } 537 } 538 539 /* 540 * Always scan the bitmap with the lock on the pollcache held. 541 * This is to make sure that a wakeup does not come undetected. 542 * If the lock is not held, a pollwakeup could have come for an 543 * fd we already checked but before this thread sleeps, in which 544 * case the wakeup is missed. Now we hold the pcache lock and 545 * check the bitmap again. This will prevent wakeup from happening 546 * while we hold pcache lock since pollwakeup() will also lock 547 * the pcache before updating poll bitmap. 548 */ 549 mutex_enter(&pcp->pc_lock); 550 for (;;) { 551 pcp->pc_flag = 0; 552 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 553 if (fdcnt || error) { 554 mutex_exit(&pcp->pc_lock); 555 mutex_exit(&ps->ps_lock); 556 break; 557 } 558 559 /* 560 * If PC_POLLWAKE is set, a pollwakeup() was performed on 561 * one of the file descriptors. This can happen only if 562 * one of the VOP_POLL() functions dropped pcp->pc_lock. 563 * The only current cases of this is in procfs (prpoll()) 564 * and STREAMS (strpoll()). 565 */ 566 if (pcp->pc_flag & PC_POLLWAKE) 567 continue; 568 569 /* 570 * If you get here, the poll of fds was unsuccessful. 571 * Wait until some fd becomes readable, writable, or gets 572 * an exception, or until a signal or a timeout occurs. 573 * Do not check for signals if we have a zero timeout. 574 */ 575 mutex_exit(&ps->ps_lock); 576 if (deadline == 0) { 577 error = -1; 578 } else { 579 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 580 &pcp->pc_lock, deadline); 581 } 582 mutex_exit(&pcp->pc_lock); 583 /* 584 * If we have received a signal or timed out 585 * then break out and return. 586 */ 587 if (error <= 0) { 588 error = (error == 0) ? EINTR : 0; 589 break; 590 } 591 /* 592 * We have not received a signal or timed out. 593 * Continue around and poll fds again. 594 */ 595 mutex_enter(&ps->ps_lock); 596 mutex_enter(&pcp->pc_lock); 597 } 598 599 pollout: 600 /* 601 * If we changed the signal mask but we received 602 * no signal then restore the signal mask. 603 * Otherwise psig() will deal with the signal mask. 604 */ 605 if (ksetp != NULL) { 606 mutex_enter(&p->p_lock); 607 if (lwp->lwp_cursig == 0) { 608 t->t_hold = lwp->lwp_sigoldmask; 609 t->t_flag &= ~T_TOMASK; 610 } 611 mutex_exit(&p->p_lock); 612 } 613 614 if (error) 615 return (set_errno(error)); 616 617 /* 618 * Copy out the events and return the fdcnt to the user. 619 */ 620 if (nfds != 0 && 621 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 622 return (set_errno(EFAULT)); 623 624 #ifdef DEBUG 625 /* 626 * Another sanity check: 627 */ 628 if (fdcnt) { 629 int reventcnt = 0; 630 631 for (i = 0; i < nfds; i++) { 632 if (pollfdp[i].fd < 0) { 633 ASSERT(pollfdp[i].revents == 0); 634 continue; 635 } 636 if (pollfdp[i].revents) { 637 reventcnt++; 638 } 639 } 640 ASSERT(fdcnt == reventcnt); 641 } else { 642 for (i = 0; i < nfds; i++) { 643 ASSERT(pollfdp[i].revents == 0); 644 } 645 } 646 #endif /* DEBUG */ 647 648 return (fdcnt); 649 } 650 651 /* 652 * This is the system call trap that poll(), 653 * select() and pselect() are built upon. 654 * It is a private interface between libc and the kernel. 655 */ 656 int 657 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 658 { 659 timespec_t ts; 660 timespec_t *tsp; 661 sigset_t set; 662 k_sigset_t kset; 663 k_sigset_t *ksetp; 664 model_t datamodel = get_udatamodel(); 665 666 if (timeoutp == NULL) 667 tsp = NULL; 668 else { 669 if (datamodel == DATAMODEL_NATIVE) { 670 if (copyin(timeoutp, &ts, sizeof (ts))) 671 return (set_errno(EFAULT)); 672 } else { 673 timespec32_t ts32; 674 675 if (copyin(timeoutp, &ts32, sizeof (ts32))) 676 return (set_errno(EFAULT)); 677 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 678 } 679 680 if (itimerspecfix(&ts)) 681 return (set_errno(EINVAL)); 682 tsp = &ts; 683 } 684 685 if (setp == NULL) 686 ksetp = NULL; 687 else { 688 if (copyin(setp, &set, sizeof (set))) 689 return (set_errno(EFAULT)); 690 sigutok(&set, &kset); 691 ksetp = &kset; 692 } 693 694 return (poll_common(fds, nfds, tsp, ksetp)); 695 } 696 697 /* 698 * Clean up any state left around by poll(2). Called when a thread exits. 699 */ 700 void 701 pollcleanup() 702 { 703 pollstate_t *ps = curthread->t_pollstate; 704 pollcache_t *pcp; 705 706 if (ps == NULL) 707 return; 708 pcp = ps->ps_pcache; 709 /* 710 * free up all cached poll fds 711 */ 712 if (pcp == NULL) { 713 /* this pollstate is used by /dev/poll */ 714 goto pollcleanout; 715 } 716 717 if (pcp->pc_bitmap != NULL) { 718 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 719 /* 720 * a close lwp can race with us when cleaning up a polldat 721 * entry. We hold the ps_lock when cleaning hash table. 722 * Since this pollcache is going away anyway, there is no 723 * need to hold the pc_lock. 724 */ 725 mutex_enter(&ps->ps_lock); 726 pcache_clean(pcp); 727 mutex_exit(&ps->ps_lock); 728 #ifdef DEBUG 729 /* 730 * At this point, all fds cached by this lwp should be 731 * cleaned up. There should be no fd in fi_list still 732 * reference this thread. 733 */ 734 checkfpollinfo(); /* sanity check */ 735 pollcheckphlist(); /* sanity check */ 736 #endif /* DEBUG */ 737 } 738 /* 739 * Be sure no one is referencing thread before exiting 740 */ 741 mutex_enter(&pcp->pc_no_exit); 742 ASSERT(pcp->pc_busy >= 0); 743 while (pcp->pc_busy > 0) 744 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 745 mutex_exit(&pcp->pc_no_exit); 746 pollcleanout: 747 pollstate_destroy(ps); 748 curthread->t_pollstate = NULL; 749 } 750 751 /* 752 * pollwakeup() - poke threads waiting in poll() for some event 753 * on a particular object. 754 * 755 * The threads hanging off of the specified pollhead structure are scanned. 756 * If their event mask matches the specified event(s), then pollnotify() is 757 * called to poke the thread. 758 * 759 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 760 * all waiting threads are poked. 761 * 762 * It is important that pollnotify() not drop the lock protecting the list 763 * of threads. 764 */ 765 void 766 pollwakeup(pollhead_t *php, short events_arg) 767 { 768 polldat_t *pdp; 769 int events = (ushort_t)events_arg; 770 struct plist { 771 port_t *pp; 772 int pevents; 773 struct plist *next; 774 }; 775 struct plist *plhead = NULL, *pltail = NULL; 776 777 retry: 778 PH_ENTER(php); 779 780 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 781 if ((pdp->pd_events & events) || 782 (events & (POLLHUP | POLLERR))) { 783 784 pollcache_t *pcp; 785 786 if (pdp->pd_portev != NULL) { 787 port_kevent_t *pkevp = pdp->pd_portev; 788 /* 789 * Object (fd) is associated with an event port, 790 * => send event notification to the port. 791 */ 792 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 793 mutex_enter(&pkevp->portkev_lock); 794 if (pkevp->portkev_flags & PORT_KEV_VALID) { 795 int pevents; 796 797 pkevp->portkev_flags &= ~PORT_KEV_VALID; 798 pkevp->portkev_events |= events & 799 (pdp->pd_events | POLLHUP | 800 POLLERR); 801 /* 802 * portkev_lock mutex will be released 803 * by port_send_event(). 804 */ 805 port_send_event(pkevp); 806 807 /* 808 * If we have some thread polling the 809 * port's fd, add it to the list. They 810 * will be notified later. 811 * The port_pollwkup() will flag the 812 * port_t so that it will not disappear 813 * till port_pollwkdone() is called. 814 */ 815 pevents = 816 port_pollwkup(pkevp->portkev_port); 817 if (pevents) { 818 struct plist *t; 819 t = kmem_zalloc( 820 sizeof (struct plist), 821 KM_SLEEP); 822 t->pp = pkevp->portkev_port; 823 t->pevents = pevents; 824 if (plhead == NULL) { 825 plhead = t; 826 } else { 827 pltail->next = t; 828 } 829 pltail = t; 830 } 831 } else { 832 mutex_exit(&pkevp->portkev_lock); 833 } 834 continue; 835 } 836 837 pcp = pdp->pd_pcache; 838 839 /* 840 * Try to grab the lock for this thread. If 841 * we don't get it then we may deadlock so 842 * back out and restart all over again. Note 843 * that the failure rate is very very low. 844 */ 845 if (mutex_tryenter(&pcp->pc_lock)) { 846 pollnotify(pcp, pdp->pd_fd); 847 mutex_exit(&pcp->pc_lock); 848 } else { 849 /* 850 * We are here because: 851 * 1) This thread has been woke up 852 * and is trying to get out of poll(). 853 * 2) Some other thread is also here 854 * but with a different pollhead lock. 855 * 856 * So, we need to drop the lock on pollhead 857 * because of (1) but we want to prevent 858 * that thread from doing lwp_exit() or 859 * devpoll close. We want to ensure that 860 * the pollcache pointer is still invalid. 861 * 862 * Solution: Grab the pcp->pc_no_exit lock, 863 * increment the pc_busy counter, drop every 864 * lock in sight. Get out of the way and wait 865 * for type (2) threads to finish. 866 */ 867 868 mutex_enter(&pcp->pc_no_exit); 869 pcp->pc_busy++; /* prevents exit()'s */ 870 mutex_exit(&pcp->pc_no_exit); 871 872 PH_EXIT(php); 873 mutex_enter(&pcp->pc_lock); 874 mutex_exit(&pcp->pc_lock); 875 mutex_enter(&pcp->pc_no_exit); 876 pcp->pc_busy--; 877 if (pcp->pc_busy == 0) { 878 /* 879 * Wakeup the thread waiting in 880 * thread_exit(). 881 */ 882 cv_signal(&pcp->pc_busy_cv); 883 } 884 mutex_exit(&pcp->pc_no_exit); 885 goto retry; 886 } 887 } 888 } 889 890 891 /* 892 * Event ports - If this php is of the port on the list, 893 * call port_pollwkdone() to release it. The port_pollwkdone() 894 * needs to be called before dropping the PH lock so that any new 895 * thread attempting to poll this port are blocked. There can be 896 * only one thread here in pollwakeup notifying this port's fd. 897 */ 898 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 899 struct plist *t; 900 port_pollwkdone(plhead->pp); 901 t = plhead; 902 plhead = plhead->next; 903 kmem_free(t, sizeof (struct plist)); 904 } 905 PH_EXIT(php); 906 907 /* 908 * Event ports - Notify threads polling the event port's fd. 909 * This is normally done in port_send_event() where it calls 910 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 911 * we do it here in pollwakeup() to avoid a recursive call. 912 */ 913 if (plhead != NULL) { 914 php = &plhead->pp->port_pollhd; 915 events = plhead->pevents; 916 goto retry; 917 } 918 } 919 920 /* 921 * This function is called to inform a thread (or threads) that an event being 922 * polled on has occurred. The pollstate lock on the thread should be held 923 * on entry. 924 */ 925 void 926 pollnotify(pollcache_t *pcp, int fd) 927 { 928 ASSERT(fd < pcp->pc_mapsize); 929 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 930 BT_SET(pcp->pc_bitmap, fd); 931 pcp->pc_flag |= PC_POLLWAKE; 932 cv_broadcast(&pcp->pc_cv); 933 pcache_wake_parents(pcp); 934 } 935 936 /* 937 * Associate a polldat entry with a pollhead (add it to ph_list). 938 * 939 * The polldat struct is used by pollwakeup to wake sleeping pollers when polled 940 * events has happened. 941 */ 942 void 943 polldat_associate(polldat_t *pdp, pollhead_t *php) 944 { 945 ASSERT3P(pdp->pd_php, ==, NULL); 946 ASSERT3P(pdp->pd_next, ==, NULL); 947 948 PH_ENTER(php); 949 #ifdef DEBUG 950 /* The polldat should not be already on the list */ 951 for (polldat_t *wp = php->ph_list; wp != NULL; wp = wp->pd_next) { 952 ASSERT3P(wp, !=, pdp); 953 } 954 #endif /* DEBUG */ 955 956 pdp->pd_next = php->ph_list; 957 php->ph_list = pdp; 958 pdp->pd_php = php; 959 PH_EXIT(php); 960 } 961 962 /* 963 * Disassociate a polldat from its pollhead (if such an association exists). 964 */ 965 void 966 polldat_disassociate(polldat_t *pdp) 967 { 968 pollhead_t *php; 969 970 /* 971 * Acquire the lock for the pollhead which this polldat is associated 972 * with. This must be done with care, re-checking pd_php after entering 973 * the pollhead lock, since a racing pollhead_clean() could have already 974 * performed the disassociation. 975 */ 976 for (;;) { 977 php = pdp->pd_php; 978 if (php == NULL) { 979 /* polldat is not associated with a pollhead */ 980 return; 981 } 982 983 /* 984 * The lock for a given pollhead is not stored in the pollhead 985 * itself, but is rather a global entry in an array (plocks) 986 * which the pollhead pointer hashes into (see: PHLOCK()). 987 */ 988 PH_ENTER(php); 989 if (pdp->pd_php == php) { 990 break; 991 } 992 PH_EXIT(php); 993 } 994 995 polldat_t **wpp = &php->ph_list, *wp = php->ph_list; 996 while (wp != NULL) { 997 if (wp == pdp) { 998 /* Unlink the polldat from the list */ 999 *wpp = pdp->pd_next; 1000 pdp->pd_next = NULL; 1001 break; 1002 } 1003 wpp = &wp->pd_next; 1004 wp = wp->pd_next; 1005 } 1006 1007 #ifdef DEBUG 1008 /* It would be unexpected if pdp was not in the pollhead list */ 1009 ASSERT(wp != NULL); 1010 1011 /* Assert that pdp is not duplicated somewhere later in the list */ 1012 for (wp = *wpp; wp; wp = wp->pd_next) { 1013 ASSERT(wp != pdp); 1014 } 1015 #endif /* DEBUG */ 1016 1017 pdp->pd_php = NULL; 1018 PH_EXIT(php); 1019 } 1020 1021 /* 1022 * walk through the poll fd lists to see if they are identical. This is an 1023 * expensive operation and should not be done more than once for each poll() 1024 * call. 1025 * 1026 * As an optimization (i.e., not having to go through the lists more than 1027 * once), this routine also clear the revents field of pollfd in 'current'. 1028 * Zeroing out the revents field of each entry in current poll list is 1029 * required by poll man page. 1030 * 1031 * Since the events field of cached list has illegal poll events filtered 1032 * out, the current list applies the same filtering before comparison. 1033 * 1034 * The routine stops when it detects a meaningful difference, or when it 1035 * exhausts the lists. 1036 */ 1037 int 1038 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 1039 { 1040 int ix; 1041 1042 for (ix = 0; ix < n; ix++) { 1043 /* Prefetch 64 bytes worth of 8-byte elements */ 1044 if ((ix & 0x7) == 0) { 1045 prefetch_write_many((caddr_t)¤t[ix + 8]); 1046 prefetch_write_many((caddr_t)&cached[ix + 8]); 1047 } 1048 if (current[ix].fd == cached[ix].fd) { 1049 /* 1050 * Filter out invalid poll events while we are in 1051 * inside the loop. 1052 */ 1053 if (current[ix].events & ~VALID_POLL_EVENTS) { 1054 current[ix].events &= VALID_POLL_EVENTS; 1055 if (newlist != NULL) 1056 newlist[ix].events = current[ix].events; 1057 } 1058 if (current[ix].events == cached[ix].events) { 1059 current[ix].revents = 0; 1060 continue; 1061 } 1062 } 1063 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1064 current[ix].revents = 0; 1065 continue; 1066 } 1067 return (ix); 1068 } 1069 return (ix); 1070 } 1071 1072 /* 1073 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1074 * does not find it in the hash table. 1075 */ 1076 polldat_t * 1077 pcache_lookup_fd(pollcache_t *pcp, int fd) 1078 { 1079 int hashindex; 1080 polldat_t *pdp; 1081 1082 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1083 pdp = pcp->pc_hash[hashindex]; 1084 while (pdp != NULL) { 1085 if (pdp->pd_fd == fd) 1086 break; 1087 pdp = pdp->pd_hashnext; 1088 } 1089 return (pdp); 1090 } 1091 1092 polldat_t * 1093 pcache_alloc_fd(int nsets) 1094 { 1095 polldat_t *pdp; 1096 1097 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1098 if (nsets > 0) { 1099 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1100 pdp->pd_nsets = nsets; 1101 } 1102 return (pdp); 1103 } 1104 1105 /* 1106 * This routine inserts a polldat into the pollcache's hash table. It 1107 * may be necessary to grow the size of the hash table. 1108 */ 1109 void 1110 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1111 { 1112 int hashindex; 1113 int fd; 1114 1115 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1116 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1117 pcache_grow_hashtbl(pcp, nfds); 1118 } 1119 fd = pdp->pd_fd; 1120 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1121 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1122 pcp->pc_hash[hashindex] = pdp; 1123 pcp->pc_fdcount++; 1124 1125 #ifdef DEBUG 1126 { 1127 /* 1128 * same fd should not appear on a hash list twice 1129 */ 1130 polldat_t *pdp1; 1131 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1132 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1133 } 1134 } 1135 #endif /* DEBUG */ 1136 } 1137 1138 /* 1139 * Grow the hash table -- either double the table size or round it to the 1140 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1141 * elements on the hash table. 1142 */ 1143 void 1144 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1145 { 1146 int oldsize; 1147 polldat_t **oldtbl; 1148 polldat_t *pdp, *pdp1; 1149 int i; 1150 #ifdef DEBUG 1151 int count = 0; 1152 #endif 1153 1154 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1155 oldsize = pcp->pc_hashsize; 1156 oldtbl = pcp->pc_hash; 1157 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1158 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1159 ~(POLLHASHCHUNKSZ - 1); 1160 } else { 1161 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1162 } 1163 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1164 KM_SLEEP); 1165 /* 1166 * rehash existing elements 1167 */ 1168 pcp->pc_fdcount = 0; 1169 for (i = 0; i < oldsize; i++) { 1170 pdp = oldtbl[i]; 1171 while (pdp != NULL) { 1172 pdp1 = pdp->pd_hashnext; 1173 pcache_insert_fd(pcp, pdp, nfds); 1174 pdp = pdp1; 1175 #ifdef DEBUG 1176 count++; 1177 #endif 1178 } 1179 } 1180 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1181 ASSERT(pcp->pc_fdcount == count); 1182 } 1183 1184 void 1185 pcache_grow_map(pollcache_t *pcp, int fd) 1186 { 1187 int newsize; 1188 ulong_t *newmap; 1189 1190 /* 1191 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1192 * power of 2. 1193 */ 1194 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1195 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1196 KM_SLEEP); 1197 /* 1198 * don't want pollwakeup to set a bit while growing the bitmap. 1199 */ 1200 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1201 mutex_enter(&pcp->pc_lock); 1202 bcopy(pcp->pc_bitmap, newmap, 1203 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1204 kmem_free(pcp->pc_bitmap, 1205 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1206 pcp->pc_bitmap = newmap; 1207 pcp->pc_mapsize = newsize; 1208 mutex_exit(&pcp->pc_lock); 1209 } 1210 1211 /* 1212 * remove all the reference from pollhead list and fpollinfo lists. 1213 */ 1214 void 1215 pcache_clean(pollcache_t *pcp) 1216 { 1217 int i; 1218 polldat_t **hashtbl; 1219 polldat_t *pdp; 1220 1221 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1222 hashtbl = pcp->pc_hash; 1223 for (i = 0; i < pcp->pc_hashsize; i++) { 1224 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1225 polldat_disassociate(pdp); 1226 if (pdp->pd_fp != NULL) { 1227 delfpollinfo(pdp->pd_fd); 1228 pdp->pd_fp = NULL; 1229 } 1230 } 1231 } 1232 } 1233 1234 void 1235 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1236 { 1237 int i; 1238 int fd = pdp->pd_fd; 1239 1240 /* 1241 * we come here because an earlier close() on this cached poll fd. 1242 */ 1243 ASSERT(pdp->pd_fp == NULL); 1244 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1245 pdp->pd_events = 0; 1246 for (i = 0; i < ps->ps_nsets; i++) { 1247 xref_t *refp; 1248 pollcacheset_t *pcsp; 1249 1250 ASSERT(pdp->pd_ref != NULL); 1251 refp = &pdp->pd_ref[i]; 1252 if (refp->xf_refcnt) { 1253 ASSERT(refp->xf_position >= 0); 1254 pcsp = &ps->ps_pcacheset[i]; 1255 if (refp->xf_refcnt == 1) { 1256 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1257 refp->xf_refcnt = 0; 1258 pdp->pd_count--; 1259 } else if (refp->xf_refcnt > 1) { 1260 int j; 1261 1262 /* 1263 * turn off every appearance in pcs_pollfd list 1264 */ 1265 for (j = refp->xf_position; 1266 j < pcsp->pcs_nfds; j++) { 1267 if (pcsp->pcs_pollfd[j].fd == fd) { 1268 pcsp->pcs_pollfd[j].fd = -1; 1269 refp->xf_refcnt--; 1270 pdp->pd_count--; 1271 } 1272 } 1273 } 1274 ASSERT(refp->xf_refcnt == 0); 1275 refp->xf_position = POLLPOSINVAL; 1276 } 1277 } 1278 ASSERT(pdp->pd_count == 0); 1279 } 1280 1281 /* 1282 * Insert poll fd into the pollcache, and add poll registration. 1283 * This routine is called after getf() and before releasef(). So the vnode 1284 * can not disappear even if we block here. 1285 * If there is an error, the polled fd is not cached. 1286 */ 1287 int 1288 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1289 ssize_t pos, int which) 1290 { 1291 pollcache_t *pcp = ps->ps_pcache; 1292 polldat_t *pdp; 1293 int error; 1294 int fd; 1295 pollhead_t *memphp = NULL; 1296 xref_t *refp; 1297 int newpollfd = 0; 1298 1299 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1300 /* 1301 * The poll caching uses the existing VOP_POLL interface. If there 1302 * is no polled events, we want the polled device to set its "some 1303 * one is sleeping in poll" flag. When the polled events happen 1304 * later, the driver will call pollwakeup(). We achieve this by 1305 * always passing 0 in the third parameter ("anyyet") when calling 1306 * VOP_POLL. This parameter is not looked at by drivers when the 1307 * polled events exist. If a driver chooses to ignore this parameter 1308 * and call pollwakeup whenever the polled events happen, that will 1309 * be OK too. 1310 */ 1311 ASSERT(curthread->t_pollcache == NULL); 1312 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1313 &memphp, NULL); 1314 if (error) { 1315 return (error); 1316 } 1317 if (pollfdp->revents) { 1318 (*fdcntp)++; 1319 } 1320 /* 1321 * polling the underlying device succeeded. Now we can cache it. 1322 * A close can't come in here because we have not done a releasef() 1323 * yet. 1324 */ 1325 fd = pollfdp->fd; 1326 pdp = pcache_lookup_fd(pcp, fd); 1327 if (pdp == NULL) { 1328 ASSERT(ps->ps_nsets > 0); 1329 pdp = pcache_alloc_fd(ps->ps_nsets); 1330 newpollfd = 1; 1331 } 1332 /* 1333 * If this entry was used to cache a poll fd which was closed, and 1334 * this entry has not been cleaned, do it now. 1335 */ 1336 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1337 pcacheset_invalidate(ps, pdp); 1338 ASSERT(pdp->pd_next == NULL); 1339 } 1340 if (pdp->pd_count == 0) { 1341 pdp->pd_fd = fd; 1342 pdp->pd_fp = fp; 1343 addfpollinfo(fd); 1344 pdp->pd_thread = curthread; 1345 pdp->pd_pcache = pcp; 1346 /* 1347 * the entry is never used or cleared by removing a cached 1348 * pollfd (pcache_delete_fd). So all the fields should be clear. 1349 */ 1350 ASSERT(pdp->pd_next == NULL); 1351 } 1352 1353 /* 1354 * A polled fd is considered cached. So there should be a fpollinfo 1355 * entry on uf_fpollinfo list. 1356 */ 1357 ASSERT(infpollinfo(fd)); 1358 /* 1359 * If there is an inconsistency, we want to know it here. 1360 */ 1361 ASSERT(pdp->pd_fp == fp); 1362 1363 /* 1364 * XXX pd_events is a union of all polled events on this fd, possibly 1365 * by different threads. Unless this is a new first poll(), pd_events 1366 * never shrinks. If an event is no longer polled by a process, there 1367 * is no way to cancel that event. In that case, poll degrade to its 1368 * old form -- polling on this fd every time poll() is called. The 1369 * assumption is an app always polls the same type of events. 1370 */ 1371 pdp->pd_events |= pollfdp->events; 1372 1373 pdp->pd_count++; 1374 /* 1375 * There is not much special handling for multiple appearances of 1376 * same fd other than xf_position always recording the first 1377 * appearance in poll list. If this is called from pcacheset_cache_list, 1378 * a VOP_POLL is called on every pollfd entry; therefore each 1379 * revents and fdcnt should be set correctly. If this is called from 1380 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1381 * pick up the right count and handle revents field of each pollfd 1382 * entry. 1383 */ 1384 ASSERT(pdp->pd_ref != NULL); 1385 refp = &pdp->pd_ref[which]; 1386 if (refp->xf_refcnt == 0) { 1387 refp->xf_position = pos; 1388 } else { 1389 /* 1390 * xf_position records the fd's first appearance in poll list 1391 */ 1392 if (pos < refp->xf_position) { 1393 refp->xf_position = pos; 1394 } 1395 } 1396 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1397 refp->xf_refcnt++; 1398 if (fd >= pcp->pc_mapsize) { 1399 pcache_grow_map(pcp, fd); 1400 } 1401 if (fd > pcp->pc_mapend) { 1402 pcp->pc_mapend = fd; 1403 } 1404 if (newpollfd != 0) { 1405 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1406 } 1407 if (memphp) { 1408 if (pdp->pd_php == NULL) { 1409 polldat_associate(pdp, memphp); 1410 } else { 1411 if (memphp != pdp->pd_php) { 1412 /* 1413 * layered devices (e.g. console driver) 1414 * may change the vnode and thus the pollhead 1415 * pointer out from underneath us. 1416 */ 1417 polldat_disassociate(pdp); 1418 polldat_associate(pdp, memphp); 1419 } 1420 } 1421 } 1422 /* 1423 * Since there is a considerable window between VOP_POLL and when 1424 * we actually put the polldat struct on the pollhead list, we could 1425 * miss a pollwakeup. In the case of polling additional events, we 1426 * don't update the events until after VOP_POLL. So we could miss 1427 * pollwakeup there too. So we always set the bit here just to be 1428 * safe. The real performance gain is in subsequent pcache_poll. 1429 */ 1430 mutex_enter(&pcp->pc_lock); 1431 BT_SET(pcp->pc_bitmap, fd); 1432 mutex_exit(&pcp->pc_lock); 1433 return (0); 1434 } 1435 1436 /* 1437 * The entry is not really deleted. The fields are cleared so that the 1438 * entry is no longer useful, but it will remain in the hash table for reuse 1439 * later. It will be freed when the polling lwp exits. 1440 */ 1441 int 1442 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1443 { 1444 pollcache_t *pcp = ps->ps_pcache; 1445 polldat_t *pdp; 1446 xref_t *refp; 1447 1448 ASSERT(fd < pcp->pc_mapsize); 1449 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1450 1451 pdp = pcache_lookup_fd(pcp, fd); 1452 ASSERT(pdp != NULL); 1453 ASSERT(pdp->pd_count > 0); 1454 ASSERT(pdp->pd_ref != NULL); 1455 refp = &pdp->pd_ref[which]; 1456 if (pdp->pd_count == 1) { 1457 pdp->pd_events = 0; 1458 refp->xf_position = POLLPOSINVAL; 1459 ASSERT(refp->xf_refcnt == 1); 1460 refp->xf_refcnt = 0; 1461 1462 /* 1463 * It is possible for a wakeup thread to get ahead of the 1464 * following polldat_disassociate and set the bit in bitmap. 1465 * That is OK because the bit will be cleared here anyway. 1466 */ 1467 polldat_disassociate(pdp); 1468 1469 pdp->pd_count = 0; 1470 if (pdp->pd_fp != NULL) { 1471 pdp->pd_fp = NULL; 1472 delfpollinfo(fd); 1473 } 1474 mutex_enter(&pcp->pc_lock); 1475 BT_CLEAR(pcp->pc_bitmap, fd); 1476 mutex_exit(&pcp->pc_lock); 1477 return (0); 1478 } 1479 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1480 /* 1481 * fd cached here has been closed. This is the first 1482 * pcache_delete_fd called after the close. Clean up the 1483 * entire entry. 1484 */ 1485 pcacheset_invalidate(ps, pdp); 1486 ASSERT(pdp->pd_php == NULL); 1487 mutex_enter(&pcp->pc_lock); 1488 BT_CLEAR(pcp->pc_bitmap, fd); 1489 mutex_exit(&pcp->pc_lock); 1490 return (0); 1491 } 1492 #ifdef DEBUG 1493 if (getf(fd) != NULL) { 1494 ASSERT(infpollinfo(fd)); 1495 releasef(fd); 1496 } 1497 #endif /* DEBUG */ 1498 pdp->pd_count--; 1499 ASSERT(refp->xf_refcnt > 0); 1500 if (--refp->xf_refcnt == 0) { 1501 refp->xf_position = POLLPOSINVAL; 1502 } else { 1503 ASSERT(pos >= refp->xf_position); 1504 if (pos == refp->xf_position) { 1505 /* 1506 * The xref position is no longer valid. 1507 * Reset it to a special value and let 1508 * caller know it needs to updatexref() 1509 * with a new xf_position value. 1510 */ 1511 refp->xf_position = POLLPOSTRANS; 1512 return (1); 1513 } 1514 } 1515 return (0); 1516 } 1517 1518 void 1519 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1520 { 1521 polldat_t *pdp; 1522 1523 pdp = pcache_lookup_fd(pcp, fd); 1524 ASSERT(pdp != NULL); 1525 ASSERT(pdp->pd_ref != NULL); 1526 pdp->pd_ref[which].xf_position = pos; 1527 } 1528 1529 #ifdef DEBUG 1530 /* 1531 * For each polled fd, it's either in the bitmap or cached in 1532 * pcache hash table. If this routine returns 0, something is wrong. 1533 */ 1534 static int 1535 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1536 { 1537 int i; 1538 int fd; 1539 pollcache_t *pcp = ps->ps_pcache; 1540 polldat_t *pdp; 1541 pollfd_t *pollfdp = ps->ps_pollfd; 1542 file_t *fp; 1543 1544 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1545 for (i = 0; i < nfds; i++) { 1546 fd = pollfdp[i].fd; 1547 if (fd < 0) { 1548 ASSERT(pollfdp[i].revents == 0); 1549 continue; 1550 } 1551 if (pollfdp[i].revents == POLLNVAL) 1552 continue; 1553 if ((fp = getf(fd)) == NULL) 1554 continue; 1555 pdp = pcache_lookup_fd(pcp, fd); 1556 ASSERT(pdp != NULL); 1557 ASSERT(infpollinfo(fd)); 1558 ASSERT(pdp->pd_fp == fp); 1559 releasef(fd); 1560 if (BT_TEST(pcp->pc_bitmap, fd)) 1561 continue; 1562 if (pdp->pd_php == NULL) 1563 return (0); 1564 } 1565 return (1); 1566 } 1567 #endif /* DEBUG */ 1568 1569 /* 1570 * resolve the difference between the current poll list and a cached one. 1571 */ 1572 int 1573 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1574 { 1575 int i; 1576 pollcache_t *pcp = ps->ps_pcache; 1577 pollfd_t *newlist = NULL; 1578 pollfd_t *current = ps->ps_pollfd; 1579 pollfd_t *cached; 1580 pollcacheset_t *pcsp; 1581 int common; 1582 int count = 0; 1583 int offset; 1584 int remain; 1585 int fd; 1586 file_t *fp; 1587 int fdcnt = 0; 1588 int cnt = 0; 1589 nfds_t old_nfds; 1590 int error = 0; 1591 int mismatch = 0; 1592 1593 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1594 #ifdef DEBUG 1595 checkpolldat(ps); 1596 #endif 1597 pcsp = &ps->ps_pcacheset[which]; 1598 old_nfds = pcsp->pcs_nfds; 1599 common = (nfds > old_nfds) ? old_nfds : nfds; 1600 if (nfds != old_nfds) { 1601 /* 1602 * the length of poll list has changed. allocate a new 1603 * pollfd list. 1604 */ 1605 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1606 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1607 } 1608 /* 1609 * Compare the overlapping part of the current fd list with the 1610 * cached one. Whenever a difference is found, resolve it. 1611 * The comparison is done on the current poll list and the 1612 * cached list. But we may be setting up the newlist to be the 1613 * cached list for next poll. 1614 */ 1615 cached = pcsp->pcs_pollfd; 1616 remain = common; 1617 1618 while (count < common) { 1619 int tmpfd; 1620 pollfd_t *np; 1621 1622 np = (newlist != NULL) ? &newlist[count] : NULL; 1623 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1624 remain); 1625 /* 1626 * Collect stats. If lists are completed the first time, 1627 * it's a hit. Otherwise, it's a partial hit or miss. 1628 */ 1629 if ((count == 0) && (offset == common)) { 1630 pollstats.pollcachehit.value.ui64++; 1631 } else { 1632 mismatch++; 1633 } 1634 count += offset; 1635 if (offset < remain) { 1636 ASSERT(count < common); 1637 ASSERT((current[count].fd != cached[count].fd) || 1638 (current[count].events != cached[count].events)); 1639 /* 1640 * Filter out invalid events. 1641 */ 1642 if (current[count].events & ~VALID_POLL_EVENTS) { 1643 if (newlist != NULL) { 1644 newlist[count].events = 1645 current[count].events &= 1646 VALID_POLL_EVENTS; 1647 } else { 1648 current[count].events &= 1649 VALID_POLL_EVENTS; 1650 } 1651 } 1652 /* 1653 * when resolving a difference, we always remove the 1654 * fd from cache before inserting one into cache. 1655 */ 1656 if (cached[count].fd >= 0) { 1657 tmpfd = cached[count].fd; 1658 if (pcache_delete_fd(ps, tmpfd, count, which, 1659 (uint_t)cached[count].events)) { 1660 /* 1661 * This should be rare but needed for 1662 * correctness. 1663 * 1664 * The first appearance in cached list 1665 * is being "turned off". The same fd 1666 * appear more than once in the cached 1667 * poll list. Find the next one on the 1668 * list and update the cached 1669 * xf_position field. 1670 */ 1671 for (i = count + 1; i < old_nfds; i++) { 1672 if (cached[i].fd == tmpfd) { 1673 pcache_update_xref(pcp, 1674 tmpfd, (ssize_t)i, 1675 which); 1676 break; 1677 } 1678 } 1679 ASSERT(i <= old_nfds); 1680 } 1681 /* 1682 * In case a new cache list is allocated, 1683 * need to keep both cache lists in sync 1684 * b/c the new one can be freed if we have 1685 * an error later. 1686 */ 1687 cached[count].fd = -1; 1688 if (newlist != NULL) { 1689 newlist[count].fd = -1; 1690 } 1691 } 1692 if ((tmpfd = current[count].fd) >= 0) { 1693 /* 1694 * add to the cached fd tbl and bitmap. 1695 */ 1696 if ((fp = getf(tmpfd)) == NULL) { 1697 current[count].revents = POLLNVAL; 1698 if (newlist != NULL) { 1699 newlist[count].fd = -1; 1700 } 1701 cached[count].fd = -1; 1702 fdcnt++; 1703 } else { 1704 /* 1705 * Here we don't care about the 1706 * fdcnt. We will examine the bitmap 1707 * later and pick up the correct 1708 * fdcnt there. So we never bother 1709 * to check value of 'cnt'. 1710 */ 1711 error = pcache_insert(ps, fp, 1712 ¤t[count], &cnt, 1713 (ssize_t)count, which); 1714 /* 1715 * if no error, we want to do releasef 1716 * after we updated cache poll list 1717 * entry so that close() won't race 1718 * us. 1719 */ 1720 if (error) { 1721 /* 1722 * If we encountered an error, 1723 * we have invalidated an 1724 * entry in cached poll list 1725 * (in pcache_delete_fd() above) 1726 * but failed to add one here. 1727 * This is OK b/c what's in the 1728 * cached list is consistent 1729 * with content of cache. 1730 * It will not have any ill 1731 * effect on next poll(). 1732 */ 1733 releasef(tmpfd); 1734 if (newlist != NULL) { 1735 kmem_free(newlist, 1736 nfds * 1737 sizeof (pollfd_t)); 1738 } 1739 return (error); 1740 } 1741 /* 1742 * If we have allocated a new(temp) 1743 * cache list, we need to keep both 1744 * in sync b/c the new one can be freed 1745 * if we have an error later. 1746 */ 1747 if (newlist != NULL) { 1748 newlist[count].fd = 1749 current[count].fd; 1750 newlist[count].events = 1751 current[count].events; 1752 } 1753 cached[count].fd = current[count].fd; 1754 cached[count].events = 1755 current[count].events; 1756 releasef(tmpfd); 1757 } 1758 } else { 1759 current[count].revents = 0; 1760 } 1761 count++; 1762 remain = common - count; 1763 } 1764 } 1765 if (mismatch != 0) { 1766 if (mismatch == common) { 1767 pollstats.pollcachemiss.value.ui64++; 1768 } else { 1769 pollstats.pollcachephit.value.ui64++; 1770 } 1771 } 1772 /* 1773 * take care of the non overlapping part of a list 1774 */ 1775 if (nfds > old_nfds) { 1776 ASSERT(newlist != NULL); 1777 for (i = old_nfds; i < nfds; i++) { 1778 /* filter out invalid events */ 1779 if (current[i].events & ~VALID_POLL_EVENTS) { 1780 newlist[i].events = current[i].events = 1781 current[i].events & VALID_POLL_EVENTS; 1782 } 1783 if ((fd = current[i].fd) < 0) { 1784 current[i].revents = 0; 1785 continue; 1786 } 1787 /* 1788 * add to the cached fd tbl and bitmap. 1789 */ 1790 if ((fp = getf(fd)) == NULL) { 1791 current[i].revents = POLLNVAL; 1792 newlist[i].fd = -1; 1793 fdcnt++; 1794 continue; 1795 } 1796 /* 1797 * Here we don't care about the 1798 * fdcnt. We will examine the bitmap 1799 * later and pick up the correct 1800 * fdcnt there. So we never bother to 1801 * check 'cnt'. 1802 */ 1803 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1804 (ssize_t)i, which); 1805 releasef(fd); 1806 if (error) { 1807 /* 1808 * Here we are half way through adding newly 1809 * polled fd. Undo enough to keep the cache 1810 * list consistent with the cache content. 1811 */ 1812 pcacheset_remove_list(ps, current, old_nfds, 1813 i, which, 0); 1814 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1815 return (error); 1816 } 1817 } 1818 } 1819 if (old_nfds > nfds) { 1820 /* 1821 * remove the fd's which are no longer polled. 1822 */ 1823 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1824 which, 1); 1825 } 1826 /* 1827 * set difference resolved. update nfds and cachedlist 1828 * in pollstate struct. 1829 */ 1830 if (newlist != NULL) { 1831 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1832 /* 1833 * By now, the pollfd.revents field should 1834 * all be zeroed. 1835 */ 1836 pcsp->pcs_pollfd = newlist; 1837 pcsp->pcs_nfds = nfds; 1838 } 1839 ASSERT(*fdcntp == 0); 1840 *fdcntp = fdcnt; 1841 /* 1842 * By now for every fd in pollfdp, one of the following should be 1843 * true. Otherwise we will miss a polled event. 1844 * 1845 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1846 * will be called on this fd in next poll. 1847 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1848 * pollnotify will happen. 1849 */ 1850 ASSERT(pollchecksanity(ps, nfds)); 1851 /* 1852 * make sure cross reference between cached poll lists and cached 1853 * poll fds are correct. 1854 */ 1855 ASSERT(pollcheckxref(ps, which)); 1856 /* 1857 * ensure each polldat in pollcache reference a polled fd in 1858 * pollcacheset. 1859 */ 1860 #ifdef DEBUG 1861 checkpolldat(ps); 1862 #endif 1863 return (0); 1864 } 1865 1866 #ifdef DEBUG 1867 static int 1868 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1869 { 1870 int i; 1871 int reventcnt = 0; 1872 1873 for (i = 0; i < nfds; i++) { 1874 if (pollfdp[i].fd < 0) { 1875 ASSERT(pollfdp[i].revents == 0); 1876 continue; 1877 } 1878 if (pollfdp[i].revents) { 1879 reventcnt++; 1880 } 1881 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1882 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1883 } 1884 } 1885 return (reventcnt); 1886 } 1887 #endif /* DEBUG */ 1888 1889 /* 1890 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1891 * is held upon entry. 1892 */ 1893 int 1894 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1895 int which) 1896 { 1897 int i; 1898 pollcache_t *pcp; 1899 int fd; 1900 int begin, end, done; 1901 pollhead_t *php; 1902 int fdcnt; 1903 int error = 0; 1904 file_t *fp; 1905 polldat_t *pdp; 1906 xref_t *refp; 1907 int entry; 1908 1909 pcp = ps->ps_pcache; 1910 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1911 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1912 retry: 1913 done = 0; 1914 begin = 0; 1915 fdcnt = 0; 1916 end = pcp->pc_mapend; 1917 while ((fdcnt < nfds) && !done) { 1918 php = NULL; 1919 /* 1920 * only poll fds which may have events 1921 */ 1922 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1923 ASSERT(fd <= end); 1924 if (fd >= 0) { 1925 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1926 /* 1927 * adjust map pointers for next round 1928 */ 1929 if (fd == end) { 1930 done = 1; 1931 } else { 1932 begin = fd + 1; 1933 } 1934 /* 1935 * A bitmap caches poll state information of 1936 * multiple poll lists. Call VOP_POLL only if 1937 * the bit corresponds to an fd in this poll 1938 * list. 1939 */ 1940 pdp = pcache_lookup_fd(pcp, fd); 1941 ASSERT(pdp != NULL); 1942 ASSERT(pdp->pd_ref != NULL); 1943 refp = &pdp->pd_ref[which]; 1944 if (refp->xf_refcnt == 0) 1945 continue; 1946 entry = refp->xf_position; 1947 ASSERT((entry >= 0) && (entry < nfds)); 1948 ASSERT(pollfdp[entry].fd == fd); 1949 /* 1950 * we are in this routine implies that we have 1951 * successfully polled this fd in the past. 1952 * Check to see this fd is closed while we are 1953 * blocked in poll. This ensures that we don't 1954 * miss a close on the fd in the case this fd is 1955 * reused. 1956 */ 1957 if (pdp->pd_fp == NULL) { 1958 ASSERT(pdp->pd_count > 0); 1959 pollfdp[entry].revents = POLLNVAL; 1960 fdcnt++; 1961 if (refp->xf_refcnt > 1) { 1962 /* 1963 * this fd appeared multiple time 1964 * in the poll list. Find all of them. 1965 */ 1966 for (i = entry + 1; i < nfds; i++) { 1967 if (pollfdp[i].fd == fd) { 1968 pollfdp[i].revents = 1969 POLLNVAL; 1970 fdcnt++; 1971 } 1972 } 1973 } 1974 pcacheset_invalidate(ps, pdp); 1975 continue; 1976 } 1977 /* 1978 * We can be here polling a device that is being 1979 * closed (i.e. the file pointer is set to NULL, 1980 * but pollcacheclean has not happened yet). 1981 */ 1982 if ((fp = getf(fd)) == NULL) { 1983 pollfdp[entry].revents = POLLNVAL; 1984 fdcnt++; 1985 if (refp->xf_refcnt > 1) { 1986 /* 1987 * this fd appeared multiple time 1988 * in the poll list. Find all of them. 1989 */ 1990 for (i = entry + 1; i < nfds; i++) { 1991 if (pollfdp[i].fd == fd) { 1992 pollfdp[i].revents = 1993 POLLNVAL; 1994 fdcnt++; 1995 } 1996 } 1997 } 1998 continue; 1999 } 2000 ASSERT(pdp->pd_fp == fp); 2001 ASSERT(infpollinfo(fd)); 2002 /* 2003 * Since we no longer hold poll head lock across 2004 * VOP_POLL, pollunlock logic can be simplifed. 2005 */ 2006 ASSERT(pdp->pd_php == NULL || 2007 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 2008 /* 2009 * underlying file systems may set a "pollpending" 2010 * flag when it sees the poll may block. Pollwakeup() 2011 * is called by wakeup thread if pollpending is set. 2012 * Pass a 0 fdcnt so that the underlying file system 2013 * will set the "pollpending" flag set when there is 2014 * no polled events. 2015 * 2016 * Use pollfdp[].events for actual polling because 2017 * the pd_events is union of all cached poll events 2018 * on this fd. The events parameter also affects 2019 * how the polled device sets the "poll pending" 2020 * flag. 2021 */ 2022 ASSERT(curthread->t_pollcache == NULL); 2023 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 2024 &pollfdp[entry].revents, &php, NULL); 2025 /* 2026 * releasef after completely done with this cached 2027 * poll entry. To prevent close() coming in to clear 2028 * this entry. 2029 */ 2030 if (error) { 2031 releasef(fd); 2032 break; 2033 } 2034 /* 2035 * layered devices (e.g. console driver) 2036 * may change the vnode and thus the pollhead 2037 * pointer out from underneath us. 2038 */ 2039 if (php != NULL && pdp->pd_php != NULL && 2040 php != pdp->pd_php) { 2041 releasef(fd); 2042 polldat_disassociate(pdp); 2043 polldat_associate(pdp, php); 2044 /* 2045 * We could have missed a wakeup on the new 2046 * target device. Make sure the new target 2047 * gets polled once. 2048 */ 2049 BT_SET(pcp->pc_bitmap, fd); 2050 goto retry; 2051 } 2052 2053 if (pollfdp[entry].revents) { 2054 ASSERT(refp->xf_refcnt >= 1); 2055 fdcnt++; 2056 if (refp->xf_refcnt > 1) { 2057 /* 2058 * this fd appeared multiple time 2059 * in the poll list. This is rare but 2060 * we have to look at all of them for 2061 * correctness. 2062 */ 2063 error = plist_chkdupfd(fp, pdp, ps, 2064 pollfdp, entry, &fdcnt); 2065 if (error > 0) { 2066 releasef(fd); 2067 break; 2068 } 2069 if (error < 0) { 2070 goto retry; 2071 } 2072 } 2073 releasef(fd); 2074 } else { 2075 /* 2076 * VOP_POLL didn't return any revents. We can 2077 * clear the bit in bitmap only if we have the 2078 * pollhead ptr cached and no other cached 2079 * entry is polling different events on this fd. 2080 * VOP_POLL may have dropped the ps_lock. Make 2081 * sure pollwakeup has not happened before clear 2082 * the bit. 2083 */ 2084 if ((pdp->pd_php != NULL) && 2085 (pollfdp[entry].events == pdp->pd_events) && 2086 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 2087 BT_CLEAR(pcp->pc_bitmap, fd); 2088 } 2089 /* 2090 * if the fd can be cached now but not before, 2091 * do it now. 2092 */ 2093 if ((pdp->pd_php == NULL) && (php != NULL)) { 2094 polldat_associate(pdp, php); 2095 /* 2096 * We are inserting a polldat struct for 2097 * the first time. We may have missed a 2098 * wakeup on this device. Re-poll once. 2099 * This should be a rare event. 2100 */ 2101 releasef(fd); 2102 goto retry; 2103 } 2104 if (refp->xf_refcnt > 1) { 2105 /* 2106 * this fd appeared multiple time 2107 * in the poll list. This is rare but 2108 * we have to look at all of them for 2109 * correctness. 2110 */ 2111 error = plist_chkdupfd(fp, pdp, ps, 2112 pollfdp, entry, &fdcnt); 2113 if (error > 0) { 2114 releasef(fd); 2115 break; 2116 } 2117 if (error < 0) { 2118 goto retry; 2119 } 2120 } 2121 releasef(fd); 2122 } 2123 } else { 2124 done = 1; 2125 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2126 } 2127 } 2128 if (!error) { 2129 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2130 *fdcntp += fdcnt; 2131 } 2132 return (error); 2133 } 2134 2135 /* 2136 * Going through the poll list without much locking. Poll all fds and 2137 * cache all valid fds in the pollcache. 2138 */ 2139 int 2140 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2141 { 2142 pollfd_t *pollfdp = ps->ps_pollfd; 2143 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2144 pollfd_t *newfdlist; 2145 int i; 2146 int fd; 2147 file_t *fp; 2148 int error = 0; 2149 2150 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2151 ASSERT(which < ps->ps_nsets); 2152 ASSERT(pcacheset != NULL); 2153 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2154 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2155 /* 2156 * cache the new poll list in pollcachset. 2157 */ 2158 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2159 2160 pcacheset[which].pcs_pollfd = newfdlist; 2161 pcacheset[which].pcs_nfds = ps->ps_nfds; 2162 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2163 2164 /* 2165 * We have saved a copy of current poll fd list in one pollcacheset. 2166 * The 'revents' field of the new list is not yet set to 0. Loop 2167 * through the new list just to do that is expensive. We do that 2168 * while polling the list. 2169 */ 2170 for (i = 0; i < ps->ps_nfds; i++) { 2171 fd = pollfdp[i].fd; 2172 /* 2173 * We also filter out the illegal poll events in the event 2174 * field for the cached poll list/set. 2175 */ 2176 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2177 newfdlist[i].events = pollfdp[i].events = 2178 pollfdp[i].events & VALID_POLL_EVENTS; 2179 } 2180 if (fd < 0) { 2181 pollfdp[i].revents = 0; 2182 continue; 2183 } 2184 if ((fp = getf(fd)) == NULL) { 2185 pollfdp[i].revents = POLLNVAL; 2186 /* 2187 * invalidate this cache entry in the cached poll list 2188 */ 2189 newfdlist[i].fd = -1; 2190 (*fdcntp)++; 2191 continue; 2192 } 2193 /* 2194 * cache this fd. 2195 */ 2196 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2197 which); 2198 releasef(fd); 2199 if (error) { 2200 /* 2201 * Here we are half way through caching a new 2202 * poll list. Undo every thing. 2203 */ 2204 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2205 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2206 pcacheset[which].pcs_pollfd = NULL; 2207 pcacheset[which].pcs_usradr = (uintptr_t)NULL; 2208 break; 2209 } 2210 } 2211 return (error); 2212 } 2213 2214 /* 2215 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2216 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2217 * wake any sleeping poller, then remove the polldat from the driver. 2218 * The routine is called with ps_pcachelock held. 2219 */ 2220 void 2221 pcache_clean_entry(pollstate_t *ps, int fd) 2222 { 2223 pollcache_t *pcp; 2224 polldat_t *pdp; 2225 int i; 2226 2227 ASSERT(ps != NULL); 2228 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2229 pcp = ps->ps_pcache; 2230 ASSERT(pcp); 2231 pdp = pcache_lookup_fd(pcp, fd); 2232 ASSERT(pdp != NULL); 2233 /* 2234 * the corresponding fpollinfo in fi_list has been removed by 2235 * a close on this fd. Reset the cached fp ptr here. 2236 */ 2237 pdp->pd_fp = NULL; 2238 /* 2239 * XXX - This routine also touches data in pcacheset struct. 2240 * 2241 * set the event in cached poll lists to POLLCLOSED. This invalidate 2242 * the cached poll fd entry in that poll list, which will force a 2243 * removal of this cached entry in next poll(). The cleanup is done 2244 * at the removal time. 2245 */ 2246 ASSERT(pdp->pd_ref != NULL); 2247 for (i = 0; i < ps->ps_nsets; i++) { 2248 xref_t *refp; 2249 pollcacheset_t *pcsp; 2250 2251 refp = &pdp->pd_ref[i]; 2252 if (refp->xf_refcnt) { 2253 ASSERT(refp->xf_position >= 0); 2254 pcsp = &ps->ps_pcacheset[i]; 2255 if (refp->xf_refcnt == 1) { 2256 pcsp->pcs_pollfd[refp->xf_position].events = 2257 (short)POLLCLOSED; 2258 } 2259 if (refp->xf_refcnt > 1) { 2260 int j; 2261 /* 2262 * mark every matching entry in pcs_pollfd 2263 */ 2264 for (j = refp->xf_position; 2265 j < pcsp->pcs_nfds; j++) { 2266 if (pcsp->pcs_pollfd[j].fd == fd) { 2267 pcsp->pcs_pollfd[j].events = 2268 (short)POLLCLOSED; 2269 } 2270 } 2271 } 2272 } 2273 } 2274 if (pdp->pd_php) { 2275 /* 2276 * Using pdp->pd_php is a bit risky here, as we lack any 2277 * protection from a racing close operation which could free 2278 * that pollhead prior to pollwakeup() acquiring the locks 2279 * necessary to make it safe. 2280 */ 2281 pollwakeup(pdp->pd_php, POLLHUP); 2282 polldat_disassociate(pdp); 2283 } 2284 } 2285 2286 void 2287 pcache_wake_parents(pollcache_t *pcp) 2288 { 2289 pcachelink_t *pl, *pln; 2290 2291 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 2292 2293 for (pl = pcp->pc_parents; pl != NULL; pl = pln) { 2294 mutex_enter(&pl->pcl_lock); 2295 if (pl->pcl_state == PCL_VALID) { 2296 ASSERT(pl->pcl_parent_pc != NULL); 2297 cv_broadcast(&pl->pcl_parent_pc->pc_cv); 2298 } 2299 pln = pl->pcl_parent_next; 2300 mutex_exit(&pl->pcl_lock); 2301 } 2302 } 2303 2304 /* 2305 * Initialize thread pollstate structure. 2306 * It will persist for the life of the thread, until it calls pollcleanup(). 2307 */ 2308 pollstate_t * 2309 pollstate_create() 2310 { 2311 pollstate_t *ps = curthread->t_pollstate; 2312 2313 if (ps == NULL) { 2314 /* 2315 * This is the first time this thread has ever polled, so we 2316 * have to create its pollstate structure. 2317 */ 2318 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2319 ps->ps_nsets = POLLFDSETS; 2320 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2321 curthread->t_pollstate = ps; 2322 } else { 2323 ASSERT(ps->ps_depth == 0); 2324 ASSERT(ps->ps_flags == 0); 2325 ASSERT(ps->ps_pc_stack[0] == 0); 2326 } 2327 return (ps); 2328 } 2329 2330 void 2331 pollstate_destroy(pollstate_t *ps) 2332 { 2333 if (ps->ps_pollfd != NULL) { 2334 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2335 ps->ps_pollfd = NULL; 2336 } 2337 if (ps->ps_pcache != NULL) { 2338 pcache_destroy(ps->ps_pcache); 2339 ps->ps_pcache = NULL; 2340 } 2341 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2342 ps->ps_pcacheset = NULL; 2343 if (ps->ps_dpbuf != NULL) { 2344 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 2345 ps->ps_dpbuf = NULL; 2346 } 2347 mutex_destroy(&ps->ps_lock); 2348 kmem_free(ps, sizeof (pollstate_t)); 2349 } 2350 2351 static int 2352 pollstate_contend(pollstate_t *ps, pollcache_t *pcp) 2353 { 2354 pollstate_t *rem, *next; 2355 pollcache_t *desired_pc; 2356 int result = 0, depth_total; 2357 2358 mutex_enter(&pollstate_contenders_lock); 2359 /* 2360 * There is a small chance that the pollcache of interest became 2361 * available while we were waiting on the contenders lock. 2362 */ 2363 if (mutex_tryenter(&pcp->pc_lock) != 0) { 2364 goto out; 2365 } 2366 2367 /* 2368 * Walk the list of contended pollstates, searching for evidence of a 2369 * deadlock condition. 2370 */ 2371 depth_total = ps->ps_depth; 2372 desired_pc = pcp; 2373 for (rem = pollstate_contenders; rem != NULL; rem = next) { 2374 int i, j; 2375 next = rem->ps_contend_nextp; 2376 2377 /* Is this pollstate holding the pollcache of interest? */ 2378 for (i = 0; i < rem->ps_depth; i++) { 2379 if (rem->ps_pc_stack[i] != desired_pc) { 2380 continue; 2381 } 2382 2383 /* 2384 * The remote pollstate holds the pollcache lock we 2385 * desire. If it is waiting on a pollcache we hold, 2386 * then we can report the obvious deadlock. 2387 */ 2388 ASSERT(rem->ps_contend_pc != NULL); 2389 for (j = 0; j < ps->ps_depth; j++) { 2390 if (rem->ps_contend_pc == ps->ps_pc_stack[j]) { 2391 rem->ps_flags |= POLLSTATE_STALEMATE; 2392 result = -1; 2393 goto out; 2394 } 2395 } 2396 2397 /* 2398 * The remote pollstate is not blocking on a pollcache 2399 * which would deadlock against us. That pollcache 2400 * may, however, be held by a pollstate which would 2401 * result in a deadlock. 2402 * 2403 * To detect such a condition, we continue walking 2404 * through the list using the pollcache blocking the 2405 * remote thread as our new search target. 2406 * 2407 * Return to the front of pollstate_contenders since it 2408 * is not ordered to guarantee complete dependency 2409 * traversal. The below depth tracking places an upper 2410 * bound on iterations. 2411 */ 2412 desired_pc = rem->ps_contend_pc; 2413 next = pollstate_contenders; 2414 2415 /* 2416 * The recursion depth of the remote pollstate is used 2417 * to calculate a final depth for the local /dev/poll 2418 * recursion, since those locks will be acquired 2419 * eventually. If that value exceeds the defined 2420 * limit, we can report the failure now instead of 2421 * recursing to that failure depth. 2422 */ 2423 depth_total += (rem->ps_depth - i); 2424 if (depth_total >= POLLMAXDEPTH) { 2425 result = -1; 2426 goto out; 2427 } 2428 } 2429 } 2430 2431 /* 2432 * No deadlock partner was found. The only course of action is to 2433 * record ourself as a contended pollstate and wait for the pollcache 2434 * mutex to become available. 2435 */ 2436 ps->ps_contend_pc = pcp; 2437 ps->ps_contend_nextp = pollstate_contenders; 2438 ps->ps_contend_pnextp = &pollstate_contenders; 2439 if (pollstate_contenders != NULL) { 2440 pollstate_contenders->ps_contend_pnextp = 2441 &ps->ps_contend_nextp; 2442 } 2443 pollstate_contenders = ps; 2444 2445 mutex_exit(&pollstate_contenders_lock); 2446 mutex_enter(&pcp->pc_lock); 2447 mutex_enter(&pollstate_contenders_lock); 2448 2449 /* 2450 * Our acquisition of the pollcache mutex may be due to another thread 2451 * giving up in the face of deadlock with us. If that is the case, 2452 * we too should report the failure. 2453 */ 2454 if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) { 2455 result = -1; 2456 ps->ps_flags &= ~POLLSTATE_STALEMATE; 2457 mutex_exit(&pcp->pc_lock); 2458 } 2459 2460 /* Remove ourself from the contenders list. */ 2461 if (ps->ps_contend_nextp != NULL) { 2462 ps->ps_contend_nextp->ps_contend_pnextp = 2463 ps->ps_contend_pnextp; 2464 } 2465 *ps->ps_contend_pnextp = ps->ps_contend_nextp; 2466 ps->ps_contend_pc = NULL; 2467 ps->ps_contend_nextp = NULL; 2468 ps->ps_contend_pnextp = NULL; 2469 2470 out: 2471 mutex_exit(&pollstate_contenders_lock); 2472 return (result); 2473 } 2474 2475 int 2476 pollstate_enter(pollcache_t *pcp) 2477 { 2478 pollstate_t *ps = curthread->t_pollstate; 2479 int i; 2480 2481 if (ps == NULL) { 2482 /* 2483 * The thread pollstate may not be initialized if VOP_POLL is 2484 * called on a recursion-enabled /dev/poll handle from outside 2485 * the poll() or /dev/poll codepaths. 2486 */ 2487 return (PSE_FAIL_POLLSTATE); 2488 } 2489 if (ps->ps_depth >= POLLMAXDEPTH) { 2490 return (PSE_FAIL_DEPTH); 2491 } 2492 /* 2493 * Check the desired pollcache against pollcaches we already have 2494 * locked. Such a loop is the most simple deadlock scenario. 2495 */ 2496 for (i = 0; i < ps->ps_depth; i++) { 2497 if (ps->ps_pc_stack[i] == pcp) { 2498 return (PSE_FAIL_LOOP); 2499 } 2500 } 2501 ASSERT(ps->ps_pc_stack[i] == NULL); 2502 2503 if (ps->ps_depth == 0) { 2504 /* Locking initial the pollcache requires no caution */ 2505 mutex_enter(&pcp->pc_lock); 2506 } else if (mutex_tryenter(&pcp->pc_lock) == 0) { 2507 if (pollstate_contend(ps, pcp) != 0) { 2508 /* This pollcache cannot safely be locked. */ 2509 return (PSE_FAIL_DEADLOCK); 2510 } 2511 } 2512 2513 ps->ps_pc_stack[ps->ps_depth++] = pcp; 2514 return (PSE_SUCCESS); 2515 } 2516 2517 void 2518 pollstate_exit(pollcache_t *pcp) 2519 { 2520 pollstate_t *ps = curthread->t_pollstate; 2521 2522 VERIFY(ps != NULL); 2523 VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp); 2524 2525 mutex_exit(&pcp->pc_lock); 2526 ps->ps_pc_stack[--ps->ps_depth] = NULL; 2527 VERIFY(ps->ps_depth >= 0); 2528 } 2529 2530 2531 /* 2532 * We are holding the appropriate uf_lock entering this routine. 2533 * Bump up the ps_busy count to prevent the thread from exiting. 2534 */ 2535 void 2536 pollblockexit(fpollinfo_t *fpip) 2537 { 2538 for (; fpip; fpip = fpip->fp_next) { 2539 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2540 2541 mutex_enter(&pcp->pc_no_exit); 2542 pcp->pc_busy++; /* prevents exit()'s */ 2543 mutex_exit(&pcp->pc_no_exit); 2544 } 2545 } 2546 2547 /* 2548 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2549 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2550 * this cache entry. We can't clean the polldat entry clean up here because 2551 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2552 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2553 * pcache_clean_entry to call pollwakeup(). 2554 */ 2555 void 2556 pollcacheclean(fpollinfo_t *fip, int fd) 2557 { 2558 struct fpollinfo *fpip, *fpip2; 2559 2560 fpip = fip; 2561 while (fpip) { 2562 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2563 pollcache_t *pcp = ps->ps_pcache; 2564 2565 mutex_enter(&ps->ps_lock); 2566 pcache_clean_entry(ps, fd); 2567 mutex_exit(&ps->ps_lock); 2568 mutex_enter(&pcp->pc_no_exit); 2569 pcp->pc_busy--; 2570 if (pcp->pc_busy == 0) { 2571 /* 2572 * Wakeup the thread waiting in 2573 * thread_exit(). 2574 */ 2575 cv_signal(&pcp->pc_busy_cv); 2576 } 2577 mutex_exit(&pcp->pc_no_exit); 2578 2579 fpip2 = fpip; 2580 fpip = fpip->fp_next; 2581 kmem_free(fpip2, sizeof (fpollinfo_t)); 2582 } 2583 } 2584 2585 /* 2586 * one of the cache line's counter is wrapping around. Reset all cache line 2587 * counters to zero except one. This is simplistic, but probably works 2588 * effectively. 2589 */ 2590 void 2591 pcacheset_reset_count(pollstate_t *ps, int index) 2592 { 2593 int i; 2594 2595 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2596 for (i = 0; i < ps->ps_nsets; i++) { 2597 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2598 ps->ps_pcacheset[i].pcs_count = 0; 2599 } 2600 } 2601 ps->ps_pcacheset[index].pcs_count = 1; 2602 } 2603 2604 /* 2605 * this routine implements poll cache list replacement policy. 2606 * It is currently choose the "least used". 2607 */ 2608 int 2609 pcacheset_replace(pollstate_t *ps) 2610 { 2611 int i; 2612 int index = 0; 2613 2614 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2615 for (i = 1; i < ps->ps_nsets; i++) { 2616 if (ps->ps_pcacheset[index].pcs_count > 2617 ps->ps_pcacheset[i].pcs_count) { 2618 index = i; 2619 } 2620 } 2621 ps->ps_pcacheset[index].pcs_count = 0; 2622 return (index); 2623 } 2624 2625 /* 2626 * this routine is called by strclose to remove remaining polldat struct on 2627 * the pollhead list of the device being closed. There are two reasons as why 2628 * the polldat structures still remain on the pollhead list: 2629 * 2630 * (1) The layered device(e.g.the console driver). 2631 * In this case, the existence of a polldat implies that the thread putting 2632 * the polldat on this list has not exited yet. Before the thread exits, it 2633 * will have to hold this pollhead lock to remove the polldat. So holding the 2634 * pollhead lock here effectively prevents the thread which put the polldat 2635 * on this list from exiting. 2636 * 2637 * (2) /dev/poll. 2638 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2639 * pollhead list if the process has not done a POLLREMOVE before closing the 2640 * polled fd. We just unlink it here. 2641 */ 2642 void 2643 pollhead_clean(pollhead_t *php) 2644 { 2645 polldat_t *pdp; 2646 2647 /* 2648 * In case(1), while we must prevent the thread in question from 2649 * exiting, we must also obey the proper locking order, i.e. 2650 * (ps_lock -> phlock). 2651 */ 2652 PH_ENTER(php); 2653 while (php->ph_list != NULL) { 2654 pollstate_t *ps; 2655 pollcache_t *pcp; 2656 2657 pdp = php->ph_list; 2658 ASSERT(pdp->pd_php == php); 2659 if (pdp->pd_thread == NULL) { 2660 /* 2661 * This is case(2). Since the ph_lock is sufficient 2662 * to synchronize this lwp with any other /dev/poll 2663 * lwp, just unlink the polldat. 2664 */ 2665 php->ph_list = pdp->pd_next; 2666 pdp->pd_php = NULL; 2667 pdp->pd_next = NULL; 2668 continue; 2669 } 2670 ps = pdp->pd_thread->t_pollstate; 2671 ASSERT(ps != NULL); 2672 pcp = pdp->pd_pcache; 2673 ASSERT(pcp != NULL); 2674 mutex_enter(&pcp->pc_no_exit); 2675 pcp->pc_busy++; /* prevents exit()'s */ 2676 mutex_exit(&pcp->pc_no_exit); 2677 /* 2678 * Now get the locks in proper order to avoid deadlock. 2679 */ 2680 PH_EXIT(php); 2681 mutex_enter(&ps->ps_lock); 2682 /* 2683 * while we dropped the pollhead lock, the element could be 2684 * taken off the list already. 2685 */ 2686 PH_ENTER(php); 2687 if (pdp->pd_php == php) { 2688 ASSERT(pdp == php->ph_list); 2689 php->ph_list = pdp->pd_next; 2690 pdp->pd_php = NULL; 2691 pdp->pd_next = NULL; 2692 } 2693 PH_EXIT(php); 2694 mutex_exit(&ps->ps_lock); 2695 mutex_enter(&pcp->pc_no_exit); 2696 pcp->pc_busy--; 2697 if (pcp->pc_busy == 0) { 2698 /* 2699 * Wakeup the thread waiting in 2700 * thread_exit(). 2701 */ 2702 cv_signal(&pcp->pc_busy_cv); 2703 } 2704 mutex_exit(&pcp->pc_no_exit); 2705 PH_ENTER(php); 2706 } 2707 PH_EXIT(php); 2708 } 2709 2710 /* 2711 * The remove_list is called to cleanup a partially cached 'current' list or 2712 * to remove a partial list which is no longer cached. The flag value of 1 2713 * indicates the second case. 2714 */ 2715 void 2716 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2717 int cacheindex, int flag) 2718 { 2719 int i; 2720 2721 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2722 for (i = start; i < end; i++) { 2723 if ((pollfdp[i].fd >= 0) && 2724 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2725 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2726 (uint_t)pollfdp[i].events)) { 2727 int j; 2728 int fd = pollfdp[i].fd; 2729 2730 for (j = i + 1; j < end; j++) { 2731 if (pollfdp[j].fd == fd) { 2732 pcache_update_xref( 2733 ps->ps_pcache, fd, 2734 (ssize_t)j, cacheindex); 2735 break; 2736 } 2737 } 2738 ASSERT(j <= end); 2739 } 2740 } 2741 } 2742 } 2743 2744 #ifdef DEBUG 2745 2746 #include<sys/strsubr.h> 2747 /* 2748 * make sure curthread is not on anyone's pollhead list any more. 2749 */ 2750 static void 2751 pollcheckphlist() 2752 { 2753 int i; 2754 file_t *fp; 2755 uf_entry_t *ufp; 2756 uf_info_t *fip = P_FINFO(curproc); 2757 struct stdata *stp; 2758 polldat_t *pdp; 2759 2760 mutex_enter(&fip->fi_lock); 2761 for (i = 0; i < fip->fi_nfiles; i++) { 2762 UF_ENTER(ufp, fip, i); 2763 if ((fp = ufp->uf_file) != NULL) { 2764 if ((stp = fp->f_vnode->v_stream) != NULL) { 2765 PH_ENTER(&stp->sd_pollist); 2766 pdp = stp->sd_pollist.ph_list; 2767 while (pdp) { 2768 ASSERT(pdp->pd_thread != curthread); 2769 pdp = pdp->pd_next; 2770 } 2771 PH_EXIT(&stp->sd_pollist); 2772 } 2773 } 2774 UF_EXIT(ufp); 2775 } 2776 mutex_exit(&fip->fi_lock); 2777 } 2778 2779 /* 2780 * for resolved set poll list, the xref info in the pcache should be 2781 * consistent with this poll list. 2782 */ 2783 static int 2784 pollcheckxref(pollstate_t *ps, int cacheindex) 2785 { 2786 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2787 pollcache_t *pcp = ps->ps_pcache; 2788 polldat_t *pdp; 2789 int i; 2790 xref_t *refp; 2791 2792 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2793 if (pollfdp[i].fd < 0) { 2794 continue; 2795 } 2796 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2797 ASSERT(pdp != NULL); 2798 ASSERT(pdp->pd_ref != NULL); 2799 refp = &pdp->pd_ref[cacheindex]; 2800 if (refp->xf_position >= 0) { 2801 ASSERT(refp->xf_refcnt >= 1); 2802 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2803 if (refp->xf_refcnt > 1) { 2804 int j; 2805 int count = 0; 2806 2807 for (j = refp->xf_position; 2808 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2809 j++) { 2810 if (pollfdp[j].fd == pdp->pd_fd) { 2811 count++; 2812 } 2813 } 2814 ASSERT(count == refp->xf_refcnt); 2815 } 2816 } 2817 } 2818 return (1); 2819 } 2820 2821 /* 2822 * For every cached pollfd, its polldat struct should be consistent with 2823 * what is in the pcacheset lists. 2824 */ 2825 static void 2826 checkpolldat(pollstate_t *ps) 2827 { 2828 pollcache_t *pcp = ps->ps_pcache; 2829 polldat_t **hashtbl; 2830 int i; 2831 2832 hashtbl = pcp->pc_hash; 2833 for (i = 0; i < pcp->pc_hashsize; i++) { 2834 polldat_t *pdp; 2835 2836 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2837 ASSERT(pdp->pd_ref != NULL); 2838 if (pdp->pd_count > 0) { 2839 xref_t *refp; 2840 int j; 2841 pollcacheset_t *pcsp; 2842 pollfd_t *pollfd; 2843 2844 for (j = 0; j < ps->ps_nsets; j++) { 2845 refp = &pdp->pd_ref[j]; 2846 if (refp->xf_refcnt > 0) { 2847 pcsp = &ps->ps_pcacheset[j]; 2848 ASSERT(refp->xf_position < 2849 pcsp->pcs_nfds); 2850 pollfd = pcsp->pcs_pollfd; 2851 ASSERT(pdp->pd_fd == 2852 pollfd[refp->xf_position]. 2853 fd); 2854 } 2855 } 2856 } 2857 } 2858 } 2859 } 2860 2861 /* 2862 * every wfd element on ph_list must have a corresponding fpollinfo on the 2863 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2864 */ 2865 void 2866 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2867 { 2868 stdata_t *stp; 2869 polldat_t *pdp; 2870 fpollinfo_t *fpip2; 2871 2872 if ((stp = vp->v_stream) == NULL) { 2873 return; 2874 } 2875 PH_ENTER(&stp->sd_pollist); 2876 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2877 if (pdp->pd_thread != NULL && 2878 pdp->pd_thread->t_procp == curthread->t_procp) { 2879 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2880 if (pdp->pd_thread == fpip2->fp_thread) { 2881 break; 2882 } 2883 } 2884 ASSERT(fpip2 != NULL); 2885 } 2886 } 2887 PH_EXIT(&stp->sd_pollist); 2888 } 2889 2890 /* 2891 * For each cached fd whose bit is not set in bitmap, its revents field in 2892 * current poll list should be 0. 2893 */ 2894 static int 2895 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2896 { 2897 pollcache_t *pcp = ps->ps_pcache; 2898 pollfd_t *pollfdp = ps->ps_pollfd; 2899 int i; 2900 2901 for (i = begin; i < end; i++) { 2902 polldat_t *pdp; 2903 2904 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2905 pdp = pcache_lookup_fd(pcp, i); 2906 if (pdp && pdp->pd_fp != NULL) { 2907 xref_t *refp; 2908 int entry; 2909 2910 ASSERT(pdp->pd_ref != NULL); 2911 refp = &pdp->pd_ref[cacheindex]; 2912 if (refp->xf_refcnt == 0) { 2913 continue; 2914 } 2915 entry = refp->xf_position; 2916 ASSERT(entry >= 0); 2917 ASSERT(pollfdp[entry].revents == 0); 2918 if (refp->xf_refcnt > 1) { 2919 int j; 2920 2921 for (j = entry + 1; j < ps->ps_nfds; j++) { 2922 if (pollfdp[j].fd == i) { 2923 ASSERT(pollfdp[j].revents == 0); 2924 } 2925 } 2926 } 2927 } 2928 } 2929 return (1); 2930 } 2931 2932 #endif /* DEBUG */ 2933 2934 pollcache_t * 2935 pcache_alloc() 2936 { 2937 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2938 } 2939 2940 void 2941 pcache_create(pollcache_t *pcp, nfds_t nfds) 2942 { 2943 size_t mapsize; 2944 2945 /* 2946 * allocate enough bits for the poll fd list 2947 */ 2948 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2949 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2950 } 2951 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2952 KM_SLEEP); 2953 pcp->pc_mapsize = mapsize; 2954 /* 2955 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2956 * number of fd to start with, allocate a bigger hash table (to the 2957 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2958 * hash table is expensive. 2959 */ 2960 if (nfds < POLLHASHCHUNKSZ) { 2961 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2962 } else { 2963 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2964 ~(POLLHASHCHUNKSZ - 1); 2965 } 2966 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2967 KM_SLEEP); 2968 } 2969 2970 void 2971 pcache_destroy(pollcache_t *pcp) 2972 { 2973 polldat_t **hashtbl; 2974 int i; 2975 2976 hashtbl = pcp->pc_hash; 2977 for (i = 0; i < pcp->pc_hashsize; i++) { 2978 if (hashtbl[i] != NULL) { 2979 polldat_t *pdp, *pdp2; 2980 2981 pdp = hashtbl[i]; 2982 while (pdp != NULL) { 2983 pdp2 = pdp->pd_hashnext; 2984 if (pdp->pd_ref != NULL) { 2985 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2986 pdp->pd_nsets); 2987 } 2988 kmem_free(pdp, sizeof (polldat_t)); 2989 pdp = pdp2; 2990 pcp->pc_fdcount--; 2991 } 2992 } 2993 } 2994 ASSERT(pcp->pc_fdcount == 0); 2995 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2996 kmem_free(pcp->pc_bitmap, 2997 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2998 mutex_destroy(&pcp->pc_no_exit); 2999 mutex_destroy(&pcp->pc_lock); 3000 cv_destroy(&pcp->pc_cv); 3001 cv_destroy(&pcp->pc_busy_cv); 3002 kmem_free(pcp, sizeof (pollcache_t)); 3003 } 3004 3005 pollcacheset_t * 3006 pcacheset_create(int nsets) 3007 { 3008 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 3009 } 3010 3011 void 3012 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 3013 { 3014 int i; 3015 3016 for (i = 0; i < nsets; i++) { 3017 if (pcsp[i].pcs_pollfd != NULL) { 3018 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 3019 sizeof (pollfd_t)); 3020 } 3021 } 3022 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 3023 } 3024 3025 /* 3026 * Check each duplicated poll fd in the poll list. It may be necessary to 3027 * VOP_POLL the same fd again using different poll events. getf() has been 3028 * done by caller. This routine returns 0 if it can sucessfully process the 3029 * entire poll fd list. It returns -1 if underlying vnode has changed during 3030 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 3031 * value if VOP_POLL failed. 3032 */ 3033 static int 3034 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 3035 int entry, int *fdcntp) 3036 { 3037 int i; 3038 int fd; 3039 nfds_t nfds = psp->ps_nfds; 3040 3041 fd = pollfdp[entry].fd; 3042 for (i = entry + 1; i < nfds; i++) { 3043 if (pollfdp[i].fd == fd) { 3044 if (pollfdp[i].events == pollfdp[entry].events) { 3045 if ((pollfdp[i].revents = 3046 pollfdp[entry].revents) != 0) { 3047 (*fdcntp)++; 3048 } 3049 } else { 3050 3051 int error; 3052 pollhead_t *php; 3053 pollcache_t *pcp = psp->ps_pcache; 3054 3055 /* 3056 * the events are different. VOP_POLL on this 3057 * fd so that we don't miss any revents. 3058 */ 3059 php = NULL; 3060 ASSERT(curthread->t_pollcache == NULL); 3061 error = VOP_POLL(fp->f_vnode, 3062 pollfdp[i].events, 0, 3063 &pollfdp[i].revents, &php, NULL); 3064 if (error) { 3065 return (error); 3066 } 3067 /* 3068 * layered devices(e.g. console driver) 3069 * may change the vnode and thus the pollhead 3070 * pointer out from underneath us. 3071 */ 3072 if (php != NULL && pdp->pd_php != NULL && 3073 php != pdp->pd_php) { 3074 polldat_disassociate(pdp); 3075 polldat_associate(pdp, php); 3076 /* 3077 * We could have missed a wakeup on the 3078 * new target device. Make sure the new 3079 * target gets polled once. 3080 */ 3081 BT_SET(pcp->pc_bitmap, fd); 3082 return (-1); 3083 } 3084 if (pollfdp[i].revents) { 3085 (*fdcntp)++; 3086 } 3087 } 3088 } 3089 } 3090 return (0); 3091 } 3092