1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012 by Delphix. All rights reserved. 32 * Copyright 2015, Joyent, Inc. 33 */ 34 35 /* 36 * Portions of this source code were derived from Berkeley 4.3 BSD 37 * under license from the Regents of the University of California. 38 */ 39 40 #include <sys/param.h> 41 #include <sys/isa_defs.h> 42 #include <sys/types.h> 43 #include <sys/sysmacros.h> 44 #include <sys/user.h> 45 #include <sys/systm.h> 46 #include <sys/errno.h> 47 #include <sys/time.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/mode.h> 51 #include <sys/proc.h> 52 #include <sys/uio.h> 53 #include <sys/poll_impl.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/debug.h> 57 #include <sys/bitmap.h> 58 #include <sys/kstat.h> 59 #include <sys/rctl.h> 60 #include <sys/port_impl.h> 61 #include <sys/schedctl.h> 62 #include <sys/cpu.h> 63 #include <sys/random.h> 64 65 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 66 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 67 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 68 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 69 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 70 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 71 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 72 73 /* settable in /etc/system */ 74 uint32_t randomize_pollwakeup = 0; 75 76 /* 77 * global counters to collect some stats 78 */ 79 static struct { 80 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 81 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 82 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 83 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 84 kstat_named_t pollunlockfail; /* failed to perform pollunlock */ 85 } pollstats = { 86 { "polllistmiss", KSTAT_DATA_UINT64 }, 87 { "pollcachehit", KSTAT_DATA_UINT64 }, 88 { "pollcachephit", KSTAT_DATA_UINT64 }, 89 { "pollcachemiss", KSTAT_DATA_UINT64 }, 90 { "pollunlockfail", KSTAT_DATA_UINT64 } 91 }; 92 93 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 94 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 95 96 struct pplock { 97 kmutex_t pp_lock; 98 short pp_flag; 99 kcondvar_t pp_wait_cv; 100 int32_t pp_pad; /* to a nice round 16 bytes */ 101 }; 102 103 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 104 105 /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */ 106 static kmutex_t pollstate_contenders_lock; 107 static pollstate_t *pollstate_contenders = NULL; 108 109 #ifdef DEBUG 110 static int pollchecksanity(pollstate_t *, nfds_t); 111 static int pollcheckxref(pollstate_t *, int); 112 static void pollcheckphlist(void); 113 static int pollcheckrevents(pollstate_t *, int, int, int); 114 static void checkpolldat(pollstate_t *); 115 #endif /* DEBUG */ 116 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 117 int *); 118 119 /* 120 * Data structure overview: 121 * The per-thread poll state consists of 122 * one pollstate_t 123 * one pollcache_t 124 * one bitmap with one event bit per fd 125 * a (two-dimensional) hashed array of polldat_t structures - one entry 126 * per fd 127 * 128 * This conglomerate of data structures interact with 129 * the pollhead which is used by VOP_POLL and pollwakeup 130 * (protected by the PHLOCK, cached array of plocks), and 131 * the fpollinfo list hanging off the fi_list which is used to notify 132 * poll when a cached fd is closed. This is protected by uf_lock. 133 * 134 * Invariants: 135 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 136 * is on that pollhead. This is modified atomically under pc_lock. 137 * 138 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 139 * list for that open file. 140 * This is modified atomically under pc_lock. 141 * 142 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 143 * Iff pd_ref[i].xf_refcnt >= 1 then 144 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 145 * Iff pd_ref[i].xf_refcnt > 1 then 146 * In ps_pcacheset[i].pcs_pollfd between index 147 * pd_ref[i].xf_position] and the end of the list 148 * there are xf_refcnt entries with .fd == pd_fd 149 * 150 * Locking design: 151 * Whenever possible the design relies on the fact that the poll cache state 152 * is per thread thus for both poll and exit it is self-synchronizing. 153 * Thus the key interactions where other threads access the state are: 154 * pollwakeup (and polltime), and 155 * close cleaning up the cached references to an open file 156 * 157 * The two key locks in poll proper is ps_lock and pc_lock. 158 * 159 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 160 * to ensure that modifications to pollcacheset structure are serialized. 161 * This lock is held through most of poll() except where poll sleeps 162 * since there is little need to handle closes concurrently with the execution 163 * of poll. 164 * The pc_lock protects most of the fields in pollcache structure and polldat 165 * structures (which are accessed by poll, pollwakeup, and polltime) 166 * with the exception of fields that are only modified when only one thread 167 * can access this per-thread state. 168 * Those exceptions occur in poll when first allocating the per-thread state, 169 * when poll grows the number of polldat (never shrinks), and when 170 * exit/pollcleanup has ensured that there are no references from either 171 * pollheads or fpollinfo to the threads poll state. 172 * 173 * Poll(2) system call is the only path which ps_lock and pc_lock are both 174 * held, in that order. It needs ps_lock to synchronize with close and 175 * lwp_exit; and pc_lock with pollwakeup. 176 * 177 * The locking interaction between pc_lock and PHLOCK take into account 178 * that poll acquires these locks in the order of pc_lock and then PHLOCK 179 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 180 * deadlock avoidance by dropping the locks and reacquiring them in the 181 * reverse order. For this to work pollwakeup needs to prevent the thread 182 * from exiting and freeing all of the poll related state. Thus is done 183 * using 184 * the pc_no_exit lock 185 * the pc_busy counter 186 * the pc_busy_cv condition variable 187 * 188 * The locking interaction between pc_lock and uf_lock has similar 189 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 190 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 191 * to prevent poll or exit from doing a delfpollinfo after which the thread 192 * might exit. But the cleanup needs to acquire pc_lock when modifying 193 * the poll cache state. The solution is to use pc_busy and do the close 194 * cleanup in two phases: 195 * First close calls pollblockexit which increments pc_busy. 196 * This prevents the per-thread poll related state from being freed. 197 * Then close drops uf_lock and calls pollcacheclean. 198 * This routine can then acquire pc_lock and remove any references 199 * to the closing fd (as well as recording that it has been closed 200 * so that a POLLNVAL can be generated even if the fd is reused before 201 * poll has been woken up and checked getf() again). 202 * 203 * When removing a polled fd from poll cache, the fd is always removed 204 * from pollhead list first and then from fpollinfo list, i.e., 205 * pollhead_delete() is called before delfpollinfo(). 206 * 207 * 208 * Locking hierarchy: 209 * pc_no_exit is a leaf level lock. 210 * ps_lock is held when acquiring pc_lock (except when pollwakeup 211 * acquires pc_lock). 212 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 213 * pollhead_delete) 214 * pc_lock is always held (but this is not required) 215 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 216 * from pcache_clean_entry). 217 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 218 * uf_lock. 219 * pc_lock is held across getf/releasef which acquire uf_lock. 220 * ps_lock might be held across getf/releasef which acquire uf_lock. 221 * pollwakeup tries to acquire pc_lock while holding PHLOCK 222 * but drops the locks and reacquire them in reverse order to avoid 223 * deadlock. 224 * 225 * Note also that there is deadlock avoidance support for VOP_POLL routines 226 * and pollwakeup involving a file system or driver lock. 227 * See below. 228 */ 229 230 /* 231 * Deadlock avoidance support for VOP_POLL() routines. This is 232 * sometimes necessary to prevent deadlock between polling threads 233 * (which hold poll locks on entry to xx_poll(), then acquire foo) 234 * and pollwakeup() threads (which hold foo, then acquire poll locks). 235 * 236 * pollunlock(*cookie) releases whatever poll locks the current thread holds, 237 * setting a cookie for use by pollrelock(); 238 * 239 * pollrelock(cookie) reacquires previously dropped poll locks; 240 * 241 * polllock(php, mutex) does the common case: pollunlock(), 242 * acquire the problematic mutex, pollrelock(). 243 * 244 * If polllock() or pollunlock() return non-zero, it indicates that a recursive 245 * /dev/poll is in progress and pollcache locks cannot be dropped. Callers 246 * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL. 247 */ 248 int 249 pollunlock(int *lockstate) 250 { 251 pollstate_t *ps = curthread->t_pollstate; 252 pollcache_t *pcp; 253 254 ASSERT(lockstate != NULL); 255 256 /* 257 * There is no way to safely perform a pollunlock() while in the depths 258 * of a recursive /dev/poll operation. 259 */ 260 if (ps != NULL && ps->ps_depth > 1) { 261 ps->ps_flags |= POLLSTATE_ULFAIL; 262 pollstats.pollunlockfail.value.ui64++; 263 return (-1); 264 } 265 266 /* 267 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 268 * If the pollrelock/pollunlock is called as a result of poll(2), 269 * the t_pollcache should be NULL. 270 */ 271 if (curthread->t_pollcache == NULL) 272 pcp = ps->ps_pcache; 273 else 274 pcp = curthread->t_pollcache; 275 276 if (!mutex_owned(&pcp->pc_lock)) { 277 *lockstate = 0; 278 } else { 279 *lockstate = 1; 280 mutex_exit(&pcp->pc_lock); 281 } 282 return (0); 283 } 284 285 void 286 pollrelock(int lockstate) 287 { 288 pollstate_t *ps = curthread->t_pollstate; 289 pollcache_t *pcp; 290 291 /* Skip this whole ordeal if the pollcache was not locked to begin */ 292 if (lockstate == 0) 293 return; 294 295 /* 296 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 297 * If the pollrelock/pollunlock is called as a result of poll(2), 298 * the t_pollcache should be NULL. 299 */ 300 if (curthread->t_pollcache == NULL) 301 pcp = ps->ps_pcache; 302 else 303 pcp = curthread->t_pollcache; 304 305 mutex_enter(&pcp->pc_lock); 306 } 307 308 /* ARGSUSED */ 309 int 310 polllock(pollhead_t *php, kmutex_t *lp) 311 { 312 if (mutex_tryenter(lp) == 0) { 313 int state; 314 315 if (pollunlock(&state) != 0) { 316 return (-1); 317 } 318 mutex_enter(lp); 319 pollrelock(state); 320 } 321 return (0); 322 } 323 324 static int 325 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 326 { 327 kthread_t *t = curthread; 328 klwp_t *lwp = ttolwp(t); 329 proc_t *p = ttoproc(t); 330 int fdcnt = 0; 331 int i; 332 hrtime_t deadline; /* hrtime value when we want to return */ 333 pollfd_t *pollfdp; 334 pollstate_t *ps; 335 pollcache_t *pcp; 336 int error = 0; 337 nfds_t old_nfds; 338 int cacheindex = 0; /* which cache set is used */ 339 340 /* 341 * Determine the precise future time of the requested timeout, if any. 342 */ 343 if (tsp == NULL) { 344 deadline = -1; 345 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 346 deadline = 0; 347 } else { 348 /* They must wait at least a tick. */ 349 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 350 deadline = MAX(deadline, nsec_per_tick); 351 deadline += gethrtime(); 352 } 353 354 /* 355 * Reset our signal mask, if requested. 356 */ 357 if (ksetp != NULL) { 358 mutex_enter(&p->p_lock); 359 schedctl_finish_sigblock(t); 360 lwp->lwp_sigoldmask = t->t_hold; 361 t->t_hold = *ksetp; 362 t->t_flag |= T_TOMASK; 363 /* 364 * Call cv_reltimedwait_sig() just to check for signals. 365 * We will return immediately with either 0 or -1. 366 */ 367 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 368 TR_CLOCK_TICK)) { 369 mutex_exit(&p->p_lock); 370 error = EINTR; 371 goto pollout; 372 } 373 mutex_exit(&p->p_lock); 374 } 375 376 /* 377 * Check to see if this guy just wants to use poll() as a timeout. 378 * If yes then bypass all the other stuff and make him sleep. 379 */ 380 if (nfds == 0) { 381 /* 382 * Sleep until we have passed the requested future 383 * time or until interrupted by a signal. 384 * Do not check for signals if we do not want to wait. 385 */ 386 if (deadline != 0) { 387 mutex_enter(&t->t_delay_lock); 388 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 389 &t->t_delay_lock, deadline)) > 0) 390 continue; 391 mutex_exit(&t->t_delay_lock); 392 error = (error == 0) ? EINTR : 0; 393 } 394 goto pollout; 395 } 396 397 if (nfds > p->p_fno_ctl) { 398 mutex_enter(&p->p_lock); 399 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 400 p->p_rctls, p, RCA_SAFE); 401 mutex_exit(&p->p_lock); 402 error = EINVAL; 403 goto pollout; 404 } 405 406 /* 407 * Need to allocate memory for pollstate before anything because 408 * the mutex and cv are created in this space 409 */ 410 ps = pollstate_create(); 411 412 if (ps->ps_pcache == NULL) 413 ps->ps_pcache = pcache_alloc(); 414 pcp = ps->ps_pcache; 415 416 /* 417 * NOTE: for performance, buffers are saved across poll() calls. 418 * The theory is that if a process polls heavily, it tends to poll 419 * on the same set of descriptors. Therefore, we only reallocate 420 * buffers when nfds changes. There is no hysteresis control, 421 * because there is no data to suggest that this is necessary; 422 * the penalty of reallocating is not *that* great in any event. 423 */ 424 old_nfds = ps->ps_nfds; 425 if (nfds != old_nfds) { 426 427 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 428 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 429 ps->ps_pollfd = pollfdp; 430 ps->ps_nfds = nfds; 431 } 432 433 pollfdp = ps->ps_pollfd; 434 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 435 error = EFAULT; 436 goto pollout; 437 } 438 439 if (fds == NULL) { 440 /* 441 * If the process has page 0 mapped, then the copyin() above 442 * will succeed even if fds is NULL. However, our cached 443 * poll lists are keyed by the address of the passed-in fds 444 * structure, and we use the value NULL to indicate an unused 445 * poll cache list entry. As such, we elect not to support 446 * NULL as a valid (user) memory address and fail the poll() 447 * call. 448 */ 449 error = EINVAL; 450 goto pollout; 451 } 452 453 /* 454 * If this thread polls for the first time, allocate ALL poll 455 * cache data structures and cache the poll fd list. This 456 * allocation is delayed till now because lwp's polling 0 fd 457 * (i.e. using poll as timeout()) don't need this memory. 458 */ 459 mutex_enter(&ps->ps_lock); 460 pcp = ps->ps_pcache; 461 ASSERT(pcp != NULL); 462 if (pcp->pc_bitmap == NULL) { 463 pcache_create(pcp, nfds); 464 /* 465 * poll and cache this poll fd list in ps_pcacheset[0]. 466 */ 467 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 468 if (fdcnt || error) { 469 mutex_exit(&ps->ps_lock); 470 goto pollout; 471 } 472 } else { 473 pollcacheset_t *pcset = ps->ps_pcacheset; 474 475 /* 476 * Not first time polling. Select a cached poll list by 477 * matching user pollfd list buffer address. 478 */ 479 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 480 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 481 if ((++pcset[cacheindex].pcs_count) == 0) { 482 /* 483 * counter is wrapping around. 484 */ 485 pcacheset_reset_count(ps, cacheindex); 486 } 487 /* 488 * examine and resolve possible 489 * difference of the current poll 490 * list and previously cached one. 491 * If there is an error during resolve(), 492 * the callee will guarantee the consistency 493 * of cached poll list and cache content. 494 */ 495 error = pcacheset_resolve(ps, nfds, &fdcnt, 496 cacheindex); 497 if (error) { 498 mutex_exit(&ps->ps_lock); 499 goto pollout; 500 } 501 break; 502 } 503 504 /* 505 * Note that pcs_usradr field of an used entry won't be 506 * NULL because it stores the address of passed-in fds, 507 * and NULL fds will not be cached (Then it is either 508 * the special timeout case when nfds is 0 or it returns 509 * failure directly). 510 */ 511 if (pcset[cacheindex].pcs_usradr == NULL) { 512 /* 513 * found an unused entry. Use it to cache 514 * this poll list. 515 */ 516 error = pcacheset_cache_list(ps, fds, &fdcnt, 517 cacheindex); 518 if (fdcnt || error) { 519 mutex_exit(&ps->ps_lock); 520 goto pollout; 521 } 522 break; 523 } 524 } 525 if (cacheindex == ps->ps_nsets) { 526 /* 527 * We failed to find a matching cached poll fd list. 528 * replace an old list. 529 */ 530 pollstats.polllistmiss.value.ui64++; 531 cacheindex = pcacheset_replace(ps); 532 ASSERT(cacheindex < ps->ps_nsets); 533 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 534 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 535 if (error) { 536 mutex_exit(&ps->ps_lock); 537 goto pollout; 538 } 539 } 540 } 541 542 /* 543 * Always scan the bitmap with the lock on the pollcache held. 544 * This is to make sure that a wakeup does not come undetected. 545 * If the lock is not held, a pollwakeup could have come for an 546 * fd we already checked but before this thread sleeps, in which 547 * case the wakeup is missed. Now we hold the pcache lock and 548 * check the bitmap again. This will prevent wakeup from happening 549 * while we hold pcache lock since pollwakeup() will also lock 550 * the pcache before updating poll bitmap. 551 */ 552 mutex_enter(&pcp->pc_lock); 553 for (;;) { 554 pcp->pc_flag = 0; 555 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 556 if (fdcnt || error) { 557 mutex_exit(&pcp->pc_lock); 558 mutex_exit(&ps->ps_lock); 559 break; 560 } 561 562 /* 563 * If PC_POLLWAKE is set, a pollwakeup() was performed on 564 * one of the file descriptors. This can happen only if 565 * one of the VOP_POLL() functions dropped pcp->pc_lock. 566 * The only current cases of this is in procfs (prpoll()) 567 * and STREAMS (strpoll()). 568 */ 569 if (pcp->pc_flag & PC_POLLWAKE) 570 continue; 571 572 /* 573 * If you get here, the poll of fds was unsuccessful. 574 * Wait until some fd becomes readable, writable, or gets 575 * an exception, or until a signal or a timeout occurs. 576 * Do not check for signals if we have a zero timeout. 577 */ 578 mutex_exit(&ps->ps_lock); 579 if (deadline == 0) { 580 error = -1; 581 } else { 582 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 583 &pcp->pc_lock, deadline); 584 } 585 mutex_exit(&pcp->pc_lock); 586 /* 587 * If we have received a signal or timed out 588 * then break out and return. 589 */ 590 if (error <= 0) { 591 error = (error == 0) ? EINTR : 0; 592 break; 593 } 594 /* 595 * We have not received a signal or timed out. 596 * Continue around and poll fds again. 597 */ 598 mutex_enter(&ps->ps_lock); 599 mutex_enter(&pcp->pc_lock); 600 } 601 602 pollout: 603 /* 604 * If we changed the signal mask but we received 605 * no signal then restore the signal mask. 606 * Otherwise psig() will deal with the signal mask. 607 */ 608 if (ksetp != NULL) { 609 mutex_enter(&p->p_lock); 610 if (lwp->lwp_cursig == 0) { 611 t->t_hold = lwp->lwp_sigoldmask; 612 t->t_flag &= ~T_TOMASK; 613 } 614 mutex_exit(&p->p_lock); 615 } 616 617 if (error) 618 return (set_errno(error)); 619 620 /* 621 * Copy out the events and return the fdcnt to the user. 622 */ 623 if (nfds != 0 && 624 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 625 return (set_errno(EFAULT)); 626 627 #ifdef DEBUG 628 /* 629 * Another sanity check: 630 */ 631 if (fdcnt) { 632 int reventcnt = 0; 633 634 for (i = 0; i < nfds; i++) { 635 if (pollfdp[i].fd < 0) { 636 ASSERT(pollfdp[i].revents == 0); 637 continue; 638 } 639 if (pollfdp[i].revents) { 640 reventcnt++; 641 } 642 } 643 ASSERT(fdcnt == reventcnt); 644 } else { 645 for (i = 0; i < nfds; i++) { 646 ASSERT(pollfdp[i].revents == 0); 647 } 648 } 649 #endif /* DEBUG */ 650 651 return (fdcnt); 652 } 653 654 /* 655 * This is the system call trap that poll(), 656 * select() and pselect() are built upon. 657 * It is a private interface between libc and the kernel. 658 */ 659 int 660 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 661 { 662 timespec_t ts; 663 timespec_t *tsp; 664 sigset_t set; 665 k_sigset_t kset; 666 k_sigset_t *ksetp; 667 model_t datamodel = get_udatamodel(); 668 669 if (timeoutp == NULL) 670 tsp = NULL; 671 else { 672 if (datamodel == DATAMODEL_NATIVE) { 673 if (copyin(timeoutp, &ts, sizeof (ts))) 674 return (set_errno(EFAULT)); 675 } else { 676 timespec32_t ts32; 677 678 if (copyin(timeoutp, &ts32, sizeof (ts32))) 679 return (set_errno(EFAULT)); 680 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 681 } 682 683 if (itimerspecfix(&ts)) 684 return (set_errno(EINVAL)); 685 tsp = &ts; 686 } 687 688 if (setp == NULL) 689 ksetp = NULL; 690 else { 691 if (copyin(setp, &set, sizeof (set))) 692 return (set_errno(EFAULT)); 693 sigutok(&set, &kset); 694 ksetp = &kset; 695 } 696 697 return (poll_common(fds, nfds, tsp, ksetp)); 698 } 699 700 /* 701 * Clean up any state left around by poll(2). Called when a thread exits. 702 */ 703 void 704 pollcleanup() 705 { 706 pollstate_t *ps = curthread->t_pollstate; 707 pollcache_t *pcp; 708 709 if (ps == NULL) 710 return; 711 pcp = ps->ps_pcache; 712 /* 713 * free up all cached poll fds 714 */ 715 if (pcp == NULL) { 716 /* this pollstate is used by /dev/poll */ 717 goto pollcleanout; 718 } 719 720 if (pcp->pc_bitmap != NULL) { 721 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 722 /* 723 * a close lwp can race with us when cleaning up a polldat 724 * entry. We hold the ps_lock when cleaning hash table. 725 * Since this pollcache is going away anyway, there is no 726 * need to hold the pc_lock. 727 */ 728 mutex_enter(&ps->ps_lock); 729 pcache_clean(pcp); 730 mutex_exit(&ps->ps_lock); 731 #ifdef DEBUG 732 /* 733 * At this point, all fds cached by this lwp should be 734 * cleaned up. There should be no fd in fi_list still 735 * reference this thread. 736 */ 737 checkfpollinfo(); /* sanity check */ 738 pollcheckphlist(); /* sanity check */ 739 #endif /* DEBUG */ 740 } 741 /* 742 * Be sure no one is referencing thread before exiting 743 */ 744 mutex_enter(&pcp->pc_no_exit); 745 ASSERT(pcp->pc_busy >= 0); 746 while (pcp->pc_busy > 0) 747 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 748 mutex_exit(&pcp->pc_no_exit); 749 pollcleanout: 750 pollstate_destroy(ps); 751 curthread->t_pollstate = NULL; 752 } 753 754 /* 755 * pollwakeup() - poke threads waiting in poll() for some event 756 * on a particular object. 757 * 758 * The threads hanging off of the specified pollhead structure are scanned. 759 * If their event mask matches the specified event(s), then pollnotify() is 760 * called to poke the thread. 761 * 762 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 763 * all waiting threads are poked. 764 * 765 * It is important that pollnotify() not drop the lock protecting the list 766 * of threads. 767 */ 768 void 769 pollwakeup(pollhead_t *php, short events_arg) 770 { 771 polldat_t *pdp; 772 polldat_t *first; 773 int events = (ushort_t)events_arg; 774 struct plist { 775 port_t *pp; 776 int pevents; 777 struct plist *next; 778 }; 779 struct plist *plhead = NULL, *pltail = NULL; 780 781 retry: 782 PH_ENTER(php); 783 784 if (php->ph_list == NULL) { 785 PH_EXIT(php); 786 return; 787 } 788 789 if (randomize_pollwakeup) { 790 size_t entries = 0; 791 size_t r = 0; 792 793 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) 794 ++entries; 795 ASSERT(entries != 0); 796 random_get_pseudo_bytes((uint8_t *)&r, sizeof(r)); 797 r %= entries; 798 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) 799 if (r-- == 0) 800 break; 801 ASSERT(pdp != NULL); 802 } else { 803 pdp = php->ph_list; 804 } 805 806 first = pdp; 807 for (;;) { 808 if ((pdp->pd_events & events) || 809 (events & (POLLHUP | POLLERR))) { 810 811 pollcache_t *pcp; 812 813 if (pdp->pd_portev != NULL) { 814 port_kevent_t *pkevp = pdp->pd_portev; 815 /* 816 * Object (fd) is associated with an event port, 817 * => send event notification to the port. 818 */ 819 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 820 mutex_enter(&pkevp->portkev_lock); 821 if (pkevp->portkev_flags & PORT_KEV_VALID) { 822 int pevents; 823 824 pkevp->portkev_flags &= ~PORT_KEV_VALID; 825 pkevp->portkev_events |= events & 826 (pdp->pd_events | POLLHUP | 827 POLLERR); 828 /* 829 * portkev_lock mutex will be released 830 * by port_send_event(). 831 */ 832 port_send_event(pkevp); 833 834 /* 835 * If we have some thread polling the 836 * port's fd, add it to the list. They 837 * will be notified later. 838 * The port_pollwkup() will flag the 839 * port_t so that it will not disappear 840 * till port_pollwkdone() is called. 841 */ 842 pevents = 843 port_pollwkup(pkevp->portkev_port); 844 if (pevents) { 845 struct plist *t; 846 t = kmem_zalloc( 847 sizeof (struct plist), 848 KM_SLEEP); 849 t->pp = pkevp->portkev_port; 850 t->pevents = pevents; 851 if (plhead == NULL) { 852 plhead = t; 853 } else { 854 pltail->next = t; 855 } 856 pltail = t; 857 } 858 } else { 859 mutex_exit(&pkevp->portkev_lock); 860 } 861 goto next; 862 } 863 864 pcp = pdp->pd_pcache; 865 866 /* 867 * Try to grab the lock for this thread. If 868 * we don't get it then we may deadlock so 869 * back out and restart all over again. Note 870 * that the failure rate is very very low. 871 */ 872 if (mutex_tryenter(&pcp->pc_lock)) { 873 pollnotify(pcp, pdp->pd_fd); 874 mutex_exit(&pcp->pc_lock); 875 } else { 876 /* 877 * We are here because: 878 * 1) This thread has been woke up 879 * and is trying to get out of poll(). 880 * 2) Some other thread is also here 881 * but with a different pollhead lock. 882 * 883 * So, we need to drop the lock on pollhead 884 * because of (1) but we want to prevent 885 * that thread from doing lwp_exit() or 886 * devpoll close. We want to ensure that 887 * the pollcache pointer is still invalid. 888 * 889 * Solution: Grab the pcp->pc_no_exit lock, 890 * increment the pc_busy counter, drop every 891 * lock in sight. Get out of the way and wait 892 * for type (2) threads to finish. 893 */ 894 895 mutex_enter(&pcp->pc_no_exit); 896 pcp->pc_busy++; /* prevents exit()'s */ 897 mutex_exit(&pcp->pc_no_exit); 898 899 PH_EXIT(php); 900 mutex_enter(&pcp->pc_lock); 901 mutex_exit(&pcp->pc_lock); 902 mutex_enter(&pcp->pc_no_exit); 903 pcp->pc_busy--; 904 if (pcp->pc_busy == 0) { 905 /* 906 * Wakeup the thread waiting in 907 * thread_exit(). 908 */ 909 cv_signal(&pcp->pc_busy_cv); 910 } 911 mutex_exit(&pcp->pc_no_exit); 912 goto retry; 913 } 914 } 915 next: 916 pdp = pdp->pd_next; 917 if (pdp == NULL) 918 pdp = php->ph_list; 919 if (pdp == first) 920 break; 921 } 922 923 924 /* 925 * Event ports - If this php is of the port on the list, 926 * call port_pollwkdone() to release it. The port_pollwkdone() 927 * needs to be called before dropping the PH lock so that any new 928 * thread attempting to poll this port are blocked. There can be 929 * only one thread here in pollwakeup notifying this port's fd. 930 */ 931 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 932 struct plist *t; 933 port_pollwkdone(plhead->pp); 934 t = plhead; 935 plhead = plhead->next; 936 kmem_free(t, sizeof (struct plist)); 937 } 938 PH_EXIT(php); 939 940 /* 941 * Event ports - Notify threads polling the event port's fd. 942 * This is normally done in port_send_event() where it calls 943 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 944 * we do it here in pollwakeup() to avoid a recursive call. 945 */ 946 if (plhead != NULL) { 947 php = &plhead->pp->port_pollhd; 948 events = plhead->pevents; 949 goto retry; 950 } 951 } 952 953 /* 954 * This function is called to inform a thread (or threads) that an event being 955 * polled on has occurred. The pollstate lock on the thread should be held 956 * on entry. 957 */ 958 void 959 pollnotify(pollcache_t *pcp, int fd) 960 { 961 ASSERT(fd < pcp->pc_mapsize); 962 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 963 BT_SET(pcp->pc_bitmap, fd); 964 pcp->pc_flag |= PC_POLLWAKE; 965 cv_broadcast(&pcp->pc_cv); 966 pcache_wake_parents(pcp); 967 } 968 969 /* 970 * add a polldat entry to pollhead ph_list. The polldat struct is used 971 * by pollwakeup to wake sleeping pollers when polled events has happened. 972 */ 973 void 974 pollhead_insert(pollhead_t *php, polldat_t *pdp) 975 { 976 PH_ENTER(php); 977 ASSERT(pdp->pd_next == NULL); 978 #ifdef DEBUG 979 { 980 /* 981 * the polldat should not be already on the list 982 */ 983 polldat_t *wp; 984 for (wp = php->ph_list; wp; wp = wp->pd_next) { 985 ASSERT(wp != pdp); 986 } 987 } 988 #endif /* DEBUG */ 989 pdp->pd_next = php->ph_list; 990 php->ph_list = pdp; 991 PH_EXIT(php); 992 } 993 994 /* 995 * Delete the polldat entry from ph_list. 996 */ 997 void 998 pollhead_delete(pollhead_t *php, polldat_t *pdp) 999 { 1000 polldat_t *wp; 1001 polldat_t **wpp; 1002 1003 PH_ENTER(php); 1004 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 1005 if (wp == pdp) { 1006 *wpp = pdp->pd_next; 1007 pdp->pd_next = NULL; 1008 break; 1009 } 1010 } 1011 #ifdef DEBUG 1012 /* assert that pdp is no longer in the list */ 1013 for (wp = *wpp; wp; wp = wp->pd_next) { 1014 ASSERT(wp != pdp); 1015 } 1016 #endif /* DEBUG */ 1017 PH_EXIT(php); 1018 } 1019 1020 /* 1021 * walk through the poll fd lists to see if they are identical. This is an 1022 * expensive operation and should not be done more than once for each poll() 1023 * call. 1024 * 1025 * As an optimization (i.e., not having to go through the lists more than 1026 * once), this routine also clear the revents field of pollfd in 'current'. 1027 * Zeroing out the revents field of each entry in current poll list is 1028 * required by poll man page. 1029 * 1030 * Since the events field of cached list has illegal poll events filtered 1031 * out, the current list applies the same filtering before comparison. 1032 * 1033 * The routine stops when it detects a meaningful difference, or when it 1034 * exhausts the lists. 1035 */ 1036 int 1037 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 1038 { 1039 int ix; 1040 1041 for (ix = 0; ix < n; ix++) { 1042 /* Prefetch 64 bytes worth of 8-byte elements */ 1043 if ((ix & 0x7) == 0) { 1044 prefetch_write_many((caddr_t)¤t[ix + 8]); 1045 prefetch_write_many((caddr_t)&cached[ix + 8]); 1046 } 1047 if (current[ix].fd == cached[ix].fd) { 1048 /* 1049 * Filter out invalid poll events while we are in 1050 * inside the loop. 1051 */ 1052 if (current[ix].events & ~VALID_POLL_EVENTS) { 1053 current[ix].events &= VALID_POLL_EVENTS; 1054 if (newlist != NULL) 1055 newlist[ix].events = current[ix].events; 1056 } 1057 if (current[ix].events == cached[ix].events) { 1058 current[ix].revents = 0; 1059 continue; 1060 } 1061 } 1062 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1063 current[ix].revents = 0; 1064 continue; 1065 } 1066 return (ix); 1067 } 1068 return (ix); 1069 } 1070 1071 /* 1072 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1073 * does not find it in the hash table. 1074 */ 1075 polldat_t * 1076 pcache_lookup_fd(pollcache_t *pcp, int fd) 1077 { 1078 int hashindex; 1079 polldat_t *pdp; 1080 1081 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1082 pdp = pcp->pc_hash[hashindex]; 1083 while (pdp != NULL) { 1084 if (pdp->pd_fd == fd) 1085 break; 1086 pdp = pdp->pd_hashnext; 1087 } 1088 return (pdp); 1089 } 1090 1091 polldat_t * 1092 pcache_alloc_fd(int nsets) 1093 { 1094 polldat_t *pdp; 1095 1096 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1097 if (nsets > 0) { 1098 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1099 pdp->pd_nsets = nsets; 1100 } 1101 return (pdp); 1102 } 1103 1104 /* 1105 * This routine inserts a polldat into the pollcache's hash table. It 1106 * may be necessary to grow the size of the hash table. 1107 */ 1108 void 1109 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1110 { 1111 int hashindex; 1112 int fd; 1113 1114 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1115 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1116 pcache_grow_hashtbl(pcp, nfds); 1117 } 1118 fd = pdp->pd_fd; 1119 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1120 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1121 pcp->pc_hash[hashindex] = pdp; 1122 pcp->pc_fdcount++; 1123 1124 #ifdef DEBUG 1125 { 1126 /* 1127 * same fd should not appear on a hash list twice 1128 */ 1129 polldat_t *pdp1; 1130 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1131 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1132 } 1133 } 1134 #endif /* DEBUG */ 1135 } 1136 1137 /* 1138 * Grow the hash table -- either double the table size or round it to the 1139 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1140 * elements on the hash table. 1141 */ 1142 void 1143 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1144 { 1145 int oldsize; 1146 polldat_t **oldtbl; 1147 polldat_t *pdp, *pdp1; 1148 int i; 1149 #ifdef DEBUG 1150 int count = 0; 1151 #endif 1152 1153 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1154 oldsize = pcp->pc_hashsize; 1155 oldtbl = pcp->pc_hash; 1156 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1157 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1158 ~(POLLHASHCHUNKSZ - 1); 1159 } else { 1160 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1161 } 1162 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1163 KM_SLEEP); 1164 /* 1165 * rehash existing elements 1166 */ 1167 pcp->pc_fdcount = 0; 1168 for (i = 0; i < oldsize; i++) { 1169 pdp = oldtbl[i]; 1170 while (pdp != NULL) { 1171 pdp1 = pdp->pd_hashnext; 1172 pcache_insert_fd(pcp, pdp, nfds); 1173 pdp = pdp1; 1174 #ifdef DEBUG 1175 count++; 1176 #endif 1177 } 1178 } 1179 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1180 ASSERT(pcp->pc_fdcount == count); 1181 } 1182 1183 void 1184 pcache_grow_map(pollcache_t *pcp, int fd) 1185 { 1186 int newsize; 1187 ulong_t *newmap; 1188 1189 /* 1190 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1191 * power of 2. 1192 */ 1193 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1194 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1195 KM_SLEEP); 1196 /* 1197 * don't want pollwakeup to set a bit while growing the bitmap. 1198 */ 1199 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1200 mutex_enter(&pcp->pc_lock); 1201 bcopy(pcp->pc_bitmap, newmap, 1202 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1203 kmem_free(pcp->pc_bitmap, 1204 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1205 pcp->pc_bitmap = newmap; 1206 pcp->pc_mapsize = newsize; 1207 mutex_exit(&pcp->pc_lock); 1208 } 1209 1210 /* 1211 * remove all the reference from pollhead list and fpollinfo lists. 1212 */ 1213 void 1214 pcache_clean(pollcache_t *pcp) 1215 { 1216 int i; 1217 polldat_t **hashtbl; 1218 polldat_t *pdp; 1219 1220 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1221 hashtbl = pcp->pc_hash; 1222 for (i = 0; i < pcp->pc_hashsize; i++) { 1223 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1224 if (pdp->pd_php != NULL) { 1225 pollhead_delete(pdp->pd_php, pdp); 1226 pdp->pd_php = NULL; 1227 } 1228 if (pdp->pd_fp != NULL) { 1229 delfpollinfo(pdp->pd_fd); 1230 pdp->pd_fp = NULL; 1231 } 1232 } 1233 } 1234 } 1235 1236 void 1237 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1238 { 1239 int i; 1240 int fd = pdp->pd_fd; 1241 1242 /* 1243 * we come here because an earlier close() on this cached poll fd. 1244 */ 1245 ASSERT(pdp->pd_fp == NULL); 1246 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1247 pdp->pd_events = 0; 1248 for (i = 0; i < ps->ps_nsets; i++) { 1249 xref_t *refp; 1250 pollcacheset_t *pcsp; 1251 1252 ASSERT(pdp->pd_ref != NULL); 1253 refp = &pdp->pd_ref[i]; 1254 if (refp->xf_refcnt) { 1255 ASSERT(refp->xf_position >= 0); 1256 pcsp = &ps->ps_pcacheset[i]; 1257 if (refp->xf_refcnt == 1) { 1258 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1259 refp->xf_refcnt = 0; 1260 pdp->pd_count--; 1261 } else if (refp->xf_refcnt > 1) { 1262 int j; 1263 1264 /* 1265 * turn off every appearance in pcs_pollfd list 1266 */ 1267 for (j = refp->xf_position; 1268 j < pcsp->pcs_nfds; j++) { 1269 if (pcsp->pcs_pollfd[j].fd == fd) { 1270 pcsp->pcs_pollfd[j].fd = -1; 1271 refp->xf_refcnt--; 1272 pdp->pd_count--; 1273 } 1274 } 1275 } 1276 ASSERT(refp->xf_refcnt == 0); 1277 refp->xf_position = POLLPOSINVAL; 1278 } 1279 } 1280 ASSERT(pdp->pd_count == 0); 1281 } 1282 1283 /* 1284 * Insert poll fd into the pollcache, and add poll registration. 1285 * This routine is called after getf() and before releasef(). So the vnode 1286 * can not disappear even if we block here. 1287 * If there is an error, the polled fd is not cached. 1288 */ 1289 int 1290 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1291 ssize_t pos, int which) 1292 { 1293 pollcache_t *pcp = ps->ps_pcache; 1294 polldat_t *pdp; 1295 int error; 1296 int fd; 1297 pollhead_t *memphp = NULL; 1298 xref_t *refp; 1299 int newpollfd = 0; 1300 1301 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1302 /* 1303 * The poll caching uses the existing VOP_POLL interface. If there 1304 * is no polled events, we want the polled device to set its "some 1305 * one is sleeping in poll" flag. When the polled events happen 1306 * later, the driver will call pollwakeup(). We achieve this by 1307 * always passing 0 in the third parameter ("anyyet") when calling 1308 * VOP_POLL. This parameter is not looked at by drivers when the 1309 * polled events exist. If a driver chooses to ignore this parameter 1310 * and call pollwakeup whenever the polled events happen, that will 1311 * be OK too. 1312 */ 1313 ASSERT(curthread->t_pollcache == NULL); 1314 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1315 &memphp, NULL); 1316 if (error) { 1317 return (error); 1318 } 1319 if (pollfdp->revents) { 1320 (*fdcntp)++; 1321 } 1322 /* 1323 * polling the underlying device succeeded. Now we can cache it. 1324 * A close can't come in here because we have not done a releasef() 1325 * yet. 1326 */ 1327 fd = pollfdp->fd; 1328 pdp = pcache_lookup_fd(pcp, fd); 1329 if (pdp == NULL) { 1330 ASSERT(ps->ps_nsets > 0); 1331 pdp = pcache_alloc_fd(ps->ps_nsets); 1332 newpollfd = 1; 1333 } 1334 /* 1335 * If this entry was used to cache a poll fd which was closed, and 1336 * this entry has not been cleaned, do it now. 1337 */ 1338 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1339 pcacheset_invalidate(ps, pdp); 1340 ASSERT(pdp->pd_next == NULL); 1341 } 1342 if (pdp->pd_count == 0) { 1343 pdp->pd_fd = fd; 1344 pdp->pd_fp = fp; 1345 addfpollinfo(fd); 1346 pdp->pd_thread = curthread; 1347 pdp->pd_pcache = pcp; 1348 /* 1349 * the entry is never used or cleared by removing a cached 1350 * pollfd (pcache_delete_fd). So all the fields should be clear. 1351 */ 1352 ASSERT(pdp->pd_next == NULL); 1353 } 1354 1355 /* 1356 * A polled fd is considered cached. So there should be a fpollinfo 1357 * entry on uf_fpollinfo list. 1358 */ 1359 ASSERT(infpollinfo(fd)); 1360 /* 1361 * If there is an inconsistency, we want to know it here. 1362 */ 1363 ASSERT(pdp->pd_fp == fp); 1364 1365 /* 1366 * XXX pd_events is a union of all polled events on this fd, possibly 1367 * by different threads. Unless this is a new first poll(), pd_events 1368 * never shrinks. If an event is no longer polled by a process, there 1369 * is no way to cancel that event. In that case, poll degrade to its 1370 * old form -- polling on this fd every time poll() is called. The 1371 * assumption is an app always polls the same type of events. 1372 */ 1373 pdp->pd_events |= pollfdp->events; 1374 1375 pdp->pd_count++; 1376 /* 1377 * There is not much special handling for multiple appearances of 1378 * same fd other than xf_position always recording the first 1379 * appearance in poll list. If this is called from pcacheset_cache_list, 1380 * a VOP_POLL is called on every pollfd entry; therefore each 1381 * revents and fdcnt should be set correctly. If this is called from 1382 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1383 * pick up the right count and handle revents field of each pollfd 1384 * entry. 1385 */ 1386 ASSERT(pdp->pd_ref != NULL); 1387 refp = &pdp->pd_ref[which]; 1388 if (refp->xf_refcnt == 0) { 1389 refp->xf_position = pos; 1390 } else { 1391 /* 1392 * xf_position records the fd's first appearance in poll list 1393 */ 1394 if (pos < refp->xf_position) { 1395 refp->xf_position = pos; 1396 } 1397 } 1398 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1399 refp->xf_refcnt++; 1400 if (fd >= pcp->pc_mapsize) { 1401 pcache_grow_map(pcp, fd); 1402 } 1403 if (fd > pcp->pc_mapend) { 1404 pcp->pc_mapend = fd; 1405 } 1406 if (newpollfd != 0) { 1407 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1408 } 1409 if (memphp) { 1410 if (pdp->pd_php == NULL) { 1411 pollhead_insert(memphp, pdp); 1412 pdp->pd_php = memphp; 1413 } else { 1414 if (memphp != pdp->pd_php) { 1415 /* 1416 * layered devices (e.g. console driver) 1417 * may change the vnode and thus the pollhead 1418 * pointer out from underneath us. 1419 */ 1420 pollhead_delete(pdp->pd_php, pdp); 1421 pollhead_insert(memphp, pdp); 1422 pdp->pd_php = memphp; 1423 } 1424 } 1425 } 1426 /* 1427 * Since there is a considerable window between VOP_POLL and when 1428 * we actually put the polldat struct on the pollhead list, we could 1429 * miss a pollwakeup. In the case of polling additional events, we 1430 * don't update the events until after VOP_POLL. So we could miss 1431 * pollwakeup there too. So we always set the bit here just to be 1432 * safe. The real performance gain is in subsequent pcache_poll. 1433 */ 1434 mutex_enter(&pcp->pc_lock); 1435 BT_SET(pcp->pc_bitmap, fd); 1436 mutex_exit(&pcp->pc_lock); 1437 return (0); 1438 } 1439 1440 /* 1441 * The entry is not really deleted. The fields are cleared so that the 1442 * entry is no longer useful, but it will remain in the hash table for reuse 1443 * later. It will be freed when the polling lwp exits. 1444 */ 1445 int 1446 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1447 { 1448 pollcache_t *pcp = ps->ps_pcache; 1449 polldat_t *pdp; 1450 xref_t *refp; 1451 1452 ASSERT(fd < pcp->pc_mapsize); 1453 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1454 1455 pdp = pcache_lookup_fd(pcp, fd); 1456 ASSERT(pdp != NULL); 1457 ASSERT(pdp->pd_count > 0); 1458 ASSERT(pdp->pd_ref != NULL); 1459 refp = &pdp->pd_ref[which]; 1460 if (pdp->pd_count == 1) { 1461 pdp->pd_events = 0; 1462 refp->xf_position = POLLPOSINVAL; 1463 ASSERT(refp->xf_refcnt == 1); 1464 refp->xf_refcnt = 0; 1465 if (pdp->pd_php) { 1466 /* 1467 * It is possible for a wakeup thread to get ahead 1468 * of the following pollhead_delete and set the bit in 1469 * bitmap. It is OK because the bit will be cleared 1470 * here anyway. 1471 */ 1472 pollhead_delete(pdp->pd_php, pdp); 1473 pdp->pd_php = NULL; 1474 } 1475 pdp->pd_count = 0; 1476 if (pdp->pd_fp != NULL) { 1477 pdp->pd_fp = NULL; 1478 delfpollinfo(fd); 1479 } 1480 mutex_enter(&pcp->pc_lock); 1481 BT_CLEAR(pcp->pc_bitmap, fd); 1482 mutex_exit(&pcp->pc_lock); 1483 return (0); 1484 } 1485 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1486 /* 1487 * fd cached here has been closed. This is the first 1488 * pcache_delete_fd called after the close. Clean up the 1489 * entire entry. 1490 */ 1491 pcacheset_invalidate(ps, pdp); 1492 ASSERT(pdp->pd_php == NULL); 1493 mutex_enter(&pcp->pc_lock); 1494 BT_CLEAR(pcp->pc_bitmap, fd); 1495 mutex_exit(&pcp->pc_lock); 1496 return (0); 1497 } 1498 #ifdef DEBUG 1499 if (getf(fd) != NULL) { 1500 ASSERT(infpollinfo(fd)); 1501 releasef(fd); 1502 } 1503 #endif /* DEBUG */ 1504 pdp->pd_count--; 1505 ASSERT(refp->xf_refcnt > 0); 1506 if (--refp->xf_refcnt == 0) { 1507 refp->xf_position = POLLPOSINVAL; 1508 } else { 1509 ASSERT(pos >= refp->xf_position); 1510 if (pos == refp->xf_position) { 1511 /* 1512 * The xref position is no longer valid. 1513 * Reset it to a special value and let 1514 * caller know it needs to updatexref() 1515 * with a new xf_position value. 1516 */ 1517 refp->xf_position = POLLPOSTRANS; 1518 return (1); 1519 } 1520 } 1521 return (0); 1522 } 1523 1524 void 1525 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1526 { 1527 polldat_t *pdp; 1528 1529 pdp = pcache_lookup_fd(pcp, fd); 1530 ASSERT(pdp != NULL); 1531 ASSERT(pdp->pd_ref != NULL); 1532 pdp->pd_ref[which].xf_position = pos; 1533 } 1534 1535 #ifdef DEBUG 1536 /* 1537 * For each polled fd, it's either in the bitmap or cached in 1538 * pcache hash table. If this routine returns 0, something is wrong. 1539 */ 1540 static int 1541 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1542 { 1543 int i; 1544 int fd; 1545 pollcache_t *pcp = ps->ps_pcache; 1546 polldat_t *pdp; 1547 pollfd_t *pollfdp = ps->ps_pollfd; 1548 file_t *fp; 1549 1550 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1551 for (i = 0; i < nfds; i++) { 1552 fd = pollfdp[i].fd; 1553 if (fd < 0) { 1554 ASSERT(pollfdp[i].revents == 0); 1555 continue; 1556 } 1557 if (pollfdp[i].revents == POLLNVAL) 1558 continue; 1559 if ((fp = getf(fd)) == NULL) 1560 continue; 1561 pdp = pcache_lookup_fd(pcp, fd); 1562 ASSERT(pdp != NULL); 1563 ASSERT(infpollinfo(fd)); 1564 ASSERT(pdp->pd_fp == fp); 1565 releasef(fd); 1566 if (BT_TEST(pcp->pc_bitmap, fd)) 1567 continue; 1568 if (pdp->pd_php == NULL) 1569 return (0); 1570 } 1571 return (1); 1572 } 1573 #endif /* DEBUG */ 1574 1575 /* 1576 * resolve the difference between the current poll list and a cached one. 1577 */ 1578 int 1579 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1580 { 1581 int i; 1582 pollcache_t *pcp = ps->ps_pcache; 1583 pollfd_t *newlist = NULL; 1584 pollfd_t *current = ps->ps_pollfd; 1585 pollfd_t *cached; 1586 pollcacheset_t *pcsp; 1587 int common; 1588 int count = 0; 1589 int offset; 1590 int remain; 1591 int fd; 1592 file_t *fp; 1593 int fdcnt = 0; 1594 int cnt = 0; 1595 nfds_t old_nfds; 1596 int error = 0; 1597 int mismatch = 0; 1598 1599 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1600 #ifdef DEBUG 1601 checkpolldat(ps); 1602 #endif 1603 pcsp = &ps->ps_pcacheset[which]; 1604 old_nfds = pcsp->pcs_nfds; 1605 common = (nfds > old_nfds) ? old_nfds : nfds; 1606 if (nfds != old_nfds) { 1607 /* 1608 * the length of poll list has changed. allocate a new 1609 * pollfd list. 1610 */ 1611 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1612 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1613 } 1614 /* 1615 * Compare the overlapping part of the current fd list with the 1616 * cached one. Whenever a difference is found, resolve it. 1617 * The comparison is done on the current poll list and the 1618 * cached list. But we may be setting up the newlist to be the 1619 * cached list for next poll. 1620 */ 1621 cached = pcsp->pcs_pollfd; 1622 remain = common; 1623 1624 while (count < common) { 1625 int tmpfd; 1626 pollfd_t *np; 1627 1628 np = (newlist != NULL) ? &newlist[count] : NULL; 1629 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1630 remain); 1631 /* 1632 * Collect stats. If lists are completed the first time, 1633 * it's a hit. Otherwise, it's a partial hit or miss. 1634 */ 1635 if ((count == 0) && (offset == common)) { 1636 pollstats.pollcachehit.value.ui64++; 1637 } else { 1638 mismatch++; 1639 } 1640 count += offset; 1641 if (offset < remain) { 1642 ASSERT(count < common); 1643 ASSERT((current[count].fd != cached[count].fd) || 1644 (current[count].events != cached[count].events)); 1645 /* 1646 * Filter out invalid events. 1647 */ 1648 if (current[count].events & ~VALID_POLL_EVENTS) { 1649 if (newlist != NULL) { 1650 newlist[count].events = 1651 current[count].events &= 1652 VALID_POLL_EVENTS; 1653 } else { 1654 current[count].events &= 1655 VALID_POLL_EVENTS; 1656 } 1657 } 1658 /* 1659 * when resolving a difference, we always remove the 1660 * fd from cache before inserting one into cache. 1661 */ 1662 if (cached[count].fd >= 0) { 1663 tmpfd = cached[count].fd; 1664 if (pcache_delete_fd(ps, tmpfd, count, which, 1665 (uint_t)cached[count].events)) { 1666 /* 1667 * This should be rare but needed for 1668 * correctness. 1669 * 1670 * The first appearance in cached list 1671 * is being "turned off". The same fd 1672 * appear more than once in the cached 1673 * poll list. Find the next one on the 1674 * list and update the cached 1675 * xf_position field. 1676 */ 1677 for (i = count + 1; i < old_nfds; i++) { 1678 if (cached[i].fd == tmpfd) { 1679 pcache_update_xref(pcp, 1680 tmpfd, (ssize_t)i, 1681 which); 1682 break; 1683 } 1684 } 1685 ASSERT(i <= old_nfds); 1686 } 1687 /* 1688 * In case a new cache list is allocated, 1689 * need to keep both cache lists in sync 1690 * b/c the new one can be freed if we have 1691 * an error later. 1692 */ 1693 cached[count].fd = -1; 1694 if (newlist != NULL) { 1695 newlist[count].fd = -1; 1696 } 1697 } 1698 if ((tmpfd = current[count].fd) >= 0) { 1699 /* 1700 * add to the cached fd tbl and bitmap. 1701 */ 1702 if ((fp = getf(tmpfd)) == NULL) { 1703 current[count].revents = POLLNVAL; 1704 if (newlist != NULL) { 1705 newlist[count].fd = -1; 1706 } 1707 cached[count].fd = -1; 1708 fdcnt++; 1709 } else { 1710 /* 1711 * Here we don't care about the 1712 * fdcnt. We will examine the bitmap 1713 * later and pick up the correct 1714 * fdcnt there. So we never bother 1715 * to check value of 'cnt'. 1716 */ 1717 error = pcache_insert(ps, fp, 1718 ¤t[count], &cnt, 1719 (ssize_t)count, which); 1720 /* 1721 * if no error, we want to do releasef 1722 * after we updated cache poll list 1723 * entry so that close() won't race 1724 * us. 1725 */ 1726 if (error) { 1727 /* 1728 * If we encountered an error, 1729 * we have invalidated an 1730 * entry in cached poll list 1731 * (in pcache_delete_fd() above) 1732 * but failed to add one here. 1733 * This is OK b/c what's in the 1734 * cached list is consistent 1735 * with content of cache. 1736 * It will not have any ill 1737 * effect on next poll(). 1738 */ 1739 releasef(tmpfd); 1740 if (newlist != NULL) { 1741 kmem_free(newlist, 1742 nfds * 1743 sizeof (pollfd_t)); 1744 } 1745 return (error); 1746 } 1747 /* 1748 * If we have allocated a new(temp) 1749 * cache list, we need to keep both 1750 * in sync b/c the new one can be freed 1751 * if we have an error later. 1752 */ 1753 if (newlist != NULL) { 1754 newlist[count].fd = 1755 current[count].fd; 1756 newlist[count].events = 1757 current[count].events; 1758 } 1759 cached[count].fd = current[count].fd; 1760 cached[count].events = 1761 current[count].events; 1762 releasef(tmpfd); 1763 } 1764 } else { 1765 current[count].revents = 0; 1766 } 1767 count++; 1768 remain = common - count; 1769 } 1770 } 1771 if (mismatch != 0) { 1772 if (mismatch == common) { 1773 pollstats.pollcachemiss.value.ui64++; 1774 } else { 1775 pollstats.pollcachephit.value.ui64++; 1776 } 1777 } 1778 /* 1779 * take care of the non overlapping part of a list 1780 */ 1781 if (nfds > old_nfds) { 1782 ASSERT(newlist != NULL); 1783 for (i = old_nfds; i < nfds; i++) { 1784 /* filter out invalid events */ 1785 if (current[i].events & ~VALID_POLL_EVENTS) { 1786 newlist[i].events = current[i].events = 1787 current[i].events & VALID_POLL_EVENTS; 1788 } 1789 if ((fd = current[i].fd) < 0) { 1790 current[i].revents = 0; 1791 continue; 1792 } 1793 /* 1794 * add to the cached fd tbl and bitmap. 1795 */ 1796 if ((fp = getf(fd)) == NULL) { 1797 current[i].revents = POLLNVAL; 1798 newlist[i].fd = -1; 1799 fdcnt++; 1800 continue; 1801 } 1802 /* 1803 * Here we don't care about the 1804 * fdcnt. We will examine the bitmap 1805 * later and pick up the correct 1806 * fdcnt there. So we never bother to 1807 * check 'cnt'. 1808 */ 1809 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1810 (ssize_t)i, which); 1811 releasef(fd); 1812 if (error) { 1813 /* 1814 * Here we are half way through adding newly 1815 * polled fd. Undo enough to keep the cache 1816 * list consistent with the cache content. 1817 */ 1818 pcacheset_remove_list(ps, current, old_nfds, 1819 i, which, 0); 1820 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1821 return (error); 1822 } 1823 } 1824 } 1825 if (old_nfds > nfds) { 1826 /* 1827 * remove the fd's which are no longer polled. 1828 */ 1829 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1830 which, 1); 1831 } 1832 /* 1833 * set difference resolved. update nfds and cachedlist 1834 * in pollstate struct. 1835 */ 1836 if (newlist != NULL) { 1837 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1838 /* 1839 * By now, the pollfd.revents field should 1840 * all be zeroed. 1841 */ 1842 pcsp->pcs_pollfd = newlist; 1843 pcsp->pcs_nfds = nfds; 1844 } 1845 ASSERT(*fdcntp == 0); 1846 *fdcntp = fdcnt; 1847 /* 1848 * By now for every fd in pollfdp, one of the following should be 1849 * true. Otherwise we will miss a polled event. 1850 * 1851 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1852 * will be called on this fd in next poll. 1853 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1854 * pollnotify will happen. 1855 */ 1856 ASSERT(pollchecksanity(ps, nfds)); 1857 /* 1858 * make sure cross reference between cached poll lists and cached 1859 * poll fds are correct. 1860 */ 1861 ASSERT(pollcheckxref(ps, which)); 1862 /* 1863 * ensure each polldat in pollcache reference a polled fd in 1864 * pollcacheset. 1865 */ 1866 #ifdef DEBUG 1867 checkpolldat(ps); 1868 #endif 1869 return (0); 1870 } 1871 1872 #ifdef DEBUG 1873 static int 1874 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1875 { 1876 int i; 1877 int reventcnt = 0; 1878 1879 for (i = 0; i < nfds; i++) { 1880 if (pollfdp[i].fd < 0) { 1881 ASSERT(pollfdp[i].revents == 0); 1882 continue; 1883 } 1884 if (pollfdp[i].revents) { 1885 reventcnt++; 1886 } 1887 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1888 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1889 } 1890 } 1891 return (reventcnt); 1892 } 1893 #endif /* DEBUG */ 1894 1895 /* 1896 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1897 * is held upon entry. 1898 */ 1899 int 1900 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1901 int which) 1902 { 1903 int i; 1904 pollcache_t *pcp; 1905 int fd; 1906 int begin, end, done; 1907 pollhead_t *php; 1908 int fdcnt; 1909 int error = 0; 1910 file_t *fp; 1911 polldat_t *pdp; 1912 xref_t *refp; 1913 int entry; 1914 1915 pcp = ps->ps_pcache; 1916 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1917 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1918 retry: 1919 done = 0; 1920 begin = 0; 1921 fdcnt = 0; 1922 end = pcp->pc_mapend; 1923 while ((fdcnt < nfds) && !done) { 1924 php = NULL; 1925 /* 1926 * only poll fds which may have events 1927 */ 1928 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1929 ASSERT(fd <= end); 1930 if (fd >= 0) { 1931 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1932 /* 1933 * adjust map pointers for next round 1934 */ 1935 if (fd == end) { 1936 done = 1; 1937 } else { 1938 begin = fd + 1; 1939 } 1940 /* 1941 * A bitmap caches poll state information of 1942 * multiple poll lists. Call VOP_POLL only if 1943 * the bit corresponds to an fd in this poll 1944 * list. 1945 */ 1946 pdp = pcache_lookup_fd(pcp, fd); 1947 ASSERT(pdp != NULL); 1948 ASSERT(pdp->pd_ref != NULL); 1949 refp = &pdp->pd_ref[which]; 1950 if (refp->xf_refcnt == 0) 1951 continue; 1952 entry = refp->xf_position; 1953 ASSERT((entry >= 0) && (entry < nfds)); 1954 ASSERT(pollfdp[entry].fd == fd); 1955 /* 1956 * we are in this routine implies that we have 1957 * successfully polled this fd in the past. 1958 * Check to see this fd is closed while we are 1959 * blocked in poll. This ensures that we don't 1960 * miss a close on the fd in the case this fd is 1961 * reused. 1962 */ 1963 if (pdp->pd_fp == NULL) { 1964 ASSERT(pdp->pd_count > 0); 1965 pollfdp[entry].revents = POLLNVAL; 1966 fdcnt++; 1967 if (refp->xf_refcnt > 1) { 1968 /* 1969 * this fd appeared multiple time 1970 * in the poll list. Find all of them. 1971 */ 1972 for (i = entry + 1; i < nfds; i++) { 1973 if (pollfdp[i].fd == fd) { 1974 pollfdp[i].revents = 1975 POLLNVAL; 1976 fdcnt++; 1977 } 1978 } 1979 } 1980 pcacheset_invalidate(ps, pdp); 1981 continue; 1982 } 1983 /* 1984 * We can be here polling a device that is being 1985 * closed (i.e. the file pointer is set to NULL, 1986 * but pollcacheclean has not happened yet). 1987 */ 1988 if ((fp = getf(fd)) == NULL) { 1989 pollfdp[entry].revents = POLLNVAL; 1990 fdcnt++; 1991 if (refp->xf_refcnt > 1) { 1992 /* 1993 * this fd appeared multiple time 1994 * in the poll list. Find all of them. 1995 */ 1996 for (i = entry + 1; i < nfds; i++) { 1997 if (pollfdp[i].fd == fd) { 1998 pollfdp[i].revents = 1999 POLLNVAL; 2000 fdcnt++; 2001 } 2002 } 2003 } 2004 continue; 2005 } 2006 ASSERT(pdp->pd_fp == fp); 2007 ASSERT(infpollinfo(fd)); 2008 /* 2009 * Since we no longer hold poll head lock across 2010 * VOP_POLL, pollunlock logic can be simplifed. 2011 */ 2012 ASSERT(pdp->pd_php == NULL || 2013 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 2014 /* 2015 * underlying file systems may set a "pollpending" 2016 * flag when it sees the poll may block. Pollwakeup() 2017 * is called by wakeup thread if pollpending is set. 2018 * Pass a 0 fdcnt so that the underlying file system 2019 * will set the "pollpending" flag set when there is 2020 * no polled events. 2021 * 2022 * Use pollfdp[].events for actual polling because 2023 * the pd_events is union of all cached poll events 2024 * on this fd. The events parameter also affects 2025 * how the polled device sets the "poll pending" 2026 * flag. 2027 */ 2028 ASSERT(curthread->t_pollcache == NULL); 2029 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 2030 &pollfdp[entry].revents, &php, NULL); 2031 /* 2032 * releasef after completely done with this cached 2033 * poll entry. To prevent close() coming in to clear 2034 * this entry. 2035 */ 2036 if (error) { 2037 releasef(fd); 2038 break; 2039 } 2040 /* 2041 * layered devices (e.g. console driver) 2042 * may change the vnode and thus the pollhead 2043 * pointer out from underneath us. 2044 */ 2045 if (php != NULL && pdp->pd_php != NULL && 2046 php != pdp->pd_php) { 2047 releasef(fd); 2048 pollhead_delete(pdp->pd_php, pdp); 2049 pdp->pd_php = php; 2050 pollhead_insert(php, pdp); 2051 /* 2052 * We could have missed a wakeup on the new 2053 * target device. Make sure the new target 2054 * gets polled once. 2055 */ 2056 BT_SET(pcp->pc_bitmap, fd); 2057 goto retry; 2058 } 2059 2060 if (pollfdp[entry].revents) { 2061 ASSERT(refp->xf_refcnt >= 1); 2062 fdcnt++; 2063 if (refp->xf_refcnt > 1) { 2064 /* 2065 * this fd appeared multiple time 2066 * in the poll list. This is rare but 2067 * we have to look at all of them for 2068 * correctness. 2069 */ 2070 error = plist_chkdupfd(fp, pdp, ps, 2071 pollfdp, entry, &fdcnt); 2072 if (error > 0) { 2073 releasef(fd); 2074 break; 2075 } 2076 if (error < 0) { 2077 goto retry; 2078 } 2079 } 2080 releasef(fd); 2081 } else { 2082 /* 2083 * VOP_POLL didn't return any revents. We can 2084 * clear the bit in bitmap only if we have the 2085 * pollhead ptr cached and no other cached 2086 * entry is polling different events on this fd. 2087 * VOP_POLL may have dropped the ps_lock. Make 2088 * sure pollwakeup has not happened before clear 2089 * the bit. 2090 */ 2091 if ((pdp->pd_php != NULL) && 2092 (pollfdp[entry].events == pdp->pd_events) && 2093 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 2094 BT_CLEAR(pcp->pc_bitmap, fd); 2095 } 2096 /* 2097 * if the fd can be cached now but not before, 2098 * do it now. 2099 */ 2100 if ((pdp->pd_php == NULL) && (php != NULL)) { 2101 pdp->pd_php = php; 2102 pollhead_insert(php, pdp); 2103 /* 2104 * We are inserting a polldat struct for 2105 * the first time. We may have missed a 2106 * wakeup on this device. Re-poll once. 2107 * This should be a rare event. 2108 */ 2109 releasef(fd); 2110 goto retry; 2111 } 2112 if (refp->xf_refcnt > 1) { 2113 /* 2114 * this fd appeared multiple time 2115 * in the poll list. This is rare but 2116 * we have to look at all of them for 2117 * correctness. 2118 */ 2119 error = plist_chkdupfd(fp, pdp, ps, 2120 pollfdp, entry, &fdcnt); 2121 if (error > 0) { 2122 releasef(fd); 2123 break; 2124 } 2125 if (error < 0) { 2126 goto retry; 2127 } 2128 } 2129 releasef(fd); 2130 } 2131 } else { 2132 done = 1; 2133 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2134 } 2135 } 2136 if (!error) { 2137 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2138 *fdcntp += fdcnt; 2139 } 2140 return (error); 2141 } 2142 2143 /* 2144 * Going through the poll list without much locking. Poll all fds and 2145 * cache all valid fds in the pollcache. 2146 */ 2147 int 2148 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2149 { 2150 pollfd_t *pollfdp = ps->ps_pollfd; 2151 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2152 pollfd_t *newfdlist; 2153 int i; 2154 int fd; 2155 file_t *fp; 2156 int error = 0; 2157 2158 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2159 ASSERT(which < ps->ps_nsets); 2160 ASSERT(pcacheset != NULL); 2161 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2162 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2163 /* 2164 * cache the new poll list in pollcachset. 2165 */ 2166 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2167 2168 pcacheset[which].pcs_pollfd = newfdlist; 2169 pcacheset[which].pcs_nfds = ps->ps_nfds; 2170 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2171 2172 /* 2173 * We have saved a copy of current poll fd list in one pollcacheset. 2174 * The 'revents' field of the new list is not yet set to 0. Loop 2175 * through the new list just to do that is expensive. We do that 2176 * while polling the list. 2177 */ 2178 for (i = 0; i < ps->ps_nfds; i++) { 2179 fd = pollfdp[i].fd; 2180 /* 2181 * We also filter out the illegal poll events in the event 2182 * field for the cached poll list/set. 2183 */ 2184 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2185 newfdlist[i].events = pollfdp[i].events = 2186 pollfdp[i].events & VALID_POLL_EVENTS; 2187 } 2188 if (fd < 0) { 2189 pollfdp[i].revents = 0; 2190 continue; 2191 } 2192 if ((fp = getf(fd)) == NULL) { 2193 pollfdp[i].revents = POLLNVAL; 2194 /* 2195 * invalidate this cache entry in the cached poll list 2196 */ 2197 newfdlist[i].fd = -1; 2198 (*fdcntp)++; 2199 continue; 2200 } 2201 /* 2202 * cache this fd. 2203 */ 2204 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2205 which); 2206 releasef(fd); 2207 if (error) { 2208 /* 2209 * Here we are half way through caching a new 2210 * poll list. Undo every thing. 2211 */ 2212 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2213 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2214 pcacheset[which].pcs_pollfd = NULL; 2215 pcacheset[which].pcs_usradr = NULL; 2216 break; 2217 } 2218 } 2219 return (error); 2220 } 2221 2222 /* 2223 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2224 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2225 * wake any sleeping poller, then remove the polldat from the driver. 2226 * The routine is called with ps_pcachelock held. 2227 */ 2228 void 2229 pcache_clean_entry(pollstate_t *ps, int fd) 2230 { 2231 pollcache_t *pcp; 2232 polldat_t *pdp; 2233 int i; 2234 2235 ASSERT(ps != NULL); 2236 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2237 pcp = ps->ps_pcache; 2238 ASSERT(pcp); 2239 pdp = pcache_lookup_fd(pcp, fd); 2240 ASSERT(pdp != NULL); 2241 /* 2242 * the corresponding fpollinfo in fi_list has been removed by 2243 * a close on this fd. Reset the cached fp ptr here. 2244 */ 2245 pdp->pd_fp = NULL; 2246 /* 2247 * XXX - This routine also touches data in pcacheset struct. 2248 * 2249 * set the event in cached poll lists to POLLCLOSED. This invalidate 2250 * the cached poll fd entry in that poll list, which will force a 2251 * removal of this cached entry in next poll(). The cleanup is done 2252 * at the removal time. 2253 */ 2254 ASSERT(pdp->pd_ref != NULL); 2255 for (i = 0; i < ps->ps_nsets; i++) { 2256 xref_t *refp; 2257 pollcacheset_t *pcsp; 2258 2259 refp = &pdp->pd_ref[i]; 2260 if (refp->xf_refcnt) { 2261 ASSERT(refp->xf_position >= 0); 2262 pcsp = &ps->ps_pcacheset[i]; 2263 if (refp->xf_refcnt == 1) { 2264 pcsp->pcs_pollfd[refp->xf_position].events = 2265 (short)POLLCLOSED; 2266 } 2267 if (refp->xf_refcnt > 1) { 2268 int j; 2269 /* 2270 * mark every matching entry in pcs_pollfd 2271 */ 2272 for (j = refp->xf_position; 2273 j < pcsp->pcs_nfds; j++) { 2274 if (pcsp->pcs_pollfd[j].fd == fd) { 2275 pcsp->pcs_pollfd[j].events = 2276 (short)POLLCLOSED; 2277 } 2278 } 2279 } 2280 } 2281 } 2282 if (pdp->pd_php) { 2283 pollwakeup(pdp->pd_php, POLLHUP); 2284 pollhead_delete(pdp->pd_php, pdp); 2285 pdp->pd_php = NULL; 2286 } 2287 } 2288 2289 void 2290 pcache_wake_parents(pollcache_t *pcp) 2291 { 2292 pcachelink_t *pl, *pln; 2293 2294 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 2295 2296 for (pl = pcp->pc_parents; pl != NULL; pl = pln) { 2297 mutex_enter(&pl->pcl_lock); 2298 if (pl->pcl_state == PCL_VALID) { 2299 ASSERT(pl->pcl_parent_pc != NULL); 2300 cv_broadcast(&pl->pcl_parent_pc->pc_cv); 2301 } 2302 pln = pl->pcl_parent_next; 2303 mutex_exit(&pl->pcl_lock); 2304 } 2305 } 2306 2307 /* 2308 * Initialize thread pollstate structure. 2309 * It will persist for the life of the thread, until it calls pollcleanup(). 2310 */ 2311 pollstate_t * 2312 pollstate_create() 2313 { 2314 pollstate_t *ps = curthread->t_pollstate; 2315 2316 if (ps == NULL) { 2317 /* 2318 * This is the first time this thread has ever polled, so we 2319 * have to create its pollstate structure. 2320 */ 2321 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2322 ps->ps_nsets = POLLFDSETS; 2323 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2324 curthread->t_pollstate = ps; 2325 } else { 2326 ASSERT(ps->ps_depth == 0); 2327 ASSERT(ps->ps_flags == 0); 2328 ASSERT(ps->ps_pc_stack[0] == 0); 2329 } 2330 return (ps); 2331 } 2332 2333 void 2334 pollstate_destroy(pollstate_t *ps) 2335 { 2336 if (ps->ps_pollfd != NULL) { 2337 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2338 ps->ps_pollfd = NULL; 2339 } 2340 if (ps->ps_pcache != NULL) { 2341 pcache_destroy(ps->ps_pcache); 2342 ps->ps_pcache = NULL; 2343 } 2344 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2345 ps->ps_pcacheset = NULL; 2346 if (ps->ps_dpbuf != NULL) { 2347 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 2348 ps->ps_dpbuf = NULL; 2349 } 2350 mutex_destroy(&ps->ps_lock); 2351 kmem_free(ps, sizeof (pollstate_t)); 2352 } 2353 2354 static int 2355 pollstate_contend(pollstate_t *ps, pollcache_t *pcp) 2356 { 2357 pollstate_t *rem, *next; 2358 pollcache_t *desired_pc; 2359 int result = 0, depth_total; 2360 2361 mutex_enter(&pollstate_contenders_lock); 2362 /* 2363 * There is a small chance that the pollcache of interest became 2364 * available while we were waiting on the contenders lock. 2365 */ 2366 if (mutex_tryenter(&pcp->pc_lock) != 0) { 2367 goto out; 2368 } 2369 2370 /* 2371 * Walk the list of contended pollstates, searching for evidence of a 2372 * deadlock condition. 2373 */ 2374 depth_total = ps->ps_depth; 2375 desired_pc = pcp; 2376 for (rem = pollstate_contenders; rem != NULL; rem = next) { 2377 int i, j; 2378 next = rem->ps_contend_nextp; 2379 2380 /* Is this pollstate holding the pollcache of interest? */ 2381 for (i = 0; i < rem->ps_depth; i++) { 2382 if (rem->ps_pc_stack[i] != desired_pc) { 2383 continue; 2384 } 2385 2386 /* 2387 * The remote pollstate holds the pollcache lock we 2388 * desire. If it is waiting on a pollcache we hold, 2389 * then we can report the obvious deadlock. 2390 */ 2391 ASSERT(rem->ps_contend_pc != NULL); 2392 for (j = 0; j < ps->ps_depth; j++) { 2393 if (rem->ps_contend_pc == ps->ps_pc_stack[j]) { 2394 rem->ps_flags |= POLLSTATE_STALEMATE; 2395 result = -1; 2396 goto out; 2397 } 2398 } 2399 2400 /* 2401 * The remote pollstate is not blocking on a pollcache 2402 * which would deadlock against us. That pollcache 2403 * may, however, be held by a pollstate which would 2404 * result in a deadlock. 2405 * 2406 * To detect such a condition, we continue walking 2407 * through the list using the pollcache blocking the 2408 * remote thread as our new search target. 2409 * 2410 * Return to the front of pollstate_contenders since it 2411 * is not ordered to guarantee complete dependency 2412 * traversal. The below depth tracking places an upper 2413 * bound on iterations. 2414 */ 2415 desired_pc = rem->ps_contend_pc; 2416 next = pollstate_contenders; 2417 2418 /* 2419 * The recursion depth of the remote pollstate is used 2420 * to calculate a final depth for the local /dev/poll 2421 * recursion, since those locks will be acquired 2422 * eventually. If that value exceeds the defined 2423 * limit, we can report the failure now instead of 2424 * recursing to that failure depth. 2425 */ 2426 depth_total += (rem->ps_depth - i); 2427 if (depth_total >= POLLMAXDEPTH) { 2428 result = -1; 2429 goto out; 2430 } 2431 } 2432 } 2433 2434 /* 2435 * No deadlock partner was found. The only course of action is to 2436 * record ourself as a contended pollstate and wait for the pollcache 2437 * mutex to become available. 2438 */ 2439 ps->ps_contend_pc = pcp; 2440 ps->ps_contend_nextp = pollstate_contenders; 2441 ps->ps_contend_pnextp = &pollstate_contenders; 2442 if (pollstate_contenders != NULL) { 2443 pollstate_contenders->ps_contend_pnextp = 2444 &ps->ps_contend_nextp; 2445 } 2446 pollstate_contenders = ps; 2447 2448 mutex_exit(&pollstate_contenders_lock); 2449 mutex_enter(&pcp->pc_lock); 2450 mutex_enter(&pollstate_contenders_lock); 2451 2452 /* 2453 * Our acquisition of the pollcache mutex may be due to another thread 2454 * giving up in the face of deadlock with us. If that is the case, 2455 * we too should report the failure. 2456 */ 2457 if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) { 2458 result = -1; 2459 ps->ps_flags &= ~POLLSTATE_STALEMATE; 2460 mutex_exit(&pcp->pc_lock); 2461 } 2462 2463 /* Remove ourself from the contenders list. */ 2464 if (ps->ps_contend_nextp != NULL) { 2465 ps->ps_contend_nextp->ps_contend_pnextp = 2466 ps->ps_contend_pnextp; 2467 } 2468 *ps->ps_contend_pnextp = ps->ps_contend_nextp; 2469 ps->ps_contend_pc = NULL; 2470 ps->ps_contend_nextp = NULL; 2471 ps->ps_contend_pnextp = NULL; 2472 2473 out: 2474 mutex_exit(&pollstate_contenders_lock); 2475 return (result); 2476 } 2477 2478 int 2479 pollstate_enter(pollcache_t *pcp) 2480 { 2481 pollstate_t *ps = curthread->t_pollstate; 2482 int i; 2483 2484 if (ps == NULL) { 2485 /* 2486 * The thread pollstate may not be initialized if VOP_POLL is 2487 * called on a recursion-enabled /dev/poll handle from outside 2488 * the poll() or /dev/poll codepaths. 2489 */ 2490 return (PSE_FAIL_POLLSTATE); 2491 } 2492 if (ps->ps_depth >= POLLMAXDEPTH) { 2493 return (PSE_FAIL_DEPTH); 2494 } 2495 /* 2496 * Check the desired pollcache against pollcaches we already have 2497 * locked. Such a loop is the most simple deadlock scenario. 2498 */ 2499 for (i = 0; i < ps->ps_depth; i++) { 2500 if (ps->ps_pc_stack[i] == pcp) { 2501 return (PSE_FAIL_LOOP); 2502 } 2503 } 2504 ASSERT(ps->ps_pc_stack[i] == NULL); 2505 2506 if (ps->ps_depth == 0) { 2507 /* Locking initial the pollcache requires no caution */ 2508 mutex_enter(&pcp->pc_lock); 2509 } else if (mutex_tryenter(&pcp->pc_lock) == 0) { 2510 if (pollstate_contend(ps, pcp) != 0) { 2511 /* This pollcache cannot safely be locked. */ 2512 return (PSE_FAIL_DEADLOCK); 2513 } 2514 } 2515 2516 ps->ps_pc_stack[ps->ps_depth++] = pcp; 2517 return (PSE_SUCCESS); 2518 } 2519 2520 void 2521 pollstate_exit(pollcache_t *pcp) 2522 { 2523 pollstate_t *ps = curthread->t_pollstate; 2524 2525 VERIFY(ps != NULL); 2526 VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp); 2527 2528 mutex_exit(&pcp->pc_lock); 2529 ps->ps_pc_stack[--ps->ps_depth] = NULL; 2530 VERIFY(ps->ps_depth >= 0); 2531 } 2532 2533 2534 /* 2535 * We are holding the appropriate uf_lock entering this routine. 2536 * Bump up the ps_busy count to prevent the thread from exiting. 2537 */ 2538 void 2539 pollblockexit(fpollinfo_t *fpip) 2540 { 2541 for (; fpip; fpip = fpip->fp_next) { 2542 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2543 2544 mutex_enter(&pcp->pc_no_exit); 2545 pcp->pc_busy++; /* prevents exit()'s */ 2546 mutex_exit(&pcp->pc_no_exit); 2547 } 2548 } 2549 2550 /* 2551 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2552 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2553 * this cache entry. We can't clean the polldat entry clean up here because 2554 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2555 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2556 * pcache_clean_entry to call pollwakeup(). 2557 */ 2558 void 2559 pollcacheclean(fpollinfo_t *fip, int fd) 2560 { 2561 struct fpollinfo *fpip, *fpip2; 2562 2563 fpip = fip; 2564 while (fpip) { 2565 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2566 pollcache_t *pcp = ps->ps_pcache; 2567 2568 mutex_enter(&ps->ps_lock); 2569 pcache_clean_entry(ps, fd); 2570 mutex_exit(&ps->ps_lock); 2571 mutex_enter(&pcp->pc_no_exit); 2572 pcp->pc_busy--; 2573 if (pcp->pc_busy == 0) { 2574 /* 2575 * Wakeup the thread waiting in 2576 * thread_exit(). 2577 */ 2578 cv_signal(&pcp->pc_busy_cv); 2579 } 2580 mutex_exit(&pcp->pc_no_exit); 2581 2582 fpip2 = fpip; 2583 fpip = fpip->fp_next; 2584 kmem_free(fpip2, sizeof (fpollinfo_t)); 2585 } 2586 } 2587 2588 /* 2589 * one of the cache line's counter is wrapping around. Reset all cache line 2590 * counters to zero except one. This is simplistic, but probably works 2591 * effectively. 2592 */ 2593 void 2594 pcacheset_reset_count(pollstate_t *ps, int index) 2595 { 2596 int i; 2597 2598 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2599 for (i = 0; i < ps->ps_nsets; i++) { 2600 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2601 ps->ps_pcacheset[i].pcs_count = 0; 2602 } 2603 } 2604 ps->ps_pcacheset[index].pcs_count = 1; 2605 } 2606 2607 /* 2608 * this routine implements poll cache list replacement policy. 2609 * It is currently choose the "least used". 2610 */ 2611 int 2612 pcacheset_replace(pollstate_t *ps) 2613 { 2614 int i; 2615 int index = 0; 2616 2617 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2618 for (i = 1; i < ps->ps_nsets; i++) { 2619 if (ps->ps_pcacheset[index].pcs_count > 2620 ps->ps_pcacheset[i].pcs_count) { 2621 index = i; 2622 } 2623 } 2624 ps->ps_pcacheset[index].pcs_count = 0; 2625 return (index); 2626 } 2627 2628 /* 2629 * this routine is called by strclose to remove remaining polldat struct on 2630 * the pollhead list of the device being closed. There are two reasons as why 2631 * the polldat structures still remain on the pollhead list: 2632 * 2633 * (1) The layered device(e.g.the console driver). 2634 * In this case, the existence of a polldat implies that the thread putting 2635 * the polldat on this list has not exited yet. Before the thread exits, it 2636 * will have to hold this pollhead lock to remove the polldat. So holding the 2637 * pollhead lock here effectively prevents the thread which put the polldat 2638 * on this list from exiting. 2639 * 2640 * (2) /dev/poll. 2641 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2642 * pollhead list if the process has not done a POLLREMOVE before closing the 2643 * polled fd. We just unlink it here. 2644 */ 2645 void 2646 pollhead_clean(pollhead_t *php) 2647 { 2648 polldat_t *pdp; 2649 2650 /* 2651 * In case(1), while we must prevent the thread in question from 2652 * exiting, we must also obey the proper locking order, i.e. 2653 * (ps_lock -> phlock). 2654 */ 2655 PH_ENTER(php); 2656 while (php->ph_list != NULL) { 2657 pollstate_t *ps; 2658 pollcache_t *pcp; 2659 2660 pdp = php->ph_list; 2661 ASSERT(pdp->pd_php == php); 2662 if (pdp->pd_thread == NULL) { 2663 /* 2664 * This is case(2). Since the ph_lock is sufficient 2665 * to synchronize this lwp with any other /dev/poll 2666 * lwp, just unlink the polldat. 2667 */ 2668 php->ph_list = pdp->pd_next; 2669 pdp->pd_php = NULL; 2670 pdp->pd_next = NULL; 2671 continue; 2672 } 2673 ps = pdp->pd_thread->t_pollstate; 2674 ASSERT(ps != NULL); 2675 pcp = pdp->pd_pcache; 2676 ASSERT(pcp != NULL); 2677 mutex_enter(&pcp->pc_no_exit); 2678 pcp->pc_busy++; /* prevents exit()'s */ 2679 mutex_exit(&pcp->pc_no_exit); 2680 /* 2681 * Now get the locks in proper order to avoid deadlock. 2682 */ 2683 PH_EXIT(php); 2684 mutex_enter(&ps->ps_lock); 2685 /* 2686 * while we dropped the pollhead lock, the element could be 2687 * taken off the list already. 2688 */ 2689 PH_ENTER(php); 2690 if (pdp->pd_php == php) { 2691 ASSERT(pdp == php->ph_list); 2692 php->ph_list = pdp->pd_next; 2693 pdp->pd_php = NULL; 2694 pdp->pd_next = NULL; 2695 } 2696 PH_EXIT(php); 2697 mutex_exit(&ps->ps_lock); 2698 mutex_enter(&pcp->pc_no_exit); 2699 pcp->pc_busy--; 2700 if (pcp->pc_busy == 0) { 2701 /* 2702 * Wakeup the thread waiting in 2703 * thread_exit(). 2704 */ 2705 cv_signal(&pcp->pc_busy_cv); 2706 } 2707 mutex_exit(&pcp->pc_no_exit); 2708 PH_ENTER(php); 2709 } 2710 PH_EXIT(php); 2711 } 2712 2713 /* 2714 * The remove_list is called to cleanup a partially cached 'current' list or 2715 * to remove a partial list which is no longer cached. The flag value of 1 2716 * indicates the second case. 2717 */ 2718 void 2719 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2720 int cacheindex, int flag) 2721 { 2722 int i; 2723 2724 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2725 for (i = start; i < end; i++) { 2726 if ((pollfdp[i].fd >= 0) && 2727 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2728 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2729 (uint_t)pollfdp[i].events)) { 2730 int j; 2731 int fd = pollfdp[i].fd; 2732 2733 for (j = i + 1; j < end; j++) { 2734 if (pollfdp[j].fd == fd) { 2735 pcache_update_xref( 2736 ps->ps_pcache, fd, 2737 (ssize_t)j, cacheindex); 2738 break; 2739 } 2740 } 2741 ASSERT(j <= end); 2742 } 2743 } 2744 } 2745 } 2746 2747 #ifdef DEBUG 2748 2749 #include<sys/strsubr.h> 2750 /* 2751 * make sure curthread is not on anyone's pollhead list any more. 2752 */ 2753 static void 2754 pollcheckphlist() 2755 { 2756 int i; 2757 file_t *fp; 2758 uf_entry_t *ufp; 2759 uf_info_t *fip = P_FINFO(curproc); 2760 struct stdata *stp; 2761 polldat_t *pdp; 2762 2763 mutex_enter(&fip->fi_lock); 2764 for (i = 0; i < fip->fi_nfiles; i++) { 2765 UF_ENTER(ufp, fip, i); 2766 if ((fp = ufp->uf_file) != NULL) { 2767 if ((stp = fp->f_vnode->v_stream) != NULL) { 2768 PH_ENTER(&stp->sd_pollist); 2769 pdp = stp->sd_pollist.ph_list; 2770 while (pdp) { 2771 ASSERT(pdp->pd_thread != curthread); 2772 pdp = pdp->pd_next; 2773 } 2774 PH_EXIT(&stp->sd_pollist); 2775 } 2776 } 2777 UF_EXIT(ufp); 2778 } 2779 mutex_exit(&fip->fi_lock); 2780 } 2781 2782 /* 2783 * for resolved set poll list, the xref info in the pcache should be 2784 * consistent with this poll list. 2785 */ 2786 static int 2787 pollcheckxref(pollstate_t *ps, int cacheindex) 2788 { 2789 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2790 pollcache_t *pcp = ps->ps_pcache; 2791 polldat_t *pdp; 2792 int i; 2793 xref_t *refp; 2794 2795 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2796 if (pollfdp[i].fd < 0) { 2797 continue; 2798 } 2799 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2800 ASSERT(pdp != NULL); 2801 ASSERT(pdp->pd_ref != NULL); 2802 refp = &pdp->pd_ref[cacheindex]; 2803 if (refp->xf_position >= 0) { 2804 ASSERT(refp->xf_refcnt >= 1); 2805 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2806 if (refp->xf_refcnt > 1) { 2807 int j; 2808 int count = 0; 2809 2810 for (j = refp->xf_position; 2811 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2812 j++) { 2813 if (pollfdp[j].fd == pdp->pd_fd) { 2814 count++; 2815 } 2816 } 2817 ASSERT(count == refp->xf_refcnt); 2818 } 2819 } 2820 } 2821 return (1); 2822 } 2823 2824 /* 2825 * For every cached pollfd, its polldat struct should be consistent with 2826 * what is in the pcacheset lists. 2827 */ 2828 static void 2829 checkpolldat(pollstate_t *ps) 2830 { 2831 pollcache_t *pcp = ps->ps_pcache; 2832 polldat_t **hashtbl; 2833 int i; 2834 2835 hashtbl = pcp->pc_hash; 2836 for (i = 0; i < pcp->pc_hashsize; i++) { 2837 polldat_t *pdp; 2838 2839 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2840 ASSERT(pdp->pd_ref != NULL); 2841 if (pdp->pd_count > 0) { 2842 xref_t *refp; 2843 int j; 2844 pollcacheset_t *pcsp; 2845 pollfd_t *pollfd; 2846 2847 for (j = 0; j < ps->ps_nsets; j++) { 2848 refp = &pdp->pd_ref[j]; 2849 if (refp->xf_refcnt > 0) { 2850 pcsp = &ps->ps_pcacheset[j]; 2851 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2852 pollfd = pcsp->pcs_pollfd; 2853 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2854 } 2855 } 2856 } 2857 } 2858 } 2859 } 2860 2861 /* 2862 * every wfd element on ph_list must have a corresponding fpollinfo on the 2863 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2864 */ 2865 void 2866 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2867 { 2868 stdata_t *stp; 2869 polldat_t *pdp; 2870 fpollinfo_t *fpip2; 2871 2872 if ((stp = vp->v_stream) == NULL) { 2873 return; 2874 } 2875 PH_ENTER(&stp->sd_pollist); 2876 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2877 if (pdp->pd_thread != NULL && 2878 pdp->pd_thread->t_procp == curthread->t_procp) { 2879 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2880 if (pdp->pd_thread == fpip2->fp_thread) { 2881 break; 2882 } 2883 } 2884 ASSERT(fpip2 != NULL); 2885 } 2886 } 2887 PH_EXIT(&stp->sd_pollist); 2888 } 2889 2890 /* 2891 * For each cached fd whose bit is not set in bitmap, its revents field in 2892 * current poll list should be 0. 2893 */ 2894 static int 2895 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2896 { 2897 pollcache_t *pcp = ps->ps_pcache; 2898 pollfd_t *pollfdp = ps->ps_pollfd; 2899 int i; 2900 2901 for (i = begin; i < end; i++) { 2902 polldat_t *pdp; 2903 2904 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2905 pdp = pcache_lookup_fd(pcp, i); 2906 if (pdp && pdp->pd_fp != NULL) { 2907 xref_t *refp; 2908 int entry; 2909 2910 ASSERT(pdp->pd_ref != NULL); 2911 refp = &pdp->pd_ref[cacheindex]; 2912 if (refp->xf_refcnt == 0) { 2913 continue; 2914 } 2915 entry = refp->xf_position; 2916 ASSERT(entry >= 0); 2917 ASSERT(pollfdp[entry].revents == 0); 2918 if (refp->xf_refcnt > 1) { 2919 int j; 2920 2921 for (j = entry + 1; j < ps->ps_nfds; j++) { 2922 if (pollfdp[j].fd == i) { 2923 ASSERT(pollfdp[j].revents == 0); 2924 } 2925 } 2926 } 2927 } 2928 } 2929 return (1); 2930 } 2931 2932 #endif /* DEBUG */ 2933 2934 pollcache_t * 2935 pcache_alloc() 2936 { 2937 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2938 } 2939 2940 void 2941 pcache_create(pollcache_t *pcp, nfds_t nfds) 2942 { 2943 size_t mapsize; 2944 2945 /* 2946 * allocate enough bits for the poll fd list 2947 */ 2948 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2949 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2950 } 2951 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2952 KM_SLEEP); 2953 pcp->pc_mapsize = mapsize; 2954 /* 2955 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2956 * number of fd to start with, allocate a bigger hash table (to the 2957 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2958 * hash table is expensive. 2959 */ 2960 if (nfds < POLLHASHCHUNKSZ) { 2961 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2962 } else { 2963 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2964 ~(POLLHASHCHUNKSZ - 1); 2965 } 2966 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2967 KM_SLEEP); 2968 } 2969 2970 void 2971 pcache_destroy(pollcache_t *pcp) 2972 { 2973 polldat_t **hashtbl; 2974 int i; 2975 2976 hashtbl = pcp->pc_hash; 2977 for (i = 0; i < pcp->pc_hashsize; i++) { 2978 if (hashtbl[i] != NULL) { 2979 polldat_t *pdp, *pdp2; 2980 2981 pdp = hashtbl[i]; 2982 while (pdp != NULL) { 2983 pdp2 = pdp->pd_hashnext; 2984 if (pdp->pd_ref != NULL) { 2985 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2986 pdp->pd_nsets); 2987 } 2988 kmem_free(pdp, sizeof (polldat_t)); 2989 pdp = pdp2; 2990 pcp->pc_fdcount--; 2991 } 2992 } 2993 } 2994 ASSERT(pcp->pc_fdcount == 0); 2995 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2996 kmem_free(pcp->pc_bitmap, 2997 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2998 mutex_destroy(&pcp->pc_no_exit); 2999 mutex_destroy(&pcp->pc_lock); 3000 cv_destroy(&pcp->pc_cv); 3001 cv_destroy(&pcp->pc_busy_cv); 3002 kmem_free(pcp, sizeof (pollcache_t)); 3003 } 3004 3005 pollcacheset_t * 3006 pcacheset_create(int nsets) 3007 { 3008 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 3009 } 3010 3011 void 3012 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 3013 { 3014 int i; 3015 3016 for (i = 0; i < nsets; i++) { 3017 if (pcsp[i].pcs_pollfd != NULL) { 3018 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 3019 sizeof (pollfd_t)); 3020 } 3021 } 3022 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 3023 } 3024 3025 /* 3026 * Check each duplicated poll fd in the poll list. It may be necessary to 3027 * VOP_POLL the same fd again using different poll events. getf() has been 3028 * done by caller. This routine returns 0 if it can sucessfully process the 3029 * entire poll fd list. It returns -1 if underlying vnode has changed during 3030 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 3031 * value if VOP_POLL failed. 3032 */ 3033 static int 3034 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 3035 int entry, int *fdcntp) 3036 { 3037 int i; 3038 int fd; 3039 nfds_t nfds = psp->ps_nfds; 3040 3041 fd = pollfdp[entry].fd; 3042 for (i = entry + 1; i < nfds; i++) { 3043 if (pollfdp[i].fd == fd) { 3044 if (pollfdp[i].events == pollfdp[entry].events) { 3045 if ((pollfdp[i].revents = 3046 pollfdp[entry].revents) != 0) { 3047 (*fdcntp)++; 3048 } 3049 } else { 3050 3051 int error; 3052 pollhead_t *php; 3053 pollcache_t *pcp = psp->ps_pcache; 3054 3055 /* 3056 * the events are different. VOP_POLL on this 3057 * fd so that we don't miss any revents. 3058 */ 3059 php = NULL; 3060 ASSERT(curthread->t_pollcache == NULL); 3061 error = VOP_POLL(fp->f_vnode, 3062 pollfdp[i].events, 0, 3063 &pollfdp[i].revents, &php, NULL); 3064 if (error) { 3065 return (error); 3066 } 3067 /* 3068 * layered devices(e.g. console driver) 3069 * may change the vnode and thus the pollhead 3070 * pointer out from underneath us. 3071 */ 3072 if (php != NULL && pdp->pd_php != NULL && 3073 php != pdp->pd_php) { 3074 pollhead_delete(pdp->pd_php, pdp); 3075 pdp->pd_php = php; 3076 pollhead_insert(php, pdp); 3077 /* 3078 * We could have missed a wakeup on the 3079 * new target device. Make sure the new 3080 * target gets polled once. 3081 */ 3082 BT_SET(pcp->pc_bitmap, fd); 3083 return (-1); 3084 } 3085 if (pollfdp[i].revents) { 3086 (*fdcntp)++; 3087 } 3088 } 3089 } 3090 } 3091 return (0); 3092 } 3093