1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 32 * Copyright 2015, Joyent, Inc. 33 */ 34 35 /* 36 * Portions of this source code were derived from Berkeley 4.3 BSD 37 * under license from the Regents of the University of California. 38 */ 39 40 #include <sys/param.h> 41 #include <sys/isa_defs.h> 42 #include <sys/types.h> 43 #include <sys/sysmacros.h> 44 #include <sys/user.h> 45 #include <sys/systm.h> 46 #include <sys/errno.h> 47 #include <sys/time.h> 48 #include <sys/vnode.h> 49 #include <sys/file.h> 50 #include <sys/mode.h> 51 #include <sys/proc.h> 52 #include <sys/uio.h> 53 #include <sys/poll_impl.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/debug.h> 57 #include <sys/bitmap.h> 58 #include <sys/kstat.h> 59 #include <sys/rctl.h> 60 #include <sys/port_impl.h> 61 #include <sys/schedctl.h> 62 #include <sys/cpu.h> 63 64 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 65 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 66 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 67 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 68 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 69 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 70 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 71 72 /* 73 * global counters to collect some stats 74 */ 75 static struct { 76 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 77 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 78 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 79 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 80 kstat_named_t pollunlockfail; /* failed to perform pollunlock */ 81 } pollstats = { 82 { "polllistmiss", KSTAT_DATA_UINT64 }, 83 { "pollcachehit", KSTAT_DATA_UINT64 }, 84 { "pollcachephit", KSTAT_DATA_UINT64 }, 85 { "pollcachemiss", KSTAT_DATA_UINT64 }, 86 { "pollunlockfail", KSTAT_DATA_UINT64 } 87 }; 88 89 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 90 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 91 92 struct pplock { 93 kmutex_t pp_lock; 94 short pp_flag; 95 kcondvar_t pp_wait_cv; 96 int32_t pp_pad; /* to a nice round 16 bytes */ 97 }; 98 99 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 100 101 /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */ 102 static kmutex_t pollstate_contenders_lock; 103 static pollstate_t *pollstate_contenders = NULL; 104 105 #ifdef DEBUG 106 static int pollchecksanity(pollstate_t *, nfds_t); 107 static int pollcheckxref(pollstate_t *, int); 108 static void pollcheckphlist(void); 109 static int pollcheckrevents(pollstate_t *, int, int, int); 110 static void checkpolldat(pollstate_t *); 111 #endif /* DEBUG */ 112 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 113 int *); 114 115 /* 116 * Data structure overview: 117 * The per-thread poll state consists of 118 * one pollstate_t 119 * one pollcache_t 120 * one bitmap with one event bit per fd 121 * a (two-dimensional) hashed array of polldat_t structures - one entry 122 * per fd 123 * 124 * This conglomerate of data structures interact with 125 * the pollhead which is used by VOP_POLL and pollwakeup 126 * (protected by the PHLOCK, cached array of plocks), and 127 * the fpollinfo list hanging off the fi_list which is used to notify 128 * poll when a cached fd is closed. This is protected by uf_lock. 129 * 130 * Invariants: 131 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 132 * is on that pollhead. This is modified atomically under pc_lock. 133 * 134 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 135 * list for that open file. 136 * This is modified atomically under pc_lock. 137 * 138 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 139 * Iff pd_ref[i].xf_refcnt >= 1 then 140 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 141 * Iff pd_ref[i].xf_refcnt > 1 then 142 * In ps_pcacheset[i].pcs_pollfd between index 143 * pd_ref[i].xf_position] and the end of the list 144 * there are xf_refcnt entries with .fd == pd_fd 145 * 146 * Locking design: 147 * Whenever possible the design relies on the fact that the poll cache state 148 * is per thread thus for both poll and exit it is self-synchronizing. 149 * Thus the key interactions where other threads access the state are: 150 * pollwakeup (and polltime), and 151 * close cleaning up the cached references to an open file 152 * 153 * The two key locks in poll proper is ps_lock and pc_lock. 154 * 155 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 156 * to ensure that modifications to pollcacheset structure are serialized. 157 * This lock is held through most of poll() except where poll sleeps 158 * since there is little need to handle closes concurrently with the execution 159 * of poll. 160 * The pc_lock protects most of the fields in pollcache structure and polldat 161 * structures (which are accessed by poll, pollwakeup, and polltime) 162 * with the exception of fields that are only modified when only one thread 163 * can access this per-thread state. 164 * Those exceptions occur in poll when first allocating the per-thread state, 165 * when poll grows the number of polldat (never shrinks), and when 166 * exit/pollcleanup has ensured that there are no references from either 167 * pollheads or fpollinfo to the threads poll state. 168 * 169 * Poll(2) system call is the only path which ps_lock and pc_lock are both 170 * held, in that order. It needs ps_lock to synchronize with close and 171 * lwp_exit; and pc_lock with pollwakeup. 172 * 173 * The locking interaction between pc_lock and PHLOCK take into account 174 * that poll acquires these locks in the order of pc_lock and then PHLOCK 175 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 176 * deadlock avoidance by dropping the locks and reacquiring them in the 177 * reverse order. For this to work pollwakeup needs to prevent the thread 178 * from exiting and freeing all of the poll related state. Thus is done 179 * using 180 * the pc_no_exit lock 181 * the pc_busy counter 182 * the pc_busy_cv condition variable 183 * 184 * The locking interaction between pc_lock and uf_lock has similar 185 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 186 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 187 * to prevent poll or exit from doing a delfpollinfo after which the thread 188 * might exit. But the cleanup needs to acquire pc_lock when modifying 189 * the poll cache state. The solution is to use pc_busy and do the close 190 * cleanup in two phases: 191 * First close calls pollblockexit which increments pc_busy. 192 * This prevents the per-thread poll related state from being freed. 193 * Then close drops uf_lock and calls pollcacheclean. 194 * This routine can then acquire pc_lock and remove any references 195 * to the closing fd (as well as recording that it has been closed 196 * so that a POLLNVAL can be generated even if the fd is reused before 197 * poll has been woken up and checked getf() again). 198 * 199 * When removing a polled fd from poll cache, the fd is always removed 200 * from pollhead list first and then from fpollinfo list, i.e., 201 * pollhead_delete() is called before delfpollinfo(). 202 * 203 * 204 * Locking hierarchy: 205 * pc_no_exit is a leaf level lock. 206 * ps_lock is held when acquiring pc_lock (except when pollwakeup 207 * acquires pc_lock). 208 * pc_lock might be held when acquiring PHLOCK (pollhead_insert/ 209 * pollhead_delete) 210 * pc_lock is always held (but this is not required) 211 * when acquiring PHLOCK (in polladd/pollhead_delete and pollwakeup called 212 * from pcache_clean_entry). 213 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 214 * uf_lock. 215 * pc_lock is held across getf/releasef which acquire uf_lock. 216 * ps_lock might be held across getf/releasef which acquire uf_lock. 217 * pollwakeup tries to acquire pc_lock while holding PHLOCK 218 * but drops the locks and reacquire them in reverse order to avoid 219 * deadlock. 220 * 221 * Note also that there is deadlock avoidance support for VOP_POLL routines 222 * and pollwakeup involving a file system or driver lock. 223 * See below. 224 */ 225 226 /* 227 * Deadlock avoidance support for VOP_POLL() routines. This is 228 * sometimes necessary to prevent deadlock between polling threads 229 * (which hold poll locks on entry to xx_poll(), then acquire foo) 230 * and pollwakeup() threads (which hold foo, then acquire poll locks). 231 * 232 * pollunlock(*cookie) releases whatever poll locks the current thread holds, 233 * setting a cookie for use by pollrelock(); 234 * 235 * pollrelock(cookie) reacquires previously dropped poll locks; 236 * 237 * polllock(php, mutex) does the common case: pollunlock(), 238 * acquire the problematic mutex, pollrelock(). 239 * 240 * If polllock() or pollunlock() return non-zero, it indicates that a recursive 241 * /dev/poll is in progress and pollcache locks cannot be dropped. Callers 242 * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL. 243 */ 244 int 245 pollunlock(int *lockstate) 246 { 247 pollstate_t *ps = curthread->t_pollstate; 248 pollcache_t *pcp; 249 250 ASSERT(lockstate != NULL); 251 252 /* 253 * There is no way to safely perform a pollunlock() while in the depths 254 * of a recursive /dev/poll operation. 255 */ 256 if (ps != NULL && ps->ps_depth > 1) { 257 ps->ps_flags |= POLLSTATE_ULFAIL; 258 pollstats.pollunlockfail.value.ui64++; 259 return (-1); 260 } 261 262 /* 263 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 264 * If the pollrelock/pollunlock is called as a result of poll(2), 265 * the t_pollcache should be NULL. 266 */ 267 if (curthread->t_pollcache == NULL) 268 pcp = ps->ps_pcache; 269 else 270 pcp = curthread->t_pollcache; 271 272 if (!mutex_owned(&pcp->pc_lock)) { 273 *lockstate = 0; 274 } else { 275 *lockstate = 1; 276 mutex_exit(&pcp->pc_lock); 277 } 278 return (0); 279 } 280 281 void 282 pollrelock(int lockstate) 283 { 284 pollstate_t *ps = curthread->t_pollstate; 285 pollcache_t *pcp; 286 287 /* Skip this whole ordeal if the pollcache was not locked to begin */ 288 if (lockstate == 0) 289 return; 290 291 /* 292 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 293 * If the pollrelock/pollunlock is called as a result of poll(2), 294 * the t_pollcache should be NULL. 295 */ 296 if (curthread->t_pollcache == NULL) 297 pcp = ps->ps_pcache; 298 else 299 pcp = curthread->t_pollcache; 300 301 mutex_enter(&pcp->pc_lock); 302 } 303 304 /* ARGSUSED */ 305 int 306 polllock(pollhead_t *php, kmutex_t *lp) 307 { 308 if (mutex_tryenter(lp) == 0) { 309 int state; 310 311 if (pollunlock(&state) != 0) { 312 return (-1); 313 } 314 mutex_enter(lp); 315 pollrelock(state); 316 } 317 return (0); 318 } 319 320 static int 321 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 322 { 323 kthread_t *t = curthread; 324 klwp_t *lwp = ttolwp(t); 325 proc_t *p = ttoproc(t); 326 int fdcnt = 0; 327 int i; 328 hrtime_t deadline; /* hrtime value when we want to return */ 329 pollfd_t *pollfdp; 330 pollstate_t *ps; 331 pollcache_t *pcp; 332 int error = 0; 333 nfds_t old_nfds; 334 int cacheindex = 0; /* which cache set is used */ 335 336 /* 337 * Determine the precise future time of the requested timeout, if any. 338 */ 339 if (tsp == NULL) { 340 deadline = -1; 341 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 342 deadline = 0; 343 } else { 344 /* They must wait at least a tick. */ 345 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 346 deadline = MAX(deadline, nsec_per_tick); 347 deadline += gethrtime(); 348 } 349 350 /* 351 * Reset our signal mask, if requested. 352 */ 353 if (ksetp != NULL) { 354 mutex_enter(&p->p_lock); 355 schedctl_finish_sigblock(t); 356 lwp->lwp_sigoldmask = t->t_hold; 357 t->t_hold = *ksetp; 358 t->t_flag |= T_TOMASK; 359 /* 360 * Call cv_reltimedwait_sig() just to check for signals. 361 * We will return immediately with either 0 or -1. 362 */ 363 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 364 TR_CLOCK_TICK)) { 365 mutex_exit(&p->p_lock); 366 error = EINTR; 367 goto pollout; 368 } 369 mutex_exit(&p->p_lock); 370 } 371 372 /* 373 * Check to see if this one just wants to use poll() as a timeout. 374 * If yes then bypass all the other stuff and make it sleep. 375 */ 376 if (nfds == 0) { 377 /* 378 * Sleep until we have passed the requested future 379 * time or until interrupted by a signal. 380 * Do not check for signals if we do not want to wait. 381 */ 382 if (deadline != 0) { 383 mutex_enter(&t->t_delay_lock); 384 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 385 &t->t_delay_lock, deadline)) > 0) 386 continue; 387 mutex_exit(&t->t_delay_lock); 388 error = (error == 0) ? EINTR : 0; 389 } 390 goto pollout; 391 } 392 393 if (nfds > p->p_fno_ctl) { 394 mutex_enter(&p->p_lock); 395 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 396 p->p_rctls, p, RCA_SAFE); 397 mutex_exit(&p->p_lock); 398 error = EINVAL; 399 goto pollout; 400 } 401 402 /* 403 * Need to allocate memory for pollstate before anything because 404 * the mutex and cv are created in this space 405 */ 406 ps = pollstate_create(); 407 408 if (ps->ps_pcache == NULL) 409 ps->ps_pcache = pcache_alloc(); 410 pcp = ps->ps_pcache; 411 412 /* 413 * NOTE: for performance, buffers are saved across poll() calls. 414 * The theory is that if a process polls heavily, it tends to poll 415 * on the same set of descriptors. Therefore, we only reallocate 416 * buffers when nfds changes. There is no hysteresis control, 417 * because there is no data to suggest that this is necessary; 418 * the penalty of reallocating is not *that* great in any event. 419 */ 420 old_nfds = ps->ps_nfds; 421 if (nfds != old_nfds) { 422 423 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 424 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 425 ps->ps_pollfd = pollfdp; 426 ps->ps_nfds = nfds; 427 } 428 429 pollfdp = ps->ps_pollfd; 430 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 431 error = EFAULT; 432 goto pollout; 433 } 434 435 if (fds == NULL) { 436 /* 437 * If the process has page 0 mapped, then the copyin() above 438 * will succeed even if fds is NULL. However, our cached 439 * poll lists are keyed by the address of the passed-in fds 440 * structure, and we use the value NULL to indicate an unused 441 * poll cache list entry. As such, we elect not to support 442 * NULL as a valid (user) memory address and fail the poll() 443 * call. 444 */ 445 error = EINVAL; 446 goto pollout; 447 } 448 449 /* 450 * If this thread polls for the first time, allocate ALL poll 451 * cache data structures and cache the poll fd list. This 452 * allocation is delayed till now because lwp's polling 0 fd 453 * (i.e. using poll as timeout()) don't need this memory. 454 */ 455 mutex_enter(&ps->ps_lock); 456 pcp = ps->ps_pcache; 457 ASSERT(pcp != NULL); 458 if (pcp->pc_bitmap == NULL) { 459 pcache_create(pcp, nfds); 460 /* 461 * poll and cache this poll fd list in ps_pcacheset[0]. 462 */ 463 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 464 if (fdcnt || error) { 465 mutex_exit(&ps->ps_lock); 466 goto pollout; 467 } 468 } else { 469 pollcacheset_t *pcset = ps->ps_pcacheset; 470 471 /* 472 * Not first time polling. Select a cached poll list by 473 * matching user pollfd list buffer address. 474 */ 475 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 476 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 477 if ((++pcset[cacheindex].pcs_count) == 0) { 478 /* 479 * counter is wrapping around. 480 */ 481 pcacheset_reset_count(ps, cacheindex); 482 } 483 /* 484 * examine and resolve possible 485 * difference of the current poll 486 * list and previously cached one. 487 * If there is an error during resolve(), 488 * the callee will guarantee the consistency 489 * of cached poll list and cache content. 490 */ 491 error = pcacheset_resolve(ps, nfds, &fdcnt, 492 cacheindex); 493 if (error) { 494 mutex_exit(&ps->ps_lock); 495 goto pollout; 496 } 497 break; 498 } 499 500 /* 501 * Note that pcs_usradr field of an used entry won't be 502 * NULL because it stores the address of passed-in fds, 503 * and NULL fds will not be cached (Then it is either 504 * the special timeout case when nfds is 0 or it returns 505 * failure directly). 506 */ 507 if (pcset[cacheindex].pcs_usradr == NULL) { 508 /* 509 * found an unused entry. Use it to cache 510 * this poll list. 511 */ 512 error = pcacheset_cache_list(ps, fds, &fdcnt, 513 cacheindex); 514 if (fdcnt || error) { 515 mutex_exit(&ps->ps_lock); 516 goto pollout; 517 } 518 break; 519 } 520 } 521 if (cacheindex == ps->ps_nsets) { 522 /* 523 * We failed to find a matching cached poll fd list. 524 * replace an old list. 525 */ 526 pollstats.polllistmiss.value.ui64++; 527 cacheindex = pcacheset_replace(ps); 528 ASSERT(cacheindex < ps->ps_nsets); 529 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 530 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 531 if (error) { 532 mutex_exit(&ps->ps_lock); 533 goto pollout; 534 } 535 } 536 } 537 538 /* 539 * Always scan the bitmap with the lock on the pollcache held. 540 * This is to make sure that a wakeup does not come undetected. 541 * If the lock is not held, a pollwakeup could have come for an 542 * fd we already checked but before this thread sleeps, in which 543 * case the wakeup is missed. Now we hold the pcache lock and 544 * check the bitmap again. This will prevent wakeup from happening 545 * while we hold pcache lock since pollwakeup() will also lock 546 * the pcache before updating poll bitmap. 547 */ 548 mutex_enter(&pcp->pc_lock); 549 for (;;) { 550 pcp->pc_flag = 0; 551 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 552 if (fdcnt || error) { 553 mutex_exit(&pcp->pc_lock); 554 mutex_exit(&ps->ps_lock); 555 break; 556 } 557 558 /* 559 * If PC_POLLWAKE is set, a pollwakeup() was performed on 560 * one of the file descriptors. This can happen only if 561 * one of the VOP_POLL() functions dropped pcp->pc_lock. 562 * The only current cases of this is in procfs (prpoll()) 563 * and STREAMS (strpoll()). 564 */ 565 if (pcp->pc_flag & PC_POLLWAKE) 566 continue; 567 568 /* 569 * If you get here, the poll of fds was unsuccessful. 570 * Wait until some fd becomes readable, writable, or gets 571 * an exception, or until a signal or a timeout occurs. 572 * Do not check for signals if we have a zero timeout. 573 */ 574 mutex_exit(&ps->ps_lock); 575 if (deadline == 0) { 576 error = -1; 577 } else { 578 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 579 &pcp->pc_lock, deadline); 580 } 581 mutex_exit(&pcp->pc_lock); 582 /* 583 * If we have received a signal or timed out 584 * then break out and return. 585 */ 586 if (error <= 0) { 587 error = (error == 0) ? EINTR : 0; 588 break; 589 } 590 /* 591 * We have not received a signal or timed out. 592 * Continue around and poll fds again. 593 */ 594 mutex_enter(&ps->ps_lock); 595 mutex_enter(&pcp->pc_lock); 596 } 597 598 pollout: 599 /* 600 * If we changed the signal mask but we received 601 * no signal then restore the signal mask. 602 * Otherwise psig() will deal with the signal mask. 603 */ 604 if (ksetp != NULL) { 605 mutex_enter(&p->p_lock); 606 if (lwp->lwp_cursig == 0) { 607 t->t_hold = lwp->lwp_sigoldmask; 608 t->t_flag &= ~T_TOMASK; 609 } 610 mutex_exit(&p->p_lock); 611 } 612 613 if (error) 614 return (set_errno(error)); 615 616 /* 617 * Copy out the events and return the fdcnt to the user. 618 */ 619 if (nfds != 0 && 620 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 621 return (set_errno(EFAULT)); 622 623 #ifdef DEBUG 624 /* 625 * Another sanity check: 626 */ 627 if (fdcnt) { 628 int reventcnt = 0; 629 630 for (i = 0; i < nfds; i++) { 631 if (pollfdp[i].fd < 0) { 632 ASSERT(pollfdp[i].revents == 0); 633 continue; 634 } 635 if (pollfdp[i].revents) { 636 reventcnt++; 637 } 638 } 639 ASSERT(fdcnt == reventcnt); 640 } else { 641 for (i = 0; i < nfds; i++) { 642 ASSERT(pollfdp[i].revents == 0); 643 } 644 } 645 #endif /* DEBUG */ 646 647 return (fdcnt); 648 } 649 650 /* 651 * This is the system call trap that poll(), 652 * select() and pselect() are built upon. 653 * It is a private interface between libc and the kernel. 654 */ 655 int 656 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 657 { 658 timespec_t ts; 659 timespec_t *tsp; 660 sigset_t set; 661 k_sigset_t kset; 662 k_sigset_t *ksetp; 663 model_t datamodel = get_udatamodel(); 664 665 if (timeoutp == NULL) 666 tsp = NULL; 667 else { 668 if (datamodel == DATAMODEL_NATIVE) { 669 if (copyin(timeoutp, &ts, sizeof (ts))) 670 return (set_errno(EFAULT)); 671 } else { 672 timespec32_t ts32; 673 674 if (copyin(timeoutp, &ts32, sizeof (ts32))) 675 return (set_errno(EFAULT)); 676 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 677 } 678 679 if (itimerspecfix(&ts)) 680 return (set_errno(EINVAL)); 681 tsp = &ts; 682 } 683 684 if (setp == NULL) 685 ksetp = NULL; 686 else { 687 if (copyin(setp, &set, sizeof (set))) 688 return (set_errno(EFAULT)); 689 sigutok(&set, &kset); 690 ksetp = &kset; 691 } 692 693 return (poll_common(fds, nfds, tsp, ksetp)); 694 } 695 696 /* 697 * Clean up any state left around by poll(2). Called when a thread exits. 698 */ 699 void 700 pollcleanup() 701 { 702 pollstate_t *ps = curthread->t_pollstate; 703 pollcache_t *pcp; 704 705 if (ps == NULL) 706 return; 707 pcp = ps->ps_pcache; 708 /* 709 * free up all cached poll fds 710 */ 711 if (pcp == NULL) { 712 /* this pollstate is used by /dev/poll */ 713 goto pollcleanout; 714 } 715 716 if (pcp->pc_bitmap != NULL) { 717 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 718 /* 719 * a close lwp can race with us when cleaning up a polldat 720 * entry. We hold the ps_lock when cleaning hash table. 721 * Since this pollcache is going away anyway, there is no 722 * need to hold the pc_lock. 723 */ 724 mutex_enter(&ps->ps_lock); 725 pcache_clean(pcp); 726 mutex_exit(&ps->ps_lock); 727 #ifdef DEBUG 728 /* 729 * At this point, all fds cached by this lwp should be 730 * cleaned up. There should be no fd in fi_list still 731 * reference this thread. 732 */ 733 checkfpollinfo(); /* sanity check */ 734 pollcheckphlist(); /* sanity check */ 735 #endif /* DEBUG */ 736 } 737 /* 738 * Be sure no one is referencing thread before exiting 739 */ 740 mutex_enter(&pcp->pc_no_exit); 741 ASSERT(pcp->pc_busy >= 0); 742 while (pcp->pc_busy > 0) 743 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 744 mutex_exit(&pcp->pc_no_exit); 745 pollcleanout: 746 pollstate_destroy(ps); 747 curthread->t_pollstate = NULL; 748 } 749 750 /* 751 * pollwakeup() - poke threads waiting in poll() for some event 752 * on a particular object. 753 * 754 * The threads hanging off of the specified pollhead structure are scanned. 755 * If their event mask matches the specified event(s), then pollnotify() is 756 * called to poke the thread. 757 * 758 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 759 * all waiting threads are poked. 760 * 761 * It is important that pollnotify() not drop the lock protecting the list 762 * of threads. 763 */ 764 void 765 pollwakeup(pollhead_t *php, short events_arg) 766 { 767 polldat_t *pdp; 768 int events = (ushort_t)events_arg; 769 struct plist { 770 port_t *pp; 771 int pevents; 772 struct plist *next; 773 }; 774 struct plist *plhead = NULL, *pltail = NULL; 775 776 retry: 777 PH_ENTER(php); 778 779 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 780 if ((pdp->pd_events & events) || 781 (events & (POLLHUP | POLLERR))) { 782 783 pollcache_t *pcp; 784 785 if (pdp->pd_portev != NULL) { 786 port_kevent_t *pkevp = pdp->pd_portev; 787 /* 788 * Object (fd) is associated with an event port, 789 * => send event notification to the port. 790 */ 791 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 792 mutex_enter(&pkevp->portkev_lock); 793 if (pkevp->portkev_flags & PORT_KEV_VALID) { 794 int pevents; 795 796 pkevp->portkev_flags &= ~PORT_KEV_VALID; 797 pkevp->portkev_events |= events & 798 (pdp->pd_events | POLLHUP | 799 POLLERR); 800 /* 801 * portkev_lock mutex will be released 802 * by port_send_event(). 803 */ 804 port_send_event(pkevp); 805 806 /* 807 * If we have some thread polling the 808 * port's fd, add it to the list. They 809 * will be notified later. 810 * The port_pollwkup() will flag the 811 * port_t so that it will not disappear 812 * till port_pollwkdone() is called. 813 */ 814 pevents = 815 port_pollwkup(pkevp->portkev_port); 816 if (pevents) { 817 struct plist *t; 818 t = kmem_zalloc( 819 sizeof (struct plist), 820 KM_SLEEP); 821 t->pp = pkevp->portkev_port; 822 t->pevents = pevents; 823 if (plhead == NULL) { 824 plhead = t; 825 } else { 826 pltail->next = t; 827 } 828 pltail = t; 829 } 830 } else { 831 mutex_exit(&pkevp->portkev_lock); 832 } 833 continue; 834 } 835 836 pcp = pdp->pd_pcache; 837 838 /* 839 * Try to grab the lock for this thread. If 840 * we don't get it then we may deadlock so 841 * back out and restart all over again. Note 842 * that the failure rate is very very low. 843 */ 844 if (mutex_tryenter(&pcp->pc_lock)) { 845 pollnotify(pcp, pdp->pd_fd); 846 mutex_exit(&pcp->pc_lock); 847 } else { 848 /* 849 * We are here because: 850 * 1) This thread has been woke up 851 * and is trying to get out of poll(). 852 * 2) Some other thread is also here 853 * but with a different pollhead lock. 854 * 855 * So, we need to drop the lock on pollhead 856 * because of (1) but we want to prevent 857 * that thread from doing lwp_exit() or 858 * devpoll close. We want to ensure that 859 * the pollcache pointer is still invalid. 860 * 861 * Solution: Grab the pcp->pc_no_exit lock, 862 * increment the pc_busy counter, drop every 863 * lock in sight. Get out of the way and wait 864 * for type (2) threads to finish. 865 */ 866 867 mutex_enter(&pcp->pc_no_exit); 868 pcp->pc_busy++; /* prevents exit()'s */ 869 mutex_exit(&pcp->pc_no_exit); 870 871 PH_EXIT(php); 872 mutex_enter(&pcp->pc_lock); 873 mutex_exit(&pcp->pc_lock); 874 mutex_enter(&pcp->pc_no_exit); 875 pcp->pc_busy--; 876 if (pcp->pc_busy == 0) { 877 /* 878 * Wakeup the thread waiting in 879 * thread_exit(). 880 */ 881 cv_signal(&pcp->pc_busy_cv); 882 } 883 mutex_exit(&pcp->pc_no_exit); 884 goto retry; 885 } 886 } 887 } 888 889 890 /* 891 * Event ports - If this php is of the port on the list, 892 * call port_pollwkdone() to release it. The port_pollwkdone() 893 * needs to be called before dropping the PH lock so that any new 894 * thread attempting to poll this port are blocked. There can be 895 * only one thread here in pollwakeup notifying this port's fd. 896 */ 897 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 898 struct plist *t; 899 port_pollwkdone(plhead->pp); 900 t = plhead; 901 plhead = plhead->next; 902 kmem_free(t, sizeof (struct plist)); 903 } 904 PH_EXIT(php); 905 906 /* 907 * Event ports - Notify threads polling the event port's fd. 908 * This is normally done in port_send_event() where it calls 909 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 910 * we do it here in pollwakeup() to avoid a recursive call. 911 */ 912 if (plhead != NULL) { 913 php = &plhead->pp->port_pollhd; 914 events = plhead->pevents; 915 goto retry; 916 } 917 } 918 919 /* 920 * This function is called to inform a thread (or threads) that an event being 921 * polled on has occurred. The pollstate lock on the thread should be held 922 * on entry. 923 */ 924 void 925 pollnotify(pollcache_t *pcp, int fd) 926 { 927 ASSERT(fd < pcp->pc_mapsize); 928 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 929 BT_SET(pcp->pc_bitmap, fd); 930 pcp->pc_flag |= PC_POLLWAKE; 931 cv_broadcast(&pcp->pc_cv); 932 pcache_wake_parents(pcp); 933 } 934 935 /* 936 * add a polldat entry to pollhead ph_list. The polldat struct is used 937 * by pollwakeup to wake sleeping pollers when polled events has happened. 938 */ 939 void 940 pollhead_insert(pollhead_t *php, polldat_t *pdp) 941 { 942 PH_ENTER(php); 943 ASSERT(pdp->pd_next == NULL); 944 #ifdef DEBUG 945 { 946 /* 947 * the polldat should not be already on the list 948 */ 949 polldat_t *wp; 950 for (wp = php->ph_list; wp; wp = wp->pd_next) { 951 ASSERT(wp != pdp); 952 } 953 } 954 #endif /* DEBUG */ 955 pdp->pd_next = php->ph_list; 956 php->ph_list = pdp; 957 PH_EXIT(php); 958 } 959 960 /* 961 * Delete the polldat entry from ph_list. 962 */ 963 void 964 pollhead_delete(pollhead_t *php, polldat_t *pdp) 965 { 966 polldat_t *wp; 967 polldat_t **wpp; 968 969 PH_ENTER(php); 970 for (wpp = &php->ph_list; (wp = *wpp) != NULL; wpp = &wp->pd_next) { 971 if (wp == pdp) { 972 *wpp = pdp->pd_next; 973 pdp->pd_next = NULL; 974 break; 975 } 976 } 977 #ifdef DEBUG 978 /* assert that pdp is no longer in the list */ 979 for (wp = *wpp; wp; wp = wp->pd_next) { 980 ASSERT(wp != pdp); 981 } 982 #endif /* DEBUG */ 983 PH_EXIT(php); 984 } 985 986 /* 987 * walk through the poll fd lists to see if they are identical. This is an 988 * expensive operation and should not be done more than once for each poll() 989 * call. 990 * 991 * As an optimization (i.e., not having to go through the lists more than 992 * once), this routine also clear the revents field of pollfd in 'current'. 993 * Zeroing out the revents field of each entry in current poll list is 994 * required by poll man page. 995 * 996 * Since the events field of cached list has illegal poll events filtered 997 * out, the current list applies the same filtering before comparison. 998 * 999 * The routine stops when it detects a meaningful difference, or when it 1000 * exhausts the lists. 1001 */ 1002 int 1003 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 1004 { 1005 int ix; 1006 1007 for (ix = 0; ix < n; ix++) { 1008 /* Prefetch 64 bytes worth of 8-byte elements */ 1009 if ((ix & 0x7) == 0) { 1010 prefetch_write_many((caddr_t)¤t[ix + 8]); 1011 prefetch_write_many((caddr_t)&cached[ix + 8]); 1012 } 1013 if (current[ix].fd == cached[ix].fd) { 1014 /* 1015 * Filter out invalid poll events while we are in 1016 * inside the loop. 1017 */ 1018 if (current[ix].events & ~VALID_POLL_EVENTS) { 1019 current[ix].events &= VALID_POLL_EVENTS; 1020 if (newlist != NULL) 1021 newlist[ix].events = current[ix].events; 1022 } 1023 if (current[ix].events == cached[ix].events) { 1024 current[ix].revents = 0; 1025 continue; 1026 } 1027 } 1028 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1029 current[ix].revents = 0; 1030 continue; 1031 } 1032 return (ix); 1033 } 1034 return (ix); 1035 } 1036 1037 /* 1038 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1039 * does not find it in the hash table. 1040 */ 1041 polldat_t * 1042 pcache_lookup_fd(pollcache_t *pcp, int fd) 1043 { 1044 int hashindex; 1045 polldat_t *pdp; 1046 1047 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1048 pdp = pcp->pc_hash[hashindex]; 1049 while (pdp != NULL) { 1050 if (pdp->pd_fd == fd) 1051 break; 1052 pdp = pdp->pd_hashnext; 1053 } 1054 return (pdp); 1055 } 1056 1057 polldat_t * 1058 pcache_alloc_fd(int nsets) 1059 { 1060 polldat_t *pdp; 1061 1062 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1063 if (nsets > 0) { 1064 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1065 pdp->pd_nsets = nsets; 1066 } 1067 return (pdp); 1068 } 1069 1070 /* 1071 * This routine inserts a polldat into the pollcache's hash table. It 1072 * may be necessary to grow the size of the hash table. 1073 */ 1074 void 1075 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1076 { 1077 int hashindex; 1078 int fd; 1079 1080 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1081 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1082 pcache_grow_hashtbl(pcp, nfds); 1083 } 1084 fd = pdp->pd_fd; 1085 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1086 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1087 pcp->pc_hash[hashindex] = pdp; 1088 pcp->pc_fdcount++; 1089 1090 #ifdef DEBUG 1091 { 1092 /* 1093 * same fd should not appear on a hash list twice 1094 */ 1095 polldat_t *pdp1; 1096 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1097 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1098 } 1099 } 1100 #endif /* DEBUG */ 1101 } 1102 1103 /* 1104 * Grow the hash table -- either double the table size or round it to the 1105 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1106 * elements on the hash table. 1107 */ 1108 void 1109 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1110 { 1111 int oldsize; 1112 polldat_t **oldtbl; 1113 polldat_t *pdp, *pdp1; 1114 int i; 1115 #ifdef DEBUG 1116 int count = 0; 1117 #endif 1118 1119 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1120 oldsize = pcp->pc_hashsize; 1121 oldtbl = pcp->pc_hash; 1122 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1123 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1124 ~(POLLHASHCHUNKSZ - 1); 1125 } else { 1126 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1127 } 1128 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1129 KM_SLEEP); 1130 /* 1131 * rehash existing elements 1132 */ 1133 pcp->pc_fdcount = 0; 1134 for (i = 0; i < oldsize; i++) { 1135 pdp = oldtbl[i]; 1136 while (pdp != NULL) { 1137 pdp1 = pdp->pd_hashnext; 1138 pcache_insert_fd(pcp, pdp, nfds); 1139 pdp = pdp1; 1140 #ifdef DEBUG 1141 count++; 1142 #endif 1143 } 1144 } 1145 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1146 ASSERT(pcp->pc_fdcount == count); 1147 } 1148 1149 void 1150 pcache_grow_map(pollcache_t *pcp, int fd) 1151 { 1152 int newsize; 1153 ulong_t *newmap; 1154 1155 /* 1156 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1157 * power of 2. 1158 */ 1159 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1160 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1161 KM_SLEEP); 1162 /* 1163 * don't want pollwakeup to set a bit while growing the bitmap. 1164 */ 1165 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1166 mutex_enter(&pcp->pc_lock); 1167 bcopy(pcp->pc_bitmap, newmap, 1168 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1169 kmem_free(pcp->pc_bitmap, 1170 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1171 pcp->pc_bitmap = newmap; 1172 pcp->pc_mapsize = newsize; 1173 mutex_exit(&pcp->pc_lock); 1174 } 1175 1176 /* 1177 * remove all the reference from pollhead list and fpollinfo lists. 1178 */ 1179 void 1180 pcache_clean(pollcache_t *pcp) 1181 { 1182 int i; 1183 polldat_t **hashtbl; 1184 polldat_t *pdp; 1185 1186 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1187 hashtbl = pcp->pc_hash; 1188 for (i = 0; i < pcp->pc_hashsize; i++) { 1189 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1190 if (pdp->pd_php != NULL) { 1191 pollhead_delete(pdp->pd_php, pdp); 1192 pdp->pd_php = NULL; 1193 } 1194 if (pdp->pd_fp != NULL) { 1195 delfpollinfo(pdp->pd_fd); 1196 pdp->pd_fp = NULL; 1197 } 1198 } 1199 } 1200 } 1201 1202 void 1203 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1204 { 1205 int i; 1206 int fd = pdp->pd_fd; 1207 1208 /* 1209 * we come here because an earlier close() on this cached poll fd. 1210 */ 1211 ASSERT(pdp->pd_fp == NULL); 1212 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1213 pdp->pd_events = 0; 1214 for (i = 0; i < ps->ps_nsets; i++) { 1215 xref_t *refp; 1216 pollcacheset_t *pcsp; 1217 1218 ASSERT(pdp->pd_ref != NULL); 1219 refp = &pdp->pd_ref[i]; 1220 if (refp->xf_refcnt) { 1221 ASSERT(refp->xf_position >= 0); 1222 pcsp = &ps->ps_pcacheset[i]; 1223 if (refp->xf_refcnt == 1) { 1224 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1225 refp->xf_refcnt = 0; 1226 pdp->pd_count--; 1227 } else if (refp->xf_refcnt > 1) { 1228 int j; 1229 1230 /* 1231 * turn off every appearance in pcs_pollfd list 1232 */ 1233 for (j = refp->xf_position; 1234 j < pcsp->pcs_nfds; j++) { 1235 if (pcsp->pcs_pollfd[j].fd == fd) { 1236 pcsp->pcs_pollfd[j].fd = -1; 1237 refp->xf_refcnt--; 1238 pdp->pd_count--; 1239 } 1240 } 1241 } 1242 ASSERT(refp->xf_refcnt == 0); 1243 refp->xf_position = POLLPOSINVAL; 1244 } 1245 } 1246 ASSERT(pdp->pd_count == 0); 1247 } 1248 1249 /* 1250 * Insert poll fd into the pollcache, and add poll registration. 1251 * This routine is called after getf() and before releasef(). So the vnode 1252 * can not disappear even if we block here. 1253 * If there is an error, the polled fd is not cached. 1254 */ 1255 int 1256 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1257 ssize_t pos, int which) 1258 { 1259 pollcache_t *pcp = ps->ps_pcache; 1260 polldat_t *pdp; 1261 int error; 1262 int fd; 1263 pollhead_t *memphp = NULL; 1264 xref_t *refp; 1265 int newpollfd = 0; 1266 1267 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1268 /* 1269 * The poll caching uses the existing VOP_POLL interface. If there 1270 * is no polled events, we want the polled device to set its "some 1271 * one is sleeping in poll" flag. When the polled events happen 1272 * later, the driver will call pollwakeup(). We achieve this by 1273 * always passing 0 in the third parameter ("anyyet") when calling 1274 * VOP_POLL. This parameter is not looked at by drivers when the 1275 * polled events exist. If a driver chooses to ignore this parameter 1276 * and call pollwakeup whenever the polled events happen, that will 1277 * be OK too. 1278 */ 1279 ASSERT(curthread->t_pollcache == NULL); 1280 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1281 &memphp, NULL); 1282 if (error) { 1283 return (error); 1284 } 1285 if (pollfdp->revents) { 1286 (*fdcntp)++; 1287 } 1288 /* 1289 * polling the underlying device succeeded. Now we can cache it. 1290 * A close can't come in here because we have not done a releasef() 1291 * yet. 1292 */ 1293 fd = pollfdp->fd; 1294 pdp = pcache_lookup_fd(pcp, fd); 1295 if (pdp == NULL) { 1296 ASSERT(ps->ps_nsets > 0); 1297 pdp = pcache_alloc_fd(ps->ps_nsets); 1298 newpollfd = 1; 1299 } 1300 /* 1301 * If this entry was used to cache a poll fd which was closed, and 1302 * this entry has not been cleaned, do it now. 1303 */ 1304 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1305 pcacheset_invalidate(ps, pdp); 1306 ASSERT(pdp->pd_next == NULL); 1307 } 1308 if (pdp->pd_count == 0) { 1309 pdp->pd_fd = fd; 1310 pdp->pd_fp = fp; 1311 addfpollinfo(fd); 1312 pdp->pd_thread = curthread; 1313 pdp->pd_pcache = pcp; 1314 /* 1315 * the entry is never used or cleared by removing a cached 1316 * pollfd (pcache_delete_fd). So all the fields should be clear. 1317 */ 1318 ASSERT(pdp->pd_next == NULL); 1319 } 1320 1321 /* 1322 * A polled fd is considered cached. So there should be a fpollinfo 1323 * entry on uf_fpollinfo list. 1324 */ 1325 ASSERT(infpollinfo(fd)); 1326 /* 1327 * If there is an inconsistency, we want to know it here. 1328 */ 1329 ASSERT(pdp->pd_fp == fp); 1330 1331 /* 1332 * XXX pd_events is a union of all polled events on this fd, possibly 1333 * by different threads. Unless this is a new first poll(), pd_events 1334 * never shrinks. If an event is no longer polled by a process, there 1335 * is no way to cancel that event. In that case, poll degrade to its 1336 * old form -- polling on this fd every time poll() is called. The 1337 * assumption is an app always polls the same type of events. 1338 */ 1339 pdp->pd_events |= pollfdp->events; 1340 1341 pdp->pd_count++; 1342 /* 1343 * There is not much special handling for multiple appearances of 1344 * same fd other than xf_position always recording the first 1345 * appearance in poll list. If this is called from pcacheset_cache_list, 1346 * a VOP_POLL is called on every pollfd entry; therefore each 1347 * revents and fdcnt should be set correctly. If this is called from 1348 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1349 * pick up the right count and handle revents field of each pollfd 1350 * entry. 1351 */ 1352 ASSERT(pdp->pd_ref != NULL); 1353 refp = &pdp->pd_ref[which]; 1354 if (refp->xf_refcnt == 0) { 1355 refp->xf_position = pos; 1356 } else { 1357 /* 1358 * xf_position records the fd's first appearance in poll list 1359 */ 1360 if (pos < refp->xf_position) { 1361 refp->xf_position = pos; 1362 } 1363 } 1364 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1365 refp->xf_refcnt++; 1366 if (fd >= pcp->pc_mapsize) { 1367 pcache_grow_map(pcp, fd); 1368 } 1369 if (fd > pcp->pc_mapend) { 1370 pcp->pc_mapend = fd; 1371 } 1372 if (newpollfd != 0) { 1373 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1374 } 1375 if (memphp) { 1376 if (pdp->pd_php == NULL) { 1377 pollhead_insert(memphp, pdp); 1378 pdp->pd_php = memphp; 1379 } else { 1380 if (memphp != pdp->pd_php) { 1381 /* 1382 * layered devices (e.g. console driver) 1383 * may change the vnode and thus the pollhead 1384 * pointer out from underneath us. 1385 */ 1386 pollhead_delete(pdp->pd_php, pdp); 1387 pollhead_insert(memphp, pdp); 1388 pdp->pd_php = memphp; 1389 } 1390 } 1391 } 1392 /* 1393 * Since there is a considerable window between VOP_POLL and when 1394 * we actually put the polldat struct on the pollhead list, we could 1395 * miss a pollwakeup. In the case of polling additional events, we 1396 * don't update the events until after VOP_POLL. So we could miss 1397 * pollwakeup there too. So we always set the bit here just to be 1398 * safe. The real performance gain is in subsequent pcache_poll. 1399 */ 1400 mutex_enter(&pcp->pc_lock); 1401 BT_SET(pcp->pc_bitmap, fd); 1402 mutex_exit(&pcp->pc_lock); 1403 return (0); 1404 } 1405 1406 /* 1407 * The entry is not really deleted. The fields are cleared so that the 1408 * entry is no longer useful, but it will remain in the hash table for reuse 1409 * later. It will be freed when the polling lwp exits. 1410 */ 1411 int 1412 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1413 { 1414 pollcache_t *pcp = ps->ps_pcache; 1415 polldat_t *pdp; 1416 xref_t *refp; 1417 1418 ASSERT(fd < pcp->pc_mapsize); 1419 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1420 1421 pdp = pcache_lookup_fd(pcp, fd); 1422 ASSERT(pdp != NULL); 1423 ASSERT(pdp->pd_count > 0); 1424 ASSERT(pdp->pd_ref != NULL); 1425 refp = &pdp->pd_ref[which]; 1426 if (pdp->pd_count == 1) { 1427 pdp->pd_events = 0; 1428 refp->xf_position = POLLPOSINVAL; 1429 ASSERT(refp->xf_refcnt == 1); 1430 refp->xf_refcnt = 0; 1431 if (pdp->pd_php) { 1432 /* 1433 * It is possible for a wakeup thread to get ahead 1434 * of the following pollhead_delete and set the bit in 1435 * bitmap. It is OK because the bit will be cleared 1436 * here anyway. 1437 */ 1438 pollhead_delete(pdp->pd_php, pdp); 1439 pdp->pd_php = NULL; 1440 } 1441 pdp->pd_count = 0; 1442 if (pdp->pd_fp != NULL) { 1443 pdp->pd_fp = NULL; 1444 delfpollinfo(fd); 1445 } 1446 mutex_enter(&pcp->pc_lock); 1447 BT_CLEAR(pcp->pc_bitmap, fd); 1448 mutex_exit(&pcp->pc_lock); 1449 return (0); 1450 } 1451 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1452 /* 1453 * fd cached here has been closed. This is the first 1454 * pcache_delete_fd called after the close. Clean up the 1455 * entire entry. 1456 */ 1457 pcacheset_invalidate(ps, pdp); 1458 ASSERT(pdp->pd_php == NULL); 1459 mutex_enter(&pcp->pc_lock); 1460 BT_CLEAR(pcp->pc_bitmap, fd); 1461 mutex_exit(&pcp->pc_lock); 1462 return (0); 1463 } 1464 #ifdef DEBUG 1465 if (getf(fd) != NULL) { 1466 ASSERT(infpollinfo(fd)); 1467 releasef(fd); 1468 } 1469 #endif /* DEBUG */ 1470 pdp->pd_count--; 1471 ASSERT(refp->xf_refcnt > 0); 1472 if (--refp->xf_refcnt == 0) { 1473 refp->xf_position = POLLPOSINVAL; 1474 } else { 1475 ASSERT(pos >= refp->xf_position); 1476 if (pos == refp->xf_position) { 1477 /* 1478 * The xref position is no longer valid. 1479 * Reset it to a special value and let 1480 * caller know it needs to updatexref() 1481 * with a new xf_position value. 1482 */ 1483 refp->xf_position = POLLPOSTRANS; 1484 return (1); 1485 } 1486 } 1487 return (0); 1488 } 1489 1490 void 1491 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1492 { 1493 polldat_t *pdp; 1494 1495 pdp = pcache_lookup_fd(pcp, fd); 1496 ASSERT(pdp != NULL); 1497 ASSERT(pdp->pd_ref != NULL); 1498 pdp->pd_ref[which].xf_position = pos; 1499 } 1500 1501 #ifdef DEBUG 1502 /* 1503 * For each polled fd, it's either in the bitmap or cached in 1504 * pcache hash table. If this routine returns 0, something is wrong. 1505 */ 1506 static int 1507 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1508 { 1509 int i; 1510 int fd; 1511 pollcache_t *pcp = ps->ps_pcache; 1512 polldat_t *pdp; 1513 pollfd_t *pollfdp = ps->ps_pollfd; 1514 file_t *fp; 1515 1516 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1517 for (i = 0; i < nfds; i++) { 1518 fd = pollfdp[i].fd; 1519 if (fd < 0) { 1520 ASSERT(pollfdp[i].revents == 0); 1521 continue; 1522 } 1523 if (pollfdp[i].revents == POLLNVAL) 1524 continue; 1525 if ((fp = getf(fd)) == NULL) 1526 continue; 1527 pdp = pcache_lookup_fd(pcp, fd); 1528 ASSERT(pdp != NULL); 1529 ASSERT(infpollinfo(fd)); 1530 ASSERT(pdp->pd_fp == fp); 1531 releasef(fd); 1532 if (BT_TEST(pcp->pc_bitmap, fd)) 1533 continue; 1534 if (pdp->pd_php == NULL) 1535 return (0); 1536 } 1537 return (1); 1538 } 1539 #endif /* DEBUG */ 1540 1541 /* 1542 * resolve the difference between the current poll list and a cached one. 1543 */ 1544 int 1545 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1546 { 1547 int i; 1548 pollcache_t *pcp = ps->ps_pcache; 1549 pollfd_t *newlist = NULL; 1550 pollfd_t *current = ps->ps_pollfd; 1551 pollfd_t *cached; 1552 pollcacheset_t *pcsp; 1553 int common; 1554 int count = 0; 1555 int offset; 1556 int remain; 1557 int fd; 1558 file_t *fp; 1559 int fdcnt = 0; 1560 int cnt = 0; 1561 nfds_t old_nfds; 1562 int error = 0; 1563 int mismatch = 0; 1564 1565 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1566 #ifdef DEBUG 1567 checkpolldat(ps); 1568 #endif 1569 pcsp = &ps->ps_pcacheset[which]; 1570 old_nfds = pcsp->pcs_nfds; 1571 common = (nfds > old_nfds) ? old_nfds : nfds; 1572 if (nfds != old_nfds) { 1573 /* 1574 * the length of poll list has changed. allocate a new 1575 * pollfd list. 1576 */ 1577 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1578 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1579 } 1580 /* 1581 * Compare the overlapping part of the current fd list with the 1582 * cached one. Whenever a difference is found, resolve it. 1583 * The comparison is done on the current poll list and the 1584 * cached list. But we may be setting up the newlist to be the 1585 * cached list for next poll. 1586 */ 1587 cached = pcsp->pcs_pollfd; 1588 remain = common; 1589 1590 while (count < common) { 1591 int tmpfd; 1592 pollfd_t *np; 1593 1594 np = (newlist != NULL) ? &newlist[count] : NULL; 1595 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1596 remain); 1597 /* 1598 * Collect stats. If lists are completed the first time, 1599 * it's a hit. Otherwise, it's a partial hit or miss. 1600 */ 1601 if ((count == 0) && (offset == common)) { 1602 pollstats.pollcachehit.value.ui64++; 1603 } else { 1604 mismatch++; 1605 } 1606 count += offset; 1607 if (offset < remain) { 1608 ASSERT(count < common); 1609 ASSERT((current[count].fd != cached[count].fd) || 1610 (current[count].events != cached[count].events)); 1611 /* 1612 * Filter out invalid events. 1613 */ 1614 if (current[count].events & ~VALID_POLL_EVENTS) { 1615 if (newlist != NULL) { 1616 newlist[count].events = 1617 current[count].events &= 1618 VALID_POLL_EVENTS; 1619 } else { 1620 current[count].events &= 1621 VALID_POLL_EVENTS; 1622 } 1623 } 1624 /* 1625 * when resolving a difference, we always remove the 1626 * fd from cache before inserting one into cache. 1627 */ 1628 if (cached[count].fd >= 0) { 1629 tmpfd = cached[count].fd; 1630 if (pcache_delete_fd(ps, tmpfd, count, which, 1631 (uint_t)cached[count].events)) { 1632 /* 1633 * This should be rare but needed for 1634 * correctness. 1635 * 1636 * The first appearance in cached list 1637 * is being "turned off". The same fd 1638 * appear more than once in the cached 1639 * poll list. Find the next one on the 1640 * list and update the cached 1641 * xf_position field. 1642 */ 1643 for (i = count + 1; i < old_nfds; i++) { 1644 if (cached[i].fd == tmpfd) { 1645 pcache_update_xref(pcp, 1646 tmpfd, (ssize_t)i, 1647 which); 1648 break; 1649 } 1650 } 1651 ASSERT(i <= old_nfds); 1652 } 1653 /* 1654 * In case a new cache list is allocated, 1655 * need to keep both cache lists in sync 1656 * b/c the new one can be freed if we have 1657 * an error later. 1658 */ 1659 cached[count].fd = -1; 1660 if (newlist != NULL) { 1661 newlist[count].fd = -1; 1662 } 1663 } 1664 if ((tmpfd = current[count].fd) >= 0) { 1665 /* 1666 * add to the cached fd tbl and bitmap. 1667 */ 1668 if ((fp = getf(tmpfd)) == NULL) { 1669 current[count].revents = POLLNVAL; 1670 if (newlist != NULL) { 1671 newlist[count].fd = -1; 1672 } 1673 cached[count].fd = -1; 1674 fdcnt++; 1675 } else { 1676 /* 1677 * Here we don't care about the 1678 * fdcnt. We will examine the bitmap 1679 * later and pick up the correct 1680 * fdcnt there. So we never bother 1681 * to check value of 'cnt'. 1682 */ 1683 error = pcache_insert(ps, fp, 1684 ¤t[count], &cnt, 1685 (ssize_t)count, which); 1686 /* 1687 * if no error, we want to do releasef 1688 * after we updated cache poll list 1689 * entry so that close() won't race 1690 * us. 1691 */ 1692 if (error) { 1693 /* 1694 * If we encountered an error, 1695 * we have invalidated an 1696 * entry in cached poll list 1697 * (in pcache_delete_fd() above) 1698 * but failed to add one here. 1699 * This is OK b/c what's in the 1700 * cached list is consistent 1701 * with content of cache. 1702 * It will not have any ill 1703 * effect on next poll(). 1704 */ 1705 releasef(tmpfd); 1706 if (newlist != NULL) { 1707 kmem_free(newlist, 1708 nfds * 1709 sizeof (pollfd_t)); 1710 } 1711 return (error); 1712 } 1713 /* 1714 * If we have allocated a new(temp) 1715 * cache list, we need to keep both 1716 * in sync b/c the new one can be freed 1717 * if we have an error later. 1718 */ 1719 if (newlist != NULL) { 1720 newlist[count].fd = 1721 current[count].fd; 1722 newlist[count].events = 1723 current[count].events; 1724 } 1725 cached[count].fd = current[count].fd; 1726 cached[count].events = 1727 current[count].events; 1728 releasef(tmpfd); 1729 } 1730 } else { 1731 current[count].revents = 0; 1732 } 1733 count++; 1734 remain = common - count; 1735 } 1736 } 1737 if (mismatch != 0) { 1738 if (mismatch == common) { 1739 pollstats.pollcachemiss.value.ui64++; 1740 } else { 1741 pollstats.pollcachephit.value.ui64++; 1742 } 1743 } 1744 /* 1745 * take care of the non overlapping part of a list 1746 */ 1747 if (nfds > old_nfds) { 1748 ASSERT(newlist != NULL); 1749 for (i = old_nfds; i < nfds; i++) { 1750 /* filter out invalid events */ 1751 if (current[i].events & ~VALID_POLL_EVENTS) { 1752 newlist[i].events = current[i].events = 1753 current[i].events & VALID_POLL_EVENTS; 1754 } 1755 if ((fd = current[i].fd) < 0) { 1756 current[i].revents = 0; 1757 continue; 1758 } 1759 /* 1760 * add to the cached fd tbl and bitmap. 1761 */ 1762 if ((fp = getf(fd)) == NULL) { 1763 current[i].revents = POLLNVAL; 1764 newlist[i].fd = -1; 1765 fdcnt++; 1766 continue; 1767 } 1768 /* 1769 * Here we don't care about the 1770 * fdcnt. We will examine the bitmap 1771 * later and pick up the correct 1772 * fdcnt there. So we never bother to 1773 * check 'cnt'. 1774 */ 1775 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1776 (ssize_t)i, which); 1777 releasef(fd); 1778 if (error) { 1779 /* 1780 * Here we are half way through adding newly 1781 * polled fd. Undo enough to keep the cache 1782 * list consistent with the cache content. 1783 */ 1784 pcacheset_remove_list(ps, current, old_nfds, 1785 i, which, 0); 1786 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1787 return (error); 1788 } 1789 } 1790 } 1791 if (old_nfds > nfds) { 1792 /* 1793 * remove the fd's which are no longer polled. 1794 */ 1795 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1796 which, 1); 1797 } 1798 /* 1799 * set difference resolved. update nfds and cachedlist 1800 * in pollstate struct. 1801 */ 1802 if (newlist != NULL) { 1803 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1804 /* 1805 * By now, the pollfd.revents field should 1806 * all be zeroed. 1807 */ 1808 pcsp->pcs_pollfd = newlist; 1809 pcsp->pcs_nfds = nfds; 1810 } 1811 ASSERT(*fdcntp == 0); 1812 *fdcntp = fdcnt; 1813 /* 1814 * By now for every fd in pollfdp, one of the following should be 1815 * true. Otherwise we will miss a polled event. 1816 * 1817 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1818 * will be called on this fd in next poll. 1819 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1820 * pollnotify will happen. 1821 */ 1822 ASSERT(pollchecksanity(ps, nfds)); 1823 /* 1824 * make sure cross reference between cached poll lists and cached 1825 * poll fds are correct. 1826 */ 1827 ASSERT(pollcheckxref(ps, which)); 1828 /* 1829 * ensure each polldat in pollcache reference a polled fd in 1830 * pollcacheset. 1831 */ 1832 #ifdef DEBUG 1833 checkpolldat(ps); 1834 #endif 1835 return (0); 1836 } 1837 1838 #ifdef DEBUG 1839 static int 1840 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1841 { 1842 int i; 1843 int reventcnt = 0; 1844 1845 for (i = 0; i < nfds; i++) { 1846 if (pollfdp[i].fd < 0) { 1847 ASSERT(pollfdp[i].revents == 0); 1848 continue; 1849 } 1850 if (pollfdp[i].revents) { 1851 reventcnt++; 1852 } 1853 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1854 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1855 } 1856 } 1857 return (reventcnt); 1858 } 1859 #endif /* DEBUG */ 1860 1861 /* 1862 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1863 * is held upon entry. 1864 */ 1865 int 1866 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1867 int which) 1868 { 1869 int i; 1870 pollcache_t *pcp; 1871 int fd; 1872 int begin, end, done; 1873 pollhead_t *php; 1874 int fdcnt; 1875 int error = 0; 1876 file_t *fp; 1877 polldat_t *pdp; 1878 xref_t *refp; 1879 int entry; 1880 1881 pcp = ps->ps_pcache; 1882 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1883 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1884 retry: 1885 done = 0; 1886 begin = 0; 1887 fdcnt = 0; 1888 end = pcp->pc_mapend; 1889 while ((fdcnt < nfds) && !done) { 1890 php = NULL; 1891 /* 1892 * only poll fds which may have events 1893 */ 1894 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1895 ASSERT(fd <= end); 1896 if (fd >= 0) { 1897 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1898 /* 1899 * adjust map pointers for next round 1900 */ 1901 if (fd == end) { 1902 done = 1; 1903 } else { 1904 begin = fd + 1; 1905 } 1906 /* 1907 * A bitmap caches poll state information of 1908 * multiple poll lists. Call VOP_POLL only if 1909 * the bit corresponds to an fd in this poll 1910 * list. 1911 */ 1912 pdp = pcache_lookup_fd(pcp, fd); 1913 ASSERT(pdp != NULL); 1914 ASSERT(pdp->pd_ref != NULL); 1915 refp = &pdp->pd_ref[which]; 1916 if (refp->xf_refcnt == 0) 1917 continue; 1918 entry = refp->xf_position; 1919 ASSERT((entry >= 0) && (entry < nfds)); 1920 ASSERT(pollfdp[entry].fd == fd); 1921 /* 1922 * we are in this routine implies that we have 1923 * successfully polled this fd in the past. 1924 * Check to see this fd is closed while we are 1925 * blocked in poll. This ensures that we don't 1926 * miss a close on the fd in the case this fd is 1927 * reused. 1928 */ 1929 if (pdp->pd_fp == NULL) { 1930 ASSERT(pdp->pd_count > 0); 1931 pollfdp[entry].revents = POLLNVAL; 1932 fdcnt++; 1933 if (refp->xf_refcnt > 1) { 1934 /* 1935 * this fd appeared multiple time 1936 * in the poll list. Find all of them. 1937 */ 1938 for (i = entry + 1; i < nfds; i++) { 1939 if (pollfdp[i].fd == fd) { 1940 pollfdp[i].revents = 1941 POLLNVAL; 1942 fdcnt++; 1943 } 1944 } 1945 } 1946 pcacheset_invalidate(ps, pdp); 1947 continue; 1948 } 1949 /* 1950 * We can be here polling a device that is being 1951 * closed (i.e. the file pointer is set to NULL, 1952 * but pollcacheclean has not happened yet). 1953 */ 1954 if ((fp = getf(fd)) == NULL) { 1955 pollfdp[entry].revents = POLLNVAL; 1956 fdcnt++; 1957 if (refp->xf_refcnt > 1) { 1958 /* 1959 * this fd appeared multiple time 1960 * in the poll list. Find all of them. 1961 */ 1962 for (i = entry + 1; i < nfds; i++) { 1963 if (pollfdp[i].fd == fd) { 1964 pollfdp[i].revents = 1965 POLLNVAL; 1966 fdcnt++; 1967 } 1968 } 1969 } 1970 continue; 1971 } 1972 ASSERT(pdp->pd_fp == fp); 1973 ASSERT(infpollinfo(fd)); 1974 /* 1975 * Since we no longer hold poll head lock across 1976 * VOP_POLL, pollunlock logic can be simplifed. 1977 */ 1978 ASSERT(pdp->pd_php == NULL || 1979 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 1980 /* 1981 * underlying file systems may set a "pollpending" 1982 * flag when it sees the poll may block. Pollwakeup() 1983 * is called by wakeup thread if pollpending is set. 1984 * Pass a 0 fdcnt so that the underlying file system 1985 * will set the "pollpending" flag set when there is 1986 * no polled events. 1987 * 1988 * Use pollfdp[].events for actual polling because 1989 * the pd_events is union of all cached poll events 1990 * on this fd. The events parameter also affects 1991 * how the polled device sets the "poll pending" 1992 * flag. 1993 */ 1994 ASSERT(curthread->t_pollcache == NULL); 1995 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 1996 &pollfdp[entry].revents, &php, NULL); 1997 /* 1998 * releasef after completely done with this cached 1999 * poll entry. To prevent close() coming in to clear 2000 * this entry. 2001 */ 2002 if (error) { 2003 releasef(fd); 2004 break; 2005 } 2006 /* 2007 * layered devices (e.g. console driver) 2008 * may change the vnode and thus the pollhead 2009 * pointer out from underneath us. 2010 */ 2011 if (php != NULL && pdp->pd_php != NULL && 2012 php != pdp->pd_php) { 2013 releasef(fd); 2014 pollhead_delete(pdp->pd_php, pdp); 2015 pdp->pd_php = php; 2016 pollhead_insert(php, pdp); 2017 /* 2018 * We could have missed a wakeup on the new 2019 * target device. Make sure the new target 2020 * gets polled once. 2021 */ 2022 BT_SET(pcp->pc_bitmap, fd); 2023 goto retry; 2024 } 2025 2026 if (pollfdp[entry].revents) { 2027 ASSERT(refp->xf_refcnt >= 1); 2028 fdcnt++; 2029 if (refp->xf_refcnt > 1) { 2030 /* 2031 * this fd appeared multiple time 2032 * in the poll list. This is rare but 2033 * we have to look at all of them for 2034 * correctness. 2035 */ 2036 error = plist_chkdupfd(fp, pdp, ps, 2037 pollfdp, entry, &fdcnt); 2038 if (error > 0) { 2039 releasef(fd); 2040 break; 2041 } 2042 if (error < 0) { 2043 goto retry; 2044 } 2045 } 2046 releasef(fd); 2047 } else { 2048 /* 2049 * VOP_POLL didn't return any revents. We can 2050 * clear the bit in bitmap only if we have the 2051 * pollhead ptr cached and no other cached 2052 * entry is polling different events on this fd. 2053 * VOP_POLL may have dropped the ps_lock. Make 2054 * sure pollwakeup has not happened before clear 2055 * the bit. 2056 */ 2057 if ((pdp->pd_php != NULL) && 2058 (pollfdp[entry].events == pdp->pd_events) && 2059 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 2060 BT_CLEAR(pcp->pc_bitmap, fd); 2061 } 2062 /* 2063 * if the fd can be cached now but not before, 2064 * do it now. 2065 */ 2066 if ((pdp->pd_php == NULL) && (php != NULL)) { 2067 pdp->pd_php = php; 2068 pollhead_insert(php, pdp); 2069 /* 2070 * We are inserting a polldat struct for 2071 * the first time. We may have missed a 2072 * wakeup on this device. Re-poll once. 2073 * This should be a rare event. 2074 */ 2075 releasef(fd); 2076 goto retry; 2077 } 2078 if (refp->xf_refcnt > 1) { 2079 /* 2080 * this fd appeared multiple time 2081 * in the poll list. This is rare but 2082 * we have to look at all of them for 2083 * correctness. 2084 */ 2085 error = plist_chkdupfd(fp, pdp, ps, 2086 pollfdp, entry, &fdcnt); 2087 if (error > 0) { 2088 releasef(fd); 2089 break; 2090 } 2091 if (error < 0) { 2092 goto retry; 2093 } 2094 } 2095 releasef(fd); 2096 } 2097 } else { 2098 done = 1; 2099 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2100 } 2101 } 2102 if (!error) { 2103 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2104 *fdcntp += fdcnt; 2105 } 2106 return (error); 2107 } 2108 2109 /* 2110 * Going through the poll list without much locking. Poll all fds and 2111 * cache all valid fds in the pollcache. 2112 */ 2113 int 2114 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2115 { 2116 pollfd_t *pollfdp = ps->ps_pollfd; 2117 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2118 pollfd_t *newfdlist; 2119 int i; 2120 int fd; 2121 file_t *fp; 2122 int error = 0; 2123 2124 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2125 ASSERT(which < ps->ps_nsets); 2126 ASSERT(pcacheset != NULL); 2127 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2128 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2129 /* 2130 * cache the new poll list in pollcachset. 2131 */ 2132 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2133 2134 pcacheset[which].pcs_pollfd = newfdlist; 2135 pcacheset[which].pcs_nfds = ps->ps_nfds; 2136 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2137 2138 /* 2139 * We have saved a copy of current poll fd list in one pollcacheset. 2140 * The 'revents' field of the new list is not yet set to 0. Loop 2141 * through the new list just to do that is expensive. We do that 2142 * while polling the list. 2143 */ 2144 for (i = 0; i < ps->ps_nfds; i++) { 2145 fd = pollfdp[i].fd; 2146 /* 2147 * We also filter out the illegal poll events in the event 2148 * field for the cached poll list/set. 2149 */ 2150 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2151 newfdlist[i].events = pollfdp[i].events = 2152 pollfdp[i].events & VALID_POLL_EVENTS; 2153 } 2154 if (fd < 0) { 2155 pollfdp[i].revents = 0; 2156 continue; 2157 } 2158 if ((fp = getf(fd)) == NULL) { 2159 pollfdp[i].revents = POLLNVAL; 2160 /* 2161 * invalidate this cache entry in the cached poll list 2162 */ 2163 newfdlist[i].fd = -1; 2164 (*fdcntp)++; 2165 continue; 2166 } 2167 /* 2168 * cache this fd. 2169 */ 2170 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2171 which); 2172 releasef(fd); 2173 if (error) { 2174 /* 2175 * Here we are half way through caching a new 2176 * poll list. Undo every thing. 2177 */ 2178 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2179 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2180 pcacheset[which].pcs_pollfd = NULL; 2181 pcacheset[which].pcs_usradr = NULL; 2182 break; 2183 } 2184 } 2185 return (error); 2186 } 2187 2188 /* 2189 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2190 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2191 * wake any sleeping poller, then remove the polldat from the driver. 2192 * The routine is called with ps_pcachelock held. 2193 */ 2194 void 2195 pcache_clean_entry(pollstate_t *ps, int fd) 2196 { 2197 pollcache_t *pcp; 2198 polldat_t *pdp; 2199 int i; 2200 2201 ASSERT(ps != NULL); 2202 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2203 pcp = ps->ps_pcache; 2204 ASSERT(pcp); 2205 pdp = pcache_lookup_fd(pcp, fd); 2206 ASSERT(pdp != NULL); 2207 /* 2208 * the corresponding fpollinfo in fi_list has been removed by 2209 * a close on this fd. Reset the cached fp ptr here. 2210 */ 2211 pdp->pd_fp = NULL; 2212 /* 2213 * XXX - This routine also touches data in pcacheset struct. 2214 * 2215 * set the event in cached poll lists to POLLCLOSED. This invalidate 2216 * the cached poll fd entry in that poll list, which will force a 2217 * removal of this cached entry in next poll(). The cleanup is done 2218 * at the removal time. 2219 */ 2220 ASSERT(pdp->pd_ref != NULL); 2221 for (i = 0; i < ps->ps_nsets; i++) { 2222 xref_t *refp; 2223 pollcacheset_t *pcsp; 2224 2225 refp = &pdp->pd_ref[i]; 2226 if (refp->xf_refcnt) { 2227 ASSERT(refp->xf_position >= 0); 2228 pcsp = &ps->ps_pcacheset[i]; 2229 if (refp->xf_refcnt == 1) { 2230 pcsp->pcs_pollfd[refp->xf_position].events = 2231 (short)POLLCLOSED; 2232 } 2233 if (refp->xf_refcnt > 1) { 2234 int j; 2235 /* 2236 * mark every matching entry in pcs_pollfd 2237 */ 2238 for (j = refp->xf_position; 2239 j < pcsp->pcs_nfds; j++) { 2240 if (pcsp->pcs_pollfd[j].fd == fd) { 2241 pcsp->pcs_pollfd[j].events = 2242 (short)POLLCLOSED; 2243 } 2244 } 2245 } 2246 } 2247 } 2248 if (pdp->pd_php) { 2249 pollwakeup(pdp->pd_php, POLLHUP); 2250 pollhead_delete(pdp->pd_php, pdp); 2251 pdp->pd_php = NULL; 2252 } 2253 } 2254 2255 void 2256 pcache_wake_parents(pollcache_t *pcp) 2257 { 2258 pcachelink_t *pl, *pln; 2259 2260 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 2261 2262 for (pl = pcp->pc_parents; pl != NULL; pl = pln) { 2263 mutex_enter(&pl->pcl_lock); 2264 if (pl->pcl_state == PCL_VALID) { 2265 ASSERT(pl->pcl_parent_pc != NULL); 2266 cv_broadcast(&pl->pcl_parent_pc->pc_cv); 2267 } 2268 pln = pl->pcl_parent_next; 2269 mutex_exit(&pl->pcl_lock); 2270 } 2271 } 2272 2273 /* 2274 * Initialize thread pollstate structure. 2275 * It will persist for the life of the thread, until it calls pollcleanup(). 2276 */ 2277 pollstate_t * 2278 pollstate_create() 2279 { 2280 pollstate_t *ps = curthread->t_pollstate; 2281 2282 if (ps == NULL) { 2283 /* 2284 * This is the first time this thread has ever polled, so we 2285 * have to create its pollstate structure. 2286 */ 2287 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2288 ps->ps_nsets = POLLFDSETS; 2289 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2290 curthread->t_pollstate = ps; 2291 } else { 2292 ASSERT(ps->ps_depth == 0); 2293 ASSERT(ps->ps_flags == 0); 2294 ASSERT(ps->ps_pc_stack[0] == 0); 2295 } 2296 return (ps); 2297 } 2298 2299 void 2300 pollstate_destroy(pollstate_t *ps) 2301 { 2302 if (ps->ps_pollfd != NULL) { 2303 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2304 ps->ps_pollfd = NULL; 2305 } 2306 if (ps->ps_pcache != NULL) { 2307 pcache_destroy(ps->ps_pcache); 2308 ps->ps_pcache = NULL; 2309 } 2310 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2311 ps->ps_pcacheset = NULL; 2312 if (ps->ps_dpbuf != NULL) { 2313 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 2314 ps->ps_dpbuf = NULL; 2315 } 2316 mutex_destroy(&ps->ps_lock); 2317 kmem_free(ps, sizeof (pollstate_t)); 2318 } 2319 2320 static int 2321 pollstate_contend(pollstate_t *ps, pollcache_t *pcp) 2322 { 2323 pollstate_t *rem, *next; 2324 pollcache_t *desired_pc; 2325 int result = 0, depth_total; 2326 2327 mutex_enter(&pollstate_contenders_lock); 2328 /* 2329 * There is a small chance that the pollcache of interest became 2330 * available while we were waiting on the contenders lock. 2331 */ 2332 if (mutex_tryenter(&pcp->pc_lock) != 0) { 2333 goto out; 2334 } 2335 2336 /* 2337 * Walk the list of contended pollstates, searching for evidence of a 2338 * deadlock condition. 2339 */ 2340 depth_total = ps->ps_depth; 2341 desired_pc = pcp; 2342 for (rem = pollstate_contenders; rem != NULL; rem = next) { 2343 int i, j; 2344 next = rem->ps_contend_nextp; 2345 2346 /* Is this pollstate holding the pollcache of interest? */ 2347 for (i = 0; i < rem->ps_depth; i++) { 2348 if (rem->ps_pc_stack[i] != desired_pc) { 2349 continue; 2350 } 2351 2352 /* 2353 * The remote pollstate holds the pollcache lock we 2354 * desire. If it is waiting on a pollcache we hold, 2355 * then we can report the obvious deadlock. 2356 */ 2357 ASSERT(rem->ps_contend_pc != NULL); 2358 for (j = 0; j < ps->ps_depth; j++) { 2359 if (rem->ps_contend_pc == ps->ps_pc_stack[j]) { 2360 rem->ps_flags |= POLLSTATE_STALEMATE; 2361 result = -1; 2362 goto out; 2363 } 2364 } 2365 2366 /* 2367 * The remote pollstate is not blocking on a pollcache 2368 * which would deadlock against us. That pollcache 2369 * may, however, be held by a pollstate which would 2370 * result in a deadlock. 2371 * 2372 * To detect such a condition, we continue walking 2373 * through the list using the pollcache blocking the 2374 * remote thread as our new search target. 2375 * 2376 * Return to the front of pollstate_contenders since it 2377 * is not ordered to guarantee complete dependency 2378 * traversal. The below depth tracking places an upper 2379 * bound on iterations. 2380 */ 2381 desired_pc = rem->ps_contend_pc; 2382 next = pollstate_contenders; 2383 2384 /* 2385 * The recursion depth of the remote pollstate is used 2386 * to calculate a final depth for the local /dev/poll 2387 * recursion, since those locks will be acquired 2388 * eventually. If that value exceeds the defined 2389 * limit, we can report the failure now instead of 2390 * recursing to that failure depth. 2391 */ 2392 depth_total += (rem->ps_depth - i); 2393 if (depth_total >= POLLMAXDEPTH) { 2394 result = -1; 2395 goto out; 2396 } 2397 } 2398 } 2399 2400 /* 2401 * No deadlock partner was found. The only course of action is to 2402 * record ourself as a contended pollstate and wait for the pollcache 2403 * mutex to become available. 2404 */ 2405 ps->ps_contend_pc = pcp; 2406 ps->ps_contend_nextp = pollstate_contenders; 2407 ps->ps_contend_pnextp = &pollstate_contenders; 2408 if (pollstate_contenders != NULL) { 2409 pollstate_contenders->ps_contend_pnextp = 2410 &ps->ps_contend_nextp; 2411 } 2412 pollstate_contenders = ps; 2413 2414 mutex_exit(&pollstate_contenders_lock); 2415 mutex_enter(&pcp->pc_lock); 2416 mutex_enter(&pollstate_contenders_lock); 2417 2418 /* 2419 * Our acquisition of the pollcache mutex may be due to another thread 2420 * giving up in the face of deadlock with us. If that is the case, 2421 * we too should report the failure. 2422 */ 2423 if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) { 2424 result = -1; 2425 ps->ps_flags &= ~POLLSTATE_STALEMATE; 2426 mutex_exit(&pcp->pc_lock); 2427 } 2428 2429 /* Remove ourself from the contenders list. */ 2430 if (ps->ps_contend_nextp != NULL) { 2431 ps->ps_contend_nextp->ps_contend_pnextp = 2432 ps->ps_contend_pnextp; 2433 } 2434 *ps->ps_contend_pnextp = ps->ps_contend_nextp; 2435 ps->ps_contend_pc = NULL; 2436 ps->ps_contend_nextp = NULL; 2437 ps->ps_contend_pnextp = NULL; 2438 2439 out: 2440 mutex_exit(&pollstate_contenders_lock); 2441 return (result); 2442 } 2443 2444 int 2445 pollstate_enter(pollcache_t *pcp) 2446 { 2447 pollstate_t *ps = curthread->t_pollstate; 2448 int i; 2449 2450 if (ps == NULL) { 2451 /* 2452 * The thread pollstate may not be initialized if VOP_POLL is 2453 * called on a recursion-enabled /dev/poll handle from outside 2454 * the poll() or /dev/poll codepaths. 2455 */ 2456 return (PSE_FAIL_POLLSTATE); 2457 } 2458 if (ps->ps_depth >= POLLMAXDEPTH) { 2459 return (PSE_FAIL_DEPTH); 2460 } 2461 /* 2462 * Check the desired pollcache against pollcaches we already have 2463 * locked. Such a loop is the most simple deadlock scenario. 2464 */ 2465 for (i = 0; i < ps->ps_depth; i++) { 2466 if (ps->ps_pc_stack[i] == pcp) { 2467 return (PSE_FAIL_LOOP); 2468 } 2469 } 2470 ASSERT(ps->ps_pc_stack[i] == NULL); 2471 2472 if (ps->ps_depth == 0) { 2473 /* Locking initial the pollcache requires no caution */ 2474 mutex_enter(&pcp->pc_lock); 2475 } else if (mutex_tryenter(&pcp->pc_lock) == 0) { 2476 if (pollstate_contend(ps, pcp) != 0) { 2477 /* This pollcache cannot safely be locked. */ 2478 return (PSE_FAIL_DEADLOCK); 2479 } 2480 } 2481 2482 ps->ps_pc_stack[ps->ps_depth++] = pcp; 2483 return (PSE_SUCCESS); 2484 } 2485 2486 void 2487 pollstate_exit(pollcache_t *pcp) 2488 { 2489 pollstate_t *ps = curthread->t_pollstate; 2490 2491 VERIFY(ps != NULL); 2492 VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp); 2493 2494 mutex_exit(&pcp->pc_lock); 2495 ps->ps_pc_stack[--ps->ps_depth] = NULL; 2496 VERIFY(ps->ps_depth >= 0); 2497 } 2498 2499 2500 /* 2501 * We are holding the appropriate uf_lock entering this routine. 2502 * Bump up the ps_busy count to prevent the thread from exiting. 2503 */ 2504 void 2505 pollblockexit(fpollinfo_t *fpip) 2506 { 2507 for (; fpip; fpip = fpip->fp_next) { 2508 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2509 2510 mutex_enter(&pcp->pc_no_exit); 2511 pcp->pc_busy++; /* prevents exit()'s */ 2512 mutex_exit(&pcp->pc_no_exit); 2513 } 2514 } 2515 2516 /* 2517 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2518 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2519 * this cache entry. We can't clean the polldat entry clean up here because 2520 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2521 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2522 * pcache_clean_entry to call pollwakeup(). 2523 */ 2524 void 2525 pollcacheclean(fpollinfo_t *fip, int fd) 2526 { 2527 struct fpollinfo *fpip, *fpip2; 2528 2529 fpip = fip; 2530 while (fpip) { 2531 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2532 pollcache_t *pcp = ps->ps_pcache; 2533 2534 mutex_enter(&ps->ps_lock); 2535 pcache_clean_entry(ps, fd); 2536 mutex_exit(&ps->ps_lock); 2537 mutex_enter(&pcp->pc_no_exit); 2538 pcp->pc_busy--; 2539 if (pcp->pc_busy == 0) { 2540 /* 2541 * Wakeup the thread waiting in 2542 * thread_exit(). 2543 */ 2544 cv_signal(&pcp->pc_busy_cv); 2545 } 2546 mutex_exit(&pcp->pc_no_exit); 2547 2548 fpip2 = fpip; 2549 fpip = fpip->fp_next; 2550 kmem_free(fpip2, sizeof (fpollinfo_t)); 2551 } 2552 } 2553 2554 /* 2555 * one of the cache line's counter is wrapping around. Reset all cache line 2556 * counters to zero except one. This is simplistic, but probably works 2557 * effectively. 2558 */ 2559 void 2560 pcacheset_reset_count(pollstate_t *ps, int index) 2561 { 2562 int i; 2563 2564 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2565 for (i = 0; i < ps->ps_nsets; i++) { 2566 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2567 ps->ps_pcacheset[i].pcs_count = 0; 2568 } 2569 } 2570 ps->ps_pcacheset[index].pcs_count = 1; 2571 } 2572 2573 /* 2574 * this routine implements poll cache list replacement policy. 2575 * It is currently choose the "least used". 2576 */ 2577 int 2578 pcacheset_replace(pollstate_t *ps) 2579 { 2580 int i; 2581 int index = 0; 2582 2583 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2584 for (i = 1; i < ps->ps_nsets; i++) { 2585 if (ps->ps_pcacheset[index].pcs_count > 2586 ps->ps_pcacheset[i].pcs_count) { 2587 index = i; 2588 } 2589 } 2590 ps->ps_pcacheset[index].pcs_count = 0; 2591 return (index); 2592 } 2593 2594 /* 2595 * this routine is called by strclose to remove remaining polldat struct on 2596 * the pollhead list of the device being closed. There are two reasons as why 2597 * the polldat structures still remain on the pollhead list: 2598 * 2599 * (1) The layered device(e.g.the console driver). 2600 * In this case, the existence of a polldat implies that the thread putting 2601 * the polldat on this list has not exited yet. Before the thread exits, it 2602 * will have to hold this pollhead lock to remove the polldat. So holding the 2603 * pollhead lock here effectively prevents the thread which put the polldat 2604 * on this list from exiting. 2605 * 2606 * (2) /dev/poll. 2607 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2608 * pollhead list if the process has not done a POLLREMOVE before closing the 2609 * polled fd. We just unlink it here. 2610 */ 2611 void 2612 pollhead_clean(pollhead_t *php) 2613 { 2614 polldat_t *pdp; 2615 2616 /* 2617 * In case(1), while we must prevent the thread in question from 2618 * exiting, we must also obey the proper locking order, i.e. 2619 * (ps_lock -> phlock). 2620 */ 2621 PH_ENTER(php); 2622 while (php->ph_list != NULL) { 2623 pollstate_t *ps; 2624 pollcache_t *pcp; 2625 2626 pdp = php->ph_list; 2627 ASSERT(pdp->pd_php == php); 2628 if (pdp->pd_thread == NULL) { 2629 /* 2630 * This is case(2). Since the ph_lock is sufficient 2631 * to synchronize this lwp with any other /dev/poll 2632 * lwp, just unlink the polldat. 2633 */ 2634 php->ph_list = pdp->pd_next; 2635 pdp->pd_php = NULL; 2636 pdp->pd_next = NULL; 2637 continue; 2638 } 2639 ps = pdp->pd_thread->t_pollstate; 2640 ASSERT(ps != NULL); 2641 pcp = pdp->pd_pcache; 2642 ASSERT(pcp != NULL); 2643 mutex_enter(&pcp->pc_no_exit); 2644 pcp->pc_busy++; /* prevents exit()'s */ 2645 mutex_exit(&pcp->pc_no_exit); 2646 /* 2647 * Now get the locks in proper order to avoid deadlock. 2648 */ 2649 PH_EXIT(php); 2650 mutex_enter(&ps->ps_lock); 2651 /* 2652 * while we dropped the pollhead lock, the element could be 2653 * taken off the list already. 2654 */ 2655 PH_ENTER(php); 2656 if (pdp->pd_php == php) { 2657 ASSERT(pdp == php->ph_list); 2658 php->ph_list = pdp->pd_next; 2659 pdp->pd_php = NULL; 2660 pdp->pd_next = NULL; 2661 } 2662 PH_EXIT(php); 2663 mutex_exit(&ps->ps_lock); 2664 mutex_enter(&pcp->pc_no_exit); 2665 pcp->pc_busy--; 2666 if (pcp->pc_busy == 0) { 2667 /* 2668 * Wakeup the thread waiting in 2669 * thread_exit(). 2670 */ 2671 cv_signal(&pcp->pc_busy_cv); 2672 } 2673 mutex_exit(&pcp->pc_no_exit); 2674 PH_ENTER(php); 2675 } 2676 PH_EXIT(php); 2677 } 2678 2679 /* 2680 * The remove_list is called to cleanup a partially cached 'current' list or 2681 * to remove a partial list which is no longer cached. The flag value of 1 2682 * indicates the second case. 2683 */ 2684 void 2685 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2686 int cacheindex, int flag) 2687 { 2688 int i; 2689 2690 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2691 for (i = start; i < end; i++) { 2692 if ((pollfdp[i].fd >= 0) && 2693 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2694 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2695 (uint_t)pollfdp[i].events)) { 2696 int j; 2697 int fd = pollfdp[i].fd; 2698 2699 for (j = i + 1; j < end; j++) { 2700 if (pollfdp[j].fd == fd) { 2701 pcache_update_xref( 2702 ps->ps_pcache, fd, 2703 (ssize_t)j, cacheindex); 2704 break; 2705 } 2706 } 2707 ASSERT(j <= end); 2708 } 2709 } 2710 } 2711 } 2712 2713 #ifdef DEBUG 2714 2715 #include<sys/strsubr.h> 2716 /* 2717 * make sure curthread is not on anyone's pollhead list any more. 2718 */ 2719 static void 2720 pollcheckphlist() 2721 { 2722 int i; 2723 file_t *fp; 2724 uf_entry_t *ufp; 2725 uf_info_t *fip = P_FINFO(curproc); 2726 struct stdata *stp; 2727 polldat_t *pdp; 2728 2729 mutex_enter(&fip->fi_lock); 2730 for (i = 0; i < fip->fi_nfiles; i++) { 2731 UF_ENTER(ufp, fip, i); 2732 if ((fp = ufp->uf_file) != NULL) { 2733 if ((stp = fp->f_vnode->v_stream) != NULL) { 2734 PH_ENTER(&stp->sd_pollist); 2735 pdp = stp->sd_pollist.ph_list; 2736 while (pdp) { 2737 ASSERT(pdp->pd_thread != curthread); 2738 pdp = pdp->pd_next; 2739 } 2740 PH_EXIT(&stp->sd_pollist); 2741 } 2742 } 2743 UF_EXIT(ufp); 2744 } 2745 mutex_exit(&fip->fi_lock); 2746 } 2747 2748 /* 2749 * for resolved set poll list, the xref info in the pcache should be 2750 * consistent with this poll list. 2751 */ 2752 static int 2753 pollcheckxref(pollstate_t *ps, int cacheindex) 2754 { 2755 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2756 pollcache_t *pcp = ps->ps_pcache; 2757 polldat_t *pdp; 2758 int i; 2759 xref_t *refp; 2760 2761 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2762 if (pollfdp[i].fd < 0) { 2763 continue; 2764 } 2765 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2766 ASSERT(pdp != NULL); 2767 ASSERT(pdp->pd_ref != NULL); 2768 refp = &pdp->pd_ref[cacheindex]; 2769 if (refp->xf_position >= 0) { 2770 ASSERT(refp->xf_refcnt >= 1); 2771 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2772 if (refp->xf_refcnt > 1) { 2773 int j; 2774 int count = 0; 2775 2776 for (j = refp->xf_position; 2777 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2778 j++) { 2779 if (pollfdp[j].fd == pdp->pd_fd) { 2780 count++; 2781 } 2782 } 2783 ASSERT(count == refp->xf_refcnt); 2784 } 2785 } 2786 } 2787 return (1); 2788 } 2789 2790 /* 2791 * For every cached pollfd, its polldat struct should be consistent with 2792 * what is in the pcacheset lists. 2793 */ 2794 static void 2795 checkpolldat(pollstate_t *ps) 2796 { 2797 pollcache_t *pcp = ps->ps_pcache; 2798 polldat_t **hashtbl; 2799 int i; 2800 2801 hashtbl = pcp->pc_hash; 2802 for (i = 0; i < pcp->pc_hashsize; i++) { 2803 polldat_t *pdp; 2804 2805 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2806 ASSERT(pdp->pd_ref != NULL); 2807 if (pdp->pd_count > 0) { 2808 xref_t *refp; 2809 int j; 2810 pollcacheset_t *pcsp; 2811 pollfd_t *pollfd; 2812 2813 for (j = 0; j < ps->ps_nsets; j++) { 2814 refp = &pdp->pd_ref[j]; 2815 if (refp->xf_refcnt > 0) { 2816 pcsp = &ps->ps_pcacheset[j]; 2817 ASSERT(refp->xf_position < pcsp->pcs_nfds); 2818 pollfd = pcsp->pcs_pollfd; 2819 ASSERT(pdp->pd_fd == pollfd[refp->xf_position].fd); 2820 } 2821 } 2822 } 2823 } 2824 } 2825 } 2826 2827 /* 2828 * every wfd element on ph_list must have a corresponding fpollinfo on the 2829 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2830 */ 2831 void 2832 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2833 { 2834 stdata_t *stp; 2835 polldat_t *pdp; 2836 fpollinfo_t *fpip2; 2837 2838 if ((stp = vp->v_stream) == NULL) { 2839 return; 2840 } 2841 PH_ENTER(&stp->sd_pollist); 2842 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2843 if (pdp->pd_thread != NULL && 2844 pdp->pd_thread->t_procp == curthread->t_procp) { 2845 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2846 if (pdp->pd_thread == fpip2->fp_thread) { 2847 break; 2848 } 2849 } 2850 ASSERT(fpip2 != NULL); 2851 } 2852 } 2853 PH_EXIT(&stp->sd_pollist); 2854 } 2855 2856 /* 2857 * For each cached fd whose bit is not set in bitmap, its revents field in 2858 * current poll list should be 0. 2859 */ 2860 static int 2861 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2862 { 2863 pollcache_t *pcp = ps->ps_pcache; 2864 pollfd_t *pollfdp = ps->ps_pollfd; 2865 int i; 2866 2867 for (i = begin; i < end; i++) { 2868 polldat_t *pdp; 2869 2870 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2871 pdp = pcache_lookup_fd(pcp, i); 2872 if (pdp && pdp->pd_fp != NULL) { 2873 xref_t *refp; 2874 int entry; 2875 2876 ASSERT(pdp->pd_ref != NULL); 2877 refp = &pdp->pd_ref[cacheindex]; 2878 if (refp->xf_refcnt == 0) { 2879 continue; 2880 } 2881 entry = refp->xf_position; 2882 ASSERT(entry >= 0); 2883 ASSERT(pollfdp[entry].revents == 0); 2884 if (refp->xf_refcnt > 1) { 2885 int j; 2886 2887 for (j = entry + 1; j < ps->ps_nfds; j++) { 2888 if (pollfdp[j].fd == i) { 2889 ASSERT(pollfdp[j].revents == 0); 2890 } 2891 } 2892 } 2893 } 2894 } 2895 return (1); 2896 } 2897 2898 #endif /* DEBUG */ 2899 2900 pollcache_t * 2901 pcache_alloc() 2902 { 2903 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2904 } 2905 2906 void 2907 pcache_create(pollcache_t *pcp, nfds_t nfds) 2908 { 2909 size_t mapsize; 2910 2911 /* 2912 * allocate enough bits for the poll fd list 2913 */ 2914 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2915 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2916 } 2917 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2918 KM_SLEEP); 2919 pcp->pc_mapsize = mapsize; 2920 /* 2921 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2922 * number of fd to start with, allocate a bigger hash table (to the 2923 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2924 * hash table is expensive. 2925 */ 2926 if (nfds < POLLHASHCHUNKSZ) { 2927 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2928 } else { 2929 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2930 ~(POLLHASHCHUNKSZ - 1); 2931 } 2932 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2933 KM_SLEEP); 2934 } 2935 2936 void 2937 pcache_destroy(pollcache_t *pcp) 2938 { 2939 polldat_t **hashtbl; 2940 int i; 2941 2942 hashtbl = pcp->pc_hash; 2943 for (i = 0; i < pcp->pc_hashsize; i++) { 2944 if (hashtbl[i] != NULL) { 2945 polldat_t *pdp, *pdp2; 2946 2947 pdp = hashtbl[i]; 2948 while (pdp != NULL) { 2949 pdp2 = pdp->pd_hashnext; 2950 if (pdp->pd_ref != NULL) { 2951 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2952 pdp->pd_nsets); 2953 } 2954 kmem_free(pdp, sizeof (polldat_t)); 2955 pdp = pdp2; 2956 pcp->pc_fdcount--; 2957 } 2958 } 2959 } 2960 ASSERT(pcp->pc_fdcount == 0); 2961 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 2962 kmem_free(pcp->pc_bitmap, 2963 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 2964 mutex_destroy(&pcp->pc_no_exit); 2965 mutex_destroy(&pcp->pc_lock); 2966 cv_destroy(&pcp->pc_cv); 2967 cv_destroy(&pcp->pc_busy_cv); 2968 kmem_free(pcp, sizeof (pollcache_t)); 2969 } 2970 2971 pollcacheset_t * 2972 pcacheset_create(int nsets) 2973 { 2974 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 2975 } 2976 2977 void 2978 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 2979 { 2980 int i; 2981 2982 for (i = 0; i < nsets; i++) { 2983 if (pcsp[i].pcs_pollfd != NULL) { 2984 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 2985 sizeof (pollfd_t)); 2986 } 2987 } 2988 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 2989 } 2990 2991 /* 2992 * Check each duplicated poll fd in the poll list. It may be necessary to 2993 * VOP_POLL the same fd again using different poll events. getf() has been 2994 * done by caller. This routine returns 0 if it can sucessfully process the 2995 * entire poll fd list. It returns -1 if underlying vnode has changed during 2996 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 2997 * value if VOP_POLL failed. 2998 */ 2999 static int 3000 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 3001 int entry, int *fdcntp) 3002 { 3003 int i; 3004 int fd; 3005 nfds_t nfds = psp->ps_nfds; 3006 3007 fd = pollfdp[entry].fd; 3008 for (i = entry + 1; i < nfds; i++) { 3009 if (pollfdp[i].fd == fd) { 3010 if (pollfdp[i].events == pollfdp[entry].events) { 3011 if ((pollfdp[i].revents = 3012 pollfdp[entry].revents) != 0) { 3013 (*fdcntp)++; 3014 } 3015 } else { 3016 3017 int error; 3018 pollhead_t *php; 3019 pollcache_t *pcp = psp->ps_pcache; 3020 3021 /* 3022 * the events are different. VOP_POLL on this 3023 * fd so that we don't miss any revents. 3024 */ 3025 php = NULL; 3026 ASSERT(curthread->t_pollcache == NULL); 3027 error = VOP_POLL(fp->f_vnode, 3028 pollfdp[i].events, 0, 3029 &pollfdp[i].revents, &php, NULL); 3030 if (error) { 3031 return (error); 3032 } 3033 /* 3034 * layered devices(e.g. console driver) 3035 * may change the vnode and thus the pollhead 3036 * pointer out from underneath us. 3037 */ 3038 if (php != NULL && pdp->pd_php != NULL && 3039 php != pdp->pd_php) { 3040 pollhead_delete(pdp->pd_php, pdp); 3041 pdp->pd_php = php; 3042 pollhead_insert(php, pdp); 3043 /* 3044 * We could have missed a wakeup on the 3045 * new target device. Make sure the new 3046 * target gets polled once. 3047 */ 3048 BT_SET(pcp->pc_bitmap, fd); 3049 return (-1); 3050 } 3051 if (pollfdp[i].revents) { 3052 (*fdcntp)++; 3053 } 3054 } 3055 } 3056 } 3057 return (0); 3058 } 3059