1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * Copyright (c) 2012, 2016 by Delphix. All rights reserved. 32 * Copyright 2015, Joyent, Inc. 33 * Copyright 2022 Oxide Computer Company 34 */ 35 36 /* 37 * Portions of this source code were derived from Berkeley 4.3 BSD 38 * under license from the Regents of the University of California. 39 */ 40 41 #include <sys/param.h> 42 #include <sys/isa_defs.h> 43 #include <sys/types.h> 44 #include <sys/sysmacros.h> 45 #include <sys/user.h> 46 #include <sys/systm.h> 47 #include <sys/errno.h> 48 #include <sys/time.h> 49 #include <sys/vnode.h> 50 #include <sys/file.h> 51 #include <sys/mode.h> 52 #include <sys/proc.h> 53 #include <sys/uio.h> 54 #include <sys/poll_impl.h> 55 #include <sys/kmem.h> 56 #include <sys/cmn_err.h> 57 #include <sys/debug.h> 58 #include <sys/bitmap.h> 59 #include <sys/kstat.h> 60 #include <sys/rctl.h> 61 #include <sys/port_impl.h> 62 #include <sys/schedctl.h> 63 #include <sys/cpu.h> 64 65 #define NPHLOCKS 64 /* Number of locks; must be power of 2 */ 66 #define PHLOCKADDR(php) &plocks[(((uintptr_t)(php)) >> 8) & (NPHLOCKS - 1)] 67 #define PHLOCK(php) PHLOCKADDR(php).pp_lock 68 #define PH_ENTER(php) mutex_enter(PHLOCK(php)) 69 #define PH_EXIT(php) mutex_exit(PHLOCK(php)) 70 #define VALID_POLL_EVENTS (POLLIN | POLLPRI | POLLOUT | POLLRDNORM \ 71 | POLLRDBAND | POLLWRBAND | POLLHUP | POLLERR | POLLNVAL) 72 73 /* 74 * global counters to collect some stats 75 */ 76 static struct { 77 kstat_named_t polllistmiss; /* failed to find a cached poll list */ 78 kstat_named_t pollcachehit; /* list matched 100% w/ cached one */ 79 kstat_named_t pollcachephit; /* list matched < 100% w/ cached one */ 80 kstat_named_t pollcachemiss; /* every list entry is dif from cache */ 81 kstat_named_t pollunlockfail; /* failed to perform pollunlock */ 82 } pollstats = { 83 { "polllistmiss", KSTAT_DATA_UINT64 }, 84 { "pollcachehit", KSTAT_DATA_UINT64 }, 85 { "pollcachephit", KSTAT_DATA_UINT64 }, 86 { "pollcachemiss", KSTAT_DATA_UINT64 }, 87 { "pollunlockfail", KSTAT_DATA_UINT64 } 88 }; 89 90 kstat_named_t *pollstats_ptr = (kstat_named_t *)&pollstats; 91 uint_t pollstats_ndata = sizeof (pollstats) / sizeof (kstat_named_t); 92 93 struct pplock { 94 kmutex_t pp_lock; 95 short pp_flag; 96 kcondvar_t pp_wait_cv; 97 int32_t pp_pad; /* to a nice round 16 bytes */ 98 }; 99 100 static struct pplock plocks[NPHLOCKS]; /* Hash array of pollhead locks */ 101 102 /* Contention lock & list for preventing deadlocks in recursive /dev/poll. */ 103 static kmutex_t pollstate_contenders_lock; 104 static pollstate_t *pollstate_contenders = NULL; 105 106 #ifdef DEBUG 107 static int pollchecksanity(pollstate_t *, nfds_t); 108 static int pollcheckxref(pollstate_t *, int); 109 static void pollcheckphlist(void); 110 static int pollcheckrevents(pollstate_t *, int, int, int); 111 static void checkpolldat(pollstate_t *); 112 #endif /* DEBUG */ 113 static int plist_chkdupfd(file_t *, polldat_t *, pollstate_t *, pollfd_t *, int, 114 int *); 115 116 /* 117 * Data structure overview: 118 * The per-thread poll state consists of 119 * one pollstate_t 120 * one pollcache_t 121 * one bitmap with one event bit per fd 122 * a (two-dimensional) hashed array of polldat_t structures - one entry 123 * per fd 124 * 125 * This conglomerate of data structures interact with 126 * the pollhead which is used by VOP_POLL and pollwakeup 127 * (protected by the PHLOCK, cached array of plocks), and 128 * the fpollinfo list hanging off the fi_list which is used to notify 129 * poll when a cached fd is closed. This is protected by uf_lock. 130 * 131 * Invariants: 132 * pd_php (pollhead pointer) is set iff (if and only if) the polldat 133 * is on that pollhead. This is modified atomically under pc_lock. 134 * 135 * pd_fp (file_t pointer) is set iff the thread is on the fpollinfo 136 * list for that open file. 137 * This is modified atomically under pc_lock. 138 * 139 * pd_count is the sum (over all values of i) of pd_ref[i].xf_refcnt. 140 * Iff pd_ref[i].xf_refcnt >= 1 then 141 * ps_pcacheset[i].pcs_pollfd[pd_ref[i].xf_position].fd == pd_fd 142 * Iff pd_ref[i].xf_refcnt > 1 then 143 * In ps_pcacheset[i].pcs_pollfd between index 144 * pd_ref[i].xf_position] and the end of the list 145 * there are xf_refcnt entries with .fd == pd_fd 146 * 147 * Locking design: 148 * Whenever possible the design relies on the fact that the poll cache state 149 * is per thread thus for both poll and exit it is self-synchronizing. 150 * Thus the key interactions where other threads access the state are: 151 * pollwakeup (and polltime), and 152 * close cleaning up the cached references to an open file 153 * 154 * The two key locks in poll proper is ps_lock and pc_lock. 155 * 156 * The ps_lock is used for synchronization between poll, (lwp_)exit and close 157 * to ensure that modifications to pollcacheset structure are serialized. 158 * This lock is held through most of poll() except where poll sleeps 159 * since there is little need to handle closes concurrently with the execution 160 * of poll. 161 * The pc_lock protects most of the fields in pollcache structure and polldat 162 * structures (which are accessed by poll, pollwakeup, and polltime) 163 * with the exception of fields that are only modified when only one thread 164 * can access this per-thread state. 165 * Those exceptions occur in poll when first allocating the per-thread state, 166 * when poll grows the number of polldat (never shrinks), and when 167 * exit/pollcleanup has ensured that there are no references from either 168 * pollheads or fpollinfo to the threads poll state. 169 * 170 * Poll(2) system call is the only path which ps_lock and pc_lock are both 171 * held, in that order. It needs ps_lock to synchronize with close and 172 * lwp_exit; and pc_lock with pollwakeup. 173 * 174 * The locking interaction between pc_lock and PHLOCK take into account 175 * that poll acquires these locks in the order of pc_lock and then PHLOCK 176 * while pollwakeup does it in the reverse order. Thus pollwakeup implements 177 * deadlock avoidance by dropping the locks and reacquiring them in the 178 * reverse order. For this to work pollwakeup needs to prevent the thread 179 * from exiting and freeing all of the poll related state. Thus is done 180 * using 181 * the pc_no_exit lock 182 * the pc_busy counter 183 * the pc_busy_cv condition variable 184 * 185 * The locking interaction between pc_lock and uf_lock has similar 186 * issues. Poll holds ps_lock and/or pc_lock across calls to getf/releasef 187 * which acquire uf_lock. The poll cleanup in close needs to hold uf_lock 188 * to prevent poll or exit from doing a delfpollinfo after which the thread 189 * might exit. But the cleanup needs to acquire pc_lock when modifying 190 * the poll cache state. The solution is to use pc_busy and do the close 191 * cleanup in two phases: 192 * First close calls pollblockexit which increments pc_busy. 193 * This prevents the per-thread poll related state from being freed. 194 * Then close drops uf_lock and calls pollcacheclean. 195 * This routine can then acquire pc_lock and remove any references 196 * to the closing fd (as well as recording that it has been closed 197 * so that a POLLNVAL can be generated even if the fd is reused before 198 * poll has been woken up and checked getf() again). 199 * 200 * When removing a polled fd from poll cache, the fd is always removed 201 * from pollhead list first and then from fpollinfo list, i.e., 202 * polldat_disassociate() is called before delfpollinfo(). 203 * 204 * 205 * Locking hierarchy: 206 * pc_no_exit is a leaf level lock. 207 * ps_lock is held when acquiring pc_lock (except when pollwakeup 208 * acquires pc_lock). 209 * pc_lock might be held when acquiring PHLOCK (polldat_associate/ 210 * polldat_disassociate) 211 * pc_lock is always held (but this is not required) 212 * when acquiring PHLOCK (in polladd/polldat_disassociate and pollwakeup 213 * called from pcache_clean_entry). 214 * pc_lock is held across addfpollinfo/delfpollinfo which acquire 215 * uf_lock. 216 * pc_lock is held across getf/releasef which acquire uf_lock. 217 * ps_lock might be held across getf/releasef which acquire uf_lock. 218 * pollwakeup tries to acquire pc_lock while holding PHLOCK 219 * but drops the locks and reacquire them in reverse order to avoid 220 * deadlock. 221 * 222 * Note also that there is deadlock avoidance support for VOP_POLL routines 223 * and pollwakeup involving a file system or driver lock. 224 * See below. 225 */ 226 227 /* 228 * Deadlock avoidance support for VOP_POLL() routines. This is 229 * sometimes necessary to prevent deadlock between polling threads 230 * (which hold poll locks on entry to xx_poll(), then acquire foo) 231 * and pollwakeup() threads (which hold foo, then acquire poll locks). 232 * 233 * pollunlock(*cookie) releases whatever poll locks the current thread holds, 234 * setting a cookie for use by pollrelock(); 235 * 236 * pollrelock(cookie) reacquires previously dropped poll locks; 237 * 238 * polllock(php, mutex) does the common case: pollunlock(), 239 * acquire the problematic mutex, pollrelock(). 240 * 241 * If polllock() or pollunlock() return non-zero, it indicates that a recursive 242 * /dev/poll is in progress and pollcache locks cannot be dropped. Callers 243 * must handle this by indicating a POLLNVAL in the revents of the VOP_POLL. 244 */ 245 int 246 pollunlock(int *lockstate) 247 { 248 pollstate_t *ps = curthread->t_pollstate; 249 pollcache_t *pcp; 250 251 ASSERT(lockstate != NULL); 252 253 /* 254 * There is no way to safely perform a pollunlock() while in the depths 255 * of a recursive /dev/poll operation. 256 */ 257 if (ps != NULL && ps->ps_depth > 1) { 258 ps->ps_flags |= POLLSTATE_ULFAIL; 259 pollstats.pollunlockfail.value.ui64++; 260 return (-1); 261 } 262 263 /* 264 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 265 * If the pollrelock/pollunlock is called as a result of poll(2), 266 * the t_pollcache should be NULL. 267 */ 268 if (curthread->t_pollcache == NULL) 269 pcp = ps->ps_pcache; 270 else 271 pcp = curthread->t_pollcache; 272 273 if (!mutex_owned(&pcp->pc_lock)) { 274 *lockstate = 0; 275 } else { 276 *lockstate = 1; 277 mutex_exit(&pcp->pc_lock); 278 } 279 return (0); 280 } 281 282 /* 283 * The pc_lock and pc_flag fields of port_fdcache_t must exactly match those of 284 * pollcache_t as they are accessed through t_pollcache as if they were part of 285 * a "real" pollcache. 286 */ 287 CTASSERT(offsetof(pollcache_t, pc_lock) == offsetof(port_fdcache_t, pc_lock)); 288 CTASSERT(offsetof(pollcache_t, pc_flag) == offsetof(port_fdcache_t, pc_flag)); 289 290 void 291 pollrelock(int lockstate) 292 { 293 pollstate_t *ps = curthread->t_pollstate; 294 pollcache_t *pcp; 295 296 /* Skip this whole ordeal if the pollcache was not locked to begin */ 297 if (lockstate == 0) 298 return; 299 300 /* 301 * t_pollcache is set by /dev/poll and event ports (port_fd.c). 302 * If the pollrelock/pollunlock is called as a result of poll(2), 303 * the t_pollcache should be NULL. 304 */ 305 if (curthread->t_pollcache == NULL) 306 pcp = ps->ps_pcache; 307 else 308 pcp = curthread->t_pollcache; 309 310 mutex_enter(&pcp->pc_lock); 311 } 312 313 /* ARGSUSED */ 314 int 315 polllock(pollhead_t *php, kmutex_t *lp) 316 { 317 if (mutex_tryenter(lp) == 0) { 318 int state; 319 320 if (pollunlock(&state) != 0) { 321 return (-1); 322 } 323 mutex_enter(lp); 324 pollrelock(state); 325 } 326 return (0); 327 } 328 329 static int 330 poll_common(pollfd_t *fds, nfds_t nfds, timespec_t *tsp, k_sigset_t *ksetp) 331 { 332 kthread_t *t = curthread; 333 klwp_t *lwp = ttolwp(t); 334 proc_t *p = ttoproc(t); 335 int fdcnt = 0; 336 int i; 337 hrtime_t deadline; /* hrtime value when we want to return */ 338 pollfd_t *pollfdp; 339 pollstate_t *ps; 340 pollcache_t *pcp; 341 int error = 0; 342 nfds_t old_nfds; 343 int cacheindex = 0; /* which cache set is used */ 344 345 /* 346 * Determine the precise future time of the requested timeout, if any. 347 */ 348 if (tsp == NULL) { 349 deadline = -1; 350 } else if (tsp->tv_sec == 0 && tsp->tv_nsec == 0) { 351 deadline = 0; 352 } else { 353 /* They must wait at least a tick. */ 354 deadline = ((hrtime_t)tsp->tv_sec * NANOSEC) + tsp->tv_nsec; 355 deadline = MAX(deadline, nsec_per_tick); 356 deadline += gethrtime(); 357 } 358 359 /* 360 * Reset our signal mask, if requested. 361 */ 362 if (ksetp != NULL) { 363 mutex_enter(&p->p_lock); 364 schedctl_finish_sigblock(t); 365 lwp->lwp_sigoldmask = t->t_hold; 366 t->t_hold = *ksetp; 367 t->t_flag |= T_TOMASK; 368 /* 369 * Call cv_reltimedwait_sig() just to check for signals. 370 * We will return immediately with either 0 or -1. 371 */ 372 if (!cv_reltimedwait_sig(&t->t_delay_cv, &p->p_lock, 0, 373 TR_CLOCK_TICK)) { 374 mutex_exit(&p->p_lock); 375 error = EINTR; 376 goto pollout; 377 } 378 mutex_exit(&p->p_lock); 379 } 380 381 /* 382 * Check to see if this one just wants to use poll() as a timeout. 383 * If yes then bypass all the other stuff and make it sleep. 384 */ 385 if (nfds == 0) { 386 /* 387 * Sleep until we have passed the requested future 388 * time or until interrupted by a signal. 389 * Do not check for signals if we do not want to wait. 390 */ 391 if (deadline != 0) { 392 mutex_enter(&t->t_delay_lock); 393 while ((error = cv_timedwait_sig_hrtime(&t->t_delay_cv, 394 &t->t_delay_lock, deadline)) > 0) 395 continue; 396 mutex_exit(&t->t_delay_lock); 397 error = (error == 0) ? EINTR : 0; 398 } 399 goto pollout; 400 } 401 402 if (nfds > p->p_fno_ctl) { 403 mutex_enter(&p->p_lock); 404 (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], 405 p->p_rctls, p, RCA_SAFE); 406 mutex_exit(&p->p_lock); 407 error = EINVAL; 408 goto pollout; 409 } 410 411 /* 412 * Need to allocate memory for pollstate before anything because 413 * the mutex and cv are created in this space 414 */ 415 ps = pollstate_create(); 416 417 if (ps->ps_pcache == NULL) 418 ps->ps_pcache = pcache_alloc(); 419 pcp = ps->ps_pcache; 420 421 /* 422 * NOTE: for performance, buffers are saved across poll() calls. 423 * The theory is that if a process polls heavily, it tends to poll 424 * on the same set of descriptors. Therefore, we only reallocate 425 * buffers when nfds changes. There is no hysteresis control, 426 * because there is no data to suggest that this is necessary; 427 * the penalty of reallocating is not *that* great in any event. 428 */ 429 old_nfds = ps->ps_nfds; 430 if (nfds != old_nfds) { 431 432 kmem_free(ps->ps_pollfd, old_nfds * sizeof (pollfd_t)); 433 pollfdp = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 434 ps->ps_pollfd = pollfdp; 435 ps->ps_nfds = nfds; 436 } 437 438 pollfdp = ps->ps_pollfd; 439 if (copyin(fds, pollfdp, nfds * sizeof (pollfd_t))) { 440 error = EFAULT; 441 goto pollout; 442 } 443 444 if (fds == NULL) { 445 /* 446 * If the process has page 0 mapped, then the copyin() above 447 * will succeed even if fds is NULL. However, our cached 448 * poll lists are keyed by the address of the passed-in fds 449 * structure, and we use the value NULL to indicate an unused 450 * poll cache list entry. As such, we elect not to support 451 * NULL as a valid (user) memory address and fail the poll() 452 * call. 453 */ 454 error = EINVAL; 455 goto pollout; 456 } 457 458 /* 459 * If this thread polls for the first time, allocate ALL poll 460 * cache data structures and cache the poll fd list. This 461 * allocation is delayed till now because lwp's polling 0 fd 462 * (i.e. using poll as timeout()) don't need this memory. 463 */ 464 mutex_enter(&ps->ps_lock); 465 pcp = ps->ps_pcache; 466 ASSERT(pcp != NULL); 467 if (pcp->pc_bitmap == NULL) { 468 pcache_create(pcp, nfds); 469 /* 470 * poll and cache this poll fd list in ps_pcacheset[0]. 471 */ 472 error = pcacheset_cache_list(ps, fds, &fdcnt, cacheindex); 473 if (fdcnt || error) { 474 mutex_exit(&ps->ps_lock); 475 goto pollout; 476 } 477 } else { 478 pollcacheset_t *pcset = ps->ps_pcacheset; 479 480 /* 481 * Not first time polling. Select a cached poll list by 482 * matching user pollfd list buffer address. 483 */ 484 for (cacheindex = 0; cacheindex < ps->ps_nsets; cacheindex++) { 485 if (pcset[cacheindex].pcs_usradr == (uintptr_t)fds) { 486 if ((++pcset[cacheindex].pcs_count) == 0) { 487 /* 488 * counter is wrapping around. 489 */ 490 pcacheset_reset_count(ps, cacheindex); 491 } 492 /* 493 * examine and resolve possible 494 * difference of the current poll 495 * list and previously cached one. 496 * If there is an error during resolve(), 497 * the callee will guarantee the consistency 498 * of cached poll list and cache content. 499 */ 500 error = pcacheset_resolve(ps, nfds, &fdcnt, 501 cacheindex); 502 if (error) { 503 mutex_exit(&ps->ps_lock); 504 goto pollout; 505 } 506 break; 507 } 508 509 /* 510 * Note that pcs_usradr field of an used entry won't be 511 * 0 because it stores the address of passed-in fds, 512 * and 0 fds will not be cached (Then it is either 513 * the special timeout case when nfds is 0 or it returns 514 * failure directly). 515 */ 516 if (pcset[cacheindex].pcs_usradr == (uintptr_t)NULL) { 517 /* 518 * found an unused entry. Use it to cache 519 * this poll list. 520 */ 521 error = pcacheset_cache_list(ps, fds, &fdcnt, 522 cacheindex); 523 if (fdcnt || error) { 524 mutex_exit(&ps->ps_lock); 525 goto pollout; 526 } 527 break; 528 } 529 } 530 if (cacheindex == ps->ps_nsets) { 531 /* 532 * We failed to find a matching cached poll fd list. 533 * replace an old list. 534 */ 535 pollstats.polllistmiss.value.ui64++; 536 cacheindex = pcacheset_replace(ps); 537 ASSERT(cacheindex < ps->ps_nsets); 538 pcset[cacheindex].pcs_usradr = (uintptr_t)fds; 539 error = pcacheset_resolve(ps, nfds, &fdcnt, cacheindex); 540 if (error) { 541 mutex_exit(&ps->ps_lock); 542 goto pollout; 543 } 544 } 545 } 546 547 /* 548 * Always scan the bitmap with the lock on the pollcache held. 549 * This is to make sure that a wakeup does not come undetected. 550 * If the lock is not held, a pollwakeup could have come for an 551 * fd we already checked but before this thread sleeps, in which 552 * case the wakeup is missed. Now we hold the pcache lock and 553 * check the bitmap again. This will prevent wakeup from happening 554 * while we hold pcache lock since pollwakeup() will also lock 555 * the pcache before updating poll bitmap. 556 */ 557 mutex_enter(&pcp->pc_lock); 558 for (;;) { 559 pcp->pc_flag = 0; 560 error = pcache_poll(pollfdp, ps, nfds, &fdcnt, cacheindex); 561 if (fdcnt || error) { 562 mutex_exit(&pcp->pc_lock); 563 mutex_exit(&ps->ps_lock); 564 break; 565 } 566 567 /* 568 * If PC_POLLWAKE is set, a pollwakeup() was performed on 569 * one of the file descriptors. This can happen only if 570 * one of the VOP_POLL() functions dropped pcp->pc_lock. 571 * The only current cases of this is in procfs (prpoll()) 572 * and STREAMS (strpoll()). 573 */ 574 if (pcp->pc_flag & PC_POLLWAKE) 575 continue; 576 577 /* 578 * If you get here, the poll of fds was unsuccessful. 579 * Wait until some fd becomes readable, writable, or gets 580 * an exception, or until a signal or a timeout occurs. 581 * Do not check for signals if we have a zero timeout. 582 */ 583 mutex_exit(&ps->ps_lock); 584 if (deadline == 0) { 585 error = -1; 586 } else { 587 error = cv_timedwait_sig_hrtime(&pcp->pc_cv, 588 &pcp->pc_lock, deadline); 589 } 590 mutex_exit(&pcp->pc_lock); 591 /* 592 * If we have received a signal or timed out 593 * then break out and return. 594 */ 595 if (error <= 0) { 596 error = (error == 0) ? EINTR : 0; 597 break; 598 } 599 /* 600 * We have not received a signal or timed out. 601 * Continue around and poll fds again. 602 */ 603 mutex_enter(&ps->ps_lock); 604 mutex_enter(&pcp->pc_lock); 605 } 606 607 pollout: 608 /* 609 * If we changed the signal mask but we received 610 * no signal then restore the signal mask. 611 * Otherwise psig() will deal with the signal mask. 612 */ 613 if (ksetp != NULL) { 614 mutex_enter(&p->p_lock); 615 if (lwp->lwp_cursig == 0) { 616 t->t_hold = lwp->lwp_sigoldmask; 617 t->t_flag &= ~T_TOMASK; 618 } 619 mutex_exit(&p->p_lock); 620 } 621 622 if (error) 623 return (set_errno(error)); 624 625 /* 626 * Copy out the events and return the fdcnt to the user. 627 */ 628 if (nfds != 0 && 629 copyout(pollfdp, fds, nfds * sizeof (pollfd_t))) 630 return (set_errno(EFAULT)); 631 632 #ifdef DEBUG 633 /* 634 * Another sanity check: 635 */ 636 if (fdcnt) { 637 int reventcnt = 0; 638 639 for (i = 0; i < nfds; i++) { 640 if (pollfdp[i].fd < 0) { 641 ASSERT(pollfdp[i].revents == 0); 642 continue; 643 } 644 if (pollfdp[i].revents) { 645 reventcnt++; 646 } 647 } 648 ASSERT(fdcnt == reventcnt); 649 } else { 650 for (i = 0; i < nfds; i++) { 651 ASSERT(pollfdp[i].revents == 0); 652 } 653 } 654 #endif /* DEBUG */ 655 656 return (fdcnt); 657 } 658 659 /* 660 * This is the system call trap that poll(), 661 * select() and pselect() are built upon. 662 * It is a private interface between libc and the kernel. 663 */ 664 int 665 pollsys(pollfd_t *fds, nfds_t nfds, timespec_t *timeoutp, sigset_t *setp) 666 { 667 timespec_t ts; 668 timespec_t *tsp; 669 sigset_t set; 670 k_sigset_t kset; 671 k_sigset_t *ksetp; 672 model_t datamodel = get_udatamodel(); 673 674 if (timeoutp == NULL) 675 tsp = NULL; 676 else { 677 if (datamodel == DATAMODEL_NATIVE) { 678 if (copyin(timeoutp, &ts, sizeof (ts))) 679 return (set_errno(EFAULT)); 680 } else { 681 timespec32_t ts32; 682 683 if (copyin(timeoutp, &ts32, sizeof (ts32))) 684 return (set_errno(EFAULT)); 685 TIMESPEC32_TO_TIMESPEC(&ts, &ts32) 686 } 687 688 if (itimerspecfix(&ts)) 689 return (set_errno(EINVAL)); 690 tsp = &ts; 691 } 692 693 if (setp == NULL) 694 ksetp = NULL; 695 else { 696 if (copyin(setp, &set, sizeof (set))) 697 return (set_errno(EFAULT)); 698 sigutok(&set, &kset); 699 ksetp = &kset; 700 } 701 702 return (poll_common(fds, nfds, tsp, ksetp)); 703 } 704 705 /* 706 * Clean up any state left around by poll(2). Called when a thread exits. 707 */ 708 void 709 pollcleanup() 710 { 711 pollstate_t *ps = curthread->t_pollstate; 712 pollcache_t *pcp; 713 714 if (ps == NULL) 715 return; 716 pcp = ps->ps_pcache; 717 /* 718 * free up all cached poll fds 719 */ 720 if (pcp == NULL) { 721 /* this pollstate is used by /dev/poll */ 722 goto pollcleanout; 723 } 724 725 if (pcp->pc_bitmap != NULL) { 726 ASSERT(MUTEX_NOT_HELD(&ps->ps_lock)); 727 /* 728 * a close lwp can race with us when cleaning up a polldat 729 * entry. We hold the ps_lock when cleaning hash table. 730 * Since this pollcache is going away anyway, there is no 731 * need to hold the pc_lock. 732 */ 733 mutex_enter(&ps->ps_lock); 734 pcache_clean(pcp); 735 mutex_exit(&ps->ps_lock); 736 #ifdef DEBUG 737 /* 738 * At this point, all fds cached by this lwp should be 739 * cleaned up. There should be no fd in fi_list still 740 * reference this thread. 741 */ 742 checkfpollinfo(); /* sanity check */ 743 pollcheckphlist(); /* sanity check */ 744 #endif /* DEBUG */ 745 } 746 /* 747 * Be sure no one is referencing thread before exiting 748 */ 749 mutex_enter(&pcp->pc_no_exit); 750 ASSERT(pcp->pc_busy >= 0); 751 while (pcp->pc_busy > 0) 752 cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); 753 mutex_exit(&pcp->pc_no_exit); 754 pollcleanout: 755 pollstate_destroy(ps); 756 curthread->t_pollstate = NULL; 757 } 758 759 /* 760 * pollwakeup() - poke threads waiting in poll() for some event 761 * on a particular object. 762 * 763 * The threads hanging off of the specified pollhead structure are scanned. 764 * If their event mask matches the specified event(s), then pollnotify() is 765 * called to poke the thread. 766 * 767 * Multiple events may be specified. When POLLHUP or POLLERR are specified, 768 * all waiting threads are poked. 769 * 770 * It is important that pollnotify() not drop the lock protecting the list 771 * of threads. 772 */ 773 void 774 pollwakeup(pollhead_t *php, short events_arg) 775 { 776 polldat_t *pdp; 777 int events = (ushort_t)events_arg; 778 struct plist { 779 port_t *pp; 780 int pevents; 781 struct plist *next; 782 }; 783 struct plist *plhead = NULL, *pltail = NULL; 784 785 retry: 786 PH_ENTER(php); 787 788 for (pdp = php->ph_list; pdp; pdp = pdp->pd_next) { 789 if ((pdp->pd_events & events) || 790 (events & (POLLHUP | POLLERR))) { 791 792 pollcache_t *pcp; 793 794 if (pdp->pd_portev != NULL) { 795 port_kevent_t *pkevp = pdp->pd_portev; 796 /* 797 * Object (fd) is associated with an event port, 798 * => send event notification to the port. 799 */ 800 ASSERT(pkevp->portkev_source == PORT_SOURCE_FD); 801 mutex_enter(&pkevp->portkev_lock); 802 if (pkevp->portkev_flags & PORT_KEV_VALID) { 803 int pevents; 804 805 pkevp->portkev_flags &= ~PORT_KEV_VALID; 806 pkevp->portkev_events |= events & 807 (pdp->pd_events | POLLHUP | 808 POLLERR); 809 /* 810 * portkev_lock mutex will be released 811 * by port_send_event(). 812 */ 813 port_send_event(pkevp); 814 815 /* 816 * If we have some thread polling the 817 * port's fd, add it to the list. They 818 * will be notified later. 819 * The port_pollwkup() will flag the 820 * port_t so that it will not disappear 821 * till port_pollwkdone() is called. 822 */ 823 pevents = 824 port_pollwkup(pkevp->portkev_port); 825 if (pevents) { 826 struct plist *t; 827 t = kmem_zalloc( 828 sizeof (struct plist), 829 KM_SLEEP); 830 t->pp = pkevp->portkev_port; 831 t->pevents = pevents; 832 if (plhead == NULL) { 833 plhead = t; 834 } else { 835 pltail->next = t; 836 } 837 pltail = t; 838 } 839 } else { 840 mutex_exit(&pkevp->portkev_lock); 841 } 842 continue; 843 } 844 845 pcp = pdp->pd_pcache; 846 847 /* 848 * Try to grab the lock for this thread. If 849 * we don't get it then we may deadlock so 850 * back out and restart all over again. Note 851 * that the failure rate is very very low. 852 */ 853 if (mutex_tryenter(&pcp->pc_lock)) { 854 pollnotify(pcp, pdp->pd_fd); 855 mutex_exit(&pcp->pc_lock); 856 } else { 857 /* 858 * We are here because: 859 * 1) This thread has been woke up 860 * and is trying to get out of poll(). 861 * 2) Some other thread is also here 862 * but with a different pollhead lock. 863 * 864 * So, we need to drop the lock on pollhead 865 * because of (1) but we want to prevent 866 * that thread from doing lwp_exit() or 867 * devpoll close. We want to ensure that 868 * the pollcache pointer is still invalid. 869 * 870 * Solution: Grab the pcp->pc_no_exit lock, 871 * increment the pc_busy counter, drop every 872 * lock in sight. Get out of the way and wait 873 * for type (2) threads to finish. 874 */ 875 876 mutex_enter(&pcp->pc_no_exit); 877 pcp->pc_busy++; /* prevents exit()'s */ 878 mutex_exit(&pcp->pc_no_exit); 879 880 PH_EXIT(php); 881 mutex_enter(&pcp->pc_lock); 882 mutex_exit(&pcp->pc_lock); 883 mutex_enter(&pcp->pc_no_exit); 884 pcp->pc_busy--; 885 if (pcp->pc_busy == 0) { 886 /* 887 * Wakeup the thread waiting in 888 * thread_exit(). 889 */ 890 cv_signal(&pcp->pc_busy_cv); 891 } 892 mutex_exit(&pcp->pc_no_exit); 893 goto retry; 894 } 895 } 896 } 897 898 899 /* 900 * Event ports - If this php is of the port on the list, 901 * call port_pollwkdone() to release it. The port_pollwkdone() 902 * needs to be called before dropping the PH lock so that any new 903 * thread attempting to poll this port are blocked. There can be 904 * only one thread here in pollwakeup notifying this port's fd. 905 */ 906 if (plhead != NULL && &plhead->pp->port_pollhd == php) { 907 struct plist *t; 908 port_pollwkdone(plhead->pp); 909 t = plhead; 910 plhead = plhead->next; 911 kmem_free(t, sizeof (struct plist)); 912 } 913 PH_EXIT(php); 914 915 /* 916 * Event ports - Notify threads polling the event port's fd. 917 * This is normally done in port_send_event() where it calls 918 * pollwakeup() on the port. But, for PORT_SOURCE_FD source alone, 919 * we do it here in pollwakeup() to avoid a recursive call. 920 */ 921 if (plhead != NULL) { 922 php = &plhead->pp->port_pollhd; 923 events = plhead->pevents; 924 goto retry; 925 } 926 } 927 928 /* 929 * This function is called to inform a thread (or threads) that an event being 930 * polled on has occurred. The pollstate lock on the thread should be held 931 * on entry. 932 */ 933 void 934 pollnotify(pollcache_t *pcp, int fd) 935 { 936 ASSERT(fd < pcp->pc_mapsize); 937 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 938 BT_SET(pcp->pc_bitmap, fd); 939 pcp->pc_flag |= PC_POLLWAKE; 940 cv_broadcast(&pcp->pc_cv); 941 pcache_wake_parents(pcp); 942 } 943 944 /* 945 * Associate a polldat entry with a pollhead (add it to ph_list). 946 * 947 * The polldat struct is used by pollwakeup to wake sleeping pollers when polled 948 * events has happened. 949 */ 950 void 951 polldat_associate(polldat_t *pdp, pollhead_t *php) 952 { 953 ASSERT3P(pdp->pd_php, ==, NULL); 954 ASSERT3P(pdp->pd_next, ==, NULL); 955 956 PH_ENTER(php); 957 #ifdef DEBUG 958 /* The polldat should not be already on the list */ 959 for (polldat_t *wp = php->ph_list; wp != NULL; wp = wp->pd_next) { 960 ASSERT3P(wp, !=, pdp); 961 } 962 #endif /* DEBUG */ 963 964 pdp->pd_next = php->ph_list; 965 php->ph_list = pdp; 966 pdp->pd_php = php; 967 PH_EXIT(php); 968 } 969 970 /* 971 * Disassociate a polldat from its pollhead (if such an association exists). 972 */ 973 void 974 polldat_disassociate(polldat_t *pdp) 975 { 976 pollhead_t *php; 977 978 /* 979 * Acquire the lock for the pollhead which this polldat is associated 980 * with. This must be done with care, re-checking pd_php after entering 981 * the pollhead lock, since a racing pollhead_clean() could have already 982 * performed the disassociation. 983 */ 984 for (;;) { 985 php = pdp->pd_php; 986 if (php == NULL) { 987 /* polldat is not associated with a pollhead */ 988 return; 989 } 990 991 /* 992 * The lock for a given pollhead is not stored in the pollhead 993 * itself, but is rather a global entry in an array (plocks) 994 * which the pollhead pointer hashes into (see: PHLOCK()). 995 */ 996 PH_ENTER(php); 997 if (pdp->pd_php == php) { 998 break; 999 } 1000 PH_EXIT(php); 1001 } 1002 1003 polldat_t **wpp = &php->ph_list, *wp = php->ph_list; 1004 while (wp != NULL) { 1005 if (wp == pdp) { 1006 /* Unlink the polldat from the list */ 1007 *wpp = pdp->pd_next; 1008 pdp->pd_next = NULL; 1009 break; 1010 } 1011 wpp = &wp->pd_next; 1012 wp = wp->pd_next; 1013 } 1014 1015 #ifdef DEBUG 1016 /* It would be unexpected if pdp was not in the pollhead list */ 1017 ASSERT(wp != NULL); 1018 1019 /* Assert that pdp is not duplicated somewhere later in the list */ 1020 for (wp = *wpp; wp; wp = wp->pd_next) { 1021 ASSERT(wp != pdp); 1022 } 1023 #endif /* DEBUG */ 1024 1025 pdp->pd_php = NULL; 1026 PH_EXIT(php); 1027 } 1028 1029 /* 1030 * walk through the poll fd lists to see if they are identical. This is an 1031 * expensive operation and should not be done more than once for each poll() 1032 * call. 1033 * 1034 * As an optimization (i.e., not having to go through the lists more than 1035 * once), this routine also clear the revents field of pollfd in 'current'. 1036 * Zeroing out the revents field of each entry in current poll list is 1037 * required by poll man page. 1038 * 1039 * Since the events field of cached list has illegal poll events filtered 1040 * out, the current list applies the same filtering before comparison. 1041 * 1042 * The routine stops when it detects a meaningful difference, or when it 1043 * exhausts the lists. 1044 */ 1045 int 1046 pcacheset_cmp(pollfd_t *current, pollfd_t *cached, pollfd_t *newlist, int n) 1047 { 1048 int ix; 1049 1050 for (ix = 0; ix < n; ix++) { 1051 /* Prefetch 64 bytes worth of 8-byte elements */ 1052 if ((ix & 0x7) == 0) { 1053 prefetch_write_many((caddr_t)¤t[ix + 8]); 1054 prefetch_write_many((caddr_t)&cached[ix + 8]); 1055 } 1056 if (current[ix].fd == cached[ix].fd) { 1057 /* 1058 * Filter out invalid poll events while we are in 1059 * inside the loop. 1060 */ 1061 if (current[ix].events & ~VALID_POLL_EVENTS) { 1062 current[ix].events &= VALID_POLL_EVENTS; 1063 if (newlist != NULL) 1064 newlist[ix].events = current[ix].events; 1065 } 1066 if (current[ix].events == cached[ix].events) { 1067 current[ix].revents = 0; 1068 continue; 1069 } 1070 } 1071 if ((current[ix].fd < 0) && (cached[ix].fd < 0)) { 1072 current[ix].revents = 0; 1073 continue; 1074 } 1075 return (ix); 1076 } 1077 return (ix); 1078 } 1079 1080 /* 1081 * This routine returns a pointer to a cached poll fd entry, or NULL if it 1082 * does not find it in the hash table. 1083 */ 1084 polldat_t * 1085 pcache_lookup_fd(pollcache_t *pcp, int fd) 1086 { 1087 int hashindex; 1088 polldat_t *pdp; 1089 1090 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1091 pdp = pcp->pc_hash[hashindex]; 1092 while (pdp != NULL) { 1093 if (pdp->pd_fd == fd) 1094 break; 1095 pdp = pdp->pd_hashnext; 1096 } 1097 return (pdp); 1098 } 1099 1100 polldat_t * 1101 pcache_alloc_fd(int nsets) 1102 { 1103 polldat_t *pdp; 1104 1105 pdp = kmem_zalloc(sizeof (polldat_t), KM_SLEEP); 1106 if (nsets > 0) { 1107 pdp->pd_ref = kmem_zalloc(sizeof (xref_t) * nsets, KM_SLEEP); 1108 pdp->pd_nsets = nsets; 1109 } 1110 return (pdp); 1111 } 1112 1113 /* 1114 * This routine inserts a polldat into the pollcache's hash table. It 1115 * may be necessary to grow the size of the hash table. 1116 */ 1117 void 1118 pcache_insert_fd(pollcache_t *pcp, polldat_t *pdp, nfds_t nfds) 1119 { 1120 int hashindex; 1121 int fd; 1122 1123 if ((pcp->pc_fdcount > pcp->pc_hashsize * POLLHASHTHRESHOLD) || 1124 (nfds > pcp->pc_hashsize * POLLHASHTHRESHOLD)) { 1125 pcache_grow_hashtbl(pcp, nfds); 1126 } 1127 fd = pdp->pd_fd; 1128 hashindex = POLLHASH(pcp->pc_hashsize, fd); 1129 pdp->pd_hashnext = pcp->pc_hash[hashindex]; 1130 pcp->pc_hash[hashindex] = pdp; 1131 pcp->pc_fdcount++; 1132 1133 #ifdef DEBUG 1134 { 1135 /* 1136 * same fd should not appear on a hash list twice 1137 */ 1138 polldat_t *pdp1; 1139 for (pdp1 = pdp->pd_hashnext; pdp1; pdp1 = pdp1->pd_hashnext) { 1140 ASSERT(pdp->pd_fd != pdp1->pd_fd); 1141 } 1142 } 1143 #endif /* DEBUG */ 1144 } 1145 1146 /* 1147 * Grow the hash table -- either double the table size or round it to the 1148 * nearest multiples of POLLHASHCHUNKSZ, whichever is bigger. Rehash all the 1149 * elements on the hash table. 1150 */ 1151 void 1152 pcache_grow_hashtbl(pollcache_t *pcp, nfds_t nfds) 1153 { 1154 int oldsize; 1155 polldat_t **oldtbl; 1156 polldat_t *pdp, *pdp1; 1157 int i; 1158 #ifdef DEBUG 1159 int count = 0; 1160 #endif 1161 1162 ASSERT(pcp->pc_hashsize % POLLHASHCHUNKSZ == 0); 1163 oldsize = pcp->pc_hashsize; 1164 oldtbl = pcp->pc_hash; 1165 if (nfds > pcp->pc_hashsize * POLLHASHINC) { 1166 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 1167 ~(POLLHASHCHUNKSZ - 1); 1168 } else { 1169 pcp->pc_hashsize = pcp->pc_hashsize * POLLHASHINC; 1170 } 1171 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 1172 KM_SLEEP); 1173 /* 1174 * rehash existing elements 1175 */ 1176 pcp->pc_fdcount = 0; 1177 for (i = 0; i < oldsize; i++) { 1178 pdp = oldtbl[i]; 1179 while (pdp != NULL) { 1180 pdp1 = pdp->pd_hashnext; 1181 pcache_insert_fd(pcp, pdp, nfds); 1182 pdp = pdp1; 1183 #ifdef DEBUG 1184 count++; 1185 #endif 1186 } 1187 } 1188 kmem_free(oldtbl, oldsize * sizeof (polldat_t *)); 1189 ASSERT(pcp->pc_fdcount == count); 1190 } 1191 1192 void 1193 pcache_grow_map(pollcache_t *pcp, int fd) 1194 { 1195 int newsize; 1196 ulong_t *newmap; 1197 1198 /* 1199 * grow to nearest multiple of POLLMAPCHUNK, assuming POLLMAPCHUNK is 1200 * power of 2. 1201 */ 1202 newsize = (fd + POLLMAPCHUNK) & ~(POLLMAPCHUNK - 1); 1203 newmap = kmem_zalloc((newsize / BT_NBIPUL) * sizeof (ulong_t), 1204 KM_SLEEP); 1205 /* 1206 * don't want pollwakeup to set a bit while growing the bitmap. 1207 */ 1208 ASSERT(mutex_owned(&pcp->pc_lock) == 0); 1209 mutex_enter(&pcp->pc_lock); 1210 bcopy(pcp->pc_bitmap, newmap, 1211 (pcp->pc_mapsize / BT_NBIPUL) * sizeof (ulong_t)); 1212 kmem_free(pcp->pc_bitmap, 1213 (pcp->pc_mapsize /BT_NBIPUL) * sizeof (ulong_t)); 1214 pcp->pc_bitmap = newmap; 1215 pcp->pc_mapsize = newsize; 1216 mutex_exit(&pcp->pc_lock); 1217 } 1218 1219 /* 1220 * remove all the reference from pollhead list and fpollinfo lists. 1221 */ 1222 void 1223 pcache_clean(pollcache_t *pcp) 1224 { 1225 int i; 1226 polldat_t **hashtbl; 1227 polldat_t *pdp; 1228 1229 ASSERT(MUTEX_HELD(&curthread->t_pollstate->ps_lock)); 1230 hashtbl = pcp->pc_hash; 1231 for (i = 0; i < pcp->pc_hashsize; i++) { 1232 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 1233 polldat_disassociate(pdp); 1234 if (pdp->pd_fp != NULL) { 1235 delfpollinfo(pdp->pd_fd); 1236 pdp->pd_fp = NULL; 1237 } 1238 } 1239 } 1240 } 1241 1242 void 1243 pcacheset_invalidate(pollstate_t *ps, polldat_t *pdp) 1244 { 1245 int i; 1246 int fd = pdp->pd_fd; 1247 1248 /* 1249 * we come here because an earlier close() on this cached poll fd. 1250 */ 1251 ASSERT(pdp->pd_fp == NULL); 1252 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1253 pdp->pd_events = 0; 1254 for (i = 0; i < ps->ps_nsets; i++) { 1255 xref_t *refp; 1256 pollcacheset_t *pcsp; 1257 1258 ASSERT(pdp->pd_ref != NULL); 1259 refp = &pdp->pd_ref[i]; 1260 if (refp->xf_refcnt) { 1261 ASSERT(refp->xf_position >= 0); 1262 pcsp = &ps->ps_pcacheset[i]; 1263 if (refp->xf_refcnt == 1) { 1264 pcsp->pcs_pollfd[refp->xf_position].fd = -1; 1265 refp->xf_refcnt = 0; 1266 pdp->pd_count--; 1267 } else if (refp->xf_refcnt > 1) { 1268 int j; 1269 1270 /* 1271 * turn off every appearance in pcs_pollfd list 1272 */ 1273 for (j = refp->xf_position; 1274 j < pcsp->pcs_nfds; j++) { 1275 if (pcsp->pcs_pollfd[j].fd == fd) { 1276 pcsp->pcs_pollfd[j].fd = -1; 1277 refp->xf_refcnt--; 1278 pdp->pd_count--; 1279 } 1280 } 1281 } 1282 ASSERT(refp->xf_refcnt == 0); 1283 refp->xf_position = POLLPOSINVAL; 1284 } 1285 } 1286 ASSERT(pdp->pd_count == 0); 1287 } 1288 1289 /* 1290 * Insert poll fd into the pollcache, and add poll registration. 1291 * This routine is called after getf() and before releasef(). So the vnode 1292 * can not disappear even if we block here. 1293 * If there is an error, the polled fd is not cached. 1294 */ 1295 int 1296 pcache_insert(pollstate_t *ps, file_t *fp, pollfd_t *pollfdp, int *fdcntp, 1297 ssize_t pos, int which) 1298 { 1299 pollcache_t *pcp = ps->ps_pcache; 1300 polldat_t *pdp; 1301 int error; 1302 int fd; 1303 pollhead_t *memphp = NULL; 1304 xref_t *refp; 1305 int newpollfd = 0; 1306 1307 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1308 /* 1309 * The poll caching uses the existing VOP_POLL interface. If there 1310 * is no polled events, we want the polled device to set its "some 1311 * one is sleeping in poll" flag. When the polled events happen 1312 * later, the driver will call pollwakeup(). We achieve this by 1313 * always passing 0 in the third parameter ("anyyet") when calling 1314 * VOP_POLL. This parameter is not looked at by drivers when the 1315 * polled events exist. If a driver chooses to ignore this parameter 1316 * and call pollwakeup whenever the polled events happen, that will 1317 * be OK too. 1318 */ 1319 ASSERT(curthread->t_pollcache == NULL); 1320 error = VOP_POLL(fp->f_vnode, pollfdp->events, 0, &pollfdp->revents, 1321 &memphp, NULL); 1322 if (error) { 1323 return (error); 1324 } 1325 if (pollfdp->revents) { 1326 (*fdcntp)++; 1327 } 1328 /* 1329 * polling the underlying device succeeded. Now we can cache it. 1330 * A close can't come in here because we have not done a releasef() 1331 * yet. 1332 */ 1333 fd = pollfdp->fd; 1334 pdp = pcache_lookup_fd(pcp, fd); 1335 if (pdp == NULL) { 1336 ASSERT(ps->ps_nsets > 0); 1337 pdp = pcache_alloc_fd(ps->ps_nsets); 1338 newpollfd = 1; 1339 } 1340 /* 1341 * If this entry was used to cache a poll fd which was closed, and 1342 * this entry has not been cleaned, do it now. 1343 */ 1344 if ((pdp->pd_count > 0) && (pdp->pd_fp == NULL)) { 1345 pcacheset_invalidate(ps, pdp); 1346 ASSERT(pdp->pd_next == NULL); 1347 } 1348 if (pdp->pd_count == 0) { 1349 pdp->pd_fd = fd; 1350 pdp->pd_fp = fp; 1351 addfpollinfo(fd); 1352 pdp->pd_thread = curthread; 1353 pdp->pd_pcache = pcp; 1354 /* 1355 * the entry is never used or cleared by removing a cached 1356 * pollfd (pcache_delete_fd). So all the fields should be clear. 1357 */ 1358 ASSERT(pdp->pd_next == NULL); 1359 } 1360 1361 /* 1362 * A polled fd is considered cached. So there should be a fpollinfo 1363 * entry on uf_fpollinfo list. 1364 */ 1365 ASSERT(infpollinfo(fd)); 1366 /* 1367 * If there is an inconsistency, we want to know it here. 1368 */ 1369 ASSERT(pdp->pd_fp == fp); 1370 1371 /* 1372 * XXX pd_events is a union of all polled events on this fd, possibly 1373 * by different threads. Unless this is a new first poll(), pd_events 1374 * never shrinks. If an event is no longer polled by a process, there 1375 * is no way to cancel that event. In that case, poll degrade to its 1376 * old form -- polling on this fd every time poll() is called. The 1377 * assumption is an app always polls the same type of events. 1378 */ 1379 pdp->pd_events |= pollfdp->events; 1380 1381 pdp->pd_count++; 1382 /* 1383 * There is not much special handling for multiple appearances of 1384 * same fd other than xf_position always recording the first 1385 * appearance in poll list. If this is called from pcacheset_cache_list, 1386 * a VOP_POLL is called on every pollfd entry; therefore each 1387 * revents and fdcnt should be set correctly. If this is called from 1388 * pcacheset_resolve, we don't care about fdcnt here. Pollreadmap will 1389 * pick up the right count and handle revents field of each pollfd 1390 * entry. 1391 */ 1392 ASSERT(pdp->pd_ref != NULL); 1393 refp = &pdp->pd_ref[which]; 1394 if (refp->xf_refcnt == 0) { 1395 refp->xf_position = pos; 1396 } else { 1397 /* 1398 * xf_position records the fd's first appearance in poll list 1399 */ 1400 if (pos < refp->xf_position) { 1401 refp->xf_position = pos; 1402 } 1403 } 1404 ASSERT(pollfdp->fd == ps->ps_pollfd[refp->xf_position].fd); 1405 refp->xf_refcnt++; 1406 if (fd >= pcp->pc_mapsize) { 1407 pcache_grow_map(pcp, fd); 1408 } 1409 if (fd > pcp->pc_mapend) { 1410 pcp->pc_mapend = fd; 1411 } 1412 if (newpollfd != 0) { 1413 pcache_insert_fd(ps->ps_pcache, pdp, ps->ps_nfds); 1414 } 1415 if (memphp) { 1416 if (pdp->pd_php == NULL) { 1417 polldat_associate(pdp, memphp); 1418 } else { 1419 if (memphp != pdp->pd_php) { 1420 /* 1421 * layered devices (e.g. console driver) 1422 * may change the vnode and thus the pollhead 1423 * pointer out from underneath us. 1424 */ 1425 polldat_disassociate(pdp); 1426 polldat_associate(pdp, memphp); 1427 } 1428 } 1429 } 1430 /* 1431 * Since there is a considerable window between VOP_POLL and when 1432 * we actually put the polldat struct on the pollhead list, we could 1433 * miss a pollwakeup. In the case of polling additional events, we 1434 * don't update the events until after VOP_POLL. So we could miss 1435 * pollwakeup there too. So we always set the bit here just to be 1436 * safe. The real performance gain is in subsequent pcache_poll. 1437 */ 1438 mutex_enter(&pcp->pc_lock); 1439 BT_SET(pcp->pc_bitmap, fd); 1440 mutex_exit(&pcp->pc_lock); 1441 return (0); 1442 } 1443 1444 /* 1445 * The entry is not really deleted. The fields are cleared so that the 1446 * entry is no longer useful, but it will remain in the hash table for reuse 1447 * later. It will be freed when the polling lwp exits. 1448 */ 1449 int 1450 pcache_delete_fd(pollstate_t *ps, int fd, size_t pos, int which, uint_t cevent) 1451 { 1452 pollcache_t *pcp = ps->ps_pcache; 1453 polldat_t *pdp; 1454 xref_t *refp; 1455 1456 ASSERT(fd < pcp->pc_mapsize); 1457 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1458 1459 pdp = pcache_lookup_fd(pcp, fd); 1460 ASSERT(pdp != NULL); 1461 ASSERT(pdp->pd_count > 0); 1462 ASSERT(pdp->pd_ref != NULL); 1463 refp = &pdp->pd_ref[which]; 1464 if (pdp->pd_count == 1) { 1465 pdp->pd_events = 0; 1466 refp->xf_position = POLLPOSINVAL; 1467 ASSERT(refp->xf_refcnt == 1); 1468 refp->xf_refcnt = 0; 1469 1470 /* 1471 * It is possible for a wakeup thread to get ahead of the 1472 * following polldat_disassociate and set the bit in bitmap. 1473 * That is OK because the bit will be cleared here anyway. 1474 */ 1475 polldat_disassociate(pdp); 1476 1477 pdp->pd_count = 0; 1478 if (pdp->pd_fp != NULL) { 1479 pdp->pd_fp = NULL; 1480 delfpollinfo(fd); 1481 } 1482 mutex_enter(&pcp->pc_lock); 1483 BT_CLEAR(pcp->pc_bitmap, fd); 1484 mutex_exit(&pcp->pc_lock); 1485 return (0); 1486 } 1487 if ((cevent & POLLCLOSED) == POLLCLOSED) { 1488 /* 1489 * fd cached here has been closed. This is the first 1490 * pcache_delete_fd called after the close. Clean up the 1491 * entire entry. 1492 */ 1493 pcacheset_invalidate(ps, pdp); 1494 ASSERT(pdp->pd_php == NULL); 1495 mutex_enter(&pcp->pc_lock); 1496 BT_CLEAR(pcp->pc_bitmap, fd); 1497 mutex_exit(&pcp->pc_lock); 1498 return (0); 1499 } 1500 #ifdef DEBUG 1501 if (getf(fd) != NULL) { 1502 ASSERT(infpollinfo(fd)); 1503 releasef(fd); 1504 } 1505 #endif /* DEBUG */ 1506 pdp->pd_count--; 1507 ASSERT(refp->xf_refcnt > 0); 1508 if (--refp->xf_refcnt == 0) { 1509 refp->xf_position = POLLPOSINVAL; 1510 } else { 1511 ASSERT(pos >= refp->xf_position); 1512 if (pos == refp->xf_position) { 1513 /* 1514 * The xref position is no longer valid. 1515 * Reset it to a special value and let 1516 * caller know it needs to updatexref() 1517 * with a new xf_position value. 1518 */ 1519 refp->xf_position = POLLPOSTRANS; 1520 return (1); 1521 } 1522 } 1523 return (0); 1524 } 1525 1526 void 1527 pcache_update_xref(pollcache_t *pcp, int fd, ssize_t pos, int which) 1528 { 1529 polldat_t *pdp; 1530 1531 pdp = pcache_lookup_fd(pcp, fd); 1532 ASSERT(pdp != NULL); 1533 ASSERT(pdp->pd_ref != NULL); 1534 pdp->pd_ref[which].xf_position = pos; 1535 } 1536 1537 #ifdef DEBUG 1538 /* 1539 * For each polled fd, it's either in the bitmap or cached in 1540 * pcache hash table. If this routine returns 0, something is wrong. 1541 */ 1542 static int 1543 pollchecksanity(pollstate_t *ps, nfds_t nfds) 1544 { 1545 int i; 1546 int fd; 1547 pollcache_t *pcp = ps->ps_pcache; 1548 polldat_t *pdp; 1549 pollfd_t *pollfdp = ps->ps_pollfd; 1550 file_t *fp; 1551 1552 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1553 for (i = 0; i < nfds; i++) { 1554 fd = pollfdp[i].fd; 1555 if (fd < 0) { 1556 ASSERT(pollfdp[i].revents == 0); 1557 continue; 1558 } 1559 if (pollfdp[i].revents == POLLNVAL) 1560 continue; 1561 if ((fp = getf(fd)) == NULL) 1562 continue; 1563 pdp = pcache_lookup_fd(pcp, fd); 1564 ASSERT(pdp != NULL); 1565 ASSERT(infpollinfo(fd)); 1566 ASSERT(pdp->pd_fp == fp); 1567 releasef(fd); 1568 if (BT_TEST(pcp->pc_bitmap, fd)) 1569 continue; 1570 if (pdp->pd_php == NULL) 1571 return (0); 1572 } 1573 return (1); 1574 } 1575 #endif /* DEBUG */ 1576 1577 /* 1578 * resolve the difference between the current poll list and a cached one. 1579 */ 1580 int 1581 pcacheset_resolve(pollstate_t *ps, nfds_t nfds, int *fdcntp, int which) 1582 { 1583 int i; 1584 pollcache_t *pcp = ps->ps_pcache; 1585 pollfd_t *newlist = NULL; 1586 pollfd_t *current = ps->ps_pollfd; 1587 pollfd_t *cached; 1588 pollcacheset_t *pcsp; 1589 int common; 1590 int count = 0; 1591 int offset; 1592 int remain; 1593 int fd; 1594 file_t *fp; 1595 int fdcnt = 0; 1596 int cnt = 0; 1597 nfds_t old_nfds; 1598 int error = 0; 1599 int mismatch = 0; 1600 1601 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1602 #ifdef DEBUG 1603 checkpolldat(ps); 1604 #endif 1605 pcsp = &ps->ps_pcacheset[which]; 1606 old_nfds = pcsp->pcs_nfds; 1607 common = (nfds > old_nfds) ? old_nfds : nfds; 1608 if (nfds != old_nfds) { 1609 /* 1610 * the length of poll list has changed. allocate a new 1611 * pollfd list. 1612 */ 1613 newlist = kmem_alloc(nfds * sizeof (pollfd_t), KM_SLEEP); 1614 bcopy(current, newlist, sizeof (pollfd_t) * nfds); 1615 } 1616 /* 1617 * Compare the overlapping part of the current fd list with the 1618 * cached one. Whenever a difference is found, resolve it. 1619 * The comparison is done on the current poll list and the 1620 * cached list. But we may be setting up the newlist to be the 1621 * cached list for next poll. 1622 */ 1623 cached = pcsp->pcs_pollfd; 1624 remain = common; 1625 1626 while (count < common) { 1627 int tmpfd; 1628 pollfd_t *np; 1629 1630 np = (newlist != NULL) ? &newlist[count] : NULL; 1631 offset = pcacheset_cmp(¤t[count], &cached[count], np, 1632 remain); 1633 /* 1634 * Collect stats. If lists are completed the first time, 1635 * it's a hit. Otherwise, it's a partial hit or miss. 1636 */ 1637 if ((count == 0) && (offset == common)) { 1638 pollstats.pollcachehit.value.ui64++; 1639 } else { 1640 mismatch++; 1641 } 1642 count += offset; 1643 if (offset < remain) { 1644 ASSERT(count < common); 1645 ASSERT((current[count].fd != cached[count].fd) || 1646 (current[count].events != cached[count].events)); 1647 /* 1648 * Filter out invalid events. 1649 */ 1650 if (current[count].events & ~VALID_POLL_EVENTS) { 1651 if (newlist != NULL) { 1652 newlist[count].events = 1653 current[count].events &= 1654 VALID_POLL_EVENTS; 1655 } else { 1656 current[count].events &= 1657 VALID_POLL_EVENTS; 1658 } 1659 } 1660 /* 1661 * when resolving a difference, we always remove the 1662 * fd from cache before inserting one into cache. 1663 */ 1664 if (cached[count].fd >= 0) { 1665 tmpfd = cached[count].fd; 1666 if (pcache_delete_fd(ps, tmpfd, count, which, 1667 (uint_t)cached[count].events)) { 1668 /* 1669 * This should be rare but needed for 1670 * correctness. 1671 * 1672 * The first appearance in cached list 1673 * is being "turned off". The same fd 1674 * appear more than once in the cached 1675 * poll list. Find the next one on the 1676 * list and update the cached 1677 * xf_position field. 1678 */ 1679 for (i = count + 1; i < old_nfds; i++) { 1680 if (cached[i].fd == tmpfd) { 1681 pcache_update_xref(pcp, 1682 tmpfd, (ssize_t)i, 1683 which); 1684 break; 1685 } 1686 } 1687 ASSERT(i <= old_nfds); 1688 } 1689 /* 1690 * In case a new cache list is allocated, 1691 * need to keep both cache lists in sync 1692 * b/c the new one can be freed if we have 1693 * an error later. 1694 */ 1695 cached[count].fd = -1; 1696 if (newlist != NULL) { 1697 newlist[count].fd = -1; 1698 } 1699 } 1700 if ((tmpfd = current[count].fd) >= 0) { 1701 /* 1702 * add to the cached fd tbl and bitmap. 1703 */ 1704 if ((fp = getf(tmpfd)) == NULL) { 1705 current[count].revents = POLLNVAL; 1706 if (newlist != NULL) { 1707 newlist[count].fd = -1; 1708 } 1709 cached[count].fd = -1; 1710 fdcnt++; 1711 } else { 1712 /* 1713 * Here we don't care about the 1714 * fdcnt. We will examine the bitmap 1715 * later and pick up the correct 1716 * fdcnt there. So we never bother 1717 * to check value of 'cnt'. 1718 */ 1719 error = pcache_insert(ps, fp, 1720 ¤t[count], &cnt, 1721 (ssize_t)count, which); 1722 /* 1723 * if no error, we want to do releasef 1724 * after we updated cache poll list 1725 * entry so that close() won't race 1726 * us. 1727 */ 1728 if (error) { 1729 /* 1730 * If we encountered an error, 1731 * we have invalidated an 1732 * entry in cached poll list 1733 * (in pcache_delete_fd() above) 1734 * but failed to add one here. 1735 * This is OK b/c what's in the 1736 * cached list is consistent 1737 * with content of cache. 1738 * It will not have any ill 1739 * effect on next poll(). 1740 */ 1741 releasef(tmpfd); 1742 if (newlist != NULL) { 1743 kmem_free(newlist, 1744 nfds * 1745 sizeof (pollfd_t)); 1746 } 1747 return (error); 1748 } 1749 /* 1750 * If we have allocated a new(temp) 1751 * cache list, we need to keep both 1752 * in sync b/c the new one can be freed 1753 * if we have an error later. 1754 */ 1755 if (newlist != NULL) { 1756 newlist[count].fd = 1757 current[count].fd; 1758 newlist[count].events = 1759 current[count].events; 1760 } 1761 cached[count].fd = current[count].fd; 1762 cached[count].events = 1763 current[count].events; 1764 releasef(tmpfd); 1765 } 1766 } else { 1767 current[count].revents = 0; 1768 } 1769 count++; 1770 remain = common - count; 1771 } 1772 } 1773 if (mismatch != 0) { 1774 if (mismatch == common) { 1775 pollstats.pollcachemiss.value.ui64++; 1776 } else { 1777 pollstats.pollcachephit.value.ui64++; 1778 } 1779 } 1780 /* 1781 * take care of the non overlapping part of a list 1782 */ 1783 if (nfds > old_nfds) { 1784 ASSERT(newlist != NULL); 1785 for (i = old_nfds; i < nfds; i++) { 1786 /* filter out invalid events */ 1787 if (current[i].events & ~VALID_POLL_EVENTS) { 1788 newlist[i].events = current[i].events = 1789 current[i].events & VALID_POLL_EVENTS; 1790 } 1791 if ((fd = current[i].fd) < 0) { 1792 current[i].revents = 0; 1793 continue; 1794 } 1795 /* 1796 * add to the cached fd tbl and bitmap. 1797 */ 1798 if ((fp = getf(fd)) == NULL) { 1799 current[i].revents = POLLNVAL; 1800 newlist[i].fd = -1; 1801 fdcnt++; 1802 continue; 1803 } 1804 /* 1805 * Here we don't care about the 1806 * fdcnt. We will examine the bitmap 1807 * later and pick up the correct 1808 * fdcnt there. So we never bother to 1809 * check 'cnt'. 1810 */ 1811 error = pcache_insert(ps, fp, ¤t[i], &cnt, 1812 (ssize_t)i, which); 1813 releasef(fd); 1814 if (error) { 1815 /* 1816 * Here we are half way through adding newly 1817 * polled fd. Undo enough to keep the cache 1818 * list consistent with the cache content. 1819 */ 1820 pcacheset_remove_list(ps, current, old_nfds, 1821 i, which, 0); 1822 kmem_free(newlist, nfds * sizeof (pollfd_t)); 1823 return (error); 1824 } 1825 } 1826 } 1827 if (old_nfds > nfds) { 1828 /* 1829 * remove the fd's which are no longer polled. 1830 */ 1831 pcacheset_remove_list(ps, pcsp->pcs_pollfd, nfds, old_nfds, 1832 which, 1); 1833 } 1834 /* 1835 * set difference resolved. update nfds and cachedlist 1836 * in pollstate struct. 1837 */ 1838 if (newlist != NULL) { 1839 kmem_free(pcsp->pcs_pollfd, old_nfds * sizeof (pollfd_t)); 1840 /* 1841 * By now, the pollfd.revents field should 1842 * all be zeroed. 1843 */ 1844 pcsp->pcs_pollfd = newlist; 1845 pcsp->pcs_nfds = nfds; 1846 } 1847 ASSERT(*fdcntp == 0); 1848 *fdcntp = fdcnt; 1849 /* 1850 * By now for every fd in pollfdp, one of the following should be 1851 * true. Otherwise we will miss a polled event. 1852 * 1853 * 1. the bit corresponding to the fd in bitmap is set. So VOP_POLL 1854 * will be called on this fd in next poll. 1855 * 2. the fd is cached in the pcache (i.e. pd_php is set). So 1856 * pollnotify will happen. 1857 */ 1858 ASSERT(pollchecksanity(ps, nfds)); 1859 /* 1860 * make sure cross reference between cached poll lists and cached 1861 * poll fds are correct. 1862 */ 1863 ASSERT(pollcheckxref(ps, which)); 1864 /* 1865 * ensure each polldat in pollcache reference a polled fd in 1866 * pollcacheset. 1867 */ 1868 #ifdef DEBUG 1869 checkpolldat(ps); 1870 #endif 1871 return (0); 1872 } 1873 1874 #ifdef DEBUG 1875 static int 1876 pollscanrevents(pollcache_t *pcp, pollfd_t *pollfdp, nfds_t nfds) 1877 { 1878 int i; 1879 int reventcnt = 0; 1880 1881 for (i = 0; i < nfds; i++) { 1882 if (pollfdp[i].fd < 0) { 1883 ASSERT(pollfdp[i].revents == 0); 1884 continue; 1885 } 1886 if (pollfdp[i].revents) { 1887 reventcnt++; 1888 } 1889 if (pollfdp[i].revents && (pollfdp[i].revents != POLLNVAL)) { 1890 ASSERT(BT_TEST(pcp->pc_bitmap, pollfdp[i].fd)); 1891 } 1892 } 1893 return (reventcnt); 1894 } 1895 #endif /* DEBUG */ 1896 1897 /* 1898 * read the bitmap and poll on fds corresponding to the '1' bits. The ps_lock 1899 * is held upon entry. 1900 */ 1901 int 1902 pcache_poll(pollfd_t *pollfdp, pollstate_t *ps, nfds_t nfds, int *fdcntp, 1903 int which) 1904 { 1905 int i; 1906 pollcache_t *pcp; 1907 int fd; 1908 int begin, end, done; 1909 pollhead_t *php; 1910 int fdcnt; 1911 int error = 0; 1912 file_t *fp; 1913 polldat_t *pdp; 1914 xref_t *refp; 1915 int entry; 1916 1917 pcp = ps->ps_pcache; 1918 ASSERT(MUTEX_HELD(&ps->ps_lock)); 1919 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 1920 retry: 1921 done = 0; 1922 begin = 0; 1923 fdcnt = 0; 1924 end = pcp->pc_mapend; 1925 while ((fdcnt < nfds) && !done) { 1926 php = NULL; 1927 /* 1928 * only poll fds which may have events 1929 */ 1930 fd = bt_getlowbit(pcp->pc_bitmap, begin, end); 1931 ASSERT(fd <= end); 1932 if (fd >= 0) { 1933 ASSERT(pollcheckrevents(ps, begin, fd, which)); 1934 /* 1935 * adjust map pointers for next round 1936 */ 1937 if (fd == end) { 1938 done = 1; 1939 } else { 1940 begin = fd + 1; 1941 } 1942 /* 1943 * A bitmap caches poll state information of 1944 * multiple poll lists. Call VOP_POLL only if 1945 * the bit corresponds to an fd in this poll 1946 * list. 1947 */ 1948 pdp = pcache_lookup_fd(pcp, fd); 1949 ASSERT(pdp != NULL); 1950 ASSERT(pdp->pd_ref != NULL); 1951 refp = &pdp->pd_ref[which]; 1952 if (refp->xf_refcnt == 0) 1953 continue; 1954 entry = refp->xf_position; 1955 ASSERT((entry >= 0) && (entry < nfds)); 1956 ASSERT(pollfdp[entry].fd == fd); 1957 /* 1958 * we are in this routine implies that we have 1959 * successfully polled this fd in the past. 1960 * Check to see this fd is closed while we are 1961 * blocked in poll. This ensures that we don't 1962 * miss a close on the fd in the case this fd is 1963 * reused. 1964 */ 1965 if (pdp->pd_fp == NULL) { 1966 ASSERT(pdp->pd_count > 0); 1967 pollfdp[entry].revents = POLLNVAL; 1968 fdcnt++; 1969 if (refp->xf_refcnt > 1) { 1970 /* 1971 * this fd appeared multiple time 1972 * in the poll list. Find all of them. 1973 */ 1974 for (i = entry + 1; i < nfds; i++) { 1975 if (pollfdp[i].fd == fd) { 1976 pollfdp[i].revents = 1977 POLLNVAL; 1978 fdcnt++; 1979 } 1980 } 1981 } 1982 pcacheset_invalidate(ps, pdp); 1983 continue; 1984 } 1985 /* 1986 * We can be here polling a device that is being 1987 * closed (i.e. the file pointer is set to NULL, 1988 * but pollcacheclean has not happened yet). 1989 */ 1990 if ((fp = getf(fd)) == NULL) { 1991 pollfdp[entry].revents = POLLNVAL; 1992 fdcnt++; 1993 if (refp->xf_refcnt > 1) { 1994 /* 1995 * this fd appeared multiple time 1996 * in the poll list. Find all of them. 1997 */ 1998 for (i = entry + 1; i < nfds; i++) { 1999 if (pollfdp[i].fd == fd) { 2000 pollfdp[i].revents = 2001 POLLNVAL; 2002 fdcnt++; 2003 } 2004 } 2005 } 2006 continue; 2007 } 2008 ASSERT(pdp->pd_fp == fp); 2009 ASSERT(infpollinfo(fd)); 2010 /* 2011 * Since we no longer hold poll head lock across 2012 * VOP_POLL, pollunlock logic can be simplifed. 2013 */ 2014 ASSERT(pdp->pd_php == NULL || 2015 MUTEX_NOT_HELD(PHLOCK(pdp->pd_php))); 2016 /* 2017 * underlying file systems may set a "pollpending" 2018 * flag when it sees the poll may block. Pollwakeup() 2019 * is called by wakeup thread if pollpending is set. 2020 * Pass a 0 fdcnt so that the underlying file system 2021 * will set the "pollpending" flag set when there is 2022 * no polled events. 2023 * 2024 * Use pollfdp[].events for actual polling because 2025 * the pd_events is union of all cached poll events 2026 * on this fd. The events parameter also affects 2027 * how the polled device sets the "poll pending" 2028 * flag. 2029 */ 2030 ASSERT(curthread->t_pollcache == NULL); 2031 error = VOP_POLL(fp->f_vnode, pollfdp[entry].events, 0, 2032 &pollfdp[entry].revents, &php, NULL); 2033 /* 2034 * releasef after completely done with this cached 2035 * poll entry. To prevent close() coming in to clear 2036 * this entry. 2037 */ 2038 if (error) { 2039 releasef(fd); 2040 break; 2041 } 2042 /* 2043 * layered devices (e.g. console driver) 2044 * may change the vnode and thus the pollhead 2045 * pointer out from underneath us. 2046 */ 2047 if (php != NULL && pdp->pd_php != NULL && 2048 php != pdp->pd_php) { 2049 releasef(fd); 2050 polldat_disassociate(pdp); 2051 polldat_associate(pdp, php); 2052 /* 2053 * We could have missed a wakeup on the new 2054 * target device. Make sure the new target 2055 * gets polled once. 2056 */ 2057 BT_SET(pcp->pc_bitmap, fd); 2058 goto retry; 2059 } 2060 2061 if (pollfdp[entry].revents) { 2062 ASSERT(refp->xf_refcnt >= 1); 2063 fdcnt++; 2064 if (refp->xf_refcnt > 1) { 2065 /* 2066 * this fd appeared multiple time 2067 * in the poll list. This is rare but 2068 * we have to look at all of them for 2069 * correctness. 2070 */ 2071 error = plist_chkdupfd(fp, pdp, ps, 2072 pollfdp, entry, &fdcnt); 2073 if (error > 0) { 2074 releasef(fd); 2075 break; 2076 } 2077 if (error < 0) { 2078 goto retry; 2079 } 2080 } 2081 releasef(fd); 2082 } else { 2083 /* 2084 * VOP_POLL didn't return any revents. We can 2085 * clear the bit in bitmap only if we have the 2086 * pollhead ptr cached and no other cached 2087 * entry is polling different events on this fd. 2088 * VOP_POLL may have dropped the ps_lock. Make 2089 * sure pollwakeup has not happened before clear 2090 * the bit. 2091 */ 2092 if ((pdp->pd_php != NULL) && 2093 (pollfdp[entry].events == pdp->pd_events) && 2094 ((pcp->pc_flag & PC_POLLWAKE) == 0)) { 2095 BT_CLEAR(pcp->pc_bitmap, fd); 2096 } 2097 /* 2098 * if the fd can be cached now but not before, 2099 * do it now. 2100 */ 2101 if ((pdp->pd_php == NULL) && (php != NULL)) { 2102 polldat_associate(pdp, php); 2103 /* 2104 * We are inserting a polldat struct for 2105 * the first time. We may have missed a 2106 * wakeup on this device. Re-poll once. 2107 * This should be a rare event. 2108 */ 2109 releasef(fd); 2110 goto retry; 2111 } 2112 if (refp->xf_refcnt > 1) { 2113 /* 2114 * this fd appeared multiple time 2115 * in the poll list. This is rare but 2116 * we have to look at all of them for 2117 * correctness. 2118 */ 2119 error = plist_chkdupfd(fp, pdp, ps, 2120 pollfdp, entry, &fdcnt); 2121 if (error > 0) { 2122 releasef(fd); 2123 break; 2124 } 2125 if (error < 0) { 2126 goto retry; 2127 } 2128 } 2129 releasef(fd); 2130 } 2131 } else { 2132 done = 1; 2133 ASSERT(pollcheckrevents(ps, begin, end + 1, which)); 2134 } 2135 } 2136 if (!error) { 2137 ASSERT(*fdcntp + fdcnt == pollscanrevents(pcp, pollfdp, nfds)); 2138 *fdcntp += fdcnt; 2139 } 2140 return (error); 2141 } 2142 2143 /* 2144 * Going through the poll list without much locking. Poll all fds and 2145 * cache all valid fds in the pollcache. 2146 */ 2147 int 2148 pcacheset_cache_list(pollstate_t *ps, pollfd_t *fds, int *fdcntp, int which) 2149 { 2150 pollfd_t *pollfdp = ps->ps_pollfd; 2151 pollcacheset_t *pcacheset = ps->ps_pcacheset; 2152 pollfd_t *newfdlist; 2153 int i; 2154 int fd; 2155 file_t *fp; 2156 int error = 0; 2157 2158 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2159 ASSERT(which < ps->ps_nsets); 2160 ASSERT(pcacheset != NULL); 2161 ASSERT(pcacheset[which].pcs_pollfd == NULL); 2162 newfdlist = kmem_alloc(ps->ps_nfds * sizeof (pollfd_t), KM_SLEEP); 2163 /* 2164 * cache the new poll list in pollcachset. 2165 */ 2166 bcopy(pollfdp, newfdlist, sizeof (pollfd_t) * ps->ps_nfds); 2167 2168 pcacheset[which].pcs_pollfd = newfdlist; 2169 pcacheset[which].pcs_nfds = ps->ps_nfds; 2170 pcacheset[which].pcs_usradr = (uintptr_t)fds; 2171 2172 /* 2173 * We have saved a copy of current poll fd list in one pollcacheset. 2174 * The 'revents' field of the new list is not yet set to 0. Loop 2175 * through the new list just to do that is expensive. We do that 2176 * while polling the list. 2177 */ 2178 for (i = 0; i < ps->ps_nfds; i++) { 2179 fd = pollfdp[i].fd; 2180 /* 2181 * We also filter out the illegal poll events in the event 2182 * field for the cached poll list/set. 2183 */ 2184 if (pollfdp[i].events & ~VALID_POLL_EVENTS) { 2185 newfdlist[i].events = pollfdp[i].events = 2186 pollfdp[i].events & VALID_POLL_EVENTS; 2187 } 2188 if (fd < 0) { 2189 pollfdp[i].revents = 0; 2190 continue; 2191 } 2192 if ((fp = getf(fd)) == NULL) { 2193 pollfdp[i].revents = POLLNVAL; 2194 /* 2195 * invalidate this cache entry in the cached poll list 2196 */ 2197 newfdlist[i].fd = -1; 2198 (*fdcntp)++; 2199 continue; 2200 } 2201 /* 2202 * cache this fd. 2203 */ 2204 error = pcache_insert(ps, fp, &pollfdp[i], fdcntp, (ssize_t)i, 2205 which); 2206 releasef(fd); 2207 if (error) { 2208 /* 2209 * Here we are half way through caching a new 2210 * poll list. Undo every thing. 2211 */ 2212 pcacheset_remove_list(ps, pollfdp, 0, i, which, 0); 2213 kmem_free(newfdlist, ps->ps_nfds * sizeof (pollfd_t)); 2214 pcacheset[which].pcs_pollfd = NULL; 2215 pcacheset[which].pcs_usradr = (uintptr_t)NULL; 2216 break; 2217 } 2218 } 2219 return (error); 2220 } 2221 2222 /* 2223 * called by pollcacheclean() to set the fp NULL. It also sets polled events 2224 * in pcacheset entries to a special events 'POLLCLOSED'. Do a pollwakeup to 2225 * wake any sleeping poller, then remove the polldat from the driver. 2226 * The routine is called with ps_pcachelock held. 2227 */ 2228 void 2229 pcache_clean_entry(pollstate_t *ps, int fd) 2230 { 2231 pollcache_t *pcp; 2232 polldat_t *pdp; 2233 int i; 2234 2235 ASSERT(ps != NULL); 2236 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2237 pcp = ps->ps_pcache; 2238 ASSERT(pcp); 2239 pdp = pcache_lookup_fd(pcp, fd); 2240 ASSERT(pdp != NULL); 2241 /* 2242 * the corresponding fpollinfo in fi_list has been removed by 2243 * a close on this fd. Reset the cached fp ptr here. 2244 */ 2245 pdp->pd_fp = NULL; 2246 /* 2247 * XXX - This routine also touches data in pcacheset struct. 2248 * 2249 * set the event in cached poll lists to POLLCLOSED. This invalidate 2250 * the cached poll fd entry in that poll list, which will force a 2251 * removal of this cached entry in next poll(). The cleanup is done 2252 * at the removal time. 2253 */ 2254 ASSERT(pdp->pd_ref != NULL); 2255 for (i = 0; i < ps->ps_nsets; i++) { 2256 xref_t *refp; 2257 pollcacheset_t *pcsp; 2258 2259 refp = &pdp->pd_ref[i]; 2260 if (refp->xf_refcnt) { 2261 ASSERT(refp->xf_position >= 0); 2262 pcsp = &ps->ps_pcacheset[i]; 2263 if (refp->xf_refcnt == 1) { 2264 pcsp->pcs_pollfd[refp->xf_position].events = 2265 (short)POLLCLOSED; 2266 } 2267 if (refp->xf_refcnt > 1) { 2268 int j; 2269 /* 2270 * mark every matching entry in pcs_pollfd 2271 */ 2272 for (j = refp->xf_position; 2273 j < pcsp->pcs_nfds; j++) { 2274 if (pcsp->pcs_pollfd[j].fd == fd) { 2275 pcsp->pcs_pollfd[j].events = 2276 (short)POLLCLOSED; 2277 } 2278 } 2279 } 2280 } 2281 } 2282 if (pdp->pd_php) { 2283 /* 2284 * Using pdp->pd_php is a bit risky here, as we lack any 2285 * protection from a racing close operation which could free 2286 * that pollhead prior to pollwakeup() acquiring the locks 2287 * necessary to make it safe. 2288 */ 2289 pollwakeup(pdp->pd_php, POLLHUP); 2290 polldat_disassociate(pdp); 2291 } 2292 } 2293 2294 void 2295 pcache_wake_parents(pollcache_t *pcp) 2296 { 2297 pcachelink_t *pl, *pln; 2298 2299 ASSERT(MUTEX_HELD(&pcp->pc_lock)); 2300 2301 for (pl = pcp->pc_parents; pl != NULL; pl = pln) { 2302 mutex_enter(&pl->pcl_lock); 2303 if (pl->pcl_state == PCL_VALID) { 2304 ASSERT(pl->pcl_parent_pc != NULL); 2305 cv_broadcast(&pl->pcl_parent_pc->pc_cv); 2306 } 2307 pln = pl->pcl_parent_next; 2308 mutex_exit(&pl->pcl_lock); 2309 } 2310 } 2311 2312 /* 2313 * Initialize thread pollstate structure. 2314 * It will persist for the life of the thread, until it calls pollcleanup(). 2315 */ 2316 pollstate_t * 2317 pollstate_create() 2318 { 2319 pollstate_t *ps = curthread->t_pollstate; 2320 2321 if (ps == NULL) { 2322 /* 2323 * This is the first time this thread has ever polled, so we 2324 * have to create its pollstate structure. 2325 */ 2326 ps = kmem_zalloc(sizeof (pollstate_t), KM_SLEEP); 2327 ps->ps_nsets = POLLFDSETS; 2328 ps->ps_pcacheset = pcacheset_create(ps->ps_nsets); 2329 curthread->t_pollstate = ps; 2330 } else { 2331 ASSERT(ps->ps_depth == 0); 2332 ASSERT(ps->ps_flags == 0); 2333 ASSERT(ps->ps_pc_stack[0] == 0); 2334 } 2335 return (ps); 2336 } 2337 2338 void 2339 pollstate_destroy(pollstate_t *ps) 2340 { 2341 if (ps->ps_pollfd != NULL) { 2342 kmem_free(ps->ps_pollfd, ps->ps_nfds * sizeof (pollfd_t)); 2343 ps->ps_pollfd = NULL; 2344 } 2345 if (ps->ps_pcache != NULL) { 2346 pcache_destroy(ps->ps_pcache); 2347 ps->ps_pcache = NULL; 2348 } 2349 pcacheset_destroy(ps->ps_pcacheset, ps->ps_nsets); 2350 ps->ps_pcacheset = NULL; 2351 if (ps->ps_dpbuf != NULL) { 2352 kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); 2353 ps->ps_dpbuf = NULL; 2354 } 2355 mutex_destroy(&ps->ps_lock); 2356 kmem_free(ps, sizeof (pollstate_t)); 2357 } 2358 2359 static int 2360 pollstate_contend(pollstate_t *ps, pollcache_t *pcp) 2361 { 2362 pollstate_t *rem, *next; 2363 pollcache_t *desired_pc; 2364 int result = 0, depth_total; 2365 2366 mutex_enter(&pollstate_contenders_lock); 2367 /* 2368 * There is a small chance that the pollcache of interest became 2369 * available while we were waiting on the contenders lock. 2370 */ 2371 if (mutex_tryenter(&pcp->pc_lock) != 0) { 2372 goto out; 2373 } 2374 2375 /* 2376 * Walk the list of contended pollstates, searching for evidence of a 2377 * deadlock condition. 2378 */ 2379 depth_total = ps->ps_depth; 2380 desired_pc = pcp; 2381 for (rem = pollstate_contenders; rem != NULL; rem = next) { 2382 int i, j; 2383 next = rem->ps_contend_nextp; 2384 2385 /* Is this pollstate holding the pollcache of interest? */ 2386 for (i = 0; i < rem->ps_depth; i++) { 2387 if (rem->ps_pc_stack[i] != desired_pc) { 2388 continue; 2389 } 2390 2391 /* 2392 * The remote pollstate holds the pollcache lock we 2393 * desire. If it is waiting on a pollcache we hold, 2394 * then we can report the obvious deadlock. 2395 */ 2396 ASSERT(rem->ps_contend_pc != NULL); 2397 for (j = 0; j < ps->ps_depth; j++) { 2398 if (rem->ps_contend_pc == ps->ps_pc_stack[j]) { 2399 rem->ps_flags |= POLLSTATE_STALEMATE; 2400 result = -1; 2401 goto out; 2402 } 2403 } 2404 2405 /* 2406 * The remote pollstate is not blocking on a pollcache 2407 * which would deadlock against us. That pollcache 2408 * may, however, be held by a pollstate which would 2409 * result in a deadlock. 2410 * 2411 * To detect such a condition, we continue walking 2412 * through the list using the pollcache blocking the 2413 * remote thread as our new search target. 2414 * 2415 * Return to the front of pollstate_contenders since it 2416 * is not ordered to guarantee complete dependency 2417 * traversal. The below depth tracking places an upper 2418 * bound on iterations. 2419 */ 2420 desired_pc = rem->ps_contend_pc; 2421 next = pollstate_contenders; 2422 2423 /* 2424 * The recursion depth of the remote pollstate is used 2425 * to calculate a final depth for the local /dev/poll 2426 * recursion, since those locks will be acquired 2427 * eventually. If that value exceeds the defined 2428 * limit, we can report the failure now instead of 2429 * recursing to that failure depth. 2430 */ 2431 depth_total += (rem->ps_depth - i); 2432 if (depth_total >= POLLMAXDEPTH) { 2433 result = -1; 2434 goto out; 2435 } 2436 } 2437 } 2438 2439 /* 2440 * No deadlock partner was found. The only course of action is to 2441 * record ourself as a contended pollstate and wait for the pollcache 2442 * mutex to become available. 2443 */ 2444 ps->ps_contend_pc = pcp; 2445 ps->ps_contend_nextp = pollstate_contenders; 2446 ps->ps_contend_pnextp = &pollstate_contenders; 2447 if (pollstate_contenders != NULL) { 2448 pollstate_contenders->ps_contend_pnextp = 2449 &ps->ps_contend_nextp; 2450 } 2451 pollstate_contenders = ps; 2452 2453 mutex_exit(&pollstate_contenders_lock); 2454 mutex_enter(&pcp->pc_lock); 2455 mutex_enter(&pollstate_contenders_lock); 2456 2457 /* 2458 * Our acquisition of the pollcache mutex may be due to another thread 2459 * giving up in the face of deadlock with us. If that is the case, 2460 * we too should report the failure. 2461 */ 2462 if ((ps->ps_flags & POLLSTATE_STALEMATE) != 0) { 2463 result = -1; 2464 ps->ps_flags &= ~POLLSTATE_STALEMATE; 2465 mutex_exit(&pcp->pc_lock); 2466 } 2467 2468 /* Remove ourself from the contenders list. */ 2469 if (ps->ps_contend_nextp != NULL) { 2470 ps->ps_contend_nextp->ps_contend_pnextp = 2471 ps->ps_contend_pnextp; 2472 } 2473 *ps->ps_contend_pnextp = ps->ps_contend_nextp; 2474 ps->ps_contend_pc = NULL; 2475 ps->ps_contend_nextp = NULL; 2476 ps->ps_contend_pnextp = NULL; 2477 2478 out: 2479 mutex_exit(&pollstate_contenders_lock); 2480 return (result); 2481 } 2482 2483 int 2484 pollstate_enter(pollcache_t *pcp) 2485 { 2486 pollstate_t *ps = curthread->t_pollstate; 2487 int i; 2488 2489 if (ps == NULL) { 2490 /* 2491 * The thread pollstate may not be initialized if VOP_POLL is 2492 * called on a recursion-enabled /dev/poll handle from outside 2493 * the poll() or /dev/poll codepaths. 2494 */ 2495 return (PSE_FAIL_POLLSTATE); 2496 } 2497 if (ps->ps_depth >= POLLMAXDEPTH) { 2498 return (PSE_FAIL_DEPTH); 2499 } 2500 /* 2501 * Check the desired pollcache against pollcaches we already have 2502 * locked. Such a loop is the most simple deadlock scenario. 2503 */ 2504 for (i = 0; i < ps->ps_depth; i++) { 2505 if (ps->ps_pc_stack[i] == pcp) { 2506 return (PSE_FAIL_LOOP); 2507 } 2508 } 2509 ASSERT(ps->ps_pc_stack[i] == NULL); 2510 2511 if (ps->ps_depth == 0) { 2512 /* Locking initial the pollcache requires no caution */ 2513 mutex_enter(&pcp->pc_lock); 2514 } else if (mutex_tryenter(&pcp->pc_lock) == 0) { 2515 if (pollstate_contend(ps, pcp) != 0) { 2516 /* This pollcache cannot safely be locked. */ 2517 return (PSE_FAIL_DEADLOCK); 2518 } 2519 } 2520 2521 ps->ps_pc_stack[ps->ps_depth++] = pcp; 2522 return (PSE_SUCCESS); 2523 } 2524 2525 void 2526 pollstate_exit(pollcache_t *pcp) 2527 { 2528 pollstate_t *ps = curthread->t_pollstate; 2529 2530 VERIFY(ps != NULL); 2531 VERIFY(ps->ps_pc_stack[ps->ps_depth - 1] == pcp); 2532 2533 mutex_exit(&pcp->pc_lock); 2534 ps->ps_pc_stack[--ps->ps_depth] = NULL; 2535 VERIFY(ps->ps_depth >= 0); 2536 } 2537 2538 2539 /* 2540 * We are holding the appropriate uf_lock entering this routine. 2541 * Bump up the ps_busy count to prevent the thread from exiting. 2542 */ 2543 void 2544 pollblockexit(fpollinfo_t *fpip) 2545 { 2546 for (; fpip; fpip = fpip->fp_next) { 2547 pollcache_t *pcp = fpip->fp_thread->t_pollstate->ps_pcache; 2548 2549 mutex_enter(&pcp->pc_no_exit); 2550 pcp->pc_busy++; /* prevents exit()'s */ 2551 mutex_exit(&pcp->pc_no_exit); 2552 } 2553 } 2554 2555 /* 2556 * Complete phase 2 of cached poll fd cleanup. Call pcache_clean_entry to mark 2557 * the pcacheset events field POLLCLOSED to force the next poll() to remove 2558 * this cache entry. We can't clean the polldat entry clean up here because 2559 * lwp block in poll() needs the info to return. Wakeup anyone blocked in 2560 * poll and let exiting lwp go. No lock is help upon entry. So it's OK for 2561 * pcache_clean_entry to call pollwakeup(). 2562 */ 2563 void 2564 pollcacheclean(fpollinfo_t *fip, int fd) 2565 { 2566 struct fpollinfo *fpip, *fpip2; 2567 2568 fpip = fip; 2569 while (fpip) { 2570 pollstate_t *ps = fpip->fp_thread->t_pollstate; 2571 pollcache_t *pcp = ps->ps_pcache; 2572 2573 mutex_enter(&ps->ps_lock); 2574 pcache_clean_entry(ps, fd); 2575 mutex_exit(&ps->ps_lock); 2576 mutex_enter(&pcp->pc_no_exit); 2577 pcp->pc_busy--; 2578 if (pcp->pc_busy == 0) { 2579 /* 2580 * Wakeup the thread waiting in 2581 * thread_exit(). 2582 */ 2583 cv_signal(&pcp->pc_busy_cv); 2584 } 2585 mutex_exit(&pcp->pc_no_exit); 2586 2587 fpip2 = fpip; 2588 fpip = fpip->fp_next; 2589 kmem_free(fpip2, sizeof (fpollinfo_t)); 2590 } 2591 } 2592 2593 /* 2594 * one of the cache line's counter is wrapping around. Reset all cache line 2595 * counters to zero except one. This is simplistic, but probably works 2596 * effectively. 2597 */ 2598 void 2599 pcacheset_reset_count(pollstate_t *ps, int index) 2600 { 2601 int i; 2602 2603 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2604 for (i = 0; i < ps->ps_nsets; i++) { 2605 if (ps->ps_pcacheset[i].pcs_pollfd != NULL) { 2606 ps->ps_pcacheset[i].pcs_count = 0; 2607 } 2608 } 2609 ps->ps_pcacheset[index].pcs_count = 1; 2610 } 2611 2612 /* 2613 * this routine implements poll cache list replacement policy. 2614 * It is currently choose the "least used". 2615 */ 2616 int 2617 pcacheset_replace(pollstate_t *ps) 2618 { 2619 int i; 2620 int index = 0; 2621 2622 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2623 for (i = 1; i < ps->ps_nsets; i++) { 2624 if (ps->ps_pcacheset[index].pcs_count > 2625 ps->ps_pcacheset[i].pcs_count) { 2626 index = i; 2627 } 2628 } 2629 ps->ps_pcacheset[index].pcs_count = 0; 2630 return (index); 2631 } 2632 2633 /* 2634 * this routine is called by strclose to remove remaining polldat struct on 2635 * the pollhead list of the device being closed. There are two reasons as why 2636 * the polldat structures still remain on the pollhead list: 2637 * 2638 * (1) The layered device(e.g.the console driver). 2639 * In this case, the existence of a polldat implies that the thread putting 2640 * the polldat on this list has not exited yet. Before the thread exits, it 2641 * will have to hold this pollhead lock to remove the polldat. So holding the 2642 * pollhead lock here effectively prevents the thread which put the polldat 2643 * on this list from exiting. 2644 * 2645 * (2) /dev/poll. 2646 * When a polled fd is cached in /dev/poll, its polldat will remain on the 2647 * pollhead list if the process has not done a POLLREMOVE before closing the 2648 * polled fd. We just unlink it here. 2649 */ 2650 void 2651 pollhead_clean(pollhead_t *php) 2652 { 2653 polldat_t *pdp; 2654 2655 /* 2656 * In case(1), while we must prevent the thread in question from 2657 * exiting, we must also obey the proper locking order, i.e. 2658 * (ps_lock -> phlock). 2659 */ 2660 PH_ENTER(php); 2661 while (php->ph_list != NULL) { 2662 pollstate_t *ps; 2663 pollcache_t *pcp; 2664 2665 pdp = php->ph_list; 2666 ASSERT(pdp->pd_php == php); 2667 if (pdp->pd_thread == NULL) { 2668 /* 2669 * This is case(2). Since the ph_lock is sufficient 2670 * to synchronize this lwp with any other /dev/poll 2671 * lwp, just unlink the polldat. 2672 */ 2673 php->ph_list = pdp->pd_next; 2674 pdp->pd_php = NULL; 2675 pdp->pd_next = NULL; 2676 continue; 2677 } 2678 ps = pdp->pd_thread->t_pollstate; 2679 ASSERT(ps != NULL); 2680 pcp = pdp->pd_pcache; 2681 ASSERT(pcp != NULL); 2682 mutex_enter(&pcp->pc_no_exit); 2683 pcp->pc_busy++; /* prevents exit()'s */ 2684 mutex_exit(&pcp->pc_no_exit); 2685 /* 2686 * Now get the locks in proper order to avoid deadlock. 2687 */ 2688 PH_EXIT(php); 2689 mutex_enter(&ps->ps_lock); 2690 /* 2691 * while we dropped the pollhead lock, the element could be 2692 * taken off the list already. 2693 */ 2694 PH_ENTER(php); 2695 if (pdp->pd_php == php) { 2696 ASSERT(pdp == php->ph_list); 2697 php->ph_list = pdp->pd_next; 2698 pdp->pd_php = NULL; 2699 pdp->pd_next = NULL; 2700 } 2701 PH_EXIT(php); 2702 mutex_exit(&ps->ps_lock); 2703 mutex_enter(&pcp->pc_no_exit); 2704 pcp->pc_busy--; 2705 if (pcp->pc_busy == 0) { 2706 /* 2707 * Wakeup the thread waiting in 2708 * thread_exit(). 2709 */ 2710 cv_signal(&pcp->pc_busy_cv); 2711 } 2712 mutex_exit(&pcp->pc_no_exit); 2713 PH_ENTER(php); 2714 } 2715 PH_EXIT(php); 2716 } 2717 2718 /* 2719 * The remove_list is called to cleanup a partially cached 'current' list or 2720 * to remove a partial list which is no longer cached. The flag value of 1 2721 * indicates the second case. 2722 */ 2723 void 2724 pcacheset_remove_list(pollstate_t *ps, pollfd_t *pollfdp, int start, int end, 2725 int cacheindex, int flag) 2726 { 2727 int i; 2728 2729 ASSERT(MUTEX_HELD(&ps->ps_lock)); 2730 for (i = start; i < end; i++) { 2731 if ((pollfdp[i].fd >= 0) && 2732 (flag || !(pollfdp[i].revents & POLLNVAL))) { 2733 if (pcache_delete_fd(ps, pollfdp[i].fd, i, cacheindex, 2734 (uint_t)pollfdp[i].events)) { 2735 int j; 2736 int fd = pollfdp[i].fd; 2737 2738 for (j = i + 1; j < end; j++) { 2739 if (pollfdp[j].fd == fd) { 2740 pcache_update_xref( 2741 ps->ps_pcache, fd, 2742 (ssize_t)j, cacheindex); 2743 break; 2744 } 2745 } 2746 ASSERT(j <= end); 2747 } 2748 } 2749 } 2750 } 2751 2752 #ifdef DEBUG 2753 2754 #include<sys/strsubr.h> 2755 /* 2756 * make sure curthread is not on anyone's pollhead list any more. 2757 */ 2758 static void 2759 pollcheckphlist() 2760 { 2761 int i; 2762 file_t *fp; 2763 uf_entry_t *ufp; 2764 uf_info_t *fip = P_FINFO(curproc); 2765 struct stdata *stp; 2766 polldat_t *pdp; 2767 2768 mutex_enter(&fip->fi_lock); 2769 for (i = 0; i < fip->fi_nfiles; i++) { 2770 UF_ENTER(ufp, fip, i); 2771 if ((fp = ufp->uf_file) != NULL) { 2772 if ((stp = fp->f_vnode->v_stream) != NULL) { 2773 PH_ENTER(&stp->sd_pollist); 2774 pdp = stp->sd_pollist.ph_list; 2775 while (pdp) { 2776 ASSERT(pdp->pd_thread != curthread); 2777 pdp = pdp->pd_next; 2778 } 2779 PH_EXIT(&stp->sd_pollist); 2780 } 2781 } 2782 UF_EXIT(ufp); 2783 } 2784 mutex_exit(&fip->fi_lock); 2785 } 2786 2787 /* 2788 * for resolved set poll list, the xref info in the pcache should be 2789 * consistent with this poll list. 2790 */ 2791 static int 2792 pollcheckxref(pollstate_t *ps, int cacheindex) 2793 { 2794 pollfd_t *pollfdp = ps->ps_pcacheset[cacheindex].pcs_pollfd; 2795 pollcache_t *pcp = ps->ps_pcache; 2796 polldat_t *pdp; 2797 int i; 2798 xref_t *refp; 2799 2800 for (i = 0; i < ps->ps_pcacheset[cacheindex].pcs_nfds; i++) { 2801 if (pollfdp[i].fd < 0) { 2802 continue; 2803 } 2804 pdp = pcache_lookup_fd(pcp, pollfdp[i].fd); 2805 ASSERT(pdp != NULL); 2806 ASSERT(pdp->pd_ref != NULL); 2807 refp = &pdp->pd_ref[cacheindex]; 2808 if (refp->xf_position >= 0) { 2809 ASSERT(refp->xf_refcnt >= 1); 2810 ASSERT(pollfdp[refp->xf_position].fd == pdp->pd_fd); 2811 if (refp->xf_refcnt > 1) { 2812 int j; 2813 int count = 0; 2814 2815 for (j = refp->xf_position; 2816 j < ps->ps_pcacheset[cacheindex].pcs_nfds; 2817 j++) { 2818 if (pollfdp[j].fd == pdp->pd_fd) { 2819 count++; 2820 } 2821 } 2822 ASSERT(count == refp->xf_refcnt); 2823 } 2824 } 2825 } 2826 return (1); 2827 } 2828 2829 /* 2830 * For every cached pollfd, its polldat struct should be consistent with 2831 * what is in the pcacheset lists. 2832 */ 2833 static void 2834 checkpolldat(pollstate_t *ps) 2835 { 2836 pollcache_t *pcp = ps->ps_pcache; 2837 polldat_t **hashtbl; 2838 int i; 2839 2840 hashtbl = pcp->pc_hash; 2841 for (i = 0; i < pcp->pc_hashsize; i++) { 2842 polldat_t *pdp; 2843 2844 for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { 2845 ASSERT(pdp->pd_ref != NULL); 2846 if (pdp->pd_count > 0) { 2847 xref_t *refp; 2848 int j; 2849 pollcacheset_t *pcsp; 2850 pollfd_t *pollfd; 2851 2852 for (j = 0; j < ps->ps_nsets; j++) { 2853 refp = &pdp->pd_ref[j]; 2854 if (refp->xf_refcnt > 0) { 2855 pcsp = &ps->ps_pcacheset[j]; 2856 ASSERT(refp->xf_position < 2857 pcsp->pcs_nfds); 2858 pollfd = pcsp->pcs_pollfd; 2859 ASSERT(pdp->pd_fd == 2860 pollfd[refp->xf_position]. 2861 fd); 2862 } 2863 } 2864 } 2865 } 2866 } 2867 } 2868 2869 /* 2870 * every wfd element on ph_list must have a corresponding fpollinfo on the 2871 * uf_fpollinfo list. This is a variation of infpollinfo() w/o holding locks. 2872 */ 2873 void 2874 checkwfdlist(vnode_t *vp, fpollinfo_t *fpip) 2875 { 2876 stdata_t *stp; 2877 polldat_t *pdp; 2878 fpollinfo_t *fpip2; 2879 2880 if ((stp = vp->v_stream) == NULL) { 2881 return; 2882 } 2883 PH_ENTER(&stp->sd_pollist); 2884 for (pdp = stp->sd_pollist.ph_list; pdp; pdp = pdp->pd_next) { 2885 if (pdp->pd_thread != NULL && 2886 pdp->pd_thread->t_procp == curthread->t_procp) { 2887 for (fpip2 = fpip; fpip2; fpip2 = fpip2->fp_next) { 2888 if (pdp->pd_thread == fpip2->fp_thread) { 2889 break; 2890 } 2891 } 2892 ASSERT(fpip2 != NULL); 2893 } 2894 } 2895 PH_EXIT(&stp->sd_pollist); 2896 } 2897 2898 /* 2899 * For each cached fd whose bit is not set in bitmap, its revents field in 2900 * current poll list should be 0. 2901 */ 2902 static int 2903 pollcheckrevents(pollstate_t *ps, int begin, int end, int cacheindex) 2904 { 2905 pollcache_t *pcp = ps->ps_pcache; 2906 pollfd_t *pollfdp = ps->ps_pollfd; 2907 int i; 2908 2909 for (i = begin; i < end; i++) { 2910 polldat_t *pdp; 2911 2912 ASSERT(!BT_TEST(pcp->pc_bitmap, i)); 2913 pdp = pcache_lookup_fd(pcp, i); 2914 if (pdp && pdp->pd_fp != NULL) { 2915 xref_t *refp; 2916 int entry; 2917 2918 ASSERT(pdp->pd_ref != NULL); 2919 refp = &pdp->pd_ref[cacheindex]; 2920 if (refp->xf_refcnt == 0) { 2921 continue; 2922 } 2923 entry = refp->xf_position; 2924 ASSERT(entry >= 0); 2925 ASSERT(pollfdp[entry].revents == 0); 2926 if (refp->xf_refcnt > 1) { 2927 int j; 2928 2929 for (j = entry + 1; j < ps->ps_nfds; j++) { 2930 if (pollfdp[j].fd == i) { 2931 ASSERT(pollfdp[j].revents == 0); 2932 } 2933 } 2934 } 2935 } 2936 } 2937 return (1); 2938 } 2939 2940 #endif /* DEBUG */ 2941 2942 pollcache_t * 2943 pcache_alloc() 2944 { 2945 return (kmem_zalloc(sizeof (pollcache_t), KM_SLEEP)); 2946 } 2947 2948 void 2949 pcache_create(pollcache_t *pcp, nfds_t nfds) 2950 { 2951 size_t mapsize; 2952 2953 /* 2954 * allocate enough bits for the poll fd list 2955 */ 2956 if ((mapsize = POLLMAPCHUNK) <= nfds) { 2957 mapsize = (nfds + POLLMAPCHUNK - 1) & ~(POLLMAPCHUNK - 1); 2958 } 2959 pcp->pc_bitmap = kmem_zalloc((mapsize / BT_NBIPUL) * sizeof (ulong_t), 2960 KM_SLEEP); 2961 pcp->pc_mapsize = mapsize; 2962 /* 2963 * The hash size is at least POLLHASHCHUNKSZ. If user polls a large 2964 * number of fd to start with, allocate a bigger hash table (to the 2965 * nearest multiple of POLLHASHCHUNKSZ) because dynamically growing a 2966 * hash table is expensive. 2967 */ 2968 if (nfds < POLLHASHCHUNKSZ) { 2969 pcp->pc_hashsize = POLLHASHCHUNKSZ; 2970 } else { 2971 pcp->pc_hashsize = (nfds + POLLHASHCHUNKSZ - 1) & 2972 ~(POLLHASHCHUNKSZ - 1); 2973 } 2974 pcp->pc_hash = kmem_zalloc(pcp->pc_hashsize * sizeof (polldat_t *), 2975 KM_SLEEP); 2976 } 2977 2978 void 2979 pcache_destroy(pollcache_t *pcp) 2980 { 2981 polldat_t **hashtbl; 2982 int i; 2983 2984 hashtbl = pcp->pc_hash; 2985 for (i = 0; i < pcp->pc_hashsize; i++) { 2986 if (hashtbl[i] != NULL) { 2987 polldat_t *pdp, *pdp2; 2988 2989 pdp = hashtbl[i]; 2990 while (pdp != NULL) { 2991 pdp2 = pdp->pd_hashnext; 2992 if (pdp->pd_ref != NULL) { 2993 kmem_free(pdp->pd_ref, sizeof (xref_t) * 2994 pdp->pd_nsets); 2995 } 2996 kmem_free(pdp, sizeof (polldat_t)); 2997 pdp = pdp2; 2998 pcp->pc_fdcount--; 2999 } 3000 } 3001 } 3002 ASSERT(pcp->pc_fdcount == 0); 3003 kmem_free(pcp->pc_hash, sizeof (polldat_t *) * pcp->pc_hashsize); 3004 kmem_free(pcp->pc_bitmap, 3005 sizeof (ulong_t) * (pcp->pc_mapsize/BT_NBIPUL)); 3006 mutex_destroy(&pcp->pc_no_exit); 3007 mutex_destroy(&pcp->pc_lock); 3008 cv_destroy(&pcp->pc_cv); 3009 cv_destroy(&pcp->pc_busy_cv); 3010 kmem_free(pcp, sizeof (pollcache_t)); 3011 } 3012 3013 pollcacheset_t * 3014 pcacheset_create(int nsets) 3015 { 3016 return (kmem_zalloc(sizeof (pollcacheset_t) * nsets, KM_SLEEP)); 3017 } 3018 3019 void 3020 pcacheset_destroy(pollcacheset_t *pcsp, int nsets) 3021 { 3022 int i; 3023 3024 for (i = 0; i < nsets; i++) { 3025 if (pcsp[i].pcs_pollfd != NULL) { 3026 kmem_free(pcsp[i].pcs_pollfd, pcsp[i].pcs_nfds * 3027 sizeof (pollfd_t)); 3028 } 3029 } 3030 kmem_free(pcsp, sizeof (pollcacheset_t) * nsets); 3031 } 3032 3033 /* 3034 * Check each duplicated poll fd in the poll list. It may be necessary to 3035 * VOP_POLL the same fd again using different poll events. getf() has been 3036 * done by caller. This routine returns 0 if it can sucessfully process the 3037 * entire poll fd list. It returns -1 if underlying vnode has changed during 3038 * a VOP_POLL, in which case the caller has to repoll. It returns a positive 3039 * value if VOP_POLL failed. 3040 */ 3041 static int 3042 plist_chkdupfd(file_t *fp, polldat_t *pdp, pollstate_t *psp, pollfd_t *pollfdp, 3043 int entry, int *fdcntp) 3044 { 3045 int i; 3046 int fd; 3047 nfds_t nfds = psp->ps_nfds; 3048 3049 fd = pollfdp[entry].fd; 3050 for (i = entry + 1; i < nfds; i++) { 3051 if (pollfdp[i].fd == fd) { 3052 if (pollfdp[i].events == pollfdp[entry].events) { 3053 if ((pollfdp[i].revents = 3054 pollfdp[entry].revents) != 0) { 3055 (*fdcntp)++; 3056 } 3057 } else { 3058 3059 int error; 3060 pollhead_t *php; 3061 pollcache_t *pcp = psp->ps_pcache; 3062 3063 /* 3064 * the events are different. VOP_POLL on this 3065 * fd so that we don't miss any revents. 3066 */ 3067 php = NULL; 3068 ASSERT(curthread->t_pollcache == NULL); 3069 error = VOP_POLL(fp->f_vnode, 3070 pollfdp[i].events, 0, 3071 &pollfdp[i].revents, &php, NULL); 3072 if (error) { 3073 return (error); 3074 } 3075 /* 3076 * layered devices(e.g. console driver) 3077 * may change the vnode and thus the pollhead 3078 * pointer out from underneath us. 3079 */ 3080 if (php != NULL && pdp->pd_php != NULL && 3081 php != pdp->pd_php) { 3082 polldat_disassociate(pdp); 3083 polldat_associate(pdp, php); 3084 /* 3085 * We could have missed a wakeup on the 3086 * new target device. Make sure the new 3087 * target gets polled once. 3088 */ 3089 BT_SET(pcp->pc_bitmap, fd); 3090 return (-1); 3091 } 3092 if (pollfdp[i].revents) { 3093 (*fdcntp)++; 3094 } 3095 } 3096 } 3097 } 3098 return (0); 3099 } 3100