xref: /illumos-gate/usr/src/uts/common/io/devpoll.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012 by Delphix. All rights reserved.
28  * Copyright 2019 Joyent, Inc.
29  * Copyright 2022 Oxide Computer Company
30  */
31 
32 #include <sys/types.h>
33 #include <sys/devops.h>
34 #include <sys/conf.h>
35 #include <sys/modctl.h>
36 #include <sys/sunddi.h>
37 #include <sys/stat.h>
38 #include <sys/poll_impl.h>
39 #include <sys/errno.h>
40 #include <sys/kmem.h>
41 #include <sys/mkdev.h>
42 #include <sys/debug.h>
43 #include <sys/file.h>
44 #include <sys/sysmacros.h>
45 #include <sys/systm.h>
46 #include <sys/bitmap.h>
47 #include <sys/devpoll.h>
48 #include <sys/rctl.h>
49 #include <sys/resource.h>
50 #include <sys/schedctl.h>
51 #include <sys/epoll.h>
52 
53 #define	RESERVED	1
54 
55 /* local data struct */
56 static	dp_entry_t	**devpolltbl;	/* dev poll entries */
57 static	size_t		dptblsize;
58 
59 static	kmutex_t	devpoll_lock;	/* lock protecting dev tbl */
60 int			devpoll_init;	/* is /dev/poll initialized already */
61 
62 /* device local functions */
63 
64 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
65 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
66 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
67     int *rvalp);
68 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
69     struct pollhead **phpp);
70 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
71 static dev_info_t *dpdevi;
72 
73 
74 static struct cb_ops    dp_cb_ops = {
75 	dpopen,			/* open */
76 	dpclose,		/* close */
77 	nodev,			/* strategy */
78 	nodev,			/* print */
79 	nodev,			/* dump */
80 	nodev,			/* read */
81 	dpwrite,		/* write */
82 	dpioctl,		/* ioctl */
83 	nodev,			/* devmap */
84 	nodev,			/* mmap */
85 	nodev,			/* segmap */
86 	dppoll,			/* poll */
87 	ddi_prop_op,		/* prop_op */
88 	(struct streamtab *)0,	/* streamtab */
89 	D_MP,			/* flags */
90 	CB_REV,			/* cb_ops revision */
91 	nodev,			/* aread */
92 	nodev			/* awrite */
93 };
94 
95 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
96 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
97 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
98 
99 static struct dev_ops dp_ops = {
100 	DEVO_REV,		/* devo_rev */
101 	0,			/* refcnt */
102 	dpinfo,			/* info */
103 	nulldev,		/* identify */
104 	nulldev,		/* probe */
105 	dpattach,		/* attach */
106 	dpdetach,		/* detach */
107 	nodev,			/* reset */
108 	&dp_cb_ops,		/* driver operations */
109 	(struct bus_ops *)NULL, /* bus operations */
110 	nulldev,		/* power */
111 	ddi_quiesce_not_needed,		/* quiesce */
112 };
113 
114 
115 static struct modldrv modldrv = {
116 	&mod_driverops,		/* type of module - a driver */
117 	"/dev/poll driver",
118 	&dp_ops,
119 };
120 
121 static struct modlinkage modlinkage = {
122 	MODREV_1,
123 	(void *)&modldrv,
124 	NULL
125 };
126 
127 static void pcachelink_assoc(pollcache_t *, pollcache_t *);
128 static void pcachelink_mark_stale(pollcache_t *);
129 static void pcachelink_purge_stale(pollcache_t *);
130 static void pcachelink_purge_all(pollcache_t *);
131 
132 
133 /*
134  * Locking Design
135  *
136  * The /dev/poll driver shares most of its code with poll sys call whose
137  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
138  * structure is per lwp. An implicit assumption is made there that some
139  * portion of pollcache will never be touched by other lwps. E.g., in
140  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
141  * This assumption is not true for /dev/poll; hence the need for extra
142  * locking.
143  *
144  * To allow more parallelism, each /dev/poll file descriptor (indexed by
145  * minor number) has its own lock. Since read (dpioctl) is a much more
146  * frequent operation than write, we want to allow multiple reads on same
147  * /dev/poll fd. However, we prevent writes from being starved by giving
148  * priority to write operation. Theoretically writes can starve reads as
149  * well. But in practical sense this is not important because (1) writes
150  * happens less often than reads, and (2) write operation defines the
151  * content of poll fd a cache set. If writes happens so often that they
152  * can starve reads, that means the cached set is very unstable. It may
153  * not make sense to read an unstable cache set anyway. Therefore, the
154  * writers starving readers case is not handled in this design.
155  */
156 
157 int
158 _init()
159 {
160 	int	error;
161 
162 	dptblsize = DEVPOLLSIZE;
163 	devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
164 	mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
165 	devpoll_init = 1;
166 	if ((error = mod_install(&modlinkage)) != 0) {
167 		kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
168 		devpoll_init = 0;
169 	}
170 	return (error);
171 }
172 
173 int
174 _fini()
175 {
176 	int error;
177 
178 	if ((error = mod_remove(&modlinkage)) != 0) {
179 		return (error);
180 	}
181 	mutex_destroy(&devpoll_lock);
182 	kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
183 	return (0);
184 }
185 
186 int
187 _info(struct modinfo *modinfop)
188 {
189 	return (mod_info(&modlinkage, modinfop));
190 }
191 
192 /*ARGSUSED*/
193 static int
194 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
195 {
196 	if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, 0)
197 	    == DDI_FAILURE) {
198 		ddi_remove_minor_node(devi, NULL);
199 		return (DDI_FAILURE);
200 	}
201 	dpdevi = devi;
202 	return (DDI_SUCCESS);
203 }
204 
205 static int
206 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
207 {
208 	if (cmd != DDI_DETACH)
209 		return (DDI_FAILURE);
210 
211 	ddi_remove_minor_node(devi, NULL);
212 	return (DDI_SUCCESS);
213 }
214 
215 /* ARGSUSED */
216 static int
217 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
218 {
219 	int error;
220 
221 	switch (infocmd) {
222 	case DDI_INFO_DEVT2DEVINFO:
223 		*result = (void *)dpdevi;
224 		error = DDI_SUCCESS;
225 		break;
226 	case DDI_INFO_DEVT2INSTANCE:
227 		*result = (void *)0;
228 		error = DDI_SUCCESS;
229 		break;
230 	default:
231 		error = DDI_FAILURE;
232 	}
233 	return (error);
234 }
235 
236 /*
237  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
238  * differences are: (1) /dev/poll requires scanning the bitmap starting at
239  * where it was stopped last time, instead of always starting from 0,
240  * (2) since user may not have cleaned up the cached fds when they are
241  * closed, some polldats in cache may refer to closed or reused fds. We
242  * need to check for those cases.
243  *
244  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
245  *	 poll(2) caches but NOT for /dev/poll caches. So expect some
246  *	 stale entries!
247  */
248 static int
249 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, pollcache_t *pcp, nfds_t nfds,
250     int *fdcntp)
251 {
252 	int		start, ostart, end, fdcnt, error = 0;
253 	boolean_t	done, no_wrap;
254 	pollfd_t	*pfdp;
255 	epoll_event_t	*epoll;
256 	const short	mask = POLLRDHUP | POLLWRBAND;
257 	const boolean_t	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
258 
259 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
260 	if (pcp->pc_bitmap == NULL) {
261 		/* No Need to search because no poll fd has been cached. */
262 		return (0);
263 	}
264 
265 	if (is_epoll) {
266 		pfdp = NULL;
267 		epoll = (epoll_event_t *)dpbuf;
268 	} else {
269 		pfdp = (pollfd_t *)dpbuf;
270 		epoll = NULL;
271 	}
272 retry:
273 	start = ostart = pcp->pc_mapstart;
274 	end = pcp->pc_mapend;
275 
276 	if (start == 0) {
277 		/*
278 		 * started from every begining, no need to wrap around.
279 		 */
280 		no_wrap = B_TRUE;
281 	} else {
282 		no_wrap = B_FALSE;
283 	}
284 	done = B_FALSE;
285 	fdcnt = 0;
286 	while ((fdcnt < nfds) && !done) {
287 		pollhead_t *php = NULL;
288 		short revent = 0;
289 		uf_entry_gen_t gen;
290 		int fd;
291 
292 		/*
293 		 * Examine the bit map in a circular fashion
294 		 * to avoid starvation. Always resume from
295 		 * last stop. Scan till end of the map. Then
296 		 * wrap around.
297 		 */
298 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
299 		ASSERT(fd <= end);
300 		if (fd >= 0) {
301 			file_t *fp;
302 			polldat_t *pdp;
303 
304 			if (fd == end) {
305 				if (no_wrap) {
306 					done = B_TRUE;
307 				} else {
308 					start = 0;
309 					end = ostart - 1;
310 					no_wrap = B_TRUE;
311 				}
312 			} else {
313 				start = fd + 1;
314 			}
315 			pdp = pcache_lookup_fd(pcp, fd);
316 repoll:
317 			ASSERT(pdp != NULL);
318 			ASSERT(pdp->pd_fd == fd);
319 			if (pdp->pd_fp == NULL) {
320 				/*
321 				 * The fd is POLLREMOVed. This fd is
322 				 * logically no longer cached. So move
323 				 * on to the next one.
324 				 */
325 				continue;
326 			}
327 			if ((fp = getf_gen(fd, &gen)) == NULL) {
328 				if (is_epoll) {
329 					/*
330 					 * In the epoll compatibility case, we
331 					 * actually perform the implicit
332 					 * removal to remain closer to the
333 					 * epoll semantics.
334 					 */
335 					pdp->pd_fp = NULL;
336 					pdp->pd_events = 0;
337 
338 					polldat_disassociate(pdp);
339 
340 					BT_CLEAR(pcp->pc_bitmap, fd);
341 				} else if (pfdp != NULL) {
342 					/*
343 					 * The fd has been closed, but user has
344 					 * not done a POLLREMOVE on this fd
345 					 * yet. Instead of cleaning it here
346 					 * implicitly, we return POLLNVAL. This
347 					 * is consistent with poll(2) polling a
348 					 * closed fd. Hope this will remind
349 					 * user to do a POLLREMOVE.
350 					 */
351 					pfdp[fdcnt].fd = fd;
352 					pfdp[fdcnt].revents = POLLNVAL;
353 					fdcnt++;
354 				}
355 				continue;
356 			}
357 
358 			/*
359 			 * Detect a change to the resource underlying a cached
360 			 * file descriptor.  While the fd generation comparison
361 			 * will catch nearly all cases, the file_t comparison
362 			 * is maintained as a failsafe as well.
363 			 */
364 			if (gen != pdp->pd_gen || fp != pdp->pd_fp) {
365 				/*
366 				 * The user is polling on a cached fd which was
367 				 * closed and then reused.  Unfortunately there
368 				 * is no good way to communicate this fact to
369 				 * the consumer.
370 				 *
371 				 * When this situation has been detected, it's
372 				 * likely that any existing pollhead is
373 				 * ill-suited to perform proper wake-ups.
374 				 *
375 				 * Clean up the old entry under the expectation
376 				 * that a valid one will be provided as part of
377 				 * the later VOP_POLL.
378 				 */
379 				polldat_disassociate(pdp);
380 
381 				/*
382 				 * Since epoll is expected to act on the
383 				 * underlying 'struct file' (in Linux terms,
384 				 * our vnode_t would be a closer analog) rather
385 				 * than the fd itself, an implicit remove
386 				 * is necessary under these circumstances to
387 				 * suppress any results (or errors) from the
388 				 * new resource occupying the fd.
389 				 */
390 				if (is_epoll) {
391 					pdp->pd_fp = NULL;
392 					pdp->pd_events = 0;
393 					BT_CLEAR(pcp->pc_bitmap, fd);
394 					releasef(fd);
395 					continue;
396 				} else {
397 					/*
398 					 * Regular /dev/poll is unbothered
399 					 * about the fd reassignment.
400 					 */
401 					pdp->pd_fp = fp;
402 					pdp->pd_gen = gen;
403 				}
404 			}
405 
406 			/*
407 			 * Skip entries marked with the sentinal value for
408 			 * having already fired under oneshot conditions.
409 			 */
410 			if (pdp->pd_events == POLLONESHOT) {
411 				releasef(fd);
412 				BT_CLEAR(pcp->pc_bitmap, fd);
413 				continue;
414 			}
415 
416 			/*
417 			 * XXX - pollrelock() logic needs to know which
418 			 * which pollcache lock to grab. It'd be a
419 			 * cleaner solution if we could pass pcp as
420 			 * an arguement in VOP_POLL interface instead
421 			 * of implicitly passing it using thread_t
422 			 * struct. On the other hand, changing VOP_POLL
423 			 * interface will require all driver/file system
424 			 * poll routine to change. May want to revisit
425 			 * the tradeoff later.
426 			 */
427 			curthread->t_pollcache = pcp;
428 			error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
429 			    &revent, &php, NULL);
430 
431 			/*
432 			 * Recheck edge-triggered descriptors which lack a
433 			 * pollhead.  While this check is performed when an fd
434 			 * is added to the pollcache in dpwrite(), subsequent
435 			 * descriptor manipulation could cause a different
436 			 * resource to be present now.
437 			 */
438 			if ((pdp->pd_events & POLLET) && error == 0 &&
439 			    pdp->pd_php == NULL && php == NULL && revent != 0) {
440 				short levent = 0;
441 
442 				/*
443 				 * The same POLLET-only VOP_POLL is used in an
444 				 * attempt to coax a pollhead from older
445 				 * driver logic.
446 				 */
447 				error = VOP_POLL(fp->f_vnode, POLLET,
448 				    0, &levent, &php, NULL);
449 			}
450 
451 			curthread->t_pollcache = NULL;
452 			releasef(fd);
453 			if (error != 0) {
454 				break;
455 			}
456 
457 			/*
458 			 * layered devices (e.g. console driver)
459 			 * may change the vnode and thus the pollhead
460 			 * pointer out from underneath us.
461 			 */
462 			if (php != NULL && pdp->pd_php != NULL &&
463 			    php != pdp->pd_php) {
464 				polldat_disassociate(pdp);
465 				polldat_associate(pdp, php);
466 				/*
467 				 * The bit should still be set.
468 				 */
469 				ASSERT(BT_TEST(pcp->pc_bitmap, fd));
470 				goto retry;
471 			}
472 
473 			if (revent != 0) {
474 				if (pfdp != NULL) {
475 					pfdp[fdcnt].fd = fd;
476 					pfdp[fdcnt].events = pdp->pd_events;
477 					pfdp[fdcnt].revents = revent;
478 				} else if (epoll != NULL) {
479 					epoll_event_t *ep = &epoll[fdcnt];
480 
481 					ASSERT(epoll != NULL);
482 					ep->data.u64 = pdp->pd_epolldata;
483 
484 					/*
485 					 * Since POLLNVAL is a legal event for
486 					 * VOP_POLL handlers to emit, it must
487 					 * be translated epoll-legal.
488 					 */
489 					if (revent & POLLNVAL) {
490 						revent &= ~POLLNVAL;
491 						revent |= POLLERR;
492 					}
493 
494 					/*
495 					 * If any of the event bits are set for
496 					 * which poll and epoll representations
497 					 * differ, swizzle in the native epoll
498 					 * values.
499 					 */
500 					if (revent & mask) {
501 						ep->events = (revent & ~mask) |
502 						    ((revent & POLLRDHUP) ?
503 						    EPOLLRDHUP : 0) |
504 						    ((revent & POLLWRBAND) ?
505 						    EPOLLWRBAND : 0);
506 					} else {
507 						ep->events = revent;
508 					}
509 
510 					/*
511 					 * We define POLLWRNORM to be POLLOUT,
512 					 * but epoll has separate definitions
513 					 * for them; if POLLOUT is set and the
514 					 * user has asked for EPOLLWRNORM, set
515 					 * that as well.
516 					 */
517 					if ((revent & POLLOUT) &&
518 					    (pdp->pd_events & EPOLLWRNORM)) {
519 						ep->events |= EPOLLWRNORM;
520 					}
521 				} else {
522 					pollstate_t *ps =
523 					    curthread->t_pollstate;
524 					/*
525 					 * The devpoll handle itself is being
526 					 * polled.  Notify the caller of any
527 					 * readable event(s), leaving as much
528 					 * state as possible untouched.
529 					 */
530 					VERIFY(fdcnt == 0);
531 					VERIFY(ps != NULL);
532 
533 					/*
534 					 * If a call to pollunlock() fails
535 					 * during VOP_POLL, skip over the fd
536 					 * and continue polling.
537 					 *
538 					 * Otherwise, report that there is an
539 					 * event pending.
540 					 */
541 					if ((ps->ps_flags & POLLSTATE_ULFAIL)
542 					    != 0) {
543 						ps->ps_flags &=
544 						    ~POLLSTATE_ULFAIL;
545 						continue;
546 					} else {
547 						fdcnt++;
548 						break;
549 					}
550 				}
551 
552 				/* Handle special polling modes. */
553 				if (pdp->pd_events & POLLONESHOT) {
554 					/*
555 					 * Entries operating under POLLONESHOT
556 					 * will be marked with a sentinel value
557 					 * to indicate that they have "fired"
558 					 * when emitting an event.  This will
559 					 * disable them from polling until a
560 					 * later add/modify event rearms them.
561 					 */
562 					pdp->pd_events = POLLONESHOT;
563 					polldat_disassociate(pdp);
564 					BT_CLEAR(pcp->pc_bitmap, fd);
565 				} else if (pdp->pd_events & POLLET) {
566 					/*
567 					 * Wire up the pollhead which should
568 					 * have been provided.  Edge-triggered
569 					 * polling cannot function properly
570 					 * with drivers which do not emit one.
571 					 */
572 					if (php != NULL &&
573 					    pdp->pd_php == NULL) {
574 						polldat_associate(pdp, php);
575 					}
576 
577 					/*
578 					 * If the driver has emitted a pollhead,
579 					 * clear the bit in the bitmap which
580 					 * effectively latches the edge on a
581 					 * pollwakeup() from the driver.
582 					 */
583 					if (pdp->pd_php != NULL) {
584 						BT_CLEAR(pcp->pc_bitmap, fd);
585 					}
586 				}
587 
588 				fdcnt++;
589 			} else if (php != NULL) {
590 				/*
591 				 * We clear a bit or cache a poll fd if
592 				 * the driver returns a poll head ptr,
593 				 * which is expected in the case of 0
594 				 * revents. Some buggy driver may return
595 				 * NULL php pointer with 0 revents. In
596 				 * this case, we just treat the driver as
597 				 * "noncachable" and not clearing the bit
598 				 * in bitmap.
599 				 */
600 				if ((pdp->pd_php != NULL) &&
601 				    ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
602 					BT_CLEAR(pcp->pc_bitmap, fd);
603 				}
604 				if (pdp->pd_php == NULL) {
605 					polldat_associate(pdp, php);
606 					/*
607 					 * An event of interest may have
608 					 * arrived between the VOP_POLL() and
609 					 * the polldat_associate(), so we
610 					 * must check again.
611 					 */
612 					goto repoll;
613 				}
614 			}
615 		} else {
616 			/*
617 			 * No bit set in the range. Check for wrap around.
618 			 */
619 			if (!no_wrap) {
620 				start = 0;
621 				end = ostart - 1;
622 				no_wrap = B_TRUE;
623 			} else {
624 				done = B_TRUE;
625 			}
626 		}
627 	}
628 
629 	if (!done) {
630 		pcp->pc_mapstart = start;
631 	}
632 	ASSERT(*fdcntp == 0);
633 	*fdcntp = fdcnt;
634 	return (error);
635 }
636 
637 /*ARGSUSED*/
638 static int
639 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
640 {
641 	minor_t		minordev;
642 	dp_entry_t	*dpep;
643 	pollcache_t	*pcp;
644 
645 	ASSERT(devpoll_init);
646 	ASSERT(dptblsize <= MAXMIN);
647 	mutex_enter(&devpoll_lock);
648 	for (minordev = 0; minordev < dptblsize; minordev++) {
649 		if (devpolltbl[minordev] == NULL) {
650 			devpolltbl[minordev] = (dp_entry_t *)RESERVED;
651 			break;
652 		}
653 	}
654 	if (minordev == dptblsize) {
655 		dp_entry_t	**newtbl;
656 		size_t		oldsize;
657 
658 		/*
659 		 * Used up every entry in the existing devpoll table.
660 		 * Grow the table by DEVPOLLSIZE.
661 		 */
662 		if ((oldsize = dptblsize) >= MAXMIN) {
663 			mutex_exit(&devpoll_lock);
664 			return (ENXIO);
665 		}
666 		dptblsize += DEVPOLLSIZE;
667 		if (dptblsize > MAXMIN) {
668 			dptblsize = MAXMIN;
669 		}
670 		newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
671 		bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
672 		kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
673 		devpolltbl = newtbl;
674 		devpolltbl[minordev] = (dp_entry_t *)RESERVED;
675 	}
676 	mutex_exit(&devpoll_lock);
677 
678 	dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
679 	/*
680 	 * allocate a pollcache skeleton here. Delay allocating bitmap
681 	 * structures until dpwrite() time, since we don't know the
682 	 * optimal size yet.  We also delay setting the pid until either
683 	 * dpwrite() or attempt to poll on the instance, allowing parents
684 	 * to create instances of /dev/poll for their children.  (In the
685 	 * epoll compatibility case, this check isn't performed to maintain
686 	 * semantic compatibility.)
687 	 */
688 	pcp = pcache_alloc();
689 	dpep->dpe_pcache = pcp;
690 	pcp->pc_pid = -1;
691 	*devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
692 	mutex_enter(&devpoll_lock);
693 	ASSERT(minordev < dptblsize);
694 	ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
695 	devpolltbl[minordev] = dpep;
696 	mutex_exit(&devpoll_lock);
697 	return (0);
698 }
699 
700 /*
701  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
702  * or change poll events for a watched fd.
703  */
704 /*ARGSUSED*/
705 static int
706 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
707 {
708 	minor_t		minor;
709 	dp_entry_t	*dpep;
710 	pollcache_t	*pcp;
711 	pollfd_t	*pollfdp, *pfdp;
712 	dvpoll_epollfd_t *epfdp;
713 	uintptr_t	limit;
714 	int		error;
715 	uint_t		size;
716 	size_t		copysize, uiosize;
717 	nfds_t		pollfdnum;
718 	boolean_t	is_epoll, fds_added = B_FALSE;
719 
720 	minor = getminor(dev);
721 
722 	mutex_enter(&devpoll_lock);
723 	ASSERT(minor < dptblsize);
724 	dpep = devpolltbl[minor];
725 	ASSERT(dpep != NULL);
726 	mutex_exit(&devpoll_lock);
727 
728 	mutex_enter(&dpep->dpe_lock);
729 	pcp = dpep->dpe_pcache;
730 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
731 	size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t);
732 	mutex_exit(&dpep->dpe_lock);
733 
734 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
735 		if (pcp->pc_pid != -1) {
736 			return (EACCES);
737 		}
738 
739 		pcp->pc_pid = curproc->p_pid;
740 	}
741 
742 	if (uiop->uio_resid < 0) {
743 		/* No one else is this careful, but maybe they should be. */
744 		return (EINVAL);
745 	}
746 
747 	uiosize = (size_t)uiop->uio_resid;
748 	pollfdnum = uiosize / size;
749 
750 	/*
751 	 * For epoll-enabled handles, restrict the allowed write size to 2.
752 	 * This corresponds to an epoll_ctl(3C) performing an EPOLL_CTL_MOD
753 	 * operation which is expanded into two operations (DEL and ADD).
754 	 *
755 	 * All other operations performed through epoll_ctl(3C) will consist of
756 	 * a single entry.
757 	 */
758 	if (is_epoll && pollfdnum > 2) {
759 		return (EINVAL);
760 	}
761 
762 	/*
763 	 * We want to make sure that pollfdnum isn't large enough to DoS us,
764 	 * but we also don't want to grab p_lock unnecessarily -- so we
765 	 * perform the full check against our resource limits if and only if
766 	 * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX.
767 	 */
768 	if (pollfdnum > UINT8_MAX) {
769 		mutex_enter(&curproc->p_lock);
770 		if (pollfdnum >
771 		    (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
772 		    curproc->p_rctls, curproc)) {
773 			(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
774 			    curproc->p_rctls, curproc, RCA_SAFE);
775 			mutex_exit(&curproc->p_lock);
776 			return (EINVAL);
777 		}
778 		mutex_exit(&curproc->p_lock);
779 	}
780 
781 	/*
782 	 * Copy in the pollfd array.  Walk through the array and add
783 	 * each polled fd to the cached set.
784 	 */
785 	pollfdp = kmem_alloc(uiosize, KM_SLEEP);
786 	limit = (uintptr_t)pollfdp + (pollfdnum * size);
787 
788 	/*
789 	 * Although /dev/poll uses the write(2) interface to cache fds, it's
790 	 * not supposed to function as a seekable device. To prevent offset
791 	 * from growing and eventually exceed the maximum, reset the offset
792 	 * here for every call.
793 	 */
794 	uiop->uio_loffset = 0;
795 
796 	/*
797 	 * Use uiocopy instead of uiomove when populating pollfdp, keeping
798 	 * uio_resid untouched for now.  Write syscalls will translate EINTR
799 	 * into a success if they detect "successfully transfered" data via an
800 	 * updated uio_resid.  Falsely suppressing such errors is disastrous.
801 	 */
802 	if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop,
803 	    &copysize)) != 0) {
804 		kmem_free(pollfdp, uiosize);
805 		return (error);
806 	}
807 
808 	/*
809 	 * We are about to enter the core portion of dpwrite(). Make sure this
810 	 * write has exclusive access in this portion of the code, i.e., no
811 	 * other writers in this code.
812 	 *
813 	 * Waiting for all readers to drop their references to the dpe is
814 	 * unecessary since the pollcache itself is protected by pc_lock.
815 	 */
816 	mutex_enter(&dpep->dpe_lock);
817 	dpep->dpe_writerwait++;
818 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) {
819 		ASSERT(dpep->dpe_refcnt != 0);
820 
821 		/*
822 		 * The epoll API does not allow EINTR as a result when making
823 		 * modifications to the set of polled fds.  Given that write
824 		 * activity is relatively quick and the size of accepted writes
825 		 * is limited above to two entries, a signal-ignorant wait is
826 		 * used here to avoid the EINTR.
827 		 */
828 		if (is_epoll) {
829 			cv_wait(&dpep->dpe_cv, &dpep->dpe_lock);
830 			continue;
831 		}
832 
833 		/*
834 		 * Non-epoll writers to /dev/poll handles can tolerate EINTR.
835 		 */
836 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
837 			dpep->dpe_writerwait--;
838 			mutex_exit(&dpep->dpe_lock);
839 			kmem_free(pollfdp, uiosize);
840 			return (EINTR);
841 		}
842 	}
843 	dpep->dpe_writerwait--;
844 	dpep->dpe_flag |= DP_WRITER_PRESENT;
845 	dpep->dpe_refcnt++;
846 
847 	if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) {
848 		/*
849 		 * The epoll compat mode was enabled while we were waiting to
850 		 * establish write access. It is not safe to continue since
851 		 * state was prepared for non-epoll operation.
852 		 */
853 		error = EBUSY;
854 		goto bypass;
855 	}
856 	mutex_exit(&dpep->dpe_lock);
857 
858 	/*
859 	 * Since the dpwrite() may recursively walk an added /dev/poll handle,
860 	 * pollstate_enter() deadlock and loop detection must be used.
861 	 */
862 	(void) pollstate_create();
863 	VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
864 
865 	if (pcp->pc_bitmap == NULL) {
866 		pcache_create(pcp, pollfdnum);
867 	}
868 	for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
869 	    pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
870 		int fd = pfdp->fd;
871 		polldat_t *pdp;
872 
873 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
874 			/*
875 			 * epoll semantics demand that we return EBADF if our
876 			 * specified fd is invalid.
877 			 */
878 			if (is_epoll) {
879 				error = EBADF;
880 				break;
881 			}
882 
883 			continue;
884 		}
885 
886 		pdp = pcache_lookup_fd(pcp, fd);
887 		if (pfdp->events != POLLREMOVE) {
888 			uf_entry_gen_t gen;
889 			file_t *fp = NULL;
890 			struct pollhead *php = NULL;
891 
892 			/*
893 			 * If we're in epoll compatibility mode, check that the
894 			 * fd is valid before allocating anything for it; epoll
895 			 * semantics demand that we return EBADF if our
896 			 * specified fd is invalid.
897 			 */
898 			if (is_epoll) {
899 				if ((fp = getf_gen(fd, &gen)) == NULL) {
900 					error = EBADF;
901 					break;
902 				}
903 			}
904 			if (pdp == NULL) {
905 				pdp = pcache_alloc_fd(0);
906 				pdp->pd_fd = fd;
907 				pdp->pd_pcache = pcp;
908 				pcache_insert_fd(pcp, pdp, pollfdnum);
909 			}
910 
911 			if (is_epoll) {
912 				/*
913 				 * If the fd is already a member of the epoll
914 				 * set, error emission is needed only when the
915 				 * fd assignment generation matches the one
916 				 * recorded in the polldat_t.  Absence of such
917 				 * a generation match indicates that a new
918 				 * resource has been assigned at that fd.
919 				 *
920 				 * Caveat: It is possible to force a generation
921 				 * update while keeping the same backing
922 				 * resource.  This is possible via dup2, but
923 				 * does not represent real-world use cases,
924 				 * making the lack of error acceptable.
925 				 */
926 				if (pdp->pd_fp != NULL && pdp->pd_gen == gen) {
927 					error = EEXIST;
928 					releasef(fd);
929 					break;
930 				}
931 
932 				/*
933 				 * We have decided that the cached information
934 				 * was stale.  Reset pd_events to assure that
935 				 * we don't mistakenly operate on cached event
936 				 * disposition.  This configures the implicit
937 				 * subscription to HUP and ERR events which
938 				 * epoll features.
939 				 */
940 				pdp->pd_events = POLLERR|POLLHUP;
941 
942 				epfdp = (dvpoll_epollfd_t *)pfdp;
943 				pdp->pd_epolldata = epfdp->dpep_data;
944 			}
945 
946 			ASSERT(pdp->pd_fd == fd);
947 			ASSERT(pdp->pd_pcache == pcp);
948 			if (fd >= pcp->pc_mapsize) {
949 				mutex_exit(&pcp->pc_lock);
950 				pcache_grow_map(pcp, fd);
951 				mutex_enter(&pcp->pc_lock);
952 			}
953 			if (fd > pcp->pc_mapend) {
954 				pcp->pc_mapend = fd;
955 			}
956 
957 			if (!is_epoll) {
958 				ASSERT(fp == NULL);
959 
960 				if ((fp = getf_gen(fd, &gen)) == NULL) {
961 					/*
962 					 * The fd is not valid. Since we can't
963 					 * pass this error back in the write()
964 					 * call, set the bit in bitmap to force
965 					 * DP_POLL ioctl to examine it.
966 					 */
967 					BT_SET(pcp->pc_bitmap, fd);
968 					pdp->pd_events |= pfdp->events;
969 					continue;
970 				}
971 				/*
972 				 * Don't do VOP_POLL for an already cached fd
973 				 * with same poll events.
974 				 */
975 				if ((pdp->pd_events == pfdp->events) &&
976 				    (pdp->pd_fp == fp)) {
977 					/*
978 					 * the events are already cached
979 					 */
980 					releasef(fd);
981 					continue;
982 				}
983 			}
984 
985 
986 			/*
987 			 * do VOP_POLL and cache this poll fd.
988 			 */
989 			/*
990 			 * XXX - pollrelock() logic needs to know which
991 			 * which pollcache lock to grab. It'd be a
992 			 * cleaner solution if we could pass pcp as
993 			 * an arguement in VOP_POLL interface instead
994 			 * of implicitly passing it using thread_t
995 			 * struct. On the other hand, changing VOP_POLL
996 			 * interface will require all driver/file system
997 			 * poll routine to change. May want to revisit
998 			 * the tradeoff later.
999 			 */
1000 			curthread->t_pollcache = pcp;
1001 			error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
1002 			    &pfdp->revents, &php, NULL);
1003 
1004 			/*
1005 			 * Edge-triggered polling requires a pollhead in order
1006 			 * to initiate wake-ups properly.  Drivers which are
1007 			 * savvy to POLLET presence, which should include
1008 			 * everything in-gate, will always emit one, regardless
1009 			 * of revent status.  Older drivers which only emit a
1010 			 * pollhead if 'revents == 0' are given a second chance
1011 			 * here via a second VOP_POLL, with only POLLET set in
1012 			 * the events of interest.  These circumstances should
1013 			 * induce any cacheable drivers to emit a pollhead for
1014 			 * wake-ups.
1015 			 *
1016 			 * Drivers which never emit a pollhead will simply
1017 			 * disobey the expectation of edge-triggered behavior.
1018 			 * This includes recursive epoll which, even on Linux,
1019 			 * yields its events in a level-triggered fashion only.
1020 			 */
1021 			if ((pfdp->events & POLLET) != 0 && error == 0 &&
1022 			    php == NULL) {
1023 				short levent = 0;
1024 
1025 				error = VOP_POLL(fp->f_vnode, POLLET, 0,
1026 				    &levent, &php, NULL);
1027 			}
1028 
1029 			curthread->t_pollcache = NULL;
1030 			/*
1031 			 * We always set the bit when this fd is cached;
1032 			 * this forces the first DP_POLL to poll this fd.
1033 			 * Real performance gain comes from subsequent
1034 			 * DP_POLL.  We also attempt a polldat_associate();
1035 			 * if it's not possible, we'll do it in dpioctl().
1036 			 */
1037 			BT_SET(pcp->pc_bitmap, fd);
1038 			if (error != 0) {
1039 				releasef(fd);
1040 				break;
1041 			}
1042 			pdp->pd_fp = fp;
1043 			pdp->pd_gen = gen;
1044 			pdp->pd_events |= pfdp->events;
1045 			if (php != NULL) {
1046 				if (pdp->pd_php == NULL) {
1047 					polldat_associate(pdp, php);
1048 				} else {
1049 					if (pdp->pd_php != php) {
1050 						polldat_disassociate(pdp);
1051 						polldat_associate(pdp, php);
1052 					}
1053 				}
1054 			}
1055 			fds_added = B_TRUE;
1056 			releasef(fd);
1057 		} else {
1058 			if (pdp == NULL || pdp->pd_fp == NULL) {
1059 				if (is_epoll) {
1060 					/*
1061 					 * As with the add case (above), epoll
1062 					 * semantics demand that we error out
1063 					 * in this case.
1064 					 */
1065 					error = ENOENT;
1066 					break;
1067 				}
1068 
1069 				continue;
1070 			}
1071 			ASSERT(pdp->pd_fd == fd);
1072 			pdp->pd_fp = NULL;
1073 			pdp->pd_events = 0;
1074 			ASSERT(pdp->pd_thread == NULL);
1075 			polldat_disassociate(pdp);
1076 			BT_CLEAR(pcp->pc_bitmap, fd);
1077 		}
1078 	}
1079 	/*
1080 	 * Wake any pollcache waiters so they can check the new descriptors.
1081 	 *
1082 	 * Any fds added to an recursive-capable pollcache could themselves be
1083 	 * /dev/poll handles. To ensure that proper event propagation occurs,
1084 	 * parent pollcaches are woken too, so that they can create any needed
1085 	 * pollcache links.
1086 	 */
1087 	if (fds_added) {
1088 		cv_broadcast(&pcp->pc_cv);
1089 		pcache_wake_parents(pcp);
1090 	}
1091 	pollstate_exit(pcp);
1092 	mutex_enter(&dpep->dpe_lock);
1093 bypass:
1094 	dpep->dpe_flag &= ~DP_WRITER_PRESENT;
1095 	dpep->dpe_refcnt--;
1096 	cv_broadcast(&dpep->dpe_cv);
1097 	mutex_exit(&dpep->dpe_lock);
1098 	kmem_free(pollfdp, uiosize);
1099 	if (error == 0) {
1100 		/*
1101 		 * The state of uio_resid is updated only after the pollcache
1102 		 * is successfully modified.
1103 		 */
1104 		uioskip(uiop, copysize);
1105 	}
1106 	return (error);
1107 }
1108 
1109 #define	DP_SIGMASK_RESTORE(ksetp) {					\
1110 	if (ksetp != NULL) {						\
1111 		mutex_enter(&p->p_lock);				\
1112 		if (lwp->lwp_cursig == 0) {				\
1113 			t->t_hold = lwp->lwp_sigoldmask;		\
1114 			t->t_flag &= ~T_TOMASK;				\
1115 		}							\
1116 		mutex_exit(&p->p_lock);					\
1117 	}								\
1118 }
1119 
1120 /*ARGSUSED*/
1121 static int
1122 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
1123 {
1124 	minor_t		minor;
1125 	dp_entry_t	*dpep;
1126 	pollcache_t	*pcp;
1127 	hrtime_t	now;
1128 	int		error = 0;
1129 	boolean_t	is_epoll;
1130 	STRUCT_DECL(dvpoll, dvpoll);
1131 
1132 	if (cmd == DP_POLL || cmd == DP_PPOLL) {
1133 		/* do this now, before we sleep on DP_WRITER_PRESENT */
1134 		now = gethrtime();
1135 	}
1136 
1137 	minor = getminor(dev);
1138 	mutex_enter(&devpoll_lock);
1139 	ASSERT(minor < dptblsize);
1140 	dpep = devpolltbl[minor];
1141 	mutex_exit(&devpoll_lock);
1142 	ASSERT(dpep != NULL);
1143 	pcp = dpep->dpe_pcache;
1144 
1145 	mutex_enter(&dpep->dpe_lock);
1146 	is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0;
1147 
1148 	if (cmd == DP_EPOLLCOMPAT) {
1149 		if (dpep->dpe_refcnt != 0) {
1150 			/*
1151 			 * We can't turn on epoll compatibility while there
1152 			 * are outstanding operations.
1153 			 */
1154 			mutex_exit(&dpep->dpe_lock);
1155 			return (EBUSY);
1156 		}
1157 
1158 		/*
1159 		 * epoll compatibility is a one-way street: there's no way
1160 		 * to turn it off for a particular open.
1161 		 */
1162 		dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
1163 
1164 		/* Record the epoll-enabled nature in the pollcache too */
1165 		mutex_enter(&pcp->pc_lock);
1166 		pcp->pc_flag |= PC_EPOLL;
1167 		mutex_exit(&pcp->pc_lock);
1168 
1169 		mutex_exit(&dpep->dpe_lock);
1170 		return (0);
1171 	}
1172 
1173 	if (!is_epoll && curproc->p_pid != pcp->pc_pid) {
1174 		if (pcp->pc_pid != -1) {
1175 			mutex_exit(&dpep->dpe_lock);
1176 			return (EACCES);
1177 		}
1178 
1179 		pcp->pc_pid = curproc->p_pid;
1180 	}
1181 
1182 	/* Wait until all writers have cleared the handle before continuing */
1183 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 ||
1184 	    (dpep->dpe_writerwait != 0)) {
1185 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
1186 			mutex_exit(&dpep->dpe_lock);
1187 			return (EINTR);
1188 		}
1189 	}
1190 	dpep->dpe_refcnt++;
1191 	mutex_exit(&dpep->dpe_lock);
1192 
1193 	switch (cmd) {
1194 	case	DP_POLL:
1195 	case	DP_PPOLL:
1196 	{
1197 		pollstate_t	*ps;
1198 		nfds_t		nfds;
1199 		int		fdcnt = 0;
1200 		size_t		size, fdsize, dpsize;
1201 		hrtime_t	deadline = 0;
1202 		k_sigset_t	*ksetp = NULL;
1203 		k_sigset_t	kset;
1204 		sigset_t	set;
1205 		kthread_t	*t = curthread;
1206 		klwp_t		*lwp = ttolwp(t);
1207 		struct proc	*p = ttoproc(curthread);
1208 
1209 		STRUCT_INIT(dvpoll, mode);
1210 
1211 		/*
1212 		 * The dp_setp member is only required/consumed for DP_PPOLL,
1213 		 * which otherwise uses the same structure as DP_POLL.
1214 		 */
1215 		if (cmd == DP_POLL) {
1216 			dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
1217 			    (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
1218 		} else {
1219 			ASSERT(cmd == DP_PPOLL);
1220 			dpsize = STRUCT_SIZE(dvpoll);
1221 		}
1222 
1223 		if ((mode & FKIOCTL) != 0) {
1224 			/* Kernel-internal ioctl call */
1225 			bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
1226 			error = 0;
1227 		} else {
1228 			error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
1229 			    dpsize);
1230 		}
1231 
1232 		if (error) {
1233 			DP_REFRELE(dpep);
1234 			return (EFAULT);
1235 		}
1236 
1237 		deadline = STRUCT_FGET(dvpoll, dp_timeout);
1238 		if (deadline > 0) {
1239 			/*
1240 			 * Convert the deadline from relative milliseconds
1241 			 * to absolute nanoseconds.  They must wait for at
1242 			 * least a tick.
1243 			 */
1244 			deadline = MSEC2NSEC(deadline);
1245 			deadline = MAX(deadline, nsec_per_tick);
1246 			deadline += now;
1247 		}
1248 
1249 		if (cmd == DP_PPOLL) {
1250 			void *setp = STRUCT_FGETP(dvpoll, dp_setp);
1251 
1252 			if (setp != NULL) {
1253 				if ((mode & FKIOCTL) != 0) {
1254 					/* Use the signal set directly */
1255 					ksetp = (k_sigset_t *)setp;
1256 				} else {
1257 					if (copyin(setp, &set, sizeof (set))) {
1258 						DP_REFRELE(dpep);
1259 						return (EFAULT);
1260 					}
1261 					sigutok(&set, &kset);
1262 					ksetp = &kset;
1263 				}
1264 
1265 				mutex_enter(&p->p_lock);
1266 				schedctl_finish_sigblock(t);
1267 				lwp->lwp_sigoldmask = t->t_hold;
1268 				t->t_hold = *ksetp;
1269 				t->t_flag |= T_TOMASK;
1270 
1271 				/*
1272 				 * Like ppoll() with a non-NULL sigset, we'll
1273 				 * call cv_reltimedwait_sig() just to check for
1274 				 * signals.  This call will return immediately
1275 				 * with either 0 (signalled) or -1 (no signal).
1276 				 * There are some conditions whereby we can
1277 				 * get 0 from cv_reltimedwait_sig() without
1278 				 * a true signal (e.g., a directed stop), so
1279 				 * we restore our signal mask in the unlikely
1280 				 * event that lwp_cursig is 0.
1281 				 */
1282 				if (!cv_reltimedwait_sig(&t->t_delay_cv,
1283 				    &p->p_lock, 0, TR_CLOCK_TICK)) {
1284 					if (lwp->lwp_cursig == 0) {
1285 						t->t_hold = lwp->lwp_sigoldmask;
1286 						t->t_flag &= ~T_TOMASK;
1287 					}
1288 
1289 					mutex_exit(&p->p_lock);
1290 
1291 					DP_REFRELE(dpep);
1292 					return (EINTR);
1293 				}
1294 
1295 				mutex_exit(&p->p_lock);
1296 			}
1297 		}
1298 
1299 		if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
1300 			/*
1301 			 * We are just using DP_POLL to sleep, so
1302 			 * we don't any of the devpoll apparatus.
1303 			 * Do not check for signals if we have a zero timeout.
1304 			 */
1305 			DP_REFRELE(dpep);
1306 			if (deadline == 0) {
1307 				DP_SIGMASK_RESTORE(ksetp);
1308 				return (0);
1309 			}
1310 
1311 			mutex_enter(&curthread->t_delay_lock);
1312 			while ((error =
1313 			    cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
1314 			    &curthread->t_delay_lock, deadline)) > 0)
1315 				continue;
1316 			mutex_exit(&curthread->t_delay_lock);
1317 
1318 			DP_SIGMASK_RESTORE(ksetp);
1319 
1320 			return (error == 0 ? EINTR : 0);
1321 		}
1322 
1323 		if (is_epoll) {
1324 			size = nfds * (fdsize = sizeof (epoll_event_t));
1325 		} else {
1326 			size = nfds * (fdsize = sizeof (pollfd_t));
1327 		}
1328 
1329 		/*
1330 		 * XXX It would be nice not to have to alloc each time, but it
1331 		 * requires another per thread structure hook. This can be
1332 		 * implemented later if data suggests that it's necessary.
1333 		 */
1334 		ps = pollstate_create();
1335 
1336 		if (ps->ps_dpbufsize < size) {
1337 			/*
1338 			 * If nfds is larger than twice the current maximum
1339 			 * open file count, we'll silently clamp it.  This
1340 			 * only limits our exposure to allocating an
1341 			 * inordinate amount of kernel memory; it doesn't
1342 			 * otherwise affect the semantics.  (We have this
1343 			 * check at twice the maximum instead of merely the
1344 			 * maximum because some applications pass an nfds that
1345 			 * is only slightly larger than their limit.)
1346 			 */
1347 			mutex_enter(&p->p_lock);
1348 			if ((nfds >> 1) > p->p_fno_ctl) {
1349 				nfds = p->p_fno_ctl;
1350 				size = nfds * fdsize;
1351 			}
1352 			mutex_exit(&p->p_lock);
1353 
1354 			if (ps->ps_dpbufsize < size) {
1355 				kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
1356 				ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
1357 				ps->ps_dpbufsize = size;
1358 			}
1359 		}
1360 
1361 		VERIFY(pollstate_enter(pcp) == PSE_SUCCESS);
1362 		for (;;) {
1363 			pcp->pc_flag &= ~PC_POLLWAKE;
1364 
1365 			/*
1366 			 * Mark all child pcachelinks as stale.
1367 			 * Those which are still part of the tree will be
1368 			 * marked as valid during the poll.
1369 			 */
1370 			pcachelink_mark_stale(pcp);
1371 
1372 			error = dp_pcache_poll(dpep, ps->ps_dpbuf,
1373 			    pcp, nfds, &fdcnt);
1374 			if (fdcnt > 0 || error != 0)
1375 				break;
1376 
1377 			/* Purge still-stale child pcachelinks */
1378 			pcachelink_purge_stale(pcp);
1379 
1380 			/*
1381 			 * A pollwake has happened since we polled cache.
1382 			 */
1383 			if (pcp->pc_flag & PC_POLLWAKE)
1384 				continue;
1385 
1386 			/*
1387 			 * Sleep until we are notified, signaled, or timed out.
1388 			 */
1389 			if (deadline == 0) {
1390 				/* immediate timeout; do not check signals */
1391 				break;
1392 			}
1393 
1394 			error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
1395 			    &pcp->pc_lock, deadline);
1396 
1397 			/*
1398 			 * If we were awakened by a signal or timeout then
1399 			 * break the loop, else poll again.
1400 			 */
1401 			if (error <= 0) {
1402 				error = (error == 0) ? EINTR : 0;
1403 				break;
1404 			} else {
1405 				error = 0;
1406 			}
1407 		}
1408 		pollstate_exit(pcp);
1409 
1410 		DP_SIGMASK_RESTORE(ksetp);
1411 
1412 		if (error == 0 && fdcnt > 0) {
1413 			/*
1414 			 * It should be noted that FKIOCTL does not influence
1415 			 * the copyout (vs bcopy) of dp_fds at this time.
1416 			 */
1417 			if (copyout(ps->ps_dpbuf,
1418 			    STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
1419 				DP_REFRELE(dpep);
1420 				return (EFAULT);
1421 			}
1422 			*rvalp = fdcnt;
1423 		}
1424 		break;
1425 	}
1426 
1427 	case	DP_ISPOLLED:
1428 	{
1429 		pollfd_t	pollfd;
1430 		polldat_t	*pdp;
1431 
1432 		STRUCT_INIT(dvpoll, mode);
1433 		error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
1434 		if (error) {
1435 			DP_REFRELE(dpep);
1436 			return (EFAULT);
1437 		}
1438 		mutex_enter(&pcp->pc_lock);
1439 		if (pcp->pc_hash == NULL) {
1440 			/*
1441 			 * No Need to search because no poll fd
1442 			 * has been cached.
1443 			 */
1444 			mutex_exit(&pcp->pc_lock);
1445 			DP_REFRELE(dpep);
1446 			return (0);
1447 		}
1448 		if (pollfd.fd < 0) {
1449 			mutex_exit(&pcp->pc_lock);
1450 			break;
1451 		}
1452 		pdp = pcache_lookup_fd(pcp, pollfd.fd);
1453 		if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
1454 		    (pdp->pd_fp != NULL)) {
1455 			pollfd.revents = pdp->pd_events;
1456 			if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
1457 				mutex_exit(&pcp->pc_lock);
1458 				DP_REFRELE(dpep);
1459 				return (EFAULT);
1460 			}
1461 			*rvalp = 1;
1462 		}
1463 		mutex_exit(&pcp->pc_lock);
1464 		break;
1465 	}
1466 
1467 	default:
1468 		DP_REFRELE(dpep);
1469 		return (EINVAL);
1470 	}
1471 	DP_REFRELE(dpep);
1472 	return (error);
1473 }
1474 
1475 /*
1476  * Overview of Recursive Polling
1477  *
1478  * It is possible for /dev/poll to poll for events on file descriptors which
1479  * themselves are /dev/poll handles.  Pending events in the child handle are
1480  * represented as readable data via the POLLIN flag.  To limit surface area,
1481  * this recursion is presently allowed on only /dev/poll handles which have
1482  * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl.  Recursion depth is
1483  * limited to 5 in order to be consistent with Linux epoll.
1484  *
1485  * Extending dppoll() for VOP_POLL:
1486  *
1487  * The recursive /dev/poll implementation begins by extending dppoll() to
1488  * report when resources contained in the pollcache have relevant event state.
1489  * At the highest level, it means calling dp_pcache_poll() so it indicates if
1490  * fd events are present without consuming them or altering the pollcache
1491  * bitmap.  This ensures that a subsequent DP_POLL operation on the bitmap will
1492  * yield the initiating event.  Additionally, the VOP_POLL should return in
1493  * such a way that dp_pcache_poll() does not clear the parent bitmap entry
1494  * which corresponds to the child /dev/poll fd.  This means that child
1495  * pollcaches will be checked during every poll which facilitates wake-up
1496  * behavior detailed below.
1497  *
1498  * Pollcache Links and Wake Events:
1499  *
1500  * Recursive /dev/poll avoids complicated pollcache locking constraints during
1501  * pollwakeup events by eschewing the traditional pollhead mechanism in favor
1502  * of a different approach.  For each pollcache at the root of a recursive
1503  * /dev/poll "tree", pcachelink_t structures are established to all child
1504  * /dev/poll pollcaches.  During pollnotify() in a child pollcache, the
1505  * linked list of pcachelink_t entries is walked, where those marked as valid
1506  * incur a cv_broadcast to their parent pollcache.  Most notably, these
1507  * pcachelink_t cv wakeups are performed without acquiring pc_lock on the
1508  * parent pollcache (which would require careful deadlock avoidance).  This
1509  * still allows the woken poll on the parent to discover the pertinent events
1510  * due to the fact that bitmap entires for the child pollcache are always
1511  * maintained by the dppoll() logic above.
1512  *
1513  * Depth Limiting and Loop Prevention:
1514  *
1515  * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and
1516  * loop constraints are enforced via pollstate_enter().  The pollcache_t
1517  * pointer is compared against any existing entries in ps_pc_stack and is added
1518  * to the end if no match (and therefore loop) is found.  Once poll operations
1519  * for a given pollcache_t are complete, pollstate_exit() clears the pointer
1520  * from the list.  The pollstate_enter() and pollstate_exit() functions are
1521  * responsible for acquiring and releasing pc_lock, respectively.
1522  *
1523  * Deadlock Safety:
1524  *
1525  * Descending through a tree of recursive /dev/poll handles involves the tricky
1526  * business of sequentially entering multiple pollcache locks.  This tree
1527  * topology cannot define a lock acquisition order in such a way that it is
1528  * immune to deadlocks between threads.  The pollstate_enter() and
1529  * pollstate_exit() functions provide an interface for recursive /dev/poll
1530  * operations to safely lock pollcaches while failing gracefully in the face of
1531  * deadlocking topologies. (See pollstate_contend() for more detail about how
1532  * deadlocks are detected and resolved.)
1533  */
1534 
1535 /*ARGSUSED*/
1536 static int
1537 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
1538     struct pollhead **phpp)
1539 {
1540 	minor_t		minor;
1541 	dp_entry_t	*dpep;
1542 	pollcache_t	*pcp;
1543 	int		res, rc = 0;
1544 
1545 	minor = getminor(dev);
1546 	mutex_enter(&devpoll_lock);
1547 	ASSERT(minor < dptblsize);
1548 	dpep = devpolltbl[minor];
1549 	ASSERT(dpep != NULL);
1550 	mutex_exit(&devpoll_lock);
1551 
1552 	mutex_enter(&dpep->dpe_lock);
1553 	if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) {
1554 		/* Poll recursion is not yet supported for non-epoll handles */
1555 		*reventsp = POLLERR;
1556 		mutex_exit(&dpep->dpe_lock);
1557 		return (0);
1558 	} else {
1559 		dpep->dpe_refcnt++;
1560 		pcp = dpep->dpe_pcache;
1561 		mutex_exit(&dpep->dpe_lock);
1562 	}
1563 
1564 	res = pollstate_enter(pcp);
1565 	if (res == PSE_SUCCESS) {
1566 		nfds_t		nfds = 1;
1567 		int		fdcnt = 0;
1568 		pollstate_t	*ps = curthread->t_pollstate;
1569 
1570 		/*
1571 		 * Recursive polling will only emit certain events.  Skip a
1572 		 * scan of the pollcache if those events are not of interest.
1573 		 */
1574 		if (events & (POLLIN|POLLRDNORM)) {
1575 			rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt);
1576 		} else {
1577 			rc = 0;
1578 			fdcnt = 0;
1579 		}
1580 
1581 		if (rc == 0 && fdcnt > 0) {
1582 			*reventsp = POLLIN|POLLRDNORM;
1583 		} else {
1584 			*reventsp = 0;
1585 		}
1586 		pcachelink_assoc(pcp, ps->ps_pc_stack[0]);
1587 		pollstate_exit(pcp);
1588 	} else {
1589 		switch (res) {
1590 		case PSE_FAIL_DEPTH:
1591 			rc = EINVAL;
1592 			break;
1593 		case PSE_FAIL_LOOP:
1594 		case PSE_FAIL_DEADLOCK:
1595 			rc = ELOOP;
1596 			break;
1597 		default:
1598 			/*
1599 			 * If anything else has gone awry, such as being polled
1600 			 * from an unexpected context, fall back to the
1601 			 * recursion-intolerant response.
1602 			 */
1603 			*reventsp = POLLERR;
1604 			rc = 0;
1605 			break;
1606 		}
1607 	}
1608 
1609 	DP_REFRELE(dpep);
1610 	return (rc);
1611 }
1612 
1613 /*
1614  * devpoll close should do enough clean up before the pollcache is deleted,
1615  * i.e., it should ensure no one still references the pollcache later.
1616  * There is no "permission" check in here. Any process having the last
1617  * reference of this /dev/poll fd can close.
1618  */
1619 /*ARGSUSED*/
1620 static int
1621 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
1622 {
1623 	minor_t		minor;
1624 	dp_entry_t	*dpep;
1625 	pollcache_t	*pcp;
1626 	int		i;
1627 	polldat_t	**hashtbl;
1628 	polldat_t	*pdp;
1629 
1630 	minor = getminor(dev);
1631 
1632 	mutex_enter(&devpoll_lock);
1633 	dpep = devpolltbl[minor];
1634 	ASSERT(dpep != NULL);
1635 	devpolltbl[minor] = NULL;
1636 	mutex_exit(&devpoll_lock);
1637 	pcp = dpep->dpe_pcache;
1638 	ASSERT(pcp != NULL);
1639 	/*
1640 	 * At this point, no other lwp can access this pollcache via the
1641 	 * /dev/poll fd. This pollcache is going away, so do the clean
1642 	 * up without the pc_lock.
1643 	 */
1644 	hashtbl = pcp->pc_hash;
1645 	for (i = 0; i < pcp->pc_hashsize; i++) {
1646 		for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1647 			polldat_disassociate(pdp);
1648 			pdp->pd_fp = NULL;
1649 		}
1650 	}
1651 	/*
1652 	 * pollwakeup() may still interact with this pollcache. Wait until
1653 	 * it is done.
1654 	 */
1655 	mutex_enter(&pcp->pc_no_exit);
1656 	ASSERT(pcp->pc_busy >= 0);
1657 	while (pcp->pc_busy > 0)
1658 		cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
1659 	mutex_exit(&pcp->pc_no_exit);
1660 
1661 	/* Clean up any pollcache links created via recursive /dev/poll */
1662 	if (pcp->pc_parents != NULL || pcp->pc_children != NULL) {
1663 		/*
1664 		 * Because of the locking rules for pcachelink manipulation,
1665 		 * acquring pc_lock is required for this step.
1666 		 */
1667 		mutex_enter(&pcp->pc_lock);
1668 		pcachelink_purge_all(pcp);
1669 		mutex_exit(&pcp->pc_lock);
1670 	}
1671 
1672 	pcache_destroy(pcp);
1673 	ASSERT(dpep->dpe_refcnt == 0);
1674 	kmem_free(dpep, sizeof (dp_entry_t));
1675 	return (0);
1676 }
1677 
1678 static void
1679 pcachelink_locked_rele(pcachelink_t *pl)
1680 {
1681 	ASSERT(MUTEX_HELD(&pl->pcl_lock));
1682 	VERIFY(pl->pcl_refcnt >= 1);
1683 
1684 	pl->pcl_refcnt--;
1685 	if (pl->pcl_refcnt == 0) {
1686 		VERIFY(pl->pcl_state == PCL_INVALID);
1687 		ASSERT(pl->pcl_parent_pc == NULL);
1688 		ASSERT(pl->pcl_child_pc == NULL);
1689 		ASSERT(pl->pcl_parent_next == NULL);
1690 		ASSERT(pl->pcl_child_next == NULL);
1691 
1692 		pl->pcl_state = PCL_FREE;
1693 		mutex_destroy(&pl->pcl_lock);
1694 		kmem_free(pl, sizeof (pcachelink_t));
1695 	} else {
1696 		mutex_exit(&pl->pcl_lock);
1697 	}
1698 }
1699 
1700 /*
1701  * Associate parent and child pollcaches via a pcachelink_t.  If an existing
1702  * link (stale or valid) between the two is found, it will be reused.  If a
1703  * suitable link is not found for reuse, a new one will be allocated.
1704  */
1705 static void
1706 pcachelink_assoc(pollcache_t *child, pollcache_t *parent)
1707 {
1708 	pcachelink_t	*pl, **plpn;
1709 
1710 	ASSERT(MUTEX_HELD(&child->pc_lock));
1711 	ASSERT(MUTEX_HELD(&parent->pc_lock));
1712 
1713 	/* Search for an existing link we can reuse. */
1714 	plpn = &child->pc_parents;
1715 	for (pl = child->pc_parents; pl != NULL; pl = *plpn) {
1716 		mutex_enter(&pl->pcl_lock);
1717 		if (pl->pcl_state == PCL_INVALID) {
1718 			/* Clean any invalid links while walking the list */
1719 			*plpn = pl->pcl_parent_next;
1720 			pl->pcl_child_pc = NULL;
1721 			pl->pcl_parent_next = NULL;
1722 			pcachelink_locked_rele(pl);
1723 		} else if (pl->pcl_parent_pc == parent) {
1724 			/* Successfully found parent link */
1725 			ASSERT(pl->pcl_state == PCL_VALID ||
1726 			    pl->pcl_state == PCL_STALE);
1727 			pl->pcl_state = PCL_VALID;
1728 			mutex_exit(&pl->pcl_lock);
1729 			return;
1730 		} else {
1731 			plpn = &pl->pcl_parent_next;
1732 			mutex_exit(&pl->pcl_lock);
1733 		}
1734 	}
1735 
1736 	/* No existing link to the parent was found.  Create a fresh one. */
1737 	pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP);
1738 	mutex_init(&pl->pcl_lock,  NULL, MUTEX_DEFAULT, NULL);
1739 
1740 	pl->pcl_parent_pc = parent;
1741 	pl->pcl_child_next = parent->pc_children;
1742 	parent->pc_children = pl;
1743 	pl->pcl_refcnt++;
1744 
1745 	pl->pcl_child_pc = child;
1746 	pl->pcl_parent_next = child->pc_parents;
1747 	child->pc_parents = pl;
1748 	pl->pcl_refcnt++;
1749 
1750 	pl->pcl_state = PCL_VALID;
1751 }
1752 
1753 /*
1754  * Mark all child links in a pollcache as stale.  Any invalid child links found
1755  * during iteration are purged.
1756  */
1757 static void
1758 pcachelink_mark_stale(pollcache_t *pcp)
1759 {
1760 	pcachelink_t	*pl, **plpn;
1761 
1762 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1763 
1764 	plpn = &pcp->pc_children;
1765 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1766 		mutex_enter(&pl->pcl_lock);
1767 		if (pl->pcl_state == PCL_INVALID) {
1768 			/*
1769 			 * Remove any invalid links while we are going to the
1770 			 * trouble of walking the list.
1771 			 */
1772 			*plpn = pl->pcl_child_next;
1773 			pl->pcl_parent_pc = NULL;
1774 			pl->pcl_child_next = NULL;
1775 			pcachelink_locked_rele(pl);
1776 		} else {
1777 			pl->pcl_state = PCL_STALE;
1778 			plpn = &pl->pcl_child_next;
1779 			mutex_exit(&pl->pcl_lock);
1780 		}
1781 	}
1782 }
1783 
1784 /*
1785  * Purge all stale (or invalid) child links from a pollcache.
1786  */
1787 static void
1788 pcachelink_purge_stale(pollcache_t *pcp)
1789 {
1790 	pcachelink_t	*pl, **plpn;
1791 
1792 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1793 
1794 	plpn = &pcp->pc_children;
1795 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1796 		mutex_enter(&pl->pcl_lock);
1797 		switch (pl->pcl_state) {
1798 		case PCL_STALE:
1799 			pl->pcl_state = PCL_INVALID;
1800 			/* FALLTHROUGH */
1801 		case PCL_INVALID:
1802 			*plpn = pl->pcl_child_next;
1803 			pl->pcl_parent_pc = NULL;
1804 			pl->pcl_child_next = NULL;
1805 			pcachelink_locked_rele(pl);
1806 			break;
1807 		default:
1808 			plpn = &pl->pcl_child_next;
1809 			mutex_exit(&pl->pcl_lock);
1810 		}
1811 	}
1812 }
1813 
1814 /*
1815  * Purge all child and parent links from a pollcache, regardless of status.
1816  */
1817 static void
1818 pcachelink_purge_all(pollcache_t *pcp)
1819 {
1820 	pcachelink_t	*pl, **plpn;
1821 
1822 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
1823 
1824 	plpn = &pcp->pc_parents;
1825 	for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) {
1826 		mutex_enter(&pl->pcl_lock);
1827 		pl->pcl_state = PCL_INVALID;
1828 		*plpn = pl->pcl_parent_next;
1829 		pl->pcl_child_pc = NULL;
1830 		pl->pcl_parent_next = NULL;
1831 		pcachelink_locked_rele(pl);
1832 	}
1833 
1834 	plpn = &pcp->pc_children;
1835 	for (pl = pcp->pc_children; pl != NULL; pl = *plpn) {
1836 		mutex_enter(&pl->pcl_lock);
1837 		pl->pcl_state = PCL_INVALID;
1838 		*plpn = pl->pcl_child_next;
1839 		pl->pcl_parent_pc = NULL;
1840 		pl->pcl_child_next = NULL;
1841 		pcachelink_locked_rele(pl);
1842 	}
1843 
1844 	ASSERT(pcp->pc_parents == NULL);
1845 	ASSERT(pcp->pc_children == NULL);
1846 }
1847