xref: /titanic_52/usr/src/uts/common/io/devpoll.c (revision a5eb7107f06a6e23e8e77e8d3a84c1ff90a73ac6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012 by Delphix. All rights reserved.
28  * Copyright (c) 2015, Joyent, Inc. All rights reserved.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/devops.h>
33 #include <sys/conf.h>
34 #include <sys/modctl.h>
35 #include <sys/sunddi.h>
36 #include <sys/stat.h>
37 #include <sys/poll_impl.h>
38 #include <sys/errno.h>
39 #include <sys/kmem.h>
40 #include <sys/mkdev.h>
41 #include <sys/debug.h>
42 #include <sys/file.h>
43 #include <sys/sysmacros.h>
44 #include <sys/systm.h>
45 #include <sys/bitmap.h>
46 #include <sys/devpoll.h>
47 #include <sys/rctl.h>
48 #include <sys/resource.h>
49 #include <sys/schedctl.h>
50 #include <sys/epoll.h>
51 
52 #define	RESERVED	1
53 
54 /* local data struct */
55 static	dp_entry_t	**devpolltbl;	/* dev poll entries */
56 static	size_t		dptblsize;
57 
58 static	kmutex_t	devpoll_lock;	/* lock protecting dev tbl */
59 int			devpoll_init;	/* is /dev/poll initialized already */
60 
61 /* device local functions */
62 
63 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
64 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
65 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
66     int *rvalp);
67 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
68     struct pollhead **phpp);
69 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
70 static dev_info_t *dpdevi;
71 
72 
73 static struct cb_ops    dp_cb_ops = {
74 	dpopen,			/* open */
75 	dpclose,		/* close */
76 	nodev,			/* strategy */
77 	nodev,			/* print */
78 	nodev,			/* dump */
79 	nodev,			/* read */
80 	dpwrite,		/* write */
81 	dpioctl,		/* ioctl */
82 	nodev,			/* devmap */
83 	nodev,			/* mmap */
84 	nodev,			/* segmap */
85 	dppoll,			/* poll */
86 	ddi_prop_op,		/* prop_op */
87 	(struct streamtab *)0,	/* streamtab */
88 	D_MP,			/* flags */
89 	CB_REV,			/* cb_ops revision */
90 	nodev,			/* aread */
91 	nodev			/* awrite */
92 };
93 
94 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
95 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
96 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
97 
98 static struct dev_ops dp_ops = {
99 	DEVO_REV,		/* devo_rev */
100 	0,			/* refcnt */
101 	dpinfo,			/* info */
102 	nulldev,		/* identify */
103 	nulldev,		/* probe */
104 	dpattach,		/* attach */
105 	dpdetach,		/* detach */
106 	nodev,			/* reset */
107 	&dp_cb_ops,		/* driver operations */
108 	(struct bus_ops *)NULL, /* bus operations */
109 	nulldev,		/* power */
110 	ddi_quiesce_not_needed,		/* quiesce */
111 };
112 
113 
114 static struct modldrv modldrv = {
115 	&mod_driverops,		/* type of module - a driver */
116 	"/dev/poll driver",
117 	&dp_ops,
118 };
119 
120 static struct modlinkage modlinkage = {
121 	MODREV_1,
122 	(void *)&modldrv,
123 	NULL
124 };
125 
126 /*
127  * Locking Design
128  *
129  * The /dev/poll driver shares most of its code with poll sys call whose
130  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
131  * structure is per lwp. An implicit assumption is made there that some
132  * portion of pollcache will never be touched by other lwps. E.g., in
133  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
134  * This assumption is not true for /dev/poll; hence the need for extra
135  * locking.
136  *
137  * To allow more parallelism, each /dev/poll file descriptor (indexed by
138  * minor number) has its own lock. Since read (dpioctl) is a much more
139  * frequent operation than write, we want to allow multiple reads on same
140  * /dev/poll fd. However, we prevent writes from being starved by giving
141  * priority to write operation. Theoretically writes can starve reads as
142  * well. But in practical sense this is not important because (1) writes
143  * happens less often than reads, and (2) write operation defines the
144  * content of poll fd a cache set. If writes happens so often that they
145  * can starve reads, that means the cached set is very unstable. It may
146  * not make sense to read an unstable cache set anyway. Therefore, the
147  * writers starving readers case is not handled in this design.
148  */
149 
150 int
151 _init()
152 {
153 	int	error;
154 
155 	dptblsize = DEVPOLLSIZE;
156 	devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
157 	mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
158 	devpoll_init = 1;
159 	if ((error = mod_install(&modlinkage)) != 0) {
160 		mutex_destroy(&devpoll_lock);
161 		kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
162 		devpoll_init = 0;
163 	}
164 	return (error);
165 }
166 
167 int
168 _fini()
169 {
170 	int error;
171 
172 	if ((error = mod_remove(&modlinkage)) != 0) {
173 		return (error);
174 	}
175 	mutex_destroy(&devpoll_lock);
176 	kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
177 	return (0);
178 }
179 
180 int
181 _info(struct modinfo *modinfop)
182 {
183 	return (mod_info(&modlinkage, modinfop));
184 }
185 
186 /*ARGSUSED*/
187 static int
188 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
189 {
190 	if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL)
191 	    == DDI_FAILURE) {
192 		ddi_remove_minor_node(devi, NULL);
193 		return (DDI_FAILURE);
194 	}
195 	dpdevi = devi;
196 	return (DDI_SUCCESS);
197 }
198 
199 static int
200 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
201 {
202 	if (cmd != DDI_DETACH)
203 		return (DDI_FAILURE);
204 
205 	ddi_remove_minor_node(devi, NULL);
206 	return (DDI_SUCCESS);
207 }
208 
209 /* ARGSUSED */
210 static int
211 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
212 {
213 	int error;
214 
215 	switch (infocmd) {
216 	case DDI_INFO_DEVT2DEVINFO:
217 		*result = (void *)dpdevi;
218 		error = DDI_SUCCESS;
219 		break;
220 	case DDI_INFO_DEVT2INSTANCE:
221 		*result = (void *)0;
222 		error = DDI_SUCCESS;
223 		break;
224 	default:
225 		error = DDI_FAILURE;
226 	}
227 	return (error);
228 }
229 
230 /*
231  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
232  * differences are: (1) /dev/poll requires scanning the bitmap starting at
233  * where it was stopped last time, instead of always starting from 0,
234  * (2) since user may not have cleaned up the cached fds when they are
235  * closed, some polldats in cache may refer to closed or reused fds. We
236  * need to check for those cases.
237  *
238  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
239  *	 poll(2) caches but NOT for /dev/poll caches. So expect some
240  *	 stale entries!
241  */
242 static int
243 dp_pcache_poll(dp_entry_t *dpep, void *dpbuf,
244     pollcache_t *pcp, nfds_t nfds, int *fdcntp)
245 {
246 	int		start, ostart, end;
247 	int		fdcnt, fd;
248 	boolean_t	done;
249 	file_t		*fp;
250 	short		revent;
251 	boolean_t	no_wrap;
252 	pollhead_t	*php;
253 	polldat_t	*pdp;
254 	pollfd_t	*pfdp;
255 	epoll_event_t	*epoll;
256 	int		error = 0;
257 	short		mask = POLLRDHUP | POLLWRBAND;
258 
259 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
260 	if (pcp->pc_bitmap == NULL) {
261 		/*
262 		 * No Need to search because no poll fd
263 		 * has been cached.
264 		 */
265 		return (error);
266 	}
267 
268 	if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
269 		pfdp = NULL;
270 		epoll = (epoll_event_t *)dpbuf;
271 	} else {
272 		pfdp = (pollfd_t *)dpbuf;
273 		epoll = NULL;
274 	}
275 retry:
276 	start = ostart = pcp->pc_mapstart;
277 	end = pcp->pc_mapend;
278 	php = NULL;
279 
280 	if (start == 0) {
281 		/*
282 		 * started from every begining, no need to wrap around.
283 		 */
284 		no_wrap = B_TRUE;
285 	} else {
286 		no_wrap = B_FALSE;
287 	}
288 	done = B_FALSE;
289 	fdcnt = 0;
290 	while ((fdcnt < nfds) && !done) {
291 		php = NULL;
292 		revent = 0;
293 		/*
294 		 * Examine the bit map in a circular fashion
295 		 * to avoid starvation. Always resume from
296 		 * last stop. Scan till end of the map. Then
297 		 * wrap around.
298 		 */
299 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
300 		ASSERT(fd <= end);
301 		if (fd >= 0) {
302 			if (fd == end) {
303 				if (no_wrap) {
304 					done = B_TRUE;
305 				} else {
306 					start = 0;
307 					end = ostart - 1;
308 					no_wrap = B_TRUE;
309 				}
310 			} else {
311 				start = fd + 1;
312 			}
313 			pdp = pcache_lookup_fd(pcp, fd);
314 repoll:
315 			ASSERT(pdp != NULL);
316 			ASSERT(pdp->pd_fd == fd);
317 			if (pdp->pd_fp == NULL) {
318 				/*
319 				 * The fd is POLLREMOVed. This fd is
320 				 * logically no longer cached. So move
321 				 * on to the next one.
322 				 */
323 				continue;
324 			}
325 			if ((fp = getf(fd)) == NULL) {
326 				/*
327 				 * The fd has been closed, but user has not
328 				 * done a POLLREMOVE on this fd yet. Instead
329 				 * of cleaning it here implicitly, we return
330 				 * POLLNVAL. This is consistent with poll(2)
331 				 * polling a closed fd. Hope this will remind
332 				 * user to do a POLLREMOVE.
333 				 */
334 				if (pfdp != NULL) {
335 					pfdp[fdcnt].fd = fd;
336 					pfdp[fdcnt].revents = POLLNVAL;
337 					fdcnt++;
338 					continue;
339 				}
340 
341 				/*
342 				 * In the epoll compatibility case, we actually
343 				 * perform the implicit removal to remain
344 				 * closer to the epoll semantics.
345 				 */
346 				ASSERT(epoll != NULL);
347 
348 				pdp->pd_fp = NULL;
349 				pdp->pd_events = 0;
350 
351 				if (php != NULL) {
352 					pollhead_delete(php, pdp);
353 					pdp->pd_php = NULL;
354 				}
355 
356 				BT_CLEAR(pcp->pc_bitmap, fd);
357 				continue;
358 			}
359 
360 			if (fp != pdp->pd_fp) {
361 				/*
362 				 * user is polling on a cached fd which was
363 				 * closed and then reused. Unfortunately
364 				 * there is no good way to inform user.
365 				 * If the file struct is also reused, we
366 				 * may not be able to detect the fd reuse
367 				 * at all.  As long as this does not
368 				 * cause system failure and/or memory leak,
369 				 * we will play along. Man page states if
370 				 * user does not clean up closed fds, polling
371 				 * results will be indeterministic.
372 				 *
373 				 * XXX - perhaps log the detection of fd
374 				 *	 reuse?
375 				 */
376 				pdp->pd_fp = fp;
377 			}
378 			/*
379 			 * XXX - pollrelock() logic needs to know which
380 			 * which pollcache lock to grab. It'd be a
381 			 * cleaner solution if we could pass pcp as
382 			 * an arguement in VOP_POLL interface instead
383 			 * of implicitly passing it using thread_t
384 			 * struct. On the other hand, changing VOP_POLL
385 			 * interface will require all driver/file system
386 			 * poll routine to change. May want to revisit
387 			 * the tradeoff later.
388 			 */
389 			curthread->t_pollcache = pcp;
390 			error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
391 			    &revent, &php, NULL);
392 			curthread->t_pollcache = NULL;
393 			releasef(fd);
394 			if (error != 0) {
395 				break;
396 			}
397 			/*
398 			 * layered devices (e.g. console driver)
399 			 * may change the vnode and thus the pollhead
400 			 * pointer out from underneath us.
401 			 */
402 			if (php != NULL && pdp->pd_php != NULL &&
403 			    php != pdp->pd_php) {
404 				pollhead_delete(pdp->pd_php, pdp);
405 				pdp->pd_php = php;
406 				pollhead_insert(php, pdp);
407 				/*
408 				 * The bit should still be set.
409 				 */
410 				ASSERT(BT_TEST(pcp->pc_bitmap, fd));
411 				goto retry;
412 			}
413 
414 			if (revent != 0) {
415 				if (pfdp != NULL) {
416 					pfdp[fdcnt].fd = fd;
417 					pfdp[fdcnt].events = pdp->pd_events;
418 					pfdp[fdcnt].revents = revent;
419 				} else {
420 					epoll_event_t *ep = &epoll[fdcnt];
421 
422 					ASSERT(epoll != NULL);
423 					ep->data.u64 = pdp->pd_epolldata;
424 
425 					/*
426 					 * If any of the event bits are set for
427 					 * which poll and epoll representations
428 					 * differ, swizzle in the native epoll
429 					 * values.
430 					 */
431 					if (revent & mask) {
432 						ep->events = (revent & ~mask) |
433 						    ((revent & POLLRDHUP) ?
434 						    EPOLLRDHUP : 0) |
435 						    ((revent & POLLWRBAND) ?
436 						    EPOLLWRBAND : 0);
437 					} else {
438 						ep->events = revent;
439 					}
440 
441 					/*
442 					 * We define POLLWRNORM to be POLLOUT,
443 					 * but epoll has separate definitions
444 					 * for them; if POLLOUT is set and the
445 					 * user has asked for EPOLLWRNORM, set
446 					 * that as well.
447 					 */
448 					if ((revent & POLLOUT) &&
449 					    (pdp->pd_events & EPOLLWRNORM)) {
450 						ep->events |= EPOLLWRNORM;
451 					}
452 				}
453 
454 				/*
455 				 * If POLLET is set, clear the bit in the
456 				 * bitmap -- which effectively latches the
457 				 * edge on a pollwakeup() from the driver.
458 				 */
459 				if (pdp->pd_events & POLLET)
460 					BT_CLEAR(pcp->pc_bitmap, fd);
461 
462 				/*
463 				 * If POLLONESHOT is set, perform the implicit
464 				 * POLLREMOVE.
465 				 */
466 				if (pdp->pd_events & POLLONESHOT) {
467 					pdp->pd_fp = NULL;
468 					pdp->pd_events = 0;
469 
470 					if (php != NULL) {
471 						pollhead_delete(php, pdp);
472 						pdp->pd_php = NULL;
473 					}
474 
475 					BT_CLEAR(pcp->pc_bitmap, fd);
476 				}
477 
478 				fdcnt++;
479 			} else if (php != NULL) {
480 				/*
481 				 * We clear a bit or cache a poll fd if
482 				 * the driver returns a poll head ptr,
483 				 * which is expected in the case of 0
484 				 * revents. Some buggy driver may return
485 				 * NULL php pointer with 0 revents. In
486 				 * this case, we just treat the driver as
487 				 * "noncachable" and not clearing the bit
488 				 * in bitmap.
489 				 */
490 				if ((pdp->pd_php != NULL) &&
491 				    ((pcp->pc_flag & PC_POLLWAKE) == 0)) {
492 					BT_CLEAR(pcp->pc_bitmap, fd);
493 				}
494 				if (pdp->pd_php == NULL) {
495 					pollhead_insert(php, pdp);
496 					pdp->pd_php = php;
497 					/*
498 					 * An event of interest may have
499 					 * arrived between the VOP_POLL() and
500 					 * the pollhead_insert(); check again.
501 					 */
502 					goto repoll;
503 				}
504 			}
505 		} else {
506 			/*
507 			 * No bit set in the range. Check for wrap around.
508 			 */
509 			if (!no_wrap) {
510 				start = 0;
511 				end = ostart - 1;
512 				no_wrap = B_TRUE;
513 			} else {
514 				done = B_TRUE;
515 			}
516 		}
517 	}
518 
519 	if (!done) {
520 		pcp->pc_mapstart = start;
521 	}
522 	ASSERT(*fdcntp == 0);
523 	*fdcntp = fdcnt;
524 	return (error);
525 }
526 
527 /*ARGSUSED*/
528 static int
529 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
530 {
531 	minor_t		minordev;
532 	dp_entry_t	*dpep;
533 	pollcache_t	*pcp;
534 
535 	ASSERT(devpoll_init);
536 	ASSERT(dptblsize <= MAXMIN);
537 	mutex_enter(&devpoll_lock);
538 	for (minordev = 0; minordev < dptblsize; minordev++) {
539 		if (devpolltbl[minordev] == NULL) {
540 			devpolltbl[minordev] = (dp_entry_t *)RESERVED;
541 			break;
542 		}
543 	}
544 	if (minordev == dptblsize) {
545 		dp_entry_t	**newtbl;
546 		size_t		oldsize;
547 
548 		/*
549 		 * Used up every entry in the existing devpoll table.
550 		 * Grow the table by DEVPOLLSIZE.
551 		 */
552 		if ((oldsize = dptblsize) >= MAXMIN) {
553 			mutex_exit(&devpoll_lock);
554 			return (ENXIO);
555 		}
556 		dptblsize += DEVPOLLSIZE;
557 		if (dptblsize > MAXMIN) {
558 			dptblsize = MAXMIN;
559 		}
560 		newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
561 		bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
562 		kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
563 		devpolltbl = newtbl;
564 		devpolltbl[minordev] = (dp_entry_t *)RESERVED;
565 	}
566 	mutex_exit(&devpoll_lock);
567 
568 	dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
569 	/*
570 	 * allocate a pollcache skeleton here. Delay allocating bitmap
571 	 * structures until dpwrite() time, since we don't know the
572 	 * optimal size yet.  We also delay setting the pid until either
573 	 * dpwrite() or attempt to poll on the instance, allowing parents
574 	 * to create instances of /dev/poll for their children.  (In the
575 	 * epoll compatibility case, this check isn't performed to maintain
576 	 * semantic compatibility.)
577 	 */
578 	pcp = pcache_alloc();
579 	dpep->dpe_pcache = pcp;
580 	pcp->pc_pid = -1;
581 	*devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
582 	mutex_enter(&devpoll_lock);
583 	ASSERT(minordev < dptblsize);
584 	ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
585 	devpolltbl[minordev] = dpep;
586 	mutex_exit(&devpoll_lock);
587 	return (0);
588 }
589 
590 /*
591  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
592  * or change poll events for a watched fd.
593  */
594 /*ARGSUSED*/
595 static int
596 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
597 {
598 	minor_t		minor;
599 	dp_entry_t	*dpep;
600 	pollcache_t	*pcp;
601 	pollfd_t	*pollfdp, *pfdp;
602 	dvpoll_epollfd_t *epfdp;
603 	uintptr_t	limit;
604 	int		error, size;
605 	ssize_t		uiosize;
606 	nfds_t		pollfdnum;
607 	struct pollhead	*php = NULL;
608 	polldat_t	*pdp;
609 	int		fd;
610 	file_t		*fp;
611 
612 	minor = getminor(dev);
613 
614 	mutex_enter(&devpoll_lock);
615 	ASSERT(minor < dptblsize);
616 	dpep = devpolltbl[minor];
617 	ASSERT(dpep != NULL);
618 	mutex_exit(&devpoll_lock);
619 	pcp = dpep->dpe_pcache;
620 
621 	if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) &&
622 	    curproc->p_pid != pcp->pc_pid) {
623 		if (pcp->pc_pid != -1)
624 			return (EACCES);
625 
626 		pcp->pc_pid = curproc->p_pid;
627 	}
628 
629 	if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
630 		size = sizeof (dvpoll_epollfd_t);
631 	} else {
632 		size = sizeof (pollfd_t);
633 	}
634 
635 	uiosize = uiop->uio_resid;
636 	pollfdnum = uiosize / size;
637 	mutex_enter(&curproc->p_lock);
638 	if (pollfdnum > (uint_t)rctl_enforced_value(
639 	    rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
640 		(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
641 		    curproc->p_rctls, curproc, RCA_SAFE);
642 		mutex_exit(&curproc->p_lock);
643 		return (set_errno(EINVAL));
644 	}
645 	mutex_exit(&curproc->p_lock);
646 	/*
647 	 * Copy in the pollfd array.  Walk through the array and add
648 	 * each polled fd to the cached set.
649 	 */
650 	pollfdp = kmem_alloc(uiosize, KM_SLEEP);
651 	limit = (uintptr_t)pollfdp + (pollfdnum * size);
652 
653 	/*
654 	 * Although /dev/poll uses the write(2) interface to cache fds, it's
655 	 * not supposed to function as a seekable device. To prevent offset
656 	 * from growing and eventually exceed the maximum, reset the offset
657 	 * here for every call.
658 	 */
659 	uiop->uio_loffset = 0;
660 	if ((error = uiomove((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop))
661 	    != 0) {
662 		kmem_free(pollfdp, uiosize);
663 		return (error);
664 	}
665 	/*
666 	 * We are about to enter the core portion of dpwrite(). Make sure this
667 	 * write has exclusive access in this portion of the code, i.e., no
668 	 * other writers in this code and no other readers in dpioctl.
669 	 */
670 	mutex_enter(&dpep->dpe_lock);
671 	dpep->dpe_writerwait++;
672 	while (dpep->dpe_refcnt != 0) {
673 		/*
674 		 * We need to do a bit of a dance here:  we need to drop
675 		 * our dpe_lock and grab the pc_lock to broadcast the pc_cv to
676 		 * kick any DP_POLL/DP_PPOLL sleepers.
677 		 */
678 		mutex_exit(&dpep->dpe_lock);
679 		mutex_enter(&pcp->pc_lock);
680 		pcp->pc_flag |= PC_WRITEWANTED;
681 		cv_broadcast(&pcp->pc_cv);
682 		mutex_exit(&pcp->pc_lock);
683 		mutex_enter(&dpep->dpe_lock);
684 
685 		if (dpep->dpe_refcnt == 0)
686 			break;
687 
688 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
689 			dpep->dpe_writerwait--;
690 			mutex_exit(&dpep->dpe_lock);
691 			mutex_enter(&pcp->pc_lock);
692 			pcp->pc_flag &= ~PC_WRITEWANTED;
693 			mutex_exit(&pcp->pc_lock);
694 			kmem_free(pollfdp, uiosize);
695 			return (set_errno(EINTR));
696 		}
697 	}
698 	dpep->dpe_writerwait--;
699 	dpep->dpe_flag |= DP_WRITER_PRESENT;
700 	dpep->dpe_refcnt++;
701 
702 	mutex_exit(&dpep->dpe_lock);
703 
704 	mutex_enter(&pcp->pc_lock);
705 	pcp->pc_flag &= ~PC_WRITEWANTED;
706 
707 	if (pcp->pc_bitmap == NULL) {
708 		pcache_create(pcp, pollfdnum);
709 	}
710 	for (pfdp = pollfdp; (uintptr_t)pfdp < limit;
711 	    pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) {
712 		fd = pfdp->fd;
713 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) {
714 			/*
715 			 * epoll semantics demand that we return EBADF if our
716 			 * specified fd is invalid.
717 			 */
718 			if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
719 				error = EBADF;
720 				break;
721 			}
722 
723 			continue;
724 		}
725 
726 		pdp = pcache_lookup_fd(pcp, fd);
727 		if (pfdp->events != POLLREMOVE) {
728 
729 			fp = NULL;
730 
731 			if (pdp == NULL) {
732 				/*
733 				 * If we're in epoll compatibility mode, check
734 				 * that the fd is valid before allocating
735 				 * anything for it; epoll semantics demand that
736 				 * we return EBADF if our specified fd is
737 				 * invalid.
738 				 */
739 				if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
740 					if ((fp = getf(fd)) == NULL) {
741 						error = EBADF;
742 						break;
743 					}
744 				}
745 
746 				pdp = pcache_alloc_fd(0);
747 				pdp->pd_fd = fd;
748 				pdp->pd_pcache = pcp;
749 				pcache_insert_fd(pcp, pdp, pollfdnum);
750 			} else {
751 				/*
752 				 * epoll semantics demand that we error out if
753 				 * a file descriptor is added twice, which we
754 				 * check (imperfectly) by checking if we both
755 				 * have the file descriptor cached and the
756 				 * file pointer that correponds to the file
757 				 * descriptor matches our cached value.  If
758 				 * there is a pointer mismatch, the file
759 				 * descriptor was closed without being removed.
760 				 * The converse is clearly not true, however,
761 				 * so to narrow the window by which a spurious
762 				 * EEXIST may be returned, we also check if
763 				 * this fp has been added to an epoll control
764 				 * descriptor in the past; if it hasn't, we
765 				 * know that this is due to fp reuse -- it's
766 				 * not a true EEXIST case.  (By performing this
767 				 * additional check, we limit the window of
768 				 * spurious EEXIST to situations where a single
769 				 * file descriptor is being used across two or
770 				 * more epoll control descriptors -- and even
771 				 * then, the file descriptor must be closed and
772 				 * reused in a relatively tight time span.)
773 				 */
774 				if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
775 					if (pdp->pd_fp != NULL &&
776 					    (fp = getf(fd)) != NULL &&
777 					    fp == pdp->pd_fp &&
778 					    (fp->f_flag2 & FEPOLLED)) {
779 						error = EEXIST;
780 						releasef(fd);
781 						break;
782 					}
783 
784 					/*
785 					 * We have decided that the cached
786 					 * information was stale: it either
787 					 * didn't match, or the fp had never
788 					 * actually been epoll()'d on before.
789 					 * We need to now clear our pd_events
790 					 * to assure that we don't mistakenly
791 					 * operate on cached event disposition.
792 					 */
793 					pdp->pd_events = 0;
794 				}
795 			}
796 
797 			if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
798 				epfdp = (dvpoll_epollfd_t *)pfdp;
799 				pdp->pd_epolldata = epfdp->dpep_data;
800 			}
801 
802 			ASSERT(pdp->pd_fd == fd);
803 			ASSERT(pdp->pd_pcache == pcp);
804 			if (fd >= pcp->pc_mapsize) {
805 				mutex_exit(&pcp->pc_lock);
806 				pcache_grow_map(pcp, fd);
807 				mutex_enter(&pcp->pc_lock);
808 			}
809 			if (fd > pcp->pc_mapend) {
810 				pcp->pc_mapend = fd;
811 			}
812 			if (fp == NULL && (fp = getf(fd)) == NULL) {
813 				/*
814 				 * The fd is not valid. Since we can't pass
815 				 * this error back in the write() call, set
816 				 * the bit in bitmap to force DP_POLL ioctl
817 				 * to examine it.
818 				 */
819 				BT_SET(pcp->pc_bitmap, fd);
820 				pdp->pd_events |= pfdp->events;
821 				continue;
822 			}
823 
824 			/*
825 			 * To (greatly) reduce EEXIST false positives, we
826 			 * denote that this fp has been epoll()'d.  We do this
827 			 * regardless of epoll compatibility mode, as the flag
828 			 * is harmless if not in epoll compatibility mode.
829 			 */
830 			fp->f_flag2 |= FEPOLLED;
831 
832 			/*
833 			 * Don't do VOP_POLL for an already cached fd with
834 			 * same poll events.
835 			 */
836 			if ((pdp->pd_events == pfdp->events) &&
837 			    (pdp->pd_fp == fp)) {
838 				/*
839 				 * the events are already cached
840 				 */
841 				releasef(fd);
842 				continue;
843 			}
844 
845 			/*
846 			 * do VOP_POLL and cache this poll fd.
847 			 */
848 			/*
849 			 * XXX - pollrelock() logic needs to know which
850 			 * which pollcache lock to grab. It'd be a
851 			 * cleaner solution if we could pass pcp as
852 			 * an arguement in VOP_POLL interface instead
853 			 * of implicitly passing it using thread_t
854 			 * struct. On the other hand, changing VOP_POLL
855 			 * interface will require all driver/file system
856 			 * poll routine to change. May want to revisit
857 			 * the tradeoff later.
858 			 */
859 			curthread->t_pollcache = pcp;
860 			error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
861 			    &pfdp->revents, &php, NULL);
862 			curthread->t_pollcache = NULL;
863 			/*
864 			 * We always set the bit when this fd is cached;
865 			 * this forces the first DP_POLL to poll this fd.
866 			 * Real performance gain comes from subsequent
867 			 * DP_POLL.  We also attempt a pollhead_insert();
868 			 * if it's not possible, we'll do it in dpioctl().
869 			 */
870 			BT_SET(pcp->pc_bitmap, fd);
871 			if (error != 0) {
872 				releasef(fd);
873 				break;
874 			}
875 			pdp->pd_fp = fp;
876 			pdp->pd_events |= pfdp->events;
877 			if (php != NULL) {
878 				if (pdp->pd_php == NULL) {
879 					pollhead_insert(php, pdp);
880 					pdp->pd_php = php;
881 				} else {
882 					if (pdp->pd_php != php) {
883 						pollhead_delete(pdp->pd_php,
884 						    pdp);
885 						pollhead_insert(php, pdp);
886 						pdp->pd_php = php;
887 					}
888 				}
889 
890 			}
891 			releasef(fd);
892 		} else {
893 			if (pdp == NULL || pdp->pd_fp == NULL) {
894 				if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
895 					/*
896 					 * As with the add case (above), epoll
897 					 * semantics demand that we error out
898 					 * in this case.
899 					 */
900 					error = ENOENT;
901 					break;
902 				}
903 
904 				continue;
905 			}
906 			ASSERT(pdp->pd_fd == fd);
907 			pdp->pd_fp = NULL;
908 			pdp->pd_events = 0;
909 			ASSERT(pdp->pd_thread == NULL);
910 			if (pdp->pd_php != NULL) {
911 				pollhead_delete(pdp->pd_php, pdp);
912 				pdp->pd_php = NULL;
913 			}
914 			BT_CLEAR(pcp->pc_bitmap, fd);
915 		}
916 	}
917 	mutex_exit(&pcp->pc_lock);
918 	mutex_enter(&dpep->dpe_lock);
919 	dpep->dpe_flag &= ~DP_WRITER_PRESENT;
920 	ASSERT(dpep->dpe_refcnt == 1);
921 	dpep->dpe_refcnt--;
922 	cv_broadcast(&dpep->dpe_cv);
923 	mutex_exit(&dpep->dpe_lock);
924 	kmem_free(pollfdp, uiosize);
925 	return (error);
926 }
927 
928 #define	DP_SIGMASK_RESTORE(ksetp) {					\
929 	if (ksetp != NULL) {						\
930 		mutex_enter(&p->p_lock);				\
931 		if (lwp->lwp_cursig == 0) {				\
932 			t->t_hold = lwp->lwp_sigoldmask;		\
933 			t->t_flag &= ~T_TOMASK;				\
934 		}							\
935 		mutex_exit(&p->p_lock);					\
936 	}								\
937 }
938 
939 /*ARGSUSED*/
940 static int
941 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
942 {
943 	minor_t		minor;
944 	dp_entry_t	*dpep;
945 	pollcache_t	*pcp;
946 	hrtime_t	now;
947 	int		error = 0;
948 	STRUCT_DECL(dvpoll, dvpoll);
949 
950 	if (cmd == DP_POLL || cmd == DP_PPOLL) {
951 		/* do this now, before we sleep on DP_WRITER_PRESENT */
952 		now = gethrtime();
953 	}
954 
955 	minor = getminor(dev);
956 	mutex_enter(&devpoll_lock);
957 	ASSERT(minor < dptblsize);
958 	dpep = devpolltbl[minor];
959 	mutex_exit(&devpoll_lock);
960 	ASSERT(dpep != NULL);
961 	pcp = dpep->dpe_pcache;
962 
963 	mutex_enter(&dpep->dpe_lock);
964 
965 	if (cmd == DP_EPOLLCOMPAT) {
966 		if (dpep->dpe_refcnt != 0) {
967 			/*
968 			 * We can't turn on epoll compatibility while there
969 			 * are outstanding operations.
970 			 */
971 			mutex_exit(&dpep->dpe_lock);
972 			return (EBUSY);
973 		}
974 
975 		/*
976 		 * epoll compatibility is a one-way street: there's no way
977 		 * to turn it off for a particular open.
978 		 */
979 		dpep->dpe_flag |= DP_ISEPOLLCOMPAT;
980 		mutex_exit(&dpep->dpe_lock);
981 
982 		return (0);
983 	}
984 
985 	if (!(dpep->dpe_flag & DP_ISEPOLLCOMPAT) &&
986 	    curproc->p_pid != pcp->pc_pid) {
987 		if (pcp->pc_pid != -1) {
988 			mutex_exit(&dpep->dpe_lock);
989 			return (EACCES);
990 		}
991 
992 		pcp->pc_pid = curproc->p_pid;
993 	}
994 
995 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
996 	    (dpep->dpe_writerwait != 0)) {
997 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
998 			mutex_exit(&dpep->dpe_lock);
999 			return (EINTR);
1000 		}
1001 	}
1002 	dpep->dpe_refcnt++;
1003 	mutex_exit(&dpep->dpe_lock);
1004 
1005 	switch (cmd) {
1006 	case	DP_POLL:
1007 	case	DP_PPOLL:
1008 	{
1009 		pollstate_t	*ps;
1010 		nfds_t		nfds;
1011 		int		fdcnt = 0;
1012 		size_t		size, fdsize, dpsize;
1013 		hrtime_t	deadline = 0;
1014 		k_sigset_t	*ksetp = NULL;
1015 		k_sigset_t	kset;
1016 		sigset_t	set;
1017 		kthread_t	*t = curthread;
1018 		klwp_t		*lwp = ttolwp(t);
1019 		struct proc	*p = ttoproc(curthread);
1020 
1021 		STRUCT_INIT(dvpoll, mode);
1022 
1023 		/*
1024 		 * The dp_setp member is only required/consumed for DP_PPOLL,
1025 		 * which otherwise uses the same structure as DP_POLL.
1026 		 */
1027 		if (cmd == DP_POLL) {
1028 			dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) -
1029 			    (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds);
1030 		} else {
1031 			ASSERT(cmd == DP_PPOLL);
1032 			dpsize = STRUCT_SIZE(dvpoll);
1033 		}
1034 
1035 		if ((mode & FKIOCTL) != 0) {
1036 			/* Kernel-internal ioctl call */
1037 			bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize);
1038 			error = 0;
1039 		} else {
1040 			error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
1041 			    dpsize);
1042 		}
1043 
1044 		if (error) {
1045 			DP_REFRELE(dpep);
1046 			return (EFAULT);
1047 		}
1048 
1049 		deadline = STRUCT_FGET(dvpoll, dp_timeout);
1050 		if (deadline > 0) {
1051 			/*
1052 			 * Convert the deadline from relative milliseconds
1053 			 * to absolute nanoseconds.  They must wait for at
1054 			 * least a tick.
1055 			 */
1056 			deadline = MSEC2NSEC(deadline);
1057 			deadline = MAX(deadline, nsec_per_tick);
1058 			deadline += now;
1059 		}
1060 
1061 		if (cmd == DP_PPOLL) {
1062 			void *setp = STRUCT_FGETP(dvpoll, dp_setp);
1063 
1064 			if (setp != NULL) {
1065 				if (copyin(setp, &set, sizeof (set))) {
1066 					DP_REFRELE(dpep);
1067 					return (EFAULT);
1068 				}
1069 
1070 				sigutok(&set, &kset);
1071 				ksetp = &kset;
1072 
1073 				mutex_enter(&p->p_lock);
1074 				schedctl_finish_sigblock(t);
1075 				lwp->lwp_sigoldmask = t->t_hold;
1076 				t->t_hold = *ksetp;
1077 				t->t_flag |= T_TOMASK;
1078 
1079 				/*
1080 				 * Like ppoll() with a non-NULL sigset, we'll
1081 				 * call cv_reltimedwait_sig() just to check for
1082 				 * signals.  This call will return immediately
1083 				 * with either 0 (signalled) or -1 (no signal).
1084 				 * There are some conditions whereby we can
1085 				 * get 0 from cv_reltimedwait_sig() without
1086 				 * a true signal (e.g., a directed stop), so
1087 				 * we restore our signal mask in the unlikely
1088 				 * event that lwp_cursig is 0.
1089 				 */
1090 				if (!cv_reltimedwait_sig(&t->t_delay_cv,
1091 				    &p->p_lock, 0, TR_CLOCK_TICK)) {
1092 					if (lwp->lwp_cursig == 0) {
1093 						t->t_hold = lwp->lwp_sigoldmask;
1094 						t->t_flag &= ~T_TOMASK;
1095 					}
1096 
1097 					mutex_exit(&p->p_lock);
1098 
1099 					DP_REFRELE(dpep);
1100 					return (EINTR);
1101 				}
1102 
1103 				mutex_exit(&p->p_lock);
1104 			}
1105 		}
1106 
1107 		if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
1108 			/*
1109 			 * We are just using DP_POLL to sleep, so
1110 			 * we don't any of the devpoll apparatus.
1111 			 * Do not check for signals if we have a zero timeout.
1112 			 */
1113 			DP_REFRELE(dpep);
1114 			if (deadline == 0) {
1115 				DP_SIGMASK_RESTORE(ksetp);
1116 				return (0);
1117 			}
1118 
1119 			mutex_enter(&curthread->t_delay_lock);
1120 			while ((error =
1121 			    cv_timedwait_sig_hrtime(&curthread->t_delay_cv,
1122 			    &curthread->t_delay_lock, deadline)) > 0)
1123 				continue;
1124 			mutex_exit(&curthread->t_delay_lock);
1125 
1126 			DP_SIGMASK_RESTORE(ksetp);
1127 
1128 			return (error == 0 ? EINTR : 0);
1129 		}
1130 
1131 		if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
1132 			size = nfds * (fdsize = sizeof (epoll_event_t));
1133 		} else {
1134 			size = nfds * (fdsize = sizeof (pollfd_t));
1135 		}
1136 
1137 		/*
1138 		 * XXX It would be nice not to have to alloc each time, but it
1139 		 * requires another per thread structure hook. This can be
1140 		 * implemented later if data suggests that it's necessary.
1141 		 */
1142 		if ((ps = curthread->t_pollstate) == NULL) {
1143 			curthread->t_pollstate = pollstate_create();
1144 			ps = curthread->t_pollstate;
1145 		}
1146 
1147 		if (ps->ps_dpbufsize < size) {
1148 			/*
1149 			 * If nfds is larger than twice the current maximum
1150 			 * open file count, we'll silently clamp it.  This
1151 			 * only limits our exposure to allocating an
1152 			 * inordinate amount of kernel memory; it doesn't
1153 			 * otherwise affect the semantics.  (We have this
1154 			 * check at twice the maximum instead of merely the
1155 			 * maximum because some applications pass an nfds that
1156 			 * is only slightly larger than their limit.)
1157 			 */
1158 			mutex_enter(&p->p_lock);
1159 			if ((nfds >> 1) > p->p_fno_ctl) {
1160 				nfds = p->p_fno_ctl;
1161 				size = nfds * fdsize;
1162 			}
1163 			mutex_exit(&p->p_lock);
1164 
1165 			if (ps->ps_dpbufsize < size) {
1166 				kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize);
1167 				ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP);
1168 				ps->ps_dpbufsize = size;
1169 			}
1170 		}
1171 
1172 		mutex_enter(&pcp->pc_lock);
1173 		for (;;) {
1174 			pcp->pc_flag &= ~PC_POLLWAKE;
1175 
1176 			error = dp_pcache_poll(dpep, ps->ps_dpbuf,
1177 			    pcp, nfds, &fdcnt);
1178 			if (fdcnt > 0 || error != 0)
1179 				break;
1180 
1181 			/*
1182 			 * A pollwake has happened since we polled cache.
1183 			 */
1184 			if (pcp->pc_flag & PC_POLLWAKE)
1185 				continue;
1186 
1187 			/*
1188 			 * Sleep until we are notified, signaled, or timed out.
1189 			 */
1190 			if (deadline == 0) {
1191 				/* immediate timeout; do not check signals */
1192 				break;
1193 			}
1194 
1195 			if (!(pcp->pc_flag & PC_WRITEWANTED)) {
1196 				error = cv_timedwait_sig_hrtime(&pcp->pc_cv,
1197 				    &pcp->pc_lock, deadline);
1198 			} else {
1199 				error = 1;
1200 			}
1201 
1202 			if (error > 0 && (pcp->pc_flag & PC_WRITEWANTED)) {
1203 				/*
1204 				 * We've been kicked off of our cv because a
1205 				 * writer wants in.  We're going to drop our
1206 				 * reference count and then wait until the
1207 				 * writer is gone -- at which point we'll
1208 				 * reacquire the pc_lock and call into
1209 				 * dp_pcache_poll() to get the updated state.
1210 				 */
1211 				mutex_exit(&pcp->pc_lock);
1212 
1213 				mutex_enter(&dpep->dpe_lock);
1214 				dpep->dpe_refcnt--;
1215 				cv_broadcast(&dpep->dpe_cv);
1216 
1217 				while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
1218 				    (dpep->dpe_writerwait != 0)) {
1219 					error = cv_wait_sig_swap(&dpep->dpe_cv,
1220 					    &dpep->dpe_lock);
1221 				}
1222 
1223 				dpep->dpe_refcnt++;
1224 				mutex_exit(&dpep->dpe_lock);
1225 				mutex_enter(&pcp->pc_lock);
1226 			}
1227 
1228 			/*
1229 			 * If we were awakened by a signal or timeout
1230 			 * then break the loop, else poll again.
1231 			 */
1232 			if (error <= 0) {
1233 				error = (error == 0) ? EINTR : 0;
1234 				break;
1235 			} else {
1236 				error = 0;
1237 			}
1238 		}
1239 		mutex_exit(&pcp->pc_lock);
1240 
1241 		DP_SIGMASK_RESTORE(ksetp);
1242 
1243 		if (error == 0 && fdcnt > 0) {
1244 			if (copyout(ps->ps_dpbuf,
1245 			    STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) {
1246 				DP_REFRELE(dpep);
1247 				return (EFAULT);
1248 			}
1249 			*rvalp = fdcnt;
1250 		}
1251 		break;
1252 	}
1253 
1254 	case	DP_ISPOLLED:
1255 	{
1256 		pollfd_t	pollfd;
1257 		polldat_t	*pdp;
1258 
1259 		STRUCT_INIT(dvpoll, mode);
1260 		error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
1261 		if (error) {
1262 			DP_REFRELE(dpep);
1263 			return (EFAULT);
1264 		}
1265 		mutex_enter(&pcp->pc_lock);
1266 		if (pcp->pc_hash == NULL) {
1267 			/*
1268 			 * No Need to search because no poll fd
1269 			 * has been cached.
1270 			 */
1271 			mutex_exit(&pcp->pc_lock);
1272 			DP_REFRELE(dpep);
1273 			return (0);
1274 		}
1275 		if (pollfd.fd < 0) {
1276 			mutex_exit(&pcp->pc_lock);
1277 			break;
1278 		}
1279 		pdp = pcache_lookup_fd(pcp, pollfd.fd);
1280 		if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
1281 		    (pdp->pd_fp != NULL)) {
1282 			pollfd.revents = pdp->pd_events;
1283 			if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
1284 				mutex_exit(&pcp->pc_lock);
1285 				DP_REFRELE(dpep);
1286 				return (EFAULT);
1287 			}
1288 			*rvalp = 1;
1289 		}
1290 		mutex_exit(&pcp->pc_lock);
1291 		break;
1292 	}
1293 
1294 	default:
1295 		DP_REFRELE(dpep);
1296 		return (EINVAL);
1297 	}
1298 	DP_REFRELE(dpep);
1299 	return (error);
1300 }
1301 
1302 /*ARGSUSED*/
1303 static int
1304 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
1305     struct pollhead **phpp)
1306 {
1307 	minor_t		minor;
1308 	dp_entry_t	*dpep;
1309 
1310 	minor = getminor(dev);
1311 
1312 	mutex_enter(&devpoll_lock);
1313 	dpep = devpolltbl[minor];
1314 	ASSERT(dpep != NULL);
1315 	mutex_exit(&devpoll_lock);
1316 
1317 	/*
1318 	 * Polling on a /dev/poll fd is not fully supported yet.
1319 	 */
1320 	if (dpep->dpe_flag & DP_ISEPOLLCOMPAT) {
1321 		/* no error in epoll compat. mode */
1322 		*reventsp = 0;
1323 	} else {
1324 		*reventsp = POLLERR;
1325 	}
1326 	return (0);
1327 }
1328 
1329 /*
1330  * devpoll close should do enough clean up before the pollcache is deleted,
1331  * i.e., it should ensure no one still references the pollcache later.
1332  * There is no "permission" check in here. Any process having the last
1333  * reference of this /dev/poll fd can close.
1334  */
1335 /*ARGSUSED*/
1336 static int
1337 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
1338 {
1339 	minor_t		minor;
1340 	dp_entry_t	*dpep;
1341 	pollcache_t	*pcp;
1342 	int		i;
1343 	polldat_t	**hashtbl;
1344 	polldat_t	*pdp;
1345 
1346 	minor = getminor(dev);
1347 
1348 	mutex_enter(&devpoll_lock);
1349 	dpep = devpolltbl[minor];
1350 	ASSERT(dpep != NULL);
1351 	devpolltbl[minor] = NULL;
1352 	mutex_exit(&devpoll_lock);
1353 	pcp = dpep->dpe_pcache;
1354 	ASSERT(pcp != NULL);
1355 	/*
1356 	 * At this point, no other lwp can access this pollcache via the
1357 	 * /dev/poll fd. This pollcache is going away, so do the clean
1358 	 * up without the pc_lock.
1359 	 */
1360 	hashtbl = pcp->pc_hash;
1361 	for (i = 0; i < pcp->pc_hashsize; i++) {
1362 		for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
1363 			if (pdp->pd_php != NULL) {
1364 				pollhead_delete(pdp->pd_php, pdp);
1365 				pdp->pd_php = NULL;
1366 				pdp->pd_fp = NULL;
1367 			}
1368 		}
1369 	}
1370 	/*
1371 	 * pollwakeup() may still interact with this pollcache. Wait until
1372 	 * it is done.
1373 	 */
1374 	mutex_enter(&pcp->pc_no_exit);
1375 	ASSERT(pcp->pc_busy >= 0);
1376 	while (pcp->pc_busy > 0)
1377 		cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
1378 	mutex_exit(&pcp->pc_no_exit);
1379 	pcache_destroy(pcp);
1380 	ASSERT(dpep->dpe_refcnt == 0);
1381 	kmem_free(dpep, sizeof (dp_entry_t));
1382 	return (0);
1383 }
1384