xref: /illumos-gate/usr/src/uts/common/io/devpoll.c (revision d6555420322a42c16b93414c29a62f8e841abc7b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/devops.h>
31 #include <sys/conf.h>
32 #include <sys/modctl.h>
33 #include <sys/sunddi.h>
34 #include <sys/stat.h>
35 #include <sys/poll_impl.h>
36 #include <sys/errno.h>
37 #include <sys/kmem.h>
38 #include <sys/mkdev.h>
39 #include <sys/debug.h>
40 #include <sys/file.h>
41 #include <sys/sysmacros.h>
42 #include <sys/systm.h>
43 #include <sys/bitmap.h>
44 #include <sys/devpoll.h>
45 #include <sys/rctl.h>
46 #include <sys/resource.h>
47 
48 #define	RESERVED	1
49 
50 /* local data struct */
51 static	dp_entry_t	**devpolltbl; 	/* dev poll entries */
52 static	size_t		dptblsize;
53 
54 static	kmutex_t	devpoll_lock;	/* lock protecting dev tbl */
55 int			devpoll_init;	/* is /dev/poll initialized already */
56 
57 /* device local functions */
58 
59 static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp);
60 static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp);
61 static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
62     int *rvalp);
63 static int dppoll(dev_t dev, short events, int anyyet, short *reventsp,
64     struct pollhead **phpp);
65 static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp);
66 static dev_info_t *dpdevi;
67 
68 
69 static struct cb_ops    dp_cb_ops = {
70 	dpopen,			/* open */
71 	dpclose,		/* close */
72 	nodev,			/* strategy */
73 	nodev,			/* print */
74 	nodev,			/* dump */
75 	nodev,			/* read */
76 	dpwrite,		/* write */
77 	dpioctl,		/* ioctl */
78 	nodev,			/* devmap */
79 	nodev,			/* mmap */
80 	nodev,			/* segmap */
81 	dppoll,			/* poll */
82 	ddi_prop_op,		/* prop_op */
83 	(struct streamtab *)0,	/* streamtab */
84 	D_MP,			/* flags */
85 	CB_REV,			/* cb_ops revision */
86 	nodev,			/* aread */
87 	nodev			/* awrite */
88 };
89 
90 static int dpattach(dev_info_t *, ddi_attach_cmd_t);
91 static int dpdetach(dev_info_t *, ddi_detach_cmd_t);
92 static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
93 
94 static struct dev_ops dp_ops = {
95 	DEVO_REV,		/* devo_rev */
96 	0,			/* refcnt */
97 	dpinfo,			/* info */
98 	nulldev,		/* identify */
99 	nulldev,		/* probe */
100 	dpattach,		/* attach */
101 	dpdetach,		/* detach */
102 	nodev,			/* reset */
103 	&dp_cb_ops,		/* driver operations */
104 	(struct bus_ops *)NULL, /* bus operations */
105 	nulldev			/* power */
106 };
107 
108 
109 static struct modldrv modldrv = {
110 	&mod_driverops,		/* type of module - a driver */
111 	"Dev Poll driver %I%",
112 	&dp_ops,
113 };
114 
115 static struct modlinkage modlinkage = {
116 	MODREV_1,
117 	(void *)&modldrv,
118 	NULL
119 };
120 
121 /*
122  * Locking Design
123  *
124  * The /dev/poll driver shares most of its code with poll sys call whose
125  * code is in common/syscall/poll.c. In poll(2) design, the pollcache
126  * structure is per lwp. An implicit assumption is made there that some
127  * portion of pollcache will never be touched by other lwps. E.g., in
128  * poll(2) design, no lwp will ever need to grow bitmap of other lwp.
129  * This assumption is not true for /dev/poll; hence the need for extra
130  * locking.
131  *
132  * To allow more paralellism, each /dev/poll file descriptor (indexed by
133  * minor number) has its own lock. Since read (dpioctl) is a much more
134  * frequent operation than write, we want to allow multiple reads on same
135  * /dev/poll fd. However, we prevent writes from being starved by giving
136  * priority to write operation. Theoretically writes can starve reads as
137  * well. But in pratical sense this is not important because (1) writes
138  * happens less often than reads, and (2) write operation defines the
139  * content of poll fd a cache set. If writes happens so often that they
140  * can starve reads, that means the cached set is very unstable. It may
141  * not make sense to read an unstable cache set anyway. Therefore, the
142  * writers starving readers case is not handled in this design.
143  */
144 
145 int
146 _init()
147 {
148 	int	error;
149 
150 	dptblsize = DEVPOLLSIZE;
151 	devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
152 	mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL);
153 	devpoll_init = 1;
154 	if ((error = mod_install(&modlinkage)) != 0) {
155 		mutex_destroy(&devpoll_lock);
156 		kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
157 		devpoll_init = 0;
158 	}
159 	return (error);
160 }
161 
162 int
163 _fini()
164 {
165 	int error;
166 
167 	if ((error = mod_remove(&modlinkage)) != 0) {
168 		return (error);
169 	}
170 	mutex_destroy(&devpoll_lock);
171 	kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize);
172 	return (0);
173 }
174 
175 int
176 _info(struct modinfo *modinfop)
177 {
178 	return (mod_info(&modlinkage, modinfop));
179 }
180 
181 /*ARGSUSED*/
182 static int
183 dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd)
184 {
185 	if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL)
186 	    == DDI_FAILURE) {
187 		ddi_remove_minor_node(devi, NULL);
188 		return (DDI_FAILURE);
189 	}
190 	dpdevi = devi;
191 	return (DDI_SUCCESS);
192 }
193 
194 static int
195 dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd)
196 {
197 	if (cmd != DDI_DETACH)
198 		return (DDI_FAILURE);
199 
200 	ddi_remove_minor_node(devi, NULL);
201 	return (DDI_SUCCESS);
202 }
203 
204 /* ARGSUSED */
205 static int
206 dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
207 {
208 	int error;
209 
210 	switch (infocmd) {
211 	case DDI_INFO_DEVT2DEVINFO:
212 		*result = (void *)dpdevi;
213 		error = DDI_SUCCESS;
214 		break;
215 	case DDI_INFO_DEVT2INSTANCE:
216 		*result = (void *)0;
217 		error = DDI_SUCCESS;
218 		break;
219 	default:
220 		error = DDI_FAILURE;
221 	}
222 	return (error);
223 }
224 
225 /*
226  * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major
227  * differences are: (1) /dev/poll requires scanning the bitmap starting at
228  * where it was stopped last time, instead of always starting from 0,
229  * (2) since user may not have cleaned up the cached fds when they are
230  * closed, some polldats in cache may refer to closed or reused fds. We
231  * need to check for those cases.
232  *
233  * NOTE: Upon closing an fd, automatic poll cache cleanup is done for
234  *	 poll(2) caches but NOT for /dev/poll caches. So expect some
235  *	 stale entries!
236  */
237 static int
238 dp_pcache_poll(pollfd_t *pfdp, pollcache_t *pcp, nfds_t nfds, int *fdcntp)
239 {
240 	int		start, ostart, end;
241 	int		fdcnt, fd;
242 	boolean_t 	done;
243 	file_t		*fp;
244 	short		revent;
245 	boolean_t	no_wrap;
246 	pollhead_t	*php;
247 	polldat_t	*pdp;
248 	int		error = 0;
249 
250 	ASSERT(MUTEX_HELD(&pcp->pc_lock));
251 	if (pcp->pc_bitmap == NULL) {
252 		/*
253 		 * No Need to search because no poll fd
254 		 * has been cached.
255 		 */
256 		return (error);
257 	}
258 retry:
259 	start = ostart = pcp->pc_mapstart;
260 	end = pcp->pc_mapend;
261 	php = NULL;
262 
263 	if (start == 0) {
264 		/*
265 		 * started from every begining, no need to wrap around.
266 		 */
267 		no_wrap = B_TRUE;
268 	} else {
269 		no_wrap = B_FALSE;
270 	}
271 	done = B_FALSE;
272 	fdcnt = 0;
273 	while ((fdcnt < nfds) && !done) {
274 		php = NULL;
275 		revent = 0;
276 		/*
277 		 * Examine the bit map in a circular fashion
278 		 * to avoid starvation. Always resume from
279 		 * last stop. Scan till end of the map. Then
280 		 * wrap around.
281 		 */
282 		fd = bt_getlowbit(pcp->pc_bitmap, start, end);
283 		ASSERT(fd <= end);
284 		if (fd >= 0) {
285 			if (fd == end) {
286 				if (no_wrap) {
287 					done = B_TRUE;
288 				} else {
289 					start = 0;
290 					end = ostart - 1;
291 					no_wrap = B_TRUE;
292 				}
293 			} else {
294 				start = fd + 1;
295 			}
296 			pdp = pcache_lookup_fd(pcp, fd);
297 			ASSERT(pdp != NULL);
298 			ASSERT(pdp->pd_fd == fd);
299 			if (pdp->pd_fp == NULL) {
300 				/*
301 				 * The fd is POLLREMOVed. This fd is
302 				 * logically no longer cached. So move
303 				 * on to the next one.
304 				 */
305 				continue;
306 			}
307 			if ((fp = getf(fd)) == NULL) {
308 				/*
309 				 * The fd has been closed, but user has not
310 				 * done a POLLREMOVE on this fd yet. Instead
311 				 * of cleaning it here implicitly, we return
312 				 * POLLNVAL. This is consistent with poll(2)
313 				 * polling a closed fd. Hope this will remind
314 				 * user to do a POLLREMOVE.
315 				 */
316 				pfdp[fdcnt].fd = fd;
317 				pfdp[fdcnt].revents = POLLNVAL;
318 				fdcnt++;
319 				continue;
320 			}
321 			if (fp != pdp->pd_fp) {
322 				/*
323 				 * user is polling on a cached fd which was
324 				 * closed and then reused. Unfortunately
325 				 * there is no good way to inform user.
326 				 * If the file struct is also reused, we
327 				 * may not be able to detect the fd reuse
328 				 * at all.  As long as this does not
329 				 * cause system failure and/or memory leak,
330 				 * we will play along. Man page states if
331 				 * user does not clean up closed fds, polling
332 				 * results will be indeterministic.
333 				 *
334 				 * XXX - perhaps log the detection of fd
335 				 *	 reuse?
336 				 */
337 				pdp->pd_fp = fp;
338 			}
339 			/*
340 			 * XXX - pollrelock() logic needs to know which
341 			 * which pollcache lock to grab. It'd be a
342 			 * cleaner solution if we could pass pcp as
343 			 * an arguement in VOP_POLL interface instead
344 			 * of implicitly passing it using thread_t
345 			 * struct. On the other hand, changing VOP_POLL
346 			 * interface will require all driver/file system
347 			 * poll routine to change. May want to revisit
348 			 * the tradeoff later.
349 			 */
350 			curthread->t_pollcache = pcp;
351 			error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0,
352 			    &revent, &php);
353 			curthread->t_pollcache = NULL;
354 			releasef(fd);
355 			if (error != 0) {
356 				break;
357 			}
358 			/*
359 			 * layered devices (e.g. console driver)
360 			 * may change the vnode and thus the pollhead
361 			 * pointer out from underneath us.
362 			 */
363 			if (php != NULL && pdp->pd_php != NULL &&
364 			    php != pdp->pd_php) {
365 				pollhead_delete(pdp->pd_php, pdp);
366 				pdp->pd_php = php;
367 				pollhead_insert(php, pdp);
368 				/*
369 				 * The bit should still be set.
370 				 */
371 				ASSERT(BT_TEST(pcp->pc_bitmap, fd));
372 				goto retry;
373 			}
374 
375 			if (revent != 0) {
376 				pfdp[fdcnt].fd = fd;
377 				pfdp[fdcnt].events = pdp->pd_events;
378 				pfdp[fdcnt].revents = revent;
379 				fdcnt++;
380 			} else if (php != NULL) {
381 				/*
382 				 * We clear a bit or cache a poll fd if
383 				 * the driver returns a poll head ptr,
384 				 * which is expected in the case of 0
385 				 * revents. Some buggy driver may return
386 				 * NULL php pointer with 0 revents. In
387 				 * this case, we just treat the driver as
388 				 * "noncachable" and not clearing the bit
389 				 * in bitmap.
390 				 */
391 				if ((pdp->pd_php != NULL) &&
392 				    ((pcp->pc_flag & T_POLLWAKE) == 0)) {
393 					BT_CLEAR(pcp->pc_bitmap, fd);
394 				}
395 				if (pdp->pd_php == NULL) {
396 					pollhead_insert(php, pdp);
397 					pdp->pd_php = php;
398 				}
399 			}
400 		} else {
401 			/*
402 			 * No bit set in the range. Check for wrap around.
403 			 */
404 			if (!no_wrap) {
405 				start = 0;
406 				end = ostart - 1;
407 				no_wrap = B_TRUE;
408 			} else {
409 				done = B_TRUE;
410 			}
411 		}
412 	}
413 
414 	if (!done) {
415 		pcp->pc_mapstart = start;
416 	}
417 	ASSERT(*fdcntp == 0);
418 	*fdcntp = fdcnt;
419 	return (error);
420 }
421 
422 /*ARGSUSED*/
423 static int
424 dpopen(dev_t *devp, int flag, int otyp, cred_t *credp)
425 {
426 	minor_t		minordev;
427 	dp_entry_t	*dpep;
428 	pollcache_t	*pcp;
429 
430 	ASSERT(devpoll_init);
431 	ASSERT(dptblsize <= MAXMIN);
432 	mutex_enter(&devpoll_lock);
433 	for (minordev = 0; minordev < dptblsize; minordev++) {
434 		if (devpolltbl[minordev] == NULL) {
435 			devpolltbl[minordev] = (dp_entry_t *)RESERVED;
436 			break;
437 		}
438 	}
439 	if (minordev == dptblsize) {
440 		dp_entry_t	**newtbl;
441 		size_t		oldsize;
442 
443 		/*
444 		 * Used up every entry in the existing devpoll table.
445 		 * Grow the table by DEVPOLLSIZE.
446 		 */
447 		if ((oldsize = dptblsize) >= MAXMIN) {
448 			mutex_exit(&devpoll_lock);
449 			return (ENXIO);
450 		}
451 		dptblsize += DEVPOLLSIZE;
452 		if (dptblsize > MAXMIN) {
453 			dptblsize = MAXMIN;
454 		}
455 		newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP);
456 		bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize);
457 		kmem_free(devpolltbl, sizeof (caddr_t) * oldsize);
458 		devpolltbl = newtbl;
459 		devpolltbl[minordev] = (dp_entry_t *)RESERVED;
460 	}
461 	mutex_exit(&devpoll_lock);
462 
463 	dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP);
464 	/*
465 	 * allocate a pollcache skeleton here. Delay allocating bitmap
466 	 * structures until dpwrite() time, since we don't know the
467 	 * optimal size yet.
468 	 */
469 	pcp = pcache_alloc();
470 	dpep->dpe_pcache = pcp;
471 	pcp->pc_pid = curproc->p_pid;
472 	*devp = makedevice(getmajor(*devp), minordev);  /* clone the driver */
473 	mutex_enter(&devpoll_lock);
474 	ASSERT(minordev < dptblsize);
475 	ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED);
476 	devpolltbl[minordev] = dpep;
477 	mutex_exit(&devpoll_lock);
478 	return (0);
479 }
480 
481 /*
482  * Write to dev/poll add/remove fd's to/from a cached poll fd set,
483  * or change poll events for a watched fd.
484  */
485 /*ARGSUSED*/
486 static int
487 dpwrite(dev_t dev, struct uio *uiop, cred_t *credp)
488 {
489 	minor_t 	minor;
490 	dp_entry_t	*dpep;
491 	pollcache_t	*pcp;
492 	pollfd_t	*pollfdp, *pfdp;
493 	int		error;
494 	ssize_t		uiosize;
495 	nfds_t		pollfdnum;
496 	struct pollhead	*php = NULL;
497 	polldat_t	*pdp;
498 	int		fd;
499 	file_t		*fp;
500 
501 	minor = getminor(dev);
502 
503 	mutex_enter(&devpoll_lock);
504 	ASSERT(minor < dptblsize);
505 	dpep = devpolltbl[minor];
506 	ASSERT(dpep != NULL);
507 	mutex_exit(&devpoll_lock);
508 	pcp = dpep->dpe_pcache;
509 	if (curproc->p_pid != pcp->pc_pid) {
510 		return (EACCES);
511 	}
512 	uiosize = uiop->uio_resid;
513 	pollfdnum = uiosize / sizeof (pollfd_t);
514 	mutex_enter(&curproc->p_lock);
515 	if (pollfdnum > (uint_t)rctl_enforced_value(
516 	    rctlproc_legacy[RLIMIT_NOFILE], curproc->p_rctls, curproc)) {
517 		(void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE],
518 		    curproc->p_rctls, curproc, RCA_SAFE);
519 		mutex_exit(&curproc->p_lock);
520 		return (set_errno(EINVAL));
521 	}
522 	mutex_exit(&curproc->p_lock);
523 	/*
524 	 * Copy in the pollfd array.  Walk through the array and add
525 	 * each polled fd to the cached set.
526 	 */
527 	pollfdp = kmem_alloc(uiosize, KM_SLEEP);
528 
529 	/*
530 	 * Although /dev/poll uses the write(2) interface to cache fds, it's
531 	 * not supposed to function as a seekable device. To prevent offset
532 	 * from growing and eventually exceed the maximum, reset the offset
533 	 * here for every call.
534 	 */
535 	uiop->uio_loffset = 0;
536 	if ((error = uiomove((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop))
537 	    != 0) {
538 		kmem_free(pollfdp, uiosize);
539 		return (error);
540 	}
541 	/*
542 	 * We are about to enter the core portion of dpwrite(). Make sure this
543 	 * write has exclusive access in this portion of the code, i.e., no
544 	 * other writers in this code and no other readers in dpioctl.
545 	 */
546 	mutex_enter(&dpep->dpe_lock);
547 	dpep->dpe_writerwait++;
548 	while (dpep->dpe_refcnt != 0) {
549 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
550 			dpep->dpe_writerwait--;
551 			mutex_exit(&dpep->dpe_lock);
552 			kmem_free(pollfdp, uiosize);
553 			return (set_errno(EINTR));
554 		}
555 	}
556 	dpep->dpe_writerwait--;
557 	dpep->dpe_flag |= DP_WRITER_PRESENT;
558 	dpep->dpe_refcnt++;
559 	mutex_exit(&dpep->dpe_lock);
560 
561 	mutex_enter(&pcp->pc_lock);
562 	if (pcp->pc_bitmap == NULL) {
563 		pcache_create(pcp, pollfdnum);
564 	}
565 	for (pfdp = pollfdp; pfdp < pollfdp + pollfdnum; pfdp++) {
566 		fd = pfdp->fd;
567 		if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles)
568 			continue;
569 		pdp = pcache_lookup_fd(pcp, fd);
570 		if (pfdp->events != POLLREMOVE) {
571 			if (pdp == NULL) {
572 				pdp = pcache_alloc_fd(0);
573 				pdp->pd_fd = fd;
574 				pdp->pd_pcache = pcp;
575 				pcache_insert_fd(pcp, pdp, pollfdnum);
576 			}
577 			ASSERT(pdp->pd_fd == fd);
578 			ASSERT(pdp->pd_pcache == pcp);
579 			if (fd >= pcp->pc_mapsize) {
580 				mutex_exit(&pcp->pc_lock);
581 				pcache_grow_map(pcp, fd);
582 				mutex_enter(&pcp->pc_lock);
583 			}
584 			if (fd > pcp->pc_mapend) {
585 				pcp->pc_mapend = fd;
586 			}
587 			if ((fp = getf(fd)) == NULL) {
588 				/*
589 				 * The fd is not valid. Since we can't pass
590 				 * this error back in the write() call, set
591 				 * the bit in bitmap to force DP_POLL ioctl
592 				 * to examine it.
593 				 */
594 				BT_SET(pcp->pc_bitmap, fd);
595 				pdp->pd_events |= pfdp->events;
596 				continue;
597 			}
598 			/*
599 			 * Don't do VOP_POLL for an already cached fd with
600 			 * same poll events.
601 			 */
602 			if ((pdp->pd_events == pfdp->events) &&
603 			    (pdp->pd_fp != NULL)) {
604 				/*
605 				 * the events are already cached
606 				 */
607 				releasef(fd);
608 				continue;
609 			}
610 
611 			/*
612 			 * do VOP_POLL and cache this poll fd.
613 			 */
614 			/*
615 			 * XXX - pollrelock() logic needs to know which
616 			 * which pollcache lock to grab. It'd be a
617 			 * cleaner solution if we could pass pcp as
618 			 * an arguement in VOP_POLL interface instead
619 			 * of implicitly passing it using thread_t
620 			 * struct. On the other hand, changing VOP_POLL
621 			 * interface will require all driver/file system
622 			 * poll routine to change. May want to revisit
623 			 * the tradeoff later.
624 			 */
625 			curthread->t_pollcache = pcp;
626 			error = VOP_POLL(fp->f_vnode, pfdp->events, 0,
627 			    &pfdp->revents, &php);
628 			curthread->t_pollcache = NULL;
629 			/*
630 			 * We always set the bit when this fd is cached.
631 			 * So we don't have to worry about missing a
632 			 * pollwakeup between VOP_POLL and pollhead_insert.
633 			 * This forces the first DP_POLL to poll this fd.
634 			 * Real performance gain comes from subsequent
635 			 * DP_POLL.
636 			 */
637 			BT_SET(pcp->pc_bitmap, fd);
638 			if (error != 0) {
639 				releasef(fd);
640 				break;
641 			}
642 			pdp->pd_fp = fp;
643 			pdp->pd_events |= pfdp->events;
644 			if (php != NULL) {
645 				if (pdp->pd_php == NULL) {
646 					pollhead_insert(php, pdp);
647 					pdp->pd_php = php;
648 				} else {
649 					if (pdp->pd_php != php) {
650 						pollhead_delete(pdp->pd_php,
651 						    pdp);
652 						pollhead_insert(php, pdp);
653 						pdp->pd_php = php;
654 					}
655 				}
656 
657 			}
658 			releasef(fd);
659 		} else {
660 			if (pdp == NULL) {
661 				continue;
662 			}
663 			ASSERT(pdp->pd_fd == fd);
664 			pdp->pd_fp = NULL;
665 			pdp->pd_events = 0;
666 			ASSERT(pdp->pd_thread == NULL);
667 			if (pdp->pd_php != NULL) {
668 				pollhead_delete(pdp->pd_php, pdp);
669 				pdp->pd_php = NULL;
670 			}
671 			BT_CLEAR(pcp->pc_bitmap, fd);
672 		}
673 	}
674 	mutex_exit(&pcp->pc_lock);
675 	mutex_enter(&dpep->dpe_lock);
676 	dpep->dpe_flag &= ~DP_WRITER_PRESENT;
677 	ASSERT(dpep->dpe_refcnt == 1);
678 	dpep->dpe_refcnt--;
679 	cv_broadcast(&dpep->dpe_cv);
680 	mutex_exit(&dpep->dpe_lock);
681 	kmem_free(pollfdp, uiosize);
682 	return (error);
683 }
684 
685 /*ARGSUSED*/
686 static int
687 dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
688 {
689 	timestruc_t	now;
690 	timestruc_t	rqtime;
691 	timestruc_t	*rqtp = NULL;
692 	int		timecheck = 0;
693 	minor_t 	minor;
694 	dp_entry_t	*dpep;
695 	pollcache_t	*pcp;
696 	int 		error = 0;
697 	STRUCT_DECL(dvpoll, dvpoll);
698 
699 	if (cmd == DP_POLL) {
700 		/* do this now, before we sleep on DP_WRITER_PRESENT below */
701 		timecheck = timechanged;
702 		gethrestime(&now);
703 	}
704 	minor = getminor(dev);
705 	mutex_enter(&devpoll_lock);
706 	ASSERT(minor < dptblsize);
707 	dpep = devpolltbl[minor];
708 	mutex_exit(&devpoll_lock);
709 	ASSERT(dpep != NULL);
710 	pcp = dpep->dpe_pcache;
711 	if (curproc->p_pid != pcp->pc_pid)
712 		return (EACCES);
713 
714 	mutex_enter(&dpep->dpe_lock);
715 	while ((dpep->dpe_flag & DP_WRITER_PRESENT) ||
716 	    (dpep->dpe_writerwait != 0)) {
717 		if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) {
718 			mutex_exit(&dpep->dpe_lock);
719 			return (EINTR);
720 		}
721 	}
722 	dpep->dpe_refcnt++;
723 	mutex_exit(&dpep->dpe_lock);
724 
725 	switch (cmd) {
726 	case	DP_POLL:
727 	{
728 		pollstate_t *ps;
729 		nfds_t	nfds;
730 		int	fdcnt = 0;
731 		int	time_out;
732 		int	rval;
733 
734 		STRUCT_INIT(dvpoll, mode);
735 		error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll),
736 		    STRUCT_SIZE(dvpoll));
737 		if (error) {
738 			DP_REFRELE(dpep);
739 			return (EFAULT);
740 		}
741 
742 		time_out = STRUCT_FGET(dvpoll, dp_timeout);
743 		if (time_out > 0) {
744 			/*
745 			 * Determine the future time of the requested timeout.
746 			 */
747 			rqtp = &rqtime;
748 			rqtp->tv_sec = time_out / MILLISEC;
749 			rqtp->tv_nsec = (time_out % MILLISEC) * MICROSEC;
750 			timespecadd(rqtp, &now);
751 		}
752 
753 		if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) {
754 			/*
755 			 * We are just using DP_POLL to sleep, so
756 			 * we don't any of the devpoll apparatus.
757 			 * Do not check for signals if we have a zero timeout.
758 			 */
759 			DP_REFRELE(dpep);
760 			if (time_out == 0)
761 				return (0);
762 			mutex_enter(&curthread->t_delay_lock);
763 			while ((rval = cv_waituntil_sig(&curthread->t_delay_cv,
764 			    &curthread->t_delay_lock, rqtp, timecheck)) > 0)
765 				continue;
766 			mutex_exit(&curthread->t_delay_lock);
767 			return ((rval == 0)? EINTR : 0);
768 		}
769 
770 		/*
771 		 * XXX It'd be nice not to have to alloc each time.
772 		 * But it requires another per thread structure hook.
773 		 * Do it later if there is data suggest that.
774 		 */
775 		if ((ps = curthread->t_pollstate) == NULL) {
776 			curthread->t_pollstate = pollstate_create();
777 			ps = curthread->t_pollstate;
778 		}
779 		if (ps->ps_dpbufsize < nfds) {
780 			struct proc *p = ttoproc(curthread);
781 			/*
782 			 * The maximum size should be no large than
783 			 * current maximum open file count.
784 			 */
785 			mutex_enter(&p->p_lock);
786 			if (nfds >= p->p_fno_ctl) {
787 				mutex_exit(&p->p_lock);
788 				DP_REFRELE(dpep);
789 				return (EINVAL);
790 			}
791 			mutex_exit(&p->p_lock);
792 			kmem_free(ps->ps_dpbuf, sizeof (pollfd_t) *
793 			    ps->ps_dpbufsize);
794 			ps->ps_dpbuf = kmem_zalloc(sizeof (pollfd_t) *
795 			    nfds, KM_SLEEP);
796 			ps->ps_dpbufsize = nfds;
797 		}
798 
799 		mutex_enter(&pcp->pc_lock);
800 		for (;;) {
801 			pcp->pc_flag = 0;
802 			error = dp_pcache_poll(ps->ps_dpbuf, pcp, nfds, &fdcnt);
803 			if (fdcnt > 0 || error != 0)
804 				break;
805 
806 			/*
807 			 * A pollwake has happened since we polled cache.
808 			 */
809 			if (pcp->pc_flag & T_POLLWAKE)
810 				continue;
811 
812 			/*
813 			 * Sleep until we are notified, signalled, or timed out.
814 			 * Do not check for signals if we have a zero timeout.
815 			 */
816 			if (time_out == 0)	/* immediate timeout */
817 				break;
818 			rval = cv_waituntil_sig(&pcp->pc_cv, &pcp->pc_lock,
819 				rqtp, timecheck);
820 			/*
821 			 * If we were awakened by a signal or timeout
822 			 * then break the loop, else poll again.
823 			 */
824 			if (rval <= 0) {
825 				if (rval == 0)	/* signal */
826 					error = EINTR;
827 				break;
828 			}
829 		}
830 		mutex_exit(&pcp->pc_lock);
831 
832 		if (error == 0 && fdcnt > 0) {
833 			if (copyout(ps->ps_dpbuf, STRUCT_FGETP(dvpoll,
834 			    dp_fds), sizeof (pollfd_t) * fdcnt)) {
835 				DP_REFRELE(dpep);
836 				return (EFAULT);
837 			}
838 			*rvalp = fdcnt;
839 		}
840 		break;
841 	}
842 
843 	case	DP_ISPOLLED:
844 	{
845 		pollfd_t	pollfd;
846 		polldat_t	*pdp;
847 
848 		STRUCT_INIT(dvpoll, mode);
849 		error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t));
850 		if (error) {
851 			DP_REFRELE(dpep);
852 			return (EFAULT);
853 		}
854 		mutex_enter(&pcp->pc_lock);
855 		if (pcp->pc_hash == NULL) {
856 			/*
857 			 * No Need to search because no poll fd
858 			 * has been cached.
859 			 */
860 			mutex_exit(&pcp->pc_lock);
861 			DP_REFRELE(dpep);
862 			return (0);
863 		}
864 		if (pollfd.fd < 0) {
865 			mutex_exit(&pcp->pc_lock);
866 			break;
867 		}
868 		pdp = pcache_lookup_fd(pcp, pollfd.fd);
869 		if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) &&
870 		    (pdp->pd_fp != NULL)) {
871 			pollfd.revents = pdp->pd_events;
872 			if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) {
873 				mutex_exit(&pcp->pc_lock);
874 				DP_REFRELE(dpep);
875 				return (EFAULT);
876 			}
877 			*rvalp = 1;
878 		}
879 		mutex_exit(&pcp->pc_lock);
880 		break;
881 	}
882 
883 	default:
884 		DP_REFRELE(dpep);
885 		return (EINVAL);
886 	}
887 	DP_REFRELE(dpep);
888 	return (error);
889 }
890 
891 /*ARGSUSED*/
892 static int
893 dppoll(dev_t dev, short events, int anyyet, short *reventsp,
894     struct pollhead **phpp)
895 {
896 	/*
897 	 * Polling on a /dev/poll fd is not fully supported yet.
898 	 */
899 	*reventsp = POLLERR;
900 	return (0);
901 }
902 
903 /*
904  * devpoll close should do enough clean up before the pollcache is deleted,
905  * i.e., it should ensure no one still references the pollcache later.
906  * There is no "permission" check in here. Any process having the last
907  * reference of this /dev/poll fd can close.
908  */
909 /*ARGSUSED*/
910 static int
911 dpclose(dev_t dev, int flag, int otyp, cred_t *credp)
912 {
913 	minor_t 	minor;
914 	dp_entry_t	*dpep;
915 	pollcache_t	*pcp;
916 	int		i;
917 	polldat_t	**hashtbl;
918 	polldat_t	*pdp;
919 
920 	minor = getminor(dev);
921 
922 	mutex_enter(&devpoll_lock);
923 	dpep = devpolltbl[minor];
924 	ASSERT(dpep != NULL);
925 	devpolltbl[minor] = NULL;
926 	mutex_exit(&devpoll_lock);
927 	pcp = dpep->dpe_pcache;
928 	ASSERT(pcp != NULL);
929 	/*
930 	 * At this point, no other lwp can access this pollcache via the
931 	 * /dev/poll fd. This pollcache is going away, so do the clean
932 	 * up without the pc_lock.
933 	 */
934 	hashtbl = pcp->pc_hash;
935 	for (i = 0; i < pcp->pc_hashsize; i++) {
936 		for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) {
937 			if (pdp->pd_php != NULL) {
938 				pollhead_delete(pdp->pd_php, pdp);
939 				pdp->pd_php = NULL;
940 				pdp->pd_fp = NULL;
941 			}
942 		}
943 	}
944 	/*
945 	 * pollwakeup() may still interact with this pollcache. Wait until
946 	 * it is done.
947 	 */
948 	mutex_enter(&pcp->pc_no_exit);
949 	ASSERT(pcp->pc_busy >= 0);
950 	while (pcp->pc_busy > 0)
951 		cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit);
952 	mutex_exit(&pcp->pc_no_exit);
953 	pcache_destroy(pcp);
954 	ASSERT(dpep->dpe_refcnt == 0);
955 	kmem_free(dpep, sizeof (dp_entry_t));
956 	return (0);
957 }
958