xref: /illumos-gate/usr/src/uts/common/io/signalfd.c (revision b3619796d92b4472acfed6b7c813f83cef335013)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Joyent, Inc.
14  */
15 
16 /*
17  * Support for the signalfd facility, a Linux-borne facility for
18  * file descriptor-based synchronous signal consumption.
19  *
20  * As described on the signalfd(3C) man page, the general idea behind these
21  * file descriptors is that they can be used to synchronously consume signals
22  * via the read(2) syscall.  While that capability already exists with the
23  * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
24  * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
25  * event ports) to notify interested parties when consumable signals arrive.
26  *
27  * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
28  * will be allocated for them along with an associated signalfd_state_t struct.
29  * It is there where the mask of desired signals resides.
30  *
31  * Reading from the signalfd is straightforward and mimics the kernel behavior
32  * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
33  * thread's t_sig, member.  During a read operation, those which match the mask
34  * are consumed so they are no longer pending.
35  *
36  * The poll side is more complex.  Every time a signal is delivered, all of the
37  * signalfds on the process need to be examined in order to pollwake threads
38  * waiting for signal arrival.
39  *
40  * When a thread polling on a signalfd requires a pollhead, several steps must
41  * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
42  * created for the calling process if it does not yet exist.  It is there where
43  * a list of sigfd_poll_waiter_t structures reside which associate pollheads to
44  * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find a
45  * sigfd_poll_waiter_t matching the signalfd_state_t which corresponds to the
46  * polled resource.  If one is found, it is reused.  Otherwise a new one is
47  * created, incrementing the refcount on the signalfd_state_t, and it is added
48  * to the sigfd_poll_waiter_t list.
49  *
50  * The complications imposed by fork(2) are why the pollhead is stored in the
51  * associated sigfd_poll_waiter_t instead of directly in the signalfd_state_t.
52  * More than one process can hold a reference to the signalfd at a time but
53  * arriving signals should wake only process-local pollers.  Additionally,
54  * signalfd_close is called only when the last referencing fd is closed, hiding
55  * occurrences of preceeding threads which released their references.  This
56  * necessitates reference counting on the signalfd_state_t so it is able to
57  * persist after close until all poll references have been cleansed.  Doing so
58  * ensures that blocked pollers which hold references to the signalfd_state_t
59  * will be able to do clean-up after the descriptor itself has been closed.
60  *
61  * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
62  * is called via the pointer in sigfd_proc_state_t.  It will walk over the
63  * sigfd_poll_waiter_t entries present in the list, searching for any
64  * associated with a signalfd_state_t with a matching signal mask.  The
65  * approach of keeping the poller list in p_sigfd was chosen because a process
66  * is likely to use few signalfds relative to its total file descriptors.  It
67  * reduces the work required for each received signal.
68  *
69  * When matching sigfd_poll_waiter_t entries are encountered in the poller list
70  * during signalfd_pollwake_cb, they are dispatched into signalfd_wakeq to
71  * perform the pollwake.  This is due to a lock ordering conflict between
72  * signalfd_poll and signalfd_pollwake_cb.  The former acquires
73  * pollcache_t`pc_lock before proc_t`p_lock.  The latter (via sigtoproc)
74  * reverses the order.  Defering the pollwake into a taskq means it can be
75  * performed without proc_t`p_lock held, avoiding the deadlock.
76  *
77  * The sigfd_list is self-cleaning; as signalfd_pollwake_cb is called, the list
78  * will clear out on its own.  Any remaining per-process state which remains
79  * will be cleaned up by the exit helper (signalfd_exit_helper).
80  *
81  * The structures associated with signalfd state are designed to operate
82  * correctly across fork, but there is one caveat that applies.  Using
83  * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
84  * descriptors (such as /dev/poll or event ports) will result in missed poll
85  * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
86  * being dependent on the process they are polled from.  Because it has a
87  * thread-local cache, poll(2) is unaffected by this limitation.
88  *
89  * Lock ordering:
90  *
91  * 1. signalfd_lock
92  * 2. signalfd_state_t`sfd_lock
93  *
94  * 1. proc_t`p_lock (to walk p_sigfd)
95  * 2. signalfd_state_t`sfd_lock
96  * 2a. signalfd_lock (after sfd_lock is dropped, when sfd_count falls to 0)
97  */
98 
99 #include <sys/ddi.h>
100 #include <sys/sunddi.h>
101 #include <sys/signalfd.h>
102 #include <sys/conf.h>
103 #include <sys/sysmacros.h>
104 #include <sys/filio.h>
105 #include <sys/stat.h>
106 #include <sys/file.h>
107 #include <sys/schedctl.h>
108 #include <sys/id_space.h>
109 #include <sys/sdt.h>
110 #include <sys/disp.h>
111 #include <sys/taskq_impl.h>
112 
113 typedef struct signalfd_state signalfd_state_t;
114 
115 struct signalfd_state {
116 	list_node_t	sfd_list;		/* node in global list */
117 	kmutex_t	sfd_lock;		/* protects fields below */
118 	uint_t		sfd_count;		/* ref count */
119 	boolean_t	sfd_valid;		/* valid while open */
120 	k_sigset_t	sfd_set;		/* signals for this fd */
121 };
122 
123 typedef struct sigfd_poll_waiter {
124 	list_node_t		spw_list;
125 	signalfd_state_t	*spw_state;
126 	pollhead_t		spw_pollhd;
127 	taskq_ent_t		spw_taskent;
128 	short			spw_pollev;
129 } sigfd_poll_waiter_t;
130 
131 /*
132  * Protects global state in signalfd_devi, signalfd_minor, signalfd_softstate,
133  * and signalfd_state (including sfd_list field of members)
134  */
135 static kmutex_t		signalfd_lock;
136 static dev_info_t	*signalfd_devi;		/* device info */
137 static id_space_t	*signalfd_minor;	/* minor number arena */
138 static void		*signalfd_softstate;	/* softstate pointer */
139 static list_t		signalfd_state;		/* global list of state */
140 static taskq_t		*signalfd_wakeq;	/* pollwake event taskq */
141 
142 
143 static void
144 signalfd_state_enter_locked(signalfd_state_t *state)
145 {
146 	ASSERT(MUTEX_HELD(&state->sfd_lock));
147 	ASSERT(state->sfd_count > 0);
148 	VERIFY(state->sfd_valid == B_TRUE);
149 
150 	state->sfd_count++;
151 }
152 
153 static void
154 signalfd_state_release(signalfd_state_t *state, boolean_t force_invalidate)
155 {
156 	mutex_enter(&state->sfd_lock);
157 
158 	if (force_invalidate) {
159 		state->sfd_valid = B_FALSE;
160 	}
161 
162 	ASSERT(state->sfd_count > 0);
163 	if (state->sfd_count == 1) {
164 		VERIFY(state->sfd_valid == B_FALSE);
165 		mutex_exit(&state->sfd_lock);
166 		if (force_invalidate) {
167 			/*
168 			 * The invalidation performed in signalfd_close is done
169 			 * while signalfd_lock is held.
170 			 */
171 			ASSERT(MUTEX_HELD(&signalfd_lock));
172 			list_remove(&signalfd_state, state);
173 		} else {
174 			ASSERT(MUTEX_NOT_HELD(&signalfd_lock));
175 			mutex_enter(&signalfd_lock);
176 			list_remove(&signalfd_state, state);
177 			mutex_exit(&signalfd_lock);
178 		}
179 		kmem_free(state, sizeof (*state));
180 		return;
181 	}
182 	state->sfd_count--;
183 	mutex_exit(&state->sfd_lock);
184 }
185 
186 static sigfd_poll_waiter_t *
187 signalfd_wake_list_add(sigfd_proc_state_t *pstate, signalfd_state_t *state)
188 {
189 	list_t *lst = &pstate->sigfd_list;
190 	sigfd_poll_waiter_t *pw;
191 
192 	for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
193 		if (pw->spw_state == state)
194 			break;
195 	}
196 
197 	if (pw == NULL) {
198 		pw = kmem_zalloc(sizeof (*pw), KM_SLEEP);
199 
200 		mutex_enter(&state->sfd_lock);
201 		signalfd_state_enter_locked(state);
202 		pw->spw_state = state;
203 		mutex_exit(&state->sfd_lock);
204 		list_insert_head(lst, pw);
205 	}
206 	return (pw);
207 }
208 
209 static sigfd_poll_waiter_t *
210 signalfd_wake_list_rm(sigfd_proc_state_t *pstate, signalfd_state_t *state)
211 {
212 	list_t *lst = &pstate->sigfd_list;
213 	sigfd_poll_waiter_t *pw;
214 
215 	for (pw = list_head(lst); pw != NULL; pw = list_next(lst, pw)) {
216 		if (pw->spw_state == state) {
217 			break;
218 		}
219 	}
220 
221 	if (pw != NULL) {
222 		list_remove(lst, pw);
223 		pw->spw_state = NULL;
224 		signalfd_state_release(state, B_FALSE);
225 	}
226 
227 	return (pw);
228 }
229 
230 static void
231 signalfd_wake_list_cleanup(proc_t *p)
232 {
233 	sigfd_proc_state_t *pstate = p->p_sigfd;
234 	sigfd_poll_waiter_t *pw;
235 	list_t *lst;
236 
237 	ASSERT(MUTEX_HELD(&p->p_lock));
238 	ASSERT(pstate != NULL);
239 
240 	lst = &pstate->sigfd_list;
241 	while ((pw = list_remove_head(lst)) != NULL) {
242 		signalfd_state_t *state = pw->spw_state;
243 
244 		pw->spw_state = NULL;
245 		signalfd_state_release(state, B_FALSE);
246 
247 		pollwakeup(&pw->spw_pollhd, POLLERR);
248 		pollhead_clean(&pw->spw_pollhd);
249 		kmem_free(pw, sizeof (*pw));
250 	}
251 	list_destroy(lst);
252 
253 	p->p_sigfd = NULL;
254 	kmem_free(pstate, sizeof (*pstate));
255 }
256 
257 static void
258 signalfd_exit_helper(void)
259 {
260 	proc_t *p = curproc;
261 
262 	mutex_enter(&p->p_lock);
263 	signalfd_wake_list_cleanup(p);
264 	mutex_exit(&p->p_lock);
265 }
266 
267 /*
268  * Perform pollwake for a sigfd_poll_waiter_t entry.
269  * Thanks to the strict and conflicting lock orders required for signalfd_poll
270  * (pc_lock before p_lock) and signalfd_pollwake_cb (p_lock before pc_lock),
271  * this is relegated to a taskq to avoid deadlock.
272  */
273 static void
274 signalfd_wake_task(void *arg)
275 {
276 	sigfd_poll_waiter_t *pw = arg;
277 	signalfd_state_t *state = pw->spw_state;
278 
279 	pw->spw_state = NULL;
280 	signalfd_state_release(state, B_FALSE);
281 	pollwakeup(&pw->spw_pollhd, pw->spw_pollev);
282 	pollhead_clean(&pw->spw_pollhd);
283 	kmem_free(pw, sizeof (*pw));
284 }
285 
286 /*
287  * Called every time a signal is delivered to the process so that we can
288  * see if any signal stream needs a pollwakeup. We maintain a list of
289  * signal state elements so that we don't have to look at every file descriptor
290  * on the process. If necessary, a further optimization would be to maintain a
291  * signal set mask that is a union of all of the sets in the list so that
292  * we don't even traverse the list if the signal is not in one of the elements.
293  * However, since the list is likely to be very short, this is not currently
294  * being done. A more complex data structure might also be used, but it is
295  * unclear what that would be since each signal set needs to be checked for a
296  * match.
297  */
298 static void
299 signalfd_pollwake_cb(void *arg0, int sig)
300 {
301 	proc_t *p = (proc_t *)arg0;
302 	sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;
303 	list_t *lst;
304 	sigfd_poll_waiter_t *pw;
305 
306 	ASSERT(MUTEX_HELD(&p->p_lock));
307 	ASSERT(pstate != NULL);
308 
309 	lst = &pstate->sigfd_list;
310 	pw = list_head(lst);
311 	while (pw != NULL) {
312 		signalfd_state_t *state = pw->spw_state;
313 		sigfd_poll_waiter_t *next;
314 
315 		mutex_enter(&state->sfd_lock);
316 		if (!state->sfd_valid) {
317 			pw->spw_pollev = POLLERR;
318 		} else if (sigismember(&state->sfd_set, sig)) {
319 			pw->spw_pollev = POLLRDNORM | POLLIN;
320 		} else {
321 			mutex_exit(&state->sfd_lock);
322 			pw = list_next(lst, pw);
323 			continue;
324 		}
325 		mutex_exit(&state->sfd_lock);
326 
327 		/*
328 		 * Pull the sigfd_poll_waiter_t out of the list and dispatch it
329 		 * to perform a pollwake.  This cannot be done synchronously
330 		 * since signalfd_poll and signalfd_pollwake_cb have
331 		 * conflicting lock orders which can deadlock.
332 		 */
333 		next = list_next(lst, pw);
334 		list_remove(lst, pw);
335 		taskq_dispatch_ent(signalfd_wakeq, signalfd_wake_task, pw, 0,
336 		    &pw->spw_taskent);
337 		pw = next;
338 	}
339 }
340 
341 _NOTE(ARGSUSED(1))
342 static int
343 signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
344 {
345 	signalfd_state_t *state, **sstate;
346 	major_t major = getemajor(*devp);
347 	minor_t minor = getminor(*devp);
348 
349 	if (minor != SIGNALFDMNRN_SIGNALFD)
350 		return (ENXIO);
351 
352 	mutex_enter(&signalfd_lock);
353 
354 	minor = (minor_t)id_allocff(signalfd_minor);
355 	if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
356 		id_free(signalfd_minor, minor);
357 		mutex_exit(&signalfd_lock);
358 		return (ENODEV);
359 	}
360 
361 	state = kmem_zalloc(sizeof (*state), KM_SLEEP);
362 	state->sfd_valid = B_TRUE;
363 	state->sfd_count = 1;
364 	list_insert_head(&signalfd_state, (void *)state);
365 
366 	sstate = ddi_get_soft_state(signalfd_softstate, minor);
367 	*sstate = state;
368 	*devp = makedevice(major, minor);
369 
370 	mutex_exit(&signalfd_lock);
371 
372 	return (0);
373 }
374 
375 /*
376  * Consume one signal from our set in a manner similar to sigtimedwait().
377  * The block parameter is used to control whether we wait for a signal or
378  * return immediately if no signal is pending. We use the thread's t_sigwait
379  * member in the same way that it is used by sigtimedwait.
380  *
381  * Return 0 if we successfully consumed a signal or an errno if not.
382  */
383 static int
384 consume_signal(k_sigset_t set, uio_t *uio, boolean_t block)
385 {
386 	k_sigset_t oldmask;
387 	kthread_t *t = curthread;
388 	klwp_t *lwp = ttolwp(t);
389 	proc_t *p = ttoproc(t);
390 	timespec_t now;
391 	timespec_t *rqtp = NULL;	/* null means blocking */
392 	int timecheck = 0;
393 	int ret = 0;
394 	k_siginfo_t info, *infop;
395 	signalfd_siginfo_t ssi, *ssp = &ssi;
396 
397 	if (block == B_FALSE) {
398 		timecheck = timechanged;
399 		gethrestime(&now);
400 		rqtp = &now;	/* non-blocking check for pending signals */
401 	}
402 
403 	t->t_sigwait = set;
404 
405 	mutex_enter(&p->p_lock);
406 	/*
407 	 * set the thread's signal mask to unmask those signals in the
408 	 * specified set.
409 	 */
410 	schedctl_finish_sigblock(t);
411 	oldmask = t->t_hold;
412 	sigdiffset(&t->t_hold, &t->t_sigwait);
413 
414 	/*
415 	 * Based on rqtp, wait indefinitely until we take a signal in our set
416 	 * or return immediately if there are no signals pending from our set.
417 	 */
418 	while ((ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock, rqtp,
419 	    timecheck)) > 0)
420 		continue;
421 
422 	/* Restore thread's signal mask to its previous value. */
423 	t->t_hold = oldmask;
424 	t->t_sig_check = 1;	/* so post_syscall sees new t_hold mask */
425 
426 	if (ret == -1) {
427 		/* no signals pending */
428 		mutex_exit(&p->p_lock);
429 		sigemptyset(&t->t_sigwait);
430 		return (EAGAIN);	/* no signals pending */
431 	}
432 
433 	/* Don't bother with signal if it is not in request set. */
434 	if (lwp->lwp_cursig == 0 ||
435 	    !sigismember(&t->t_sigwait, lwp->lwp_cursig)) {
436 		mutex_exit(&p->p_lock);
437 		/*
438 		 * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
439 		 * This happens if some other thread in this process called
440 		 * forkall() or exit().
441 		 */
442 		sigemptyset(&t->t_sigwait);
443 		return (EINTR);
444 	}
445 
446 	if (lwp->lwp_curinfo) {
447 		infop = &lwp->lwp_curinfo->sq_info;
448 	} else {
449 		infop = &info;
450 		bzero(infop, sizeof (info));
451 		infop->si_signo = lwp->lwp_cursig;
452 		infop->si_code = SI_NOINFO;
453 	}
454 
455 	lwp->lwp_ru.nsignals++;
456 
457 	DTRACE_PROC2(signal__clear, int, ret, ksiginfo_t *, infop);
458 	lwp->lwp_cursig = 0;
459 	lwp->lwp_extsig = 0;
460 	mutex_exit(&p->p_lock);
461 
462 	/* Convert k_siginfo into external, datamodel independent, struct. */
463 	bzero(ssp, sizeof (*ssp));
464 	ssp->ssi_signo = infop->si_signo;
465 	ssp->ssi_errno = infop->si_errno;
466 	ssp->ssi_code = infop->si_code;
467 	ssp->ssi_pid = infop->si_pid;
468 	ssp->ssi_uid = infop->si_uid;
469 	ssp->ssi_fd = infop->si_fd;
470 	ssp->ssi_band = infop->si_band;
471 	ssp->ssi_trapno = infop->si_trapno;
472 	ssp->ssi_status = infop->si_status;
473 	ssp->ssi_utime = infop->si_utime;
474 	ssp->ssi_stime = infop->si_stime;
475 	ssp->ssi_addr = (uint64_t)(intptr_t)infop->si_addr;
476 
477 	ret = uiomove(ssp, sizeof (*ssp), UIO_READ, uio);
478 
479 	if (lwp->lwp_curinfo) {
480 		siginfofree(lwp->lwp_curinfo);
481 		lwp->lwp_curinfo = NULL;
482 	}
483 	sigemptyset(&t->t_sigwait);
484 	return (ret);
485 }
486 
487 /*
488  * This is similar to sigtimedwait. Based on the fd mode we may wait until a
489  * signal within our specified set is posted. We consume as many available
490  * signals within our set as we can.
491  */
492 _NOTE(ARGSUSED(2))
493 static int
494 signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
495 {
496 	signalfd_state_t *state, **sstate;
497 	minor_t minor = getminor(dev);
498 	boolean_t block = B_TRUE;
499 	k_sigset_t set;
500 	boolean_t got_one = B_FALSE;
501 	int res;
502 
503 	if (uio->uio_resid < sizeof (signalfd_siginfo_t))
504 		return (EINVAL);
505 
506 	sstate = ddi_get_soft_state(signalfd_softstate, minor);
507 	state = *sstate;
508 
509 	if (uio->uio_fmode & (FNDELAY|FNONBLOCK))
510 		block = B_FALSE;
511 
512 	mutex_enter(&state->sfd_lock);
513 	set = state->sfd_set;
514 	mutex_exit(&state->sfd_lock);
515 
516 	if (sigisempty(&set))
517 		return (set_errno(EINVAL));
518 
519 	do  {
520 		res = consume_signal(set, uio, block);
521 
522 		if (res == 0) {
523 			/*
524 			 * After consuming one signal, do not block while
525 			 * trying to consume more.
526 			 */
527 			got_one = B_TRUE;
528 			block = B_FALSE;
529 
530 			/*
531 			 * Refresh the matching signal set in case it was
532 			 * updated during the wait.
533 			 */
534 			mutex_enter(&state->sfd_lock);
535 			set = state->sfd_set;
536 			mutex_exit(&state->sfd_lock);
537 			if (sigisempty(&set))
538 				break;
539 		}
540 	} while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));
541 
542 	if (got_one)
543 		res = 0;
544 
545 	return (res);
546 }
547 
548 /*
549  * If ksigset_t's were a single word, we would do:
550  *      return (((p->p_sig | t->t_sig) & set) & fillset);
551  */
552 static int
553 signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set)
554 {
555 	return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) &
556 	    set.__sigbits[0]) |
557 	    ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) &
558 	    set.__sigbits[1]) |
559 	    (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) &
560 	    set.__sigbits[2]) & FILLSET2));
561 }
562 
563 static int
564 signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
565     struct pollhead **phpp)
566 {
567 	signalfd_state_t *state, **sstate;
568 	minor_t minor = getminor(dev);
569 	kthread_t *t = curthread;
570 	proc_t *p = ttoproc(t);
571 	short revents = 0;
572 
573 	sstate = ddi_get_soft_state(signalfd_softstate, minor);
574 	state = *sstate;
575 
576 	mutex_enter(&state->sfd_lock);
577 
578 	if (signalfd_sig_pending(p, t, state->sfd_set) != 0)
579 		revents |= POLLRDNORM | POLLIN;
580 
581 	mutex_exit(&state->sfd_lock);
582 
583 	*reventsp = revents & events;
584 	if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
585 		sigfd_proc_state_t *pstate;
586 		sigfd_poll_waiter_t *pw;
587 
588 		/*
589 		 * Enable pollwakeup handling.
590 		 */
591 		mutex_enter(&p->p_lock);
592 		if ((pstate = (sigfd_proc_state_t *)p->p_sigfd) == NULL) {
593 
594 			mutex_exit(&p->p_lock);
595 			pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
596 			list_create(&pstate->sigfd_list,
597 			    sizeof (sigfd_poll_waiter_t),
598 			    offsetof(sigfd_poll_waiter_t, spw_list));
599 			pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;
600 
601 			/* Check again, after blocking for the alloc. */
602 			mutex_enter(&p->p_lock);
603 			if (p->p_sigfd == NULL) {
604 				p->p_sigfd = pstate;
605 			} else {
606 				/* someone beat us to it */
607 				list_destroy(&pstate->sigfd_list);
608 				kmem_free(pstate, sizeof (*pstate));
609 				pstate = p->p_sigfd;
610 			}
611 		}
612 
613 		pw = signalfd_wake_list_add(pstate, state);
614 		*phpp = &pw->spw_pollhd;
615 		mutex_exit(&p->p_lock);
616 	}
617 
618 	return (0);
619 }
620 
621 _NOTE(ARGSUSED(4))
622 static int
623 signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
624 {
625 	signalfd_state_t *state, **sstate;
626 	minor_t minor = getminor(dev);
627 	sigset_t mask;
628 
629 	sstate = ddi_get_soft_state(signalfd_softstate, minor);
630 	state = *sstate;
631 
632 	switch (cmd) {
633 	case SIGNALFDIOC_MASK:
634 		if (ddi_copyin((caddr_t)arg, (caddr_t)&mask, sizeof (sigset_t),
635 		    md) != 0)
636 			return (set_errno(EFAULT));
637 
638 		mutex_enter(&state->sfd_lock);
639 		sigutok(&mask, &state->sfd_set);
640 		mutex_exit(&state->sfd_lock);
641 
642 		return (0);
643 
644 	default:
645 		break;
646 	}
647 
648 	return (ENOTTY);
649 }
650 
651 _NOTE(ARGSUSED(1))
652 static int
653 signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
654 {
655 	signalfd_state_t *state, **sstate;
656 	sigfd_poll_waiter_t *pw = NULL;
657 	minor_t minor = getminor(dev);
658 	proc_t *p = curproc;
659 
660 	sstate = ddi_get_soft_state(signalfd_softstate, minor);
661 	state = *sstate;
662 
663 	/* Make sure state is removed from this proc's pollwake list. */
664 	mutex_enter(&p->p_lock);
665 	if (p->p_sigfd != NULL) {
666 		sigfd_proc_state_t *pstate = p->p_sigfd;
667 
668 		pw = signalfd_wake_list_rm(pstate, state);
669 		if (list_is_empty(&pstate->sigfd_list)) {
670 			signalfd_wake_list_cleanup(p);
671 		}
672 	}
673 	mutex_exit(&p->p_lock);
674 
675 	if (pw != NULL) {
676 		pollwakeup(&pw->spw_pollhd, POLLERR);
677 		pollhead_clean(&pw->spw_pollhd);
678 		kmem_free(pw, sizeof (*pw));
679 	}
680 
681 	mutex_enter(&signalfd_lock);
682 
683 	*sstate = NULL;
684 	ddi_soft_state_free(signalfd_softstate, minor);
685 	id_free(signalfd_minor, minor);
686 
687 	signalfd_state_release(state, B_TRUE);
688 
689 	mutex_exit(&signalfd_lock);
690 
691 	return (0);
692 }
693 
694 static int
695 signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
696 {
697 	if (cmd != DDI_ATTACH || signalfd_devi != NULL)
698 		return (DDI_FAILURE);
699 
700 	mutex_enter(&signalfd_lock);
701 
702 	signalfd_minor = id_space_create("signalfd_minor", 1, L_MAXMIN32 + 1);
703 	if (signalfd_minor == NULL) {
704 		cmn_err(CE_WARN, "signalfd couldn't create id space");
705 		mutex_exit(&signalfd_lock);
706 		return (DDI_FAILURE);
707 	}
708 
709 	if (ddi_soft_state_init(&signalfd_softstate,
710 	    sizeof (signalfd_state_t *), 0) != 0) {
711 		cmn_err(CE_WARN, "signalfd failed to create soft state");
712 		id_space_destroy(signalfd_minor);
713 		mutex_exit(&signalfd_lock);
714 		return (DDI_FAILURE);
715 	}
716 
717 	if (ddi_create_minor_node(devi, "signalfd", S_IFCHR,
718 	    SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
719 		cmn_err(CE_NOTE, "/dev/signalfd couldn't create minor node");
720 		ddi_soft_state_fini(&signalfd_softstate);
721 		id_space_destroy(signalfd_minor);
722 		mutex_exit(&signalfd_lock);
723 		return (DDI_FAILURE);
724 	}
725 
726 	ddi_report_dev(devi);
727 	signalfd_devi = devi;
728 
729 	sigfd_exit_helper = signalfd_exit_helper;
730 
731 	list_create(&signalfd_state, sizeof (signalfd_state_t),
732 	    offsetof(signalfd_state_t, sfd_list));
733 
734 	signalfd_wakeq = taskq_create("signalfd_wake", 1, minclsyspri,
735 	    0, INT_MAX, TASKQ_PREPOPULATE);
736 
737 	mutex_exit(&signalfd_lock);
738 
739 	return (DDI_SUCCESS);
740 }
741 
742 _NOTE(ARGSUSED(0))
743 static int
744 signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
745 {
746 	switch (cmd) {
747 	case DDI_DETACH:
748 		break;
749 
750 	default:
751 		return (DDI_FAILURE);
752 	}
753 
754 	mutex_enter(&signalfd_lock);
755 
756 	if (!list_is_empty(&signalfd_state)) {
757 		/*
758 		 * There are dangling poll waiters holding signalfd_state_t
759 		 * entries on the global list.  Detach is not possible until
760 		 * they purge themselves.
761 		 */
762 		mutex_exit(&signalfd_lock);
763 		return (DDI_FAILURE);
764 	}
765 	list_destroy(&signalfd_state);
766 
767 	/*
768 	 * With no remaining entries in the signalfd_state list, the wake taskq
769 	 * should be empty with no possibility for new entries.
770 	 */
771 	taskq_destroy(signalfd_wakeq);
772 
773 	id_space_destroy(signalfd_minor);
774 
775 	ddi_remove_minor_node(signalfd_devi, NULL);
776 	signalfd_devi = NULL;
777 	sigfd_exit_helper = NULL;
778 
779 	ddi_soft_state_fini(&signalfd_softstate);
780 	mutex_exit(&signalfd_lock);
781 
782 	return (DDI_SUCCESS);
783 }
784 
785 _NOTE(ARGSUSED(0))
786 static int
787 signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
788 {
789 	int error;
790 
791 	switch (infocmd) {
792 	case DDI_INFO_DEVT2DEVINFO:
793 		*result = (void *)signalfd_devi;
794 		error = DDI_SUCCESS;
795 		break;
796 	case DDI_INFO_DEVT2INSTANCE:
797 		*result = (void *)0;
798 		error = DDI_SUCCESS;
799 		break;
800 	default:
801 		error = DDI_FAILURE;
802 	}
803 	return (error);
804 }
805 
806 static struct cb_ops signalfd_cb_ops = {
807 	signalfd_open,		/* open */
808 	signalfd_close,		/* close */
809 	nulldev,		/* strategy */
810 	nulldev,		/* print */
811 	nodev,			/* dump */
812 	signalfd_read,		/* read */
813 	nodev,			/* write */
814 	signalfd_ioctl,		/* ioctl */
815 	nodev,			/* devmap */
816 	nodev,			/* mmap */
817 	nodev,			/* segmap */
818 	signalfd_poll,		/* poll */
819 	ddi_prop_op,		/* cb_prop_op */
820 	0,			/* streamtab  */
821 	D_NEW | D_MP		/* Driver compatibility flag */
822 };
823 
824 static struct dev_ops signalfd_ops = {
825 	DEVO_REV,		/* devo_rev */
826 	0,			/* refcnt */
827 	signalfd_info,		/* get_dev_info */
828 	nulldev,		/* identify */
829 	nulldev,		/* probe */
830 	signalfd_attach,	/* attach */
831 	signalfd_detach,	/* detach */
832 	nodev,			/* reset */
833 	&signalfd_cb_ops,	/* driver operations */
834 	NULL,			/* bus operations */
835 	nodev,			/* dev power */
836 	ddi_quiesce_not_needed,	/* quiesce */
837 };
838 
839 static struct modldrv modldrv = {
840 	&mod_driverops,		/* module type (this is a pseudo driver) */
841 	"signalfd support",	/* name of module */
842 	&signalfd_ops,		/* driver ops */
843 };
844 
845 static struct modlinkage modlinkage = {
846 	MODREV_1,
847 	(void *)&modldrv,
848 	NULL
849 };
850 
851 int
852 _init(void)
853 {
854 	return (mod_install(&modlinkage));
855 }
856 
857 int
858 _info(struct modinfo *modinfop)
859 {
860 	return (mod_info(&modlinkage, modinfop));
861 }
862 
863 int
864 _fini(void)
865 {
866 	return (mod_remove(&modlinkage));
867 }
868