xref: /illumos-gate/usr/src/uts/common/io/signalfd.c (revision fc910014e8a32a65612105835a10995f2c13d942)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms of version
5  * 1.0 of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2017 Joyent, Inc.
14  * Copyright 2023 Oxide Computer Company
15  */
16 
17 /*
18  * Support for the signalfd facility, a Linux-borne facility for
19  * file descriptor-based synchronous signal consumption.
20  *
21  * As described on the signalfd(3C) man page, the general idea behind these
22  * file descriptors is that they can be used to synchronously consume signals
23  * via the read(2) syscall.  While that capability already exists with the
24  * sigwaitinfo(3C) function, signalfd holds an advantage since it is file
25  * descriptor based: It is able use the event facilities (poll(2), /dev/poll,
26  * event ports) to notify interested parties when consumable signals arrive.
27  *
28  * The signalfd lifecycle begins When a process opens /dev/signalfd.  A minor
29  * will be allocated for them along with an associated signalfd_state_t struct.
30  * It is there where the mask of desired signals resides.
31  *
32  * Reading from the signalfd is straightforward and mimics the kernel behavior
33  * for sigtimedwait().  Signals continue to live on either the proc's p_sig, or
34  * thread's t_sig, member.  During a read operation, those which match the mask
35  * are consumed so they are no longer pending.
36  *
37  * The poll side is more complex.  Every time a signal is delivered, all of the
38  * signalfds on the process need to be examined in order to pollwake threads
39  * waiting for signal arrival.
40  *
41  * When a thread polling on a signalfd requires a pollhead, several steps must
42  * be taken to safely ensure the proper result.  A sigfd_proc_state_t is
43  * created for the calling process if it does not yet exist.  It is there where
44  * a list of signalfd_poller_t structures reside which associate pollheads to
45  * signalfd_state_t entries.  The sigfd_proc_state_t list is walked to find any
46  * signalfd_poller_t which is both associated with the polling process and
47  * corresponds to the signalfd resource being polled.  If none matching those
48  * conditions is found, then a new one with the appropriate associations is
49  * created.
50  *
51  * The complications imposed by fork(2) are why the pollhead is stored in the
52  * associated signalfd_poller_t instead of directly in the signalfd_state_t.
53  * More than one process can hold a reference to the signalfd at a time but
54  * arriving signals should wake only process-local pollers.  Additionally,
55  * signalfd_close is called only when the last referencing fd is closed, hiding
56  * occurrences of preceeding threads which released their references.  This
57  * necessitates a pollhead for each signalfd/process pair when being polled.
58  * Doing so ensures that those pollheads will live long enough for the greater
59  * poll machinery can act upon them without risk of use-after-free.  When a
60  * signalfd is closed, existing signalfd_poller_t instances are dissociated from
61  * their respective processes, causing pollwake() calls for any blocked pollers.
62  *
63  * When a signal arrives in a process polling on signalfd, signalfd_pollwake_cb
64  * is called via the pointer in sigfd_proc_state_t.  It will walk over the
65  * signalfd_poller_t entries present in the list, searching for any possessing a
66  * signal mask which matches the incoming signal.  (Changes to the signal mask
67  * held in signalfd_state_t is propagated to the signalfd_poller_t instance to
68  * avoid the need for additional locks during the callback.) The approach of
69  * keeping the poller list in p_sigfd was chosen because a process is likely to
70  * use few signalfds relative to its total file descriptors.  It reduces the
71  * work required for each received signal.
72  *
73  * When matching signalfd_poller_t entries are encountered in the poller list
74  * during signalfd_pollwake_cb, they are dispatched into signalfd_wakeq to
75  * perform the pollwake.  This is due to a lock ordering conflict between
76  * signalfd_poll and signalfd_pollwake_cb.  The former acquires
77  * pollcache_t`pc_lock before proc_t`p_lock.  The latter (via sigtoproc)
78  * reverses the order.  Defering the pollwake into a taskq means it can be
79  * performed without proc_t`p_lock held, avoiding the deadlock.
80  *
81  * Poller entries in sigfd_proc_state_t`sigfd_list are cleaned up under two
82  * different circumstances.  When a signalfd instance is being closed, it will
83  * dissociate all of its remaining signalfd_poller_t instances from their
84  * polling processes.  When a process (which polled on signalfd instance(s)
85  * which have not yet been closed) exits, the exit helper (signalfd_exit_helper)
86  * is called, and it dissociates all signalfd_poller_t instances tied to the
87  * existing process.
88  *
89  * The structures associated with signalfd state are designed to operate
90  * correctly across fork, but there is one caveat that applies.  Using
91  * fork-shared signalfd descriptors in conjuction with fork-shared caching poll
92  * descriptors (such as /dev/poll or event ports) will result in missed poll
93  * wake-ups.  This is caused by the pollhead identity of signalfd descriptors
94  * being dependent on the process they are polled from.  Because it has a
95  * thread-local cache, poll(2) is unaffected by this limitation.
96  *
97  * Lock ordering:
98  *
99  * Calling signalfd_poll:
100  * 1. pollcache_t`pc_lock
101  * 2. signalfd_state_t`sfd_lock
102  * 3. proc_t`p_lock
103  *
104  * Signal delivery, waking a pollhead:
105  * 1. proc_t`p_lock
106  * 2. signalfd_poller_t`sp_lock
107  *
108  * Process exit, cleaning up signalfd pollers:
109  * 1. proc_t`p_lock
110  * 2. signalfd_poller_t`sp_lock
111  *
112  * Waking a pollhead, from taskq:
113  * 1. signalfd_poller_t`sp_lock
114  * ... Disjoint from signalfd_poller_t`sp_lock hold ...
115  * 1. pollcache_t`pc_lock
116  *
117  * Closing signalfd, dissociating pollers:
118  * 1. signalfd_state_t`sfd_lock
119  * 2. pidlock
120  * 3. proc_t`p_lock
121  *
122  */
123 
124 #include <sys/ddi.h>
125 #include <sys/sunddi.h>
126 #include <sys/signalfd.h>
127 #include <sys/conf.h>
128 #include <sys/sysmacros.h>
129 #include <sys/filio.h>
130 #include <sys/stat.h>
131 #include <sys/file.h>
132 #include <sys/schedctl.h>
133 #include <sys/id_space.h>
134 #include <sys/sdt.h>
135 #include <sys/disp.h>
136 #include <sys/taskq_impl.h>
137 #include <sys/condvar.h>
138 #include <sys/stdbool.h>
139 
140 /* Per-instance signalfd device state: */
141 typedef struct signalfd_state {
142 	kmutex_t	sfd_lock;	/* protects fields below */
143 	list_t		sfd_pollers;
144 	k_sigset_t	sfd_mask;	/* signal mask for this instance */
145 	minor_t		sfd_minor;	/* dev minor, fixed at creation */
146 } signalfd_state_t;
147 
148 typedef struct signalfd_poller {
149 	/*
150 	 * List node referenced by containing signalfd_state_t
151 	 * Protected by signalfd_state`sfd_lock
152 	 */
153 	list_node_t	sp_state_node;
154 
155 	/*
156 	 * List node referenced by containing sigfd_proc_state_t
157 	 * Protected by proc_t`plock
158 	 */
159 	list_node_t	sp_proc_node;
160 
161 	pollhead_t	sp_pollhead;
162 
163 	/*
164 	 * The signalfd_state_t to which this poller is associated.
165 	 * It remains fixed after its initialization at creation time.
166 	 */
167 	signalfd_state_t	*sp_state;
168 
169 	/*
170 	 * The proc_t to which this poller is associated.
171 	 * It is initialized under the protection of proc_t`p_lock when this
172 	 * poller is created.  It is NULLed out, again under the protection of
173 	 * proc_t`p_lock, when the poller is dissociated from the process.
174 	 */
175 	proc_t		*sp_proc;
176 
177 	kmutex_t	sp_lock;	/* protects fields below */
178 	kcondvar_t	sp_cv;		/* CV for cleaning up */
179 	short		sp_pollev;	/* Event(s) pending delivery */
180 	bool		sp_pending;	/* pollwakeup() via taskq in progress */
181 	taskq_ent_t	sp_taskent;	/* pollwakeup() dispatch taskq */
182 	k_sigset_t	sp_mask;	/* signal match mask */
183 } signalfd_poller_t;
184 
185 static dev_info_t	*signalfd_devi;		/* device info */
186 static id_space_t	*signalfd_minors;	/* minor number arena */
187 static void		*signalfd_softstate;	/* softstate pointer */
188 static taskq_t		*signalfd_wakeq;	/* pollwake event taskq */
189 
190 static void
191 signalfd_proc_clean(proc_t *p)
192 {
193 	sigfd_proc_state_t *pstate = p->p_sigfd;
194 
195 	ASSERT(MUTEX_HELD(&p->p_lock));
196 	ASSERT(pstate != NULL);
197 	VERIFY(list_is_empty(&pstate->sigfd_list));
198 
199 	p->p_sigfd = NULL;
200 	list_destroy(&pstate->sigfd_list);
201 	kmem_free(pstate, sizeof (*pstate));
202 }
203 
204 static void
205 signalfd_wake_task(void *arg)
206 {
207 	signalfd_poller_t *sp = arg;
208 
209 	mutex_enter(&sp->sp_lock);
210 	VERIFY(sp->sp_pollev != 0);
211 	VERIFY(sp->sp_pending);
212 	do {
213 		const short pollev = sp->sp_pollev;
214 		const bool is_err = (pollev & POLLERR) != 0;
215 		sp->sp_pollev = 0;
216 		mutex_exit(&sp->sp_lock);
217 
218 		/*
219 		 * Actions against the pollhead and associated pollcache(s) are
220 		 * taken without signalfd_poller_t`sp_lock held, since the chain
221 		 * of dependencies through pollcache_t`pc_lock and
222 		 * signalfd_state_t`sfd_lock form a potential for deadlock.
223 		 */
224 		pollwakeup(&sp->sp_pollhead, pollev);
225 		if (is_err) {
226 			pollhead_clean(&sp->sp_pollhead);
227 		}
228 
229 		mutex_enter(&sp->sp_lock);
230 		/*
231 		 * Once pollhead/pollcache actions are complete, check for newly
232 		 * queued events which could have appeared in the mean time.  We
233 		 * can bail immediately if POLLER was being delivered, since the
234 		 * underlying resource is undergoing clean-up.
235 		 */
236 		if (is_err) {
237 			break;
238 		}
239 	} while (sp->sp_pollev != 0);
240 
241 	/*
242 	 * Indicate that wake task processing is complete.
243 	 *
244 	 * Wake any thread waiting for event delivery to complete if this poller
245 	 * is being torn down.
246 	 */
247 	sp->sp_pending = false;
248 	cv_signal(&sp->sp_cv);
249 	mutex_exit(&sp->sp_lock);
250 }
251 
252 static void
253 signalfd_poller_wake(signalfd_poller_t *sp, short ev)
254 {
255 	ASSERT(MUTEX_HELD(&sp->sp_lock));
256 
257 	sp->sp_pollev |= ev;
258 	if (!sp->sp_pending) {
259 		sp->sp_pending = true;
260 		taskq_dispatch_ent(signalfd_wakeq, signalfd_wake_task, sp, 0,
261 		    &sp->sp_taskent);
262 	}
263 }
264 
265 /*
266  * Notification callback associated to processes which are being polled for
267  * signalfd events.  Called by sigtoproc().
268  */
269 static void
270 signalfd_pollwake_cb(void *arg0, int sig)
271 {
272 	proc_t *p = (proc_t *)arg0;
273 	sigfd_proc_state_t *pstate = (sigfd_proc_state_t *)p->p_sigfd;
274 
275 	ASSERT(MUTEX_HELD(&p->p_lock));
276 	ASSERT(pstate != NULL);
277 
278 	list_t *pollers = &pstate->sigfd_list;
279 	for (signalfd_poller_t *sp = list_head(pollers); sp != NULL;
280 	    sp = list_next(pollers, sp)) {
281 		mutex_enter(&sp->sp_lock);
282 		if (sigismember(&sp->sp_mask, sig)) {
283 			signalfd_poller_wake(sp, POLLRDNORM | POLLIN);
284 		}
285 		mutex_exit(&sp->sp_lock);
286 	}
287 }
288 
289 /*
290  * Get the sigfd_proc_state_t for a given process, allocating one if necessary.
291  *
292  * Must be called with p_lock held, which may be dropped and reacquired during
293  * the allocation.
294  */
295 static sigfd_proc_state_t *
296 signalfd_proc_pstate(proc_t *p)
297 {
298 	ASSERT(MUTEX_HELD(&p->p_lock));
299 
300 	sigfd_proc_state_t *pstate = p->p_sigfd;
301 	if (pstate == NULL) {
302 		mutex_exit(&p->p_lock);
303 		pstate = kmem_zalloc(sizeof (*pstate), KM_SLEEP);
304 		list_create(&pstate->sigfd_list,
305 		    sizeof (signalfd_poller_t),
306 		    offsetof(signalfd_poller_t, sp_proc_node));
307 		pstate->sigfd_pollwake_cb = signalfd_pollwake_cb;
308 
309 		/* Check again, after blocking for the alloc. */
310 		mutex_enter(&p->p_lock);
311 		if (p->p_sigfd == NULL) {
312 			p->p_sigfd = pstate;
313 		} else {
314 			/* Someone beat us to it */
315 			list_destroy(&pstate->sigfd_list);
316 			kmem_free(pstate, sizeof (*pstate));
317 			pstate = p->p_sigfd;
318 		}
319 	}
320 
321 	return (pstate);
322 }
323 
324 static signalfd_poller_t *
325 signalfd_poller_associate(signalfd_state_t *state, proc_t *p)
326 {
327 	sigfd_proc_state_t *pstate;
328 	list_t *pollers;
329 	signalfd_poller_t *sp;
330 
331 	ASSERT(MUTEX_HELD(&state->sfd_lock));
332 
333 	mutex_enter(&p->p_lock);
334 
335 	pstate = signalfd_proc_pstate(p);
336 	pollers = &pstate->sigfd_list;
337 
338 	/*
339 	 * Check if there is already a signalfd_poller_t allocated for this
340 	 * signalfd_state_t/proc_t pair.
341 	 */
342 	for (sp = list_head(pollers); sp != NULL; sp = list_next(pollers, sp)) {
343 		if (sp->sp_state == state) {
344 			mutex_exit(&p->p_lock);
345 			return (sp);
346 		}
347 	}
348 
349 	/*
350 	 * No existing poller found, so allocate one. Since sfd_lock remains
351 	 * held, there is no risk of some other operation racing with us to
352 	 * create such a poller.
353 	 */
354 	mutex_exit(&p->p_lock);
355 
356 	sp = kmem_zalloc(sizeof (*sp), KM_SLEEP);
357 	mutex_init(&sp->sp_lock, NULL, MUTEX_DEFAULT, NULL);
358 	cv_init(&sp->sp_cv, NULL, CV_DEFAULT, NULL);
359 	sigorset(&sp->sp_mask, &state->sfd_mask);
360 	sp->sp_state = state;
361 	sp->sp_proc = p;
362 
363 	mutex_enter(&p->p_lock);
364 	/*
365 	 * Fetch the pstate again, since it could have been freed or reallocated
366 	 * in the time p_lock was dropped.
367 	 */
368 	pstate = signalfd_proc_pstate(p);
369 
370 	list_insert_tail(&pstate->sigfd_list, sp);
371 	list_insert_tail(&state->sfd_pollers, sp);
372 	mutex_exit(&p->p_lock);
373 
374 	return (sp);
375 }
376 
377 static void
378 signalfd_pollers_dissociate(signalfd_state_t *state)
379 {
380 	ASSERT(MUTEX_HELD(&state->sfd_lock));
381 
382 	mutex_enter(&pidlock);
383 
384 	signalfd_poller_t *sp;
385 	list_t *pollers = &state->sfd_pollers;
386 	for (sp = list_head(pollers); sp != NULL; sp = list_next(pollers, sp)) {
387 		proc_t *p = sp->sp_proc;
388 
389 		if (p == NULL) {
390 			continue;
391 		}
392 
393 		/*
394 		 * Even if the process in question is racing us to clean-up in
395 		 * proc_exit(), it will be unable to exit (and free itself)
396 		 * since we hold pidlock.  This prevents us from otherwise
397 		 * attempting to lock a p_lock which was freed.
398 		 */
399 		mutex_enter(&p->p_lock);
400 		if (sp->sp_proc == NULL) {
401 			mutex_exit(&p->p_lock);
402 			continue;
403 		}
404 		VERIFY3P(sp->sp_proc, ==, p);
405 		VERIFY3P(sp->sp_state, ==, state);
406 		VERIFY3P(p->p_sigfd, !=, NULL);
407 
408 		sigfd_proc_state_t *pstate = p->p_sigfd;
409 		list_remove(&pstate->sigfd_list, sp);
410 		sp->sp_proc = NULL;
411 
412 		/* Wake any lingering pollers referencing the pollhead */
413 		mutex_enter(&sp->sp_lock);
414 		signalfd_poller_wake(sp, POLLERR);
415 		mutex_exit(&sp->sp_lock);
416 
417 		if (list_is_empty(&pstate->sigfd_list)) {
418 			/*
419 			 * If this poller was the last associated against the
420 			 * process, then clean up its state as well.
421 			 */
422 			signalfd_proc_clean(p);
423 		}
424 		mutex_exit(&p->p_lock);
425 	}
426 	mutex_exit(&pidlock);
427 }
428 
429 static void
430 signalfd_pollers_free(signalfd_state_t *state)
431 {
432 	ASSERT(MUTEX_HELD(&state->sfd_lock));
433 
434 	signalfd_poller_t *sp;
435 	while ((sp = list_remove_head(&state->sfd_pollers)) != NULL) {
436 		ASSERT3P(sp->sp_proc, ==, NULL);
437 
438 		mutex_enter(&sp->sp_lock);
439 		while (sp->sp_pending) {
440 			cv_wait(&sp->sp_cv, &sp->sp_lock);
441 		}
442 		/*
443 		 * With the poller dissociated from its polling process, and any
444 		 * lingering events delivered, the pollhead should be empty.
445 		 */
446 		ASSERT3P(sp->sp_pollhead.ph_list, ==, NULL);
447 
448 		cv_destroy(&sp->sp_cv);
449 		mutex_destroy(&sp->sp_lock);
450 		kmem_free(sp, sizeof (*sp));
451 	}
452 }
453 
454 /*
455  * Callback for cleaning up signalfd state from a process during proc_exit().
456  */
457 static void
458 signalfd_exit_helper(void)
459 {
460 	proc_t *p = curproc;
461 
462 	mutex_enter(&p->p_lock);
463 
464 	sigfd_proc_state_t *pstate = p->p_sigfd;
465 	if (pstate == NULL) {
466 		mutex_exit(&p->p_lock);
467 		return;
468 	}
469 
470 	signalfd_poller_t *sp;
471 	while ((sp = list_remove_head(&pstate->sigfd_list)) != NULL) {
472 		/*
473 		 * Having been removed from the sigfd_list, make it clear that
474 		 * this signalfd_poller_t is disssociated from the process.
475 		 */
476 		sp->sp_proc = NULL;
477 
478 		/* Wake any lingering pollers referencing the pollhead */
479 		mutex_enter(&sp->sp_lock);
480 		signalfd_poller_wake(sp, POLLERR);
481 		mutex_exit(&sp->sp_lock);
482 	}
483 	signalfd_proc_clean(p);
484 	mutex_exit(&p->p_lock);
485 }
486 
487 _NOTE(ARGSUSED(1))
488 static int
489 signalfd_open(dev_t *devp, int flag, int otyp, cred_t *cr)
490 {
491 	if (getminor(*devp) != SIGNALFDMNRN_SIGNALFD) {
492 		return (ENXIO);
493 	}
494 
495 	const minor_t minor = (minor_t)id_allocff_nosleep(signalfd_minors);
496 	if (minor == -1) {
497 		return (ENOMEM);
498 	}
499 
500 	if (ddi_soft_state_zalloc(signalfd_softstate, minor) != DDI_SUCCESS) {
501 		id_free(signalfd_minors, minor);
502 		return (ENODEV);
503 	}
504 
505 	signalfd_state_t *state = ddi_get_soft_state(signalfd_softstate, minor);
506 	mutex_init(&state->sfd_lock, NULL, MUTEX_DEFAULT, NULL);
507 	list_create(&state->sfd_pollers, sizeof (signalfd_poller_t),
508 	    offsetof(signalfd_poller_t, sp_state_node));
509 	state->sfd_minor = minor;
510 
511 	const major_t major = getemajor(*devp);
512 	*devp = makedevice(major, minor);
513 
514 	return (0);
515 }
516 
517 /*
518  * Consume one signal from our set in a manner similar to sigtimedwait().
519  * The block parameter is used to control whether we wait for a signal or
520  * return immediately if no signal is pending. We use the thread's t_sigwait
521  * member in the same way that it is used by sigtimedwait.
522  *
523  * Return 0 if we successfully consumed a signal or an errno if not.
524  */
525 static int
526 signalfd_consume_signal(k_sigset_t set, uio_t *uio, bool should_block)
527 {
528 	kthread_t *t = curthread;
529 	klwp_t *lwp = ttolwp(t);
530 	proc_t *p = ttoproc(t);
531 	int ret = 0;
532 
533 	/*
534 	 * Identify signals of interest so they can be processed, even if other
535 	 * parts of the machinery would be poised to ignore them.
536 	 */
537 	t->t_sigwait = set;
538 
539 	mutex_enter(&p->p_lock);
540 
541 	/* Set thread signal mask to unmask those in the specified set. */
542 	schedctl_finish_sigblock(t);
543 	const k_sigset_t oldmask = t->t_hold;
544 	sigdiffset(&t->t_hold, &t->t_sigwait);
545 
546 	if (should_block) {
547 		do {
548 			ret = cv_waituntil_sig(&t->t_delay_cv, &p->p_lock,
549 			    NULL, 0);
550 		} while (ret > 0);
551 	} else {
552 		mutex_exit(&p->p_lock);
553 		if (issig(FORREAL) == 0) {
554 			ret = -1;
555 		}
556 		mutex_enter(&p->p_lock);
557 	}
558 
559 	/*
560 	 * Restore thread's signal mask to its previous value.
561 	 * Set t_sig_check so post_syscall sees new t_hold mask.
562 	 */
563 	t->t_hold = oldmask;
564 	t->t_sig_check = 1;
565 
566 	if (ret == -1) {
567 		/* no signals pending */
568 		mutex_exit(&p->p_lock);
569 		sigemptyset(&t->t_sigwait);
570 		return (EAGAIN);
571 	}
572 
573 	/* Do not bother with signal if it is not in request set. */
574 	if (lwp->lwp_cursig == 0 ||
575 	    !sigismember(&t->t_sigwait, lwp->lwp_cursig)) {
576 		/*
577 		 * lwp_cursig is zero if pokelwps() awakened cv_wait_sig().
578 		 * This happens if some other thread in this process called
579 		 * forkall() or exit().
580 		 */
581 		mutex_exit(&p->p_lock);
582 		sigemptyset(&t->t_sigwait);
583 		return (EINTR);
584 	}
585 
586 	/* Convert signal info into external, datamodel independent, struct. */
587 	signalfd_siginfo_t ssi;
588 	bzero(&ssi, sizeof (ssi));
589 	if (lwp->lwp_curinfo != NULL) {
590 		k_siginfo_t *infop = &lwp->lwp_curinfo->sq_info;
591 
592 		ssi.ssi_signo	= infop->si_signo;
593 		ssi.ssi_errno	= infop->si_errno;
594 		ssi.ssi_code	= infop->si_code;
595 		ssi.ssi_pid	= infop->si_pid;
596 		ssi.ssi_uid	= infop->si_uid;
597 		ssi.ssi_fd	= infop->si_fd;
598 		ssi.ssi_band	= infop->si_band;
599 		ssi.ssi_trapno	= infop->si_trapno;
600 		ssi.ssi_status	= infop->si_status;
601 		ssi.ssi_utime	= infop->si_utime;
602 		ssi.ssi_stime	= infop->si_stime;
603 		ssi.ssi_addr	= (uint64_t)(intptr_t)infop->si_addr;
604 
605 		DTRACE_PROC2(signal__clear, int, 0, ksiginfo_t *, infop);
606 	} else {
607 		/* Convert to the format expected by the probe. */
608 		k_siginfo_t info = {
609 			.si_signo = lwp->lwp_cursig,
610 			.si_code = SI_NOINFO,
611 		};
612 
613 		ssi.ssi_signo = info.si_signo;
614 		ssi.ssi_code = info.si_code;
615 
616 		DTRACE_PROC2(signal__clear, int, 0, ksiginfo_t *, &info);
617 	}
618 
619 	lwp->lwp_ru.nsignals++;
620 	lwp->lwp_cursig = 0;
621 	lwp->lwp_extsig = 0;
622 	if (lwp->lwp_curinfo != NULL) {
623 		siginfofree(lwp->lwp_curinfo);
624 		lwp->lwp_curinfo = NULL;
625 	}
626 	mutex_exit(&p->p_lock);
627 
628 	ret = uiomove(&ssi, sizeof (ssi), UIO_READ, uio);
629 	sigemptyset(&t->t_sigwait);
630 	return (ret);
631 }
632 
633 /*
634  * This is similar to sigtimedwait. Based on the fd mode, we may wait until a
635  * signal within our specified set is posted. We consume as many available
636  * signals within our set as we can.
637  */
638 _NOTE(ARGSUSED(2))
639 static int
640 signalfd_read(dev_t dev, uio_t *uio, cred_t *cr)
641 {
642 	signalfd_state_t *state;
643 	k_sigset_t set;
644 	bool should_block = true, got_one = false;
645 	int res;
646 
647 	state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
648 	if (state == NULL) {
649 		return (ENXIO);
650 	}
651 
652 	if (uio->uio_resid < sizeof (signalfd_siginfo_t)) {
653 		return (EINVAL);
654 	}
655 
656 	if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
657 		should_block = false;
658 	}
659 
660 	mutex_enter(&state->sfd_lock);
661 	set = state->sfd_mask;
662 	mutex_exit(&state->sfd_lock);
663 
664 	if (sigisempty(&set))
665 		return (set_errno(EINVAL));
666 
667 	do  {
668 		res = signalfd_consume_signal(set, uio, should_block);
669 
670 		if (res == 0) {
671 			/*
672 			 * After consuming one signal, do not block while
673 			 * trying to consume more.
674 			 */
675 			got_one = true;
676 			should_block = false;
677 
678 			/*
679 			 * Refresh the matching signal set in case it was
680 			 * updated during the wait.
681 			 */
682 			mutex_enter(&state->sfd_lock);
683 			set = state->sfd_mask;
684 			mutex_exit(&state->sfd_lock);
685 			if (sigisempty(&set))
686 				break;
687 		}
688 	} while (res == 0 && uio->uio_resid >= sizeof (signalfd_siginfo_t));
689 
690 	if (got_one)
691 		res = 0;
692 
693 	return (res);
694 }
695 
696 /*
697  * If ksigset_t's were a single word, we would do:
698  *      return (((p->p_sig | t->t_sig) & set) & fillset);
699  */
700 static int
701 signalfd_sig_pending(proc_t *p, kthread_t *t, k_sigset_t set)
702 {
703 	return (((p->p_sig.__sigbits[0] | t->t_sig.__sigbits[0]) &
704 	    set.__sigbits[0]) |
705 	    ((p->p_sig.__sigbits[1] | t->t_sig.__sigbits[1]) &
706 	    set.__sigbits[1]) |
707 	    (((p->p_sig.__sigbits[2] | t->t_sig.__sigbits[2]) &
708 	    set.__sigbits[2]) & FILLSET2));
709 }
710 
711 static int
712 signalfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
713     struct pollhead **phpp)
714 {
715 	signalfd_state_t *state;
716 	short revents = 0;
717 	kthread_t *t = curthread;
718 	proc_t *p = ttoproc(t);
719 
720 	state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
721 	if (state == NULL) {
722 		return (ENXIO);
723 	}
724 
725 	mutex_enter(&state->sfd_lock);
726 	if (signalfd_sig_pending(p, t, state->sfd_mask) != 0) {
727 		revents |= POLLRDNORM | POLLIN;
728 	}
729 
730 	*reventsp = revents & events;
731 	if ((*reventsp == 0 && !anyyet) || (events & POLLET) != 0) {
732 		signalfd_poller_t *sp;
733 
734 		sp = signalfd_poller_associate(state, p);
735 		*phpp = &sp->sp_pollhead;
736 	}
737 	mutex_exit(&state->sfd_lock);
738 
739 	return (0);
740 }
741 
742 static void
743 signalfd_set_mask(signalfd_state_t *state, const sigset_t *umask)
744 {
745 	k_sigset_t kmask;
746 
747 	sigutok(umask, &kmask);
748 
749 	mutex_enter(&state->sfd_lock);
750 	state->sfd_mask = kmask;
751 	list_t *pollers = &state->sfd_pollers;
752 	for (signalfd_poller_t *sp = list_head(pollers); sp != NULL;
753 	    sp = list_next(pollers, sp)) {
754 		mutex_enter(&sp->sp_lock);
755 		sp->sp_mask = kmask;
756 		mutex_exit(&sp->sp_lock);
757 	}
758 	mutex_exit(&state->sfd_lock);
759 }
760 
761 _NOTE(ARGSUSED(4))
762 static int
763 signalfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
764 {
765 	signalfd_state_t *state;
766 	sigset_t mask;
767 
768 	state = ddi_get_soft_state(signalfd_softstate, getminor(dev));
769 	if (state == NULL) {
770 		return (ENXIO);
771 	}
772 
773 	switch (cmd) {
774 	case SIGNALFDIOC_MASK:
775 		if (ddi_copyin((caddr_t)arg, &mask, sizeof (mask), md) != 0) {
776 			return (EFAULT);
777 		}
778 		signalfd_set_mask(state, &mask);
779 		return (0);
780 
781 	default:
782 		break;
783 	}
784 
785 	return (ENOTTY);
786 }
787 
788 _NOTE(ARGSUSED(1))
789 static int
790 signalfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
791 {
792 	signalfd_state_t *state;
793 	const minor_t minor = getminor(dev);
794 
795 	state = ddi_get_soft_state(signalfd_softstate, minor);
796 	if (state == NULL) {
797 		return (ENXIO);
798 	}
799 
800 	/*
801 	 * With this signalfd instance being closed, sfd_lock is a formality, as
802 	 * nothing else should be reaching for it to add pollers at this point.
803 	 */
804 	mutex_enter(&state->sfd_lock);
805 
806 	/* Dissociate any pollers from their respective processes */
807 	signalfd_pollers_dissociate(state);
808 
809 	/* ... and free all those (now-dissociated) pollers */
810 	signalfd_pollers_free(state);
811 	ASSERT(list_is_empty(&state->sfd_pollers));
812 
813 	mutex_destroy(&state->sfd_lock);
814 	ddi_soft_state_free(signalfd_softstate, minor);
815 	id_free(signalfd_minors, minor);
816 
817 	return (0);
818 }
819 
820 static int
821 signalfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
822 {
823 	if (cmd != DDI_ATTACH || signalfd_devi != NULL) {
824 		return (DDI_FAILURE);
825 	}
826 
827 	signalfd_minors = id_space_create("signalfd_minors", 1, L_MAXMIN32 + 1);
828 	if (signalfd_minors == NULL) {
829 		cmn_err(CE_WARN, "signalfd couldn't create id space");
830 		return (DDI_FAILURE);
831 	}
832 
833 	if (ddi_soft_state_init(&signalfd_softstate,
834 	    sizeof (signalfd_state_t), 0) != 0) {
835 		cmn_err(CE_WARN, "signalfd failed to create soft state");
836 		id_space_destroy(signalfd_minors);
837 		return (DDI_FAILURE);
838 	}
839 
840 	if (ddi_create_minor_node(devi, "signalfd", S_IFCHR,
841 	    SIGNALFDMNRN_SIGNALFD, DDI_PSEUDO, 0) == DDI_FAILURE) {
842 		cmn_err(CE_NOTE, "signalfd couldn't create minor node");
843 		ddi_soft_state_fini(&signalfd_softstate);
844 		id_space_destroy(signalfd_minors);
845 		return (DDI_FAILURE);
846 	}
847 
848 
849 	sigfd_exit_helper = signalfd_exit_helper;
850 
851 	signalfd_wakeq = taskq_create("signalfd_wake", 1, minclsyspri,
852 	    0, INT_MAX, TASKQ_PREPOPULATE);
853 
854 	ddi_report_dev(devi);
855 	signalfd_devi = devi;
856 
857 	return (DDI_SUCCESS);
858 }
859 
860 _NOTE(ARGSUSED(0))
861 static int
862 signalfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
863 {
864 	if (cmd != DDI_DETACH) {
865 		return (DDI_FAILURE);
866 	}
867 
868 	/*
869 	 * With all of the instances gone, it is safe to both destroy the waker
870 	 * taskq (which must be empty) and tear down the exit helper (which must
871 	 * be unreachable with no proc_t`p_sigfd associations).
872 	 */
873 	taskq_destroy(signalfd_wakeq);
874 	sigfd_exit_helper = NULL;
875 
876 	id_space_destroy(signalfd_minors);
877 	ddi_soft_state_fini(&signalfd_softstate);
878 	ddi_remove_minor_node(signalfd_devi, NULL);
879 	signalfd_devi = NULL;
880 
881 	return (DDI_SUCCESS);
882 }
883 
884 _NOTE(ARGSUSED(0))
885 static int
886 signalfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
887 {
888 	int error;
889 
890 	switch (infocmd) {
891 	case DDI_INFO_DEVT2DEVINFO:
892 		*result = (void *)signalfd_devi;
893 		error = DDI_SUCCESS;
894 		break;
895 	case DDI_INFO_DEVT2INSTANCE:
896 		*result = (void *)0;
897 		error = DDI_SUCCESS;
898 		break;
899 	default:
900 		error = DDI_FAILURE;
901 	}
902 	return (error);
903 }
904 
905 static struct cb_ops signalfd_cb_ops = {
906 	signalfd_open,		/* open */
907 	signalfd_close,		/* close */
908 	nulldev,		/* strategy */
909 	nulldev,		/* print */
910 	nodev,			/* dump */
911 	signalfd_read,		/* read */
912 	nodev,			/* write */
913 	signalfd_ioctl,		/* ioctl */
914 	nodev,			/* devmap */
915 	nodev,			/* mmap */
916 	nodev,			/* segmap */
917 	signalfd_poll,		/* poll */
918 	ddi_prop_op,		/* cb_prop_op */
919 	0,			/* streamtab  */
920 	D_NEW | D_MP		/* Driver compatibility flag */
921 };
922 
923 static struct dev_ops signalfd_ops = {
924 	DEVO_REV,		/* devo_rev */
925 	0,			/* refcnt */
926 	signalfd_info,		/* get_dev_info */
927 	nulldev,		/* identify */
928 	nulldev,		/* probe */
929 	signalfd_attach,	/* attach */
930 	signalfd_detach,	/* detach */
931 	nodev,			/* reset */
932 	&signalfd_cb_ops,	/* driver operations */
933 	NULL,			/* bus operations */
934 	nodev,			/* dev power */
935 	ddi_quiesce_not_needed,	/* quiesce */
936 };
937 
938 static struct modldrv modldrv = {
939 	&mod_driverops,		/* module type (this is a pseudo driver) */
940 	"signalfd support",	/* name of module */
941 	&signalfd_ops,		/* driver ops */
942 };
943 
944 static struct modlinkage modlinkage = {
945 	MODREV_1,
946 	(void *)&modldrv,
947 	NULL
948 };
949 
950 int
951 _init(void)
952 {
953 	return (mod_install(&modlinkage));
954 }
955 
956 int
957 _info(struct modinfo *modinfop)
958 {
959 	return (mod_info(&modlinkage, modinfop));
960 }
961 
962 int
963 _fini(void)
964 {
965 	return (mod_remove(&modlinkage));
966 }
967