xref: /illumos-gate/usr/src/uts/common/os/exit.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/cred.h>
34 #include <sys/user.h>
35 #include <sys/errno.h>
36 #include <sys/proc.h>
37 #include <sys/ucontext.h>
38 #include <sys/procfs.h>
39 #include <sys/vnode.h>
40 #include <sys/acct.h>
41 #include <sys/var.h>
42 #include <sys/cmn_err.h>
43 #include <sys/debug.h>
44 #include <sys/wait.h>
45 #include <sys/siginfo.h>
46 #include <sys/procset.h>
47 #include <sys/class.h>
48 #include <sys/file.h>
49 #include <sys/session.h>
50 #include <sys/kmem.h>
51 #include <sys/vtrace.h>
52 #include <sys/prsystm.h>
53 #include <sys/ipc.h>
54 #include <sys/sem_impl.h>
55 #include <c2/audit.h>
56 #include <sys/aio_impl.h>
57 #include <vm/as.h>
58 #include <sys/poll.h>
59 #include <sys/door.h>
60 #include <sys/lwpchan_impl.h>
61 #include <sys/utrap.h>
62 #include <sys/task.h>
63 #include <sys/exacct.h>
64 #include <sys/cyclic.h>
65 #include <sys/schedctl.h>
66 #include <sys/rctl.h>
67 #include <sys/contract_impl.h>
68 #include <sys/contract/process_impl.h>
69 #include <sys/list.h>
70 #include <sys/dtrace.h>
71 #include <sys/pool.h>
72 #include <sys/sdt.h>
73 #include <sys/corectl.h>
74 #include <sys/brand.h>
75 #include <sys/libc_kernel.h>
76 
77 /*
78  * convert code/data pair into old style wait status
79  */
80 int
81 wstat(int code, int data)
82 {
83 	int stat = (data & 0377);
84 
85 	switch (code) {
86 	case CLD_EXITED:
87 		stat <<= 8;
88 		break;
89 	case CLD_DUMPED:
90 		stat |= WCOREFLG;
91 		break;
92 	case CLD_KILLED:
93 		break;
94 	case CLD_TRAPPED:
95 	case CLD_STOPPED:
96 		stat <<= 8;
97 		stat |= WSTOPFLG;
98 		break;
99 	case CLD_CONTINUED:
100 		stat = WCONTFLG;
101 		break;
102 	default:
103 		cmn_err(CE_PANIC, "wstat: bad code");
104 		/* NOTREACHED */
105 	}
106 	return (stat);
107 }
108 
109 static char *
110 exit_reason(char *buf, size_t bufsz, int what, int why)
111 {
112 	switch (why) {
113 	case CLD_EXITED:
114 		(void) snprintf(buf, bufsz, "exited with status %d", what);
115 		break;
116 	case CLD_KILLED:
117 		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
118 		break;
119 	case CLD_DUMPED:
120 		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
121 		break;
122 	default:
123 		(void) snprintf(buf, bufsz, "encountered unknown error "
124 		    "(%d, %d)", why, what);
125 		break;
126 	}
127 
128 	return (buf);
129 }
130 
131 /*
132  * exit system call: pass back caller's arg.
133  */
134 void
135 rexit(int rval)
136 {
137 	exit(CLD_EXITED, rval);
138 }
139 
140 /*
141  * Called by proc_exit() when a zone's init exits, presumably because
142  * it failed.  As long as the given zone is still in the "running"
143  * state, we will re-exec() init, but first we need to reset things
144  * which are usually inherited across exec() but will break init's
145  * assumption that it is being exec()'d from a virgin process.  Most
146  * importantly this includes closing all file descriptors (exec only
147  * closes those marked close-on-exec) and resetting signals (exec only
148  * resets handled signals, and we need to clear any signals which
149  * killed init).  Anything else that exec(2) says would be inherited,
150  * but would affect the execution of init, needs to be reset.
151  */
152 static int
153 restart_init(int what, int why)
154 {
155 	kthread_t *t = curthread;
156 	klwp_t *lwp = ttolwp(t);
157 	proc_t *p = ttoproc(t);
158 	user_t *up = PTOU(p);
159 
160 	vnode_t *oldcd, *oldrd;
161 	int i, err;
162 	char reason_buf[64];
163 
164 	/*
165 	 * Let zone admin (and global zone admin if this is for a non-global
166 	 * zone) know that init has failed and will be restarted.
167 	 */
168 	zcmn_err(p->p_zone->zone_id, CE_WARN,
169 	    "init(1M) %s: restarting automatically",
170 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
171 
172 	if (!INGLOBALZONE(p)) {
173 		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
174 		    "restarting automatically",
175 		    p->p_zone->zone_name, p->p_pid, reason_buf);
176 	}
177 
178 	/*
179 	 * Remove any fpollinfo_t's for this (last) thread from our file
180 	 * descriptors so closeall() can ASSERT() that they're all gone.
181 	 * Then close all open file descriptors in the process.
182 	 */
183 	pollcleanup();
184 	closeall(P_FINFO(p));
185 
186 	/*
187 	 * Grab p_lock and begin clearing miscellaneous global process
188 	 * state that needs to be reset before we exec the new init(1M).
189 	 */
190 
191 	mutex_enter(&p->p_lock);
192 	prbarrier(p);
193 
194 	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
195 	up->u_cmask = CMASK;
196 
197 	sigemptyset(&t->t_hold);
198 	sigemptyset(&t->t_sig);
199 	sigemptyset(&t->t_extsig);
200 
201 	sigemptyset(&p->p_sig);
202 	sigemptyset(&p->p_extsig);
203 
204 	sigdelq(p, t, 0);
205 	sigdelq(p, NULL, 0);
206 
207 	if (p->p_killsqp) {
208 		siginfofree(p->p_killsqp);
209 		p->p_killsqp = NULL;
210 	}
211 
212 	/*
213 	 * Reset any signals that are ignored back to the default disposition.
214 	 * Other u_signal members will be cleared when exec calls sigdefault().
215 	 */
216 	for (i = 1; i < NSIG; i++) {
217 		if (up->u_signal[i - 1] == SIG_IGN) {
218 			up->u_signal[i - 1] = SIG_DFL;
219 			sigemptyset(&up->u_sigmask[i - 1]);
220 		}
221 	}
222 
223 	/*
224 	 * Clear the current signal, any signal info associated with it, and
225 	 * any signal information from contracts and/or contract templates.
226 	 */
227 	lwp->lwp_cursig = 0;
228 	lwp->lwp_extsig = 0;
229 	if (lwp->lwp_curinfo != NULL) {
230 		siginfofree(lwp->lwp_curinfo);
231 		lwp->lwp_curinfo = NULL;
232 	}
233 	lwp_ctmpl_clear(lwp);
234 
235 	/*
236 	 * Reset both the process root directory and the current working
237 	 * directory to the root of the zone just as we do during boot.
238 	 */
239 	VN_HOLD(p->p_zone->zone_rootvp);
240 	oldrd = up->u_rdir;
241 	up->u_rdir = p->p_zone->zone_rootvp;
242 
243 	VN_HOLD(p->p_zone->zone_rootvp);
244 	oldcd = up->u_cdir;
245 	up->u_cdir = p->p_zone->zone_rootvp;
246 
247 	if (up->u_cwd != NULL) {
248 		refstr_rele(up->u_cwd);
249 		up->u_cwd = NULL;
250 	}
251 
252 	mutex_exit(&p->p_lock);
253 
254 	if (oldrd != NULL)
255 		VN_RELE(oldrd);
256 	if (oldcd != NULL)
257 		VN_RELE(oldcd);
258 
259 	/* Free the controlling tty.  (freectty() always assumes curproc.) */
260 	ASSERT(p == curproc);
261 	(void) freectty(B_TRUE);
262 
263 	/*
264 	 * Now exec() the new init(1M) on top of the current process.  If we
265 	 * succeed, the caller will treat this like a successful system call.
266 	 * If we fail, we issue messages and the caller will proceed with exit.
267 	 */
268 	err = exec_init(p->p_zone->zone_initname, NULL);
269 
270 	if (err == 0)
271 		return (0);
272 
273 	zcmn_err(p->p_zone->zone_id, CE_WARN,
274 	    "failed to restart init(1M) (err=%d): system reboot required", err);
275 
276 	if (!INGLOBALZONE(p)) {
277 		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
278 		    "(pid %d, err=%d): zoneadm(1M) boot required",
279 		    p->p_zone->zone_name, p->p_pid, err);
280 	}
281 
282 	return (-1);
283 }
284 
285 /*
286  * Release resources.
287  * Enter zombie state.
288  * Wake up parent and init processes,
289  * and dispose of children.
290  */
291 void
292 exit(int why, int what)
293 {
294 	/*
295 	 * If proc_exit() fails, then some other lwp in the process
296 	 * got there first.  We just have to call lwp_exit() to allow
297 	 * the other lwp to finish exiting the process.  Otherwise we're
298 	 * restarting init, and should return.
299 	 */
300 	if (proc_exit(why, what) != 0) {
301 		mutex_enter(&curproc->p_lock);
302 		ASSERT(curproc->p_flag & SEXITLWPS);
303 		lwp_exit();
304 		/* NOTREACHED */
305 	}
306 }
307 
308 /*
309  * Set the SEXITING flag on the process, after making sure /proc does
310  * not have it locked.  This is done in more places than proc_exit(),
311  * so it is a separate function.
312  */
313 void
314 proc_is_exiting(proc_t *p)
315 {
316 	mutex_enter(&p->p_lock);
317 	prbarrier(p);
318 	p->p_flag |= SEXITING;
319 	mutex_exit(&p->p_lock);
320 }
321 
322 /*
323  * Return value:
324  *   1 - exitlwps() failed, call (or continue) lwp_exit()
325  *   0 - restarting init.  Return through system call path
326  */
327 int
328 proc_exit(int why, int what)
329 {
330 	kthread_t *t = curthread;
331 	klwp_t *lwp = ttolwp(t);
332 	proc_t *p = ttoproc(t);
333 	zone_t *z = p->p_zone;
334 	timeout_id_t tmp_id;
335 	int rv;
336 	proc_t *q;
337 	task_t *tk;
338 	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
339 	sigqueue_t *sqp;
340 	lwpdir_t *lwpdir;
341 	uint_t lwpdir_sz;
342 	tidhash_t *tidhash;
343 	uint_t tidhash_sz;
344 	ret_tidhash_t *ret_tidhash;
345 	refstr_t *cwd;
346 	hrtime_t hrutime, hrstime;
347 	int evaporate;
348 
349 	/*
350 	 * Stop and discard the process's lwps except for the current one,
351 	 * unless some other lwp beat us to it.  If exitlwps() fails then
352 	 * return and the calling lwp will call (or continue in) lwp_exit().
353 	 */
354 	proc_is_exiting(p);
355 	if (exitlwps(0) != 0)
356 		return (1);
357 
358 	mutex_enter(&p->p_lock);
359 	if (p->p_ttime > 0) {
360 		/*
361 		 * Account any remaining ticks charged to this process
362 		 * on its way out.
363 		 */
364 		(void) task_cpu_time_incr(p->p_task, p->p_ttime);
365 		p->p_ttime = 0;
366 	}
367 	mutex_exit(&p->p_lock);
368 
369 	DTRACE_PROC(lwp__exit);
370 	DTRACE_PROC1(exit, int, why);
371 
372 	/*
373 	 * Will perform any brand specific proc exit processing, since this
374 	 * is always the last lwp, will also perform lwp_exit and free brand
375 	 * data
376 	 */
377 	if (PROC_IS_BRANDED(p)) {
378 		lwp_detach_brand_hdlrs(lwp);
379 		brand_clearbrand(p, B_FALSE);
380 	}
381 
382 	/*
383 	 * Don't let init exit unless zone_start_init() failed its exec, or
384 	 * we are shutting down the zone or the machine.
385 	 *
386 	 * Since we are single threaded, we don't need to lock the
387 	 * following accesses to zone_proc_initpid.
388 	 */
389 	if (p->p_pid == z->zone_proc_initpid) {
390 		if (z->zone_boot_err == 0 &&
391 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
392 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) {
393 			if (z->zone_restart_init == B_TRUE) {
394 				if (restart_init(what, why) == 0)
395 					return (0);
396 			} else {
397 				(void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL,
398 				    CRED());
399 			}
400 		}
401 
402 		/*
403 		 * Since we didn't or couldn't restart init, we clear
404 		 * the zone's init state and proceed with exit
405 		 * processing.
406 		 */
407 		z->zone_proc_initpid = -1;
408 	}
409 
410 	lwp_pcb_exit();
411 
412 	/*
413 	 * Allocate a sigqueue now, before we grab locks.
414 	 * It will be given to sigcld(), below.
415 	 * Special case:  If we will be making the process disappear
416 	 * without a trace because it is either:
417 	 *	* an exiting SSYS process, or
418 	 *	* a posix_spawn() vfork child who requests it,
419 	 * we don't bother to allocate a useless sigqueue.
420 	 */
421 	evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
422 	    why == CLD_EXITED && what == _EVAPORATE);
423 	if (!evaporate)
424 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
425 
426 	/*
427 	 * revoke any doors created by the process.
428 	 */
429 	if (p->p_door_list)
430 		door_exit();
431 
432 	/*
433 	 * Release schedctl data structures.
434 	 */
435 	if (p->p_pagep)
436 		schedctl_proc_cleanup();
437 
438 	/*
439 	 * make sure all pending kaio has completed.
440 	 */
441 	if (p->p_aio)
442 		aio_cleanup_exit();
443 
444 	/*
445 	 * discard the lwpchan cache.
446 	 */
447 	if (p->p_lcp != NULL)
448 		lwpchan_destroy_cache(0);
449 
450 	/*
451 	 * Clean up any DTrace helper actions or probes for the process.
452 	 */
453 	if (p->p_dtrace_helpers != NULL) {
454 		ASSERT(dtrace_helpers_cleanup != NULL);
455 		(*dtrace_helpers_cleanup)(p);
456 	}
457 
458 	/*
459 	 * Clean up any signalfd state for the process.
460 	 */
461 	if (p->p_sigfd != NULL) {
462 		VERIFY(sigfd_exit_helper != NULL);
463 		(*sigfd_exit_helper)();
464 	}
465 
466 	/* untimeout the realtime timers */
467 	if (p->p_itimer != NULL)
468 		timer_exit();
469 
470 	if ((tmp_id = p->p_alarmid) != 0) {
471 		p->p_alarmid = 0;
472 		(void) untimeout(tmp_id);
473 	}
474 
475 	/*
476 	 * Remove any fpollinfo_t's for this (last) thread from our file
477 	 * descriptors so closeall() can ASSERT() that they're all gone.
478 	 */
479 	pollcleanup();
480 
481 	if (p->p_rprof_cyclic != CYCLIC_NONE) {
482 		mutex_enter(&cpu_lock);
483 		cyclic_remove(p->p_rprof_cyclic);
484 		mutex_exit(&cpu_lock);
485 	}
486 
487 	mutex_enter(&p->p_lock);
488 
489 	/*
490 	 * Clean up any DTrace probes associated with this process.
491 	 */
492 	if (p->p_dtrace_probes) {
493 		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
494 		dtrace_fasttrap_exit_ptr(p);
495 	}
496 
497 	while ((tmp_id = p->p_itimerid) != 0) {
498 		p->p_itimerid = 0;
499 		mutex_exit(&p->p_lock);
500 		(void) untimeout(tmp_id);
501 		mutex_enter(&p->p_lock);
502 	}
503 
504 	lwp_cleanup();
505 
506 	/*
507 	 * We are about to exit; prevent our resource associations from
508 	 * being changed.
509 	 */
510 	pool_barrier_enter();
511 
512 	/*
513 	 * Block the process against /proc now that we have really
514 	 * acquired p->p_lock (to manipulate p_tlist at least).
515 	 */
516 	prbarrier(p);
517 
518 	sigfillset(&p->p_ignore);
519 	sigemptyset(&p->p_siginfo);
520 	sigemptyset(&p->p_sig);
521 	sigemptyset(&p->p_extsig);
522 	sigemptyset(&t->t_sig);
523 	sigemptyset(&t->t_extsig);
524 	sigemptyset(&p->p_sigmask);
525 	sigdelq(p, t, 0);
526 	lwp->lwp_cursig = 0;
527 	lwp->lwp_extsig = 0;
528 	p->p_flag &= ~(SKILLED | SEXTKILLED);
529 	if (lwp->lwp_curinfo) {
530 		siginfofree(lwp->lwp_curinfo);
531 		lwp->lwp_curinfo = NULL;
532 	}
533 
534 	t->t_proc_flag |= TP_LWPEXIT;
535 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
536 	prlwpexit(t);		/* notify /proc */
537 	lwp_hash_out(p, t->t_tid);
538 	prexit(p);
539 
540 	p->p_lwpcnt = 0;
541 	p->p_tlist = NULL;
542 	sigqfree(p);
543 	term_mstate(t);
544 	p->p_mterm = gethrtime();
545 
546 	exec_vp = p->p_exec;
547 	execdir_vp = p->p_execdir;
548 	p->p_exec = NULLVP;
549 	p->p_execdir = NULLVP;
550 	mutex_exit(&p->p_lock);
551 
552 	pr_free_watched_pages(p);
553 
554 	closeall(P_FINFO(p));
555 
556 	/* Free the controlling tty.  (freectty() always assumes curproc.) */
557 	ASSERT(p == curproc);
558 	(void) freectty(B_TRUE);
559 
560 #if defined(__sparc)
561 	if (p->p_utraps != NULL)
562 		utrap_free(p);
563 #endif
564 	if (p->p_semacct)			/* IPC semaphore exit */
565 		semexit(p);
566 	rv = wstat(why, what);
567 
568 	acct(rv & 0xff);
569 	exacct_commit_proc(p, rv);
570 
571 	/*
572 	 * Release any resources associated with C2 auditing
573 	 */
574 	if (AU_AUDITING()) {
575 		/*
576 		 * audit exit system call
577 		 */
578 		audit_exit(why, what);
579 	}
580 
581 	/*
582 	 * Free address space.
583 	 */
584 	relvm();
585 
586 	if (exec_vp) {
587 		/*
588 		 * Close this executable which has been opened when the process
589 		 * was created by getproc().
590 		 */
591 		(void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL);
592 		VN_RELE(exec_vp);
593 	}
594 	if (execdir_vp)
595 		VN_RELE(execdir_vp);
596 
597 	/*
598 	 * Release held contracts.
599 	 */
600 	contract_exit(p);
601 
602 	/*
603 	 * Depart our encapsulating process contract.
604 	 */
605 	if ((p->p_flag & SSYS) == 0) {
606 		ASSERT(p->p_ct_process);
607 		contract_process_exit(p->p_ct_process, p, rv);
608 	}
609 
610 	/*
611 	 * Remove pool association, and block if requested by pool_do_bind.
612 	 */
613 	mutex_enter(&p->p_lock);
614 	ASSERT(p->p_pool->pool_ref > 0);
615 	atomic_dec_32(&p->p_pool->pool_ref);
616 	p->p_pool = pool_default;
617 	/*
618 	 * Now that our address space has been freed and all other threads
619 	 * in this process have exited, set the PEXITED pool flag.  This
620 	 * tells the pools subsystems to ignore this process if it was
621 	 * requested to rebind this process to a new pool.
622 	 */
623 	p->p_poolflag |= PEXITED;
624 	pool_barrier_exit();
625 	mutex_exit(&p->p_lock);
626 
627 	mutex_enter(&pidlock);
628 
629 	/*
630 	 * Delete this process from the newstate list of its parent. We
631 	 * will put it in the right place in the sigcld in the end.
632 	 */
633 	delete_ns(p->p_parent, p);
634 
635 	/*
636 	 * Reassign the orphans to the next of kin.
637 	 * Don't rearrange init's orphanage.
638 	 */
639 	if ((q = p->p_orphan) != NULL && p != proc_init) {
640 
641 		proc_t *nokp = p->p_nextofkin;
642 
643 		for (;;) {
644 			q->p_nextofkin = nokp;
645 			if (q->p_nextorph == NULL)
646 				break;
647 			q = q->p_nextorph;
648 		}
649 		q->p_nextorph = nokp->p_orphan;
650 		nokp->p_orphan = p->p_orphan;
651 		p->p_orphan = NULL;
652 	}
653 
654 	/*
655 	 * Reassign the children to init.
656 	 * Don't try to assign init's children to init.
657 	 */
658 	if ((q = p->p_child) != NULL && p != proc_init) {
659 		struct proc	*np;
660 		struct proc	*initp = proc_init;
661 		boolean_t	setzonetop = B_FALSE;
662 
663 		if (!INGLOBALZONE(curproc))
664 			setzonetop = B_TRUE;
665 
666 		pgdetach(p);
667 
668 		do {
669 			np = q->p_sibling;
670 			/*
671 			 * Delete it from its current parent new state
672 			 * list and add it to init new state list
673 			 */
674 			delete_ns(q->p_parent, q);
675 
676 			q->p_ppid = 1;
677 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
678 			if (setzonetop) {
679 				mutex_enter(&q->p_lock);
680 				q->p_flag |= SZONETOP;
681 				mutex_exit(&q->p_lock);
682 			}
683 			q->p_parent = initp;
684 
685 			/*
686 			 * Since q will be the first child,
687 			 * it will not have a previous sibling.
688 			 */
689 			q->p_psibling = NULL;
690 			if (initp->p_child) {
691 				initp->p_child->p_psibling = q;
692 			}
693 			q->p_sibling = initp->p_child;
694 			initp->p_child = q;
695 			if (q->p_proc_flag & P_PR_PTRACE) {
696 				mutex_enter(&q->p_lock);
697 				sigtoproc(q, NULL, SIGKILL);
698 				mutex_exit(&q->p_lock);
699 			}
700 			/*
701 			 * sigcld() will add the child to parents
702 			 * newstate list.
703 			 */
704 			if (q->p_stat == SZOMB)
705 				sigcld(q, NULL);
706 		} while ((q = np) != NULL);
707 
708 		p->p_child = NULL;
709 		ASSERT(p->p_child_ns == NULL);
710 	}
711 
712 	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
713 
714 	mutex_enter(&p->p_lock);
715 	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
716 
717 	/*
718 	 * Have our task accummulate our resource usage data before they
719 	 * become contaminated by p_cacct etc., and before we renounce
720 	 * membership of the task.
721 	 *
722 	 * We do this regardless of whether or not task accounting is active.
723 	 * This is to avoid having nonsense data reported for this task if
724 	 * task accounting is subsequently enabled. The overhead is minimal;
725 	 * by this point, this process has accounted for the usage of all its
726 	 * LWPs. We nonetheless do the work here, and under the protection of
727 	 * pidlock, so that the movement of the process's usage to the task
728 	 * happens at the same time as the removal of the process from the
729 	 * task, from the point of view of exacct_snapshot_task_usage().
730 	 */
731 	exacct_update_task_mstate(p);
732 
733 	hrutime = mstate_aggr_state(p, LMS_USER);
734 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
735 	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
736 	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
737 
738 	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
739 	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
740 	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
741 	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
742 	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
743 	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
744 	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
745 	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
746 	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
747 	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
748 
749 	p->p_ru.minflt	+= p->p_cru.minflt;
750 	p->p_ru.majflt	+= p->p_cru.majflt;
751 	p->p_ru.nswap	+= p->p_cru.nswap;
752 	p->p_ru.inblock	+= p->p_cru.inblock;
753 	p->p_ru.oublock	+= p->p_cru.oublock;
754 	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
755 	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
756 	p->p_ru.nsignals += p->p_cru.nsignals;
757 	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
758 	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
759 	p->p_ru.sysc	+= p->p_cru.sysc;
760 	p->p_ru.ioch	+= p->p_cru.ioch;
761 
762 	p->p_stat = SZOMB;
763 	p->p_proc_flag &= ~P_PR_PTRACE;
764 	p->p_wdata = what;
765 	p->p_wcode = (char)why;
766 
767 	cdir = PTOU(p)->u_cdir;
768 	rdir = PTOU(p)->u_rdir;
769 	cwd = PTOU(p)->u_cwd;
770 
771 	ASSERT(cdir != NULL || p->p_parent == &p0);
772 
773 	/*
774 	 * Release resource controls, as they are no longer enforceable.
775 	 */
776 	rctl_set_free(p->p_rctls);
777 
778 	/*
779 	 * Decrement tk_nlwps counter for our task.max-lwps resource control.
780 	 * An extended accounting record, if that facility is active, is
781 	 * scheduled to be written.  We cannot give up task and project
782 	 * membership at this point because that would allow zombies to escape
783 	 * from the max-processes resource controls.  Zombies stay in their
784 	 * current task and project until the process table slot is released
785 	 * in freeproc().
786 	 */
787 	tk = p->p_task;
788 
789 	mutex_enter(&p->p_zone->zone_nlwps_lock);
790 	tk->tk_nlwps--;
791 	tk->tk_proj->kpj_nlwps--;
792 	p->p_zone->zone_nlwps--;
793 	mutex_exit(&p->p_zone->zone_nlwps_lock);
794 
795 	/*
796 	 * Clear the lwp directory and the lwpid hash table
797 	 * now that /proc can't bother us any more.
798 	 * We free the memory below, after dropping p->p_lock.
799 	 */
800 	lwpdir = p->p_lwpdir;
801 	lwpdir_sz = p->p_lwpdir_sz;
802 	tidhash = p->p_tidhash;
803 	tidhash_sz = p->p_tidhash_sz;
804 	ret_tidhash = p->p_ret_tidhash;
805 	p->p_lwpdir = NULL;
806 	p->p_lwpfree = NULL;
807 	p->p_lwpdir_sz = 0;
808 	p->p_tidhash = NULL;
809 	p->p_tidhash_sz = 0;
810 	p->p_ret_tidhash = NULL;
811 
812 	/*
813 	 * If the process has context ops installed, call the exit routine
814 	 * on behalf of this last remaining thread. Normally exitpctx() is
815 	 * called during thread_exit() or lwp_exit(), but because this is the
816 	 * last thread in the process, we must call it here. By the time
817 	 * thread_exit() is called (below), the association with the relevant
818 	 * process has been lost.
819 	 *
820 	 * We also free the context here.
821 	 */
822 	if (p->p_pctx) {
823 		kpreempt_disable();
824 		exitpctx(p);
825 		kpreempt_enable();
826 
827 		freepctx(p, 0);
828 	}
829 
830 	/*
831 	 * curthread's proc pointer is changed to point to the 'sched'
832 	 * process for the corresponding zone, except in the case when
833 	 * the exiting process is in fact a zsched instance, in which
834 	 * case the proc pointer is set to p0.  We do so, so that the
835 	 * process still points at the right zone when we call the VN_RELE()
836 	 * below.
837 	 *
838 	 * This is because curthread's original proc pointer can be freed as
839 	 * soon as the child sends a SIGCLD to its parent.  We use zsched so
840 	 * that for user processes, even in the final moments of death, the
841 	 * process is still associated with its zone.
842 	 */
843 	if (p != t->t_procp->p_zone->zone_zsched)
844 		t->t_procp = t->t_procp->p_zone->zone_zsched;
845 	else
846 		t->t_procp = &p0;
847 
848 	mutex_exit(&p->p_lock);
849 	if (!evaporate) {
850 		p->p_pidflag &= ~CLDPEND;
851 		sigcld(p, sqp);
852 	} else {
853 		/*
854 		 * Do what sigcld() would do if the disposition
855 		 * of the SIGCHLD signal were set to be ignored.
856 		 */
857 		cv_broadcast(&p->p_srwchan_cv);
858 		freeproc(p);
859 	}
860 	mutex_exit(&pidlock);
861 
862 	/*
863 	 * We don't release u_cdir and u_rdir until SZOMB is set.
864 	 * This protects us against dofusers().
865 	 */
866 	if (cdir)
867 		VN_RELE(cdir);
868 	if (rdir)
869 		VN_RELE(rdir);
870 	if (cwd)
871 		refstr_rele(cwd);
872 
873 	/*
874 	 * task_rele() may ultimately cause the zone to go away (or
875 	 * may cause the last user process in a zone to go away, which
876 	 * signals zsched to go away).  So prior to this call, we must
877 	 * no longer point at zsched.
878 	 */
879 	t->t_procp = &p0;
880 
881 	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
882 	kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
883 	while (ret_tidhash != NULL) {
884 		ret_tidhash_t *next = ret_tidhash->rth_next;
885 		kmem_free(ret_tidhash->rth_tidhash,
886 		    ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
887 		kmem_free(ret_tidhash, sizeof (*ret_tidhash));
888 		ret_tidhash = next;
889 	}
890 
891 	thread_exit();
892 	/* NOTREACHED */
893 }
894 
895 /*
896  * Format siginfo structure for wait system calls.
897  */
898 void
899 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
900 {
901 	ASSERT(MUTEX_HELD(&pidlock));
902 
903 	bzero(ip, sizeof (k_siginfo_t));
904 	ip->si_signo = SIGCLD;
905 	ip->si_code = pp->p_wcode;
906 	ip->si_pid = pp->p_pid;
907 	ip->si_ctid = PRCTID(pp);
908 	ip->si_zoneid = pp->p_zone->zone_id;
909 	ip->si_status = pp->p_wdata;
910 	ip->si_stime = pp->p_stime;
911 	ip->si_utime = pp->p_utime;
912 
913 	if (waitflag) {
914 		pp->p_wcode = 0;
915 		pp->p_wdata = 0;
916 		pp->p_pidflag &= ~CLDPEND;
917 	}
918 }
919 
920 /*
921  * Wait system call.
922  * Search for a terminated (zombie) child,
923  * finally lay it to rest, and collect its status.
924  * Look also for stopped children,
925  * and pass back status from them.
926  */
927 int
928 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
929 {
930 	int found;
931 	proc_t *cp, *pp;
932 	int proc_gone;
933 	int waitflag = !(options & WNOWAIT);
934 
935 	/*
936 	 * Obsolete flag, defined here only for binary compatibility
937 	 * with old statically linked executables.  Delete this when
938 	 * we no longer care about these old and broken applications.
939 	 */
940 #define	_WNOCHLD	0400
941 	options &= ~_WNOCHLD;
942 
943 	if (options == 0 || (options & ~WOPTMASK))
944 		return (EINVAL);
945 
946 	switch (idtype) {
947 	case P_PID:
948 	case P_PGID:
949 		if (id < 0 || id >= maxpid)
950 			return (EINVAL);
951 		/* FALLTHROUGH */
952 	case P_ALL:
953 		break;
954 	default:
955 		return (EINVAL);
956 	}
957 
958 	pp = ttoproc(curthread);
959 
960 	/*
961 	 * lock parent mutex so that sibling chain can be searched.
962 	 */
963 	mutex_enter(&pidlock);
964 
965 	/*
966 	 * if we are only looking for exited processes and child_ns list
967 	 * is empty no reason to look at all children.
968 	 */
969 	if (idtype == P_ALL &&
970 	    (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
971 	    pp->p_child_ns == NULL) {
972 		if (pp->p_child) {
973 			mutex_exit(&pidlock);
974 			bzero(ip, sizeof (k_siginfo_t));
975 			return (0);
976 		}
977 		mutex_exit(&pidlock);
978 		return (ECHILD);
979 	}
980 
981 	while (pp->p_child != NULL) {
982 
983 		proc_gone = 0;
984 
985 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
986 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
987 				continue;
988 			if (idtype == P_PID && id != cp->p_pid)
989 				continue;
990 			if (idtype == P_PGID && id != cp->p_pgrp)
991 				continue;
992 
993 			switch (cp->p_wcode) {
994 
995 			case CLD_TRAPPED:
996 			case CLD_STOPPED:
997 			case CLD_CONTINUED:
998 				cmn_err(CE_PANIC,
999 				    "waitid: wrong state %d on the p_newstate"
1000 				    " list", cp->p_wcode);
1001 				break;
1002 
1003 			case CLD_EXITED:
1004 			case CLD_DUMPED:
1005 			case CLD_KILLED:
1006 				if (!(options & WEXITED)) {
1007 					/*
1008 					 * Count how many are already gone
1009 					 * for good.
1010 					 */
1011 					proc_gone++;
1012 					break;
1013 				}
1014 				if (!waitflag) {
1015 					winfo(cp, ip, 0);
1016 				} else {
1017 					winfo(cp, ip, 1);
1018 					freeproc(cp);
1019 				}
1020 				mutex_exit(&pidlock);
1021 				if (waitflag) {		/* accept SIGCLD */
1022 					sigcld_delete(ip);
1023 					sigcld_repost();
1024 				}
1025 				return (0);
1026 			}
1027 
1028 			if (idtype == P_PID)
1029 				break;
1030 		}
1031 
1032 		/*
1033 		 * Wow! None of the threads on the p_sibling_ns list were
1034 		 * interesting threads. Check all the kids!
1035 		 */
1036 		found = 0;
1037 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1038 			if (idtype == P_PID && id != cp->p_pid)
1039 				continue;
1040 			if (idtype == P_PGID && id != cp->p_pgrp)
1041 				continue;
1042 
1043 			switch (cp->p_wcode) {
1044 			case CLD_TRAPPED:
1045 				if (!(options & WTRAPPED))
1046 					break;
1047 				winfo(cp, ip, waitflag);
1048 				mutex_exit(&pidlock);
1049 				if (waitflag) {		/* accept SIGCLD */
1050 					sigcld_delete(ip);
1051 					sigcld_repost();
1052 				}
1053 				return (0);
1054 
1055 			case CLD_STOPPED:
1056 				if (!(options & WSTOPPED))
1057 					break;
1058 				/* Is it still stopped? */
1059 				mutex_enter(&cp->p_lock);
1060 				if (!jobstopped(cp)) {
1061 					mutex_exit(&cp->p_lock);
1062 					break;
1063 				}
1064 				mutex_exit(&cp->p_lock);
1065 				winfo(cp, ip, waitflag);
1066 				mutex_exit(&pidlock);
1067 				if (waitflag) {		/* accept SIGCLD */
1068 					sigcld_delete(ip);
1069 					sigcld_repost();
1070 				}
1071 				return (0);
1072 
1073 			case CLD_CONTINUED:
1074 				if (!(options & WCONTINUED))
1075 					break;
1076 				winfo(cp, ip, waitflag);
1077 				mutex_exit(&pidlock);
1078 				if (waitflag) {		/* accept SIGCLD */
1079 					sigcld_delete(ip);
1080 					sigcld_repost();
1081 				}
1082 				return (0);
1083 
1084 			case CLD_EXITED:
1085 			case CLD_DUMPED:
1086 			case CLD_KILLED:
1087 				if (idtype != P_PID &&
1088 				    (cp->p_pidflag & CLDWAITPID))
1089 					continue;
1090 				/*
1091 				 * Don't complain if a process was found in
1092 				 * the first loop but we broke out of the loop
1093 				 * because of the arguments passed to us.
1094 				 */
1095 				if (proc_gone == 0) {
1096 					cmn_err(CE_PANIC,
1097 					    "waitid: wrong state on the"
1098 					    " p_child list");
1099 				} else {
1100 					break;
1101 				}
1102 			}
1103 
1104 			found++;
1105 
1106 			if (idtype == P_PID)
1107 				break;
1108 		}
1109 
1110 		/*
1111 		 * If we found no interesting processes at all,
1112 		 * break out and return ECHILD.
1113 		 */
1114 		if (found + proc_gone == 0)
1115 			break;
1116 
1117 		if (options & WNOHANG) {
1118 			mutex_exit(&pidlock);
1119 			bzero(ip, sizeof (k_siginfo_t));
1120 			/*
1121 			 * We should set ip->si_signo = SIGCLD,
1122 			 * but there is an SVVS test that expects
1123 			 * ip->si_signo to be zero in this case.
1124 			 */
1125 			return (0);
1126 		}
1127 
1128 		/*
1129 		 * If we found no processes of interest that could
1130 		 * change state while we wait, we don't wait at all.
1131 		 * Get out with ECHILD according to SVID.
1132 		 */
1133 		if (found == proc_gone)
1134 			break;
1135 
1136 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1137 			mutex_exit(&pidlock);
1138 			return (EINTR);
1139 		}
1140 	}
1141 	mutex_exit(&pidlock);
1142 	return (ECHILD);
1143 }
1144 
1145 int
1146 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1147 {
1148 	int error;
1149 	k_siginfo_t info;
1150 
1151 	if (error = waitid(idtype, id, &info, options))
1152 		return (set_errno(error));
1153 	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1154 		return (set_errno(EFAULT));
1155 	return (0);
1156 }
1157 
1158 #ifdef _SYSCALL32_IMPL
1159 
1160 int
1161 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1162 {
1163 	int error;
1164 	k_siginfo_t info;
1165 	siginfo32_t info32;
1166 
1167 	if (error = waitid(idtype, id, &info, options))
1168 		return (set_errno(error));
1169 	siginfo_kto32(&info, &info32);
1170 	if (copyout(&info32, infop, sizeof (info32)))
1171 		return (set_errno(EFAULT));
1172 	return (0);
1173 }
1174 
1175 #endif	/* _SYSCALL32_IMPL */
1176 
1177 void
1178 proc_detach(proc_t *p)
1179 {
1180 	proc_t *q;
1181 
1182 	ASSERT(MUTEX_HELD(&pidlock));
1183 
1184 	q = p->p_parent;
1185 	ASSERT(q != NULL);
1186 
1187 	/*
1188 	 * Take it off the newstate list of its parent
1189 	 */
1190 	delete_ns(q, p);
1191 
1192 	if (q->p_child == p) {
1193 		q->p_child = p->p_sibling;
1194 		/*
1195 		 * If the parent has no children, it better not
1196 		 * have any with new states either!
1197 		 */
1198 		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1199 	}
1200 
1201 	if (p->p_sibling) {
1202 		p->p_sibling->p_psibling = p->p_psibling;
1203 	}
1204 
1205 	if (p->p_psibling) {
1206 		p->p_psibling->p_sibling = p->p_sibling;
1207 	}
1208 }
1209 
1210 /*
1211  * Remove zombie children from the process table.
1212  */
1213 void
1214 freeproc(proc_t *p)
1215 {
1216 	proc_t *q;
1217 	task_t *tk;
1218 
1219 	ASSERT(p->p_stat == SZOMB);
1220 	ASSERT(p->p_tlist == NULL);
1221 	ASSERT(MUTEX_HELD(&pidlock));
1222 
1223 	sigdelq(p, NULL, 0);
1224 	if (p->p_killsqp) {
1225 		siginfofree(p->p_killsqp);
1226 		p->p_killsqp = NULL;
1227 	}
1228 
1229 	prfree(p);	/* inform /proc */
1230 
1231 	/*
1232 	 * Don't free the init processes.
1233 	 * Other dying processes will access it.
1234 	 */
1235 	if (p == proc_init)
1236 		return;
1237 
1238 
1239 	/*
1240 	 * We wait until now to free the cred structure because a
1241 	 * zombie process's credentials may be examined by /proc.
1242 	 * No cred locking needed because there are no threads at this point.
1243 	 */
1244 	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1245 	crfree(p->p_cred);
1246 	if (p->p_corefile != NULL) {
1247 		corectl_path_rele(p->p_corefile);
1248 		p->p_corefile = NULL;
1249 	}
1250 	if (p->p_content != NULL) {
1251 		corectl_content_rele(p->p_content);
1252 		p->p_content = NULL;
1253 	}
1254 
1255 	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1256 	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1257 		/*
1258 		 * This should still do the right thing since p_utime/stime
1259 		 * get set to the correct value on process exit, so it
1260 		 * should get properly updated
1261 		 */
1262 		p->p_nextofkin->p_cutime += p->p_utime;
1263 		p->p_nextofkin->p_cstime += p->p_stime;
1264 
1265 		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1266 		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1267 		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1268 		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1269 		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1270 		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1271 		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1272 		    += p->p_acct[LMS_USER_LOCK];
1273 		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1274 		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1275 		    += p->p_acct[LMS_WAIT_CPU];
1276 		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1277 
1278 		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1279 		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1280 		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1281 		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1282 		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1283 		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1284 		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1285 		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1286 		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1287 		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1288 		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1289 		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1290 
1291 	}
1292 
1293 	q = p->p_nextofkin;
1294 	if (q && q->p_orphan == p)
1295 		q->p_orphan = p->p_nextorph;
1296 	else if (q) {
1297 		for (q = q->p_orphan; q; q = q->p_nextorph)
1298 			if (q->p_nextorph == p)
1299 				break;
1300 		ASSERT(q && q->p_nextorph == p);
1301 		q->p_nextorph = p->p_nextorph;
1302 	}
1303 
1304 	/*
1305 	 * The process table slot is being freed, so it is now safe to give up
1306 	 * task and project membership.
1307 	 */
1308 	mutex_enter(&p->p_lock);
1309 	tk = p->p_task;
1310 	task_detach(p);
1311 	mutex_exit(&p->p_lock);
1312 
1313 	proc_detach(p);
1314 	pid_exit(p, tk);	/* frees pid and proc structure */
1315 
1316 	task_rele(tk);
1317 }
1318 
1319 /*
1320  * Delete process "child" from the newstate list of process "parent"
1321  */
1322 void
1323 delete_ns(proc_t *parent, proc_t *child)
1324 {
1325 	proc_t **ns;
1326 
1327 	ASSERT(MUTEX_HELD(&pidlock));
1328 	ASSERT(child->p_parent == parent);
1329 	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1330 		if (*ns == child) {
1331 
1332 			ASSERT((*ns)->p_parent == parent);
1333 
1334 			*ns = child->p_sibling_ns;
1335 			child->p_sibling_ns = NULL;
1336 			return;
1337 		}
1338 	}
1339 }
1340 
1341 /*
1342  * Add process "child" to the new state list of process "parent"
1343  */
1344 void
1345 add_ns(proc_t *parent, proc_t *child)
1346 {
1347 	ASSERT(child->p_sibling_ns == NULL);
1348 	child->p_sibling_ns = parent->p_child_ns;
1349 	parent->p_child_ns = child;
1350 }
1351