xref: /titanic_41/usr/src/uts/common/os/exit.c (revision 986c3e858c71d3da2429d82e3ca17e44988f94b3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.74 */
30 
31 #include <sys/types.h>
32 #include <sys/param.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/errno.h>
38 #include <sys/proc.h>
39 #include <sys/ucontext.h>
40 #include <sys/procfs.h>
41 #include <sys/vnode.h>
42 #include <sys/acct.h>
43 #include <sys/var.h>
44 #include <sys/cmn_err.h>
45 #include <sys/debug.h>
46 #include <sys/wait.h>
47 #include <sys/siginfo.h>
48 #include <sys/procset.h>
49 #include <sys/class.h>
50 #include <sys/file.h>
51 #include <sys/session.h>
52 #include <sys/kmem.h>
53 #include <sys/vtrace.h>
54 #include <sys/prsystm.h>
55 #include <sys/ipc.h>
56 #include <sys/sem_impl.h>
57 #include <c2/audit.h>
58 #include <sys/aio_impl.h>
59 #include <vm/as.h>
60 #include <sys/poll.h>
61 #include <sys/door.h>
62 #include <sys/lwpchan_impl.h>
63 #include <sys/utrap.h>
64 #include <sys/task.h>
65 #include <sys/exacct.h>
66 #include <sys/cyclic.h>
67 #include <sys/schedctl.h>
68 #include <sys/rctl.h>
69 #include <sys/contract_impl.h>
70 #include <sys/contract/process_impl.h>
71 #include <sys/list.h>
72 #include <sys/dtrace.h>
73 #include <sys/pool.h>
74 #include <sys/sdt.h>
75 #include <sys/corectl.h>
76 
77 /*
78  * convert code/data pair into old style wait status
79  */
80 int
81 wstat(int code, int data)
82 {
83 	int stat = (data & 0377);
84 
85 	switch (code) {
86 	case CLD_EXITED:
87 		stat <<= 8;
88 		break;
89 	case CLD_DUMPED:
90 		stat |= WCOREFLG;
91 		break;
92 	case CLD_KILLED:
93 		break;
94 	case CLD_TRAPPED:
95 	case CLD_STOPPED:
96 		stat <<= 8;
97 		stat |= WSTOPFLG;
98 		break;
99 	case CLD_CONTINUED:
100 		stat = WCONTFLG;
101 		break;
102 	default:
103 		cmn_err(CE_PANIC, "wstat: bad code");
104 		/* NOTREACHED */
105 	}
106 	return (stat);
107 }
108 
109 static char *
110 exit_reason(char *buf, size_t bufsz, int what, int why)
111 {
112 	switch (why) {
113 	case CLD_EXITED:
114 		(void) snprintf(buf, bufsz, "exited with status %d", what);
115 		break;
116 	case CLD_KILLED:
117 		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
118 		break;
119 	case CLD_DUMPED:
120 		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
121 		break;
122 	default:
123 		(void) snprintf(buf, bufsz, "encountered unknown error "
124 		    "(%d, %d)", why, what);
125 		break;
126 	}
127 
128 	return (buf);
129 }
130 
131 /*
132  * exit system call: pass back caller's arg.
133  */
134 void
135 rexit(int rval)
136 {
137 	exit(CLD_EXITED, rval);
138 }
139 
140 /*
141  * Called by proc_exit() when a zone's init exits, presumably because
142  * it failed.  As long as the given zone is still in the "running"
143  * state, we will re-exec() init, but first we need to reset things
144  * which are usually inherited across exec() but will break init's
145  * assumption that it is being exec()'d from a virgin process.  Most
146  * importantly this includes closing all file descriptors (exec only
147  * closes those marked close-on-exec) and resetting signals (exec only
148  * resets handled signals, and we need to clear any signals which
149  * killed init).  Anything else that exec(2) says would be inherited,
150  * but would affect the execution of init, needs to be reset.
151  */
152 static int
153 restart_init(int what, int why)
154 {
155 	kthread_t *t = curthread;
156 	klwp_t *lwp = ttolwp(t);
157 	proc_t *p = ttoproc(t);
158 	user_t *up = PTOU(p);
159 
160 	vnode_t *oldcd, *oldrd;
161 	sess_t *sp;
162 	int i, err;
163 	char reason_buf[64];
164 
165 	/*
166 	 * Let zone admin (and global zone admin if this is for a non-global
167 	 * zone) know that init has failed and will be restarted.
168 	 */
169 	zcmn_err(p->p_zone->zone_id, CE_WARN,
170 	    "init(1M) %s: restarting automatically",
171 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
172 
173 	if (!INGLOBALZONE(p)) {
174 		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
175 		    "restarting automatically",
176 		    p->p_zone->zone_name, p->p_pid, reason_buf);
177 	}
178 
179 	/*
180 	 * Remove any fpollinfo_t's for this (last) thread from our file
181 	 * descriptors so closeall() can ASSERT() that they're all gone.
182 	 * Then close all open file descriptors in the process.
183 	 */
184 	pollcleanup();
185 	closeall(P_FINFO(p));
186 
187 	/*
188 	 * Grab p_lock and begin clearing miscellaneous global process
189 	 * state that needs to be reset before we exec the new init(1M).
190 	 */
191 
192 	mutex_enter(&p->p_lock);
193 	prbarrier(p);
194 
195 	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
196 	up->u_cmask = CMASK;
197 
198 	sigemptyset(&t->t_hold);
199 	sigemptyset(&t->t_sig);
200 	sigemptyset(&t->t_extsig);
201 
202 	sigemptyset(&p->p_sig);
203 	sigemptyset(&p->p_extsig);
204 
205 	sigdelq(p, t, 0);
206 	sigdelq(p, NULL, 0);
207 
208 	if (p->p_killsqp) {
209 		siginfofree(p->p_killsqp);
210 		p->p_killsqp = NULL;
211 	}
212 
213 	/*
214 	 * Reset any signals that are ignored back to the default disposition.
215 	 * Other u_signal members will be cleared when exec calls sigdefault().
216 	 */
217 	for (i = 1; i < NSIG; i++) {
218 		if (up->u_signal[i - 1] == SIG_IGN) {
219 			up->u_signal[i - 1] = SIG_DFL;
220 			sigemptyset(&up->u_sigmask[i - 1]);
221 		}
222 	}
223 
224 	/*
225 	 * Clear the current signal, any signal info associated with it, and
226 	 * any signal information from contracts and/or contract templates.
227 	 */
228 	lwp->lwp_cursig = 0;
229 	lwp->lwp_extsig = 0;
230 	if (lwp->lwp_curinfo != NULL) {
231 		siginfofree(lwp->lwp_curinfo);
232 		lwp->lwp_curinfo = NULL;
233 	}
234 	lwp_ctmpl_clear(lwp);
235 
236 	/*
237 	 * Reset both the process root directory and the current working
238 	 * directory to the root of the zone just as we do during boot.
239 	 */
240 	VN_HOLD(p->p_zone->zone_rootvp);
241 	oldrd = up->u_rdir;
242 	up->u_rdir = p->p_zone->zone_rootvp;
243 
244 	VN_HOLD(p->p_zone->zone_rootvp);
245 	oldcd = up->u_cdir;
246 	up->u_cdir = p->p_zone->zone_rootvp;
247 
248 	if (up->u_cwd != NULL) {
249 		refstr_rele(up->u_cwd);
250 		up->u_cwd = NULL;
251 	}
252 
253 	mutex_exit(&p->p_lock);
254 
255 	if (oldrd != NULL)
256 		VN_RELE(oldrd);
257 	if (oldcd != NULL)
258 		VN_RELE(oldcd);
259 
260 	/*
261 	 * Free the controlling tty.
262 	 */
263 	mutex_enter(&pidlock);
264 	sp = p->p_sessp;
265 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
266 		mutex_exit(&pidlock);
267 		freectty(sp);
268 	} else {
269 		mutex_exit(&pidlock);
270 	}
271 
272 	/*
273 	 * Now exec() the new init(1M) on top of the current process.  If we
274 	 * succeed, the caller will treat this like a successful system call.
275 	 * If we fail, we issue messages and the caller will proceed with exit.
276 	 */
277 	err = exec_init(p->p_zone->zone_initname, NULL);
278 
279 	if (err == 0)
280 		return (0);
281 
282 	zcmn_err(p->p_zone->zone_id, CE_WARN,
283 	    "failed to restart init(1M) (err=%d): system reboot required", err);
284 
285 	if (!INGLOBALZONE(p)) {
286 		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
287 		    "(pid %d, err=%d): zoneadm(1M) boot required",
288 		    p->p_zone->zone_name, p->p_pid, err);
289 	}
290 
291 	return (-1);
292 }
293 
294 /*
295  * Release resources.
296  * Enter zombie state.
297  * Wake up parent and init processes,
298  * and dispose of children.
299  */
300 void
301 exit(int why, int what)
302 {
303 	/*
304 	 * If proc_exit() fails, then some other lwp in the process
305 	 * got there first.  We just have to call lwp_exit() to allow
306 	 * the other lwp to finish exiting the process.  Otherwise we're
307 	 * restarting init, and should return.
308 	 */
309 	if (proc_exit(why, what) != 0) {
310 		mutex_enter(&curproc->p_lock);
311 		ASSERT(curproc->p_flag & SEXITLWPS);
312 		lwp_exit();
313 		/* NOTREACHED */
314 	}
315 }
316 
317 /*
318  * Set the SEXITING flag on the process, after making sure /proc does
319  * not have it locked.  This is done in more places than proc_exit(),
320  * so it is a separate function.
321  */
322 void
323 proc_is_exiting(proc_t *p)
324 {
325 	mutex_enter(&p->p_lock);
326 	prbarrier(p);
327 	p->p_flag |= SEXITING;
328 	mutex_exit(&p->p_lock);
329 }
330 
331 /*
332  * Return value:
333  *   1 - exitlwps() failed, call (or continue) lwp_exit()
334  *   0 - restarting init.  Return through system call path
335  */
336 int
337 proc_exit(int why, int what)
338 {
339 	kthread_t *t = curthread;
340 	klwp_t *lwp = ttolwp(t);
341 	proc_t *p = ttoproc(t);
342 	zone_t *z = p->p_zone;
343 	timeout_id_t tmp_id;
344 	int rv;
345 	proc_t *q;
346 	sess_t *sp;
347 	task_t *tk;
348 	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
349 	sigqueue_t *sqp;
350 	lwpdir_t *lwpdir;
351 	uint_t lwpdir_sz;
352 	lwpdir_t **tidhash;
353 	uint_t tidhash_sz;
354 	refstr_t *cwd;
355 	hrtime_t hrutime, hrstime;
356 
357 	/*
358 	 * Stop and discard the process's lwps except for the current one,
359 	 * unless some other lwp beat us to it.  If exitlwps() fails then
360 	 * return and the calling lwp will call (or continue in) lwp_exit().
361 	 */
362 	proc_is_exiting(p);
363 	if (exitlwps(0) != 0)
364 		return (1);
365 
366 	DTRACE_PROC(lwp__exit);
367 	DTRACE_PROC1(exit, int, why);
368 
369 	/*
370 	 * Don't let init exit unless zone_start_init() failed its exec, or
371 	 * we are shutting down the zone or the machine.
372 	 *
373 	 * Since we are single threaded, we don't need to lock the
374 	 * following accesses to zone_proc_initpid.
375 	 */
376 	if (p->p_pid == z->zone_proc_initpid) {
377 		if (z->zone_boot_err == 0 &&
378 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
379 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
380 		    restart_init(what, why) == 0)
381 			return (0);
382 		/*
383 		 * Since we didn't or couldn't restart init, we clear
384 		 * the zone's init state and proceed with exit
385 		 * processing.
386 		 */
387 		z->zone_proc_initpid = -1;
388 	}
389 
390 	/*
391 	 * Allocate a sigqueue now, before we grab locks.
392 	 * It will be given to sigcld(), below.
393 	 */
394 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
395 
396 	/*
397 	 * revoke any doors created by the process.
398 	 */
399 	if (p->p_door_list)
400 		door_exit();
401 
402 	/*
403 	 * Release schedctl data structures.
404 	 */
405 	if (p->p_pagep)
406 		schedctl_proc_cleanup();
407 
408 	/*
409 	 * make sure all pending kaio has completed.
410 	 */
411 	if (p->p_aio)
412 		aio_cleanup_exit();
413 
414 	/*
415 	 * discard the lwpchan cache.
416 	 */
417 	if (p->p_lcp != NULL)
418 		lwpchan_destroy_cache(0);
419 
420 	/*
421 	 * Clean up any DTrace helper actions or probes for the process.
422 	 */
423 	if (p->p_dtrace_helpers != NULL) {
424 		ASSERT(dtrace_helpers_cleanup != NULL);
425 		(*dtrace_helpers_cleanup)();
426 	}
427 
428 	/* untimeout the realtime timers */
429 	if (p->p_itimer != NULL)
430 		timer_exit();
431 
432 	if ((tmp_id = p->p_alarmid) != 0) {
433 		p->p_alarmid = 0;
434 		(void) untimeout(tmp_id);
435 	}
436 
437 	/*
438 	 * Remove any fpollinfo_t's for this (last) thread from our file
439 	 * descriptors so closeall() can ASSERT() that they're all gone.
440 	 */
441 	pollcleanup();
442 
443 	if (p->p_rprof_cyclic != CYCLIC_NONE) {
444 		mutex_enter(&cpu_lock);
445 		cyclic_remove(p->p_rprof_cyclic);
446 		mutex_exit(&cpu_lock);
447 	}
448 
449 	mutex_enter(&p->p_lock);
450 
451 	/*
452 	 * Clean up any DTrace probes associated with this process.
453 	 */
454 	if (p->p_dtrace_probes) {
455 		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
456 		dtrace_fasttrap_exit_ptr(p);
457 	}
458 
459 	while ((tmp_id = p->p_itimerid) != 0) {
460 		p->p_itimerid = 0;
461 		mutex_exit(&p->p_lock);
462 		(void) untimeout(tmp_id);
463 		mutex_enter(&p->p_lock);
464 	}
465 
466 	lwp_cleanup();
467 
468 	/*
469 	 * We are about to exit; prevent our resource associations from
470 	 * being changed.
471 	 */
472 	pool_barrier_enter();
473 
474 	/*
475 	 * Block the process against /proc now that we have really
476 	 * acquired p->p_lock (to manipulate p_tlist at least).
477 	 */
478 	prbarrier(p);
479 
480 #ifdef	SUN_SRC_COMPAT
481 	if (code == CLD_KILLED)
482 		u.u_acflag |= AXSIG;
483 #endif
484 	sigfillset(&p->p_ignore);
485 	sigemptyset(&p->p_siginfo);
486 	sigemptyset(&p->p_sig);
487 	sigemptyset(&p->p_extsig);
488 	sigemptyset(&t->t_sig);
489 	sigemptyset(&t->t_extsig);
490 	sigemptyset(&p->p_sigmask);
491 	sigdelq(p, t, 0);
492 	lwp->lwp_cursig = 0;
493 	lwp->lwp_extsig = 0;
494 	p->p_flag &= ~(SKILLED | SEXTKILLED);
495 	if (lwp->lwp_curinfo) {
496 		siginfofree(lwp->lwp_curinfo);
497 		lwp->lwp_curinfo = NULL;
498 	}
499 
500 	t->t_proc_flag |= TP_LWPEXIT;
501 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
502 	prlwpexit(t);		/* notify /proc */
503 	lwp_hash_out(p, t->t_tid);
504 	prexit(p);
505 
506 	p->p_lwpcnt = 0;
507 	p->p_tlist = NULL;
508 	sigqfree(p);
509 	term_mstate(t);
510 	p->p_mterm = gethrtime();
511 
512 	exec_vp = p->p_exec;
513 	execdir_vp = p->p_execdir;
514 	p->p_exec = NULLVP;
515 	p->p_execdir = NULLVP;
516 	mutex_exit(&p->p_lock);
517 	if (exec_vp)
518 		VN_RELE(exec_vp);
519 	if (execdir_vp)
520 		VN_RELE(execdir_vp);
521 
522 	pr_free_watched_pages(p);
523 
524 	closeall(P_FINFO(p));
525 
526 	mutex_enter(&pidlock);
527 	sp = p->p_sessp;
528 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
529 		mutex_exit(&pidlock);
530 		freectty(sp);
531 	} else
532 		mutex_exit(&pidlock);
533 
534 #if defined(__sparc)
535 	if (p->p_utraps != NULL)
536 		utrap_free(p);
537 #endif
538 	if (p->p_semacct)			/* IPC semaphore exit */
539 		semexit(p);
540 	rv = wstat(why, what);
541 
542 	acct(rv & 0xff);
543 	exacct_commit_proc(p, rv);
544 
545 	/*
546 	 * Release any resources associated with C2 auditing
547 	 */
548 #ifdef C2_AUDIT
549 	if (audit_active) {
550 		/*
551 		 * audit exit system call
552 		 */
553 		audit_exit(why, what);
554 	}
555 #endif
556 
557 	/*
558 	 * Free address space.
559 	 */
560 	relvm();
561 
562 	/*
563 	 * Release held contracts.
564 	 */
565 	contract_exit(p);
566 
567 	/*
568 	 * Depart our encapsulating process contract.
569 	 */
570 	if ((p->p_flag & SSYS) == 0) {
571 		ASSERT(p->p_ct_process);
572 		contract_process_exit(p->p_ct_process, p, rv);
573 	}
574 
575 	/*
576 	 * Remove pool association, and block if requested by pool_do_bind.
577 	 */
578 	mutex_enter(&p->p_lock);
579 	ASSERT(p->p_pool->pool_ref > 0);
580 	atomic_add_32(&p->p_pool->pool_ref, -1);
581 	p->p_pool = pool_default;
582 	/*
583 	 * Now that our address space has been freed and all other threads
584 	 * in this process have exited, set the PEXITED pool flag.  This
585 	 * tells the pools subsystems to ignore this process if it was
586 	 * requested to rebind this process to a new pool.
587 	 */
588 	p->p_poolflag |= PEXITED;
589 	pool_barrier_exit();
590 	mutex_exit(&p->p_lock);
591 
592 	mutex_enter(&pidlock);
593 
594 	/*
595 	 * Delete this process from the newstate list of its parent. We
596 	 * will put it in the right place in the sigcld in the end.
597 	 */
598 	delete_ns(p->p_parent, p);
599 
600 	/*
601 	 * Reassign the orphans to the next of kin.
602 	 * Don't rearrange init's orphanage.
603 	 */
604 	if ((q = p->p_orphan) != NULL && p != proc_init) {
605 
606 		proc_t *nokp = p->p_nextofkin;
607 
608 		for (;;) {
609 			q->p_nextofkin = nokp;
610 			if (q->p_nextorph == NULL)
611 				break;
612 			q = q->p_nextorph;
613 		}
614 		q->p_nextorph = nokp->p_orphan;
615 		nokp->p_orphan = p->p_orphan;
616 		p->p_orphan = NULL;
617 	}
618 
619 	/*
620 	 * Reassign the children to init.
621 	 * Don't try to assign init's children to init.
622 	 */
623 	if ((q = p->p_child) != NULL && p != proc_init) {
624 		struct proc	*np;
625 		struct proc	*initp = proc_init;
626 		boolean_t	setzonetop = B_FALSE;
627 
628 		if (!INGLOBALZONE(curproc))
629 			setzonetop = B_TRUE;
630 
631 		pgdetach(p);
632 
633 		do {
634 			np = q->p_sibling;
635 			/*
636 			 * Delete it from its current parent new state
637 			 * list and add it to init new state list
638 			 */
639 			delete_ns(q->p_parent, q);
640 
641 			q->p_ppid = 1;
642 			if (setzonetop) {
643 				mutex_enter(&q->p_lock);
644 				q->p_flag |= SZONETOP;
645 				mutex_exit(&q->p_lock);
646 			}
647 			q->p_parent = initp;
648 
649 			/*
650 			 * Since q will be the first child,
651 			 * it will not have a previous sibling.
652 			 */
653 			q->p_psibling = NULL;
654 			if (initp->p_child) {
655 				initp->p_child->p_psibling = q;
656 			}
657 			q->p_sibling = initp->p_child;
658 			initp->p_child = q;
659 			if (q->p_proc_flag & P_PR_PTRACE) {
660 				mutex_enter(&q->p_lock);
661 				sigtoproc(q, NULL, SIGKILL);
662 				mutex_exit(&q->p_lock);
663 			}
664 			/*
665 			 * sigcld() will add the child to parents
666 			 * newstate list.
667 			 */
668 			if (q->p_stat == SZOMB)
669 				sigcld(q, NULL);
670 		} while ((q = np) != NULL);
671 
672 		p->p_child = NULL;
673 		ASSERT(p->p_child_ns == NULL);
674 	}
675 
676 	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
677 
678 	mutex_enter(&p->p_lock);
679 	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
680 
681 	hrutime = mstate_aggr_state(p, LMS_USER);
682 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
683 	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
684 	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
685 
686 	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
687 	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
688 	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
689 	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
690 	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
691 	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
692 	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
693 	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
694 	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
695 	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
696 
697 	p->p_ru.minflt	+= p->p_cru.minflt;
698 	p->p_ru.majflt	+= p->p_cru.majflt;
699 	p->p_ru.nswap	+= p->p_cru.nswap;
700 	p->p_ru.inblock	+= p->p_cru.inblock;
701 	p->p_ru.oublock	+= p->p_cru.oublock;
702 	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
703 	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
704 	p->p_ru.nsignals += p->p_cru.nsignals;
705 	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
706 	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
707 	p->p_ru.sysc	+= p->p_cru.sysc;
708 	p->p_ru.ioch	+= p->p_cru.ioch;
709 
710 	p->p_stat = SZOMB;
711 	p->p_proc_flag &= ~P_PR_PTRACE;
712 	p->p_wdata = what;
713 	p->p_wcode = (char)why;
714 
715 	cdir = PTOU(p)->u_cdir;
716 	rdir = PTOU(p)->u_rdir;
717 	cwd = PTOU(p)->u_cwd;
718 
719 	/*
720 	 * Release resource controls, as they are no longer enforceable.
721 	 */
722 	rctl_set_free(p->p_rctls);
723 
724 	/*
725 	 * Give up task and project memberships.  Decrement tk_nlwps counter
726 	 * for our task.max-lwps resource control.  An extended accounting
727 	 * record, if that facility is active, is scheduled to be written.
728 	 * Zombie processes are false members of task0 for the remainder of
729 	 * their lifetime; no accounting information is recorded for them.
730 	 */
731 	tk = p->p_task;
732 
733 	mutex_enter(&p->p_zone->zone_nlwps_lock);
734 	tk->tk_nlwps--;
735 	tk->tk_proj->kpj_nlwps--;
736 	p->p_zone->zone_nlwps--;
737 	mutex_exit(&p->p_zone->zone_nlwps_lock);
738 	task_detach(p);
739 	p->p_task = task0p;
740 
741 	/*
742 	 * Clear the lwp directory and the lwpid hash table
743 	 * now that /proc can't bother us any more.
744 	 * We free the memory below, after dropping p->p_lock.
745 	 */
746 	lwpdir = p->p_lwpdir;
747 	lwpdir_sz = p->p_lwpdir_sz;
748 	tidhash = p->p_tidhash;
749 	tidhash_sz = p->p_tidhash_sz;
750 	p->p_lwpdir = NULL;
751 	p->p_lwpfree = NULL;
752 	p->p_lwpdir_sz = 0;
753 	p->p_tidhash = NULL;
754 	p->p_tidhash_sz = 0;
755 
756 	/*
757 	 * If the process has context ops installed, call the exit routine
758 	 * on behalf of this last remaining thread. Normally exitpctx() is
759 	 * called during thread_exit() or lwp_exit(), but because this is the
760 	 * last thread in the process, we must call it here. By the time
761 	 * thread_exit() is called (below), the association with the relevant
762 	 * process has been lost.
763 	 *
764 	 * We also free the context here.
765 	 */
766 	if (p->p_pctx) {
767 		kpreempt_disable();
768 		exitpctx(p);
769 		kpreempt_enable();
770 
771 		freepctx(p, 0);
772 	}
773 
774 	/*
775 	 * curthread's proc pointer is changed to point at p0 because
776 	 * curthread's original proc pointer can be freed as soon as
777 	 * the child sends a SIGCLD to its parent.
778 	 */
779 	t->t_procp = &p0;
780 
781 	mutex_exit(&p->p_lock);
782 	sigcld(p, sqp);
783 	mutex_exit(&pidlock);
784 
785 	task_rele(tk);
786 
787 	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
788 	kmem_free(tidhash, tidhash_sz * sizeof (lwpdir_t *));
789 
790 	/*
791 	 * We don't release u_cdir and u_rdir until SZOMB is set.
792 	 * This protects us against dofusers().
793 	 */
794 	VN_RELE(cdir);
795 	if (rdir)
796 		VN_RELE(rdir);
797 	if (cwd)
798 		refstr_rele(cwd);
799 
800 	lwp_pcb_exit();
801 
802 	thread_exit();
803 	/* NOTREACHED */
804 }
805 
806 /*
807  * Format siginfo structure for wait system calls.
808  */
809 void
810 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
811 {
812 	ASSERT(MUTEX_HELD(&pidlock));
813 
814 	bzero(ip, sizeof (k_siginfo_t));
815 	ip->si_signo = SIGCLD;
816 	ip->si_code = pp->p_wcode;
817 	ip->si_pid = pp->p_pid;
818 	ip->si_ctid = PRCTID(pp);
819 	ip->si_zoneid = pp->p_zone->zone_id;
820 	ip->si_status = pp->p_wdata;
821 	ip->si_stime = pp->p_stime;
822 	ip->si_utime = pp->p_utime;
823 
824 	if (waitflag) {
825 		pp->p_wcode = 0;
826 		pp->p_wdata = 0;
827 		pp->p_pidflag &= ~CLDPEND;
828 	}
829 }
830 
831 /*
832  * Wait system call.
833  * Search for a terminated (zombie) child,
834  * finally lay it to rest, and collect its status.
835  * Look also for stopped children,
836  * and pass back status from them.
837  */
838 int
839 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
840 {
841 	int found;
842 	proc_t *cp, *pp;
843 	proc_t **nsp;
844 	int proc_gone;
845 	int waitflag = !(options & WNOWAIT);
846 
847 	/*
848 	 * Obsolete flag, defined here only for binary compatibility
849 	 * with old statically linked executables.  Delete this when
850 	 * we no longer care about these old and broken applications.
851 	 */
852 #define	_WNOCHLD	0400
853 	options &= ~_WNOCHLD;
854 
855 	if (options == 0 || (options & ~WOPTMASK))
856 		return (EINVAL);
857 
858 	switch (idtype) {
859 	case P_PID:
860 	case P_PGID:
861 		if (id < 0 || id >= maxpid)
862 			return (EINVAL);
863 		/* FALLTHROUGH */
864 	case P_ALL:
865 		break;
866 	default:
867 		return (EINVAL);
868 	}
869 
870 	pp = ttoproc(curthread);
871 
872 	/*
873 	 * lock parent mutex so that sibling chain can be searched.
874 	 */
875 	mutex_enter(&pidlock);
876 
877 	/*
878 	 * if we are only looking for exited processes and child_ns list
879 	 * is empty no reason to look at all children.
880 	 */
881 	if (idtype == P_ALL &&
882 	    (options & (WOPTMASK & ~WNOWAIT)) == (WNOHANG | WEXITED) &&
883 	    pp->p_child_ns == NULL) {
884 
885 		if (pp->p_child) {
886 			mutex_exit(&pidlock);
887 			bzero(ip, sizeof (k_siginfo_t));
888 			return (0);
889 		}
890 		mutex_exit(&pidlock);
891 		return (ECHILD);
892 	}
893 
894 	while ((cp = pp->p_child) != NULL) {
895 
896 		proc_gone = 0;
897 
898 		for (nsp = &pp->p_child_ns; *nsp; nsp = &(*nsp)->p_sibling_ns) {
899 			if (idtype == P_PID && id != (*nsp)->p_pid) {
900 				continue;
901 			}
902 			if (idtype == P_PGID && id != (*nsp)->p_pgrp) {
903 				continue;
904 			}
905 
906 			switch ((*nsp)->p_wcode) {
907 
908 			case CLD_TRAPPED:
909 			case CLD_STOPPED:
910 			case CLD_CONTINUED:
911 				cmn_err(CE_PANIC,
912 				    "waitid: wrong state %d on the p_newstate"
913 				    " list", (*nsp)->p_wcode);
914 				break;
915 
916 			case CLD_EXITED:
917 			case CLD_DUMPED:
918 			case CLD_KILLED:
919 				if (!(options & WEXITED)) {
920 					/*
921 					 * Count how many are already gone
922 					 * for good.
923 					 */
924 					proc_gone++;
925 					break;
926 				}
927 				if (!waitflag) {
928 					winfo((*nsp), ip, 0);
929 				} else {
930 					proc_t *xp = *nsp;
931 					winfo(xp, ip, 1);
932 					freeproc(xp);
933 				}
934 				mutex_exit(&pidlock);
935 				if (waitflag) {		/* accept SIGCLD */
936 					sigcld_delete(ip);
937 					sigcld_repost();
938 				}
939 				return (0);
940 			}
941 
942 			if (idtype == P_PID)
943 				break;
944 		}
945 
946 		/*
947 		 * Wow! None of the threads on the p_sibling_ns list were
948 		 * interesting threads. Check all the kids!
949 		 */
950 		found = 0;
951 		cp = pp->p_child;
952 		do {
953 			if (idtype == P_PID && id != cp->p_pid) {
954 				continue;
955 			}
956 			if (idtype == P_PGID && id != cp->p_pgrp) {
957 				continue;
958 			}
959 
960 			found++;
961 
962 			switch (cp->p_wcode) {
963 			case CLD_TRAPPED:
964 				if (!(options & WTRAPPED))
965 					break;
966 				winfo(cp, ip, waitflag);
967 				mutex_exit(&pidlock);
968 				if (waitflag) {		/* accept SIGCLD */
969 					sigcld_delete(ip);
970 					sigcld_repost();
971 				}
972 				return (0);
973 
974 			case CLD_STOPPED:
975 				if (!(options & WSTOPPED))
976 					break;
977 				/* Is it still stopped? */
978 				mutex_enter(&cp->p_lock);
979 				if (!jobstopped(cp)) {
980 					mutex_exit(&cp->p_lock);
981 					break;
982 				}
983 				mutex_exit(&cp->p_lock);
984 				winfo(cp, ip, waitflag);
985 				mutex_exit(&pidlock);
986 				if (waitflag) {		/* accept SIGCLD */
987 					sigcld_delete(ip);
988 					sigcld_repost();
989 				}
990 				return (0);
991 
992 			case CLD_CONTINUED:
993 				if (!(options & WCONTINUED))
994 					break;
995 				winfo(cp, ip, waitflag);
996 				mutex_exit(&pidlock);
997 				if (waitflag) {		/* accept SIGCLD */
998 					sigcld_delete(ip);
999 					sigcld_repost();
1000 				}
1001 				return (0);
1002 
1003 			case CLD_EXITED:
1004 			case CLD_DUMPED:
1005 			case CLD_KILLED:
1006 				/*
1007 				 * Don't complain if a process was found in
1008 				 * the first loop but we broke out of the loop
1009 				 * because of the arguments passed to us.
1010 				 */
1011 				if (proc_gone == 0) {
1012 					cmn_err(CE_PANIC,
1013 					    "waitid: wrong state on the"
1014 					    " p_child list");
1015 				} else {
1016 					break;
1017 				}
1018 			}
1019 
1020 			if (idtype == P_PID)
1021 				break;
1022 		} while ((cp = cp->p_sibling) != NULL);
1023 
1024 		/*
1025 		 * If we found no interesting processes at all,
1026 		 * break out and return ECHILD.
1027 		 */
1028 		if (found + proc_gone == 0)
1029 			break;
1030 
1031 		if (options & WNOHANG) {
1032 			bzero(ip, sizeof (k_siginfo_t));
1033 			/*
1034 			 * We should set ip->si_signo = SIGCLD,
1035 			 * but there is an SVVS test that expects
1036 			 * ip->si_signo to be zero in this case.
1037 			 */
1038 			mutex_exit(&pidlock);
1039 			return (0);
1040 		}
1041 
1042 		/*
1043 		 * If we found no processes of interest that could
1044 		 * change state while we wait, we don't wait at all.
1045 		 * Get out with ECHILD according to SVID.
1046 		 */
1047 		if (found == proc_gone)
1048 			break;
1049 
1050 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1051 			mutex_exit(&pidlock);
1052 			return (EINTR);
1053 		}
1054 	}
1055 	mutex_exit(&pidlock);
1056 	return (ECHILD);
1057 }
1058 
1059 /*
1060  * For implementations that don't require binary compatibility,
1061  * the wait system call may be made into a library call to the
1062  * waitid system call.
1063  */
1064 int64_t
1065 wait(void)
1066 {
1067 	int error;
1068 	k_siginfo_t info;
1069 	rval_t	r;
1070 
1071 	if (error =  waitid(P_ALL, (id_t)0, &info, WEXITED|WTRAPPED))
1072 		return (set_errno(error));
1073 	r.r_val1 = info.si_pid;
1074 	r.r_val2 = wstat(info.si_code, info.si_status);
1075 	return (r.r_vals);
1076 }
1077 
1078 int
1079 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1080 {
1081 	int error;
1082 	k_siginfo_t info;
1083 
1084 	if (error = waitid(idtype, id, &info, options))
1085 		return (set_errno(error));
1086 	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1087 		return (set_errno(EFAULT));
1088 	return (0);
1089 }
1090 
1091 #ifdef _SYSCALL32_IMPL
1092 
1093 int
1094 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1095 {
1096 	int error;
1097 	k_siginfo_t info;
1098 	siginfo32_t info32;
1099 
1100 	if (error = waitid(idtype, id, &info, options))
1101 		return (set_errno(error));
1102 	siginfo_kto32(&info, &info32);
1103 	if (copyout(&info32, infop, sizeof (info32)))
1104 		return (set_errno(EFAULT));
1105 	return (0);
1106 }
1107 
1108 #endif	/* _SYSCALL32_IMPL */
1109 
1110 void
1111 proc_detach(proc_t *p)
1112 {
1113 	proc_t *q;
1114 
1115 	ASSERT(MUTEX_HELD(&pidlock));
1116 
1117 	q = p->p_parent;
1118 	ASSERT(q != NULL);
1119 
1120 	/*
1121 	 * Take it off the newstate list of its parent
1122 	 */
1123 	delete_ns(q, p);
1124 
1125 	if (q->p_child == p) {
1126 		q->p_child = p->p_sibling;
1127 		/*
1128 		 * If the parent has no children, it better not
1129 		 * have any with new states either!
1130 		 */
1131 		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1132 	}
1133 
1134 	if (p->p_sibling) {
1135 		p->p_sibling->p_psibling = p->p_psibling;
1136 	}
1137 
1138 	if (p->p_psibling) {
1139 		p->p_psibling->p_sibling = p->p_sibling;
1140 	}
1141 }
1142 
1143 /*
1144  * Remove zombie children from the process table.
1145  */
1146 void
1147 freeproc(proc_t *p)
1148 {
1149 	proc_t *q;
1150 
1151 	ASSERT(p->p_stat == SZOMB);
1152 	ASSERT(p->p_tlist == NULL);
1153 	ASSERT(MUTEX_HELD(&pidlock));
1154 
1155 	sigdelq(p, NULL, 0);
1156 	if (p->p_killsqp) {
1157 		siginfofree(p->p_killsqp);
1158 		p->p_killsqp = NULL;
1159 	}
1160 
1161 	prfree(p);	/* inform /proc */
1162 
1163 	/*
1164 	 * Don't free the init processes.
1165 	 * Other dying processes will access it.
1166 	 */
1167 	if (p == proc_init)
1168 		return;
1169 
1170 
1171 	/*
1172 	 * We wait until now to free the cred structure because a
1173 	 * zombie process's credentials may be examined by /proc.
1174 	 * No cred locking needed because there are no threads at this point.
1175 	 */
1176 	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1177 	crfree(p->p_cred);
1178 	if (p->p_corefile != NULL) {
1179 		corectl_path_rele(p->p_corefile);
1180 		p->p_corefile = NULL;
1181 	}
1182 	if (p->p_content != NULL) {
1183 		corectl_content_rele(p->p_content);
1184 		p->p_content = NULL;
1185 	}
1186 
1187 	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1188 	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1189 		/*
1190 		 * This should still do the right thing since p_utime/stime
1191 		 * get set to the correct value on process exit, so it
1192 		 * should get properly updated
1193 		 */
1194 		p->p_nextofkin->p_cutime += p->p_utime;
1195 		p->p_nextofkin->p_cstime += p->p_stime;
1196 
1197 		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1198 		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1199 		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1200 		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1201 		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1202 		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1203 		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1204 		    += p->p_acct[LMS_USER_LOCK];
1205 		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1206 		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1207 		    += p->p_acct[LMS_WAIT_CPU];
1208 		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1209 
1210 		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1211 		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1212 		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1213 		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1214 		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1215 		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1216 		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1217 		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1218 		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1219 		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1220 		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1221 		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1222 
1223 	}
1224 
1225 	q = p->p_nextofkin;
1226 	if (q && q->p_orphan == p)
1227 		q->p_orphan = p->p_nextorph;
1228 	else if (q) {
1229 		for (q = q->p_orphan; q; q = q->p_nextorph)
1230 			if (q->p_nextorph == p)
1231 				break;
1232 		ASSERT(q && q->p_nextorph == p);
1233 		q->p_nextorph = p->p_nextorph;
1234 	}
1235 
1236 	proc_detach(p);
1237 	pid_exit(p);	/* frees pid and proc structure */
1238 }
1239 
1240 /*
1241  * Delete process "child" from the newstate list of process "parent"
1242  */
1243 void
1244 delete_ns(proc_t *parent, proc_t *child)
1245 {
1246 	proc_t **ns;
1247 
1248 	ASSERT(MUTEX_HELD(&pidlock));
1249 	ASSERT(child->p_parent == parent);
1250 	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1251 		if (*ns == child) {
1252 
1253 			ASSERT((*ns)->p_parent == parent);
1254 
1255 			*ns = child->p_sibling_ns;
1256 			child->p_sibling_ns = NULL;
1257 			return;
1258 		}
1259 	}
1260 }
1261 
1262 /*
1263  * Add process "child" to the new state list of process "parent"
1264  */
1265 void
1266 add_ns(proc_t *parent, proc_t *child)
1267 {
1268 	ASSERT(child->p_sibling_ns == NULL);
1269 	child->p_sibling_ns = parent->p_child_ns;
1270 	parent->p_child_ns = child;
1271 }
1272