xref: /titanic_51/usr/src/uts/common/os/exit.c (revision c8343062f6e25afd9c2a31b65df357030e69fa55)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.74 */
31 
32 #include <sys/types.h>
33 #include <sys/param.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/proc.h>
40 #include <sys/ucontext.h>
41 #include <sys/procfs.h>
42 #include <sys/vnode.h>
43 #include <sys/acct.h>
44 #include <sys/var.h>
45 #include <sys/cmn_err.h>
46 #include <sys/debug.h>
47 #include <sys/wait.h>
48 #include <sys/siginfo.h>
49 #include <sys/procset.h>
50 #include <sys/class.h>
51 #include <sys/file.h>
52 #include <sys/session.h>
53 #include <sys/kmem.h>
54 #include <sys/vtrace.h>
55 #include <sys/prsystm.h>
56 #include <sys/ipc.h>
57 #include <sys/sem_impl.h>
58 #include <c2/audit.h>
59 #include <sys/aio_impl.h>
60 #include <vm/as.h>
61 #include <sys/poll.h>
62 #include <sys/door.h>
63 #include <sys/lwpchan_impl.h>
64 #include <sys/utrap.h>
65 #include <sys/task.h>
66 #include <sys/exacct.h>
67 #include <sys/cyclic.h>
68 #include <sys/schedctl.h>
69 #include <sys/rctl.h>
70 #include <sys/contract_impl.h>
71 #include <sys/contract/process_impl.h>
72 #include <sys/list.h>
73 #include <sys/dtrace.h>
74 #include <sys/pool.h>
75 #include <sys/sdt.h>
76 #include <sys/corectl.h>
77 
78 #if defined(__x86)
79 extern void ldt_free(proc_t *pp);
80 #endif
81 
82 /*
83  * convert code/data pair into old style wait status
84  */
85 int
86 wstat(int code, int data)
87 {
88 	int stat = (data & 0377);
89 
90 	switch (code) {
91 	case CLD_EXITED:
92 		stat <<= 8;
93 		break;
94 	case CLD_DUMPED:
95 		stat |= WCOREFLG;
96 		break;
97 	case CLD_KILLED:
98 		break;
99 	case CLD_TRAPPED:
100 	case CLD_STOPPED:
101 		stat <<= 8;
102 		stat |= WSTOPFLG;
103 		break;
104 	case CLD_CONTINUED:
105 		stat = WCONTFLG;
106 		break;
107 	default:
108 		cmn_err(CE_PANIC, "wstat: bad code");
109 		/* NOTREACHED */
110 	}
111 	return (stat);
112 }
113 
114 static char *
115 exit_reason(char *buf, size_t bufsz, int what, int why)
116 {
117 	switch (why) {
118 	case CLD_EXITED:
119 		(void) snprintf(buf, bufsz, "exited with status %d", what);
120 		break;
121 	case CLD_KILLED:
122 		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
123 		break;
124 	case CLD_DUMPED:
125 		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
126 		break;
127 	default:
128 		(void) snprintf(buf, bufsz, "encountered unknown error "
129 		    "(%d, %d)", why, what);
130 		break;
131 	}
132 
133 	return (buf);
134 }
135 
136 /*
137  * exit system call: pass back caller's arg.
138  */
139 void
140 rexit(int rval)
141 {
142 	exit(CLD_EXITED, rval);
143 }
144 
145 /*
146  * Called by proc_exit() when a zone's init exits, presumably because
147  * it failed.  As long as the given zone is still in the "running"
148  * state, we will re-exec() init, but first we need to reset things
149  * which are usually inherited across exec() but will break init's
150  * assumption that it is being exec()'d from a virgin process.  Most
151  * importantly this includes closing all file descriptors (exec only
152  * closes those marked close-on-exec) and resetting signals (exec only
153  * resets handled signals, and we need to clear any signals which
154  * killed init).  Anything else that exec(2) says would be inherited,
155  * but would affect the execution of init, needs to be reset.
156  */
157 static int
158 restart_init(int what, int why)
159 {
160 	kthread_t *t = curthread;
161 	klwp_t *lwp = ttolwp(t);
162 	proc_t *p = ttoproc(t);
163 	user_t *up = PTOU(p);
164 
165 	vnode_t *oldcd, *oldrd;
166 	sess_t *sp;
167 	int i, err;
168 	char reason_buf[64];
169 	const char *ipath;
170 
171 	/*
172 	 * Let zone admin (and global zone admin if this is for a non-global
173 	 * zone) know that init has failed and will be restarted.
174 	 */
175 	zcmn_err(p->p_zone->zone_id, CE_WARN,
176 	    "init(1M) %s: restarting automatically",
177 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
178 
179 	if (!INGLOBALZONE(p)) {
180 		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
181 		    "restarting automatically",
182 		    p->p_zone->zone_name, p->p_pid, reason_buf);
183 	}
184 
185 	/*
186 	 * Remove any fpollinfo_t's for this (last) thread from our file
187 	 * descriptors so closeall() can ASSERT() that they're all gone.
188 	 * Then close all open file descriptors in the process.
189 	 */
190 	pollcleanup();
191 	closeall(P_FINFO(p));
192 
193 	/*
194 	 * Grab p_lock and begin clearing miscellaneous global process
195 	 * state that needs to be reset before we exec the new init(1M).
196 	 */
197 
198 	mutex_enter(&p->p_lock);
199 	prbarrier(p);
200 
201 	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
202 	up->u_cmask = CMASK;
203 
204 	sigemptyset(&t->t_hold);
205 	sigemptyset(&t->t_sig);
206 	sigemptyset(&t->t_extsig);
207 
208 	sigemptyset(&p->p_sig);
209 	sigemptyset(&p->p_extsig);
210 
211 	sigdelq(p, t, 0);
212 	sigdelq(p, NULL, 0);
213 
214 	if (p->p_killsqp) {
215 		siginfofree(p->p_killsqp);
216 		p->p_killsqp = NULL;
217 	}
218 
219 	/*
220 	 * Reset any signals that are ignored back to the default disposition.
221 	 * Other u_signal members will be cleared when exec calls sigdefault().
222 	 */
223 	for (i = 1; i < NSIG; i++) {
224 		if (up->u_signal[i - 1] == SIG_IGN) {
225 			up->u_signal[i - 1] = SIG_DFL;
226 			sigemptyset(&up->u_sigmask[i - 1]);
227 		}
228 	}
229 
230 	/*
231 	 * Clear the current signal, any signal info associated with it, and
232 	 * any signal information from contracts and/or contract templates.
233 	 */
234 	lwp->lwp_cursig = 0;
235 	lwp->lwp_extsig = 0;
236 	if (lwp->lwp_curinfo != NULL) {
237 		siginfofree(lwp->lwp_curinfo);
238 		lwp->lwp_curinfo = NULL;
239 	}
240 	lwp_ctmpl_clear(lwp);
241 
242 	/*
243 	 * Reset both the process root directory and the current working
244 	 * directory to the root of the zone just as we do during boot.
245 	 */
246 	VN_HOLD(p->p_zone->zone_rootvp);
247 	oldrd = up->u_rdir;
248 	up->u_rdir = p->p_zone->zone_rootvp;
249 
250 	VN_HOLD(p->p_zone->zone_rootvp);
251 	oldcd = up->u_cdir;
252 	up->u_cdir = p->p_zone->zone_rootvp;
253 
254 	if (up->u_cwd != NULL) {
255 		refstr_rele(up->u_cwd);
256 		up->u_cwd = NULL;
257 	}
258 
259 	mutex_exit(&p->p_lock);
260 
261 	if (oldrd != NULL)
262 		VN_RELE(oldrd);
263 	if (oldcd != NULL)
264 		VN_RELE(oldcd);
265 
266 	/*
267 	 * Free the controlling tty.
268 	 */
269 	mutex_enter(&pidlock);
270 	sp = p->p_sessp;
271 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
272 		mutex_exit(&pidlock);
273 		freectty(sp);
274 	} else {
275 		mutex_exit(&pidlock);
276 	}
277 
278 	/*
279 	 * Now exec() the new init(1M) on top of the current process.  If we
280 	 * succeed, the caller will treat this like a successful system call.
281 	 * If we fail, we issue messages and the caller will proceed with exit.
282 	 */
283 	ipath = INGLOBALZONE(p) ? initname : zone_initname;
284 	err = exec_init(ipath, 0, NULL);
285 
286 	if (err == 0)
287 		return (0);
288 
289 	zcmn_err(p->p_zone->zone_id, CE_WARN,
290 	    "failed to restart init(1M) (err=%d): system reboot required", err);
291 
292 	if (!INGLOBALZONE(p)) {
293 		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
294 		    "(pid %d, err=%d): zoneadm(1M) boot required",
295 		    p->p_zone->zone_name, p->p_pid, err);
296 	}
297 
298 	return (-1);
299 }
300 
301 /*
302  * Release resources.
303  * Enter zombie state.
304  * Wake up parent and init processes,
305  * and dispose of children.
306  */
307 void
308 exit(int why, int what)
309 {
310 	/*
311 	 * If proc_exit() fails, then some other lwp in the process
312 	 * got there first.  We just have to call lwp_exit() to allow
313 	 * the other lwp to finish exiting the process.  Otherwise we're
314 	 * restarting init, and should return.
315 	 */
316 	if (proc_exit(why, what) != 0) {
317 		mutex_enter(&curproc->p_lock);
318 		ASSERT(curproc->p_flag & SEXITLWPS);
319 		lwp_exit();
320 		/* NOTREACHED */
321 	}
322 }
323 
324 /*
325  * Set the SEXITING flag on the process, after making sure /proc does
326  * not have it locked.  This is done in more places than proc_exit(),
327  * so it is a separate function.
328  */
329 void
330 proc_is_exiting(proc_t *p)
331 {
332 	mutex_enter(&p->p_lock);
333 	prbarrier(p);
334 	p->p_flag |= SEXITING;
335 	mutex_exit(&p->p_lock);
336 }
337 
338 /*
339  * Return value:
340  *   1 - exitlwps() failed, call (or continue) lwp_exit()
341  *   0 - restarting init.  Return through system call path
342  */
343 int
344 proc_exit(int why, int what)
345 {
346 	kthread_t *t = curthread;
347 	klwp_t *lwp = ttolwp(t);
348 	proc_t *p = ttoproc(t);
349 	zone_t *z = p->p_zone;
350 	timeout_id_t tmp_id;
351 	int rv;
352 	proc_t *q;
353 	sess_t *sp;
354 	task_t *tk;
355 	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
356 	sigqueue_t *sqp;
357 	lwpdir_t *lwpdir;
358 	uint_t lwpdir_sz;
359 	lwpdir_t **tidhash;
360 	uint_t tidhash_sz;
361 	refstr_t *cwd;
362 	hrtime_t hrutime, hrstime;
363 
364 	/*
365 	 * Stop and discard the process's lwps except for the current one,
366 	 * unless some other lwp beat us to it.  If exitlwps() fails then
367 	 * return and the calling lwp will call (or continue in) lwp_exit().
368 	 */
369 	proc_is_exiting(p);
370 	if (exitlwps(0) != 0)
371 		return (1);
372 
373 	DTRACE_PROC(lwp__exit);
374 	DTRACE_PROC1(exit, int, why);
375 
376 	/*
377 	 * Don't let init exit unless zone_icode() failed its exec, or
378 	 * we are shutting down the zone or the machine.
379 	 *
380 	 * Since we are single threaded, we don't need to lock the
381 	 * following accesses to zone_proc_initpid.
382 	 */
383 	if (p->p_pid == z->zone_proc_initpid) {
384 		if (z->zone_boot_err == 0 &&
385 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
386 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
387 		    restart_init(what, why) == 0)
388 			return (0);
389 		/*
390 		 * Since we didn't or couldn't restart init, we clear
391 		 * the zone's init state and proceed with exit
392 		 * processing.
393 		 */
394 		z->zone_proc_initpid = -1;
395 	}
396 
397 	/*
398 	 * Allocate a sigqueue now, before we grab locks.
399 	 * It will be given to sigcld(), below.
400 	 */
401 	sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
402 
403 	/*
404 	 * revoke any doors created by the process.
405 	 */
406 	if (p->p_door_list)
407 		door_exit();
408 
409 	/*
410 	 * Release schedctl data structures.
411 	 */
412 	if (p->p_pagep)
413 		schedctl_proc_cleanup();
414 
415 	/*
416 	 * make sure all pending kaio has completed.
417 	 */
418 	if (p->p_aio)
419 		aio_cleanup_exit();
420 
421 	/*
422 	 * discard the lwpchan cache.
423 	 */
424 	if (p->p_lcp != NULL)
425 		lwpchan_destroy_cache(0);
426 
427 	/*
428 	 * Clean up any DTrace helper actions or probes for the process.
429 	 */
430 	if (p->p_dtrace_helpers != NULL) {
431 		ASSERT(dtrace_helpers_cleanup != NULL);
432 		(*dtrace_helpers_cleanup)();
433 	}
434 
435 	/* untimeout the realtime timers */
436 	if (p->p_itimer != NULL)
437 		timer_exit();
438 
439 	if ((tmp_id = p->p_alarmid) != 0) {
440 		p->p_alarmid = 0;
441 		(void) untimeout(tmp_id);
442 	}
443 
444 	/*
445 	 * Remove any fpollinfo_t's for this (last) thread from our file
446 	 * descriptors so closeall() can ASSERT() that they're all gone.
447 	 */
448 	pollcleanup();
449 
450 	if (p->p_rprof_cyclic != CYCLIC_NONE) {
451 		mutex_enter(&cpu_lock);
452 		cyclic_remove(p->p_rprof_cyclic);
453 		mutex_exit(&cpu_lock);
454 	}
455 
456 	mutex_enter(&p->p_lock);
457 
458 	/*
459 	 * Clean up any DTrace probes associated with this process.
460 	 */
461 	if (p->p_dtrace_probes) {
462 		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
463 		dtrace_fasttrap_exit_ptr(p);
464 	}
465 
466 	while ((tmp_id = p->p_itimerid) != 0) {
467 		p->p_itimerid = 0;
468 		mutex_exit(&p->p_lock);
469 		(void) untimeout(tmp_id);
470 		mutex_enter(&p->p_lock);
471 	}
472 
473 	lwp_cleanup();
474 
475 	/*
476 	 * We are about to exit; prevent our resource associations from
477 	 * being changed.
478 	 */
479 	pool_barrier_enter();
480 
481 	/*
482 	 * Block the process against /proc now that we have really
483 	 * acquired p->p_lock (to manipulate p_tlist at least).
484 	 */
485 	prbarrier(p);
486 
487 #ifdef	SUN_SRC_COMPAT
488 	if (code == CLD_KILLED)
489 		u.u_acflag |= AXSIG;
490 #endif
491 	sigfillset(&p->p_ignore);
492 	sigemptyset(&p->p_siginfo);
493 	sigemptyset(&p->p_sig);
494 	sigemptyset(&p->p_extsig);
495 	sigemptyset(&t->t_sig);
496 	sigemptyset(&t->t_extsig);
497 	sigemptyset(&p->p_sigmask);
498 	sigdelq(p, t, 0);
499 	lwp->lwp_cursig = 0;
500 	lwp->lwp_extsig = 0;
501 	p->p_flag &= ~(SKILLED | SEXTKILLED);
502 	if (lwp->lwp_curinfo) {
503 		siginfofree(lwp->lwp_curinfo);
504 		lwp->lwp_curinfo = NULL;
505 	}
506 
507 	t->t_proc_flag |= TP_LWPEXIT;
508 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
509 	prlwpexit(t);		/* notify /proc */
510 	lwp_hash_out(p, t->t_tid);
511 	prexit(p);
512 
513 	p->p_lwpcnt = 0;
514 	p->p_tlist = NULL;
515 	sigqfree(p);
516 	term_mstate(t);
517 	p->p_mterm = gethrtime();
518 
519 	exec_vp = p->p_exec;
520 	execdir_vp = p->p_execdir;
521 	p->p_exec = NULLVP;
522 	p->p_execdir = NULLVP;
523 	mutex_exit(&p->p_lock);
524 	if (exec_vp)
525 		VN_RELE(exec_vp);
526 	if (execdir_vp)
527 		VN_RELE(execdir_vp);
528 
529 	pr_free_watched_pages(p);
530 
531 	closeall(P_FINFO(p));
532 
533 	mutex_enter(&pidlock);
534 	sp = p->p_sessp;
535 	if (sp->s_sidp == p->p_pidp && sp->s_vp != NULL) {
536 		mutex_exit(&pidlock);
537 		freectty(sp);
538 	} else
539 		mutex_exit(&pidlock);
540 
541 #if defined(__x86)
542 	/*
543 	 * If the process was using a private LDT then free it.
544 	 */
545 	if (p->p_ldt)
546 		ldt_free(p);
547 #endif
548 
549 #if defined(__sparc)
550 	if (p->p_utraps != NULL)
551 		utrap_free(p);
552 #endif
553 	if (p->p_semacct)			/* IPC semaphore exit */
554 		semexit(p);
555 	rv = wstat(why, what);
556 
557 	acct(rv & 0xff);
558 	exacct_commit_proc(p, rv);
559 
560 	/*
561 	 * Release any resources associated with C2 auditing
562 	 */
563 #ifdef C2_AUDIT
564 	if (audit_active) {
565 		/*
566 		 * audit exit system call
567 		 */
568 		audit_exit(why, what);
569 	}
570 #endif
571 
572 	/*
573 	 * Free address space.
574 	 */
575 	relvm();
576 
577 	/*
578 	 * Release held contracts.
579 	 */
580 	contract_exit(p);
581 
582 	/*
583 	 * Depart our encapsulating process contract.
584 	 */
585 	if ((p->p_flag & SSYS) == 0) {
586 		ASSERT(p->p_ct_process);
587 		contract_process_exit(p->p_ct_process, p, rv);
588 	}
589 
590 	/*
591 	 * Remove pool association, and block if requested by pool_do_bind.
592 	 */
593 	mutex_enter(&p->p_lock);
594 	ASSERT(p->p_pool->pool_ref > 0);
595 	atomic_add_32(&p->p_pool->pool_ref, -1);
596 	p->p_pool = pool_default;
597 	/*
598 	 * Now that our address space has been freed and all other threads
599 	 * in this process have exited, set the PEXITED pool flag.  This
600 	 * tells the pools subsystems to ignore this process if it was
601 	 * requested to rebind this process to a new pool.
602 	 */
603 	p->p_poolflag |= PEXITED;
604 	pool_barrier_exit();
605 	mutex_exit(&p->p_lock);
606 
607 	mutex_enter(&pidlock);
608 
609 	/*
610 	 * Delete this process from the newstate list of its parent. We
611 	 * will put it in the right place in the sigcld in the end.
612 	 */
613 	delete_ns(p->p_parent, p);
614 
615 	/*
616 	 * Reassign the orphans to the next of kin.
617 	 * Don't rearrange init's orphanage.
618 	 */
619 	if ((q = p->p_orphan) != NULL && p != proc_init) {
620 
621 		proc_t *nokp = p->p_nextofkin;
622 
623 		for (;;) {
624 			q->p_nextofkin = nokp;
625 			if (q->p_nextorph == NULL)
626 				break;
627 			q = q->p_nextorph;
628 		}
629 		q->p_nextorph = nokp->p_orphan;
630 		nokp->p_orphan = p->p_orphan;
631 		p->p_orphan = NULL;
632 	}
633 
634 	/*
635 	 * Reassign the children to init.
636 	 * Don't try to assign init's children to init.
637 	 */
638 	if ((q = p->p_child) != NULL && p != proc_init) {
639 		struct proc	*np;
640 		struct proc	*initp = proc_init;
641 		boolean_t	setzonetop = B_FALSE;
642 
643 		if (!INGLOBALZONE(curproc))
644 			setzonetop = B_TRUE;
645 
646 		pgdetach(p);
647 
648 		do {
649 			np = q->p_sibling;
650 			/*
651 			 * Delete it from its current parent new state
652 			 * list and add it to init new state list
653 			 */
654 			delete_ns(q->p_parent, q);
655 
656 			q->p_ppid = 1;
657 			if (setzonetop) {
658 				mutex_enter(&q->p_lock);
659 				q->p_flag |= SZONETOP;
660 				mutex_exit(&q->p_lock);
661 			}
662 			q->p_parent = initp;
663 
664 			/*
665 			 * Since q will be the first child,
666 			 * it will not have a previous sibling.
667 			 */
668 			q->p_psibling = NULL;
669 			if (initp->p_child) {
670 				initp->p_child->p_psibling = q;
671 			}
672 			q->p_sibling = initp->p_child;
673 			initp->p_child = q;
674 			if (q->p_proc_flag & P_PR_PTRACE) {
675 				mutex_enter(&q->p_lock);
676 				sigtoproc(q, NULL, SIGKILL);
677 				mutex_exit(&q->p_lock);
678 			}
679 			/*
680 			 * sigcld() will add the child to parents
681 			 * newstate list.
682 			 */
683 			if (q->p_stat == SZOMB)
684 				sigcld(q, NULL);
685 		} while ((q = np) != NULL);
686 
687 		p->p_child = NULL;
688 		ASSERT(p->p_child_ns == NULL);
689 	}
690 
691 	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
692 
693 	mutex_enter(&p->p_lock);
694 	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
695 
696 	hrutime = mstate_aggr_state(p, LMS_USER);
697 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
698 	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
699 	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
700 
701 	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
702 	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
703 	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
704 	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
705 	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
706 	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
707 	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
708 	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
709 	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
710 	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
711 
712 	p->p_ru.minflt	+= p->p_cru.minflt;
713 	p->p_ru.majflt	+= p->p_cru.majflt;
714 	p->p_ru.nswap	+= p->p_cru.nswap;
715 	p->p_ru.inblock	+= p->p_cru.inblock;
716 	p->p_ru.oublock	+= p->p_cru.oublock;
717 	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
718 	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
719 	p->p_ru.nsignals += p->p_cru.nsignals;
720 	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
721 	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
722 	p->p_ru.sysc	+= p->p_cru.sysc;
723 	p->p_ru.ioch	+= p->p_cru.ioch;
724 
725 	p->p_stat = SZOMB;
726 	p->p_proc_flag &= ~P_PR_PTRACE;
727 	p->p_wdata = what;
728 	p->p_wcode = (char)why;
729 
730 	cdir = PTOU(p)->u_cdir;
731 	rdir = PTOU(p)->u_rdir;
732 	cwd = PTOU(p)->u_cwd;
733 
734 	/*
735 	 * Release resource controls, as they are no longer enforceable.
736 	 */
737 	rctl_set_free(p->p_rctls);
738 
739 	/*
740 	 * Give up task and project memberships.  Decrement tk_nlwps counter
741 	 * for our task.max-lwps resource control.  An extended accounting
742 	 * record, if that facility is active, is scheduled to be written.
743 	 * Zombie processes are false members of task0 for the remainder of
744 	 * their lifetime; no accounting information is recorded for them.
745 	 */
746 	tk = p->p_task;
747 
748 	mutex_enter(&p->p_zone->zone_nlwps_lock);
749 	tk->tk_nlwps--;
750 	tk->tk_proj->kpj_nlwps--;
751 	p->p_zone->zone_nlwps--;
752 	mutex_exit(&p->p_zone->zone_nlwps_lock);
753 	task_detach(p);
754 	p->p_task = task0p;
755 
756 	/*
757 	 * Clear the lwp directory and the lwpid hash table
758 	 * now that /proc can't bother us any more.
759 	 * We free the memory below, after dropping p->p_lock.
760 	 */
761 	lwpdir = p->p_lwpdir;
762 	lwpdir_sz = p->p_lwpdir_sz;
763 	tidhash = p->p_tidhash;
764 	tidhash_sz = p->p_tidhash_sz;
765 	p->p_lwpdir = NULL;
766 	p->p_lwpfree = NULL;
767 	p->p_lwpdir_sz = 0;
768 	p->p_tidhash = NULL;
769 	p->p_tidhash_sz = 0;
770 
771 	/*
772 	 * curthread's proc pointer is changed to point at p0 because
773 	 * curthread's original proc pointer can be freed as soon as
774 	 * the child sends a SIGCLD to its parent.
775 	 */
776 	t->t_procp = &p0;
777 
778 	mutex_exit(&p->p_lock);
779 	sigcld(p, sqp);
780 	mutex_exit(&pidlock);
781 
782 	task_rele(tk);
783 
784 	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
785 	kmem_free(tidhash, tidhash_sz * sizeof (lwpdir_t *));
786 
787 	/*
788 	 * We don't release u_cdir and u_rdir until SZOMB is set.
789 	 * This protects us against dofusers().
790 	 */
791 	VN_RELE(cdir);
792 	if (rdir)
793 		VN_RELE(rdir);
794 	if (cwd)
795 		refstr_rele(cwd);
796 
797 	lwp_pcb_exit();
798 
799 	thread_exit();
800 	/* NOTREACHED */
801 }
802 
803 /*
804  * Format siginfo structure for wait system calls.
805  */
806 void
807 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
808 {
809 	ASSERT(MUTEX_HELD(&pidlock));
810 
811 	bzero(ip, sizeof (k_siginfo_t));
812 	ip->si_signo = SIGCLD;
813 	ip->si_code = pp->p_wcode;
814 	ip->si_pid = pp->p_pid;
815 	ip->si_ctid = PRCTID(pp);
816 	ip->si_zoneid = pp->p_zone->zone_id;
817 	ip->si_status = pp->p_wdata;
818 	ip->si_stime = pp->p_stime;
819 	ip->si_utime = pp->p_utime;
820 
821 	if (waitflag) {
822 		pp->p_wcode = 0;
823 		pp->p_wdata = 0;
824 		pp->p_pidflag &= ~CLDPEND;
825 	}
826 }
827 
828 /*
829  * Wait system call.
830  * Search for a terminated (zombie) child,
831  * finally lay it to rest, and collect its status.
832  * Look also for stopped children,
833  * and pass back status from them.
834  */
835 int
836 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
837 {
838 	int found;
839 	proc_t *cp, *pp;
840 	proc_t **nsp;
841 	int proc_gone;
842 	int waitflag = !(options & WNOWAIT);
843 
844 	/*
845 	 * Obsolete flag, defined here only for binary compatibility
846 	 * with old statically linked executables.  Delete this when
847 	 * we no longer care about these old and broken applications.
848 	 */
849 #define	_WNOCHLD	0400
850 	options &= ~_WNOCHLD;
851 
852 	if (options == 0 || (options & ~WOPTMASK))
853 		return (EINVAL);
854 
855 	switch (idtype) {
856 	case P_PID:
857 	case P_PGID:
858 		if (id < 0 || id >= maxpid)
859 			return (EINVAL);
860 		/* FALLTHROUGH */
861 	case P_ALL:
862 		break;
863 	default:
864 		return (EINVAL);
865 	}
866 
867 	pp = ttoproc(curthread);
868 
869 	/*
870 	 * lock parent mutex so that sibling chain can be searched.
871 	 */
872 	mutex_enter(&pidlock);
873 
874 	/*
875 	 * if we are only looking for exited processes and child_ns list
876 	 * is empty no reason to look at all children.
877 	 */
878 	if (idtype == P_ALL &&
879 	    (options & (WOPTMASK & ~WNOWAIT)) == (WNOHANG | WEXITED) &&
880 		pp->p_child_ns == NULL) {
881 
882 		if (pp->p_child) {
883 			mutex_exit(&pidlock);
884 			bzero(ip, sizeof (k_siginfo_t));
885 			return (0);
886 		}
887 		mutex_exit(&pidlock);
888 		return (ECHILD);
889 	}
890 
891 	while ((cp = pp->p_child) != NULL) {
892 
893 		proc_gone = 0;
894 
895 		for (nsp = &pp->p_child_ns; *nsp; nsp = &(*nsp)->p_sibling_ns) {
896 			if (idtype == P_PID && id != (*nsp)->p_pid) {
897 				continue;
898 			}
899 			if (idtype == P_PGID && id != (*nsp)->p_pgrp) {
900 				continue;
901 			}
902 
903 			switch ((*nsp)->p_wcode) {
904 
905 			case CLD_TRAPPED:
906 			case CLD_STOPPED:
907 			case CLD_CONTINUED:
908 				cmn_err(CE_PANIC,
909 				    "waitid: wrong state %d on the p_newstate"
910 				    " list", (*nsp)->p_wcode);
911 				break;
912 
913 			case CLD_EXITED:
914 			case CLD_DUMPED:
915 			case CLD_KILLED:
916 				if (!(options & WEXITED)) {
917 					/*
918 					 * Count how many are already gone
919 					 * for good.
920 					 */
921 					proc_gone++;
922 					break;
923 				}
924 				if (!waitflag) {
925 					winfo((*nsp), ip, 0);
926 				} else {
927 					proc_t *xp = *nsp;
928 					winfo(xp, ip, 1);
929 					freeproc(xp);
930 				}
931 				mutex_exit(&pidlock);
932 				if (waitflag) {		/* accept SIGCLD */
933 					sigcld_delete(ip);
934 					sigcld_repost();
935 				}
936 				return (0);
937 			}
938 
939 			if (idtype == P_PID)
940 				break;
941 		}
942 
943 		/*
944 		 * Wow! None of the threads on the p_sibling_ns list were
945 		 * interesting threads. Check all the kids!
946 		 */
947 		found = 0;
948 		cp = pp->p_child;
949 		do {
950 			if (idtype == P_PID && id != cp->p_pid) {
951 				continue;
952 			}
953 			if (idtype == P_PGID && id != cp->p_pgrp) {
954 				continue;
955 			}
956 
957 			found++;
958 
959 			switch (cp->p_wcode) {
960 			case CLD_TRAPPED:
961 				if (!(options & WTRAPPED))
962 					break;
963 				winfo(cp, ip, waitflag);
964 				mutex_exit(&pidlock);
965 				if (waitflag) {		/* accept SIGCLD */
966 					sigcld_delete(ip);
967 					sigcld_repost();
968 				}
969 				return (0);
970 
971 			case CLD_STOPPED:
972 				if (!(options & WSTOPPED))
973 					break;
974 				/* Is it still stopped? */
975 				mutex_enter(&cp->p_lock);
976 				if (!jobstopped(cp)) {
977 					mutex_exit(&cp->p_lock);
978 					break;
979 				}
980 				mutex_exit(&cp->p_lock);
981 				winfo(cp, ip, waitflag);
982 				mutex_exit(&pidlock);
983 				if (waitflag) {		/* accept SIGCLD */
984 					sigcld_delete(ip);
985 					sigcld_repost();
986 				}
987 				return (0);
988 
989 			case CLD_CONTINUED:
990 				if (!(options & WCONTINUED))
991 					break;
992 				winfo(cp, ip, waitflag);
993 				mutex_exit(&pidlock);
994 				if (waitflag) {		/* accept SIGCLD */
995 					sigcld_delete(ip);
996 					sigcld_repost();
997 				}
998 				return (0);
999 
1000 			case CLD_EXITED:
1001 			case CLD_DUMPED:
1002 			case CLD_KILLED:
1003 				/*
1004 				 * Don't complain if a process was found in
1005 				 * the first loop but we broke out of the loop
1006 				 * because of the arguments passed to us.
1007 				 */
1008 				if (proc_gone == 0) {
1009 					cmn_err(CE_PANIC,
1010 					    "waitid: wrong state on the"
1011 					    " p_child list");
1012 				} else {
1013 					break;
1014 				}
1015 			}
1016 
1017 			if (idtype == P_PID)
1018 				break;
1019 		} while ((cp = cp->p_sibling) != NULL);
1020 
1021 		/*
1022 		 * If we found no interesting processes at all,
1023 		 * break out and return ECHILD.
1024 		 */
1025 		if (found + proc_gone == 0)
1026 			break;
1027 
1028 		if (options & WNOHANG) {
1029 			bzero(ip, sizeof (k_siginfo_t));
1030 			/*
1031 			 * We should set ip->si_signo = SIGCLD,
1032 			 * but there is an SVVS test that expects
1033 			 * ip->si_signo to be zero in this case.
1034 			 */
1035 			mutex_exit(&pidlock);
1036 			return (0);
1037 		}
1038 
1039 		/*
1040 		 * If we found no processes of interest that could
1041 		 * change state while we wait, we don't wait at all.
1042 		 * Get out with ECHILD according to SVID.
1043 		 */
1044 		if (found == proc_gone)
1045 			break;
1046 
1047 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1048 			mutex_exit(&pidlock);
1049 			return (EINTR);
1050 		}
1051 	}
1052 	mutex_exit(&pidlock);
1053 	return (ECHILD);
1054 }
1055 
1056 /*
1057  * For implementations that don't require binary compatibility,
1058  * the wait system call may be made into a library call to the
1059  * waitid system call.
1060  */
1061 int64_t
1062 wait(void)
1063 {
1064 	int error;
1065 	k_siginfo_t info;
1066 	rval_t	r;
1067 
1068 	if (error =  waitid(P_ALL, (id_t)0, &info, WEXITED|WTRAPPED))
1069 		return (set_errno(error));
1070 	r.r_val1 = info.si_pid;
1071 	r.r_val2 = wstat(info.si_code, info.si_status);
1072 	return (r.r_vals);
1073 }
1074 
1075 int
1076 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1077 {
1078 	int error;
1079 	k_siginfo_t info;
1080 
1081 	if (error = waitid(idtype, id, &info, options))
1082 		return (set_errno(error));
1083 	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1084 		return (set_errno(EFAULT));
1085 	return (0);
1086 }
1087 
1088 #ifdef _SYSCALL32_IMPL
1089 
1090 int
1091 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1092 {
1093 	int error;
1094 	k_siginfo_t info;
1095 	siginfo32_t info32;
1096 
1097 	if (error = waitid(idtype, id, &info, options))
1098 		return (set_errno(error));
1099 	siginfo_kto32(&info, &info32);
1100 	if (copyout(&info32, infop, sizeof (info32)))
1101 		return (set_errno(EFAULT));
1102 	return (0);
1103 }
1104 
1105 #endif	/* _SYSCALL32_IMPL */
1106 
1107 void
1108 proc_detach(proc_t *p)
1109 {
1110 	proc_t *q;
1111 
1112 	ASSERT(MUTEX_HELD(&pidlock));
1113 
1114 	q = p->p_parent;
1115 	ASSERT(q != NULL);
1116 
1117 	/*
1118 	 * Take it off the newstate list of its parent
1119 	 */
1120 	delete_ns(q, p);
1121 
1122 	if (q->p_child == p) {
1123 		q->p_child = p->p_sibling;
1124 		/*
1125 		 * If the parent has no children, it better not
1126 		 * have any with new states either!
1127 		 */
1128 		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1129 	}
1130 
1131 	if (p->p_sibling) {
1132 		p->p_sibling->p_psibling = p->p_psibling;
1133 	}
1134 
1135 	if (p->p_psibling) {
1136 		p->p_psibling->p_sibling = p->p_sibling;
1137 	}
1138 }
1139 
1140 /*
1141  * Remove zombie children from the process table.
1142  */
1143 void
1144 freeproc(proc_t *p)
1145 {
1146 	proc_t *q;
1147 
1148 	ASSERT(p->p_stat == SZOMB);
1149 	ASSERT(p->p_tlist == NULL);
1150 	ASSERT(MUTEX_HELD(&pidlock));
1151 
1152 	sigdelq(p, NULL, 0);
1153 	if (p->p_killsqp) {
1154 		siginfofree(p->p_killsqp);
1155 		p->p_killsqp = NULL;
1156 	}
1157 
1158 	prfree(p);	/* inform /proc */
1159 
1160 	/*
1161 	 * Don't free the init processes.
1162 	 * Other dying processes will access it.
1163 	 */
1164 	if (p == proc_init)
1165 		return;
1166 
1167 
1168 	/*
1169 	 * We wait until now to free the cred structure because a
1170 	 * zombie process's credentials may be examined by /proc.
1171 	 * No cred locking needed because there are no threads at this point.
1172 	 */
1173 	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1174 	crfree(p->p_cred);
1175 	if (p->p_corefile != NULL) {
1176 		corectl_path_rele(p->p_corefile);
1177 		p->p_corefile = NULL;
1178 	}
1179 	if (p->p_content != NULL) {
1180 		corectl_content_rele(p->p_content);
1181 		p->p_content = NULL;
1182 	}
1183 
1184 	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1185 	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1186 		/*
1187 		 * This should still do the right thing since p_utime/stime
1188 		 * get set to the correct value on process exit, so it
1189 		 * should get properly updated
1190 		 */
1191 		p->p_nextofkin->p_cutime += p->p_utime;
1192 		p->p_nextofkin->p_cstime += p->p_stime;
1193 
1194 		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1195 		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1196 		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1197 		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1198 		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1199 		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1200 		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1201 		    += p->p_acct[LMS_USER_LOCK];
1202 		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1203 		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1204 		    += p->p_acct[LMS_WAIT_CPU];
1205 		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1206 
1207 		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1208 		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1209 		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1210 		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1211 		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1212 		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1213 		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1214 		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1215 		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1216 		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1217 		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1218 		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1219 
1220 	}
1221 
1222 	q = p->p_nextofkin;
1223 	if (q && q->p_orphan == p)
1224 		q->p_orphan = p->p_nextorph;
1225 	else if (q) {
1226 		for (q = q->p_orphan; q; q = q->p_nextorph)
1227 			if (q->p_nextorph == p)
1228 				break;
1229 		ASSERT(q && q->p_nextorph == p);
1230 		q->p_nextorph = p->p_nextorph;
1231 	}
1232 
1233 	proc_detach(p);
1234 	pid_exit(p);	/* frees pid and proc structure */
1235 }
1236 
1237 /*
1238  * Delete process "child" from the newstate list of process "parent"
1239  */
1240 void
1241 delete_ns(proc_t *parent, proc_t *child)
1242 {
1243 	proc_t **ns;
1244 
1245 	ASSERT(MUTEX_HELD(&pidlock));
1246 	ASSERT(child->p_parent == parent);
1247 	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1248 		if (*ns == child) {
1249 
1250 			ASSERT((*ns)->p_parent == parent);
1251 
1252 			*ns = child->p_sibling_ns;
1253 			child->p_sibling_ns = NULL;
1254 			return;
1255 		}
1256 	}
1257 }
1258 
1259 /*
1260  * Add process "child" to the new state list of process "parent"
1261  */
1262 void
1263 add_ns(proc_t *parent, proc_t *child)
1264 {
1265 	ASSERT(child->p_sibling_ns == NULL);
1266 	child->p_sibling_ns = parent->p_child_ns;
1267 	parent->p_child_ns = child;
1268 }
1269