xref: /illumos-gate/usr/src/uts/common/os/exit.c (revision a1c36c8ba5112b6713dabac615bf8d56a45f0764)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/cred.h>
33 #include <sys/user.h>
34 #include <sys/errno.h>
35 #include <sys/proc.h>
36 #include <sys/ucontext.h>
37 #include <sys/procfs.h>
38 #include <sys/vnode.h>
39 #include <sys/acct.h>
40 #include <sys/var.h>
41 #include <sys/cmn_err.h>
42 #include <sys/debug.h>
43 #include <sys/wait.h>
44 #include <sys/siginfo.h>
45 #include <sys/procset.h>
46 #include <sys/class.h>
47 #include <sys/file.h>
48 #include <sys/session.h>
49 #include <sys/kmem.h>
50 #include <sys/vtrace.h>
51 #include <sys/prsystm.h>
52 #include <sys/ipc.h>
53 #include <sys/sem_impl.h>
54 #include <c2/audit.h>
55 #include <sys/aio_impl.h>
56 #include <vm/as.h>
57 #include <sys/poll.h>
58 #include <sys/door.h>
59 #include <sys/lwpchan_impl.h>
60 #include <sys/utrap.h>
61 #include <sys/task.h>
62 #include <sys/exacct.h>
63 #include <sys/cyclic.h>
64 #include <sys/schedctl.h>
65 #include <sys/rctl.h>
66 #include <sys/contract_impl.h>
67 #include <sys/contract/process_impl.h>
68 #include <sys/list.h>
69 #include <sys/dtrace.h>
70 #include <sys/pool.h>
71 #include <sys/sdt.h>
72 #include <sys/corectl.h>
73 #include <sys/brand.h>
74 #include <sys/libc_kernel.h>
75 
76 /*
77  * convert code/data pair into old style wait status
78  */
79 int
80 wstat(int code, int data)
81 {
82 	int stat = (data & 0377);
83 
84 	switch (code) {
85 	case CLD_EXITED:
86 		stat <<= 8;
87 		break;
88 	case CLD_DUMPED:
89 		stat |= WCOREFLG;
90 		break;
91 	case CLD_KILLED:
92 		break;
93 	case CLD_TRAPPED:
94 	case CLD_STOPPED:
95 		stat <<= 8;
96 		stat |= WSTOPFLG;
97 		break;
98 	case CLD_CONTINUED:
99 		stat = WCONTFLG;
100 		break;
101 	default:
102 		cmn_err(CE_PANIC, "wstat: bad code");
103 		/* NOTREACHED */
104 	}
105 	return (stat);
106 }
107 
108 static char *
109 exit_reason(char *buf, size_t bufsz, int what, int why)
110 {
111 	switch (why) {
112 	case CLD_EXITED:
113 		(void) snprintf(buf, bufsz, "exited with status %d", what);
114 		break;
115 	case CLD_KILLED:
116 		(void) snprintf(buf, bufsz, "exited on fatal signal %d", what);
117 		break;
118 	case CLD_DUMPED:
119 		(void) snprintf(buf, bufsz, "core dumped on signal %d", what);
120 		break;
121 	default:
122 		(void) snprintf(buf, bufsz, "encountered unknown error "
123 		    "(%d, %d)", why, what);
124 		break;
125 	}
126 
127 	return (buf);
128 }
129 
130 /*
131  * exit system call: pass back caller's arg.
132  */
133 void
134 rexit(int rval)
135 {
136 	exit(CLD_EXITED, rval);
137 }
138 
139 /*
140  * Called by proc_exit() when a zone's init exits, presumably because
141  * it failed.  As long as the given zone is still in the "running"
142  * state, we will re-exec() init, but first we need to reset things
143  * which are usually inherited across exec() but will break init's
144  * assumption that it is being exec()'d from a virgin process.  Most
145  * importantly this includes closing all file descriptors (exec only
146  * closes those marked close-on-exec) and resetting signals (exec only
147  * resets handled signals, and we need to clear any signals which
148  * killed init).  Anything else that exec(2) says would be inherited,
149  * but would affect the execution of init, needs to be reset.
150  */
151 static int
152 restart_init(int what, int why)
153 {
154 	kthread_t *t = curthread;
155 	klwp_t *lwp = ttolwp(t);
156 	proc_t *p = ttoproc(t);
157 	user_t *up = PTOU(p);
158 
159 	vnode_t *oldcd, *oldrd;
160 	int i, err;
161 	char reason_buf[64];
162 
163 	/*
164 	 * Let zone admin (and global zone admin if this is for a non-global
165 	 * zone) know that init has failed and will be restarted.
166 	 */
167 	zcmn_err(p->p_zone->zone_id, CE_WARN,
168 	    "init(1M) %s: restarting automatically",
169 	    exit_reason(reason_buf, sizeof (reason_buf), what, why));
170 
171 	if (!INGLOBALZONE(p)) {
172 		cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: "
173 		    "restarting automatically",
174 		    p->p_zone->zone_name, p->p_pid, reason_buf);
175 	}
176 
177 	/*
178 	 * Remove any fpollinfo_t's for this (last) thread from our file
179 	 * descriptors so closeall() can ASSERT() that they're all gone.
180 	 * Then close all open file descriptors in the process.
181 	 */
182 	pollcleanup();
183 	closeall(P_FINFO(p));
184 
185 	/*
186 	 * Grab p_lock and begin clearing miscellaneous global process
187 	 * state that needs to be reset before we exec the new init(1M).
188 	 */
189 
190 	mutex_enter(&p->p_lock);
191 	prbarrier(p);
192 
193 	p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE);
194 	up->u_cmask = CMASK;
195 
196 	sigemptyset(&t->t_hold);
197 	sigemptyset(&t->t_sig);
198 	sigemptyset(&t->t_extsig);
199 
200 	sigemptyset(&p->p_sig);
201 	sigemptyset(&p->p_extsig);
202 
203 	sigdelq(p, t, 0);
204 	sigdelq(p, NULL, 0);
205 
206 	if (p->p_killsqp) {
207 		siginfofree(p->p_killsqp);
208 		p->p_killsqp = NULL;
209 	}
210 
211 	/*
212 	 * Reset any signals that are ignored back to the default disposition.
213 	 * Other u_signal members will be cleared when exec calls sigdefault().
214 	 */
215 	for (i = 1; i < NSIG; i++) {
216 		if (up->u_signal[i - 1] == SIG_IGN) {
217 			up->u_signal[i - 1] = SIG_DFL;
218 			sigemptyset(&up->u_sigmask[i - 1]);
219 		}
220 	}
221 
222 	/*
223 	 * Clear the current signal, any signal info associated with it, and
224 	 * any signal information from contracts and/or contract templates.
225 	 */
226 	lwp->lwp_cursig = 0;
227 	lwp->lwp_extsig = 0;
228 	if (lwp->lwp_curinfo != NULL) {
229 		siginfofree(lwp->lwp_curinfo);
230 		lwp->lwp_curinfo = NULL;
231 	}
232 	lwp_ctmpl_clear(lwp);
233 
234 	/*
235 	 * Reset both the process root directory and the current working
236 	 * directory to the root of the zone just as we do during boot.
237 	 */
238 	VN_HOLD(p->p_zone->zone_rootvp);
239 	oldrd = up->u_rdir;
240 	up->u_rdir = p->p_zone->zone_rootvp;
241 
242 	VN_HOLD(p->p_zone->zone_rootvp);
243 	oldcd = up->u_cdir;
244 	up->u_cdir = p->p_zone->zone_rootvp;
245 
246 	if (up->u_cwd != NULL) {
247 		refstr_rele(up->u_cwd);
248 		up->u_cwd = NULL;
249 	}
250 
251 	mutex_exit(&p->p_lock);
252 
253 	if (oldrd != NULL)
254 		VN_RELE(oldrd);
255 	if (oldcd != NULL)
256 		VN_RELE(oldcd);
257 
258 	/* Free the controlling tty.  (freectty() always assumes curproc.) */
259 	ASSERT(p == curproc);
260 	(void) freectty(B_TRUE);
261 
262 	/*
263 	 * Now exec() the new init(1M) on top of the current process.  If we
264 	 * succeed, the caller will treat this like a successful system call.
265 	 * If we fail, we issue messages and the caller will proceed with exit.
266 	 */
267 	err = exec_init(p->p_zone->zone_initname, NULL);
268 
269 	if (err == 0)
270 		return (0);
271 
272 	zcmn_err(p->p_zone->zone_id, CE_WARN,
273 	    "failed to restart init(1M) (err=%d): system reboot required", err);
274 
275 	if (!INGLOBALZONE(p)) {
276 		cmn_err(CE_WARN, "failed to restart init(1M) for zone %s "
277 		    "(pid %d, err=%d): zoneadm(1M) boot required",
278 		    p->p_zone->zone_name, p->p_pid, err);
279 	}
280 
281 	return (-1);
282 }
283 
284 /*
285  * Release resources.
286  * Enter zombie state.
287  * Wake up parent and init processes,
288  * and dispose of children.
289  */
290 void
291 exit(int why, int what)
292 {
293 	/*
294 	 * If proc_exit() fails, then some other lwp in the process
295 	 * got there first.  We just have to call lwp_exit() to allow
296 	 * the other lwp to finish exiting the process.  Otherwise we're
297 	 * restarting init, and should return.
298 	 */
299 	if (proc_exit(why, what) != 0) {
300 		mutex_enter(&curproc->p_lock);
301 		ASSERT(curproc->p_flag & SEXITLWPS);
302 		lwp_exit();
303 		/* NOTREACHED */
304 	}
305 }
306 
307 /*
308  * Set the SEXITING flag on the process, after making sure /proc does
309  * not have it locked.  This is done in more places than proc_exit(),
310  * so it is a separate function.
311  */
312 void
313 proc_is_exiting(proc_t *p)
314 {
315 	mutex_enter(&p->p_lock);
316 	prbarrier(p);
317 	p->p_flag |= SEXITING;
318 	mutex_exit(&p->p_lock);
319 }
320 
321 /*
322  * Return value:
323  *   1 - exitlwps() failed, call (or continue) lwp_exit()
324  *   0 - restarting init.  Return through system call path
325  */
326 int
327 proc_exit(int why, int what)
328 {
329 	kthread_t *t = curthread;
330 	klwp_t *lwp = ttolwp(t);
331 	proc_t *p = ttoproc(t);
332 	zone_t *z = p->p_zone;
333 	timeout_id_t tmp_id;
334 	int rv;
335 	proc_t *q;
336 	task_t *tk;
337 	vnode_t *exec_vp, *execdir_vp, *cdir, *rdir;
338 	sigqueue_t *sqp;
339 	lwpdir_t *lwpdir;
340 	uint_t lwpdir_sz;
341 	tidhash_t *tidhash;
342 	uint_t tidhash_sz;
343 	ret_tidhash_t *ret_tidhash;
344 	refstr_t *cwd;
345 	hrtime_t hrutime, hrstime;
346 	int evaporate;
347 
348 	/*
349 	 * Stop and discard the process's lwps except for the current one,
350 	 * unless some other lwp beat us to it.  If exitlwps() fails then
351 	 * return and the calling lwp will call (or continue in) lwp_exit().
352 	 */
353 	proc_is_exiting(p);
354 	if (exitlwps(0) != 0)
355 		return (1);
356 
357 	mutex_enter(&p->p_lock);
358 	if (p->p_ttime > 0) {
359 		/*
360 		 * Account any remaining ticks charged to this process
361 		 * on its way out.
362 		 */
363 		(void) task_cpu_time_incr(p->p_task, p->p_ttime);
364 		p->p_ttime = 0;
365 	}
366 	mutex_exit(&p->p_lock);
367 
368 	DTRACE_PROC(lwp__exit);
369 	DTRACE_PROC1(exit, int, why);
370 
371 	/*
372 	 * Will perform any brand specific proc exit processing, since this
373 	 * is always the last lwp, will also perform lwp_exit and free brand
374 	 * data
375 	 */
376 	if (PROC_IS_BRANDED(p)) {
377 		lwp_detach_brand_hdlrs(lwp);
378 		brand_clearbrand(p, B_FALSE);
379 	}
380 
381 	/*
382 	 * Don't let init exit unless zone_start_init() failed its exec, or
383 	 * we are shutting down the zone or the machine.
384 	 *
385 	 * Since we are single threaded, we don't need to lock the
386 	 * following accesses to zone_proc_initpid.
387 	 */
388 	if (p->p_pid == z->zone_proc_initpid) {
389 		if (z->zone_boot_err == 0 &&
390 		    zone_status_get(z) < ZONE_IS_SHUTTING_DOWN &&
391 		    zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN &&
392 		    z->zone_restart_init == B_TRUE &&
393 		    restart_init(what, why) == 0)
394 			return (0);
395 		/*
396 		 * Since we didn't or couldn't restart init, we clear
397 		 * the zone's init state and proceed with exit
398 		 * processing.
399 		 */
400 		z->zone_proc_initpid = -1;
401 	}
402 
403 	lwp_pcb_exit();
404 
405 	/*
406 	 * Allocate a sigqueue now, before we grab locks.
407 	 * It will be given to sigcld(), below.
408 	 * Special case:  If we will be making the process disappear
409 	 * without a trace because it is either:
410 	 *	* an exiting SSYS process, or
411 	 *	* a posix_spawn() vfork child who requests it,
412 	 * we don't bother to allocate a useless sigqueue.
413 	 */
414 	evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) &&
415 	    why == CLD_EXITED && what == _EVAPORATE);
416 	if (!evaporate)
417 		sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
418 
419 	/*
420 	 * revoke any doors created by the process.
421 	 */
422 	if (p->p_door_list)
423 		door_exit();
424 
425 	/*
426 	 * Release schedctl data structures.
427 	 */
428 	if (p->p_pagep)
429 		schedctl_proc_cleanup();
430 
431 	/*
432 	 * make sure all pending kaio has completed.
433 	 */
434 	if (p->p_aio)
435 		aio_cleanup_exit();
436 
437 	/*
438 	 * discard the lwpchan cache.
439 	 */
440 	if (p->p_lcp != NULL)
441 		lwpchan_destroy_cache(0);
442 
443 	/*
444 	 * Clean up any DTrace helper actions or probes for the process.
445 	 */
446 	if (p->p_dtrace_helpers != NULL) {
447 		ASSERT(dtrace_helpers_cleanup != NULL);
448 		(*dtrace_helpers_cleanup)();
449 	}
450 
451 	/* untimeout the realtime timers */
452 	if (p->p_itimer != NULL)
453 		timer_exit();
454 
455 	if ((tmp_id = p->p_alarmid) != 0) {
456 		p->p_alarmid = 0;
457 		(void) untimeout(tmp_id);
458 	}
459 
460 	/*
461 	 * Remove any fpollinfo_t's for this (last) thread from our file
462 	 * descriptors so closeall() can ASSERT() that they're all gone.
463 	 */
464 	pollcleanup();
465 
466 	if (p->p_rprof_cyclic != CYCLIC_NONE) {
467 		mutex_enter(&cpu_lock);
468 		cyclic_remove(p->p_rprof_cyclic);
469 		mutex_exit(&cpu_lock);
470 	}
471 
472 	mutex_enter(&p->p_lock);
473 
474 	/*
475 	 * Clean up any DTrace probes associated with this process.
476 	 */
477 	if (p->p_dtrace_probes) {
478 		ASSERT(dtrace_fasttrap_exit_ptr != NULL);
479 		dtrace_fasttrap_exit_ptr(p);
480 	}
481 
482 	while ((tmp_id = p->p_itimerid) != 0) {
483 		p->p_itimerid = 0;
484 		mutex_exit(&p->p_lock);
485 		(void) untimeout(tmp_id);
486 		mutex_enter(&p->p_lock);
487 	}
488 
489 	lwp_cleanup();
490 
491 	/*
492 	 * We are about to exit; prevent our resource associations from
493 	 * being changed.
494 	 */
495 	pool_barrier_enter();
496 
497 	/*
498 	 * Block the process against /proc now that we have really
499 	 * acquired p->p_lock (to manipulate p_tlist at least).
500 	 */
501 	prbarrier(p);
502 
503 	sigfillset(&p->p_ignore);
504 	sigemptyset(&p->p_siginfo);
505 	sigemptyset(&p->p_sig);
506 	sigemptyset(&p->p_extsig);
507 	sigemptyset(&t->t_sig);
508 	sigemptyset(&t->t_extsig);
509 	sigemptyset(&p->p_sigmask);
510 	sigdelq(p, t, 0);
511 	lwp->lwp_cursig = 0;
512 	lwp->lwp_extsig = 0;
513 	p->p_flag &= ~(SKILLED | SEXTKILLED);
514 	if (lwp->lwp_curinfo) {
515 		siginfofree(lwp->lwp_curinfo);
516 		lwp->lwp_curinfo = NULL;
517 	}
518 
519 	t->t_proc_flag |= TP_LWPEXIT;
520 	ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0);
521 	prlwpexit(t);		/* notify /proc */
522 	lwp_hash_out(p, t->t_tid);
523 	prexit(p);
524 
525 	p->p_lwpcnt = 0;
526 	p->p_tlist = NULL;
527 	sigqfree(p);
528 	term_mstate(t);
529 	p->p_mterm = gethrtime();
530 
531 	exec_vp = p->p_exec;
532 	execdir_vp = p->p_execdir;
533 	p->p_exec = NULLVP;
534 	p->p_execdir = NULLVP;
535 	mutex_exit(&p->p_lock);
536 
537 	pr_free_watched_pages(p);
538 
539 	closeall(P_FINFO(p));
540 
541 	/* Free the controlling tty.  (freectty() always assumes curproc.) */
542 	ASSERT(p == curproc);
543 	(void) freectty(B_TRUE);
544 
545 #if defined(__sparc)
546 	if (p->p_utraps != NULL)
547 		utrap_free(p);
548 #endif
549 	if (p->p_semacct)			/* IPC semaphore exit */
550 		semexit(p);
551 	rv = wstat(why, what);
552 
553 	acct(rv & 0xff);
554 	exacct_commit_proc(p, rv);
555 
556 	/*
557 	 * Release any resources associated with C2 auditing
558 	 */
559 	if (AU_AUDITING()) {
560 		/*
561 		 * audit exit system call
562 		 */
563 		audit_exit(why, what);
564 	}
565 
566 	/*
567 	 * Free address space.
568 	 */
569 	relvm();
570 
571 	if (exec_vp) {
572 		/*
573 		 * Close this executable which has been opened when the process
574 		 * was created by getproc().
575 		 */
576 		(void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL);
577 		VN_RELE(exec_vp);
578 	}
579 	if (execdir_vp)
580 		VN_RELE(execdir_vp);
581 
582 	/*
583 	 * Release held contracts.
584 	 */
585 	contract_exit(p);
586 
587 	/*
588 	 * Depart our encapsulating process contract.
589 	 */
590 	if ((p->p_flag & SSYS) == 0) {
591 		ASSERT(p->p_ct_process);
592 		contract_process_exit(p->p_ct_process, p, rv);
593 	}
594 
595 	/*
596 	 * Remove pool association, and block if requested by pool_do_bind.
597 	 */
598 	mutex_enter(&p->p_lock);
599 	ASSERT(p->p_pool->pool_ref > 0);
600 	atomic_add_32(&p->p_pool->pool_ref, -1);
601 	p->p_pool = pool_default;
602 	/*
603 	 * Now that our address space has been freed and all other threads
604 	 * in this process have exited, set the PEXITED pool flag.  This
605 	 * tells the pools subsystems to ignore this process if it was
606 	 * requested to rebind this process to a new pool.
607 	 */
608 	p->p_poolflag |= PEXITED;
609 	pool_barrier_exit();
610 	mutex_exit(&p->p_lock);
611 
612 	mutex_enter(&pidlock);
613 
614 	/*
615 	 * Delete this process from the newstate list of its parent. We
616 	 * will put it in the right place in the sigcld in the end.
617 	 */
618 	delete_ns(p->p_parent, p);
619 
620 	/*
621 	 * Reassign the orphans to the next of kin.
622 	 * Don't rearrange init's orphanage.
623 	 */
624 	if ((q = p->p_orphan) != NULL && p != proc_init) {
625 
626 		proc_t *nokp = p->p_nextofkin;
627 
628 		for (;;) {
629 			q->p_nextofkin = nokp;
630 			if (q->p_nextorph == NULL)
631 				break;
632 			q = q->p_nextorph;
633 		}
634 		q->p_nextorph = nokp->p_orphan;
635 		nokp->p_orphan = p->p_orphan;
636 		p->p_orphan = NULL;
637 	}
638 
639 	/*
640 	 * Reassign the children to init.
641 	 * Don't try to assign init's children to init.
642 	 */
643 	if ((q = p->p_child) != NULL && p != proc_init) {
644 		struct proc	*np;
645 		struct proc	*initp = proc_init;
646 		boolean_t	setzonetop = B_FALSE;
647 
648 		if (!INGLOBALZONE(curproc))
649 			setzonetop = B_TRUE;
650 
651 		pgdetach(p);
652 
653 		do {
654 			np = q->p_sibling;
655 			/*
656 			 * Delete it from its current parent new state
657 			 * list and add it to init new state list
658 			 */
659 			delete_ns(q->p_parent, q);
660 
661 			q->p_ppid = 1;
662 			q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID);
663 			if (setzonetop) {
664 				mutex_enter(&q->p_lock);
665 				q->p_flag |= SZONETOP;
666 				mutex_exit(&q->p_lock);
667 			}
668 			q->p_parent = initp;
669 
670 			/*
671 			 * Since q will be the first child,
672 			 * it will not have a previous sibling.
673 			 */
674 			q->p_psibling = NULL;
675 			if (initp->p_child) {
676 				initp->p_child->p_psibling = q;
677 			}
678 			q->p_sibling = initp->p_child;
679 			initp->p_child = q;
680 			if (q->p_proc_flag & P_PR_PTRACE) {
681 				mutex_enter(&q->p_lock);
682 				sigtoproc(q, NULL, SIGKILL);
683 				mutex_exit(&q->p_lock);
684 			}
685 			/*
686 			 * sigcld() will add the child to parents
687 			 * newstate list.
688 			 */
689 			if (q->p_stat == SZOMB)
690 				sigcld(q, NULL);
691 		} while ((q = np) != NULL);
692 
693 		p->p_child = NULL;
694 		ASSERT(p->p_child_ns == NULL);
695 	}
696 
697 	TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p);
698 
699 	mutex_enter(&p->p_lock);
700 	CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */
701 
702 	/*
703 	 * Have our task accummulate our resource usage data before they
704 	 * become contaminated by p_cacct etc., and before we renounce
705 	 * membership of the task.
706 	 *
707 	 * We do this regardless of whether or not task accounting is active.
708 	 * This is to avoid having nonsense data reported for this task if
709 	 * task accounting is subsequently enabled. The overhead is minimal;
710 	 * by this point, this process has accounted for the usage of all its
711 	 * LWPs. We nonetheless do the work here, and under the protection of
712 	 * pidlock, so that the movement of the process's usage to the task
713 	 * happens at the same time as the removal of the process from the
714 	 * task, from the point of view of exacct_snapshot_task_usage().
715 	 */
716 	exacct_update_task_mstate(p);
717 
718 	hrutime = mstate_aggr_state(p, LMS_USER);
719 	hrstime = mstate_aggr_state(p, LMS_SYSTEM);
720 	p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime;
721 	p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime;
722 
723 	p->p_acct[LMS_USER]	+= p->p_cacct[LMS_USER];
724 	p->p_acct[LMS_SYSTEM]	+= p->p_cacct[LMS_SYSTEM];
725 	p->p_acct[LMS_TRAP]	+= p->p_cacct[LMS_TRAP];
726 	p->p_acct[LMS_TFAULT]	+= p->p_cacct[LMS_TFAULT];
727 	p->p_acct[LMS_DFAULT]	+= p->p_cacct[LMS_DFAULT];
728 	p->p_acct[LMS_KFAULT]	+= p->p_cacct[LMS_KFAULT];
729 	p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK];
730 	p->p_acct[LMS_SLEEP]	+= p->p_cacct[LMS_SLEEP];
731 	p->p_acct[LMS_WAIT_CPU]	+= p->p_cacct[LMS_WAIT_CPU];
732 	p->p_acct[LMS_STOPPED]	+= p->p_cacct[LMS_STOPPED];
733 
734 	p->p_ru.minflt	+= p->p_cru.minflt;
735 	p->p_ru.majflt	+= p->p_cru.majflt;
736 	p->p_ru.nswap	+= p->p_cru.nswap;
737 	p->p_ru.inblock	+= p->p_cru.inblock;
738 	p->p_ru.oublock	+= p->p_cru.oublock;
739 	p->p_ru.msgsnd	+= p->p_cru.msgsnd;
740 	p->p_ru.msgrcv	+= p->p_cru.msgrcv;
741 	p->p_ru.nsignals += p->p_cru.nsignals;
742 	p->p_ru.nvcsw	+= p->p_cru.nvcsw;
743 	p->p_ru.nivcsw	+= p->p_cru.nivcsw;
744 	p->p_ru.sysc	+= p->p_cru.sysc;
745 	p->p_ru.ioch	+= p->p_cru.ioch;
746 
747 	p->p_stat = SZOMB;
748 	p->p_proc_flag &= ~P_PR_PTRACE;
749 	p->p_wdata = what;
750 	p->p_wcode = (char)why;
751 
752 	cdir = PTOU(p)->u_cdir;
753 	rdir = PTOU(p)->u_rdir;
754 	cwd = PTOU(p)->u_cwd;
755 
756 	ASSERT(cdir != NULL || p->p_parent == &p0);
757 
758 	/*
759 	 * Release resource controls, as they are no longer enforceable.
760 	 */
761 	rctl_set_free(p->p_rctls);
762 
763 	/*
764 	 * Decrement tk_nlwps counter for our task.max-lwps resource control.
765 	 * An extended accounting record, if that facility is active, is
766 	 * scheduled to be written.  We cannot give up task and project
767 	 * membership at this point because that would allow zombies to escape
768 	 * from the max-processes resource controls.  Zombies stay in their
769 	 * current task and project until the process table slot is released
770 	 * in freeproc().
771 	 */
772 	tk = p->p_task;
773 
774 	mutex_enter(&p->p_zone->zone_nlwps_lock);
775 	tk->tk_nlwps--;
776 	tk->tk_proj->kpj_nlwps--;
777 	p->p_zone->zone_nlwps--;
778 	mutex_exit(&p->p_zone->zone_nlwps_lock);
779 
780 	/*
781 	 * Clear the lwp directory and the lwpid hash table
782 	 * now that /proc can't bother us any more.
783 	 * We free the memory below, after dropping p->p_lock.
784 	 */
785 	lwpdir = p->p_lwpdir;
786 	lwpdir_sz = p->p_lwpdir_sz;
787 	tidhash = p->p_tidhash;
788 	tidhash_sz = p->p_tidhash_sz;
789 	ret_tidhash = p->p_ret_tidhash;
790 	p->p_lwpdir = NULL;
791 	p->p_lwpfree = NULL;
792 	p->p_lwpdir_sz = 0;
793 	p->p_tidhash = NULL;
794 	p->p_tidhash_sz = 0;
795 	p->p_ret_tidhash = NULL;
796 
797 	/*
798 	 * If the process has context ops installed, call the exit routine
799 	 * on behalf of this last remaining thread. Normally exitpctx() is
800 	 * called during thread_exit() or lwp_exit(), but because this is the
801 	 * last thread in the process, we must call it here. By the time
802 	 * thread_exit() is called (below), the association with the relevant
803 	 * process has been lost.
804 	 *
805 	 * We also free the context here.
806 	 */
807 	if (p->p_pctx) {
808 		kpreempt_disable();
809 		exitpctx(p);
810 		kpreempt_enable();
811 
812 		freepctx(p, 0);
813 	}
814 
815 	/*
816 	 * curthread's proc pointer is changed to point to the 'sched'
817 	 * process for the corresponding zone, except in the case when
818 	 * the exiting process is in fact a zsched instance, in which
819 	 * case the proc pointer is set to p0.  We do so, so that the
820 	 * process still points at the right zone when we call the VN_RELE()
821 	 * below.
822 	 *
823 	 * This is because curthread's original proc pointer can be freed as
824 	 * soon as the child sends a SIGCLD to its parent.  We use zsched so
825 	 * that for user processes, even in the final moments of death, the
826 	 * process is still associated with its zone.
827 	 */
828 	if (p != t->t_procp->p_zone->zone_zsched)
829 		t->t_procp = t->t_procp->p_zone->zone_zsched;
830 	else
831 		t->t_procp = &p0;
832 
833 	mutex_exit(&p->p_lock);
834 	if (!evaporate) {
835 		p->p_pidflag &= ~CLDPEND;
836 		sigcld(p, sqp);
837 	} else {
838 		/*
839 		 * Do what sigcld() would do if the disposition
840 		 * of the SIGCHLD signal were set to be ignored.
841 		 */
842 		cv_broadcast(&p->p_srwchan_cv);
843 		freeproc(p);
844 	}
845 	mutex_exit(&pidlock);
846 
847 	/*
848 	 * We don't release u_cdir and u_rdir until SZOMB is set.
849 	 * This protects us against dofusers().
850 	 */
851 	if (cdir)
852 		VN_RELE(cdir);
853 	if (rdir)
854 		VN_RELE(rdir);
855 	if (cwd)
856 		refstr_rele(cwd);
857 
858 	/*
859 	 * task_rele() may ultimately cause the zone to go away (or
860 	 * may cause the last user process in a zone to go away, which
861 	 * signals zsched to go away).  So prior to this call, we must
862 	 * no longer point at zsched.
863 	 */
864 	t->t_procp = &p0;
865 
866 	kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t));
867 	kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t));
868 	while (ret_tidhash != NULL) {
869 		ret_tidhash_t *next = ret_tidhash->rth_next;
870 		kmem_free(ret_tidhash->rth_tidhash,
871 		    ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t));
872 		kmem_free(ret_tidhash, sizeof (*ret_tidhash));
873 		ret_tidhash = next;
874 	}
875 
876 	thread_exit();
877 	/* NOTREACHED */
878 }
879 
880 /*
881  * Format siginfo structure for wait system calls.
882  */
883 void
884 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag)
885 {
886 	ASSERT(MUTEX_HELD(&pidlock));
887 
888 	bzero(ip, sizeof (k_siginfo_t));
889 	ip->si_signo = SIGCLD;
890 	ip->si_code = pp->p_wcode;
891 	ip->si_pid = pp->p_pid;
892 	ip->si_ctid = PRCTID(pp);
893 	ip->si_zoneid = pp->p_zone->zone_id;
894 	ip->si_status = pp->p_wdata;
895 	ip->si_stime = pp->p_stime;
896 	ip->si_utime = pp->p_utime;
897 
898 	if (waitflag) {
899 		pp->p_wcode = 0;
900 		pp->p_wdata = 0;
901 		pp->p_pidflag &= ~CLDPEND;
902 	}
903 }
904 
905 /*
906  * Wait system call.
907  * Search for a terminated (zombie) child,
908  * finally lay it to rest, and collect its status.
909  * Look also for stopped children,
910  * and pass back status from them.
911  */
912 int
913 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options)
914 {
915 	int found;
916 	proc_t *cp, *pp;
917 	int proc_gone;
918 	int waitflag = !(options & WNOWAIT);
919 
920 	/*
921 	 * Obsolete flag, defined here only for binary compatibility
922 	 * with old statically linked executables.  Delete this when
923 	 * we no longer care about these old and broken applications.
924 	 */
925 #define	_WNOCHLD	0400
926 	options &= ~_WNOCHLD;
927 
928 	if (options == 0 || (options & ~WOPTMASK))
929 		return (EINVAL);
930 
931 	switch (idtype) {
932 	case P_PID:
933 	case P_PGID:
934 		if (id < 0 || id >= maxpid)
935 			return (EINVAL);
936 		/* FALLTHROUGH */
937 	case P_ALL:
938 		break;
939 	default:
940 		return (EINVAL);
941 	}
942 
943 	pp = ttoproc(curthread);
944 
945 	/*
946 	 * lock parent mutex so that sibling chain can be searched.
947 	 */
948 	mutex_enter(&pidlock);
949 
950 	/*
951 	 * if we are only looking for exited processes and child_ns list
952 	 * is empty no reason to look at all children.
953 	 */
954 	if (idtype == P_ALL &&
955 	    (options & ~WNOWAIT) == (WNOHANG | WEXITED) &&
956 	    pp->p_child_ns == NULL) {
957 		if (pp->p_child) {
958 			mutex_exit(&pidlock);
959 			bzero(ip, sizeof (k_siginfo_t));
960 			return (0);
961 		}
962 		mutex_exit(&pidlock);
963 		return (ECHILD);
964 	}
965 
966 	while (pp->p_child != NULL) {
967 
968 		proc_gone = 0;
969 
970 		for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) {
971 			if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID))
972 				continue;
973 			if (idtype == P_PID && id != cp->p_pid)
974 				continue;
975 			if (idtype == P_PGID && id != cp->p_pgrp)
976 				continue;
977 
978 			switch (cp->p_wcode) {
979 
980 			case CLD_TRAPPED:
981 			case CLD_STOPPED:
982 			case CLD_CONTINUED:
983 				cmn_err(CE_PANIC,
984 				    "waitid: wrong state %d on the p_newstate"
985 				    " list", cp->p_wcode);
986 				break;
987 
988 			case CLD_EXITED:
989 			case CLD_DUMPED:
990 			case CLD_KILLED:
991 				if (!(options & WEXITED)) {
992 					/*
993 					 * Count how many are already gone
994 					 * for good.
995 					 */
996 					proc_gone++;
997 					break;
998 				}
999 				if (!waitflag) {
1000 					winfo(cp, ip, 0);
1001 				} else {
1002 					winfo(cp, ip, 1);
1003 					freeproc(cp);
1004 				}
1005 				mutex_exit(&pidlock);
1006 				if (waitflag) {		/* accept SIGCLD */
1007 					sigcld_delete(ip);
1008 					sigcld_repost();
1009 				}
1010 				return (0);
1011 			}
1012 
1013 			if (idtype == P_PID)
1014 				break;
1015 		}
1016 
1017 		/*
1018 		 * Wow! None of the threads on the p_sibling_ns list were
1019 		 * interesting threads. Check all the kids!
1020 		 */
1021 		found = 0;
1022 		for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) {
1023 			if (idtype == P_PID && id != cp->p_pid)
1024 				continue;
1025 			if (idtype == P_PGID && id != cp->p_pgrp)
1026 				continue;
1027 
1028 			switch (cp->p_wcode) {
1029 			case CLD_TRAPPED:
1030 				if (!(options & WTRAPPED))
1031 					break;
1032 				winfo(cp, ip, waitflag);
1033 				mutex_exit(&pidlock);
1034 				if (waitflag) {		/* accept SIGCLD */
1035 					sigcld_delete(ip);
1036 					sigcld_repost();
1037 				}
1038 				return (0);
1039 
1040 			case CLD_STOPPED:
1041 				if (!(options & WSTOPPED))
1042 					break;
1043 				/* Is it still stopped? */
1044 				mutex_enter(&cp->p_lock);
1045 				if (!jobstopped(cp)) {
1046 					mutex_exit(&cp->p_lock);
1047 					break;
1048 				}
1049 				mutex_exit(&cp->p_lock);
1050 				winfo(cp, ip, waitflag);
1051 				mutex_exit(&pidlock);
1052 				if (waitflag) {		/* accept SIGCLD */
1053 					sigcld_delete(ip);
1054 					sigcld_repost();
1055 				}
1056 				return (0);
1057 
1058 			case CLD_CONTINUED:
1059 				if (!(options & WCONTINUED))
1060 					break;
1061 				winfo(cp, ip, waitflag);
1062 				mutex_exit(&pidlock);
1063 				if (waitflag) {		/* accept SIGCLD */
1064 					sigcld_delete(ip);
1065 					sigcld_repost();
1066 				}
1067 				return (0);
1068 
1069 			case CLD_EXITED:
1070 			case CLD_DUMPED:
1071 			case CLD_KILLED:
1072 				if (idtype != P_PID &&
1073 				    (cp->p_pidflag & CLDWAITPID))
1074 					continue;
1075 				/*
1076 				 * Don't complain if a process was found in
1077 				 * the first loop but we broke out of the loop
1078 				 * because of the arguments passed to us.
1079 				 */
1080 				if (proc_gone == 0) {
1081 					cmn_err(CE_PANIC,
1082 					    "waitid: wrong state on the"
1083 					    " p_child list");
1084 				} else {
1085 					break;
1086 				}
1087 			}
1088 
1089 			found++;
1090 
1091 			if (idtype == P_PID)
1092 				break;
1093 		}
1094 
1095 		/*
1096 		 * If we found no interesting processes at all,
1097 		 * break out and return ECHILD.
1098 		 */
1099 		if (found + proc_gone == 0)
1100 			break;
1101 
1102 		if (options & WNOHANG) {
1103 			mutex_exit(&pidlock);
1104 			bzero(ip, sizeof (k_siginfo_t));
1105 			/*
1106 			 * We should set ip->si_signo = SIGCLD,
1107 			 * but there is an SVVS test that expects
1108 			 * ip->si_signo to be zero in this case.
1109 			 */
1110 			return (0);
1111 		}
1112 
1113 		/*
1114 		 * If we found no processes of interest that could
1115 		 * change state while we wait, we don't wait at all.
1116 		 * Get out with ECHILD according to SVID.
1117 		 */
1118 		if (found == proc_gone)
1119 			break;
1120 
1121 		if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) {
1122 			mutex_exit(&pidlock);
1123 			return (EINTR);
1124 		}
1125 	}
1126 	mutex_exit(&pidlock);
1127 	return (ECHILD);
1128 }
1129 
1130 int
1131 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1132 {
1133 	int error;
1134 	k_siginfo_t info;
1135 
1136 	if (error = waitid(idtype, id, &info, options))
1137 		return (set_errno(error));
1138 	if (copyout(&info, infop, sizeof (k_siginfo_t)))
1139 		return (set_errno(EFAULT));
1140 	return (0);
1141 }
1142 
1143 #ifdef _SYSCALL32_IMPL
1144 
1145 int
1146 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options)
1147 {
1148 	int error;
1149 	k_siginfo_t info;
1150 	siginfo32_t info32;
1151 
1152 	if (error = waitid(idtype, id, &info, options))
1153 		return (set_errno(error));
1154 	siginfo_kto32(&info, &info32);
1155 	if (copyout(&info32, infop, sizeof (info32)))
1156 		return (set_errno(EFAULT));
1157 	return (0);
1158 }
1159 
1160 #endif	/* _SYSCALL32_IMPL */
1161 
1162 void
1163 proc_detach(proc_t *p)
1164 {
1165 	proc_t *q;
1166 
1167 	ASSERT(MUTEX_HELD(&pidlock));
1168 
1169 	q = p->p_parent;
1170 	ASSERT(q != NULL);
1171 
1172 	/*
1173 	 * Take it off the newstate list of its parent
1174 	 */
1175 	delete_ns(q, p);
1176 
1177 	if (q->p_child == p) {
1178 		q->p_child = p->p_sibling;
1179 		/*
1180 		 * If the parent has no children, it better not
1181 		 * have any with new states either!
1182 		 */
1183 		ASSERT(q->p_child ? 1 : q->p_child_ns == NULL);
1184 	}
1185 
1186 	if (p->p_sibling) {
1187 		p->p_sibling->p_psibling = p->p_psibling;
1188 	}
1189 
1190 	if (p->p_psibling) {
1191 		p->p_psibling->p_sibling = p->p_sibling;
1192 	}
1193 }
1194 
1195 /*
1196  * Remove zombie children from the process table.
1197  */
1198 void
1199 freeproc(proc_t *p)
1200 {
1201 	proc_t *q;
1202 	task_t *tk;
1203 
1204 	ASSERT(p->p_stat == SZOMB);
1205 	ASSERT(p->p_tlist == NULL);
1206 	ASSERT(MUTEX_HELD(&pidlock));
1207 
1208 	sigdelq(p, NULL, 0);
1209 	if (p->p_killsqp) {
1210 		siginfofree(p->p_killsqp);
1211 		p->p_killsqp = NULL;
1212 	}
1213 
1214 	prfree(p);	/* inform /proc */
1215 
1216 	/*
1217 	 * Don't free the init processes.
1218 	 * Other dying processes will access it.
1219 	 */
1220 	if (p == proc_init)
1221 		return;
1222 
1223 
1224 	/*
1225 	 * We wait until now to free the cred structure because a
1226 	 * zombie process's credentials may be examined by /proc.
1227 	 * No cred locking needed because there are no threads at this point.
1228 	 */
1229 	upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred));
1230 	crfree(p->p_cred);
1231 	if (p->p_corefile != NULL) {
1232 		corectl_path_rele(p->p_corefile);
1233 		p->p_corefile = NULL;
1234 	}
1235 	if (p->p_content != NULL) {
1236 		corectl_content_rele(p->p_content);
1237 		p->p_content = NULL;
1238 	}
1239 
1240 	if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) ||
1241 	    (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) {
1242 		/*
1243 		 * This should still do the right thing since p_utime/stime
1244 		 * get set to the correct value on process exit, so it
1245 		 * should get properly updated
1246 		 */
1247 		p->p_nextofkin->p_cutime += p->p_utime;
1248 		p->p_nextofkin->p_cstime += p->p_stime;
1249 
1250 		p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER];
1251 		p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM];
1252 		p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP];
1253 		p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT];
1254 		p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT];
1255 		p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT];
1256 		p->p_nextofkin->p_cacct[LMS_USER_LOCK]
1257 		    += p->p_acct[LMS_USER_LOCK];
1258 		p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP];
1259 		p->p_nextofkin->p_cacct[LMS_WAIT_CPU]
1260 		    += p->p_acct[LMS_WAIT_CPU];
1261 		p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED];
1262 
1263 		p->p_nextofkin->p_cru.minflt	+= p->p_ru.minflt;
1264 		p->p_nextofkin->p_cru.majflt	+= p->p_ru.majflt;
1265 		p->p_nextofkin->p_cru.nswap	+= p->p_ru.nswap;
1266 		p->p_nextofkin->p_cru.inblock	+= p->p_ru.inblock;
1267 		p->p_nextofkin->p_cru.oublock	+= p->p_ru.oublock;
1268 		p->p_nextofkin->p_cru.msgsnd	+= p->p_ru.msgsnd;
1269 		p->p_nextofkin->p_cru.msgrcv	+= p->p_ru.msgrcv;
1270 		p->p_nextofkin->p_cru.nsignals	+= p->p_ru.nsignals;
1271 		p->p_nextofkin->p_cru.nvcsw	+= p->p_ru.nvcsw;
1272 		p->p_nextofkin->p_cru.nivcsw	+= p->p_ru.nivcsw;
1273 		p->p_nextofkin->p_cru.sysc	+= p->p_ru.sysc;
1274 		p->p_nextofkin->p_cru.ioch	+= p->p_ru.ioch;
1275 
1276 	}
1277 
1278 	q = p->p_nextofkin;
1279 	if (q && q->p_orphan == p)
1280 		q->p_orphan = p->p_nextorph;
1281 	else if (q) {
1282 		for (q = q->p_orphan; q; q = q->p_nextorph)
1283 			if (q->p_nextorph == p)
1284 				break;
1285 		ASSERT(q && q->p_nextorph == p);
1286 		q->p_nextorph = p->p_nextorph;
1287 	}
1288 
1289 	/*
1290 	 * The process table slot is being freed, so it is now safe to give up
1291 	 * task and project membership.
1292 	 */
1293 	mutex_enter(&p->p_lock);
1294 	tk = p->p_task;
1295 	task_detach(p);
1296 	mutex_exit(&p->p_lock);
1297 
1298 	proc_detach(p);
1299 	pid_exit(p, tk);	/* frees pid and proc structure */
1300 
1301 	task_rele(tk);
1302 }
1303 
1304 /*
1305  * Delete process "child" from the newstate list of process "parent"
1306  */
1307 void
1308 delete_ns(proc_t *parent, proc_t *child)
1309 {
1310 	proc_t **ns;
1311 
1312 	ASSERT(MUTEX_HELD(&pidlock));
1313 	ASSERT(child->p_parent == parent);
1314 	for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) {
1315 		if (*ns == child) {
1316 
1317 			ASSERT((*ns)->p_parent == parent);
1318 
1319 			*ns = child->p_sibling_ns;
1320 			child->p_sibling_ns = NULL;
1321 			return;
1322 		}
1323 	}
1324 }
1325 
1326 /*
1327  * Add process "child" to the new state list of process "parent"
1328  */
1329 void
1330 add_ns(proc_t *parent, proc_t *child)
1331 {
1332 	ASSERT(child->p_sibling_ns == NULL);
1333 	child->p_sibling_ns = parent->p_child_ns;
1334 	parent->p_child_ns = child;
1335 }
1336