1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, Joyent, Inc. All rights reserved. 25 * Copyright 2020 Oxide Computer Company 26 * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. 27 */ 28 29 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 30 31 #include <sys/types.h> 32 #include <sys/param.h> 33 #include <sys/sysmacros.h> 34 #include <sys/systm.h> 35 #include <sys/cred.h> 36 #include <sys/user.h> 37 #include <sys/errno.h> 38 #include <sys/proc.h> 39 #include <sys/ucontext.h> 40 #include <sys/procfs.h> 41 #include <sys/vnode.h> 42 #include <sys/acct.h> 43 #include <sys/var.h> 44 #include <sys/cmn_err.h> 45 #include <sys/debug.h> 46 #include <sys/wait.h> 47 #include <sys/siginfo.h> 48 #include <sys/procset.h> 49 #include <sys/class.h> 50 #include <sys/file.h> 51 #include <sys/session.h> 52 #include <sys/kmem.h> 53 #include <sys/vtrace.h> 54 #include <sys/prsystm.h> 55 #include <sys/ipc.h> 56 #include <sys/sem_impl.h> 57 #include <c2/audit.h> 58 #include <sys/aio_impl.h> 59 #include <vm/as.h> 60 #include <sys/poll.h> 61 #include <sys/door.h> 62 #include <sys/lwpchan_impl.h> 63 #include <sys/utrap.h> 64 #include <sys/task.h> 65 #include <sys/exacct.h> 66 #include <sys/cyclic.h> 67 #include <sys/schedctl.h> 68 #include <sys/rctl.h> 69 #include <sys/contract_impl.h> 70 #include <sys/contract/process_impl.h> 71 #include <sys/list.h> 72 #include <sys/dtrace.h> 73 #include <sys/pool.h> 74 #include <sys/sdt.h> 75 #include <sys/corectl.h> 76 #include <sys/core.h> 77 #include <sys/brand.h> 78 #include <sys/libc_kernel.h> 79 80 /* 81 * convert code/data pair into old style wait status 82 */ 83 int 84 wstat(int code, int data) 85 { 86 int stat = (data & 0377); 87 88 switch (code) { 89 case CLD_EXITED: 90 stat <<= 8; 91 break; 92 case CLD_DUMPED: 93 stat |= WCOREFLG; 94 break; 95 case CLD_KILLED: 96 break; 97 case CLD_TRAPPED: 98 case CLD_STOPPED: 99 stat <<= 8; 100 stat |= WSTOPFLG; 101 break; 102 case CLD_CONTINUED: 103 stat = WCONTFLG; 104 break; 105 default: 106 cmn_err(CE_PANIC, "wstat: bad code"); 107 /* NOTREACHED */ 108 } 109 return (stat); 110 } 111 112 static char * 113 exit_reason(char *buf, size_t bufsz, int what, int why) 114 { 115 switch (why) { 116 case CLD_EXITED: 117 (void) snprintf(buf, bufsz, "exited with status %d", what); 118 break; 119 case CLD_KILLED: 120 (void) snprintf(buf, bufsz, "exited on fatal signal %d", what); 121 break; 122 case CLD_DUMPED: 123 (void) snprintf(buf, bufsz, "core dumped on signal %d", what); 124 break; 125 default: 126 (void) snprintf(buf, bufsz, "encountered unknown error " 127 "(%d, %d)", why, what); 128 break; 129 } 130 131 return (buf); 132 } 133 134 /* 135 * exit system call: pass back caller's arg. 136 */ 137 void 138 rexit(int rval) 139 { 140 exit(CLD_EXITED, rval); 141 } 142 143 /* 144 * Called by proc_exit() when a zone's init exits, presumably because 145 * it failed. As long as the given zone is still in the "running" 146 * state, we will re-exec() init, but first we need to reset things 147 * which are usually inherited across exec() but will break init's 148 * assumption that it is being exec()'d from a virgin process. Most 149 * importantly this includes closing all file descriptors (exec only 150 * closes those marked close-on-exec) and resetting signals (exec only 151 * resets handled signals, and we need to clear any signals which 152 * killed init). Anything else that exec(2) says would be inherited, 153 * but would affect the execution of init, needs to be reset. 154 */ 155 static int 156 restart_init(int what, int why) 157 { 158 kthread_t *t = curthread; 159 klwp_t *lwp = ttolwp(t); 160 proc_t *p = ttoproc(t); 161 proc_t *pp = p->p_zone->zone_zsched; 162 user_t *up = PTOU(p); 163 164 vnode_t *oldcd, *oldrd; 165 int i, err; 166 char reason_buf[64]; 167 168 /* 169 * Let zone admin (and global zone admin if this is for a non-global 170 * zone) know that init has failed and will be restarted. 171 */ 172 zcmn_err(p->p_zone->zone_id, CE_WARN, 173 "init(1M) %s: restarting automatically", 174 exit_reason(reason_buf, sizeof (reason_buf), what, why)); 175 176 if (!INGLOBALZONE(p)) { 177 cmn_err(CE_WARN, "init(1M) for zone %s (pid %d) %s: " 178 "restarting automatically", 179 p->p_zone->zone_name, p->p_pid, reason_buf); 180 } 181 182 /* 183 * Remove any fpollinfo_t's for this (last) thread from our file 184 * descriptors so closeall() can ASSERT() that they're all gone. 185 * Then close all open file descriptors in the process. 186 */ 187 pollcleanup(); 188 closeall(P_FINFO(p)); 189 190 /* 191 * Grab p_lock and begin clearing miscellaneous global process 192 * state that needs to be reset before we exec the new init(1M). 193 */ 194 195 mutex_enter(&p->p_lock); 196 prbarrier(p); 197 198 p->p_flag &= ~(SKILLED | SEXTKILLED | SEXITING | SDOCORE); 199 up->u_cmask = CMASK; 200 201 sigemptyset(&t->t_hold); 202 sigemptyset(&t->t_sig); 203 sigemptyset(&t->t_extsig); 204 205 sigemptyset(&p->p_sig); 206 sigemptyset(&p->p_extsig); 207 208 sigdelq(p, t, 0); 209 sigdelq(p, NULL, 0); 210 211 if (p->p_killsqp) { 212 siginfofree(p->p_killsqp); 213 p->p_killsqp = NULL; 214 } 215 216 /* 217 * Reset any signals that are ignored back to the default disposition. 218 * Other u_signal members will be cleared when exec calls sigdefault(). 219 */ 220 for (i = 1; i < NSIG; i++) { 221 if (up->u_signal[i - 1] == SIG_IGN) { 222 up->u_signal[i - 1] = SIG_DFL; 223 sigemptyset(&up->u_sigmask[i - 1]); 224 } 225 } 226 227 /* 228 * Clear the current signal, any signal info associated with it, and 229 * any signal information from contracts and/or contract templates. 230 */ 231 lwp->lwp_cursig = 0; 232 lwp->lwp_extsig = 0; 233 if (lwp->lwp_curinfo != NULL) { 234 siginfofree(lwp->lwp_curinfo); 235 lwp->lwp_curinfo = NULL; 236 } 237 lwp_ctmpl_clear(lwp); 238 239 /* 240 * Reset both the process root directory and the current working 241 * directory to the root of the zone just as we do during boot. 242 */ 243 VN_HOLD(p->p_zone->zone_rootvp); 244 oldrd = up->u_rdir; 245 up->u_rdir = p->p_zone->zone_rootvp; 246 247 VN_HOLD(p->p_zone->zone_rootvp); 248 oldcd = up->u_cdir; 249 up->u_cdir = p->p_zone->zone_rootvp; 250 251 if (up->u_cwd != NULL) { 252 refstr_rele(up->u_cwd); 253 up->u_cwd = NULL; 254 } 255 256 /* Reset security flags */ 257 mutex_enter(&pp->p_lock); 258 p->p_secflags = pp->p_secflags; 259 mutex_exit(&pp->p_lock); 260 261 mutex_exit(&p->p_lock); 262 263 if (oldrd != NULL) 264 VN_RELE(oldrd); 265 if (oldcd != NULL) 266 VN_RELE(oldcd); 267 268 /* 269 * It's possible that a zone's init will have become privilege aware 270 * and modified privilege sets; reset them. 271 */ 272 cred_t *oldcr, *newcr; 273 274 mutex_enter(&p->p_crlock); 275 oldcr = p->p_cred; 276 mutex_enter(&pp->p_crlock); 277 crhold(newcr = p->p_cred = pp->p_cred); 278 mutex_exit(&pp->p_crlock); 279 mutex_exit(&p->p_crlock); 280 crfree(oldcr); 281 /* Additional hold for the current thread - expected by crset() */ 282 crhold(newcr); 283 crset(p, newcr); 284 285 /* Free the controlling tty. (freectty() always assumes curproc.) */ 286 ASSERT(p == curproc); 287 (void) freectty(B_TRUE); 288 289 /* 290 * Now exec() the new init(1M) on top of the current process. If we 291 * succeed, the caller will treat this like a successful system call. 292 * If we fail, we issue messages and the caller will proceed with exit. 293 */ 294 err = exec_init(p->p_zone->zone_initname, NULL); 295 296 if (err == 0) 297 return (0); 298 299 zcmn_err(p->p_zone->zone_id, CE_WARN, 300 "failed to restart init(1M) (err=%d): system reboot required", err); 301 302 if (!INGLOBALZONE(p)) { 303 cmn_err(CE_WARN, "failed to restart init(1M) for zone %s " 304 "(pid %d, err=%d): zoneadm(1M) boot required", 305 p->p_zone->zone_name, p->p_pid, err); 306 } 307 308 return (-1); 309 } 310 311 /* 312 * Release resources. 313 * Enter zombie state. 314 * Wake up parent and init processes, 315 * and dispose of children. 316 */ 317 void 318 exit(int why, int what) 319 { 320 /* 321 * If proc_exit() fails, then some other lwp in the process 322 * got there first. We just have to call lwp_exit() to allow 323 * the other lwp to finish exiting the process. Otherwise we're 324 * restarting init, and should return. 325 */ 326 if (proc_exit(why, what) != 0) { 327 mutex_enter(&curproc->p_lock); 328 ASSERT(curproc->p_flag & SEXITLWPS); 329 lwp_exit(); 330 /* NOTREACHED */ 331 } 332 } 333 334 /* 335 * Set the SEXITING flag on the process, after making sure /proc does 336 * not have it locked. This is done in more places than proc_exit(), 337 * so it is a separate function. 338 */ 339 void 340 proc_is_exiting(proc_t *p) 341 { 342 mutex_enter(&p->p_lock); 343 prbarrier(p); 344 p->p_flag |= SEXITING; 345 mutex_exit(&p->p_lock); 346 } 347 348 /* 349 * Return value: 350 * 1 - exitlwps() failed, call (or continue) lwp_exit() 351 * 0 - restarting init. Return through system call path 352 */ 353 int 354 proc_exit(int why, int what) 355 { 356 kthread_t *t = curthread; 357 klwp_t *lwp = ttolwp(t); 358 proc_t *p = ttoproc(t); 359 zone_t *z = p->p_zone; 360 timeout_id_t tmp_id; 361 int rv; 362 proc_t *q; 363 task_t *tk; 364 vnode_t *exec_vp, *execdir_vp, *cdir, *rdir; 365 sigqueue_t *sqp; 366 lwpdir_t *lwpdir; 367 uint_t lwpdir_sz; 368 tidhash_t *tidhash; 369 uint_t tidhash_sz; 370 ret_tidhash_t *ret_tidhash; 371 refstr_t *cwd; 372 hrtime_t hrutime, hrstime; 373 int evaporate; 374 375 /* 376 * Stop and discard the process's lwps except for the current one, 377 * unless some other lwp beat us to it. If exitlwps() fails then 378 * return and the calling lwp will call (or continue in) lwp_exit(). 379 */ 380 proc_is_exiting(p); 381 if (exitlwps(0) != 0) 382 return (1); 383 384 mutex_enter(&p->p_lock); 385 if (p->p_ttime > 0) { 386 /* 387 * Account any remaining ticks charged to this process 388 * on its way out. 389 */ 390 (void) task_cpu_time_incr(p->p_task, p->p_ttime); 391 p->p_ttime = 0; 392 } 393 mutex_exit(&p->p_lock); 394 395 DTRACE_PROC(lwp__exit); 396 DTRACE_PROC1(exit, int, why); 397 398 /* 399 * Will perform any brand specific proc exit processing, since this 400 * is always the last lwp, will also perform lwp_exit and free brand 401 * data 402 */ 403 if (PROC_IS_BRANDED(p)) { 404 lwp_detach_brand_hdlrs(lwp); 405 brand_clearbrand(p, B_FALSE); 406 } 407 408 /* 409 * Don't let init exit unless zone_start_init() failed its exec, or 410 * we are shutting down the zone or the machine. 411 * 412 * Since we are single threaded, we don't need to lock the 413 * following accesses to zone_proc_initpid. 414 */ 415 if (p->p_pid == z->zone_proc_initpid) { 416 if (z->zone_boot_err == 0 && 417 zone_status_get(z) < ZONE_IS_SHUTTING_DOWN && 418 zone_status_get(global_zone) < ZONE_IS_SHUTTING_DOWN) { 419 if (z->zone_restart_init == B_TRUE) { 420 if (restart_init(what, why) == 0) 421 return (0); 422 } else { 423 (void) zone_kadmin(A_SHUTDOWN, AD_HALT, NULL, 424 CRED()); 425 } 426 } 427 428 /* 429 * Since we didn't or couldn't restart init, we clear 430 * the zone's init state and proceed with exit 431 * processing. 432 */ 433 z->zone_proc_initpid = -1; 434 } 435 436 lwp_pcb_exit(); 437 438 /* 439 * Allocate a sigqueue now, before we grab locks. 440 * It will be given to sigcld(), below. 441 * Special case: If we will be making the process disappear 442 * without a trace because it is either: 443 * * an exiting SSYS process, or 444 * * a posix_spawn() vfork child who requests it, 445 * we don't bother to allocate a useless sigqueue. 446 */ 447 evaporate = (p->p_flag & SSYS) || ((p->p_flag & SVFORK) && 448 why == CLD_EXITED && what == _EVAPORATE); 449 if (!evaporate) 450 sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP); 451 452 /* 453 * revoke any doors created by the process. 454 */ 455 if (p->p_door_list) 456 door_exit(); 457 458 /* 459 * Release schedctl data structures. 460 */ 461 if (p->p_pagep) 462 schedctl_proc_cleanup(); 463 464 /* 465 * make sure all pending kaio has completed. 466 */ 467 if (p->p_aio) 468 aio_cleanup_exit(); 469 470 /* 471 * discard the lwpchan cache. 472 */ 473 if (p->p_lcp != NULL) 474 lwpchan_destroy_cache(0); 475 476 /* 477 * Clean up any DTrace helper actions or probes for the process. 478 */ 479 if (p->p_dtrace_helpers != NULL) { 480 ASSERT(dtrace_helpers_cleanup != NULL); 481 (*dtrace_helpers_cleanup)(p); 482 } 483 484 /* 485 * Clean up any signalfd state for the process. 486 */ 487 if (p->p_sigfd != NULL) { 488 VERIFY(sigfd_exit_helper != NULL); 489 (*sigfd_exit_helper)(); 490 } 491 492 /* untimeout the realtime timers */ 493 if (p->p_itimer != NULL) 494 timer_exit(); 495 496 if ((tmp_id = p->p_alarmid) != 0) { 497 p->p_alarmid = 0; 498 (void) untimeout(tmp_id); 499 } 500 501 /* 502 * If we had generated any upanic(2) state, free that now. 503 */ 504 if (p->p_upanic != NULL) { 505 kmem_free(p->p_upanic, PRUPANIC_BUFLEN); 506 p->p_upanic = NULL; 507 } 508 509 /* 510 * Remove any fpollinfo_t's for this (last) thread from our file 511 * descriptors so closeall() can ASSERT() that they're all gone. 512 */ 513 pollcleanup(); 514 515 if (p->p_rprof_cyclic != CYCLIC_NONE) { 516 mutex_enter(&cpu_lock); 517 cyclic_remove(p->p_rprof_cyclic); 518 mutex_exit(&cpu_lock); 519 } 520 521 mutex_enter(&p->p_lock); 522 523 /* 524 * Clean up any DTrace probes associated with this process. 525 */ 526 if (p->p_dtrace_probes) { 527 ASSERT(dtrace_fasttrap_exit_ptr != NULL); 528 dtrace_fasttrap_exit_ptr(p); 529 } 530 531 while ((tmp_id = p->p_itimerid) != 0) { 532 p->p_itimerid = 0; 533 mutex_exit(&p->p_lock); 534 (void) untimeout(tmp_id); 535 mutex_enter(&p->p_lock); 536 } 537 538 lwp_cleanup(); 539 540 /* 541 * We are about to exit; prevent our resource associations from 542 * being changed. 543 */ 544 pool_barrier_enter(); 545 546 /* 547 * Block the process against /proc now that we have really 548 * acquired p->p_lock (to manipulate p_tlist at least). 549 */ 550 prbarrier(p); 551 552 sigfillset(&p->p_ignore); 553 sigemptyset(&p->p_siginfo); 554 sigemptyset(&p->p_sig); 555 sigemptyset(&p->p_extsig); 556 sigemptyset(&t->t_sig); 557 sigemptyset(&t->t_extsig); 558 sigemptyset(&p->p_sigmask); 559 sigdelq(p, t, 0); 560 lwp->lwp_cursig = 0; 561 lwp->lwp_extsig = 0; 562 p->p_flag &= ~(SKILLED | SEXTKILLED); 563 if (lwp->lwp_curinfo) { 564 siginfofree(lwp->lwp_curinfo); 565 lwp->lwp_curinfo = NULL; 566 } 567 568 t->t_proc_flag |= TP_LWPEXIT; 569 ASSERT(p->p_lwpcnt == 1 && p->p_zombcnt == 0); 570 prlwpexit(t); /* notify /proc */ 571 lwp_hash_out(p, t->t_tid); 572 prexit(p); 573 574 p->p_lwpcnt = 0; 575 p->p_tlist = NULL; 576 sigqfree(p); 577 term_mstate(t); 578 p->p_mterm = gethrtime(); 579 580 exec_vp = p->p_exec; 581 execdir_vp = p->p_execdir; 582 p->p_exec = NULLVP; 583 p->p_execdir = NULLVP; 584 mutex_exit(&p->p_lock); 585 586 pr_free_watched_pages(p); 587 588 closeall(P_FINFO(p)); 589 590 /* Free the controlling tty. (freectty() always assumes curproc.) */ 591 ASSERT(p == curproc); 592 (void) freectty(B_TRUE); 593 594 #if defined(__sparc) 595 if (p->p_utraps != NULL) 596 utrap_free(p); 597 #endif 598 if (p->p_semacct) /* IPC semaphore exit */ 599 semexit(p); 600 rv = wstat(why, what); 601 602 acct(rv & 0xff); 603 exacct_commit_proc(p, rv); 604 605 /* 606 * Release any resources associated with C2 auditing 607 */ 608 if (AU_AUDITING()) { 609 /* 610 * audit exit system call 611 */ 612 audit_exit(why, what); 613 } 614 615 /* 616 * Free address space. 617 */ 618 relvm(); 619 620 if (exec_vp) { 621 /* 622 * Close this executable which has been opened when the process 623 * was created by getproc(). 624 */ 625 (void) VOP_CLOSE(exec_vp, FREAD, 1, (offset_t)0, CRED(), NULL); 626 VN_RELE(exec_vp); 627 } 628 if (execdir_vp) 629 VN_RELE(execdir_vp); 630 631 /* 632 * Release held contracts. 633 */ 634 contract_exit(p); 635 636 /* 637 * Depart our encapsulating process contract. 638 */ 639 if ((p->p_flag & SSYS) == 0) { 640 ASSERT(p->p_ct_process); 641 contract_process_exit(p->p_ct_process, p, rv); 642 } 643 644 /* 645 * Remove pool association, and block if requested by pool_do_bind. 646 */ 647 mutex_enter(&p->p_lock); 648 ASSERT(p->p_pool->pool_ref > 0); 649 atomic_dec_32(&p->p_pool->pool_ref); 650 p->p_pool = pool_default; 651 /* 652 * Now that our address space has been freed and all other threads 653 * in this process have exited, set the PEXITED pool flag. This 654 * tells the pools subsystems to ignore this process if it was 655 * requested to rebind this process to a new pool. 656 */ 657 p->p_poolflag |= PEXITED; 658 pool_barrier_exit(); 659 mutex_exit(&p->p_lock); 660 661 mutex_enter(&pidlock); 662 663 /* 664 * Delete this process from the newstate list of its parent. We 665 * will put it in the right place in the sigcld in the end. 666 */ 667 delete_ns(p->p_parent, p); 668 669 /* 670 * Reassign the orphans to the next of kin. 671 * Don't rearrange init's orphanage. 672 */ 673 if ((q = p->p_orphan) != NULL && p != proc_init) { 674 675 proc_t *nokp = p->p_nextofkin; 676 677 for (;;) { 678 q->p_nextofkin = nokp; 679 if (q->p_nextorph == NULL) 680 break; 681 q = q->p_nextorph; 682 } 683 q->p_nextorph = nokp->p_orphan; 684 nokp->p_orphan = p->p_orphan; 685 p->p_orphan = NULL; 686 } 687 688 /* 689 * Reassign the children to init. 690 * Don't try to assign init's children to init. 691 */ 692 if ((q = p->p_child) != NULL && p != proc_init) { 693 struct proc *np; 694 struct proc *initp = proc_init; 695 boolean_t setzonetop = B_FALSE; 696 697 if (!INGLOBALZONE(curproc)) 698 setzonetop = B_TRUE; 699 700 pgdetach(p); 701 702 do { 703 np = q->p_sibling; 704 /* 705 * Delete it from its current parent new state 706 * list and add it to init new state list 707 */ 708 delete_ns(q->p_parent, q); 709 710 q->p_ppid = 1; 711 q->p_pidflag &= ~(CLDNOSIGCHLD | CLDWAITPID); 712 if (setzonetop) { 713 mutex_enter(&q->p_lock); 714 q->p_flag |= SZONETOP; 715 mutex_exit(&q->p_lock); 716 } 717 q->p_parent = initp; 718 719 /* 720 * Since q will be the first child, 721 * it will not have a previous sibling. 722 */ 723 q->p_psibling = NULL; 724 if (initp->p_child) { 725 initp->p_child->p_psibling = q; 726 } 727 q->p_sibling = initp->p_child; 728 initp->p_child = q; 729 if (q->p_proc_flag & P_PR_PTRACE) { 730 mutex_enter(&q->p_lock); 731 sigtoproc(q, NULL, SIGKILL); 732 mutex_exit(&q->p_lock); 733 } 734 /* 735 * sigcld() will add the child to parents 736 * newstate list. 737 */ 738 if (q->p_stat == SZOMB) 739 sigcld(q, NULL); 740 } while ((q = np) != NULL); 741 742 p->p_child = NULL; 743 ASSERT(p->p_child_ns == NULL); 744 } 745 746 TRACE_1(TR_FAC_PROC, TR_PROC_EXIT, "proc_exit: %p", p); 747 748 mutex_enter(&p->p_lock); 749 CL_EXIT(curthread); /* tell the scheduler that curthread is exiting */ 750 751 /* 752 * Have our task accummulate our resource usage data before they 753 * become contaminated by p_cacct etc., and before we renounce 754 * membership of the task. 755 * 756 * We do this regardless of whether or not task accounting is active. 757 * This is to avoid having nonsense data reported for this task if 758 * task accounting is subsequently enabled. The overhead is minimal; 759 * by this point, this process has accounted for the usage of all its 760 * LWPs. We nonetheless do the work here, and under the protection of 761 * pidlock, so that the movement of the process's usage to the task 762 * happens at the same time as the removal of the process from the 763 * task, from the point of view of exacct_snapshot_task_usage(). 764 */ 765 exacct_update_task_mstate(p); 766 767 hrutime = mstate_aggr_state(p, LMS_USER); 768 hrstime = mstate_aggr_state(p, LMS_SYSTEM); 769 p->p_utime = (clock_t)NSEC_TO_TICK(hrutime) + p->p_cutime; 770 p->p_stime = (clock_t)NSEC_TO_TICK(hrstime) + p->p_cstime; 771 772 p->p_acct[LMS_USER] += p->p_cacct[LMS_USER]; 773 p->p_acct[LMS_SYSTEM] += p->p_cacct[LMS_SYSTEM]; 774 p->p_acct[LMS_TRAP] += p->p_cacct[LMS_TRAP]; 775 p->p_acct[LMS_TFAULT] += p->p_cacct[LMS_TFAULT]; 776 p->p_acct[LMS_DFAULT] += p->p_cacct[LMS_DFAULT]; 777 p->p_acct[LMS_KFAULT] += p->p_cacct[LMS_KFAULT]; 778 p->p_acct[LMS_USER_LOCK] += p->p_cacct[LMS_USER_LOCK]; 779 p->p_acct[LMS_SLEEP] += p->p_cacct[LMS_SLEEP]; 780 p->p_acct[LMS_WAIT_CPU] += p->p_cacct[LMS_WAIT_CPU]; 781 p->p_acct[LMS_STOPPED] += p->p_cacct[LMS_STOPPED]; 782 783 p->p_ru.minflt += p->p_cru.minflt; 784 p->p_ru.majflt += p->p_cru.majflt; 785 p->p_ru.nswap += p->p_cru.nswap; 786 p->p_ru.inblock += p->p_cru.inblock; 787 p->p_ru.oublock += p->p_cru.oublock; 788 p->p_ru.msgsnd += p->p_cru.msgsnd; 789 p->p_ru.msgrcv += p->p_cru.msgrcv; 790 p->p_ru.nsignals += p->p_cru.nsignals; 791 p->p_ru.nvcsw += p->p_cru.nvcsw; 792 p->p_ru.nivcsw += p->p_cru.nivcsw; 793 p->p_ru.sysc += p->p_cru.sysc; 794 p->p_ru.ioch += p->p_cru.ioch; 795 796 p->p_stat = SZOMB; 797 p->p_proc_flag &= ~P_PR_PTRACE; 798 p->p_wdata = what; 799 p->p_wcode = (char)why; 800 801 cdir = PTOU(p)->u_cdir; 802 rdir = PTOU(p)->u_rdir; 803 cwd = PTOU(p)->u_cwd; 804 805 ASSERT(cdir != NULL || p->p_parent == &p0); 806 807 /* 808 * Release resource controls, as they are no longer enforceable. 809 */ 810 rctl_set_free(p->p_rctls); 811 812 /* 813 * Decrement tk_nlwps counter for our task.max-lwps resource control. 814 * An extended accounting record, if that facility is active, is 815 * scheduled to be written. We cannot give up task and project 816 * membership at this point because that would allow zombies to escape 817 * from the max-processes resource controls. Zombies stay in their 818 * current task and project until the process table slot is released 819 * in freeproc(). 820 */ 821 tk = p->p_task; 822 823 mutex_enter(&p->p_zone->zone_nlwps_lock); 824 tk->tk_nlwps--; 825 tk->tk_proj->kpj_nlwps--; 826 p->p_zone->zone_nlwps--; 827 mutex_exit(&p->p_zone->zone_nlwps_lock); 828 829 /* 830 * Clear the lwp directory and the lwpid hash table 831 * now that /proc can't bother us any more. 832 * We free the memory below, after dropping p->p_lock. 833 */ 834 lwpdir = p->p_lwpdir; 835 lwpdir_sz = p->p_lwpdir_sz; 836 tidhash = p->p_tidhash; 837 tidhash_sz = p->p_tidhash_sz; 838 ret_tidhash = p->p_ret_tidhash; 839 p->p_lwpdir = NULL; 840 p->p_lwpfree = NULL; 841 p->p_lwpdir_sz = 0; 842 p->p_tidhash = NULL; 843 p->p_tidhash_sz = 0; 844 p->p_ret_tidhash = NULL; 845 846 /* 847 * If the process has context ops installed, call the exit routine 848 * on behalf of this last remaining thread. Normally exitpctx() is 849 * called during thread_exit() or lwp_exit(), but because this is the 850 * last thread in the process, we must call it here. By the time 851 * thread_exit() is called (below), the association with the relevant 852 * process has been lost. 853 * 854 * We also free the context here. 855 */ 856 if (p->p_pctx) { 857 kpreempt_disable(); 858 exitpctx(p); 859 kpreempt_enable(); 860 861 freepctx(p, 0); 862 } 863 864 /* 865 * curthread's proc pointer is changed to point to the 'sched' 866 * process for the corresponding zone, except in the case when 867 * the exiting process is in fact a zsched instance, in which 868 * case the proc pointer is set to p0. We do so, so that the 869 * process still points at the right zone when we call the VN_RELE() 870 * below. 871 * 872 * This is because curthread's original proc pointer can be freed as 873 * soon as the child sends a SIGCLD to its parent. We use zsched so 874 * that for user processes, even in the final moments of death, the 875 * process is still associated with its zone. 876 */ 877 if (p != t->t_procp->p_zone->zone_zsched) 878 t->t_procp = t->t_procp->p_zone->zone_zsched; 879 else 880 t->t_procp = &p0; 881 882 mutex_exit(&p->p_lock); 883 if (!evaporate) { 884 p->p_pidflag &= ~CLDPEND; 885 sigcld(p, sqp); 886 } else { 887 /* 888 * Do what sigcld() would do if the disposition 889 * of the SIGCHLD signal were set to be ignored. 890 */ 891 cv_broadcast(&p->p_srwchan_cv); 892 freeproc(p); 893 } 894 mutex_exit(&pidlock); 895 896 /* 897 * We don't release u_cdir and u_rdir until SZOMB is set. 898 * This protects us against dofusers(). 899 */ 900 if (cdir) 901 VN_RELE(cdir); 902 if (rdir) 903 VN_RELE(rdir); 904 if (cwd) 905 refstr_rele(cwd); 906 907 /* 908 * task_rele() may ultimately cause the zone to go away (or 909 * may cause the last user process in a zone to go away, which 910 * signals zsched to go away). So prior to this call, we must 911 * no longer point at zsched. 912 */ 913 t->t_procp = &p0; 914 915 kmem_free(lwpdir, lwpdir_sz * sizeof (lwpdir_t)); 916 kmem_free(tidhash, tidhash_sz * sizeof (tidhash_t)); 917 while (ret_tidhash != NULL) { 918 ret_tidhash_t *next = ret_tidhash->rth_next; 919 kmem_free(ret_tidhash->rth_tidhash, 920 ret_tidhash->rth_tidhash_sz * sizeof (tidhash_t)); 921 kmem_free(ret_tidhash, sizeof (*ret_tidhash)); 922 ret_tidhash = next; 923 } 924 925 thread_exit(); 926 /* NOTREACHED */ 927 } 928 929 /* 930 * Format siginfo structure for wait system calls. 931 */ 932 void 933 winfo(proc_t *pp, k_siginfo_t *ip, int waitflag) 934 { 935 ASSERT(MUTEX_HELD(&pidlock)); 936 937 bzero(ip, sizeof (k_siginfo_t)); 938 ip->si_signo = SIGCLD; 939 ip->si_code = pp->p_wcode; 940 ip->si_pid = pp->p_pid; 941 ip->si_ctid = PRCTID(pp); 942 ip->si_zoneid = pp->p_zone->zone_id; 943 ip->si_status = pp->p_wdata; 944 ip->si_stime = pp->p_stime; 945 ip->si_utime = pp->p_utime; 946 947 if (waitflag) { 948 pp->p_wcode = 0; 949 pp->p_wdata = 0; 950 pp->p_pidflag &= ~CLDPEND; 951 } 952 } 953 954 /* 955 * Wait system call. 956 * Search for a terminated (zombie) child, 957 * finally lay it to rest, and collect its status. 958 * Look also for stopped children, 959 * and pass back status from them. 960 */ 961 int 962 waitid(idtype_t idtype, id_t id, k_siginfo_t *ip, int options) 963 { 964 int found; 965 proc_t *cp, *pp; 966 int proc_gone; 967 int waitflag = !(options & WNOWAIT); 968 969 /* 970 * Obsolete flag, defined here only for binary compatibility 971 * with old statically linked executables. Delete this when 972 * we no longer care about these old and broken applications. 973 */ 974 #define _WNOCHLD 0400 975 options &= ~_WNOCHLD; 976 977 if (options == 0 || (options & ~WOPTMASK)) 978 return (EINVAL); 979 980 switch (idtype) { 981 case P_PID: 982 case P_PGID: 983 if (id < 0 || id >= maxpid) 984 return (EINVAL); 985 /* FALLTHROUGH */ 986 case P_ALL: 987 break; 988 default: 989 return (EINVAL); 990 } 991 992 pp = ttoproc(curthread); 993 994 /* 995 * lock parent mutex so that sibling chain can be searched. 996 */ 997 mutex_enter(&pidlock); 998 999 /* 1000 * if we are only looking for exited processes and child_ns list 1001 * is empty no reason to look at all children. 1002 */ 1003 if (idtype == P_ALL && 1004 (options & ~WNOWAIT) == (WNOHANG | WEXITED) && 1005 pp->p_child_ns == NULL) { 1006 if (pp->p_child) { 1007 mutex_exit(&pidlock); 1008 bzero(ip, sizeof (k_siginfo_t)); 1009 return (0); 1010 } 1011 mutex_exit(&pidlock); 1012 return (ECHILD); 1013 } 1014 1015 while (pp->p_child != NULL) { 1016 1017 proc_gone = 0; 1018 1019 for (cp = pp->p_child_ns; cp != NULL; cp = cp->p_sibling_ns) { 1020 if (idtype != P_PID && (cp->p_pidflag & CLDWAITPID)) 1021 continue; 1022 if (idtype == P_PID && id != cp->p_pid) 1023 continue; 1024 if (idtype == P_PGID && id != cp->p_pgrp) 1025 continue; 1026 1027 switch (cp->p_wcode) { 1028 1029 case CLD_TRAPPED: 1030 case CLD_STOPPED: 1031 case CLD_CONTINUED: 1032 cmn_err(CE_PANIC, 1033 "waitid: wrong state %d on the p_newstate" 1034 " list", cp->p_wcode); 1035 break; 1036 1037 case CLD_EXITED: 1038 case CLD_DUMPED: 1039 case CLD_KILLED: 1040 if (!(options & WEXITED)) { 1041 /* 1042 * Count how many are already gone 1043 * for good. 1044 */ 1045 proc_gone++; 1046 break; 1047 } 1048 if (!waitflag) { 1049 winfo(cp, ip, 0); 1050 } else { 1051 winfo(cp, ip, 1); 1052 freeproc(cp); 1053 } 1054 mutex_exit(&pidlock); 1055 if (waitflag) { /* accept SIGCLD */ 1056 sigcld_delete(ip); 1057 sigcld_repost(); 1058 } 1059 return (0); 1060 } 1061 1062 if (idtype == P_PID) 1063 break; 1064 } 1065 1066 /* 1067 * Wow! None of the threads on the p_sibling_ns list were 1068 * interesting threads. Check all the kids! 1069 */ 1070 found = 0; 1071 for (cp = pp->p_child; cp != NULL; cp = cp->p_sibling) { 1072 if (idtype == P_PID && id != cp->p_pid) 1073 continue; 1074 if (idtype == P_PGID && id != cp->p_pgrp) 1075 continue; 1076 1077 switch (cp->p_wcode) { 1078 case CLD_TRAPPED: 1079 if (!(options & WTRAPPED)) 1080 break; 1081 winfo(cp, ip, waitflag); 1082 mutex_exit(&pidlock); 1083 if (waitflag) { /* accept SIGCLD */ 1084 sigcld_delete(ip); 1085 sigcld_repost(); 1086 } 1087 return (0); 1088 1089 case CLD_STOPPED: 1090 if (!(options & WSTOPPED)) 1091 break; 1092 /* Is it still stopped? */ 1093 mutex_enter(&cp->p_lock); 1094 if (!jobstopped(cp)) { 1095 mutex_exit(&cp->p_lock); 1096 break; 1097 } 1098 mutex_exit(&cp->p_lock); 1099 winfo(cp, ip, waitflag); 1100 mutex_exit(&pidlock); 1101 if (waitflag) { /* accept SIGCLD */ 1102 sigcld_delete(ip); 1103 sigcld_repost(); 1104 } 1105 return (0); 1106 1107 case CLD_CONTINUED: 1108 if (!(options & WCONTINUED)) 1109 break; 1110 winfo(cp, ip, waitflag); 1111 mutex_exit(&pidlock); 1112 if (waitflag) { /* accept SIGCLD */ 1113 sigcld_delete(ip); 1114 sigcld_repost(); 1115 } 1116 return (0); 1117 1118 case CLD_EXITED: 1119 case CLD_DUMPED: 1120 case CLD_KILLED: 1121 if (idtype != P_PID && 1122 (cp->p_pidflag & CLDWAITPID)) 1123 continue; 1124 /* 1125 * Don't complain if a process was found in 1126 * the first loop but we broke out of the loop 1127 * because of the arguments passed to us. 1128 */ 1129 if (proc_gone == 0) { 1130 cmn_err(CE_PANIC, 1131 "waitid: wrong state on the" 1132 " p_child list"); 1133 } else { 1134 break; 1135 } 1136 } 1137 1138 found++; 1139 1140 if (idtype == P_PID) 1141 break; 1142 } 1143 1144 /* 1145 * If we found no interesting processes at all, 1146 * break out and return ECHILD. 1147 */ 1148 if (found + proc_gone == 0) 1149 break; 1150 1151 if (options & WNOHANG) { 1152 mutex_exit(&pidlock); 1153 bzero(ip, sizeof (k_siginfo_t)); 1154 /* 1155 * We should set ip->si_signo = SIGCLD, 1156 * but there is an SVVS test that expects 1157 * ip->si_signo to be zero in this case. 1158 */ 1159 return (0); 1160 } 1161 1162 /* 1163 * If we found no processes of interest that could 1164 * change state while we wait, we don't wait at all. 1165 * Get out with ECHILD according to SVID. 1166 */ 1167 if (found == proc_gone) 1168 break; 1169 1170 if (!cv_wait_sig_swap(&pp->p_cv, &pidlock)) { 1171 mutex_exit(&pidlock); 1172 return (EINTR); 1173 } 1174 } 1175 mutex_exit(&pidlock); 1176 return (ECHILD); 1177 } 1178 1179 int 1180 waitsys(idtype_t idtype, id_t id, siginfo_t *infop, int options) 1181 { 1182 int error; 1183 k_siginfo_t info; 1184 1185 if (error = waitid(idtype, id, &info, options)) 1186 return (set_errno(error)); 1187 if (copyout(&info, infop, sizeof (k_siginfo_t))) 1188 return (set_errno(EFAULT)); 1189 return (0); 1190 } 1191 1192 #ifdef _SYSCALL32_IMPL 1193 1194 int 1195 waitsys32(idtype_t idtype, id_t id, siginfo_t *infop, int options) 1196 { 1197 int error; 1198 k_siginfo_t info; 1199 siginfo32_t info32; 1200 1201 if (error = waitid(idtype, id, &info, options)) 1202 return (set_errno(error)); 1203 siginfo_kto32(&info, &info32); 1204 if (copyout(&info32, infop, sizeof (info32))) 1205 return (set_errno(EFAULT)); 1206 return (0); 1207 } 1208 1209 #endif /* _SYSCALL32_IMPL */ 1210 1211 void 1212 proc_detach(proc_t *p) 1213 { 1214 proc_t *q; 1215 1216 ASSERT(MUTEX_HELD(&pidlock)); 1217 1218 q = p->p_parent; 1219 ASSERT(q != NULL); 1220 1221 /* 1222 * Take it off the newstate list of its parent 1223 */ 1224 delete_ns(q, p); 1225 1226 if (q->p_child == p) { 1227 q->p_child = p->p_sibling; 1228 /* 1229 * If the parent has no children, it better not 1230 * have any with new states either! 1231 */ 1232 ASSERT(q->p_child ? 1 : q->p_child_ns == NULL); 1233 } 1234 1235 if (p->p_sibling) { 1236 p->p_sibling->p_psibling = p->p_psibling; 1237 } 1238 1239 if (p->p_psibling) { 1240 p->p_psibling->p_sibling = p->p_sibling; 1241 } 1242 } 1243 1244 /* 1245 * Remove zombie children from the process table. 1246 */ 1247 void 1248 freeproc(proc_t *p) 1249 { 1250 proc_t *q; 1251 task_t *tk; 1252 1253 ASSERT(p->p_stat == SZOMB); 1254 ASSERT(p->p_tlist == NULL); 1255 ASSERT(MUTEX_HELD(&pidlock)); 1256 1257 sigdelq(p, NULL, 0); 1258 if (p->p_killsqp) { 1259 siginfofree(p->p_killsqp); 1260 p->p_killsqp = NULL; 1261 } 1262 1263 prfree(p); /* inform /proc */ 1264 1265 /* 1266 * Don't free the init processes. 1267 * Other dying processes will access it. 1268 */ 1269 if (p == proc_init) 1270 return; 1271 1272 1273 /* 1274 * We wait until now to free the cred structure because a 1275 * zombie process's credentials may be examined by /proc. 1276 * No cred locking needed because there are no threads at this point. 1277 */ 1278 upcount_dec(crgetruid(p->p_cred), crgetzoneid(p->p_cred)); 1279 crfree(p->p_cred); 1280 if (p->p_corefile != NULL) { 1281 corectl_path_rele(p->p_corefile); 1282 p->p_corefile = NULL; 1283 } 1284 if (p->p_content != NULL) { 1285 corectl_content_rele(p->p_content); 1286 p->p_content = NULL; 1287 } 1288 1289 if (p->p_nextofkin && !((p->p_nextofkin->p_flag & SNOWAIT) || 1290 (PTOU(p->p_nextofkin)->u_signal[SIGCLD - 1] == SIG_IGN))) { 1291 /* 1292 * This should still do the right thing since p_utime/stime 1293 * get set to the correct value on process exit, so it 1294 * should get properly updated 1295 */ 1296 p->p_nextofkin->p_cutime += p->p_utime; 1297 p->p_nextofkin->p_cstime += p->p_stime; 1298 1299 p->p_nextofkin->p_cacct[LMS_USER] += p->p_acct[LMS_USER]; 1300 p->p_nextofkin->p_cacct[LMS_SYSTEM] += p->p_acct[LMS_SYSTEM]; 1301 p->p_nextofkin->p_cacct[LMS_TRAP] += p->p_acct[LMS_TRAP]; 1302 p->p_nextofkin->p_cacct[LMS_TFAULT] += p->p_acct[LMS_TFAULT]; 1303 p->p_nextofkin->p_cacct[LMS_DFAULT] += p->p_acct[LMS_DFAULT]; 1304 p->p_nextofkin->p_cacct[LMS_KFAULT] += p->p_acct[LMS_KFAULT]; 1305 p->p_nextofkin->p_cacct[LMS_USER_LOCK] 1306 += p->p_acct[LMS_USER_LOCK]; 1307 p->p_nextofkin->p_cacct[LMS_SLEEP] += p->p_acct[LMS_SLEEP]; 1308 p->p_nextofkin->p_cacct[LMS_WAIT_CPU] 1309 += p->p_acct[LMS_WAIT_CPU]; 1310 p->p_nextofkin->p_cacct[LMS_STOPPED] += p->p_acct[LMS_STOPPED]; 1311 1312 p->p_nextofkin->p_cru.minflt += p->p_ru.minflt; 1313 p->p_nextofkin->p_cru.majflt += p->p_ru.majflt; 1314 p->p_nextofkin->p_cru.nswap += p->p_ru.nswap; 1315 p->p_nextofkin->p_cru.inblock += p->p_ru.inblock; 1316 p->p_nextofkin->p_cru.oublock += p->p_ru.oublock; 1317 p->p_nextofkin->p_cru.msgsnd += p->p_ru.msgsnd; 1318 p->p_nextofkin->p_cru.msgrcv += p->p_ru.msgrcv; 1319 p->p_nextofkin->p_cru.nsignals += p->p_ru.nsignals; 1320 p->p_nextofkin->p_cru.nvcsw += p->p_ru.nvcsw; 1321 p->p_nextofkin->p_cru.nivcsw += p->p_ru.nivcsw; 1322 p->p_nextofkin->p_cru.sysc += p->p_ru.sysc; 1323 p->p_nextofkin->p_cru.ioch += p->p_ru.ioch; 1324 1325 } 1326 1327 q = p->p_nextofkin; 1328 if (q && q->p_orphan == p) 1329 q->p_orphan = p->p_nextorph; 1330 else if (q) { 1331 for (q = q->p_orphan; q; q = q->p_nextorph) 1332 if (q->p_nextorph == p) 1333 break; 1334 ASSERT(q && q->p_nextorph == p); 1335 q->p_nextorph = p->p_nextorph; 1336 } 1337 1338 /* 1339 * The process table slot is being freed, so it is now safe to give up 1340 * task and project membership. 1341 */ 1342 mutex_enter(&p->p_lock); 1343 tk = p->p_task; 1344 task_detach(p); 1345 mutex_exit(&p->p_lock); 1346 1347 proc_detach(p); 1348 pid_exit(p, tk); /* frees pid and proc structure */ 1349 1350 task_rele(tk); 1351 } 1352 1353 /* 1354 * Delete process "child" from the newstate list of process "parent" 1355 */ 1356 void 1357 delete_ns(proc_t *parent, proc_t *child) 1358 { 1359 proc_t **ns; 1360 1361 ASSERT(MUTEX_HELD(&pidlock)); 1362 ASSERT(child->p_parent == parent); 1363 for (ns = &parent->p_child_ns; *ns != NULL; ns = &(*ns)->p_sibling_ns) { 1364 if (*ns == child) { 1365 1366 ASSERT((*ns)->p_parent == parent); 1367 1368 *ns = child->p_sibling_ns; 1369 child->p_sibling_ns = NULL; 1370 return; 1371 } 1372 } 1373 } 1374 1375 /* 1376 * Add process "child" to the new state list of process "parent" 1377 */ 1378 void 1379 add_ns(proc_t *parent, proc_t *child) 1380 { 1381 ASSERT(child->p_sibling_ns == NULL); 1382 child->p_sibling_ns = parent->p_child_ns; 1383 parent->p_child_ns = child; 1384 } 1385