1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/sysmacros.h> 36 #include <sys/signal.h> 37 #include <sys/cred.h> 38 #include <sys/policy.h> 39 #include <sys/user.h> 40 #include <sys/systm.h> 41 #include <sys/cpuvar.h> 42 #include <sys/vfs.h> 43 #include <sys/vnode.h> 44 #include <sys/file.h> 45 #include <sys/errno.h> 46 #include <sys/time.h> 47 #include <sys/proc.h> 48 #include <sys/cmn_err.h> 49 #include <sys/acct.h> 50 #include <sys/tuneable.h> 51 #include <sys/class.h> 52 #include <sys/kmem.h> 53 #include <sys/session.h> 54 #include <sys/ucontext.h> 55 #include <sys/stack.h> 56 #include <sys/procfs.h> 57 #include <sys/prsystm.h> 58 #include <sys/vmsystm.h> 59 #include <sys/vtrace.h> 60 #include <sys/debug.h> 61 #include <sys/shm_impl.h> 62 #include <sys/door_data.h> 63 #include <vm/as.h> 64 #include <vm/rm.h> 65 #include <c2/audit.h> 66 #include <sys/var.h> 67 #include <sys/schedctl.h> 68 #include <sys/utrap.h> 69 #include <sys/task.h> 70 #include <sys/resource.h> 71 #include <sys/cyclic.h> 72 #include <sys/lgrp.h> 73 #include <sys/rctl.h> 74 #include <sys/contract_impl.h> 75 #include <sys/contract/process_impl.h> 76 #include <sys/list.h> 77 #include <sys/dtrace.h> 78 #include <sys/pool.h> 79 #include <sys/zone.h> 80 #include <sys/sdt.h> 81 #include <sys/class.h> 82 #include <sys/corectl.h> 83 84 static int64_t cfork(int, int); 85 static int getproc(proc_t **, int); 86 static void fork_fail(proc_t *); 87 static void forklwp_fail(proc_t *); 88 89 int fork_fail_pending; 90 91 extern struct kmem_cache *process_cache; 92 93 /* 94 * forkall system call. 95 */ 96 int64_t 97 forkall(void) 98 { 99 return (cfork(0, 0)); 100 } 101 102 /* 103 * The parent is stopped until the child invokes relvm(). 104 */ 105 int64_t 106 vfork(void) 107 { 108 curthread->t_post_sys = 1; /* so vfwait() will be called */ 109 return (cfork(1, 1)); 110 } 111 112 /* 113 * fork1 system call 114 */ 115 int64_t 116 fork1(void) 117 { 118 return (cfork(0, 1)); 119 } 120 121 /* ARGSUSED */ 122 static int64_t 123 cfork(int isvfork, int isfork1) 124 { 125 proc_t *p = ttoproc(curthread); 126 struct as *as; 127 proc_t *cp, **orphpp; 128 klwp_t *clone; 129 kthread_t *t; 130 task_t *tk; 131 rval_t r; 132 int error; 133 int i; 134 rctl_set_t *dup_set; 135 rctl_alloc_gp_t *dup_gp; 136 rctl_entity_p_t e; 137 lwpdir_t *ldp; 138 lwpent_t *lep; 139 lwpent_t *clep; 140 141 /* 142 * fork is not supported for the /proc agent lwp. 143 */ 144 if (curthread == p->p_agenttp) { 145 error = ENOTSUP; 146 goto forkerr; 147 } 148 149 if ((error = secpolicy_basic_fork(CRED())) != 0) 150 goto forkerr; 151 152 /* 153 * If the calling lwp is doing a fork1() then the 154 * other lwps in this process are not duplicated and 155 * don't need to be held where their kernel stacks can be 156 * cloned. If doing forkall(), the process is held with 157 * SHOLDFORK, so that the lwps are at a point where their 158 * stacks can be copied which is on entry or exit from 159 * the kernel. 160 */ 161 if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) { 162 aston(curthread); 163 error = EINTR; 164 goto forkerr; 165 } 166 167 #if defined(__sparc) 168 /* 169 * Ensure that the user stack is fully constructed 170 * before creating the child process structure. 171 */ 172 (void) flush_user_windows_to_stack(NULL); 173 #endif 174 175 /* 176 * Prevent our resource set associations from being changed during fork. 177 */ 178 mutex_enter(&p->p_lock); 179 pool_barrier_enter(); 180 mutex_exit(&p->p_lock); 181 182 /* 183 * Create a child proc struct. Place a VN_HOLD on appropriate vnodes. 184 */ 185 if (getproc(&cp, 0) < 0) { 186 mutex_enter(&p->p_lock); 187 pool_barrier_exit(); 188 continuelwps(p); 189 mutex_exit(&p->p_lock); 190 error = EAGAIN; 191 goto forkerr; 192 } 193 194 TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p); 195 196 /* 197 * Assign an address space to child 198 */ 199 if (isvfork) { 200 /* 201 * Clear any watched areas and remember the 202 * watched pages for restoring in vfwait(). 203 */ 204 as = p->p_as; 205 if (avl_numnodes(&as->a_wpage) != 0) { 206 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 207 as_clearwatch(as); 208 p->p_wpage = as->a_wpage; 209 avl_create(&as->a_wpage, wp_compare, 210 sizeof (struct watched_page), 211 offsetof(struct watched_page, wp_link)); 212 AS_LOCK_EXIT(as, &as->a_lock); 213 } 214 cp->p_as = as; 215 cp->p_flag |= SVFORK; 216 } else { 217 /* 218 * We need to hold P_PR_LOCK until the address space has 219 * been duplicated and we've had a chance to remove from the 220 * child any DTrace probes that were in the parent. Holding 221 * P_PR_LOCK prevents any new probes from being added and any 222 * extant probes from being removed. 223 */ 224 mutex_enter(&p->p_lock); 225 sprlock_proc(p); 226 mutex_exit(&p->p_lock); 227 228 error = as_dup(p->p_as, &cp->p_as); 229 if (error != 0) { 230 fork_fail(cp); 231 mutex_enter(&pidlock); 232 orphpp = &p->p_orphan; 233 while (*orphpp != cp) 234 orphpp = &(*orphpp)->p_nextorph; 235 *orphpp = cp->p_nextorph; 236 ASSERT(p->p_child == cp); 237 p->p_child = cp->p_sibling; 238 if (p->p_child) { 239 p->p_child->p_psibling = NULL; 240 } 241 mutex_enter(&cp->p_lock); 242 tk = cp->p_task; 243 task_detach(cp); 244 ASSERT(cp->p_pool->pool_ref > 0); 245 atomic_add_32(&cp->p_pool->pool_ref, -1); 246 mutex_exit(&cp->p_lock); 247 pid_exit(cp); 248 mutex_exit(&pidlock); 249 task_rele(tk); 250 251 mutex_enter(&p->p_lock); 252 pool_barrier_exit(); 253 continuelwps(p); 254 sprunlock(p); 255 /* 256 * Preserve ENOMEM error condition but 257 * map all others to EAGAIN. 258 */ 259 error = (error == ENOMEM) ? ENOMEM : EAGAIN; 260 goto forkerr; 261 } 262 /* Duplicate parent's shared memory */ 263 if (p->p_segacct) 264 shmfork(p, cp); 265 266 if (p->p_dtrace_helpers != NULL) { 267 ASSERT(dtrace_helpers_fork != NULL); 268 (*dtrace_helpers_fork)(p, cp); 269 } 270 271 /* 272 * Remove all DTrace tracepoints from the child process. 273 */ 274 mutex_enter(&p->p_lock); 275 if (p->p_dtrace_count > 0) 276 dtrace_fasttrap_fork(p, cp); 277 sprunlock(p); 278 } 279 280 /* 281 * Duplicate parent's resource controls. 282 */ 283 dup_set = rctl_set_create(); 284 for (;;) { 285 dup_gp = rctl_set_dup_prealloc(p->p_rctls); 286 mutex_enter(&p->p_rctls->rcs_lock); 287 if (rctl_set_dup_ready(p->p_rctls, dup_gp)) 288 break; 289 mutex_exit(&p->p_rctls->rcs_lock); 290 rctl_prealloc_destroy(dup_gp); 291 } 292 e.rcep_p.proc = cp; 293 e.rcep_t = RCENTITY_PROCESS; 294 cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp, 295 RCD_DUP | RCD_CALLBACK); 296 mutex_exit(&p->p_rctls->rcs_lock); 297 298 rctl_prealloc_destroy(dup_gp); 299 300 /* 301 * Allocate the child's lwp directory and lwpid hash table. 302 */ 303 if (isfork1) 304 cp->p_lwpdir_sz = 2; 305 else 306 cp->p_lwpdir_sz = p->p_lwpdir_sz; 307 cp->p_lwpdir = cp->p_lwpfree = ldp = 308 kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP); 309 for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++) 310 ldp->ld_next = ldp + 1; 311 cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2; 312 cp->p_tidhash = 313 kmem_zalloc(cp->p_tidhash_sz * sizeof (lwpdir_t *), KM_SLEEP); 314 315 /* 316 * Duplicate parent's lwps. 317 * Mutual exclusion is not needed because the process is 318 * in the hold state and only the current lwp is running. 319 */ 320 klgrpset_clear(cp->p_lgrpset); 321 if (isfork1) { 322 clone = forklwp(ttolwp(curthread), cp, curthread->t_tid); 323 if (clone == NULL) 324 goto forklwperr; 325 /* 326 * Inherit only the lwp_wait()able flag, 327 * Daemon threads should not call fork1(), but oh well... 328 */ 329 lwptot(clone)->t_proc_flag |= 330 (curthread->t_proc_flag & TP_TWAIT); 331 } else { 332 /* this is forkall(), no one can be in lwp_wait() */ 333 ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0); 334 /* for each entry in the parent's lwp directory... */ 335 for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) { 336 klwp_t *clwp; 337 kthread_t *ct; 338 339 if ((lep = ldp->ld_entry) == NULL) 340 continue; 341 342 if ((t = lep->le_thread) != NULL) { 343 clwp = forklwp(ttolwp(t), cp, t->t_tid); 344 if (clwp == NULL) 345 goto forklwperr; 346 ct = lwptot(clwp); 347 /* 348 * Inherit lwp_wait()able and daemon flags. 349 */ 350 ct->t_proc_flag |= 351 (t->t_proc_flag & (TP_TWAIT|TP_DAEMON)); 352 /* 353 * Keep track of the clone of curthread to 354 * post return values through lwp_setrval(). 355 * Mark other threads for special treatment 356 * by lwp_rtt() / post_syscall(). 357 */ 358 if (t == curthread) 359 clone = clwp; 360 else 361 ct->t_flag |= T_FORKALL; 362 } else { 363 /* 364 * Replicate zombie lwps in the child. 365 */ 366 clep = kmem_zalloc(sizeof (*clep), KM_SLEEP); 367 clep->le_lwpid = lep->le_lwpid; 368 clep->le_start = lep->le_start; 369 lwp_hash_in(cp, clep); 370 } 371 } 372 } 373 374 /* 375 * Put new process in the parent's process contract, or put it 376 * in a new one if there is an active process template. Send a 377 * fork event (if requested) to whatever contract the child is 378 * a member of. Fails if the parent has been SIGKILLed. 379 */ 380 if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL) 381 goto forklwperr; 382 383 /* 384 * No fork failures occur beyond this point. 385 */ 386 387 cp->p_lwpid = p->p_lwpid; 388 if (!isfork1) { 389 cp->p_lwpdaemon = p->p_lwpdaemon; 390 cp->p_zombcnt = p->p_zombcnt; 391 /* 392 * If the parent's lwp ids have wrapped around, so have the 393 * child's. 394 */ 395 cp->p_flag |= p->p_flag & SLWPWRAP; 396 } 397 398 corectl_path_hold(cp->p_corefile = p->p_corefile); 399 corectl_content_hold(cp->p_content = p->p_content); 400 401 #if defined(__x86) 402 /* 403 * Get the right ldt descr for the child. 404 */ 405 (void) ldt_dup(p, cp); 406 #endif 407 408 #ifdef __sparc 409 utrap_dup(p, cp); 410 #endif 411 /* 412 * If the child process has been marked to stop on exit 413 * from this fork, arrange for all other lwps to stop in 414 * sympathy with the active lwp. 415 */ 416 if (PTOU(cp)->u_systrap && 417 prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) { 418 mutex_enter(&cp->p_lock); 419 t = cp->p_tlist; 420 do { 421 t->t_proc_flag |= TP_PRSTOP; 422 aston(t); /* so TP_PRSTOP will be seen */ 423 } while ((t = t->t_forw) != cp->p_tlist); 424 mutex_exit(&cp->p_lock); 425 } 426 /* 427 * If the parent process has been marked to stop on exit 428 * from this fork, and its asynchronous-stop flag has not 429 * been set, arrange for all other lwps to stop before 430 * they return back to user level. 431 */ 432 if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap && 433 prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) { 434 mutex_enter(&p->p_lock); 435 t = p->p_tlist; 436 do { 437 t->t_proc_flag |= TP_PRSTOP; 438 aston(t); /* so TP_PRSTOP will be seen */ 439 } while ((t = t->t_forw) != p->p_tlist); 440 mutex_exit(&p->p_lock); 441 } 442 443 /* set return values for child */ 444 lwp_setrval(clone, p->p_pid, 1); 445 446 /* set return values for parent */ 447 r.r_val1 = (int)cp->p_pid; 448 r.r_val2 = 0; 449 450 /* 451 * pool_barrier_exit() can now be called because the child process has: 452 * - all identifying features cloned or set (p_pid, p_task, p_pool) 453 * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset) 454 * - any other fields set which are used in resource set binding. 455 */ 456 mutex_enter(&p->p_lock); 457 pool_barrier_exit(); 458 mutex_exit(&p->p_lock); 459 460 mutex_enter(&pidlock); 461 mutex_enter(&cp->p_lock); 462 463 /* 464 * Now that there are lwps and threads attached, add the new 465 * process to the process group. 466 */ 467 pgjoin(cp, p->p_pgidp); 468 cp->p_stat = SRUN; 469 /* 470 * We are now done with all the lwps in the child process. 471 */ 472 t = cp->p_tlist; 473 do { 474 /* 475 * Set the lwp_suspend()ed lwps running. 476 * They will suspend properly at syscall exit. 477 */ 478 if (t->t_proc_flag & TP_HOLDLWP) 479 lwp_create_done(t); 480 else { 481 /* set TS_CREATE to allow continuelwps() to work */ 482 thread_lock(t); 483 ASSERT(t->t_state == TS_STOPPED && 484 !(t->t_schedflag & (TS_CREATE|TS_CSTART))); 485 t->t_schedflag |= TS_CREATE; 486 thread_unlock(t); 487 } 488 } while ((t = t->t_forw) != cp->p_tlist); 489 mutex_exit(&cp->p_lock); 490 491 if (isvfork) { 492 CPU_STATS_ADDQ(CPU, sys, sysvfork, 1); 493 mutex_enter(&p->p_lock); 494 p->p_flag |= SVFWAIT; 495 DTRACE_PROC1(create, proc_t *, cp); 496 cv_broadcast(&pr_pid_cv[p->p_slot]); /* inform /proc */ 497 mutex_exit(&p->p_lock); 498 /* 499 * Grab child's p_lock before dropping pidlock to ensure 500 * the process will not disappear before we set it running. 501 */ 502 mutex_enter(&cp->p_lock); 503 mutex_exit(&pidlock); 504 sigdefault(cp); 505 continuelwps(cp); 506 mutex_exit(&cp->p_lock); 507 } else { 508 CPU_STATS_ADDQ(CPU, sys, sysfork, 1); 509 DTRACE_PROC1(create, proc_t *, cp); 510 /* 511 * It is CL_FORKRET's job to drop pidlock. 512 * If we do it here, the process could be set running 513 * and disappear before CL_FORKRET() is called. 514 */ 515 CL_FORKRET(curthread, cp->p_tlist); 516 ASSERT(MUTEX_NOT_HELD(&pidlock)); 517 } 518 519 return (r.r_vals); 520 521 forklwperr: 522 if (isvfork) { 523 if (avl_numnodes(&p->p_wpage) != 0) { 524 /* restore watchpoints to parent */ 525 as = p->p_as; 526 AS_LOCK_ENTER(as, &as->a_lock, 527 RW_WRITER); 528 as->a_wpage = p->p_wpage; 529 avl_create(&p->p_wpage, wp_compare, 530 sizeof (struct watched_page), 531 offsetof(struct watched_page, wp_link)); 532 as_setwatch(as); 533 AS_LOCK_EXIT(as, &as->a_lock); 534 } 535 } else { 536 if (cp->p_segacct) 537 shmexit(cp); 538 as = cp->p_as; 539 cp->p_as = &kas; 540 as_free(as); 541 } 542 543 if (cp->p_lwpdir) { 544 for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++) 545 if ((lep = ldp->ld_entry) != NULL) 546 kmem_free(lep, sizeof (*lep)); 547 kmem_free(cp->p_lwpdir, 548 cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir)); 549 } 550 cp->p_lwpdir = NULL; 551 cp->p_lwpfree = NULL; 552 cp->p_lwpdir_sz = 0; 553 554 if (cp->p_tidhash) 555 kmem_free(cp->p_tidhash, 556 cp->p_tidhash_sz * sizeof (*cp->p_tidhash)); 557 cp->p_tidhash = NULL; 558 cp->p_tidhash_sz = 0; 559 560 forklwp_fail(cp); 561 fork_fail(cp); 562 rctl_set_free(cp->p_rctls); 563 mutex_enter(&pidlock); 564 565 /* 566 * Detach failed child from task. 567 */ 568 mutex_enter(&cp->p_lock); 569 tk = cp->p_task; 570 task_detach(cp); 571 ASSERT(cp->p_pool->pool_ref > 0); 572 atomic_add_32(&cp->p_pool->pool_ref, -1); 573 mutex_exit(&cp->p_lock); 574 575 orphpp = &p->p_orphan; 576 while (*orphpp != cp) 577 orphpp = &(*orphpp)->p_nextorph; 578 *orphpp = cp->p_nextorph; 579 ASSERT(p->p_child == cp); 580 p->p_child = cp->p_sibling; 581 if (p->p_child) { 582 p->p_child->p_psibling = NULL; 583 } 584 pid_exit(cp); 585 mutex_exit(&pidlock); 586 587 task_rele(tk); 588 589 mutex_enter(&p->p_lock); 590 pool_barrier_exit(); 591 continuelwps(p); 592 mutex_exit(&p->p_lock); 593 error = EAGAIN; 594 forkerr: 595 return ((int64_t)set_errno(error)); 596 } 597 598 /* 599 * Free allocated resources from getproc() if a fork failed. 600 */ 601 static void 602 fork_fail(proc_t *cp) 603 { 604 uf_info_t *fip = P_FINFO(cp); 605 606 fcnt_add(fip, -1); 607 sigdelq(cp, NULL, 0); 608 609 mutex_enter(&pidlock); 610 upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred)); 611 mutex_exit(&pidlock); 612 613 /* 614 * single threaded, so no locking needed here 615 */ 616 crfree(cp->p_cred); 617 618 kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t)); 619 620 VN_RELE(u.u_cdir); 621 if (u.u_rdir) 622 VN_RELE(u.u_rdir); 623 if (cp->p_exec) 624 VN_RELE(cp->p_exec); 625 if (cp->p_execdir) 626 VN_RELE(cp->p_execdir); 627 if (u.u_cwd) 628 refstr_rele(u.u_cwd); 629 } 630 631 /* 632 * Clean up the lwps already created for this child process. 633 * The fork failed while duplicating all the lwps of the parent 634 * and those lwps already created must be freed. 635 * This process is invisible to the rest of the system, 636 * so we don't need to hold p->p_lock to protect the list. 637 */ 638 static void 639 forklwp_fail(proc_t *p) 640 { 641 kthread_t *t; 642 task_t *tk; 643 644 while ((t = p->p_tlist) != NULL) { 645 /* 646 * First remove the lwp from the process's p_tlist. 647 */ 648 if (t != t->t_forw) 649 p->p_tlist = t->t_forw; 650 else 651 p->p_tlist = NULL; 652 p->p_lwpcnt--; 653 t->t_forw->t_back = t->t_back; 654 t->t_back->t_forw = t->t_forw; 655 656 tk = p->p_task; 657 mutex_enter(&p->p_zone->zone_nlwps_lock); 658 tk->tk_nlwps--; 659 tk->tk_proj->kpj_nlwps--; 660 p->p_zone->zone_nlwps--; 661 mutex_exit(&p->p_zone->zone_nlwps_lock); 662 663 ASSERT(t->t_schedctl == NULL); 664 665 if (t->t_door != NULL) { 666 kmem_free(t->t_door, sizeof (door_data_t)); 667 t->t_door = NULL; 668 } 669 lwp_ctmpl_clear(ttolwp(t)); 670 671 /* 672 * Remove the thread from the all threads list. 673 * We need to hold pidlock for this. 674 */ 675 mutex_enter(&pidlock); 676 t->t_next->t_prev = t->t_prev; 677 t->t_prev->t_next = t->t_next; 678 CL_EXIT(t); /* tell the scheduler that we're exiting */ 679 cv_broadcast(&t->t_joincv); /* tell anyone in thread_join */ 680 mutex_exit(&pidlock); 681 682 /* 683 * Let the lgroup load averages know that this thread isn't 684 * going to show up (i.e. un-do what was done on behalf of 685 * this thread by the earlier lgrp_move_thread()). 686 */ 687 kpreempt_disable(); 688 lgrp_move_thread(t, NULL, 1); 689 kpreempt_enable(); 690 691 /* 692 * The thread was created TS_STOPPED. 693 * We change it to TS_FREE to avoid an 694 * ASSERT() panic in thread_free(). 695 */ 696 t->t_state = TS_FREE; 697 thread_rele(t); 698 thread_free(t); 699 } 700 } 701 702 extern struct as kas; 703 704 /* 705 * fork a kernel process. 706 */ 707 int 708 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct) 709 { 710 proc_t *p; 711 struct user *up; 712 klwp_t *lwp; 713 cont_process_t *ctp = NULL; 714 rctl_entity_p_t e; 715 716 ASSERT(!(cid == syscid && ct != NULL)); 717 if (cid == syscid) { 718 rctl_alloc_gp_t *init_gp; 719 rctl_set_t *init_set; 720 721 if (getproc(&p, 1) < 0) 722 return (EAGAIN); 723 724 p->p_flag |= SNOWAIT; 725 p->p_exec = NULL; 726 p->p_execdir = NULL; 727 728 init_set = rctl_set_create(); 729 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS); 730 731 /* 732 * kernel processes do not inherit /proc tracing flags. 733 */ 734 sigemptyset(&p->p_sigmask); 735 premptyset(&p->p_fltmask); 736 up = PTOU(p); 737 up->u_systrap = 0; 738 premptyset(&(up->u_entrymask)); 739 premptyset(&(up->u_exitmask)); 740 mutex_enter(&p->p_lock); 741 e.rcep_p.proc = p; 742 e.rcep_t = RCENTITY_PROCESS; 743 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set, 744 init_gp); 745 mutex_exit(&p->p_lock); 746 747 rctl_prealloc_destroy(init_gp); 748 } else { 749 rctl_alloc_gp_t *init_gp, *default_gp; 750 rctl_set_t *init_set; 751 task_t *tk, *tk_old; 752 753 if (getproc(&p, 0) < 0) 754 return (EAGAIN); 755 /* 756 * init creates a new task, distinct from the task 757 * containing kernel "processes". 758 */ 759 tk = task_create(0, p->p_zone); 760 mutex_enter(&tk->tk_zone->zone_nlwps_lock); 761 tk->tk_proj->kpj_ntasks++; 762 mutex_exit(&tk->tk_zone->zone_nlwps_lock); 763 764 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS); 765 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS); 766 init_set = rctl_set_create(); 767 768 mutex_enter(&pidlock); 769 mutex_enter(&p->p_lock); 770 tk_old = p->p_task; /* switch to new task */ 771 772 task_detach(p); 773 task_begin(tk, p); 774 mutex_exit(&pidlock); 775 776 e.rcep_p.proc = p; 777 e.rcep_t = RCENTITY_PROCESS; 778 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set, 779 init_gp); 780 rctlproc_default_init(p, default_gp); 781 mutex_exit(&p->p_lock); 782 783 task_rele(tk_old); 784 rctl_prealloc_destroy(default_gp); 785 rctl_prealloc_destroy(init_gp); 786 } 787 788 p->p_as = &kas; 789 790 #if defined(__x86) 791 (void) ldt_dup(&p0, p); /* Get the default ldt descr */ 792 #endif 793 794 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri, 795 &curthread->t_hold, cid, 1)) == NULL) { 796 task_t *tk; 797 fork_fail(p); 798 mutex_enter(&pidlock); 799 mutex_enter(&p->p_lock); 800 tk = p->p_task; 801 task_detach(p); 802 ASSERT(p->p_pool->pool_ref > 0); 803 atomic_add_32(&p->p_pool->pool_ref, -1); 804 mutex_exit(&p->p_lock); 805 pid_exit(p); 806 mutex_exit(&pidlock); 807 task_rele(tk); 808 809 return (EAGAIN); 810 } 811 812 if (cid != syscid) { 813 ctp = contract_process_fork(sys_process_tmpl, p, curproc, 814 B_FALSE); 815 ASSERT(ctp != NULL); 816 if (ct != NULL) 817 *ct = &ctp->conp_contract; 818 } 819 820 p->p_lwpid = 1; 821 mutex_enter(&pidlock); 822 pgjoin(p, curproc->p_pgidp); 823 p->p_stat = SRUN; 824 mutex_enter(&p->p_lock); 825 lwptot(lwp)->t_proc_flag &= ~TP_HOLDLWP; 826 lwp_create_done(lwptot(lwp)); 827 mutex_exit(&p->p_lock); 828 mutex_exit(&pidlock); 829 return (0); 830 } 831 832 /* 833 * create a child proc struct. 834 */ 835 static int 836 getproc(proc_t **cpp, int kernel) 837 { 838 proc_t *pp, *cp; 839 pid_t newpid; 840 struct user *uarea; 841 extern uint_t nproc; 842 struct cred *cr; 843 uid_t ruid; 844 zoneid_t zoneid; 845 846 if (!page_mem_avail(tune.t_minarmem)) 847 return (-1); 848 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) 849 return (-1); /* no point in starting new processes */ 850 851 pp = curproc; 852 cp = kmem_cache_alloc(process_cache, KM_SLEEP); 853 bzero(cp, sizeof (proc_t)); 854 855 /* 856 * Make proc entry for child process 857 */ 858 mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL); 859 mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL); 860 #if defined(__x86) 861 mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL); 862 #endif 863 mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL); 864 cp->p_stat = SIDL; 865 cp->p_mstart = gethrtime(); 866 867 if ((newpid = pid_assign(cp)) == -1) { 868 if (nproc == v.v_proc) { 869 CPU_STATS_ADDQ(CPU, sys, procovf, 1); 870 cmn_err(CE_WARN, "out of processes"); 871 } 872 goto bad; 873 } 874 875 /* 876 * If not privileged make sure that this user hasn't exceeded 877 * v.v_maxup processes, and that users collectively haven't 878 * exceeded v.v_maxupttl processes. 879 */ 880 mutex_enter(&pidlock); 881 ASSERT(nproc < v.v_proc); /* otherwise how'd we get our pid? */ 882 cr = CRED(); 883 ruid = crgetruid(cr); 884 zoneid = crgetzoneid(cr); 885 if (nproc >= v.v_maxup && /* short-circuit; usually false */ 886 (nproc >= v.v_maxupttl || 887 upcount_get(ruid, zoneid) >= v.v_maxup) && 888 secpolicy_newproc(cr) != 0) { 889 mutex_exit(&pidlock); 890 zcmn_err(zoneid, CE_NOTE, 891 "out of per-user processes for uid %d", ruid); 892 goto bad; 893 } 894 895 /* 896 * Everything is cool, put the new proc on the active process list. 897 * It is already on the pid list and in /proc. 898 * Increment the per uid process count (upcount). 899 */ 900 nproc++; 901 upcount_inc(ruid, zoneid); 902 903 cp->p_next = practive; 904 practive->p_prev = cp; 905 practive = cp; 906 907 cp->p_ignore = pp->p_ignore; 908 cp->p_siginfo = pp->p_siginfo; 909 cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD); 910 cp->p_sessp = pp->p_sessp; 911 SESS_HOLD(pp->p_sessp); 912 cp->p_exec = pp->p_exec; 913 cp->p_execdir = pp->p_execdir; 914 cp->p_zone = pp->p_zone; 915 916 cp->p_bssbase = pp->p_bssbase; 917 cp->p_brkbase = pp->p_brkbase; 918 cp->p_brksize = pp->p_brksize; 919 cp->p_brkpageszc = pp->p_brkpageszc; 920 cp->p_stksize = pp->p_stksize; 921 cp->p_stkpageszc = pp->p_stkpageszc; 922 cp->p_stkprot = pp->p_stkprot; 923 cp->p_datprot = pp->p_datprot; 924 cp->p_usrstack = pp->p_usrstack; 925 cp->p_model = pp->p_model; 926 cp->p_ppid = pp->p_pid; 927 cp->p_ancpid = pp->p_pid; 928 cp->p_portcnt = pp->p_portcnt; 929 930 /* 931 * Initialize watchpoint structures 932 */ 933 avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area), 934 offsetof(struct watched_area, wa_link)); 935 936 /* 937 * Initialize immediate resource control values. 938 */ 939 cp->p_stk_ctl = pp->p_stk_ctl; 940 cp->p_fsz_ctl = pp->p_fsz_ctl; 941 cp->p_vmem_ctl = pp->p_vmem_ctl; 942 cp->p_fno_ctl = pp->p_fno_ctl; 943 944 /* 945 * Link up to parent-child-sibling chain. No need to lock 946 * in general since only a call to freeproc() (done by the 947 * same parent as newproc()) diddles with the child chain. 948 */ 949 cp->p_sibling = pp->p_child; 950 if (pp->p_child) 951 pp->p_child->p_psibling = cp; 952 953 cp->p_parent = pp; 954 pp->p_child = cp; 955 956 cp->p_child_ns = NULL; 957 cp->p_sibling_ns = NULL; 958 959 cp->p_nextorph = pp->p_orphan; 960 cp->p_nextofkin = pp; 961 pp->p_orphan = cp; 962 963 /* 964 * Inherit profiling state; do not inherit REALPROF profiling state. 965 */ 966 cp->p_prof = pp->p_prof; 967 cp->p_rprof_cyclic = CYCLIC_NONE; 968 969 /* 970 * Inherit pool pointer from the parent. Kernel processes are 971 * always bound to the default pool. 972 */ 973 mutex_enter(&pp->p_lock); 974 if (kernel) { 975 cp->p_pool = pool_default; 976 cp->p_flag |= SSYS; 977 } else { 978 cp->p_pool = pp->p_pool; 979 } 980 atomic_add_32(&cp->p_pool->pool_ref, 1); 981 mutex_exit(&pp->p_lock); 982 983 /* 984 * Add the child process to the current task. Kernel processes 985 * are always attached to task0. 986 */ 987 mutex_enter(&cp->p_lock); 988 if (kernel) 989 task_attach(task0p, cp); 990 else 991 task_attach(pp->p_task, cp); 992 mutex_exit(&cp->p_lock); 993 mutex_exit(&pidlock); 994 995 avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t), 996 offsetof(contract_t, ct_ctlist)); 997 998 /* 999 * Duplicate any audit information kept in the process table 1000 */ 1001 #ifdef C2_AUDIT 1002 if (audit_active) /* copy audit data to cp */ 1003 audit_newproc(cp); 1004 #endif 1005 1006 crhold(cp->p_cred = cr); 1007 1008 /* 1009 * Bump up the counts on the file structures pointed at by the 1010 * parent's file table since the child will point at them too. 1011 */ 1012 fcnt_add(P_FINFO(pp), 1); 1013 1014 VN_HOLD(u.u_cdir); 1015 if (u.u_rdir) 1016 VN_HOLD(u.u_rdir); 1017 if (u.u_cwd) 1018 refstr_hold(u.u_cwd); 1019 1020 /* 1021 * copy the parent's uarea. 1022 */ 1023 uarea = PTOU(cp); 1024 bcopy(PTOU(pp), uarea, sizeof (user_t)); 1025 flist_fork(P_FINFO(pp), P_FINFO(cp)); 1026 1027 gethrestime(&uarea->u_start); 1028 uarea->u_ticks = lbolt; 1029 uarea->u_mem = rm_asrss(pp->p_as); 1030 uarea->u_acflag = AFORK; 1031 1032 /* 1033 * If inherit-on-fork, copy /proc tracing flags to child. 1034 */ 1035 if ((pp->p_proc_flag & P_PR_FORK) != 0) { 1036 cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK); 1037 cp->p_sigmask = pp->p_sigmask; 1038 cp->p_fltmask = pp->p_fltmask; 1039 } else { 1040 sigemptyset(&cp->p_sigmask); 1041 premptyset(&cp->p_fltmask); 1042 uarea->u_systrap = 0; 1043 premptyset(&uarea->u_entrymask); 1044 premptyset(&uarea->u_exitmask); 1045 } 1046 /* 1047 * If microstate accounting is being inherited, mark child 1048 */ 1049 if ((pp->p_flag & SMSFORK) != 0) 1050 cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT); 1051 1052 /* 1053 * Inherit fixalignment flag from the parent 1054 */ 1055 cp->p_fixalignment = pp->p_fixalignment; 1056 1057 if (cp->p_exec) 1058 VN_HOLD(cp->p_exec); 1059 if (cp->p_execdir) 1060 VN_HOLD(cp->p_execdir); 1061 *cpp = cp; 1062 return (0); 1063 1064 bad: 1065 ASSERT(MUTEX_NOT_HELD(&pidlock)); 1066 1067 mutex_destroy(&cp->p_crlock); 1068 mutex_destroy(&cp->p_pflock); 1069 #if defined(__x86) 1070 mutex_destroy(&cp->p_ldtlock); 1071 #endif 1072 if (newpid != -1) { 1073 proc_entry_free(cp->p_pidp); 1074 (void) pid_rele(cp->p_pidp); 1075 } 1076 kmem_cache_free(process_cache, cp); 1077 1078 /* 1079 * We most likely got into this situation because some process is 1080 * forking out of control. As punishment, put it to sleep for a 1081 * bit so it can't eat the machine alive. Sleep interval is chosen 1082 * to allow no more than one fork failure per cpu per clock tick 1083 * on average (yes, I just made this up). This has two desirable 1084 * properties: (1) it sets a constant limit on the fork failure 1085 * rate, and (2) the busier the system is, the harsher the penalty 1086 * for abusing it becomes. 1087 */ 1088 INCR_COUNT(&fork_fail_pending, &pidlock); 1089 delay(fork_fail_pending / ncpus + 1); 1090 DECR_COUNT(&fork_fail_pending, &pidlock); 1091 1092 return (-1); /* out of memory or proc slots */ 1093 } 1094 1095 /* 1096 * Release virtual memory. 1097 * In the case of vfork(), the child was given exclusive access to its 1098 * parent's address space. The parent is waiting in vfwait() for the 1099 * child to release its exclusive claim via relvm(). 1100 */ 1101 void 1102 relvm() 1103 { 1104 proc_t *p = curproc; 1105 1106 ASSERT((unsigned)p->p_lwpcnt <= 1); 1107 1108 prrelvm(); /* inform /proc */ 1109 1110 if (p->p_flag & SVFORK) { 1111 proc_t *pp = p->p_parent; 1112 /* 1113 * The child process is either exec'ing or exit'ing. 1114 * The child is now separated from the parent's address 1115 * space. The parent process is made dispatchable. 1116 * 1117 * This is a delicate locking maneuver, involving 1118 * both the parent's p_lock and the child's p_lock. 1119 * As soon as the SVFORK flag is turned off, the 1120 * parent is free to run, but it must not run until 1121 * we wake it up using its p_cv because it might 1122 * exit and we would be referencing invalid memory. 1123 * Therefore, we hold the parent with its p_lock 1124 * while protecting our p_flags with our own p_lock. 1125 */ 1126 try_again: 1127 mutex_enter(&p->p_lock); /* grab child's lock first */ 1128 prbarrier(p); /* make sure /proc is blocked out */ 1129 mutex_enter(&pp->p_lock); 1130 1131 /* 1132 * Check if parent is locked by /proc. 1133 */ 1134 if (pp->p_proc_flag & P_PR_LOCK) { 1135 /* 1136 * Delay until /proc is done with the parent. 1137 * We must drop our (the child's) p->p_lock, wait 1138 * via prbarrier() on the parent, then start over. 1139 */ 1140 mutex_exit(&p->p_lock); 1141 prbarrier(pp); 1142 mutex_exit(&pp->p_lock); 1143 goto try_again; 1144 } 1145 p->p_flag &= ~SVFORK; 1146 kpreempt_disable(); 1147 p->p_as = &kas; 1148 1149 /* 1150 * notify hat of change in thread's address space 1151 */ 1152 hat_thread_exit(curthread); 1153 kpreempt_enable(); 1154 1155 /* 1156 * child sizes are copied back to parent because 1157 * child may have grown. 1158 */ 1159 pp->p_brkbase = p->p_brkbase; 1160 pp->p_brksize = p->p_brksize; 1161 pp->p_stksize = p->p_stksize; 1162 /* 1163 * The parent is no longer waiting for the vfork()d child. 1164 * Restore the parent's watched pages, if any. This is 1165 * safe because we know the parent is not locked by /proc 1166 */ 1167 pp->p_flag &= ~SVFWAIT; 1168 if (avl_numnodes(&pp->p_wpage) != 0) { 1169 pp->p_as->a_wpage = pp->p_wpage; 1170 avl_create(&pp->p_wpage, wp_compare, 1171 sizeof (struct watched_page), 1172 offsetof(struct watched_page, wp_link)); 1173 } 1174 cv_signal(&pp->p_cv); 1175 mutex_exit(&pp->p_lock); 1176 mutex_exit(&p->p_lock); 1177 } else { 1178 if (p->p_as != &kas) { 1179 struct as *as; 1180 1181 if (p->p_segacct) 1182 shmexit(p); 1183 /* 1184 * We grab p_lock for the benefit of /proc 1185 */ 1186 kpreempt_disable(); 1187 mutex_enter(&p->p_lock); 1188 prbarrier(p); /* make sure /proc is blocked out */ 1189 as = p->p_as; 1190 p->p_as = &kas; 1191 mutex_exit(&p->p_lock); 1192 1193 /* 1194 * notify hat of change in thread's address space 1195 */ 1196 hat_thread_exit(curthread); 1197 kpreempt_enable(); 1198 1199 as_free(as); 1200 } 1201 } 1202 } 1203 1204 /* 1205 * Wait for child to exec or exit. 1206 * Called by parent of vfork'ed process. 1207 * See important comments in relvm(), above. 1208 */ 1209 void 1210 vfwait(pid_t pid) 1211 { 1212 int signalled = 0; 1213 proc_t *pp = ttoproc(curthread); 1214 proc_t *cp; 1215 1216 /* 1217 * Wait for child to exec or exit. 1218 */ 1219 for (;;) { 1220 mutex_enter(&pidlock); 1221 cp = prfind(pid); 1222 if (cp == NULL || cp->p_parent != pp) { 1223 /* 1224 * Child has exit()ed. 1225 */ 1226 mutex_exit(&pidlock); 1227 break; 1228 } 1229 /* 1230 * Grab the child's p_lock before releasing pidlock. 1231 * Otherwise, the child could exit and we would be 1232 * referencing invalid memory. 1233 */ 1234 mutex_enter(&cp->p_lock); 1235 mutex_exit(&pidlock); 1236 if (!(cp->p_flag & SVFORK)) { 1237 /* 1238 * Child has exec()ed or is exit()ing. 1239 */ 1240 mutex_exit(&cp->p_lock); 1241 break; 1242 } 1243 mutex_enter(&pp->p_lock); 1244 mutex_exit(&cp->p_lock); 1245 /* 1246 * We might be waked up spuriously from the cv_wait(). 1247 * We have to do the whole operation over again to be 1248 * sure the child's SVFORK flag really is turned off. 1249 * We cannot make reference to the child because it can 1250 * exit before we return and we would be referencing 1251 * invalid memory. 1252 * 1253 * Because this is potentially a very long-term wait, 1254 * we call cv_wait_sig() (for its jobcontrol and /proc 1255 * side-effects) unless there is a current signal, in 1256 * which case we use cv_wait() because we cannot return 1257 * from this function until the child has released the 1258 * address space. Calling cv_wait_sig() with a current 1259 * signal would lead to an indefinite loop here because 1260 * cv_wait_sig() returns immediately in this case. 1261 */ 1262 if (signalled) 1263 cv_wait(&pp->p_cv, &pp->p_lock); 1264 else 1265 signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock); 1266 mutex_exit(&pp->p_lock); 1267 } 1268 1269 /* restore watchpoints to parent */ 1270 if (pr_watch_active(pp)) { 1271 struct as *as = pp->p_as; 1272 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER); 1273 as_setwatch(as); 1274 AS_LOCK_EXIT(as, &as->a_lock); 1275 } 1276 1277 mutex_enter(&pp->p_lock); 1278 prbarrier(pp); /* barrier against /proc locking */ 1279 continuelwps(pp); 1280 mutex_exit(&pp->p_lock); 1281 } 1282