1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 #include <sys/types.h> 30 #include <sys/param.h> 31 #include <sys/sysmacros.h> 32 #include <sys/proc.h> 33 #include <sys/kmem.h> 34 #include <sys/tuneable.h> 35 #include <sys/var.h> 36 #include <sys/cred.h> 37 #include <sys/systm.h> 38 #include <sys/prsystm.h> 39 #include <sys/vnode.h> 40 #include <sys/session.h> 41 #include <sys/cpuvar.h> 42 #include <sys/cmn_err.h> 43 #include <sys/bitmap.h> 44 #include <sys/debug.h> 45 #include <c2/audit.h> 46 #include <sys/project.h> 47 #include <sys/task.h> 48 #include <sys/zone.h> 49 50 /* directory entries for /proc */ 51 union procent { 52 proc_t *pe_proc; 53 union procent *pe_next; 54 }; 55 56 struct pid pid0 = { 57 0, /* pid_prinactive */ 58 1, /* pid_pgorphaned */ 59 0, /* pid_padding */ 60 0, /* pid_prslot */ 61 0, /* pid_id */ 62 NULL, /* pid_pglink */ 63 NULL, /* pid_pgtail */ 64 NULL, /* pid_link */ 65 3 /* pid_ref */ 66 }; 67 68 static int pid_hashlen = 4; /* desired average hash chain length */ 69 static int pid_hashsz; /* number of buckets in the hash table */ 70 71 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 72 73 extern uint_t nproc; 74 extern struct kmem_cache *process_cache; 75 static void upcount_init(void); 76 77 kmutex_t pidlock; /* global process lock */ 78 kmutex_t pr_pidlock; /* /proc global process lock */ 79 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 80 struct plock *proc_lock; /* persistent array of p_lock's */ 81 82 /* 83 * See the comment above pid_getlockslot() for a detailed explanation of this 84 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 85 * granularity; if the coherence granularity is ever changed, this constant 86 * should be modified to reflect the change to minimize proc_lock false 87 * sharing (correctness, however, is guaranteed regardless of the coherence 88 * granularity). 89 */ 90 #define PLOCK_SHIFT 3 91 92 static kmutex_t pidlinklock; 93 static struct pid **pidhash; 94 static pid_t minpid; 95 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ 96 static union procent *procdir; 97 static union procent *procentfree; 98 99 static struct pid * 100 pid_lookup(pid_t pid) 101 { 102 struct pid *pidp; 103 104 ASSERT(MUTEX_HELD(&pidlinklock)); 105 106 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 107 if (pidp->pid_id == pid) { 108 ASSERT(pidp->pid_ref > 0); 109 break; 110 } 111 } 112 return (pidp); 113 } 114 115 void 116 pid_setmin(void) 117 { 118 if (jump_pid && jump_pid > mpid) 119 minpid = mpid = jump_pid; 120 else 121 minpid = mpid; 122 } 123 124 /* 125 * When prslots are simply used as an index to determine a process' p_lock, 126 * adjacent prslots share adjacent p_locks. On machines where the size 127 * of a mutex is smaller than that of a cache line (which, as of this writing, 128 * is true for all machines on which Solaris runs), this can potentially 129 * induce false sharing. The standard solution for false sharing is to pad 130 * out one's data structures (in this case, struct plock). However, 131 * given the size and (generally) sparse use of the proc_lock array, this 132 * is suboptimal. We therefore stride through the proc_lock array with 133 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 134 * 135 * log_2 (coherence_granularity / sizeof (kmutex_t)) 136 * 137 * Under this scheme, false sharing is still possible -- but only when 138 * the number of active processes is very large. Note that the one-to-one 139 * mapping between prslots and lockslots is maintained. 140 */ 141 static int 142 pid_getlockslot(int prslot) 143 { 144 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 145 int perlap = even >> PLOCK_SHIFT; 146 147 if (prslot >= even) 148 return (prslot); 149 150 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 151 } 152 153 /* 154 * This function allocates a pid structure, a free pid, and optionally a 155 * slot in the proc table for it. 156 * 157 * pid_allocate() returns the new pid on success, -1 on failure. 158 */ 159 pid_t 160 pid_allocate(proc_t *prp, pid_t pid, int flags) 161 { 162 struct pid *pidp; 163 union procent *pep; 164 pid_t newpid, startpid; 165 166 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 167 168 mutex_enter(&pidlinklock); 169 pep = procentfree; 170 if ((flags & PID_ALLOC_PROC) && pep == NULL) { 171 /* 172 * ran out of /proc directory entries 173 */ 174 goto failed; 175 } 176 177 if (pid != 0) { 178 VERIFY(minpid == 0); 179 VERIFY3P(pid, <, mpid); 180 VERIFY3P(pid_lookup(pid), ==, NULL); 181 newpid = pid; 182 } else { 183 /* 184 * Allocate a pid 185 */ 186 ASSERT(minpid <= mpid && mpid < maxpid); 187 188 startpid = mpid; 189 for (;;) { 190 newpid = mpid; 191 if (++mpid == maxpid) 192 mpid = minpid; 193 194 if (pid_lookup(newpid) == NULL) 195 break; 196 197 if (mpid == startpid) 198 goto failed; 199 } 200 } 201 202 /* 203 * Put pid into the pid hash table. 204 */ 205 pidp->pid_link = HASHPID(newpid); 206 HASHPID(newpid) = pidp; 207 pidp->pid_ref = 1; 208 pidp->pid_id = newpid; 209 210 if (flags & PID_ALLOC_PROC) { 211 procentfree = pep->pe_next; 212 pidp->pid_prslot = pep - procdir; 213 pep->pe_proc = prp; 214 prp->p_pidp = pidp; 215 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 216 } else { 217 pidp->pid_prslot = 0; 218 } 219 220 mutex_exit(&pidlinklock); 221 222 return (newpid); 223 224 failed: 225 mutex_exit(&pidlinklock); 226 kmem_free(pidp, sizeof (struct pid)); 227 return (-1); 228 } 229 230 /* 231 * decrement the reference count for pid 232 */ 233 int 234 pid_rele(struct pid *pidp) 235 { 236 struct pid **pidpp; 237 238 mutex_enter(&pidlinklock); 239 ASSERT(pidp != &pid0); 240 241 pidpp = &HASHPID(pidp->pid_id); 242 for (;;) { 243 ASSERT(*pidpp != NULL); 244 if (*pidpp == pidp) 245 break; 246 pidpp = &(*pidpp)->pid_link; 247 } 248 249 *pidpp = pidp->pid_link; 250 mutex_exit(&pidlinklock); 251 252 kmem_free(pidp, sizeof (*pidp)); 253 return (0); 254 } 255 256 void 257 proc_entry_free(struct pid *pidp) 258 { 259 mutex_enter(&pidlinklock); 260 pidp->pid_prinactive = 1; 261 procdir[pidp->pid_prslot].pe_next = procentfree; 262 procentfree = &procdir[pidp->pid_prslot]; 263 mutex_exit(&pidlinklock); 264 } 265 266 /* 267 * The original task needs to be passed in since the process has already been 268 * detached from the task at this point in time. 269 */ 270 void 271 pid_exit(proc_t *prp, struct task *tk) 272 { 273 struct pid *pidp; 274 zone_t *zone = prp->p_zone; 275 276 ASSERT(MUTEX_HELD(&pidlock)); 277 278 /* 279 * Exit process group. If it is NULL, it's because fork failed 280 * before calling pgjoin(). 281 */ 282 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 283 if (prp->p_pgidp != NULL) 284 pgexit(prp); 285 286 sess_rele(prp->p_sessp, B_TRUE); 287 288 pidp = prp->p_pidp; 289 290 proc_entry_free(pidp); 291 292 if (audit_active) 293 audit_pfree(prp); 294 295 if (practive == prp) { 296 practive = prp->p_next; 297 } 298 299 if (prp->p_next) { 300 prp->p_next->p_prev = prp->p_prev; 301 } 302 if (prp->p_prev) { 303 prp->p_prev->p_next = prp->p_next; 304 } 305 306 PID_RELE(pidp); 307 308 mutex_destroy(&prp->p_crlock); 309 kmem_cache_free(process_cache, prp); 310 nproc--; 311 312 /* 313 * Decrement the process counts of the original task, project and zone. 314 */ 315 mutex_enter(&zone->zone_nlwps_lock); 316 tk->tk_nprocs--; 317 tk->tk_proj->kpj_nprocs--; 318 zone->zone_nprocs--; 319 mutex_exit(&zone->zone_nlwps_lock); 320 } 321 322 /* 323 * Find a process visible from the specified zone given its process ID. 324 */ 325 proc_t * 326 prfind_zone(pid_t pid, zoneid_t zoneid) 327 { 328 struct pid *pidp; 329 proc_t *p; 330 331 ASSERT(MUTEX_HELD(&pidlock)); 332 333 mutex_enter(&pidlinklock); 334 pidp = pid_lookup(pid); 335 mutex_exit(&pidlinklock); 336 if (pidp != NULL && pidp->pid_prinactive == 0) { 337 p = procdir[pidp->pid_prslot].pe_proc; 338 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 339 return (p); 340 } 341 return (NULL); 342 } 343 344 /* 345 * Find a process given its process ID. This obeys zone restrictions, 346 * so if the caller is in a non-global zone it won't find processes 347 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 348 * bypass this restriction. 349 */ 350 proc_t * 351 prfind(pid_t pid) 352 { 353 zoneid_t zoneid; 354 355 if (INGLOBALZONE(curproc)) 356 zoneid = ALL_ZONES; 357 else 358 zoneid = getzoneid(); 359 return (prfind_zone(pid, zoneid)); 360 } 361 362 proc_t * 363 pgfind_zone(pid_t pgid, zoneid_t zoneid) 364 { 365 struct pid *pidp; 366 367 ASSERT(MUTEX_HELD(&pidlock)); 368 369 mutex_enter(&pidlinklock); 370 pidp = pid_lookup(pgid); 371 mutex_exit(&pidlinklock); 372 if (pidp != NULL) { 373 proc_t *p = pidp->pid_pglink; 374 375 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 376 p->p_zone->zone_id == zoneid) 377 return (p); 378 } 379 return (NULL); 380 } 381 382 /* 383 * return the head of the list of processes whose process group ID is 'pgid', 384 * or NULL, if no such process group 385 */ 386 proc_t * 387 pgfind(pid_t pgid) 388 { 389 zoneid_t zoneid; 390 391 if (INGLOBALZONE(curproc)) 392 zoneid = ALL_ZONES; 393 else 394 zoneid = getzoneid(); 395 return (pgfind_zone(pgid, zoneid)); 396 } 397 398 /* 399 * Sets P_PR_LOCK on a non-system process. Process must be fully created 400 * and not exiting to succeed. 401 * 402 * Returns 0 on success. 403 * Returns 1 if P_PR_LOCK is set. 404 * Returns -1 if proc is in invalid state. 405 */ 406 int 407 sprtrylock_proc(proc_t *p) 408 { 409 ASSERT(MUTEX_HELD(&p->p_lock)); 410 411 /* skip system and incomplete processes */ 412 if (p->p_stat == SIDL || p->p_stat == SZOMB || 413 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 414 return (-1); 415 } 416 417 if (p->p_proc_flag & P_PR_LOCK) 418 return (1); 419 420 p->p_proc_flag |= P_PR_LOCK; 421 THREAD_KPRI_REQUEST(); 422 423 return (0); 424 } 425 426 /* 427 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 428 * and the proc pointer no longer valid, as the proc may have exited. 429 */ 430 void 431 sprwaitlock_proc(proc_t *p) 432 { 433 kmutex_t *mp; 434 435 ASSERT(MUTEX_HELD(&p->p_lock)); 436 ASSERT(p->p_proc_flag & P_PR_LOCK); 437 438 /* 439 * p_lock is persistent, but p itself is not -- it could 440 * vanish during cv_wait(). Load p->p_lock now so we can 441 * drop it after cv_wait() without referencing p. 442 */ 443 mp = &p->p_lock; 444 cv_wait(&pr_pid_cv[p->p_slot], mp); 445 mutex_exit(mp); 446 } 447 448 /* 449 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 450 * Returns the proc pointer on success, NULL on failure. sprlock() is 451 * really just a stripped-down version of pr_p_lock() to allow practive 452 * walkers like dofusers() and dumpsys() to synchronize with /proc. 453 */ 454 proc_t * 455 sprlock_zone(pid_t pid, zoneid_t zoneid) 456 { 457 proc_t *p; 458 int ret; 459 460 for (;;) { 461 mutex_enter(&pidlock); 462 if ((p = prfind_zone(pid, zoneid)) == NULL) { 463 mutex_exit(&pidlock); 464 return (NULL); 465 } 466 mutex_enter(&p->p_lock); 467 mutex_exit(&pidlock); 468 469 if (panicstr) 470 return (p); 471 472 ret = sprtrylock_proc(p); 473 if (ret == -1) { 474 mutex_exit(&p->p_lock); 475 return (NULL); 476 } else if (ret == 0) { 477 break; 478 } 479 sprwaitlock_proc(p); 480 } 481 return (p); 482 } 483 484 proc_t * 485 sprlock(pid_t pid) 486 { 487 zoneid_t zoneid; 488 489 if (INGLOBALZONE(curproc)) 490 zoneid = ALL_ZONES; 491 else 492 zoneid = getzoneid(); 493 return (sprlock_zone(pid, zoneid)); 494 } 495 496 void 497 sprlock_proc(proc_t *p) 498 { 499 ASSERT(MUTEX_HELD(&p->p_lock)); 500 501 while (p->p_proc_flag & P_PR_LOCK) { 502 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 503 } 504 505 p->p_proc_flag |= P_PR_LOCK; 506 THREAD_KPRI_REQUEST(); 507 } 508 509 void 510 sprunlock(proc_t *p) 511 { 512 if (panicstr) { 513 mutex_exit(&p->p_lock); 514 return; 515 } 516 517 ASSERT(p->p_proc_flag & P_PR_LOCK); 518 ASSERT(MUTEX_HELD(&p->p_lock)); 519 520 cv_signal(&pr_pid_cv[p->p_slot]); 521 p->p_proc_flag &= ~P_PR_LOCK; 522 mutex_exit(&p->p_lock); 523 THREAD_KPRI_RELEASE(); 524 } 525 526 void 527 pid_init(void) 528 { 529 int i; 530 531 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 532 533 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 534 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 535 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 536 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 537 538 nproc = 1; 539 practive = proc_sched; 540 proc_sched->p_next = NULL; 541 procdir[0].pe_proc = proc_sched; 542 543 procentfree = &procdir[1]; 544 for (i = 1; i < v.v_proc - 1; i++) 545 procdir[i].pe_next = &procdir[i+1]; 546 procdir[i].pe_next = NULL; 547 548 HASHPID(0) = &pid0; 549 550 upcount_init(); 551 } 552 553 proc_t * 554 pid_entry(int slot) 555 { 556 union procent *pep; 557 proc_t *prp; 558 559 ASSERT(MUTEX_HELD(&pidlock)); 560 ASSERT(slot >= 0 && slot < v.v_proc); 561 562 pep = procdir[slot].pe_next; 563 if (pep >= procdir && pep < &procdir[v.v_proc]) 564 return (NULL); 565 prp = procdir[slot].pe_proc; 566 if (prp != 0 && prp->p_stat == SIDL) 567 return (NULL); 568 return (prp); 569 } 570 571 /* 572 * Send the specified signal to all processes whose process group ID is 573 * equal to 'pgid' 574 */ 575 576 void 577 signal(pid_t pgid, int sig) 578 { 579 struct pid *pidp; 580 proc_t *prp; 581 582 mutex_enter(&pidlock); 583 mutex_enter(&pidlinklock); 584 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 585 mutex_exit(&pidlinklock); 586 mutex_exit(&pidlock); 587 return; 588 } 589 mutex_exit(&pidlinklock); 590 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 591 mutex_enter(&prp->p_lock); 592 sigtoproc(prp, NULL, sig); 593 mutex_exit(&prp->p_lock); 594 } 595 mutex_exit(&pidlock); 596 } 597 598 /* 599 * Send the specified signal to the specified process 600 */ 601 602 void 603 prsignal(struct pid *pidp, int sig) 604 { 605 if (!(pidp->pid_prinactive)) 606 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 607 } 608 609 #include <sys/sunddi.h> 610 611 /* 612 * DDI/DKI interfaces for drivers to send signals to processes 613 */ 614 615 /* 616 * obtain an opaque reference to a process for signaling 617 */ 618 void * 619 proc_ref(void) 620 { 621 struct pid *pidp; 622 623 mutex_enter(&pidlock); 624 pidp = curproc->p_pidp; 625 PID_HOLD(pidp); 626 mutex_exit(&pidlock); 627 628 return (pidp); 629 } 630 631 /* 632 * release a reference to a process 633 * - a process can exit even if a driver has a reference to it 634 * - one proc_unref for every proc_ref 635 */ 636 void 637 proc_unref(void *pref) 638 { 639 mutex_enter(&pidlock); 640 PID_RELE((struct pid *)pref); 641 mutex_exit(&pidlock); 642 } 643 644 /* 645 * send a signal to a process 646 * 647 * - send the process the signal 648 * - if the process went away, return a -1 649 * - if the process is still there return 0 650 */ 651 int 652 proc_signal(void *pref, int sig) 653 { 654 struct pid *pidp = pref; 655 656 prsignal(pidp, sig); 657 return (pidp->pid_prinactive ? -1 : 0); 658 } 659 660 661 static struct upcount **upc_hash; /* a boot time allocated array */ 662 static ulong_t upc_hashmask; 663 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 664 665 /* 666 * Get us off the ground. Called once at boot. 667 */ 668 void 669 upcount_init(void) 670 { 671 ulong_t upc_hashsize; 672 673 /* 674 * An entry per MB of memory is our current guess 675 */ 676 /* 677 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 678 * converts pages to megs (without overflowing a u_int 679 * if you have more than 4G of memory, like ptob(physmem)/1M 680 * would). 681 */ 682 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 683 upc_hashmask = upc_hashsize - 1; 684 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 685 KM_SLEEP); 686 } 687 688 /* 689 * Increment the number of processes associated with a given uid and zoneid. 690 */ 691 void 692 upcount_inc(uid_t uid, zoneid_t zoneid) 693 { 694 struct upcount **upc, **hupc; 695 struct upcount *new; 696 697 ASSERT(MUTEX_HELD(&pidlock)); 698 new = NULL; 699 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 700 top: 701 upc = hupc; 702 while ((*upc) != NULL) { 703 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 704 (*upc)->up_count++; 705 if (new) { 706 /* 707 * did not need `new' afterall. 708 */ 709 kmem_free(new, sizeof (*new)); 710 } 711 return; 712 } 713 upc = &(*upc)->up_next; 714 } 715 716 /* 717 * There is no entry for this <uid,zoneid> pair. 718 * Allocate one. If we have to drop pidlock, check 719 * again. 720 */ 721 if (new == NULL) { 722 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 723 if (new == NULL) { 724 mutex_exit(&pidlock); 725 new = (struct upcount *)kmem_alloc(sizeof (*new), 726 KM_SLEEP); 727 mutex_enter(&pidlock); 728 goto top; 729 } 730 } 731 732 733 /* 734 * On the assumption that a new user is going to do some 735 * more forks, put the new upcount structure on the front. 736 */ 737 upc = hupc; 738 739 new->up_uid = uid; 740 new->up_zoneid = zoneid; 741 new->up_count = 1; 742 new->up_next = *upc; 743 744 *upc = new; 745 } 746 747 /* 748 * Decrement the number of processes a given uid and zoneid has. 749 */ 750 void 751 upcount_dec(uid_t uid, zoneid_t zoneid) 752 { 753 struct upcount **upc; 754 struct upcount *done; 755 756 ASSERT(MUTEX_HELD(&pidlock)); 757 758 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 759 while ((*upc) != NULL) { 760 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 761 (*upc)->up_count--; 762 if ((*upc)->up_count == 0) { 763 done = *upc; 764 *upc = (*upc)->up_next; 765 kmem_free(done, sizeof (*done)); 766 } 767 return; 768 } 769 upc = &(*upc)->up_next; 770 } 771 cmn_err(CE_PANIC, "decr_upcount-off the end"); 772 } 773 774 /* 775 * Returns the number of processes a uid has. 776 * Non-existent uid's are assumed to have no processes. 777 */ 778 int 779 upcount_get(uid_t uid, zoneid_t zoneid) 780 { 781 struct upcount *upc; 782 783 ASSERT(MUTEX_HELD(&pidlock)); 784 785 upc = upc_hash[UPC_HASH(uid, zoneid)]; 786 while (upc != NULL) { 787 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 788 return (upc->up_count); 789 } 790 upc = upc->up_next; 791 } 792 return (0); 793 } 794