1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/proc.h> 34 #include <sys/kmem.h> 35 #include <sys/tuneable.h> 36 #include <sys/var.h> 37 #include <sys/cred.h> 38 #include <sys/systm.h> 39 #include <sys/prsystm.h> 40 #include <sys/vnode.h> 41 #include <sys/session.h> 42 #include <sys/cpuvar.h> 43 #include <sys/cmn_err.h> 44 #include <sys/bitmap.h> 45 #include <sys/debug.h> 46 #include <c2/audit.h> 47 #include <sys/zone.h> 48 49 /* directory entries for /proc */ 50 union procent { 51 proc_t *pe_proc; 52 union procent *pe_next; 53 }; 54 55 struct pid pid0 = { 56 0, /* pid_prinactive */ 57 1, /* pid_pgorphaned */ 58 0, /* pid_padding */ 59 0, /* pid_prslot */ 60 0, /* pid_id */ 61 NULL, /* pid_pglink */ 62 NULL, /* pid_pgtail */ 63 NULL, /* pid_link */ 64 3 /* pid_ref */ 65 }; 66 67 static int pid_hashlen = 4; /* desired average hash chain length */ 68 static int pid_hashsz; /* number of buckets in the hash table */ 69 70 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 71 72 extern uint_t nproc; 73 extern struct kmem_cache *process_cache; 74 static void upcount_init(void); 75 76 kmutex_t pidlock; /* global process lock */ 77 kmutex_t pr_pidlock; /* /proc global process lock */ 78 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 79 struct plock *proc_lock; /* persistent array of p_lock's */ 80 81 /* 82 * See the comment above pid_getlockslot() for a detailed explanation of this 83 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 84 * granularity; if the coherence granularity is ever changed, this constant 85 * should be modified to reflect the change to minimize proc_lock false 86 * sharing (correctness, however, is guaranteed regardless of the coherence 87 * granularity). 88 */ 89 #define PLOCK_SHIFT 3 90 91 static kmutex_t pidlinklock; 92 static struct pid **pidhash; 93 static pid_t minpid; 94 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ 95 static union procent *procdir; 96 static union procent *procentfree; 97 98 static struct pid * 99 pid_lookup(pid_t pid) 100 { 101 struct pid *pidp; 102 103 ASSERT(MUTEX_HELD(&pidlinklock)); 104 105 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 106 if (pidp->pid_id == pid) { 107 ASSERT(pidp->pid_ref > 0); 108 break; 109 } 110 } 111 return (pidp); 112 } 113 114 struct pid * 115 pid_find(pid_t pid) 116 { 117 struct pid *pidp; 118 119 mutex_enter(&pidlinklock); 120 pidp = pid_lookup(pid); 121 mutex_exit(&pidlinklock); 122 123 return (pidp); 124 } 125 126 void 127 pid_setmin(void) 128 { 129 if (jump_pid && jump_pid > mpid) 130 minpid = mpid = jump_pid; 131 else 132 minpid = mpid; 133 } 134 135 /* 136 * When prslots are simply used as an index to determine a process' p_lock, 137 * adjacent prslots share adjacent p_locks. On machines where the size 138 * of a mutex is smaller than that of a cache line (which, as of this writing, 139 * is true for all machines on which Solaris runs), this can potentially 140 * induce false sharing. The standard solution for false sharing is to pad 141 * out one's data structures (in this case, struct plock). However, 142 * given the size and (generally) sparse use of the proc_lock array, this 143 * is suboptimal. We therefore stride through the proc_lock array with 144 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 145 * 146 * log_2 (coherence_granularity / sizeof (kmutex_t)) 147 * 148 * Under this scheme, false sharing is still possible -- but only when 149 * the number of active processes is very large. Note that the one-to-one 150 * mapping between prslots and lockslots is maintained. 151 */ 152 static int 153 pid_getlockslot(int prslot) 154 { 155 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 156 int perlap = even >> PLOCK_SHIFT; 157 158 if (prslot >= even) 159 return (prslot); 160 161 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 162 } 163 164 /* 165 * This function allocates a pid structure, a free pid, and optionally a 166 * slot in the proc table for it. 167 * 168 * pid_allocate() returns the new pid on success, -1 on failure. 169 */ 170 pid_t 171 pid_allocate(proc_t *prp, pid_t pid, int flags) 172 { 173 struct pid *pidp; 174 union procent *pep; 175 pid_t newpid, startpid; 176 177 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 178 179 mutex_enter(&pidlinklock); 180 if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) { 181 /* 182 * ran out of /proc directory entries 183 */ 184 goto failed; 185 } 186 187 if (pid != 0) { 188 VERIFY(minpid == 0); 189 VERIFY3P(pid, <, mpid); 190 VERIFY3P(pid_lookup(pid), ==, NULL); 191 newpid = pid; 192 } else { 193 /* 194 * Allocate a pid 195 */ 196 ASSERT(minpid <= mpid && mpid <= maxpid); 197 198 startpid = mpid; 199 for (;;) { 200 newpid = mpid; 201 if (mpid >= maxpid) 202 mpid = minpid; 203 else 204 mpid++; 205 206 if (pid_lookup(newpid) == NULL) 207 break; 208 209 if (mpid == startpid) 210 goto failed; 211 } 212 } 213 214 /* 215 * Put pid into the pid hash table. 216 */ 217 pidp->pid_link = HASHPID(newpid); 218 HASHPID(newpid) = pidp; 219 pidp->pid_ref = 1; 220 pidp->pid_id = newpid; 221 222 if (flags & PID_ALLOC_PROC) { 223 procentfree = pep->pe_next; 224 pidp->pid_prslot = pep - procdir; 225 pep->pe_proc = prp; 226 prp->p_pidp = pidp; 227 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 228 } else { 229 pidp->pid_prslot = 0; 230 } 231 232 mutex_exit(&pidlinklock); 233 234 return (newpid); 235 236 failed: 237 mutex_exit(&pidlinklock); 238 kmem_free(pidp, sizeof (struct pid)); 239 return (-1); 240 } 241 242 /* 243 * decrement the reference count for pid 244 */ 245 int 246 pid_rele(struct pid *pidp) 247 { 248 struct pid **pidpp; 249 250 mutex_enter(&pidlinklock); 251 ASSERT(pidp != &pid0); 252 253 pidpp = &HASHPID(pidp->pid_id); 254 for (;;) { 255 ASSERT(*pidpp != NULL); 256 if (*pidpp == pidp) 257 break; 258 pidpp = &(*pidpp)->pid_link; 259 } 260 261 *pidpp = pidp->pid_link; 262 mutex_exit(&pidlinklock); 263 264 kmem_free(pidp, sizeof (*pidp)); 265 return (0); 266 } 267 268 void 269 proc_entry_free(struct pid *pidp) 270 { 271 mutex_enter(&pidlinklock); 272 pidp->pid_prinactive = 1; 273 procdir[pidp->pid_prslot].pe_next = procentfree; 274 procentfree = &procdir[pidp->pid_prslot]; 275 mutex_exit(&pidlinklock); 276 } 277 278 void 279 pid_exit(proc_t *prp) 280 { 281 struct pid *pidp; 282 283 ASSERT(MUTEX_HELD(&pidlock)); 284 285 /* 286 * Exit process group. If it is NULL, it's because fork failed 287 * before calling pgjoin(). 288 */ 289 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 290 if (prp->p_pgidp != NULL) 291 pgexit(prp); 292 293 sess_rele(prp->p_sessp, B_TRUE); 294 295 pidp = prp->p_pidp; 296 297 proc_entry_free(pidp); 298 299 if (audit_active) 300 audit_pfree(prp); 301 302 if (practive == prp) { 303 practive = prp->p_next; 304 } 305 306 if (prp->p_next) { 307 prp->p_next->p_prev = prp->p_prev; 308 } 309 if (prp->p_prev) { 310 prp->p_prev->p_next = prp->p_next; 311 } 312 313 PID_RELE(pidp); 314 315 mutex_destroy(&prp->p_crlock); 316 kmem_cache_free(process_cache, prp); 317 nproc--; 318 } 319 320 /* 321 * Find a process visible from the specified zone given its process ID. 322 */ 323 proc_t * 324 prfind_zone(pid_t pid, zoneid_t zoneid) 325 { 326 struct pid *pidp; 327 proc_t *p; 328 329 ASSERT(MUTEX_HELD(&pidlock)); 330 331 mutex_enter(&pidlinklock); 332 pidp = pid_lookup(pid); 333 mutex_exit(&pidlinklock); 334 if (pidp != NULL && pidp->pid_prinactive == 0) { 335 p = procdir[pidp->pid_prslot].pe_proc; 336 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 337 return (p); 338 } 339 return (NULL); 340 } 341 342 /* 343 * Find a process given its process ID. This obeys zone restrictions, 344 * so if the caller is in a non-global zone it won't find processes 345 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 346 * bypass this restriction. 347 */ 348 proc_t * 349 prfind(pid_t pid) 350 { 351 zoneid_t zoneid; 352 353 if (INGLOBALZONE(curproc)) 354 zoneid = ALL_ZONES; 355 else 356 zoneid = getzoneid(); 357 return (prfind_zone(pid, zoneid)); 358 } 359 360 proc_t * 361 pgfind_zone(pid_t pgid, zoneid_t zoneid) 362 { 363 struct pid *pidp; 364 365 ASSERT(MUTEX_HELD(&pidlock)); 366 367 mutex_enter(&pidlinklock); 368 pidp = pid_lookup(pgid); 369 mutex_exit(&pidlinklock); 370 if (pidp != NULL) { 371 proc_t *p = pidp->pid_pglink; 372 373 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 374 p->p_zone->zone_id == zoneid) 375 return (p); 376 } 377 return (NULL); 378 } 379 380 /* 381 * return the head of the list of processes whose process group ID is 'pgid', 382 * or NULL, if no such process group 383 */ 384 proc_t * 385 pgfind(pid_t pgid) 386 { 387 zoneid_t zoneid; 388 389 if (INGLOBALZONE(curproc)) 390 zoneid = ALL_ZONES; 391 else 392 zoneid = getzoneid(); 393 return (pgfind_zone(pgid, zoneid)); 394 } 395 396 /* 397 * Sets P_PR_LOCK on a non-system process. Process must be fully created 398 * and not exiting to succeed. 399 * 400 * Returns 0 on success. 401 * Returns 1 if P_PR_LOCK is set. 402 * Returns -1 if proc is in invalid state. 403 */ 404 int 405 sprtrylock_proc(proc_t *p) 406 { 407 ASSERT(MUTEX_HELD(&p->p_lock)); 408 409 /* skip system and incomplete processes */ 410 if (p->p_stat == SIDL || p->p_stat == SZOMB || 411 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 412 return (-1); 413 } 414 415 if (p->p_proc_flag & P_PR_LOCK) 416 return (1); 417 418 p->p_proc_flag |= P_PR_LOCK; 419 THREAD_KPRI_REQUEST(); 420 421 return (0); 422 } 423 424 /* 425 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 426 * and the proc pointer no longer valid, as the proc may have exited. 427 */ 428 void 429 sprwaitlock_proc(proc_t *p) 430 { 431 kmutex_t *mp; 432 433 ASSERT(MUTEX_HELD(&p->p_lock)); 434 ASSERT(p->p_proc_flag & P_PR_LOCK); 435 436 /* 437 * p_lock is persistent, but p itself is not -- it could 438 * vanish during cv_wait(). Load p->p_lock now so we can 439 * drop it after cv_wait() without referencing p. 440 */ 441 mp = &p->p_lock; 442 cv_wait(&pr_pid_cv[p->p_slot], mp); 443 mutex_exit(mp); 444 } 445 446 /* 447 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 448 * Returns the proc pointer on success, NULL on failure. sprlock() is 449 * really just a stripped-down version of pr_p_lock() to allow practive 450 * walkers like dofusers() and dumpsys() to synchronize with /proc. 451 */ 452 proc_t * 453 sprlock_zone(pid_t pid, zoneid_t zoneid) 454 { 455 proc_t *p; 456 int ret; 457 458 for (;;) { 459 mutex_enter(&pidlock); 460 if ((p = prfind_zone(pid, zoneid)) == NULL) { 461 mutex_exit(&pidlock); 462 return (NULL); 463 } 464 mutex_enter(&p->p_lock); 465 mutex_exit(&pidlock); 466 467 if (panicstr) 468 return (p); 469 470 ret = sprtrylock_proc(p); 471 if (ret == -1) { 472 mutex_exit(&p->p_lock); 473 return (NULL); 474 } else if (ret == 0) { 475 break; 476 } 477 sprwaitlock_proc(p); 478 } 479 return (p); 480 } 481 482 proc_t * 483 sprlock(pid_t pid) 484 { 485 zoneid_t zoneid; 486 487 if (INGLOBALZONE(curproc)) 488 zoneid = ALL_ZONES; 489 else 490 zoneid = getzoneid(); 491 return (sprlock_zone(pid, zoneid)); 492 } 493 494 void 495 sprlock_proc(proc_t *p) 496 { 497 ASSERT(MUTEX_HELD(&p->p_lock)); 498 499 while (p->p_proc_flag & P_PR_LOCK) { 500 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 501 } 502 503 p->p_proc_flag |= P_PR_LOCK; 504 THREAD_KPRI_REQUEST(); 505 } 506 507 void 508 sprunlock(proc_t *p) 509 { 510 if (panicstr) { 511 mutex_exit(&p->p_lock); 512 return; 513 } 514 515 ASSERT(p->p_proc_flag & P_PR_LOCK); 516 ASSERT(MUTEX_HELD(&p->p_lock)); 517 518 cv_signal(&pr_pid_cv[p->p_slot]); 519 p->p_proc_flag &= ~P_PR_LOCK; 520 mutex_exit(&p->p_lock); 521 THREAD_KPRI_RELEASE(); 522 } 523 524 void 525 pid_init(void) 526 { 527 int i; 528 529 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 530 531 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 532 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 533 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 534 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 535 536 nproc = 1; 537 practive = proc_sched; 538 proc_sched->p_next = NULL; 539 procdir[0].pe_proc = proc_sched; 540 541 procentfree = &procdir[1]; 542 for (i = 1; i < v.v_proc - 1; i++) 543 procdir[i].pe_next = &procdir[i+1]; 544 procdir[i].pe_next = NULL; 545 546 HASHPID(0) = &pid0; 547 548 upcount_init(); 549 } 550 551 proc_t * 552 pid_entry(int slot) 553 { 554 union procent *pep; 555 proc_t *prp; 556 557 ASSERT(MUTEX_HELD(&pidlock)); 558 ASSERT(slot >= 0 && slot < v.v_proc); 559 560 pep = procdir[slot].pe_next; 561 if (pep >= procdir && pep < &procdir[v.v_proc]) 562 return (NULL); 563 prp = procdir[slot].pe_proc; 564 if (prp != 0 && prp->p_stat == SIDL) 565 return (NULL); 566 return (prp); 567 } 568 569 /* 570 * Send the specified signal to all processes whose process group ID is 571 * equal to 'pgid' 572 */ 573 574 void 575 signal(pid_t pgid, int sig) 576 { 577 struct pid *pidp; 578 proc_t *prp; 579 580 mutex_enter(&pidlock); 581 mutex_enter(&pidlinklock); 582 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 583 mutex_exit(&pidlinklock); 584 mutex_exit(&pidlock); 585 return; 586 } 587 mutex_exit(&pidlinklock); 588 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 589 mutex_enter(&prp->p_lock); 590 sigtoproc(prp, NULL, sig); 591 mutex_exit(&prp->p_lock); 592 } 593 mutex_exit(&pidlock); 594 } 595 596 /* 597 * Send the specified signal to the specified process 598 */ 599 600 void 601 prsignal(struct pid *pidp, int sig) 602 { 603 if (!(pidp->pid_prinactive)) 604 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 605 } 606 607 #include <sys/sunddi.h> 608 609 /* 610 * DDI/DKI interfaces for drivers to send signals to processes 611 */ 612 613 /* 614 * obtain an opaque reference to a process for signaling 615 */ 616 void * 617 proc_ref(void) 618 { 619 struct pid *pidp; 620 621 mutex_enter(&pidlock); 622 pidp = curproc->p_pidp; 623 PID_HOLD(pidp); 624 mutex_exit(&pidlock); 625 626 return (pidp); 627 } 628 629 /* 630 * release a reference to a process 631 * - a process can exit even if a driver has a reference to it 632 * - one proc_unref for every proc_ref 633 */ 634 void 635 proc_unref(void *pref) 636 { 637 mutex_enter(&pidlock); 638 PID_RELE((struct pid *)pref); 639 mutex_exit(&pidlock); 640 } 641 642 /* 643 * send a signal to a process 644 * 645 * - send the process the signal 646 * - if the process went away, return a -1 647 * - if the process is still there return 0 648 */ 649 int 650 proc_signal(void *pref, int sig) 651 { 652 struct pid *pidp = pref; 653 654 prsignal(pidp, sig); 655 return (pidp->pid_prinactive ? -1 : 0); 656 } 657 658 659 static struct upcount **upc_hash; /* a boot time allocated array */ 660 static ulong_t upc_hashmask; 661 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 662 663 /* 664 * Get us off the ground. Called once at boot. 665 */ 666 void 667 upcount_init(void) 668 { 669 ulong_t upc_hashsize; 670 671 /* 672 * An entry per MB of memory is our current guess 673 */ 674 /* 675 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 676 * converts pages to megs (without overflowing a u_int 677 * if you have more than 4G of memory, like ptob(physmem)/1M 678 * would). 679 */ 680 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 681 upc_hashmask = upc_hashsize - 1; 682 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 683 KM_SLEEP); 684 } 685 686 /* 687 * Increment the number of processes associated with a given uid and zoneid. 688 */ 689 void 690 upcount_inc(uid_t uid, zoneid_t zoneid) 691 { 692 struct upcount **upc, **hupc; 693 struct upcount *new; 694 695 ASSERT(MUTEX_HELD(&pidlock)); 696 new = NULL; 697 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 698 top: 699 upc = hupc; 700 while ((*upc) != NULL) { 701 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 702 (*upc)->up_count++; 703 if (new) { 704 /* 705 * did not need `new' afterall. 706 */ 707 kmem_free(new, sizeof (*new)); 708 } 709 return; 710 } 711 upc = &(*upc)->up_next; 712 } 713 714 /* 715 * There is no entry for this <uid,zoneid> pair. 716 * Allocate one. If we have to drop pidlock, check 717 * again. 718 */ 719 if (new == NULL) { 720 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 721 if (new == NULL) { 722 mutex_exit(&pidlock); 723 new = (struct upcount *)kmem_alloc(sizeof (*new), 724 KM_SLEEP); 725 mutex_enter(&pidlock); 726 goto top; 727 } 728 } 729 730 731 /* 732 * On the assumption that a new user is going to do some 733 * more forks, put the new upcount structure on the front. 734 */ 735 upc = hupc; 736 737 new->up_uid = uid; 738 new->up_zoneid = zoneid; 739 new->up_count = 1; 740 new->up_next = *upc; 741 742 *upc = new; 743 } 744 745 /* 746 * Decrement the number of processes a given uid and zoneid has. 747 */ 748 void 749 upcount_dec(uid_t uid, zoneid_t zoneid) 750 { 751 struct upcount **upc; 752 struct upcount *done; 753 754 ASSERT(MUTEX_HELD(&pidlock)); 755 756 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 757 while ((*upc) != NULL) { 758 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 759 (*upc)->up_count--; 760 if ((*upc)->up_count == 0) { 761 done = *upc; 762 *upc = (*upc)->up_next; 763 kmem_free(done, sizeof (*done)); 764 } 765 return; 766 } 767 upc = &(*upc)->up_next; 768 } 769 cmn_err(CE_PANIC, "decr_upcount-off the end"); 770 } 771 772 /* 773 * Returns the number of processes a uid has. 774 * Non-existent uid's are assumed to have no processes. 775 */ 776 int 777 upcount_get(uid_t uid, zoneid_t zoneid) 778 { 779 struct upcount *upc; 780 781 ASSERT(MUTEX_HELD(&pidlock)); 782 783 upc = upc_hash[UPC_HASH(uid, zoneid)]; 784 while (upc != NULL) { 785 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 786 return (upc->up_count); 787 } 788 upc = upc->up_next; 789 } 790 return (0); 791 } 792