1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/types.h> 34 #include <sys/param.h> 35 #include <sys/sysmacros.h> 36 #include <sys/proc.h> 37 #include <sys/kmem.h> 38 #include <sys/tuneable.h> 39 #include <sys/var.h> 40 #include <sys/cred.h> 41 #include <sys/systm.h> 42 #include <sys/prsystm.h> 43 #include <sys/vnode.h> 44 #include <sys/session.h> 45 #include <sys/cpuvar.h> 46 #include <sys/cmn_err.h> 47 #include <sys/bitmap.h> 48 #include <sys/debug.h> 49 #include <c2/audit.h> 50 #include <sys/zone.h> 51 52 /* directory entries for /proc */ 53 union procent { 54 proc_t *pe_proc; 55 union procent *pe_next; 56 }; 57 58 struct pid pid0 = { 59 0, /* pid_prinactive */ 60 1, /* pid_pgorphaned */ 61 0, /* pid_padding */ 62 0, /* pid_prslot */ 63 0, /* pid_id */ 64 NULL, /* pid_pglink */ 65 NULL, /* pid_pgtail */ 66 NULL, /* pid_link */ 67 3 /* pid_ref */ 68 }; 69 70 static int pid_hashlen = 4; /* desired average hash chain length */ 71 static int pid_hashsz; /* number of buckets in the hash table */ 72 73 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 74 75 extern uint_t nproc; 76 extern struct kmem_cache *process_cache; 77 static void upcount_init(void); 78 79 kmutex_t pidlock; /* global process lock */ 80 kmutex_t pr_pidlock; /* /proc global process lock */ 81 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 82 struct plock *proc_lock; /* persistent array of p_lock's */ 83 84 /* 85 * See the comment above pid_getlockslot() for a detailed explanation of this 86 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 87 * granularity; if the coherence granularity is ever changed, this constant 88 * should be modified to reflect the change to minimize proc_lock false 89 * sharing (correctness, however, is guaranteed regardless of the coherence 90 * granularity). 91 */ 92 #define PLOCK_SHIFT 3 93 94 static kmutex_t pidlinklock; 95 static struct pid **pidhash; 96 static pid_t minpid; 97 static pid_t mpid; 98 static union procent *procdir; 99 static union procent *procentfree; 100 101 static struct pid * 102 pid_lookup(pid_t pid) 103 { 104 struct pid *pidp; 105 106 ASSERT(MUTEX_HELD(&pidlinklock)); 107 108 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 109 if (pidp->pid_id == pid) { 110 ASSERT(pidp->pid_ref > 0); 111 break; 112 } 113 } 114 return (pidp); 115 } 116 117 struct pid * 118 pid_find(pid_t pid) 119 { 120 struct pid *pidp; 121 122 mutex_enter(&pidlinklock); 123 pidp = pid_lookup(pid); 124 mutex_exit(&pidlinklock); 125 126 return (pidp); 127 } 128 129 void 130 pid_setmin(void) 131 { 132 if (jump_pid && jump_pid > mpid) 133 minpid = mpid = jump_pid; 134 else 135 minpid = mpid + 1; 136 } 137 138 /* 139 * When prslots are simply used as an index to determine a process' p_lock, 140 * adjacent prslots share adjacent p_locks. On machines where the size 141 * of a mutex is smaller than that of a cache line (which, as of this writing, 142 * is true for all machines on which Solaris runs), this can potentially 143 * induce false sharing. The standard solution for false sharing is to pad 144 * out one's data structures (in this case, struct plock). However, 145 * given the size and (generally) sparse use of the proc_lock array, this 146 * is suboptimal. We therefore stride through the proc_lock array with 147 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 148 * 149 * log_2 (coherence_granularity / sizeof (kmutex_t)) 150 * 151 * Under this scheme, false sharing is still possible -- but only when 152 * the number of active processes is very large. Note that the one-to-one 153 * mapping between prslots and lockslots is maintained. 154 */ 155 static int 156 pid_getlockslot(int prslot) 157 { 158 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 159 int perlap = even >> PLOCK_SHIFT; 160 161 if (prslot >= even) 162 return (prslot); 163 164 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 165 } 166 167 /* 168 * This function allocates a pid structure, a free pid, and optionally a 169 * slot in the proc table for it. 170 * 171 * pid_allocate() returns the new pid on success, -1 on failure. 172 */ 173 pid_t 174 pid_allocate(proc_t *prp, int flags) 175 { 176 struct pid *pidp; 177 union procent *pep; 178 pid_t newpid, startpid; 179 180 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 181 182 mutex_enter(&pidlinklock); 183 if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) { 184 /* 185 * ran out of /proc directory entries 186 */ 187 goto failed; 188 } 189 190 /* 191 * Allocate a pid 192 */ 193 startpid = mpid; 194 do { 195 newpid = (++mpid == maxpid ? mpid = minpid : mpid); 196 } while (pid_lookup(newpid) && newpid != startpid); 197 198 if (newpid == startpid && pid_lookup(newpid)) { 199 /* couldn't find a free pid */ 200 goto failed; 201 } 202 203 /* 204 * Put pid into the pid hash table. 205 */ 206 pidp->pid_link = HASHPID(newpid); 207 HASHPID(newpid) = pidp; 208 pidp->pid_ref = 1; 209 pidp->pid_id = newpid; 210 211 if (flags & PID_ALLOC_PROC) { 212 procentfree = pep->pe_next; 213 pidp->pid_prslot = pep - procdir; 214 pep->pe_proc = prp; 215 prp->p_pidp = pidp; 216 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 217 } else { 218 pidp->pid_prslot = 0; 219 } 220 221 mutex_exit(&pidlinklock); 222 223 return (newpid); 224 225 failed: 226 mutex_exit(&pidlinklock); 227 kmem_free(pidp, sizeof (struct pid)); 228 return (-1); 229 } 230 231 /* 232 * decrement the reference count for pid 233 */ 234 int 235 pid_rele(struct pid *pidp) 236 { 237 struct pid **pidpp; 238 239 mutex_enter(&pidlinklock); 240 ASSERT(pidp != &pid0); 241 242 pidpp = &HASHPID(pidp->pid_id); 243 for (;;) { 244 ASSERT(*pidpp != NULL); 245 if (*pidpp == pidp) 246 break; 247 pidpp = &(*pidpp)->pid_link; 248 } 249 250 *pidpp = pidp->pid_link; 251 mutex_exit(&pidlinklock); 252 253 kmem_free(pidp, sizeof (*pidp)); 254 return (0); 255 } 256 257 void 258 proc_entry_free(struct pid *pidp) 259 { 260 mutex_enter(&pidlinklock); 261 pidp->pid_prinactive = 1; 262 procdir[pidp->pid_prslot].pe_next = procentfree; 263 procentfree = &procdir[pidp->pid_prslot]; 264 mutex_exit(&pidlinklock); 265 } 266 267 void 268 pid_exit(proc_t *prp) 269 { 270 struct pid *pidp; 271 272 ASSERT(MUTEX_HELD(&pidlock)); 273 274 /* 275 * Exit process group. If it is NULL, it's because fork failed 276 * before calling pgjoin(). 277 */ 278 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 279 if (prp->p_pgidp != NULL) 280 pgexit(prp); 281 282 sess_rele(prp->p_sessp, B_TRUE); 283 284 pidp = prp->p_pidp; 285 286 proc_entry_free(pidp); 287 288 if (audit_active) 289 audit_pfree(prp); 290 291 if (practive == prp) { 292 practive = prp->p_next; 293 } 294 295 if (prp->p_next) { 296 prp->p_next->p_prev = prp->p_prev; 297 } 298 if (prp->p_prev) { 299 prp->p_prev->p_next = prp->p_next; 300 } 301 302 PID_RELE(pidp); 303 304 mutex_destroy(&prp->p_crlock); 305 kmem_cache_free(process_cache, prp); 306 nproc--; 307 } 308 309 /* 310 * Find a process visible from the specified zone given its process ID. 311 */ 312 proc_t * 313 prfind_zone(pid_t pid, zoneid_t zoneid) 314 { 315 struct pid *pidp; 316 proc_t *p; 317 318 ASSERT(MUTEX_HELD(&pidlock)); 319 320 mutex_enter(&pidlinklock); 321 pidp = pid_lookup(pid); 322 mutex_exit(&pidlinklock); 323 if (pidp != NULL && pidp->pid_prinactive == 0) { 324 p = procdir[pidp->pid_prslot].pe_proc; 325 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 326 return (p); 327 } 328 return (NULL); 329 } 330 331 /* 332 * Find a process given its process ID. This obeys zone restrictions, 333 * so if the caller is in a non-global zone it won't find processes 334 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 335 * bypass this restriction. 336 */ 337 proc_t * 338 prfind(pid_t pid) 339 { 340 zoneid_t zoneid; 341 342 if (INGLOBALZONE(curproc)) 343 zoneid = ALL_ZONES; 344 else 345 zoneid = getzoneid(); 346 return (prfind_zone(pid, zoneid)); 347 } 348 349 proc_t * 350 pgfind_zone(pid_t pgid, zoneid_t zoneid) 351 { 352 struct pid *pidp; 353 354 ASSERT(MUTEX_HELD(&pidlock)); 355 356 mutex_enter(&pidlinklock); 357 pidp = pid_lookup(pgid); 358 mutex_exit(&pidlinklock); 359 if (pidp != NULL) { 360 proc_t *p = pidp->pid_pglink; 361 362 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 363 p->p_zone->zone_id == zoneid) 364 return (p); 365 } 366 return (NULL); 367 } 368 369 /* 370 * return the head of the list of processes whose process group ID is 'pgid', 371 * or NULL, if no such process group 372 */ 373 proc_t * 374 pgfind(pid_t pgid) 375 { 376 zoneid_t zoneid; 377 378 if (INGLOBALZONE(curproc)) 379 zoneid = ALL_ZONES; 380 else 381 zoneid = getzoneid(); 382 return (pgfind_zone(pgid, zoneid)); 383 } 384 385 /* 386 * Sets P_PR_LOCK on a non-system process. Process must be fully created 387 * and not exiting to succeed. 388 * 389 * Returns 0 on success. 390 * Returns 1 if P_PR_LOCK is set. 391 * Returns -1 if proc is in invalid state. 392 */ 393 int 394 sprtrylock_proc(proc_t *p) 395 { 396 ASSERT(MUTEX_HELD(&p->p_lock)); 397 398 /* skip system and incomplete processes */ 399 if (p->p_stat == SIDL || p->p_stat == SZOMB || 400 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 401 return (-1); 402 } 403 404 if (p->p_proc_flag & P_PR_LOCK) 405 return (1); 406 407 p->p_proc_flag |= P_PR_LOCK; 408 THREAD_KPRI_REQUEST(); 409 410 return (0); 411 } 412 413 /* 414 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 415 * and the proc pointer no longer valid, as the proc may have exited. 416 */ 417 void 418 sprwaitlock_proc(proc_t *p) 419 { 420 kmutex_t *mp; 421 422 ASSERT(MUTEX_HELD(&p->p_lock)); 423 ASSERT(p->p_proc_flag & P_PR_LOCK); 424 425 /* 426 * p_lock is persistent, but p itself is not -- it could 427 * vanish during cv_wait(). Load p->p_lock now so we can 428 * drop it after cv_wait() without referencing p. 429 */ 430 mp = &p->p_lock; 431 cv_wait(&pr_pid_cv[p->p_slot], mp); 432 mutex_exit(mp); 433 } 434 435 /* 436 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 437 * Returns the proc pointer on success, NULL on failure. sprlock() is 438 * really just a stripped-down version of pr_p_lock() to allow practive 439 * walkers like dofusers() and dumpsys() to synchronize with /proc. 440 */ 441 proc_t * 442 sprlock_zone(pid_t pid, zoneid_t zoneid) 443 { 444 proc_t *p; 445 int ret; 446 447 for (;;) { 448 mutex_enter(&pidlock); 449 if ((p = prfind_zone(pid, zoneid)) == NULL) { 450 mutex_exit(&pidlock); 451 return (NULL); 452 } 453 mutex_enter(&p->p_lock); 454 mutex_exit(&pidlock); 455 456 if (panicstr) 457 return (p); 458 459 ret = sprtrylock_proc(p); 460 if (ret == -1) { 461 mutex_exit(&p->p_lock); 462 return (NULL); 463 } else if (ret == 0) { 464 break; 465 } 466 sprwaitlock_proc(p); 467 } 468 return (p); 469 } 470 471 proc_t * 472 sprlock(pid_t pid) 473 { 474 zoneid_t zoneid; 475 476 if (INGLOBALZONE(curproc)) 477 zoneid = ALL_ZONES; 478 else 479 zoneid = getzoneid(); 480 return (sprlock_zone(pid, zoneid)); 481 } 482 483 void 484 sprlock_proc(proc_t *p) 485 { 486 ASSERT(MUTEX_HELD(&p->p_lock)); 487 488 while (p->p_proc_flag & P_PR_LOCK) { 489 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 490 } 491 492 p->p_proc_flag |= P_PR_LOCK; 493 THREAD_KPRI_REQUEST(); 494 } 495 496 void 497 sprunlock(proc_t *p) 498 { 499 if (panicstr) { 500 mutex_exit(&p->p_lock); 501 return; 502 } 503 504 ASSERT(p->p_proc_flag & P_PR_LOCK); 505 ASSERT(MUTEX_HELD(&p->p_lock)); 506 507 cv_signal(&pr_pid_cv[p->p_slot]); 508 p->p_proc_flag &= ~P_PR_LOCK; 509 mutex_exit(&p->p_lock); 510 THREAD_KPRI_RELEASE(); 511 } 512 513 void 514 pid_init(void) 515 { 516 int i; 517 518 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 519 520 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 521 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 522 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 523 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 524 525 nproc = 1; 526 practive = proc_sched; 527 proc_sched->p_next = NULL; 528 procdir[0].pe_proc = proc_sched; 529 530 procentfree = &procdir[1]; 531 for (i = 1; i < v.v_proc - 1; i++) 532 procdir[i].pe_next = &procdir[i+1]; 533 procdir[i].pe_next = NULL; 534 535 HASHPID(0) = &pid0; 536 537 upcount_init(); 538 } 539 540 proc_t * 541 pid_entry(int slot) 542 { 543 union procent *pep; 544 proc_t *prp; 545 546 ASSERT(MUTEX_HELD(&pidlock)); 547 ASSERT(slot >= 0 && slot < v.v_proc); 548 549 pep = procdir[slot].pe_next; 550 if (pep >= procdir && pep < &procdir[v.v_proc]) 551 return (NULL); 552 prp = procdir[slot].pe_proc; 553 if (prp != 0 && prp->p_stat == SIDL) 554 return (NULL); 555 return (prp); 556 } 557 558 /* 559 * Send the specified signal to all processes whose process group ID is 560 * equal to 'pgid' 561 */ 562 563 void 564 signal(pid_t pgid, int sig) 565 { 566 struct pid *pidp; 567 proc_t *prp; 568 569 mutex_enter(&pidlock); 570 mutex_enter(&pidlinklock); 571 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 572 mutex_exit(&pidlinklock); 573 mutex_exit(&pidlock); 574 return; 575 } 576 mutex_exit(&pidlinklock); 577 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 578 mutex_enter(&prp->p_lock); 579 sigtoproc(prp, NULL, sig); 580 mutex_exit(&prp->p_lock); 581 } 582 mutex_exit(&pidlock); 583 } 584 585 /* 586 * Send the specified signal to the specified process 587 */ 588 589 void 590 prsignal(struct pid *pidp, int sig) 591 { 592 if (!(pidp->pid_prinactive)) 593 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 594 } 595 596 #include <sys/sunddi.h> 597 598 /* 599 * DDI/DKI interfaces for drivers to send signals to processes 600 */ 601 602 /* 603 * obtain an opaque reference to a process for signaling 604 */ 605 void * 606 proc_ref(void) 607 { 608 struct pid *pidp; 609 610 mutex_enter(&pidlock); 611 pidp = curproc->p_pidp; 612 PID_HOLD(pidp); 613 mutex_exit(&pidlock); 614 615 return (pidp); 616 } 617 618 /* 619 * release a reference to a process 620 * - a process can exit even if a driver has a reference to it 621 * - one proc_unref for every proc_ref 622 */ 623 void 624 proc_unref(void *pref) 625 { 626 mutex_enter(&pidlock); 627 PID_RELE((struct pid *)pref); 628 mutex_exit(&pidlock); 629 } 630 631 /* 632 * send a signal to a process 633 * 634 * - send the process the signal 635 * - if the process went away, return a -1 636 * - if the process is still there return 0 637 */ 638 int 639 proc_signal(void *pref, int sig) 640 { 641 struct pid *pidp = pref; 642 643 prsignal(pidp, sig); 644 return (pidp->pid_prinactive ? -1 : 0); 645 } 646 647 648 static struct upcount **upc_hash; /* a boot time allocated array */ 649 static ulong_t upc_hashmask; 650 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 651 652 /* 653 * Get us off the ground. Called once at boot. 654 */ 655 void 656 upcount_init(void) 657 { 658 ulong_t upc_hashsize; 659 660 /* 661 * An entry per MB of memory is our current guess 662 */ 663 /* 664 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 665 * converts pages to megs (without overflowing a u_int 666 * if you have more than 4G of memory, like ptob(physmem)/1M 667 * would). 668 */ 669 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 670 upc_hashmask = upc_hashsize - 1; 671 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 672 KM_SLEEP); 673 } 674 675 /* 676 * Increment the number of processes associated with a given uid and zoneid. 677 */ 678 void 679 upcount_inc(uid_t uid, zoneid_t zoneid) 680 { 681 struct upcount **upc, **hupc; 682 struct upcount *new; 683 684 ASSERT(MUTEX_HELD(&pidlock)); 685 new = NULL; 686 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 687 top: 688 upc = hupc; 689 while ((*upc) != NULL) { 690 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 691 (*upc)->up_count++; 692 if (new) { 693 /* 694 * did not need `new' afterall. 695 */ 696 kmem_free(new, sizeof (*new)); 697 } 698 return; 699 } 700 upc = &(*upc)->up_next; 701 } 702 703 /* 704 * There is no entry for this <uid,zoneid> pair. 705 * Allocate one. If we have to drop pidlock, check 706 * again. 707 */ 708 if (new == NULL) { 709 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 710 if (new == NULL) { 711 mutex_exit(&pidlock); 712 new = (struct upcount *)kmem_alloc(sizeof (*new), 713 KM_SLEEP); 714 mutex_enter(&pidlock); 715 goto top; 716 } 717 } 718 719 720 /* 721 * On the assumption that a new user is going to do some 722 * more forks, put the new upcount structure on the front. 723 */ 724 upc = hupc; 725 726 new->up_uid = uid; 727 new->up_zoneid = zoneid; 728 new->up_count = 1; 729 new->up_next = *upc; 730 731 *upc = new; 732 } 733 734 /* 735 * Decrement the number of processes a given uid and zoneid has. 736 */ 737 void 738 upcount_dec(uid_t uid, zoneid_t zoneid) 739 { 740 struct upcount **upc; 741 struct upcount *done; 742 743 ASSERT(MUTEX_HELD(&pidlock)); 744 745 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 746 while ((*upc) != NULL) { 747 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 748 (*upc)->up_count--; 749 if ((*upc)->up_count == 0) { 750 done = *upc; 751 *upc = (*upc)->up_next; 752 kmem_free(done, sizeof (*done)); 753 } 754 return; 755 } 756 upc = &(*upc)->up_next; 757 } 758 cmn_err(CE_PANIC, "decr_upcount-off the end"); 759 } 760 761 /* 762 * Returns the number of processes a uid has. 763 * Non-existent uid's are assumed to have no processes. 764 */ 765 int 766 upcount_get(uid_t uid, zoneid_t zoneid) 767 { 768 struct upcount *upc; 769 770 ASSERT(MUTEX_HELD(&pidlock)); 771 772 upc = upc_hash[UPC_HASH(uid, zoneid)]; 773 while (upc != NULL) { 774 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 775 return (upc->up_count); 776 } 777 upc = upc->up_next; 778 } 779 return (0); 780 } 781