1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 #include <sys/types.h> 31 #include <sys/param.h> 32 #include <sys/sysmacros.h> 33 #include <sys/proc.h> 34 #include <sys/kmem.h> 35 #include <sys/tuneable.h> 36 #include <sys/var.h> 37 #include <sys/cred.h> 38 #include <sys/systm.h> 39 #include <sys/prsystm.h> 40 #include <sys/vnode.h> 41 #include <sys/session.h> 42 #include <sys/cpuvar.h> 43 #include <sys/cmn_err.h> 44 #include <sys/bitmap.h> 45 #include <sys/debug.h> 46 #include <c2/audit.h> 47 #include <sys/project.h> 48 #include <sys/task.h> 49 #include <sys/zone.h> 50 51 /* directory entries for /proc */ 52 union procent { 53 proc_t *pe_proc; 54 union procent *pe_next; 55 }; 56 57 struct pid pid0 = { 58 0, /* pid_prinactive */ 59 1, /* pid_pgorphaned */ 60 0, /* pid_padding */ 61 0, /* pid_prslot */ 62 0, /* pid_id */ 63 NULL, /* pid_pglink */ 64 NULL, /* pid_pgtail */ 65 NULL, /* pid_link */ 66 3 /* pid_ref */ 67 }; 68 69 static int pid_hashlen = 4; /* desired average hash chain length */ 70 static int pid_hashsz; /* number of buckets in the hash table */ 71 72 #define HASHPID(pid) (pidhash[((pid)&(pid_hashsz-1))]) 73 74 extern uint_t nproc; 75 extern struct kmem_cache *process_cache; 76 static void upcount_init(void); 77 78 kmutex_t pidlock; /* global process lock */ 79 kmutex_t pr_pidlock; /* /proc global process lock */ 80 kcondvar_t *pr_pid_cv; /* for /proc, one per process slot */ 81 struct plock *proc_lock; /* persistent array of p_lock's */ 82 83 /* 84 * See the comment above pid_getlockslot() for a detailed explanation of this 85 * constant. Note that a PLOCK_SHIFT of 3 implies 64-byte coherence 86 * granularity; if the coherence granularity is ever changed, this constant 87 * should be modified to reflect the change to minimize proc_lock false 88 * sharing (correctness, however, is guaranteed regardless of the coherence 89 * granularity). 90 */ 91 #define PLOCK_SHIFT 3 92 93 static kmutex_t pidlinklock; 94 static struct pid **pidhash; 95 static pid_t minpid; 96 static pid_t mpid = FAMOUS_PIDS; /* one more than the last famous pid */ 97 static union procent *procdir; 98 static union procent *procentfree; 99 100 static struct pid * 101 pid_lookup(pid_t pid) 102 { 103 struct pid *pidp; 104 105 ASSERT(MUTEX_HELD(&pidlinklock)); 106 107 for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) { 108 if (pidp->pid_id == pid) { 109 ASSERT(pidp->pid_ref > 0); 110 break; 111 } 112 } 113 return (pidp); 114 } 115 116 void 117 pid_setmin(void) 118 { 119 if (jump_pid && jump_pid > mpid) 120 minpid = mpid = jump_pid; 121 else 122 minpid = mpid; 123 } 124 125 /* 126 * When prslots are simply used as an index to determine a process' p_lock, 127 * adjacent prslots share adjacent p_locks. On machines where the size 128 * of a mutex is smaller than that of a cache line (which, as of this writing, 129 * is true for all machines on which Solaris runs), this can potentially 130 * induce false sharing. The standard solution for false sharing is to pad 131 * out one's data structures (in this case, struct plock). However, 132 * given the size and (generally) sparse use of the proc_lock array, this 133 * is suboptimal. We therefore stride through the proc_lock array with 134 * a stride of PLOCK_SHIFT. PLOCK_SHIFT should be defined as: 135 * 136 * log_2 (coherence_granularity / sizeof (kmutex_t)) 137 * 138 * Under this scheme, false sharing is still possible -- but only when 139 * the number of active processes is very large. Note that the one-to-one 140 * mapping between prslots and lockslots is maintained. 141 */ 142 static int 143 pid_getlockslot(int prslot) 144 { 145 int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT; 146 int perlap = even >> PLOCK_SHIFT; 147 148 if (prslot >= even) 149 return (prslot); 150 151 return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap)); 152 } 153 154 /* 155 * This function allocates a pid structure, a free pid, and optionally a 156 * slot in the proc table for it. 157 * 158 * pid_allocate() returns the new pid on success, -1 on failure. 159 */ 160 pid_t 161 pid_allocate(proc_t *prp, pid_t pid, int flags) 162 { 163 struct pid *pidp; 164 union procent *pep; 165 pid_t newpid, startpid; 166 167 pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP); 168 169 mutex_enter(&pidlinklock); 170 pep = procentfree; 171 if ((flags & PID_ALLOC_PROC) && pep == NULL) { 172 /* 173 * ran out of /proc directory entries 174 */ 175 goto failed; 176 } 177 178 if (pid != 0) { 179 VERIFY(minpid == 0); 180 VERIFY3P(pid, <, mpid); 181 VERIFY3P(pid_lookup(pid), ==, NULL); 182 newpid = pid; 183 } else { 184 /* 185 * Allocate a pid 186 */ 187 ASSERT(minpid <= mpid && mpid < maxpid); 188 189 startpid = mpid; 190 for (;;) { 191 newpid = mpid; 192 if (++mpid == maxpid) 193 mpid = minpid; 194 195 if (pid_lookup(newpid) == NULL) 196 break; 197 198 if (mpid == startpid) 199 goto failed; 200 } 201 } 202 203 /* 204 * Put pid into the pid hash table. 205 */ 206 pidp->pid_link = HASHPID(newpid); 207 HASHPID(newpid) = pidp; 208 pidp->pid_ref = 1; 209 pidp->pid_id = newpid; 210 211 if (flags & PID_ALLOC_PROC) { 212 procentfree = pep->pe_next; 213 pidp->pid_prslot = pep - procdir; 214 pep->pe_proc = prp; 215 prp->p_pidp = pidp; 216 prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)]; 217 } else { 218 pidp->pid_prslot = 0; 219 } 220 221 mutex_exit(&pidlinklock); 222 223 return (newpid); 224 225 failed: 226 mutex_exit(&pidlinklock); 227 kmem_free(pidp, sizeof (struct pid)); 228 return (-1); 229 } 230 231 /* 232 * decrement the reference count for pid 233 */ 234 int 235 pid_rele(struct pid *pidp) 236 { 237 struct pid **pidpp; 238 239 mutex_enter(&pidlinklock); 240 ASSERT(pidp != &pid0); 241 242 pidpp = &HASHPID(pidp->pid_id); 243 for (;;) { 244 ASSERT(*pidpp != NULL); 245 if (*pidpp == pidp) 246 break; 247 pidpp = &(*pidpp)->pid_link; 248 } 249 250 *pidpp = pidp->pid_link; 251 mutex_exit(&pidlinklock); 252 253 kmem_free(pidp, sizeof (*pidp)); 254 return (0); 255 } 256 257 void 258 proc_entry_free(struct pid *pidp) 259 { 260 mutex_enter(&pidlinklock); 261 pidp->pid_prinactive = 1; 262 procdir[pidp->pid_prslot].pe_next = procentfree; 263 procentfree = &procdir[pidp->pid_prslot]; 264 mutex_exit(&pidlinklock); 265 } 266 267 /* 268 * The original task needs to be passed in since the process has already been 269 * detached from the task at this point in time. 270 */ 271 void 272 pid_exit(proc_t *prp, struct task *tk) 273 { 274 struct pid *pidp; 275 zone_t *zone = prp->p_zone; 276 277 ASSERT(MUTEX_HELD(&pidlock)); 278 279 /* 280 * Exit process group. If it is NULL, it's because fork failed 281 * before calling pgjoin(). 282 */ 283 ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL); 284 if (prp->p_pgidp != NULL) 285 pgexit(prp); 286 287 sess_rele(prp->p_sessp, B_TRUE); 288 289 pidp = prp->p_pidp; 290 291 proc_entry_free(pidp); 292 293 if (audit_active) 294 audit_pfree(prp); 295 296 if (practive == prp) { 297 practive = prp->p_next; 298 } 299 300 if (prp->p_next) { 301 prp->p_next->p_prev = prp->p_prev; 302 } 303 if (prp->p_prev) { 304 prp->p_prev->p_next = prp->p_next; 305 } 306 307 PID_RELE(pidp); 308 309 mutex_destroy(&prp->p_crlock); 310 kmem_cache_free(process_cache, prp); 311 nproc--; 312 313 /* 314 * Decrement the process counts of the original task, project and zone. 315 */ 316 mutex_enter(&zone->zone_nlwps_lock); 317 tk->tk_nprocs--; 318 tk->tk_proj->kpj_nprocs--; 319 zone->zone_nprocs--; 320 mutex_exit(&zone->zone_nlwps_lock); 321 } 322 323 /* 324 * Find a process visible from the specified zone given its process ID. 325 */ 326 proc_t * 327 prfind_zone(pid_t pid, zoneid_t zoneid) 328 { 329 struct pid *pidp; 330 proc_t *p; 331 332 ASSERT(MUTEX_HELD(&pidlock)); 333 334 mutex_enter(&pidlinklock); 335 pidp = pid_lookup(pid); 336 mutex_exit(&pidlinklock); 337 if (pidp != NULL && pidp->pid_prinactive == 0) { 338 p = procdir[pidp->pid_prslot].pe_proc; 339 if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid) 340 return (p); 341 } 342 return (NULL); 343 } 344 345 /* 346 * Find a process given its process ID. This obeys zone restrictions, 347 * so if the caller is in a non-global zone it won't find processes 348 * associated with other zones. Use prfind_zone(pid, ALL_ZONES) to 349 * bypass this restriction. 350 */ 351 proc_t * 352 prfind(pid_t pid) 353 { 354 zoneid_t zoneid; 355 356 if (INGLOBALZONE(curproc)) 357 zoneid = ALL_ZONES; 358 else 359 zoneid = getzoneid(); 360 return (prfind_zone(pid, zoneid)); 361 } 362 363 proc_t * 364 pgfind_zone(pid_t pgid, zoneid_t zoneid) 365 { 366 struct pid *pidp; 367 368 ASSERT(MUTEX_HELD(&pidlock)); 369 370 mutex_enter(&pidlinklock); 371 pidp = pid_lookup(pgid); 372 mutex_exit(&pidlinklock); 373 if (pidp != NULL) { 374 proc_t *p = pidp->pid_pglink; 375 376 if (zoneid == ALL_ZONES || pgid == 0 || p == NULL || 377 p->p_zone->zone_id == zoneid) 378 return (p); 379 } 380 return (NULL); 381 } 382 383 /* 384 * return the head of the list of processes whose process group ID is 'pgid', 385 * or NULL, if no such process group 386 */ 387 proc_t * 388 pgfind(pid_t pgid) 389 { 390 zoneid_t zoneid; 391 392 if (INGLOBALZONE(curproc)) 393 zoneid = ALL_ZONES; 394 else 395 zoneid = getzoneid(); 396 return (pgfind_zone(pgid, zoneid)); 397 } 398 399 /* 400 * Sets P_PR_LOCK on a non-system process. Process must be fully created 401 * and not exiting to succeed. 402 * 403 * Returns 0 on success. 404 * Returns 1 if P_PR_LOCK is set. 405 * Returns -1 if proc is in invalid state. 406 */ 407 int 408 sprtrylock_proc(proc_t *p) 409 { 410 ASSERT(MUTEX_HELD(&p->p_lock)); 411 412 /* skip system and incomplete processes */ 413 if (p->p_stat == SIDL || p->p_stat == SZOMB || 414 (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) { 415 return (-1); 416 } 417 418 if (p->p_proc_flag & P_PR_LOCK) 419 return (1); 420 421 p->p_proc_flag |= P_PR_LOCK; 422 423 return (0); 424 } 425 426 /* 427 * Wait for P_PR_LOCK to become clear. Returns with p_lock dropped, 428 * and the proc pointer no longer valid, as the proc may have exited. 429 */ 430 void 431 sprwaitlock_proc(proc_t *p) 432 { 433 kmutex_t *mp; 434 435 ASSERT(MUTEX_HELD(&p->p_lock)); 436 ASSERT(p->p_proc_flag & P_PR_LOCK); 437 438 /* 439 * p_lock is persistent, but p itself is not -- it could 440 * vanish during cv_wait(). Load p->p_lock now so we can 441 * drop it after cv_wait() without referencing p. 442 */ 443 mp = &p->p_lock; 444 cv_wait(&pr_pid_cv[p->p_slot], mp); 445 mutex_exit(mp); 446 } 447 448 /* 449 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK. 450 * Returns the proc pointer on success, NULL on failure. sprlock() is 451 * really just a stripped-down version of pr_p_lock() to allow practive 452 * walkers like dofusers() and dumpsys() to synchronize with /proc. 453 */ 454 proc_t * 455 sprlock_zone(pid_t pid, zoneid_t zoneid) 456 { 457 proc_t *p; 458 int ret; 459 460 for (;;) { 461 mutex_enter(&pidlock); 462 if ((p = prfind_zone(pid, zoneid)) == NULL) { 463 mutex_exit(&pidlock); 464 return (NULL); 465 } 466 mutex_enter(&p->p_lock); 467 mutex_exit(&pidlock); 468 469 if (panicstr) 470 return (p); 471 472 ret = sprtrylock_proc(p); 473 if (ret == -1) { 474 mutex_exit(&p->p_lock); 475 return (NULL); 476 } else if (ret == 0) { 477 break; 478 } 479 sprwaitlock_proc(p); 480 } 481 return (p); 482 } 483 484 proc_t * 485 sprlock(pid_t pid) 486 { 487 zoneid_t zoneid; 488 489 if (INGLOBALZONE(curproc)) 490 zoneid = ALL_ZONES; 491 else 492 zoneid = getzoneid(); 493 return (sprlock_zone(pid, zoneid)); 494 } 495 496 void 497 sprlock_proc(proc_t *p) 498 { 499 ASSERT(MUTEX_HELD(&p->p_lock)); 500 501 while (p->p_proc_flag & P_PR_LOCK) { 502 cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock); 503 } 504 505 p->p_proc_flag |= P_PR_LOCK; 506 } 507 508 void 509 sprunlock(proc_t *p) 510 { 511 if (panicstr) { 512 mutex_exit(&p->p_lock); 513 return; 514 } 515 516 ASSERT(p->p_proc_flag & P_PR_LOCK); 517 ASSERT(MUTEX_HELD(&p->p_lock)); 518 519 cv_signal(&pr_pid_cv[p->p_slot]); 520 p->p_proc_flag &= ~P_PR_LOCK; 521 mutex_exit(&p->p_lock); 522 } 523 524 void 525 pid_init(void) 526 { 527 int i; 528 529 pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen); 530 531 pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP); 532 procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP); 533 pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP); 534 proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP); 535 536 nproc = 1; 537 practive = proc_sched; 538 proc_sched->p_next = NULL; 539 procdir[0].pe_proc = proc_sched; 540 541 procentfree = &procdir[1]; 542 for (i = 1; i < v.v_proc - 1; i++) 543 procdir[i].pe_next = &procdir[i+1]; 544 procdir[i].pe_next = NULL; 545 546 HASHPID(0) = &pid0; 547 548 upcount_init(); 549 } 550 551 proc_t * 552 pid_entry(int slot) 553 { 554 union procent *pep; 555 proc_t *prp; 556 557 ASSERT(MUTEX_HELD(&pidlock)); 558 ASSERT(slot >= 0 && slot < v.v_proc); 559 560 pep = procdir[slot].pe_next; 561 if (pep >= procdir && pep < &procdir[v.v_proc]) 562 return (NULL); 563 prp = procdir[slot].pe_proc; 564 if (prp != 0 && prp->p_stat == SIDL) 565 return (NULL); 566 return (prp); 567 } 568 569 /* 570 * Send the specified signal to all processes whose process group ID is 571 * equal to 'pgid' 572 */ 573 574 void 575 signal(pid_t pgid, int sig) 576 { 577 struct pid *pidp; 578 proc_t *prp; 579 580 mutex_enter(&pidlock); 581 mutex_enter(&pidlinklock); 582 if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) { 583 mutex_exit(&pidlinklock); 584 mutex_exit(&pidlock); 585 return; 586 } 587 mutex_exit(&pidlinklock); 588 for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) { 589 mutex_enter(&prp->p_lock); 590 sigtoproc(prp, NULL, sig); 591 mutex_exit(&prp->p_lock); 592 } 593 mutex_exit(&pidlock); 594 } 595 596 /* 597 * Send the specified signal to the specified process 598 */ 599 600 void 601 prsignal(struct pid *pidp, int sig) 602 { 603 if (!(pidp->pid_prinactive)) 604 psignal(procdir[pidp->pid_prslot].pe_proc, sig); 605 } 606 607 #include <sys/sunddi.h> 608 609 /* 610 * DDI/DKI interfaces for drivers to send signals to processes 611 */ 612 613 /* 614 * obtain an opaque reference to a process for signaling 615 */ 616 void * 617 proc_ref(void) 618 { 619 struct pid *pidp; 620 621 mutex_enter(&pidlock); 622 pidp = curproc->p_pidp; 623 PID_HOLD(pidp); 624 mutex_exit(&pidlock); 625 626 return (pidp); 627 } 628 629 /* 630 * release a reference to a process 631 * - a process can exit even if a driver has a reference to it 632 * - one proc_unref for every proc_ref 633 */ 634 void 635 proc_unref(void *pref) 636 { 637 mutex_enter(&pidlock); 638 PID_RELE((struct pid *)pref); 639 mutex_exit(&pidlock); 640 } 641 642 /* 643 * send a signal to a process 644 * 645 * - send the process the signal 646 * - if the process went away, return a -1 647 * - if the process is still there return 0 648 */ 649 int 650 proc_signal(void *pref, int sig) 651 { 652 struct pid *pidp = pref; 653 654 prsignal(pidp, sig); 655 return (pidp->pid_prinactive ? -1 : 0); 656 } 657 658 659 static struct upcount **upc_hash; /* a boot time allocated array */ 660 static ulong_t upc_hashmask; 661 #define UPC_HASH(x, y) ((ulong_t)(x ^ y) & upc_hashmask) 662 663 /* 664 * Get us off the ground. Called once at boot. 665 */ 666 void 667 upcount_init(void) 668 { 669 ulong_t upc_hashsize; 670 671 /* 672 * An entry per MB of memory is our current guess 673 */ 674 /* 675 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT 676 * converts pages to megs (without overflowing a u_int 677 * if you have more than 4G of memory, like ptob(physmem)/1M 678 * would). 679 */ 680 upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT))); 681 upc_hashmask = upc_hashsize - 1; 682 upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *), 683 KM_SLEEP); 684 } 685 686 /* 687 * Increment the number of processes associated with a given uid and zoneid. 688 */ 689 void 690 upcount_inc(uid_t uid, zoneid_t zoneid) 691 { 692 struct upcount **upc, **hupc; 693 struct upcount *new; 694 695 ASSERT(MUTEX_HELD(&pidlock)); 696 new = NULL; 697 hupc = &upc_hash[UPC_HASH(uid, zoneid)]; 698 top: 699 upc = hupc; 700 while ((*upc) != NULL) { 701 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 702 (*upc)->up_count++; 703 if (new) { 704 /* 705 * did not need `new' afterall. 706 */ 707 kmem_free(new, sizeof (*new)); 708 } 709 return; 710 } 711 upc = &(*upc)->up_next; 712 } 713 714 /* 715 * There is no entry for this <uid,zoneid> pair. 716 * Allocate one. If we have to drop pidlock, check 717 * again. 718 */ 719 if (new == NULL) { 720 new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP); 721 if (new == NULL) { 722 mutex_exit(&pidlock); 723 new = (struct upcount *)kmem_alloc(sizeof (*new), 724 KM_SLEEP); 725 mutex_enter(&pidlock); 726 goto top; 727 } 728 } 729 730 731 /* 732 * On the assumption that a new user is going to do some 733 * more forks, put the new upcount structure on the front. 734 */ 735 upc = hupc; 736 737 new->up_uid = uid; 738 new->up_zoneid = zoneid; 739 new->up_count = 1; 740 new->up_next = *upc; 741 742 *upc = new; 743 } 744 745 /* 746 * Decrement the number of processes a given uid and zoneid has. 747 */ 748 void 749 upcount_dec(uid_t uid, zoneid_t zoneid) 750 { 751 struct upcount **upc; 752 struct upcount *done; 753 754 ASSERT(MUTEX_HELD(&pidlock)); 755 756 upc = &upc_hash[UPC_HASH(uid, zoneid)]; 757 while ((*upc) != NULL) { 758 if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) { 759 (*upc)->up_count--; 760 if ((*upc)->up_count == 0) { 761 done = *upc; 762 *upc = (*upc)->up_next; 763 kmem_free(done, sizeof (*done)); 764 } 765 return; 766 } 767 upc = &(*upc)->up_next; 768 } 769 cmn_err(CE_PANIC, "decr_upcount-off the end"); 770 } 771 772 /* 773 * Returns the number of processes a uid has. 774 * Non-existent uid's are assumed to have no processes. 775 */ 776 int 777 upcount_get(uid_t uid, zoneid_t zoneid) 778 { 779 struct upcount *upc; 780 781 ASSERT(MUTEX_HELD(&pidlock)); 782 783 upc = upc_hash[UPC_HASH(uid, zoneid)]; 784 while (upc != NULL) { 785 if (upc->up_uid == uid && upc->up_zoneid == zoneid) { 786 return (upc->up_count); 787 } 788 upc = upc->up_next; 789 } 790 return (0); 791 } 792