xref: /illumos-gate/usr/src/uts/common/os/pid.c (revision d5dbd18d69de8954ab5ceb588e99d43fc9b21d46)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/proc.h>
38 #include <sys/kmem.h>
39 #include <sys/tuneable.h>
40 #include <sys/var.h>
41 #include <sys/cred.h>
42 #include <sys/systm.h>
43 #include <sys/prsystm.h>
44 #include <sys/vnode.h>
45 #include <sys/session.h>
46 #include <sys/cpuvar.h>
47 #include <sys/cmn_err.h>
48 #include <sys/bitmap.h>
49 #include <sys/debug.h>
50 #include <c2/audit.h>
51 #include <sys/zone.h>
52 
53 /* directory entries for /proc */
54 union procent {
55 	proc_t *pe_proc;
56 	union procent *pe_next;
57 };
58 
59 struct pid pid0 = {
60 	0,		/* pid_prinactive */
61 	1,		/* pid_pgorphaned */
62 	0,		/* pid_padding	*/
63 	0,		/* pid_prslot	*/
64 	0,		/* pid_id	*/
65 	NULL,		/* pid_pglink	*/
66 	NULL,		/* pid_link	*/
67 	3		/* pid_ref	*/
68 };
69 
70 static int pid_hashlen = 4;	/* desired average hash chain length */
71 static int pid_hashsz;		/* number of buckets in the hash table */
72 
73 #define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
74 
75 extern uint_t nproc;
76 extern struct kmem_cache *process_cache;
77 static void	upcount_init(void);
78 
79 kmutex_t	pidlock;	/* global process lock */
80 kmutex_t	pr_pidlock;	/* /proc global process lock */
81 kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
82 struct plock	*proc_lock;	/* persistent array of p_lock's */
83 
84 /*
85  * See the comment above pid_getlockslot() for a detailed explanation of this
86  * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
87  * granularity; if the coherence granularity is ever changed, this constant
88  * should be modified to reflect the change to minimize proc_lock false
89  * sharing (correctness, however, is guaranteed regardless of the coherence
90  * granularity).
91  */
92 #define	PLOCK_SHIFT	3
93 
94 static kmutex_t	pidlinklock;
95 static struct pid **pidhash;
96 static pid_t minpid;
97 static pid_t mpid;
98 static union procent *procdir;
99 static union procent *procentfree;
100 
101 static struct pid *
102 pid_lookup(pid_t pid)
103 {
104 	struct pid *pidp;
105 
106 	ASSERT(MUTEX_HELD(&pidlinklock));
107 
108 	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
109 		if (pidp->pid_id == pid) {
110 			ASSERT(pidp->pid_ref > 0);
111 			break;
112 		}
113 	}
114 	return (pidp);
115 }
116 
117 void
118 pid_setmin(void)
119 {
120 	if (jump_pid && jump_pid > mpid)
121 		minpid = mpid = jump_pid;
122 	else
123 		minpid = mpid + 1;
124 }
125 
126 /*
127  * When prslots are simply used as an index to determine a process' p_lock,
128  * adjacent prslots share adjacent p_locks.  On machines where the size
129  * of a mutex is smaller than that of a cache line (which, as of this writing,
130  * is true for all machines on which Solaris runs), this can potentially
131  * induce false sharing.  The standard solution for false sharing is to pad
132  * out one's data structures (in this case, struct plock).  However,
133  * given the size and (generally) sparse use of the proc_lock array, this
134  * is suboptimal.  We therefore stride through the proc_lock array with
135  * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
136  *
137  *   log_2 (coherence_granularity / sizeof (kmutex_t))
138  *
139  * Under this scheme, false sharing is still possible -- but only when
140  * the number of active processes is very large.  Note that the one-to-one
141  * mapping between prslots and lockslots is maintained.
142  */
143 static int
144 pid_getlockslot(int prslot)
145 {
146 	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
147 	int perlap = even >> PLOCK_SHIFT;
148 
149 	if (prslot >= even)
150 		return (prslot);
151 
152 	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
153 }
154 
155 /*
156  * This function assigns a pid for use in a fork request.  It allocates
157  * a pid structure, tries to find an empty slot in the proc table,
158  * and selects the process id.
159  *
160  * pid_assign() returns the new pid on success, -1 on failure.
161  */
162 pid_t
163 pid_assign(proc_t *prp)
164 {
165 	struct pid *pidp;
166 	union procent *pep;
167 	pid_t newpid, startpid;
168 
169 	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
170 
171 	mutex_enter(&pidlinklock);
172 	if ((pep = procentfree) == NULL) {
173 		/*
174 		 * ran out of /proc directory entries
175 		 */
176 		goto failed;
177 	}
178 
179 	/*
180 	 * Allocate a pid
181 	 */
182 	startpid = mpid;
183 	do  {
184 		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
185 	} while (pid_lookup(newpid) && newpid != startpid);
186 
187 	if (newpid == startpid && pid_lookup(newpid)) {
188 		/* couldn't find a free pid */
189 		goto failed;
190 	}
191 
192 	procentfree = pep->pe_next;
193 	pep->pe_proc = prp;
194 	prp->p_pidp = pidp;
195 
196 	/*
197 	 * Put pid into the pid hash table.
198 	 */
199 	pidp->pid_link = HASHPID(newpid);
200 	HASHPID(newpid) = pidp;
201 	pidp->pid_ref = 1;
202 	pidp->pid_id = newpid;
203 	pidp->pid_prslot = pep - procdir;
204 	prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
205 	mutex_exit(&pidlinklock);
206 
207 	return (newpid);
208 
209 failed:
210 	mutex_exit(&pidlinklock);
211 	kmem_free(pidp, sizeof (struct pid));
212 	return (-1);
213 }
214 
215 /*
216  * decrement the reference count for pid
217  */
218 int
219 pid_rele(struct pid *pidp)
220 {
221 	struct pid **pidpp;
222 
223 	mutex_enter(&pidlinklock);
224 	ASSERT(pidp != &pid0);
225 
226 	pidpp = &HASHPID(pidp->pid_id);
227 	for (;;) {
228 		ASSERT(*pidpp != NULL);
229 		if (*pidpp == pidp)
230 			break;
231 		pidpp = &(*pidpp)->pid_link;
232 	}
233 
234 	*pidpp = pidp->pid_link;
235 	mutex_exit(&pidlinklock);
236 
237 	kmem_free(pidp, sizeof (*pidp));
238 	return (0);
239 }
240 
241 void
242 proc_entry_free(struct pid *pidp)
243 {
244 	mutex_enter(&pidlinklock);
245 	pidp->pid_prinactive = 1;
246 	procdir[pidp->pid_prslot].pe_next = procentfree;
247 	procentfree = &procdir[pidp->pid_prslot];
248 	mutex_exit(&pidlinklock);
249 }
250 
251 void
252 pid_exit(proc_t *prp)
253 {
254 	struct pid *pidp;
255 
256 	ASSERT(MUTEX_HELD(&pidlock));
257 
258 	/*
259 	 * Exit process group.  If it is NULL, it's because fork failed
260 	 * before calling pgjoin().
261 	 */
262 	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
263 	if (prp->p_pgidp != NULL)
264 		pgexit(prp);
265 
266 	SESS_RELE(prp->p_sessp);
267 
268 	pidp = prp->p_pidp;
269 
270 	proc_entry_free(pidp);
271 
272 #ifdef C2_AUDIT
273 	if (audit_active)
274 		audit_pfree(prp);
275 #endif
276 
277 	if (practive == prp) {
278 		practive = prp->p_next;
279 	}
280 
281 	if (prp->p_next) {
282 		prp->p_next->p_prev = prp->p_prev;
283 	}
284 	if (prp->p_prev) {
285 		prp->p_prev->p_next = prp->p_next;
286 	}
287 
288 	PID_RELE(pidp);
289 
290 	mutex_destroy(&prp->p_crlock);
291 	kmem_cache_free(process_cache, prp);
292 	nproc--;
293 }
294 
295 /*
296  * Find a process visible from the specified zone given its process ID.
297  */
298 proc_t *
299 prfind_zone(pid_t pid, zoneid_t zoneid)
300 {
301 	struct pid *pidp;
302 	proc_t *p;
303 
304 	ASSERT(MUTEX_HELD(&pidlock));
305 
306 	mutex_enter(&pidlinklock);
307 	pidp = pid_lookup(pid);
308 	mutex_exit(&pidlinklock);
309 	if (pidp != NULL && pidp->pid_prinactive == 0) {
310 		p = procdir[pidp->pid_prslot].pe_proc;
311 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
312 			return (p);
313 	}
314 	return (NULL);
315 }
316 
317 /*
318  * Find a process given its process ID.  This obeys zone restrictions,
319  * so if the caller is in a non-global zone it won't find processes
320  * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
321  * bypass this restriction.
322  */
323 proc_t *
324 prfind(pid_t pid)
325 {
326 	zoneid_t zoneid;
327 
328 	if (INGLOBALZONE(curproc))
329 		zoneid = ALL_ZONES;
330 	else
331 		zoneid = getzoneid();
332 	return (prfind_zone(pid, zoneid));
333 }
334 
335 proc_t *
336 pgfind_zone(pid_t pgid, zoneid_t zoneid)
337 {
338 	struct pid *pidp;
339 
340 	ASSERT(MUTEX_HELD(&pidlock));
341 
342 	mutex_enter(&pidlinklock);
343 	pidp = pid_lookup(pgid);
344 	mutex_exit(&pidlinklock);
345 	if (pidp != NULL) {
346 		proc_t *p = pidp->pid_pglink;
347 
348 		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
349 		    p->p_zone->zone_id == zoneid)
350 			return (p);
351 	}
352 	return (NULL);
353 }
354 
355 /*
356  * return the head of the list of processes whose process group ID is 'pgid',
357  * or NULL, if no such process group
358  */
359 proc_t *
360 pgfind(pid_t pgid)
361 {
362 	zoneid_t zoneid;
363 
364 	if (INGLOBALZONE(curproc))
365 		zoneid = ALL_ZONES;
366 	else
367 		zoneid = getzoneid();
368 	return (pgfind_zone(pgid, zoneid));
369 }
370 
371 /*
372  * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
373  * Returns the proc pointer on success, NULL on failure.  sprlock() is
374  * really just a stripped-down version of pr_p_lock() to allow practive
375  * walkers like dofusers() and dumpsys() to synchronize with /proc.
376  */
377 proc_t *
378 sprlock_zone(pid_t pid, zoneid_t zoneid)
379 {
380 	proc_t *p;
381 	kmutex_t *mp;
382 
383 	for (;;) {
384 		mutex_enter(&pidlock);
385 		if ((p = prfind_zone(pid, zoneid)) == NULL) {
386 			mutex_exit(&pidlock);
387 			return (NULL);
388 		}
389 		/*
390 		 * p_lock is persistent, but p itself is not -- it could
391 		 * vanish during cv_wait().  Load p->p_lock now so we can
392 		 * drop it after cv_wait() without referencing p.
393 		 */
394 		mp = &p->p_lock;
395 		mutex_enter(mp);
396 		mutex_exit(&pidlock);
397 		/*
398 		 * If the process is in some half-baked state, fail.
399 		 */
400 		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
401 		    (p->p_flag & (SEXITING | SEXITLWPS))) {
402 			mutex_exit(mp);
403 			return (NULL);
404 		}
405 		if (panicstr)
406 			return (p);
407 		if (!(p->p_proc_flag & P_PR_LOCK))
408 			break;
409 		cv_wait(&pr_pid_cv[p->p_slot], mp);
410 		mutex_exit(mp);
411 	}
412 	p->p_proc_flag |= P_PR_LOCK;
413 	THREAD_KPRI_REQUEST();
414 	return (p);
415 }
416 
417 proc_t *
418 sprlock(pid_t pid)
419 {
420 	zoneid_t zoneid;
421 
422 	if (INGLOBALZONE(curproc))
423 		zoneid = ALL_ZONES;
424 	else
425 		zoneid = getzoneid();
426 	return (sprlock_zone(pid, zoneid));
427 }
428 
429 void
430 sprlock_proc(proc_t *p)
431 {
432 	ASSERT(MUTEX_HELD(&p->p_lock));
433 
434 	while (p->p_proc_flag & P_PR_LOCK) {
435 		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
436 	}
437 
438 	p->p_proc_flag |= P_PR_LOCK;
439 	THREAD_KPRI_REQUEST();
440 }
441 
442 void
443 sprunlock(proc_t *p)
444 {
445 	if (panicstr) {
446 		mutex_exit(&p->p_lock);
447 		return;
448 	}
449 
450 	ASSERT(p->p_proc_flag & P_PR_LOCK);
451 	ASSERT(MUTEX_HELD(&p->p_lock));
452 
453 	cv_signal(&pr_pid_cv[p->p_slot]);
454 	p->p_proc_flag &= ~P_PR_LOCK;
455 	mutex_exit(&p->p_lock);
456 	THREAD_KPRI_RELEASE();
457 }
458 
459 void
460 pid_init(void)
461 {
462 	int i;
463 
464 	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
465 
466 	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
467 	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
468 	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
469 	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
470 
471 	nproc = 1;
472 	practive = proc_sched;
473 	proc_sched->p_next = NULL;
474 	procdir[0].pe_proc = proc_sched;
475 
476 	procentfree = &procdir[1];
477 	for (i = 1; i < v.v_proc - 1; i++)
478 		procdir[i].pe_next = &procdir[i+1];
479 	procdir[i].pe_next = NULL;
480 
481 	HASHPID(0) = &pid0;
482 
483 	upcount_init();
484 }
485 
486 proc_t *
487 pid_entry(int slot)
488 {
489 	union procent *pep;
490 	proc_t *prp;
491 
492 	ASSERT(MUTEX_HELD(&pidlock));
493 	ASSERT(slot >= 0 && slot < v.v_proc);
494 
495 	pep = procdir[slot].pe_next;
496 	if (pep >= procdir && pep < &procdir[v.v_proc])
497 		return (NULL);
498 	prp = procdir[slot].pe_proc;
499 	if (prp != 0 && prp->p_stat == SIDL)
500 		return (NULL);
501 	return (prp);
502 }
503 
504 /*
505  * Send the specified signal to all processes whose process group ID is
506  * equal to 'pgid'
507  */
508 
509 void
510 signal(pid_t pgid, int sig)
511 {
512 	struct pid *pidp;
513 	proc_t *prp;
514 
515 	mutex_enter(&pidlock);
516 	mutex_enter(&pidlinklock);
517 	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
518 		mutex_exit(&pidlinklock);
519 		mutex_exit(&pidlock);
520 		return;
521 	}
522 	mutex_exit(&pidlinklock);
523 	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
524 		mutex_enter(&prp->p_lock);
525 		sigtoproc(prp, NULL, sig);
526 		mutex_exit(&prp->p_lock);
527 	}
528 	mutex_exit(&pidlock);
529 }
530 
531 /*
532  * Send the specified signal to the specified process
533  */
534 
535 void
536 prsignal(struct pid *pidp, int sig)
537 {
538 	if (!(pidp->pid_prinactive))
539 		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
540 }
541 
542 #include <sys/sunddi.h>
543 
544 /*
545  * DDI/DKI interfaces for drivers to send signals to processes
546  */
547 
548 /*
549  * obtain an opaque reference to a process for signaling
550  */
551 void *
552 proc_ref(void)
553 {
554 	struct pid *pidp;
555 
556 	mutex_enter(&pidlock);
557 	pidp = curproc->p_pidp;
558 	PID_HOLD(pidp);
559 	mutex_exit(&pidlock);
560 
561 	return (pidp);
562 }
563 
564 /*
565  * release a reference to a process
566  * - a process can exit even if a driver has a reference to it
567  * - one proc_unref for every proc_ref
568  */
569 void
570 proc_unref(void *pref)
571 {
572 	mutex_enter(&pidlock);
573 	PID_RELE((struct pid *)pref);
574 	mutex_exit(&pidlock);
575 }
576 
577 /*
578  * send a signal to a process
579  *
580  * - send the process the signal
581  * - if the process went away, return a -1
582  * - if the process is still there return 0
583  */
584 int
585 proc_signal(void *pref, int sig)
586 {
587 	struct pid *pidp = pref;
588 
589 	prsignal(pidp, sig);
590 	return (pidp->pid_prinactive ? -1 : 0);
591 }
592 
593 
594 static struct upcount	**upc_hash;	/* a boot time allocated array */
595 static ulong_t		upc_hashmask;
596 #define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
597 
598 /*
599  * Get us off the ground.  Called once at boot.
600  */
601 void
602 upcount_init(void)
603 {
604 	ulong_t	upc_hashsize;
605 
606 	/*
607 	 * An entry per MB of memory is our current guess
608 	 */
609 	/*
610 	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
611 	 * converts pages to megs (without overflowing a u_int
612 	 * if you have more than 4G of memory, like ptob(physmem)/1M
613 	 * would).
614 	 */
615 	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
616 	upc_hashmask = upc_hashsize - 1;
617 	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
618 	    KM_SLEEP);
619 }
620 
621 /*
622  * Increment the number of processes associated with a given uid and zoneid.
623  */
624 void
625 upcount_inc(uid_t uid, zoneid_t zoneid)
626 {
627 	struct upcount	**upc, **hupc;
628 	struct upcount	*new;
629 
630 	ASSERT(MUTEX_HELD(&pidlock));
631 	new = NULL;
632 	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
633 top:
634 	upc = hupc;
635 	while ((*upc) != NULL) {
636 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
637 			(*upc)->up_count++;
638 			if (new) {
639 				/*
640 				 * did not need `new' afterall.
641 				 */
642 				kmem_free(new, sizeof (*new));
643 			}
644 			return;
645 		}
646 		upc = &(*upc)->up_next;
647 	}
648 
649 	/*
650 	 * There is no entry for this <uid,zoneid> pair.
651 	 * Allocate one.  If we have to drop pidlock, check
652 	 * again.
653 	 */
654 	if (new == NULL) {
655 		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
656 		if (new == NULL) {
657 			mutex_exit(&pidlock);
658 			new = (struct upcount *)kmem_alloc(sizeof (*new),
659 			    KM_SLEEP);
660 			mutex_enter(&pidlock);
661 			goto top;
662 		}
663 	}
664 
665 
666 	/*
667 	 * On the assumption that a new user is going to do some
668 	 * more forks, put the new upcount structure on the front.
669 	 */
670 	upc = hupc;
671 
672 	new->up_uid = uid;
673 	new->up_zoneid = zoneid;
674 	new->up_count = 1;
675 	new->up_next = *upc;
676 
677 	*upc = new;
678 }
679 
680 /*
681  * Decrement the number of processes a given uid and zoneid has.
682  */
683 void
684 upcount_dec(uid_t uid, zoneid_t zoneid)
685 {
686 	struct	upcount **upc;
687 	struct	upcount *done;
688 
689 	ASSERT(MUTEX_HELD(&pidlock));
690 
691 	upc = &upc_hash[UPC_HASH(uid, zoneid)];
692 	while ((*upc) != NULL) {
693 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
694 			(*upc)->up_count--;
695 			if ((*upc)->up_count == 0) {
696 				done = *upc;
697 				*upc = (*upc)->up_next;
698 				kmem_free(done, sizeof (*done));
699 			}
700 			return;
701 		}
702 		upc = &(*upc)->up_next;
703 	}
704 	cmn_err(CE_PANIC, "decr_upcount-off the end");
705 }
706 
707 /*
708  * Returns the number of processes a uid has.
709  * Non-existent uid's are assumed to have no processes.
710  */
711 int
712 upcount_get(uid_t uid, zoneid_t zoneid)
713 {
714 	struct	upcount *upc;
715 
716 	ASSERT(MUTEX_HELD(&pidlock));
717 
718 	upc = upc_hash[UPC_HASH(uid, zoneid)];
719 	while (upc != NULL) {
720 		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
721 			return (upc->up_count);
722 		}
723 		upc = upc->up_next;
724 	}
725 	return (0);
726 }
727