xref: /titanic_50/usr/src/uts/common/os/pid.c (revision e26dc4c873897a5ed83dd661da8537476957f68b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved  	*/
30 
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/types.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/proc.h>
38 #include <sys/kmem.h>
39 #include <sys/tuneable.h>
40 #include <sys/var.h>
41 #include <sys/cred.h>
42 #include <sys/systm.h>
43 #include <sys/prsystm.h>
44 #include <sys/vnode.h>
45 #include <sys/session.h>
46 #include <sys/cpuvar.h>
47 #include <sys/cmn_err.h>
48 #include <sys/bitmap.h>
49 #include <sys/debug.h>
50 #include <c2/audit.h>
51 #include <sys/zone.h>
52 
53 /* directory entries for /proc */
54 union procent {
55 	proc_t *pe_proc;
56 	union procent *pe_next;
57 };
58 
59 struct pid pid0 = {
60 	0,		/* pid_prinactive */
61 	1,		/* pid_pgorphaned */
62 	0,		/* pid_padding	*/
63 	0,		/* pid_prslot	*/
64 	0,		/* pid_id	*/
65 	NULL,		/* pid_pglink	*/
66 	NULL,		/* pid_pgtail	*/
67 	NULL,		/* pid_link	*/
68 	3		/* pid_ref	*/
69 };
70 
71 static int pid_hashlen = 4;	/* desired average hash chain length */
72 static int pid_hashsz;		/* number of buckets in the hash table */
73 
74 #define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
75 
76 extern uint_t nproc;
77 extern struct kmem_cache *process_cache;
78 static void	upcount_init(void);
79 
80 kmutex_t	pidlock;	/* global process lock */
81 kmutex_t	pr_pidlock;	/* /proc global process lock */
82 kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
83 struct plock	*proc_lock;	/* persistent array of p_lock's */
84 
85 /*
86  * See the comment above pid_getlockslot() for a detailed explanation of this
87  * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
88  * granularity; if the coherence granularity is ever changed, this constant
89  * should be modified to reflect the change to minimize proc_lock false
90  * sharing (correctness, however, is guaranteed regardless of the coherence
91  * granularity).
92  */
93 #define	PLOCK_SHIFT	3
94 
95 static kmutex_t	pidlinklock;
96 static struct pid **pidhash;
97 static pid_t minpid;
98 static pid_t mpid;
99 static union procent *procdir;
100 static union procent *procentfree;
101 
102 static struct pid *
103 pid_lookup(pid_t pid)
104 {
105 	struct pid *pidp;
106 
107 	ASSERT(MUTEX_HELD(&pidlinklock));
108 
109 	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
110 		if (pidp->pid_id == pid) {
111 			ASSERT(pidp->pid_ref > 0);
112 			break;
113 		}
114 	}
115 	return (pidp);
116 }
117 
118 void
119 pid_setmin(void)
120 {
121 	if (jump_pid && jump_pid > mpid)
122 		minpid = mpid = jump_pid;
123 	else
124 		minpid = mpid + 1;
125 }
126 
127 /*
128  * When prslots are simply used as an index to determine a process' p_lock,
129  * adjacent prslots share adjacent p_locks.  On machines where the size
130  * of a mutex is smaller than that of a cache line (which, as of this writing,
131  * is true for all machines on which Solaris runs), this can potentially
132  * induce false sharing.  The standard solution for false sharing is to pad
133  * out one's data structures (in this case, struct plock).  However,
134  * given the size and (generally) sparse use of the proc_lock array, this
135  * is suboptimal.  We therefore stride through the proc_lock array with
136  * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
137  *
138  *   log_2 (coherence_granularity / sizeof (kmutex_t))
139  *
140  * Under this scheme, false sharing is still possible -- but only when
141  * the number of active processes is very large.  Note that the one-to-one
142  * mapping between prslots and lockslots is maintained.
143  */
144 static int
145 pid_getlockslot(int prslot)
146 {
147 	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
148 	int perlap = even >> PLOCK_SHIFT;
149 
150 	if (prslot >= even)
151 		return (prslot);
152 
153 	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
154 }
155 
156 /*
157  * This function assigns a pid for use in a fork request.  It allocates
158  * a pid structure, tries to find an empty slot in the proc table,
159  * and selects the process id.
160  *
161  * pid_assign() returns the new pid on success, -1 on failure.
162  */
163 pid_t
164 pid_assign(proc_t *prp)
165 {
166 	struct pid *pidp;
167 	union procent *pep;
168 	pid_t newpid, startpid;
169 
170 	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
171 
172 	mutex_enter(&pidlinklock);
173 	if ((pep = procentfree) == NULL) {
174 		/*
175 		 * ran out of /proc directory entries
176 		 */
177 		goto failed;
178 	}
179 
180 	/*
181 	 * Allocate a pid
182 	 */
183 	startpid = mpid;
184 	do  {
185 		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
186 	} while (pid_lookup(newpid) && newpid != startpid);
187 
188 	if (newpid == startpid && pid_lookup(newpid)) {
189 		/* couldn't find a free pid */
190 		goto failed;
191 	}
192 
193 	procentfree = pep->pe_next;
194 	pep->pe_proc = prp;
195 	prp->p_pidp = pidp;
196 
197 	/*
198 	 * Put pid into the pid hash table.
199 	 */
200 	pidp->pid_link = HASHPID(newpid);
201 	HASHPID(newpid) = pidp;
202 	pidp->pid_ref = 1;
203 	pidp->pid_id = newpid;
204 	pidp->pid_prslot = pep - procdir;
205 	prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
206 	mutex_exit(&pidlinklock);
207 
208 	return (newpid);
209 
210 failed:
211 	mutex_exit(&pidlinklock);
212 	kmem_free(pidp, sizeof (struct pid));
213 	return (-1);
214 }
215 
216 /*
217  * decrement the reference count for pid
218  */
219 int
220 pid_rele(struct pid *pidp)
221 {
222 	struct pid **pidpp;
223 
224 	mutex_enter(&pidlinklock);
225 	ASSERT(pidp != &pid0);
226 
227 	pidpp = &HASHPID(pidp->pid_id);
228 	for (;;) {
229 		ASSERT(*pidpp != NULL);
230 		if (*pidpp == pidp)
231 			break;
232 		pidpp = &(*pidpp)->pid_link;
233 	}
234 
235 	*pidpp = pidp->pid_link;
236 	mutex_exit(&pidlinklock);
237 
238 	kmem_free(pidp, sizeof (*pidp));
239 	return (0);
240 }
241 
242 void
243 proc_entry_free(struct pid *pidp)
244 {
245 	mutex_enter(&pidlinklock);
246 	pidp->pid_prinactive = 1;
247 	procdir[pidp->pid_prslot].pe_next = procentfree;
248 	procentfree = &procdir[pidp->pid_prslot];
249 	mutex_exit(&pidlinklock);
250 }
251 
252 void
253 pid_exit(proc_t *prp)
254 {
255 	struct pid *pidp;
256 
257 	ASSERT(MUTEX_HELD(&pidlock));
258 
259 	/*
260 	 * Exit process group.  If it is NULL, it's because fork failed
261 	 * before calling pgjoin().
262 	 */
263 	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
264 	if (prp->p_pgidp != NULL)
265 		pgexit(prp);
266 
267 	SESS_RELE(prp->p_sessp);
268 
269 	pidp = prp->p_pidp;
270 
271 	proc_entry_free(pidp);
272 
273 #ifdef C2_AUDIT
274 	if (audit_active)
275 		audit_pfree(prp);
276 #endif
277 
278 	if (practive == prp) {
279 		practive = prp->p_next;
280 	}
281 
282 	if (prp->p_next) {
283 		prp->p_next->p_prev = prp->p_prev;
284 	}
285 	if (prp->p_prev) {
286 		prp->p_prev->p_next = prp->p_next;
287 	}
288 
289 	PID_RELE(pidp);
290 
291 	mutex_destroy(&prp->p_crlock);
292 	kmem_cache_free(process_cache, prp);
293 	nproc--;
294 }
295 
296 /*
297  * Find a process visible from the specified zone given its process ID.
298  */
299 proc_t *
300 prfind_zone(pid_t pid, zoneid_t zoneid)
301 {
302 	struct pid *pidp;
303 	proc_t *p;
304 
305 	ASSERT(MUTEX_HELD(&pidlock));
306 
307 	mutex_enter(&pidlinklock);
308 	pidp = pid_lookup(pid);
309 	mutex_exit(&pidlinklock);
310 	if (pidp != NULL && pidp->pid_prinactive == 0) {
311 		p = procdir[pidp->pid_prslot].pe_proc;
312 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
313 			return (p);
314 	}
315 	return (NULL);
316 }
317 
318 /*
319  * Find a process given its process ID.  This obeys zone restrictions,
320  * so if the caller is in a non-global zone it won't find processes
321  * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
322  * bypass this restriction.
323  */
324 proc_t *
325 prfind(pid_t pid)
326 {
327 	zoneid_t zoneid;
328 
329 	if (INGLOBALZONE(curproc))
330 		zoneid = ALL_ZONES;
331 	else
332 		zoneid = getzoneid();
333 	return (prfind_zone(pid, zoneid));
334 }
335 
336 proc_t *
337 pgfind_zone(pid_t pgid, zoneid_t zoneid)
338 {
339 	struct pid *pidp;
340 
341 	ASSERT(MUTEX_HELD(&pidlock));
342 
343 	mutex_enter(&pidlinklock);
344 	pidp = pid_lookup(pgid);
345 	mutex_exit(&pidlinklock);
346 	if (pidp != NULL) {
347 		proc_t *p = pidp->pid_pglink;
348 
349 		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
350 		    p->p_zone->zone_id == zoneid)
351 			return (p);
352 	}
353 	return (NULL);
354 }
355 
356 /*
357  * return the head of the list of processes whose process group ID is 'pgid',
358  * or NULL, if no such process group
359  */
360 proc_t *
361 pgfind(pid_t pgid)
362 {
363 	zoneid_t zoneid;
364 
365 	if (INGLOBALZONE(curproc))
366 		zoneid = ALL_ZONES;
367 	else
368 		zoneid = getzoneid();
369 	return (pgfind_zone(pgid, zoneid));
370 }
371 
372 /*
373  * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
374  * Returns the proc pointer on success, NULL on failure.  sprlock() is
375  * really just a stripped-down version of pr_p_lock() to allow practive
376  * walkers like dofusers() and dumpsys() to synchronize with /proc.
377  */
378 proc_t *
379 sprlock_zone(pid_t pid, zoneid_t zoneid)
380 {
381 	proc_t *p;
382 	kmutex_t *mp;
383 
384 	for (;;) {
385 		mutex_enter(&pidlock);
386 		if ((p = prfind_zone(pid, zoneid)) == NULL) {
387 			mutex_exit(&pidlock);
388 			return (NULL);
389 		}
390 		/*
391 		 * p_lock is persistent, but p itself is not -- it could
392 		 * vanish during cv_wait().  Load p->p_lock now so we can
393 		 * drop it after cv_wait() without referencing p.
394 		 */
395 		mp = &p->p_lock;
396 		mutex_enter(mp);
397 		mutex_exit(&pidlock);
398 		/*
399 		 * If the process is in some half-baked state, fail.
400 		 */
401 		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
402 		    (p->p_flag & (SEXITING | SEXITLWPS))) {
403 			mutex_exit(mp);
404 			return (NULL);
405 		}
406 		if (panicstr)
407 			return (p);
408 		if (!(p->p_proc_flag & P_PR_LOCK))
409 			break;
410 		cv_wait(&pr_pid_cv[p->p_slot], mp);
411 		mutex_exit(mp);
412 	}
413 	p->p_proc_flag |= P_PR_LOCK;
414 	THREAD_KPRI_REQUEST();
415 	return (p);
416 }
417 
418 proc_t *
419 sprlock(pid_t pid)
420 {
421 	zoneid_t zoneid;
422 
423 	if (INGLOBALZONE(curproc))
424 		zoneid = ALL_ZONES;
425 	else
426 		zoneid = getzoneid();
427 	return (sprlock_zone(pid, zoneid));
428 }
429 
430 void
431 sprlock_proc(proc_t *p)
432 {
433 	ASSERT(MUTEX_HELD(&p->p_lock));
434 
435 	while (p->p_proc_flag & P_PR_LOCK) {
436 		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
437 	}
438 
439 	p->p_proc_flag |= P_PR_LOCK;
440 	THREAD_KPRI_REQUEST();
441 }
442 
443 void
444 sprunlock(proc_t *p)
445 {
446 	if (panicstr) {
447 		mutex_exit(&p->p_lock);
448 		return;
449 	}
450 
451 	ASSERT(p->p_proc_flag & P_PR_LOCK);
452 	ASSERT(MUTEX_HELD(&p->p_lock));
453 
454 	cv_signal(&pr_pid_cv[p->p_slot]);
455 	p->p_proc_flag &= ~P_PR_LOCK;
456 	mutex_exit(&p->p_lock);
457 	THREAD_KPRI_RELEASE();
458 }
459 
460 void
461 pid_init(void)
462 {
463 	int i;
464 
465 	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
466 
467 	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
468 	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
469 	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
470 	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
471 
472 	nproc = 1;
473 	practive = proc_sched;
474 	proc_sched->p_next = NULL;
475 	procdir[0].pe_proc = proc_sched;
476 
477 	procentfree = &procdir[1];
478 	for (i = 1; i < v.v_proc - 1; i++)
479 		procdir[i].pe_next = &procdir[i+1];
480 	procdir[i].pe_next = NULL;
481 
482 	HASHPID(0) = &pid0;
483 
484 	upcount_init();
485 }
486 
487 proc_t *
488 pid_entry(int slot)
489 {
490 	union procent *pep;
491 	proc_t *prp;
492 
493 	ASSERT(MUTEX_HELD(&pidlock));
494 	ASSERT(slot >= 0 && slot < v.v_proc);
495 
496 	pep = procdir[slot].pe_next;
497 	if (pep >= procdir && pep < &procdir[v.v_proc])
498 		return (NULL);
499 	prp = procdir[slot].pe_proc;
500 	if (prp != 0 && prp->p_stat == SIDL)
501 		return (NULL);
502 	return (prp);
503 }
504 
505 /*
506  * Send the specified signal to all processes whose process group ID is
507  * equal to 'pgid'
508  */
509 
510 void
511 signal(pid_t pgid, int sig)
512 {
513 	struct pid *pidp;
514 	proc_t *prp;
515 
516 	mutex_enter(&pidlock);
517 	mutex_enter(&pidlinklock);
518 	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
519 		mutex_exit(&pidlinklock);
520 		mutex_exit(&pidlock);
521 		return;
522 	}
523 	mutex_exit(&pidlinklock);
524 	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
525 		mutex_enter(&prp->p_lock);
526 		sigtoproc(prp, NULL, sig);
527 		mutex_exit(&prp->p_lock);
528 	}
529 	mutex_exit(&pidlock);
530 }
531 
532 /*
533  * Send the specified signal to the specified process
534  */
535 
536 void
537 prsignal(struct pid *pidp, int sig)
538 {
539 	if (!(pidp->pid_prinactive))
540 		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
541 }
542 
543 #include <sys/sunddi.h>
544 
545 /*
546  * DDI/DKI interfaces for drivers to send signals to processes
547  */
548 
549 /*
550  * obtain an opaque reference to a process for signaling
551  */
552 void *
553 proc_ref(void)
554 {
555 	struct pid *pidp;
556 
557 	mutex_enter(&pidlock);
558 	pidp = curproc->p_pidp;
559 	PID_HOLD(pidp);
560 	mutex_exit(&pidlock);
561 
562 	return (pidp);
563 }
564 
565 /*
566  * release a reference to a process
567  * - a process can exit even if a driver has a reference to it
568  * - one proc_unref for every proc_ref
569  */
570 void
571 proc_unref(void *pref)
572 {
573 	mutex_enter(&pidlock);
574 	PID_RELE((struct pid *)pref);
575 	mutex_exit(&pidlock);
576 }
577 
578 /*
579  * send a signal to a process
580  *
581  * - send the process the signal
582  * - if the process went away, return a -1
583  * - if the process is still there return 0
584  */
585 int
586 proc_signal(void *pref, int sig)
587 {
588 	struct pid *pidp = pref;
589 
590 	prsignal(pidp, sig);
591 	return (pidp->pid_prinactive ? -1 : 0);
592 }
593 
594 
595 static struct upcount	**upc_hash;	/* a boot time allocated array */
596 static ulong_t		upc_hashmask;
597 #define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
598 
599 /*
600  * Get us off the ground.  Called once at boot.
601  */
602 void
603 upcount_init(void)
604 {
605 	ulong_t	upc_hashsize;
606 
607 	/*
608 	 * An entry per MB of memory is our current guess
609 	 */
610 	/*
611 	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
612 	 * converts pages to megs (without overflowing a u_int
613 	 * if you have more than 4G of memory, like ptob(physmem)/1M
614 	 * would).
615 	 */
616 	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
617 	upc_hashmask = upc_hashsize - 1;
618 	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
619 	    KM_SLEEP);
620 }
621 
622 /*
623  * Increment the number of processes associated with a given uid and zoneid.
624  */
625 void
626 upcount_inc(uid_t uid, zoneid_t zoneid)
627 {
628 	struct upcount	**upc, **hupc;
629 	struct upcount	*new;
630 
631 	ASSERT(MUTEX_HELD(&pidlock));
632 	new = NULL;
633 	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
634 top:
635 	upc = hupc;
636 	while ((*upc) != NULL) {
637 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
638 			(*upc)->up_count++;
639 			if (new) {
640 				/*
641 				 * did not need `new' afterall.
642 				 */
643 				kmem_free(new, sizeof (*new));
644 			}
645 			return;
646 		}
647 		upc = &(*upc)->up_next;
648 	}
649 
650 	/*
651 	 * There is no entry for this <uid,zoneid> pair.
652 	 * Allocate one.  If we have to drop pidlock, check
653 	 * again.
654 	 */
655 	if (new == NULL) {
656 		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
657 		if (new == NULL) {
658 			mutex_exit(&pidlock);
659 			new = (struct upcount *)kmem_alloc(sizeof (*new),
660 			    KM_SLEEP);
661 			mutex_enter(&pidlock);
662 			goto top;
663 		}
664 	}
665 
666 
667 	/*
668 	 * On the assumption that a new user is going to do some
669 	 * more forks, put the new upcount structure on the front.
670 	 */
671 	upc = hupc;
672 
673 	new->up_uid = uid;
674 	new->up_zoneid = zoneid;
675 	new->up_count = 1;
676 	new->up_next = *upc;
677 
678 	*upc = new;
679 }
680 
681 /*
682  * Decrement the number of processes a given uid and zoneid has.
683  */
684 void
685 upcount_dec(uid_t uid, zoneid_t zoneid)
686 {
687 	struct	upcount **upc;
688 	struct	upcount *done;
689 
690 	ASSERT(MUTEX_HELD(&pidlock));
691 
692 	upc = &upc_hash[UPC_HASH(uid, zoneid)];
693 	while ((*upc) != NULL) {
694 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
695 			(*upc)->up_count--;
696 			if ((*upc)->up_count == 0) {
697 				done = *upc;
698 				*upc = (*upc)->up_next;
699 				kmem_free(done, sizeof (*done));
700 			}
701 			return;
702 		}
703 		upc = &(*upc)->up_next;
704 	}
705 	cmn_err(CE_PANIC, "decr_upcount-off the end");
706 }
707 
708 /*
709  * Returns the number of processes a uid has.
710  * Non-existent uid's are assumed to have no processes.
711  */
712 int
713 upcount_get(uid_t uid, zoneid_t zoneid)
714 {
715 	struct	upcount *upc;
716 
717 	ASSERT(MUTEX_HELD(&pidlock));
718 
719 	upc = upc_hash[UPC_HASH(uid, zoneid)];
720 	while (upc != NULL) {
721 		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
722 			return (upc->up_count);
723 		}
724 		upc = upc->up_next;
725 	}
726 	return (0);
727 }
728