xref: /titanic_52/usr/src/uts/common/os/pid.c (revision d3cf9c7d3cb6a89c5ee679d866610bc6baaf2c9a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/proc.h>
37 #include <sys/kmem.h>
38 #include <sys/tuneable.h>
39 #include <sys/var.h>
40 #include <sys/cred.h>
41 #include <sys/systm.h>
42 #include <sys/prsystm.h>
43 #include <sys/vnode.h>
44 #include <sys/session.h>
45 #include <sys/cpuvar.h>
46 #include <sys/cmn_err.h>
47 #include <sys/bitmap.h>
48 #include <sys/debug.h>
49 #include <c2/audit.h>
50 #include <sys/zone.h>
51 
52 /* directory entries for /proc */
53 union procent {
54 	proc_t *pe_proc;
55 	union procent *pe_next;
56 };
57 
58 struct pid pid0 = {
59 	0,		/* pid_prinactive */
60 	1,		/* pid_pgorphaned */
61 	0,		/* pid_padding	*/
62 	0,		/* pid_prslot	*/
63 	0,		/* pid_id	*/
64 	NULL,		/* pid_pglink	*/
65 	NULL,		/* pid_link	*/
66 	3		/* pid_ref	*/
67 };
68 
69 static int pid_hashlen = 4;	/* desired average hash chain length */
70 static int pid_hashsz;		/* number of buckets in the hash table */
71 
72 #define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
73 
74 extern uint_t nproc;
75 extern struct kmem_cache *process_cache;
76 static void	upcount_init(void);
77 
78 kmutex_t	pidlock;	/* global process lock */
79 kmutex_t	pr_pidlock;	/* /proc global process lock */
80 kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
81 struct plock	*proc_lock;	/* persistent array of p_lock's */
82 
83 /*
84  * See the comment above pid_getlockslot() for a detailed explanation of this
85  * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
86  * granularity; if the coherence granularity is ever changed, this constant
87  * should be modified to reflect the change to minimize proc_lock false
88  * sharing (correctness, however, is guaranteed regardless of the coherence
89  * granularity).
90  */
91 #define	PLOCK_SHIFT	3
92 
93 static kmutex_t	pidlinklock;
94 static struct pid **pidhash;
95 static pid_t minpid;
96 static pid_t mpid;
97 static union procent *procdir;
98 static union procent *procentfree;
99 
100 static struct pid *
101 pid_lookup(pid_t pid)
102 {
103 	struct pid *pidp;
104 
105 	ASSERT(MUTEX_HELD(&pidlinklock));
106 
107 	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
108 		if (pidp->pid_id == pid) {
109 			ASSERT(pidp->pid_ref > 0);
110 			break;
111 		}
112 	}
113 	return (pidp);
114 }
115 
116 void
117 pid_setmin(void)
118 {
119 	if (jump_pid && jump_pid > mpid)
120 		minpid = mpid = jump_pid;
121 	else
122 		minpid = mpid + 1;
123 }
124 
125 /*
126  * When prslots are simply used as an index to determine a process' p_lock,
127  * adjacent prslots share adjacent p_locks.  On machines where the size
128  * of a mutex is smaller than that of a cache line (which, as of this writing,
129  * is true for all machines on which Solaris runs), this can potentially
130  * induce false sharing.  The standard solution for false sharing is to pad
131  * out one's data structures (in this case, struct plock).  However,
132  * given the size and (generally) sparse use of the proc_lock array, this
133  * is suboptimal.  We therefore stride through the proc_lock array with
134  * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
135  *
136  *   log_2 (coherence_granularity / sizeof (kmutex_t))
137  *
138  * Under this scheme, false sharing is still possible -- but only when
139  * the number of active processes is very large.  Note that the one-to-one
140  * mapping between prslots and lockslots is maintained.
141  */
142 static int
143 pid_getlockslot(int prslot)
144 {
145 	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
146 	int perlap = even >> PLOCK_SHIFT;
147 
148 	if (prslot >= even)
149 		return (prslot);
150 
151 	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
152 }
153 
154 /*
155  * This function assigns a pid for use in a fork request.  It allocates
156  * a pid structure, tries to find an empty slot in the proc table,
157  * and selects the process id.
158  *
159  * pid_assign() returns the new pid on success, -1 on failure.
160  */
161 pid_t
162 pid_assign(proc_t *prp)
163 {
164 	struct pid *pidp;
165 	union procent *pep;
166 	pid_t newpid, startpid;
167 
168 	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
169 
170 	mutex_enter(&pidlinklock);
171 	if ((pep = procentfree) == NULL) {
172 		/*
173 		 * ran out of /proc directory entries
174 		 */
175 		goto failed;
176 	}
177 
178 	/*
179 	 * Allocate a pid
180 	 */
181 	startpid = mpid;
182 	do  {
183 		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
184 	} while (pid_lookup(newpid) && newpid != startpid);
185 
186 	if (newpid == startpid && pid_lookup(newpid)) {
187 		/* couldn't find a free pid */
188 		goto failed;
189 	}
190 
191 	procentfree = pep->pe_next;
192 	pep->pe_proc = prp;
193 	prp->p_pidp = pidp;
194 
195 	/*
196 	 * Put pid into the pid hash table.
197 	 */
198 	pidp->pid_link = HASHPID(newpid);
199 	HASHPID(newpid) = pidp;
200 	pidp->pid_ref = 1;
201 	pidp->pid_id = newpid;
202 	pidp->pid_prslot = pep - procdir;
203 	prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
204 	mutex_exit(&pidlinklock);
205 
206 	return (newpid);
207 
208 failed:
209 	mutex_exit(&pidlinklock);
210 	kmem_free(pidp, sizeof (struct pid));
211 	return (-1);
212 }
213 
214 /*
215  * decrement the reference count for pid
216  */
217 int
218 pid_rele(struct pid *pidp)
219 {
220 	struct pid **pidpp;
221 
222 	mutex_enter(&pidlinklock);
223 	ASSERT(pidp != &pid0);
224 
225 	pidpp = &HASHPID(pidp->pid_id);
226 	for (;;) {
227 		ASSERT(*pidpp != NULL);
228 		if (*pidpp == pidp)
229 			break;
230 		pidpp = &(*pidpp)->pid_link;
231 	}
232 
233 	*pidpp = pidp->pid_link;
234 	mutex_exit(&pidlinklock);
235 
236 	kmem_free(pidp, sizeof (*pidp));
237 	return (0);
238 }
239 
240 void
241 proc_entry_free(struct pid *pidp)
242 {
243 	mutex_enter(&pidlinklock);
244 	pidp->pid_prinactive = 1;
245 	procdir[pidp->pid_prslot].pe_next = procentfree;
246 	procentfree = &procdir[pidp->pid_prslot];
247 	mutex_exit(&pidlinklock);
248 }
249 
250 void
251 pid_exit(proc_t *prp)
252 {
253 	struct pid *pidp;
254 
255 	ASSERT(MUTEX_HELD(&pidlock));
256 
257 	/*
258 	 * Exit process group.  If it is NULL, it's because fork failed
259 	 * before calling pgjoin().
260 	 */
261 	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
262 	if (prp->p_pgidp != NULL)
263 		pgexit(prp);
264 
265 	SESS_RELE(prp->p_sessp);
266 
267 	pidp = prp->p_pidp;
268 
269 	proc_entry_free(pidp);
270 
271 #ifdef C2_AUDIT
272 	if (audit_active)
273 		audit_pfree(prp);
274 #endif
275 
276 	if (practive == prp) {
277 		practive = prp->p_next;
278 	}
279 
280 	if (prp->p_next) {
281 		prp->p_next->p_prev = prp->p_prev;
282 	}
283 	if (prp->p_prev) {
284 		prp->p_prev->p_next = prp->p_next;
285 	}
286 
287 	PID_RELE(pidp);
288 
289 	mutex_destroy(&prp->p_crlock);
290 	kmem_cache_free(process_cache, prp);
291 	nproc--;
292 }
293 
294 /*
295  * Find a process visible from the specified zone given its process ID.
296  */
297 proc_t *
298 prfind_zone(pid_t pid, zoneid_t zoneid)
299 {
300 	struct pid *pidp;
301 	proc_t *p;
302 
303 	ASSERT(MUTEX_HELD(&pidlock));
304 
305 	mutex_enter(&pidlinklock);
306 	pidp = pid_lookup(pid);
307 	mutex_exit(&pidlinklock);
308 	if (pidp != NULL && pidp->pid_prinactive == 0) {
309 		p = procdir[pidp->pid_prslot].pe_proc;
310 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
311 			return (p);
312 	}
313 	return (NULL);
314 }
315 
316 /*
317  * Find a process given its process ID.  This obeys zone restrictions,
318  * so if the caller is in a non-global zone it won't find processes
319  * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
320  * bypass this restriction.
321  */
322 proc_t *
323 prfind(pid_t pid)
324 {
325 	zoneid_t zoneid;
326 
327 	if (INGLOBALZONE(curproc))
328 		zoneid = ALL_ZONES;
329 	else
330 		zoneid = getzoneid();
331 	return (prfind_zone(pid, zoneid));
332 }
333 
334 proc_t *
335 pgfind_zone(pid_t pgid, zoneid_t zoneid)
336 {
337 	struct pid *pidp;
338 
339 	ASSERT(MUTEX_HELD(&pidlock));
340 
341 	mutex_enter(&pidlinklock);
342 	pidp = pid_lookup(pgid);
343 	mutex_exit(&pidlinklock);
344 	if (pidp != NULL) {
345 		proc_t *p = pidp->pid_pglink;
346 
347 		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
348 		    p->p_zone->zone_id == zoneid)
349 			return (p);
350 	}
351 	return (NULL);
352 }
353 
354 /*
355  * return the head of the list of processes whose process group ID is 'pgid',
356  * or NULL, if no such process group
357  */
358 proc_t *
359 pgfind(pid_t pgid)
360 {
361 	zoneid_t zoneid;
362 
363 	if (INGLOBALZONE(curproc))
364 		zoneid = ALL_ZONES;
365 	else
366 		zoneid = getzoneid();
367 	return (pgfind_zone(pgid, zoneid));
368 }
369 
370 /*
371  * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
372  * Returns the proc pointer on success, NULL on failure.  sprlock() is
373  * really just a stripped-down version of pr_p_lock() to allow practive
374  * walkers like dofusers() and dumpsys() to synchronize with /proc.
375  */
376 proc_t *
377 sprlock_zone(pid_t pid, zoneid_t zoneid)
378 {
379 	proc_t *p;
380 	kmutex_t *mp;
381 
382 	for (;;) {
383 		mutex_enter(&pidlock);
384 		if ((p = prfind_zone(pid, zoneid)) == NULL) {
385 			mutex_exit(&pidlock);
386 			return (NULL);
387 		}
388 		/*
389 		 * p_lock is persistent, but p itself is not -- it could
390 		 * vanish during cv_wait().  Load p->p_lock now so we can
391 		 * drop it after cv_wait() without referencing p.
392 		 */
393 		mp = &p->p_lock;
394 		mutex_enter(mp);
395 		mutex_exit(&pidlock);
396 		/*
397 		 * If the process is in some half-baked state, fail.
398 		 */
399 		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
400 		    p->p_tlist == NULL || (p->p_flag & SEXITLWPS)) {
401 			mutex_exit(mp);
402 			return (NULL);
403 		}
404 		if (panicstr)
405 			return (p);
406 		if (!(p->p_proc_flag & P_PR_LOCK))
407 			break;
408 		cv_wait(&pr_pid_cv[p->p_slot], mp);
409 		mutex_exit(mp);
410 	}
411 	p->p_proc_flag |= P_PR_LOCK;
412 	THREAD_KPRI_REQUEST();
413 	return (p);
414 }
415 
416 proc_t *
417 sprlock(pid_t pid)
418 {
419 	zoneid_t zoneid;
420 
421 	if (INGLOBALZONE(curproc))
422 		zoneid = ALL_ZONES;
423 	else
424 		zoneid = getzoneid();
425 	return (sprlock_zone(pid, zoneid));
426 }
427 
428 void
429 sprlock_proc(proc_t *p)
430 {
431 	ASSERT(MUTEX_HELD(&p->p_lock));
432 
433 	while (p->p_proc_flag & P_PR_LOCK) {
434 		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
435 	}
436 
437 	p->p_proc_flag |= P_PR_LOCK;
438 	THREAD_KPRI_REQUEST();
439 }
440 
441 void
442 sprunlock(proc_t *p)
443 {
444 	if (panicstr) {
445 		mutex_exit(&p->p_lock);
446 		return;
447 	}
448 
449 	ASSERT(p->p_proc_flag & P_PR_LOCK);
450 	ASSERT(MUTEX_HELD(&p->p_lock));
451 
452 	cv_signal(&pr_pid_cv[p->p_slot]);
453 	p->p_proc_flag &= ~P_PR_LOCK;
454 	mutex_exit(&p->p_lock);
455 	THREAD_KPRI_RELEASE();
456 }
457 
458 void
459 pid_init(void)
460 {
461 	int i;
462 
463 	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
464 
465 	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
466 	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
467 	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
468 	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
469 
470 	nproc = 1;
471 	practive = proc_sched;
472 	proc_sched->p_next = NULL;
473 	procdir[0].pe_proc = proc_sched;
474 
475 	procentfree = &procdir[1];
476 	for (i = 1; i < v.v_proc - 1; i++)
477 		procdir[i].pe_next = &procdir[i+1];
478 	procdir[i].pe_next = NULL;
479 
480 	HASHPID(0) = &pid0;
481 
482 	upcount_init();
483 }
484 
485 proc_t *
486 pid_entry(int slot)
487 {
488 	union procent *pep;
489 	proc_t *prp;
490 
491 	ASSERT(MUTEX_HELD(&pidlock));
492 	ASSERT(slot >= 0 && slot < v.v_proc);
493 
494 	pep = procdir[slot].pe_next;
495 	if (pep >= procdir && pep < &procdir[v.v_proc])
496 		return (NULL);
497 	prp = procdir[slot].pe_proc;
498 	if (prp != 0 && prp->p_stat == SIDL)
499 		return (NULL);
500 	return (prp);
501 }
502 
503 /*
504  * Send the specified signal to all processes whose process group ID is
505  * equal to 'pgid'
506  */
507 
508 void
509 signal(pid_t pgid, int sig)
510 {
511 	struct pid *pidp;
512 	proc_t *prp;
513 
514 	mutex_enter(&pidlock);
515 	mutex_enter(&pidlinklock);
516 	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
517 		mutex_exit(&pidlinklock);
518 		mutex_exit(&pidlock);
519 		return;
520 	}
521 	mutex_exit(&pidlinklock);
522 	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
523 		mutex_enter(&prp->p_lock);
524 		sigtoproc(prp, NULL, sig);
525 		mutex_exit(&prp->p_lock);
526 	}
527 	mutex_exit(&pidlock);
528 }
529 
530 /*
531  * Send the specified signal to the specified process
532  */
533 
534 void
535 prsignal(struct pid *pidp, int sig)
536 {
537 	if (!(pidp->pid_prinactive))
538 		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
539 }
540 
541 #include <sys/sunddi.h>
542 
543 /*
544  * DDI/DKI interfaces for drivers to send signals to processes
545  */
546 
547 /*
548  * obtain an opaque reference to a process for signaling
549  */
550 void *
551 proc_ref(void)
552 {
553 	struct pid *pidp;
554 
555 	mutex_enter(&pidlock);
556 	pidp = curproc->p_pidp;
557 	PID_HOLD(pidp);
558 	mutex_exit(&pidlock);
559 
560 	return (pidp);
561 }
562 
563 /*
564  * release a reference to a process
565  * - a process can exit even if a driver has a reference to it
566  * - one proc_unref for every proc_ref
567  */
568 void
569 proc_unref(void *pref)
570 {
571 	mutex_enter(&pidlock);
572 	PID_RELE((struct pid *)pref);
573 	mutex_exit(&pidlock);
574 }
575 
576 /*
577  * send a signal to a process
578  *
579  * - send the process the signal
580  * - if the process went away, return a -1
581  * - if the process is still there return 0
582  */
583 int
584 proc_signal(void *pref, int sig)
585 {
586 	struct pid *pidp = pref;
587 
588 	prsignal(pidp, sig);
589 	return (pidp->pid_prinactive ? -1 : 0);
590 }
591 
592 
593 static struct upcount	**upc_hash;	/* a boot time allocated array */
594 static ulong_t		upc_hashmask;
595 #define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
596 
597 /*
598  * Get us off the ground.  Called once at boot.
599  */
600 void
601 upcount_init(void)
602 {
603 	ulong_t	upc_hashsize;
604 
605 	/*
606 	 * An entry per MB of memory is our current guess
607 	 */
608 	/*
609 	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
610 	 * converts pages to megs (without overflowing a u_int
611 	 * if you have more than 4G of memory, like ptob(physmem)/1M
612 	 * would).
613 	 */
614 	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
615 	upc_hashmask = upc_hashsize - 1;
616 	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
617 	    KM_SLEEP);
618 }
619 
620 /*
621  * Increment the number of processes associated with a given uid and zoneid.
622  */
623 void
624 upcount_inc(uid_t uid, zoneid_t zoneid)
625 {
626 	struct upcount	**upc, **hupc;
627 	struct upcount	*new;
628 
629 	ASSERT(MUTEX_HELD(&pidlock));
630 	new = NULL;
631 	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
632 top:
633 	upc = hupc;
634 	while ((*upc) != NULL) {
635 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
636 			(*upc)->up_count++;
637 			if (new) {
638 				/*
639 				 * did not need `new' afterall.
640 				 */
641 				kmem_free(new, sizeof (*new));
642 			}
643 			return;
644 		}
645 		upc = &(*upc)->up_next;
646 	}
647 
648 	/*
649 	 * There is no entry for this <uid,zoneid> pair.
650 	 * Allocate one.  If we have to drop pidlock, check
651 	 * again.
652 	 */
653 	if (new == NULL) {
654 		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
655 		if (new == NULL) {
656 			mutex_exit(&pidlock);
657 			new = (struct upcount *)kmem_alloc(sizeof (*new),
658 			    KM_SLEEP);
659 			mutex_enter(&pidlock);
660 			goto top;
661 		}
662 	}
663 
664 
665 	/*
666 	 * On the assumption that a new user is going to do some
667 	 * more forks, put the new upcount structure on the front.
668 	 */
669 	upc = hupc;
670 
671 	new->up_uid = uid;
672 	new->up_zoneid = zoneid;
673 	new->up_count = 1;
674 	new->up_next = *upc;
675 
676 	*upc = new;
677 }
678 
679 /*
680  * Decrement the number of processes a given uid and zoneid has.
681  */
682 void
683 upcount_dec(uid_t uid, zoneid_t zoneid)
684 {
685 	struct	upcount **upc;
686 	struct	upcount *done;
687 
688 	ASSERT(MUTEX_HELD(&pidlock));
689 
690 	upc = &upc_hash[UPC_HASH(uid, zoneid)];
691 	while ((*upc) != NULL) {
692 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
693 			(*upc)->up_count--;
694 			if ((*upc)->up_count == 0) {
695 				done = *upc;
696 				*upc = (*upc)->up_next;
697 				kmem_free(done, sizeof (*done));
698 			}
699 			return;
700 		}
701 		upc = &(*upc)->up_next;
702 	}
703 	cmn_err(CE_PANIC, "decr_upcount-off the end");
704 }
705 
706 /*
707  * Returns the number of processes a uid has.
708  * Non-existent uid's are assumed to have no processes.
709  */
710 int
711 upcount_get(uid_t uid, zoneid_t zoneid)
712 {
713 	struct	upcount *upc;
714 
715 	ASSERT(MUTEX_HELD(&pidlock));
716 
717 	upc = upc_hash[UPC_HASH(uid, zoneid)];
718 	while (upc != NULL) {
719 		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
720 			return (upc->up_count);
721 		}
722 		upc = upc->up_next;
723 	}
724 	return (0);
725 }
726