xref: /titanic_44/usr/src/uts/common/os/pid.c (revision 18c2aff776a775d34a4c9893a4c72e0434d68e36)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/types.h>
34 #include <sys/param.h>
35 #include <sys/sysmacros.h>
36 #include <sys/proc.h>
37 #include <sys/kmem.h>
38 #include <sys/tuneable.h>
39 #include <sys/var.h>
40 #include <sys/cred.h>
41 #include <sys/systm.h>
42 #include <sys/prsystm.h>
43 #include <sys/vnode.h>
44 #include <sys/session.h>
45 #include <sys/cpuvar.h>
46 #include <sys/cmn_err.h>
47 #include <sys/bitmap.h>
48 #include <sys/debug.h>
49 #include <c2/audit.h>
50 #include <sys/zone.h>
51 
52 /* directory entries for /proc */
53 union procent {
54 	proc_t *pe_proc;
55 	union procent *pe_next;
56 };
57 
58 struct pid pid0 = {
59 	0,		/* pid_prinactive */
60 	1,		/* pid_pgorphaned */
61 	0,		/* pid_padding	*/
62 	0,		/* pid_prslot	*/
63 	0,		/* pid_id	*/
64 	NULL,		/* pid_pglink	*/
65 	NULL,		/* pid_pgtail	*/
66 	NULL,		/* pid_link	*/
67 	3		/* pid_ref	*/
68 };
69 
70 static int pid_hashlen = 4;	/* desired average hash chain length */
71 static int pid_hashsz;		/* number of buckets in the hash table */
72 
73 #define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])
74 
75 extern uint_t nproc;
76 extern struct kmem_cache *process_cache;
77 static void	upcount_init(void);
78 
79 kmutex_t	pidlock;	/* global process lock */
80 kmutex_t	pr_pidlock;	/* /proc global process lock */
81 kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
82 struct plock	*proc_lock;	/* persistent array of p_lock's */
83 
84 /*
85  * See the comment above pid_getlockslot() for a detailed explanation of this
86  * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
87  * granularity; if the coherence granularity is ever changed, this constant
88  * should be modified to reflect the change to minimize proc_lock false
89  * sharing (correctness, however, is guaranteed regardless of the coherence
90  * granularity).
91  */
92 #define	PLOCK_SHIFT	3
93 
94 static kmutex_t	pidlinklock;
95 static struct pid **pidhash;
96 static pid_t minpid;
97 static pid_t mpid;
98 static union procent *procdir;
99 static union procent *procentfree;
100 
101 static struct pid *
102 pid_lookup(pid_t pid)
103 {
104 	struct pid *pidp;
105 
106 	ASSERT(MUTEX_HELD(&pidlinklock));
107 
108 	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
109 		if (pidp->pid_id == pid) {
110 			ASSERT(pidp->pid_ref > 0);
111 			break;
112 		}
113 	}
114 	return (pidp);
115 }
116 
117 struct pid *
118 pid_find(pid_t pid)
119 {
120 	struct pid *pidp;
121 
122 	mutex_enter(&pidlinklock);
123 	pidp = pid_lookup(pid);
124 	mutex_exit(&pidlinklock);
125 
126 	return (pidp);
127 }
128 
129 void
130 pid_setmin(void)
131 {
132 	if (jump_pid && jump_pid > mpid)
133 		minpid = mpid = jump_pid;
134 	else
135 		minpid = mpid + 1;
136 }
137 
138 /*
139  * When prslots are simply used as an index to determine a process' p_lock,
140  * adjacent prslots share adjacent p_locks.  On machines where the size
141  * of a mutex is smaller than that of a cache line (which, as of this writing,
142  * is true for all machines on which Solaris runs), this can potentially
143  * induce false sharing.  The standard solution for false sharing is to pad
144  * out one's data structures (in this case, struct plock).  However,
145  * given the size and (generally) sparse use of the proc_lock array, this
146  * is suboptimal.  We therefore stride through the proc_lock array with
147  * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
148  *
149  *   log_2 (coherence_granularity / sizeof (kmutex_t))
150  *
151  * Under this scheme, false sharing is still possible -- but only when
152  * the number of active processes is very large.  Note that the one-to-one
153  * mapping between prslots and lockslots is maintained.
154  */
155 static int
156 pid_getlockslot(int prslot)
157 {
158 	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
159 	int perlap = even >> PLOCK_SHIFT;
160 
161 	if (prslot >= even)
162 		return (prslot);
163 
164 	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
165 }
166 
167 /*
168  * This function allocates a pid structure, a free pid, and optionally a
169  * slot in the proc table for it.
170  *
171  * pid_allocate() returns the new pid on success, -1 on failure.
172  */
173 pid_t
174 pid_allocate(proc_t *prp, int flags)
175 {
176 	struct pid *pidp;
177 	union procent *pep;
178 	pid_t newpid, startpid;
179 
180 	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);
181 
182 	mutex_enter(&pidlinklock);
183 	if ((flags & PID_ALLOC_PROC) && (pep = procentfree) == NULL) {
184 		/*
185 		 * ran out of /proc directory entries
186 		 */
187 		goto failed;
188 	}
189 
190 	/*
191 	 * Allocate a pid
192 	 */
193 	startpid = mpid;
194 	do  {
195 		newpid = (++mpid == maxpid ? mpid = minpid : mpid);
196 	} while (pid_lookup(newpid) && newpid != startpid);
197 
198 	if (newpid == startpid && pid_lookup(newpid)) {
199 		/* couldn't find a free pid */
200 		goto failed;
201 	}
202 
203 	/*
204 	 * Put pid into the pid hash table.
205 	 */
206 	pidp->pid_link = HASHPID(newpid);
207 	HASHPID(newpid) = pidp;
208 	pidp->pid_ref = 1;
209 	pidp->pid_id = newpid;
210 
211 	if (flags & PID_ALLOC_PROC) {
212 		procentfree = pep->pe_next;
213 		pidp->pid_prslot = pep - procdir;
214 		pep->pe_proc = prp;
215 		prp->p_pidp = pidp;
216 		prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
217 	} else {
218 		pidp->pid_prslot = 0;
219 	}
220 
221 	mutex_exit(&pidlinklock);
222 
223 	return (newpid);
224 
225 failed:
226 	mutex_exit(&pidlinklock);
227 	kmem_free(pidp, sizeof (struct pid));
228 	return (-1);
229 }
230 
231 /*
232  * decrement the reference count for pid
233  */
234 int
235 pid_rele(struct pid *pidp)
236 {
237 	struct pid **pidpp;
238 
239 	mutex_enter(&pidlinklock);
240 	ASSERT(pidp != &pid0);
241 
242 	pidpp = &HASHPID(pidp->pid_id);
243 	for (;;) {
244 		ASSERT(*pidpp != NULL);
245 		if (*pidpp == pidp)
246 			break;
247 		pidpp = &(*pidpp)->pid_link;
248 	}
249 
250 	*pidpp = pidp->pid_link;
251 	mutex_exit(&pidlinklock);
252 
253 	kmem_free(pidp, sizeof (*pidp));
254 	return (0);
255 }
256 
257 void
258 proc_entry_free(struct pid *pidp)
259 {
260 	mutex_enter(&pidlinklock);
261 	pidp->pid_prinactive = 1;
262 	procdir[pidp->pid_prslot].pe_next = procentfree;
263 	procentfree = &procdir[pidp->pid_prslot];
264 	mutex_exit(&pidlinklock);
265 }
266 
267 void
268 pid_exit(proc_t *prp)
269 {
270 	struct pid *pidp;
271 
272 	ASSERT(MUTEX_HELD(&pidlock));
273 
274 	/*
275 	 * Exit process group.  If it is NULL, it's because fork failed
276 	 * before calling pgjoin().
277 	 */
278 	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
279 	if (prp->p_pgidp != NULL)
280 		pgexit(prp);
281 
282 	sess_rele(prp->p_sessp, B_TRUE);
283 
284 	pidp = prp->p_pidp;
285 
286 	proc_entry_free(pidp);
287 
288 #ifdef C2_AUDIT
289 	if (audit_active)
290 		audit_pfree(prp);
291 #endif
292 
293 	if (practive == prp) {
294 		practive = prp->p_next;
295 	}
296 
297 	if (prp->p_next) {
298 		prp->p_next->p_prev = prp->p_prev;
299 	}
300 	if (prp->p_prev) {
301 		prp->p_prev->p_next = prp->p_next;
302 	}
303 
304 	PID_RELE(pidp);
305 
306 	mutex_destroy(&prp->p_crlock);
307 	kmem_cache_free(process_cache, prp);
308 	nproc--;
309 }
310 
311 /*
312  * Find a process visible from the specified zone given its process ID.
313  */
314 proc_t *
315 prfind_zone(pid_t pid, zoneid_t zoneid)
316 {
317 	struct pid *pidp;
318 	proc_t *p;
319 
320 	ASSERT(MUTEX_HELD(&pidlock));
321 
322 	mutex_enter(&pidlinklock);
323 	pidp = pid_lookup(pid);
324 	mutex_exit(&pidlinklock);
325 	if (pidp != NULL && pidp->pid_prinactive == 0) {
326 		p = procdir[pidp->pid_prslot].pe_proc;
327 		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
328 			return (p);
329 	}
330 	return (NULL);
331 }
332 
333 /*
334  * Find a process given its process ID.  This obeys zone restrictions,
335  * so if the caller is in a non-global zone it won't find processes
336  * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
337  * bypass this restriction.
338  */
339 proc_t *
340 prfind(pid_t pid)
341 {
342 	zoneid_t zoneid;
343 
344 	if (INGLOBALZONE(curproc))
345 		zoneid = ALL_ZONES;
346 	else
347 		zoneid = getzoneid();
348 	return (prfind_zone(pid, zoneid));
349 }
350 
351 proc_t *
352 pgfind_zone(pid_t pgid, zoneid_t zoneid)
353 {
354 	struct pid *pidp;
355 
356 	ASSERT(MUTEX_HELD(&pidlock));
357 
358 	mutex_enter(&pidlinklock);
359 	pidp = pid_lookup(pgid);
360 	mutex_exit(&pidlinklock);
361 	if (pidp != NULL) {
362 		proc_t *p = pidp->pid_pglink;
363 
364 		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
365 		    p->p_zone->zone_id == zoneid)
366 			return (p);
367 	}
368 	return (NULL);
369 }
370 
371 /*
372  * return the head of the list of processes whose process group ID is 'pgid',
373  * or NULL, if no such process group
374  */
375 proc_t *
376 pgfind(pid_t pgid)
377 {
378 	zoneid_t zoneid;
379 
380 	if (INGLOBALZONE(curproc))
381 		zoneid = ALL_ZONES;
382 	else
383 		zoneid = getzoneid();
384 	return (pgfind_zone(pgid, zoneid));
385 }
386 
387 /*
388  * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
389  * Returns the proc pointer on success, NULL on failure.  sprlock() is
390  * really just a stripped-down version of pr_p_lock() to allow practive
391  * walkers like dofusers() and dumpsys() to synchronize with /proc.
392  */
393 proc_t *
394 sprlock_zone(pid_t pid, zoneid_t zoneid)
395 {
396 	proc_t *p;
397 	kmutex_t *mp;
398 
399 	for (;;) {
400 		mutex_enter(&pidlock);
401 		if ((p = prfind_zone(pid, zoneid)) == NULL) {
402 			mutex_exit(&pidlock);
403 			return (NULL);
404 		}
405 		/*
406 		 * p_lock is persistent, but p itself is not -- it could
407 		 * vanish during cv_wait().  Load p->p_lock now so we can
408 		 * drop it after cv_wait() without referencing p.
409 		 */
410 		mp = &p->p_lock;
411 		mutex_enter(mp);
412 		mutex_exit(&pidlock);
413 		/*
414 		 * If the process is in some half-baked state, fail.
415 		 */
416 		if (p->p_stat == SZOMB || p->p_stat == SIDL ||
417 		    (p->p_flag & (SEXITING | SEXITLWPS))) {
418 			mutex_exit(mp);
419 			return (NULL);
420 		}
421 		if (panicstr)
422 			return (p);
423 		if (!(p->p_proc_flag & P_PR_LOCK))
424 			break;
425 		cv_wait(&pr_pid_cv[p->p_slot], mp);
426 		mutex_exit(mp);
427 	}
428 	p->p_proc_flag |= P_PR_LOCK;
429 	THREAD_KPRI_REQUEST();
430 	return (p);
431 }
432 
433 proc_t *
434 sprlock(pid_t pid)
435 {
436 	zoneid_t zoneid;
437 
438 	if (INGLOBALZONE(curproc))
439 		zoneid = ALL_ZONES;
440 	else
441 		zoneid = getzoneid();
442 	return (sprlock_zone(pid, zoneid));
443 }
444 
445 void
446 sprlock_proc(proc_t *p)
447 {
448 	ASSERT(MUTEX_HELD(&p->p_lock));
449 
450 	while (p->p_proc_flag & P_PR_LOCK) {
451 		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
452 	}
453 
454 	p->p_proc_flag |= P_PR_LOCK;
455 	THREAD_KPRI_REQUEST();
456 }
457 
458 void
459 sprunlock(proc_t *p)
460 {
461 	if (panicstr) {
462 		mutex_exit(&p->p_lock);
463 		return;
464 	}
465 
466 	ASSERT(p->p_proc_flag & P_PR_LOCK);
467 	ASSERT(MUTEX_HELD(&p->p_lock));
468 
469 	cv_signal(&pr_pid_cv[p->p_slot]);
470 	p->p_proc_flag &= ~P_PR_LOCK;
471 	mutex_exit(&p->p_lock);
472 	THREAD_KPRI_RELEASE();
473 }
474 
475 void
476 pid_init(void)
477 {
478 	int i;
479 
480 	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);
481 
482 	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
483 	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
484 	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
485 	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);
486 
487 	nproc = 1;
488 	practive = proc_sched;
489 	proc_sched->p_next = NULL;
490 	procdir[0].pe_proc = proc_sched;
491 
492 	procentfree = &procdir[1];
493 	for (i = 1; i < v.v_proc - 1; i++)
494 		procdir[i].pe_next = &procdir[i+1];
495 	procdir[i].pe_next = NULL;
496 
497 	HASHPID(0) = &pid0;
498 
499 	upcount_init();
500 }
501 
502 proc_t *
503 pid_entry(int slot)
504 {
505 	union procent *pep;
506 	proc_t *prp;
507 
508 	ASSERT(MUTEX_HELD(&pidlock));
509 	ASSERT(slot >= 0 && slot < v.v_proc);
510 
511 	pep = procdir[slot].pe_next;
512 	if (pep >= procdir && pep < &procdir[v.v_proc])
513 		return (NULL);
514 	prp = procdir[slot].pe_proc;
515 	if (prp != 0 && prp->p_stat == SIDL)
516 		return (NULL);
517 	return (prp);
518 }
519 
520 /*
521  * Send the specified signal to all processes whose process group ID is
522  * equal to 'pgid'
523  */
524 
525 void
526 signal(pid_t pgid, int sig)
527 {
528 	struct pid *pidp;
529 	proc_t *prp;
530 
531 	mutex_enter(&pidlock);
532 	mutex_enter(&pidlinklock);
533 	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
534 		mutex_exit(&pidlinklock);
535 		mutex_exit(&pidlock);
536 		return;
537 	}
538 	mutex_exit(&pidlinklock);
539 	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
540 		mutex_enter(&prp->p_lock);
541 		sigtoproc(prp, NULL, sig);
542 		mutex_exit(&prp->p_lock);
543 	}
544 	mutex_exit(&pidlock);
545 }
546 
547 /*
548  * Send the specified signal to the specified process
549  */
550 
551 void
552 prsignal(struct pid *pidp, int sig)
553 {
554 	if (!(pidp->pid_prinactive))
555 		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
556 }
557 
558 #include <sys/sunddi.h>
559 
560 /*
561  * DDI/DKI interfaces for drivers to send signals to processes
562  */
563 
564 /*
565  * obtain an opaque reference to a process for signaling
566  */
567 void *
568 proc_ref(void)
569 {
570 	struct pid *pidp;
571 
572 	mutex_enter(&pidlock);
573 	pidp = curproc->p_pidp;
574 	PID_HOLD(pidp);
575 	mutex_exit(&pidlock);
576 
577 	return (pidp);
578 }
579 
580 /*
581  * release a reference to a process
582  * - a process can exit even if a driver has a reference to it
583  * - one proc_unref for every proc_ref
584  */
585 void
586 proc_unref(void *pref)
587 {
588 	mutex_enter(&pidlock);
589 	PID_RELE((struct pid *)pref);
590 	mutex_exit(&pidlock);
591 }
592 
593 /*
594  * send a signal to a process
595  *
596  * - send the process the signal
597  * - if the process went away, return a -1
598  * - if the process is still there return 0
599  */
600 int
601 proc_signal(void *pref, int sig)
602 {
603 	struct pid *pidp = pref;
604 
605 	prsignal(pidp, sig);
606 	return (pidp->pid_prinactive ? -1 : 0);
607 }
608 
609 
610 static struct upcount	**upc_hash;	/* a boot time allocated array */
611 static ulong_t		upc_hashmask;
612 #define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)
613 
614 /*
615  * Get us off the ground.  Called once at boot.
616  */
617 void
618 upcount_init(void)
619 {
620 	ulong_t	upc_hashsize;
621 
622 	/*
623 	 * An entry per MB of memory is our current guess
624 	 */
625 	/*
626 	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
627 	 * converts pages to megs (without overflowing a u_int
628 	 * if you have more than 4G of memory, like ptob(physmem)/1M
629 	 * would).
630 	 */
631 	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
632 	upc_hashmask = upc_hashsize - 1;
633 	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
634 	    KM_SLEEP);
635 }
636 
637 /*
638  * Increment the number of processes associated with a given uid and zoneid.
639  */
640 void
641 upcount_inc(uid_t uid, zoneid_t zoneid)
642 {
643 	struct upcount	**upc, **hupc;
644 	struct upcount	*new;
645 
646 	ASSERT(MUTEX_HELD(&pidlock));
647 	new = NULL;
648 	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
649 top:
650 	upc = hupc;
651 	while ((*upc) != NULL) {
652 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
653 			(*upc)->up_count++;
654 			if (new) {
655 				/*
656 				 * did not need `new' afterall.
657 				 */
658 				kmem_free(new, sizeof (*new));
659 			}
660 			return;
661 		}
662 		upc = &(*upc)->up_next;
663 	}
664 
665 	/*
666 	 * There is no entry for this <uid,zoneid> pair.
667 	 * Allocate one.  If we have to drop pidlock, check
668 	 * again.
669 	 */
670 	if (new == NULL) {
671 		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
672 		if (new == NULL) {
673 			mutex_exit(&pidlock);
674 			new = (struct upcount *)kmem_alloc(sizeof (*new),
675 			    KM_SLEEP);
676 			mutex_enter(&pidlock);
677 			goto top;
678 		}
679 	}
680 
681 
682 	/*
683 	 * On the assumption that a new user is going to do some
684 	 * more forks, put the new upcount structure on the front.
685 	 */
686 	upc = hupc;
687 
688 	new->up_uid = uid;
689 	new->up_zoneid = zoneid;
690 	new->up_count = 1;
691 	new->up_next = *upc;
692 
693 	*upc = new;
694 }
695 
696 /*
697  * Decrement the number of processes a given uid and zoneid has.
698  */
699 void
700 upcount_dec(uid_t uid, zoneid_t zoneid)
701 {
702 	struct	upcount **upc;
703 	struct	upcount *done;
704 
705 	ASSERT(MUTEX_HELD(&pidlock));
706 
707 	upc = &upc_hash[UPC_HASH(uid, zoneid)];
708 	while ((*upc) != NULL) {
709 		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
710 			(*upc)->up_count--;
711 			if ((*upc)->up_count == 0) {
712 				done = *upc;
713 				*upc = (*upc)->up_next;
714 				kmem_free(done, sizeof (*done));
715 			}
716 			return;
717 		}
718 		upc = &(*upc)->up_next;
719 	}
720 	cmn_err(CE_PANIC, "decr_upcount-off the end");
721 }
722 
723 /*
724  * Returns the number of processes a uid has.
725  * Non-existent uid's are assumed to have no processes.
726  */
727 int
728 upcount_get(uid_t uid, zoneid_t zoneid)
729 {
730 	struct	upcount *upc;
731 
732 	ASSERT(MUTEX_HELD(&pidlock));
733 
734 	upc = upc_hash[UPC_HASH(uid, zoneid)];
735 	while (upc != NULL) {
736 		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
737 			return (upc->up_count);
738 		}
739 		upc = upc->up_next;
740 	}
741 	return (0);
742 }
743