pid.c (revision 82b7b979be13234985096762083f39b829dbd03f) - OpenGrok cross reference for /illumos-gate/usr/src/uts/common/os/pid.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2019 Joyent, Inc.
 * Copyright 2025 Oxide Computer Company
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved	*/

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/tuneable.h>
#include <sys/var.h>
#include <sys/cred.h>
#include <sys/systm.h>
#include <sys/prsystm.h>
#include <sys/vnode.h>
#include <sys/session.h>
#include <sys/cpuvar.h>
#include <sys/cmn_err.h>
#include <sys/bitmap.h>
#include <sys/debug.h>
#include <c2/audit.h>
#include <sys/project.h>
#include <sys/task.h>
#include <sys/zone.h>

/* directory entries for /proc */
union procent {
	proc_t *pe_proc;
	union procent *pe_next;
};

struct pid pid0 = {
	0,		/* pid_prinactive */
	1,		/* pid_pgorphaned */
	0,		/* pid_padding	*/
	0,		/* pid_prslot	*/
	0,		/* pid_id	*/
	NULL,		/* pid_pglink	*/
	NULL,		/* pid_pgtail	*/
	NULL,		/* pid_link	*/
	3		/* pid_ref	*/
};

static int pid_hashlen = 4;	/* desired average hash chain length */
static int pid_hashsz;		/* number of buckets in the hash table */

#define	HASHPID(pid)	(pidhash[((pid)&(pid_hashsz-1))])

extern uint_t nproc;
extern struct kmem_cache *process_cache;
static void	upcount_init(void);

kmutex_t	pidlock;	/* global process lock */
kmutex_t	pr_pidlock;	/* /proc global process lock */
kcondvar_t	*pr_pid_cv;	/* for /proc, one per process slot */
struct plock	*proc_lock;	/* persistent array of p_lock's */

/*
 * See the comment above pid_getlockslot() for a detailed explanation of this
 * constant.  Note that a PLOCK_SHIFT of 3 implies 64-byte coherence
 * granularity; if the coherence granularity is ever changed, this constant
 * should be modified to reflect the change to minimize proc_lock false
 * sharing (correctness, however, is guaranteed regardless of the coherence
 * granularity).
 */
#define	PLOCK_SHIFT	3

static kmutex_t	pidlinklock;
static struct pid **pidhash;
static pid_t minpid;
static pid_t mpid = FAMOUS_PIDS;	/* one more than the last famous pid */
static union procent *procdir;
static union procent *procentfree;

static struct pid *
pid_lookup(pid_t pid)
{
	struct pid *pidp;

	ASSERT(MUTEX_HELD(&pidlinklock));

	for (pidp = HASHPID(pid); pidp; pidp = pidp->pid_link) {
		if (pidp->pid_id == pid) {
			ASSERT(pidp->pid_ref > 0);
			break;
		}
	}
	return (pidp);
}

void
pid_setmin(void)
{
	if (jump_pid && jump_pid > mpid)
		minpid = mpid = jump_pid;
	else
		minpid = mpid;
}

/*
 * When prslots are simply used as an index to determine a process' p_lock,
 * adjacent prslots share adjacent p_locks.  On machines where the size
 * of a mutex is smaller than that of a cache line (which, as of this writing,
 * is true for all machines on which Solaris runs), this can potentially
 * induce false sharing.  The standard solution for false sharing is to pad
 * out one's data structures (in this case, struct plock).  However,
 * given the size and (generally) sparse use of the proc_lock array, this
 * is suboptimal.  We therefore stride through the proc_lock array with
 * a stride of PLOCK_SHIFT.  PLOCK_SHIFT should be defined as:
 *
 *   log_2 (coherence_granularity / sizeof (kmutex_t))
 *
 * Under this scheme, false sharing is still possible -- but only when
 * the number of active processes is very large.  Note that the one-to-one
 * mapping between prslots and lockslots is maintained.
 */
static int
pid_getlockslot(int prslot)
{
	int even = (v.v_proc >> PLOCK_SHIFT) << PLOCK_SHIFT;
	int perlap = even >> PLOCK_SHIFT;

	if (prslot >= even)
		return (prslot);

	return (((prslot % perlap) << PLOCK_SHIFT) + (prslot / perlap));
}

/*
 * This function allocates a pid structure, a free pid, and optionally a
 * slot in the proc table for it.
 *
 * pid_allocate() returns the new pid on success, -1 on failure.
 */
pid_t
pid_allocate(proc_t *prp, pid_t pid, int flags)
{
	struct pid *pidp;
	union procent *pep;
	pid_t newpid, startpid;

	pidp = kmem_zalloc(sizeof (struct pid), KM_SLEEP);

	mutex_enter(&pidlinklock);
	pep = procentfree;
	if ((flags & PID_ALLOC_PROC) && pep == NULL) {
		/*
		 * ran out of /proc directory entries
		 */
		goto failed;
	}

	if (pid != 0) {
		VERIFY(minpid == 0);
		VERIFY3P(pid, <, mpid);
		VERIFY3P(pid_lookup(pid), ==, NULL);
		newpid = pid;
	} else {
		/*
		 * Allocate a pid
		 */
		ASSERT(minpid <= mpid && mpid < maxpid);

		startpid = mpid;
		for (;;) {
			newpid = mpid;
			if (++mpid == maxpid)
				mpid = minpid;

			if (pid_lookup(newpid) == NULL)
				break;

			if (mpid == startpid)
				goto failed;
		}
	}

	/*
	 * Put pid into the pid hash table.
	 */
	pidp->pid_link = HASHPID(newpid);
	HASHPID(newpid) = pidp;
	pidp->pid_ref = 1;
	pidp->pid_id = newpid;

	if (flags & PID_ALLOC_PROC) {
		procentfree = pep->pe_next;
		pidp->pid_prslot = pep - procdir;
		pep->pe_proc = prp;
		prp->p_pidp = pidp;
		prp->p_lockp = &proc_lock[pid_getlockslot(pidp->pid_prslot)];
	} else {
		pidp->pid_prslot = 0;
	}

	mutex_exit(&pidlinklock);

	return (newpid);

failed:
	mutex_exit(&pidlinklock);
	kmem_free(pidp, sizeof (struct pid));
	return (-1);
}

/*
 * decrement the reference count for pid
 */
int
pid_rele(struct pid *pidp)
{
	struct pid **pidpp;

	mutex_enter(&pidlinklock);
	ASSERT(pidp != &pid0);

	pidpp = &HASHPID(pidp->pid_id);
	for (;;) {
		ASSERT(*pidpp != NULL);
		if (*pidpp == pidp)
			break;
		pidpp = &(*pidpp)->pid_link;
	}

	*pidpp = pidp->pid_link;
	mutex_exit(&pidlinklock);

	kmem_free(pidp, sizeof (*pidp));
	return (0);
}

void
proc_entry_free(struct pid *pidp)
{
	mutex_enter(&pidlinklock);
	pidp->pid_prinactive = 1;
	procdir[pidp->pid_prslot].pe_next = procentfree;
	procentfree = &procdir[pidp->pid_prslot];
	mutex_exit(&pidlinklock);
}

/*
 * The original task needs to be passed in since the process has already been
 * detached from the task at this point in time.
 */
void
pid_exit(proc_t *prp, struct task *tk)
{
	struct pid *pidp;
	zone_t	*zone = prp->p_zone;

	ASSERT(MUTEX_HELD(&pidlock));

	/*
	 * Exit process group.  If it is NULL, it's because fork failed
	 * before calling pgjoin().
	 */
	ASSERT(prp->p_pgidp != NULL || prp->p_stat == SIDL);
	if (prp->p_pgidp != NULL)
		pgexit(prp);

	sess_rele(prp->p_sessp, B_TRUE);

	pidp = prp->p_pidp;

	proc_entry_free(pidp);

	if (audit_active)
		audit_pfree(prp);

	if (practive == prp) {
		practive = prp->p_next;
	}

	if (prp->p_next) {
		prp->p_next->p_prev = prp->p_prev;
	}
	if (prp->p_prev) {
		prp->p_prev->p_next = prp->p_next;
	}

	PID_RELE(pidp);

	mutex_destroy(&prp->p_crlock);
	kmem_cache_free(process_cache, prp);
	nproc--;

	/*
	 * Decrement the process counts of the original task, project and zone.
	 */
	mutex_enter(&zone->zone_nlwps_lock);
	tk->tk_nprocs--;
	tk->tk_proj->kpj_nprocs--;
	zone->zone_nprocs--;
	mutex_exit(&zone->zone_nlwps_lock);
}

/*
 * Find a process visible from the specified zone given its process ID.
 */
proc_t *
prfind_zone(pid_t pid, zoneid_t zoneid)
{
	struct pid *pidp;
	proc_t *p;

	ASSERT(MUTEX_HELD(&pidlock));

	mutex_enter(&pidlinklock);
	pidp = pid_lookup(pid);
	mutex_exit(&pidlinklock);
	if (pidp != NULL && pidp->pid_prinactive == 0) {
		p = procdir[pidp->pid_prslot].pe_proc;
		if (zoneid == ALL_ZONES || p->p_zone->zone_id == zoneid)
			return (p);
	}
	return (NULL);
}

/*
 * Find a process given its process ID.  This obeys zone restrictions,
 * so if the caller is in a non-global zone it won't find processes
 * associated with other zones.  Use prfind_zone(pid, ALL_ZONES) to
 * bypass this restriction.
 */
proc_t *
prfind(pid_t pid)
{
	zoneid_t zoneid;

	if (INGLOBALZONE(curproc))
		zoneid = ALL_ZONES;
	else
		zoneid = getzoneid();
	return (prfind_zone(pid, zoneid));
}

proc_t *
pgfind_zone(pid_t pgid, zoneid_t zoneid)
{
	struct pid *pidp;

	ASSERT(MUTEX_HELD(&pidlock));

	mutex_enter(&pidlinklock);
	pidp = pid_lookup(pgid);
	mutex_exit(&pidlinklock);
	if (pidp != NULL) {
		proc_t *p = pidp->pid_pglink;

		if (zoneid == ALL_ZONES || pgid == 0 || p == NULL ||
		    p->p_zone->zone_id == zoneid)
			return (p);
	}
	return (NULL);
}

/*
 * return the head of the list of processes whose process group ID is 'pgid',
 * or NULL, if no such process group
 */
proc_t *
pgfind(pid_t pgid)
{
	zoneid_t zoneid;

	if (INGLOBALZONE(curproc))
		zoneid = ALL_ZONES;
	else
		zoneid = getzoneid();
	return (pgfind_zone(pgid, zoneid));
}

/*
 * Sets P_PR_LOCK on a non-system process.  Process must be fully created
 * and not exiting to succeed.
 *
 * Returns 0 on success.
 * Returns 1 if P_PR_LOCK is set.
 * Returns -1 if proc is in invalid state.
 */
int
sprtrylock_proc(proc_t *p)
{
	ASSERT(MUTEX_HELD(&p->p_lock));

	/* skip system and incomplete processes */
	if (p->p_stat == SIDL || p->p_stat == SZOMB ||
	    (p->p_flag & (SSYS | SEXITING | SEXITLWPS))) {
		return (-1);
	}

	if (p->p_proc_flag & P_PR_LOCK)
		return (1);

	p->p_proc_flag |= P_PR_LOCK;

	return (0);
}

/*
 * Wait for P_PR_LOCK to become clear.  Returns with p_lock dropped,
 * and the proc pointer no longer valid, as the proc may have exited.
 */
void
sprwaitlock_proc(proc_t *p)
{
	kmutex_t *mp;

	ASSERT(MUTEX_HELD(&p->p_lock));
	ASSERT(p->p_proc_flag & P_PR_LOCK);

	/*
	 * p_lock is persistent, but p itself is not -- it could
	 * vanish during cv_wait().  Load p->p_lock now so we can
	 * drop it after cv_wait() without referencing p.
	 */
	mp = &p->p_lock;
	cv_wait(&pr_pid_cv[p->p_slot], mp);
	mutex_exit(mp);
}

/*
 * If pid exists, find its proc, acquire its p_lock and mark it P_PR_LOCK.
 * Returns the proc pointer on success, NULL on failure.  sprlock() is
 * really just a stripped-down version of pr_p_lock() to allow practive
 * walkers like dofusers() and dumpsys() to synchronize with /proc.
 */
proc_t *
sprlock_zone(pid_t pid, zoneid_t zoneid)
{
	proc_t *p;
	int ret;

	for (;;) {
		mutex_enter(&pidlock);
		if ((p = prfind_zone(pid, zoneid)) == NULL) {
			mutex_exit(&pidlock);
			return (NULL);
		}
		mutex_enter(&p->p_lock);
		mutex_exit(&pidlock);

		if (panicstr)
			return (p);

		ret = sprtrylock_proc(p);
		if (ret == -1) {
			mutex_exit(&p->p_lock);
			return (NULL);
		} else if (ret == 0) {
			break;
		}
		sprwaitlock_proc(p);
	}
	return (p);
}

proc_t *
sprlock(pid_t pid)
{
	zoneid_t zoneid;

	if (INGLOBALZONE(curproc))
		zoneid = ALL_ZONES;
	else
		zoneid = getzoneid();
	return (sprlock_zone(pid, zoneid));
}

void
sprlock_proc(proc_t *p)
{
	ASSERT(MUTEX_HELD(&p->p_lock));

	while (p->p_proc_flag & P_PR_LOCK) {
		cv_wait(&pr_pid_cv[p->p_slot], &p->p_lock);
	}

	p->p_proc_flag |= P_PR_LOCK;
}

void
sprunlock(proc_t *p)
{
	if (panicstr) {
		mutex_exit(&p->p_lock);
		return;
	}

	ASSERT(p->p_proc_flag & P_PR_LOCK);
	ASSERT(MUTEX_HELD(&p->p_lock));

	if ((p->p_flag & SKILLED) && p->p_tlist != NULL) {
		/*
		 * While P_PR_LOCK was set, this process received a SIGKILL.
		 * The signal was posted in p->p_sig and p->p_extsig, but we
		 * skipped resuming stopped threads because P_PR_LOCK prevented
		 * the process' shape from changing.  If all threads were
		 * stopped by SIGSTOP or /proc PCSTOP, none will run to witness
		 * the SIGKILL and this process will end up stuck.
		 *
		 * While only one thread needs to be runnable to witness the
		 * SIGKILL, set as many running as we can in case there are
		 * mixed scheduler priorities.  It would otherwise be
		 * unfortunate if we set a single low-priority thread runnable
		 * in an otherwise-stopped process and did not promptly notice
		 * the SIGKILL.
		 *
		 * * TS_XSTART undoes the stopping effect of SIGSTOP.
		 * * TS_PSTART undoes the stopping effect of /proc PCSTOP.
		 *
		 * Notably, other TS_* bits are inappropriate here:
		 * * Do not set TS_CSTART or TS_UNPAUSE; lwps may be stopped by
		 *   PR_SUSPEND for many reasons. Some cases, like holdlwps(),
		 *   will resume the process before the corresponding syscall
		 *   returns. Other cases, like dumping core, the suspender
		 *   will tear down the lwps as it completes.
		 * * Do not set TS_RESUME out of caution; not sure about the
		 *   consequences of a process going away during CPR resume and
		 *   CPR should set the process running eventually.
		 * * Do not set TS_CREATE because lwp creation expects threads
		 *   to remain paused until lwp completes.
		 */
		runlwps(p, TS_XSTART | TS_PSTART);
	}

	cv_signal(&pr_pid_cv[p->p_slot]);
	p->p_proc_flag &= ~P_PR_LOCK;
	mutex_exit(&p->p_lock);
}

void
pid_init(void)
{
	int i;

	pid_hashsz = 1 << highbit(v.v_proc / pid_hashlen);

	pidhash = kmem_zalloc(sizeof (struct pid *) * pid_hashsz, KM_SLEEP);
	procdir = kmem_alloc(sizeof (union procent) * v.v_proc, KM_SLEEP);
	pr_pid_cv = kmem_zalloc(sizeof (kcondvar_t) * v.v_proc, KM_SLEEP);
	proc_lock = kmem_zalloc(sizeof (struct plock) * v.v_proc, KM_SLEEP);

	nproc = 1;
	practive = proc_sched;
	proc_sched->p_next = NULL;
	procdir[0].pe_proc = proc_sched;

	procentfree = &procdir[1];
	for (i = 1; i < v.v_proc - 1; i++)
		procdir[i].pe_next = &procdir[i+1];
	procdir[i].pe_next = NULL;

	HASHPID(0) = &pid0;

	upcount_init();
}

proc_t *
pid_entry(int slot)
{
	union procent *pep;
	proc_t *prp;

	ASSERT(MUTEX_HELD(&pidlock));
	ASSERT(slot >= 0 && slot < v.v_proc);

	pep = procdir[slot].pe_next;
	if (pep >= procdir && pep < &procdir[v.v_proc])
		return (NULL);
	prp = procdir[slot].pe_proc;
	if (prp != 0 && prp->p_stat == SIDL)
		return (NULL);
	return (prp);
}

/*
 * Send the specified signal to all processes whose process group ID is
 * equal to 'pgid'
 */

void
signal(pid_t pgid, int sig)
{
	struct pid *pidp;
	proc_t *prp;

	mutex_enter(&pidlock);
	mutex_enter(&pidlinklock);
	if (pgid == 0 || (pidp = pid_lookup(pgid)) == NULL) {
		mutex_exit(&pidlinklock);
		mutex_exit(&pidlock);
		return;
	}
	mutex_exit(&pidlinklock);
	for (prp = pidp->pid_pglink; prp; prp = prp->p_pglink) {
		mutex_enter(&prp->p_lock);
		sigtoproc(prp, NULL, sig);
		mutex_exit(&prp->p_lock);
	}
	mutex_exit(&pidlock);
}

/*
 * Send the specified signal to the specified process
 */

void
prsignal(struct pid *pidp, int sig)
{
	if (!(pidp->pid_prinactive))
		psignal(procdir[pidp->pid_prslot].pe_proc, sig);
}

#include <sys/sunddi.h>

/*
 * DDI/DKI interfaces for drivers to send signals to processes
 */

/*
 * obtain an opaque reference to a process for signaling
 */
void *
proc_ref(void)
{
	struct pid *pidp;

	mutex_enter(&pidlock);
	pidp = curproc->p_pidp;
	PID_HOLD(pidp);
	mutex_exit(&pidlock);

	return (pidp);
}

/*
 * release a reference to a process
 * - a process can exit even if a driver has a reference to it
 * - one proc_unref for every proc_ref
 */
void
proc_unref(void *pref)
{
	mutex_enter(&pidlock);
	PID_RELE((struct pid *)pref);
	mutex_exit(&pidlock);
}

/*
 * send a signal to a process
 *
 * - send the process the signal
 * - if the process went away, return a -1
 * - if the process is still there return 0
 */
int
proc_signal(void *pref, int sig)
{
	struct pid *pidp = pref;

	prsignal(pidp, sig);
	return (pidp->pid_prinactive ? -1 : 0);
}


static struct upcount	**upc_hash;	/* a boot time allocated array */
static ulong_t		upc_hashmask;
#define	UPC_HASH(x, y)	((ulong_t)(x ^ y) & upc_hashmask)

/*
 * Get us off the ground.  Called once at boot.
 */
void
upcount_init(void)
{
	ulong_t	upc_hashsize;

	/*
	 * An entry per MB of memory is our current guess
	 */
	/*
	 * 2^20 is a meg, so shifting right by 20 - PAGESHIFT
	 * converts pages to megs (without overflowing a u_int
	 * if you have more than 4G of memory, like ptob(physmem)/1M
	 * would).
	 */
	upc_hashsize = (1 << highbit(physmem >> (20 - PAGESHIFT)));
	upc_hashmask = upc_hashsize - 1;
	upc_hash = kmem_zalloc(upc_hashsize * sizeof (struct upcount *),
	    KM_SLEEP);
}

/*
 * Increment the number of processes associated with a given uid and zoneid.
 */
void
upcount_inc(uid_t uid, zoneid_t zoneid)
{
	struct upcount	**upc, **hupc;
	struct upcount	*new;

	ASSERT(MUTEX_HELD(&pidlock));
	new = NULL;
	hupc = &upc_hash[UPC_HASH(uid, zoneid)];
top:
	upc = hupc;
	while ((*upc) != NULL) {
		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
			(*upc)->up_count++;
			if (new) {
				/*
				 * did not need `new' afterall.
				 */
				kmem_free(new, sizeof (*new));
			}
			return;
		}
		upc = &(*upc)->up_next;
	}

	/*
	 * There is no entry for this <uid,zoneid> pair.
	 * Allocate one.  If we have to drop pidlock, check
	 * again.
	 */
	if (new == NULL) {
		new = (struct upcount *)kmem_alloc(sizeof (*new), KM_NOSLEEP);
		if (new == NULL) {
			mutex_exit(&pidlock);
			new = (struct upcount *)kmem_alloc(sizeof (*new),
			    KM_SLEEP);
			mutex_enter(&pidlock);
			goto top;
		}
	}


	/*
	 * On the assumption that a new user is going to do some
	 * more forks, put the new upcount structure on the front.
	 */
	upc = hupc;

	new->up_uid = uid;
	new->up_zoneid = zoneid;
	new->up_count = 1;
	new->up_next = *upc;

	*upc = new;
}

/*
 * Decrement the number of processes a given uid and zoneid has.
 */
void
upcount_dec(uid_t uid, zoneid_t zoneid)
{
	struct	upcount **upc;
	struct	upcount *done;

	ASSERT(MUTEX_HELD(&pidlock));

	upc = &upc_hash[UPC_HASH(uid, zoneid)];
	while ((*upc) != NULL) {
		if ((*upc)->up_uid == uid && (*upc)->up_zoneid == zoneid) {
			(*upc)->up_count--;
			if ((*upc)->up_count == 0) {
				done = *upc;
				*upc = (*upc)->up_next;
				kmem_free(done, sizeof (*done));
			}
			return;
		}
		upc = &(*upc)->up_next;
	}
	cmn_err(CE_PANIC, "decr_upcount-off the end");
}

/*
 * Returns the number of processes a uid has.
 * Non-existent uid's are assumed to have no processes.
 */
int
upcount_get(uid_t uid, zoneid_t zoneid)
{
	struct	upcount *upc;

	ASSERT(MUTEX_HELD(&pidlock));

	upc = upc_hash[UPC_HASH(uid, zoneid)];
	while (upc != NULL) {
		if (upc->up_uid == uid && upc->up_zoneid == zoneid) {
			return (upc->up_count);
		}
		upc = upc->up_next;
	}
	return (0);
}