/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/


#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * Inter-Process Communication Semaphore Facility.
 *
 * See os/ipc.c for a description of common IPC functionality.
 *
 * Resource controls
 * -----------------
 *
 * Control:      project.max-sem-ids (rc_project_semmni)
 * Description:  Maximum number of semaphore ids allowed a project.
 *
 *   When semget() is used to allocate a semaphore set, one id is
 *   allocated.  If the id allocation doesn't succeed, semget() fails
 *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
 *   the id is deallocated.
 *
 * Control:      process.max-sem-nsems (rc_process_semmsl)
 * Description:  Maximum number of semaphores allowed per semaphore set.
 *
 *   When semget() is used to allocate a semaphore set, the size of the
 *   set is compared with this limit.  If the number of semaphores
 *   exceeds the limit, semget() fails and errno is set to EINVAL.
 *
 * Control:      process.max-sem-ops (rc_process_semopm)
 * Description:  Maximum number of semaphore operations allowed per
 *               semop call.
 *
 *   When semget() successfully allocates a semaphore set, the minimum
 *   enforced value of this limit is used to initialize the
 *   "system-imposed maximum" number of operations a semop() call for
 *   this set can perform.
 *
 * Undo structures
 * ---------------
 *
 * Removing the undo structure tunables involved a serious redesign of
 * how they were implemented.  There is now one undo structure for
 * every process/semaphore array combination (lazily allocated, of
 * course), and each is equal in size to the semaphore it corresponds
 * to.  To avoid scalability and performance problems, the undo
 * structures are stored in two places: a per-process AVL tree sorted
 * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
 * per-semaphore linked list (sem_undos, protected by the semaphore's
 * ID lock).  The former is used by semop, where a lookup is performed
 * once and cached if SEM_UNDO is specified for any of the operations,
 * and at process exit where the undoable operations are rolled back.
 * The latter is used when removing the semaphore, so the undo
 * structures can be removed from the appropriate processes' trees.
 *
 * The undo structure itself contains pointers to the ksemid and proc
 * to which it corresponds, a list node, an AVL node, and an array of
 * adjust-on-exit (AOE) values.  When an undo structure is allocated it
 * is immediately added to both the process's tree and the semaphore's
 * list.  Lastly, the reference count on the semaphore is increased.
 *
 * Avoiding a lock ordering violation between p_lock and the ID lock,
 * wont to occur when there is a race between a process exiting and the
 * removal of a semaphore, mandates the delicate dance that exists
 * between semexit and sem_rmid.
 *
 * sem_rmid, holding the ID lock, iterates through all undo structures
 * and for each takes the appropriate process's p_lock and checks to
 * see if p_semacct is NULL.  If it is, it skips that undo structure
 * and continues to the next.  Otherwise, it removes the undo structure
 * from both the AVL tree and the semaphore's list, and releases the
 * hold that the undo structure had on the semaphore.
 *
 * The important other half of this is semexit, which will immediately
 * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
 * p_lock.  From this point on it is semexit's responsibility to clean
 * up all undo structures found in the tree -- a coexecuting sem_rmid
 * will see the NULL p_semacct and skip that undo structure.  It walks
 * the AVL tree (using avl_destroy_nodes) and for each undo structure
 * takes the appropriate semaphore's ID lock (always legal since the
 * undo structure has a hold on the semaphore), updates all semaphores
 * with non-zero AOE values, and removes the structure from the
 * semaphore's list.  It then drops the structure's reference on the
 * semaphore, drops the ID lock, and frees the undo structure.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/cred.h>
#include <sys/vmem.h>
#include <sys/kmem.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/ipc.h>
#include <sys/ipc_impl.h>
#include <sys/sem.h>
#include <sys/sem_impl.h>
#include <sys/user.h>
#include <sys/proc.h>
#include <sys/cpuvar.h>
#include <sys/debug.h>
#include <sys/var.h>
#include <sys/cmn_err.h>
#include <sys/modctl.h>
#include <sys/syscall.h>
#include <sys/avl.h>
#include <sys/list.h>
#include <sys/zone.h>

#include <c2/audit.h>

extern rctl_hndl_t rc_project_semmni;
extern rctl_hndl_t rc_process_semmsl;
extern rctl_hndl_t rc_process_semopm;
static ipc_service_t *sem_svc;
static zone_key_t sem_zone_key;

/*
 * The following tunables are obsolete.  Though for compatibility we
 * still read and interpret seminfo_semmsl, seminfo_semopm and
 * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
 * mechanism for administrating the IPC Semaphore facility is through
 * the resource controls described at the top of this file.
 */
int seminfo_semaem = 16384;	/* (obsolete) */
int seminfo_semmap = 10;	/* (obsolete) */
int seminfo_semmni = 10;	/* (obsolete) */
int seminfo_semmns = 60;	/* (obsolete) */
int seminfo_semmnu = 30;	/* (obsolete) */
int seminfo_semmsl = 25;	/* (obsolete) */
int seminfo_semopm = 10;	/* (obsolete) */
int seminfo_semume = 10;	/* (obsolete) */
int seminfo_semusz = 96;	/* (obsolete) */
int seminfo_semvmx = 32767;	/* (obsolete) */

#define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
#define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))

static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
    uintptr_t a2, uintptr_t a3);
static void sem_dtor(kipc_perm_t *);
static void sem_rmid(kipc_perm_t *);
static void sem_remove_zone(zoneid_t, void *);

static struct sysent ipcsem_sysent = {
	5,
	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
	semsys
};

/*
 * Module linkage information for the kernel.
 */
static struct modlsys modlsys = {
	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
};

#ifdef _SYSCALL32_IMPL
static struct modlsys modlsys32 = {
	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
};
#endif

static struct modlinkage modlinkage = {
	MODREV_1,
	&modlsys,
#ifdef _SYSCALL32_IMPL
	&modlsys32,
#endif
	NULL
};


int
_init(void)
{
	int result;

	sem_svc = ipcs_create("semids", rc_project_semmni, sizeof (ksemid_t),
	    sem_dtor, sem_rmid, AT_IPC_SEM,
	    offsetof(kproject_data_t, kpd_semmni));
	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);

	if ((result = mod_install(&modlinkage)) == 0)
		return (0);

	(void) zone_key_delete(sem_zone_key);
	ipcs_destroy(sem_svc);

	return (result);
}

int
_fini(void)
{
	return (EBUSY);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

static void
sem_dtor(kipc_perm_t *perm)
{
	ksemid_t *sp = (ksemid_t *)perm;

	kmem_free(sp->sem_base,
	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
	list_destroy(&sp->sem_undos);
}

/*
 * sem_undo_add - Create or update adjust on exit entry.
 */
static int
sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
{
	int newval = undo->un_aoe[num] - val;

	if (newval > USHRT_MAX || newval < -USHRT_MAX)
		return (ERANGE);
	undo->un_aoe[num] = newval;

	return (0);
}

/*
 * sem_undo_clear - clears all undo entries for specified semaphores
 *
 * Used when semaphores are reset by SETVAL or SETALL.
 */
static void
sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
{
	struct sem_undo *undo;
	int i;

	ASSERT(low <= high);
	ASSERT(high < sp->sem_nsems);

	for (undo = list_head(&sp->sem_undos); undo;
	    undo = list_next(&sp->sem_undos, undo))
		for (i = low; i <= high; i++)
			undo->un_aoe[i] = 0;
}

/*
 * sem_rollback - roll back work done so far if unable to complete operation
 */
static void
sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
{
	struct sem *semp;	/* semaphore ptr */

	for (op += n - 1; n--; op--) {
		if (op->sem_op == 0)
			continue;
		semp = &sp->sem_base[op->sem_num];
		semp->semval -= op->sem_op;
		if (op->sem_flg & SEM_UNDO) {
			ASSERT(undo != NULL);
			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
		}
	}
}

static void
sem_rmid(kipc_perm_t *perm)
{
	ksemid_t *sp = (ksemid_t *)perm;
	struct sem *semp;
	struct sem_undo *undo;
	size_t size = SEM_UNDOSZ(sp->sem_nsems);
	int i;

	/*LINTED*/
	while (undo = list_head(&sp->sem_undos)) {
		list_remove(&sp->sem_undos, undo);
		mutex_enter(&undo->un_proc->p_lock);
		if (undo->un_proc->p_semacct == NULL) {
			mutex_exit(&undo->un_proc->p_lock);
			continue;
		}
		avl_remove(undo->un_proc->p_semacct, undo);
		mutex_exit(&undo->un_proc->p_lock);
		kmem_free(undo, size);
		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
	}

	for (i = 0; i < sp->sem_nsems; i++) {
		semp = &sp->sem_base[i];
		semp->semval = semp->sempid = 0;
		if (semp->semncnt) {
			cv_broadcast(&semp->semncnt_cv);
			semp->semncnt = 0;
		}
		if (semp->semzcnt) {
			cv_broadcast(&semp->semzcnt_cv);
			semp->semzcnt = 0;
		}
	}
}

/*
 * semctl - Semctl system call.
 */
static int
semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
{
	ksemid_t		*sp;	/* ptr to semaphore header */
	struct sem		*p;	/* ptr to semaphore */
	unsigned int		i;	/* loop control */
	ushort_t		*vals, *vp;
	size_t			vsize = 0;
	int			error = 0;
	int			retval = 0;
	struct cred		*cr;
	kmutex_t		*lock;
	model_t			mdl = get_udatamodel();
	STRUCT_DECL(semid_ds, sid);
	struct semid_ds64	ds64;

	STRUCT_INIT(sid, mdl);
	cr = CRED();

	/*
	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
	 */
	switch (cmd) {
	case IPC_SET:
		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
			return (set_errno(EFAULT));
		break;

	case IPC_SET64:
		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
			return (set_errno(EFAULT));
		break;

	case SETALL:
		if ((lock = ipc_lookup(sem_svc, semid,
		    (kipc_perm_t **)&sp)) == NULL)
			return (set_errno(EINVAL));
		vsize = sp->sem_nsems * sizeof (*vals);
		mutex_exit(lock);

		/* allocate space to hold all semaphore values */
		vals = kmem_alloc(vsize, KM_SLEEP);

		if (copyin((void *)arg, vals, vsize)) {
			kmem_free(vals, vsize);
			return (set_errno(EFAULT));
		}
		break;

	case IPC_RMID:
		if (error = ipc_rmid(sem_svc, semid, cr))
			return (set_errno(error));
		return (0);
	}

	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
		if (vsize != 0)
			kmem_free(vals, vsize);
		return (set_errno(EINVAL));
	}
	switch (cmd) {
	/* Set ownership and permissions. */
	case IPC_SET:

		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		sp->sem_ctime = gethrestime_sec();
		mutex_exit(lock);
		return (0);

	/* Get semaphore data structure. */
	case IPC_STAT:

		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}

		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
		mutex_exit(lock);

		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
			return (set_errno(EFAULT));
		return (0);

	case IPC_SET64:

		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
		    &ds64.semx_perm)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		sp->sem_ctime = gethrestime_sec();
		mutex_exit(lock);
		return (0);

	case IPC_STAT64:

		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
		ds64.semx_nsems = sp->sem_nsems;
		ds64.semx_otime = sp->sem_otime;
		ds64.semx_ctime = sp->sem_ctime;

		mutex_exit(lock);
		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
			return (set_errno(EFAULT));

		return (0);

	/* Get # of processes sleeping for greater semval. */
	case GETNCNT:
		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		if (semnum >= sp->sem_nsems) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
		retval = sp->sem_base[semnum].semncnt;
		mutex_exit(lock);
		return (retval);

	/* Get pid of last process to operate on semaphore. */
	case GETPID:
		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		if (semnum >= sp->sem_nsems) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
		retval = sp->sem_base[semnum].sempid;
		mutex_exit(lock);
		return (retval);

	/* Get semval of one semaphore. */
	case GETVAL:
		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		if (semnum >= sp->sem_nsems) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
		retval = sp->sem_base[semnum].semval;
		mutex_exit(lock);
		return (retval);

	/* Get all semvals in set. */
	case GETALL:
		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}

		/* allocate space to hold all semaphore values */
		vsize = sp->sem_nsems * sizeof (*vals);
		vals = vp = kmem_alloc(vsize, KM_SLEEP);

		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
			bcopy(&p->semval, vp, sizeof (p->semval));

		mutex_exit(lock);

		if (copyout((void *)vals, (void *)arg, vsize)) {
			kmem_free(vals, vsize);
			return (set_errno(EFAULT));
		}

		kmem_free(vals, vsize);
		return (0);

	/* Get # of processes sleeping for semval to become zero. */
	case GETZCNT:
		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		if (semnum >= sp->sem_nsems) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
		retval = sp->sem_base[semnum].semzcnt;
		mutex_exit(lock);
		return (retval);

	/* Set semval of one semaphore. */
	case SETVAL:
		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
			mutex_exit(lock);
			return (set_errno(error));
		}
		if (semnum >= sp->sem_nsems) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
		if ((uint_t)arg > USHRT_MAX) {
			mutex_exit(lock);
			return (set_errno(ERANGE));
		}
		p = &sp->sem_base[semnum];
		if ((p->semval = (ushort_t)arg) != 0) {
			if (p->semncnt) {
				cv_broadcast(&p->semncnt_cv);
			}
		} else if (p->semzcnt) {
			cv_broadcast(&p->semzcnt_cv);
		}
		p->sempid = curproc->p_pid;
		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
		mutex_exit(lock);
		return (0);

	/* Set semvals of all semaphores in set. */
	case SETALL:
		/* Check if semaphore set has been deleted and reallocated. */
		if (sp->sem_nsems * sizeof (*vals) != vsize) {
			error = set_errno(EINVAL);
			goto seterr;
		}
		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
			error = set_errno(error);
			goto seterr;
		}
		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
		    (p++)->sempid = curproc->p_pid) {
			if ((p->semval = vals[i++]) != 0) {
				if (p->semncnt) {
					cv_broadcast(&p->semncnt_cv);
				}
			} else if (p->semzcnt) {
				cv_broadcast(&p->semzcnt_cv);
			}
		}
seterr:
		mutex_exit(lock);
		kmem_free(vals, vsize);
		return (error);

	default:
		mutex_exit(lock);
		return (set_errno(EINVAL));
	}

	/* NOTREACHED */
}

/*
 * semexit - Called by exit() to clean up on process exit.
 */
void
semexit(proc_t *pp)
{
	avl_tree_t	*tree;
	struct sem_undo	*undo;
	void		*cookie = NULL;

	mutex_enter(&pp->p_lock);
	tree = pp->p_semacct;
	pp->p_semacct = NULL;
	mutex_exit(&pp->p_lock);

	while (undo = avl_destroy_nodes(tree, &cookie)) {
		ksemid_t *sp = undo->un_sp;
		size_t size = SEM_UNDOSZ(sp->sem_nsems);
		int i;

		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
		if (!IPC_FREE(&sp->sem_perm)) {
			for (i = 0; i < sp->sem_nsems; i++) {
				int adj = undo->un_aoe[i];
				if (adj) {
					struct sem *semp = &sp->sem_base[i];
					int v = (int)semp->semval + adj;

					if (v < 0 || v > USHRT_MAX)
						continue;
					semp->semval = (ushort_t)v;
					if (v == 0 && semp->semzcnt)
						cv_broadcast(&semp->semzcnt_cv);
					if (adj > 0 && semp->semncnt)
						cv_broadcast(&semp->semncnt_cv);
				}
			}
			list_remove(&sp->sem_undos, undo);
		}
		ipc_rele(sem_svc, (kipc_perm_t *)sp);
		kmem_free(undo, size);
	}

	avl_destroy(tree);
	kmem_free(tree, sizeof (avl_tree_t));
}

/*
 * Remove all semaphores associated with a given zone.  Called by
 * zone_shutdown when the zone is halted.
 */
/*ARGSUSED1*/
static void
sem_remove_zone(zoneid_t zoneid, void *arg)
{
	ipc_remove_zone(sem_svc, zoneid);
}

/*
 * semget - Semget system call.
 */
static int
semget(key_t key, int nsems, int semflg)
{
	ksemid_t	*sp;
	kmutex_t	*lock;
	int		id, error;
	proc_t		*pp = curproc;

top:
	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
		return (set_errno(error));

	if (!IPC_FREE(&sp->sem_perm)) {
		/*
		 * A semaphore with the requested key exists.
		 */
		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
			mutex_exit(lock);
			return (set_errno(EINVAL));
		}
	} else {
		/*
		 * This is a new semaphore set.  Finish initialization.
		 */
		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
		    nsems, RCA_SAFE) & RCT_DENY)) {
			mutex_exit(lock);
			mutex_exit(&pp->p_lock);
			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
			return (set_errno(EINVAL));
		}
		mutex_exit(lock);
		mutex_exit(&pp->p_lock);

		/*
		 * We round the allocation up to coherency granularity
		 * so that multiple semaphore allocations won't result
		 * in the false sharing of their sem structures.
		 */
		sp->sem_base =
		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
		    KM_SLEEP);
		sp->sem_binary = (nsems == 1);
		sp->sem_nsems = (ushort_t)nsems;
		sp->sem_ctime = gethrestime_sec();
		sp->sem_otime = 0;
		list_create(&sp->sem_undos, sizeof (struct sem_undo),
		    offsetof(struct sem_undo, un_list));

		if (error = ipc_commit_begin(sem_svc, key, semflg,
		    (kipc_perm_t *)sp)) {
			if (error == EAGAIN)
				goto top;
			return (set_errno(error));
		}
		sp->sem_maxops =
		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
		    RCA_SAFE) & RCT_DENY) {
			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
			return (set_errno(EINVAL));
		}
		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
	}
#ifdef C2_AUDIT
	if (audit_active)
		audit_ipcget(AT_IPC_SEM, (void *)sp);
#endif
	id = sp->sem_perm.ipc_id;
	mutex_exit(lock);
	return (id);
}

/*
 * semids system call.
 */
static int
semids(int *buf, uint_t nids, uint_t *pnids)
{
	int error;

	if (error = ipc_ids(sem_svc, buf, nids, pnids))
		return (set_errno(error));

	return (0);
}


/*
 * Helper function for semop - copies in the provided timespec and
 * computes the absolute future time after which we must return.
 */
static int
compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
	timespec_t *timeout)
{
	model_t datamodel = get_udatamodel();

	if (datamodel == DATAMODEL_NATIVE) {
		if (copyin(timeout, ts, sizeof (timespec_t)))
			return (EFAULT);
	} else {
		timespec32_t ts32;

		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
			return (EFAULT);
		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
	}

	if (itimerspecfix(ts))
		return (EINVAL);

	/*
	 * Convert the timespec value into absolute time.
	 */
	timespecadd(ts, now);
	*tsp = ts;

	return (0);
}

/*
 * Undo structure comparator.  We sort based on ksemid_t pointer.
 */
static int
sem_undo_compar(const void *x, const void *y)
{
	struct sem_undo *undo1 = (struct sem_undo *)x;
	struct sem_undo *undo2 = (struct sem_undo *)y;

	if (undo1->un_sp < undo2->un_sp)
		return (-1);
	if (undo1->un_sp > undo2->un_sp)
		return (1);
	return (0);
}

/*
 * Helper function for semop - creates an undo structure and adds it to
 * the process's avl tree and the semaphore's list.
 */
static int
sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
    struct sem_undo *template, struct sem_undo **un)
{
	size_t size;
	struct sem_undo *undo;
	avl_tree_t *tree = NULL;
	avl_index_t where;

	mutex_exit(*lock);

	size = SEM_UNDOSZ(sp->sem_nsems);
	undo = kmem_zalloc(size, KM_SLEEP);
	undo->un_proc = pp;
	undo->un_sp = sp;

	if (pp->p_semacct == NULL)
		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);

	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
	if (IPC_FREE(&sp->sem_perm)) {
		kmem_free(undo, size);
		if (tree)
			kmem_free(tree, sizeof (avl_tree_t));
		return (EIDRM);
	}

	mutex_enter(&pp->p_lock);
	if (tree) {
		if (pp->p_semacct == NULL) {
			avl_create(tree, sem_undo_compar,
			    sizeof (struct sem_undo),
			    offsetof(struct sem_undo, un_avl));
			pp->p_semacct = tree;
		} else {
			kmem_free(tree, sizeof (avl_tree_t));
		}
	}

	if (*un = avl_find(pp->p_semacct, template, &where)) {
		mutex_exit(&pp->p_lock);
		kmem_free(undo, size);
	} else {
		*un = undo;
		avl_insert(pp->p_semacct, undo, where);
		mutex_exit(&pp->p_lock);
		list_insert_head(&sp->sem_undos, undo);
		ipc_hold(sem_svc, (kipc_perm_t *)sp);
	}


	return (0);
}

/*
 * semop - Semop system call.
 */
static int
semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
{
	ksemid_t	*sp = NULL;
	kmutex_t	*lock;
	struct sembuf	*op;	/* ptr to operation */
	int		i;	/* loop control */
	struct sem	*semp;	/* ptr to semaphore */
	int 		error = 0;
	struct sembuf	*uops;	/* ptr to copy of user ops */
	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
	timespec_t	now, ts, *tsp = NULL;
	int		timecheck = 0;
	int		cvres, needundo, mode;
	struct sem_undo	*undo;
	proc_t		*pp = curproc;
	int		held = 0;

	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */

	/*
	 * To avoid the cost of copying in 'timeout' in the common
	 * case, we could only grab the time here and defer the copyin
	 * and associated computations until we are about to block.
	 *
	 * The down side to this is that we would then have to spin
	 * some goto top nonsense to avoid the copyin behind the semid
	 * lock.  As a common use of timed semaphores is as an explicit
	 * blocking mechanism, this could incur a greater penalty.
	 *
	 * If we eventually decide that this would be a wise route to
	 * take, the deferrable functionality is completely contained
	 * in 'compute_timeout', and the interface is defined such that
	 * we can legally not validate 'timeout' if it is unused.
	 */
	if (timeout != NULL) {
		timecheck = timechanged;
		gethrestime(&now);
		if (error = compute_timeout(&tsp, &ts, &now, timeout))
			return (set_errno(error));
	}

	/*
	 * Allocate space to hold the vector of semaphore ops.  If
	 * there is only 1 operation we use a preallocated buffer on
	 * the stack for speed.
	 *
	 * Since we don't want to allow the user to allocate an
	 * arbitrary amount of kernel memory, we need to check against
	 * the number of operations allowed by the semaphore.  We only
	 * bother doing this if the number of operations is larger than
	 * SEM_MAXUCOPS.
	 */
	if (nsops == 1)
		uops = &x_sem;
	else if (nsops == 0)
		return (0);
	else if (nsops <= SEM_MAXUCOPS)
		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);

	if (nsops > SEM_MAXUCOPS) {
		if ((lock = ipc_lookup(sem_svc, semid,
		    (kipc_perm_t **)&sp)) == NULL)
			return (set_errno(EFAULT));

		if (nsops > sp->sem_maxops) {
			mutex_exit(lock);
			return (set_errno(E2BIG));
		}
		held = 1;
		ipc_hold(sem_svc, (kipc_perm_t *)sp);
		mutex_exit(lock);

		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
		if (copyin(sops, uops, nsops * sizeof (*op))) {
			error = EFAULT;
			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
			goto semoperr;
		}

		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
		if (IPC_FREE(&sp->sem_perm)) {
			error = EIDRM;
			goto semoperr;
		}
	} else {
		/*
		 * This could be interleaved with the above code, but
		 * keeping them separate improves readability.
		 */
		if (copyin(sops, uops, nsops * sizeof (*op))) {
			error = EFAULT;
			goto semoperr_unlocked;
		}

		if ((lock = ipc_lookup(sem_svc, semid,
		    (kipc_perm_t **)&sp)) == NULL) {
			error = EINVAL;
			goto semoperr_unlocked;
		}

		if (nsops > sp->sem_maxops) {
			error = E2BIG;
			goto semoperr;
		}
	}

	/*
	 * Scan all operations.  Verify that sem #s are in range and
	 * this process is allowed the requested operations.  If any
	 * operations are marked SEM_UNDO, find (or allocate) the undo
	 * structure for this process and semaphore.
	 */
	needundo = 0;
	mode = 0;
	for (i = 0, op = uops; i++ < nsops; op++) {
		mode |= op->sem_op ? SEM_A : SEM_R;
		if (op->sem_num >= sp->sem_nsems) {
			error = EFBIG;
			goto semoperr;
		}
		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
			needundo = 1;
	}
	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
		goto semoperr;

	if (needundo) {
		struct sem_undo template;

		template.un_sp = sp;
		mutex_enter(&pp->p_lock);
		if (pp->p_semacct)
			undo = avl_find(pp->p_semacct, &template, NULL);
		else
			undo = NULL;
		mutex_exit(&pp->p_lock);
		if (undo == NULL) {
			if (error = sem_undo_alloc(pp, sp, &lock, &template,
			    &undo))
				goto semoperr;

			/* sem_undo_alloc unlocks the semaphore */
			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
				goto semoperr;
		}
	}

check:
	/*
	 * Loop waiting for the operations to be satisfied atomically.
	 * Actually, do the operations and undo them if a wait is needed
	 * or an error is detected.
	 */
	for (i = 0; i < nsops; i++) {
		op = &uops[i];
		semp = &sp->sem_base[op->sem_num];

		/*
		 * Raise the semaphore (i.e. sema_v)
		 */
		if (op->sem_op > 0) {
			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
			    ((op->sem_flg & SEM_UNDO) &&
			    (error = sem_undo_add(op->sem_op, op->sem_num,
			    undo)))) {
				if (i)
					sem_rollback(sp, uops, i, undo);
				if (error == 0)
					error = ERANGE;
				goto semoperr;
			}
			semp->semval += op->sem_op;
			/*
			 * If we are only incrementing the semaphore value
			 * by one on a binary semaphore, we can cv_signal.
			 */
			if (semp->semncnt) {
				if (op->sem_op == 1 && sp->sem_binary)
					cv_signal(&semp->semncnt_cv);
				else
					cv_broadcast(&semp->semncnt_cv);
			}
			if (semp->semzcnt && !semp->semval)
				cv_broadcast(&semp->semzcnt_cv);
			continue;
		}

		/*
		 * Lower the semaphore (i.e. sema_p)
		 */
		if (op->sem_op < 0) {
			if (semp->semval >= (unsigned)(-op->sem_op)) {
				if ((op->sem_flg & SEM_UNDO) &&
				    (error = sem_undo_add(op->sem_op,
				    op->sem_num, undo))) {
					if (i)
						sem_rollback(sp, uops, i, undo);
					goto semoperr;
				}
				semp->semval += op->sem_op;
				if (semp->semzcnt && !semp->semval)
					cv_broadcast(&semp->semzcnt_cv);
				continue;
			}
			if (i)
				sem_rollback(sp, uops, i, undo);
			if (op->sem_flg & IPC_NOWAIT) {
				error = EAGAIN;
				goto semoperr;
			}

			/*
			 * Mark the semaphore set as not a binary type
			 * if we are decrementing the value by more than 1.
			 *
			 * V operations will resort to cv_broadcast
			 * for this set because there are too many weird
			 * cases that have to be caught.
			 */
			if (op->sem_op < -1)
				sp->sem_binary = 0;
			if (!held) {
				held = 1;
				ipc_hold(sem_svc, (kipc_perm_t *)sp);
			}
			semp->semncnt++;
			cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
				tsp, timecheck);
			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);

			if (!IPC_FREE(&sp->sem_perm)) {
				ASSERT(semp->semncnt != 0);
				semp->semncnt--;
				if (cvres > 0)	/* normal wakeup */
					goto check;
			}

			/* EINTR or EAGAIN overrides EIDRM */
			if (cvres == 0)
				error = EINTR;
			else if (cvres < 0)
				error = EAGAIN;
			else
				error = EIDRM;
			goto semoperr;
		}

		/*
		 * Wait for zero value
		 */
		if (semp->semval) {
			if (i)
				sem_rollback(sp, uops, i, undo);
			if (op->sem_flg & IPC_NOWAIT) {
				error = EAGAIN;
				goto semoperr;
			}

			if (!held) {
				held = 1;
				ipc_hold(sem_svc, (kipc_perm_t *)sp);
			}
			semp->semzcnt++;
			cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
				tsp, timecheck);
			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);

			/*
			 * Don't touch semp if the semaphores have been removed.
			 */
			if (!IPC_FREE(&sp->sem_perm)) {
				ASSERT(semp->semzcnt != 0);
				semp->semzcnt--;
				if (cvres > 0)	/* normal wakeup */
					goto check;
			}

			/* EINTR or EAGAIN overrides EIDRM */
			if (cvres == 0)
				error = EINTR;
			else if (cvres < 0)
				error = EAGAIN;
			else
				error = EIDRM;
			goto semoperr;
		}
	}

	/* All operations succeeded.  Update sempid for accessed semaphores. */
	for (i = 0, op = uops; i++ < nsops;
	    sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
		;
	sp->sem_otime = gethrestime_sec();
	if (held)
		ipc_rele(sem_svc, (kipc_perm_t *)sp);
	else
		mutex_exit(lock);

	/* Before leaving, deallocate the buffer that held the user semops */
	if (nsops != 1)
		kmem_free(uops, sizeof (*uops) * nsops);
	return (0);

	/*
	 * Error return labels
	 */
semoperr:
	if (held)
		ipc_rele(sem_svc, (kipc_perm_t *)sp);
	else
		mutex_exit(lock);

semoperr_unlocked:

	/* Before leaving, deallocate the buffer that held the user semops */
	if (nsops != 1)
		kmem_free(uops, sizeof (*uops) * nsops);
	return (set_errno(error));
}

/*
 * semsys - System entry point for semctl, semget, and semop system calls.
 */
static int
semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
{
	int error;

	switch (opcode) {
	case SEMCTL:
		error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
		break;
	case SEMGET:
		error = semget((key_t)a1, (int)a2, (int)a3);
		break;
	case SEMOP:
		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
		break;
	case SEMIDS:
		error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
		break;
	case SEMTIMEDOP:
		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
		    (timespec_t *)a4);
		break;
	default:
		error = set_errno(EINVAL);
		break;
	}
	return (error);
}