xref: /titanic_41/usr/src/uts/common/syscall/sem.c (revision 46736d35df047bb400483364f76bfcb08cdcbb25)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 /*
33  * Inter-Process Communication Semaphore Facility.
34  *
35  * See os/ipc.c for a description of common IPC functionality.
36  *
37  * Resource controls
38  * -----------------
39  *
40  * Control:      zone.max-sem-ids (rc_zone_semmni)
41  * Description:  Maximum number of semaphore ids allowed a zone.
42  *
43  *   When semget() is used to allocate a semaphore set, one id is
44  *   allocated.  If the id allocation doesn't succeed, semget() fails
45  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
46  *   the id is deallocated.
47  *
48  * Control:      project.max-sem-ids (rc_project_semmni)
49  * Description:  Maximum number of semaphore ids allowed a project.
50  *
51  *   When semget() is used to allocate a semaphore set, one id is
52  *   allocated.  If the id allocation doesn't succeed, semget() fails
53  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
54  *   the id is deallocated.
55  *
56  * Control:      process.max-sem-nsems (rc_process_semmsl)
57  * Description:  Maximum number of semaphores allowed per semaphore set.
58  *
59  *   When semget() is used to allocate a semaphore set, the size of the
60  *   set is compared with this limit.  If the number of semaphores
61  *   exceeds the limit, semget() fails and errno is set to EINVAL.
62  *
63  * Control:      process.max-sem-ops (rc_process_semopm)
64  * Description:  Maximum number of semaphore operations allowed per
65  *               semop call.
66  *
67  *   When semget() successfully allocates a semaphore set, the minimum
68  *   enforced value of this limit is used to initialize the
69  *   "system-imposed maximum" number of operations a semop() call for
70  *   this set can perform.
71  *
72  * Undo structures
73  * ---------------
74  *
75  * Removing the undo structure tunables involved a serious redesign of
76  * how they were implemented.  There is now one undo structure for
77  * every process/semaphore array combination (lazily allocated, of
78  * course), and each is equal in size to the semaphore it corresponds
79  * to.  To avoid scalability and performance problems, the undo
80  * structures are stored in two places: a per-process AVL tree sorted
81  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
82  * per-semaphore linked list (sem_undos, protected by the semaphore's
83  * ID lock).  The former is used by semop, where a lookup is performed
84  * once and cached if SEM_UNDO is specified for any of the operations,
85  * and at process exit where the undoable operations are rolled back.
86  * The latter is used when removing the semaphore, so the undo
87  * structures can be removed from the appropriate processes' trees.
88  *
89  * The undo structure itself contains pointers to the ksemid and proc
90  * to which it corresponds, a list node, an AVL node, and an array of
91  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
92  * is immediately added to both the process's tree and the semaphore's
93  * list.  Lastly, the reference count on the semaphore is increased.
94  *
95  * Avoiding a lock ordering violation between p_lock and the ID lock,
96  * wont to occur when there is a race between a process exiting and the
97  * removal of a semaphore, mandates the delicate dance that exists
98  * between semexit and sem_rmid.
99  *
100  * sem_rmid, holding the ID lock, iterates through all undo structures
101  * and for each takes the appropriate process's p_lock and checks to
102  * see if p_semacct is NULL.  If it is, it skips that undo structure
103  * and continues to the next.  Otherwise, it removes the undo structure
104  * from both the AVL tree and the semaphore's list, and releases the
105  * hold that the undo structure had on the semaphore.
106  *
107  * The important other half of this is semexit, which will immediately
108  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
109  * p_lock.  From this point on it is semexit's responsibility to clean
110  * up all undo structures found in the tree -- a coexecuting sem_rmid
111  * will see the NULL p_semacct and skip that undo structure.  It walks
112  * the AVL tree (using avl_destroy_nodes) and for each undo structure
113  * takes the appropriate semaphore's ID lock (always legal since the
114  * undo structure has a hold on the semaphore), updates all semaphores
115  * with non-zero AOE values, and removes the structure from the
116  * semaphore's list.  It then drops the structure's reference on the
117  * semaphore, drops the ID lock, and frees the undo structure.
118  */
119 
120 #include <sys/types.h>
121 #include <sys/t_lock.h>
122 #include <sys/param.h>
123 #include <sys/systm.h>
124 #include <sys/sysmacros.h>
125 #include <sys/cred.h>
126 #include <sys/vmem.h>
127 #include <sys/kmem.h>
128 #include <sys/errno.h>
129 #include <sys/time.h>
130 #include <sys/ipc.h>
131 #include <sys/ipc_impl.h>
132 #include <sys/sem.h>
133 #include <sys/sem_impl.h>
134 #include <sys/user.h>
135 #include <sys/proc.h>
136 #include <sys/cpuvar.h>
137 #include <sys/debug.h>
138 #include <sys/var.h>
139 #include <sys/cmn_err.h>
140 #include <sys/modctl.h>
141 #include <sys/syscall.h>
142 #include <sys/avl.h>
143 #include <sys/list.h>
144 #include <sys/zone.h>
145 
146 #include <c2/audit.h>
147 
148 extern rctl_hndl_t rc_zone_semmni;
149 extern rctl_hndl_t rc_project_semmni;
150 extern rctl_hndl_t rc_process_semmsl;
151 extern rctl_hndl_t rc_process_semopm;
152 static ipc_service_t *sem_svc;
153 static zone_key_t sem_zone_key;
154 
155 /*
156  * The following tunables are obsolete.  Though for compatibility we
157  * still read and interpret seminfo_semmsl, seminfo_semopm and
158  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
159  * mechanism for administrating the IPC Semaphore facility is through
160  * the resource controls described at the top of this file.
161  */
162 int seminfo_semaem = 16384;	/* (obsolete) */
163 int seminfo_semmap = 10;	/* (obsolete) */
164 int seminfo_semmni = 10;	/* (obsolete) */
165 int seminfo_semmns = 60;	/* (obsolete) */
166 int seminfo_semmnu = 30;	/* (obsolete) */
167 int seminfo_semmsl = 25;	/* (obsolete) */
168 int seminfo_semopm = 10;	/* (obsolete) */
169 int seminfo_semume = 10;	/* (obsolete) */
170 int seminfo_semusz = 96;	/* (obsolete) */
171 int seminfo_semvmx = 32767;	/* (obsolete) */
172 
173 #define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
174 #define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))
175 
176 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
177     uintptr_t a2, uintptr_t a3);
178 static void sem_dtor(kipc_perm_t *);
179 static void sem_rmid(kipc_perm_t *);
180 static void sem_remove_zone(zoneid_t, void *);
181 
182 static struct sysent ipcsem_sysent = {
183 	5,
184 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
185 	semsys
186 };
187 
188 /*
189  * Module linkage information for the kernel.
190  */
191 static struct modlsys modlsys = {
192 	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
193 };
194 
195 #ifdef _SYSCALL32_IMPL
196 static struct modlsys modlsys32 = {
197 	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
198 };
199 #endif
200 
201 static struct modlinkage modlinkage = {
202 	MODREV_1,
203 	&modlsys,
204 #ifdef _SYSCALL32_IMPL
205 	&modlsys32,
206 #endif
207 	NULL
208 };
209 
210 
211 int
212 _init(void)
213 {
214 	int result;
215 
216 	sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
217 	    sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
218 	    offsetof(ipc_rqty_t, ipcq_semmni));
219 	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
220 
221 	if ((result = mod_install(&modlinkage)) == 0)
222 		return (0);
223 
224 	(void) zone_key_delete(sem_zone_key);
225 	ipcs_destroy(sem_svc);
226 
227 	return (result);
228 }
229 
230 int
231 _fini(void)
232 {
233 	return (EBUSY);
234 }
235 
236 int
237 _info(struct modinfo *modinfop)
238 {
239 	return (mod_info(&modlinkage, modinfop));
240 }
241 
242 static void
243 sem_dtor(kipc_perm_t *perm)
244 {
245 	ksemid_t *sp = (ksemid_t *)perm;
246 
247 	kmem_free(sp->sem_base,
248 	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
249 	list_destroy(&sp->sem_undos);
250 }
251 
252 /*
253  * sem_undo_add - Create or update adjust on exit entry.
254  */
255 static int
256 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
257 {
258 	int newval = undo->un_aoe[num] - val;
259 
260 	if (newval > USHRT_MAX || newval < -USHRT_MAX)
261 		return (ERANGE);
262 	undo->un_aoe[num] = newval;
263 
264 	return (0);
265 }
266 
267 /*
268  * sem_undo_clear - clears all undo entries for specified semaphores
269  *
270  * Used when semaphores are reset by SETVAL or SETALL.
271  */
272 static void
273 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
274 {
275 	struct sem_undo *undo;
276 	int i;
277 
278 	ASSERT(low <= high);
279 	ASSERT(high < sp->sem_nsems);
280 
281 	for (undo = list_head(&sp->sem_undos); undo;
282 	    undo = list_next(&sp->sem_undos, undo))
283 		for (i = low; i <= high; i++)
284 			undo->un_aoe[i] = 0;
285 }
286 
287 /*
288  * sem_rollback - roll back work done so far if unable to complete operation
289  */
290 static void
291 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
292 {
293 	struct sem *semp;	/* semaphore ptr */
294 
295 	for (op += n - 1; n--; op--) {
296 		if (op->sem_op == 0)
297 			continue;
298 		semp = &sp->sem_base[op->sem_num];
299 		semp->semval -= op->sem_op;
300 		if (op->sem_flg & SEM_UNDO) {
301 			ASSERT(undo != NULL);
302 			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
303 		}
304 	}
305 }
306 
307 static void
308 sem_rmid(kipc_perm_t *perm)
309 {
310 	ksemid_t *sp = (ksemid_t *)perm;
311 	struct sem *semp;
312 	struct sem_undo *undo;
313 	size_t size = SEM_UNDOSZ(sp->sem_nsems);
314 	int i;
315 
316 	/*LINTED*/
317 	while (undo = list_head(&sp->sem_undos)) {
318 		list_remove(&sp->sem_undos, undo);
319 		mutex_enter(&undo->un_proc->p_lock);
320 		if (undo->un_proc->p_semacct == NULL) {
321 			mutex_exit(&undo->un_proc->p_lock);
322 			continue;
323 		}
324 		avl_remove(undo->un_proc->p_semacct, undo);
325 		mutex_exit(&undo->un_proc->p_lock);
326 		kmem_free(undo, size);
327 		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
328 	}
329 
330 	for (i = 0; i < sp->sem_nsems; i++) {
331 		semp = &sp->sem_base[i];
332 		semp->semval = semp->sempid = 0;
333 		if (semp->semncnt) {
334 			cv_broadcast(&semp->semncnt_cv);
335 			semp->semncnt = 0;
336 		}
337 		if (semp->semzcnt) {
338 			cv_broadcast(&semp->semzcnt_cv);
339 			semp->semzcnt = 0;
340 		}
341 	}
342 }
343 
344 /*
345  * semctl - Semctl system call.
346  */
347 static int
348 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
349 {
350 	ksemid_t		*sp;	/* ptr to semaphore header */
351 	struct sem		*p;	/* ptr to semaphore */
352 	unsigned int		i;	/* loop control */
353 	ushort_t		*vals, *vp;
354 	size_t			vsize = 0;
355 	int			error = 0;
356 	int			retval = 0;
357 	struct cred		*cr;
358 	kmutex_t		*lock;
359 	model_t			mdl = get_udatamodel();
360 	STRUCT_DECL(semid_ds, sid);
361 	struct semid_ds64	ds64;
362 
363 	STRUCT_INIT(sid, mdl);
364 	cr = CRED();
365 
366 	/*
367 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
368 	 */
369 	switch (cmd) {
370 	case IPC_SET:
371 		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
372 			return (set_errno(EFAULT));
373 		break;
374 
375 	case IPC_SET64:
376 		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
377 			return (set_errno(EFAULT));
378 		break;
379 
380 	case SETALL:
381 		if ((lock = ipc_lookup(sem_svc, semid,
382 		    (kipc_perm_t **)&sp)) == NULL)
383 			return (set_errno(EINVAL));
384 		vsize = sp->sem_nsems * sizeof (*vals);
385 		mutex_exit(lock);
386 
387 		/* allocate space to hold all semaphore values */
388 		vals = kmem_alloc(vsize, KM_SLEEP);
389 
390 		if (copyin((void *)arg, vals, vsize)) {
391 			kmem_free(vals, vsize);
392 			return (set_errno(EFAULT));
393 		}
394 		break;
395 
396 	case IPC_RMID:
397 		if (error = ipc_rmid(sem_svc, semid, cr))
398 			return (set_errno(error));
399 		return (0);
400 	}
401 
402 	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
403 		if (vsize != 0)
404 			kmem_free(vals, vsize);
405 		return (set_errno(EINVAL));
406 	}
407 	switch (cmd) {
408 	/* Set ownership and permissions. */
409 	case IPC_SET:
410 
411 		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
412 		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
413 			mutex_exit(lock);
414 			return (set_errno(error));
415 		}
416 		sp->sem_ctime = gethrestime_sec();
417 		mutex_exit(lock);
418 		return (0);
419 
420 	/* Get semaphore data structure. */
421 	case IPC_STAT:
422 
423 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
424 			mutex_exit(lock);
425 			return (set_errno(error));
426 		}
427 
428 		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
429 		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
430 		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
431 		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
432 		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
433 		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
434 		mutex_exit(lock);
435 
436 		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
437 			return (set_errno(EFAULT));
438 		return (0);
439 
440 	case IPC_SET64:
441 
442 		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
443 		    &ds64.semx_perm)) {
444 			mutex_exit(lock);
445 			return (set_errno(error));
446 		}
447 		sp->sem_ctime = gethrestime_sec();
448 		mutex_exit(lock);
449 		return (0);
450 
451 	case IPC_STAT64:
452 
453 		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
454 		ds64.semx_nsems = sp->sem_nsems;
455 		ds64.semx_otime = sp->sem_otime;
456 		ds64.semx_ctime = sp->sem_ctime;
457 
458 		mutex_exit(lock);
459 		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
460 			return (set_errno(EFAULT));
461 
462 		return (0);
463 
464 	/* Get # of processes sleeping for greater semval. */
465 	case GETNCNT:
466 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
467 			mutex_exit(lock);
468 			return (set_errno(error));
469 		}
470 		if (semnum >= sp->sem_nsems) {
471 			mutex_exit(lock);
472 			return (set_errno(EINVAL));
473 		}
474 		retval = sp->sem_base[semnum].semncnt;
475 		mutex_exit(lock);
476 		return (retval);
477 
478 	/* Get pid of last process to operate on semaphore. */
479 	case GETPID:
480 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
481 			mutex_exit(lock);
482 			return (set_errno(error));
483 		}
484 		if (semnum >= sp->sem_nsems) {
485 			mutex_exit(lock);
486 			return (set_errno(EINVAL));
487 		}
488 		retval = sp->sem_base[semnum].sempid;
489 		mutex_exit(lock);
490 		return (retval);
491 
492 	/* Get semval of one semaphore. */
493 	case GETVAL:
494 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
495 			mutex_exit(lock);
496 			return (set_errno(error));
497 		}
498 		if (semnum >= sp->sem_nsems) {
499 			mutex_exit(lock);
500 			return (set_errno(EINVAL));
501 		}
502 		retval = sp->sem_base[semnum].semval;
503 		mutex_exit(lock);
504 		return (retval);
505 
506 	/* Get all semvals in set. */
507 	case GETALL:
508 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
509 			mutex_exit(lock);
510 			return (set_errno(error));
511 		}
512 
513 		/* allocate space to hold all semaphore values */
514 		vsize = sp->sem_nsems * sizeof (*vals);
515 		vals = vp = kmem_alloc(vsize, KM_SLEEP);
516 
517 		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
518 			bcopy(&p->semval, vp, sizeof (p->semval));
519 
520 		mutex_exit(lock);
521 
522 		if (copyout((void *)vals, (void *)arg, vsize)) {
523 			kmem_free(vals, vsize);
524 			return (set_errno(EFAULT));
525 		}
526 
527 		kmem_free(vals, vsize);
528 		return (0);
529 
530 	/* Get # of processes sleeping for semval to become zero. */
531 	case GETZCNT:
532 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
533 			mutex_exit(lock);
534 			return (set_errno(error));
535 		}
536 		if (semnum >= sp->sem_nsems) {
537 			mutex_exit(lock);
538 			return (set_errno(EINVAL));
539 		}
540 		retval = sp->sem_base[semnum].semzcnt;
541 		mutex_exit(lock);
542 		return (retval);
543 
544 	/* Set semval of one semaphore. */
545 	case SETVAL:
546 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
547 			mutex_exit(lock);
548 			return (set_errno(error));
549 		}
550 		if (semnum >= sp->sem_nsems) {
551 			mutex_exit(lock);
552 			return (set_errno(EINVAL));
553 		}
554 		if ((uint_t)arg > USHRT_MAX) {
555 			mutex_exit(lock);
556 			return (set_errno(ERANGE));
557 		}
558 		p = &sp->sem_base[semnum];
559 		if ((p->semval = (ushort_t)arg) != 0) {
560 			if (p->semncnt) {
561 				cv_broadcast(&p->semncnt_cv);
562 			}
563 		} else if (p->semzcnt) {
564 			cv_broadcast(&p->semzcnt_cv);
565 		}
566 		p->sempid = curproc->p_pid;
567 		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
568 		mutex_exit(lock);
569 		return (0);
570 
571 	/* Set semvals of all semaphores in set. */
572 	case SETALL:
573 		/* Check if semaphore set has been deleted and reallocated. */
574 		if (sp->sem_nsems * sizeof (*vals) != vsize) {
575 			error = set_errno(EINVAL);
576 			goto seterr;
577 		}
578 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
579 			error = set_errno(error);
580 			goto seterr;
581 		}
582 		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
583 		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
584 		    (p++)->sempid = curproc->p_pid) {
585 			if ((p->semval = vals[i++]) != 0) {
586 				if (p->semncnt) {
587 					cv_broadcast(&p->semncnt_cv);
588 				}
589 			} else if (p->semzcnt) {
590 				cv_broadcast(&p->semzcnt_cv);
591 			}
592 		}
593 seterr:
594 		mutex_exit(lock);
595 		kmem_free(vals, vsize);
596 		return (error);
597 
598 	default:
599 		mutex_exit(lock);
600 		return (set_errno(EINVAL));
601 	}
602 
603 	/* NOTREACHED */
604 }
605 
606 /*
607  * semexit - Called by exit() to clean up on process exit.
608  */
609 void
610 semexit(proc_t *pp)
611 {
612 	avl_tree_t	*tree;
613 	struct sem_undo	*undo;
614 	void		*cookie = NULL;
615 
616 	mutex_enter(&pp->p_lock);
617 	tree = pp->p_semacct;
618 	pp->p_semacct = NULL;
619 	mutex_exit(&pp->p_lock);
620 
621 	while (undo = avl_destroy_nodes(tree, &cookie)) {
622 		ksemid_t *sp = undo->un_sp;
623 		size_t size = SEM_UNDOSZ(sp->sem_nsems);
624 		int i;
625 
626 		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
627 		if (!IPC_FREE(&sp->sem_perm)) {
628 			for (i = 0; i < sp->sem_nsems; i++) {
629 				int adj = undo->un_aoe[i];
630 				if (adj) {
631 					struct sem *semp = &sp->sem_base[i];
632 					int v = (int)semp->semval + adj;
633 
634 					if (v < 0 || v > USHRT_MAX)
635 						continue;
636 					semp->semval = (ushort_t)v;
637 					if (v == 0 && semp->semzcnt)
638 						cv_broadcast(&semp->semzcnt_cv);
639 					if (adj > 0 && semp->semncnt)
640 						cv_broadcast(&semp->semncnt_cv);
641 				}
642 			}
643 			list_remove(&sp->sem_undos, undo);
644 		}
645 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
646 		kmem_free(undo, size);
647 	}
648 
649 	avl_destroy(tree);
650 	kmem_free(tree, sizeof (avl_tree_t));
651 }
652 
653 /*
654  * Remove all semaphores associated with a given zone.  Called by
655  * zone_shutdown when the zone is halted.
656  */
657 /*ARGSUSED1*/
658 static void
659 sem_remove_zone(zoneid_t zoneid, void *arg)
660 {
661 	ipc_remove_zone(sem_svc, zoneid);
662 }
663 
664 /*
665  * semget - Semget system call.
666  */
667 static int
668 semget(key_t key, int nsems, int semflg)
669 {
670 	ksemid_t	*sp;
671 	kmutex_t	*lock;
672 	int		id, error;
673 	proc_t		*pp = curproc;
674 
675 top:
676 	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
677 		return (set_errno(error));
678 
679 	if (!IPC_FREE(&sp->sem_perm)) {
680 		/*
681 		 * A semaphore with the requested key exists.
682 		 */
683 		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
684 			mutex_exit(lock);
685 			return (set_errno(EINVAL));
686 		}
687 	} else {
688 		/*
689 		 * This is a new semaphore set.  Finish initialization.
690 		 */
691 		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
692 		    nsems, RCA_SAFE) & RCT_DENY)) {
693 			mutex_exit(lock);
694 			mutex_exit(&pp->p_lock);
695 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
696 			return (set_errno(EINVAL));
697 		}
698 		mutex_exit(lock);
699 		mutex_exit(&pp->p_lock);
700 
701 		/*
702 		 * We round the allocation up to coherency granularity
703 		 * so that multiple semaphore allocations won't result
704 		 * in the false sharing of their sem structures.
705 		 */
706 		sp->sem_base =
707 		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
708 		    KM_SLEEP);
709 		sp->sem_binary = (nsems == 1);
710 		sp->sem_nsems = (ushort_t)nsems;
711 		sp->sem_ctime = gethrestime_sec();
712 		sp->sem_otime = 0;
713 		list_create(&sp->sem_undos, sizeof (struct sem_undo),
714 		    offsetof(struct sem_undo, un_list));
715 
716 		if (error = ipc_commit_begin(sem_svc, key, semflg,
717 		    (kipc_perm_t *)sp)) {
718 			if (error == EAGAIN)
719 				goto top;
720 			return (set_errno(error));
721 		}
722 		sp->sem_maxops =
723 		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
724 		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
725 		    RCA_SAFE) & RCT_DENY) {
726 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
727 			return (set_errno(EINVAL));
728 		}
729 		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
730 	}
731 #ifdef C2_AUDIT
732 	if (audit_active)
733 		audit_ipcget(AT_IPC_SEM, (void *)sp);
734 #endif
735 	id = sp->sem_perm.ipc_id;
736 	mutex_exit(lock);
737 	return (id);
738 }
739 
740 /*
741  * semids system call.
742  */
743 static int
744 semids(int *buf, uint_t nids, uint_t *pnids)
745 {
746 	int error;
747 
748 	if (error = ipc_ids(sem_svc, buf, nids, pnids))
749 		return (set_errno(error));
750 
751 	return (0);
752 }
753 
754 
755 /*
756  * Helper function for semop - copies in the provided timespec and
757  * computes the absolute future time after which we must return.
758  */
759 static int
760 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
761 	timespec_t *timeout)
762 {
763 	model_t datamodel = get_udatamodel();
764 
765 	if (datamodel == DATAMODEL_NATIVE) {
766 		if (copyin(timeout, ts, sizeof (timespec_t)))
767 			return (EFAULT);
768 	} else {
769 		timespec32_t ts32;
770 
771 		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
772 			return (EFAULT);
773 		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
774 	}
775 
776 	if (itimerspecfix(ts))
777 		return (EINVAL);
778 
779 	/*
780 	 * Convert the timespec value into absolute time.
781 	 */
782 	timespecadd(ts, now);
783 	*tsp = ts;
784 
785 	return (0);
786 }
787 
788 /*
789  * Undo structure comparator.  We sort based on ksemid_t pointer.
790  */
791 static int
792 sem_undo_compar(const void *x, const void *y)
793 {
794 	struct sem_undo *undo1 = (struct sem_undo *)x;
795 	struct sem_undo *undo2 = (struct sem_undo *)y;
796 
797 	if (undo1->un_sp < undo2->un_sp)
798 		return (-1);
799 	if (undo1->un_sp > undo2->un_sp)
800 		return (1);
801 	return (0);
802 }
803 
804 /*
805  * Helper function for semop - creates an undo structure and adds it to
806  * the process's avl tree and the semaphore's list.
807  */
808 static int
809 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
810     struct sem_undo *template, struct sem_undo **un)
811 {
812 	size_t size;
813 	struct sem_undo *undo;
814 	avl_tree_t *tree = NULL;
815 	avl_index_t where;
816 
817 	mutex_exit(*lock);
818 
819 	size = SEM_UNDOSZ(sp->sem_nsems);
820 	undo = kmem_zalloc(size, KM_SLEEP);
821 	undo->un_proc = pp;
822 	undo->un_sp = sp;
823 
824 	if (pp->p_semacct == NULL)
825 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
826 
827 	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
828 	if (IPC_FREE(&sp->sem_perm)) {
829 		kmem_free(undo, size);
830 		if (tree)
831 			kmem_free(tree, sizeof (avl_tree_t));
832 		return (EIDRM);
833 	}
834 
835 	mutex_enter(&pp->p_lock);
836 	if (tree) {
837 		if (pp->p_semacct == NULL) {
838 			avl_create(tree, sem_undo_compar,
839 			    sizeof (struct sem_undo),
840 			    offsetof(struct sem_undo, un_avl));
841 			pp->p_semacct = tree;
842 		} else {
843 			kmem_free(tree, sizeof (avl_tree_t));
844 		}
845 	}
846 
847 	if (*un = avl_find(pp->p_semacct, template, &where)) {
848 		mutex_exit(&pp->p_lock);
849 		kmem_free(undo, size);
850 	} else {
851 		*un = undo;
852 		avl_insert(pp->p_semacct, undo, where);
853 		mutex_exit(&pp->p_lock);
854 		list_insert_head(&sp->sem_undos, undo);
855 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
856 	}
857 
858 
859 	return (0);
860 }
861 
862 /*
863  * semop - Semop system call.
864  */
865 static int
866 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
867 {
868 	ksemid_t	*sp = NULL;
869 	kmutex_t	*lock;
870 	struct sembuf	*op;	/* ptr to operation */
871 	int		i;	/* loop control */
872 	struct sem	*semp;	/* ptr to semaphore */
873 	int 		error = 0;
874 	struct sembuf	*uops;	/* ptr to copy of user ops */
875 	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
876 	timespec_t	now, ts, *tsp = NULL;
877 	int		cvres, needundo, mode;
878 	struct sem_undo	*undo;
879 	proc_t		*pp = curproc;
880 	int		held = 0;
881 
882 	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
883 
884 	/*
885 	 * To avoid the cost of copying in 'timeout' in the common
886 	 * case, we could only grab the time here and defer the copyin
887 	 * and associated computations until we are about to block.
888 	 *
889 	 * The down side to this is that we would then have to spin
890 	 * some goto top nonsense to avoid the copyin behind the semid
891 	 * lock.  As a common use of timed semaphores is as an explicit
892 	 * blocking mechanism, this could incur a greater penalty.
893 	 *
894 	 * If we eventually decide that this would be a wise route to
895 	 * take, the deferrable functionality is completely contained
896 	 * in 'compute_timeout', and the interface is defined such that
897 	 * we can legally not validate 'timeout' if it is unused.
898 	 */
899 	if (timeout != NULL) {
900 		gethrestime(&now);
901 		if (error = compute_timeout(&tsp, &ts, &now, timeout))
902 			return (set_errno(error));
903 	}
904 
905 	/*
906 	 * Allocate space to hold the vector of semaphore ops.  If
907 	 * there is only 1 operation we use a preallocated buffer on
908 	 * the stack for speed.
909 	 *
910 	 * Since we don't want to allow the user to allocate an
911 	 * arbitrary amount of kernel memory, we need to check against
912 	 * the number of operations allowed by the semaphore.  We only
913 	 * bother doing this if the number of operations is larger than
914 	 * SEM_MAXUCOPS.
915 	 */
916 	if (nsops == 1)
917 		uops = &x_sem;
918 	else if (nsops == 0)
919 		return (0);
920 	else if (nsops <= SEM_MAXUCOPS)
921 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
922 
923 	if (nsops > SEM_MAXUCOPS) {
924 		if ((lock = ipc_lookup(sem_svc, semid,
925 		    (kipc_perm_t **)&sp)) == NULL)
926 			return (set_errno(EFAULT));
927 
928 		if (nsops > sp->sem_maxops) {
929 			mutex_exit(lock);
930 			return (set_errno(E2BIG));
931 		}
932 		held = 1;
933 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
934 		mutex_exit(lock);
935 
936 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
937 		if (copyin(sops, uops, nsops * sizeof (*op))) {
938 			error = EFAULT;
939 			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
940 			goto semoperr;
941 		}
942 
943 		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
944 		if (IPC_FREE(&sp->sem_perm)) {
945 			error = EIDRM;
946 			goto semoperr;
947 		}
948 	} else {
949 		/*
950 		 * This could be interleaved with the above code, but
951 		 * keeping them separate improves readability.
952 		 */
953 		if (copyin(sops, uops, nsops * sizeof (*op))) {
954 			error = EFAULT;
955 			goto semoperr_unlocked;
956 		}
957 
958 		if ((lock = ipc_lookup(sem_svc, semid,
959 		    (kipc_perm_t **)&sp)) == NULL) {
960 			error = EINVAL;
961 			goto semoperr_unlocked;
962 		}
963 
964 		if (nsops > sp->sem_maxops) {
965 			error = E2BIG;
966 			goto semoperr;
967 		}
968 	}
969 
970 	/*
971 	 * Scan all operations.  Verify that sem #s are in range and
972 	 * this process is allowed the requested operations.  If any
973 	 * operations are marked SEM_UNDO, find (or allocate) the undo
974 	 * structure for this process and semaphore.
975 	 */
976 	needundo = 0;
977 	mode = 0;
978 	for (i = 0, op = uops; i++ < nsops; op++) {
979 		mode |= op->sem_op ? SEM_A : SEM_R;
980 		if (op->sem_num >= sp->sem_nsems) {
981 			error = EFBIG;
982 			goto semoperr;
983 		}
984 		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
985 			needundo = 1;
986 	}
987 	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
988 		goto semoperr;
989 
990 	if (needundo) {
991 		struct sem_undo template;
992 
993 		template.un_sp = sp;
994 		mutex_enter(&pp->p_lock);
995 		if (pp->p_semacct)
996 			undo = avl_find(pp->p_semacct, &template, NULL);
997 		else
998 			undo = NULL;
999 		mutex_exit(&pp->p_lock);
1000 		if (undo == NULL) {
1001 			if (error = sem_undo_alloc(pp, sp, &lock, &template,
1002 			    &undo))
1003 				goto semoperr;
1004 
1005 			/* sem_undo_alloc unlocks the semaphore */
1006 			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
1007 				goto semoperr;
1008 		}
1009 	}
1010 
1011 check:
1012 	/*
1013 	 * Loop waiting for the operations to be satisfied atomically.
1014 	 * Actually, do the operations and undo them if a wait is needed
1015 	 * or an error is detected.
1016 	 */
1017 	for (i = 0; i < nsops; i++) {
1018 		op = &uops[i];
1019 		semp = &sp->sem_base[op->sem_num];
1020 
1021 		/*
1022 		 * Raise the semaphore (i.e. sema_v)
1023 		 */
1024 		if (op->sem_op > 0) {
1025 			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
1026 			    ((op->sem_flg & SEM_UNDO) &&
1027 			    (error = sem_undo_add(op->sem_op, op->sem_num,
1028 			    undo)))) {
1029 				if (i)
1030 					sem_rollback(sp, uops, i, undo);
1031 				if (error == 0)
1032 					error = ERANGE;
1033 				goto semoperr;
1034 			}
1035 			semp->semval += op->sem_op;
1036 			/*
1037 			 * If we are only incrementing the semaphore value
1038 			 * by one on a binary semaphore, we can cv_signal.
1039 			 */
1040 			if (semp->semncnt) {
1041 				if (op->sem_op == 1 && sp->sem_binary)
1042 					cv_signal(&semp->semncnt_cv);
1043 				else
1044 					cv_broadcast(&semp->semncnt_cv);
1045 			}
1046 			if (semp->semzcnt && !semp->semval)
1047 				cv_broadcast(&semp->semzcnt_cv);
1048 			continue;
1049 		}
1050 
1051 		/*
1052 		 * Lower the semaphore (i.e. sema_p)
1053 		 */
1054 		if (op->sem_op < 0) {
1055 			if (semp->semval >= (unsigned)(-op->sem_op)) {
1056 				if ((op->sem_flg & SEM_UNDO) &&
1057 				    (error = sem_undo_add(op->sem_op,
1058 				    op->sem_num, undo))) {
1059 					if (i)
1060 						sem_rollback(sp, uops, i, undo);
1061 					goto semoperr;
1062 				}
1063 				semp->semval += op->sem_op;
1064 				if (semp->semzcnt && !semp->semval)
1065 					cv_broadcast(&semp->semzcnt_cv);
1066 				continue;
1067 			}
1068 			if (i)
1069 				sem_rollback(sp, uops, i, undo);
1070 			if (op->sem_flg & IPC_NOWAIT) {
1071 				error = EAGAIN;
1072 				goto semoperr;
1073 			}
1074 
1075 			/*
1076 			 * Mark the semaphore set as not a binary type
1077 			 * if we are decrementing the value by more than 1.
1078 			 *
1079 			 * V operations will resort to cv_broadcast
1080 			 * for this set because there are too many weird
1081 			 * cases that have to be caught.
1082 			 */
1083 			if (op->sem_op < -1)
1084 				sp->sem_binary = 0;
1085 			if (!held) {
1086 				held = 1;
1087 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1088 			}
1089 			semp->semncnt++;
1090 			cvres = cv_waituntil_sig(&semp->semncnt_cv, lock, tsp);
1091 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1092 
1093 			if (!IPC_FREE(&sp->sem_perm)) {
1094 				ASSERT(semp->semncnt != 0);
1095 				semp->semncnt--;
1096 				if (cvres > 0)	/* normal wakeup */
1097 					goto check;
1098 			}
1099 
1100 			/* EINTR or EAGAIN overrides EIDRM */
1101 			if (cvres == 0)
1102 				error = EINTR;
1103 			else if (cvres < 0)
1104 				error = EAGAIN;
1105 			else
1106 				error = EIDRM;
1107 			goto semoperr;
1108 		}
1109 
1110 		/*
1111 		 * Wait for zero value
1112 		 */
1113 		if (semp->semval) {
1114 			if (i)
1115 				sem_rollback(sp, uops, i, undo);
1116 			if (op->sem_flg & IPC_NOWAIT) {
1117 				error = EAGAIN;
1118 				goto semoperr;
1119 			}
1120 
1121 			if (!held) {
1122 				held = 1;
1123 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1124 			}
1125 			semp->semzcnt++;
1126 			cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock, tsp);
1127 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1128 
1129 			/*
1130 			 * Don't touch semp if the semaphores have been removed.
1131 			 */
1132 			if (!IPC_FREE(&sp->sem_perm)) {
1133 				ASSERT(semp->semzcnt != 0);
1134 				semp->semzcnt--;
1135 				if (cvres > 0)	/* normal wakeup */
1136 					goto check;
1137 			}
1138 
1139 			/* EINTR or EAGAIN overrides EIDRM */
1140 			if (cvres == 0)
1141 				error = EINTR;
1142 			else if (cvres < 0)
1143 				error = EAGAIN;
1144 			else
1145 				error = EIDRM;
1146 			goto semoperr;
1147 		}
1148 	}
1149 
1150 	/* All operations succeeded.  Update sempid for accessed semaphores. */
1151 	for (i = 0, op = uops; i++ < nsops;
1152 	    sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
1153 		;
1154 	sp->sem_otime = gethrestime_sec();
1155 	if (held)
1156 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1157 	else
1158 		mutex_exit(lock);
1159 
1160 	/* Before leaving, deallocate the buffer that held the user semops */
1161 	if (nsops != 1)
1162 		kmem_free(uops, sizeof (*uops) * nsops);
1163 	return (0);
1164 
1165 	/*
1166 	 * Error return labels
1167 	 */
1168 semoperr:
1169 	if (held)
1170 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1171 	else
1172 		mutex_exit(lock);
1173 
1174 semoperr_unlocked:
1175 
1176 	/* Before leaving, deallocate the buffer that held the user semops */
1177 	if (nsops != 1)
1178 		kmem_free(uops, sizeof (*uops) * nsops);
1179 	return (set_errno(error));
1180 }
1181 
1182 /*
1183  * semsys - System entry point for semctl, semget, and semop system calls.
1184  */
1185 static int
1186 semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
1187 {
1188 	int error;
1189 
1190 	switch (opcode) {
1191 	case SEMCTL:
1192 		error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
1193 		break;
1194 	case SEMGET:
1195 		error = semget((key_t)a1, (int)a2, (int)a3);
1196 		break;
1197 	case SEMOP:
1198 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
1199 		break;
1200 	case SEMIDS:
1201 		error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
1202 		break;
1203 	case SEMTIMEDOP:
1204 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
1205 		    (timespec_t *)a4);
1206 		break;
1207 	default:
1208 		error = set_errno(EINVAL);
1209 		break;
1210 	}
1211 	return (error);
1212 }
1213