xref: /illumos-gate/usr/src/uts/common/syscall/sem.c (revision 69a119caa6570c7077699161b7c28b6ee9f8b0f4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * Inter-Process Communication Semaphore Facility.
31  *
32  * See os/ipc.c for a description of common IPC functionality.
33  *
34  * Resource controls
35  * -----------------
36  *
37  * Control:      zone.max-sem-ids (rc_zone_semmni)
38  * Description:  Maximum number of semaphore ids allowed a zone.
39  *
40  *   When semget() is used to allocate a semaphore set, one id is
41  *   allocated.  If the id allocation doesn't succeed, semget() fails
42  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
43  *   the id is deallocated.
44  *
45  * Control:      project.max-sem-ids (rc_project_semmni)
46  * Description:  Maximum number of semaphore ids allowed a project.
47  *
48  *   When semget() is used to allocate a semaphore set, one id is
49  *   allocated.  If the id allocation doesn't succeed, semget() fails
50  *   and errno is set to ENOSPC.  Upon successful semctl(, IPC_RMID)
51  *   the id is deallocated.
52  *
53  * Control:      process.max-sem-nsems (rc_process_semmsl)
54  * Description:  Maximum number of semaphores allowed per semaphore set.
55  *
56  *   When semget() is used to allocate a semaphore set, the size of the
57  *   set is compared with this limit.  If the number of semaphores
58  *   exceeds the limit, semget() fails and errno is set to EINVAL.
59  *
60  * Control:      process.max-sem-ops (rc_process_semopm)
61  * Description:  Maximum number of semaphore operations allowed per
62  *               semop call.
63  *
64  *   When semget() successfully allocates a semaphore set, the minimum
65  *   enforced value of this limit is used to initialize the
66  *   "system-imposed maximum" number of operations a semop() call for
67  *   this set can perform.
68  *
69  * Undo structures
70  * ---------------
71  *
72  * Removing the undo structure tunables involved a serious redesign of
73  * how they were implemented.  There is now one undo structure for
74  * every process/semaphore array combination (lazily allocated, of
75  * course), and each is equal in size to the semaphore it corresponds
76  * to.  To avoid scalability and performance problems, the undo
77  * structures are stored in two places: a per-process AVL tree sorted
78  * by ksemid pointer (p_semacct, protected by p_lock) and an unsorted
79  * per-semaphore linked list (sem_undos, protected by the semaphore's
80  * ID lock).  The former is used by semop, where a lookup is performed
81  * once and cached if SEM_UNDO is specified for any of the operations,
82  * and at process exit where the undoable operations are rolled back.
83  * The latter is used when removing the semaphore, so the undo
84  * structures can be removed from the appropriate processes' trees.
85  *
86  * The undo structure itself contains pointers to the ksemid and proc
87  * to which it corresponds, a list node, an AVL node, and an array of
88  * adjust-on-exit (AOE) values.  When an undo structure is allocated it
89  * is immediately added to both the process's tree and the semaphore's
90  * list.  Lastly, the reference count on the semaphore is increased.
91  *
92  * Avoiding a lock ordering violation between p_lock and the ID lock,
93  * wont to occur when there is a race between a process exiting and the
94  * removal of a semaphore, mandates the delicate dance that exists
95  * between semexit and sem_rmid.
96  *
97  * sem_rmid, holding the ID lock, iterates through all undo structures
98  * and for each takes the appropriate process's p_lock and checks to
99  * see if p_semacct is NULL.  If it is, it skips that undo structure
100  * and continues to the next.  Otherwise, it removes the undo structure
101  * from both the AVL tree and the semaphore's list, and releases the
102  * hold that the undo structure had on the semaphore.
103  *
104  * The important other half of this is semexit, which will immediately
105  * take p_lock, obtain the AVL pointer, clear p_semacct, and drop
106  * p_lock.  From this point on it is semexit's responsibility to clean
107  * up all undo structures found in the tree -- a coexecuting sem_rmid
108  * will see the NULL p_semacct and skip that undo structure.  It walks
109  * the AVL tree (using avl_destroy_nodes) and for each undo structure
110  * takes the appropriate semaphore's ID lock (always legal since the
111  * undo structure has a hold on the semaphore), updates all semaphores
112  * with non-zero AOE values, and removes the structure from the
113  * semaphore's list.  It then drops the structure's reference on the
114  * semaphore, drops the ID lock, and frees the undo structure.
115  */
116 
117 #include <sys/types.h>
118 #include <sys/t_lock.h>
119 #include <sys/param.h>
120 #include <sys/systm.h>
121 #include <sys/sysmacros.h>
122 #include <sys/cred.h>
123 #include <sys/vmem.h>
124 #include <sys/kmem.h>
125 #include <sys/errno.h>
126 #include <sys/time.h>
127 #include <sys/ipc.h>
128 #include <sys/ipc_impl.h>
129 #include <sys/sem.h>
130 #include <sys/sem_impl.h>
131 #include <sys/user.h>
132 #include <sys/proc.h>
133 #include <sys/cpuvar.h>
134 #include <sys/debug.h>
135 #include <sys/var.h>
136 #include <sys/cmn_err.h>
137 #include <sys/modctl.h>
138 #include <sys/syscall.h>
139 #include <sys/avl.h>
140 #include <sys/list.h>
141 #include <sys/zone.h>
142 
143 #include <c2/audit.h>
144 
145 extern rctl_hndl_t rc_zone_semmni;
146 extern rctl_hndl_t rc_project_semmni;
147 extern rctl_hndl_t rc_process_semmsl;
148 extern rctl_hndl_t rc_process_semopm;
149 static ipc_service_t *sem_svc;
150 static zone_key_t sem_zone_key;
151 
152 /*
153  * The following tunables are obsolete.  Though for compatibility we
154  * still read and interpret seminfo_semmsl, seminfo_semopm and
155  * seminfo_semmni (see os/project.c and os/rctl_proc.c), the preferred
156  * mechanism for administrating the IPC Semaphore facility is through
157  * the resource controls described at the top of this file.
158  */
159 int seminfo_semaem = 16384;	/* (obsolete) */
160 int seminfo_semmap = 10;	/* (obsolete) */
161 int seminfo_semmni = 10;	/* (obsolete) */
162 int seminfo_semmns = 60;	/* (obsolete) */
163 int seminfo_semmnu = 30;	/* (obsolete) */
164 int seminfo_semmsl = 25;	/* (obsolete) */
165 int seminfo_semopm = 10;	/* (obsolete) */
166 int seminfo_semume = 10;	/* (obsolete) */
167 int seminfo_semusz = 96;	/* (obsolete) */
168 int seminfo_semvmx = 32767;	/* (obsolete) */
169 
170 #define	SEM_MAXUCOPS	4096	/* max # of unchecked ops per semop call */
171 #define	SEM_UNDOSZ(n)	(sizeof (struct sem_undo) + (n - 1) * sizeof (int))
172 
173 static int semsys(int opcode, uintptr_t a0, uintptr_t a1,
174     uintptr_t a2, uintptr_t a3);
175 static void sem_dtor(kipc_perm_t *);
176 static void sem_rmid(kipc_perm_t *);
177 static void sem_remove_zone(zoneid_t, void *);
178 
179 static struct sysent ipcsem_sysent = {
180 	5,
181 	SE_NOUNLOAD | SE_ARGC | SE_32RVAL1,
182 	semsys
183 };
184 
185 /*
186  * Module linkage information for the kernel.
187  */
188 static struct modlsys modlsys = {
189 	&mod_syscallops, "System V semaphore facility", &ipcsem_sysent
190 };
191 
192 #ifdef _SYSCALL32_IMPL
193 static struct modlsys modlsys32 = {
194 	&mod_syscallops32, "32-bit System V semaphore facility", &ipcsem_sysent
195 };
196 #endif
197 
198 static struct modlinkage modlinkage = {
199 	MODREV_1,
200 	&modlsys,
201 #ifdef _SYSCALL32_IMPL
202 	&modlsys32,
203 #endif
204 	NULL
205 };
206 
207 
208 int
209 _init(void)
210 {
211 	int result;
212 
213 	sem_svc = ipcs_create("semids", rc_project_semmni, rc_zone_semmni,
214 	    sizeof (ksemid_t), sem_dtor, sem_rmid, AT_IPC_SEM,
215 	    offsetof(ipc_rqty_t, ipcq_semmni));
216 	zone_key_create(&sem_zone_key, NULL, sem_remove_zone, NULL);
217 
218 	if ((result = mod_install(&modlinkage)) == 0)
219 		return (0);
220 
221 	(void) zone_key_delete(sem_zone_key);
222 	ipcs_destroy(sem_svc);
223 
224 	return (result);
225 }
226 
227 int
228 _fini(void)
229 {
230 	return (EBUSY);
231 }
232 
233 int
234 _info(struct modinfo *modinfop)
235 {
236 	return (mod_info(&modlinkage, modinfop));
237 }
238 
239 static void
240 sem_dtor(kipc_perm_t *perm)
241 {
242 	ksemid_t *sp = (ksemid_t *)perm;
243 
244 	kmem_free(sp->sem_base,
245 	    P2ROUNDUP(sp->sem_nsems * sizeof (struct sem), 64));
246 	list_destroy(&sp->sem_undos);
247 }
248 
249 /*
250  * sem_undo_add - Create or update adjust on exit entry.
251  */
252 static int
253 sem_undo_add(short val, ushort_t num, struct sem_undo *undo)
254 {
255 	int newval = undo->un_aoe[num] - val;
256 
257 	if (newval > USHRT_MAX || newval < -USHRT_MAX)
258 		return (ERANGE);
259 	undo->un_aoe[num] = newval;
260 
261 	return (0);
262 }
263 
264 /*
265  * sem_undo_clear - clears all undo entries for specified semaphores
266  *
267  * Used when semaphores are reset by SETVAL or SETALL.
268  */
269 static void
270 sem_undo_clear(ksemid_t *sp, ushort_t low, ushort_t high)
271 {
272 	struct sem_undo *undo;
273 	int i;
274 
275 	ASSERT(low <= high);
276 	ASSERT(high < sp->sem_nsems);
277 
278 	for (undo = list_head(&sp->sem_undos); undo;
279 	    undo = list_next(&sp->sem_undos, undo))
280 		for (i = low; i <= high; i++)
281 			undo->un_aoe[i] = 0;
282 }
283 
284 /*
285  * sem_rollback - roll back work done so far if unable to complete operation
286  */
287 static void
288 sem_rollback(ksemid_t *sp, struct sembuf *op, int n, struct sem_undo *undo)
289 {
290 	struct sem *semp;	/* semaphore ptr */
291 
292 	for (op += n - 1; n--; op--) {
293 		if (op->sem_op == 0)
294 			continue;
295 		semp = &sp->sem_base[op->sem_num];
296 		semp->semval -= op->sem_op;
297 		if (op->sem_flg & SEM_UNDO) {
298 			ASSERT(undo != NULL);
299 			(void) sem_undo_add(-op->sem_op, op->sem_num, undo);
300 		}
301 	}
302 }
303 
304 static void
305 sem_rmid(kipc_perm_t *perm)
306 {
307 	ksemid_t *sp = (ksemid_t *)perm;
308 	struct sem *semp;
309 	struct sem_undo *undo;
310 	size_t size = SEM_UNDOSZ(sp->sem_nsems);
311 	int i;
312 
313 	/*LINTED*/
314 	while (undo = list_head(&sp->sem_undos)) {
315 		list_remove(&sp->sem_undos, undo);
316 		mutex_enter(&undo->un_proc->p_lock);
317 		if (undo->un_proc->p_semacct == NULL) {
318 			mutex_exit(&undo->un_proc->p_lock);
319 			continue;
320 		}
321 		avl_remove(undo->un_proc->p_semacct, undo);
322 		mutex_exit(&undo->un_proc->p_lock);
323 		kmem_free(undo, size);
324 		ipc_rele_locked(sem_svc, (kipc_perm_t *)sp);
325 	}
326 
327 	for (i = 0; i < sp->sem_nsems; i++) {
328 		semp = &sp->sem_base[i];
329 		semp->semval = semp->sempid = 0;
330 		if (semp->semncnt) {
331 			cv_broadcast(&semp->semncnt_cv);
332 			semp->semncnt = 0;
333 		}
334 		if (semp->semzcnt) {
335 			cv_broadcast(&semp->semzcnt_cv);
336 			semp->semzcnt = 0;
337 		}
338 	}
339 }
340 
341 /*
342  * semctl - Semctl system call.
343  */
344 static int
345 semctl(int semid, uint_t semnum, int cmd, uintptr_t arg)
346 {
347 	ksemid_t		*sp;	/* ptr to semaphore header */
348 	struct sem		*p;	/* ptr to semaphore */
349 	unsigned int		i;	/* loop control */
350 	ushort_t		*vals, *vp;
351 	size_t			vsize = 0;
352 	int			error = 0;
353 	int			retval = 0;
354 	struct cred		*cr;
355 	kmutex_t		*lock;
356 	model_t			mdl = get_udatamodel();
357 	STRUCT_DECL(semid_ds, sid);
358 	struct semid_ds64	ds64;
359 
360 	STRUCT_INIT(sid, mdl);
361 	cr = CRED();
362 
363 	/*
364 	 * Perform pre- or non-lookup actions (e.g. copyins, RMID).
365 	 */
366 	switch (cmd) {
367 	case IPC_SET:
368 		if (copyin((void *)arg, STRUCT_BUF(sid), STRUCT_SIZE(sid)))
369 			return (set_errno(EFAULT));
370 		break;
371 
372 	case IPC_SET64:
373 		if (copyin((void *)arg, &ds64, sizeof (struct semid_ds64)))
374 			return (set_errno(EFAULT));
375 		break;
376 
377 	case SETALL:
378 		if ((lock = ipc_lookup(sem_svc, semid,
379 		    (kipc_perm_t **)&sp)) == NULL)
380 			return (set_errno(EINVAL));
381 		vsize = sp->sem_nsems * sizeof (*vals);
382 		mutex_exit(lock);
383 
384 		/* allocate space to hold all semaphore values */
385 		vals = kmem_alloc(vsize, KM_SLEEP);
386 
387 		if (copyin((void *)arg, vals, vsize)) {
388 			kmem_free(vals, vsize);
389 			return (set_errno(EFAULT));
390 		}
391 		break;
392 
393 	case IPC_RMID:
394 		if (error = ipc_rmid(sem_svc, semid, cr))
395 			return (set_errno(error));
396 		return (0);
397 	}
398 
399 	if ((lock = ipc_lookup(sem_svc, semid, (kipc_perm_t **)&sp)) == NULL) {
400 		if (vsize != 0)
401 			kmem_free(vals, vsize);
402 		return (set_errno(EINVAL));
403 	}
404 	switch (cmd) {
405 	/* Set ownership and permissions. */
406 	case IPC_SET:
407 
408 		if (error = ipcperm_set(sem_svc, cr, &sp->sem_perm,
409 		    &STRUCT_BUF(sid)->sem_perm, mdl)) {
410 			mutex_exit(lock);
411 			return (set_errno(error));
412 		}
413 		sp->sem_ctime = gethrestime_sec();
414 		mutex_exit(lock);
415 		return (0);
416 
417 	/* Get semaphore data structure. */
418 	case IPC_STAT:
419 
420 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
421 			mutex_exit(lock);
422 			return (set_errno(error));
423 		}
424 
425 		ipcperm_stat(&STRUCT_BUF(sid)->sem_perm, &sp->sem_perm, mdl);
426 		STRUCT_FSETP(sid, sem_base, NULL);	/* kernel addr */
427 		STRUCT_FSET(sid, sem_nsems, sp->sem_nsems);
428 		STRUCT_FSET(sid, sem_otime, sp->sem_otime);
429 		STRUCT_FSET(sid, sem_ctime, sp->sem_ctime);
430 		STRUCT_FSET(sid, sem_binary, sp->sem_binary);
431 		mutex_exit(lock);
432 
433 		if (copyout(STRUCT_BUF(sid), (void *)arg, STRUCT_SIZE(sid)))
434 			return (set_errno(EFAULT));
435 		return (0);
436 
437 	case IPC_SET64:
438 
439 		if (error = ipcperm_set64(sem_svc, cr, &sp->sem_perm,
440 		    &ds64.semx_perm)) {
441 			mutex_exit(lock);
442 			return (set_errno(error));
443 		}
444 		sp->sem_ctime = gethrestime_sec();
445 		mutex_exit(lock);
446 		return (0);
447 
448 	case IPC_STAT64:
449 
450 		ipcperm_stat64(&ds64.semx_perm, &sp->sem_perm);
451 		ds64.semx_nsems = sp->sem_nsems;
452 		ds64.semx_otime = sp->sem_otime;
453 		ds64.semx_ctime = sp->sem_ctime;
454 
455 		mutex_exit(lock);
456 		if (copyout(&ds64, (void *)arg, sizeof (struct semid_ds64)))
457 			return (set_errno(EFAULT));
458 
459 		return (0);
460 
461 	/* Get # of processes sleeping for greater semval. */
462 	case GETNCNT:
463 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
464 			mutex_exit(lock);
465 			return (set_errno(error));
466 		}
467 		if (semnum >= sp->sem_nsems) {
468 			mutex_exit(lock);
469 			return (set_errno(EINVAL));
470 		}
471 		retval = sp->sem_base[semnum].semncnt;
472 		mutex_exit(lock);
473 		return (retval);
474 
475 	/* Get pid of last process to operate on semaphore. */
476 	case GETPID:
477 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
478 			mutex_exit(lock);
479 			return (set_errno(error));
480 		}
481 		if (semnum >= sp->sem_nsems) {
482 			mutex_exit(lock);
483 			return (set_errno(EINVAL));
484 		}
485 		retval = sp->sem_base[semnum].sempid;
486 		mutex_exit(lock);
487 		return (retval);
488 
489 	/* Get semval of one semaphore. */
490 	case GETVAL:
491 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
492 			mutex_exit(lock);
493 			return (set_errno(error));
494 		}
495 		if (semnum >= sp->sem_nsems) {
496 			mutex_exit(lock);
497 			return (set_errno(EINVAL));
498 		}
499 		retval = sp->sem_base[semnum].semval;
500 		mutex_exit(lock);
501 		return (retval);
502 
503 	/* Get all semvals in set. */
504 	case GETALL:
505 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
506 			mutex_exit(lock);
507 			return (set_errno(error));
508 		}
509 
510 		/* allocate space to hold all semaphore values */
511 		vsize = sp->sem_nsems * sizeof (*vals);
512 		vals = vp = kmem_alloc(vsize, KM_SLEEP);
513 
514 		for (i = sp->sem_nsems, p = sp->sem_base; i--; p++, vp++)
515 			bcopy(&p->semval, vp, sizeof (p->semval));
516 
517 		mutex_exit(lock);
518 
519 		if (copyout((void *)vals, (void *)arg, vsize)) {
520 			kmem_free(vals, vsize);
521 			return (set_errno(EFAULT));
522 		}
523 
524 		kmem_free(vals, vsize);
525 		return (0);
526 
527 	/* Get # of processes sleeping for semval to become zero. */
528 	case GETZCNT:
529 		if (error = ipcperm_access(&sp->sem_perm, SEM_R, cr)) {
530 			mutex_exit(lock);
531 			return (set_errno(error));
532 		}
533 		if (semnum >= sp->sem_nsems) {
534 			mutex_exit(lock);
535 			return (set_errno(EINVAL));
536 		}
537 		retval = sp->sem_base[semnum].semzcnt;
538 		mutex_exit(lock);
539 		return (retval);
540 
541 	/* Set semval of one semaphore. */
542 	case SETVAL:
543 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
544 			mutex_exit(lock);
545 			return (set_errno(error));
546 		}
547 		if (semnum >= sp->sem_nsems) {
548 			mutex_exit(lock);
549 			return (set_errno(EINVAL));
550 		}
551 		if ((uint_t)arg > USHRT_MAX) {
552 			mutex_exit(lock);
553 			return (set_errno(ERANGE));
554 		}
555 		p = &sp->sem_base[semnum];
556 		if ((p->semval = (ushort_t)arg) != 0) {
557 			if (p->semncnt) {
558 				cv_broadcast(&p->semncnt_cv);
559 			}
560 		} else if (p->semzcnt) {
561 			cv_broadcast(&p->semzcnt_cv);
562 		}
563 		p->sempid = curproc->p_pid;
564 		sem_undo_clear(sp, (ushort_t)semnum, (ushort_t)semnum);
565 		mutex_exit(lock);
566 		return (0);
567 
568 	/* Set semvals of all semaphores in set. */
569 	case SETALL:
570 		/* Check if semaphore set has been deleted and reallocated. */
571 		if (sp->sem_nsems * sizeof (*vals) != vsize) {
572 			error = set_errno(EINVAL);
573 			goto seterr;
574 		}
575 		if (error = ipcperm_access(&sp->sem_perm, SEM_A, cr)) {
576 			error = set_errno(error);
577 			goto seterr;
578 		}
579 		sem_undo_clear(sp, 0, sp->sem_nsems - 1);
580 		for (i = 0, p = sp->sem_base; i < sp->sem_nsems;
581 		    (p++)->sempid = curproc->p_pid) {
582 			if ((p->semval = vals[i++]) != 0) {
583 				if (p->semncnt) {
584 					cv_broadcast(&p->semncnt_cv);
585 				}
586 			} else if (p->semzcnt) {
587 				cv_broadcast(&p->semzcnt_cv);
588 			}
589 		}
590 seterr:
591 		mutex_exit(lock);
592 		kmem_free(vals, vsize);
593 		return (error);
594 
595 	default:
596 		mutex_exit(lock);
597 		return (set_errno(EINVAL));
598 	}
599 
600 	/* NOTREACHED */
601 }
602 
603 /*
604  * semexit - Called by exit() to clean up on process exit.
605  */
606 void
607 semexit(proc_t *pp)
608 {
609 	avl_tree_t	*tree;
610 	struct sem_undo	*undo;
611 	void		*cookie = NULL;
612 
613 	mutex_enter(&pp->p_lock);
614 	tree = pp->p_semacct;
615 	pp->p_semacct = NULL;
616 	mutex_exit(&pp->p_lock);
617 
618 	while (undo = avl_destroy_nodes(tree, &cookie)) {
619 		ksemid_t *sp = undo->un_sp;
620 		size_t size = SEM_UNDOSZ(sp->sem_nsems);
621 		int i;
622 
623 		(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
624 		if (!IPC_FREE(&sp->sem_perm)) {
625 			for (i = 0; i < sp->sem_nsems; i++) {
626 				int adj = undo->un_aoe[i];
627 				if (adj) {
628 					struct sem *semp = &sp->sem_base[i];
629 					int v = (int)semp->semval + adj;
630 
631 					if (v < 0 || v > USHRT_MAX)
632 						continue;
633 					semp->semval = (ushort_t)v;
634 					if (v == 0 && semp->semzcnt)
635 						cv_broadcast(&semp->semzcnt_cv);
636 					if (adj > 0 && semp->semncnt)
637 						cv_broadcast(&semp->semncnt_cv);
638 				}
639 			}
640 			list_remove(&sp->sem_undos, undo);
641 		}
642 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
643 		kmem_free(undo, size);
644 	}
645 
646 	avl_destroy(tree);
647 	kmem_free(tree, sizeof (avl_tree_t));
648 }
649 
650 /*
651  * Remove all semaphores associated with a given zone.  Called by
652  * zone_shutdown when the zone is halted.
653  */
654 /*ARGSUSED1*/
655 static void
656 sem_remove_zone(zoneid_t zoneid, void *arg)
657 {
658 	ipc_remove_zone(sem_svc, zoneid);
659 }
660 
661 /*
662  * semget - Semget system call.
663  */
664 static int
665 semget(key_t key, int nsems, int semflg)
666 {
667 	ksemid_t	*sp;
668 	kmutex_t	*lock;
669 	int		id, error;
670 	proc_t		*pp = curproc;
671 
672 top:
673 	if (error = ipc_get(sem_svc, key, semflg, (kipc_perm_t **)&sp, &lock))
674 		return (set_errno(error));
675 
676 	if (!IPC_FREE(&sp->sem_perm)) {
677 		/*
678 		 * A semaphore with the requested key exists.
679 		 */
680 		if (!((nsems >= 0) && (nsems <= sp->sem_nsems))) {
681 			mutex_exit(lock);
682 			return (set_errno(EINVAL));
683 		}
684 	} else {
685 		/*
686 		 * This is a new semaphore set.  Finish initialization.
687 		 */
688 		if (nsems <= 0 || (rctl_test(rc_process_semmsl, pp->p_rctls, pp,
689 		    nsems, RCA_SAFE) & RCT_DENY)) {
690 			mutex_exit(lock);
691 			mutex_exit(&pp->p_lock);
692 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
693 			return (set_errno(EINVAL));
694 		}
695 		mutex_exit(lock);
696 		mutex_exit(&pp->p_lock);
697 
698 		/*
699 		 * We round the allocation up to coherency granularity
700 		 * so that multiple semaphore allocations won't result
701 		 * in the false sharing of their sem structures.
702 		 */
703 		sp->sem_base =
704 		    kmem_zalloc(P2ROUNDUP(nsems * sizeof (struct sem), 64),
705 		    KM_SLEEP);
706 		sp->sem_binary = (nsems == 1);
707 		sp->sem_nsems = (ushort_t)nsems;
708 		sp->sem_ctime = gethrestime_sec();
709 		sp->sem_otime = 0;
710 		list_create(&sp->sem_undos, sizeof (struct sem_undo),
711 		    offsetof(struct sem_undo, un_list));
712 
713 		if (error = ipc_commit_begin(sem_svc, key, semflg,
714 		    (kipc_perm_t *)sp)) {
715 			if (error == EAGAIN)
716 				goto top;
717 			return (set_errno(error));
718 		}
719 		sp->sem_maxops =
720 		    rctl_enforced_value(rc_process_semopm, pp->p_rctls, pp);
721 		if (rctl_test(rc_process_semmsl, pp->p_rctls, pp, nsems,
722 		    RCA_SAFE) & RCT_DENY) {
723 			ipc_cleanup(sem_svc, (kipc_perm_t *)sp);
724 			return (set_errno(EINVAL));
725 		}
726 		lock = ipc_commit_end(sem_svc, &sp->sem_perm);
727 	}
728 
729 	if (AU_AUDITING())
730 		audit_ipcget(AT_IPC_SEM, (void *)sp);
731 
732 	id = sp->sem_perm.ipc_id;
733 	mutex_exit(lock);
734 	return (id);
735 }
736 
737 /*
738  * semids system call.
739  */
740 static int
741 semids(int *buf, uint_t nids, uint_t *pnids)
742 {
743 	int error;
744 
745 	if (error = ipc_ids(sem_svc, buf, nids, pnids))
746 		return (set_errno(error));
747 
748 	return (0);
749 }
750 
751 
752 /*
753  * Helper function for semop - copies in the provided timespec and
754  * computes the absolute future time after which we must return.
755  */
756 static int
757 compute_timeout(timespec_t **tsp, timespec_t *ts, timespec_t *now,
758 	timespec_t *timeout)
759 {
760 	model_t datamodel = get_udatamodel();
761 
762 	if (datamodel == DATAMODEL_NATIVE) {
763 		if (copyin(timeout, ts, sizeof (timespec_t)))
764 			return (EFAULT);
765 	} else {
766 		timespec32_t ts32;
767 
768 		if (copyin(timeout, &ts32, sizeof (timespec32_t)))
769 			return (EFAULT);
770 		TIMESPEC32_TO_TIMESPEC(ts, &ts32)
771 	}
772 
773 	if (itimerspecfix(ts))
774 		return (EINVAL);
775 
776 	/*
777 	 * Convert the timespec value into absolute time.
778 	 */
779 	timespecadd(ts, now);
780 	*tsp = ts;
781 
782 	return (0);
783 }
784 
785 /*
786  * Undo structure comparator.  We sort based on ksemid_t pointer.
787  */
788 static int
789 sem_undo_compar(const void *x, const void *y)
790 {
791 	struct sem_undo *undo1 = (struct sem_undo *)x;
792 	struct sem_undo *undo2 = (struct sem_undo *)y;
793 
794 	if (undo1->un_sp < undo2->un_sp)
795 		return (-1);
796 	if (undo1->un_sp > undo2->un_sp)
797 		return (1);
798 	return (0);
799 }
800 
801 /*
802  * Helper function for semop - creates an undo structure and adds it to
803  * the process's avl tree and the semaphore's list.
804  */
805 static int
806 sem_undo_alloc(proc_t *pp, ksemid_t *sp, kmutex_t **lock,
807     struct sem_undo *template, struct sem_undo **un)
808 {
809 	size_t size;
810 	struct sem_undo *undo;
811 	avl_tree_t *tree = NULL;
812 	avl_index_t where;
813 
814 	mutex_exit(*lock);
815 
816 	size = SEM_UNDOSZ(sp->sem_nsems);
817 	undo = kmem_zalloc(size, KM_SLEEP);
818 	undo->un_proc = pp;
819 	undo->un_sp = sp;
820 
821 	if (pp->p_semacct == NULL)
822 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
823 
824 	*lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
825 	if (IPC_FREE(&sp->sem_perm)) {
826 		kmem_free(undo, size);
827 		if (tree)
828 			kmem_free(tree, sizeof (avl_tree_t));
829 		return (EIDRM);
830 	}
831 
832 	mutex_enter(&pp->p_lock);
833 	if (tree) {
834 		if (pp->p_semacct == NULL) {
835 			avl_create(tree, sem_undo_compar,
836 			    sizeof (struct sem_undo),
837 			    offsetof(struct sem_undo, un_avl));
838 			pp->p_semacct = tree;
839 		} else {
840 			kmem_free(tree, sizeof (avl_tree_t));
841 		}
842 	}
843 
844 	if (*un = avl_find(pp->p_semacct, template, &where)) {
845 		mutex_exit(&pp->p_lock);
846 		kmem_free(undo, size);
847 	} else {
848 		*un = undo;
849 		avl_insert(pp->p_semacct, undo, where);
850 		mutex_exit(&pp->p_lock);
851 		list_insert_head(&sp->sem_undos, undo);
852 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
853 	}
854 
855 
856 	return (0);
857 }
858 
859 /*
860  * semop - Semop system call.
861  */
862 static int
863 semop(int semid, struct sembuf *sops, size_t nsops, timespec_t *timeout)
864 {
865 	ksemid_t	*sp = NULL;
866 	kmutex_t	*lock;
867 	struct sembuf	*op;	/* ptr to operation */
868 	int		i;	/* loop control */
869 	struct sem	*semp;	/* ptr to semaphore */
870 	int 		error = 0;
871 	struct sembuf	*uops;	/* ptr to copy of user ops */
872 	struct sembuf 	x_sem;	/* avoid kmem_alloc's */
873 	timespec_t	now, ts, *tsp = NULL;
874 	int		timecheck = 0;
875 	int		cvres, needundo, mode;
876 	struct sem_undo	*undo;
877 	proc_t		*pp = curproc;
878 	int		held = 0;
879 
880 	CPU_STATS_ADDQ(CPU, sys, sema, 1); /* bump semaphore op count */
881 
882 	/*
883 	 * To avoid the cost of copying in 'timeout' in the common
884 	 * case, we could only grab the time here and defer the copyin
885 	 * and associated computations until we are about to block.
886 	 *
887 	 * The down side to this is that we would then have to spin
888 	 * some goto top nonsense to avoid the copyin behind the semid
889 	 * lock.  As a common use of timed semaphores is as an explicit
890 	 * blocking mechanism, this could incur a greater penalty.
891 	 *
892 	 * If we eventually decide that this would be a wise route to
893 	 * take, the deferrable functionality is completely contained
894 	 * in 'compute_timeout', and the interface is defined such that
895 	 * we can legally not validate 'timeout' if it is unused.
896 	 */
897 	if (timeout != NULL) {
898 		timecheck = timechanged;
899 		gethrestime(&now);
900 		if (error = compute_timeout(&tsp, &ts, &now, timeout))
901 			return (set_errno(error));
902 	}
903 
904 	/*
905 	 * Allocate space to hold the vector of semaphore ops.  If
906 	 * there is only 1 operation we use a preallocated buffer on
907 	 * the stack for speed.
908 	 *
909 	 * Since we don't want to allow the user to allocate an
910 	 * arbitrary amount of kernel memory, we need to check against
911 	 * the number of operations allowed by the semaphore.  We only
912 	 * bother doing this if the number of operations is larger than
913 	 * SEM_MAXUCOPS.
914 	 */
915 	if (nsops == 1)
916 		uops = &x_sem;
917 	else if (nsops == 0)
918 		return (0);
919 	else if (nsops <= SEM_MAXUCOPS)
920 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
921 
922 	if (nsops > SEM_MAXUCOPS) {
923 		if ((lock = ipc_lookup(sem_svc, semid,
924 		    (kipc_perm_t **)&sp)) == NULL)
925 			return (set_errno(EFAULT));
926 
927 		if (nsops > sp->sem_maxops) {
928 			mutex_exit(lock);
929 			return (set_errno(E2BIG));
930 		}
931 		held = 1;
932 		ipc_hold(sem_svc, (kipc_perm_t *)sp);
933 		mutex_exit(lock);
934 
935 		uops = kmem_alloc(nsops * sizeof (*uops), KM_SLEEP);
936 		if (copyin(sops, uops, nsops * sizeof (*op))) {
937 			error = EFAULT;
938 			(void) ipc_lock(sem_svc, sp->sem_perm.ipc_id);
939 			goto semoperr;
940 		}
941 
942 		lock = ipc_lock(sem_svc, sp->sem_perm.ipc_id);
943 		if (IPC_FREE(&sp->sem_perm)) {
944 			error = EIDRM;
945 			goto semoperr;
946 		}
947 	} else {
948 		/*
949 		 * This could be interleaved with the above code, but
950 		 * keeping them separate improves readability.
951 		 */
952 		if (copyin(sops, uops, nsops * sizeof (*op))) {
953 			error = EFAULT;
954 			goto semoperr_unlocked;
955 		}
956 
957 		if ((lock = ipc_lookup(sem_svc, semid,
958 		    (kipc_perm_t **)&sp)) == NULL) {
959 			error = EINVAL;
960 			goto semoperr_unlocked;
961 		}
962 
963 		if (nsops > sp->sem_maxops) {
964 			error = E2BIG;
965 			goto semoperr;
966 		}
967 	}
968 
969 	/*
970 	 * Scan all operations.  Verify that sem #s are in range and
971 	 * this process is allowed the requested operations.  If any
972 	 * operations are marked SEM_UNDO, find (or allocate) the undo
973 	 * structure for this process and semaphore.
974 	 */
975 	needundo = 0;
976 	mode = 0;
977 	for (i = 0, op = uops; i++ < nsops; op++) {
978 		mode |= op->sem_op ? SEM_A : SEM_R;
979 		if (op->sem_num >= sp->sem_nsems) {
980 			error = EFBIG;
981 			goto semoperr;
982 		}
983 		if ((op->sem_flg & SEM_UNDO) && op->sem_op)
984 			needundo = 1;
985 	}
986 	if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
987 		goto semoperr;
988 
989 	if (needundo) {
990 		struct sem_undo template;
991 
992 		template.un_sp = sp;
993 		mutex_enter(&pp->p_lock);
994 		if (pp->p_semacct)
995 			undo = avl_find(pp->p_semacct, &template, NULL);
996 		else
997 			undo = NULL;
998 		mutex_exit(&pp->p_lock);
999 		if (undo == NULL) {
1000 			if (!held) {
1001 				held = 1;
1002 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1003 			}
1004 			if (error = sem_undo_alloc(pp, sp, &lock, &template,
1005 			    &undo))
1006 				goto semoperr;
1007 
1008 			/* sem_undo_alloc unlocks the semaphore */
1009 			if (error = ipcperm_access(&sp->sem_perm, mode, CRED()))
1010 				goto semoperr;
1011 		}
1012 	}
1013 
1014 check:
1015 	/*
1016 	 * Loop waiting for the operations to be satisfied atomically.
1017 	 * Actually, do the operations and undo them if a wait is needed
1018 	 * or an error is detected.
1019 	 */
1020 	for (i = 0; i < nsops; i++) {
1021 		op = &uops[i];
1022 		semp = &sp->sem_base[op->sem_num];
1023 
1024 		/*
1025 		 * Raise the semaphore (i.e. sema_v)
1026 		 */
1027 		if (op->sem_op > 0) {
1028 			if (op->sem_op + (int)semp->semval > USHRT_MAX ||
1029 			    ((op->sem_flg & SEM_UNDO) &&
1030 			    (error = sem_undo_add(op->sem_op, op->sem_num,
1031 			    undo)))) {
1032 				if (i)
1033 					sem_rollback(sp, uops, i, undo);
1034 				if (error == 0)
1035 					error = ERANGE;
1036 				goto semoperr;
1037 			}
1038 			semp->semval += op->sem_op;
1039 			/*
1040 			 * If we are only incrementing the semaphore value
1041 			 * by one on a binary semaphore, we can cv_signal.
1042 			 */
1043 			if (semp->semncnt) {
1044 				if (op->sem_op == 1 && sp->sem_binary)
1045 					cv_signal(&semp->semncnt_cv);
1046 				else
1047 					cv_broadcast(&semp->semncnt_cv);
1048 			}
1049 			if (semp->semzcnt && !semp->semval)
1050 				cv_broadcast(&semp->semzcnt_cv);
1051 			continue;
1052 		}
1053 
1054 		/*
1055 		 * Lower the semaphore (i.e. sema_p)
1056 		 */
1057 		if (op->sem_op < 0) {
1058 			if (semp->semval >= (unsigned)(-op->sem_op)) {
1059 				if ((op->sem_flg & SEM_UNDO) &&
1060 				    (error = sem_undo_add(op->sem_op,
1061 				    op->sem_num, undo))) {
1062 					if (i)
1063 						sem_rollback(sp, uops, i, undo);
1064 					goto semoperr;
1065 				}
1066 				semp->semval += op->sem_op;
1067 				if (semp->semzcnt && !semp->semval)
1068 					cv_broadcast(&semp->semzcnt_cv);
1069 				continue;
1070 			}
1071 			if (i)
1072 				sem_rollback(sp, uops, i, undo);
1073 			if (op->sem_flg & IPC_NOWAIT) {
1074 				error = EAGAIN;
1075 				goto semoperr;
1076 			}
1077 
1078 			/*
1079 			 * Mark the semaphore set as not a binary type
1080 			 * if we are decrementing the value by more than 1.
1081 			 *
1082 			 * V operations will resort to cv_broadcast
1083 			 * for this set because there are too many weird
1084 			 * cases that have to be caught.
1085 			 */
1086 			if (op->sem_op < -1)
1087 				sp->sem_binary = 0;
1088 			if (!held) {
1089 				held = 1;
1090 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1091 			}
1092 			semp->semncnt++;
1093 			cvres = cv_waituntil_sig(&semp->semncnt_cv, lock,
1094 			    tsp, timecheck);
1095 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1096 
1097 			if (!IPC_FREE(&sp->sem_perm)) {
1098 				ASSERT(semp->semncnt != 0);
1099 				semp->semncnt--;
1100 				if (cvres > 0)	/* normal wakeup */
1101 					goto check;
1102 			}
1103 
1104 			/* EINTR or EAGAIN overrides EIDRM */
1105 			if (cvres == 0)
1106 				error = EINTR;
1107 			else if (cvres < 0)
1108 				error = EAGAIN;
1109 			else
1110 				error = EIDRM;
1111 			goto semoperr;
1112 		}
1113 
1114 		/*
1115 		 * Wait for zero value
1116 		 */
1117 		if (semp->semval) {
1118 			if (i)
1119 				sem_rollback(sp, uops, i, undo);
1120 			if (op->sem_flg & IPC_NOWAIT) {
1121 				error = EAGAIN;
1122 				goto semoperr;
1123 			}
1124 
1125 			if (!held) {
1126 				held = 1;
1127 				ipc_hold(sem_svc, (kipc_perm_t *)sp);
1128 			}
1129 			semp->semzcnt++;
1130 			cvres = cv_waituntil_sig(&semp->semzcnt_cv, lock,
1131 			    tsp, timecheck);
1132 			lock = ipc_relock(sem_svc, sp->sem_perm.ipc_id, lock);
1133 
1134 			/*
1135 			 * Don't touch semp if the semaphores have been removed.
1136 			 */
1137 			if (!IPC_FREE(&sp->sem_perm)) {
1138 				ASSERT(semp->semzcnt != 0);
1139 				semp->semzcnt--;
1140 				if (cvres > 0)	/* normal wakeup */
1141 					goto check;
1142 			}
1143 
1144 			/* EINTR or EAGAIN overrides EIDRM */
1145 			if (cvres == 0)
1146 				error = EINTR;
1147 			else if (cvres < 0)
1148 				error = EAGAIN;
1149 			else
1150 				error = EIDRM;
1151 			goto semoperr;
1152 		}
1153 	}
1154 
1155 	/* All operations succeeded.  Update sempid for accessed semaphores. */
1156 	for (i = 0, op = uops; i++ < nsops;
1157 	    sp->sem_base[(op++)->sem_num].sempid = pp->p_pid)
1158 		;
1159 	sp->sem_otime = gethrestime_sec();
1160 	if (held)
1161 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1162 	else
1163 		mutex_exit(lock);
1164 
1165 	/* Before leaving, deallocate the buffer that held the user semops */
1166 	if (nsops != 1)
1167 		kmem_free(uops, sizeof (*uops) * nsops);
1168 	return (0);
1169 
1170 	/*
1171 	 * Error return labels
1172 	 */
1173 semoperr:
1174 	if (held)
1175 		ipc_rele(sem_svc, (kipc_perm_t *)sp);
1176 	else
1177 		mutex_exit(lock);
1178 
1179 semoperr_unlocked:
1180 
1181 	/* Before leaving, deallocate the buffer that held the user semops */
1182 	if (nsops != 1)
1183 		kmem_free(uops, sizeof (*uops) * nsops);
1184 	return (set_errno(error));
1185 }
1186 
1187 /*
1188  * semsys - System entry point for semctl, semget, and semop system calls.
1189  */
1190 static int
1191 semsys(int opcode, uintptr_t a1, uintptr_t a2, uintptr_t a3, uintptr_t a4)
1192 {
1193 	int error;
1194 
1195 	switch (opcode) {
1196 	case SEMCTL:
1197 		error = semctl((int)a1, (uint_t)a2, (int)a3, a4);
1198 		break;
1199 	case SEMGET:
1200 		error = semget((key_t)a1, (int)a2, (int)a3);
1201 		break;
1202 	case SEMOP:
1203 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3, 0);
1204 		break;
1205 	case SEMIDS:
1206 		error = semids((int *)a1, (uint_t)a2, (uint_t *)a3);
1207 		break;
1208 	case SEMTIMEDOP:
1209 		error = semop((int)a1, (struct sembuf *)a2, (size_t)a3,
1210 		    (timespec_t *)a4);
1211 		break;
1212 	default:
1213 		error = set_errno(EINVAL);
1214 		break;
1215 	}
1216 	return (error);
1217 }
1218