xref: /titanic_52/usr/src/uts/common/os/ipc.c (revision f35eb4e637f5e925af3bb7993f3bb87c5a69a696)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T		*/
26 /*	All Rights Reserved					*/
27 
28 
29 /*
30  * Common Inter-Process Communication routines.
31  *
32  * Overview
33  * --------
34  *
35  * The System V inter-process communication (IPC) facilities provide
36  * three services, message queues, semaphore arrays, and shared memory
37  * segments, which are mananged using filesystem-like namespaces.
38  * Unlike a filesystem, these namespaces aren't mounted and accessible
39  * via a path -- a special API is used to interact with the different
40  * facilities (nothing precludes a VFS-based interface, but the
41  * standards require the special APIs).  Furthermore, these special
42  * APIs don't use file descriptors, nor do they have an equivalent.
43  * This means that every operation which acts on an object needs to
44  * perform the quivalent of a lookup, which in turn means that every
45  * operation can fail if the specified object doesn't exist in the
46  * facility's namespace.
47  *
48  * Objects
49  * -------
50  *
51  * Each object in a namespace has a unique ID, which is assigned by the
52  * system and is used to identify the object when performing operations
53  * on it.  An object can also have a key, which is selected by the user
54  * at allocation time and is used as a primitive rendezvous mechanism.
55  * An object without a key is said to have a "private" key.
56  *
57  * To perform an operation on an object given its key, one must first
58  * perform a lookup and obtain its ID.  The ID is then used to identify
59  * the object when performing the operation.  If the object has a
60  * private key, the ID must be known or obtained by other means.
61  *
62  * Each object in the namespace has a creator uid and gid, as well as
63  * an owner uid and gid.  Both are initialized with the ruid and rgid
64  * of the process which created the object.  The creator or current
65  * owner has the ability to change the owner of the object.
66  *
67  * Each object in the namespace has a set of file-like permissions,
68  * which, in conjunction with the creator and owner uid and gid,
69  * control read and write access to the object (execute is ignored).
70  *
71  * Each object also has a creator project and zone, which are used to
72  * account for its resource usage.
73  *
74  * Operations
75  * ----------
76  *
77  * There are five operations which all three facilities have in
78  * common: GET, SET, STAT, RMID, and IDS.
79  *
80  * GET, like open, is used to allocate a new object or obtain an
81  * existing one (using its key).  It takes a key, a set of flags and
82  * mode bits, and optionally facility-specific arguments.  If the key
83  * is IPC_PRIVATE, a new object with the requested mode bits and
84  * facility-specific attributes is created.  If the key isn't
85  * IPC_PRIVATE, the GET will attempt to look up the specified key and
86  * either return that or create a new key depending on the state of the
87  * IPC_CREAT and IPC_EXCL flags, much like open.  If GET needs to
88  * allocate an object, it can fail if there is insufficient space in
89  * the namespace (the maximum number of ids for the facility has been
90  * exceeded) or if the facility-specific initialization fails.  If GET
91  * finds an object it can return, it can still fail if that object's
92  * permissions or facility-specific attributes are less than those
93  * requested.
94  *
95  * SET is used to adjust facility-specific parameters of an object, in
96  * addition to the owner uid and gid, and mode bits.  It can fail if
97  * the caller isn't the creator or owner.
98  *
99  * STAT is used to obtain information about an object including the
100  * general attributes object described as well as facility-specific
101  * information.  It can fail if the caller doesn't have read
102  * permission.
103  *
104  * RMID removes an object from the namespace.  Subsequent operations
105  * using the object's ID or key will fail (until another object is
106  * created with the same key or ID).  Since an RMID may be performed
107  * asynchronously with other operations, it is possible that other
108  * threads and/or processes will have references to the object.  While
109  * a facility may have actions which need to be performed at RMID time,
110  * only when all references are dropped can the object be destroyed.
111  * RMID will fail if the caller isn't the creator or owner.
112  *
113  * IDS obtains a list of all IDs in a facility's namespace.  There are
114  * no facility-specific behaviors of IDS.
115  *
116  * Design
117  * ------
118  *
119  * Because some IPC facilities provide services whose operations must
120  * scale, a mechanism which allows fast, concurrent access to
121  * individual objects is needed.  Of primary importance is object
122  * lookup based on ID (SET, STAT, others).  Allocation (GET),
123  * deallocation (RMID), ID enumeration (IDS), and key lookups (GET) are
124  * lesser concerns, but should be implemented in such a way that ID
125  * lookup isn't affected (at least not in the common case).
126  *
127  * Starting from the bottom up, each object is represented by a
128  * structure, the first member of which must be a kipc_perm_t.  The
129  * kipc_perm_t contains the information described above in "Objects", a
130  * reference count (since the object may continue to exist after it has
131  * been removed from the namespace), as well as some additional
132  * metadata used to manage data structure membership.  These objects
133  * are dynamically allocated.
134  *
135  * Above the objects is a power-of-two sized table of ID slots.  Each
136  * slot contains a pointer to an object, a sequence number, and a
137  * lock.  An object's ID is a function of its slot's index in the table
138  * and its slot's sequence number.  Every time a slot is released (via
139  * RMID) its sequence number is increased.  Strictly speaking, the
140  * sequence number is unnecessary.  However, checking the sequence
141  * number after a lookup provides a certain degree of robustness
142  * against the use of stale IDs (useful since nothing else does).  When
143  * the table fills up, it is resized (see Locking, below).
144  *
145  * Of an ID's 31 bits (an ID is, as defined by the standards, a signed
146  * int) the top IPC_SEQ_BITS are used for the sequence number with the
147  * remainder holding the index into the table.  The size of the table
148  * is therefore bounded at 2 ^ (31 - IPC_SEQ_BITS) slots.
149  *
150  * Managing this table is the ipc_service structure.  It contains a
151  * pointer to the dynamically allocated ID table, a namespace-global
152  * lock, an id_space for managing the free space in the table, and
153  * sundry other metadata necessary for the maintenance of the
154  * namespace.  An AVL tree of all keyed objects in the table (sorted by
155  * key) is used for key lookups.  An unordered doubly linked list of
156  * all objects in the namespace (keyed or not) is maintained to
157  * facilitate ID enumeration.
158  *
159  * To help visualize these relationships, here's a picture of a
160  * namespace with a table of size 8 containing three objects
161  * (IPC_SEQ_BITS = 28):
162  *
163  *
164  * +-ipc_service_t--+
165  * | table          *---\
166  * | keys           *---+----------------------\
167  * | all ids        *--\|                      |
168  * |                |  ||                      |
169  * +----------------+  ||                      |
170  *                     ||                      |
171  * /-------------------/|                      |
172  * |    /---------------/                      |
173  * |    |                                      |
174  * |    v                                      |
175  * |  +-0------+-1------+-2------+-3------+-4--+---+-5------+-6------+-7------+
176  * |  | Seq=3  |        |        | Seq=1  |    :   |        |        | Seq=6  |
177  * |  |        |        |        |        |    :   |        |        |        |
178  * |  +-*------+--------+--------+-*------+----+---+--------+--------+-*------+
179  * |    |                          |           |                       |
180  * |    |                      /---/           |      /----------------/
181  * |    |                      |               |      |
182  * |    v                      v               |      v
183  * |  +-kipc_perm_t-+        +-kipc_perm_t-+   |    +-kipc_perm_t-+
184  * |  | id=0x30     |        | id=0x13     |   |    | id=0x67     |
185  * |  | key=0xfeed  |        | key=0xbeef  |   |    | key=0xcafe  |
186  * \->| [list]      |<------>| [list]      |<------>| [list]      |
187  * /->| [avl left]  x   /--->| [avl left]  x   \--->| [avl left]  *---\
188  * |  | [avl right] x   |    | [avl right] x        | [avl right] *---+-\
189  * |  |             |   |    |             |        |             |   | |
190  * |  +-------------+   |    +-------------+        +-------------+   | |
191  * |                    \---------------------------------------------/ |
192  * \--------------------------------------------------------------------/
193  *
194  * Locking
195  * -------
196  *
197  * There are three locks (or sets of locks) which are used to ensure
198  * correctness: the slot locks, the namespace lock, and p_lock (needed
199  * when checking resource controls).  Their ordering is
200  *
201  *   namespace lock -> slot lock 0 -> ... -> slot lock t -> p_lock
202  *
203  * Generally speaking, the namespace lock is used to protect allocation
204  * and removal from the namespace, ID enumeration, and resizing the ID
205  * table.  Specifically:
206  *
207  * - write access to all fields of the ipc_service structure
208  * - read access to all variable fields of ipc_service except
209  *   ipcs_tabsz (table size) and ipcs_table (the table pointer)
210  * - read/write access to ipc_avl, ipc_list in visible objects'
211  *   kipc_perm structures (i.e. objects which have been removed from
212  *   the namespace don't have this restriction)
213  * - write access to ipct_seq and ipct_data in the table entries
214  *
215  * A slot lock by itself is meaningless (except when resizing).  Of
216  * greater interest conceptually is the notion of an ID lock -- a
217  * "virtual lock" which refers to whichever slot lock an object's ID
218  * currently hashes to.
219  *
220  * An ID lock protects all objects with that ID.  Normally there will
221  * only be one such object: the one pointed to by the locked slot.
222  * However, if an object is removed from the namespace but retains
223  * references (e.g. an attached shared memory segment which has been
224  * RMIDed), it continues to use the lock associated with its original
225  * ID.  While this can result in increased contention, operations which
226  * require taking the ID lock of removed objects are infrequent.
227  *
228  * Specifically, an ID lock protects the contents of an object's
229  * structure, including the contents of the embedded kipc_perm
230  * structure (but excluding those fields protected by the namespace
231  * lock).  It also protects the ipct_seq and ipct_data fields in its
232  * slot (it is really a slot lock, after all).
233  *
234  * Recall that the table is resizable.  To avoid requiring every ID
235  * lookup to take a global lock, a scheme much like that employed for
236  * file descriptors (see the comment above UF_ENTER in user.h) is
237  * used.  Note that the sequence number and data pointer are protected
238  * by both the namespace lock and their slot lock.  When the table is
239  * resized, the following operations take place:
240  *
241  *   1) A new table is allocated.
242  *   2) The global lock is taken.
243  *   3) All old slots are locked, in order.
244  *   4) The first half of the new slots are locked.
245  *   5) All table entries are copied to the new table, and cleared from
246  *	the old table.
247  *   6) The ipc_service structure is updated to point to the new table.
248  *   7) The ipc_service structure is updated with the new table size.
249  *   8) All slot locks (old and new) are dropped.
250  *
251  * Because the slot locks are embedded in the table, ID lookups and
252  * other operations which require taking an slot lock need to verify
253  * that the lock taken wasn't part of a stale table.  This is
254  * accomplished by checking the table size before and after
255  * dereferencing the table pointer and taking the lock: if the size
256  * changes, the lock must be dropped and reacquired.  It is this
257  * additional work which distinguishes an ID lock from a slot lock.
258  *
259  * Because we can't guarantee that threads aren't accessing the old
260  * tables' locks, they are never deallocated.  To prevent spurious
261  * reports of memory leaks, a pointer to the discarded table is stored
262  * in the new one in step 5.  (Theoretically ipcs_destroy will delete
263  * the discarded tables, but it is only ever called from a failed _init
264  * invocation; i.e. when there aren't any.)
265  *
266  * Interfaces
267  * ----------
268  *
269  * The following interfaces are provided by the ipc module for use by
270  * the individual IPC facilities:
271  *
272  * ipcperm_access
273  *
274  *   Given an object and a cred structure, determines if the requested
275  *   access type is allowed.
276  *
277  * ipcperm_set, ipcperm_stat,
278  * ipcperm_set64, ipcperm_stat64
279  *
280  *   Performs the common portion of an STAT or SET operation.  All
281  *   (except stat and stat64) can fail, so they should be called before
282  *   any facility-specific non-reversible changes are made to an
283  *   object.  Similarly, the set operations have side effects, so they
284  *   should only be called once the possibility of a facility-specific
285  *   failure is eliminated.
286  *
287  * ipcs_create
288  *
289  *   Creates an IPC namespace for use by an IPC facility.
290  *
291  * ipcs_destroy
292  *
293  *   Destroys an IPC namespace.
294  *
295  * ipcs_lock, ipcs_unlock
296  *
297  *   Takes the namespace lock.  Ideally such access wouldn't be
298  *   necessary, but there may be facility-specific data protected by
299  *   this lock (e.g. project-wide resource consumption).
300  *
301  * ipc_lock
302  *
303  *   Takes the lock associated with an ID.  Can't fail.
304  *
305  * ipc_relock
306  *
307  *   Like ipc_lock, but takes a pointer to a held lock.  Drops the lock
308  *   unless it is the one that would have been returned by ipc_lock.
309  *   Used after calls to cv_wait.
310  *
311  * ipc_lookup
312  *
313  *   Performs an ID lookup, returns with the ID lock held.  Fails if
314  *   the ID doesn't exist in the namespace.
315  *
316  * ipc_hold
317  *
318  *   Takes a reference on an object.
319  *
320  * ipc_rele
321  *
322  *   Releases a reference on an object, and drops the object's lock.
323  *   Calls the object's destructor if last reference is being
324  *   released.
325  *
326  * ipc_rele_locked
327  *
328  *   Releases a reference on an object.  Doesn't drop lock, and may
329  *   only be called when there is more than one reference to the
330  *   object.
331  *
332  * ipc_get, ipc_commit_begin, ipc_commit_end, ipc_cleanup
333  *
334  *   Components of a GET operation.  ipc_get performs a key lookup,
335  *   allocating an object if the key isn't found (returning with the
336  *   namespace lock and p_lock held), and returning the existing object
337  *   if it is (with the object lock held).  ipc_get doesn't modify the
338  *   namespace.
339  *
340  *   ipc_commit_begin begins the process of inserting an object
341  *   allocated by ipc_get into the namespace, and can fail.  If
342  *   successful, it returns with the namespace lock and p_lock held.
343  *   ipc_commit_end completes the process of inserting an object into
344  *   the namespace and can't fail.  The facility can call ipc_cleanup
345  *   at any time following a successful ipc_get and before
346  *   ipc_commit_end or a failed ipc_commit_begin to fail the
347  *   allocation.  Pseudocode for the suggested GET implementation:
348  *
349  *   top:
350  *
351  *     ipc_get
352  *
353  *     if failure
354  *       return
355  *
356  *     if found {
357  *
358  *	 if object meets criteria
359  *	   unlock object and return success
360  *       else
361  *	   unlock object and return failure
362  *
363  *     } else {
364  *
365  *	 perform resource control tests
366  *	 drop namespace lock, p_lock
367  *	 if failure
368  *	   ipc_cleanup
369  *
370  *       perform facility-specific initialization
371  *	 if failure {
372  *	   facility-specific cleanup
373  *	   ipc_cleanup
374  *       }
375  *
376  *	 ( At this point the object should be destructible using the
377  *	   destructor given to ipcs_create )
378  *
379  *       ipc_commit_begin
380  *	 if retry
381  *	   goto top
382  *       else if failure
383  *         return
384  *
385  *       perform facility-specific resource control tests/allocations
386  *	 if failure
387  *	   ipc_cleanup
388  *
389  *	 ipc_commit_end
390  *	 perform any infallible post-creation actions, unlock, and return
391  *
392  *     }
393  *
394  * ipc_rmid
395  *
396  *   Performs the common portion of an RMID operation -- looks up an ID
397  *   removes it, and calls the a facility-specific function to do
398  *   RMID-time cleanup on the private portions of the object.
399  *
400  * ipc_ids
401  *
402  *   Performs the common portion of an IDS operation.
403  *
404  */
405 
406 #include <sys/types.h>
407 #include <sys/param.h>
408 #include <sys/cred.h>
409 #include <sys/policy.h>
410 #include <sys/proc.h>
411 #include <sys/user.h>
412 #include <sys/ipc.h>
413 #include <sys/ipc_impl.h>
414 #include <sys/errno.h>
415 #include <sys/systm.h>
416 #include <sys/list.h>
417 #include <sys/atomic.h>
418 #include <sys/zone.h>
419 #include <sys/task.h>
420 #include <sys/modctl.h>
421 
422 #include <c2/audit.h>
423 
424 static struct modlmisc modlmisc = {
425 	&mod_miscops,
426 	"common ipc code",
427 };
428 
429 static struct modlinkage modlinkage = {
430 	MODREV_1, (void *)&modlmisc, NULL
431 };
432 
433 
434 int
435 _init(void)
436 {
437 	return (mod_install(&modlinkage));
438 }
439 
440 int
441 _fini(void)
442 {
443 	return (mod_remove(&modlinkage));
444 }
445 
446 int
447 _info(struct modinfo *modinfop)
448 {
449 	return (mod_info(&modlinkage, modinfop));
450 }
451 
452 
453 /*
454  * Check message, semaphore, or shared memory access permissions.
455  *
456  * This routine verifies the requested access permission for the current
457  * process.  The zone ids are compared, and the appropriate bits are
458  * checked corresponding to owner, group (including the list of
459  * supplementary groups), or everyone.  Zero is returned on success.
460  * On failure, the security policy is asked to check to override the
461  * permissions check; the policy will either return 0 for access granted
462  * or EACCES.
463  *
464  * Access to objects in other zones requires that the caller be in the
465  * global zone and have the appropriate IPC_DAC_* privilege, regardless
466  * of whether the uid or gid match those of the object.  Note that
467  * cross-zone accesses will normally never get here since they'll
468  * fail in ipc_lookup or ipc_get.
469  *
470  * The arguments must be set up as follows:
471  * 	p - Pointer to permission structure to verify
472  * 	mode - Desired access permissions
473  */
474 int
475 ipcperm_access(kipc_perm_t *p, int mode, cred_t *cr)
476 {
477 	int shifts = 0;
478 	uid_t uid = crgetuid(cr);
479 	zoneid_t zoneid = getzoneid();
480 
481 	if (p->ipc_zoneid == zoneid) {
482 		if (uid != p->ipc_uid && uid != p->ipc_cuid) {
483 			shifts += 3;
484 			if (!groupmember(p->ipc_gid, cr) &&
485 			    !groupmember(p->ipc_cgid, cr))
486 				shifts += 3;
487 		}
488 
489 		mode &= ~(p->ipc_mode << shifts);
490 
491 		if (mode == 0)
492 			return (0);
493 	} else if (zoneid != GLOBAL_ZONEID)
494 		return (EACCES);
495 
496 	return (secpolicy_ipc_access(cr, p, mode));
497 }
498 
499 /*
500  * There are two versions of the ipcperm_set/stat functions:
501  *   ipcperm_???        - for use with IPC_SET/STAT
502  *   ipcperm_???_64     - for use with IPC_SET64/STAT64
503  *
504  * These functions encapsulate the common portions (copying, permission
505  * checks, and auditing) of the set/stat operations.  All, except for
506  * stat and stat_64 which are void, return 0 on success or a non-zero
507  * errno value on error.
508  */
509 
510 int
511 ipcperm_set(ipc_service_t *service, struct cred *cr,
512     kipc_perm_t *kperm, struct ipc_perm *perm, model_t model)
513 {
514 	STRUCT_HANDLE(ipc_perm, lperm);
515 	uid_t uid;
516 	gid_t gid;
517 	mode_t mode;
518 	zone_t *zone;
519 
520 	ASSERT(IPC_LOCKED(service, kperm));
521 
522 	STRUCT_SET_HANDLE(lperm, model, perm);
523 	uid = STRUCT_FGET(lperm, uid);
524 	gid = STRUCT_FGET(lperm, gid);
525 	mode = STRUCT_FGET(lperm, mode);
526 
527 	if (secpolicy_ipc_owner(cr, kperm) != 0)
528 		return (EPERM);
529 
530 	zone = crgetzone(cr);
531 	if (!VALID_UID(uid, zone) || !VALID_GID(gid, zone))
532 		return (EINVAL);
533 
534 	kperm->ipc_uid = uid;
535 	kperm->ipc_gid = gid;
536 	kperm->ipc_mode = (mode & 0777) | (kperm->ipc_mode & ~0777);
537 
538 	if (AU_AUDITING())
539 		audit_ipcget(service->ipcs_atype, kperm);
540 
541 	return (0);
542 }
543 
544 void
545 ipcperm_stat(struct ipc_perm *perm, kipc_perm_t *kperm, model_t model)
546 {
547 	STRUCT_HANDLE(ipc_perm, lperm);
548 
549 	STRUCT_SET_HANDLE(lperm, model, perm);
550 	STRUCT_FSET(lperm, uid, kperm->ipc_uid);
551 	STRUCT_FSET(lperm, gid, kperm->ipc_gid);
552 	STRUCT_FSET(lperm, cuid, kperm->ipc_cuid);
553 	STRUCT_FSET(lperm, cgid, kperm->ipc_cgid);
554 	STRUCT_FSET(lperm, mode, kperm->ipc_mode);
555 	STRUCT_FSET(lperm, seq, 0);
556 	STRUCT_FSET(lperm, key, kperm->ipc_key);
557 }
558 
559 int
560 ipcperm_set64(ipc_service_t *service, struct cred *cr,
561     kipc_perm_t *kperm, ipc_perm64_t *perm64)
562 {
563 	zone_t *zone;
564 
565 	ASSERT(IPC_LOCKED(service, kperm));
566 
567 	if (secpolicy_ipc_owner(cr, kperm) != 0)
568 		return (EPERM);
569 
570 	zone = crgetzone(cr);
571 	if (!VALID_UID(perm64->ipcx_uid, zone) ||
572 	    !VALID_GID(perm64->ipcx_gid, zone))
573 		return (EINVAL);
574 
575 	kperm->ipc_uid = perm64->ipcx_uid;
576 	kperm->ipc_gid = perm64->ipcx_gid;
577 	kperm->ipc_mode = (perm64->ipcx_mode & 0777) |
578 	    (kperm->ipc_mode & ~0777);
579 
580 	if (AU_AUDITING())
581 		audit_ipcget(service->ipcs_atype, kperm);
582 
583 	return (0);
584 }
585 
586 void
587 ipcperm_stat64(ipc_perm64_t *perm64, kipc_perm_t *kperm)
588 {
589 	perm64->ipcx_uid = kperm->ipc_uid;
590 	perm64->ipcx_gid = kperm->ipc_gid;
591 	perm64->ipcx_cuid = kperm->ipc_cuid;
592 	perm64->ipcx_cgid = kperm->ipc_cgid;
593 	perm64->ipcx_mode = kperm->ipc_mode;
594 	perm64->ipcx_key = kperm->ipc_key;
595 	perm64->ipcx_projid = kperm->ipc_proj->kpj_id;
596 	perm64->ipcx_zoneid = kperm->ipc_zoneid;
597 }
598 
599 
600 /*
601  * ipc key comparator.
602  */
603 static int
604 ipc_key_compar(const void *a, const void *b)
605 {
606 	kipc_perm_t *aperm = (kipc_perm_t *)a;
607 	kipc_perm_t *bperm = (kipc_perm_t *)b;
608 	int ak = aperm->ipc_key;
609 	int bk = bperm->ipc_key;
610 	zoneid_t az;
611 	zoneid_t bz;
612 
613 	ASSERT(ak != IPC_PRIVATE);
614 	ASSERT(bk != IPC_PRIVATE);
615 
616 	/*
617 	 * Compare key first, then zoneid.  This optimizes performance for
618 	 * systems with only one zone, since the zone checks will only be
619 	 * made when the keys match.
620 	 */
621 	if (ak < bk)
622 		return (-1);
623 	if (ak > bk)
624 		return (1);
625 
626 	/* keys match */
627 	az = aperm->ipc_zoneid;
628 	bz = bperm->ipc_zoneid;
629 	if (az < bz)
630 		return (-1);
631 	if (az > bz)
632 		return (1);
633 	return (0);
634 }
635 
636 /*
637  * Create an ipc service.
638  */
639 ipc_service_t *
640 ipcs_create(const char *name, rctl_hndl_t proj_rctl, rctl_hndl_t zone_rctl,
641     size_t size, ipc_func_t *dtor, ipc_func_t *rmid, int audit_type,
642     size_t rctl_offset)
643 {
644 	ipc_service_t *result;
645 
646 	result = kmem_alloc(sizeof (ipc_service_t), KM_SLEEP);
647 
648 	mutex_init(&result->ipcs_lock, NULL, MUTEX_ADAPTIVE, NULL);
649 	result->ipcs_count = 0;
650 	avl_create(&result->ipcs_keys, ipc_key_compar, size, 0);
651 	result->ipcs_tabsz = IPC_IDS_MIN;
652 	result->ipcs_table =
653 	    kmem_zalloc(IPC_IDS_MIN * sizeof (ipc_slot_t), KM_SLEEP);
654 	result->ipcs_ssize = size;
655 	result->ipcs_ids = id_space_create(name, 0, IPC_IDS_MIN);
656 	result->ipcs_dtor = dtor;
657 	result->ipcs_rmid = rmid;
658 	result->ipcs_proj_rctl = proj_rctl;
659 	result->ipcs_zone_rctl = zone_rctl;
660 	result->ipcs_atype = audit_type;
661 	ASSERT(rctl_offset < sizeof (ipc_rqty_t));
662 	result->ipcs_rctlofs = rctl_offset;
663 	list_create(&result->ipcs_usedids, sizeof (kipc_perm_t),
664 	    offsetof(kipc_perm_t, ipc_list));
665 
666 	return (result);
667 }
668 
669 /*
670  * Destroy an ipc service.
671  */
672 void
673 ipcs_destroy(ipc_service_t *service)
674 {
675 	ipc_slot_t *slot, *next;
676 
677 	mutex_enter(&service->ipcs_lock);
678 
679 	ASSERT(service->ipcs_count == 0);
680 	avl_destroy(&service->ipcs_keys);
681 	list_destroy(&service->ipcs_usedids);
682 	id_space_destroy(service->ipcs_ids);
683 
684 	for (slot = service->ipcs_table; slot; slot = next) {
685 		next = slot[0].ipct_chain;
686 		kmem_free(slot, service->ipcs_tabsz * sizeof (ipc_slot_t));
687 		service->ipcs_tabsz >>= 1;
688 	}
689 
690 	mutex_destroy(&service->ipcs_lock);
691 	kmem_free(service, sizeof (ipc_service_t));
692 }
693 
694 /*
695  * Takes the service lock.
696  */
697 void
698 ipcs_lock(ipc_service_t *service)
699 {
700 	mutex_enter(&service->ipcs_lock);
701 }
702 
703 /*
704  * Releases the service lock.
705  */
706 void
707 ipcs_unlock(ipc_service_t *service)
708 {
709 	mutex_exit(&service->ipcs_lock);
710 }
711 
712 
713 /*
714  * Locks the specified ID.  Returns the ID's ID table index.
715  */
716 static int
717 ipc_lock_internal(ipc_service_t *service, uint_t id)
718 {
719 	uint_t	tabsz;
720 	uint_t	index;
721 	kmutex_t *mutex;
722 
723 	for (;;) {
724 		tabsz = service->ipcs_tabsz;
725 		membar_consumer();
726 		index = id & (tabsz - 1);
727 		mutex = &service->ipcs_table[index].ipct_lock;
728 		mutex_enter(mutex);
729 		if (tabsz == service->ipcs_tabsz)
730 			break;
731 		mutex_exit(mutex);
732 	}
733 
734 	return (index);
735 }
736 
737 /*
738  * Locks the specified ID.  Returns a pointer to the ID's lock.
739  */
740 kmutex_t *
741 ipc_lock(ipc_service_t *service, int id)
742 {
743 	uint_t index;
744 
745 	/*
746 	 * These assertions don't reflect requirements of the code
747 	 * which follows, but they should never fail nonetheless.
748 	 */
749 	ASSERT(id >= 0);
750 	ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
751 	index = ipc_lock_internal(service, id);
752 
753 	return (&service->ipcs_table[index].ipct_lock);
754 }
755 
756 /*
757  * Checks to see if the held lock provided is the current lock for the
758  * specified id.  If so, we return it instead of dropping it and
759  * returning the result of ipc_lock.  This is intended to speed up cv
760  * wakeups where we are left holding a lock which could be stale, but
761  * probably isn't.
762  */
763 kmutex_t *
764 ipc_relock(ipc_service_t *service, int id, kmutex_t *lock)
765 {
766 	ASSERT(id >= 0);
767 	ASSERT(IPC_INDEX(id) < service->ipcs_tabsz);
768 	ASSERT(MUTEX_HELD(lock));
769 
770 	if (&service->ipcs_table[IPC_INDEX(id)].ipct_lock == lock)
771 		return (lock);
772 
773 	mutex_exit(lock);
774 	return (ipc_lock(service, id));
775 }
776 
777 /*
778  * Performs an ID lookup.  If the ID doesn't exist or has been removed,
779  * or isn't visible to the caller (because of zones), NULL is returned.
780  * Otherwise, a pointer to the ID's perm structure and held ID lock are
781  * returned.
782  */
783 kmutex_t *
784 ipc_lookup(ipc_service_t *service, int id, kipc_perm_t **perm)
785 {
786 	kipc_perm_t *result;
787 	uint_t index;
788 
789 	/*
790 	 * There is no need to check to see if id is in-range (i.e.
791 	 * positive and fits into the table).  If it is out-of-range,
792 	 * the id simply won't match the object's.
793 	 */
794 
795 	index = ipc_lock_internal(service, id);
796 	result = service->ipcs_table[index].ipct_data;
797 	if (result == NULL || result->ipc_id != (uint_t)id ||
798 	    !HASZONEACCESS(curproc, result->ipc_zoneid)) {
799 		mutex_exit(&service->ipcs_table[index].ipct_lock);
800 		return (NULL);
801 	}
802 
803 	ASSERT(IPC_SEQ(id) == service->ipcs_table[index].ipct_seq);
804 
805 	*perm = result;
806 	if (AU_AUDITING())
807 		audit_ipc(service->ipcs_atype, id, result);
808 
809 	return (&service->ipcs_table[index].ipct_lock);
810 }
811 
812 /*
813  * Increase the reference count on an ID.
814  */
815 /*ARGSUSED*/
816 void
817 ipc_hold(ipc_service_t *s, kipc_perm_t *perm)
818 {
819 	ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
820 	ASSERT(IPC_LOCKED(s, perm));
821 	perm->ipc_ref++;
822 }
823 
824 /*
825  * Decrease the reference count on an ID and drops the ID's lock.
826  * Destroys the ID if the new reference count is zero.
827  */
828 void
829 ipc_rele(ipc_service_t *s, kipc_perm_t *perm)
830 {
831 	int nref;
832 
833 	ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
834 	ASSERT(IPC_LOCKED(s, perm));
835 	ASSERT(perm->ipc_ref > 0);
836 
837 	nref = --perm->ipc_ref;
838 	mutex_exit(&s->ipcs_table[IPC_INDEX(perm->ipc_id)].ipct_lock);
839 
840 	if (nref == 0) {
841 		ASSERT(IPC_FREE(perm));		/* ipc_rmid clears IPC_ALLOC */
842 		s->ipcs_dtor(perm);
843 		project_rele(perm->ipc_proj);
844 		zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
845 		kmem_free(perm, s->ipcs_ssize);
846 	}
847 }
848 
849 /*
850  * Decrease the reference count on an ID, but don't drop the ID lock.
851  * Used in cases where one thread needs to remove many references (on
852  * behalf of other parties).
853  */
854 void
855 ipc_rele_locked(ipc_service_t *s, kipc_perm_t *perm)
856 {
857 	ASSERT(perm->ipc_ref > 1);
858 	ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz);
859 	ASSERT(IPC_LOCKED(s, perm));
860 
861 	perm->ipc_ref--;
862 }
863 
864 
865 /*
866  * Internal function to grow the service ID table.
867  */
868 static int
869 ipc_grow(ipc_service_t *service)
870 {
871 	ipc_slot_t *new, *old;
872 	int i, oldsize, newsize;
873 
874 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
875 	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
876 
877 	if (service->ipcs_tabsz == IPC_IDS_MAX)
878 		return (ENOSPC);
879 
880 	oldsize = service->ipcs_tabsz;
881 	newsize = oldsize << 1;
882 	new = kmem_zalloc(newsize * sizeof (ipc_slot_t), KM_NOSLEEP);
883 	if (new == NULL)
884 		return (ENOSPC);
885 
886 	old = service->ipcs_table;
887 	for (i = 0; i < oldsize; i++) {
888 		mutex_enter(&old[i].ipct_lock);
889 		mutex_enter(&new[i].ipct_lock);
890 
891 		new[i].ipct_seq = old[i].ipct_seq;
892 		new[i].ipct_data = old[i].ipct_data;
893 		old[i].ipct_data = NULL;
894 	}
895 
896 	new[0].ipct_chain = old;
897 	service->ipcs_table = new;
898 	membar_producer();
899 	service->ipcs_tabsz = newsize;
900 
901 	for (i = 0; i < oldsize; i++) {
902 		mutex_exit(&old[i].ipct_lock);
903 		mutex_exit(&new[i].ipct_lock);
904 	}
905 
906 	id_space_extend(service->ipcs_ids, oldsize, service->ipcs_tabsz);
907 
908 	return (0);
909 }
910 
911 
912 static int
913 ipc_keylookup(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp)
914 {
915 	kipc_perm_t *perm = NULL;
916 	avl_index_t where;
917 	kipc_perm_t template;
918 
919 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
920 
921 	template.ipc_key = key;
922 	template.ipc_zoneid = getzoneid();
923 	if (perm = avl_find(&service->ipcs_keys, &template, &where)) {
924 		ASSERT(!IPC_FREE(perm));
925 		if ((flag & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
926 			return (EEXIST);
927 		if ((flag & 0777) & ~perm->ipc_mode) {
928 			if (AU_AUDITING())
929 				audit_ipcget(NULL, (void *)perm);
930 			return (EACCES);
931 		}
932 		*permp = perm;
933 		return (0);
934 	} else if (flag & IPC_CREAT) {
935 		*permp = NULL;
936 		return (0);
937 	}
938 	return (ENOENT);
939 }
940 
941 static int
942 ipc_alloc_test(ipc_service_t *service, proc_t *pp)
943 {
944 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
945 
946 	/*
947 	 * Resizing the table first would result in a cleaner code
948 	 * path, but would also allow a user to (permanently) double
949 	 * the id table size in cases where the allocation would be
950 	 * denied.  Hence we test the rctl first.
951 	 */
952 retry:
953 	mutex_enter(&pp->p_lock);
954 	if ((rctl_test(service->ipcs_proj_rctl, pp->p_task->tk_proj->kpj_rctls,
955 	    pp, 1, RCA_SAFE) & RCT_DENY) ||
956 	    (rctl_test(service->ipcs_zone_rctl, pp->p_zone->zone_rctls,
957 	    pp, 1, RCA_SAFE) & RCT_DENY)) {
958 		mutex_exit(&pp->p_lock);
959 		return (ENOSPC);
960 	}
961 
962 	if (service->ipcs_count == service->ipcs_tabsz) {
963 		int error;
964 
965 		mutex_exit(&pp->p_lock);
966 		if (error = ipc_grow(service))
967 			return (error);
968 		goto retry;
969 	}
970 
971 	return (0);
972 }
973 
974 /*
975  * Given a key, search for or create the associated identifier.
976  *
977  * If IPC_CREAT is specified and the key isn't found, or if the key is
978  * equal to IPC_PRIVATE, we return 0 and place a pointer to a newly
979  * allocated object structure in permp.  A pointer to the held service
980  * lock is placed in lockp.  ipc_mode's IPC_ALLOC bit is clear.
981  *
982  * If the key is found and no error conditions arise, we return 0 and
983  * place a pointer to the existing object structure in permp.  A
984  * pointer to the held ID lock is placed in lockp.  ipc_mode's
985  * IPC_ALLOC bit is set.
986  *
987  * Otherwise, a non-zero errno value is returned.
988  */
989 int
990 ipc_get(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp,
991     kmutex_t **lockp)
992 {
993 	kipc_perm_t	*perm = NULL;
994 	proc_t		*pp = curproc;
995 	int		error, index;
996 	cred_t		*cr = CRED();
997 
998 	if (key != IPC_PRIVATE) {
999 
1000 		mutex_enter(&service->ipcs_lock);
1001 		error = ipc_keylookup(service, key, flag, &perm);
1002 		if (perm != NULL)
1003 			index = ipc_lock_internal(service, perm->ipc_id);
1004 		mutex_exit(&service->ipcs_lock);
1005 
1006 		if (error) {
1007 			ASSERT(perm == NULL);
1008 			return (error);
1009 		}
1010 
1011 		if (perm) {
1012 			ASSERT(!IPC_FREE(perm));
1013 			*permp = perm;
1014 			*lockp = &service->ipcs_table[index].ipct_lock;
1015 			return (0);
1016 		}
1017 
1018 		/* Key not found; fall through */
1019 	}
1020 
1021 	perm = kmem_zalloc(service->ipcs_ssize, KM_SLEEP);
1022 
1023 	mutex_enter(&service->ipcs_lock);
1024 	if (error = ipc_alloc_test(service, pp)) {
1025 		mutex_exit(&service->ipcs_lock);
1026 		kmem_free(perm, service->ipcs_ssize);
1027 		return (error);
1028 	}
1029 
1030 	perm->ipc_cuid = perm->ipc_uid = crgetuid(cr);
1031 	perm->ipc_cgid = perm->ipc_gid = crgetgid(cr);
1032 	perm->ipc_zoneid = getzoneid();
1033 	perm->ipc_mode = flag & 0777;
1034 	perm->ipc_key = key;
1035 	perm->ipc_ref = 1;
1036 	perm->ipc_id = IPC_ID_INVAL;
1037 	*permp = perm;
1038 	*lockp = &service->ipcs_lock;
1039 
1040 	return (0);
1041 }
1042 
1043 /*
1044  * Attempts to add the a newly created ID to the global namespace.  If
1045  * creating it would cause an error, we return the error.  If there is
1046  * the possibility that we could obtain the existing ID and return it
1047  * to the user, we return EAGAIN.  Otherwise, we return 0 with p_lock
1048  * and the service lock held.
1049  *
1050  * Since this should be only called after all initialization has been
1051  * completed, on failure we automatically invoke the destructor for the
1052  * object and deallocate the memory associated with it.
1053  */
1054 int
1055 ipc_commit_begin(ipc_service_t *service, key_t key, int flag,
1056     kipc_perm_t *newperm)
1057 {
1058 	kipc_perm_t *perm;
1059 	int error;
1060 	proc_t *pp = curproc;
1061 
1062 	ASSERT(newperm->ipc_ref == 1);
1063 	ASSERT(IPC_FREE(newperm));
1064 
1065 	/*
1066 	 * Set ipc_proj and ipc_zone_ref so that future calls to ipc_cleanup()
1067 	 * clean up the necessary state.  This must be done before the
1068 	 * potential call to ipcs_dtor() below.
1069 	 */
1070 	newperm->ipc_proj = pp->p_task->tk_proj;
1071 	zone_init_ref(&newperm->ipc_zone_ref);
1072 	zone_hold_ref(pp->p_zone, &newperm->ipc_zone_ref, ZONE_REF_IPC);
1073 
1074 	mutex_enter(&service->ipcs_lock);
1075 	/*
1076 	 * Ensure that no-one has raced with us and created the key.
1077 	 */
1078 	if ((key != IPC_PRIVATE) &&
1079 	    (((error = ipc_keylookup(service, key, flag, &perm)) != 0) ||
1080 	    (perm != NULL))) {
1081 		error = error ? error : EAGAIN;
1082 		goto errout;
1083 	}
1084 
1085 	/*
1086 	 * Ensure that no-one has raced with us and used the last of
1087 	 * the permissible ids, or the last of the free spaces in the
1088 	 * id table.
1089 	 */
1090 	if (error = ipc_alloc_test(service, pp))
1091 		goto errout;
1092 
1093 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
1094 	ASSERT(MUTEX_HELD(&pp->p_lock));
1095 
1096 	return (0);
1097 errout:
1098 	mutex_exit(&service->ipcs_lock);
1099 	service->ipcs_dtor(newperm);
1100 	zone_rele_ref(&newperm->ipc_zone_ref, ZONE_REF_IPC);
1101 	kmem_free(newperm, service->ipcs_ssize);
1102 	return (error);
1103 }
1104 
1105 /*
1106  * Commit the ID allocation transaction.  Called with p_lock and the
1107  * service lock held, both of which are dropped.  Returns the held ID
1108  * lock so the caller can extract the ID and perform ipcget auditing.
1109  */
1110 kmutex_t *
1111 ipc_commit_end(ipc_service_t *service, kipc_perm_t *perm)
1112 {
1113 	ipc_slot_t *slot;
1114 	avl_index_t where;
1115 	int index;
1116 	void *loc;
1117 
1118 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
1119 	ASSERT(MUTEX_HELD(&curproc->p_lock));
1120 
1121 	(void) project_hold(perm->ipc_proj);
1122 	mutex_exit(&curproc->p_lock);
1123 
1124 	/*
1125 	 * Pick out our slot.
1126 	 */
1127 	service->ipcs_count++;
1128 	index = id_alloc(service->ipcs_ids);
1129 	ASSERT(index < service->ipcs_tabsz);
1130 	slot = &service->ipcs_table[index];
1131 	mutex_enter(&slot->ipct_lock);
1132 	ASSERT(slot->ipct_data == NULL);
1133 
1134 	/*
1135 	 * Update the perm structure.
1136 	 */
1137 	perm->ipc_mode |= IPC_ALLOC;
1138 	perm->ipc_id = (slot->ipct_seq << IPC_SEQ_SHIFT) | index;
1139 
1140 	/*
1141 	 * Push into global visibility.
1142 	 */
1143 	slot->ipct_data = perm;
1144 	if (perm->ipc_key != IPC_PRIVATE) {
1145 		loc = avl_find(&service->ipcs_keys, perm, &where);
1146 		ASSERT(loc == NULL);
1147 		avl_insert(&service->ipcs_keys, perm, where);
1148 	}
1149 	list_insert_head(&service->ipcs_usedids, perm);
1150 
1151 	/*
1152 	 * Update resource consumption.
1153 	 */
1154 	IPC_PROJ_USAGE(perm, service) += 1;
1155 	IPC_ZONE_USAGE(perm, service) += 1;
1156 
1157 	mutex_exit(&service->ipcs_lock);
1158 	return (&slot->ipct_lock);
1159 }
1160 
1161 /*
1162  * Clean up function, in case the allocation fails.  If called between
1163  * ipc_lookup and ipc_commit_begin, perm->ipc_proj will be 0 and we
1164  * merely free the perm structure.  If called after ipc_commit_begin,
1165  * we also drop locks and call the ID's destructor.
1166  */
1167 void
1168 ipc_cleanup(ipc_service_t *service, kipc_perm_t *perm)
1169 {
1170 	ASSERT(IPC_FREE(perm));
1171 	if (perm->ipc_proj) {
1172 		mutex_exit(&curproc->p_lock);
1173 		mutex_exit(&service->ipcs_lock);
1174 		service->ipcs_dtor(perm);
1175 	}
1176 	if (perm->ipc_zone_ref.zref_zone != NULL)
1177 		zone_rele_ref(&perm->ipc_zone_ref, ZONE_REF_IPC);
1178 	kmem_free(perm, service->ipcs_ssize);
1179 }
1180 
1181 
1182 /*
1183  * Common code to remove an IPC object.  This should be called after
1184  * all permissions checks have been performed, and with the service
1185  * and ID locked.  Note that this does not remove the object from
1186  * the ipcs_usedids list (this needs to be done by the caller before
1187  * dropping the service lock).
1188  */
1189 static void
1190 ipc_remove(ipc_service_t *service, kipc_perm_t *perm)
1191 {
1192 	int id = perm->ipc_id;
1193 	int index;
1194 
1195 	ASSERT(MUTEX_HELD(&service->ipcs_lock));
1196 	ASSERT(IPC_LOCKED(service, perm));
1197 
1198 	index = IPC_INDEX(id);
1199 
1200 	service->ipcs_table[index].ipct_data = NULL;
1201 
1202 	if (perm->ipc_key != IPC_PRIVATE)
1203 		avl_remove(&service->ipcs_keys, perm);
1204 	list_remove(&service->ipcs_usedids, perm);
1205 	perm->ipc_mode &= ~IPC_ALLOC;
1206 
1207 	id_free(service->ipcs_ids, index);
1208 
1209 	if (service->ipcs_table[index].ipct_seq++ == IPC_SEQ_MASK)
1210 		service->ipcs_table[index].ipct_seq = 0;
1211 	service->ipcs_count--;
1212 	ASSERT(IPC_PROJ_USAGE(perm, service) > 0);
1213 	ASSERT(IPC_ZONE_USAGE(perm, service) > 0);
1214 	IPC_PROJ_USAGE(perm, service) -= 1;
1215 	IPC_ZONE_USAGE(perm, service) -= 1;
1216 	ASSERT(service->ipcs_count || ((IPC_PROJ_USAGE(perm, service) == 0) &&
1217 	    (IPC_ZONE_USAGE(perm, service) == 0)));
1218 }
1219 
1220 
1221 /*
1222  * Common code to perform an IPC_RMID.  Returns an errno value on
1223  * failure, 0 on success.
1224  */
1225 int
1226 ipc_rmid(ipc_service_t *service, int id, cred_t *cr)
1227 {
1228 	kipc_perm_t *perm;
1229 	kmutex_t *lock;
1230 
1231 	mutex_enter(&service->ipcs_lock);
1232 
1233 	lock = ipc_lookup(service, id, &perm);
1234 	if (lock == NULL) {
1235 		mutex_exit(&service->ipcs_lock);
1236 		return (EINVAL);
1237 	}
1238 
1239 	ASSERT(service->ipcs_count > 0);
1240 
1241 	if (secpolicy_ipc_owner(cr, perm) != 0) {
1242 		mutex_exit(lock);
1243 		mutex_exit(&service->ipcs_lock);
1244 		return (EPERM);
1245 	}
1246 
1247 	/*
1248 	 * Nothing can fail from this point on.
1249 	 */
1250 	ipc_remove(service, perm);
1251 	mutex_exit(&service->ipcs_lock);
1252 
1253 	/* perform any per-service removal actions */
1254 	service->ipcs_rmid(perm);
1255 
1256 	ipc_rele(service, perm);
1257 
1258 	return (0);
1259 }
1260 
1261 /*
1262  * Implementation for shmids, semids, and msgids.  buf is the address
1263  * of the user buffer, nids is the size, and pnids is a pointer to
1264  * where we write the actual number of ids that [would] have been
1265  * copied out.
1266  */
1267 int
1268 ipc_ids(ipc_service_t *service, int *buf, uint_t nids, uint_t *pnids)
1269 {
1270 	kipc_perm_t *perm;
1271 	size_t	idsize = 0;
1272 	int	error = 0;
1273 	int	idcount;
1274 	int	*ids;
1275 	int	numids = 0;
1276 	zoneid_t zoneid = getzoneid();
1277 	int	global = INGLOBALZONE(curproc);
1278 
1279 	if (buf == NULL)
1280 		nids = 0;
1281 
1282 	/*
1283 	 * Get an accurate count of the total number of ids, and allocate a
1284 	 * staging buffer.  Since ipcs_count is always sane, we don't have
1285 	 * to take ipcs_lock for our first guess.  If there are no ids, or
1286 	 * we're in the global zone and the number of ids is greater than
1287 	 * the size of the specified buffer, we shunt to the end.  Otherwise,
1288 	 * we go through the id list looking for (and counting) what is
1289 	 * visible in the specified zone.
1290 	 */
1291 	idcount = service->ipcs_count;
1292 	for (;;) {
1293 		if ((global && idcount > nids) || idcount == 0) {
1294 			numids = idcount;
1295 			nids = 0;
1296 			goto out;
1297 		}
1298 
1299 		idsize = idcount * sizeof (int);
1300 		ids = kmem_alloc(idsize, KM_SLEEP);
1301 
1302 		mutex_enter(&service->ipcs_lock);
1303 		if (idcount >= service->ipcs_count)
1304 			break;
1305 		idcount = service->ipcs_count;
1306 		mutex_exit(&service->ipcs_lock);
1307 
1308 		if (idsize != 0) {
1309 			kmem_free(ids, idsize);
1310 			idsize = 0;
1311 		}
1312 	}
1313 
1314 	for (perm = list_head(&service->ipcs_usedids); perm != NULL;
1315 	    perm = list_next(&service->ipcs_usedids, perm)) {
1316 		ASSERT(!IPC_FREE(perm));
1317 		if (global || perm->ipc_zoneid == zoneid)
1318 			ids[numids++] = perm->ipc_id;
1319 	}
1320 	mutex_exit(&service->ipcs_lock);
1321 
1322 	/*
1323 	 * If there isn't enough space to hold all of the ids, just
1324 	 * return the number of ids without copying out any of them.
1325 	 */
1326 	if (nids < numids)
1327 		nids = 0;
1328 
1329 out:
1330 	if (suword32(pnids, (uint32_t)numids) ||
1331 	    (nids != 0 && copyout(ids, buf, numids * sizeof (int))))
1332 		error = EFAULT;
1333 	if (idsize != 0)
1334 		kmem_free(ids, idsize);
1335 	return (error);
1336 }
1337 
1338 /*
1339  * Destroy IPC objects from the given service that are associated with
1340  * the given zone.
1341  *
1342  * We can't hold on to the service lock when freeing objects, so we
1343  * first search the service and move all the objects to a private
1344  * list, then walk through and free them after dropping the lock.
1345  */
1346 void
1347 ipc_remove_zone(ipc_service_t *service, zoneid_t zoneid)
1348 {
1349 	kipc_perm_t *perm, *next;
1350 	list_t rmlist;
1351 	kmutex_t *lock;
1352 
1353 	list_create(&rmlist, sizeof (kipc_perm_t),
1354 	    offsetof(kipc_perm_t, ipc_list));
1355 
1356 	mutex_enter(&service->ipcs_lock);
1357 	for (perm = list_head(&service->ipcs_usedids); perm != NULL;
1358 	    perm = next) {
1359 		next = list_next(&service->ipcs_usedids, perm);
1360 		if (perm->ipc_zoneid != zoneid)
1361 			continue;
1362 
1363 		/*
1364 		 * Remove the object from the service, then put it on
1365 		 * the removal list so we can defer the call to
1366 		 * ipc_rele (which will actually free the structure).
1367 		 * We need to do this since the destructor may grab
1368 		 * the service lock.
1369 		 */
1370 		ASSERT(!IPC_FREE(perm));
1371 		lock = ipc_lock(service, perm->ipc_id);
1372 		ipc_remove(service, perm);
1373 		mutex_exit(lock);
1374 		list_insert_tail(&rmlist, perm);
1375 	}
1376 	mutex_exit(&service->ipcs_lock);
1377 
1378 	/*
1379 	 * Now that we've dropped the service lock, loop through the
1380 	 * private list freeing removed objects.
1381 	 */
1382 	for (perm = list_head(&rmlist); perm != NULL; perm = next) {
1383 		next = list_next(&rmlist, perm);
1384 		list_remove(&rmlist, perm);
1385 
1386 		(void) ipc_lock(service, perm->ipc_id);
1387 
1388 		/* perform any per-service removal actions */
1389 		service->ipcs_rmid(perm);
1390 
1391 		/* release reference */
1392 		ipc_rele(service, perm);
1393 	}
1394 
1395 	list_destroy(&rmlist);
1396 }
1397