/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * Common Inter-Process Communication routines. * * Overview * -------- * * The System V inter-process communication (IPC) facilities provide * three services, message queues, semaphore arrays, and shared memory * segments, which are mananged using filesystem-like namespaces. * Unlike a filesystem, these namespaces aren't mounted and accessible * via a path -- a special API is used to interact with the different * facilities (nothing precludes a VFS-based interface, but the * standards require the special APIs). Furthermore, these special * APIs don't use file descriptors, nor do they have an equivalent. * This means that every operation which acts on an object needs to * perform the quivalent of a lookup, which in turn means that every * operation can fail if the specified object doesn't exist in the * facility's namespace. * * Objects * ------- * * Each object in a namespace has a unique ID, which is assigned by the * system and is used to identify the object when performing operations * on it. An object can also have a key, which is selected by the user * at allocation time and is used as a primitive rendezvous mechanism. * An object without a key is said to have a "private" key. * * To perform an operation on an object given its key, one must first * perform a lookup and obtain its ID. The ID is then used to identify * the object when performing the operation. If the object has a * private key, the ID must be known or obtained by other means. * * Each object in the namespace has a creator uid and gid, as well as * an owner uid and gid. Both are initialized with the ruid and rgid * of the process which created the object. The creator or current * owner has the ability to change the owner of the object. * * Each object in the namespace has a set of file-like permissions, * which, in conjunction with the creator and owner uid and gid, * control read and write access to the object (execute is ignored). * * Each object also has a creator project and zone, which are used to * account for its resource usage. * * Operations * ---------- * * There are five operations which all three facilities have in * common: GET, SET, STAT, RMID, and IDS. * * GET, like open, is used to allocate a new object or obtain an * existing one (using its key). It takes a key, a set of flags and * mode bits, and optionally facility-specific arguments. If the key * is IPC_PRIVATE, a new object with the requested mode bits and * facility-specific attributes is created. If the key isn't * IPC_PRIVATE, the GET will attempt to look up the specified key and * either return that or create a new key depending on the state of the * IPC_CREAT and IPC_EXCL flags, much like open. If GET needs to * allocate an object, it can fail if there is insufficient space in * the namespace (the maximum number of ids for the facility has been * exceeded) or if the facility-specific initialization fails. If GET * finds an object it can return, it can still fail if that object's * permissions or facility-specific attributes are less than those * requested. * * SET is used to adjust facility-specific parameters of an object, in * addition to the owner uid and gid, and mode bits. It can fail if * the caller isn't the creator or owner. * * STAT is used to obtain information about an object including the * general attributes object described as well as facility-specific * information. It can fail if the caller doesn't have read * permission. * * RMID removes an object from the namespace. Subsequent operations * using the object's ID or key will fail (until another object is * created with the same key or ID). Since an RMID may be performed * asynchronously with other operations, it is possible that other * threads and/or processes will have references to the object. While * a facility may have actions which need to be performed at RMID time, * only when all references are dropped can the object be destroyed. * RMID will fail if the caller isn't the creator or owner. * * IDS obtains a list of all IDs in a facility's namespace. There are * no facility-specific behaviors of IDS. * * Design * ------ * * Because some IPC facilities provide services whose operations must * scale, a mechanism which allows fast, concurrent access to * individual objects is needed. Of primary importance is object * lookup based on ID (SET, STAT, others). Allocation (GET), * deallocation (RMID), ID enumeration (IDS), and key lookups (GET) are * lesser concerns, but should be implemented in such a way that ID * lookup isn't affected (at least not in the common case). * * Starting from the bottom up, each object is represented by a * structure, the first member of which must be a kipc_perm_t. The * kipc_perm_t contains the information described above in "Objects", a * reference count (since the object may continue to exist after it has * been removed from the namespace), as well as some additional * metadata used to manage data structure membership. These objects * are dynamically allocated. * * Above the objects is a power-of-two sized table of ID slots. Each * slot contains a pointer to an object, a sequence number, and a * lock. An object's ID is a function of its slot's index in the table * and its slot's sequence number. Every time a slot is released (via * RMID) its sequence number is increased. Strictly speaking, the * sequence number is unnecessary. However, checking the sequence * number after a lookup provides a certain degree of robustness * against the use of stale IDs (useful since nothing else does). When * the table fills up, it is resized (see Locking, below). * * Of an ID's 31 bits (an ID is, as defined by the standards, a signed * int) the top IPC_SEQ_BITS are used for the sequence number with the * remainder holding the index into the table. The size of the table * is therefore bounded at 2 ^ (31 - IPC_SEQ_BITS) slots. * * Managing this table is the ipc_service structure. It contains a * pointer to the dynamically allocated ID table, a namespace-global * lock, an id_space for managing the free space in the table, and * sundry other metadata necessary for the maintenance of the * namespace. An AVL tree of all keyed objects in the table (sorted by * key) is used for key lookups. An unordered doubly linked list of * all objects in the namespace (keyed or not) is maintained to * facilitate ID enumeration. * * To help visualize these relationships, here's a picture of a * namespace with a table of size 8 containing three objects * (IPC_SEQ_BITS = 28): * * * +-ipc_service_t--+ * | table *---\ * | keys *---+----------------------\ * | all ids *--\| | * | | || | * +----------------+ || | * || | * /-------------------/| | * | /---------------/ | * | | | * | v | * | +-0------+-1------+-2------+-3------+-4--+---+-5------+-6------+-7------+ * | | Seq=3 | | | Seq=1 | : | | | Seq=6 | * | | | | | | : | | | | * | +-*------+--------+--------+-*------+----+---+--------+--------+-*------+ * | | | | | * | | /---/ | /----------------/ * | | | | | * | v v | v * | +-kipc_perm_t-+ +-kipc_perm_t-+ | +-kipc_perm_t-+ * | | id=0x30 | | id=0x13 | | | id=0x67 | * | | key=0xfeed | | key=0xbeef | | | key=0xcafe | * \->| [list] |<------>| [list] |<------>| [list] | * /->| [avl left] x /--->| [avl left] x \--->| [avl left] *---\ * | | [avl right] x | | [avl right] x | [avl right] *---+-\ * | | | | | | | | | | * | +-------------+ | +-------------+ +-------------+ | | * | \---------------------------------------------/ | * \--------------------------------------------------------------------/ * * Locking * ------- * * There are three locks (or sets of locks) which are used to ensure * correctness: the slot locks, the namespace lock, and p_lock (needed * when checking resource controls). Their ordering is * * namespace lock -> slot lock 0 -> ... -> slot lock t -> p_lock * * Generally speaking, the namespace lock is used to protect allocation * and removal from the namespace, ID enumeration, and resizing the ID * table. Specifically: * * - write access to all fields of the ipc_service structure * - read access to all variable fields of ipc_service except * ipcs_tabsz (table size) and ipcs_table (the table pointer) * - read/write access to ipc_avl, ipc_list in visible objects' * kipc_perm structures (i.e. objects which have been removed from * the namespace don't have this restriction) * - write access to ipct_seq and ipct_data in the table entries * * A slot lock by itself is meaningless (except when resizing). Of * greater interest conceptually is the notion of an ID lock -- a * "virtual lock" which refers to whichever slot lock an object's ID * currently hashes to. * * An ID lock protects all objects with that ID. Normally there will * only be one such object: the one pointed to by the locked slot. * However, if an object is removed from the namespace but retains * references (e.g. an attached shared memory segment which has been * RMIDed), it continues to use the lock associated with its original * ID. While this can result in increased contention, operations which * require taking the ID lock of removed objects are infrequent. * * Specifically, an ID lock protects the contents of an object's * structure, including the contents of the embedded kipc_perm * structure (but excluding those fields protected by the namespace * lock). It also protects the ipct_seq and ipct_data fields in its * slot (it is really a slot lock, after all). * * Recall that the table is resizable. To avoid requiring every ID * lookup to take a global lock, a scheme much like that employed for * file descriptors (see the comment above UF_ENTER in user.h) is * used. Note that the sequence number and data pointer are protected * by both the namespace lock and their slot lock. When the table is * resized, the following operations take place: * * 1) A new table is allocated. * 2) The global lock is taken. * 3) All old slots are locked, in order. * 4) The first half of the new slots are locked. * 5) All table entries are copied to the new table, and cleared from * the old table. * 6) The ipc_service structure is updated to point to the new table. * 7) The ipc_service structure is updated with the new table size. * 8) All slot locks (old and new) are dropped. * * Because the slot locks are embedded in the table, ID lookups and * other operations which require taking an slot lock need to verify * that the lock taken wasn't part of a stale table. This is * accomplished by checking the table size before and after * dereferencing the table pointer and taking the lock: if the size * changes, the lock must be dropped and reacquired. It is this * additional work which distinguishes an ID lock from a slot lock. * * Because we can't guarantee that threads aren't accessing the old * tables' locks, they are never deallocated. To prevent spurious * reports of memory leaks, a pointer to the discarded table is stored * in the new one in step 5. (Theoretically ipcs_destroy will delete * the discarded tables, but it is only ever called from a failed _init * invocation; i.e. when there aren't any.) * * Interfaces * ---------- * * The following interfaces are provided by the ipc module for use by * the individual IPC facilities: * * ipcperm_access * * Given an object and a cred structure, determines if the requested * access type is allowed. * * ipcperm_set, ipcperm_stat, * ipcperm_set64, ipcperm_stat64 * * Performs the common portion of an STAT or SET operation. All * (except stat and stat64) can fail, so they should be called before * any facility-specific non-reversible changes are made to an * object. Similarly, the set operations have side effects, so they * should only be called once the possibility of a facility-specific * failure is eliminated. * * ipcs_create * * Creates an IPC namespace for use by an IPC facility. * * ipcs_destroy * * Destroys an IPC namespace. * * ipcs_lock, ipcs_unlock * * Takes the namespace lock. Ideally such access wouldn't be * necessary, but there may be facility-specific data protected by * this lock (e.g. project-wide resource consumption). * * ipc_lock * * Takes the lock associated with an ID. Can't fail. * * ipc_relock * * Like ipc_lock, but takes a pointer to a held lock. Drops the lock * unless it is the one that would have been returned by ipc_lock. * Used after calls to cv_wait. * * ipc_lookup * * Performs an ID lookup, returns with the ID lock held. Fails if * the ID doesn't exist in the namespace. * * ipc_hold * * Takes a reference on an object. * * ipc_rele * * Releases a reference on an object, and drops the object's lock. * Calls the object's destructor if last reference is being * released. * * ipc_rele_locked * * Releases a reference on an object. Doesn't drop lock, and may * only be called when there is more than one reference to the * object. * * ipc_get, ipc_commit_begin, ipc_commit_end, ipc_cleanup * * Components of a GET operation. ipc_get performs a key lookup, * allocating an object if the key isn't found (returning with the * namespace lock and p_lock held), and returning the existing object * if it is (with the object lock held). ipc_get doesn't modify the * namespace. * * ipc_commit_begin begins the process of inserting an object * allocated by ipc_get into the namespace, and can fail. If * successful, it returns with the namespace lock and p_lock held. * ipc_commit_end completes the process of inserting an object into * the namespace and can't fail. The facility can call ipc_cleanup * at any time following a successful ipc_get and before * ipc_commit_end or a failed ipc_commit_begin to fail the * allocation. Pseudocode for the suggested GET implementation: * * top: * * ipc_get * * if failure * return * * if found { * * if object meets criteria * unlock object and return success * else * unlock object and return failure * * } else { * * perform resource control tests * drop namespace lock, p_lock * if failure * ipc_cleanup * * perform facility-specific initialization * if failure { * facility-specific cleanup * ipc_cleanup * } * * ( At this point the object should be destructible using the * destructor given to ipcs_create ) * * ipc_commit_begin * if retry * goto top * else if failure * return * * perform facility-specific resource control tests/allocations * if failure * ipc_cleanup * * ipc_commit_end * perform any infallible post-creation actions, unlock, and return * * } * * ipc_rmid * * Performs the common portion of an RMID operation -- looks up an ID * removes it, and calls the a facility-specific function to do * RMID-time cleanup on the private portions of the object. * * ipc_ids * * Performs the common portion of an IDS operation. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct modlmisc modlmisc = { &mod_miscops, "common ipc code", }; static struct modlinkage modlinkage = { MODREV_1, (void *)&modlmisc, NULL }; int _init(void) { return (mod_install(&modlinkage)); } int _fini(void) { return (mod_remove(&modlinkage)); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } /* * Check message, semaphore, or shared memory access permissions. * * This routine verifies the requested access permission for the current * process. The zone ids are compared, and the appropriate bits are * checked corresponding to owner, group (including the list of * supplementary groups), or everyone. Zero is returned on success. * On failure, the security policy is asked to check to override the * permissions check; the policy will either return 0 for access granted * or EACCES. * * Access to objects in other zones requires that the caller be in the * global zone and have the appropriate IPC_DAC_* privilege, regardless * of whether the uid or gid match those of the object. Note that * cross-zone accesses will normally never get here since they'll * fail in ipc_lookup or ipc_get. * * The arguments must be set up as follows: * p - Pointer to permission structure to verify * mode - Desired access permissions */ int ipcperm_access(kipc_perm_t *p, int mode, cred_t *cr) { int shifts = 0; uid_t uid = crgetuid(cr); zoneid_t zoneid = getzoneid(); if (p->ipc_zoneid == zoneid) { if (uid != p->ipc_uid && uid != p->ipc_cuid) { shifts += 3; if (!groupmember(p->ipc_gid, cr) && !groupmember(p->ipc_cgid, cr)) shifts += 3; } mode &= ~(p->ipc_mode << shifts); if (mode == 0) return (0); } else if (zoneid != GLOBAL_ZONEID) return (EACCES); return (secpolicy_ipc_access(cr, p, mode)); } /* * There are two versions of the ipcperm_set/stat functions: * ipcperm_??? - for use with IPC_SET/STAT * ipcperm_???_64 - for use with IPC_SET64/STAT64 * * These functions encapsulate the common portions (copying, permission * checks, and auditing) of the set/stat operations. All, except for * stat and stat_64 which are void, return 0 on success or a non-zero * errno value on error. */ int ipcperm_set(ipc_service_t *service, struct cred *cr, kipc_perm_t *kperm, struct ipc_perm *perm, model_t model) { STRUCT_HANDLE(ipc_perm, lperm); uid_t uid; gid_t gid; mode_t mode; zone_t *zone; ASSERT(IPC_LOCKED(service, kperm)); STRUCT_SET_HANDLE(lperm, model, perm); uid = STRUCT_FGET(lperm, uid); gid = STRUCT_FGET(lperm, gid); mode = STRUCT_FGET(lperm, mode); if (secpolicy_ipc_owner(cr, kperm) != 0) return (EPERM); zone = crgetzone(cr); if (!VALID_UID(uid, zone) || !VALID_GID(gid, zone)) return (EINVAL); kperm->ipc_uid = uid; kperm->ipc_gid = gid; kperm->ipc_mode = (mode & 0777) | (kperm->ipc_mode & ~0777); if (AU_AUDITING()) audit_ipcget(service->ipcs_atype, kperm); return (0); } void ipcperm_stat(struct ipc_perm *perm, kipc_perm_t *kperm, model_t model) { STRUCT_HANDLE(ipc_perm, lperm); STRUCT_SET_HANDLE(lperm, model, perm); STRUCT_FSET(lperm, uid, kperm->ipc_uid); STRUCT_FSET(lperm, gid, kperm->ipc_gid); STRUCT_FSET(lperm, cuid, kperm->ipc_cuid); STRUCT_FSET(lperm, cgid, kperm->ipc_cgid); STRUCT_FSET(lperm, mode, kperm->ipc_mode); STRUCT_FSET(lperm, seq, 0); STRUCT_FSET(lperm, key, kperm->ipc_key); } int ipcperm_set64(ipc_service_t *service, struct cred *cr, kipc_perm_t *kperm, ipc_perm64_t *perm64) { zone_t *zone; ASSERT(IPC_LOCKED(service, kperm)); if (secpolicy_ipc_owner(cr, kperm) != 0) return (EPERM); zone = crgetzone(cr); if (!VALID_UID(perm64->ipcx_uid, zone) || !VALID_GID(perm64->ipcx_gid, zone)) return (EINVAL); kperm->ipc_uid = perm64->ipcx_uid; kperm->ipc_gid = perm64->ipcx_gid; kperm->ipc_mode = (perm64->ipcx_mode & 0777) | (kperm->ipc_mode & ~0777); if (AU_AUDITING()) audit_ipcget(service->ipcs_atype, kperm); return (0); } void ipcperm_stat64(ipc_perm64_t *perm64, kipc_perm_t *kperm) { perm64->ipcx_uid = kperm->ipc_uid; perm64->ipcx_gid = kperm->ipc_gid; perm64->ipcx_cuid = kperm->ipc_cuid; perm64->ipcx_cgid = kperm->ipc_cgid; perm64->ipcx_mode = kperm->ipc_mode; perm64->ipcx_key = kperm->ipc_key; perm64->ipcx_projid = kperm->ipc_proj->kpj_id; perm64->ipcx_zoneid = kperm->ipc_zoneid; } /* * ipc key comparator. */ static int ipc_key_compar(const void *a, const void *b) { kipc_perm_t *aperm = (kipc_perm_t *)a; kipc_perm_t *bperm = (kipc_perm_t *)b; int ak = aperm->ipc_key; int bk = bperm->ipc_key; zoneid_t az; zoneid_t bz; ASSERT(ak != IPC_PRIVATE); ASSERT(bk != IPC_PRIVATE); /* * Compare key first, then zoneid. This optimizes performance for * systems with only one zone, since the zone checks will only be * made when the keys match. */ if (ak < bk) return (-1); if (ak > bk) return (1); /* keys match */ az = aperm->ipc_zoneid; bz = bperm->ipc_zoneid; if (az < bz) return (-1); if (az > bz) return (1); return (0); } /* * Create an ipc service. */ ipc_service_t * ipcs_create(const char *name, rctl_hndl_t proj_rctl, rctl_hndl_t zone_rctl, size_t size, ipc_func_t *dtor, ipc_func_t *rmid, int audit_type, size_t rctl_offset) { ipc_service_t *result; result = kmem_alloc(sizeof (ipc_service_t), KM_SLEEP); mutex_init(&result->ipcs_lock, NULL, MUTEX_ADAPTIVE, NULL); result->ipcs_count = 0; avl_create(&result->ipcs_keys, ipc_key_compar, size, 0); result->ipcs_tabsz = IPC_IDS_MIN; result->ipcs_table = kmem_zalloc(IPC_IDS_MIN * sizeof (ipc_slot_t), KM_SLEEP); result->ipcs_ssize = size; result->ipcs_ids = id_space_create(name, 0, IPC_IDS_MIN); result->ipcs_dtor = dtor; result->ipcs_rmid = rmid; result->ipcs_proj_rctl = proj_rctl; result->ipcs_zone_rctl = zone_rctl; result->ipcs_atype = audit_type; ASSERT(rctl_offset < sizeof (ipc_rqty_t)); result->ipcs_rctlofs = rctl_offset; list_create(&result->ipcs_usedids, sizeof (kipc_perm_t), offsetof(kipc_perm_t, ipc_list)); return (result); } /* * Destroy an ipc service. */ void ipcs_destroy(ipc_service_t *service) { ipc_slot_t *slot, *next; mutex_enter(&service->ipcs_lock); ASSERT(service->ipcs_count == 0); avl_destroy(&service->ipcs_keys); list_destroy(&service->ipcs_usedids); id_space_destroy(service->ipcs_ids); for (slot = service->ipcs_table; slot; slot = next) { next = slot[0].ipct_chain; kmem_free(slot, service->ipcs_tabsz * sizeof (ipc_slot_t)); service->ipcs_tabsz >>= 1; } mutex_destroy(&service->ipcs_lock); kmem_free(service, sizeof (ipc_service_t)); } /* * Takes the service lock. */ void ipcs_lock(ipc_service_t *service) { mutex_enter(&service->ipcs_lock); } /* * Releases the service lock. */ void ipcs_unlock(ipc_service_t *service) { mutex_exit(&service->ipcs_lock); } /* * Locks the specified ID. Returns the ID's ID table index. */ static int ipc_lock_internal(ipc_service_t *service, uint_t id) { uint_t tabsz; uint_t index; kmutex_t *mutex; for (;;) { tabsz = service->ipcs_tabsz; membar_consumer(); index = id & (tabsz - 1); mutex = &service->ipcs_table[index].ipct_lock; mutex_enter(mutex); if (tabsz == service->ipcs_tabsz) break; mutex_exit(mutex); } return (index); } /* * Locks the specified ID. Returns a pointer to the ID's lock. */ kmutex_t * ipc_lock(ipc_service_t *service, int id) { uint_t index; /* * These assertions don't reflect requirements of the code * which follows, but they should never fail nonetheless. */ ASSERT(id >= 0); ASSERT(IPC_INDEX(id) < service->ipcs_tabsz); index = ipc_lock_internal(service, id); return (&service->ipcs_table[index].ipct_lock); } /* * Checks to see if the held lock provided is the current lock for the * specified id. If so, we return it instead of dropping it and * returning the result of ipc_lock. This is intended to speed up cv * wakeups where we are left holding a lock which could be stale, but * probably isn't. */ kmutex_t * ipc_relock(ipc_service_t *service, int id, kmutex_t *lock) { ASSERT(id >= 0); ASSERT(IPC_INDEX(id) < service->ipcs_tabsz); ASSERT(MUTEX_HELD(lock)); if (&service->ipcs_table[IPC_INDEX(id)].ipct_lock == lock) return (lock); mutex_exit(lock); return (ipc_lock(service, id)); } /* * Performs an ID lookup. If the ID doesn't exist or has been removed, * or isn't visible to the caller (because of zones), NULL is returned. * Otherwise, a pointer to the ID's perm structure and held ID lock are * returned. */ kmutex_t * ipc_lookup(ipc_service_t *service, int id, kipc_perm_t **perm) { kipc_perm_t *result; uint_t index; /* * There is no need to check to see if id is in-range (i.e. * positive and fits into the table). If it is out-of-range, * the id simply won't match the object's. */ index = ipc_lock_internal(service, id); result = service->ipcs_table[index].ipct_data; if (result == NULL || result->ipc_id != (uint_t)id || !HASZONEACCESS(curproc, result->ipc_zoneid)) { mutex_exit(&service->ipcs_table[index].ipct_lock); return (NULL); } ASSERT(IPC_SEQ(id) == service->ipcs_table[index].ipct_seq); *perm = result; if (AU_AUDITING()) audit_ipc(service->ipcs_atype, id, result); return (&service->ipcs_table[index].ipct_lock); } /* * Increase the reference count on an ID. */ /*ARGSUSED*/ void ipc_hold(ipc_service_t *s, kipc_perm_t *perm) { ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz); ASSERT(IPC_LOCKED(s, perm)); perm->ipc_ref++; } /* * Decrease the reference count on an ID and drops the ID's lock. * Destroys the ID if the new reference count is zero. */ void ipc_rele(ipc_service_t *s, kipc_perm_t *perm) { int nref; ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz); ASSERT(IPC_LOCKED(s, perm)); ASSERT(perm->ipc_ref > 0); nref = --perm->ipc_ref; mutex_exit(&s->ipcs_table[IPC_INDEX(perm->ipc_id)].ipct_lock); if (nref == 0) { ASSERT(IPC_FREE(perm)); /* ipc_rmid clears IPC_ALLOC */ s->ipcs_dtor(perm); project_rele(perm->ipc_proj); zone_rele(perm->ipc_zone); kmem_free(perm, s->ipcs_ssize); } } /* * Decrease the reference count on an ID, but don't drop the ID lock. * Used in cases where one thread needs to remove many references (on * behalf of other parties). */ void ipc_rele_locked(ipc_service_t *s, kipc_perm_t *perm) { ASSERT(perm->ipc_ref > 1); ASSERT(IPC_INDEX(perm->ipc_id) < s->ipcs_tabsz); ASSERT(IPC_LOCKED(s, perm)); perm->ipc_ref--; } /* * Internal function to grow the service ID table. */ static int ipc_grow(ipc_service_t *service) { ipc_slot_t *new, *old; int i, oldsize, newsize; ASSERT(MUTEX_HELD(&service->ipcs_lock)); ASSERT(MUTEX_NOT_HELD(&curproc->p_lock)); if (service->ipcs_tabsz == IPC_IDS_MAX) return (ENOSPC); oldsize = service->ipcs_tabsz; newsize = oldsize << 1; new = kmem_zalloc(newsize * sizeof (ipc_slot_t), KM_NOSLEEP); if (new == NULL) return (ENOSPC); old = service->ipcs_table; for (i = 0; i < oldsize; i++) { mutex_enter(&old[i].ipct_lock); mutex_enter(&new[i].ipct_lock); new[i].ipct_seq = old[i].ipct_seq; new[i].ipct_data = old[i].ipct_data; old[i].ipct_data = NULL; } new[0].ipct_chain = old; service->ipcs_table = new; membar_producer(); service->ipcs_tabsz = newsize; for (i = 0; i < oldsize; i++) { mutex_exit(&old[i].ipct_lock); mutex_exit(&new[i].ipct_lock); } id_space_extend(service->ipcs_ids, oldsize, service->ipcs_tabsz); return (0); } static int ipc_keylookup(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp) { kipc_perm_t *perm = NULL; avl_index_t where; kipc_perm_t template; ASSERT(MUTEX_HELD(&service->ipcs_lock)); template.ipc_key = key; template.ipc_zoneid = getzoneid(); if (perm = avl_find(&service->ipcs_keys, &template, &where)) { ASSERT(!IPC_FREE(perm)); if ((flag & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) return (EEXIST); if ((flag & 0777) & ~perm->ipc_mode) { if (AU_AUDITING()) audit_ipcget(NULL, (void *)perm); return (EACCES); } *permp = perm; return (0); } else if (flag & IPC_CREAT) { *permp = NULL; return (0); } return (ENOENT); } static int ipc_alloc_test(ipc_service_t *service, proc_t *pp) { ASSERT(MUTEX_HELD(&service->ipcs_lock)); /* * Resizing the table first would result in a cleaner code * path, but would also allow a user to (permanently) double * the id table size in cases where the allocation would be * denied. Hence we test the rctl first. */ retry: mutex_enter(&pp->p_lock); if ((rctl_test(service->ipcs_proj_rctl, pp->p_task->tk_proj->kpj_rctls, pp, 1, RCA_SAFE) & RCT_DENY) || (rctl_test(service->ipcs_zone_rctl, pp->p_zone->zone_rctls, pp, 1, RCA_SAFE) & RCT_DENY)) { mutex_exit(&pp->p_lock); return (ENOSPC); } if (service->ipcs_count == service->ipcs_tabsz) { int error; mutex_exit(&pp->p_lock); if (error = ipc_grow(service)) return (error); goto retry; } return (0); } /* * Given a key, search for or create the associated identifier. * * If IPC_CREAT is specified and the key isn't found, or if the key is * equal to IPC_PRIVATE, we return 0 and place a pointer to a newly * allocated object structure in permp. A pointer to the held service * lock is placed in lockp. ipc_mode's IPC_ALLOC bit is clear. * * If the key is found and no error conditions arise, we return 0 and * place a pointer to the existing object structure in permp. A * pointer to the held ID lock is placed in lockp. ipc_mode's * IPC_ALLOC bit is set. * * Otherwise, a non-zero errno value is returned. */ int ipc_get(ipc_service_t *service, key_t key, int flag, kipc_perm_t **permp, kmutex_t **lockp) { kipc_perm_t *perm = NULL; proc_t *pp = curproc; int error, index; cred_t *cr = CRED(); if (key != IPC_PRIVATE) { mutex_enter(&service->ipcs_lock); error = ipc_keylookup(service, key, flag, &perm); if (perm != NULL) index = ipc_lock_internal(service, perm->ipc_id); mutex_exit(&service->ipcs_lock); if (error) { ASSERT(perm == NULL); return (error); } if (perm) { ASSERT(!IPC_FREE(perm)); *permp = perm; *lockp = &service->ipcs_table[index].ipct_lock; return (0); } /* Key not found; fall through */ } perm = kmem_zalloc(service->ipcs_ssize, KM_SLEEP); mutex_enter(&service->ipcs_lock); if (error = ipc_alloc_test(service, pp)) { mutex_exit(&service->ipcs_lock); kmem_free(perm, service->ipcs_ssize); return (error); } perm->ipc_cuid = perm->ipc_uid = crgetuid(cr); perm->ipc_cgid = perm->ipc_gid = crgetgid(cr); perm->ipc_zoneid = getzoneid(); perm->ipc_mode = flag & 0777; perm->ipc_key = key; perm->ipc_ref = 1; perm->ipc_id = IPC_ID_INVAL; *permp = perm; *lockp = &service->ipcs_lock; return (0); } /* * Attempts to add the a newly created ID to the global namespace. If * creating it would cause an error, we return the error. If there is * the possibility that we could obtain the existing ID and return it * to the user, we return EAGAIN. Otherwise, we return 0 with p_lock * and the service lock held. * * Since this should be only called after all initialization has been * completed, on failure we automatically invoke the destructor for the * object and deallocate the memory associated with it. */ int ipc_commit_begin(ipc_service_t *service, key_t key, int flag, kipc_perm_t *newperm) { kipc_perm_t *perm; int error; proc_t *pp = curproc; ASSERT(newperm->ipc_ref == 1); ASSERT(IPC_FREE(newperm)); /* * Set ipc_proj and ipc_zone so that future calls to ipc_cleanup() * clean up the necessary state. This must be done before the * potential call to ipcs_dtor() below. */ newperm->ipc_proj = pp->p_task->tk_proj; newperm->ipc_zone = pp->p_zone; mutex_enter(&service->ipcs_lock); /* * Ensure that no-one has raced with us and created the key. */ if ((key != IPC_PRIVATE) && (((error = ipc_keylookup(service, key, flag, &perm)) != 0) || (perm != NULL))) { error = error ? error : EAGAIN; goto errout; } /* * Ensure that no-one has raced with us and used the last of * the permissible ids, or the last of the free spaces in the * id table. */ if (error = ipc_alloc_test(service, pp)) goto errout; ASSERT(MUTEX_HELD(&service->ipcs_lock)); ASSERT(MUTEX_HELD(&pp->p_lock)); return (0); errout: mutex_exit(&service->ipcs_lock); service->ipcs_dtor(newperm); kmem_free(newperm, service->ipcs_ssize); return (error); } /* * Commit the ID allocation transaction. Called with p_lock and the * service lock held, both of which are dropped. Returns the held ID * lock so the caller can extract the ID and perform ipcget auditing. */ kmutex_t * ipc_commit_end(ipc_service_t *service, kipc_perm_t *perm) { ipc_slot_t *slot; avl_index_t where; int index; void *loc; ASSERT(MUTEX_HELD(&service->ipcs_lock)); ASSERT(MUTEX_HELD(&curproc->p_lock)); (void) project_hold(perm->ipc_proj); (void) zone_hold(perm->ipc_zone); mutex_exit(&curproc->p_lock); /* * Pick out our slot. */ service->ipcs_count++; index = id_alloc(service->ipcs_ids); ASSERT(index < service->ipcs_tabsz); slot = &service->ipcs_table[index]; mutex_enter(&slot->ipct_lock); ASSERT(slot->ipct_data == NULL); /* * Update the perm structure. */ perm->ipc_mode |= IPC_ALLOC; perm->ipc_id = (slot->ipct_seq << IPC_SEQ_SHIFT) | index; /* * Push into global visibility. */ slot->ipct_data = perm; if (perm->ipc_key != IPC_PRIVATE) { loc = avl_find(&service->ipcs_keys, perm, &where); ASSERT(loc == NULL); avl_insert(&service->ipcs_keys, perm, where); } list_insert_head(&service->ipcs_usedids, perm); /* * Update resource consumption. */ IPC_PROJ_USAGE(perm, service) += 1; IPC_ZONE_USAGE(perm, service) += 1; mutex_exit(&service->ipcs_lock); return (&slot->ipct_lock); } /* * Clean up function, in case the allocation fails. If called between * ipc_lookup and ipc_commit_begin, perm->ipc_proj will be 0 and we * merely free the perm structure. If called after ipc_commit_begin, * we also drop locks and call the ID's destructor. */ void ipc_cleanup(ipc_service_t *service, kipc_perm_t *perm) { ASSERT(IPC_FREE(perm)); if (perm->ipc_proj) { mutex_exit(&curproc->p_lock); mutex_exit(&service->ipcs_lock); service->ipcs_dtor(perm); } kmem_free(perm, service->ipcs_ssize); } /* * Common code to remove an IPC object. This should be called after * all permissions checks have been performed, and with the service * and ID locked. Note that this does not remove the object from * the ipcs_usedids list (this needs to be done by the caller before * dropping the service lock). */ static void ipc_remove(ipc_service_t *service, kipc_perm_t *perm) { int id = perm->ipc_id; int index; ASSERT(MUTEX_HELD(&service->ipcs_lock)); ASSERT(IPC_LOCKED(service, perm)); index = IPC_INDEX(id); service->ipcs_table[index].ipct_data = NULL; if (perm->ipc_key != IPC_PRIVATE) avl_remove(&service->ipcs_keys, perm); list_remove(&service->ipcs_usedids, perm); perm->ipc_mode &= ~IPC_ALLOC; id_free(service->ipcs_ids, index); if (service->ipcs_table[index].ipct_seq++ == IPC_SEQ_MASK) service->ipcs_table[index].ipct_seq = 0; service->ipcs_count--; ASSERT(IPC_PROJ_USAGE(perm, service) > 0); ASSERT(IPC_ZONE_USAGE(perm, service) > 0); IPC_PROJ_USAGE(perm, service) -= 1; IPC_ZONE_USAGE(perm, service) -= 1; ASSERT(service->ipcs_count || ((IPC_PROJ_USAGE(perm, service) == 0) && (IPC_ZONE_USAGE(perm, service) == 0))); } /* * Common code to perform an IPC_RMID. Returns an errno value on * failure, 0 on success. */ int ipc_rmid(ipc_service_t *service, int id, cred_t *cr) { kipc_perm_t *perm; kmutex_t *lock; mutex_enter(&service->ipcs_lock); lock = ipc_lookup(service, id, &perm); if (lock == NULL) { mutex_exit(&service->ipcs_lock); return (EINVAL); } ASSERT(service->ipcs_count > 0); if (secpolicy_ipc_owner(cr, perm) != 0) { mutex_exit(lock); mutex_exit(&service->ipcs_lock); return (EPERM); } /* * Nothing can fail from this point on. */ ipc_remove(service, perm); mutex_exit(&service->ipcs_lock); /* perform any per-service removal actions */ service->ipcs_rmid(perm); ipc_rele(service, perm); return (0); } /* * Implementation for shmids, semids, and msgids. buf is the address * of the user buffer, nids is the size, and pnids is a pointer to * where we write the actual number of ids that [would] have been * copied out. */ int ipc_ids(ipc_service_t *service, int *buf, uint_t nids, uint_t *pnids) { kipc_perm_t *perm; size_t idsize = 0; int error = 0; int idcount; int *ids; int numids = 0; zoneid_t zoneid = getzoneid(); int global = INGLOBALZONE(curproc); if (buf == NULL) nids = 0; /* * Get an accurate count of the total number of ids, and allocate a * staging buffer. Since ipcs_count is always sane, we don't have * to take ipcs_lock for our first guess. If there are no ids, or * we're in the global zone and the number of ids is greater than * the size of the specified buffer, we shunt to the end. Otherwise, * we go through the id list looking for (and counting) what is * visible in the specified zone. */ idcount = service->ipcs_count; for (;;) { if ((global && idcount > nids) || idcount == 0) { numids = idcount; nids = 0; goto out; } idsize = idcount * sizeof (int); ids = kmem_alloc(idsize, KM_SLEEP); mutex_enter(&service->ipcs_lock); if (idcount >= service->ipcs_count) break; idcount = service->ipcs_count; mutex_exit(&service->ipcs_lock); if (idsize != 0) { kmem_free(ids, idsize); idsize = 0; } } for (perm = list_head(&service->ipcs_usedids); perm != NULL; perm = list_next(&service->ipcs_usedids, perm)) { ASSERT(!IPC_FREE(perm)); if (global || perm->ipc_zoneid == zoneid) ids[numids++] = perm->ipc_id; } mutex_exit(&service->ipcs_lock); /* * If there isn't enough space to hold all of the ids, just * return the number of ids without copying out any of them. */ if (nids < numids) nids = 0; out: if (suword32(pnids, (uint32_t)numids) || (nids != 0 && copyout(ids, buf, numids * sizeof (int)))) error = EFAULT; if (idsize != 0) kmem_free(ids, idsize); return (error); } /* * Destroy IPC objects from the given service that are associated with * the given zone. * * We can't hold on to the service lock when freeing objects, so we * first search the service and move all the objects to a private * list, then walk through and free them after dropping the lock. */ void ipc_remove_zone(ipc_service_t *service, zoneid_t zoneid) { kipc_perm_t *perm, *next; list_t rmlist; kmutex_t *lock; list_create(&rmlist, sizeof (kipc_perm_t), offsetof(kipc_perm_t, ipc_list)); mutex_enter(&service->ipcs_lock); for (perm = list_head(&service->ipcs_usedids); perm != NULL; perm = next) { next = list_next(&service->ipcs_usedids, perm); if (perm->ipc_zoneid != zoneid) continue; /* * Remove the object from the service, then put it on * the removal list so we can defer the call to * ipc_rele (which will actually free the structure). * We need to do this since the destructor may grab * the service lock. */ ASSERT(!IPC_FREE(perm)); lock = ipc_lock(service, perm->ipc_id); ipc_remove(service, perm); mutex_exit(lock); list_insert_tail(&rmlist, perm); } mutex_exit(&service->ipcs_lock); /* * Now that we've dropped the service lock, loop through the * private list freeing removed objects. */ for (perm = list_head(&rmlist); perm != NULL; perm = next) { next = list_next(&rmlist, perm); list_remove(&rmlist, perm); (void) ipc_lock(service, perm->ipc_id); /* perform any per-service removal actions */ service->ipcs_rmid(perm); /* release reference */ ipc_rele(service, perm); } list_destroy(&rmlist); }