/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static kthread_t *lwpsobj_owner(caddr_t); static void lwp_unsleep(kthread_t *t); static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip); static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg); extern int lwp_cond_signal(lwp_cond_t *cv); /* * Maximum number of user prio inheritance locks that can be held by a thread. * Used to limit kmem for each thread. This is a per-thread limit that * can be administered on a system wide basis (using /etc/system). * * Also, when a limit, say maxlwps is added for numbers of lwps within a * process, the per-thread limit automatically becomes a process-wide limit * of maximum number of held upi locks within a process: * maxheldupimx = maxnestupimx * maxlwps; */ static uint32_t maxnestupimx = 2000; /* * The sobj_ops vector exports a set of functions needed when a thread * is asleep on a synchronization object of this type. */ static sobj_ops_t lwp_sobj_ops = { SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri }; static kthread_t *lwpsobj_pi_owner(upimutex_t *up); static sobj_ops_t lwp_sobj_pi_ops = { SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep, turnstile_change_pri }; static sleepq_head_t lwpsleepq[NSLEEPQ]; upib_t upimutextab[UPIMUTEX_TABSIZE]; #define LWPCHAN_LOCK_SHIFT 10 /* 1024 locks for each pool */ #define LWPCHAN_LOCK_SIZE (1 << LWPCHAN_LOCK_SHIFT) /* * We know that both lc_wchan and lc_wchan0 are addresses that most * likely are 8-byte aligned, so we shift off the low-order 3 bits. * 'pool' is either 0 or 1. */ #define LWPCHAN_LOCK_HASH(X, pool) \ (((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \ (LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0)) static kmutex_t lwpchanlock[2 * LWPCHAN_LOCK_SIZE]; /* * Is this a POSIX threads user-level lock requiring priority inheritance? */ #define UPIMUTEX(type) ((type) & LOCK_PRIO_INHERIT) static sleepq_head_t * lwpsqhash(lwpchan_t *lwpchan) { uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; return (&lwpsleepq[SQHASHINDEX(x)]); } /* * Lock an lwpchan. * Keep this in sync with lwpchan_unlock(), below. */ static void lwpchan_lock(lwpchan_t *lwpchan, int pool) { uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); } /* * Unlock an lwpchan. * Keep this in sync with lwpchan_lock(), above. */ static void lwpchan_unlock(lwpchan_t *lwpchan, int pool) { uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0; mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]); } /* * Delete mappings from the lwpchan cache for pages that are being * unmapped by as_unmap(). Given a range of addresses, "start" to "end", * all mappings within the range are deleted from the lwpchan cache. */ void lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end) { lwpchan_data_t *lcp; lwpchan_hashbucket_t *hashbucket; lwpchan_hashbucket_t *endbucket; lwpchan_entry_t *ent; lwpchan_entry_t **prev; caddr_t addr; mutex_enter(&p->p_lcp_lock); lcp = p->p_lcp; hashbucket = lcp->lwpchan_cache; endbucket = hashbucket + lcp->lwpchan_size; for (; hashbucket < endbucket; hashbucket++) { if (hashbucket->lwpchan_chain == NULL) continue; mutex_enter(&hashbucket->lwpchan_lock); prev = &hashbucket->lwpchan_chain; /* check entire chain */ while ((ent = *prev) != NULL) { addr = ent->lwpchan_addr; if (start <= addr && addr < end) { *prev = ent->lwpchan_next; if (ent->lwpchan_pool == LWPCHAN_MPPOOL && (ent->lwpchan_type & LOCK_ROBUST)) lwp_mutex_cleanup(ent, LOCK_UNMAPPED); kmem_free(ent, sizeof (*ent)); atomic_add_32(&lcp->lwpchan_entries, -1); } else { prev = &ent->lwpchan_next; } } mutex_exit(&hashbucket->lwpchan_lock); } mutex_exit(&p->p_lcp_lock); } /* * Given an lwpchan cache pointer and a process virtual address, * return a pointer to the corresponding lwpchan hash bucket. */ static lwpchan_hashbucket_t * lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr) { uint_t i; /* * All user-level sync object addresses are 8-byte aligned. * Ignore the lowest 3 bits of the address and use the * higher-order 2*lwpchan_bits bits for the hash index. */ addr >>= 3; i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask; return (lcp->lwpchan_cache + i); } /* * (Re)allocate the per-process lwpchan cache. */ static void lwpchan_alloc_cache(proc_t *p, uint_t bits) { lwpchan_data_t *lcp; lwpchan_data_t *old_lcp; lwpchan_hashbucket_t *hashbucket; lwpchan_hashbucket_t *endbucket; lwpchan_hashbucket_t *newbucket; lwpchan_entry_t *ent; lwpchan_entry_t *next; uint_t count; ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS); lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP); lcp->lwpchan_bits = bits; lcp->lwpchan_size = 1 << lcp->lwpchan_bits; lcp->lwpchan_mask = lcp->lwpchan_size - 1; lcp->lwpchan_entries = 0; lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size * sizeof (lwpchan_hashbucket_t), KM_SLEEP); lcp->lwpchan_next_data = NULL; mutex_enter(&p->p_lcp_lock); if ((old_lcp = p->p_lcp) != NULL) { if (old_lcp->lwpchan_bits >= bits) { /* someone beat us to it */ mutex_exit(&p->p_lcp_lock); kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * sizeof (lwpchan_hashbucket_t)); kmem_free(lcp, sizeof (lwpchan_data_t)); return; } /* * Acquire all of the old hash table locks. */ hashbucket = old_lcp->lwpchan_cache; endbucket = hashbucket + old_lcp->lwpchan_size; for (; hashbucket < endbucket; hashbucket++) mutex_enter(&hashbucket->lwpchan_lock); /* * Move all of the old hash table entries to the * new hash table. The new hash table has not yet * been installed so we don't need any of its locks. */ count = 0; hashbucket = old_lcp->lwpchan_cache; for (; hashbucket < endbucket; hashbucket++) { ent = hashbucket->lwpchan_chain; while (ent != NULL) { next = ent->lwpchan_next; newbucket = lwpchan_bucket(lcp, (uintptr_t)ent->lwpchan_addr); ent->lwpchan_next = newbucket->lwpchan_chain; newbucket->lwpchan_chain = ent; ent = next; count++; } hashbucket->lwpchan_chain = NULL; } lcp->lwpchan_entries = count; } /* * Retire the old hash table. We can't actually kmem_free() it * now because someone may still have a pointer to it. Instead, * we link it onto the new hash table's list of retired hash tables. * The new hash table is double the size of the previous one, so * the total size of all retired hash tables is less than the size * of the new one. exit() and exec() free the retired hash tables * (see lwpchan_destroy_cache(), below). */ lcp->lwpchan_next_data = old_lcp; /* * As soon as we store the new lcp, future locking operations will * use it. Therefore, we must ensure that all the state we've just * established reaches global visibility before the new lcp does. */ membar_producer(); p->p_lcp = lcp; if (old_lcp != NULL) { /* * Release all of the old hash table locks. */ hashbucket = old_lcp->lwpchan_cache; for (; hashbucket < endbucket; hashbucket++) mutex_exit(&hashbucket->lwpchan_lock); } mutex_exit(&p->p_lcp_lock); } /* * Deallocate the lwpchan cache, and any dynamically allocated mappings. * Called when the process exits or execs. All lwps except one have * exited so we need no locks here. */ void lwpchan_destroy_cache(int exec) { proc_t *p = curproc; lwpchan_hashbucket_t *hashbucket; lwpchan_hashbucket_t *endbucket; lwpchan_data_t *lcp; lwpchan_entry_t *ent; lwpchan_entry_t *next; uint16_t lockflg; lcp = p->p_lcp; p->p_lcp = NULL; lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD; hashbucket = lcp->lwpchan_cache; endbucket = hashbucket + lcp->lwpchan_size; for (; hashbucket < endbucket; hashbucket++) { ent = hashbucket->lwpchan_chain; hashbucket->lwpchan_chain = NULL; while (ent != NULL) { next = ent->lwpchan_next; if (ent->lwpchan_pool == LWPCHAN_MPPOOL && (ent->lwpchan_type & LOCK_ROBUST)) lwp_mutex_cleanup(ent, lockflg); kmem_free(ent, sizeof (*ent)); ent = next; } } while (lcp != NULL) { lwpchan_data_t *next_lcp = lcp->lwpchan_next_data; kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size * sizeof (lwpchan_hashbucket_t)); kmem_free(lcp, sizeof (lwpchan_data_t)); lcp = next_lcp; } } /* * Return zero when there is an entry in the lwpchan cache for the * given process virtual address and non-zero when there is not. * The returned non-zero value is the current length of the * hash chain plus one. The caller holds the hash bucket lock. */ static uint_t lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan, lwpchan_hashbucket_t *hashbucket) { lwpchan_entry_t *ent; uint_t count = 1; for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) { if (ent->lwpchan_addr == addr) { if (ent->lwpchan_type != type || ent->lwpchan_pool != pool) { /* * This shouldn't happen, but might if the * process reuses its memory for different * types of sync objects. We test first * to avoid grabbing the memory cache line. */ ent->lwpchan_type = (uint16_t)type; ent->lwpchan_pool = (uint16_t)pool; } *lwpchan = ent->lwpchan_lwpchan; return (0); } count++; } return (count); } /* * Return the cached lwpchan mapping if cached, otherwise insert * a virtual address to lwpchan mapping into the cache. */ static int lwpchan_get_mapping(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool) { proc_t *p = curproc; lwpchan_data_t *lcp; lwpchan_hashbucket_t *hashbucket; lwpchan_entry_t *ent; memid_t memid; uint_t count; uint_t bits; top: /* initialize the lwpchan cache, if necesary */ if ((lcp = p->p_lcp) == NULL) { lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS); goto top; } hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr); mutex_enter(&hashbucket->lwpchan_lock); if (lcp != p->p_lcp) { /* someone resized the lwpchan cache; start over */ mutex_exit(&hashbucket->lwpchan_lock); goto top; } if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) { /* it's in the cache */ mutex_exit(&hashbucket->lwpchan_lock); return (1); } mutex_exit(&hashbucket->lwpchan_lock); if (as_getmemid(as, addr, &memid) != 0) return (0); lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0]; lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1]; ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP); mutex_enter(&hashbucket->lwpchan_lock); if (lcp != p->p_lcp) { /* someone resized the lwpchan cache; start over */ mutex_exit(&hashbucket->lwpchan_lock); kmem_free(ent, sizeof (*ent)); goto top; } count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket); if (count == 0) { /* someone else added this entry to the cache */ mutex_exit(&hashbucket->lwpchan_lock); kmem_free(ent, sizeof (*ent)); return (1); } if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */ (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) { /* hash chain too long; reallocate the hash table */ mutex_exit(&hashbucket->lwpchan_lock); kmem_free(ent, sizeof (*ent)); lwpchan_alloc_cache(p, bits + 1); goto top; } ent->lwpchan_addr = addr; ent->lwpchan_type = (uint16_t)type; ent->lwpchan_pool = (uint16_t)pool; ent->lwpchan_lwpchan = *lwpchan; ent->lwpchan_next = hashbucket->lwpchan_chain; hashbucket->lwpchan_chain = ent; atomic_add_32(&lcp->lwpchan_entries, 1); mutex_exit(&hashbucket->lwpchan_lock); return (1); } /* * Return a unique pair of identifiers that corresponds to a * synchronization object's virtual address. Process-shared * sync objects usually get vnode/offset from as_getmemid(). */ static int get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool) { /* * If the lwp synch object is defined to be process-private, * we just make the first field of the lwpchan be 'as' and * the second field be the synch object's virtual address. * (segvn_getmemid() does the same for MAP_PRIVATE mappings.) * The lwpchan cache is used only for process-shared objects. */ if (!(type & USYNC_PROCESS)) { lwpchan->lc_wchan0 = (caddr_t)as; lwpchan->lc_wchan = addr; return (1); } return (lwpchan_get_mapping(as, addr, type, lwpchan, pool)); } static void lwp_block(lwpchan_t *lwpchan) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); sleepq_head_t *sqh; thread_lock(t); t->t_flag |= T_WAKEABLE; t->t_lwpchan = *lwpchan; t->t_sobj_ops = &lwp_sobj_ops; t->t_release = 0; sqh = lwpsqhash(lwpchan); disp_lock_enter_high(&sqh->sq_lock); CL_SLEEP(t); DTRACE_SCHED(sleep); THREAD_SLEEP(t, &sqh->sq_lock); sleepq_insert(&sqh->sq_queue, t); thread_unlock(t); lwp->lwp_asleep = 1; lwp->lwp_sysabort = 0; lwp->lwp_ru.nvcsw++; (void) new_mstate(curthread, LMS_SLEEP); } static kthread_t * lwpsobj_pi_owner(upimutex_t *up) { return (up->upi_owner); } static struct upimutex * upi_get(upib_t *upibp, lwpchan_t *lcp) { struct upimutex *upip; for (upip = upibp->upib_first; upip != NULL; upip = upip->upi_nextchain) { if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 && upip->upi_lwpchan.lc_wchan == lcp->lc_wchan) break; } return (upip); } static void upi_chain_add(upib_t *upibp, struct upimutex *upimutex) { ASSERT(MUTEX_HELD(&upibp->upib_lock)); /* * Insert upimutex at front of list. Maybe a bit unfair * but assume that not many lwpchans hash to the same * upimutextab bucket, i.e. the list of upimutexes from * upib_first is not too long. */ upimutex->upi_nextchain = upibp->upib_first; upibp->upib_first = upimutex; } static void upi_chain_del(upib_t *upibp, struct upimutex *upimutex) { struct upimutex **prev; ASSERT(MUTEX_HELD(&upibp->upib_lock)); prev = &upibp->upib_first; while (*prev != upimutex) { prev = &(*prev)->upi_nextchain; } *prev = upimutex->upi_nextchain; upimutex->upi_nextchain = NULL; } /* * Add upimutex to chain of upimutexes held by curthread. * Returns number of upimutexes held by curthread. */ static uint32_t upi_mylist_add(struct upimutex *upimutex) { kthread_t *t = curthread; /* * Insert upimutex at front of list of upimutexes owned by t. This * would match typical LIFO order in which nested locks are acquired * and released. */ upimutex->upi_nextowned = t->t_upimutex; t->t_upimutex = upimutex; t->t_nupinest++; ASSERT(t->t_nupinest > 0); return (t->t_nupinest); } /* * Delete upimutex from list of upimutexes owned by curthread. */ static void upi_mylist_del(struct upimutex *upimutex) { kthread_t *t = curthread; struct upimutex **prev; /* * Since the order in which nested locks are acquired and released, * is typically LIFO, and typical nesting levels are not too deep, the * following should not be expensive in the general case. */ prev = &t->t_upimutex; while (*prev != upimutex) { prev = &(*prev)->upi_nextowned; } *prev = upimutex->upi_nextowned; upimutex->upi_nextowned = NULL; ASSERT(t->t_nupinest > 0); t->t_nupinest--; } /* * Returns true if upimutex is owned. Should be called only when upim points * to kmem which cannot disappear from underneath. */ static int upi_owned(upimutex_t *upim) { return (upim->upi_owner == curthread); } /* * Returns pointer to kernel object (upimutex_t *) if lp is owned. */ static struct upimutex * lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type) { lwpchan_t lwpchan; upib_t *upibp; struct upimutex *upimutex; if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) return (NULL); upibp = &UPI_CHAIN(lwpchan); mutex_enter(&upibp->upib_lock); upimutex = upi_get(upibp, &lwpchan); if (upimutex == NULL || upimutex->upi_owner != curthread) { mutex_exit(&upibp->upib_lock); return (NULL); } mutex_exit(&upibp->upib_lock); return (upimutex); } /* * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if * no lock hand-off occurrs. */ static void upimutex_unlock(struct upimutex *upimutex, uint16_t flag) { turnstile_t *ts; upib_t *upibp; kthread_t *newowner; upi_mylist_del(upimutex); upibp = upimutex->upi_upibp; mutex_enter(&upibp->upib_lock); if (upimutex->upi_waiter != 0) { /* if waiters */ ts = turnstile_lookup(upimutex); if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) { /* hand-off lock to highest prio waiter */ newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first; upimutex->upi_owner = newowner; if (ts->ts_waiters == 1) upimutex->upi_waiter = 0; turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner); mutex_exit(&upibp->upib_lock); return; } else if (ts != NULL) { /* LOCK_NOTRECOVERABLE: wakeup all */ turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL); } else { /* * Misleading w bit. Waiters might have been * interrupted. No need to clear the w bit (upimutex * will soon be freed). Re-calculate PI from existing * waiters. */ turnstile_exit(upimutex); turnstile_pi_recalc(); } } /* * no waiters, or LOCK_NOTRECOVERABLE. * remove from the bucket chain of upi mutexes. * de-allocate kernel memory (upimutex). */ upi_chain_del(upimutex->upi_upibp, upimutex); mutex_exit(&upibp->upib_lock); kmem_free(upimutex, sizeof (upimutex_t)); } static int lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp) { label_t ljb; int error = 0; lwpchan_t lwpchan; uint16_t flag; upib_t *upibp; volatile struct upimutex *upimutex = NULL; turnstile_t *ts; uint32_t nupinest; volatile int upilocked = 0; if (on_fault(&ljb)) { if (upilocked) upimutex_unlock((upimutex_t *)upimutex, 0); error = EFAULT; goto out; } /* * The apparent assumption made in implementing other _lwp_* synch * primitives, is that get_lwpchan() does not return a unique cookie * for the case where 2 processes (one forked from the other) point * at the same underlying object, which is typed USYNC_PROCESS, but * mapped MAP_PRIVATE, since the object has not yet been written to, * in the child process. * * Since get_lwpchan() has been fixed, it is not necessary to do the * dummy writes to force a COW fault as in other places (which should * be fixed). */ if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } upibp = &UPI_CHAIN(lwpchan); retry: mutex_enter(&upibp->upib_lock); upimutex = upi_get(upibp, &lwpchan); if (upimutex == NULL) { /* lock available since lwpchan has no upimutex */ upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP); upi_chain_add(upibp, (upimutex_t *)upimutex); upimutex->upi_owner = curthread; /* grab lock */ upimutex->upi_upibp = upibp; upimutex->upi_vaddr = lp; upimutex->upi_lwpchan = lwpchan; mutex_exit(&upibp->upib_lock); nupinest = upi_mylist_add((upimutex_t *)upimutex); upilocked = 1; fuword16_noerr(&lp->mutex_flag, &flag); if (nupinest > maxnestupimx && secpolicy_resource(CRED()) != 0) { upimutex_unlock((upimutex_t *)upimutex, flag); error = ENOMEM; goto out; } if (flag & LOCK_NOTRECOVERABLE) { /* * Since the setting of LOCK_NOTRECOVERABLE * was done under the high-level upi mutex, * in lwp_upimutex_unlock(), this flag needs to * be checked while holding the upi mutex. * If set, this thread should return without * the lock held, and with the right error code. */ upimutex_unlock((upimutex_t *)upimutex, flag); upilocked = 0; error = ENOTRECOVERABLE; } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { if (flag & LOCK_OWNERDEAD) error = EOWNERDEAD; else if (type & USYNC_PROCESS_ROBUST) error = ELOCKUNMAPPED; else error = EOWNERDEAD; } goto out; } /* * If a upimutex object exists, it must have an owner. * This is due to lock hand-off, and release of upimutex when no * waiters are present at unlock time, */ ASSERT(upimutex->upi_owner != NULL); if (upimutex->upi_owner == curthread) { /* * The user wrapper can check if the mutex type is * ERRORCHECK: if not, it should stall at user-level. * If so, it should return the error code. */ mutex_exit(&upibp->upib_lock); error = EDEADLK; goto out; } if (try == UPIMUTEX_TRY) { mutex_exit(&upibp->upib_lock); error = EBUSY; goto out; } /* * Block for the lock. * Put the lwp in an orderly state for debugging. * Calling prstop() has to be done here, and not in * turnstile_block(), since the preceding call to * turnstile_lookup() raises the PIL to a level * at which calls to prstop() should not be made. */ if ((error = lwptp->lwpt_time_error) != 0) { /* * The SUSV3 Posix spec is very clear that we * should get no error from validating the * timer until we would actually sleep. */ mutex_exit(&upibp->upib_lock); goto out; } prstop(PR_REQUESTED, 0); if (lwptp->lwpt_tsp != NULL) { /* * If we successfully queue the timeout * (lwp_timer_enqueue() returns zero), * then don't drop t_delay_lock until we are * on the sleep queue (in turnstile_block()). * Otherwise we will get an immediate timeout * when we attempt to sleep in turnstile_block(). */ mutex_enter(&curthread->t_delay_lock); if (lwp_timer_enqueue(lwptp) != 0) mutex_exit(&curthread->t_delay_lock); } /* * Now, set the waiter bit and block for the lock in turnstile_block(). * No need to preserve the previous wbit since a lock try is not * attempted after setting the wait bit. Wait bit is set under * the upib_lock, which is not released until the turnstile lock * is acquired. Say, the upimutex is L: * * 1. upib_lock is held so the waiter does not have to retry L after * setting the wait bit: since the owner has to grab the upib_lock * to unlock L, it will certainly see the wait bit set. * 2. upib_lock is not released until the turnstile lock is acquired. * This is the key to preventing a missed wake-up. Otherwise, the * owner could acquire the upib_lock, and the tc_lock, to call * turnstile_wakeup(). All this, before the waiter gets tc_lock * to sleep in turnstile_block(). turnstile_wakeup() will then not * find this waiter, resulting in the missed wakeup. * 3. The upib_lock, being a kernel mutex, cannot be released while * holding the tc_lock (since mutex_exit() could need to acquire * the same tc_lock)...and so is held when calling turnstile_block(). * The address of upib_lock is passed to turnstile_block() which * releases it after releasing all turnstile locks, and before going * to sleep in swtch(). * 4. The waiter value cannot be a count of waiters, because a waiter * can be interrupted. The interrupt occurs under the tc_lock, at * which point, the upib_lock cannot be locked, to decrement waiter * count. So, just treat the waiter state as a bit, not a count. */ ts = turnstile_lookup((upimutex_t *)upimutex); upimutex->upi_waiter = 1; error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex, &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp); /* * Hand-off implies that we wakeup holding the lock, except when: * - deadlock is detected * - lock is not recoverable * - we got an interrupt or timeout * If we wake up due to an interrupt or timeout, we may * or may not be holding the lock due to mutex hand-off. * Use lwp_upimutex_owned() to check if we do hold the lock. */ if (error != 0) { if ((error == EINTR || error == ETIME) && (upimutex = lwp_upimutex_owned(lp, type))) { /* * Unlock and return - the re-startable syscall will * try the lock again if we got EINTR. */ (void) upi_mylist_add((upimutex_t *)upimutex); upimutex_unlock((upimutex_t *)upimutex, 0); } /* * The only other possible error is EDEADLK. If so, upimutex * is valid, since its owner is deadlocked with curthread. */ ASSERT(error == EINTR || error == ETIME || (error == EDEADLK && !upi_owned((upimutex_t *)upimutex))); ASSERT(!lwp_upimutex_owned(lp, type)); goto out; } if (lwp_upimutex_owned(lp, type)) { ASSERT(lwp_upimutex_owned(lp, type) == upimutex); nupinest = upi_mylist_add((upimutex_t *)upimutex); upilocked = 1; } /* * Now, need to read the user-level lp->mutex_flag to do the following: * * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED * should be returned. * - if lock isn't held, check if ENOTRECOVERABLE should * be returned. * * Now, either lp->mutex_flag is readable or it's not. If not * readable, the on_fault path will cause a return with EFAULT * as it should. If it is readable, the state of the flag * encodes the robustness state of the lock: * * If the upimutex is locked here, the flag's LOCK_OWNERDEAD * or LOCK_UNMAPPED setting will influence the return code * appropriately. If the upimutex is not locked here, this * could be due to a spurious wake-up or a NOTRECOVERABLE * event. The flag's setting can be used to distinguish * between these two events. */ fuword16_noerr(&lp->mutex_flag, &flag); if (upilocked) { /* * If the thread wakes up from turnstile_block with the lock * held, the flag could not be set to LOCK_NOTRECOVERABLE, * since it would not have been handed-off the lock. * So, no need to check for this case. */ if (nupinest > maxnestupimx && secpolicy_resource(CRED()) != 0) { upimutex_unlock((upimutex_t *)upimutex, flag); upilocked = 0; error = ENOMEM; } else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { if (flag & LOCK_OWNERDEAD) error = EOWNERDEAD; else if (type & USYNC_PROCESS_ROBUST) error = ELOCKUNMAPPED; else error = EOWNERDEAD; } } else { /* * Wake-up without the upimutex held. Either this is a * spurious wake-up (due to signals, forkall(), whatever), or * it is a LOCK_NOTRECOVERABLE robustness event. The setting * of the mutex flag can be used to distinguish between the * two events. */ if (flag & LOCK_NOTRECOVERABLE) { error = ENOTRECOVERABLE; } else { /* * Here, the flag could be set to LOCK_OWNERDEAD or * not. In both cases, this is a spurious wakeup, * since the upi lock is not held, but the thread * has returned from turnstile_block(). * * The user flag could be LOCK_OWNERDEAD if, at the * same time as curthread having been woken up * spuriously, the owner (say Tdead) has died, marked * the mutex flag accordingly, and handed off the lock * to some other waiter (say Tnew). curthread just * happened to read the flag while Tnew has yet to deal * with the owner-dead event. * * In this event, curthread should retry the lock. * If Tnew is able to cleanup the lock, curthread * will eventually get the lock with a zero error code, * If Tnew is unable to cleanup, its eventual call to * unlock the lock will result in the mutex flag being * set to LOCK_NOTRECOVERABLE, and the wake-up of * all waiters, including curthread, which will then * eventually return ENOTRECOVERABLE due to the above * check. * * Of course, if the user-flag is not set with * LOCK_OWNERDEAD, retrying is the thing to do, since * this is definitely a spurious wakeup. */ goto retry; } } out: no_fault(); return (error); } static int lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type) { label_t ljb; int error = 0; lwpchan_t lwpchan; uint16_t flag; upib_t *upibp; volatile struct upimutex *upimutex = NULL; volatile int upilocked = 0; if (on_fault(&ljb)) { if (upilocked) upimutex_unlock((upimutex_t *)upimutex, 0); error = EFAULT; goto out; } if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } upibp = &UPI_CHAIN(lwpchan); mutex_enter(&upibp->upib_lock); upimutex = upi_get(upibp, &lwpchan); /* * If the lock is not held, or the owner is not curthread, return * error. The user-level wrapper can return this error or stall, * depending on whether mutex is of ERRORCHECK type or not. */ if (upimutex == NULL || upimutex->upi_owner != curthread) { mutex_exit(&upibp->upib_lock); error = EPERM; goto out; } mutex_exit(&upibp->upib_lock); /* release for user memory access */ upilocked = 1; fuword16_noerr(&lp->mutex_flag, &flag); if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { /* * transition mutex to the LOCK_NOTRECOVERABLE state. */ flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED); flag |= LOCK_NOTRECOVERABLE; suword16_noerr(&lp->mutex_flag, flag); } if (type & USYNC_PROCESS) suword32_noerr(&lp->mutex_ownerpid, 0); upimutex_unlock((upimutex_t *)upimutex, flag); upilocked = 0; out: no_fault(); return (error); } /* * Clear the contents of a user-level mutex; return the flags. * Used only by upi_dead() and lwp_mutex_cleanup(), below. */ static uint16_t lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg) { uint16_t flag; fuword16_noerr(&lp->mutex_flag, &flag); if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) { flag |= lockflg; suword16_noerr(&lp->mutex_flag, flag); } suword32_noerr((uint32_t *)&lp->mutex_owner, 0); suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0); suword32_noerr(&lp->mutex_ownerpid, 0); suword8_noerr(&lp->mutex_rcount, 0); return (flag); } /* * Mark user mutex state, corresponding to kernel upimutex, * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate */ static int upi_dead(upimutex_t *upip, uint16_t lockflg) { label_t ljb; int error = 0; lwp_mutex_t *lp; if (on_fault(&ljb)) { error = EFAULT; goto out; } lp = upip->upi_vaddr; (void) lwp_clear_mutex(lp, lockflg); suword8_noerr(&lp->mutex_lockw, 0); out: no_fault(); return (error); } /* * Unlock all upimutexes held by curthread, since curthread is dying. * For each upimutex, attempt to mark its corresponding user mutex object as * dead. */ void upimutex_cleanup() { kthread_t *t = curthread; uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)? LOCK_UNMAPPED : LOCK_OWNERDEAD; struct upimutex *upip; while ((upip = t->t_upimutex) != NULL) { if (upi_dead(upip, lockflg) != 0) { /* * If the user object associated with this upimutex is * unmapped, unlock upimutex with the * LOCK_NOTRECOVERABLE flag, so that all waiters are * woken up. Since user object is unmapped, it could * not be marked as dead or notrecoverable. * The waiters will now all wake up and return * ENOTRECOVERABLE, since they would find that the lock * has not been handed-off to them. * See lwp_upimutex_lock(). */ upimutex_unlock(upip, LOCK_NOTRECOVERABLE); } else { /* * The user object has been updated as dead. * Unlock the upimutex: if no waiters, upip kmem will * be freed. If there is a waiter, the lock will be * handed off. If exit() is in progress, each existing * waiter will successively get the lock, as owners * die, and each new owner will call this routine as * it dies. The last owner will free kmem, since * it will find the upimutex has no waiters. So, * eventually, the kmem is guaranteed to be freed. */ upimutex_unlock(upip, 0); } /* * Note that the call to upimutex_unlock() above will delete * upimutex from the t_upimutexes chain. And so the * while loop will eventually terminate. */ } } int lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); lwp_timer_t lwpt; caddr_t timedwait; int error = 0; int time_error; clock_t tim = -1; uchar_t waiters; volatile int locked = 0; volatile int watched = 0; label_t ljb; volatile uint8_t type = 0; lwpchan_t lwpchan; sleepq_head_t *sqh; static int iswanted(); uint16_t flag; int imm_timeout = 0; if ((caddr_t)lp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); timedwait = (caddr_t)tsp; if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && lwpt.lwpt_imm_timeout) { imm_timeout = 1; timedwait = NULL; } /* * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock", * this micro state is really a run state. If the thread indeed blocks, * this state becomes valid. If not, the state is converted back to * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just * when blocking. */ (void) new_mstate(t, LMS_USER_LOCK); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = EFAULT; goto out; } fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); if (UPIMUTEX(type)) { no_fault(); error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt); if ((type & USYNC_PROCESS) && (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)) (void) suword32(&lp->mutex_ownerpid, p->p_pid); if (tsp && !time_error) /* copyout the residual time left */ error = lwp_timer_copyout(&lwpt, error); if (error) return (set_errno(error)); return (0); } /* * Force Copy-on-write fault if lwp_mutex_t object is * defined to be MAP_PRIVATE and it was initialized to * USYNC_PROCESS. */ suword8_noerr(&lp->mutex_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & LOCK_NOTRECOVERABLE) { lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = ENOTRECOVERABLE; goto out; } } fuword8_noerr(&lp->mutex_waiters, &waiters); suword8_noerr(&lp->mutex_waiters, 1); /* * If watchpoints are set, they need to be restored, since * atomic accesses of memory such as the call to ulock_try() * below cannot be watched. */ watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); while (!ulock_try(&lp->mutex_lockw)) { if (time_error) { /* * The SUSV3 Posix spec is very clear that we * should get no error from validating the * timer until we would actually sleep. */ error = time_error; break; } if (watched) { watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); watched = 0; } /* * Put the lwp in an orderly state for debugging. */ prstop(PR_REQUESTED, 0); if (timedwait) { /* * If we successfully queue the timeout, * then don't drop t_delay_lock until * we are on the sleep queue (below). */ mutex_enter(&t->t_delay_lock); if (lwp_timer_enqueue(&lwpt) != 0) { mutex_exit(&t->t_delay_lock); imm_timeout = 1; timedwait = NULL; } } lwp_block(&lwpchan); /* * Nothing should happen to cause the lwp to go to * sleep again until after it returns from swtch(). */ if (timedwait) mutex_exit(&t->t_delay_lock); locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout) setrun(t); swtch(); t->t_flag &= ~T_WAKEABLE; if (timedwait) tim = lwp_timer_dequeue(&lwpt); setallwatch(); if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t)) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; if (error) { lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); /* * Need to re-compute waiters bit. The waiters field in * the lock is not reliable. Either of two things could * have occurred: no lwp may have called lwp_release() * for me but I have woken up due to a signal or * timeout. In this case, the waiter bit is incorrect * since it is still set to 1, set above. * OR an lwp_release() did occur for some other lwp on * the same lwpchan. In this case, the waiter bit is * correct. But which event occurred, one can't tell. * So, recompute. */ lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; sqh = lwpsqhash(&lwpchan); disp_lock_enter(&sqh->sq_lock); waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan); disp_lock_exit(&sqh->sq_lock); break; } lwp->lwp_asleep = 0; watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; fuword8_noerr(&lp->mutex_waiters, &waiters); suword8_noerr(&lp->mutex_waiters, 1); if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & LOCK_NOTRECOVERABLE) { error = ENOTRECOVERABLE; break; } } } if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); if (error == 0) { if (type & USYNC_PROCESS) suword32_noerr(&lp->mutex_ownerpid, p->p_pid); if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { if (flag & LOCK_OWNERDEAD) error = EOWNERDEAD; else if (type & USYNC_PROCESS_ROBUST) error = ELOCKUNMAPPED; else error = EOWNERDEAD; } } } suword8_noerr(&lp->mutex_waiters, waiters); locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (tsp && !time_error) /* copyout the residual time left */ error = lwp_timer_copyout(&lwpt, error); if (error) return (set_errno(error)); return (0); } /* * Obsolete lwp_mutex_lock() interface, no longer called from libc. * libc now calls lwp_mutex_timedlock(lp, NULL). * This system call trap continues to exist solely for the benefit * of old statically-linked binaries from Solaris 9 and before. * It should be removed from the system when we no longer care * about such applications. */ int lwp_mutex_lock(lwp_mutex_t *lp) { return (lwp_mutex_timedlock(lp, NULL)); } static int iswanted(kthread_t *t, lwpchan_t *lwpchan) { /* * The caller holds the dispatcher lock on the sleep queue. */ while (t != NULL) { if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && t->t_lwpchan.lc_wchan == lwpchan->lc_wchan) return (1); t = t->t_link; } return (0); } /* * Return the highest priority thread sleeping on this lwpchan. */ static kthread_t * lwp_queue_waiter(lwpchan_t *lwpchan) { sleepq_head_t *sqh; kthread_t *tp; sqh = lwpsqhash(lwpchan); disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */ for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) { if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) break; } disp_lock_exit(&sqh->sq_lock); return (tp); } static int lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type) { sleepq_head_t *sqh; kthread_t *tp; kthread_t **tpp; sqh = lwpsqhash(lwpchan); disp_lock_enter(&sqh->sq_lock); /* lock the sleep queue */ tpp = &sqh->sq_queue.sq_first; while ((tp = *tpp) != NULL) { if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { /* * The following is typically false. It could be true * only if lwp_release() is called from * lwp_mutex_wakeup() after reading the waiters field * from memory in which the lwp lock used to be, but has * since been re-used to hold a lwp cv or lwp semaphore. * The thread "tp" found to match the lwp lock's wchan * is actually sleeping for the cv or semaphore which * now has the same wchan. In this case, lwp_release() * should return failure. */ if (sync_type != (tp->t_flag & T_WAITCVSEM)) { ASSERT(sync_type == 0); /* * assert that this can happen only for mutexes * i.e. sync_type == 0, for correctly written * user programs. */ disp_lock_exit(&sqh->sq_lock); return (0); } *waiters = iswanted(tp->t_link, lwpchan); sleepq_unlink(tpp, tp); DTRACE_SCHED1(wakeup, kthread_t *, tp); tp->t_wchan0 = NULL; tp->t_wchan = NULL; tp->t_sobj_ops = NULL; tp->t_release = 1; THREAD_TRANSITION(tp); /* drops sleepq lock */ CL_WAKEUP(tp); thread_unlock(tp); /* drop run queue lock */ return (1); } tpp = &tp->t_link; } *waiters = 0; disp_lock_exit(&sqh->sq_lock); return (0); } static void lwp_release_all(lwpchan_t *lwpchan) { sleepq_head_t *sqh; kthread_t *tp; kthread_t **tpp; sqh = lwpsqhash(lwpchan); disp_lock_enter(&sqh->sq_lock); /* lock sleep q queue */ tpp = &sqh->sq_queue.sq_first; while ((tp = *tpp) != NULL) { if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { sleepq_unlink(tpp, tp); DTRACE_SCHED1(wakeup, kthread_t *, tp); tp->t_wchan0 = NULL; tp->t_wchan = NULL; tp->t_sobj_ops = NULL; CL_WAKEUP(tp); thread_unlock_high(tp); /* release run queue lock */ } else { tpp = &tp->t_link; } } disp_lock_exit(&sqh->sq_lock); /* drop sleep q lock */ } /* * unblock a lwp that is trying to acquire this mutex. the blocked * lwp resumes and retries to acquire the lock. */ int lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all) { proc_t *p = ttoproc(curthread); lwpchan_t lwpchan; uchar_t waiters; volatile int locked = 0; volatile int watched = 0; volatile uint8_t type = 0; label_t ljb; int error = 0; if ((caddr_t)lp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_mutex_t object is * defined to be MAP_PRIVATE, and type is USYNC_PROCESS */ fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); suword8_noerr(&lp->mutex_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; /* * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release() * may fail. If it fails, do not write into the waiter bit. * The call to lwp_release() might fail due to one of three reasons: * * 1. due to the thread which set the waiter bit not actually * sleeping since it got the lock on the re-try. The waiter * bit will then be correctly updated by that thread. This * window may be closed by reading the wait bit again here * and not calling lwp_release() at all if it is zero. * 2. the thread which set the waiter bit and went to sleep * was woken up by a signal. This time, the waiter recomputes * the wait bit in the return with EINTR code. * 3. the waiter bit read by lwp_mutex_wakeup() was in * memory that has been re-used after the lock was dropped. * In this case, writing into the waiter bit would cause data * corruption. */ if (release_all) lwp_release_all(&lwpchan); else if (lwp_release(&lwpchan, &waiters, 0) == 1) suword8_noerr(&lp->mutex_waiters, waiters); lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (error) return (set_errno(error)); return (0); } /* * lwp_cond_wait() has four arguments, a pointer to a condition variable, * a pointer to a mutex, a pointer to a timespec for a timed wait and * a flag telling the kernel whether or not to honor the kernel/user * schedctl parking protocol (see schedctl_is_park() in schedctl.c). * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an * lwpchan, returned by get_lwpchan(). If the timespec pointer is non-NULL, * it is used an an in/out parameter. On entry, it contains the relative * time until timeout. On exit, we copyout the residual time left to it. */ int lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); lwp_timer_t lwpt; lwpchan_t cv_lwpchan; lwpchan_t m_lwpchan; caddr_t timedwait; volatile uint16_t type = 0; volatile uint8_t mtype = 0; uchar_t waiters; volatile int error; clock_t tim = -1; volatile int locked = 0; volatile int m_locked = 0; volatile int cvwatched = 0; volatile int mpwatched = 0; label_t ljb; volatile int no_lwpchan = 1; int imm_timeout = 0; int imm_unpark = 0; if ((caddr_t)cv >= p->p_as->a_userlimit || (caddr_t)mp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); timedwait = (caddr_t)tsp; if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0) return (set_errno(error)); if (lwpt.lwpt_imm_timeout) { imm_timeout = 1; timedwait = NULL; } (void) new_mstate(t, LMS_USER_LOCK); if (on_fault(&ljb)) { if (no_lwpchan) { error = EFAULT; goto out; } if (m_locked) { m_locked = 0; lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); } if (locked) { locked = 0; lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); } /* * set up another on_fault() for a possible fault * on the user lock accessed at "efault" */ if (on_fault(&ljb)) { if (m_locked) { m_locked = 0; lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); } goto out; } error = EFAULT; goto efault; } /* * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS */ fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype); if (UPIMUTEX(mtype) == 0) { suword8_noerr(&mp->mutex_type, mtype); /* convert user level mutex, "mp", to a unique lwpchan */ /* check if mtype is ok to use below, instead of type from cv */ if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype, &m_lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } } fuword16_noerr(&cv->cond_type, (uint16_t *)&type); suword16_noerr(&cv->cond_type, type); /* convert user level condition variable, "cv", to a unique lwpchan */ if (!get_lwpchan(p->p_as, (caddr_t)cv, type, &cv_lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } no_lwpchan = 0; cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (UPIMUTEX(mtype) == 0) mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); /* * lwpchan_lock ensures that the calling lwp is put to sleep atomically * with respect to a possible wakeup which is a result of either * an lwp_cond_signal() or an lwp_cond_broadcast(). * * What's misleading, is that the lwp is put to sleep after the * condition variable's mutex is released. This is OK as long as * the release operation is also done while holding lwpchan_lock. * The lwp is then put to sleep when the possibility of pagefaulting * or sleeping is completely eliminated. */ lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL); locked = 1; if (UPIMUTEX(mtype) == 0) { lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL); m_locked = 1; suword8_noerr(&cv->cond_waiters_kernel, 1); /* * unlock the condition variable's mutex. (pagefaults are * possible here.) */ ulock_clear(&mp->mutex_lockw); fuword8_noerr(&mp->mutex_waiters, &waiters); if (waiters != 0) { /* * Given the locking of lwpchan_lock around the release * of the mutex and checking for waiters, the following * call to lwp_release() can fail ONLY if the lock * acquirer is interrupted after setting the waiter bit, * calling lwp_block() and releasing lwpchan_lock. * In this case, it could get pulled off the lwp sleep * q (via setrun()) before the following call to * lwp_release() occurs. In this case, the lock * requestor will update the waiter bit correctly by * re-evaluating it. */ if (lwp_release(&m_lwpchan, &waiters, 0) > 0) suword8_noerr(&mp->mutex_waiters, waiters); } m_locked = 0; lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); } else { suword8_noerr(&cv->cond_waiters_kernel, 1); error = lwp_upimutex_unlock(mp, mtype); if (error) { /* if the upimutex unlock failed */ locked = 0; lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); goto out; } } no_fault(); if (mpwatched) { watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); mpwatched = 0; } if (cvwatched) { watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); cvwatched = 0; } /* * Put the lwp in an orderly state for debugging. */ prstop(PR_REQUESTED, 0); if (check_park && (!schedctl_is_park() || t->t_unpark)) { /* * We received a signal at user-level before calling here * or another thread wants us to return immediately * with EINTR. See lwp_unpark(). */ imm_unpark = 1; t->t_unpark = 0; timedwait = NULL; } else if (timedwait) { /* * If we successfully queue the timeout, * then don't drop t_delay_lock until * we are on the sleep queue (below). */ mutex_enter(&t->t_delay_lock); if (lwp_timer_enqueue(&lwpt) != 0) { mutex_exit(&t->t_delay_lock); imm_timeout = 1; timedwait = NULL; } } t->t_flag |= T_WAITCVSEM; lwp_block(&cv_lwpchan); /* * Nothing should happen to cause the lwp to go to sleep * until after it returns from swtch(). */ if (timedwait) mutex_exit(&t->t_delay_lock); locked = 0; lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL); if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || (imm_timeout | imm_unpark)) setrun(t); swtch(); t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); if (timedwait) tim = lwp_timer_dequeue(&lwpt); if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t) || imm_unpark) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; setallwatch(); if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); if (tsp && check_park) /* copyout the residual time left */ error = lwp_timer_copyout(&lwpt, error); /* the mutex is reacquired by the caller on return to user level */ if (error) { /* * If we were concurrently lwp_cond_signal()d and we * received a UNIX signal or got a timeout, then perform * another lwp_cond_signal() to avoid consuming the wakeup. */ if (t->t_release) (void) lwp_cond_signal(cv); return (set_errno(error)); } return (0); efault: /* * make sure that the user level lock is dropped before * returning to caller, since the caller always re-acquires it. */ if (UPIMUTEX(mtype) == 0) { lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL); m_locked = 1; ulock_clear(&mp->mutex_lockw); fuword8_noerr(&mp->mutex_waiters, &waiters); if (waiters != 0) { /* * See comment above on lock clearing and lwp_release() * success/failure. */ if (lwp_release(&m_lwpchan, &waiters, 0) > 0) suword8_noerr(&mp->mutex_waiters, waiters); } m_locked = 0; lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL); } else { (void) lwp_upimutex_unlock(mp, mtype); } out: no_fault(); if (mpwatched) watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); if (cvwatched) watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); return (set_errno(error)); } /* * wakeup one lwp that's blocked on this condition variable. */ int lwp_cond_signal(lwp_cond_t *cv) { proc_t *p = ttoproc(curthread); lwpchan_t lwpchan; uchar_t waiters; volatile uint16_t type = 0; volatile int locked = 0; volatile int watched = 0; label_t ljb; int error = 0; if ((caddr_t)cv >= p->p_as->a_userlimit) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_cond_t object is * defined to be MAP_PRIVATE, and is USYNC_PROCESS. */ fuword16_noerr(&cv->cond_type, (uint16_t *)&type); suword16_noerr(&cv->cond_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword8_noerr(&cv->cond_waiters_kernel, &waiters); if (waiters != 0) { /* * The following call to lwp_release() might fail but it is * OK to write into the waiters bit below, since the memory * could not have been re-used or unmapped (for correctly * written user programs) as in the case of lwp_mutex_wakeup(). * For an incorrect program, we should not care about data * corruption since this is just one instance of other places * where corruption can occur for such a program. Of course * if the memory is unmapped, normal fault recovery occurs. */ (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); suword8_noerr(&cv->cond_waiters_kernel, waiters); } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (error) return (set_errno(error)); return (0); } /* * wakeup every lwp that's blocked on this condition variable. */ int lwp_cond_broadcast(lwp_cond_t *cv) { proc_t *p = ttoproc(curthread); lwpchan_t lwpchan; volatile uint16_t type = 0; volatile int locked = 0; volatile int watched = 0; label_t ljb; uchar_t waiters; int error = 0; if ((caddr_t)cv >= p->p_as->a_userlimit) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_cond_t object is * defined to be MAP_PRIVATE, and is USYNC_PROCESS. */ fuword16_noerr(&cv->cond_type, (uint16_t *)&type); suword16_noerr(&cv->cond_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword8_noerr(&cv->cond_waiters_kernel, &waiters); if (waiters != 0) { lwp_release_all(&lwpchan); suword8_noerr(&cv->cond_waiters_kernel, 0); } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE); if (error) return (set_errno(error)); return (0); } int lwp_sema_trywait(lwp_sema_t *sp) { kthread_t *t = curthread; proc_t *p = ttoproc(t); label_t ljb; volatile int locked = 0; volatile int watched = 0; volatile uint16_t type = 0; int count; lwpchan_t lwpchan; uchar_t waiters; int error = 0; if ((caddr_t)sp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_sema_t object is * defined to be MAP_PRIVATE, and is USYNC_PROCESS. */ fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type); suword16_noerr((void *)&sp->sema_type, type); if (!get_lwpchan(p->p_as, (caddr_t)sp, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); if (count == 0) error = EBUSY; else suword32_noerr((void *)&sp->sema_count, --count); if (count != 0) { fuword8_noerr(&sp->sema_waiters, &waiters); if (waiters != 0) { (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); suword8_noerr(&sp->sema_waiters, waiters); } } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (error) return (set_errno(error)); return (0); } /* * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument. */ int lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park) { kthread_t *t = curthread; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); lwp_timer_t lwpt; caddr_t timedwait; clock_t tim = -1; label_t ljb; volatile int locked = 0; volatile int watched = 0; volatile uint16_t type = 0; int count; lwpchan_t lwpchan; uchar_t waiters; int error = 0; int time_error; int imm_timeout = 0; int imm_unpark = 0; if ((caddr_t)sp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); timedwait = (caddr_t)tsp; if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && lwpt.lwpt_imm_timeout) { imm_timeout = 1; timedwait = NULL; } watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_sema_t object is * defined to be MAP_PRIVATE, and is USYNC_PROCESS. */ fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type); suword16_noerr((void *)&sp->sema_type, type); if (!get_lwpchan(p->p_as, (caddr_t)sp, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); while (error == 0 && count == 0) { if (time_error) { /* * The SUSV3 Posix spec is very clear that we * should get no error from validating the * timer until we would actually sleep. */ error = time_error; break; } suword8_noerr(&sp->sema_waiters, 1); if (watched) watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); /* * Put the lwp in an orderly state for debugging. */ prstop(PR_REQUESTED, 0); if (check_park && (!schedctl_is_park() || t->t_unpark)) { /* * We received a signal at user-level before calling * here or another thread wants us to return * immediately with EINTR. See lwp_unpark(). */ imm_unpark = 1; t->t_unpark = 0; timedwait = NULL; } else if (timedwait) { /* * If we successfully queue the timeout, * then don't drop t_delay_lock until * we are on the sleep queue (below). */ mutex_enter(&t->t_delay_lock); if (lwp_timer_enqueue(&lwpt) != 0) { mutex_exit(&t->t_delay_lock); imm_timeout = 1; timedwait = NULL; } } t->t_flag |= T_WAITCVSEM; lwp_block(&lwpchan); /* * Nothing should happen to cause the lwp to sleep * again until after it returns from swtch(). */ if (timedwait) mutex_exit(&t->t_delay_lock); locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || (imm_timeout | imm_unpark)) setrun(t); swtch(); t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); if (timedwait) tim = lwp_timer_dequeue(&lwpt); setallwatch(); if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t) || imm_unpark) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count); } if (error == 0) suword32_noerr((void *)&sp->sema_count, --count); if (count != 0) { (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); suword8_noerr(&sp->sema_waiters, waiters); } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (tsp && check_park && !time_error) error = lwp_timer_copyout(&lwpt, error); if (error) return (set_errno(error)); return (0); } /* * Obsolete lwp_sema_wait() interface, no longer called from libc. * libc now calls lwp_sema_timedwait(). * This system call trap exists solely for the benefit of old * statically linked applications from Solaris 9 and before. * It should be removed when we no longer care about such applications. */ int lwp_sema_wait(lwp_sema_t *sp) { return (lwp_sema_timedwait(sp, NULL, 0)); } int lwp_sema_post(lwp_sema_t *sp) { proc_t *p = ttoproc(curthread); label_t ljb; volatile int locked = 0; volatile int watched = 0; volatile uint16_t type = 0; int count; lwpchan_t lwpchan; uchar_t waiters; int error = 0; if ((caddr_t)sp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); error = EFAULT; goto out; } /* * Force Copy-on-write fault if lwp_sema_t object is * defined to be MAP_PRIVATE, and is USYNC_PROCESS. */ fuword16_noerr(&sp->sema_type, (uint16_t *)&type); suword16_noerr(&sp->sema_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; fuword32_noerr(&sp->sema_count, (uint32_t *)&count); if (count == _SEM_VALUE_MAX) error = EOVERFLOW; else suword32_noerr(&sp->sema_count, ++count); if (count == 1) { fuword8_noerr(&sp->sema_waiters, &waiters); if (waiters) { (void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM); suword8_noerr(&sp->sema_waiters, waiters); } } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE); if (error) return (set_errno(error)); return (0); } #define TRW_WANT_WRITE 0x1 #define TRW_LOCK_GRANTED 0x2 #define READ_LOCK 0 #define WRITE_LOCK 1 #define TRY_FLAG 0x10 #define READ_LOCK_TRY (READ_LOCK | TRY_FLAG) #define WRITE_LOCK_TRY (WRITE_LOCK | TRY_FLAG) /* * Release one writer or one or more readers. Compute the rwstate word to * reflect the new state of the queue. For a safe hand-off we copy the new * rwstate value back to userland before we wake any of the new lock holders. * * Note that sleepq_insert() implements a prioritized FIFO (with writers * being given precedence over readers of the same priority). * * If the first thread is a reader we scan the queue releasing all readers * until we hit a writer or the end of the queue. If the first thread is a * writer we still need to check for another writer. */ void lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw) { sleepq_head_t *sqh; kthread_t *tp; kthread_t **tpp; kthread_t *tpnext; kthread_t *wakelist = NULL; uint32_t rwstate = 0; int wcount = 0; int rcount = 0; sqh = lwpsqhash(lwpchan); disp_lock_enter(&sqh->sq_lock); tpp = &sqh->sq_queue.sq_first; while ((tp = *tpp) != NULL) { if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 && tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) { if (tp->t_writer & TRW_WANT_WRITE) { if ((wcount++ == 0) && (rcount == 0)) { rwstate |= URW_WRITE_LOCKED; /* Just one writer to wake. */ sleepq_unlink(tpp, tp); wakelist = tp; /* tpp already set for next thread. */ continue; } else { rwstate |= URW_HAS_WAITERS; /* We need look no further. */ break; } } else { rcount++; if (wcount == 0) { rwstate++; /* Add reader to wake list. */ sleepq_unlink(tpp, tp); tp->t_link = wakelist; wakelist = tp; /* tpp already set for next thread. */ continue; } else { rwstate |= URW_HAS_WAITERS; /* We need look no further. */ break; } } } tpp = &tp->t_link; } /* Copy the new rwstate back to userland. */ suword32_noerr(&rw->rwlock_readers, rwstate); /* Wake the new lock holder(s) up. */ tp = wakelist; while (tp != NULL) { DTRACE_SCHED1(wakeup, kthread_t *, tp); tp->t_wchan0 = NULL; tp->t_wchan = NULL; tp->t_sobj_ops = NULL; tp->t_writer |= TRW_LOCK_GRANTED; tpnext = tp->t_link; tp->t_link = NULL; CL_WAKEUP(tp); thread_unlock_high(tp); tp = tpnext; } disp_lock_exit(&sqh->sq_lock); } /* * We enter here holding the user-level mutex, which we must release before * returning or blocking. Based on lwp_cond_wait(). */ static int lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr) { lwp_mutex_t *mp = NULL; kthread_t *t = curthread; kthread_t *tp; klwp_t *lwp = ttolwp(t); proc_t *p = ttoproc(t); lwp_timer_t lwpt; lwpchan_t lwpchan; lwpchan_t mlwpchan; caddr_t timedwait; volatile uint16_t type = 0; volatile uint8_t mtype = 0; uchar_t mwaiters; volatile int error = 0; int time_error; clock_t tim = -1; volatile int locked = 0; volatile int mlocked = 0; volatile int watched = 0; volatile int mwatched = 0; label_t ljb; volatile int no_lwpchan = 1; int imm_timeout = 0; int try_flag; uint32_t rwstate; int acquired = 0; /* We only check rw because the mutex is included in it. */ if ((caddr_t)rw >= p->p_as->a_userlimit) return (set_errno(EFAULT)); /* We must only report this error if we are about to sleep (later). */ timedwait = (caddr_t)tsp; if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 && lwpt.lwpt_imm_timeout) { imm_timeout = 1; timedwait = NULL; } (void) new_mstate(t, LMS_USER_LOCK); if (on_fault(&ljb)) { if (no_lwpchan) { error = EFAULT; goto out_nodrop; } if (mlocked) { mlocked = 0; lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); } if (locked) { locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); } /* * Set up another on_fault() for a possible fault * on the user lock accessed at "out_drop". */ if (on_fault(&ljb)) { if (mlocked) { mlocked = 0; lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); } error = EFAULT; goto out_nodrop; } error = EFAULT; goto out_nodrop; } /* Process rd_wr (including sanity check). */ try_flag = (rd_wr & TRY_FLAG); rd_wr &= ~TRY_FLAG; if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) { error = EINVAL; goto out_nodrop; } /* We can only continue for simple USYNC_PROCESS locks. */ mp = &rw->mutex; fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype); fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type); if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) { error = EINVAL; goto out_nodrop; } /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */ suword8_noerr(&mp->mutex_type, mtype); suword16_noerr(&rw->rwlock_type, type); /* Convert user level mutex, "mp", to a unique lwpchan. */ if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype, &mlwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out_nodrop; } /* Convert user level rwlock, "rw", to a unique lwpchan. */ if (!get_lwpchan(p->p_as, (caddr_t)rw, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out_nodrop; } no_lwpchan = 0; watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); /* * lwpchan_lock() ensures that the calling LWP is put to sleep * atomically with respect to a possible wakeup which is a result * of lwp_rwlock_unlock(). * * What's misleading is that the LWP is put to sleep after the * rwlock's mutex is released. This is OK as long as the release * operation is also done while holding mlwpchan. The LWP is then * put to sleep when the possibility of pagefaulting or sleeping * has been completely eliminated. */ lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL); mlocked = 1; /* * Fetch the current rwlock state. * * The possibility of spurious wake-ups or killed waiters means * rwstate's URW_HAS_WAITERS bit may indicate false positives. * We only fix these if they are important to us. * * Although various error states can be observed here (e.g. the lock * is not held, but there are waiters) we assume these are applicaton * errors and so we take no corrective action. */ fuword32_noerr(&rw->rwlock_readers, &rwstate); /* * We cannot legitimately get here from user-level * without URW_HAS_WAITERS being set. * Set it now to guard against user-level error. */ rwstate |= URW_HAS_WAITERS; /* * We can try only if the lock isn't held by a writer. */ if (!(rwstate & URW_WRITE_LOCKED)) { tp = lwp_queue_waiter(&lwpchan); if (tp == NULL) { /* * Hmmm, rwstate indicates waiters but there are * none queued. This could just be the result of a * spurious wakeup, so let's ignore it. * * We now have a chance to acquire the lock * uncontended, but this is the last chance for * a writer to acquire the lock without blocking. */ if (rd_wr == READ_LOCK) { rwstate++; acquired = 1; } else if ((rwstate & URW_READERS_MASK) == 0) { rwstate |= URW_WRITE_LOCKED; acquired = 1; } } else if (rd_wr == READ_LOCK) { /* * This is the last chance for a reader to acquire * the lock now, but it can only do so if there is * no writer of equal or greater priority at the * head of the queue . * * It is also just possible that there is a reader * at the head of the queue. This may be the result * of a spurious wakeup or an application failure. * In this case we only acquire the lock if we have * equal or greater priority. It is not our job to * release spurious waiters. */ pri_t our_pri = DISP_PRIO(t); pri_t his_pri = DISP_PRIO(tp); if ((our_pri > his_pri) || ((our_pri == his_pri) && !(tp->t_writer & TRW_WANT_WRITE))) { rwstate++; acquired = 1; } } } if (acquired || try_flag || time_error) { /* * We're not going to block this time. */ suword32_noerr(&rw->rwlock_readers, rwstate); lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); locked = 0; if (acquired) { /* * Got the lock! */ error = 0; } else if (try_flag) { /* * We didn't get the lock and we're about to block. * If we're doing a trylock, return EBUSY instead. */ error = EBUSY; } else if (time_error) { /* * The SUSV3 POSIX spec is very clear that we should * get no error from validating the timer (above) * until we would actually sleep. */ error = time_error; } goto out_drop; } /* * We're about to block, so indicate what kind of waiter we are. */ t->t_writer = 0; if (rd_wr == WRITE_LOCK) t->t_writer = TRW_WANT_WRITE; suword32_noerr(&rw->rwlock_readers, rwstate); /* * Unlock the rwlock's mutex (pagefaults are possible here). */ suword32_noerr((uint32_t *)&mp->mutex_owner, 0); suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0); suword32_noerr(&mp->mutex_ownerpid, 0); ulock_clear(&mp->mutex_lockw); fuword8_noerr(&mp->mutex_waiters, &mwaiters); if (mwaiters != 0) { /* * Given the locking of mlwpchan around the release of * the mutex and checking for waiters, the following * call to lwp_release() can fail ONLY if the lock * acquirer is interrupted after setting the waiter bit, * calling lwp_block() and releasing mlwpchan. * In this case, it could get pulled off the LWP sleep * queue (via setrun()) before the following call to * lwp_release() occurs, and the lock requestor will * update the waiter bit correctly by re-evaluating it. */ if (lwp_release(&mlwpchan, &mwaiters, 0) > 0) suword8_noerr(&mp->mutex_waiters, mwaiters); } lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); mlocked = 0; no_fault(); if (mwatched) { watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); mwatched = 0; } if (watched) { watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); watched = 0; } /* * Put the LWP in an orderly state for debugging. */ prstop(PR_REQUESTED, 0); if (timedwait) { /* * If we successfully queue the timeout, * then don't drop t_delay_lock until * we are on the sleep queue (below). */ mutex_enter(&t->t_delay_lock); if (lwp_timer_enqueue(&lwpt) != 0) { mutex_exit(&t->t_delay_lock); imm_timeout = 1; timedwait = NULL; } } t->t_flag |= T_WAITCVSEM; lwp_block(&lwpchan); /* * Nothing should happen to cause the LWp to go to sleep until after * it returns from swtch(). */ if (timedwait) mutex_exit(&t->t_delay_lock); locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t)) setrun(t); swtch(); /* * We're back, but we need to work out why. Were we interrupted? Did * we timeout? Were we granted the lock? */ error = EAGAIN; acquired = (t->t_writer & TRW_LOCK_GRANTED); t->t_writer = 0; t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE); if (timedwait) tim = lwp_timer_dequeue(&lwpt); if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t)) error = EINTR; else if (imm_timeout || (timedwait && tim == -1)) error = ETIME; lwp->lwp_asleep = 0; lwp->lwp_sysabort = 0; setallwatch(); /* * If we were granted the lock we don't care about EINTR or ETIME. */ if (acquired) error = 0; if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); if (error) return (set_errno(error)); return (0); out_drop: /* * Make sure that the user level lock is dropped before returning * to the caller. */ if (!mlocked) { lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL); mlocked = 1; } suword32_noerr((uint32_t *)&mp->mutex_owner, 0); suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0); suword32_noerr(&mp->mutex_ownerpid, 0); ulock_clear(&mp->mutex_lockw); fuword8_noerr(&mp->mutex_waiters, &mwaiters); if (mwaiters != 0) { /* * See comment above on lock clearing and lwp_release() * success/failure. */ if (lwp_release(&mlwpchan, &mwaiters, 0) > 0) suword8_noerr(&mp->mutex_waiters, mwaiters); } lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL); mlocked = 0; out_nodrop: no_fault(); if (mwatched) watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE); if (watched) watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); if (error) return (set_errno(error)); return (0); } /* * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(), * we never drop the lock. */ static int lwp_rwlock_unlock(lwp_rwlock_t *rw) { kthread_t *t = curthread; proc_t *p = ttoproc(t); lwpchan_t lwpchan; volatile uint16_t type = 0; volatile int error = 0; volatile int locked = 0; volatile int watched = 0; label_t ljb; volatile int no_lwpchan = 1; uint32_t rwstate; /* We only check rw because the mutex is included in it. */ if ((caddr_t)rw >= p->p_as->a_userlimit) return (set_errno(EFAULT)); if (on_fault(&ljb)) { if (no_lwpchan) { error = EFAULT; goto out_nodrop; } if (locked) { locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); } error = EFAULT; goto out_nodrop; } /* We can only continue for simple USYNC_PROCESS locks. */ fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type); if (type != USYNC_PROCESS) { error = EINVAL; goto out_nodrop; } /* Force Copy-on-write fault incase objects are MAP_PRIVATE. */ suword16_noerr(&rw->rwlock_type, type); /* Convert user level rwlock, "rw", to a unique lwpchan. */ if (!get_lwpchan(p->p_as, (caddr_t)rw, type, &lwpchan, LWPCHAN_CVPOOL)) { error = EFAULT; goto out_nodrop; } no_lwpchan = 0; watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL); locked = 1; /* * We can resolve multiple readers (except the last reader) here. * For the last reader or a writer we need lwp_rwlock_release(), * to which we also delegate the task of copying the new rwstate * back to userland (see the comment there). */ fuword32_noerr(&rw->rwlock_readers, &rwstate); if (rwstate & URW_WRITE_LOCKED) lwp_rwlock_release(&lwpchan, rw); else if ((rwstate & URW_READERS_MASK) > 0) { rwstate--; if ((rwstate & URW_READERS_MASK) == 0) lwp_rwlock_release(&lwpchan, rw); else suword32_noerr(&rw->rwlock_readers, rwstate); } lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL); locked = 0; error = 0; out_nodrop: no_fault(); if (watched) watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE); if (error) return (set_errno(error)); return (0); } int lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp) { switch (subcode) { case 0: return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK)); case 1: return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK)); case 2: return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY)); case 3: return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY)); case 4: return (lwp_rwlock_unlock(rwlp)); } return (set_errno(EINVAL)); } /* * Return the owner of the user-level s-object. * Since we can't really do this, return NULL. */ /* ARGSUSED */ static kthread_t * lwpsobj_owner(caddr_t sobj) { return ((kthread_t *)NULL); } /* * Wake up a thread asleep on a user-level synchronization * object. */ static void lwp_unsleep(kthread_t *t) { ASSERT(THREAD_LOCK_HELD(t)); if (t->t_wchan0 != NULL) { sleepq_head_t *sqh; sleepq_t *sqp = t->t_sleepq; if (sqp != NULL) { sqh = lwpsqhash(&t->t_lwpchan); ASSERT(&sqh->sq_queue == sqp); sleepq_unsleep(t); disp_lock_exit_high(&sqh->sq_lock); CL_SETRUN(t); return; } } panic("lwp_unsleep: thread %p not on sleepq", (void *)t); } /* * Change the priority of a thread asleep on a user-level * synchronization object. To maintain proper priority order, * we: * o dequeue the thread. * o change its priority. * o re-enqueue the thread. * Assumption: the thread is locked on entry. */ static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip) { ASSERT(THREAD_LOCK_HELD(t)); if (t->t_wchan0 != NULL) { sleepq_t *sqp = t->t_sleepq; sleepq_dequeue(t); *t_prip = pri; sleepq_insert(sqp, t); } else panic("lwp_change_pri: %p not on a sleep queue", (void *)t); } /* * Clean up a locked robust mutex */ static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg) { uint16_t flag; uchar_t waiters; label_t ljb; pid_t owner_pid; lwp_mutex_t *lp; volatile int locked = 0; volatile int watched = 0; volatile struct upimutex *upimutex = NULL; volatile int upilocked = 0; ASSERT(ent->lwpchan_type & LOCK_ROBUST); lp = (lwp_mutex_t *)ent->lwpchan_addr; watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); if (upilocked) upimutex_unlock((upimutex_t *)upimutex, 0); goto out; } if (ent->lwpchan_type & USYNC_PROCESS) { fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid); if (owner_pid != curproc->p_pid) goto out; } if (UPIMUTEX(ent->lwpchan_type)) { lwpchan_t lwpchan = ent->lwpchan_lwpchan; upib_t *upibp = &UPI_CHAIN(lwpchan); mutex_enter(&upibp->upib_lock); upimutex = upi_get(upibp, &lwpchan); if (upimutex == NULL || upimutex->upi_owner != curthread) { mutex_exit(&upibp->upib_lock); goto out; } mutex_exit(&upibp->upib_lock); upilocked = 1; flag = lwp_clear_mutex(lp, lockflg); suword8_noerr(&lp->mutex_lockw, 0); upimutex_unlock((upimutex_t *)upimutex, flag); } else { lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); locked = 1; (void) lwp_clear_mutex(lp, lockflg); ulock_clear(&lp->mutex_lockw); fuword8_noerr(&lp->mutex_waiters, &waiters); if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0)) suword8_noerr(&lp->mutex_waiters, waiters); lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL); } out: no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); } /* * Register a process-shared robust mutex in the lwpchan cache. */ int lwp_mutex_register(lwp_mutex_t *lp) { int error = 0; volatile int watched; label_t ljb; uint8_t type; lwpchan_t lwpchan; if ((caddr_t)lp >= (caddr_t)USERLIMIT) return (set_errno(EFAULT)); watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (on_fault(&ljb)) { error = EFAULT; } else { fuword8_noerr(&lp->mutex_type, &type); if ((type & (USYNC_PROCESS|LOCK_ROBUST)) != (USYNC_PROCESS|LOCK_ROBUST)) { error = EINVAL; } else { /* * Force Copy-on-write fault if lwp_mutex_t object is * defined to be MAP_PRIVATE and it was initialized to * USYNC_PROCESS. */ suword8_noerr(&lp->mutex_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) error = EFAULT; } } no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (error) return (set_errno(error)); return (0); } int lwp_mutex_trylock(lwp_mutex_t *lp) { kthread_t *t = curthread; proc_t *p = ttoproc(t); int error = 0; volatile int locked = 0; volatile int watched = 0; label_t ljb; volatile uint8_t type = 0; uint16_t flag; lwpchan_t lwpchan; if ((caddr_t)lp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); (void) new_mstate(t, LMS_USER_LOCK); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = EFAULT; goto out; } fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); if (UPIMUTEX(type)) { no_fault(); error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL); if ((type & USYNC_PROCESS) && (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)) (void) suword32(&lp->mutex_ownerpid, p->p_pid); if (error) return (set_errno(error)); return (0); } /* * Force Copy-on-write fault if lwp_mutex_t object is * defined to be MAP_PRIVATE and it was initialized to * USYNC_PROCESS. */ suword8_noerr(&lp->mutex_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & LOCK_NOTRECOVERABLE) { lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = ENOTRECOVERABLE; goto out; } } watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (!ulock_try(&lp->mutex_lockw)) error = EBUSY; else { if (type & USYNC_PROCESS) suword32_noerr(&lp->mutex_ownerpid, p->p_pid); if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { if (flag & LOCK_OWNERDEAD) error = EOWNERDEAD; else if (type & USYNC_PROCESS_ROBUST) error = ELOCKUNMAPPED; else error = EOWNERDEAD; } } } locked = 0; lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); out: if (t->t_mstate == LMS_USER_LOCK) (void) new_mstate(t, LMS_SYSTEM); no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (error) return (set_errno(error)); return (0); } /* * unlock the mutex and unblock lwps that is trying to acquire this mutex. * the blocked lwp resumes and retries to acquire the lock. */ int lwp_mutex_unlock(lwp_mutex_t *lp) { proc_t *p = ttoproc(curthread); lwpchan_t lwpchan; uchar_t waiters; volatile int locked = 0; volatile int watched = 0; volatile uint8_t type = 0; label_t ljb; uint16_t flag; int error = 0; if ((caddr_t)lp >= p->p_as->a_userlimit) return (set_errno(EFAULT)); if (on_fault(&ljb)) { if (locked) lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); error = EFAULT; goto out; } fuword8_noerr(&lp->mutex_type, (uint8_t *)&type); if (UPIMUTEX(type)) { no_fault(); error = lwp_upimutex_unlock(lp, type); if (error) return (set_errno(error)); return (0); } watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); /* * Force Copy-on-write fault if lwp_mutex_t object is * defined to be MAP_PRIVATE, and type is USYNC_PROCESS */ suword8_noerr(&lp->mutex_type, type); if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type, &lwpchan, LWPCHAN_MPPOOL)) { error = EFAULT; goto out; } lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL); locked = 1; if (type & LOCK_ROBUST) { fuword16_noerr(&lp->mutex_flag, &flag); if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) { flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED); flag |= LOCK_NOTRECOVERABLE; suword16_noerr(&lp->mutex_flag, flag); } } if (type & USYNC_PROCESS) suword32_noerr(&lp->mutex_ownerpid, 0); ulock_clear(&lp->mutex_lockw); /* * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release() * may fail. If it fails, do not write into the waiter bit. * The call to lwp_release() might fail due to one of three reasons: * * 1. due to the thread which set the waiter bit not actually * sleeping since it got the lock on the re-try. The waiter * bit will then be correctly updated by that thread. This * window may be closed by reading the wait bit again here * and not calling lwp_release() at all if it is zero. * 2. the thread which set the waiter bit and went to sleep * was woken up by a signal. This time, the waiter recomputes * the wait bit in the return with EINTR code. * 3. the waiter bit read by lwp_mutex_wakeup() was in * memory that has been re-used after the lock was dropped. * In this case, writing into the waiter bit would cause data * corruption. */ fuword8_noerr(&lp->mutex_waiters, &waiters); if (waiters) { if ((type & LOCK_ROBUST) && (flag & LOCK_NOTRECOVERABLE)) { lwp_release_all(&lwpchan); suword8_noerr(&lp->mutex_waiters, 0); } else if (lwp_release(&lwpchan, &waiters, 0) == 1) { suword8_noerr(&lp->mutex_waiters, waiters); } } lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL); out: no_fault(); if (watched) watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE); if (error) return (set_errno(error)); return (0); }