xref: /titanic_52/usr/src/uts/common/os/turnstile.c (revision 8793b36b40d14ad0a0fecc97738dc118a928f46c)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
59d68b18eSck142721  * Common Development and Distribution License (the "License").
69d68b18eSck142721  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
21fd6545c7Sraf 
227c478bd9Sstevel@tonic-gate /*
239d68b18eSck142721  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
247c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
257c478bd9Sstevel@tonic-gate  */
267c478bd9Sstevel@tonic-gate 
277c478bd9Sstevel@tonic-gate /*
287c478bd9Sstevel@tonic-gate  * Big Theory Statement for turnstiles.
297c478bd9Sstevel@tonic-gate  *
307c478bd9Sstevel@tonic-gate  * Turnstiles provide blocking and wakeup support, including priority
317c478bd9Sstevel@tonic-gate  * inheritance, for synchronization primitives (e.g. mutexes and rwlocks).
327c478bd9Sstevel@tonic-gate  * Typical usage is as follows:
337c478bd9Sstevel@tonic-gate  *
347c478bd9Sstevel@tonic-gate  * To block on lock 'lp' for read access in foo_enter():
357c478bd9Sstevel@tonic-gate  *
367c478bd9Sstevel@tonic-gate  *	ts = turnstile_lookup(lp);
377c478bd9Sstevel@tonic-gate  *	[ If the lock is still held, set the waiters bit
387c478bd9Sstevel@tonic-gate  *	turnstile_block(ts, TS_READER_Q, lp, &foo_sobj_ops);
397c478bd9Sstevel@tonic-gate  *
407c478bd9Sstevel@tonic-gate  * To wake threads waiting for write access to lock 'lp' in foo_exit():
417c478bd9Sstevel@tonic-gate  *
427c478bd9Sstevel@tonic-gate  *	ts = turnstile_lookup(lp);
437c478bd9Sstevel@tonic-gate  *	[ Either drop the lock (change owner to NULL) or perform a direct
447c478bd9Sstevel@tonic-gate  *	[ handoff (change owner to one of the threads we're about to wake).
457c478bd9Sstevel@tonic-gate  *	[ If we're going to wake the last waiter, clear the waiters bit.
467c478bd9Sstevel@tonic-gate  *	turnstile_wakeup(ts, TS_WRITER_Q, nwaiters, new_owner or NULL);
477c478bd9Sstevel@tonic-gate  *
487c478bd9Sstevel@tonic-gate  * turnstile_lookup() returns holding the turnstile hash chain lock for lp.
497c478bd9Sstevel@tonic-gate  * Both turnstile_block() and turnstile_wakeup() drop the turnstile lock.
507c478bd9Sstevel@tonic-gate  * To abort a turnstile operation, the client must call turnstile_exit().
517c478bd9Sstevel@tonic-gate  *
527c478bd9Sstevel@tonic-gate  * Requirements of the client:
537c478bd9Sstevel@tonic-gate  *
547c478bd9Sstevel@tonic-gate  * (1)  The lock's waiters indicator may be manipulated *only* while
557c478bd9Sstevel@tonic-gate  *	holding the turnstile hash chain lock (i.e. under turnstile_lookup()).
567c478bd9Sstevel@tonic-gate  *
577c478bd9Sstevel@tonic-gate  * (2)	Once the lock is marked as having waiters, the owner may be
587c478bd9Sstevel@tonic-gate  *	changed *only* while holding the turnstile hash chain lock.
597c478bd9Sstevel@tonic-gate  *
607c478bd9Sstevel@tonic-gate  * (3)	The caller must never block on an unheld lock.
617c478bd9Sstevel@tonic-gate  *
627c478bd9Sstevel@tonic-gate  * Consequences of these assumptions include the following:
637c478bd9Sstevel@tonic-gate  *
647c478bd9Sstevel@tonic-gate  * (a) It is impossible for a lock to be unheld but have waiters.
657c478bd9Sstevel@tonic-gate  *
667c478bd9Sstevel@tonic-gate  * (b)	The priority inheritance code can safely assume that an active
677c478bd9Sstevel@tonic-gate  *	turnstile's ts_inheritor never changes until the inheritor calls
687c478bd9Sstevel@tonic-gate  *	turnstile_pi_waive().
697c478bd9Sstevel@tonic-gate  *
707c478bd9Sstevel@tonic-gate  * These assumptions simplify the implementation of both turnstiles and
717c478bd9Sstevel@tonic-gate  * their clients.
727c478bd9Sstevel@tonic-gate  *
737c478bd9Sstevel@tonic-gate  * Background on priority inheritance:
747c478bd9Sstevel@tonic-gate  *
757c478bd9Sstevel@tonic-gate  * Priority inheritance allows a thread to "will" its dispatch priority
767c478bd9Sstevel@tonic-gate  * to all the threads blocking it, directly or indirectly.  This prevents
777c478bd9Sstevel@tonic-gate  * situations called priority inversions in which a high-priority thread
787c478bd9Sstevel@tonic-gate  * needs a lock held by a low-priority thread, which cannot run because
797c478bd9Sstevel@tonic-gate  * of medium-priority threads.  Without PI, the medium-priority threads
807c478bd9Sstevel@tonic-gate  * can starve out the high-priority thread indefinitely.  With PI, the
817c478bd9Sstevel@tonic-gate  * low-priority thread becomes high-priority until it releases whatever
827c478bd9Sstevel@tonic-gate  * synchronization object the real high-priority thread is waiting for.
837c478bd9Sstevel@tonic-gate  *
847c478bd9Sstevel@tonic-gate  * How turnstiles work:
857c478bd9Sstevel@tonic-gate  *
867c478bd9Sstevel@tonic-gate  * All active turnstiles reside in a global hash table, turnstile_table[].
877c478bd9Sstevel@tonic-gate  * The address of a synchronization object determines its hash index.
887c478bd9Sstevel@tonic-gate  * Each hash chain is protected by its own dispatcher lock, acquired
897c478bd9Sstevel@tonic-gate  * by turnstile_lookup().  This lock protects the hash chain linkage, the
907c478bd9Sstevel@tonic-gate  * contents of all turnstiles on the hash chain, and the waiters bits of
917c478bd9Sstevel@tonic-gate  * every synchronization object in the system that hashes to the same chain.
927c478bd9Sstevel@tonic-gate  * Giving the lock such broad scope simplifies the interactions between
937c478bd9Sstevel@tonic-gate  * the turnstile code and its clients considerably.  The blocking path
947c478bd9Sstevel@tonic-gate  * is rare enough that this has no impact on scalability.  (If it ever
957c478bd9Sstevel@tonic-gate  * does, it's almost surely a second-order effect -- the real problem
967c478bd9Sstevel@tonic-gate  * is that some synchronization object is *very* heavily contended.)
977c478bd9Sstevel@tonic-gate  *
987c478bd9Sstevel@tonic-gate  * Each thread has an attached turnstile in case it needs to block.
997c478bd9Sstevel@tonic-gate  * A thread cannot block on more than one lock at a time, so one
1007c478bd9Sstevel@tonic-gate  * turnstile per thread is the most we ever need.  The first thread
1017c478bd9Sstevel@tonic-gate  * to block on a lock donates its attached turnstile and adds it to
1027c478bd9Sstevel@tonic-gate  * the appropriate hash chain in turnstile_table[].  This becomes the
1037c478bd9Sstevel@tonic-gate  * "active turnstile" for the lock.  Each subsequent thread that blocks
1047c478bd9Sstevel@tonic-gate  * on the same lock discovers that the lock already has an active
1057c478bd9Sstevel@tonic-gate  * turnstile, so it stashes its own turnstile on the active turnstile's
1067c478bd9Sstevel@tonic-gate  * freelist.  As threads wake up, the process is reversed.
1077c478bd9Sstevel@tonic-gate  *
1087c478bd9Sstevel@tonic-gate  * turnstile_block() puts the current thread to sleep on the active
1097c478bd9Sstevel@tonic-gate  * turnstile for the desired lock, walks the blocking chain to apply
1107c478bd9Sstevel@tonic-gate  * priority inheritance to everyone in its way, and yields the CPU.
1117c478bd9Sstevel@tonic-gate  *
1127c478bd9Sstevel@tonic-gate  * turnstile_wakeup() waives any priority the owner may have inherited
1137c478bd9Sstevel@tonic-gate  * and wakes the specified number of waiting threads.  If the caller is
1147c478bd9Sstevel@tonic-gate  * doing direct handoff of ownership (rather than just dropping the lock),
1157c478bd9Sstevel@tonic-gate  * the new owner automatically inherits priority from any existing waiters.
1167c478bd9Sstevel@tonic-gate  */
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate #include <sys/param.h>
1197c478bd9Sstevel@tonic-gate #include <sys/systm.h>
1207c478bd9Sstevel@tonic-gate #include <sys/thread.h>
1217c478bd9Sstevel@tonic-gate #include <sys/proc.h>
1227c478bd9Sstevel@tonic-gate #include <sys/debug.h>
1237c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
1247c478bd9Sstevel@tonic-gate #include <sys/turnstile.h>
1257c478bd9Sstevel@tonic-gate #include <sys/t_lock.h>
1267c478bd9Sstevel@tonic-gate #include <sys/disp.h>
1277c478bd9Sstevel@tonic-gate #include <sys/sobject.h>
1287c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
1297c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
1307c478bd9Sstevel@tonic-gate #include <sys/lockstat.h>
1317c478bd9Sstevel@tonic-gate #include <sys/lwp_upimutex_impl.h>
1327c478bd9Sstevel@tonic-gate #include <sys/schedctl.h>
1337c478bd9Sstevel@tonic-gate #include <sys/cpu.h>
1347c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
1357c478bd9Sstevel@tonic-gate #include <sys/cpupart.h>
1367c478bd9Sstevel@tonic-gate 
1377c478bd9Sstevel@tonic-gate extern upib_t upimutextab[UPIMUTEX_TABSIZE];
1387c478bd9Sstevel@tonic-gate 
1397c478bd9Sstevel@tonic-gate #define	IS_UPI(sobj)	\
1407c478bd9Sstevel@tonic-gate 	((uintptr_t)(sobj) - (uintptr_t)upimutextab < sizeof (upimutextab))
1417c478bd9Sstevel@tonic-gate 
1427c478bd9Sstevel@tonic-gate /*
1437c478bd9Sstevel@tonic-gate  * The turnstile hash table is partitioned into two halves: the lower half
1447c478bd9Sstevel@tonic-gate  * is used for upimutextab[] locks, the upper half for everything else.
1457c478bd9Sstevel@tonic-gate  * The reason for the distinction is that SOBJ_USER_PI locks present a
1467c478bd9Sstevel@tonic-gate  * unique problem: the upimutextab[] lock passed to turnstile_block()
1477c478bd9Sstevel@tonic-gate  * cannot be dropped until the calling thread has blocked on its
1487c478bd9Sstevel@tonic-gate  * SOBJ_USER_PI lock and willed its priority down the blocking chain.
1497c478bd9Sstevel@tonic-gate  * At that point, the caller's t_lockp will be one of the turnstile locks.
1507c478bd9Sstevel@tonic-gate  * If mutex_exit() discovers that the upimutextab[] lock has waiters, it
1517c478bd9Sstevel@tonic-gate  * must wake them, which forces a lock ordering on us: the turnstile lock
1527c478bd9Sstevel@tonic-gate  * for the upimutextab[] lock will be acquired in mutex_vector_exit(),
1537c478bd9Sstevel@tonic-gate  * which will eventually call into turnstile_pi_waive(), which will then
1547c478bd9Sstevel@tonic-gate  * acquire the caller's thread lock, which in this case is the turnstile
1557c478bd9Sstevel@tonic-gate  * lock for the SOBJ_USER_PI lock.  In general, when two turnstile locks
1567c478bd9Sstevel@tonic-gate  * must be held at the same time, the lock order must be the address order.
1577c478bd9Sstevel@tonic-gate  * Therefore, to prevent deadlock in turnstile_pi_waive(), we must ensure
1587c478bd9Sstevel@tonic-gate  * that upimutextab[] locks *always* hash to lower addresses than any
1597c478bd9Sstevel@tonic-gate  * other locks.  You think this is cheesy?  Let's see you do better.
1607c478bd9Sstevel@tonic-gate  */
1617c478bd9Sstevel@tonic-gate #define	TURNSTILE_HASH_SIZE	128		/* must be power of 2 */
1627c478bd9Sstevel@tonic-gate #define	TURNSTILE_HASH_MASK	(TURNSTILE_HASH_SIZE - 1)
1637c478bd9Sstevel@tonic-gate #define	TURNSTILE_SOBJ_HASH(sobj)	\
1647c478bd9Sstevel@tonic-gate 	((((ulong_t)sobj >> 2) + ((ulong_t)sobj >> 9)) & TURNSTILE_HASH_MASK)
1657c478bd9Sstevel@tonic-gate #define	TURNSTILE_SOBJ_BUCKET(sobj)		\
1667c478bd9Sstevel@tonic-gate 	((IS_UPI(sobj) ? 0 : TURNSTILE_HASH_SIZE) + TURNSTILE_SOBJ_HASH(sobj))
1677c478bd9Sstevel@tonic-gate #define	TURNSTILE_CHAIN(sobj)	turnstile_table[TURNSTILE_SOBJ_BUCKET(sobj)]
1687c478bd9Sstevel@tonic-gate 
1697c478bd9Sstevel@tonic-gate typedef struct turnstile_chain {
1707c478bd9Sstevel@tonic-gate 	turnstile_t	*tc_first;	/* first turnstile on hash chain */
1717c478bd9Sstevel@tonic-gate 	disp_lock_t	tc_lock;	/* lock for this hash chain */
1727c478bd9Sstevel@tonic-gate } turnstile_chain_t;
1737c478bd9Sstevel@tonic-gate 
1747c478bd9Sstevel@tonic-gate turnstile_chain_t	turnstile_table[2 * TURNSTILE_HASH_SIZE];
1757c478bd9Sstevel@tonic-gate 
1767c478bd9Sstevel@tonic-gate static	lock_t	turnstile_loser_lock;
1777c478bd9Sstevel@tonic-gate 
1787c478bd9Sstevel@tonic-gate /*
1797c478bd9Sstevel@tonic-gate  * Make 'inheritor' inherit priority from this turnstile.
1807c478bd9Sstevel@tonic-gate  */
1817c478bd9Sstevel@tonic-gate static void
1827c478bd9Sstevel@tonic-gate turnstile_pi_inherit(turnstile_t *ts, kthread_t *inheritor, pri_t epri)
1837c478bd9Sstevel@tonic-gate {
1847c478bd9Sstevel@tonic-gate 	ASSERT(THREAD_LOCK_HELD(inheritor));
1857c478bd9Sstevel@tonic-gate 	ASSERT(DISP_LOCK_HELD(&TURNSTILE_CHAIN(ts->ts_sobj).tc_lock));
1867c478bd9Sstevel@tonic-gate 
1877c478bd9Sstevel@tonic-gate 	if (epri <= inheritor->t_pri)
1887c478bd9Sstevel@tonic-gate 		return;
1897c478bd9Sstevel@tonic-gate 
1907c478bd9Sstevel@tonic-gate 	if (ts->ts_inheritor == NULL) {
1917c478bd9Sstevel@tonic-gate 		ts->ts_inheritor = inheritor;
1927c478bd9Sstevel@tonic-gate 		ts->ts_epri = epri;
1937c478bd9Sstevel@tonic-gate 		disp_lock_enter_high(&inheritor->t_pi_lock);
1947c478bd9Sstevel@tonic-gate 		ts->ts_prioinv = inheritor->t_prioinv;
1957c478bd9Sstevel@tonic-gate 		inheritor->t_prioinv = ts;
1967c478bd9Sstevel@tonic-gate 		disp_lock_exit_high(&inheritor->t_pi_lock);
1977c478bd9Sstevel@tonic-gate 	} else {
1987c478bd9Sstevel@tonic-gate 		/*
1997c478bd9Sstevel@tonic-gate 		 * 'inheritor' is already inheriting from this turnstile,
2007c478bd9Sstevel@tonic-gate 		 * so just adjust its priority.
2017c478bd9Sstevel@tonic-gate 		 */
2027c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_inheritor == inheritor);
2037c478bd9Sstevel@tonic-gate 		if (ts->ts_epri < epri)
2047c478bd9Sstevel@tonic-gate 			ts->ts_epri = epri;
2057c478bd9Sstevel@tonic-gate 	}
2067c478bd9Sstevel@tonic-gate 
2077c478bd9Sstevel@tonic-gate 	if (epri > DISP_PRIO(inheritor))
2087c478bd9Sstevel@tonic-gate 		thread_change_epri(inheritor, epri);
2097c478bd9Sstevel@tonic-gate }
2107c478bd9Sstevel@tonic-gate 
2117c478bd9Sstevel@tonic-gate /*
2127c478bd9Sstevel@tonic-gate  * If turnstile is non-NULL, remove it from inheritor's t_prioinv list.
2137c478bd9Sstevel@tonic-gate  * Compute new inherited priority, and return it.
2147c478bd9Sstevel@tonic-gate  */
2157c478bd9Sstevel@tonic-gate static pri_t
2167c478bd9Sstevel@tonic-gate turnstile_pi_tsdelete(turnstile_t *ts, kthread_t *inheritor)
2177c478bd9Sstevel@tonic-gate {
2187c478bd9Sstevel@tonic-gate 	turnstile_t **tspp, *tsp;
2197c478bd9Sstevel@tonic-gate 	pri_t new_epri = 0;
2207c478bd9Sstevel@tonic-gate 
2217c478bd9Sstevel@tonic-gate 	disp_lock_enter_high(&inheritor->t_pi_lock);
2227c478bd9Sstevel@tonic-gate 	tspp = &inheritor->t_prioinv;
2237c478bd9Sstevel@tonic-gate 	while ((tsp = *tspp) != NULL) {
2247c478bd9Sstevel@tonic-gate 		if (tsp == ts)
2257c478bd9Sstevel@tonic-gate 			*tspp = tsp->ts_prioinv;
2267c478bd9Sstevel@tonic-gate 		else
2277c478bd9Sstevel@tonic-gate 			new_epri = MAX(new_epri, tsp->ts_epri);
2287c478bd9Sstevel@tonic-gate 		tspp = &tsp->ts_prioinv;
2297c478bd9Sstevel@tonic-gate 	}
2307c478bd9Sstevel@tonic-gate 	disp_lock_exit_high(&inheritor->t_pi_lock);
2317c478bd9Sstevel@tonic-gate 	return (new_epri);
2327c478bd9Sstevel@tonic-gate }
2337c478bd9Sstevel@tonic-gate 
2347c478bd9Sstevel@tonic-gate /*
2357c478bd9Sstevel@tonic-gate  * Remove turnstile from inheritor's t_prioinv list, compute
2367c478bd9Sstevel@tonic-gate  * new priority, and change the inheritor's effective priority if
2377c478bd9Sstevel@tonic-gate  * necessary. Keep in synch with turnstile_pi_recalc().
2387c478bd9Sstevel@tonic-gate  */
2397c478bd9Sstevel@tonic-gate static void
2407c478bd9Sstevel@tonic-gate turnstile_pi_waive(turnstile_t *ts)
2417c478bd9Sstevel@tonic-gate {
2427c478bd9Sstevel@tonic-gate 	kthread_t *inheritor = ts->ts_inheritor;
2437c478bd9Sstevel@tonic-gate 	pri_t new_epri;
2447c478bd9Sstevel@tonic-gate 
2457c478bd9Sstevel@tonic-gate 	ASSERT(inheritor == curthread);
2467c478bd9Sstevel@tonic-gate 
2477c478bd9Sstevel@tonic-gate 	thread_lock_high(inheritor);
2487c478bd9Sstevel@tonic-gate 	new_epri = turnstile_pi_tsdelete(ts, inheritor);
2497c478bd9Sstevel@tonic-gate 	if (new_epri != DISP_PRIO(inheritor))
2507c478bd9Sstevel@tonic-gate 		thread_change_epri(inheritor, new_epri);
2517c478bd9Sstevel@tonic-gate 	ts->ts_inheritor = NULL;
2527c478bd9Sstevel@tonic-gate 	if (DISP_MUST_SURRENDER(inheritor))
2537c478bd9Sstevel@tonic-gate 		cpu_surrender(inheritor);
2547c478bd9Sstevel@tonic-gate 	thread_unlock_high(inheritor);
2557c478bd9Sstevel@tonic-gate }
2567c478bd9Sstevel@tonic-gate 
2577c478bd9Sstevel@tonic-gate /*
2587c478bd9Sstevel@tonic-gate  * Compute caller's new inherited priority, and change its effective
2597c478bd9Sstevel@tonic-gate  * priority if necessary. Necessary only for SOBJ_USER_PI, because of
2607c478bd9Sstevel@tonic-gate  * its interruptibility characteristic.
2617c478bd9Sstevel@tonic-gate  */
2627c478bd9Sstevel@tonic-gate void
2637c478bd9Sstevel@tonic-gate turnstile_pi_recalc(void)
2647c478bd9Sstevel@tonic-gate {
2657c478bd9Sstevel@tonic-gate 	kthread_t *inheritor = curthread;
2667c478bd9Sstevel@tonic-gate 	pri_t new_epri;
2677c478bd9Sstevel@tonic-gate 
2687c478bd9Sstevel@tonic-gate 	thread_lock(inheritor);
2697c478bd9Sstevel@tonic-gate 	new_epri = turnstile_pi_tsdelete(NULL, inheritor);
2707c478bd9Sstevel@tonic-gate 	if (new_epri != DISP_PRIO(inheritor))
2717c478bd9Sstevel@tonic-gate 		thread_change_epri(inheritor, new_epri);
2727c478bd9Sstevel@tonic-gate 	if (DISP_MUST_SURRENDER(inheritor))
2737c478bd9Sstevel@tonic-gate 		cpu_surrender(inheritor);
2747c478bd9Sstevel@tonic-gate 	thread_unlock(inheritor);
2757c478bd9Sstevel@tonic-gate }
2767c478bd9Sstevel@tonic-gate 
2777c478bd9Sstevel@tonic-gate /*
2787c478bd9Sstevel@tonic-gate  * Grab the lock protecting the hash chain for sobj
2797c478bd9Sstevel@tonic-gate  * and return the active turnstile for sobj, if any.
2807c478bd9Sstevel@tonic-gate  */
2817c478bd9Sstevel@tonic-gate turnstile_t *
2827c478bd9Sstevel@tonic-gate turnstile_lookup(void *sobj)
2837c478bd9Sstevel@tonic-gate {
2847c478bd9Sstevel@tonic-gate 	turnstile_t *ts;
2857c478bd9Sstevel@tonic-gate 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj);
2867c478bd9Sstevel@tonic-gate 
2877c478bd9Sstevel@tonic-gate 	disp_lock_enter(&tc->tc_lock);
2887c478bd9Sstevel@tonic-gate 
2897c478bd9Sstevel@tonic-gate 	for (ts = tc->tc_first; ts != NULL; ts = ts->ts_next)
2907c478bd9Sstevel@tonic-gate 		if (ts->ts_sobj == sobj)
2917c478bd9Sstevel@tonic-gate 			break;
2927c478bd9Sstevel@tonic-gate 
2937c478bd9Sstevel@tonic-gate 	return (ts);
2947c478bd9Sstevel@tonic-gate }
2957c478bd9Sstevel@tonic-gate 
2967c478bd9Sstevel@tonic-gate /*
2977c478bd9Sstevel@tonic-gate  * Drop the lock protecting the hash chain for sobj.
2987c478bd9Sstevel@tonic-gate  */
2997c478bd9Sstevel@tonic-gate void
3007c478bd9Sstevel@tonic-gate turnstile_exit(void *sobj)
3017c478bd9Sstevel@tonic-gate {
3027c478bd9Sstevel@tonic-gate 	disp_lock_exit(&TURNSTILE_CHAIN(sobj).tc_lock);
3037c478bd9Sstevel@tonic-gate }
3047c478bd9Sstevel@tonic-gate 
3057c478bd9Sstevel@tonic-gate /*
3067c478bd9Sstevel@tonic-gate  * When we apply priority inheritance, we must grab the owner's thread lock
3077c478bd9Sstevel@tonic-gate  * while already holding the waiter's thread lock.  If both thread locks are
3087c478bd9Sstevel@tonic-gate  * turnstile locks, this can lead to deadlock: while we hold L1 and try to
3097c478bd9Sstevel@tonic-gate  * grab L2, some unrelated thread may be applying priority inheritance to
3107c478bd9Sstevel@tonic-gate  * some other blocking chain, holding L2 and trying to grab L1.  The most
3117c478bd9Sstevel@tonic-gate  * obvious solution -- do a lock_try() for the owner lock -- isn't quite
3127c478bd9Sstevel@tonic-gate  * sufficient because it can cause livelock: each thread may hold one lock,
3137c478bd9Sstevel@tonic-gate  * try to grab the other, fail, bail out, and try again, looping forever.
3147c478bd9Sstevel@tonic-gate  * To prevent livelock we must define a winner, i.e. define an arbitrary
3157c478bd9Sstevel@tonic-gate  * lock ordering on the turnstile locks.  For simplicity we declare that
3167c478bd9Sstevel@tonic-gate  * virtual address order defines lock order, i.e. if L1 < L2, then the
3177c478bd9Sstevel@tonic-gate  * correct lock ordering is L1, L2.  Thus the thread that holds L1 and
3187c478bd9Sstevel@tonic-gate  * wants L2 should spin until L2 is available, but the thread that holds
3197c478bd9Sstevel@tonic-gate  * L2 and can't get L1 on the first try must drop L2 and return failure.
3207c478bd9Sstevel@tonic-gate  * Moreover, the losing thread must not reacquire L2 until the winning
3217c478bd9Sstevel@tonic-gate  * thread has had a chance to grab it; to ensure this, the losing thread
3227c478bd9Sstevel@tonic-gate  * must grab L1 after dropping L2, thus spinning until the winner is done.
3237c478bd9Sstevel@tonic-gate  * Complicating matters further, note that the owner's thread lock pointer
3247c478bd9Sstevel@tonic-gate  * can change (i.e. be pointed at a different lock) while we're trying to
3257c478bd9Sstevel@tonic-gate  * grab it.  If that happens, we must unwind our state and try again.
3267c478bd9Sstevel@tonic-gate  *
3277c478bd9Sstevel@tonic-gate  * On success, returns 1 with both locks held.
3287c478bd9Sstevel@tonic-gate  * On failure, returns 0 with neither lock held.
3297c478bd9Sstevel@tonic-gate  */
3307c478bd9Sstevel@tonic-gate static int
3317c478bd9Sstevel@tonic-gate turnstile_interlock(lock_t *wlp, lock_t *volatile *olpp)
3327c478bd9Sstevel@tonic-gate {
3337c478bd9Sstevel@tonic-gate 	ASSERT(LOCK_HELD(wlp));
3347c478bd9Sstevel@tonic-gate 
3357c478bd9Sstevel@tonic-gate 	for (;;) {
3367c478bd9Sstevel@tonic-gate 		volatile lock_t *olp = *olpp;
3377c478bd9Sstevel@tonic-gate 
3387c478bd9Sstevel@tonic-gate 		/*
3397c478bd9Sstevel@tonic-gate 		 * If the locks are identical, there's nothing to do.
3407c478bd9Sstevel@tonic-gate 		 */
3417c478bd9Sstevel@tonic-gate 		if (olp == wlp)
3427c478bd9Sstevel@tonic-gate 			return (1);
3437c478bd9Sstevel@tonic-gate 		if (lock_try((lock_t *)olp)) {
3447c478bd9Sstevel@tonic-gate 			/*
3457c478bd9Sstevel@tonic-gate 			 * If 'olp' is still the right lock, return success.
3467c478bd9Sstevel@tonic-gate 			 * Otherwise, drop 'olp' and try the dance again.
3477c478bd9Sstevel@tonic-gate 			 */
3487c478bd9Sstevel@tonic-gate 			if (olp == *olpp)
3497c478bd9Sstevel@tonic-gate 				return (1);
3507c478bd9Sstevel@tonic-gate 			lock_clear((lock_t *)olp);
3517c478bd9Sstevel@tonic-gate 		} else {
3529d68b18eSck142721 			hrtime_t spin_time = 0;
3537c478bd9Sstevel@tonic-gate 			/*
3547c478bd9Sstevel@tonic-gate 			 * If we're grabbing the locks out of order, we lose.
3557c478bd9Sstevel@tonic-gate 			 * Drop the waiter's lock, and then grab and release
3567c478bd9Sstevel@tonic-gate 			 * the owner's lock to ensure that we won't retry
3577c478bd9Sstevel@tonic-gate 			 * until the winner is done (as described above).
3587c478bd9Sstevel@tonic-gate 			 */
3597c478bd9Sstevel@tonic-gate 			if (olp >= (lock_t *)turnstile_table && olp < wlp) {
3607c478bd9Sstevel@tonic-gate 				lock_clear(wlp);
3617c478bd9Sstevel@tonic-gate 				lock_set((lock_t *)olp);
3627c478bd9Sstevel@tonic-gate 				lock_clear((lock_t *)olp);
3637c478bd9Sstevel@tonic-gate 				return (0);
3647c478bd9Sstevel@tonic-gate 			}
3657c478bd9Sstevel@tonic-gate 			/*
3667c478bd9Sstevel@tonic-gate 			 * We're grabbing the locks in the right order,
3677c478bd9Sstevel@tonic-gate 			 * so spin until the owner's lock either becomes
3687c478bd9Sstevel@tonic-gate 			 * available or spontaneously changes.
3697c478bd9Sstevel@tonic-gate 			 */
3709d68b18eSck142721 			spin_time =
3719d68b18eSck142721 			    LOCKSTAT_START_TIME(LS_TURNSTILE_INTERLOCK_SPIN);
3727c478bd9Sstevel@tonic-gate 			while (olp == *olpp && LOCK_HELD(olp)) {
3737c478bd9Sstevel@tonic-gate 				if (panicstr)
3747c478bd9Sstevel@tonic-gate 					return (1);
3757c478bd9Sstevel@tonic-gate 				SMT_PAUSE();
3767c478bd9Sstevel@tonic-gate 			}
3779d68b18eSck142721 			LOCKSTAT_RECORD_TIME(LS_TURNSTILE_INTERLOCK_SPIN,
3789d68b18eSck142721 			    olp, spin_time);
3797c478bd9Sstevel@tonic-gate 		}
3807c478bd9Sstevel@tonic-gate 	}
3817c478bd9Sstevel@tonic-gate }
3827c478bd9Sstevel@tonic-gate 
3837c478bd9Sstevel@tonic-gate /*
3847c478bd9Sstevel@tonic-gate  * Block the current thread on a synchronization object.
3857c478bd9Sstevel@tonic-gate  *
3867c478bd9Sstevel@tonic-gate  * Turnstiles implement both kernel and user-level priority inheritance.
3877c478bd9Sstevel@tonic-gate  * To avoid missed wakeups in the user-level case, lwp_upimutex_lock() calls
3887c478bd9Sstevel@tonic-gate  * turnstile_block() holding the appropriate lock in the upimutextab (see
3897c478bd9Sstevel@tonic-gate  * the block comment in lwp_upimutex_lock() for details).  The held lock is
3907c478bd9Sstevel@tonic-gate  * passed to turnstile_block() as the "mp" parameter, and will be dropped
3917c478bd9Sstevel@tonic-gate  * after priority has been willed, but before the thread actually sleeps
3927c478bd9Sstevel@tonic-gate  * (this locking behavior leads to some subtle ordering issues; see the
3937c478bd9Sstevel@tonic-gate  * block comment on turnstile hashing for details).  This _must_ be the only
3947c478bd9Sstevel@tonic-gate  * lock held when calling turnstile_block() with a SOBJ_USER_PI sobj; holding
3957c478bd9Sstevel@tonic-gate  * other locks can result in panics due to cycles in the blocking chain.
3967c478bd9Sstevel@tonic-gate  *
3977c478bd9Sstevel@tonic-gate  * turnstile_block() always succeeds for kernel synchronization objects.
3987c478bd9Sstevel@tonic-gate  * For SOBJ_USER_PI locks the possible errors are EINTR for signals, and
3997c478bd9Sstevel@tonic-gate  * EDEADLK for cycles in the blocking chain. A return code of zero indicates
4007c478bd9Sstevel@tonic-gate  * *either* that the lock is now held, or that this is a spurious wake-up, or
4017c478bd9Sstevel@tonic-gate  * that the lock can never be held due to an ENOTRECOVERABLE error.
4027c478bd9Sstevel@tonic-gate  * It is up to lwp_upimutex_lock() to sort this all out.
4037c478bd9Sstevel@tonic-gate  */
4047c478bd9Sstevel@tonic-gate 
4057c478bd9Sstevel@tonic-gate int
4067c478bd9Sstevel@tonic-gate turnstile_block(turnstile_t *ts, int qnum, void *sobj, sobj_ops_t *sobj_ops,
4077c478bd9Sstevel@tonic-gate     kmutex_t *mp, lwp_timer_t *lwptp)
4087c478bd9Sstevel@tonic-gate {
4097c478bd9Sstevel@tonic-gate 	kthread_t *owner;
4107c478bd9Sstevel@tonic-gate 	kthread_t *t = curthread;
4117c478bd9Sstevel@tonic-gate 	proc_t *p = ttoproc(t);
4127c478bd9Sstevel@tonic-gate 	klwp_t *lwp = ttolwp(t);
4137c478bd9Sstevel@tonic-gate 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj);
4147c478bd9Sstevel@tonic-gate 	int error = 0;
4157c478bd9Sstevel@tonic-gate 	int loser = 0;
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
4187c478bd9Sstevel@tonic-gate 	ASSERT(mp == NULL || IS_UPI(mp));
4197c478bd9Sstevel@tonic-gate 	ASSERT((SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) ^ (mp == NULL));
4207c478bd9Sstevel@tonic-gate 
4217c478bd9Sstevel@tonic-gate 	thread_lock_high(t);
4227c478bd9Sstevel@tonic-gate 
4237c478bd9Sstevel@tonic-gate 	if (ts == NULL) {
4247c478bd9Sstevel@tonic-gate 		/*
4257c478bd9Sstevel@tonic-gate 		 * This is the first thread to block on this sobj.
4267c478bd9Sstevel@tonic-gate 		 * Take its attached turnstile and add it to the hash chain.
4277c478bd9Sstevel@tonic-gate 		 */
4287c478bd9Sstevel@tonic-gate 		ts = t->t_ts;
4297c478bd9Sstevel@tonic-gate 		ts->ts_sobj = sobj;
4307c478bd9Sstevel@tonic-gate 		ts->ts_next = tc->tc_first;
4317c478bd9Sstevel@tonic-gate 		tc->tc_first = ts;
4327c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_waiters == 0);
4337c478bd9Sstevel@tonic-gate 	} else {
4347c478bd9Sstevel@tonic-gate 		/*
4357c478bd9Sstevel@tonic-gate 		 * Another thread has already donated its turnstile
4367c478bd9Sstevel@tonic-gate 		 * to block on this sobj, so ours isn't needed.
4377c478bd9Sstevel@tonic-gate 		 * Stash it on the active turnstile's freelist.
4387c478bd9Sstevel@tonic-gate 		 */
4397c478bd9Sstevel@tonic-gate 		turnstile_t *myts = t->t_ts;
4407c478bd9Sstevel@tonic-gate 		myts->ts_free = ts->ts_free;
4417c478bd9Sstevel@tonic-gate 		ts->ts_free = myts;
4427c478bd9Sstevel@tonic-gate 		t->t_ts = ts;
4437c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_sobj == sobj);
4447c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_waiters > 0);
4457c478bd9Sstevel@tonic-gate 	}
4467c478bd9Sstevel@tonic-gate 
4477c478bd9Sstevel@tonic-gate 	/*
4487c478bd9Sstevel@tonic-gate 	 * Put the thread to sleep.
4497c478bd9Sstevel@tonic-gate 	 */
4507c478bd9Sstevel@tonic-gate 	ASSERT(t != CPU->cpu_idle_thread);
4517c478bd9Sstevel@tonic-gate 	ASSERT(CPU_ON_INTR(CPU) == 0);
4527c478bd9Sstevel@tonic-gate 	ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
4537c478bd9Sstevel@tonic-gate 	ASSERT(t->t_state == TS_ONPROC);
4547c478bd9Sstevel@tonic-gate 
4557c478bd9Sstevel@tonic-gate 	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
4567c478bd9Sstevel@tonic-gate 		curthread->t_flag |= T_WAKEABLE;
4577c478bd9Sstevel@tonic-gate 	}
4587c478bd9Sstevel@tonic-gate 	CL_SLEEP(t);		/* assign kernel priority */
4597c478bd9Sstevel@tonic-gate 	THREAD_SLEEP(t, &tc->tc_lock);
4607c478bd9Sstevel@tonic-gate 	t->t_wchan = sobj;
4617c478bd9Sstevel@tonic-gate 	t->t_sobj_ops = sobj_ops;
4627c478bd9Sstevel@tonic-gate 	DTRACE_SCHED(sleep);
4637c478bd9Sstevel@tonic-gate 
4647c478bd9Sstevel@tonic-gate 	if (lwp != NULL) {
4657c478bd9Sstevel@tonic-gate 		lwp->lwp_ru.nvcsw++;
4667c478bd9Sstevel@tonic-gate 		(void) new_mstate(t, LMS_SLEEP);
4677c478bd9Sstevel@tonic-gate 		if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
4687c478bd9Sstevel@tonic-gate 			lwp->lwp_asleep = 1;
4697c478bd9Sstevel@tonic-gate 			lwp->lwp_sysabort = 0;
4707c478bd9Sstevel@tonic-gate 			/*
4717c478bd9Sstevel@tonic-gate 			 * make wchan0 non-zero to conform to the rule that
4727c478bd9Sstevel@tonic-gate 			 * threads blocking for user-level objects have a
4737c478bd9Sstevel@tonic-gate 			 * non-zero wchan0: this prevents spurious wake-ups
4747c478bd9Sstevel@tonic-gate 			 * by, for example, /proc.
4757c478bd9Sstevel@tonic-gate 			 */
4767c478bd9Sstevel@tonic-gate 			t->t_wchan0 = (caddr_t)1;
4777c478bd9Sstevel@tonic-gate 		}
4787c478bd9Sstevel@tonic-gate 	}
4797c478bd9Sstevel@tonic-gate 	ts->ts_waiters++;
4807c478bd9Sstevel@tonic-gate 	sleepq_insert(&ts->ts_sleepq[qnum], t);
4817c478bd9Sstevel@tonic-gate 
4827c478bd9Sstevel@tonic-gate 	if (SOBJ_TYPE(sobj_ops) == SOBJ_MUTEX &&
4837c478bd9Sstevel@tonic-gate 	    SOBJ_OWNER(sobj_ops, sobj) == NULL)
4847c478bd9Sstevel@tonic-gate 		panic("turnstile_block(%p): unowned mutex", (void *)ts);
4857c478bd9Sstevel@tonic-gate 
4867c478bd9Sstevel@tonic-gate 	/*
4877c478bd9Sstevel@tonic-gate 	 * Follow the blocking chain to its end, willing our priority to
4887c478bd9Sstevel@tonic-gate 	 * everyone who's in our way.
4897c478bd9Sstevel@tonic-gate 	 */
4907c478bd9Sstevel@tonic-gate 	while (t->t_sobj_ops != NULL &&
4917c478bd9Sstevel@tonic-gate 	    (owner = SOBJ_OWNER(t->t_sobj_ops, t->t_wchan)) != NULL) {
4927c478bd9Sstevel@tonic-gate 		if (owner == curthread) {
4937c478bd9Sstevel@tonic-gate 			if (SOBJ_TYPE(sobj_ops) != SOBJ_USER_PI) {
4947c478bd9Sstevel@tonic-gate 				panic("Deadlock: cycle in blocking chain");
4957c478bd9Sstevel@tonic-gate 			}
4967c478bd9Sstevel@tonic-gate 			/*
4977c478bd9Sstevel@tonic-gate 			 * If the cycle we've encountered ends in mp,
4987c478bd9Sstevel@tonic-gate 			 * then we know it isn't a 'real' cycle because
4997c478bd9Sstevel@tonic-gate 			 * we're going to drop mp before we go to sleep.
5007c478bd9Sstevel@tonic-gate 			 * Moreover, since we've come full circle we know
5017c478bd9Sstevel@tonic-gate 			 * that we must have willed priority to everyone
5027c478bd9Sstevel@tonic-gate 			 * in our way.  Therefore, we can break out now.
5037c478bd9Sstevel@tonic-gate 			 */
5047c478bd9Sstevel@tonic-gate 			if (t->t_wchan == (void *)mp)
5057c478bd9Sstevel@tonic-gate 				break;
5067c478bd9Sstevel@tonic-gate 
5077c478bd9Sstevel@tonic-gate 			if (loser)
5087c478bd9Sstevel@tonic-gate 				lock_clear(&turnstile_loser_lock);
5097c478bd9Sstevel@tonic-gate 			/*
5107c478bd9Sstevel@tonic-gate 			 * For SOBJ_USER_PI, a cycle is an application
5117c478bd9Sstevel@tonic-gate 			 * deadlock which needs to be communicated
5127c478bd9Sstevel@tonic-gate 			 * back to the application.
5137c478bd9Sstevel@tonic-gate 			 */
5147c478bd9Sstevel@tonic-gate 			thread_unlock_nopreempt(t);
5157c478bd9Sstevel@tonic-gate 			mutex_exit(mp);
5167c478bd9Sstevel@tonic-gate 			setrun(curthread);
5177c478bd9Sstevel@tonic-gate 			swtch(); /* necessary to transition state */
5187c478bd9Sstevel@tonic-gate 			curthread->t_flag &= ~T_WAKEABLE;
519fd6545c7Sraf 			if (lwptp->lwpt_id != 0)
520fd6545c7Sraf 				(void) lwp_timer_dequeue(lwptp);
5217c478bd9Sstevel@tonic-gate 			setallwatch();
5227c478bd9Sstevel@tonic-gate 			lwp->lwp_asleep = 0;
5237c478bd9Sstevel@tonic-gate 			lwp->lwp_sysabort = 0;
5247c478bd9Sstevel@tonic-gate 			return (EDEADLK);
5257c478bd9Sstevel@tonic-gate 		}
5267c478bd9Sstevel@tonic-gate 		if (!turnstile_interlock(t->t_lockp, &owner->t_lockp)) {
5277c478bd9Sstevel@tonic-gate 			/*
5287c478bd9Sstevel@tonic-gate 			 * If we failed to grab the owner's thread lock,
5297c478bd9Sstevel@tonic-gate 			 * turnstile_interlock() will have dropped t's
5307c478bd9Sstevel@tonic-gate 			 * thread lock, so at this point we don't even know
5317c478bd9Sstevel@tonic-gate 			 * that 't' exists anymore.  The simplest solution
5327c478bd9Sstevel@tonic-gate 			 * is to restart the entire priority inheritance dance
5337c478bd9Sstevel@tonic-gate 			 * from the beginning of the blocking chain, since
5347c478bd9Sstevel@tonic-gate 			 * we *do* know that 'curthread' still exists.
5357c478bd9Sstevel@tonic-gate 			 * Application of priority inheritance is idempotent,
5367c478bd9Sstevel@tonic-gate 			 * so it's OK that we're doing it more than once.
5377c478bd9Sstevel@tonic-gate 			 * Note also that since we've dropped our thread lock,
5387c478bd9Sstevel@tonic-gate 			 * we may already have been woken up; if so, our
5397c478bd9Sstevel@tonic-gate 			 * t_sobj_ops will be NULL, the loop will terminate,
5407c478bd9Sstevel@tonic-gate 			 * and the call to swtch() will be a no-op.  Phew.
5417c478bd9Sstevel@tonic-gate 			 *
5427c478bd9Sstevel@tonic-gate 			 * There is one further complication: if two (or more)
5437c478bd9Sstevel@tonic-gate 			 * threads keep trying to grab the turnstile locks out
5447c478bd9Sstevel@tonic-gate 			 * of order and keep losing the race to another thread,
5457c478bd9Sstevel@tonic-gate 			 * these "dueling losers" can livelock the system.
5467c478bd9Sstevel@tonic-gate 			 * Therefore, once we get into this rare situation,
5477c478bd9Sstevel@tonic-gate 			 * we serialize all the losers.
5487c478bd9Sstevel@tonic-gate 			 */
5497c478bd9Sstevel@tonic-gate 			if (loser == 0) {
5507c478bd9Sstevel@tonic-gate 				loser = 1;
5517c478bd9Sstevel@tonic-gate 				lock_set(&turnstile_loser_lock);
5527c478bd9Sstevel@tonic-gate 			}
5537c478bd9Sstevel@tonic-gate 			t = curthread;
5547c478bd9Sstevel@tonic-gate 			thread_lock_high(t);
5557c478bd9Sstevel@tonic-gate 			continue;
5567c478bd9Sstevel@tonic-gate 		}
5577c478bd9Sstevel@tonic-gate 
5587c478bd9Sstevel@tonic-gate 		/*
5597c478bd9Sstevel@tonic-gate 		 * We now have the owner's thread lock.  If we are traversing
5607c478bd9Sstevel@tonic-gate 		 * from non-SOBJ_USER_PI ops to SOBJ_USER_PI ops, then we know
5617c478bd9Sstevel@tonic-gate 		 * that we have caught the thread while in the TS_SLEEP state,
5627c478bd9Sstevel@tonic-gate 		 * but holding mp.  We know that this situation is transient
5637c478bd9Sstevel@tonic-gate 		 * (mp will be dropped before the holder actually sleeps on
5647c478bd9Sstevel@tonic-gate 		 * the SOBJ_USER_PI sobj), so we will spin waiting for mp to
5657c478bd9Sstevel@tonic-gate 		 * be dropped.  Then, as in the turnstile_interlock() failure
5667c478bd9Sstevel@tonic-gate 		 * case, we will restart the priority inheritance dance.
5677c478bd9Sstevel@tonic-gate 		 */
5687c478bd9Sstevel@tonic-gate 		if (SOBJ_TYPE(t->t_sobj_ops) != SOBJ_USER_PI &&
5697c478bd9Sstevel@tonic-gate 		    owner->t_sobj_ops != NULL &&
5707c478bd9Sstevel@tonic-gate 		    SOBJ_TYPE(owner->t_sobj_ops) == SOBJ_USER_PI) {
5717c478bd9Sstevel@tonic-gate 			kmutex_t *upi_lock = (kmutex_t *)t->t_wchan;
5727c478bd9Sstevel@tonic-gate 
5737c478bd9Sstevel@tonic-gate 			ASSERT(IS_UPI(upi_lock));
5747c478bd9Sstevel@tonic-gate 			ASSERT(SOBJ_TYPE(t->t_sobj_ops) == SOBJ_MUTEX);
5757c478bd9Sstevel@tonic-gate 
5767c478bd9Sstevel@tonic-gate 			if (t->t_lockp != owner->t_lockp)
5777c478bd9Sstevel@tonic-gate 				thread_unlock_high(owner);
5787c478bd9Sstevel@tonic-gate 			thread_unlock_high(t);
5797c478bd9Sstevel@tonic-gate 			if (loser)
5807c478bd9Sstevel@tonic-gate 				lock_clear(&turnstile_loser_lock);
5817c478bd9Sstevel@tonic-gate 
5827c478bd9Sstevel@tonic-gate 			while (mutex_owner(upi_lock) == owner) {
5837c478bd9Sstevel@tonic-gate 				SMT_PAUSE();
5847c478bd9Sstevel@tonic-gate 				continue;
5857c478bd9Sstevel@tonic-gate 			}
5867c478bd9Sstevel@tonic-gate 
5877c478bd9Sstevel@tonic-gate 			if (loser)
5887c478bd9Sstevel@tonic-gate 				lock_set(&turnstile_loser_lock);
5897c478bd9Sstevel@tonic-gate 			t = curthread;
5907c478bd9Sstevel@tonic-gate 			thread_lock_high(t);
5917c478bd9Sstevel@tonic-gate 			continue;
5927c478bd9Sstevel@tonic-gate 		}
5937c478bd9Sstevel@tonic-gate 
5947c478bd9Sstevel@tonic-gate 		turnstile_pi_inherit(t->t_ts, owner, DISP_PRIO(t));
5957c478bd9Sstevel@tonic-gate 		if (t->t_lockp != owner->t_lockp)
5967c478bd9Sstevel@tonic-gate 			thread_unlock_high(t);
5977c478bd9Sstevel@tonic-gate 		t = owner;
5987c478bd9Sstevel@tonic-gate 	}
5997c478bd9Sstevel@tonic-gate 
6007c478bd9Sstevel@tonic-gate 	if (loser)
6017c478bd9Sstevel@tonic-gate 		lock_clear(&turnstile_loser_lock);
6027c478bd9Sstevel@tonic-gate 
6037c478bd9Sstevel@tonic-gate 	/*
6047c478bd9Sstevel@tonic-gate 	 * Note: 't' and 'curthread' were synonymous before the loop above,
6057c478bd9Sstevel@tonic-gate 	 * but now they may be different.  ('t' is now the last thread in
6067c478bd9Sstevel@tonic-gate 	 * the blocking chain.)
6077c478bd9Sstevel@tonic-gate 	 */
6087c478bd9Sstevel@tonic-gate 	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
6097c478bd9Sstevel@tonic-gate 		ushort_t s = curthread->t_oldspl;
6107c478bd9Sstevel@tonic-gate 		int timedwait = 0;
611fd6545c7Sraf 		uint_t imm_timeout = 0;
6127c478bd9Sstevel@tonic-gate 		clock_t tim = -1;
6137c478bd9Sstevel@tonic-gate 
6147c478bd9Sstevel@tonic-gate 		thread_unlock_high(t);
6157c478bd9Sstevel@tonic-gate 		if (lwptp->lwpt_id != 0) {
6167c478bd9Sstevel@tonic-gate 			/*
617fd6545c7Sraf 			 * We enqueued a timeout.  If it has already fired,
618fd6545c7Sraf 			 * lwptp->lwpt_imm_timeout has been set with cas,
619fd6545c7Sraf 			 * so fetch it with cas.
6207c478bd9Sstevel@tonic-gate 			 */
6217c478bd9Sstevel@tonic-gate 			timedwait = 1;
622fd6545c7Sraf 			imm_timeout =
623fd6545c7Sraf 			    atomic_cas_uint(&lwptp->lwpt_imm_timeout, 0, 0);
6247c478bd9Sstevel@tonic-gate 		}
6257c478bd9Sstevel@tonic-gate 		mutex_exit(mp);
6267c478bd9Sstevel@tonic-gate 		splx(s);
6277c478bd9Sstevel@tonic-gate 
6287c478bd9Sstevel@tonic-gate 		if (ISSIG(curthread, JUSTLOOKING) ||
629fd6545c7Sraf 		    MUSTRETURN(p, curthread) || imm_timeout)
6307c478bd9Sstevel@tonic-gate 			setrun(curthread);
6317c478bd9Sstevel@tonic-gate 		swtch();
6327c478bd9Sstevel@tonic-gate 		curthread->t_flag &= ~T_WAKEABLE;
6337c478bd9Sstevel@tonic-gate 		if (timedwait)
6347c478bd9Sstevel@tonic-gate 			tim = lwp_timer_dequeue(lwptp);
6357c478bd9Sstevel@tonic-gate 		setallwatch();
6367c478bd9Sstevel@tonic-gate 		if (ISSIG(curthread, FORREAL) || lwp->lwp_sysabort ||
6377c478bd9Sstevel@tonic-gate 		    MUSTRETURN(p, curthread))
6387c478bd9Sstevel@tonic-gate 			error = EINTR;
639fd6545c7Sraf 		else if (imm_timeout || (timedwait && tim == -1))
6407c478bd9Sstevel@tonic-gate 			error = ETIME;
6417c478bd9Sstevel@tonic-gate 		lwp->lwp_sysabort = 0;
6427c478bd9Sstevel@tonic-gate 		lwp->lwp_asleep = 0;
6437c478bd9Sstevel@tonic-gate 	} else {
6447c478bd9Sstevel@tonic-gate 		thread_unlock_nopreempt(t);
6457c478bd9Sstevel@tonic-gate 		swtch();
6467c478bd9Sstevel@tonic-gate 	}
6477c478bd9Sstevel@tonic-gate 
6487c478bd9Sstevel@tonic-gate 	return (error);
6497c478bd9Sstevel@tonic-gate }
6507c478bd9Sstevel@tonic-gate 
6517c478bd9Sstevel@tonic-gate /*
6527c478bd9Sstevel@tonic-gate  * Remove thread from specified turnstile sleep queue; retrieve its
6537c478bd9Sstevel@tonic-gate  * free turnstile; if it is the last waiter, delete the turnstile
6547c478bd9Sstevel@tonic-gate  * from the turnstile chain and if there is an inheritor, delete it
6557c478bd9Sstevel@tonic-gate  * from the inheritor's t_prioinv chain.
6567c478bd9Sstevel@tonic-gate  */
6577c478bd9Sstevel@tonic-gate static void
6587c478bd9Sstevel@tonic-gate turnstile_dequeue(kthread_t *t)
6597c478bd9Sstevel@tonic-gate {
6607c478bd9Sstevel@tonic-gate 	turnstile_t *ts = t->t_ts;
6617c478bd9Sstevel@tonic-gate 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(ts->ts_sobj);
6627c478bd9Sstevel@tonic-gate 	turnstile_t *tsfree, **tspp;
6637c478bd9Sstevel@tonic-gate 
6647c478bd9Sstevel@tonic-gate 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
6657c478bd9Sstevel@tonic-gate 	ASSERT(t->t_lockp == &tc->tc_lock);
6667c478bd9Sstevel@tonic-gate 
6677c478bd9Sstevel@tonic-gate 	if ((tsfree = ts->ts_free) != NULL) {
6687c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_waiters > 1);
6697c478bd9Sstevel@tonic-gate 		ASSERT(tsfree->ts_waiters == 0);
6707c478bd9Sstevel@tonic-gate 		t->t_ts = tsfree;
6717c478bd9Sstevel@tonic-gate 		ts->ts_free = tsfree->ts_free;
6727c478bd9Sstevel@tonic-gate 		tsfree->ts_free = NULL;
6737c478bd9Sstevel@tonic-gate 	} else {
6747c478bd9Sstevel@tonic-gate 		/*
6757c478bd9Sstevel@tonic-gate 		 * The active turnstile's freelist is empty, so this
6767c478bd9Sstevel@tonic-gate 		 * must be the last waiter.  Remove the turnstile
6777c478bd9Sstevel@tonic-gate 		 * from the hash chain and leave the now-inactive
6787c478bd9Sstevel@tonic-gate 		 * turnstile attached to the thread we're waking.
6797c478bd9Sstevel@tonic-gate 		 * Note that the ts_inheritor for the turnstile
6807c478bd9Sstevel@tonic-gate 		 * may be NULL. If one exists, its t_prioinv
6817c478bd9Sstevel@tonic-gate 		 * chain has to be updated.
6827c478bd9Sstevel@tonic-gate 		 */
6837c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_waiters == 1);
6847c478bd9Sstevel@tonic-gate 		if (ts->ts_inheritor != NULL) {
6857c478bd9Sstevel@tonic-gate 			(void) turnstile_pi_tsdelete(ts, ts->ts_inheritor);
6867c478bd9Sstevel@tonic-gate 			/*
6877c478bd9Sstevel@tonic-gate 			 * If we ever do a "disinherit" or "unboost", we need
6887c478bd9Sstevel@tonic-gate 			 * to do it only if "t" is a thread at the head of the
6897c478bd9Sstevel@tonic-gate 			 * sleep queue. Since the sleep queue is prioritized,
6907c478bd9Sstevel@tonic-gate 			 * the disinherit is necessary only if the interrupted
6917c478bd9Sstevel@tonic-gate 			 * thread is the highest priority thread.
6927c478bd9Sstevel@tonic-gate 			 * Otherwise, there is a higher priority thread blocked
6937c478bd9Sstevel@tonic-gate 			 * on the turnstile, whose inheritance cannot be
6947c478bd9Sstevel@tonic-gate 			 * disinherited. However, disinheriting is explicitly
6957c478bd9Sstevel@tonic-gate 			 * not done here, since it would require holding the
6967c478bd9Sstevel@tonic-gate 			 * inheritor's thread lock (see turnstile_unsleep()).
6977c478bd9Sstevel@tonic-gate 			 */
6987c478bd9Sstevel@tonic-gate 			ts->ts_inheritor = NULL;
6997c478bd9Sstevel@tonic-gate 		}
7007c478bd9Sstevel@tonic-gate 		tspp = &tc->tc_first;
7017c478bd9Sstevel@tonic-gate 		while (*tspp != ts)
7027c478bd9Sstevel@tonic-gate 			tspp = &(*tspp)->ts_next;
7037c478bd9Sstevel@tonic-gate 		*tspp = ts->ts_next;
7047c478bd9Sstevel@tonic-gate 		ASSERT(t->t_ts == ts);
7057c478bd9Sstevel@tonic-gate 	}
7067c478bd9Sstevel@tonic-gate 	ts->ts_waiters--;
7077c478bd9Sstevel@tonic-gate 	sleepq_dequeue(t);
7087c478bd9Sstevel@tonic-gate 	t->t_sobj_ops = NULL;
7097c478bd9Sstevel@tonic-gate 	t->t_wchan = NULL;
7107c478bd9Sstevel@tonic-gate 	t->t_wchan0 = NULL;
7117c478bd9Sstevel@tonic-gate 	ASSERT(t->t_state == TS_SLEEP);
7127c478bd9Sstevel@tonic-gate }
7137c478bd9Sstevel@tonic-gate 
7147c478bd9Sstevel@tonic-gate /*
7157c478bd9Sstevel@tonic-gate  * Wake threads that are blocked in a turnstile.
7167c478bd9Sstevel@tonic-gate  */
7177c478bd9Sstevel@tonic-gate void
7187c478bd9Sstevel@tonic-gate turnstile_wakeup(turnstile_t *ts, int qnum, int nthreads, kthread_t *owner)
7197c478bd9Sstevel@tonic-gate {
7207c478bd9Sstevel@tonic-gate 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(ts->ts_sobj);
7217c478bd9Sstevel@tonic-gate 	sleepq_t *sqp = &ts->ts_sleepq[qnum];
7227c478bd9Sstevel@tonic-gate 
7237c478bd9Sstevel@tonic-gate 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
7247c478bd9Sstevel@tonic-gate 
7257c478bd9Sstevel@tonic-gate 	/*
7267c478bd9Sstevel@tonic-gate 	 * Waive any priority we may have inherited from this turnstile.
7277c478bd9Sstevel@tonic-gate 	 */
7287c478bd9Sstevel@tonic-gate 	if (ts->ts_inheritor != NULL) {
7297c478bd9Sstevel@tonic-gate 		turnstile_pi_waive(ts);
7307c478bd9Sstevel@tonic-gate 	}
7317c478bd9Sstevel@tonic-gate 	while (nthreads-- > 0) {
7327c478bd9Sstevel@tonic-gate 		kthread_t *t = sqp->sq_first;
7337c478bd9Sstevel@tonic-gate 		ASSERT(t->t_ts == ts);
7347c478bd9Sstevel@tonic-gate 		ASSERT(ts->ts_waiters > 1 || ts->ts_inheritor == NULL);
7357c478bd9Sstevel@tonic-gate 		DTRACE_SCHED1(wakeup, kthread_t *, t);
7367c478bd9Sstevel@tonic-gate 		turnstile_dequeue(t);
7377c478bd9Sstevel@tonic-gate 		CL_WAKEUP(t); /* previous thread lock, tc_lock, not dropped */
7387c478bd9Sstevel@tonic-gate 		/*
7397c478bd9Sstevel@tonic-gate 		 * If the caller did direct handoff of ownership,
7407c478bd9Sstevel@tonic-gate 		 * make the new owner inherit from this turnstile.
7417c478bd9Sstevel@tonic-gate 		 */
7427c478bd9Sstevel@tonic-gate 		if (t == owner) {
7437c478bd9Sstevel@tonic-gate 			kthread_t *wp = ts->ts_sleepq[TS_WRITER_Q].sq_first;
7447c478bd9Sstevel@tonic-gate 			kthread_t *rp = ts->ts_sleepq[TS_READER_Q].sq_first;
7457c478bd9Sstevel@tonic-gate 			pri_t wpri = wp ? DISP_PRIO(wp) : 0;
7467c478bd9Sstevel@tonic-gate 			pri_t rpri = rp ? DISP_PRIO(rp) : 0;
7477c478bd9Sstevel@tonic-gate 			turnstile_pi_inherit(ts, t, MAX(wpri, rpri));
7487c478bd9Sstevel@tonic-gate 			owner = NULL;
7497c478bd9Sstevel@tonic-gate 		}
7507c478bd9Sstevel@tonic-gate 		thread_unlock_high(t);		/* drop run queue lock */
7517c478bd9Sstevel@tonic-gate 	}
7527c478bd9Sstevel@tonic-gate 	if (owner != NULL)
753*8793b36bSNick Todd 		panic("turnstile_wakeup: owner %p not woken", (void *)owner);
7547c478bd9Sstevel@tonic-gate 	disp_lock_exit(&tc->tc_lock);
7557c478bd9Sstevel@tonic-gate }
7567c478bd9Sstevel@tonic-gate 
7577c478bd9Sstevel@tonic-gate /*
7587c478bd9Sstevel@tonic-gate  * Change priority of a thread sleeping in a turnstile.
7597c478bd9Sstevel@tonic-gate  */
7607c478bd9Sstevel@tonic-gate void
7617c478bd9Sstevel@tonic-gate turnstile_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
7627c478bd9Sstevel@tonic-gate {
7637c478bd9Sstevel@tonic-gate 	sleepq_t *sqp = t->t_sleepq;
7647c478bd9Sstevel@tonic-gate 
7657c478bd9Sstevel@tonic-gate 	sleepq_dequeue(t);
7667c478bd9Sstevel@tonic-gate 	*t_prip = pri;
7677c478bd9Sstevel@tonic-gate 	sleepq_insert(sqp, t);
7687c478bd9Sstevel@tonic-gate }
7697c478bd9Sstevel@tonic-gate 
7707c478bd9Sstevel@tonic-gate /*
7717c478bd9Sstevel@tonic-gate  * We don't allow spurious wakeups of threads blocked in turnstiles
7727c478bd9Sstevel@tonic-gate  * for synch objects whose sobj_ops vector is initialized with the
7737c478bd9Sstevel@tonic-gate  * following routine (e.g. kernel synchronization objects).
7747c478bd9Sstevel@tonic-gate  * This is vital to the correctness of direct-handoff logic in some
7757c478bd9Sstevel@tonic-gate  * synchronization primitives, and it also simplifies the PI logic.
7767c478bd9Sstevel@tonic-gate  */
7777c478bd9Sstevel@tonic-gate /* ARGSUSED */
7787c478bd9Sstevel@tonic-gate void
7797c478bd9Sstevel@tonic-gate turnstile_stay_asleep(kthread_t *t)
7807c478bd9Sstevel@tonic-gate {
7817c478bd9Sstevel@tonic-gate }
7827c478bd9Sstevel@tonic-gate 
7837c478bd9Sstevel@tonic-gate /*
7847c478bd9Sstevel@tonic-gate  * Wake up a thread blocked in a turnstile. Used to enable interruptibility
7857c478bd9Sstevel@tonic-gate  * of threads blocked on a SOBJ_USER_PI sobj.
7867c478bd9Sstevel@tonic-gate  *
7877c478bd9Sstevel@tonic-gate  * The implications of this interface are:
7887c478bd9Sstevel@tonic-gate  *
7897c478bd9Sstevel@tonic-gate  * 1. turnstile_block() may return with an EINTR.
7907c478bd9Sstevel@tonic-gate  * 2. When the owner of an sobj releases it, but no turnstile is found (i.e.
7917c478bd9Sstevel@tonic-gate  *    no waiters), the (prior) owner must call turnstile_pi_recalc() to
7927c478bd9Sstevel@tonic-gate  *    waive any priority inherited from interrupted waiters.
7937c478bd9Sstevel@tonic-gate  *
7947c478bd9Sstevel@tonic-gate  * When a waiter is interrupted, disinheriting its willed priority from the
7957c478bd9Sstevel@tonic-gate  * inheritor would require holding the inheritor's thread lock, while also
7967c478bd9Sstevel@tonic-gate  * holding the waiter's thread lock which is a turnstile lock. If the
7977c478bd9Sstevel@tonic-gate  * inheritor's thread lock is not free, and is also a turnstile lock that
7987c478bd9Sstevel@tonic-gate  * is out of lock order, the waiter's thread lock would have to be dropped.
7997c478bd9Sstevel@tonic-gate  * This leads to complications for the caller of turnstile_unsleep(), since
8007c478bd9Sstevel@tonic-gate  * the caller holds the waiter's thread lock. So, instead of disinheriting
8017c478bd9Sstevel@tonic-gate  * on waiter interruption, the owner is required to follow rule 2 above.
8027c478bd9Sstevel@tonic-gate  *
8037c478bd9Sstevel@tonic-gate  * Avoiding disinherit on waiter interruption seems acceptable because
8047c478bd9Sstevel@tonic-gate  * the owner runs at an unnecessarily high priority only while sobj is held,
8057c478bd9Sstevel@tonic-gate  * which it would have done in any case, if the waiter had not been interrupted.
8067c478bd9Sstevel@tonic-gate  */
8077c478bd9Sstevel@tonic-gate void
8087c478bd9Sstevel@tonic-gate turnstile_unsleep(kthread_t *t)
8097c478bd9Sstevel@tonic-gate {
8107c478bd9Sstevel@tonic-gate 	turnstile_dequeue(t);
8117c478bd9Sstevel@tonic-gate 	THREAD_TRANSITION(t);
8127c478bd9Sstevel@tonic-gate 	CL_SETRUN(t);
8137c478bd9Sstevel@tonic-gate }
814