xref: /titanic_52/usr/src/uts/common/os/turnstile.c (revision c7158ae983f5a04c4a998f468ecefba6d23ba721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Big Theory Statement for turnstiles.
31  *
32  * Turnstiles provide blocking and wakeup support, including priority
33  * inheritance, for synchronization primitives (e.g. mutexes and rwlocks).
34  * Typical usage is as follows:
35  *
36  * To block on lock 'lp' for read access in foo_enter():
37  *
38  *	ts = turnstile_lookup(lp);
39  *	[ If the lock is still held, set the waiters bit
40  *	turnstile_block(ts, TS_READER_Q, lp, &foo_sobj_ops);
41  *
42  * To wake threads waiting for write access to lock 'lp' in foo_exit():
43  *
44  *	ts = turnstile_lookup(lp);
45  *	[ Either drop the lock (change owner to NULL) or perform a direct
46  *	[ handoff (change owner to one of the threads we're about to wake).
47  *	[ If we're going to wake the last waiter, clear the waiters bit.
48  *	turnstile_wakeup(ts, TS_WRITER_Q, nwaiters, new_owner or NULL);
49  *
50  * turnstile_lookup() returns holding the turnstile hash chain lock for lp.
51  * Both turnstile_block() and turnstile_wakeup() drop the turnstile lock.
52  * To abort a turnstile operation, the client must call turnstile_exit().
53  *
54  * Requirements of the client:
55  *
56  * (1)  The lock's waiters indicator may be manipulated *only* while
57  *	holding the turnstile hash chain lock (i.e. under turnstile_lookup()).
58  *
59  * (2)	Once the lock is marked as having waiters, the owner may be
60  *	changed *only* while holding the turnstile hash chain lock.
61  *
62  * (3)	The caller must never block on an unheld lock.
63  *
64  * Consequences of these assumptions include the following:
65  *
66  * (a) It is impossible for a lock to be unheld but have waiters.
67  *
68  * (b)	The priority inheritance code can safely assume that an active
69  *	turnstile's ts_inheritor never changes until the inheritor calls
70  *	turnstile_pi_waive().
71  *
72  * These assumptions simplify the implementation of both turnstiles and
73  * their clients.
74  *
75  * Background on priority inheritance:
76  *
77  * Priority inheritance allows a thread to "will" its dispatch priority
78  * to all the threads blocking it, directly or indirectly.  This prevents
79  * situations called priority inversions in which a high-priority thread
80  * needs a lock held by a low-priority thread, which cannot run because
81  * of medium-priority threads.  Without PI, the medium-priority threads
82  * can starve out the high-priority thread indefinitely.  With PI, the
83  * low-priority thread becomes high-priority until it releases whatever
84  * synchronization object the real high-priority thread is waiting for.
85  *
86  * How turnstiles work:
87  *
88  * All active turnstiles reside in a global hash table, turnstile_table[].
89  * The address of a synchronization object determines its hash index.
90  * Each hash chain is protected by its own dispatcher lock, acquired
91  * by turnstile_lookup().  This lock protects the hash chain linkage, the
92  * contents of all turnstiles on the hash chain, and the waiters bits of
93  * every synchronization object in the system that hashes to the same chain.
94  * Giving the lock such broad scope simplifies the interactions between
95  * the turnstile code and its clients considerably.  The blocking path
96  * is rare enough that this has no impact on scalability.  (If it ever
97  * does, it's almost surely a second-order effect -- the real problem
98  * is that some synchronization object is *very* heavily contended.)
99  *
100  * Each thread has an attached turnstile in case it needs to block.
101  * A thread cannot block on more than one lock at a time, so one
102  * turnstile per thread is the most we ever need.  The first thread
103  * to block on a lock donates its attached turnstile and adds it to
104  * the appropriate hash chain in turnstile_table[].  This becomes the
105  * "active turnstile" for the lock.  Each subsequent thread that blocks
106  * on the same lock discovers that the lock already has an active
107  * turnstile, so it stashes its own turnstile on the active turnstile's
108  * freelist.  As threads wake up, the process is reversed.
109  *
110  * turnstile_block() puts the current thread to sleep on the active
111  * turnstile for the desired lock, walks the blocking chain to apply
112  * priority inheritance to everyone in its way, and yields the CPU.
113  *
114  * turnstile_wakeup() waives any priority the owner may have inherited
115  * and wakes the specified number of waiting threads.  If the caller is
116  * doing direct handoff of ownership (rather than just dropping the lock),
117  * the new owner automatically inherits priority from any existing waiters.
118  */
119 
120 #include <sys/param.h>
121 #include <sys/systm.h>
122 #include <sys/thread.h>
123 #include <sys/proc.h>
124 #include <sys/debug.h>
125 #include <sys/cpuvar.h>
126 #include <sys/turnstile.h>
127 #include <sys/t_lock.h>
128 #include <sys/disp.h>
129 #include <sys/sobject.h>
130 #include <sys/cmn_err.h>
131 #include <sys/sysmacros.h>
132 #include <sys/lockstat.h>
133 #include <sys/lwp_upimutex_impl.h>
134 #include <sys/schedctl.h>
135 #include <sys/cpu.h>
136 #include <sys/sdt.h>
137 #include <sys/cpupart.h>
138 
139 extern upib_t upimutextab[UPIMUTEX_TABSIZE];
140 
141 #define	IS_UPI(sobj)	\
142 	((uintptr_t)(sobj) - (uintptr_t)upimutextab < sizeof (upimutextab))
143 
144 /*
145  * The turnstile hash table is partitioned into two halves: the lower half
146  * is used for upimutextab[] locks, the upper half for everything else.
147  * The reason for the distinction is that SOBJ_USER_PI locks present a
148  * unique problem: the upimutextab[] lock passed to turnstile_block()
149  * cannot be dropped until the calling thread has blocked on its
150  * SOBJ_USER_PI lock and willed its priority down the blocking chain.
151  * At that point, the caller's t_lockp will be one of the turnstile locks.
152  * If mutex_exit() discovers that the upimutextab[] lock has waiters, it
153  * must wake them, which forces a lock ordering on us: the turnstile lock
154  * for the upimutextab[] lock will be acquired in mutex_vector_exit(),
155  * which will eventually call into turnstile_pi_waive(), which will then
156  * acquire the caller's thread lock, which in this case is the turnstile
157  * lock for the SOBJ_USER_PI lock.  In general, when two turnstile locks
158  * must be held at the same time, the lock order must be the address order.
159  * Therefore, to prevent deadlock in turnstile_pi_waive(), we must ensure
160  * that upimutextab[] locks *always* hash to lower addresses than any
161  * other locks.  You think this is cheesy?  Let's see you do better.
162  */
163 #define	TURNSTILE_HASH_SIZE	128		/* must be power of 2 */
164 #define	TURNSTILE_HASH_MASK	(TURNSTILE_HASH_SIZE - 1)
165 #define	TURNSTILE_SOBJ_HASH(sobj)	\
166 	((((ulong_t)sobj >> 2) + ((ulong_t)sobj >> 9)) & TURNSTILE_HASH_MASK)
167 #define	TURNSTILE_SOBJ_BUCKET(sobj)		\
168 	((IS_UPI(sobj) ? 0 : TURNSTILE_HASH_SIZE) + TURNSTILE_SOBJ_HASH(sobj))
169 #define	TURNSTILE_CHAIN(sobj)	turnstile_table[TURNSTILE_SOBJ_BUCKET(sobj)]
170 
171 typedef struct turnstile_chain {
172 	turnstile_t	*tc_first;	/* first turnstile on hash chain */
173 	disp_lock_t	tc_lock;	/* lock for this hash chain */
174 } turnstile_chain_t;
175 
176 turnstile_chain_t	turnstile_table[2 * TURNSTILE_HASH_SIZE];
177 
178 static	lock_t	turnstile_loser_lock;
179 
180 /*
181  * Make 'inheritor' inherit priority from this turnstile.
182  */
183 static void
184 turnstile_pi_inherit(turnstile_t *ts, kthread_t *inheritor, pri_t epri)
185 {
186 	ASSERT(THREAD_LOCK_HELD(inheritor));
187 	ASSERT(DISP_LOCK_HELD(&TURNSTILE_CHAIN(ts->ts_sobj).tc_lock));
188 
189 	if (epri <= inheritor->t_pri)
190 		return;
191 
192 	if (ts->ts_inheritor == NULL) {
193 		ts->ts_inheritor = inheritor;
194 		ts->ts_epri = epri;
195 		disp_lock_enter_high(&inheritor->t_pi_lock);
196 		ts->ts_prioinv = inheritor->t_prioinv;
197 		inheritor->t_prioinv = ts;
198 		disp_lock_exit_high(&inheritor->t_pi_lock);
199 	} else {
200 		/*
201 		 * 'inheritor' is already inheriting from this turnstile,
202 		 * so just adjust its priority.
203 		 */
204 		ASSERT(ts->ts_inheritor == inheritor);
205 		if (ts->ts_epri < epri)
206 			ts->ts_epri = epri;
207 	}
208 
209 	if (epri > DISP_PRIO(inheritor))
210 		thread_change_epri(inheritor, epri);
211 }
212 
213 /*
214  * If turnstile is non-NULL, remove it from inheritor's t_prioinv list.
215  * Compute new inherited priority, and return it.
216  */
217 static pri_t
218 turnstile_pi_tsdelete(turnstile_t *ts, kthread_t *inheritor)
219 {
220 	turnstile_t **tspp, *tsp;
221 	pri_t new_epri = 0;
222 
223 	disp_lock_enter_high(&inheritor->t_pi_lock);
224 	tspp = &inheritor->t_prioinv;
225 	while ((tsp = *tspp) != NULL) {
226 		if (tsp == ts)
227 			*tspp = tsp->ts_prioinv;
228 		else
229 			new_epri = MAX(new_epri, tsp->ts_epri);
230 		tspp = &tsp->ts_prioinv;
231 	}
232 	disp_lock_exit_high(&inheritor->t_pi_lock);
233 	return (new_epri);
234 }
235 
236 /*
237  * Remove turnstile from inheritor's t_prioinv list, compute
238  * new priority, and change the inheritor's effective priority if
239  * necessary. Keep in synch with turnstile_pi_recalc().
240  */
241 static void
242 turnstile_pi_waive(turnstile_t *ts)
243 {
244 	kthread_t *inheritor = ts->ts_inheritor;
245 	pri_t new_epri;
246 
247 	ASSERT(inheritor == curthread);
248 
249 	thread_lock_high(inheritor);
250 	new_epri = turnstile_pi_tsdelete(ts, inheritor);
251 	if (new_epri != DISP_PRIO(inheritor))
252 		thread_change_epri(inheritor, new_epri);
253 	ts->ts_inheritor = NULL;
254 	if (DISP_MUST_SURRENDER(inheritor))
255 		cpu_surrender(inheritor);
256 	thread_unlock_high(inheritor);
257 }
258 
259 /*
260  * Compute caller's new inherited priority, and change its effective
261  * priority if necessary. Necessary only for SOBJ_USER_PI, because of
262  * its interruptibility characteristic.
263  */
264 void
265 turnstile_pi_recalc(void)
266 {
267 	kthread_t *inheritor = curthread;
268 	pri_t new_epri;
269 
270 	thread_lock(inheritor);
271 	new_epri = turnstile_pi_tsdelete(NULL, inheritor);
272 	if (new_epri != DISP_PRIO(inheritor))
273 		thread_change_epri(inheritor, new_epri);
274 	if (DISP_MUST_SURRENDER(inheritor))
275 		cpu_surrender(inheritor);
276 	thread_unlock(inheritor);
277 }
278 
279 /*
280  * Grab the lock protecting the hash chain for sobj
281  * and return the active turnstile for sobj, if any.
282  */
283 turnstile_t *
284 turnstile_lookup(void *sobj)
285 {
286 	turnstile_t *ts;
287 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj);
288 
289 	disp_lock_enter(&tc->tc_lock);
290 
291 	for (ts = tc->tc_first; ts != NULL; ts = ts->ts_next)
292 		if (ts->ts_sobj == sobj)
293 			break;
294 
295 	return (ts);
296 }
297 
298 /*
299  * Drop the lock protecting the hash chain for sobj.
300  */
301 void
302 turnstile_exit(void *sobj)
303 {
304 	disp_lock_exit(&TURNSTILE_CHAIN(sobj).tc_lock);
305 }
306 
307 /*
308  * When we apply priority inheritance, we must grab the owner's thread lock
309  * while already holding the waiter's thread lock.  If both thread locks are
310  * turnstile locks, this can lead to deadlock: while we hold L1 and try to
311  * grab L2, some unrelated thread may be applying priority inheritance to
312  * some other blocking chain, holding L2 and trying to grab L1.  The most
313  * obvious solution -- do a lock_try() for the owner lock -- isn't quite
314  * sufficient because it can cause livelock: each thread may hold one lock,
315  * try to grab the other, fail, bail out, and try again, looping forever.
316  * To prevent livelock we must define a winner, i.e. define an arbitrary
317  * lock ordering on the turnstile locks.  For simplicity we declare that
318  * virtual address order defines lock order, i.e. if L1 < L2, then the
319  * correct lock ordering is L1, L2.  Thus the thread that holds L1 and
320  * wants L2 should spin until L2 is available, but the thread that holds
321  * L2 and can't get L1 on the first try must drop L2 and return failure.
322  * Moreover, the losing thread must not reacquire L2 until the winning
323  * thread has had a chance to grab it; to ensure this, the losing thread
324  * must grab L1 after dropping L2, thus spinning until the winner is done.
325  * Complicating matters further, note that the owner's thread lock pointer
326  * can change (i.e. be pointed at a different lock) while we're trying to
327  * grab it.  If that happens, we must unwind our state and try again.
328  *
329  * On success, returns 1 with both locks held.
330  * On failure, returns 0 with neither lock held.
331  */
332 static int
333 turnstile_interlock(lock_t *wlp, lock_t *volatile *olpp)
334 {
335 	ASSERT(LOCK_HELD(wlp));
336 
337 	for (;;) {
338 		volatile lock_t *olp = *olpp;
339 
340 		/*
341 		 * If the locks are identical, there's nothing to do.
342 		 */
343 		if (olp == wlp)
344 			return (1);
345 		if (lock_try((lock_t *)olp)) {
346 			/*
347 			 * If 'olp' is still the right lock, return success.
348 			 * Otherwise, drop 'olp' and try the dance again.
349 			 */
350 			if (olp == *olpp)
351 				return (1);
352 			lock_clear((lock_t *)olp);
353 		} else {
354 			hrtime_t spin_time = 0;
355 			/*
356 			 * If we're grabbing the locks out of order, we lose.
357 			 * Drop the waiter's lock, and then grab and release
358 			 * the owner's lock to ensure that we won't retry
359 			 * until the winner is done (as described above).
360 			 */
361 			if (olp >= (lock_t *)turnstile_table && olp < wlp) {
362 				lock_clear(wlp);
363 				lock_set((lock_t *)olp);
364 				lock_clear((lock_t *)olp);
365 				return (0);
366 			}
367 			/*
368 			 * We're grabbing the locks in the right order,
369 			 * so spin until the owner's lock either becomes
370 			 * available or spontaneously changes.
371 			 */
372 			spin_time =
373 			    LOCKSTAT_START_TIME(LS_TURNSTILE_INTERLOCK_SPIN);
374 			while (olp == *olpp && LOCK_HELD(olp)) {
375 				if (panicstr)
376 					return (1);
377 				SMT_PAUSE();
378 			}
379 			LOCKSTAT_RECORD_TIME(LS_TURNSTILE_INTERLOCK_SPIN,
380 			    olp, spin_time);
381 		}
382 	}
383 }
384 
385 /*
386  * Block the current thread on a synchronization object.
387  *
388  * Turnstiles implement both kernel and user-level priority inheritance.
389  * To avoid missed wakeups in the user-level case, lwp_upimutex_lock() calls
390  * turnstile_block() holding the appropriate lock in the upimutextab (see
391  * the block comment in lwp_upimutex_lock() for details).  The held lock is
392  * passed to turnstile_block() as the "mp" parameter, and will be dropped
393  * after priority has been willed, but before the thread actually sleeps
394  * (this locking behavior leads to some subtle ordering issues; see the
395  * block comment on turnstile hashing for details).  This _must_ be the only
396  * lock held when calling turnstile_block() with a SOBJ_USER_PI sobj; holding
397  * other locks can result in panics due to cycles in the blocking chain.
398  *
399  * turnstile_block() always succeeds for kernel synchronization objects.
400  * For SOBJ_USER_PI locks the possible errors are EINTR for signals, and
401  * EDEADLK for cycles in the blocking chain. A return code of zero indicates
402  * *either* that the lock is now held, or that this is a spurious wake-up, or
403  * that the lock can never be held due to an ENOTRECOVERABLE error.
404  * It is up to lwp_upimutex_lock() to sort this all out.
405  */
406 
407 int
408 turnstile_block(turnstile_t *ts, int qnum, void *sobj, sobj_ops_t *sobj_ops,
409     kmutex_t *mp, lwp_timer_t *lwptp)
410 {
411 	kthread_t *owner;
412 	kthread_t *t = curthread;
413 	proc_t *p = ttoproc(t);
414 	klwp_t *lwp = ttolwp(t);
415 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(sobj);
416 	int error = 0;
417 	int loser = 0;
418 
419 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
420 	ASSERT(mp == NULL || IS_UPI(mp));
421 	ASSERT((SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) ^ (mp == NULL));
422 
423 	thread_lock_high(t);
424 
425 	if (ts == NULL) {
426 		/*
427 		 * This is the first thread to block on this sobj.
428 		 * Take its attached turnstile and add it to the hash chain.
429 		 */
430 		ts = t->t_ts;
431 		ts->ts_sobj = sobj;
432 		ts->ts_next = tc->tc_first;
433 		tc->tc_first = ts;
434 		ASSERT(ts->ts_waiters == 0);
435 	} else {
436 		/*
437 		 * Another thread has already donated its turnstile
438 		 * to block on this sobj, so ours isn't needed.
439 		 * Stash it on the active turnstile's freelist.
440 		 */
441 		turnstile_t *myts = t->t_ts;
442 		myts->ts_free = ts->ts_free;
443 		ts->ts_free = myts;
444 		t->t_ts = ts;
445 		ASSERT(ts->ts_sobj == sobj);
446 		ASSERT(ts->ts_waiters > 0);
447 	}
448 
449 	/*
450 	 * Put the thread to sleep.
451 	 */
452 	ASSERT(t != CPU->cpu_idle_thread);
453 	ASSERT(CPU_ON_INTR(CPU) == 0);
454 	ASSERT(t->t_wchan0 == NULL && t->t_wchan == NULL);
455 	ASSERT(t->t_state == TS_ONPROC);
456 
457 	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
458 		curthread->t_flag |= T_WAKEABLE;
459 	}
460 	CL_SLEEP(t);		/* assign kernel priority */
461 	THREAD_SLEEP(t, &tc->tc_lock);
462 	t->t_wchan = sobj;
463 	t->t_sobj_ops = sobj_ops;
464 	DTRACE_SCHED(sleep);
465 
466 	if (lwp != NULL) {
467 		lwp->lwp_ru.nvcsw++;
468 		(void) new_mstate(t, LMS_SLEEP);
469 		if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
470 			lwp->lwp_asleep = 1;
471 			lwp->lwp_sysabort = 0;
472 			/*
473 			 * make wchan0 non-zero to conform to the rule that
474 			 * threads blocking for user-level objects have a
475 			 * non-zero wchan0: this prevents spurious wake-ups
476 			 * by, for example, /proc.
477 			 */
478 			t->t_wchan0 = (caddr_t)1;
479 		}
480 	}
481 	ts->ts_waiters++;
482 	sleepq_insert(&ts->ts_sleepq[qnum], t);
483 
484 	if (SOBJ_TYPE(sobj_ops) == SOBJ_MUTEX &&
485 	    SOBJ_OWNER(sobj_ops, sobj) == NULL)
486 		panic("turnstile_block(%p): unowned mutex", (void *)ts);
487 
488 	/*
489 	 * Follow the blocking chain to its end, willing our priority to
490 	 * everyone who's in our way.
491 	 */
492 	while (t->t_sobj_ops != NULL &&
493 	    (owner = SOBJ_OWNER(t->t_sobj_ops, t->t_wchan)) != NULL) {
494 		if (owner == curthread) {
495 			if (SOBJ_TYPE(sobj_ops) != SOBJ_USER_PI) {
496 				panic("Deadlock: cycle in blocking chain");
497 			}
498 			/*
499 			 * If the cycle we've encountered ends in mp,
500 			 * then we know it isn't a 'real' cycle because
501 			 * we're going to drop mp before we go to sleep.
502 			 * Moreover, since we've come full circle we know
503 			 * that we must have willed priority to everyone
504 			 * in our way.  Therefore, we can break out now.
505 			 */
506 			if (t->t_wchan == (void *)mp)
507 				break;
508 
509 			if (loser)
510 				lock_clear(&turnstile_loser_lock);
511 			/*
512 			 * For SOBJ_USER_PI, a cycle is an application
513 			 * deadlock which needs to be communicated
514 			 * back to the application.
515 			 */
516 			thread_unlock_nopreempt(t);
517 			mutex_exit(mp);
518 			setrun(curthread);
519 			swtch(); /* necessary to transition state */
520 			curthread->t_flag &= ~T_WAKEABLE;
521 			if (lwptp->lwpt_id != 0)
522 				(void) lwp_timer_dequeue(lwptp);
523 			setallwatch();
524 			lwp->lwp_asleep = 0;
525 			lwp->lwp_sysabort = 0;
526 			return (EDEADLK);
527 		}
528 		if (!turnstile_interlock(t->t_lockp, &owner->t_lockp)) {
529 			/*
530 			 * If we failed to grab the owner's thread lock,
531 			 * turnstile_interlock() will have dropped t's
532 			 * thread lock, so at this point we don't even know
533 			 * that 't' exists anymore.  The simplest solution
534 			 * is to restart the entire priority inheritance dance
535 			 * from the beginning of the blocking chain, since
536 			 * we *do* know that 'curthread' still exists.
537 			 * Application of priority inheritance is idempotent,
538 			 * so it's OK that we're doing it more than once.
539 			 * Note also that since we've dropped our thread lock,
540 			 * we may already have been woken up; if so, our
541 			 * t_sobj_ops will be NULL, the loop will terminate,
542 			 * and the call to swtch() will be a no-op.  Phew.
543 			 *
544 			 * There is one further complication: if two (or more)
545 			 * threads keep trying to grab the turnstile locks out
546 			 * of order and keep losing the race to another thread,
547 			 * these "dueling losers" can livelock the system.
548 			 * Therefore, once we get into this rare situation,
549 			 * we serialize all the losers.
550 			 */
551 			if (loser == 0) {
552 				loser = 1;
553 				lock_set(&turnstile_loser_lock);
554 			}
555 			t = curthread;
556 			thread_lock_high(t);
557 			continue;
558 		}
559 
560 		/*
561 		 * We now have the owner's thread lock.  If we are traversing
562 		 * from non-SOBJ_USER_PI ops to SOBJ_USER_PI ops, then we know
563 		 * that we have caught the thread while in the TS_SLEEP state,
564 		 * but holding mp.  We know that this situation is transient
565 		 * (mp will be dropped before the holder actually sleeps on
566 		 * the SOBJ_USER_PI sobj), so we will spin waiting for mp to
567 		 * be dropped.  Then, as in the turnstile_interlock() failure
568 		 * case, we will restart the priority inheritance dance.
569 		 */
570 		if (SOBJ_TYPE(t->t_sobj_ops) != SOBJ_USER_PI &&
571 		    owner->t_sobj_ops != NULL &&
572 		    SOBJ_TYPE(owner->t_sobj_ops) == SOBJ_USER_PI) {
573 			kmutex_t *upi_lock = (kmutex_t *)t->t_wchan;
574 
575 			ASSERT(IS_UPI(upi_lock));
576 			ASSERT(SOBJ_TYPE(t->t_sobj_ops) == SOBJ_MUTEX);
577 
578 			if (t->t_lockp != owner->t_lockp)
579 				thread_unlock_high(owner);
580 			thread_unlock_high(t);
581 			if (loser)
582 				lock_clear(&turnstile_loser_lock);
583 
584 			while (mutex_owner(upi_lock) == owner) {
585 				SMT_PAUSE();
586 				continue;
587 			}
588 
589 			if (loser)
590 				lock_set(&turnstile_loser_lock);
591 			t = curthread;
592 			thread_lock_high(t);
593 			continue;
594 		}
595 
596 		turnstile_pi_inherit(t->t_ts, owner, DISP_PRIO(t));
597 		if (t->t_lockp != owner->t_lockp)
598 			thread_unlock_high(t);
599 		t = owner;
600 	}
601 
602 	if (loser)
603 		lock_clear(&turnstile_loser_lock);
604 
605 	/*
606 	 * Note: 't' and 'curthread' were synonymous before the loop above,
607 	 * but now they may be different.  ('t' is now the last thread in
608 	 * the blocking chain.)
609 	 */
610 	if (SOBJ_TYPE(sobj_ops) == SOBJ_USER_PI) {
611 		ushort_t s = curthread->t_oldspl;
612 		int timedwait = 0;
613 		uint_t imm_timeout = 0;
614 		clock_t tim = -1;
615 
616 		thread_unlock_high(t);
617 		if (lwptp->lwpt_id != 0) {
618 			/*
619 			 * We enqueued a timeout.  If it has already fired,
620 			 * lwptp->lwpt_imm_timeout has been set with cas,
621 			 * so fetch it with cas.
622 			 */
623 			timedwait = 1;
624 			imm_timeout =
625 			    atomic_cas_uint(&lwptp->lwpt_imm_timeout, 0, 0);
626 		}
627 		mutex_exit(mp);
628 		splx(s);
629 
630 		if (ISSIG(curthread, JUSTLOOKING) ||
631 		    MUSTRETURN(p, curthread) || imm_timeout)
632 			setrun(curthread);
633 		swtch();
634 		curthread->t_flag &= ~T_WAKEABLE;
635 		if (timedwait)
636 			tim = lwp_timer_dequeue(lwptp);
637 		setallwatch();
638 		if (ISSIG(curthread, FORREAL) || lwp->lwp_sysabort ||
639 		    MUSTRETURN(p, curthread))
640 			error = EINTR;
641 		else if (imm_timeout || (timedwait && tim == -1))
642 			error = ETIME;
643 		lwp->lwp_sysabort = 0;
644 		lwp->lwp_asleep = 0;
645 	} else {
646 		thread_unlock_nopreempt(t);
647 		swtch();
648 	}
649 
650 	return (error);
651 }
652 
653 /*
654  * Remove thread from specified turnstile sleep queue; retrieve its
655  * free turnstile; if it is the last waiter, delete the turnstile
656  * from the turnstile chain and if there is an inheritor, delete it
657  * from the inheritor's t_prioinv chain.
658  */
659 static void
660 turnstile_dequeue(kthread_t *t)
661 {
662 	turnstile_t *ts = t->t_ts;
663 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(ts->ts_sobj);
664 	turnstile_t *tsfree, **tspp;
665 
666 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
667 	ASSERT(t->t_lockp == &tc->tc_lock);
668 
669 	if ((tsfree = ts->ts_free) != NULL) {
670 		ASSERT(ts->ts_waiters > 1);
671 		ASSERT(tsfree->ts_waiters == 0);
672 		t->t_ts = tsfree;
673 		ts->ts_free = tsfree->ts_free;
674 		tsfree->ts_free = NULL;
675 	} else {
676 		/*
677 		 * The active turnstile's freelist is empty, so this
678 		 * must be the last waiter.  Remove the turnstile
679 		 * from the hash chain and leave the now-inactive
680 		 * turnstile attached to the thread we're waking.
681 		 * Note that the ts_inheritor for the turnstile
682 		 * may be NULL. If one exists, its t_prioinv
683 		 * chain has to be updated.
684 		 */
685 		ASSERT(ts->ts_waiters == 1);
686 		if (ts->ts_inheritor != NULL) {
687 			(void) turnstile_pi_tsdelete(ts, ts->ts_inheritor);
688 			/*
689 			 * If we ever do a "disinherit" or "unboost", we need
690 			 * to do it only if "t" is a thread at the head of the
691 			 * sleep queue. Since the sleep queue is prioritized,
692 			 * the disinherit is necessary only if the interrupted
693 			 * thread is the highest priority thread.
694 			 * Otherwise, there is a higher priority thread blocked
695 			 * on the turnstile, whose inheritance cannot be
696 			 * disinherited. However, disinheriting is explicitly
697 			 * not done here, since it would require holding the
698 			 * inheritor's thread lock (see turnstile_unsleep()).
699 			 */
700 			ts->ts_inheritor = NULL;
701 		}
702 		tspp = &tc->tc_first;
703 		while (*tspp != ts)
704 			tspp = &(*tspp)->ts_next;
705 		*tspp = ts->ts_next;
706 		ASSERT(t->t_ts == ts);
707 	}
708 	ts->ts_waiters--;
709 	sleepq_dequeue(t);
710 	t->t_sobj_ops = NULL;
711 	t->t_wchan = NULL;
712 	t->t_wchan0 = NULL;
713 	ASSERT(t->t_state == TS_SLEEP);
714 }
715 
716 /*
717  * Wake threads that are blocked in a turnstile.
718  */
719 void
720 turnstile_wakeup(turnstile_t *ts, int qnum, int nthreads, kthread_t *owner)
721 {
722 	turnstile_chain_t *tc = &TURNSTILE_CHAIN(ts->ts_sobj);
723 	sleepq_t *sqp = &ts->ts_sleepq[qnum];
724 
725 	ASSERT(DISP_LOCK_HELD(&tc->tc_lock));
726 
727 	/*
728 	 * Waive any priority we may have inherited from this turnstile.
729 	 */
730 	if (ts->ts_inheritor != NULL) {
731 		turnstile_pi_waive(ts);
732 	}
733 	while (nthreads-- > 0) {
734 		kthread_t *t = sqp->sq_first;
735 		ASSERT(t->t_ts == ts);
736 		ASSERT(ts->ts_waiters > 1 || ts->ts_inheritor == NULL);
737 		DTRACE_SCHED1(wakeup, kthread_t *, t);
738 		turnstile_dequeue(t);
739 		CL_WAKEUP(t); /* previous thread lock, tc_lock, not dropped */
740 		/*
741 		 * If the caller did direct handoff of ownership,
742 		 * make the new owner inherit from this turnstile.
743 		 */
744 		if (t == owner) {
745 			kthread_t *wp = ts->ts_sleepq[TS_WRITER_Q].sq_first;
746 			kthread_t *rp = ts->ts_sleepq[TS_READER_Q].sq_first;
747 			pri_t wpri = wp ? DISP_PRIO(wp) : 0;
748 			pri_t rpri = rp ? DISP_PRIO(rp) : 0;
749 			turnstile_pi_inherit(ts, t, MAX(wpri, rpri));
750 			owner = NULL;
751 		}
752 		thread_unlock_high(t);		/* drop run queue lock */
753 	}
754 	if (owner != NULL)
755 		panic("turnstile_wakeup: owner %p not woken", owner);
756 	disp_lock_exit(&tc->tc_lock);
757 }
758 
759 /*
760  * Change priority of a thread sleeping in a turnstile.
761  */
762 void
763 turnstile_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
764 {
765 	sleepq_t *sqp = t->t_sleepq;
766 
767 	sleepq_dequeue(t);
768 	*t_prip = pri;
769 	sleepq_insert(sqp, t);
770 }
771 
772 /*
773  * We don't allow spurious wakeups of threads blocked in turnstiles
774  * for synch objects whose sobj_ops vector is initialized with the
775  * following routine (e.g. kernel synchronization objects).
776  * This is vital to the correctness of direct-handoff logic in some
777  * synchronization primitives, and it also simplifies the PI logic.
778  */
779 /* ARGSUSED */
780 void
781 turnstile_stay_asleep(kthread_t *t)
782 {
783 }
784 
785 /*
786  * Wake up a thread blocked in a turnstile. Used to enable interruptibility
787  * of threads blocked on a SOBJ_USER_PI sobj.
788  *
789  * The implications of this interface are:
790  *
791  * 1. turnstile_block() may return with an EINTR.
792  * 2. When the owner of an sobj releases it, but no turnstile is found (i.e.
793  *    no waiters), the (prior) owner must call turnstile_pi_recalc() to
794  *    waive any priority inherited from interrupted waiters.
795  *
796  * When a waiter is interrupted, disinheriting its willed priority from the
797  * inheritor would require holding the inheritor's thread lock, while also
798  * holding the waiter's thread lock which is a turnstile lock. If the
799  * inheritor's thread lock is not free, and is also a turnstile lock that
800  * is out of lock order, the waiter's thread lock would have to be dropped.
801  * This leads to complications for the caller of turnstile_unsleep(), since
802  * the caller holds the waiter's thread lock. So, instead of disinheriting
803  * on waiter interruption, the owner is required to follow rule 2 above.
804  *
805  * Avoiding disinherit on waiter interruption seems acceptable because
806  * the owner runs at an unnecessarily high priority only while sobj is held,
807  * which it would have done in any case, if the waiter had not been interrupted.
808  */
809 void
810 turnstile_unsleep(kthread_t *t)
811 {
812 	turnstile_dequeue(t);
813 	THREAD_TRANSITION(t);
814 	CL_SETRUN(t);
815 }
816