xref: /freebsd/sys/kern/subr_turnstile.c (revision d056fa046c6a91b90cd98165face0e42a33a5173)
1 /*-
2  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  * 3. Berkeley Software Design Inc's name may not be used to endorse or
13  *    promote products derived from this software without specific prior
14  *    written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30  */
31 
32 /*
33  * Implementation of turnstiles used to hold queue of threads blocked on
34  * non-sleepable locks.  Sleepable locks use condition variables to
35  * implement their queues.  Turnstiles differ from a sleep queue in that
36  * turnstile queue's are assigned to a lock held by an owning thread.  Thus,
37  * when one thread is enqueued onto a turnstile, it can lend its priority
38  * to the owning thread.
39  *
40  * We wish to avoid bloating locks with an embedded turnstile and we do not
41  * want to use back-pointers in the locks for the same reason.  Thus, we
42  * use a similar approach to that of Solaris 7 as described in Solaris
43  * Internals by Jim Mauro and Richard McDougall.  Turnstiles are looked up
44  * in a hash table based on the address of the lock.  Each entry in the
45  * hash table is a linked-lists of turnstiles and is called a turnstile
46  * chain.  Each chain contains a spin mutex that protects all of the
47  * turnstiles in the chain.
48  *
49  * Each time a thread is created, a turnstile is malloc'd and attached to
50  * that thread.  When a thread blocks on a lock, if it is the first thread
51  * to block, it lends its turnstile to the lock.  If the lock already has
52  * a turnstile, then it gives its turnstile to the lock's turnstile's free
53  * list.  When a thread is woken up, it takes a turnstile from the free list
54  * if there are any other waiters.  If it is the only thread blocked on the
55  * lock, then it reclaims the turnstile associated with the lock and removes
56  * it from the hash table.
57  */
58 
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61 
62 #include "opt_ddb.h"
63 #include "opt_turnstile_profiling.h"
64 
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/ktr.h>
69 #include <sys/lock.h>
70 #include <sys/malloc.h>
71 #include <sys/mutex.h>
72 #include <sys/proc.h>
73 #include <sys/queue.h>
74 #include <sys/sched.h>
75 #include <sys/sysctl.h>
76 #include <sys/turnstile.h>
77 
78 #ifdef DDB
79 #include <sys/kdb.h>
80 #include <ddb/ddb.h>
81 #endif
82 
83 /*
84  * Constants for the hash table of turnstile chains.  TC_SHIFT is a magic
85  * number chosen because the sleep queue's use the same value for the
86  * shift.  Basically, we ignore the lower 8 bits of the address.
87  * TC_TABLESIZE must be a power of two for TC_MASK to work properly.
88  */
89 #define	TC_TABLESIZE	128			/* Must be power of 2. */
90 #define	TC_MASK		(TC_TABLESIZE - 1)
91 #define	TC_SHIFT	8
92 #define	TC_HASH(lock)	(((uintptr_t)(lock) >> TC_SHIFT) & TC_MASK)
93 #define	TC_LOOKUP(lock)	&turnstile_chains[TC_HASH(lock)]
94 
95 /*
96  * There are three different lists of turnstiles as follows.  The list
97  * connected by ts_link entries is a per-thread list of all the turnstiles
98  * attached to locks that we own.  This is used to fixup our priority when
99  * a lock is released.  The other two lists use the ts_hash entries.  The
100  * first of these two is the turnstile chain list that a turnstile is on
101  * when it is attached to a lock.  The second list to use ts_hash is the
102  * free list hung off of a turnstile that is attached to a lock.
103  *
104  * Each turnstile contains three lists of threads.  The two ts_blocked lists
105  * are linked list of threads blocked on the turnstile's lock.  One list is
106  * for exclusive waiters, and the other is for shared waiters.  The
107  * ts_pending list is a linked list of threads previously awakened by
108  * turnstile_signal() or turnstile_wait() that are waiting to be put on
109  * the run queue.
110  *
111  * Locking key:
112  *  c - turnstile chain lock
113  *  q - td_contested lock
114  */
115 struct turnstile {
116 	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
117 	struct threadqueue ts_pending;		/* (c) Pending threads. */
118 	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
119 	LIST_ENTRY(turnstile) ts_link;		/* (q) Contested locks. */
120 	LIST_HEAD(, turnstile) ts_free;		/* (c) Free turnstiles. */
121 	struct lock_object *ts_lockobj;		/* (c) Lock we reference. */
122 	struct thread *ts_owner;		/* (c + q) Who owns the lock. */
123 };
124 
125 struct turnstile_chain {
126 	LIST_HEAD(, turnstile) tc_turnstiles;	/* List of turnstiles. */
127 	struct mtx tc_lock;			/* Spin lock for this chain. */
128 #ifdef TURNSTILE_PROFILING
129 	u_int	tc_depth;			/* Length of tc_queues. */
130 	u_int	tc_max_depth;			/* Max length of tc_queues. */
131 #endif
132 };
133 
134 #ifdef TURNSTILE_PROFILING
135 u_int turnstile_max_depth;
136 SYSCTL_NODE(_debug, OID_AUTO, turnstile, CTLFLAG_RD, 0, "turnstile profiling");
137 SYSCTL_NODE(_debug_turnstile, OID_AUTO, chains, CTLFLAG_RD, 0,
138     "turnstile chain stats");
139 SYSCTL_UINT(_debug_turnstile, OID_AUTO, max_depth, CTLFLAG_RD,
140     &turnstile_max_depth, 0, "maxmimum depth achieved of a single chain");
141 #endif
142 static struct mtx td_contested_lock;
143 static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
144 
145 static MALLOC_DEFINE(M_TURNSTILE, "turnstiles", "turnstiles");
146 
147 /*
148  * Prototypes for non-exported routines.
149  */
150 static void	init_turnstile0(void *dummy);
151 #ifdef TURNSTILE_PROFILING
152 static void	init_turnstile_profiling(void *arg);
153 #endif
154 static void	propagate_priority(struct thread *td);
155 static int	turnstile_adjust_thread(struct turnstile *ts,
156 		    struct thread *td);
157 static struct thread *turnstile_first_waiter(struct turnstile *ts);
158 static void	turnstile_setowner(struct turnstile *ts, struct thread *owner);
159 
160 /*
161  * Walks the chain of turnstiles and their owners to propagate the priority
162  * of the thread being blocked to all the threads holding locks that have to
163  * release their locks before this thread can run again.
164  */
165 static void
166 propagate_priority(struct thread *td)
167 {
168 	struct turnstile_chain *tc;
169 	struct turnstile *ts;
170 	int pri;
171 
172 	mtx_assert(&sched_lock, MA_OWNED);
173 	pri = td->td_priority;
174 	ts = td->td_blocked;
175 	for (;;) {
176 		td = ts->ts_owner;
177 
178 		if (td == NULL) {
179 			/*
180 			 * This might be a read lock with no owner.  There's
181 			 * not much we can do, so just bail.
182 			 */
183 			return;
184 		}
185 
186 		MPASS(td->td_proc != NULL);
187 		MPASS(td->td_proc->p_magic == P_MAGIC);
188 
189 		/*
190 		 * If the thread is asleep, then we are probably about
191 		 * to deadlock.  To make debugging this easier, just
192 		 * panic and tell the user which thread misbehaved so
193 		 * they can hopefully get a stack trace from the truly
194 		 * misbehaving thread.
195 		 */
196 		if (TD_IS_SLEEPING(td)) {
197 			printf(
198 		"Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
199 			    td->td_tid, td->td_proc->p_pid);
200 #ifdef DDB
201 			db_trace_thread(td, -1);
202 #endif
203 			panic("sleeping thread");
204 		}
205 
206 		/*
207 		 * If this thread already has higher priority than the
208 		 * thread that is being blocked, we are finished.
209 		 */
210 		if (td->td_priority <= pri)
211 			return;
212 
213 		/*
214 		 * Bump this thread's priority.
215 		 */
216 		sched_lend_prio(td, pri);
217 
218 		/*
219 		 * If lock holder is actually running or on the run queue
220 		 * then we are done.
221 		 */
222 		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
223 			MPASS(td->td_blocked == NULL);
224 			return;
225 		}
226 
227 #ifndef SMP
228 		/*
229 		 * For UP, we check to see if td is curthread (this shouldn't
230 		 * ever happen however as it would mean we are in a deadlock.)
231 		 */
232 		KASSERT(td != curthread, ("Deadlock detected"));
233 #endif
234 
235 		/*
236 		 * If we aren't blocked on a lock, we should be.
237 		 */
238 		KASSERT(TD_ON_LOCK(td), (
239 		    "thread %d(%s):%d holds %s but isn't blocked on a lock\n",
240 		    td->td_tid, td->td_proc->p_comm, td->td_state,
241 		    ts->ts_lockobj->lo_name));
242 
243 		/*
244 		 * Pick up the lock that td is blocked on.
245 		 */
246 		ts = td->td_blocked;
247 		MPASS(ts != NULL);
248 		tc = TC_LOOKUP(ts->ts_lockobj);
249 		mtx_lock_spin(&tc->tc_lock);
250 
251 		/* Resort td on the list if needed. */
252 		if (!turnstile_adjust_thread(ts, td)) {
253 			mtx_unlock_spin(&tc->tc_lock);
254 			return;
255 		}
256 		mtx_unlock_spin(&tc->tc_lock);
257 	}
258 }
259 
260 /*
261  * Adjust the thread's position on a turnstile after its priority has been
262  * changed.
263  */
264 static int
265 turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
266 {
267 	struct turnstile_chain *tc;
268 	struct thread *td1, *td2;
269 	int queue;
270 
271 	mtx_assert(&sched_lock, MA_OWNED);
272 	MPASS(TD_ON_LOCK(td));
273 
274 	/*
275 	 * This thread may not be blocked on this turnstile anymore
276 	 * but instead might already be woken up on another CPU
277 	 * that is waiting on sched_lock in turnstile_unpend() to
278 	 * finish waking this thread up.  We can detect this case
279 	 * by checking to see if this thread has been given a
280 	 * turnstile by either turnstile_signal() or
281 	 * turnstile_broadcast().  In this case, treat the thread as
282 	 * if it was already running.
283 	 */
284 	if (td->td_turnstile != NULL)
285 		return (0);
286 
287 	/*
288 	 * Check if the thread needs to be moved on the blocked chain.
289 	 * It needs to be moved if either its priority is lower than
290 	 * the previous thread or higher than the next thread.
291 	 */
292 	tc = TC_LOOKUP(ts->ts_lockobj);
293 	mtx_assert(&tc->tc_lock, MA_OWNED);
294 	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
295 	td2 = TAILQ_NEXT(td, td_lockq);
296 	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
297 	    (td2 != NULL && td->td_priority > td2->td_priority)) {
298 
299 		/*
300 		 * Remove thread from blocked chain and determine where
301 		 * it should be moved to.
302 		 */
303 		queue = td->td_tsqueue;
304 		MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
305 		mtx_lock_spin(&td_contested_lock);
306 		TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
307 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
308 			MPASS(td1->td_proc->p_magic == P_MAGIC);
309 			if (td1->td_priority > td->td_priority)
310 				break;
311 		}
312 
313 		if (td1 == NULL)
314 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
315 		else
316 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
317 		mtx_unlock_spin(&td_contested_lock);
318 		if (td1 == NULL)
319 			CTR3(KTR_LOCK,
320 		    "turnstile_adjust_thread: td %d put at tail on [%p] %s",
321 			    td->td_tid, ts->ts_lockobj, ts->ts_lockobj->lo_name);
322 		else
323 			CTR4(KTR_LOCK,
324 		    "turnstile_adjust_thread: td %d moved before %d on [%p] %s",
325 			    td->td_tid, td1->td_tid, ts->ts_lockobj,
326 			    ts->ts_lockobj->lo_name);
327 	}
328 	return (1);
329 }
330 
331 /*
332  * Early initialization of turnstiles.  This is not done via a SYSINIT()
333  * since this needs to be initialized very early when mutexes are first
334  * initialized.
335  */
336 void
337 init_turnstiles(void)
338 {
339 	int i;
340 
341 	for (i = 0; i < TC_TABLESIZE; i++) {
342 		LIST_INIT(&turnstile_chains[i].tc_turnstiles);
343 		mtx_init(&turnstile_chains[i].tc_lock, "turnstile chain",
344 		    NULL, MTX_SPIN);
345 	}
346 	mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
347 	LIST_INIT(&thread0.td_contested);
348 	thread0.td_turnstile = NULL;
349 }
350 
351 #ifdef TURNSTILE_PROFILING
352 static void
353 init_turnstile_profiling(void *arg)
354 {
355 	struct sysctl_oid *chain_oid;
356 	char chain_name[10];
357 	int i;
358 
359 	for (i = 0; i < TC_TABLESIZE; i++) {
360 		snprintf(chain_name, sizeof(chain_name), "%d", i);
361 		chain_oid = SYSCTL_ADD_NODE(NULL,
362 		    SYSCTL_STATIC_CHILDREN(_debug_turnstile_chains), OID_AUTO,
363 		    chain_name, CTLFLAG_RD, NULL, "turnstile chain stats");
364 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
365 		    "depth", CTLFLAG_RD, &turnstile_chains[i].tc_depth, 0,
366 		    NULL);
367 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
368 		    "max_depth", CTLFLAG_RD, &turnstile_chains[i].tc_max_depth,
369 		    0, NULL);
370 	}
371 }
372 SYSINIT(turnstile_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
373     init_turnstile_profiling, NULL);
374 #endif
375 
376 static void
377 init_turnstile0(void *dummy)
378 {
379 
380 	thread0.td_turnstile = turnstile_alloc();
381 }
382 SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
383 
384 /*
385  * Update a thread on the turnstile list after it's priority has been changed.
386  * The old priority is passed in as an argument.
387  */
388 void
389 turnstile_adjust(struct thread *td, u_char oldpri)
390 {
391 	struct turnstile_chain *tc;
392 	struct turnstile *ts;
393 
394 	mtx_assert(&sched_lock, MA_OWNED);
395 	MPASS(TD_ON_LOCK(td));
396 
397 	/*
398 	 * Pick up the lock that td is blocked on.
399 	 */
400 	ts = td->td_blocked;
401 	MPASS(ts != NULL);
402 	tc = TC_LOOKUP(ts->ts_lockobj);
403 	mtx_lock_spin(&tc->tc_lock);
404 
405 	/* Resort the turnstile on the list. */
406 	if (!turnstile_adjust_thread(ts, td)) {
407 		mtx_unlock_spin(&tc->tc_lock);
408 		return;
409 	}
410 
411 	/*
412 	 * If our priority was lowered and we are at the head of the
413 	 * turnstile, then propagate our new priority up the chain.
414 	 * Note that we currently don't try to revoke lent priorities
415 	 * when our priority goes up.
416 	 */
417 	MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
418 	    td->td_tsqueue == TS_SHARED_QUEUE);
419 	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
420 	    td->td_priority < oldpri) {
421 		mtx_unlock_spin(&tc->tc_lock);
422 		propagate_priority(td);
423 	} else
424 		mtx_unlock_spin(&tc->tc_lock);
425 }
426 
427 /*
428  * Set the owner of the lock this turnstile is attached to.
429  */
430 static void
431 turnstile_setowner(struct turnstile *ts, struct thread *owner)
432 {
433 
434 	mtx_assert(&td_contested_lock, MA_OWNED);
435 	MPASS(ts->ts_owner == NULL);
436 
437 	/* A shared lock might not have an owner. */
438 	if (owner == NULL)
439 		return;
440 
441 	MPASS(owner->td_proc->p_magic == P_MAGIC);
442 	ts->ts_owner = owner;
443 	LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
444 }
445 
446 /*
447  * Malloc a turnstile for a new thread, initialize it and return it.
448  */
449 struct turnstile *
450 turnstile_alloc(void)
451 {
452 	struct turnstile *ts;
453 
454 	ts = malloc(sizeof(struct turnstile), M_TURNSTILE, M_WAITOK | M_ZERO);
455 	TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
456 	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
457 	TAILQ_INIT(&ts->ts_pending);
458 	LIST_INIT(&ts->ts_free);
459 	return (ts);
460 }
461 
462 /*
463  * Free a turnstile when a thread is destroyed.
464  */
465 void
466 turnstile_free(struct turnstile *ts)
467 {
468 
469 	MPASS(ts != NULL);
470 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
471 	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
472 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
473 	free(ts, M_TURNSTILE);
474 }
475 
476 /*
477  * Lock the turnstile chain associated with the specified lock.
478  */
479 void
480 turnstile_lock(struct lock_object *lock)
481 {
482 	struct turnstile_chain *tc;
483 
484 	tc = TC_LOOKUP(lock);
485 	mtx_lock_spin(&tc->tc_lock);
486 }
487 
488 /*
489  * Look up the turnstile for a lock in the hash table locking the associated
490  * turnstile chain along the way.  If no turnstile is found in the hash
491  * table, NULL is returned.
492  */
493 struct turnstile *
494 turnstile_lookup(struct lock_object *lock)
495 {
496 	struct turnstile_chain *tc;
497 	struct turnstile *ts;
498 
499 	tc = TC_LOOKUP(lock);
500 	mtx_assert(&tc->tc_lock, MA_OWNED);
501 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
502 		if (ts->ts_lockobj == lock)
503 			return (ts);
504 	return (NULL);
505 }
506 
507 /*
508  * Unlock the turnstile chain associated with a given lock.
509  */
510 void
511 turnstile_release(struct lock_object *lock)
512 {
513 	struct turnstile_chain *tc;
514 
515 	tc = TC_LOOKUP(lock);
516 	mtx_unlock_spin(&tc->tc_lock);
517 }
518 
519 /*
520  * Return a pointer to the thread waiting on this turnstile with the
521  * most important priority or NULL if the turnstile has no waiters.
522  */
523 static struct thread *
524 turnstile_first_waiter(struct turnstile *ts)
525 {
526 	struct thread *std, *xtd;
527 
528 	std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
529 	xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
530 	if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
531 		return (std);
532 	return (xtd);
533 }
534 
535 /*
536  * Take ownership of a turnstile and adjust the priority of the new
537  * owner appropriately.
538  */
539 void
540 turnstile_claim(struct lock_object *lock)
541 {
542 	struct turnstile_chain *tc;
543 	struct turnstile *ts;
544 	struct thread *td, *owner;
545 
546 	tc = TC_LOOKUP(lock);
547 	mtx_assert(&tc->tc_lock, MA_OWNED);
548 	ts = turnstile_lookup(lock);
549 	MPASS(ts != NULL);
550 
551 	owner = curthread;
552 	mtx_lock_spin(&td_contested_lock);
553 	turnstile_setowner(ts, owner);
554 	mtx_unlock_spin(&td_contested_lock);
555 
556 	td = turnstile_first_waiter(ts);
557 	MPASS(td != NULL);
558 	MPASS(td->td_proc->p_magic == P_MAGIC);
559 	mtx_unlock_spin(&tc->tc_lock);
560 
561 	/*
562 	 * Update the priority of the new owner if needed.
563 	 */
564 	mtx_lock_spin(&sched_lock);
565 	if (td->td_priority < owner->td_priority)
566 		sched_lend_prio(owner, td->td_priority);
567 	mtx_unlock_spin(&sched_lock);
568 }
569 
570 /*
571  * Block the current thread on the turnstile assicated with 'lock'.  This
572  * function will context switch and not return until this thread has been
573  * woken back up.  This function must be called with the appropriate
574  * turnstile chain locked and will return with it unlocked.
575  */
576 void
577 turnstile_wait(struct lock_object *lock, struct thread *owner, int queue)
578 {
579 	struct turnstile_chain *tc;
580 	struct turnstile *ts;
581 	struct thread *td, *td1;
582 
583 	td = curthread;
584 	tc = TC_LOOKUP(lock);
585 	mtx_assert(&tc->tc_lock, MA_OWNED);
586 	MPASS(td->td_turnstile != NULL);
587 	if (queue == TS_SHARED_QUEUE)
588 		MPASS(owner != NULL);
589 	if (owner)
590 		MPASS(owner->td_proc->p_magic == P_MAGIC);
591 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
592 
593 	/* Look up the turnstile associated with the lock 'lock'. */
594 	ts = turnstile_lookup(lock);
595 
596 	/*
597 	 * If the lock does not already have a turnstile, use this thread's
598 	 * turnstile.  Otherwise insert the current thread into the
599 	 * turnstile already in use by this lock.
600 	 */
601 	if (ts == NULL) {
602 #ifdef TURNSTILE_PROFILING
603 		tc->tc_depth++;
604 		if (tc->tc_depth > tc->tc_max_depth) {
605 			tc->tc_max_depth = tc->tc_depth;
606 			if (tc->tc_max_depth > turnstile_max_depth)
607 				turnstile_max_depth = tc->tc_max_depth;
608 		}
609 #endif
610 		ts = td->td_turnstile;
611 		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
612 		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
613 		    ("thread's turnstile has pending threads"));
614 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
615 		    ("thread's turnstile has exclusive waiters"));
616 		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
617 		    ("thread's turnstile has shared waiters"));
618 		KASSERT(LIST_EMPTY(&ts->ts_free),
619 		    ("thread's turnstile has a non-empty free list"));
620 		KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
621 		ts->ts_lockobj = lock;
622 		mtx_lock_spin(&td_contested_lock);
623 		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
624 		turnstile_setowner(ts, owner);
625 		mtx_unlock_spin(&td_contested_lock);
626 	} else {
627 		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
628 			if (td1->td_priority > td->td_priority)
629 				break;
630 		mtx_lock_spin(&td_contested_lock);
631 		if (td1 != NULL)
632 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
633 		else
634 			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
635 		MPASS(owner == ts->ts_owner);
636 		mtx_unlock_spin(&td_contested_lock);
637 		MPASS(td->td_turnstile != NULL);
638 		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
639 	}
640 	td->td_turnstile = NULL;
641 	mtx_unlock_spin(&tc->tc_lock);
642 
643 	mtx_lock_spin(&sched_lock);
644 	/*
645 	 * Handle race condition where a thread on another CPU that owns
646 	 * lock 'lock' could have woken us in between us dropping the
647 	 * turnstile chain lock and acquiring the sched_lock.
648 	 */
649 	if (td->td_flags & TDF_TSNOBLOCK) {
650 		td->td_flags &= ~TDF_TSNOBLOCK;
651 		mtx_unlock_spin(&sched_lock);
652 		return;
653 	}
654 
655 #ifdef notyet
656 	/*
657 	 * If we're borrowing an interrupted thread's VM context, we
658 	 * must clean up before going to sleep.
659 	 */
660 	if (td->td_ithd != NULL) {
661 		struct ithd *it = td->td_ithd;
662 
663 		if (it->it_interrupted) {
664 			if (LOCK_LOG_TEST(lock, 0))
665 				CTR3(KTR_LOCK, "%s: %p interrupted %p",
666 				    __func__, it, it->it_interrupted);
667 			intr_thd_fixup(it);
668 		}
669 	}
670 #endif
671 
672 	/* Save who we are blocked on and switch. */
673 	td->td_tsqueue = queue;
674 	td->td_blocked = ts;
675 	td->td_lockname = lock->lo_name;
676 	TD_SET_LOCK(td);
677 	propagate_priority(td);
678 
679 	if (LOCK_LOG_TEST(lock, 0))
680 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
681 		    td->td_tid, lock, lock->lo_name);
682 
683 	mi_switch(SW_VOL, NULL);
684 
685 	if (LOCK_LOG_TEST(lock, 0))
686 		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
687 		    __func__, td->td_tid, lock, lock->lo_name);
688 
689 	mtx_unlock_spin(&sched_lock);
690 }
691 
692 /*
693  * Pick the highest priority thread on this turnstile and put it on the
694  * pending list.  This must be called with the turnstile chain locked.
695  */
696 int
697 turnstile_signal(struct turnstile *ts, int queue)
698 {
699 	struct turnstile_chain *tc;
700 	struct thread *td;
701 	int empty;
702 
703 	MPASS(ts != NULL);
704 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
705 	MPASS(ts->ts_owner == curthread ||
706 	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
707 	tc = TC_LOOKUP(ts->ts_lockobj);
708 	mtx_assert(&tc->tc_lock, MA_OWNED);
709 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
710 
711 	/*
712 	 * Pick the highest priority thread blocked on this lock and
713 	 * move it to the pending list.
714 	 */
715 	td = TAILQ_FIRST(&ts->ts_blocked[queue]);
716 	MPASS(td->td_proc->p_magic == P_MAGIC);
717 	mtx_lock_spin(&td_contested_lock);
718 	TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
719 	mtx_unlock_spin(&td_contested_lock);
720 	TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
721 
722 	/*
723 	 * If the turnstile is now empty, remove it from its chain and
724 	 * give it to the about-to-be-woken thread.  Otherwise take a
725 	 * turnstile from the free list and give it to the thread.
726 	 */
727 	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
728 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
729 	if (empty) {
730 		MPASS(LIST_EMPTY(&ts->ts_free));
731 #ifdef TURNSTILE_PROFILING
732 		tc->tc_depth--;
733 #endif
734 	} else
735 		ts = LIST_FIRST(&ts->ts_free);
736 	MPASS(ts != NULL);
737 	LIST_REMOVE(ts, ts_hash);
738 	td->td_turnstile = ts;
739 
740 	return (empty);
741 }
742 
743 /*
744  * Put all blocked threads on the pending list.  This must be called with
745  * the turnstile chain locked.
746  */
747 void
748 turnstile_broadcast(struct turnstile *ts, int queue)
749 {
750 	struct turnstile_chain *tc;
751 	struct turnstile *ts1;
752 	struct thread *td;
753 
754 	MPASS(ts != NULL);
755 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
756 	MPASS(ts->ts_owner == curthread ||
757 	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
758 	tc = TC_LOOKUP(ts->ts_lockobj);
759 	mtx_assert(&tc->tc_lock, MA_OWNED);
760 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
761 
762 	/*
763 	 * Transfer the blocked list to the pending list.
764 	 */
765 	mtx_lock_spin(&td_contested_lock);
766 	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
767 	mtx_unlock_spin(&td_contested_lock);
768 
769 	/*
770 	 * Give a turnstile to each thread.  The last thread gets
771 	 * this turnstile if the turnstile is empty.
772 	 */
773 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
774 		if (LIST_EMPTY(&ts->ts_free)) {
775 			MPASS(TAILQ_NEXT(td, td_lockq) == NULL);
776 			ts1 = ts;
777 #ifdef TURNSTILE_PROFILING
778 			tc->tc_depth--;
779 #endif
780 		} else
781 			ts1 = LIST_FIRST(&ts->ts_free);
782 		MPASS(ts1 != NULL);
783 		LIST_REMOVE(ts1, ts_hash);
784 		td->td_turnstile = ts1;
785 	}
786 }
787 
788 /*
789  * Wakeup all threads on the pending list and adjust the priority of the
790  * current thread appropriately.  This must be called with the turnstile
791  * chain locked.
792  */
793 void
794 turnstile_unpend(struct turnstile *ts, int owner_type)
795 {
796 	TAILQ_HEAD( ,thread) pending_threads;
797 	struct turnstile_chain *tc;
798 	struct thread *td;
799 	u_char cp, pri;
800 
801 	MPASS(ts != NULL);
802 	MPASS(ts->ts_owner == curthread ||
803 	    (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL));
804 	tc = TC_LOOKUP(ts->ts_lockobj);
805 	mtx_assert(&tc->tc_lock, MA_OWNED);
806 	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
807 
808 	/*
809 	 * Move the list of pending threads out of the turnstile and
810 	 * into a local variable.
811 	 */
812 	TAILQ_INIT(&pending_threads);
813 	TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
814 #ifdef INVARIANTS
815 	if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
816 	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
817 		ts->ts_lockobj = NULL;
818 #endif
819 
820 	/*
821 	 * Remove the turnstile from this thread's list of contested locks
822 	 * since this thread doesn't own it anymore.  New threads will
823 	 * not be blocking on the turnstile until it is claimed by a new
824 	 * owner.  There might not be a current owner if this is a shared
825 	 * lock.
826 	 */
827 	if (ts->ts_owner != NULL) {
828 		mtx_lock_spin(&td_contested_lock);
829 		ts->ts_owner = NULL;
830 		LIST_REMOVE(ts, ts_link);
831 		mtx_unlock_spin(&td_contested_lock);
832 	}
833 	critical_enter();
834 	mtx_unlock_spin(&tc->tc_lock);
835 
836 	/*
837 	 * Adjust the priority of curthread based on other contested
838 	 * locks it owns.  Don't lower the priority below the base
839 	 * priority however.
840 	 */
841 	td = curthread;
842 	pri = PRI_MAX;
843 	mtx_lock_spin(&sched_lock);
844 	mtx_lock_spin(&td_contested_lock);
845 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
846 		cp = turnstile_first_waiter(ts)->td_priority;
847 		if (cp < pri)
848 			pri = cp;
849 	}
850 	mtx_unlock_spin(&td_contested_lock);
851 	sched_unlend_prio(td, pri);
852 
853 	/*
854 	 * Wake up all the pending threads.  If a thread is not blocked
855 	 * on a lock, then it is currently executing on another CPU in
856 	 * turnstile_wait() or sitting on a run queue waiting to resume
857 	 * in turnstile_wait().  Set a flag to force it to try to acquire
858 	 * the lock again instead of blocking.
859 	 */
860 	while (!TAILQ_EMPTY(&pending_threads)) {
861 		td = TAILQ_FIRST(&pending_threads);
862 		TAILQ_REMOVE(&pending_threads, td, td_lockq);
863 		MPASS(td->td_proc->p_magic == P_MAGIC);
864 		if (TD_ON_LOCK(td)) {
865 			td->td_blocked = NULL;
866 			td->td_lockname = NULL;
867 #ifdef INVARIANTS
868 			td->td_tsqueue = 0xff;
869 #endif
870 			TD_CLR_LOCK(td);
871 			MPASS(TD_CAN_RUN(td));
872 			setrunqueue(td, SRQ_BORING);
873 		} else {
874 			td->td_flags |= TDF_TSNOBLOCK;
875 			MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td));
876 		}
877 	}
878 	critical_exit();
879 	mtx_unlock_spin(&sched_lock);
880 }
881 
882 /*
883  * Give up ownership of a turnstile.  This must be called with the
884  * turnstile chain locked.
885  */
886 void
887 turnstile_disown(struct turnstile *ts)
888 {
889 	struct turnstile_chain *tc;
890 	struct thread *td;
891 	u_char cp, pri;
892 
893 	MPASS(ts != NULL);
894 	MPASS(ts->ts_owner == curthread);
895 	tc = TC_LOOKUP(ts->ts_lockobj);
896 	mtx_assert(&tc->tc_lock, MA_OWNED);
897 	MPASS(TAILQ_EMPTY(&ts->ts_pending));
898 	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
899 	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
900 
901 	/*
902 	 * Remove the turnstile from this thread's list of contested locks
903 	 * since this thread doesn't own it anymore.  New threads will
904 	 * not be blocking on the turnstile until it is claimed by a new
905 	 * owner.
906 	 */
907 	mtx_lock_spin(&td_contested_lock);
908 	ts->ts_owner = NULL;
909 	LIST_REMOVE(ts, ts_link);
910 	mtx_unlock_spin(&td_contested_lock);
911 	mtx_unlock_spin(&tc->tc_lock);
912 
913 	/*
914 	 * Adjust the priority of curthread based on other contested
915 	 * locks it owns.  Don't lower the priority below the base
916 	 * priority however.
917 	 */
918 	td = curthread;
919 	pri = PRI_MAX;
920 	mtx_lock_spin(&sched_lock);
921 	mtx_lock_spin(&td_contested_lock);
922 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
923 		cp = turnstile_first_waiter(ts)->td_priority;
924 		if (cp < pri)
925 			pri = cp;
926 	}
927 	mtx_unlock_spin(&td_contested_lock);
928 	sched_unlend_prio(td, pri);
929 	mtx_unlock_spin(&sched_lock);
930 }
931 
932 /*
933  * Return the first thread in a turnstile.
934  */
935 struct thread *
936 turnstile_head(struct turnstile *ts, int queue)
937 {
938 #ifdef INVARIANTS
939 	struct turnstile_chain *tc;
940 
941 	MPASS(ts != NULL);
942 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
943 	tc = TC_LOOKUP(ts->ts_lockobj);
944 	mtx_assert(&tc->tc_lock, MA_OWNED);
945 #endif
946 	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
947 }
948 
949 /*
950  * Returns true if a sub-queue of a turnstile is empty.
951  */
952 int
953 turnstile_empty(struct turnstile *ts, int queue)
954 {
955 #ifdef INVARIANTS
956 	struct turnstile_chain *tc;
957 
958 	MPASS(ts != NULL);
959 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
960 	tc = TC_LOOKUP(ts->ts_lockobj);
961 	mtx_assert(&tc->tc_lock, MA_OWNED);
962 #endif
963 	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
964 }
965 
966 #ifdef DDB
967 static void
968 print_thread(struct thread *td, const char *prefix)
969 {
970 
971 	db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
972 	    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
973 	    td->td_proc->p_comm);
974 }
975 
976 static void
977 print_queue(struct threadqueue *queue, const char *header, const char *prefix)
978 {
979 	struct thread *td;
980 
981 	db_printf("%s:\n", header);
982 	if (TAILQ_EMPTY(queue)) {
983 		db_printf("%sempty\n", prefix);
984 		return;
985 	}
986 	TAILQ_FOREACH(td, queue, td_lockq) {
987 		print_thread(td, prefix);
988 	}
989 }
990 
991 DB_SHOW_COMMAND(turnstile, db_show_turnstile)
992 {
993 	struct turnstile_chain *tc;
994 	struct turnstile *ts;
995 	struct lock_object *lock;
996 	int i;
997 
998 	if (!have_addr)
999 		return;
1000 
1001 	/*
1002 	 * First, see if there is an active turnstile for the lock indicated
1003 	 * by the address.
1004 	 */
1005 	lock = (struct lock_object *)addr;
1006 	tc = TC_LOOKUP(lock);
1007 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
1008 		if (ts->ts_lockobj == lock)
1009 			goto found;
1010 
1011 	/*
1012 	 * Second, see if there is an active turnstile at the address
1013 	 * indicated.
1014 	 */
1015 	for (i = 0; i < TC_TABLESIZE; i++)
1016 		LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
1017 			if (ts == (struct turnstile *)addr)
1018 				goto found;
1019 		}
1020 
1021 	db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
1022 	return;
1023 found:
1024 	lock = ts->ts_lockobj;
1025 	db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
1026 	    lock->lo_name);
1027 	if (ts->ts_owner)
1028 		print_thread(ts->ts_owner, "Lock Owner: ");
1029 	else
1030 		db_printf("Lock Owner: none\n");
1031 	print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
1032 	print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
1033 	    "\t");
1034 	print_queue(&ts->ts_pending, "Pending Threads", "\t");
1035 
1036 }
1037 
1038 static void
1039 print_threadchain(struct thread *td, const char *prefix)
1040 {
1041 	struct lock_object *lock;
1042 	struct lock_class *class;
1043 	struct turnstile *ts;
1044 
1045 	/*
1046 	 * Follow the chain.  We keep walking as long as the thread is
1047 	 * blocked on a turnstile that has an owner.
1048 	 */
1049 	while (!db_pager_quit) {
1050 		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
1051 		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
1052 		    td->td_proc->p_comm);
1053 		switch (td->td_state) {
1054 		case TDS_INACTIVE:
1055 			db_printf("is inactive\n");
1056 			return;
1057 		case TDS_CAN_RUN:
1058 			db_printf("can run\n");
1059 			return;
1060 		case TDS_RUNQ:
1061 			db_printf("is on a run queue\n");
1062 			return;
1063 		case TDS_RUNNING:
1064 			db_printf("running on CPU %d\n", td->td_oncpu);
1065 			return;
1066 		case TDS_INHIBITED:
1067 			if (TD_ON_LOCK(td)) {
1068 				ts = td->td_blocked;
1069 				lock = ts->ts_lockobj;
1070 				class = LOCK_CLASS(lock);
1071 				db_printf("blocked on lock %p (%s) \"%s\"\n",
1072 				    lock, class->lc_name, lock->lo_name);
1073 				if (ts->ts_owner == NULL)
1074 					return;
1075 				td = ts->ts_owner;
1076 				break;
1077 			}
1078 			db_printf("inhibited\n");
1079 			return;
1080 		default:
1081 			db_printf("??? (%#x)\n", td->td_state);
1082 			return;
1083 		}
1084 	}
1085 }
1086 
1087 DB_SHOW_COMMAND(threadchain, db_show_threadchain)
1088 {
1089 	struct thread *td;
1090 
1091 	/* Figure out which thread to start with. */
1092 	if (have_addr)
1093 		td = db_lookup_thread(addr, TRUE);
1094 	else
1095 		td = kdb_thread;
1096 
1097 	print_threadchain(td, "");
1098 }
1099 
1100 DB_SHOW_COMMAND(allchains, db_show_allchains)
1101 {
1102 	struct thread *td;
1103 	struct proc *p;
1104 	int i;
1105 
1106 	i = 1;
1107 	LIST_FOREACH(p, &allproc, p_list) {
1108 		FOREACH_THREAD_IN_PROC(p, td) {
1109 			if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
1110 				db_printf("chain %d:\n", i++);
1111 				print_threadchain(td, " ");
1112 			}
1113 			if (db_pager_quit)
1114 				return;
1115 		}
1116 	}
1117 }
1118 
1119 static void	print_waiters(struct turnstile *ts, int indent);
1120 
1121 static void
1122 print_waiter(struct thread *td, int indent)
1123 {
1124 	struct turnstile *ts;
1125 	int i;
1126 
1127 	if (db_pager_quit)
1128 		return;
1129 	for (i = 0; i < indent; i++)
1130 		db_printf(" ");
1131 	print_thread(td, "thread ");
1132 	LIST_FOREACH(ts, &td->td_contested, ts_link)
1133 		print_waiters(ts, indent + 1);
1134 }
1135 
1136 static void
1137 print_waiters(struct turnstile *ts, int indent)
1138 {
1139 	struct lock_object *lock;
1140 	struct lock_class *class;
1141 	struct thread *td;
1142 	int i;
1143 
1144 	if (db_pager_quit)
1145 		return;
1146 	lock = ts->ts_lockobj;
1147 	class = LOCK_CLASS(lock);
1148 	for (i = 0; i < indent; i++)
1149 		db_printf(" ");
1150 	db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
1151 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
1152 		print_waiter(td, indent + 1);
1153 	TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
1154 		print_waiter(td, indent + 1);
1155 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
1156 		print_waiter(td, indent + 1);
1157 }
1158 
1159 DB_SHOW_COMMAND(lockchain, db_show_lockchain)
1160 {
1161 	struct lock_object *lock;
1162 	struct lock_class *class;
1163 	struct turnstile_chain *tc;
1164 	struct turnstile *ts;
1165 
1166 	if (!have_addr)
1167 		return;
1168 	lock = (struct lock_object *)addr;
1169 	tc = TC_LOOKUP(lock);
1170 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
1171 		if (ts->ts_lockobj == lock)
1172 			break;
1173 	if (ts == NULL) {
1174 		class = LOCK_CLASS(lock);
1175 		db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
1176 		    lock->lo_name);
1177 	} else
1178 		print_waiters(ts, 0);
1179 }
1180 #endif
1181