xref: /freebsd/sys/kern/kern_mutex.c (revision 77b7cdf1999ee965ad494fddd184b18f532ac91a)
1 /*-
2  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  * 3. Berkeley Software Design Inc's name may not be used to endorse or
13  *    promote products derived from this software without specific prior
14  *    written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30  * $FreeBSD$
31  */
32 
33 /*
34  * Machine independent bits of mutex implementation.
35  */
36 
37 #include "opt_adaptive_mutexes.h"
38 #include "opt_ddb.h"
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/bus.h>
43 #include <sys/kernel.h>
44 #include <sys/ktr.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mutex.h>
48 #include <sys/proc.h>
49 #include <sys/resourcevar.h>
50 #include <sys/sched.h>
51 #include <sys/sbuf.h>
52 #include <sys/sysctl.h>
53 #include <sys/vmmeter.h>
54 
55 #include <machine/atomic.h>
56 #include <machine/bus.h>
57 #include <machine/clock.h>
58 #include <machine/cpu.h>
59 
60 #include <ddb/ddb.h>
61 
62 #include <vm/vm.h>
63 #include <vm/vm_extern.h>
64 
65 /*
66  * Internal utility macros.
67  */
68 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
69 
70 #define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
71 	: (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
72 
73 /*
74  * Lock classes for sleep and spin mutexes.
75  */
76 struct lock_class lock_class_mtx_sleep = {
77 	"sleep mutex",
78 	LC_SLEEPLOCK | LC_RECURSABLE
79 };
80 struct lock_class lock_class_mtx_spin = {
81 	"spin mutex",
82 	LC_SPINLOCK | LC_RECURSABLE
83 };
84 
85 /*
86  * System-wide mutexes
87  */
88 struct mtx sched_lock;
89 struct mtx Giant;
90 
91 /*
92  * Prototypes for non-exported routines.
93  */
94 static void	propagate_priority(struct thread *);
95 
96 static void
97 propagate_priority(struct thread *td)
98 {
99 	int pri = td->td_priority;
100 	struct mtx *m = td->td_blocked;
101 
102 	mtx_assert(&sched_lock, MA_OWNED);
103 	for (;;) {
104 		struct thread *td1;
105 
106 		td = mtx_owner(m);
107 
108 		if (td == NULL) {
109 			/*
110 			 * This really isn't quite right. Really
111 			 * ought to bump priority of thread that
112 			 * next acquires the mutex.
113 			 */
114 			MPASS(m->mtx_lock == MTX_CONTESTED);
115 			return;
116 		}
117 
118 		MPASS(td->td_proc != NULL);
119 		MPASS(td->td_proc->p_magic == P_MAGIC);
120 		KASSERT(!TD_IS_SLEEPING(td), ("sleeping thread owns a mutex"));
121 		if (td->td_priority <= pri) /* lower is higher priority */
122 			return;
123 
124 
125 		/*
126 		 * If lock holder is actually running, just bump priority.
127 		 */
128 		if (TD_IS_RUNNING(td)) {
129 			td->td_priority = pri;
130 			return;
131 		}
132 
133 #ifndef SMP
134 		/*
135 		 * For UP, we check to see if td is curthread (this shouldn't
136 		 * ever happen however as it would mean we are in a deadlock.)
137 		 */
138 		KASSERT(td != curthread, ("Deadlock detected"));
139 #endif
140 
141 		/*
142 		 * If on run queue move to new run queue, and quit.
143 		 * XXXKSE this gets a lot more complicated under threads
144 		 * but try anyhow.
145 		 */
146 		if (TD_ON_RUNQ(td)) {
147 			MPASS(td->td_blocked == NULL);
148 			sched_prio(td, pri);
149 			return;
150 		}
151 		/*
152 		 * Adjust for any other cases.
153 		 */
154 		td->td_priority = pri;
155 
156 		/*
157 		 * If we aren't blocked on a mutex, we should be.
158 		 */
159 		KASSERT(TD_ON_LOCK(td), (
160 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
161 		    td->td_proc->p_pid, td->td_proc->p_comm, td->td_state,
162 		    m->mtx_object.lo_name));
163 
164 		/*
165 		 * Pick up the mutex that td is blocked on.
166 		 */
167 		m = td->td_blocked;
168 		MPASS(m != NULL);
169 
170 		/*
171 		 * Check if the thread needs to be moved up on
172 		 * the blocked chain
173 		 */
174 		if (td == TAILQ_FIRST(&m->mtx_blocked)) {
175 			continue;
176 		}
177 
178 		td1 = TAILQ_PREV(td, threadqueue, td_lockq);
179 		if (td1->td_priority <= pri) {
180 			continue;
181 		}
182 
183 		/*
184 		 * Remove thread from blocked chain and determine where
185 		 * it should be moved up to.  Since we know that td1 has
186 		 * a lower priority than td, we know that at least one
187 		 * thread in the chain has a lower priority and that
188 		 * td1 will thus not be NULL after the loop.
189 		 */
190 		TAILQ_REMOVE(&m->mtx_blocked, td, td_lockq);
191 		TAILQ_FOREACH(td1, &m->mtx_blocked, td_lockq) {
192 			MPASS(td1->td_proc->p_magic == P_MAGIC);
193 			if (td1->td_priority > pri)
194 				break;
195 		}
196 
197 		MPASS(td1 != NULL);
198 		TAILQ_INSERT_BEFORE(td1, td, td_lockq);
199 		CTR4(KTR_LOCK,
200 		    "propagate_priority: p %p moved before %p on [%p] %s",
201 		    td, td1, m, m->mtx_object.lo_name);
202 	}
203 }
204 
205 #ifdef MUTEX_PROFILING
206 SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
207 SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
208 static int mutex_prof_enable = 0;
209 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
210     &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
211 
212 struct mutex_prof {
213 	const char	*name;
214 	const char	*file;
215 	int		line;
216 	uintmax_t	cnt_max;
217 	uintmax_t	cnt_tot;
218 	uintmax_t	cnt_cur;
219 	struct mutex_prof *next;
220 };
221 
222 /*
223  * mprof_buf is a static pool of profiling records to avoid possible
224  * reentrance of the memory allocation functions.
225  *
226  * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
227  */
228 #define	NUM_MPROF_BUFFERS	1000
229 static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
230 static int first_free_mprof_buf;
231 #define	MPROF_HASH_SIZE		1009
232 static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
233 /* SWAG: sbuf size = avg stat. line size * number of locks */
234 #define MPROF_SBUF_SIZE		256 * 400
235 
236 static int mutex_prof_acquisitions;
237 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
238     &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
239 static int mutex_prof_records;
240 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
241     &mutex_prof_records, 0, "Number of profiling records");
242 static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
243 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
244     &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
245 static int mutex_prof_rejected;
246 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
247     &mutex_prof_rejected, 0, "Number of rejected profiling records");
248 static int mutex_prof_hashsize = MPROF_HASH_SIZE;
249 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
250     &mutex_prof_hashsize, 0, "Hash size");
251 static int mutex_prof_collisions = 0;
252 SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
253     &mutex_prof_collisions, 0, "Number of hash collisions");
254 
255 /*
256  * mprof_mtx protects the profiling buffers and the hash.
257  */
258 static struct mtx mprof_mtx;
259 MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
260 
261 static u_int64_t
262 nanoseconds(void)
263 {
264 	struct timespec tv;
265 
266 	nanotime(&tv);
267 	return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
268 }
269 
270 static int
271 dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
272 {
273 	struct sbuf *sb;
274 	int error, i;
275 	static int multiplier = 1;
276 
277 	if (first_free_mprof_buf == 0)
278 		return (SYSCTL_OUT(req, "No locking recorded",
279 		    sizeof("No locking recorded")));
280 
281 retry_sbufops:
282 	sb = sbuf_new(NULL, NULL, MPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
283 	sbuf_printf(sb, "%6s %12s %11s %5s %s\n",
284 	    "max", "total", "count", "avg", "name");
285 	/*
286 	 * XXX this spinlock seems to be by far the largest perpetrator
287 	 * of spinlock latency (1.6 msec on an Athlon1600 was recorded
288 	 * even before I pessimized it further by moving the average
289 	 * computation here).
290 	 */
291 	mtx_lock_spin(&mprof_mtx);
292 	for (i = 0; i < first_free_mprof_buf; ++i) {
293 		sbuf_printf(sb, "%6ju %12ju %11ju %5ju %s:%d (%s)\n",
294 		    mprof_buf[i].cnt_max / 1000,
295 		    mprof_buf[i].cnt_tot / 1000,
296 		    mprof_buf[i].cnt_cur,
297 		    mprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
298 			mprof_buf[i].cnt_tot / (mprof_buf[i].cnt_cur * 1000),
299 		    mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
300 		if (sbuf_overflowed(sb)) {
301 			mtx_unlock_spin(&mprof_mtx);
302 			sbuf_delete(sb);
303 			multiplier++;
304 			goto retry_sbufops;
305 		}
306 	}
307 	mtx_unlock_spin(&mprof_mtx);
308 	sbuf_finish(sb);
309 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
310 	sbuf_delete(sb);
311 	return (error);
312 }
313 SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
314     NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
315 #endif
316 
317 /*
318  * Function versions of the inlined __mtx_* macros.  These are used by
319  * modules and can also be called from assembly language if needed.
320  */
321 void
322 _mtx_lock_flags(struct mtx *m, int opts, const char *file, int line)
323 {
324 
325 	MPASS(curthread != NULL);
326 	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
327 	    ("mtx_lock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
328 	    file, line));
329 	_get_sleep_lock(m, curthread, opts, file, line);
330 	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
331 	    line);
332 	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
333 #ifdef MUTEX_PROFILING
334 	/* don't reset the timer when/if recursing */
335 	if (m->mtx_acqtime == 0) {
336 		m->mtx_filename = file;
337 		m->mtx_lineno = line;
338 		m->mtx_acqtime = mutex_prof_enable ? nanoseconds() : 0;
339 		++mutex_prof_acquisitions;
340 	}
341 #endif
342 }
343 
344 void
345 _mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
346 {
347 
348 	MPASS(curthread != NULL);
349 	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
350 	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
351 	    file, line));
352 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
353 	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
354 	    line);
355 	mtx_assert(m, MA_OWNED);
356 #ifdef MUTEX_PROFILING
357 	if (m->mtx_acqtime != 0) {
358 		static const char *unknown = "(unknown)";
359 		struct mutex_prof *mpp;
360 		u_int64_t acqtime, now;
361 		const char *p, *q;
362 		volatile u_int hash;
363 
364 		now = nanoseconds();
365 		acqtime = m->mtx_acqtime;
366 		m->mtx_acqtime = 0;
367 		if (now <= acqtime)
368 			goto out;
369 		for (p = m->mtx_filename;
370 		    p != NULL && strncmp(p, "../", 3) == 0; p += 3)
371 			/* nothing */ ;
372 		if (p == NULL || *p == '\0')
373 			p = unknown;
374 		for (hash = m->mtx_lineno, q = p; *q != '\0'; ++q)
375 			hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
376 		mtx_lock_spin(&mprof_mtx);
377 		for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
378 			if (mpp->line == m->mtx_lineno &&
379 			    strcmp(mpp->file, p) == 0)
380 				break;
381 		if (mpp == NULL) {
382 			/* Just exit if we cannot get a trace buffer */
383 			if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
384 				++mutex_prof_rejected;
385 				goto unlock;
386 			}
387 			mpp = &mprof_buf[first_free_mprof_buf++];
388 			mpp->name = mtx_name(m);
389 			mpp->file = p;
390 			mpp->line = m->mtx_lineno;
391 			mpp->next = mprof_hash[hash];
392 			if (mprof_hash[hash] != NULL)
393 				++mutex_prof_collisions;
394 			mprof_hash[hash] = mpp;
395 			++mutex_prof_records;
396 		}
397 		/*
398 		 * Record if the mutex has been held longer now than ever
399 		 * before.
400 		 */
401 		if (now - acqtime > mpp->cnt_max)
402 			mpp->cnt_max = now - acqtime;
403 		mpp->cnt_tot += now - acqtime;
404 		mpp->cnt_cur++;
405 unlock:
406 		mtx_unlock_spin(&mprof_mtx);
407 	}
408 out:
409 #endif
410 	_rel_sleep_lock(m, curthread, opts, file, line);
411 }
412 
413 void
414 _mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
415 {
416 
417 	MPASS(curthread != NULL);
418 	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
419 	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
420 	    m->mtx_object.lo_name, file, line));
421 #if defined(SMP) || LOCK_DEBUG > 0 || 1
422 	_get_spin_lock(m, curthread, opts, file, line);
423 #else
424 	critical_enter();
425 #endif
426 	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
427 	    line);
428 	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
429 }
430 
431 void
432 _mtx_unlock_spin_flags(struct mtx *m, int opts, const char *file, int line)
433 {
434 
435 	MPASS(curthread != NULL);
436 	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
437 	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
438 	    m->mtx_object.lo_name, file, line));
439 	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
440 	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
441 	    line);
442 	mtx_assert(m, MA_OWNED);
443 #if defined(SMP) || LOCK_DEBUG > 0 || 1
444 	_rel_spin_lock(m);
445 #else
446 	critical_exit();
447 #endif
448 }
449 
450 /*
451  * The important part of mtx_trylock{,_flags}()
452  * Tries to acquire lock `m.' We do NOT handle recursion here.  If this
453  * function is called on a recursed mutex, it will return failure and
454  * will not recursively acquire the lock.  You are expected to know what
455  * you are doing.
456  */
457 int
458 _mtx_trylock(struct mtx *m, int opts, const char *file, int line)
459 {
460 	int rval;
461 
462 	MPASS(curthread != NULL);
463 
464 	rval = _obtain_lock(m, curthread);
465 
466 	LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
467 	if (rval)
468 		WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
469 		    file, line);
470 
471 	return (rval);
472 }
473 
474 /*
475  * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
476  *
477  * We call this if the lock is either contested (i.e. we need to go to
478  * sleep waiting for it), or if we need to recurse on it.
479  */
480 void
481 _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
482 {
483 	struct thread *td = curthread;
484 	struct thread *td1;
485 #if defined(SMP) && defined(ADAPTIVE_MUTEXES)
486 	struct thread *owner;
487 #endif
488 	uintptr_t v;
489 #ifdef KTR
490 	int cont_logged = 0;
491 #endif
492 
493 	if (mtx_owned(m)) {
494 		m->mtx_recurse++;
495 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
496 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
497 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
498 		return;
499 	}
500 
501 	if (LOCK_LOG_TEST(&m->mtx_object, opts))
502 		CTR4(KTR_LOCK,
503 		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
504 		    m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
505 
506 	while (!_obtain_lock(m, td)) {
507 
508 		mtx_lock_spin(&sched_lock);
509 		v = m->mtx_lock;
510 
511 		/*
512 		 * Check if the lock has been released while spinning for
513 		 * the sched_lock.
514 		 */
515 		if (v == MTX_UNOWNED) {
516 			mtx_unlock_spin(&sched_lock);
517 #ifdef __i386__
518 			ia32_pause();
519 #endif
520 			continue;
521 		}
522 
523 		/*
524 		 * The mutex was marked contested on release. This means that
525 		 * there are threads blocked on it.
526 		 */
527 		if (v == MTX_CONTESTED) {
528 			td1 = TAILQ_FIRST(&m->mtx_blocked);
529 			MPASS(td1 != NULL);
530 			m->mtx_lock = (uintptr_t)td | MTX_CONTESTED;
531 
532 			if (td1->td_priority < td->td_priority)
533 				td->td_priority = td1->td_priority;
534 			mtx_unlock_spin(&sched_lock);
535 			return;
536 		}
537 
538 		/*
539 		 * If the mutex isn't already contested and a failure occurs
540 		 * setting the contested bit, the mutex was either released
541 		 * or the state of the MTX_RECURSED bit changed.
542 		 */
543 		if ((v & MTX_CONTESTED) == 0 &&
544 		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
545 			(void *)(v | MTX_CONTESTED))) {
546 			mtx_unlock_spin(&sched_lock);
547 #ifdef __i386__
548 			ia32_pause();
549 #endif
550 			continue;
551 		}
552 
553 #if defined(SMP) && defined(ADAPTIVE_MUTEXES)
554 		/*
555 		 * If the current owner of the lock is executing on another
556 		 * CPU, spin instead of blocking.
557 		 */
558 		owner = (struct thread *)(v & MTX_FLAGMASK);
559 		if (m != &Giant && TD_IS_RUNNING(owner)) {
560 			mtx_unlock_spin(&sched_lock);
561 			while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) {
562 #ifdef __i386__
563 				ia32_pause();
564 #endif
565 			}
566 			continue;
567 		}
568 #endif	/* SMP && ADAPTIVE_MUTEXES */
569 
570 		/*
571 		 * We definitely must sleep for this lock.
572 		 */
573 		mtx_assert(m, MA_NOTOWNED);
574 
575 #ifdef notyet
576 		/*
577 		 * If we're borrowing an interrupted thread's VM context, we
578 		 * must clean up before going to sleep.
579 		 */
580 		if (td->td_ithd != NULL) {
581 			struct ithd *it = td->td_ithd;
582 
583 			if (it->it_interrupted) {
584 				if (LOCK_LOG_TEST(&m->mtx_object, opts))
585 					CTR2(KTR_LOCK,
586 				    "_mtx_lock_sleep: %p interrupted %p",
587 					    it, it->it_interrupted);
588 				intr_thd_fixup(it);
589 			}
590 		}
591 #endif
592 
593 		/*
594 		 * Put us on the list of threads blocked on this mutex.
595 		 */
596 		if (TAILQ_EMPTY(&m->mtx_blocked)) {
597 			td1 = mtx_owner(m);
598 			LIST_INSERT_HEAD(&td1->td_contested, m, mtx_contested);
599 			TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_lockq);
600 		} else {
601 			TAILQ_FOREACH(td1, &m->mtx_blocked, td_lockq)
602 				if (td1->td_priority > td->td_priority)
603 					break;
604 			if (td1)
605 				TAILQ_INSERT_BEFORE(td1, td, td_lockq);
606 			else
607 				TAILQ_INSERT_TAIL(&m->mtx_blocked, td, td_lockq);
608 		}
609 #ifdef KTR
610 		if (!cont_logged) {
611 			CTR6(KTR_CONTENTION,
612 			    "contention: %p at %s:%d wants %s, taken by %s:%d",
613 			    td, file, line, m->mtx_object.lo_name,
614 			    WITNESS_FILE(&m->mtx_object),
615 			    WITNESS_LINE(&m->mtx_object));
616 			cont_logged = 1;
617 		}
618 #endif
619 
620 		/*
621 		 * Save who we're blocked on.
622 		 */
623 		td->td_blocked = m;
624 		td->td_lockname = m->mtx_object.lo_name;
625 		TD_SET_LOCK(td);
626 		propagate_priority(td);
627 
628 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
629 			CTR3(KTR_LOCK,
630 			    "_mtx_lock_sleep: p %p blocked on [%p] %s", td, m,
631 			    m->mtx_object.lo_name);
632 
633 		td->td_proc->p_stats->p_ru.ru_nvcsw++;
634 		mi_switch();
635 
636 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
637 			CTR3(KTR_LOCK,
638 			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
639 			  td, m, m->mtx_object.lo_name);
640 
641 		mtx_unlock_spin(&sched_lock);
642 	}
643 
644 #ifdef KTR
645 	if (cont_logged) {
646 		CTR4(KTR_CONTENTION,
647 		    "contention end: %s acquired by %p at %s:%d",
648 		    m->mtx_object.lo_name, td, file, line);
649 	}
650 #endif
651 	return;
652 }
653 
654 /*
655  * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
656  *
657  * This is only called if we need to actually spin for the lock. Recursion
658  * is handled inline.
659  */
660 void
661 _mtx_lock_spin(struct mtx *m, int opts, const char *file, int line)
662 {
663 	int i = 0;
664 
665 	if (LOCK_LOG_TEST(&m->mtx_object, opts))
666 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
667 
668 	for (;;) {
669 		if (_obtain_lock(m, curthread))
670 			break;
671 
672 		/* Give interrupts a chance while we spin. */
673 		critical_exit();
674 		while (m->mtx_lock != MTX_UNOWNED) {
675 			if (i++ < 10000000) {
676 #ifdef __i386__
677 				ia32_pause();
678 #endif
679 				continue;
680 			}
681 			if (i < 60000000)
682 				DELAY(1);
683 #ifdef DDB
684 			else if (!db_active)
685 #else
686 			else
687 #endif
688 				panic("spin lock %s held by %p for > 5 seconds",
689 				    m->mtx_object.lo_name, (void *)m->mtx_lock);
690 #ifdef __i386__
691 			ia32_pause();
692 #endif
693 		}
694 		critical_enter();
695 	}
696 
697 	if (LOCK_LOG_TEST(&m->mtx_object, opts))
698 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
699 
700 	return;
701 }
702 
703 /*
704  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
705  *
706  * We are only called here if the lock is recursed or contested (i.e. we
707  * need to wake up a blocked thread).
708  */
709 void
710 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
711 {
712 	struct thread *td, *td1;
713 	struct mtx *m1;
714 	int pri;
715 
716 	td = curthread;
717 
718 	if (mtx_recursed(m)) {
719 		if (--(m->mtx_recurse) == 0)
720 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
721 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
722 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
723 		return;
724 	}
725 
726 	mtx_lock_spin(&sched_lock);
727 	if (LOCK_LOG_TEST(&m->mtx_object, opts))
728 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
729 
730 	td1 = TAILQ_FIRST(&m->mtx_blocked);
731 #if defined(SMP) && defined(ADAPTIVE_MUTEXES)
732 	if (td1 == NULL) {
733 		_release_lock_quick(m);
734 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
735 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
736 		mtx_unlock_spin(&sched_lock);
737 		return;
738 	}
739 #endif
740 	MPASS(td->td_proc->p_magic == P_MAGIC);
741 	MPASS(td1->td_proc->p_magic == P_MAGIC);
742 
743 	TAILQ_REMOVE(&m->mtx_blocked, td1, td_lockq);
744 
745 	if (TAILQ_EMPTY(&m->mtx_blocked)) {
746 		LIST_REMOVE(m, mtx_contested);
747 		_release_lock_quick(m);
748 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
749 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
750 	} else
751 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
752 
753 	pri = PRI_MAX;
754 	LIST_FOREACH(m1, &td->td_contested, mtx_contested) {
755 		int cp = TAILQ_FIRST(&m1->mtx_blocked)->td_priority;
756 		if (cp < pri)
757 			pri = cp;
758 	}
759 
760 	if (pri > td->td_base_pri)
761 		pri = td->td_base_pri;
762 	td->td_priority = pri;
763 
764 	if (LOCK_LOG_TEST(&m->mtx_object, opts))
765 		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
766 		    m, td1);
767 
768 	td1->td_blocked = NULL;
769 	TD_CLR_LOCK(td1);
770 	if (!TD_CAN_RUN(td1)) {
771 		mtx_unlock_spin(&sched_lock);
772 		return;
773 	}
774 	setrunqueue(td1);
775 
776 	if (td->td_critnest == 1 && td1->td_priority < pri) {
777 #ifdef notyet
778 		if (td->td_ithd != NULL) {
779 			struct ithd *it = td->td_ithd;
780 
781 			if (it->it_interrupted) {
782 				if (LOCK_LOG_TEST(&m->mtx_object, opts))
783 					CTR2(KTR_LOCK,
784 				    "_mtx_unlock_sleep: %p interrupted %p",
785 					    it, it->it_interrupted);
786 				intr_thd_fixup(it);
787 			}
788 		}
789 #endif
790 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
791 			CTR2(KTR_LOCK,
792 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
793 			    (void *)m->mtx_lock);
794 
795 		td->td_proc->p_stats->p_ru.ru_nivcsw++;
796 		mi_switch();
797 		if (LOCK_LOG_TEST(&m->mtx_object, opts))
798 			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
799 			    m, (void *)m->mtx_lock);
800 	}
801 
802 	mtx_unlock_spin(&sched_lock);
803 
804 	return;
805 }
806 
807 /*
808  * All the unlocking of MTX_SPIN locks is done inline.
809  * See the _rel_spin_lock() macro for the details.
810  */
811 
812 /*
813  * The backing function for the INVARIANTS-enabled mtx_assert()
814  */
815 #ifdef INVARIANT_SUPPORT
816 void
817 _mtx_assert(struct mtx *m, int what, const char *file, int line)
818 {
819 
820 	if (panicstr != NULL)
821 		return;
822 	switch (what) {
823 	case MA_OWNED:
824 	case MA_OWNED | MA_RECURSED:
825 	case MA_OWNED | MA_NOTRECURSED:
826 		if (!mtx_owned(m))
827 			panic("mutex %s not owned at %s:%d",
828 			    m->mtx_object.lo_name, file, line);
829 		if (mtx_recursed(m)) {
830 			if ((what & MA_NOTRECURSED) != 0)
831 				panic("mutex %s recursed at %s:%d",
832 				    m->mtx_object.lo_name, file, line);
833 		} else if ((what & MA_RECURSED) != 0) {
834 			panic("mutex %s unrecursed at %s:%d",
835 			    m->mtx_object.lo_name, file, line);
836 		}
837 		break;
838 	case MA_NOTOWNED:
839 		if (mtx_owned(m))
840 			panic("mutex %s owned at %s:%d",
841 			    m->mtx_object.lo_name, file, line);
842 		break;
843 	default:
844 		panic("unknown mtx_assert at %s:%d", file, line);
845 	}
846 }
847 #endif
848 
849 /*
850  * The MUTEX_DEBUG-enabled mtx_validate()
851  *
852  * Most of these checks have been moved off into the LO_INITIALIZED flag
853  * maintained by the witness code.
854  */
855 #ifdef MUTEX_DEBUG
856 
857 void	mtx_validate(struct mtx *);
858 
859 void
860 mtx_validate(struct mtx *m)
861 {
862 
863 /*
864  * XXX: When kernacc() does not require Giant we can reenable this check
865  */
866 #ifdef notyet
867 /*
868  * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
869  * we can re-enable the kernacc() checks.
870  */
871 #ifndef __alpha__
872 	/*
873 	 * Can't call kernacc() from early init386(), especially when
874 	 * initializing Giant mutex, because some stuff in kernacc()
875 	 * requires Giant itself.
876 	 */
877 	if (!cold)
878 		if (!kernacc((caddr_t)m, sizeof(m),
879 		    VM_PROT_READ | VM_PROT_WRITE))
880 			panic("Can't read and write to mutex %p", m);
881 #endif
882 #endif
883 }
884 #endif
885 
886 /*
887  * General init routine used by the MTX_SYSINIT() macro.
888  */
889 void
890 mtx_sysinit(void *arg)
891 {
892 	struct mtx_args *margs = arg;
893 
894 	mtx_init(margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts);
895 }
896 
897 /*
898  * Mutex initialization routine; initialize lock `m' of type contained in
899  * `opts' with options contained in `opts' and name `name.'  The optional
900  * lock type `type' is used as a general lock category name for use with
901  * witness.
902  */
903 void
904 mtx_init(struct mtx *m, const char *name, const char *type, int opts)
905 {
906 	struct lock_object *lock;
907 
908 	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
909 	    MTX_NOWITNESS | MTX_DUPOK)) == 0);
910 
911 #ifdef MUTEX_DEBUG
912 	/* Diagnostic and error correction */
913 	mtx_validate(m);
914 #endif
915 
916 	lock = &m->mtx_object;
917 	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
918 	    ("mutex %s %p already initialized", name, m));
919 	bzero(m, sizeof(*m));
920 	if (opts & MTX_SPIN)
921 		lock->lo_class = &lock_class_mtx_spin;
922 	else
923 		lock->lo_class = &lock_class_mtx_sleep;
924 	lock->lo_name = name;
925 	lock->lo_type = type != NULL ? type : name;
926 	if (opts & MTX_QUIET)
927 		lock->lo_flags = LO_QUIET;
928 	if (opts & MTX_RECURSE)
929 		lock->lo_flags |= LO_RECURSABLE;
930 	if ((opts & MTX_NOWITNESS) == 0)
931 		lock->lo_flags |= LO_WITNESS;
932 	if (opts & MTX_DUPOK)
933 		lock->lo_flags |= LO_DUPOK;
934 
935 	m->mtx_lock = MTX_UNOWNED;
936 	TAILQ_INIT(&m->mtx_blocked);
937 
938 	LOCK_LOG_INIT(lock, opts);
939 
940 	WITNESS_INIT(lock);
941 }
942 
943 /*
944  * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
945  * passed in as a flag here because if the corresponding mtx_init() was
946  * called with MTX_QUIET set, then it will already be set in the mutex's
947  * flags.
948  */
949 void
950 mtx_destroy(struct mtx *m)
951 {
952 
953 	LOCK_LOG_DESTROY(&m->mtx_object, 0);
954 
955 	if (!mtx_owned(m))
956 		MPASS(mtx_unowned(m));
957 	else {
958 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
959 
960 		/* Tell witness this isn't locked to make it happy. */
961 		WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
962 		    __LINE__);
963 	}
964 
965 	WITNESS_DESTROY(&m->mtx_object);
966 }
967 
968 /*
969  * Intialize the mutex code and system mutexes.  This is called from the MD
970  * startup code prior to mi_startup().  The per-CPU data space needs to be
971  * setup before this is called.
972  */
973 void
974 mutex_init(void)
975 {
976 
977 	/* Setup thread0 so that mutexes work. */
978 	LIST_INIT(&thread0.td_contested);
979 
980 	/*
981 	 * Initialize mutexes.
982 	 */
983 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
984 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
985 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
986 	mtx_lock(&Giant);
987 }
988