xref: /titanic_51/usr/src/lib/libc/port/threads/synch.c (revision 5d1dd9a99f22e7799cd7c62987a566ed51713c5a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/sdt.h>
30 
31 #include "lint.h"
32 #include "thr_uberdata.h"
33 
34 /*
35  * This mutex is initialized to be held by lwp#1.
36  * It is used to block a thread that has returned from a mutex_lock()
37  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
38  */
39 mutex_t	stall_mutex = DEFAULTMUTEX;
40 
41 static int shared_mutex_held(mutex_t *);
42 static int mutex_unlock_internal(mutex_t *, int);
43 static int mutex_queuelock_adaptive(mutex_t *);
44 static void mutex_wakeup_all(mutex_t *);
45 
46 /*
47  * Lock statistics support functions.
48  */
49 void
50 record_begin_hold(tdb_mutex_stats_t *msp)
51 {
52 	tdb_incr(msp->mutex_lock);
53 	msp->mutex_begin_hold = gethrtime();
54 }
55 
56 hrtime_t
57 record_hold_time(tdb_mutex_stats_t *msp)
58 {
59 	hrtime_t now = gethrtime();
60 
61 	if (msp->mutex_begin_hold)
62 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
63 	msp->mutex_begin_hold = 0;
64 	return (now);
65 }
66 
67 /*
68  * Called once at library initialization.
69  */
70 void
71 mutex_setup(void)
72 {
73 	if (set_lock_byte(&stall_mutex.mutex_lockw))
74 		thr_panic("mutex_setup() cannot acquire stall_mutex");
75 	stall_mutex.mutex_owner = (uintptr_t)curthread;
76 }
77 
78 /*
79  * The default spin count of 1000 is experimentally determined.
80  * On sun4u machines with any number of processors it could be raised
81  * to 10,000 but that (experimentally) makes almost no difference.
82  * The environment variable:
83  *	_THREAD_ADAPTIVE_SPIN=count
84  * can be used to override and set the count in the range [0 .. 1,000,000].
85  */
86 int	thread_adaptive_spin = 1000;
87 uint_t	thread_max_spinners = 100;
88 int	thread_queue_verify = 0;
89 static	int	ncpus;
90 
91 /*
92  * Distinguish spinning for queue locks from spinning for regular locks.
93  * We try harder to acquire queue locks by spinning.
94  * The environment variable:
95  *	_THREAD_QUEUE_SPIN=count
96  * can be used to override and set the count in the range [0 .. 1,000,000].
97  */
98 int	thread_queue_spin = 10000;
99 
100 #define	ALL_ATTRIBUTES				\
101 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
102 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
103 	LOCK_ROBUST)
104 
105 /*
106  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
107  * augmented by zero or more the flags:
108  *	LOCK_RECURSIVE
109  *	LOCK_ERRORCHECK
110  *	LOCK_PRIO_INHERIT
111  *	LOCK_PRIO_PROTECT
112  *	LOCK_ROBUST
113  */
114 #pragma weak _private_mutex_init = __mutex_init
115 #pragma weak mutex_init = __mutex_init
116 #pragma weak _mutex_init = __mutex_init
117 /* ARGSUSED2 */
118 int
119 __mutex_init(mutex_t *mp, int type, void *arg)
120 {
121 	int basetype = (type & ~ALL_ATTRIBUTES);
122 	int error = 0;
123 
124 	if (basetype == USYNC_PROCESS_ROBUST) {
125 		/*
126 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
127 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
128 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
129 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
130 		 * mutexes will ever draw ELOCKUNMAPPED).
131 		 */
132 		type |= (USYNC_PROCESS | LOCK_ROBUST);
133 		basetype = USYNC_PROCESS;
134 	}
135 
136 	if (!(basetype == USYNC_THREAD || basetype == USYNC_PROCESS) ||
137 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
138 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) {
139 		error = EINVAL;
140 	} else if (type & LOCK_ROBUST) {
141 		/*
142 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
143 		 * are required to pass an initially all-zero mutex.
144 		 * Multiple calls to mutex_init() are allowed; all but
145 		 * the first return EBUSY.  A call to mutex_init() is
146 		 * allowed to make an inconsistent robust lock consistent
147 		 * (for historical usage, even though the proper interface
148 		 * for this is mutex_consistent()).  Note that we use
149 		 * atomic_or_16() to set the LOCK_INITED flag so as
150 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
151 		 */
152 		extern void _atomic_or_16(volatile uint16_t *, uint16_t);
153 		if (!(mp->mutex_flag & LOCK_INITED)) {
154 			mp->mutex_type = (uint8_t)type;
155 			_atomic_or_16(&mp->mutex_flag, LOCK_INITED);
156 			mp->mutex_magic = MUTEX_MAGIC;
157 		} else if (type != mp->mutex_type ||
158 		    ((type & LOCK_PRIO_PROTECT) &&
159 		    mp->mutex_ceiling != (*(int *)arg))) {
160 			error = EINVAL;
161 		} else if (__mutex_consistent(mp) != 0) {
162 			error = EBUSY;
163 		}
164 		/* register a process robust mutex with the kernel */
165 		if (basetype == USYNC_PROCESS)
166 			register_lock(mp);
167 	} else {
168 		(void) _memset(mp, 0, sizeof (*mp));
169 		mp->mutex_type = (uint8_t)type;
170 		mp->mutex_flag = LOCK_INITED;
171 		mp->mutex_magic = MUTEX_MAGIC;
172 	}
173 
174 	if (error == 0 && (type & LOCK_PRIO_PROTECT))
175 		mp->mutex_ceiling = (uint8_t)(*(int *)arg);
176 
177 	return (error);
178 }
179 
180 /*
181  * Delete mp from list of ceil mutexes owned by curthread.
182  * Return 1 if the head of the chain was updated.
183  */
184 int
185 _ceil_mylist_del(mutex_t *mp)
186 {
187 	ulwp_t *self = curthread;
188 	mxchain_t **mcpp;
189 	mxchain_t *mcp;
190 
191 	mcpp = &self->ul_mxchain;
192 	while ((*mcpp)->mxchain_mx != mp)
193 		mcpp = &(*mcpp)->mxchain_next;
194 	mcp = *mcpp;
195 	*mcpp = mcp->mxchain_next;
196 	lfree(mcp, sizeof (*mcp));
197 	return (mcpp == &self->ul_mxchain);
198 }
199 
200 /*
201  * Add mp to head of list of ceil mutexes owned by curthread.
202  * Return ENOMEM if no memory could be allocated.
203  */
204 int
205 _ceil_mylist_add(mutex_t *mp)
206 {
207 	ulwp_t *self = curthread;
208 	mxchain_t *mcp;
209 
210 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
211 		return (ENOMEM);
212 	mcp->mxchain_mx = mp;
213 	mcp->mxchain_next = self->ul_mxchain;
214 	self->ul_mxchain = mcp;
215 	return (0);
216 }
217 
218 /*
219  * Inherit priority from ceiling.  The inheritance impacts the effective
220  * priority, not the assigned priority.  See _thread_setschedparam_main().
221  */
222 void
223 _ceil_prio_inherit(int ceil)
224 {
225 	ulwp_t *self = curthread;
226 	struct sched_param param;
227 
228 	(void) _memset(&param, 0, sizeof (param));
229 	param.sched_priority = ceil;
230 	if (_thread_setschedparam_main(self->ul_lwpid,
231 	    self->ul_policy, &param, PRIO_INHERIT)) {
232 		/*
233 		 * Panic since unclear what error code to return.
234 		 * If we do return the error codes returned by above
235 		 * called routine, update the man page...
236 		 */
237 		thr_panic("_thread_setschedparam_main() fails");
238 	}
239 }
240 
241 /*
242  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
243  * if holding at least one ceiling lock.  If no ceiling locks are held at this
244  * point, disinherit completely, reverting back to assigned priority.
245  */
246 void
247 _ceil_prio_waive(void)
248 {
249 	ulwp_t *self = curthread;
250 	struct sched_param param;
251 
252 	(void) _memset(&param, 0, sizeof (param));
253 	if (self->ul_mxchain == NULL) {
254 		/*
255 		 * No ceil locks held.  Zero the epri, revert back to ul_pri.
256 		 * Since thread's hash lock is not held, one cannot just
257 		 * read ul_pri here...do it in the called routine...
258 		 */
259 		param.sched_priority = self->ul_pri;	/* ignored */
260 		if (_thread_setschedparam_main(self->ul_lwpid,
261 		    self->ul_policy, &param, PRIO_DISINHERIT))
262 			thr_panic("_thread_setschedparam_main() fails");
263 	} else {
264 		/*
265 		 * Set priority to that of the mutex at the head
266 		 * of the ceilmutex chain.
267 		 */
268 		param.sched_priority =
269 		    self->ul_mxchain->mxchain_mx->mutex_ceiling;
270 		if (_thread_setschedparam_main(self->ul_lwpid,
271 		    self->ul_policy, &param, PRIO_INHERIT))
272 			thr_panic("_thread_setschedparam_main() fails");
273 	}
274 }
275 
276 /*
277  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
278  * Return the old value of the lock word.
279  */
280 static uint32_t
281 clear_lockbyte(volatile uint32_t *lockword)
282 {
283 	uint32_t old;
284 	uint32_t new;
285 
286 	do {
287 		old = *lockword;
288 		new = old & ~LOCKMASK;
289 	} while (atomic_cas_32(lockword, old, new) != old);
290 
291 	return (old);
292 }
293 
294 /*
295  * Increment the spinners count in the mutex lock word.
296  * Return 0 on success.  Return -1 if the count would overflow.
297  */
298 static int
299 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
300 {
301 	uint32_t old;
302 	uint32_t new;
303 
304 	do {
305 		old = *lockword;
306 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
307 			return (-1);
308 		new = old + (1 << SPINNERSHIFT);
309 	} while (atomic_cas_32(lockword, old, new) != old);
310 
311 	return (0);
312 }
313 
314 /*
315  * Decrement the spinners count in the mutex lock word.
316  * Return the new value of the lock word.
317  */
318 static uint32_t
319 spinners_decr(volatile uint32_t *lockword)
320 {
321 	uint32_t old;
322 	uint32_t new;
323 
324 	do {
325 		new = old = *lockword;
326 		if (new & SPINNERMASK)
327 			new -= (1 << SPINNERSHIFT);
328 	} while (atomic_cas_32(lockword, old, new) != old);
329 
330 	return (new);
331 }
332 
333 /*
334  * Non-preemptive spin locks.  Used by queue_lock().
335  * No lock statistics are gathered for these locks.
336  * No DTrace probes are provided for these locks.
337  */
338 void
339 spin_lock_set(mutex_t *mp)
340 {
341 	ulwp_t *self = curthread;
342 
343 	no_preempt(self);
344 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
345 		mp->mutex_owner = (uintptr_t)self;
346 		return;
347 	}
348 	/*
349 	 * Spin for a while, attempting to acquire the lock.
350 	 */
351 	if (self->ul_spin_lock_spin != UINT_MAX)
352 		self->ul_spin_lock_spin++;
353 	if (mutex_queuelock_adaptive(mp) == 0 ||
354 	    set_lock_byte(&mp->mutex_lockw) == 0) {
355 		mp->mutex_owner = (uintptr_t)self;
356 		return;
357 	}
358 	/*
359 	 * Try harder if we were previously at a no premption level.
360 	 */
361 	if (self->ul_preempt > 1) {
362 		if (self->ul_spin_lock_spin2 != UINT_MAX)
363 			self->ul_spin_lock_spin2++;
364 		if (mutex_queuelock_adaptive(mp) == 0 ||
365 		    set_lock_byte(&mp->mutex_lockw) == 0) {
366 			mp->mutex_owner = (uintptr_t)self;
367 			return;
368 		}
369 	}
370 	/*
371 	 * Give up and block in the kernel for the mutex.
372 	 */
373 	if (self->ul_spin_lock_sleep != UINT_MAX)
374 		self->ul_spin_lock_sleep++;
375 	(void) ___lwp_mutex_timedlock(mp, NULL);
376 	mp->mutex_owner = (uintptr_t)self;
377 }
378 
379 void
380 spin_lock_clear(mutex_t *mp)
381 {
382 	ulwp_t *self = curthread;
383 
384 	mp->mutex_owner = 0;
385 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
386 		(void) ___lwp_mutex_wakeup(mp, 0);
387 		if (self->ul_spin_lock_wakeup != UINT_MAX)
388 			self->ul_spin_lock_wakeup++;
389 	}
390 	preempt(self);
391 }
392 
393 /*
394  * Allocate the sleep queue hash table.
395  */
396 void
397 queue_alloc(void)
398 {
399 	ulwp_t *self = curthread;
400 	uberdata_t *udp = self->ul_uberdata;
401 	mutex_t *mp;
402 	void *data;
403 	int i;
404 
405 	/*
406 	 * No locks are needed; we call here only when single-threaded.
407 	 */
408 	ASSERT(self == udp->ulwp_one);
409 	ASSERT(!udp->uberflags.uf_mt);
410 	if ((data = _private_mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
411 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
412 	    == MAP_FAILED)
413 		thr_panic("cannot allocate thread queue_head table");
414 	udp->queue_head = (queue_head_t *)data;
415 	for (i = 0; i < 2 * QHASHSIZE; i++) {
416 		mp = &udp->queue_head[i].qh_lock;
417 		mp->mutex_flag = LOCK_INITED;
418 		mp->mutex_magic = MUTEX_MAGIC;
419 	}
420 }
421 
422 #if defined(THREAD_DEBUG)
423 
424 /*
425  * Debugging: verify correctness of a sleep queue.
426  */
427 void
428 QVERIFY(queue_head_t *qp)
429 {
430 	ulwp_t *self = curthread;
431 	uberdata_t *udp = self->ul_uberdata;
432 	ulwp_t *ulwp;
433 	ulwp_t *prev;
434 	uint_t index;
435 	uint32_t cnt = 0;
436 	char qtype;
437 	void *wchan;
438 
439 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
440 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
441 	ASSERT((qp->qh_head != NULL && qp->qh_tail != NULL) ||
442 	    (qp->qh_head == NULL && qp->qh_tail == NULL));
443 	if (!thread_queue_verify)
444 		return;
445 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
446 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
447 	for (prev = NULL, ulwp = qp->qh_head; ulwp != NULL;
448 	    prev = ulwp, ulwp = ulwp->ul_link, cnt++) {
449 		ASSERT(ulwp->ul_qtype == qtype);
450 		ASSERT(ulwp->ul_wchan != NULL);
451 		ASSERT(ulwp->ul_sleepq == qp);
452 		wchan = ulwp->ul_wchan;
453 		index = QUEUE_HASH(wchan, qtype);
454 		ASSERT(&udp->queue_head[index] == qp);
455 	}
456 	ASSERT(qp->qh_tail == prev);
457 	ASSERT(qp->qh_qlen == cnt);
458 }
459 
460 #else	/* THREAD_DEBUG */
461 
462 #define	QVERIFY(qp)
463 
464 #endif	/* THREAD_DEBUG */
465 
466 /*
467  * Acquire a queue head.
468  */
469 queue_head_t *
470 queue_lock(void *wchan, int qtype)
471 {
472 	uberdata_t *udp = curthread->ul_uberdata;
473 	queue_head_t *qp;
474 
475 	ASSERT(qtype == MX || qtype == CV);
476 
477 	/*
478 	 * It is possible that we could be called while still single-threaded.
479 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
480 	 */
481 	if ((qp = udp->queue_head) == NULL) {
482 		queue_alloc();
483 		qp = udp->queue_head;
484 	}
485 	qp += QUEUE_HASH(wchan, qtype);
486 	spin_lock_set(&qp->qh_lock);
487 	/*
488 	 * At once per nanosecond, qh_lockcount will wrap after 512 years.
489 	 * Were we to care about this, we could peg the value at UINT64_MAX.
490 	 */
491 	qp->qh_lockcount++;
492 	QVERIFY(qp);
493 	return (qp);
494 }
495 
496 /*
497  * Release a queue head.
498  */
499 void
500 queue_unlock(queue_head_t *qp)
501 {
502 	QVERIFY(qp);
503 	spin_lock_clear(&qp->qh_lock);
504 }
505 
506 /*
507  * For rwlock queueing, we must queue writers ahead of readers of the
508  * same priority.  We do this by making writers appear to have a half
509  * point higher priority for purposes of priority comparisons below.
510  */
511 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
512 
513 void
514 enqueue(queue_head_t *qp, ulwp_t *ulwp, void *wchan, int qtype)
515 {
516 	ulwp_t **ulwpp;
517 	ulwp_t *next;
518 	int pri = CMP_PRIO(ulwp);
519 	int force_fifo = (qtype & FIFOQ);
520 	int do_fifo;
521 
522 	qtype &= ~FIFOQ;
523 	ASSERT(qtype == MX || qtype == CV);
524 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
525 	ASSERT(ulwp->ul_sleepq != qp);
526 
527 	/*
528 	 * LIFO queue ordering is unfair and can lead to starvation,
529 	 * but it gives better performance for heavily contended locks.
530 	 * We use thread_queue_fifo (range is 0..8) to determine
531 	 * the frequency of FIFO vs LIFO queuing:
532 	 *	0 : every 256th time	(almost always LIFO)
533 	 *	1 : every 128th time
534 	 *	2 : every 64th  time
535 	 *	3 : every 32nd  time
536 	 *	4 : every 16th  time	(the default value, mostly LIFO)
537 	 *	5 : every 8th   time
538 	 *	6 : every 4th   time
539 	 *	7 : every 2nd   time
540 	 *	8 : every time		(never LIFO, always FIFO)
541 	 * Note that there is always some degree of FIFO ordering.
542 	 * This breaks live lock conditions that occur in applications
543 	 * that are written assuming (incorrectly) that threads acquire
544 	 * locks fairly, that is, in roughly round-robin order.
545 	 * In any event, the queue is maintained in priority order.
546 	 *
547 	 * If we are given the FIFOQ flag in qtype, fifo queueing is forced.
548 	 * SUSV3 requires this for semaphores.
549 	 */
550 	do_fifo = (force_fifo ||
551 	    ((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0);
552 
553 	if (qp->qh_head == NULL) {
554 		/*
555 		 * The queue is empty.  LIFO/FIFO doesn't matter.
556 		 */
557 		ASSERT(qp->qh_tail == NULL);
558 		ulwpp = &qp->qh_head;
559 	} else if (do_fifo) {
560 		/*
561 		 * Enqueue after the last thread whose priority is greater
562 		 * than or equal to the priority of the thread being queued.
563 		 * Attempt first to go directly onto the tail of the queue.
564 		 */
565 		if (pri <= CMP_PRIO(qp->qh_tail))
566 			ulwpp = &qp->qh_tail->ul_link;
567 		else {
568 			for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
569 			    ulwpp = &next->ul_link)
570 				if (pri > CMP_PRIO(next))
571 					break;
572 		}
573 	} else {
574 		/*
575 		 * Enqueue before the first thread whose priority is less
576 		 * than or equal to the priority of the thread being queued.
577 		 * Hopefully we can go directly onto the head of the queue.
578 		 */
579 		for (ulwpp = &qp->qh_head; (next = *ulwpp) != NULL;
580 		    ulwpp = &next->ul_link)
581 			if (pri >= CMP_PRIO(next))
582 				break;
583 	}
584 	if ((ulwp->ul_link = *ulwpp) == NULL)
585 		qp->qh_tail = ulwp;
586 	*ulwpp = ulwp;
587 
588 	ulwp->ul_sleepq = qp;
589 	ulwp->ul_wchan = wchan;
590 	ulwp->ul_qtype = qtype;
591 	if (qp->qh_qmax < ++qp->qh_qlen)
592 		qp->qh_qmax = qp->qh_qlen;
593 }
594 
595 /*
596  * Return a pointer to the queue slot of the
597  * highest priority thread on the queue.
598  * On return, prevp, if not NULL, will contain a pointer
599  * to the thread's predecessor on the queue
600  */
601 static ulwp_t **
602 queue_slot(queue_head_t *qp, void *wchan, int *more, ulwp_t **prevp)
603 {
604 	ulwp_t **ulwpp;
605 	ulwp_t *ulwp;
606 	ulwp_t *prev = NULL;
607 	ulwp_t **suspp = NULL;
608 	ulwp_t *susprev;
609 
610 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
611 
612 	/*
613 	 * Find a waiter on the sleep queue.
614 	 */
615 	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
616 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
617 		if (ulwp->ul_wchan == wchan) {
618 			if (!ulwp->ul_stop)
619 				break;
620 			/*
621 			 * Try not to return a suspended thread.
622 			 * This mimics the old libthread's behavior.
623 			 */
624 			if (suspp == NULL) {
625 				suspp = ulwpp;
626 				susprev = prev;
627 			}
628 		}
629 	}
630 
631 	if (ulwp == NULL && suspp != NULL) {
632 		ulwp = *(ulwpp = suspp);
633 		prev = susprev;
634 		suspp = NULL;
635 	}
636 	if (ulwp == NULL) {
637 		if (more != NULL)
638 			*more = 0;
639 		return (NULL);
640 	}
641 
642 	if (prevp != NULL)
643 		*prevp = prev;
644 	if (more == NULL)
645 		return (ulwpp);
646 
647 	/*
648 	 * Scan the remainder of the queue for another waiter.
649 	 */
650 	if (suspp != NULL) {
651 		*more = 1;
652 		return (ulwpp);
653 	}
654 	for (ulwp = ulwp->ul_link; ulwp != NULL; ulwp = ulwp->ul_link) {
655 		if (ulwp->ul_wchan == wchan) {
656 			*more = 1;
657 			return (ulwpp);
658 		}
659 	}
660 
661 	*more = 0;
662 	return (ulwpp);
663 }
664 
665 ulwp_t *
666 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
667 {
668 	ulwp_t *ulwp;
669 
670 	ulwp = *ulwpp;
671 	*ulwpp = ulwp->ul_link;
672 	ulwp->ul_link = NULL;
673 	if (qp->qh_tail == ulwp)
674 		qp->qh_tail = prev;
675 	qp->qh_qlen--;
676 	ulwp->ul_sleepq = NULL;
677 	ulwp->ul_wchan = NULL;
678 
679 	return (ulwp);
680 }
681 
682 ulwp_t *
683 dequeue(queue_head_t *qp, void *wchan, int *more)
684 {
685 	ulwp_t **ulwpp;
686 	ulwp_t *prev;
687 
688 	if ((ulwpp = queue_slot(qp, wchan, more, &prev)) == NULL)
689 		return (NULL);
690 	return (queue_unlink(qp, ulwpp, prev));
691 }
692 
693 /*
694  * Return a pointer to the highest priority thread sleeping on wchan.
695  */
696 ulwp_t *
697 queue_waiter(queue_head_t *qp, void *wchan)
698 {
699 	ulwp_t **ulwpp;
700 
701 	if ((ulwpp = queue_slot(qp, wchan, NULL, NULL)) == NULL)
702 		return (NULL);
703 	return (*ulwpp);
704 }
705 
706 uint8_t
707 dequeue_self(queue_head_t *qp, void *wchan)
708 {
709 	ulwp_t *self = curthread;
710 	ulwp_t **ulwpp;
711 	ulwp_t *ulwp;
712 	ulwp_t *prev = NULL;
713 	int found = 0;
714 	int more = 0;
715 
716 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
717 
718 	/* find self on the sleep queue */
719 	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
720 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
721 		if (ulwp == self) {
722 			/* dequeue ourself */
723 			ASSERT(self->ul_wchan == wchan);
724 			(void) queue_unlink(qp, ulwpp, prev);
725 			self->ul_cvmutex = NULL;
726 			self->ul_cv_wake = 0;
727 			found = 1;
728 			break;
729 		}
730 		if (ulwp->ul_wchan == wchan)
731 			more = 1;
732 	}
733 
734 	if (!found)
735 		thr_panic("dequeue_self(): curthread not found on queue");
736 
737 	if (more)
738 		return (1);
739 
740 	/* scan the remainder of the queue for another waiter */
741 	for (ulwp = *ulwpp; ulwp != NULL; ulwp = ulwp->ul_link) {
742 		if (ulwp->ul_wchan == wchan)
743 			return (1);
744 	}
745 
746 	return (0);
747 }
748 
749 /*
750  * Called from call_user_handler() and _thrp_suspend() to take
751  * ourself off of our sleep queue so we can grab locks.
752  */
753 void
754 unsleep_self(void)
755 {
756 	ulwp_t *self = curthread;
757 	queue_head_t *qp;
758 
759 	/*
760 	 * Calling enter_critical()/exit_critical() here would lead
761 	 * to recursion.  Just manipulate self->ul_critical directly.
762 	 */
763 	self->ul_critical++;
764 	while (self->ul_sleepq != NULL) {
765 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
766 		/*
767 		 * We may have been moved from a CV queue to a
768 		 * mutex queue while we were attempting queue_lock().
769 		 * If so, just loop around and try again.
770 		 * dequeue_self() clears self->ul_sleepq.
771 		 */
772 		if (qp == self->ul_sleepq) {
773 			(void) dequeue_self(qp, self->ul_wchan);
774 			self->ul_writer = 0;
775 		}
776 		queue_unlock(qp);
777 	}
778 	self->ul_critical--;
779 }
780 
781 /*
782  * Common code for calling the the ___lwp_mutex_timedlock() system call.
783  * Returns with mutex_owner and mutex_ownerpid set correctly.
784  */
785 static int
786 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
787 {
788 	ulwp_t *self = curthread;
789 	uberdata_t *udp = self->ul_uberdata;
790 	int mtype = mp->mutex_type;
791 	hrtime_t begin_sleep;
792 	int acquired;
793 	int error;
794 
795 	self->ul_sp = stkptr();
796 	self->ul_wchan = mp;
797 	if (__td_event_report(self, TD_SLEEP, udp)) {
798 		self->ul_td_evbuf.eventnum = TD_SLEEP;
799 		self->ul_td_evbuf.eventdata = mp;
800 		tdb_event(TD_SLEEP, udp);
801 	}
802 	if (msp) {
803 		tdb_incr(msp->mutex_sleep);
804 		begin_sleep = gethrtime();
805 	}
806 
807 	DTRACE_PROBE1(plockstat, mutex__block, mp);
808 
809 	for (;;) {
810 		/*
811 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
812 		 * means we successfully acquired the lock.
813 		 */
814 		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0 &&
815 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
816 			acquired = 0;
817 			break;
818 		}
819 
820 		if (mtype & USYNC_PROCESS) {
821 			/*
822 			 * Defend against forkall().  We may be the child,
823 			 * in which case we don't actually own the mutex.
824 			 */
825 			enter_critical(self);
826 			if (mp->mutex_ownerpid == udp->pid) {
827 				mp->mutex_owner = (uintptr_t)self;
828 				exit_critical(self);
829 				acquired = 1;
830 				break;
831 			}
832 			exit_critical(self);
833 		} else {
834 			mp->mutex_owner = (uintptr_t)self;
835 			acquired = 1;
836 			break;
837 		}
838 	}
839 	if (msp)
840 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
841 	self->ul_wchan = NULL;
842 	self->ul_sp = 0;
843 
844 	if (acquired) {
845 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
846 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
847 	} else {
848 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
849 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
850 	}
851 
852 	return (error);
853 }
854 
855 /*
856  * Common code for calling the ___lwp_mutex_trylock() system call.
857  * Returns with mutex_owner and mutex_ownerpid set correctly.
858  */
859 int
860 mutex_trylock_kernel(mutex_t *mp)
861 {
862 	ulwp_t *self = curthread;
863 	uberdata_t *udp = self->ul_uberdata;
864 	int mtype = mp->mutex_type;
865 	int error;
866 	int acquired;
867 
868 	for (;;) {
869 		/*
870 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
871 		 * means we successfully acquired the lock.
872 		 */
873 		if ((error = ___lwp_mutex_trylock(mp)) != 0 &&
874 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
875 			acquired = 0;
876 			break;
877 		}
878 
879 		if (mtype & USYNC_PROCESS) {
880 			/*
881 			 * Defend against forkall().  We may be the child,
882 			 * in which case we don't actually own the mutex.
883 			 */
884 			enter_critical(self);
885 			if (mp->mutex_ownerpid == udp->pid) {
886 				mp->mutex_owner = (uintptr_t)self;
887 				exit_critical(self);
888 				acquired = 1;
889 				break;
890 			}
891 			exit_critical(self);
892 		} else {
893 			mp->mutex_owner = (uintptr_t)self;
894 			acquired = 1;
895 			break;
896 		}
897 	}
898 
899 	if (acquired) {
900 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
901 	} else if (error != EBUSY) {
902 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
903 	}
904 
905 	return (error);
906 }
907 
908 volatile sc_shared_t *
909 setup_schedctl(void)
910 {
911 	ulwp_t *self = curthread;
912 	volatile sc_shared_t *scp;
913 	sc_shared_t *tmp;
914 
915 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
916 	    !self->ul_vfork &&			/* not a child of vfork() */
917 	    !self->ul_schedctl_called) {	/* haven't been called before */
918 		enter_critical(self);
919 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
920 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
921 			self->ul_schedctl = scp = tmp;
922 		exit_critical(self);
923 	}
924 	/*
925 	 * Unless the call to setup_schedctl() is surrounded
926 	 * by enter_critical()/exit_critical(), the address
927 	 * we are returning could be invalid due to a forkall()
928 	 * having occurred in another thread.
929 	 */
930 	return (scp);
931 }
932 
933 /*
934  * Interfaces from libsched, incorporated into libc.
935  * libsched.so.1 is now a filter library onto libc.
936  */
937 #pragma weak schedctl_lookup = _schedctl_init
938 #pragma weak _schedctl_lookup = _schedctl_init
939 #pragma weak schedctl_init = _schedctl_init
940 schedctl_t *
941 _schedctl_init(void)
942 {
943 	volatile sc_shared_t *scp = setup_schedctl();
944 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
945 }
946 
947 #pragma weak schedctl_exit = _schedctl_exit
948 void
949 _schedctl_exit(void)
950 {
951 }
952 
953 /*
954  * Contract private interface for java.
955  * Set up the schedctl data if it doesn't exist yet.
956  * Return a pointer to the pointer to the schedctl data.
957  */
958 volatile sc_shared_t *volatile *
959 _thr_schedctl(void)
960 {
961 	ulwp_t *self = curthread;
962 	volatile sc_shared_t *volatile *ptr;
963 
964 	if (self->ul_vfork)
965 		return (NULL);
966 	if (*(ptr = &self->ul_schedctl) == NULL)
967 		(void) setup_schedctl();
968 	return (ptr);
969 }
970 
971 /*
972  * Block signals and attempt to block preemption.
973  * no_preempt()/preempt() must be used in pairs but can be nested.
974  */
975 void
976 no_preempt(ulwp_t *self)
977 {
978 	volatile sc_shared_t *scp;
979 
980 	if (self->ul_preempt++ == 0) {
981 		enter_critical(self);
982 		if ((scp = self->ul_schedctl) != NULL ||
983 		    (scp = setup_schedctl()) != NULL) {
984 			/*
985 			 * Save the pre-existing preempt value.
986 			 */
987 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
988 			scp->sc_preemptctl.sc_nopreempt = 1;
989 		}
990 	}
991 }
992 
993 /*
994  * Undo the effects of no_preempt().
995  */
996 void
997 preempt(ulwp_t *self)
998 {
999 	volatile sc_shared_t *scp;
1000 
1001 	ASSERT(self->ul_preempt > 0);
1002 	if (--self->ul_preempt == 0) {
1003 		if ((scp = self->ul_schedctl) != NULL) {
1004 			/*
1005 			 * Restore the pre-existing preempt value.
1006 			 */
1007 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1008 			if (scp->sc_preemptctl.sc_yield &&
1009 			    scp->sc_preemptctl.sc_nopreempt == 0) {
1010 				lwp_yield();
1011 				if (scp->sc_preemptctl.sc_yield) {
1012 					/*
1013 					 * Shouldn't happen.  This is either
1014 					 * a race condition or the thread
1015 					 * just entered the real-time class.
1016 					 */
1017 					lwp_yield();
1018 					scp->sc_preemptctl.sc_yield = 0;
1019 				}
1020 			}
1021 		}
1022 		exit_critical(self);
1023 	}
1024 }
1025 
1026 /*
1027  * If a call to preempt() would cause the current thread to yield or to
1028  * take deferred actions in exit_critical(), then unpark the specified
1029  * lwp so it can run while we delay.  Return the original lwpid if the
1030  * unpark was not performed, else return zero.  The tests are a repeat
1031  * of some of the tests in preempt(), above.  This is a statistical
1032  * optimization solely for cond_sleep_queue(), below.
1033  */
1034 static lwpid_t
1035 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1036 {
1037 	volatile sc_shared_t *scp = self->ul_schedctl;
1038 
1039 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1040 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1041 	    (self->ul_curplease && self->ul_critical == 1)) {
1042 		(void) __lwp_unpark(lwpid);
1043 		lwpid = 0;
1044 	}
1045 	return (lwpid);
1046 }
1047 
1048 /*
1049  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1050  * If this fails, return EBUSY and let the caller deal with it.
1051  * If this succeeds, return 0 with mutex_owner set to curthread.
1052  */
1053 static int
1054 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1055 {
1056 	ulwp_t *self = curthread;
1057 	int error = EBUSY;
1058 	ulwp_t *ulwp;
1059 	volatile sc_shared_t *scp;
1060 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1061 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1062 	uint32_t new_lockword;
1063 	int count = 0;
1064 	int max_count;
1065 	uint8_t max_spinners;
1066 
1067 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1068 
1069 	if (MUTEX_OWNER(mp) == self)
1070 		return (EBUSY);
1071 
1072 	/* short-cut, not definitive (see below) */
1073 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1074 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1075 		error = ENOTRECOVERABLE;
1076 		goto done;
1077 	}
1078 
1079 	/*
1080 	 * Make one attempt to acquire the lock before
1081 	 * incurring the overhead of the spin loop.
1082 	 */
1083 	if (set_lock_byte(lockp) == 0) {
1084 		*ownerp = (uintptr_t)self;
1085 		error = 0;
1086 		goto done;
1087 	}
1088 	if (!tryhard)
1089 		goto done;
1090 	if (ncpus == 0)
1091 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1092 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1093 		max_spinners = ncpus - 1;
1094 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1095 	if (max_count == 0)
1096 		goto done;
1097 
1098 	/*
1099 	 * This spin loop is unfair to lwps that have already dropped into
1100 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1101 	 * This is just too bad.  The adaptive spin algorithm is intended
1102 	 * to allow programs with highly-contended locks (that is, broken
1103 	 * programs) to execute with reasonable speed despite their contention.
1104 	 * Being fair would reduce the speed of such programs and well-written
1105 	 * programs will not suffer in any case.
1106 	 */
1107 	enter_critical(self);
1108 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1109 		exit_critical(self);
1110 		goto done;
1111 	}
1112 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1113 	for (count = 1; ; count++) {
1114 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1115 			*ownerp = (uintptr_t)self;
1116 			error = 0;
1117 			break;
1118 		}
1119 		if (count == max_count)
1120 			break;
1121 		SMT_PAUSE();
1122 		/*
1123 		 * Stop spinning if the mutex owner is not running on
1124 		 * a processor; it will not drop the lock any time soon
1125 		 * and we would just be wasting time to keep spinning.
1126 		 *
1127 		 * Note that we are looking at another thread (ulwp_t)
1128 		 * without ensuring that the other thread does not exit.
1129 		 * The scheme relies on ulwp_t structures never being
1130 		 * deallocated by the library (the library employs a free
1131 		 * list of ulwp_t structs that are reused when new threads
1132 		 * are created) and on schedctl shared memory never being
1133 		 * deallocated once created via __schedctl().
1134 		 *
1135 		 * Thus, the worst that can happen when the spinning thread
1136 		 * looks at the owner's schedctl data is that it is looking
1137 		 * at some other thread's schedctl data.  This almost never
1138 		 * happens and is benign when it does.
1139 		 */
1140 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1141 		    ((scp = ulwp->ul_schedctl) == NULL ||
1142 		    scp->sc_state != SC_ONPROC))
1143 			break;
1144 	}
1145 	new_lockword = spinners_decr(&mp->mutex_lockword);
1146 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1147 		/*
1148 		 * We haven't yet acquired the lock, the lock
1149 		 * is free, and there are no other spinners.
1150 		 * Make one final attempt to acquire the lock.
1151 		 *
1152 		 * This isn't strictly necessary since mutex_lock_queue()
1153 		 * (the next action this thread will take if it doesn't
1154 		 * acquire the lock here) makes one attempt to acquire
1155 		 * the lock before putting the thread to sleep.
1156 		 *
1157 		 * If the next action for this thread (on failure here)
1158 		 * were not to call mutex_lock_queue(), this would be
1159 		 * necessary for correctness, to avoid ending up with an
1160 		 * unheld mutex with waiters but no one to wake them up.
1161 		 */
1162 		if (set_lock_byte(lockp) == 0) {
1163 			*ownerp = (uintptr_t)self;
1164 			error = 0;
1165 		}
1166 		count++;
1167 	}
1168 	exit_critical(self);
1169 
1170 done:
1171 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1172 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1173 		/*
1174 		 * We shouldn't own the mutex; clear the lock.
1175 		 */
1176 		mp->mutex_owner = 0;
1177 		if (clear_lockbyte(&mp->mutex_lockword) & WAITERMASK)
1178 			mutex_wakeup_all(mp);
1179 		error = ENOTRECOVERABLE;
1180 	}
1181 
1182 	if (error) {
1183 		if (count) {
1184 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1185 		}
1186 		if (error != EBUSY) {
1187 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1188 		}
1189 	} else {
1190 		if (count) {
1191 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1192 		}
1193 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1194 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1195 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1196 			error = EOWNERDEAD;
1197 		}
1198 	}
1199 
1200 	return (error);
1201 }
1202 
1203 /*
1204  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1205  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1206  */
1207 static int
1208 mutex_queuelock_adaptive(mutex_t *mp)
1209 {
1210 	ulwp_t *ulwp;
1211 	volatile sc_shared_t *scp;
1212 	volatile uint8_t *lockp;
1213 	volatile uint64_t *ownerp;
1214 	int count = curthread->ul_queue_spin;
1215 
1216 	ASSERT(mp->mutex_type == USYNC_THREAD);
1217 
1218 	if (count == 0)
1219 		return (EBUSY);
1220 
1221 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1222 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1223 	while (--count >= 0) {
1224 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1225 			return (0);
1226 		SMT_PAUSE();
1227 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1228 		    ((scp = ulwp->ul_schedctl) == NULL ||
1229 		    scp->sc_state != SC_ONPROC))
1230 			break;
1231 	}
1232 
1233 	return (EBUSY);
1234 }
1235 
1236 /*
1237  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1238  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1239  * If this fails, return EBUSY and let the caller deal with it.
1240  * If this succeeds, return 0 with mutex_owner set to curthread
1241  * and mutex_ownerpid set to the current pid.
1242  */
1243 static int
1244 mutex_trylock_process(mutex_t *mp, int tryhard)
1245 {
1246 	ulwp_t *self = curthread;
1247 	uberdata_t *udp = self->ul_uberdata;
1248 	int error = EBUSY;
1249 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1250 	uint32_t new_lockword;
1251 	int count = 0;
1252 	int max_count;
1253 	uint8_t max_spinners;
1254 
1255 	ASSERT(mp->mutex_type & USYNC_PROCESS);
1256 
1257 	if (shared_mutex_held(mp))
1258 		return (EBUSY);
1259 
1260 	/* short-cut, not definitive (see below) */
1261 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1262 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1263 		error = ENOTRECOVERABLE;
1264 		goto done;
1265 	}
1266 
1267 	/*
1268 	 * Make one attempt to acquire the lock before
1269 	 * incurring the overhead of the spin loop.
1270 	 */
1271 	enter_critical(self);
1272 	if (set_lock_byte(lockp) == 0) {
1273 		mp->mutex_owner = (uintptr_t)self;
1274 		mp->mutex_ownerpid = udp->pid;
1275 		exit_critical(self);
1276 		error = 0;
1277 		goto done;
1278 	}
1279 	exit_critical(self);
1280 	if (!tryhard)
1281 		goto done;
1282 	if (ncpus == 0)
1283 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1284 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1285 		max_spinners = ncpus - 1;
1286 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1287 	if (max_count == 0)
1288 		goto done;
1289 
1290 	/*
1291 	 * This is a process-shared mutex.
1292 	 * We cannot know if the owner is running on a processor.
1293 	 * We just spin and hope that it is on a processor.
1294 	 */
1295 	enter_critical(self);
1296 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1297 		exit_critical(self);
1298 		goto done;
1299 	}
1300 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1301 	for (count = 1; ; count++) {
1302 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1303 			mp->mutex_owner = (uintptr_t)self;
1304 			mp->mutex_ownerpid = udp->pid;
1305 			error = 0;
1306 			break;
1307 		}
1308 		if (count == max_count)
1309 			break;
1310 		SMT_PAUSE();
1311 	}
1312 	new_lockword = spinners_decr(&mp->mutex_lockword);
1313 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1314 		/*
1315 		 * We haven't yet acquired the lock, the lock
1316 		 * is free, and there are no other spinners.
1317 		 * Make one final attempt to acquire the lock.
1318 		 *
1319 		 * This isn't strictly necessary since mutex_lock_kernel()
1320 		 * (the next action this thread will take if it doesn't
1321 		 * acquire the lock here) makes one attempt to acquire
1322 		 * the lock before putting the thread to sleep.
1323 		 *
1324 		 * If the next action for this thread (on failure here)
1325 		 * were not to call mutex_lock_kernel(), this would be
1326 		 * necessary for correctness, to avoid ending up with an
1327 		 * unheld mutex with waiters but no one to wake them up.
1328 		 */
1329 		if (set_lock_byte(lockp) == 0) {
1330 			mp->mutex_owner = (uintptr_t)self;
1331 			mp->mutex_ownerpid = udp->pid;
1332 			error = 0;
1333 		}
1334 		count++;
1335 	}
1336 	exit_critical(self);
1337 
1338 done:
1339 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1340 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1341 		/*
1342 		 * We shouldn't own the mutex; clear the lock.
1343 		 */
1344 		mp->mutex_owner = 0;
1345 		mp->mutex_ownerpid = 0;
1346 		if (clear_lockbyte(&mp->mutex_lockword) & WAITERMASK) {
1347 			no_preempt(self);
1348 			(void) ___lwp_mutex_wakeup(mp, 1);
1349 			preempt(self);
1350 		}
1351 		error = ENOTRECOVERABLE;
1352 	}
1353 
1354 	if (error) {
1355 		if (count) {
1356 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1357 		}
1358 		if (error != EBUSY) {
1359 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1360 		}
1361 	} else {
1362 		if (count) {
1363 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1364 		}
1365 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1366 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1367 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1368 			if (mp->mutex_flag & LOCK_OWNERDEAD)
1369 				error = EOWNERDEAD;
1370 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1371 				error = ELOCKUNMAPPED;
1372 			else
1373 				error = EOWNERDEAD;
1374 		}
1375 	}
1376 
1377 	return (error);
1378 }
1379 
1380 /*
1381  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1382  * Returns the lwpid of the thread that was dequeued, if any.
1383  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1384  * to wake up the specified lwp.
1385  */
1386 static lwpid_t
1387 mutex_wakeup(mutex_t *mp)
1388 {
1389 	lwpid_t lwpid = 0;
1390 	queue_head_t *qp;
1391 	ulwp_t *ulwp;
1392 	int more;
1393 
1394 	/*
1395 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1396 	 * waiters bit if no one was found on the queue because the mutex
1397 	 * might have been deallocated or reallocated for another purpose.
1398 	 */
1399 	qp = queue_lock(mp, MX);
1400 	if ((ulwp = dequeue(qp, mp, &more)) != NULL) {
1401 		lwpid = ulwp->ul_lwpid;
1402 		mp->mutex_waiters = (more? 1 : 0);
1403 	}
1404 	queue_unlock(qp);
1405 	return (lwpid);
1406 }
1407 
1408 /*
1409  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1410  */
1411 static void
1412 mutex_wakeup_all(mutex_t *mp)
1413 {
1414 	queue_head_t *qp;
1415 	int nlwpid = 0;
1416 	int maxlwps = MAXLWPS;
1417 	ulwp_t **ulwpp;
1418 	ulwp_t *ulwp;
1419 	ulwp_t *prev = NULL;
1420 	lwpid_t buffer[MAXLWPS];
1421 	lwpid_t *lwpid = buffer;
1422 
1423 	/*
1424 	 * Walk the list of waiters and prepare to wake up all of them.
1425 	 * The waiters flag has already been cleared from the mutex.
1426 	 *
1427 	 * We keep track of lwpids that are to be unparked in lwpid[].
1428 	 * __lwp_unpark_all() is called to unpark all of them after
1429 	 * they have been removed from the sleep queue and the sleep
1430 	 * queue lock has been dropped.  If we run out of space in our
1431 	 * on-stack buffer, we need to allocate more but we can't call
1432 	 * lmalloc() because we are holding a queue lock when the overflow
1433 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1434 	 * either because the application may have allocated a small
1435 	 * stack and we don't want to overrun the stack.  So we call
1436 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1437 	 * system call directly since that path acquires no locks.
1438 	 */
1439 	qp = queue_lock(mp, MX);
1440 	ulwpp = &qp->qh_head;
1441 	while ((ulwp = *ulwpp) != NULL) {
1442 		if (ulwp->ul_wchan != mp) {
1443 			prev = ulwp;
1444 			ulwpp = &ulwp->ul_link;
1445 		} else {
1446 			if (nlwpid == maxlwps)
1447 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1448 			(void) queue_unlink(qp, ulwpp, prev);
1449 			lwpid[nlwpid++] = ulwp->ul_lwpid;
1450 		}
1451 	}
1452 
1453 	if (nlwpid == 0) {
1454 		queue_unlock(qp);
1455 	} else {
1456 		mp->mutex_waiters = 0;
1457 		no_preempt(curthread);
1458 		queue_unlock(qp);
1459 		if (nlwpid == 1)
1460 			(void) __lwp_unpark(lwpid[0]);
1461 		else
1462 			(void) __lwp_unpark_all(lwpid, nlwpid);
1463 		preempt(curthread);
1464 	}
1465 
1466 	if (lwpid != buffer)
1467 		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
1468 }
1469 
1470 /*
1471  * Release a process-private mutex.
1472  * As an optimization, if there are waiters but there are also spinners
1473  * attempting to acquire the mutex, then don't bother waking up a waiter;
1474  * one of the spinners will acquire the mutex soon and it would be a waste
1475  * of resources to wake up some thread just to have it spin for a while
1476  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1477  */
1478 static lwpid_t
1479 mutex_unlock_queue(mutex_t *mp, int release_all)
1480 {
1481 	lwpid_t lwpid = 0;
1482 	uint32_t old_lockword;
1483 
1484 	mp->mutex_owner = 0;
1485 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1486 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1487 	if ((old_lockword & WAITERMASK) &&
1488 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1489 		ulwp_t *self = curthread;
1490 		no_preempt(self);	/* ensure a prompt wakeup */
1491 		if (release_all)
1492 			mutex_wakeup_all(mp);
1493 		else
1494 			lwpid = mutex_wakeup(mp);
1495 		if (lwpid == 0)
1496 			preempt(self);
1497 	}
1498 	return (lwpid);
1499 }
1500 
1501 /*
1502  * Like mutex_unlock_queue(), but for process-shared mutexes.
1503  */
1504 static void
1505 mutex_unlock_process(mutex_t *mp, int release_all)
1506 {
1507 	uint32_t old_lockword;
1508 
1509 	mp->mutex_owner = 0;
1510 	mp->mutex_ownerpid = 0;
1511 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1512 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1513 	if ((old_lockword & WAITERMASK) &&
1514 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1515 		ulwp_t *self = curthread;
1516 		no_preempt(self);	/* ensure a prompt wakeup */
1517 		(void) ___lwp_mutex_wakeup(mp, release_all);
1518 		preempt(self);
1519 	}
1520 }
1521 
1522 /*
1523  * Return the real priority of a thread.
1524  */
1525 int
1526 real_priority(ulwp_t *ulwp)
1527 {
1528 	if (ulwp->ul_epri == 0)
1529 		return (ulwp->ul_mappedpri? ulwp->ul_mappedpri : ulwp->ul_pri);
1530 	return (ulwp->ul_emappedpri? ulwp->ul_emappedpri : ulwp->ul_epri);
1531 }
1532 
1533 void
1534 stall(void)
1535 {
1536 	for (;;)
1537 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1538 }
1539 
1540 /*
1541  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1542  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1543  * If successful, returns with mutex_owner set correctly.
1544  */
1545 int
1546 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1547 	timespec_t *tsp)
1548 {
1549 	uberdata_t *udp = curthread->ul_uberdata;
1550 	queue_head_t *qp;
1551 	hrtime_t begin_sleep;
1552 	int error = 0;
1553 
1554 	self->ul_sp = stkptr();
1555 	if (__td_event_report(self, TD_SLEEP, udp)) {
1556 		self->ul_wchan = mp;
1557 		self->ul_td_evbuf.eventnum = TD_SLEEP;
1558 		self->ul_td_evbuf.eventdata = mp;
1559 		tdb_event(TD_SLEEP, udp);
1560 	}
1561 	if (msp) {
1562 		tdb_incr(msp->mutex_sleep);
1563 		begin_sleep = gethrtime();
1564 	}
1565 
1566 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1567 
1568 	/*
1569 	 * Put ourself on the sleep queue, and while we are
1570 	 * unable to grab the lock, go park in the kernel.
1571 	 * Take ourself off the sleep queue after we acquire the lock.
1572 	 * The waiter bit can be set/cleared only while holding the queue lock.
1573 	 */
1574 	qp = queue_lock(mp, MX);
1575 	enqueue(qp, self, mp, MX);
1576 	mp->mutex_waiters = 1;
1577 	for (;;) {
1578 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1579 			mp->mutex_owner = (uintptr_t)self;
1580 			mp->mutex_waiters = dequeue_self(qp, mp);
1581 			break;
1582 		}
1583 		set_parking_flag(self, 1);
1584 		queue_unlock(qp);
1585 		/*
1586 		 * __lwp_park() will return the residual time in tsp
1587 		 * if we are unparked before the timeout expires.
1588 		 */
1589 		error = __lwp_park(tsp, 0);
1590 		set_parking_flag(self, 0);
1591 		/*
1592 		 * We could have taken a signal or suspended ourself.
1593 		 * If we did, then we removed ourself from the queue.
1594 		 * Someone else may have removed us from the queue
1595 		 * as a consequence of mutex_unlock().  We may have
1596 		 * gotten a timeout from __lwp_park().  Or we may still
1597 		 * be on the queue and this is just a spurious wakeup.
1598 		 */
1599 		qp = queue_lock(mp, MX);
1600 		if (self->ul_sleepq == NULL) {
1601 			if (error) {
1602 				mp->mutex_waiters = queue_waiter(qp, mp)? 1 : 0;
1603 				if (error != EINTR)
1604 					break;
1605 				error = 0;
1606 			}
1607 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1608 				mp->mutex_owner = (uintptr_t)self;
1609 				break;
1610 			}
1611 			enqueue(qp, self, mp, MX);
1612 			mp->mutex_waiters = 1;
1613 		}
1614 		ASSERT(self->ul_sleepq == qp &&
1615 		    self->ul_qtype == MX &&
1616 		    self->ul_wchan == mp);
1617 		if (error) {
1618 			if (error != EINTR) {
1619 				mp->mutex_waiters = dequeue_self(qp, mp);
1620 				break;
1621 			}
1622 			error = 0;
1623 		}
1624 	}
1625 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1626 	    self->ul_wchan == NULL);
1627 	self->ul_sp = 0;
1628 	queue_unlock(qp);
1629 
1630 	if (msp)
1631 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1632 
1633 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1634 
1635 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1636 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1637 		/*
1638 		 * We shouldn't own the mutex; clear the lock.
1639 		 */
1640 		mp->mutex_owner = 0;
1641 		if (clear_lockbyte(&mp->mutex_lockword) & WAITERMASK)
1642 			mutex_wakeup_all(mp);
1643 		error = ENOTRECOVERABLE;
1644 	}
1645 
1646 	if (error) {
1647 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1648 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1649 	} else {
1650 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1651 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1652 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1653 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1654 			error = EOWNERDEAD;
1655 		}
1656 	}
1657 
1658 	return (error);
1659 }
1660 
1661 static int
1662 mutex_recursion(mutex_t *mp, int mtype, int try)
1663 {
1664 	ASSERT(mutex_is_held(mp));
1665 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1666 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1667 
1668 	if (mtype & LOCK_RECURSIVE) {
1669 		if (mp->mutex_rcount == RECURSION_MAX) {
1670 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1671 			return (EAGAIN);
1672 		}
1673 		mp->mutex_rcount++;
1674 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1675 		return (0);
1676 	}
1677 	if (try == MUTEX_LOCK) {
1678 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1679 		return (EDEADLK);
1680 	}
1681 	return (EBUSY);
1682 }
1683 
1684 /*
1685  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1686  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1687  * We use tdb_hash_lock here and in the synch object tracking code in
1688  * the tdb_agent.c file.  There is no conflict between these two usages.
1689  */
1690 void
1691 register_lock(mutex_t *mp)
1692 {
1693 	uberdata_t *udp = curthread->ul_uberdata;
1694 	uint_t hash = LOCK_HASH(mp);
1695 	robust_t *rlp;
1696 	robust_t **rlpp;
1697 	robust_t **table;
1698 
1699 	if ((table = udp->robustlocks) == NULL) {
1700 		lmutex_lock(&udp->tdb_hash_lock);
1701 		if ((table = udp->robustlocks) == NULL) {
1702 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1703 			_membar_producer();
1704 			udp->robustlocks = table;
1705 		}
1706 		lmutex_unlock(&udp->tdb_hash_lock);
1707 	}
1708 	_membar_consumer();
1709 
1710 	/*
1711 	 * First search the registered table with no locks held.
1712 	 * This is safe because the table never shrinks
1713 	 * and we can only get a false negative.
1714 	 */
1715 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1716 		if (rlp->robust_lock == mp)	/* already registered */
1717 			return;
1718 	}
1719 
1720 	/*
1721 	 * The lock was not found.
1722 	 * Repeat the operation with tdb_hash_lock held.
1723 	 */
1724 	lmutex_lock(&udp->tdb_hash_lock);
1725 
1726 	for (rlpp = &table[hash];
1727 	    (rlp = *rlpp) != NULL;
1728 	    rlpp = &rlp->robust_next) {
1729 		if (rlp->robust_lock == mp) {	/* already registered */
1730 			lmutex_unlock(&udp->tdb_hash_lock);
1731 			return;
1732 		}
1733 	}
1734 
1735 	/*
1736 	 * The lock has never been registered.
1737 	 * Register it now and add it to the table.
1738 	 */
1739 	(void) ___lwp_mutex_register(mp);
1740 	rlp = lmalloc(sizeof (*rlp));
1741 	rlp->robust_lock = mp;
1742 	_membar_producer();
1743 	*rlpp = rlp;
1744 
1745 	lmutex_unlock(&udp->tdb_hash_lock);
1746 }
1747 
1748 /*
1749  * This is called in the child of fork()/forkall() to start over
1750  * with a clean slate.  (Each process must register its own locks.)
1751  * No locks are needed because all other threads are suspended or gone.
1752  */
1753 void
1754 unregister_locks(void)
1755 {
1756 	uberdata_t *udp = curthread->ul_uberdata;
1757 	uint_t hash;
1758 	robust_t **table;
1759 	robust_t *rlp;
1760 	robust_t *next;
1761 
1762 	if ((table = udp->robustlocks) != NULL) {
1763 		for (hash = 0; hash < LOCKHASHSZ; hash++) {
1764 			rlp = table[hash];
1765 			while (rlp != NULL) {
1766 				next = rlp->robust_next;
1767 				lfree(rlp, sizeof (*rlp));
1768 				rlp = next;
1769 			}
1770 		}
1771 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
1772 		udp->robustlocks = NULL;
1773 	}
1774 }
1775 
1776 /*
1777  * Returns with mutex_owner set correctly.
1778  */
1779 static int
1780 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
1781 {
1782 	ulwp_t *self = curthread;
1783 	uberdata_t *udp = self->ul_uberdata;
1784 	int mtype = mp->mutex_type;
1785 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
1786 	int error = 0;
1787 	uint8_t ceil;
1788 	int myprio;
1789 
1790 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1791 
1792 	if (!self->ul_schedctl_called)
1793 		(void) setup_schedctl();
1794 
1795 	if (msp && try == MUTEX_TRY)
1796 		tdb_incr(msp->mutex_try);
1797 
1798 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_is_held(mp))
1799 		return (mutex_recursion(mp, mtype, try));
1800 
1801 	if (self->ul_error_detection && try == MUTEX_LOCK &&
1802 	    tsp == NULL && mutex_is_held(mp))
1803 		lock_error(mp, "mutex_lock", NULL, NULL);
1804 
1805 	if (mtype & LOCK_PRIO_PROTECT) {
1806 		ceil = mp->mutex_ceiling;
1807 		ASSERT(_validate_rt_prio(SCHED_FIFO, ceil) == 0);
1808 		myprio = real_priority(self);
1809 		if (myprio > ceil) {
1810 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
1811 			return (EINVAL);
1812 		}
1813 		if ((error = _ceil_mylist_add(mp)) != 0) {
1814 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1815 			return (error);
1816 		}
1817 		if (myprio < ceil)
1818 			_ceil_prio_inherit(ceil);
1819 	}
1820 
1821 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
1822 	    == (USYNC_PROCESS | LOCK_ROBUST))
1823 		register_lock(mp);
1824 
1825 	if (mtype & LOCK_PRIO_INHERIT) {
1826 		/* go straight to the kernel */
1827 		if (try == MUTEX_TRY)
1828 			error = mutex_trylock_kernel(mp);
1829 		else	/* MUTEX_LOCK */
1830 			error = mutex_lock_kernel(mp, tsp, msp);
1831 		/*
1832 		 * The kernel never sets or clears the lock byte
1833 		 * for LOCK_PRIO_INHERIT mutexes.
1834 		 * Set it here for consistency.
1835 		 */
1836 		switch (error) {
1837 		case 0:
1838 			mp->mutex_lockw = LOCKSET;
1839 			break;
1840 		case EOWNERDEAD:
1841 		case ELOCKUNMAPPED:
1842 			mp->mutex_lockw = LOCKSET;
1843 			/* FALLTHROUGH */
1844 		case ENOTRECOVERABLE:
1845 			ASSERT(mtype & LOCK_ROBUST);
1846 			break;
1847 		case EDEADLK:
1848 			if (try == MUTEX_LOCK)
1849 				stall();
1850 			error = EBUSY;
1851 			break;
1852 		}
1853 	} else if (mtype & USYNC_PROCESS) {
1854 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
1855 		if (error == EBUSY && try == MUTEX_LOCK)
1856 			error = mutex_lock_kernel(mp, tsp, msp);
1857 	} else {	/* USYNC_THREAD */
1858 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
1859 		if (error == EBUSY && try == MUTEX_LOCK)
1860 			error = mutex_lock_queue(self, msp, mp, tsp);
1861 	}
1862 
1863 	switch (error) {
1864 	case 0:
1865 	case EOWNERDEAD:
1866 	case ELOCKUNMAPPED:
1867 		if (mtype & LOCK_ROBUST)
1868 			remember_lock(mp);
1869 		if (msp)
1870 			record_begin_hold(msp);
1871 		break;
1872 	default:
1873 		if (mtype & LOCK_PRIO_PROTECT) {
1874 			(void) _ceil_mylist_del(mp);
1875 			if (myprio < ceil)
1876 				_ceil_prio_waive();
1877 		}
1878 		if (try == MUTEX_TRY) {
1879 			if (msp)
1880 				tdb_incr(msp->mutex_try_fail);
1881 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
1882 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
1883 				tdb_event(TD_LOCK_TRY, udp);
1884 			}
1885 		}
1886 		break;
1887 	}
1888 
1889 	return (error);
1890 }
1891 
1892 int
1893 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
1894 {
1895 	ulwp_t *self = curthread;
1896 	uberdata_t *udp = self->ul_uberdata;
1897 
1898 	/*
1899 	 * We know that USYNC_PROCESS is set in mtype and that
1900 	 * zero, one, or both of the flags LOCK_RECURSIVE and
1901 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
1902 	 */
1903 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
1904 	enter_critical(self);
1905 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
1906 		mp->mutex_owner = (uintptr_t)self;
1907 		mp->mutex_ownerpid = udp->pid;
1908 		exit_critical(self);
1909 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1910 		return (0);
1911 	}
1912 	exit_critical(self);
1913 
1914 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
1915 		return (mutex_recursion(mp, mtype, try));
1916 
1917 	if (try == MUTEX_LOCK) {
1918 		if (mutex_trylock_process(mp, 1) == 0)
1919 			return (0);
1920 		return (mutex_lock_kernel(mp, tsp, NULL));
1921 	}
1922 
1923 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
1924 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
1925 		tdb_event(TD_LOCK_TRY, udp);
1926 	}
1927 	return (EBUSY);
1928 }
1929 
1930 static int
1931 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
1932 {
1933 	ulwp_t *self = curthread;
1934 	uberdata_t *udp = self->ul_uberdata;
1935 	uberflags_t *gflags;
1936 	int mtype;
1937 
1938 	/*
1939 	 * Optimize the case of USYNC_THREAD, including
1940 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
1941 	 * no error detection, no lock statistics,
1942 	 * and the process has only a single thread.
1943 	 * (Most likely a traditional single-threaded application.)
1944 	 */
1945 	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
1946 	    udp->uberflags.uf_all) == 0) {
1947 		/*
1948 		 * Only one thread exists so we don't need an atomic operation.
1949 		 */
1950 		if (mp->mutex_lockw == 0) {
1951 			mp->mutex_lockw = LOCKSET;
1952 			mp->mutex_owner = (uintptr_t)self;
1953 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1954 			return (0);
1955 		}
1956 		if (mtype && MUTEX_OWNER(mp) == self)
1957 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
1958 		/*
1959 		 * We have reached a deadlock, probably because the
1960 		 * process is executing non-async-signal-safe code in
1961 		 * a signal handler and is attempting to acquire a lock
1962 		 * that it already owns.  This is not surprising, given
1963 		 * bad programming practices over the years that has
1964 		 * resulted in applications calling printf() and such
1965 		 * in their signal handlers.  Unless the user has told
1966 		 * us that the signal handlers are safe by setting:
1967 		 *	export _THREAD_ASYNC_SAFE=1
1968 		 * we return EDEADLK rather than actually deadlocking.
1969 		 */
1970 		if (tsp == NULL &&
1971 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
1972 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1973 			return (EDEADLK);
1974 		}
1975 	}
1976 
1977 	/*
1978 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
1979 	 * no error detection, and no lock statistics.
1980 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
1981 	 */
1982 	if ((gflags = self->ul_schedctl_called) != NULL &&
1983 	    (gflags->uf_trs_ted |
1984 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
1985 		if (mtype & USYNC_PROCESS)
1986 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
1987 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1988 			mp->mutex_owner = (uintptr_t)self;
1989 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1990 			return (0);
1991 		}
1992 		if (mtype && MUTEX_OWNER(mp) == self)
1993 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
1994 		if (mutex_trylock_adaptive(mp, 1) != 0)
1995 			return (mutex_lock_queue(self, NULL, mp, tsp));
1996 		return (0);
1997 	}
1998 
1999 	/* else do it the long way */
2000 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2001 }
2002 
2003 #pragma weak _private_mutex_lock = __mutex_lock
2004 #pragma weak mutex_lock = __mutex_lock
2005 #pragma weak _mutex_lock = __mutex_lock
2006 #pragma weak pthread_mutex_lock = __mutex_lock
2007 #pragma weak _pthread_mutex_lock = __mutex_lock
2008 int
2009 __mutex_lock(mutex_t *mp)
2010 {
2011 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2012 	return (mutex_lock_impl(mp, NULL));
2013 }
2014 
2015 #pragma weak pthread_mutex_timedlock = _pthread_mutex_timedlock
2016 int
2017 _pthread_mutex_timedlock(mutex_t *mp, const timespec_t *abstime)
2018 {
2019 	timespec_t tslocal;
2020 	int error;
2021 
2022 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2023 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2024 	error = mutex_lock_impl(mp, &tslocal);
2025 	if (error == ETIME)
2026 		error = ETIMEDOUT;
2027 	return (error);
2028 }
2029 
2030 #pragma weak pthread_mutex_reltimedlock_np = _pthread_mutex_reltimedlock_np
2031 int
2032 _pthread_mutex_reltimedlock_np(mutex_t *mp, const timespec_t *reltime)
2033 {
2034 	timespec_t tslocal;
2035 	int error;
2036 
2037 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2038 	tslocal = *reltime;
2039 	error = mutex_lock_impl(mp, &tslocal);
2040 	if (error == ETIME)
2041 		error = ETIMEDOUT;
2042 	return (error);
2043 }
2044 
2045 #pragma weak _private_mutex_trylock = __mutex_trylock
2046 #pragma weak mutex_trylock = __mutex_trylock
2047 #pragma weak _mutex_trylock = __mutex_trylock
2048 #pragma weak pthread_mutex_trylock = __mutex_trylock
2049 #pragma weak _pthread_mutex_trylock = __mutex_trylock
2050 int
2051 __mutex_trylock(mutex_t *mp)
2052 {
2053 	ulwp_t *self = curthread;
2054 	uberdata_t *udp = self->ul_uberdata;
2055 	uberflags_t *gflags;
2056 	int mtype;
2057 
2058 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2059 	/*
2060 	 * Optimize the case of USYNC_THREAD, including
2061 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2062 	 * no error detection, no lock statistics,
2063 	 * and the process has only a single thread.
2064 	 * (Most likely a traditional single-threaded application.)
2065 	 */
2066 	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2067 	    udp->uberflags.uf_all) == 0) {
2068 		/*
2069 		 * Only one thread exists so we don't need an atomic operation.
2070 		 */
2071 		if (mp->mutex_lockw == 0) {
2072 			mp->mutex_lockw = LOCKSET;
2073 			mp->mutex_owner = (uintptr_t)self;
2074 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2075 			return (0);
2076 		}
2077 		if (mtype && MUTEX_OWNER(mp) == self)
2078 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2079 		return (EBUSY);
2080 	}
2081 
2082 	/*
2083 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2084 	 * no error detection, and no lock statistics.
2085 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2086 	 */
2087 	if ((gflags = self->ul_schedctl_called) != NULL &&
2088 	    (gflags->uf_trs_ted |
2089 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2090 		if (mtype & USYNC_PROCESS)
2091 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2092 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2093 			mp->mutex_owner = (uintptr_t)self;
2094 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2095 			return (0);
2096 		}
2097 		if (mtype && MUTEX_OWNER(mp) == self)
2098 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2099 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2100 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2101 			tdb_event(TD_LOCK_TRY, udp);
2102 		}
2103 		return (EBUSY);
2104 	}
2105 
2106 	/* else do it the long way */
2107 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2108 }
2109 
2110 int
2111 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2112 {
2113 	ulwp_t *self = curthread;
2114 	uberdata_t *udp = self->ul_uberdata;
2115 	int mtype = mp->mutex_type;
2116 	tdb_mutex_stats_t *msp;
2117 	int error = 0;
2118 	int release_all;
2119 	lwpid_t lwpid;
2120 
2121 	if ((mtype & LOCK_ERRORCHECK) && !mutex_is_held(mp))
2122 		return (EPERM);
2123 
2124 	if (self->ul_error_detection && !mutex_is_held(mp))
2125 		lock_error(mp, "mutex_unlock", NULL, NULL);
2126 
2127 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2128 		mp->mutex_rcount--;
2129 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2130 		return (0);
2131 	}
2132 
2133 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2134 		(void) record_hold_time(msp);
2135 
2136 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2137 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2138 		ASSERT(mp->mutex_type & LOCK_ROBUST);
2139 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2140 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2141 	}
2142 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2143 
2144 	if (mtype & LOCK_PRIO_INHERIT) {
2145 		no_preempt(self);
2146 		mp->mutex_owner = 0;
2147 		mp->mutex_ownerpid = 0;
2148 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2149 		mp->mutex_lockw = LOCKCLEAR;
2150 		error = ___lwp_mutex_unlock(mp);
2151 		preempt(self);
2152 	} else if (mtype & USYNC_PROCESS) {
2153 		mutex_unlock_process(mp, release_all);
2154 	} else {	/* USYNC_THREAD */
2155 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2156 			(void) __lwp_unpark(lwpid);
2157 			preempt(self);
2158 		}
2159 	}
2160 
2161 	if (mtype & LOCK_ROBUST)
2162 		forget_lock(mp);
2163 
2164 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2165 		_ceil_prio_waive();
2166 
2167 	return (error);
2168 }
2169 
2170 #pragma weak _private_mutex_unlock = __mutex_unlock
2171 #pragma weak mutex_unlock = __mutex_unlock
2172 #pragma weak _mutex_unlock = __mutex_unlock
2173 #pragma weak pthread_mutex_unlock = __mutex_unlock
2174 #pragma weak _pthread_mutex_unlock = __mutex_unlock
2175 int
2176 __mutex_unlock(mutex_t *mp)
2177 {
2178 	ulwp_t *self = curthread;
2179 	uberdata_t *udp = self->ul_uberdata;
2180 	uberflags_t *gflags;
2181 	lwpid_t lwpid;
2182 	int mtype;
2183 	short el;
2184 
2185 	/*
2186 	 * Optimize the case of USYNC_THREAD, including
2187 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2188 	 * no error detection, no lock statistics,
2189 	 * and the process has only a single thread.
2190 	 * (Most likely a traditional single-threaded application.)
2191 	 */
2192 	if ((((mtype = mp->mutex_type) & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2193 	    udp->uberflags.uf_all) == 0) {
2194 		if (mtype) {
2195 			/*
2196 			 * At this point we know that one or both of the
2197 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2198 			 */
2199 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2200 				return (EPERM);
2201 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2202 				mp->mutex_rcount--;
2203 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2204 				return (0);
2205 			}
2206 		}
2207 		/*
2208 		 * Only one thread exists so we don't need an atomic operation.
2209 		 * Also, there can be no waiters.
2210 		 */
2211 		mp->mutex_owner = 0;
2212 		mp->mutex_lockword = 0;
2213 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2214 		return (0);
2215 	}
2216 
2217 	/*
2218 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2219 	 * no error detection, and no lock statistics.
2220 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2221 	 */
2222 	if ((gflags = self->ul_schedctl_called) != NULL) {
2223 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2224 fast_unlock:
2225 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2226 				(void) __lwp_unpark(lwpid);
2227 				preempt(self);
2228 			}
2229 			return (0);
2230 		}
2231 		if (el)		/* error detection or lock statistics */
2232 			goto slow_unlock;
2233 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2234 			/*
2235 			 * At this point we know that one or both of the
2236 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2237 			 */
2238 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2239 				return (EPERM);
2240 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2241 				mp->mutex_rcount--;
2242 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2243 				return (0);
2244 			}
2245 			goto fast_unlock;
2246 		}
2247 		if ((mtype &
2248 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2249 			/*
2250 			 * At this point we know that zero, one, or both of the
2251 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2252 			 * that the USYNC_PROCESS flag is set.
2253 			 */
2254 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2255 				return (EPERM);
2256 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2257 				mp->mutex_rcount--;
2258 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2259 				return (0);
2260 			}
2261 			mutex_unlock_process(mp, 0);
2262 			return (0);
2263 		}
2264 	}
2265 
2266 	/* else do it the long way */
2267 slow_unlock:
2268 	return (mutex_unlock_internal(mp, 0));
2269 }
2270 
2271 /*
2272  * Internally to the library, almost all mutex lock/unlock actions
2273  * go through these lmutex_ functions, to protect critical regions.
2274  * We replicate a bit of code from __mutex_lock() and __mutex_unlock()
2275  * to make these functions faster since we know that the mutex type
2276  * of all internal locks is USYNC_THREAD.  We also know that internal
2277  * locking can never fail, so we panic if it does.
2278  */
2279 void
2280 lmutex_lock(mutex_t *mp)
2281 {
2282 	ulwp_t *self = curthread;
2283 	uberdata_t *udp = self->ul_uberdata;
2284 
2285 	ASSERT(mp->mutex_type == USYNC_THREAD);
2286 
2287 	enter_critical(self);
2288 	/*
2289 	 * Optimize the case of no lock statistics and only a single thread.
2290 	 * (Most likely a traditional single-threaded application.)
2291 	 */
2292 	if (udp->uberflags.uf_all == 0) {
2293 		/*
2294 		 * Only one thread exists; the mutex must be free.
2295 		 */
2296 		ASSERT(mp->mutex_lockw == 0);
2297 		mp->mutex_lockw = LOCKSET;
2298 		mp->mutex_owner = (uintptr_t)self;
2299 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2300 	} else {
2301 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2302 
2303 		if (!self->ul_schedctl_called)
2304 			(void) setup_schedctl();
2305 
2306 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2307 			mp->mutex_owner = (uintptr_t)self;
2308 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2309 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2310 			(void) mutex_lock_queue(self, msp, mp, NULL);
2311 		}
2312 
2313 		if (msp)
2314 			record_begin_hold(msp);
2315 	}
2316 }
2317 
2318 void
2319 lmutex_unlock(mutex_t *mp)
2320 {
2321 	ulwp_t *self = curthread;
2322 	uberdata_t *udp = self->ul_uberdata;
2323 
2324 	ASSERT(mp->mutex_type == USYNC_THREAD);
2325 
2326 	/*
2327 	 * Optimize the case of no lock statistics and only a single thread.
2328 	 * (Most likely a traditional single-threaded application.)
2329 	 */
2330 	if (udp->uberflags.uf_all == 0) {
2331 		/*
2332 		 * Only one thread exists so there can be no waiters.
2333 		 */
2334 		mp->mutex_owner = 0;
2335 		mp->mutex_lockword = 0;
2336 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2337 	} else {
2338 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2339 		lwpid_t lwpid;
2340 
2341 		if (msp)
2342 			(void) record_hold_time(msp);
2343 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2344 			(void) __lwp_unpark(lwpid);
2345 			preempt(self);
2346 		}
2347 	}
2348 	exit_critical(self);
2349 }
2350 
2351 /*
2352  * For specialized code in libc, like the asynchronous i/o code,
2353  * the following sig_*() locking primitives are used in order
2354  * to make the code asynchronous signal safe.  Signals are
2355  * deferred while locks acquired by these functions are held.
2356  */
2357 void
2358 sig_mutex_lock(mutex_t *mp)
2359 {
2360 	sigoff(curthread);
2361 	(void) _private_mutex_lock(mp);
2362 }
2363 
2364 void
2365 sig_mutex_unlock(mutex_t *mp)
2366 {
2367 	(void) _private_mutex_unlock(mp);
2368 	sigon(curthread);
2369 }
2370 
2371 int
2372 sig_mutex_trylock(mutex_t *mp)
2373 {
2374 	int error;
2375 
2376 	sigoff(curthread);
2377 	if ((error = _private_mutex_trylock(mp)) != 0)
2378 		sigon(curthread);
2379 	return (error);
2380 }
2381 
2382 /*
2383  * sig_cond_wait() is a cancellation point.
2384  */
2385 int
2386 sig_cond_wait(cond_t *cv, mutex_t *mp)
2387 {
2388 	int error;
2389 
2390 	ASSERT(curthread->ul_sigdefer != 0);
2391 	_private_testcancel();
2392 	error = _cond_wait(cv, mp);
2393 	if (error == EINTR && curthread->ul_cursig) {
2394 		sig_mutex_unlock(mp);
2395 		/* take the deferred signal here */
2396 		sig_mutex_lock(mp);
2397 	}
2398 	_private_testcancel();
2399 	return (error);
2400 }
2401 
2402 /*
2403  * sig_cond_reltimedwait() is a cancellation point.
2404  */
2405 int
2406 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2407 {
2408 	int error;
2409 
2410 	ASSERT(curthread->ul_sigdefer != 0);
2411 	_private_testcancel();
2412 	error = _cond_reltimedwait(cv, mp, ts);
2413 	if (error == EINTR && curthread->ul_cursig) {
2414 		sig_mutex_unlock(mp);
2415 		/* take the deferred signal here */
2416 		sig_mutex_lock(mp);
2417 	}
2418 	_private_testcancel();
2419 	return (error);
2420 }
2421 
2422 static int
2423 shared_mutex_held(mutex_t *mparg)
2424 {
2425 	/*
2426 	 * The 'volatile' is necessary to make sure the compiler doesn't
2427 	 * reorder the tests of the various components of the mutex.
2428 	 * They must be tested in this order:
2429 	 *	mutex_lockw
2430 	 *	mutex_owner
2431 	 *	mutex_ownerpid
2432 	 * This relies on the fact that everywhere mutex_lockw is cleared,
2433 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2434 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2435 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2436 	 * mutex_lockw is set or cleared with a memory barrier.
2437 	 */
2438 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2439 	ulwp_t *self = curthread;
2440 	uberdata_t *udp = self->ul_uberdata;
2441 
2442 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2443 }
2444 
2445 /*
2446  * Some crufty old programs define their own version of _mutex_held()
2447  * to be simply return(1).  This breaks internal libc logic, so we
2448  * define a private version for exclusive use by libc, mutex_is_held(),
2449  * and also a new public function, __mutex_held(), to be used in new
2450  * code to circumvent these crufty old programs.
2451  */
2452 #pragma weak mutex_held = mutex_is_held
2453 #pragma weak _mutex_held = mutex_is_held
2454 #pragma weak __mutex_held = mutex_is_held
2455 int
2456 mutex_is_held(mutex_t *mparg)
2457 {
2458 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2459 
2460 	if (mparg->mutex_type & USYNC_PROCESS)
2461 		return (shared_mutex_held(mparg));
2462 	return (MUTEX_OWNED(mp, curthread));
2463 }
2464 
2465 #pragma weak _private_mutex_destroy = __mutex_destroy
2466 #pragma weak mutex_destroy = __mutex_destroy
2467 #pragma weak _mutex_destroy = __mutex_destroy
2468 #pragma weak pthread_mutex_destroy = __mutex_destroy
2469 #pragma weak _pthread_mutex_destroy = __mutex_destroy
2470 int
2471 __mutex_destroy(mutex_t *mp)
2472 {
2473 	if (mp->mutex_type & USYNC_PROCESS)
2474 		forget_lock(mp);
2475 	(void) _memset(mp, 0, sizeof (*mp));
2476 	tdb_sync_obj_deregister(mp);
2477 	return (0);
2478 }
2479 
2480 #pragma weak mutex_consistent = __mutex_consistent
2481 #pragma weak _mutex_consistent = __mutex_consistent
2482 #pragma weak pthread_mutex_consistent_np = __mutex_consistent
2483 #pragma weak _pthread_mutex_consistent_np = __mutex_consistent
2484 int
2485 __mutex_consistent(mutex_t *mp)
2486 {
2487 	/*
2488 	 * Do this only for an inconsistent, initialized robust lock
2489 	 * that we hold.  For all other cases, return EINVAL.
2490 	 */
2491 	if (mutex_is_held(mp) &&
2492 	    (mp->mutex_type & LOCK_ROBUST) &&
2493 	    (mp->mutex_flag & LOCK_INITED) &&
2494 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2495 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2496 		mp->mutex_rcount = 0;
2497 		return (0);
2498 	}
2499 	return (EINVAL);
2500 }
2501 
2502 /*
2503  * Spin locks are separate from ordinary mutexes,
2504  * but we use the same data structure for them.
2505  */
2506 
2507 #pragma weak pthread_spin_init = _pthread_spin_init
2508 int
2509 _pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2510 {
2511 	mutex_t *mp = (mutex_t *)lock;
2512 
2513 	(void) _memset(mp, 0, sizeof (*mp));
2514 	if (pshared == PTHREAD_PROCESS_SHARED)
2515 		mp->mutex_type = USYNC_PROCESS;
2516 	else
2517 		mp->mutex_type = USYNC_THREAD;
2518 	mp->mutex_flag = LOCK_INITED;
2519 	mp->mutex_magic = MUTEX_MAGIC;
2520 	return (0);
2521 }
2522 
2523 #pragma weak pthread_spin_destroy = _pthread_spin_destroy
2524 int
2525 _pthread_spin_destroy(pthread_spinlock_t *lock)
2526 {
2527 	(void) _memset(lock, 0, sizeof (*lock));
2528 	return (0);
2529 }
2530 
2531 #pragma weak pthread_spin_trylock = _pthread_spin_trylock
2532 int
2533 _pthread_spin_trylock(pthread_spinlock_t *lock)
2534 {
2535 	mutex_t *mp = (mutex_t *)lock;
2536 	ulwp_t *self = curthread;
2537 	int error = 0;
2538 
2539 	no_preempt(self);
2540 	if (set_lock_byte(&mp->mutex_lockw) != 0)
2541 		error = EBUSY;
2542 	else {
2543 		mp->mutex_owner = (uintptr_t)self;
2544 		if (mp->mutex_type == USYNC_PROCESS)
2545 			mp->mutex_ownerpid = self->ul_uberdata->pid;
2546 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2547 	}
2548 	preempt(self);
2549 	return (error);
2550 }
2551 
2552 #pragma weak pthread_spin_lock = _pthread_spin_lock
2553 int
2554 _pthread_spin_lock(pthread_spinlock_t *lock)
2555 {
2556 	mutex_t *mp = (mutex_t *)lock;
2557 	ulwp_t *self = curthread;
2558 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2559 	int count = 0;
2560 
2561 	ASSERT(!self->ul_critical || self->ul_bindflags);
2562 
2563 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
2564 
2565 	/*
2566 	 * We don't care whether the owner is running on a processor.
2567 	 * We just spin because that's what this interface requires.
2568 	 */
2569 	for (;;) {
2570 		if (*lockp == 0) {	/* lock byte appears to be clear */
2571 			no_preempt(self);
2572 			if (set_lock_byte(lockp) == 0)
2573 				break;
2574 			preempt(self);
2575 		}
2576 		if (count < INT_MAX)
2577 			count++;
2578 		SMT_PAUSE();
2579 	}
2580 	mp->mutex_owner = (uintptr_t)self;
2581 	if (mp->mutex_type == USYNC_PROCESS)
2582 		mp->mutex_ownerpid = self->ul_uberdata->pid;
2583 	preempt(self);
2584 	if (count) {
2585 		DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
2586 	}
2587 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2588 	return (0);
2589 }
2590 
2591 #pragma weak pthread_spin_unlock = _pthread_spin_unlock
2592 int
2593 _pthread_spin_unlock(pthread_spinlock_t *lock)
2594 {
2595 	mutex_t *mp = (mutex_t *)lock;
2596 	ulwp_t *self = curthread;
2597 
2598 	no_preempt(self);
2599 	mp->mutex_owner = 0;
2600 	mp->mutex_ownerpid = 0;
2601 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2602 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
2603 	preempt(self);
2604 	return (0);
2605 }
2606 
2607 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
2608 
2609 /*
2610  * Find/allocate an entry for 'lock' in our array of held locks.
2611  */
2612 static mutex_t **
2613 find_lock_entry(mutex_t *lock)
2614 {
2615 	ulwp_t *self = curthread;
2616 	mutex_t **remembered = NULL;
2617 	mutex_t **lockptr;
2618 	uint_t nlocks;
2619 
2620 	if ((nlocks = self->ul_heldlockcnt) != 0)
2621 		lockptr = self->ul_heldlocks.array;
2622 	else {
2623 		nlocks = 1;
2624 		lockptr = &self->ul_heldlocks.single;
2625 	}
2626 
2627 	for (; nlocks; nlocks--, lockptr++) {
2628 		if (*lockptr == lock)
2629 			return (lockptr);
2630 		if (*lockptr == NULL && remembered == NULL)
2631 			remembered = lockptr;
2632 	}
2633 	if (remembered != NULL) {
2634 		*remembered = lock;
2635 		return (remembered);
2636 	}
2637 
2638 	/*
2639 	 * No entry available.  Allocate more space, converting
2640 	 * the single entry into an array of entries if necessary.
2641 	 */
2642 	if ((nlocks = self->ul_heldlockcnt) == 0) {
2643 		/*
2644 		 * Initial allocation of the array.
2645 		 * Convert the single entry into an array.
2646 		 */
2647 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
2648 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
2649 		/*
2650 		 * The single entry becomes the first entry in the array.
2651 		 */
2652 		*lockptr = self->ul_heldlocks.single;
2653 		self->ul_heldlocks.array = lockptr;
2654 		/*
2655 		 * Return the next available entry in the array.
2656 		 */
2657 		*++lockptr = lock;
2658 		return (lockptr);
2659 	}
2660 	/*
2661 	 * Reallocate the array, double the size each time.
2662 	 */
2663 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
2664 	(void) _memcpy(lockptr, self->ul_heldlocks.array,
2665 	    nlocks * sizeof (mutex_t *));
2666 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2667 	self->ul_heldlocks.array = lockptr;
2668 	self->ul_heldlockcnt *= 2;
2669 	/*
2670 	 * Return the next available entry in the newly allocated array.
2671 	 */
2672 	*(lockptr += nlocks) = lock;
2673 	return (lockptr);
2674 }
2675 
2676 /*
2677  * Insert 'lock' into our list of held locks.
2678  * Currently only used for LOCK_ROBUST mutexes.
2679  */
2680 void
2681 remember_lock(mutex_t *lock)
2682 {
2683 	(void) find_lock_entry(lock);
2684 }
2685 
2686 /*
2687  * Remove 'lock' from our list of held locks.
2688  * Currently only used for LOCK_ROBUST mutexes.
2689  */
2690 void
2691 forget_lock(mutex_t *lock)
2692 {
2693 	*find_lock_entry(lock) = NULL;
2694 }
2695 
2696 /*
2697  * Free the array of held locks.
2698  */
2699 void
2700 heldlock_free(ulwp_t *ulwp)
2701 {
2702 	uint_t nlocks;
2703 
2704 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
2705 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2706 	ulwp->ul_heldlockcnt = 0;
2707 	ulwp->ul_heldlocks.array = NULL;
2708 }
2709 
2710 /*
2711  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
2712  * Called from _thrp_exit() to deal with abandoned locks.
2713  */
2714 void
2715 heldlock_exit(void)
2716 {
2717 	ulwp_t *self = curthread;
2718 	mutex_t **lockptr;
2719 	uint_t nlocks;
2720 	mutex_t *mp;
2721 
2722 	if ((nlocks = self->ul_heldlockcnt) != 0)
2723 		lockptr = self->ul_heldlocks.array;
2724 	else {
2725 		nlocks = 1;
2726 		lockptr = &self->ul_heldlocks.single;
2727 	}
2728 
2729 	for (; nlocks; nlocks--, lockptr++) {
2730 		/*
2731 		 * The kernel takes care of transitioning held
2732 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
2733 		 * We avoid that case here.
2734 		 */
2735 		if ((mp = *lockptr) != NULL &&
2736 		    mutex_is_held(mp) &&
2737 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
2738 		    LOCK_ROBUST) {
2739 			mp->mutex_rcount = 0;
2740 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
2741 				mp->mutex_flag |= LOCK_OWNERDEAD;
2742 			(void) mutex_unlock_internal(mp, 1);
2743 		}
2744 	}
2745 
2746 	heldlock_free(self);
2747 }
2748 
2749 #pragma weak cond_init = _cond_init
2750 /* ARGSUSED2 */
2751 int
2752 _cond_init(cond_t *cvp, int type, void *arg)
2753 {
2754 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
2755 		return (EINVAL);
2756 	(void) _memset(cvp, 0, sizeof (*cvp));
2757 	cvp->cond_type = (uint16_t)type;
2758 	cvp->cond_magic = COND_MAGIC;
2759 	return (0);
2760 }
2761 
2762 /*
2763  * cond_sleep_queue(): utility function for cond_wait_queue().
2764  *
2765  * Go to sleep on a condvar sleep queue, expect to be waked up
2766  * by someone calling cond_signal() or cond_broadcast() or due
2767  * to receiving a UNIX signal or being cancelled, or just simply
2768  * due to a spurious wakeup (like someome calling forkall()).
2769  *
2770  * The associated mutex is *not* reacquired before returning.
2771  * That must be done by the caller of cond_sleep_queue().
2772  */
2773 static int
2774 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2775 {
2776 	ulwp_t *self = curthread;
2777 	queue_head_t *qp;
2778 	queue_head_t *mqp;
2779 	lwpid_t lwpid;
2780 	int signalled;
2781 	int error;
2782 	int release_all;
2783 
2784 	/*
2785 	 * Put ourself on the CV sleep queue, unlock the mutex, then
2786 	 * park ourself and unpark a candidate lwp to grab the mutex.
2787 	 * We must go onto the CV sleep queue before dropping the
2788 	 * mutex in order to guarantee atomicity of the operation.
2789 	 */
2790 	self->ul_sp = stkptr();
2791 	qp = queue_lock(cvp, CV);
2792 	enqueue(qp, self, cvp, CV);
2793 	cvp->cond_waiters_user = 1;
2794 	self->ul_cvmutex = mp;
2795 	self->ul_cv_wake = (tsp != NULL);
2796 	self->ul_signalled = 0;
2797 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
2798 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
2799 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2800 	}
2801 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2802 	lwpid = mutex_unlock_queue(mp, release_all);
2803 	for (;;) {
2804 		set_parking_flag(self, 1);
2805 		queue_unlock(qp);
2806 		if (lwpid != 0) {
2807 			lwpid = preempt_unpark(self, lwpid);
2808 			preempt(self);
2809 		}
2810 		/*
2811 		 * We may have a deferred signal present,
2812 		 * in which case we should return EINTR.
2813 		 * Also, we may have received a SIGCANCEL; if so
2814 		 * and we are cancelable we should return EINTR.
2815 		 * We force an immediate EINTR return from
2816 		 * __lwp_park() by turning our parking flag off.
2817 		 */
2818 		if (self->ul_cursig != 0 ||
2819 		    (self->ul_cancelable && self->ul_cancel_pending))
2820 			set_parking_flag(self, 0);
2821 		/*
2822 		 * __lwp_park() will return the residual time in tsp
2823 		 * if we are unparked before the timeout expires.
2824 		 */
2825 		error = __lwp_park(tsp, lwpid);
2826 		set_parking_flag(self, 0);
2827 		lwpid = 0;	/* unpark the other lwp only once */
2828 		/*
2829 		 * We were waked up by cond_signal(), cond_broadcast(),
2830 		 * by an interrupt or timeout (EINTR or ETIME),
2831 		 * or we may just have gotten a spurious wakeup.
2832 		 */
2833 		qp = queue_lock(cvp, CV);
2834 		mqp = queue_lock(mp, MX);
2835 		if (self->ul_sleepq == NULL)
2836 			break;
2837 		/*
2838 		 * We are on either the condvar sleep queue or the
2839 		 * mutex sleep queue.  Break out of the sleep if we
2840 		 * were interrupted or we timed out (EINTR or ETIME).
2841 		 * Else this is a spurious wakeup; continue the loop.
2842 		 */
2843 		if (self->ul_sleepq == mqp) {		/* mutex queue */
2844 			if (error) {
2845 				mp->mutex_waiters = dequeue_self(mqp, mp);
2846 				break;
2847 			}
2848 			tsp = NULL;	/* no more timeout */
2849 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
2850 			if (error) {
2851 				cvp->cond_waiters_user = dequeue_self(qp, cvp);
2852 				break;
2853 			}
2854 			/*
2855 			 * Else a spurious wakeup on the condvar queue.
2856 			 * __lwp_park() has already adjusted the timeout.
2857 			 */
2858 		} else {
2859 			thr_panic("cond_sleep_queue(): thread not on queue");
2860 		}
2861 		queue_unlock(mqp);
2862 	}
2863 
2864 	self->ul_sp = 0;
2865 	ASSERT(self->ul_cvmutex == NULL && self->ul_cv_wake == 0);
2866 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
2867 	    self->ul_wchan == NULL);
2868 
2869 	signalled = self->ul_signalled;
2870 	self->ul_signalled = 0;
2871 	queue_unlock(qp);
2872 	queue_unlock(mqp);
2873 
2874 	/*
2875 	 * If we were concurrently cond_signal()d and any of:
2876 	 * received a UNIX signal, were cancelled, or got a timeout,
2877 	 * then perform another cond_signal() to avoid consuming it.
2878 	 */
2879 	if (error && signalled)
2880 		(void) cond_signal_internal(cvp);
2881 
2882 	return (error);
2883 }
2884 
2885 int
2886 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2887 {
2888 	ulwp_t *self = curthread;
2889 	int error;
2890 	int merror;
2891 
2892 	/*
2893 	 * The old thread library was programmed to defer signals
2894 	 * while in cond_wait() so that the associated mutex would
2895 	 * be guaranteed to be held when the application signal
2896 	 * handler was invoked.
2897 	 *
2898 	 * We do not behave this way by default; the state of the
2899 	 * associated mutex in the signal handler is undefined.
2900 	 *
2901 	 * To accommodate applications that depend on the old
2902 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
2903 	 * variable can be set to 1 and we will behave in the
2904 	 * old way with respect to cond_wait().
2905 	 */
2906 	if (self->ul_cond_wait_defer)
2907 		sigoff(self);
2908 
2909 	error = cond_sleep_queue(cvp, mp, tsp);
2910 
2911 	/*
2912 	 * Reacquire the mutex.
2913 	 */
2914 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
2915 		error = merror;
2916 
2917 	/*
2918 	 * Take any deferred signal now, after we have reacquired the mutex.
2919 	 */
2920 	if (self->ul_cond_wait_defer)
2921 		sigon(self);
2922 
2923 	return (error);
2924 }
2925 
2926 /*
2927  * cond_sleep_kernel(): utility function for cond_wait_kernel().
2928  * See the comment ahead of cond_sleep_queue(), above.
2929  */
2930 static int
2931 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2932 {
2933 	int mtype = mp->mutex_type;
2934 	ulwp_t *self = curthread;
2935 	int error;
2936 
2937 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2938 		_ceil_prio_waive();
2939 
2940 	self->ul_sp = stkptr();
2941 	self->ul_wchan = cvp;
2942 	mp->mutex_owner = 0;
2943 	mp->mutex_ownerpid = 0;
2944 	if (mtype & LOCK_PRIO_INHERIT)
2945 		mp->mutex_lockw = LOCKCLEAR;
2946 	/*
2947 	 * ___lwp_cond_wait() returns immediately with EINTR if
2948 	 * set_parking_flag(self,0) is called on this lwp before it
2949 	 * goes to sleep in the kernel.  sigacthandler() calls this
2950 	 * when a deferred signal is noted.  This assures that we don't
2951 	 * get stuck in ___lwp_cond_wait() with all signals blocked
2952 	 * due to taking a deferred signal before going to sleep.
2953 	 */
2954 	set_parking_flag(self, 1);
2955 	if (self->ul_cursig != 0 ||
2956 	    (self->ul_cancelable && self->ul_cancel_pending))
2957 		set_parking_flag(self, 0);
2958 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
2959 	set_parking_flag(self, 0);
2960 	self->ul_sp = 0;
2961 	self->ul_wchan = NULL;
2962 	return (error);
2963 }
2964 
2965 int
2966 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2967 {
2968 	ulwp_t *self = curthread;
2969 	int error;
2970 	int merror;
2971 
2972 	/*
2973 	 * See the large comment in cond_wait_queue(), above.
2974 	 */
2975 	if (self->ul_cond_wait_defer)
2976 		sigoff(self);
2977 
2978 	error = cond_sleep_kernel(cvp, mp, tsp);
2979 
2980 	/*
2981 	 * Override the return code from ___lwp_cond_wait()
2982 	 * with any non-zero return code from mutex_lock().
2983 	 * This addresses robust lock failures in particular;
2984 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
2985 	 * errors in order to take corrective action.
2986 	 */
2987 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
2988 		error = merror;
2989 
2990 	/*
2991 	 * Take any deferred signal now, after we have reacquired the mutex.
2992 	 */
2993 	if (self->ul_cond_wait_defer)
2994 		sigon(self);
2995 
2996 	return (error);
2997 }
2998 
2999 /*
3000  * Common code for _cond_wait() and _cond_timedwait()
3001  */
3002 int
3003 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3004 {
3005 	int mtype = mp->mutex_type;
3006 	hrtime_t begin_sleep = 0;
3007 	ulwp_t *self = curthread;
3008 	uberdata_t *udp = self->ul_uberdata;
3009 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3010 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3011 	uint8_t rcount;
3012 	int error = 0;
3013 
3014 	/*
3015 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3016 	 *	Except in the case of [ETIMEDOUT], all these error checks
3017 	 *	shall act as if they were performed immediately at the
3018 	 *	beginning of processing for the function and shall cause
3019 	 *	an error return, in effect, prior to modifying the state
3020 	 *	of the mutex specified by mutex or the condition variable
3021 	 *	specified by cond.
3022 	 * Therefore, we must return EINVAL now if the timout is invalid.
3023 	 */
3024 	if (tsp != NULL &&
3025 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3026 		return (EINVAL);
3027 
3028 	if (__td_event_report(self, TD_SLEEP, udp)) {
3029 		self->ul_sp = stkptr();
3030 		self->ul_wchan = cvp;
3031 		self->ul_td_evbuf.eventnum = TD_SLEEP;
3032 		self->ul_td_evbuf.eventdata = cvp;
3033 		tdb_event(TD_SLEEP, udp);
3034 		self->ul_sp = 0;
3035 	}
3036 	if (csp) {
3037 		if (tsp)
3038 			tdb_incr(csp->cond_timedwait);
3039 		else
3040 			tdb_incr(csp->cond_wait);
3041 	}
3042 	if (msp)
3043 		begin_sleep = record_hold_time(msp);
3044 	else if (csp)
3045 		begin_sleep = gethrtime();
3046 
3047 	if (self->ul_error_detection) {
3048 		if (!mutex_is_held(mp))
3049 			lock_error(mp, "cond_wait", cvp, NULL);
3050 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3051 			lock_error(mp, "recursive mutex in cond_wait",
3052 			    cvp, NULL);
3053 		if (cvp->cond_type & USYNC_PROCESS) {
3054 			if (!(mtype & USYNC_PROCESS))
3055 				lock_error(mp, "cond_wait", cvp,
3056 				    "condvar process-shared, "
3057 				    "mutex process-private");
3058 		} else {
3059 			if (mtype & USYNC_PROCESS)
3060 				lock_error(mp, "cond_wait", cvp,
3061 				    "condvar process-private, "
3062 				    "mutex process-shared");
3063 		}
3064 	}
3065 
3066 	/*
3067 	 * We deal with recursive mutexes by completely
3068 	 * dropping the lock and restoring the recursion
3069 	 * count after waking up.  This is arguably wrong,
3070 	 * but it obeys the principle of least astonishment.
3071 	 */
3072 	rcount = mp->mutex_rcount;
3073 	mp->mutex_rcount = 0;
3074 	if ((mtype &
3075 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3076 	    (cvp->cond_type & USYNC_PROCESS))
3077 		error = cond_wait_kernel(cvp, mp, tsp);
3078 	else
3079 		error = cond_wait_queue(cvp, mp, tsp);
3080 	mp->mutex_rcount = rcount;
3081 
3082 	if (csp) {
3083 		hrtime_t lapse = gethrtime() - begin_sleep;
3084 		if (tsp == NULL)
3085 			csp->cond_wait_sleep_time += lapse;
3086 		else {
3087 			csp->cond_timedwait_sleep_time += lapse;
3088 			if (error == ETIME)
3089 				tdb_incr(csp->cond_timedwait_timeout);
3090 		}
3091 	}
3092 	return (error);
3093 }
3094 
3095 /*
3096  * cond_wait() is a cancellation point but _cond_wait() is not.
3097  * System libraries call the non-cancellation version.
3098  * It is expected that only applications call the cancellation version.
3099  */
3100 int
3101 _cond_wait(cond_t *cvp, mutex_t *mp)
3102 {
3103 	ulwp_t *self = curthread;
3104 	uberdata_t *udp = self->ul_uberdata;
3105 	uberflags_t *gflags;
3106 
3107 	/*
3108 	 * Optimize the common case of USYNC_THREAD plus
3109 	 * no error detection, no lock statistics, and no event tracing.
3110 	 */
3111 	if ((gflags = self->ul_schedctl_called) != NULL &&
3112 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3113 	    self->ul_td_events_enable |
3114 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3115 		return (cond_wait_queue(cvp, mp, NULL));
3116 
3117 	/*
3118 	 * Else do it the long way.
3119 	 */
3120 	return (cond_wait_common(cvp, mp, NULL));
3121 }
3122 
3123 int
3124 cond_wait(cond_t *cvp, mutex_t *mp)
3125 {
3126 	int error;
3127 
3128 	_cancelon();
3129 	error = _cond_wait(cvp, mp);
3130 	if (error == EINTR)
3131 		_canceloff();
3132 	else
3133 		_canceloff_nocancel();
3134 	return (error);
3135 }
3136 
3137 #pragma weak pthread_cond_wait = _pthread_cond_wait
3138 int
3139 _pthread_cond_wait(cond_t *cvp, mutex_t *mp)
3140 {
3141 	int error;
3142 
3143 	error = cond_wait(cvp, mp);
3144 	return ((error == EINTR)? 0 : error);
3145 }
3146 
3147 /*
3148  * cond_timedwait() is a cancellation point but _cond_timedwait() is not.
3149  * System libraries call the non-cancellation version.
3150  * It is expected that only applications call the cancellation version.
3151  */
3152 int
3153 _cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3154 {
3155 	clockid_t clock_id = cvp->cond_clockid;
3156 	timespec_t reltime;
3157 	int error;
3158 
3159 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3160 		clock_id = CLOCK_REALTIME;
3161 	abstime_to_reltime(clock_id, abstime, &reltime);
3162 	error = cond_wait_common(cvp, mp, &reltime);
3163 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3164 		/*
3165 		 * Don't return ETIME if we didn't really get a timeout.
3166 		 * This can happen if we return because someone resets
3167 		 * the system clock.  Just return zero in this case,
3168 		 * giving a spurious wakeup but not a timeout.
3169 		 */
3170 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3171 		    abstime->tv_nsec > gethrtime())
3172 			error = 0;
3173 	}
3174 	return (error);
3175 }
3176 
3177 int
3178 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3179 {
3180 	int error;
3181 
3182 	_cancelon();
3183 	error = _cond_timedwait(cvp, mp, abstime);
3184 	if (error == EINTR)
3185 		_canceloff();
3186 	else
3187 		_canceloff_nocancel();
3188 	return (error);
3189 }
3190 
3191 #pragma weak pthread_cond_timedwait = _pthread_cond_timedwait
3192 int
3193 _pthread_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3194 {
3195 	int error;
3196 
3197 	error = cond_timedwait(cvp, mp, abstime);
3198 	if (error == ETIME)
3199 		error = ETIMEDOUT;
3200 	else if (error == EINTR)
3201 		error = 0;
3202 	return (error);
3203 }
3204 
3205 /*
3206  * cond_reltimedwait() is a cancellation point but _cond_reltimedwait()
3207  * is not.  System libraries call the non-cancellation version.
3208  * It is expected that only applications call the cancellation version.
3209  */
3210 int
3211 _cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3212 {
3213 	timespec_t tslocal = *reltime;
3214 
3215 	return (cond_wait_common(cvp, mp, &tslocal));
3216 }
3217 
3218 #pragma weak cond_reltimedwait = _cond_reltimedwait_cancel
3219 int
3220 _cond_reltimedwait_cancel(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3221 {
3222 	int error;
3223 
3224 	_cancelon();
3225 	error = _cond_reltimedwait(cvp, mp, reltime);
3226 	if (error == EINTR)
3227 		_canceloff();
3228 	else
3229 		_canceloff_nocancel();
3230 	return (error);
3231 }
3232 
3233 #pragma weak pthread_cond_reltimedwait_np = _pthread_cond_reltimedwait_np
3234 int
3235 _pthread_cond_reltimedwait_np(cond_t *cvp, mutex_t *mp,
3236 	const timespec_t *reltime)
3237 {
3238 	int error;
3239 
3240 	error = _cond_reltimedwait_cancel(cvp, mp, reltime);
3241 	if (error == ETIME)
3242 		error = ETIMEDOUT;
3243 	else if (error == EINTR)
3244 		error = 0;
3245 	return (error);
3246 }
3247 
3248 #pragma weak pthread_cond_signal = cond_signal_internal
3249 #pragma weak _pthread_cond_signal = cond_signal_internal
3250 #pragma weak cond_signal = cond_signal_internal
3251 #pragma weak _cond_signal = cond_signal_internal
3252 int
3253 cond_signal_internal(cond_t *cvp)
3254 {
3255 	ulwp_t *self = curthread;
3256 	uberdata_t *udp = self->ul_uberdata;
3257 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3258 	int error = 0;
3259 	queue_head_t *qp;
3260 	mutex_t *mp;
3261 	queue_head_t *mqp;
3262 	ulwp_t **ulwpp;
3263 	ulwp_t *ulwp;
3264 	ulwp_t *prev = NULL;
3265 	ulwp_t *next;
3266 	ulwp_t **suspp = NULL;
3267 	ulwp_t *susprev;
3268 
3269 	if (csp)
3270 		tdb_incr(csp->cond_signal);
3271 
3272 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3273 		error = __lwp_cond_signal(cvp);
3274 
3275 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3276 		return (error);
3277 
3278 	/*
3279 	 * Move someone from the condvar sleep queue to the mutex sleep
3280 	 * queue for the mutex that he will acquire on being waked up.
3281 	 * We can do this only if we own the mutex he will acquire.
3282 	 * If we do not own the mutex, or if his ul_cv_wake flag
3283 	 * is set, just dequeue and unpark him.
3284 	 */
3285 	qp = queue_lock(cvp, CV);
3286 	for (ulwpp = &qp->qh_head; (ulwp = *ulwpp) != NULL;
3287 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
3288 		if (ulwp->ul_wchan == cvp) {
3289 			if (!ulwp->ul_stop)
3290 				break;
3291 			/*
3292 			 * Try not to dequeue a suspended thread.
3293 			 * This mimics the old libthread's behavior.
3294 			 */
3295 			if (suspp == NULL) {
3296 				suspp = ulwpp;
3297 				susprev = prev;
3298 			}
3299 		}
3300 	}
3301 	if (ulwp == NULL && suspp != NULL) {
3302 		ulwp = *(ulwpp = suspp);
3303 		prev = susprev;
3304 		suspp = NULL;
3305 	}
3306 	if (ulwp == NULL) {	/* no one on the sleep queue */
3307 		cvp->cond_waiters_user = 0;
3308 		queue_unlock(qp);
3309 		return (error);
3310 	}
3311 	/*
3312 	 * Scan the remainder of the CV queue for another waiter.
3313 	 */
3314 	if (suspp != NULL) {
3315 		next = *suspp;
3316 	} else {
3317 		for (next = ulwp->ul_link; next != NULL; next = next->ul_link)
3318 			if (next->ul_wchan == cvp)
3319 				break;
3320 	}
3321 	if (next == NULL)
3322 		cvp->cond_waiters_user = 0;
3323 
3324 	/*
3325 	 * Inform the thread that he was the recipient of a cond_signal().
3326 	 * This lets him deal with cond_signal() and, concurrently,
3327 	 * one or more of a cancellation, a UNIX signal, or a timeout.
3328 	 * These latter conditions must not consume a cond_signal().
3329 	 */
3330 	ulwp->ul_signalled = 1;
3331 
3332 	/*
3333 	 * Dequeue the waiter but leave his ul_sleepq non-NULL
3334 	 * while we move him to the mutex queue so that he can
3335 	 * deal properly with spurious wakeups.
3336 	 */
3337 	*ulwpp = ulwp->ul_link;
3338 	ulwp->ul_link = NULL;
3339 	if (qp->qh_tail == ulwp)
3340 		qp->qh_tail = prev;
3341 	qp->qh_qlen--;
3342 
3343 	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
3344 	ulwp->ul_cvmutex = NULL;
3345 	ASSERT(mp != NULL);
3346 
3347 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3348 		lwpid_t lwpid = ulwp->ul_lwpid;
3349 
3350 		no_preempt(self);
3351 		ulwp->ul_sleepq = NULL;
3352 		ulwp->ul_wchan = NULL;
3353 		ulwp->ul_cv_wake = 0;
3354 		queue_unlock(qp);
3355 		(void) __lwp_unpark(lwpid);
3356 		preempt(self);
3357 	} else {
3358 		mqp = queue_lock(mp, MX);
3359 		enqueue(mqp, ulwp, mp, MX);
3360 		mp->mutex_waiters = 1;
3361 		queue_unlock(mqp);
3362 		queue_unlock(qp);
3363 	}
3364 
3365 	return (error);
3366 }
3367 
3368 /*
3369  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3370  * and rw_queue_release() to (re)allocate a big buffer to hold the
3371  * lwpids of all the threads to be set running after they are removed
3372  * from their sleep queues.  Since we are holding a queue lock, we
3373  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3374  * lwp_unpark_all() are simple system calls and are safe in this regard.
3375  */
3376 lwpid_t *
3377 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3378 {
3379 	/*
3380 	 * Allocate NEWLWPS ids on the first overflow.
3381 	 * Double the allocation each time after that.
3382 	 */
3383 	int nlwpid = *nlwpid_ptr;
3384 	int maxlwps = *maxlwps_ptr;
3385 	int first_allocation;
3386 	int newlwps;
3387 	void *vaddr;
3388 
3389 	ASSERT(nlwpid == maxlwps);
3390 
3391 	first_allocation = (maxlwps == MAXLWPS);
3392 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3393 	vaddr = _private_mmap(NULL, newlwps * sizeof (lwpid_t),
3394 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3395 
3396 	if (vaddr == MAP_FAILED) {
3397 		/*
3398 		 * Let's hope this never happens.
3399 		 * If it does, then we have a terrible
3400 		 * thundering herd on our hands.
3401 		 */
3402 		(void) __lwp_unpark_all(lwpid, nlwpid);
3403 		*nlwpid_ptr = 0;
3404 	} else {
3405 		(void) _memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3406 		if (!first_allocation)
3407 			(void) _private_munmap(lwpid,
3408 			    maxlwps * sizeof (lwpid_t));
3409 		lwpid = vaddr;
3410 		*maxlwps_ptr = newlwps;
3411 	}
3412 
3413 	return (lwpid);
3414 }
3415 
3416 #pragma weak pthread_cond_broadcast = cond_broadcast_internal
3417 #pragma weak _pthread_cond_broadcast = cond_broadcast_internal
3418 #pragma weak cond_broadcast = cond_broadcast_internal
3419 #pragma weak _cond_broadcast = cond_broadcast_internal
3420 int
3421 cond_broadcast_internal(cond_t *cvp)
3422 {
3423 	ulwp_t *self = curthread;
3424 	uberdata_t *udp = self->ul_uberdata;
3425 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3426 	int error = 0;
3427 	queue_head_t *qp;
3428 	mutex_t *mp;
3429 	mutex_t *mp_cache = NULL;
3430 	queue_head_t *mqp = NULL;
3431 	ulwp_t **ulwpp;
3432 	ulwp_t *ulwp;
3433 	ulwp_t *prev = NULL;
3434 	int nlwpid = 0;
3435 	int maxlwps = MAXLWPS;
3436 	lwpid_t buffer[MAXLWPS];
3437 	lwpid_t *lwpid = buffer;
3438 
3439 	if (csp)
3440 		tdb_incr(csp->cond_broadcast);
3441 
3442 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3443 		error = __lwp_cond_broadcast(cvp);
3444 
3445 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3446 		return (error);
3447 
3448 	/*
3449 	 * Move everyone from the condvar sleep queue to the mutex sleep
3450 	 * queue for the mutex that they will acquire on being waked up.
3451 	 * We can do this only if we own the mutex they will acquire.
3452 	 * If we do not own the mutex, or if their ul_cv_wake flag
3453 	 * is set, just dequeue and unpark them.
3454 	 *
3455 	 * We keep track of lwpids that are to be unparked in lwpid[].
3456 	 * __lwp_unpark_all() is called to unpark all of them after
3457 	 * they have been removed from the sleep queue and the sleep
3458 	 * queue lock has been dropped.  If we run out of space in our
3459 	 * on-stack buffer, we need to allocate more but we can't call
3460 	 * lmalloc() because we are holding a queue lock when the overflow
3461 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3462 	 * either because the application may have allocated a small
3463 	 * stack and we don't want to overrun the stack.  So we call
3464 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3465 	 * system call directly since that path acquires no locks.
3466 	 */
3467 	qp = queue_lock(cvp, CV);
3468 	cvp->cond_waiters_user = 0;
3469 	ulwpp = &qp->qh_head;
3470 	while ((ulwp = *ulwpp) != NULL) {
3471 		if (ulwp->ul_wchan != cvp) {
3472 			prev = ulwp;
3473 			ulwpp = &ulwp->ul_link;
3474 			continue;
3475 		}
3476 		*ulwpp = ulwp->ul_link;
3477 		ulwp->ul_link = NULL;
3478 		if (qp->qh_tail == ulwp)
3479 			qp->qh_tail = prev;
3480 		qp->qh_qlen--;
3481 		mp = ulwp->ul_cvmutex;		/* his mutex */
3482 		ulwp->ul_cvmutex = NULL;
3483 		ASSERT(mp != NULL);
3484 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3485 			ulwp->ul_sleepq = NULL;
3486 			ulwp->ul_wchan = NULL;
3487 			ulwp->ul_cv_wake = 0;
3488 			if (nlwpid == maxlwps)
3489 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3490 			lwpid[nlwpid++] = ulwp->ul_lwpid;
3491 		} else {
3492 			if (mp != mp_cache) {
3493 				mp_cache = mp;
3494 				if (mqp != NULL)
3495 					queue_unlock(mqp);
3496 				mqp = queue_lock(mp, MX);
3497 			}
3498 			enqueue(mqp, ulwp, mp, MX);
3499 			mp->mutex_waiters = 1;
3500 		}
3501 	}
3502 	if (mqp != NULL)
3503 		queue_unlock(mqp);
3504 	if (nlwpid == 0) {
3505 		queue_unlock(qp);
3506 	} else {
3507 		no_preempt(self);
3508 		queue_unlock(qp);
3509 		if (nlwpid == 1)
3510 			(void) __lwp_unpark(lwpid[0]);
3511 		else
3512 			(void) __lwp_unpark_all(lwpid, nlwpid);
3513 		preempt(self);
3514 	}
3515 	if (lwpid != buffer)
3516 		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
3517 	return (error);
3518 }
3519 
3520 #pragma weak pthread_cond_destroy = _cond_destroy
3521 #pragma weak _pthread_cond_destroy = _cond_destroy
3522 #pragma weak cond_destroy = _cond_destroy
3523 int
3524 _cond_destroy(cond_t *cvp)
3525 {
3526 	cvp->cond_magic = 0;
3527 	tdb_sync_obj_deregister(cvp);
3528 	return (0);
3529 }
3530 
3531 #if defined(THREAD_DEBUG)
3532 void
3533 assert_no_libc_locks_held(void)
3534 {
3535 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3536 }
3537 #endif
3538 
3539 /* protected by link_lock */
3540 uint64_t spin_lock_spin;
3541 uint64_t spin_lock_spin2;
3542 uint64_t spin_lock_sleep;
3543 uint64_t spin_lock_wakeup;
3544 
3545 /*
3546  * Record spin lock statistics.
3547  * Called by a thread exiting itself in thrp_exit().
3548  * Also called via atexit() from the thread calling
3549  * exit() to do all the other threads as well.
3550  */
3551 void
3552 record_spin_locks(ulwp_t *ulwp)
3553 {
3554 	spin_lock_spin += ulwp->ul_spin_lock_spin;
3555 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3556 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3557 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3558 	ulwp->ul_spin_lock_spin = 0;
3559 	ulwp->ul_spin_lock_spin2 = 0;
3560 	ulwp->ul_spin_lock_sleep = 0;
3561 	ulwp->ul_spin_lock_wakeup = 0;
3562 }
3563 
3564 /*
3565  * atexit function:  dump the queue statistics to stderr.
3566  */
3567 #if !defined(__lint)
3568 #define	fprintf	_fprintf
3569 #endif
3570 #include <stdio.h>
3571 void
3572 dump_queue_statistics(void)
3573 {
3574 	uberdata_t *udp = curthread->ul_uberdata;
3575 	queue_head_t *qp;
3576 	int qn;
3577 	uint64_t spin_lock_total = 0;
3578 
3579 	if (udp->queue_head == NULL || thread_queue_dump == 0)
3580 		return;
3581 
3582 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3583 	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
3584 		return;
3585 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3586 		if (qp->qh_lockcount == 0)
3587 			continue;
3588 		spin_lock_total += qp->qh_lockcount;
3589 		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
3590 		    (u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
3591 			return;
3592 	}
3593 
3594 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3595 	    fprintf(stderr, "queue#   lockcount    max qlen\n") < 0)
3596 		return;
3597 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3598 		if (qp->qh_lockcount == 0)
3599 			continue;
3600 		spin_lock_total += qp->qh_lockcount;
3601 		if (fprintf(stderr, "%5d %12llu%12u\n", qn,
3602 		    (u_longlong_t)qp->qh_lockcount, qp->qh_qmax) < 0)
3603 			return;
3604 	}
3605 
3606 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3607 	    (u_longlong_t)spin_lock_total);
3608 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3609 	    (u_longlong_t)spin_lock_spin);
3610 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3611 	    (u_longlong_t)spin_lock_spin2);
3612 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3613 	    (u_longlong_t)spin_lock_sleep);
3614 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3615 	    (u_longlong_t)spin_lock_wakeup);
3616 }
3617