xref: /titanic_44/usr/src/lib/libc/port/threads/synch.c (revision 32b87932f3ef0887d873b7f6d2d1943799b2afc0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #define	atomic_cas_64	_atomic_cas_64
30 
31 #include "lint.h"
32 #include "thr_uberdata.h"
33 #include <sys/rtpriocntl.h>
34 #include <sys/sdt.h>
35 #include <atomic.h>
36 
37 #if defined(THREAD_DEBUG)
38 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
39 #define	INCR(x)		((x)++)
40 #define	DECR(x)		((x)--)
41 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
42 #else
43 #define	INCR32(x)
44 #define	INCR(x)
45 #define	DECR(x)
46 #define	MAXINCR(m, x)
47 #endif
48 
49 /*
50  * This mutex is initialized to be held by lwp#1.
51  * It is used to block a thread that has returned from a mutex_lock()
52  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
53  */
54 mutex_t	stall_mutex = DEFAULTMUTEX;
55 
56 static int shared_mutex_held(mutex_t *);
57 static int mutex_queuelock_adaptive(mutex_t *);
58 static void mutex_wakeup_all(mutex_t *);
59 
60 /*
61  * Lock statistics support functions.
62  */
63 void
64 record_begin_hold(tdb_mutex_stats_t *msp)
65 {
66 	tdb_incr(msp->mutex_lock);
67 	msp->mutex_begin_hold = gethrtime();
68 }
69 
70 hrtime_t
71 record_hold_time(tdb_mutex_stats_t *msp)
72 {
73 	hrtime_t now = gethrtime();
74 
75 	if (msp->mutex_begin_hold)
76 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
77 	msp->mutex_begin_hold = 0;
78 	return (now);
79 }
80 
81 /*
82  * Called once at library initialization.
83  */
84 void
85 mutex_setup(void)
86 {
87 	if (set_lock_byte(&stall_mutex.mutex_lockw))
88 		thr_panic("mutex_setup() cannot acquire stall_mutex");
89 	stall_mutex.mutex_owner = (uintptr_t)curthread;
90 }
91 
92 /*
93  * The default spin count of 1000 is experimentally determined.
94  * On sun4u machines with any number of processors it could be raised
95  * to 10,000 but that (experimentally) makes almost no difference.
96  * The environment variable:
97  *	_THREAD_ADAPTIVE_SPIN=count
98  * can be used to override and set the count in the range [0 .. 1,000,000].
99  */
100 int	thread_adaptive_spin = 1000;
101 uint_t	thread_max_spinners = 100;
102 int	thread_queue_verify = 0;
103 static	int	ncpus;
104 
105 /*
106  * Distinguish spinning for queue locks from spinning for regular locks.
107  * We try harder to acquire queue locks by spinning.
108  * The environment variable:
109  *	_THREAD_QUEUE_SPIN=count
110  * can be used to override and set the count in the range [0 .. 1,000,000].
111  */
112 int	thread_queue_spin = 10000;
113 
114 #define	ALL_ATTRIBUTES				\
115 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
116 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
117 	LOCK_ROBUST)
118 
119 /*
120  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
121  * augmented by zero or more the flags:
122  *	LOCK_RECURSIVE
123  *	LOCK_ERRORCHECK
124  *	LOCK_PRIO_INHERIT
125  *	LOCK_PRIO_PROTECT
126  *	LOCK_ROBUST
127  */
128 #pragma weak mutex_init = __mutex_init
129 #pragma weak _mutex_init = __mutex_init
130 /* ARGSUSED2 */
131 int
132 __mutex_init(mutex_t *mp, int type, void *arg)
133 {
134 	int basetype = (type & ~ALL_ATTRIBUTES);
135 	const pcclass_t *pccp;
136 	int error = 0;
137 	int ceil;
138 
139 	if (basetype == USYNC_PROCESS_ROBUST) {
140 		/*
141 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
142 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
143 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
144 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
145 		 * mutexes will ever draw ELOCKUNMAPPED).
146 		 */
147 		type |= (USYNC_PROCESS | LOCK_ROBUST);
148 		basetype = USYNC_PROCESS;
149 	}
150 
151 	if (type & LOCK_PRIO_PROTECT)
152 		pccp = get_info_by_policy(SCHED_FIFO);
153 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
154 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
155 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
156 	    ((type & LOCK_PRIO_PROTECT) &&
157 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
158 	    ceil > pccp->pcc_primax))) {
159 		error = EINVAL;
160 	} else if (type & LOCK_ROBUST) {
161 		/*
162 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
163 		 * are required to pass an initially all-zero mutex.
164 		 * Multiple calls to mutex_init() are allowed; all but
165 		 * the first return EBUSY.  A call to mutex_init() is
166 		 * allowed to make an inconsistent robust lock consistent
167 		 * (for historical usage, even though the proper interface
168 		 * for this is mutex_consistent()).  Note that we use
169 		 * atomic_or_16() to set the LOCK_INITED flag so as
170 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
171 		 */
172 		extern void _atomic_or_16(volatile uint16_t *, uint16_t);
173 		if (!(mp->mutex_flag & LOCK_INITED)) {
174 			mp->mutex_type = (uint8_t)type;
175 			_atomic_or_16(&mp->mutex_flag, LOCK_INITED);
176 			mp->mutex_magic = MUTEX_MAGIC;
177 		} else if (type != mp->mutex_type ||
178 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
179 			error = EINVAL;
180 		} else if (__mutex_consistent(mp) != 0) {
181 			error = EBUSY;
182 		}
183 		/* register a process robust mutex with the kernel */
184 		if (basetype == USYNC_PROCESS)
185 			register_lock(mp);
186 	} else {
187 		(void) memset(mp, 0, sizeof (*mp));
188 		mp->mutex_type = (uint8_t)type;
189 		mp->mutex_flag = LOCK_INITED;
190 		mp->mutex_magic = MUTEX_MAGIC;
191 	}
192 
193 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
194 		mp->mutex_ceiling = ceil;
195 	}
196 
197 	return (error);
198 }
199 
200 /*
201  * Delete mp from list of ceiling mutexes owned by curthread.
202  * Return 1 if the head of the chain was updated.
203  */
204 int
205 _ceil_mylist_del(mutex_t *mp)
206 {
207 	ulwp_t *self = curthread;
208 	mxchain_t **mcpp;
209 	mxchain_t *mcp;
210 
211 	for (mcpp = &self->ul_mxchain;
212 	    (mcp = *mcpp) != NULL;
213 	    mcpp = &mcp->mxchain_next) {
214 		if (mcp->mxchain_mx == mp) {
215 			*mcpp = mcp->mxchain_next;
216 			lfree(mcp, sizeof (*mcp));
217 			return (mcpp == &self->ul_mxchain);
218 		}
219 	}
220 	return (0);
221 }
222 
223 /*
224  * Add mp to the list of ceiling mutexes owned by curthread.
225  * Return ENOMEM if no memory could be allocated.
226  */
227 int
228 _ceil_mylist_add(mutex_t *mp)
229 {
230 	ulwp_t *self = curthread;
231 	mxchain_t *mcp;
232 
233 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
234 		return (ENOMEM);
235 	mcp->mxchain_mx = mp;
236 	mcp->mxchain_next = self->ul_mxchain;
237 	self->ul_mxchain = mcp;
238 	return (0);
239 }
240 
241 /*
242  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
243  */
244 static void
245 set_rt_priority(ulwp_t *self, int prio)
246 {
247 	pcparms_t pcparm;
248 
249 	pcparm.pc_cid = self->ul_rtclassid;
250 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
251 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
252 	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
253 }
254 
255 /*
256  * Inherit priority from ceiling.
257  * This changes the effective priority, not the assigned priority.
258  */
259 void
260 _ceil_prio_inherit(int prio)
261 {
262 	ulwp_t *self = curthread;
263 
264 	self->ul_epri = prio;
265 	set_rt_priority(self, prio);
266 }
267 
268 /*
269  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
270  * if holding at least one ceiling lock.  If no ceiling locks are held at this
271  * point, disinherit completely, reverting back to assigned priority.
272  */
273 void
274 _ceil_prio_waive(void)
275 {
276 	ulwp_t *self = curthread;
277 	mxchain_t *mcp = self->ul_mxchain;
278 	int prio;
279 
280 	if (mcp == NULL) {
281 		prio = self->ul_pri;
282 		self->ul_epri = 0;
283 	} else {
284 		prio = mcp->mxchain_mx->mutex_ceiling;
285 		self->ul_epri = prio;
286 	}
287 	set_rt_priority(self, prio);
288 }
289 
290 /*
291  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
292  * Return the old value of the lock word.
293  */
294 static uint32_t
295 clear_lockbyte(volatile uint32_t *lockword)
296 {
297 	uint32_t old;
298 	uint32_t new;
299 
300 	do {
301 		old = *lockword;
302 		new = old & ~LOCKMASK;
303 	} while (atomic_cas_32(lockword, old, new) != old);
304 
305 	return (old);
306 }
307 
308 /*
309  * Same as clear_lockbyte(), but operates on mutex_lockword64.
310  * The mutex_ownerpid field is cleared along with the lock byte.
311  */
312 static uint64_t
313 clear_lockbyte64(volatile uint64_t *lockword64)
314 {
315 	uint64_t old;
316 	uint64_t new;
317 
318 	do {
319 		old = *lockword64;
320 		new = old & ~LOCKMASK64;
321 	} while (atomic_cas_64(lockword64, old, new) != old);
322 
323 	return (old);
324 }
325 
326 /*
327  * Similar to set_lock_byte(), which only tries to set the lock byte.
328  * Here, we attempt to set the lock byte AND the mutex_ownerpid,
329  * keeping the remaining bytes constant.
330  */
331 static int
332 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
333 {
334 	uint64_t old;
335 	uint64_t new;
336 
337 	old = *lockword64 & ~LOCKMASK64;
338 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
339 	if (atomic_cas_64(lockword64, old, new) == old)
340 		return (LOCKCLEAR);
341 
342 	return (LOCKSET);
343 }
344 
345 /*
346  * Increment the spinners count in the mutex lock word.
347  * Return 0 on success.  Return -1 if the count would overflow.
348  */
349 static int
350 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
351 {
352 	uint32_t old;
353 	uint32_t new;
354 
355 	do {
356 		old = *lockword;
357 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
358 			return (-1);
359 		new = old + (1 << SPINNERSHIFT);
360 	} while (atomic_cas_32(lockword, old, new) != old);
361 
362 	return (0);
363 }
364 
365 /*
366  * Decrement the spinners count in the mutex lock word.
367  * Return the new value of the lock word.
368  */
369 static uint32_t
370 spinners_decr(volatile uint32_t *lockword)
371 {
372 	uint32_t old;
373 	uint32_t new;
374 
375 	do {
376 		new = old = *lockword;
377 		if (new & SPINNERMASK)
378 			new -= (1 << SPINNERSHIFT);
379 	} while (atomic_cas_32(lockword, old, new) != old);
380 
381 	return (new);
382 }
383 
384 /*
385  * Non-preemptive spin locks.  Used by queue_lock().
386  * No lock statistics are gathered for these locks.
387  * No DTrace probes are provided for these locks.
388  */
389 void
390 spin_lock_set(mutex_t *mp)
391 {
392 	ulwp_t *self = curthread;
393 
394 	no_preempt(self);
395 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
396 		mp->mutex_owner = (uintptr_t)self;
397 		return;
398 	}
399 	/*
400 	 * Spin for a while, attempting to acquire the lock.
401 	 */
402 	INCR32(self->ul_spin_lock_spin);
403 	if (mutex_queuelock_adaptive(mp) == 0 ||
404 	    set_lock_byte(&mp->mutex_lockw) == 0) {
405 		mp->mutex_owner = (uintptr_t)self;
406 		return;
407 	}
408 	/*
409 	 * Try harder if we were previously at a no premption level.
410 	 */
411 	if (self->ul_preempt > 1) {
412 		INCR32(self->ul_spin_lock_spin2);
413 		if (mutex_queuelock_adaptive(mp) == 0 ||
414 		    set_lock_byte(&mp->mutex_lockw) == 0) {
415 			mp->mutex_owner = (uintptr_t)self;
416 			return;
417 		}
418 	}
419 	/*
420 	 * Give up and block in the kernel for the mutex.
421 	 */
422 	INCR32(self->ul_spin_lock_sleep);
423 	(void) ___lwp_mutex_timedlock(mp, NULL);
424 	mp->mutex_owner = (uintptr_t)self;
425 }
426 
427 void
428 spin_lock_clear(mutex_t *mp)
429 {
430 	ulwp_t *self = curthread;
431 
432 	mp->mutex_owner = 0;
433 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
434 		(void) ___lwp_mutex_wakeup(mp, 0);
435 		INCR32(self->ul_spin_lock_wakeup);
436 	}
437 	preempt(self);
438 }
439 
440 /*
441  * Allocate the sleep queue hash table.
442  */
443 void
444 queue_alloc(void)
445 {
446 	ulwp_t *self = curthread;
447 	uberdata_t *udp = self->ul_uberdata;
448 	queue_head_t *qp;
449 	void *data;
450 	int i;
451 
452 	/*
453 	 * No locks are needed; we call here only when single-threaded.
454 	 */
455 	ASSERT(self == udp->ulwp_one);
456 	ASSERT(!udp->uberflags.uf_mt);
457 	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
458 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
459 	    == MAP_FAILED)
460 		thr_panic("cannot allocate thread queue_head table");
461 	udp->queue_head = qp = (queue_head_t *)data;
462 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
463 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
464 		qp->qh_lock.mutex_flag = LOCK_INITED;
465 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
466 		qp->qh_hlist = &qp->qh_def_root;
467 #if defined(THREAD_DEBUG)
468 		qp->qh_hlen = 1;
469 		qp->qh_hmax = 1;
470 #endif
471 	}
472 }
473 
474 #if defined(THREAD_DEBUG)
475 
476 /*
477  * Debugging: verify correctness of a sleep queue.
478  */
479 void
480 QVERIFY(queue_head_t *qp)
481 {
482 	ulwp_t *self = curthread;
483 	uberdata_t *udp = self->ul_uberdata;
484 	queue_root_t *qrp;
485 	ulwp_t *ulwp;
486 	ulwp_t *prev;
487 	uint_t index;
488 	uint32_t cnt;
489 	char qtype;
490 	void *wchan;
491 
492 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
493 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
494 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
495 		cnt++;
496 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
497 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
498 	}
499 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
500 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
501 	ASSERT(qp->qh_type == qtype);
502 	if (!thread_queue_verify)
503 		return;
504 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
505 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
506 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
507 		    prev = ulwp, ulwp = ulwp->ul_link) {
508 			cnt++;
509 			if (ulwp->ul_writer)
510 				ASSERT(prev == NULL || prev->ul_writer);
511 			ASSERT(ulwp->ul_qtype == qtype);
512 			ASSERT(ulwp->ul_wchan != NULL);
513 			ASSERT(ulwp->ul_sleepq == qp);
514 			wchan = ulwp->ul_wchan;
515 			ASSERT(qrp->qr_wchan == wchan);
516 			index = QUEUE_HASH(wchan, qtype);
517 			ASSERT(&udp->queue_head[index] == qp);
518 		}
519 		ASSERT(qrp->qr_tail == prev);
520 	}
521 	ASSERT(qp->qh_qlen == cnt);
522 }
523 
524 #else	/* THREAD_DEBUG */
525 
526 #define	QVERIFY(qp)
527 
528 #endif	/* THREAD_DEBUG */
529 
530 /*
531  * Acquire a queue head.
532  */
533 queue_head_t *
534 queue_lock(void *wchan, int qtype)
535 {
536 	uberdata_t *udp = curthread->ul_uberdata;
537 	queue_head_t *qp;
538 	queue_root_t *qrp;
539 
540 	ASSERT(qtype == MX || qtype == CV);
541 
542 	/*
543 	 * It is possible that we could be called while still single-threaded.
544 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
545 	 */
546 	if ((qp = udp->queue_head) == NULL) {
547 		queue_alloc();
548 		qp = udp->queue_head;
549 	}
550 	qp += QUEUE_HASH(wchan, qtype);
551 	spin_lock_set(&qp->qh_lock);
552 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
553 		if (qrp->qr_wchan == wchan)
554 			break;
555 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
556 		/* the default queue root is available; use it */
557 		qrp = &qp->qh_def_root;
558 		qrp->qr_wchan = wchan;
559 		ASSERT(qrp->qr_next == NULL);
560 		ASSERT(qrp->qr_tail == NULL &&
561 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
562 	}
563 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
564 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
565 	INCR32(qp->qh_lockcount);
566 	QVERIFY(qp);
567 	return (qp);
568 }
569 
570 /*
571  * Release a queue head.
572  */
573 void
574 queue_unlock(queue_head_t *qp)
575 {
576 	QVERIFY(qp);
577 	spin_lock_clear(&qp->qh_lock);
578 }
579 
580 /*
581  * For rwlock queueing, we must queue writers ahead of readers of the
582  * same priority.  We do this by making writers appear to have a half
583  * point higher priority for purposes of priority comparisons below.
584  */
585 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
586 
587 void
588 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
589 {
590 	queue_root_t *qrp;
591 	ulwp_t **ulwpp;
592 	ulwp_t *next;
593 	int pri = CMP_PRIO(ulwp);
594 
595 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
596 	ASSERT(ulwp->ul_sleepq != qp);
597 
598 	if ((qrp = qp->qh_root) == NULL) {
599 		/* use the thread's queue root for the linkage */
600 		qrp = &ulwp->ul_queue_root;
601 		qrp->qr_next = qp->qh_hlist;
602 		qrp->qr_prev = NULL;
603 		qrp->qr_head = NULL;
604 		qrp->qr_tail = NULL;
605 		qrp->qr_wchan = qp->qh_wchan;
606 		qrp->qr_rtcount = 0;
607 		qrp->qr_qlen = 0;
608 		qrp->qr_qmax = 0;
609 		qp->qh_hlist->qr_prev = qrp;
610 		qp->qh_hlist = qrp;
611 		qp->qh_root = qrp;
612 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
613 	}
614 
615 	/*
616 	 * LIFO queue ordering is unfair and can lead to starvation,
617 	 * but it gives better performance for heavily contended locks.
618 	 * We use thread_queue_fifo (range is 0..8) to determine
619 	 * the frequency of FIFO vs LIFO queuing:
620 	 *	0 : every 256th time	(almost always LIFO)
621 	 *	1 : every 128th time
622 	 *	2 : every 64th  time
623 	 *	3 : every 32nd  time
624 	 *	4 : every 16th  time	(the default value, mostly LIFO)
625 	 *	5 : every 8th   time
626 	 *	6 : every 4th   time
627 	 *	7 : every 2nd   time
628 	 *	8 : every time		(never LIFO, always FIFO)
629 	 * Note that there is always some degree of FIFO ordering.
630 	 * This breaks live lock conditions that occur in applications
631 	 * that are written assuming (incorrectly) that threads acquire
632 	 * locks fairly, that is, in roughly round-robin order.
633 	 * In any event, the queue is maintained in kernel priority order.
634 	 *
635 	 * If force_fifo is non-zero, fifo queueing is forced.
636 	 * SUSV3 requires this for semaphores.
637 	 */
638 	if (qrp->qr_head == NULL) {
639 		/*
640 		 * The queue is empty.  LIFO/FIFO doesn't matter.
641 		 */
642 		ASSERT(qrp->qr_tail == NULL);
643 		ulwpp = &qrp->qr_head;
644 	} else if (force_fifo |
645 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
646 		/*
647 		 * Enqueue after the last thread whose priority is greater
648 		 * than or equal to the priority of the thread being queued.
649 		 * Attempt first to go directly onto the tail of the queue.
650 		 */
651 		if (pri <= CMP_PRIO(qrp->qr_tail))
652 			ulwpp = &qrp->qr_tail->ul_link;
653 		else {
654 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
655 			    ulwpp = &next->ul_link)
656 				if (pri > CMP_PRIO(next))
657 					break;
658 		}
659 	} else {
660 		/*
661 		 * Enqueue before the first thread whose priority is less
662 		 * than or equal to the priority of the thread being queued.
663 		 * Hopefully we can go directly onto the head of the queue.
664 		 */
665 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
666 		    ulwpp = &next->ul_link)
667 			if (pri >= CMP_PRIO(next))
668 				break;
669 	}
670 	if ((ulwp->ul_link = *ulwpp) == NULL)
671 		qrp->qr_tail = ulwp;
672 	*ulwpp = ulwp;
673 
674 	ulwp->ul_sleepq = qp;
675 	ulwp->ul_wchan = qp->qh_wchan;
676 	ulwp->ul_qtype = qp->qh_type;
677 	if ((ulwp->ul_schedctl != NULL &&
678 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
679 	    ulwp->ul_pilocks) {
680 		ulwp->ul_rtqueued = 1;
681 		qrp->qr_rtcount++;
682 	}
683 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
684 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
685 }
686 
687 /*
688  * Helper function for queue_slot() and queue_slot_rt().
689  * Try to find a non-suspended thread on the queue.
690  */
691 static ulwp_t **
692 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
693 {
694 	ulwp_t *ulwp;
695 	ulwp_t **foundpp = NULL;
696 	int priority = -1;
697 	ulwp_t *prev;
698 	int tpri;
699 
700 	for (prev = NULL;
701 	    (ulwp = *ulwpp) != NULL;
702 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
703 		if (ulwp->ul_stop)	/* skip suspended threads */
704 			continue;
705 		tpri = rt? CMP_PRIO(ulwp) : 0;
706 		if (tpri > priority) {
707 			foundpp = ulwpp;
708 			*prevp = prev;
709 			priority = tpri;
710 			if (!rt)
711 				break;
712 		}
713 	}
714 	return (foundpp);
715 }
716 
717 /*
718  * For real-time, we search the entire queue because the dispatch
719  * (kernel) priorities may have changed since enqueueing.
720  */
721 static ulwp_t **
722 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
723 {
724 	ulwp_t **ulwpp = ulwpp_org;
725 	ulwp_t *ulwp = *ulwpp;
726 	ulwp_t **foundpp = ulwpp;
727 	int priority = CMP_PRIO(ulwp);
728 	ulwp_t *prev;
729 	int tpri;
730 
731 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
732 	    (ulwp = *ulwpp) != NULL;
733 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
734 		tpri = CMP_PRIO(ulwp);
735 		if (tpri > priority) {
736 			foundpp = ulwpp;
737 			*prevp = prev;
738 			priority = tpri;
739 		}
740 	}
741 	ulwp = *foundpp;
742 
743 	/*
744 	 * Try not to return a suspended thread.
745 	 * This mimics the old libthread's behavior.
746 	 */
747 	if (ulwp->ul_stop &&
748 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
749 		foundpp = ulwpp;
750 		ulwp = *foundpp;
751 	}
752 	ulwp->ul_rt = 1;
753 	return (foundpp);
754 }
755 
756 ulwp_t **
757 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
758 {
759 	queue_root_t *qrp;
760 	ulwp_t **ulwpp;
761 	ulwp_t *ulwp;
762 	int rt;
763 
764 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
765 
766 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
767 		*more = 0;
768 		return (NULL);		/* no lwps on the queue */
769 	}
770 	rt = (qrp->qr_rtcount != 0);
771 	*prevp = NULL;
772 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
773 		*more = 0;
774 		ulwp->ul_rt = rt;
775 		return (&qrp->qr_head);
776 	}
777 	*more = 1;
778 
779 	if (rt)		/* real-time queue */
780 		return (queue_slot_rt(&qrp->qr_head, prevp));
781 	/*
782 	 * Try not to return a suspended thread.
783 	 * This mimics the old libthread's behavior.
784 	 */
785 	if (ulwp->ul_stop &&
786 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
787 		ulwp = *ulwpp;
788 		ulwp->ul_rt = 0;
789 		return (ulwpp);
790 	}
791 	/*
792 	 * The common case; just pick the first thread on the queue.
793 	 */
794 	ulwp->ul_rt = 0;
795 	return (&qrp->qr_head);
796 }
797 
798 /*
799  * Common code for unlinking an lwp from a user-level sleep queue.
800  */
801 void
802 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
803 {
804 	queue_root_t *qrp = qp->qh_root;
805 	queue_root_t *nqrp;
806 	ulwp_t *ulwp = *ulwpp;
807 	ulwp_t *next;
808 
809 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
810 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
811 
812 	DECR(qp->qh_qlen);
813 	DECR(qrp->qr_qlen);
814 	if (ulwp->ul_rtqueued) {
815 		ulwp->ul_rtqueued = 0;
816 		qrp->qr_rtcount--;
817 	}
818 	next = ulwp->ul_link;
819 	*ulwpp = next;
820 	ulwp->ul_link = NULL;
821 	if (qrp->qr_tail == ulwp)
822 		qrp->qr_tail = prev;
823 	if (qrp == &ulwp->ul_queue_root) {
824 		/*
825 		 * We can't continue to use the unlinked thread's
826 		 * queue root for the linkage.
827 		 */
828 		queue_root_t *qr_next = qrp->qr_next;
829 		queue_root_t *qr_prev = qrp->qr_prev;
830 
831 		if (qrp->qr_tail) {
832 			/* switch to using the last thread's queue root */
833 			ASSERT(qrp->qr_qlen != 0);
834 			nqrp = &qrp->qr_tail->ul_queue_root;
835 			*nqrp = *qrp;
836 			if (qr_next)
837 				qr_next->qr_prev = nqrp;
838 			if (qr_prev)
839 				qr_prev->qr_next = nqrp;
840 			else
841 				qp->qh_hlist = nqrp;
842 			qp->qh_root = nqrp;
843 		} else {
844 			/* empty queue root; just delete from the hash list */
845 			ASSERT(qrp->qr_qlen == 0);
846 			if (qr_next)
847 				qr_next->qr_prev = qr_prev;
848 			if (qr_prev)
849 				qr_prev->qr_next = qr_next;
850 			else
851 				qp->qh_hlist = qr_next;
852 			qp->qh_root = NULL;
853 			DECR(qp->qh_hlen);
854 		}
855 	}
856 }
857 
858 ulwp_t *
859 dequeue(queue_head_t *qp, int *more)
860 {
861 	ulwp_t **ulwpp;
862 	ulwp_t *ulwp;
863 	ulwp_t *prev;
864 
865 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
866 		return (NULL);
867 	ulwp = *ulwpp;
868 	queue_unlink(qp, ulwpp, prev);
869 	ulwp->ul_sleepq = NULL;
870 	ulwp->ul_wchan = NULL;
871 	return (ulwp);
872 }
873 
874 /*
875  * Return a pointer to the highest priority thread sleeping on wchan.
876  */
877 ulwp_t *
878 queue_waiter(queue_head_t *qp)
879 {
880 	ulwp_t **ulwpp;
881 	ulwp_t *prev;
882 	int more;
883 
884 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
885 		return (NULL);
886 	return (*ulwpp);
887 }
888 
889 int
890 dequeue_self(queue_head_t *qp)
891 {
892 	ulwp_t *self = curthread;
893 	queue_root_t *qrp;
894 	ulwp_t **ulwpp;
895 	ulwp_t *ulwp;
896 	ulwp_t *prev;
897 	int found = 0;
898 
899 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
900 
901 	/* find self on the sleep queue */
902 	if ((qrp = qp->qh_root) != NULL) {
903 		for (prev = NULL, ulwpp = &qrp->qr_head;
904 		    (ulwp = *ulwpp) != NULL;
905 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
906 			if (ulwp == self) {
907 				queue_unlink(qp, ulwpp, prev);
908 				self->ul_cvmutex = NULL;
909 				self->ul_sleepq = NULL;
910 				self->ul_wchan = NULL;
911 				found = 1;
912 				break;
913 			}
914 		}
915 	}
916 
917 	if (!found)
918 		thr_panic("dequeue_self(): curthread not found on queue");
919 
920 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
921 }
922 
923 /*
924  * Called from call_user_handler() and _thrp_suspend() to take
925  * ourself off of our sleep queue so we can grab locks.
926  */
927 void
928 unsleep_self(void)
929 {
930 	ulwp_t *self = curthread;
931 	queue_head_t *qp;
932 
933 	/*
934 	 * Calling enter_critical()/exit_critical() here would lead
935 	 * to recursion.  Just manipulate self->ul_critical directly.
936 	 */
937 	self->ul_critical++;
938 	while (self->ul_sleepq != NULL) {
939 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
940 		/*
941 		 * We may have been moved from a CV queue to a
942 		 * mutex queue while we were attempting queue_lock().
943 		 * If so, just loop around and try again.
944 		 * dequeue_self() clears self->ul_sleepq.
945 		 */
946 		if (qp == self->ul_sleepq)
947 			(void) dequeue_self(qp);
948 		queue_unlock(qp);
949 	}
950 	self->ul_writer = 0;
951 	self->ul_critical--;
952 }
953 
954 /*
955  * Common code for calling the the ___lwp_mutex_timedlock() system call.
956  * Returns with mutex_owner and mutex_ownerpid set correctly.
957  */
958 static int
959 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
960 {
961 	ulwp_t *self = curthread;
962 	uberdata_t *udp = self->ul_uberdata;
963 	int mtype = mp->mutex_type;
964 	hrtime_t begin_sleep;
965 	int acquired;
966 	int error;
967 
968 	self->ul_sp = stkptr();
969 	self->ul_wchan = mp;
970 	if (__td_event_report(self, TD_SLEEP, udp)) {
971 		self->ul_td_evbuf.eventnum = TD_SLEEP;
972 		self->ul_td_evbuf.eventdata = mp;
973 		tdb_event(TD_SLEEP, udp);
974 	}
975 	if (msp) {
976 		tdb_incr(msp->mutex_sleep);
977 		begin_sleep = gethrtime();
978 	}
979 
980 	DTRACE_PROBE1(plockstat, mutex__block, mp);
981 
982 	for (;;) {
983 		/*
984 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
985 		 * means we successfully acquired the lock.
986 		 */
987 		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0 &&
988 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
989 			acquired = 0;
990 			break;
991 		}
992 
993 		if (mtype & USYNC_PROCESS) {
994 			/*
995 			 * Defend against forkall().  We may be the child,
996 			 * in which case we don't actually own the mutex.
997 			 */
998 			enter_critical(self);
999 			if (mp->mutex_ownerpid == udp->pid) {
1000 				mp->mutex_owner = (uintptr_t)self;
1001 				exit_critical(self);
1002 				acquired = 1;
1003 				break;
1004 			}
1005 			exit_critical(self);
1006 		} else {
1007 			mp->mutex_owner = (uintptr_t)self;
1008 			acquired = 1;
1009 			break;
1010 		}
1011 	}
1012 	if (msp)
1013 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1014 	self->ul_wchan = NULL;
1015 	self->ul_sp = 0;
1016 
1017 	if (acquired) {
1018 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1019 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1020 	} else {
1021 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1022 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1023 	}
1024 
1025 	return (error);
1026 }
1027 
1028 /*
1029  * Common code for calling the ___lwp_mutex_trylock() system call.
1030  * Returns with mutex_owner and mutex_ownerpid set correctly.
1031  */
1032 int
1033 mutex_trylock_kernel(mutex_t *mp)
1034 {
1035 	ulwp_t *self = curthread;
1036 	uberdata_t *udp = self->ul_uberdata;
1037 	int mtype = mp->mutex_type;
1038 	int error;
1039 	int acquired;
1040 
1041 	for (;;) {
1042 		/*
1043 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1044 		 * means we successfully acquired the lock.
1045 		 */
1046 		if ((error = ___lwp_mutex_trylock(mp)) != 0 &&
1047 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1048 			acquired = 0;
1049 			break;
1050 		}
1051 
1052 		if (mtype & USYNC_PROCESS) {
1053 			/*
1054 			 * Defend against forkall().  We may be the child,
1055 			 * in which case we don't actually own the mutex.
1056 			 */
1057 			enter_critical(self);
1058 			if (mp->mutex_ownerpid == udp->pid) {
1059 				mp->mutex_owner = (uintptr_t)self;
1060 				exit_critical(self);
1061 				acquired = 1;
1062 				break;
1063 			}
1064 			exit_critical(self);
1065 		} else {
1066 			mp->mutex_owner = (uintptr_t)self;
1067 			acquired = 1;
1068 			break;
1069 		}
1070 	}
1071 
1072 	if (acquired) {
1073 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1074 	} else if (error != EBUSY) {
1075 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1076 	}
1077 
1078 	return (error);
1079 }
1080 
1081 volatile sc_shared_t *
1082 setup_schedctl(void)
1083 {
1084 	ulwp_t *self = curthread;
1085 	volatile sc_shared_t *scp;
1086 	sc_shared_t *tmp;
1087 
1088 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1089 	    !self->ul_vfork &&			/* not a child of vfork() */
1090 	    !self->ul_schedctl_called) {	/* haven't been called before */
1091 		enter_critical(self);
1092 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1093 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1094 			self->ul_schedctl = scp = tmp;
1095 		exit_critical(self);
1096 	}
1097 	/*
1098 	 * Unless the call to setup_schedctl() is surrounded
1099 	 * by enter_critical()/exit_critical(), the address
1100 	 * we are returning could be invalid due to a forkall()
1101 	 * having occurred in another thread.
1102 	 */
1103 	return (scp);
1104 }
1105 
1106 /*
1107  * Interfaces from libsched, incorporated into libc.
1108  * libsched.so.1 is now a filter library onto libc.
1109  */
1110 #pragma weak schedctl_lookup = _schedctl_init
1111 #pragma weak _schedctl_lookup = _schedctl_init
1112 #pragma weak schedctl_init = _schedctl_init
1113 schedctl_t *
1114 _schedctl_init(void)
1115 {
1116 	volatile sc_shared_t *scp = setup_schedctl();
1117 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1118 }
1119 
1120 #pragma weak schedctl_exit = _schedctl_exit
1121 void
1122 _schedctl_exit(void)
1123 {
1124 }
1125 
1126 /*
1127  * Contract private interface for java.
1128  * Set up the schedctl data if it doesn't exist yet.
1129  * Return a pointer to the pointer to the schedctl data.
1130  */
1131 volatile sc_shared_t *volatile *
1132 _thr_schedctl(void)
1133 {
1134 	ulwp_t *self = curthread;
1135 	volatile sc_shared_t *volatile *ptr;
1136 
1137 	if (self->ul_vfork)
1138 		return (NULL);
1139 	if (*(ptr = &self->ul_schedctl) == NULL)
1140 		(void) setup_schedctl();
1141 	return (ptr);
1142 }
1143 
1144 /*
1145  * Block signals and attempt to block preemption.
1146  * no_preempt()/preempt() must be used in pairs but can be nested.
1147  */
1148 void
1149 no_preempt(ulwp_t *self)
1150 {
1151 	volatile sc_shared_t *scp;
1152 
1153 	if (self->ul_preempt++ == 0) {
1154 		enter_critical(self);
1155 		if ((scp = self->ul_schedctl) != NULL ||
1156 		    (scp = setup_schedctl()) != NULL) {
1157 			/*
1158 			 * Save the pre-existing preempt value.
1159 			 */
1160 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1161 			scp->sc_preemptctl.sc_nopreempt = 1;
1162 		}
1163 	}
1164 }
1165 
1166 /*
1167  * Undo the effects of no_preempt().
1168  */
1169 void
1170 preempt(ulwp_t *self)
1171 {
1172 	volatile sc_shared_t *scp;
1173 
1174 	ASSERT(self->ul_preempt > 0);
1175 	if (--self->ul_preempt == 0) {
1176 		if ((scp = self->ul_schedctl) != NULL) {
1177 			/*
1178 			 * Restore the pre-existing preempt value.
1179 			 */
1180 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1181 			if (scp->sc_preemptctl.sc_yield &&
1182 			    scp->sc_preemptctl.sc_nopreempt == 0) {
1183 				yield();
1184 				if (scp->sc_preemptctl.sc_yield) {
1185 					/*
1186 					 * Shouldn't happen.  This is either
1187 					 * a race condition or the thread
1188 					 * just entered the real-time class.
1189 					 */
1190 					yield();
1191 					scp->sc_preemptctl.sc_yield = 0;
1192 				}
1193 			}
1194 		}
1195 		exit_critical(self);
1196 	}
1197 }
1198 
1199 /*
1200  * If a call to preempt() would cause the current thread to yield or to
1201  * take deferred actions in exit_critical(), then unpark the specified
1202  * lwp so it can run while we delay.  Return the original lwpid if the
1203  * unpark was not performed, else return zero.  The tests are a repeat
1204  * of some of the tests in preempt(), above.  This is a statistical
1205  * optimization solely for cond_sleep_queue(), below.
1206  */
1207 static lwpid_t
1208 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1209 {
1210 	volatile sc_shared_t *scp = self->ul_schedctl;
1211 
1212 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1213 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1214 	    (self->ul_curplease && self->ul_critical == 1)) {
1215 		(void) __lwp_unpark(lwpid);
1216 		lwpid = 0;
1217 	}
1218 	return (lwpid);
1219 }
1220 
1221 /*
1222  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1223  * If this fails, return EBUSY and let the caller deal with it.
1224  * If this succeeds, return 0 with mutex_owner set to curthread.
1225  */
1226 static int
1227 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1228 {
1229 	ulwp_t *self = curthread;
1230 	int error = EBUSY;
1231 	ulwp_t *ulwp;
1232 	volatile sc_shared_t *scp;
1233 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1234 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1235 	uint32_t new_lockword;
1236 	int count = 0;
1237 	int max_count;
1238 	uint8_t max_spinners;
1239 
1240 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1241 
1242 	if (MUTEX_OWNER(mp) == self)
1243 		return (EBUSY);
1244 
1245 	/* short-cut, not definitive (see below) */
1246 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1247 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1248 		error = ENOTRECOVERABLE;
1249 		goto done;
1250 	}
1251 
1252 	/*
1253 	 * Make one attempt to acquire the lock before
1254 	 * incurring the overhead of the spin loop.
1255 	 */
1256 	if (set_lock_byte(lockp) == 0) {
1257 		*ownerp = (uintptr_t)self;
1258 		error = 0;
1259 		goto done;
1260 	}
1261 	if (!tryhard)
1262 		goto done;
1263 	if (ncpus == 0)
1264 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1265 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1266 		max_spinners = ncpus - 1;
1267 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1268 	if (max_count == 0)
1269 		goto done;
1270 
1271 	/*
1272 	 * This spin loop is unfair to lwps that have already dropped into
1273 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1274 	 * This is just too bad.  The adaptive spin algorithm is intended
1275 	 * to allow programs with highly-contended locks (that is, broken
1276 	 * programs) to execute with reasonable speed despite their contention.
1277 	 * Being fair would reduce the speed of such programs and well-written
1278 	 * programs will not suffer in any case.
1279 	 */
1280 	enter_critical(self);
1281 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1282 		exit_critical(self);
1283 		goto done;
1284 	}
1285 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1286 	for (count = 1; ; count++) {
1287 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1288 			*ownerp = (uintptr_t)self;
1289 			error = 0;
1290 			break;
1291 		}
1292 		if (count == max_count)
1293 			break;
1294 		SMT_PAUSE();
1295 		/*
1296 		 * Stop spinning if the mutex owner is not running on
1297 		 * a processor; it will not drop the lock any time soon
1298 		 * and we would just be wasting time to keep spinning.
1299 		 *
1300 		 * Note that we are looking at another thread (ulwp_t)
1301 		 * without ensuring that the other thread does not exit.
1302 		 * The scheme relies on ulwp_t structures never being
1303 		 * deallocated by the library (the library employs a free
1304 		 * list of ulwp_t structs that are reused when new threads
1305 		 * are created) and on schedctl shared memory never being
1306 		 * deallocated once created via __schedctl().
1307 		 *
1308 		 * Thus, the worst that can happen when the spinning thread
1309 		 * looks at the owner's schedctl data is that it is looking
1310 		 * at some other thread's schedctl data.  This almost never
1311 		 * happens and is benign when it does.
1312 		 */
1313 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1314 		    ((scp = ulwp->ul_schedctl) == NULL ||
1315 		    scp->sc_state != SC_ONPROC))
1316 			break;
1317 	}
1318 	new_lockword = spinners_decr(&mp->mutex_lockword);
1319 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1320 		/*
1321 		 * We haven't yet acquired the lock, the lock
1322 		 * is free, and there are no other spinners.
1323 		 * Make one final attempt to acquire the lock.
1324 		 *
1325 		 * This isn't strictly necessary since mutex_lock_queue()
1326 		 * (the next action this thread will take if it doesn't
1327 		 * acquire the lock here) makes one attempt to acquire
1328 		 * the lock before putting the thread to sleep.
1329 		 *
1330 		 * If the next action for this thread (on failure here)
1331 		 * were not to call mutex_lock_queue(), this would be
1332 		 * necessary for correctness, to avoid ending up with an
1333 		 * unheld mutex with waiters but no one to wake them up.
1334 		 */
1335 		if (set_lock_byte(lockp) == 0) {
1336 			*ownerp = (uintptr_t)self;
1337 			error = 0;
1338 		}
1339 		count++;
1340 	}
1341 	exit_critical(self);
1342 
1343 done:
1344 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1345 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1346 		/*
1347 		 * We shouldn't own the mutex.
1348 		 * Just clear the lock; everyone has already been waked up.
1349 		 */
1350 		mp->mutex_owner = 0;
1351 		(void) clear_lockbyte(&mp->mutex_lockword);
1352 		error = ENOTRECOVERABLE;
1353 	}
1354 
1355 	if (error) {
1356 		if (count) {
1357 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1358 		}
1359 		if (error != EBUSY) {
1360 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1361 		}
1362 	} else {
1363 		if (count) {
1364 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1365 		}
1366 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1367 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1368 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1369 			error = EOWNERDEAD;
1370 		}
1371 	}
1372 
1373 	return (error);
1374 }
1375 
1376 /*
1377  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1378  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1379  */
1380 static int
1381 mutex_queuelock_adaptive(mutex_t *mp)
1382 {
1383 	ulwp_t *ulwp;
1384 	volatile sc_shared_t *scp;
1385 	volatile uint8_t *lockp;
1386 	volatile uint64_t *ownerp;
1387 	int count = curthread->ul_queue_spin;
1388 
1389 	ASSERT(mp->mutex_type == USYNC_THREAD);
1390 
1391 	if (count == 0)
1392 		return (EBUSY);
1393 
1394 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1395 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1396 	while (--count >= 0) {
1397 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1398 			return (0);
1399 		SMT_PAUSE();
1400 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1401 		    ((scp = ulwp->ul_schedctl) == NULL ||
1402 		    scp->sc_state != SC_ONPROC))
1403 			break;
1404 	}
1405 
1406 	return (EBUSY);
1407 }
1408 
1409 /*
1410  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1411  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1412  * If this fails, return EBUSY and let the caller deal with it.
1413  * If this succeeds, return 0 with mutex_owner set to curthread
1414  * and mutex_ownerpid set to the current pid.
1415  */
1416 static int
1417 mutex_trylock_process(mutex_t *mp, int tryhard)
1418 {
1419 	ulwp_t *self = curthread;
1420 	uberdata_t *udp = self->ul_uberdata;
1421 	int error = EBUSY;
1422 	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1423 	uint32_t new_lockword;
1424 	int count = 0;
1425 	int max_count;
1426 	uint8_t max_spinners;
1427 
1428 	ASSERT(mp->mutex_type & USYNC_PROCESS);
1429 
1430 	if (shared_mutex_held(mp))
1431 		return (EBUSY);
1432 
1433 	/* short-cut, not definitive (see below) */
1434 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1435 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1436 		error = ENOTRECOVERABLE;
1437 		goto done;
1438 	}
1439 
1440 	/*
1441 	 * Make one attempt to acquire the lock before
1442 	 * incurring the overhead of the spin loop.
1443 	 */
1444 	enter_critical(self);
1445 	if (set_lock_byte64(lockp, udp->pid) == 0) {
1446 		mp->mutex_owner = (uintptr_t)self;
1447 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
1448 		exit_critical(self);
1449 		error = 0;
1450 		goto done;
1451 	}
1452 	exit_critical(self);
1453 	if (!tryhard)
1454 		goto done;
1455 	if (ncpus == 0)
1456 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1457 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1458 		max_spinners = ncpus - 1;
1459 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1460 	if (max_count == 0)
1461 		goto done;
1462 
1463 	/*
1464 	 * This is a process-shared mutex.
1465 	 * We cannot know if the owner is running on a processor.
1466 	 * We just spin and hope that it is on a processor.
1467 	 */
1468 	enter_critical(self);
1469 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1470 		exit_critical(self);
1471 		goto done;
1472 	}
1473 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1474 	for (count = 1; ; count++) {
1475 		if ((*lockp & LOCKMASK64) == 0 &&
1476 		    set_lock_byte64(lockp, udp->pid) == 0) {
1477 			mp->mutex_owner = (uintptr_t)self;
1478 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1479 			error = 0;
1480 			break;
1481 		}
1482 		if (count == max_count)
1483 			break;
1484 		SMT_PAUSE();
1485 	}
1486 	new_lockword = spinners_decr(&mp->mutex_lockword);
1487 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1488 		/*
1489 		 * We haven't yet acquired the lock, the lock
1490 		 * is free, and there are no other spinners.
1491 		 * Make one final attempt to acquire the lock.
1492 		 *
1493 		 * This isn't strictly necessary since mutex_lock_kernel()
1494 		 * (the next action this thread will take if it doesn't
1495 		 * acquire the lock here) makes one attempt to acquire
1496 		 * the lock before putting the thread to sleep.
1497 		 *
1498 		 * If the next action for this thread (on failure here)
1499 		 * were not to call mutex_lock_kernel(), this would be
1500 		 * necessary for correctness, to avoid ending up with an
1501 		 * unheld mutex with waiters but no one to wake them up.
1502 		 */
1503 		if (set_lock_byte64(lockp, udp->pid) == 0) {
1504 			mp->mutex_owner = (uintptr_t)self;
1505 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1506 			error = 0;
1507 		}
1508 		count++;
1509 	}
1510 	exit_critical(self);
1511 
1512 done:
1513 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1514 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1515 		/*
1516 		 * We shouldn't own the mutex.
1517 		 * Just clear the lock; everyone has already been waked up.
1518 		 */
1519 		mp->mutex_owner = 0;
1520 		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1521 		(void) clear_lockbyte64(&mp->mutex_lockword64);
1522 		error = ENOTRECOVERABLE;
1523 	}
1524 
1525 	if (error) {
1526 		if (count) {
1527 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1528 		}
1529 		if (error != EBUSY) {
1530 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1531 		}
1532 	} else {
1533 		if (count) {
1534 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1535 		}
1536 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1537 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1538 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1539 			if (mp->mutex_flag & LOCK_OWNERDEAD)
1540 				error = EOWNERDEAD;
1541 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1542 				error = ELOCKUNMAPPED;
1543 			else
1544 				error = EOWNERDEAD;
1545 		}
1546 	}
1547 
1548 	return (error);
1549 }
1550 
1551 /*
1552  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1553  * Returns the lwpid of the thread that was dequeued, if any.
1554  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1555  * to wake up the specified lwp.
1556  */
1557 static lwpid_t
1558 mutex_wakeup(mutex_t *mp)
1559 {
1560 	lwpid_t lwpid = 0;
1561 	int more;
1562 	queue_head_t *qp;
1563 	ulwp_t *ulwp;
1564 
1565 	/*
1566 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1567 	 * waiters bit if no one was found on the queue because the mutex
1568 	 * might have been deallocated or reallocated for another purpose.
1569 	 */
1570 	qp = queue_lock(mp, MX);
1571 	if ((ulwp = dequeue(qp, &more)) != NULL) {
1572 		lwpid = ulwp->ul_lwpid;
1573 		mp->mutex_waiters = more;
1574 	}
1575 	queue_unlock(qp);
1576 	return (lwpid);
1577 }
1578 
1579 /*
1580  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1581  */
1582 static void
1583 mutex_wakeup_all(mutex_t *mp)
1584 {
1585 	queue_head_t *qp;
1586 	queue_root_t *qrp;
1587 	int nlwpid = 0;
1588 	int maxlwps = MAXLWPS;
1589 	ulwp_t *ulwp;
1590 	lwpid_t buffer[MAXLWPS];
1591 	lwpid_t *lwpid = buffer;
1592 
1593 	/*
1594 	 * Walk the list of waiters and prepare to wake up all of them.
1595 	 * The waiters flag has already been cleared from the mutex.
1596 	 *
1597 	 * We keep track of lwpids that are to be unparked in lwpid[].
1598 	 * __lwp_unpark_all() is called to unpark all of them after
1599 	 * they have been removed from the sleep queue and the sleep
1600 	 * queue lock has been dropped.  If we run out of space in our
1601 	 * on-stack buffer, we need to allocate more but we can't call
1602 	 * lmalloc() because we are holding a queue lock when the overflow
1603 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1604 	 * either because the application may have allocated a small
1605 	 * stack and we don't want to overrun the stack.  So we call
1606 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1607 	 * system call directly since that path acquires no locks.
1608 	 */
1609 	qp = queue_lock(mp, MX);
1610 	for (;;) {
1611 		if ((qrp = qp->qh_root) == NULL ||
1612 		    (ulwp = qrp->qr_head) == NULL)
1613 			break;
1614 		ASSERT(ulwp->ul_wchan == mp);
1615 		queue_unlink(qp, &qrp->qr_head, NULL);
1616 		ulwp->ul_sleepq = NULL;
1617 		ulwp->ul_wchan = NULL;
1618 		if (nlwpid == maxlwps)
1619 			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1620 		lwpid[nlwpid++] = ulwp->ul_lwpid;
1621 	}
1622 
1623 	if (nlwpid == 0) {
1624 		queue_unlock(qp);
1625 	} else {
1626 		mp->mutex_waiters = 0;
1627 		no_preempt(curthread);
1628 		queue_unlock(qp);
1629 		if (nlwpid == 1)
1630 			(void) __lwp_unpark(lwpid[0]);
1631 		else
1632 			(void) __lwp_unpark_all(lwpid, nlwpid);
1633 		preempt(curthread);
1634 	}
1635 
1636 	if (lwpid != buffer)
1637 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1638 }
1639 
1640 /*
1641  * Release a process-private mutex.
1642  * As an optimization, if there are waiters but there are also spinners
1643  * attempting to acquire the mutex, then don't bother waking up a waiter;
1644  * one of the spinners will acquire the mutex soon and it would be a waste
1645  * of resources to wake up some thread just to have it spin for a while
1646  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1647  */
1648 static lwpid_t
1649 mutex_unlock_queue(mutex_t *mp, int release_all)
1650 {
1651 	lwpid_t lwpid = 0;
1652 	uint32_t old_lockword;
1653 
1654 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1655 	mp->mutex_owner = 0;
1656 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1657 	if ((old_lockword & WAITERMASK) &&
1658 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1659 		ulwp_t *self = curthread;
1660 		no_preempt(self);	/* ensure a prompt wakeup */
1661 		if (release_all)
1662 			mutex_wakeup_all(mp);
1663 		else
1664 			lwpid = mutex_wakeup(mp);
1665 		if (lwpid == 0)
1666 			preempt(self);
1667 	}
1668 	return (lwpid);
1669 }
1670 
1671 /*
1672  * Like mutex_unlock_queue(), but for process-shared mutexes.
1673  */
1674 static void
1675 mutex_unlock_process(mutex_t *mp, int release_all)
1676 {
1677 	uint64_t old_lockword64;
1678 
1679 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1680 	mp->mutex_owner = 0;
1681 	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1682 	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1683 	if ((old_lockword64 & WAITERMASK64) &&
1684 	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1685 		ulwp_t *self = curthread;
1686 		no_preempt(self);	/* ensure a prompt wakeup */
1687 		(void) ___lwp_mutex_wakeup(mp, release_all);
1688 		preempt(self);
1689 	}
1690 }
1691 
1692 void
1693 stall(void)
1694 {
1695 	for (;;)
1696 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1697 }
1698 
1699 /*
1700  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1701  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1702  * If successful, returns with mutex_owner set correctly.
1703  */
1704 int
1705 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1706 	timespec_t *tsp)
1707 {
1708 	uberdata_t *udp = curthread->ul_uberdata;
1709 	queue_head_t *qp;
1710 	hrtime_t begin_sleep;
1711 	int error = 0;
1712 
1713 	self->ul_sp = stkptr();
1714 	if (__td_event_report(self, TD_SLEEP, udp)) {
1715 		self->ul_wchan = mp;
1716 		self->ul_td_evbuf.eventnum = TD_SLEEP;
1717 		self->ul_td_evbuf.eventdata = mp;
1718 		tdb_event(TD_SLEEP, udp);
1719 	}
1720 	if (msp) {
1721 		tdb_incr(msp->mutex_sleep);
1722 		begin_sleep = gethrtime();
1723 	}
1724 
1725 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1726 
1727 	/*
1728 	 * Put ourself on the sleep queue, and while we are
1729 	 * unable to grab the lock, go park in the kernel.
1730 	 * Take ourself off the sleep queue after we acquire the lock.
1731 	 * The waiter bit can be set/cleared only while holding the queue lock.
1732 	 */
1733 	qp = queue_lock(mp, MX);
1734 	enqueue(qp, self, 0);
1735 	mp->mutex_waiters = 1;
1736 	for (;;) {
1737 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1738 			mp->mutex_owner = (uintptr_t)self;
1739 			mp->mutex_waiters = dequeue_self(qp);
1740 			break;
1741 		}
1742 		set_parking_flag(self, 1);
1743 		queue_unlock(qp);
1744 		/*
1745 		 * __lwp_park() will return the residual time in tsp
1746 		 * if we are unparked before the timeout expires.
1747 		 */
1748 		error = __lwp_park(tsp, 0);
1749 		set_parking_flag(self, 0);
1750 		/*
1751 		 * We could have taken a signal or suspended ourself.
1752 		 * If we did, then we removed ourself from the queue.
1753 		 * Someone else may have removed us from the queue
1754 		 * as a consequence of mutex_unlock().  We may have
1755 		 * gotten a timeout from __lwp_park().  Or we may still
1756 		 * be on the queue and this is just a spurious wakeup.
1757 		 */
1758 		qp = queue_lock(mp, MX);
1759 		if (self->ul_sleepq == NULL) {
1760 			if (error) {
1761 				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1762 				if (error != EINTR)
1763 					break;
1764 				error = 0;
1765 			}
1766 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1767 				mp->mutex_owner = (uintptr_t)self;
1768 				break;
1769 			}
1770 			enqueue(qp, self, 0);
1771 			mp->mutex_waiters = 1;
1772 		}
1773 		ASSERT(self->ul_sleepq == qp &&
1774 		    self->ul_qtype == MX &&
1775 		    self->ul_wchan == mp);
1776 		if (error) {
1777 			if (error != EINTR) {
1778 				mp->mutex_waiters = dequeue_self(qp);
1779 				break;
1780 			}
1781 			error = 0;
1782 		}
1783 	}
1784 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1785 	    self->ul_wchan == NULL);
1786 	self->ul_sp = 0;
1787 	queue_unlock(qp);
1788 
1789 	if (msp)
1790 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1791 
1792 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1793 
1794 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1795 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1796 		/*
1797 		 * We shouldn't own the mutex.
1798 		 * Just clear the lock; everyone has already been waked up.
1799 		 */
1800 		mp->mutex_owner = 0;
1801 		(void) clear_lockbyte(&mp->mutex_lockword);
1802 		error = ENOTRECOVERABLE;
1803 	}
1804 
1805 	if (error) {
1806 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1807 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1808 	} else {
1809 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1810 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1811 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1812 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1813 			error = EOWNERDEAD;
1814 		}
1815 	}
1816 
1817 	return (error);
1818 }
1819 
1820 static int
1821 mutex_recursion(mutex_t *mp, int mtype, int try)
1822 {
1823 	ASSERT(mutex_is_held(mp));
1824 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1825 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1826 
1827 	if (mtype & LOCK_RECURSIVE) {
1828 		if (mp->mutex_rcount == RECURSION_MAX) {
1829 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1830 			return (EAGAIN);
1831 		}
1832 		mp->mutex_rcount++;
1833 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1834 		return (0);
1835 	}
1836 	if (try == MUTEX_LOCK) {
1837 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1838 		return (EDEADLK);
1839 	}
1840 	return (EBUSY);
1841 }
1842 
1843 /*
1844  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1845  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1846  * We use tdb_hash_lock here and in the synch object tracking code in
1847  * the tdb_agent.c file.  There is no conflict between these two usages.
1848  */
1849 void
1850 register_lock(mutex_t *mp)
1851 {
1852 	uberdata_t *udp = curthread->ul_uberdata;
1853 	uint_t hash = LOCK_HASH(mp);
1854 	robust_t *rlp;
1855 	robust_t **rlpp;
1856 	robust_t **table;
1857 
1858 	if ((table = udp->robustlocks) == NULL) {
1859 		lmutex_lock(&udp->tdb_hash_lock);
1860 		if ((table = udp->robustlocks) == NULL) {
1861 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1862 			_membar_producer();
1863 			udp->robustlocks = table;
1864 		}
1865 		lmutex_unlock(&udp->tdb_hash_lock);
1866 	}
1867 	_membar_consumer();
1868 
1869 	/*
1870 	 * First search the registered table with no locks held.
1871 	 * This is safe because the table never shrinks
1872 	 * and we can only get a false negative.
1873 	 */
1874 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1875 		if (rlp->robust_lock == mp)	/* already registered */
1876 			return;
1877 	}
1878 
1879 	/*
1880 	 * The lock was not found.
1881 	 * Repeat the operation with tdb_hash_lock held.
1882 	 */
1883 	lmutex_lock(&udp->tdb_hash_lock);
1884 
1885 	for (rlpp = &table[hash];
1886 	    (rlp = *rlpp) != NULL;
1887 	    rlpp = &rlp->robust_next) {
1888 		if (rlp->robust_lock == mp) {	/* already registered */
1889 			lmutex_unlock(&udp->tdb_hash_lock);
1890 			return;
1891 		}
1892 	}
1893 
1894 	/*
1895 	 * The lock has never been registered.
1896 	 * Register it now and add it to the table.
1897 	 */
1898 	(void) ___lwp_mutex_register(mp);
1899 	rlp = lmalloc(sizeof (*rlp));
1900 	rlp->robust_lock = mp;
1901 	_membar_producer();
1902 	*rlpp = rlp;
1903 
1904 	lmutex_unlock(&udp->tdb_hash_lock);
1905 }
1906 
1907 /*
1908  * This is called in the child of fork()/forkall() to start over
1909  * with a clean slate.  (Each process must register its own locks.)
1910  * No locks are needed because all other threads are suspended or gone.
1911  */
1912 void
1913 unregister_locks(void)
1914 {
1915 	uberdata_t *udp = curthread->ul_uberdata;
1916 	uint_t hash;
1917 	robust_t **table;
1918 	robust_t *rlp;
1919 	robust_t *next;
1920 
1921 	if ((table = udp->robustlocks) != NULL) {
1922 		for (hash = 0; hash < LOCKHASHSZ; hash++) {
1923 			rlp = table[hash];
1924 			while (rlp != NULL) {
1925 				next = rlp->robust_next;
1926 				lfree(rlp, sizeof (*rlp));
1927 				rlp = next;
1928 			}
1929 		}
1930 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
1931 		udp->robustlocks = NULL;
1932 	}
1933 }
1934 
1935 /*
1936  * Returns with mutex_owner set correctly.
1937  */
1938 int
1939 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
1940 {
1941 	ulwp_t *self = curthread;
1942 	uberdata_t *udp = self->ul_uberdata;
1943 	int mtype = mp->mutex_type;
1944 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
1945 	int error = 0;
1946 	int noceil = try & MUTEX_NOCEIL;
1947 	uint8_t ceil;
1948 	int myprio;
1949 
1950 	try &= ~MUTEX_NOCEIL;
1951 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1952 
1953 	if (!self->ul_schedctl_called)
1954 		(void) setup_schedctl();
1955 
1956 	if (msp && try == MUTEX_TRY)
1957 		tdb_incr(msp->mutex_try);
1958 
1959 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_is_held(mp))
1960 		return (mutex_recursion(mp, mtype, try));
1961 
1962 	if (self->ul_error_detection && try == MUTEX_LOCK &&
1963 	    tsp == NULL && mutex_is_held(mp))
1964 		lock_error(mp, "mutex_lock", NULL, NULL);
1965 
1966 	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
1967 		update_sched(self);
1968 		if (self->ul_cid != self->ul_rtclassid) {
1969 			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
1970 			return (EPERM);
1971 		}
1972 		ceil = mp->mutex_ceiling;
1973 		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
1974 		if (myprio > ceil) {
1975 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
1976 			return (EINVAL);
1977 		}
1978 		if ((error = _ceil_mylist_add(mp)) != 0) {
1979 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1980 			return (error);
1981 		}
1982 		if (myprio < ceil)
1983 			_ceil_prio_inherit(ceil);
1984 	}
1985 
1986 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
1987 	    == (USYNC_PROCESS | LOCK_ROBUST))
1988 		register_lock(mp);
1989 
1990 	if (mtype & LOCK_PRIO_INHERIT) {
1991 		/* go straight to the kernel */
1992 		if (try == MUTEX_TRY)
1993 			error = mutex_trylock_kernel(mp);
1994 		else	/* MUTEX_LOCK */
1995 			error = mutex_lock_kernel(mp, tsp, msp);
1996 		/*
1997 		 * The kernel never sets or clears the lock byte
1998 		 * for LOCK_PRIO_INHERIT mutexes.
1999 		 * Set it here for consistency.
2000 		 */
2001 		switch (error) {
2002 		case 0:
2003 			self->ul_pilocks++;
2004 			mp->mutex_lockw = LOCKSET;
2005 			break;
2006 		case EOWNERDEAD:
2007 		case ELOCKUNMAPPED:
2008 			self->ul_pilocks++;
2009 			mp->mutex_lockw = LOCKSET;
2010 			/* FALLTHROUGH */
2011 		case ENOTRECOVERABLE:
2012 			ASSERT(mtype & LOCK_ROBUST);
2013 			break;
2014 		case EDEADLK:
2015 			if (try == MUTEX_LOCK)
2016 				stall();
2017 			error = EBUSY;
2018 			break;
2019 		}
2020 	} else if (mtype & USYNC_PROCESS) {
2021 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2022 		if (error == EBUSY && try == MUTEX_LOCK)
2023 			error = mutex_lock_kernel(mp, tsp, msp);
2024 	} else {	/* USYNC_THREAD */
2025 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2026 		if (error == EBUSY && try == MUTEX_LOCK)
2027 			error = mutex_lock_queue(self, msp, mp, tsp);
2028 	}
2029 
2030 	switch (error) {
2031 	case 0:
2032 	case EOWNERDEAD:
2033 	case ELOCKUNMAPPED:
2034 		if (mtype & LOCK_ROBUST)
2035 			remember_lock(mp);
2036 		if (msp)
2037 			record_begin_hold(msp);
2038 		break;
2039 	default:
2040 		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2041 			(void) _ceil_mylist_del(mp);
2042 			if (myprio < ceil)
2043 				_ceil_prio_waive();
2044 		}
2045 		if (try == MUTEX_TRY) {
2046 			if (msp)
2047 				tdb_incr(msp->mutex_try_fail);
2048 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2049 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2050 				tdb_event(TD_LOCK_TRY, udp);
2051 			}
2052 		}
2053 		break;
2054 	}
2055 
2056 	return (error);
2057 }
2058 
2059 int
2060 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2061 {
2062 	ulwp_t *self = curthread;
2063 	uberdata_t *udp = self->ul_uberdata;
2064 
2065 	/*
2066 	 * We know that USYNC_PROCESS is set in mtype and that
2067 	 * zero, one, or both of the flags LOCK_RECURSIVE and
2068 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
2069 	 */
2070 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2071 	enter_critical(self);
2072 	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2073 		mp->mutex_owner = (uintptr_t)self;
2074 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
2075 		exit_critical(self);
2076 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2077 		return (0);
2078 	}
2079 	exit_critical(self);
2080 
2081 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2082 		return (mutex_recursion(mp, mtype, try));
2083 
2084 	if (try == MUTEX_LOCK) {
2085 		if (mutex_trylock_process(mp, 1) == 0)
2086 			return (0);
2087 		return (mutex_lock_kernel(mp, tsp, NULL));
2088 	}
2089 
2090 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2091 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2092 		tdb_event(TD_LOCK_TRY, udp);
2093 	}
2094 	return (EBUSY);
2095 }
2096 
2097 static int
2098 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2099 {
2100 	ulwp_t *self = curthread;
2101 	int mtype = mp->mutex_type;
2102 	uberflags_t *gflags;
2103 
2104 	/*
2105 	 * Optimize the case of USYNC_THREAD, including
2106 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2107 	 * no error detection, no lock statistics,
2108 	 * and the process has only a single thread.
2109 	 * (Most likely a traditional single-threaded application.)
2110 	 */
2111 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2112 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2113 		/*
2114 		 * Only one thread exists so we don't need an atomic operation.
2115 		 */
2116 		if (mp->mutex_lockw == 0) {
2117 			mp->mutex_lockw = LOCKSET;
2118 			mp->mutex_owner = (uintptr_t)self;
2119 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2120 			return (0);
2121 		}
2122 		if (mtype && MUTEX_OWNER(mp) == self)
2123 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2124 		/*
2125 		 * We have reached a deadlock, probably because the
2126 		 * process is executing non-async-signal-safe code in
2127 		 * a signal handler and is attempting to acquire a lock
2128 		 * that it already owns.  This is not surprising, given
2129 		 * bad programming practices over the years that has
2130 		 * resulted in applications calling printf() and such
2131 		 * in their signal handlers.  Unless the user has told
2132 		 * us that the signal handlers are safe by setting:
2133 		 *	export _THREAD_ASYNC_SAFE=1
2134 		 * we return EDEADLK rather than actually deadlocking.
2135 		 */
2136 		if (tsp == NULL &&
2137 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2138 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2139 			return (EDEADLK);
2140 		}
2141 	}
2142 
2143 	/*
2144 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2145 	 * no error detection, and no lock statistics.
2146 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2147 	 */
2148 	if ((gflags = self->ul_schedctl_called) != NULL &&
2149 	    (gflags->uf_trs_ted |
2150 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2151 		if (mtype & USYNC_PROCESS)
2152 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2153 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2154 			mp->mutex_owner = (uintptr_t)self;
2155 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2156 			return (0);
2157 		}
2158 		if (mtype && MUTEX_OWNER(mp) == self)
2159 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2160 		if (mutex_trylock_adaptive(mp, 1) != 0)
2161 			return (mutex_lock_queue(self, NULL, mp, tsp));
2162 		return (0);
2163 	}
2164 
2165 	/* else do it the long way */
2166 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2167 }
2168 
2169 #pragma weak mutex_lock = __mutex_lock
2170 #pragma weak _mutex_lock = __mutex_lock
2171 #pragma weak pthread_mutex_lock = __mutex_lock
2172 #pragma weak _pthread_mutex_lock = __mutex_lock
2173 int
2174 __mutex_lock(mutex_t *mp)
2175 {
2176 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2177 	return (mutex_lock_impl(mp, NULL));
2178 }
2179 
2180 #pragma weak pthread_mutex_timedlock = _pthread_mutex_timedlock
2181 int
2182 _pthread_mutex_timedlock(mutex_t *mp, const timespec_t *abstime)
2183 {
2184 	timespec_t tslocal;
2185 	int error;
2186 
2187 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2188 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2189 	error = mutex_lock_impl(mp, &tslocal);
2190 	if (error == ETIME)
2191 		error = ETIMEDOUT;
2192 	return (error);
2193 }
2194 
2195 #pragma weak pthread_mutex_reltimedlock_np = _pthread_mutex_reltimedlock_np
2196 int
2197 _pthread_mutex_reltimedlock_np(mutex_t *mp, const timespec_t *reltime)
2198 {
2199 	timespec_t tslocal;
2200 	int error;
2201 
2202 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2203 	tslocal = *reltime;
2204 	error = mutex_lock_impl(mp, &tslocal);
2205 	if (error == ETIME)
2206 		error = ETIMEDOUT;
2207 	return (error);
2208 }
2209 
2210 #pragma weak mutex_trylock = __mutex_trylock
2211 #pragma weak _mutex_trylock = __mutex_trylock
2212 #pragma weak pthread_mutex_trylock = __mutex_trylock
2213 #pragma weak _pthread_mutex_trylock = __mutex_trylock
2214 int
2215 __mutex_trylock(mutex_t *mp)
2216 {
2217 	ulwp_t *self = curthread;
2218 	uberdata_t *udp = self->ul_uberdata;
2219 	int mtype = mp->mutex_type;
2220 	uberflags_t *gflags;
2221 
2222 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2223 
2224 	/*
2225 	 * Optimize the case of USYNC_THREAD, including
2226 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2227 	 * no error detection, no lock statistics,
2228 	 * and the process has only a single thread.
2229 	 * (Most likely a traditional single-threaded application.)
2230 	 */
2231 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2232 	    udp->uberflags.uf_all) == 0) {
2233 		/*
2234 		 * Only one thread exists so we don't need an atomic operation.
2235 		 */
2236 		if (mp->mutex_lockw == 0) {
2237 			mp->mutex_lockw = LOCKSET;
2238 			mp->mutex_owner = (uintptr_t)self;
2239 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2240 			return (0);
2241 		}
2242 		if (mtype && MUTEX_OWNER(mp) == self)
2243 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2244 		return (EBUSY);
2245 	}
2246 
2247 	/*
2248 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2249 	 * no error detection, and no lock statistics.
2250 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2251 	 */
2252 	if ((gflags = self->ul_schedctl_called) != NULL &&
2253 	    (gflags->uf_trs_ted |
2254 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2255 		if (mtype & USYNC_PROCESS)
2256 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2257 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2258 			mp->mutex_owner = (uintptr_t)self;
2259 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2260 			return (0);
2261 		}
2262 		if (mtype && MUTEX_OWNER(mp) == self)
2263 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2264 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2265 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2266 			tdb_event(TD_LOCK_TRY, udp);
2267 		}
2268 		return (EBUSY);
2269 	}
2270 
2271 	/* else do it the long way */
2272 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2273 }
2274 
2275 int
2276 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2277 {
2278 	ulwp_t *self = curthread;
2279 	uberdata_t *udp = self->ul_uberdata;
2280 	int mtype = mp->mutex_type;
2281 	tdb_mutex_stats_t *msp;
2282 	int error = 0;
2283 	int release_all;
2284 	lwpid_t lwpid;
2285 
2286 	if ((mtype & LOCK_ERRORCHECK) && !mutex_is_held(mp))
2287 		return (EPERM);
2288 
2289 	if (self->ul_error_detection && !mutex_is_held(mp))
2290 		lock_error(mp, "mutex_unlock", NULL, NULL);
2291 
2292 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2293 		mp->mutex_rcount--;
2294 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2295 		return (0);
2296 	}
2297 
2298 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2299 		(void) record_hold_time(msp);
2300 
2301 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2302 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2303 		ASSERT(mp->mutex_type & LOCK_ROBUST);
2304 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2305 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2306 	}
2307 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2308 
2309 	if (mtype & LOCK_PRIO_INHERIT) {
2310 		no_preempt(self);
2311 		mp->mutex_owner = 0;
2312 		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2313 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2314 		mp->mutex_lockw = LOCKCLEAR;
2315 		self->ul_pilocks--;
2316 		error = ___lwp_mutex_unlock(mp);
2317 		preempt(self);
2318 	} else if (mtype & USYNC_PROCESS) {
2319 		mutex_unlock_process(mp, release_all);
2320 	} else {	/* USYNC_THREAD */
2321 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2322 			(void) __lwp_unpark(lwpid);
2323 			preempt(self);
2324 		}
2325 	}
2326 
2327 	if (mtype & LOCK_ROBUST)
2328 		forget_lock(mp);
2329 
2330 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2331 		_ceil_prio_waive();
2332 
2333 	return (error);
2334 }
2335 
2336 #pragma weak mutex_unlock = __mutex_unlock
2337 #pragma weak _mutex_unlock = __mutex_unlock
2338 #pragma weak pthread_mutex_unlock = __mutex_unlock
2339 #pragma weak _pthread_mutex_unlock = __mutex_unlock
2340 int
2341 __mutex_unlock(mutex_t *mp)
2342 {
2343 	ulwp_t *self = curthread;
2344 	int mtype = mp->mutex_type;
2345 	uberflags_t *gflags;
2346 	lwpid_t lwpid;
2347 	short el;
2348 
2349 	/*
2350 	 * Optimize the case of USYNC_THREAD, including
2351 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2352 	 * no error detection, no lock statistics,
2353 	 * and the process has only a single thread.
2354 	 * (Most likely a traditional single-threaded application.)
2355 	 */
2356 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2357 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2358 		if (mtype) {
2359 			/*
2360 			 * At this point we know that one or both of the
2361 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2362 			 */
2363 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2364 				return (EPERM);
2365 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2366 				mp->mutex_rcount--;
2367 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2368 				return (0);
2369 			}
2370 		}
2371 		/*
2372 		 * Only one thread exists so we don't need an atomic operation.
2373 		 * Also, there can be no waiters.
2374 		 */
2375 		mp->mutex_owner = 0;
2376 		mp->mutex_lockword = 0;
2377 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2378 		return (0);
2379 	}
2380 
2381 	/*
2382 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2383 	 * no error detection, and no lock statistics.
2384 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2385 	 */
2386 	if ((gflags = self->ul_schedctl_called) != NULL) {
2387 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2388 fast_unlock:
2389 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2390 				(void) __lwp_unpark(lwpid);
2391 				preempt(self);
2392 			}
2393 			return (0);
2394 		}
2395 		if (el)		/* error detection or lock statistics */
2396 			goto slow_unlock;
2397 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2398 			/*
2399 			 * At this point we know that one or both of the
2400 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2401 			 */
2402 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2403 				return (EPERM);
2404 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2405 				mp->mutex_rcount--;
2406 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2407 				return (0);
2408 			}
2409 			goto fast_unlock;
2410 		}
2411 		if ((mtype &
2412 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2413 			/*
2414 			 * At this point we know that zero, one, or both of the
2415 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2416 			 * that the USYNC_PROCESS flag is set.
2417 			 */
2418 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2419 				return (EPERM);
2420 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2421 				mp->mutex_rcount--;
2422 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2423 				return (0);
2424 			}
2425 			mutex_unlock_process(mp, 0);
2426 			return (0);
2427 		}
2428 	}
2429 
2430 	/* else do it the long way */
2431 slow_unlock:
2432 	return (mutex_unlock_internal(mp, 0));
2433 }
2434 
2435 /*
2436  * Internally to the library, almost all mutex lock/unlock actions
2437  * go through these lmutex_ functions, to protect critical regions.
2438  * We replicate a bit of code from __mutex_lock() and __mutex_unlock()
2439  * to make these functions faster since we know that the mutex type
2440  * of all internal locks is USYNC_THREAD.  We also know that internal
2441  * locking can never fail, so we panic if it does.
2442  */
2443 void
2444 lmutex_lock(mutex_t *mp)
2445 {
2446 	ulwp_t *self = curthread;
2447 	uberdata_t *udp = self->ul_uberdata;
2448 
2449 	ASSERT(mp->mutex_type == USYNC_THREAD);
2450 
2451 	enter_critical(self);
2452 	/*
2453 	 * Optimize the case of no lock statistics and only a single thread.
2454 	 * (Most likely a traditional single-threaded application.)
2455 	 */
2456 	if (udp->uberflags.uf_all == 0) {
2457 		/*
2458 		 * Only one thread exists; the mutex must be free.
2459 		 */
2460 		ASSERT(mp->mutex_lockw == 0);
2461 		mp->mutex_lockw = LOCKSET;
2462 		mp->mutex_owner = (uintptr_t)self;
2463 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2464 	} else {
2465 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2466 
2467 		if (!self->ul_schedctl_called)
2468 			(void) setup_schedctl();
2469 
2470 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2471 			mp->mutex_owner = (uintptr_t)self;
2472 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2473 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2474 			(void) mutex_lock_queue(self, msp, mp, NULL);
2475 		}
2476 
2477 		if (msp)
2478 			record_begin_hold(msp);
2479 	}
2480 }
2481 
2482 void
2483 lmutex_unlock(mutex_t *mp)
2484 {
2485 	ulwp_t *self = curthread;
2486 	uberdata_t *udp = self->ul_uberdata;
2487 
2488 	ASSERT(mp->mutex_type == USYNC_THREAD);
2489 
2490 	/*
2491 	 * Optimize the case of no lock statistics and only a single thread.
2492 	 * (Most likely a traditional single-threaded application.)
2493 	 */
2494 	if (udp->uberflags.uf_all == 0) {
2495 		/*
2496 		 * Only one thread exists so there can be no waiters.
2497 		 */
2498 		mp->mutex_owner = 0;
2499 		mp->mutex_lockword = 0;
2500 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2501 	} else {
2502 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2503 		lwpid_t lwpid;
2504 
2505 		if (msp)
2506 			(void) record_hold_time(msp);
2507 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2508 			(void) __lwp_unpark(lwpid);
2509 			preempt(self);
2510 		}
2511 	}
2512 	exit_critical(self);
2513 }
2514 
2515 /*
2516  * For specialized code in libc, like the asynchronous i/o code,
2517  * the following sig_*() locking primitives are used in order
2518  * to make the code asynchronous signal safe.  Signals are
2519  * deferred while locks acquired by these functions are held.
2520  */
2521 void
2522 sig_mutex_lock(mutex_t *mp)
2523 {
2524 	sigoff(curthread);
2525 	(void) mutex_lock(mp);
2526 }
2527 
2528 void
2529 sig_mutex_unlock(mutex_t *mp)
2530 {
2531 	(void) mutex_unlock(mp);
2532 	sigon(curthread);
2533 }
2534 
2535 int
2536 sig_mutex_trylock(mutex_t *mp)
2537 {
2538 	int error;
2539 
2540 	sigoff(curthread);
2541 	if ((error = mutex_trylock(mp)) != 0)
2542 		sigon(curthread);
2543 	return (error);
2544 }
2545 
2546 /*
2547  * sig_cond_wait() is a cancellation point.
2548  */
2549 int
2550 sig_cond_wait(cond_t *cv, mutex_t *mp)
2551 {
2552 	int error;
2553 
2554 	ASSERT(curthread->ul_sigdefer != 0);
2555 	pthread_testcancel();
2556 	error = __cond_wait(cv, mp);
2557 	if (error == EINTR && curthread->ul_cursig) {
2558 		sig_mutex_unlock(mp);
2559 		/* take the deferred signal here */
2560 		sig_mutex_lock(mp);
2561 	}
2562 	pthread_testcancel();
2563 	return (error);
2564 }
2565 
2566 /*
2567  * sig_cond_reltimedwait() is a cancellation point.
2568  */
2569 int
2570 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2571 {
2572 	int error;
2573 
2574 	ASSERT(curthread->ul_sigdefer != 0);
2575 	pthread_testcancel();
2576 	error = __cond_reltimedwait(cv, mp, ts);
2577 	if (error == EINTR && curthread->ul_cursig) {
2578 		sig_mutex_unlock(mp);
2579 		/* take the deferred signal here */
2580 		sig_mutex_lock(mp);
2581 	}
2582 	pthread_testcancel();
2583 	return (error);
2584 }
2585 
2586 /*
2587  * For specialized code in libc, like the stdio code.
2588  * the following cancel_safe_*() locking primitives are used in
2589  * order to make the code cancellation-safe.  Cancellation is
2590  * deferred while locks acquired by these functions are held.
2591  */
2592 void
2593 cancel_safe_mutex_lock(mutex_t *mp)
2594 {
2595 	(void) mutex_lock(mp);
2596 	curthread->ul_libc_locks++;
2597 }
2598 
2599 int
2600 cancel_safe_mutex_trylock(mutex_t *mp)
2601 {
2602 	int error;
2603 
2604 	if ((error = mutex_trylock(mp)) == 0)
2605 		curthread->ul_libc_locks++;
2606 	return (error);
2607 }
2608 
2609 void
2610 cancel_safe_mutex_unlock(mutex_t *mp)
2611 {
2612 	ulwp_t *self = curthread;
2613 
2614 	ASSERT(self->ul_libc_locks != 0);
2615 
2616 	(void) mutex_unlock(mp);
2617 
2618 	/*
2619 	 * Decrement the count of locks held by cancel_safe_mutex_lock().
2620 	 * If we are then in a position to terminate cleanly and
2621 	 * if there is a pending cancellation and cancellation
2622 	 * is not disabled and we received EINTR from a recent
2623 	 * system call then perform the cancellation action now.
2624 	 */
2625 	if (--self->ul_libc_locks == 0 &&
2626 	    !(self->ul_vfork | self->ul_nocancel |
2627 	    self->ul_critical | self->ul_sigdefer) &&
2628 	    cancel_active())
2629 		_pthread_exit(PTHREAD_CANCELED);
2630 }
2631 
2632 static int
2633 shared_mutex_held(mutex_t *mparg)
2634 {
2635 	/*
2636 	 * The 'volatile' is necessary to make sure the compiler doesn't
2637 	 * reorder the tests of the various components of the mutex.
2638 	 * They must be tested in this order:
2639 	 *	mutex_lockw
2640 	 *	mutex_owner
2641 	 *	mutex_ownerpid
2642 	 * This relies on the fact that everywhere mutex_lockw is cleared,
2643 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2644 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2645 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2646 	 * mutex_lockw is set or cleared with a memory barrier.
2647 	 */
2648 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2649 	ulwp_t *self = curthread;
2650 	uberdata_t *udp = self->ul_uberdata;
2651 
2652 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2653 }
2654 
2655 /*
2656  * Some crufty old programs define their own version of _mutex_held()
2657  * to be simply return(1).  This breaks internal libc logic, so we
2658  * define a private version for exclusive use by libc, mutex_is_held(),
2659  * and also a new public function, __mutex_held(), to be used in new
2660  * code to circumvent these crufty old programs.
2661  */
2662 #pragma weak mutex_held = mutex_is_held
2663 #pragma weak _mutex_held = mutex_is_held
2664 #pragma weak __mutex_held = mutex_is_held
2665 int
2666 mutex_is_held(mutex_t *mparg)
2667 {
2668 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2669 
2670 	if (mparg->mutex_type & USYNC_PROCESS)
2671 		return (shared_mutex_held(mparg));
2672 	return (MUTEX_OWNED(mp, curthread));
2673 }
2674 
2675 #pragma weak mutex_destroy = __mutex_destroy
2676 #pragma weak _mutex_destroy = __mutex_destroy
2677 #pragma weak pthread_mutex_destroy = __mutex_destroy
2678 #pragma weak _pthread_mutex_destroy = __mutex_destroy
2679 int
2680 __mutex_destroy(mutex_t *mp)
2681 {
2682 	if (mp->mutex_type & USYNC_PROCESS)
2683 		forget_lock(mp);
2684 	(void) memset(mp, 0, sizeof (*mp));
2685 	tdb_sync_obj_deregister(mp);
2686 	return (0);
2687 }
2688 
2689 #pragma weak mutex_consistent = __mutex_consistent
2690 #pragma weak _mutex_consistent = __mutex_consistent
2691 #pragma weak pthread_mutex_consistent_np = __mutex_consistent
2692 #pragma weak _pthread_mutex_consistent_np = __mutex_consistent
2693 int
2694 __mutex_consistent(mutex_t *mp)
2695 {
2696 	/*
2697 	 * Do this only for an inconsistent, initialized robust lock
2698 	 * that we hold.  For all other cases, return EINVAL.
2699 	 */
2700 	if (mutex_is_held(mp) &&
2701 	    (mp->mutex_type & LOCK_ROBUST) &&
2702 	    (mp->mutex_flag & LOCK_INITED) &&
2703 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2704 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2705 		mp->mutex_rcount = 0;
2706 		return (0);
2707 	}
2708 	return (EINVAL);
2709 }
2710 
2711 /*
2712  * Spin locks are separate from ordinary mutexes,
2713  * but we use the same data structure for them.
2714  */
2715 
2716 #pragma weak pthread_spin_init = _pthread_spin_init
2717 int
2718 _pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2719 {
2720 	mutex_t *mp = (mutex_t *)lock;
2721 
2722 	(void) memset(mp, 0, sizeof (*mp));
2723 	if (pshared == PTHREAD_PROCESS_SHARED)
2724 		mp->mutex_type = USYNC_PROCESS;
2725 	else
2726 		mp->mutex_type = USYNC_THREAD;
2727 	mp->mutex_flag = LOCK_INITED;
2728 	mp->mutex_magic = MUTEX_MAGIC;
2729 	return (0);
2730 }
2731 
2732 #pragma weak pthread_spin_destroy = _pthread_spin_destroy
2733 int
2734 _pthread_spin_destroy(pthread_spinlock_t *lock)
2735 {
2736 	(void) memset(lock, 0, sizeof (*lock));
2737 	return (0);
2738 }
2739 
2740 #pragma weak pthread_spin_trylock = _pthread_spin_trylock
2741 int
2742 _pthread_spin_trylock(pthread_spinlock_t *lock)
2743 {
2744 	mutex_t *mp = (mutex_t *)lock;
2745 	ulwp_t *self = curthread;
2746 	int error = 0;
2747 
2748 	no_preempt(self);
2749 	if (set_lock_byte(&mp->mutex_lockw) != 0)
2750 		error = EBUSY;
2751 	else {
2752 		mp->mutex_owner = (uintptr_t)self;
2753 		if (mp->mutex_type == USYNC_PROCESS)
2754 			mp->mutex_ownerpid = self->ul_uberdata->pid;
2755 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2756 	}
2757 	preempt(self);
2758 	return (error);
2759 }
2760 
2761 #pragma weak pthread_spin_lock = _pthread_spin_lock
2762 int
2763 _pthread_spin_lock(pthread_spinlock_t *lock)
2764 {
2765 	mutex_t *mp = (mutex_t *)lock;
2766 	ulwp_t *self = curthread;
2767 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2768 	int count = 0;
2769 
2770 	ASSERT(!self->ul_critical || self->ul_bindflags);
2771 
2772 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
2773 
2774 	/*
2775 	 * We don't care whether the owner is running on a processor.
2776 	 * We just spin because that's what this interface requires.
2777 	 */
2778 	for (;;) {
2779 		if (*lockp == 0) {	/* lock byte appears to be clear */
2780 			no_preempt(self);
2781 			if (set_lock_byte(lockp) == 0)
2782 				break;
2783 			preempt(self);
2784 		}
2785 		if (count < INT_MAX)
2786 			count++;
2787 		SMT_PAUSE();
2788 	}
2789 	mp->mutex_owner = (uintptr_t)self;
2790 	if (mp->mutex_type == USYNC_PROCESS)
2791 		mp->mutex_ownerpid = self->ul_uberdata->pid;
2792 	preempt(self);
2793 	if (count) {
2794 		DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
2795 	}
2796 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2797 	return (0);
2798 }
2799 
2800 #pragma weak pthread_spin_unlock = _pthread_spin_unlock
2801 int
2802 _pthread_spin_unlock(pthread_spinlock_t *lock)
2803 {
2804 	mutex_t *mp = (mutex_t *)lock;
2805 	ulwp_t *self = curthread;
2806 
2807 	no_preempt(self);
2808 	mp->mutex_owner = 0;
2809 	mp->mutex_ownerpid = 0;
2810 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2811 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
2812 	preempt(self);
2813 	return (0);
2814 }
2815 
2816 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
2817 
2818 /*
2819  * Find/allocate an entry for 'lock' in our array of held locks.
2820  */
2821 static mutex_t **
2822 find_lock_entry(mutex_t *lock)
2823 {
2824 	ulwp_t *self = curthread;
2825 	mutex_t **remembered = NULL;
2826 	mutex_t **lockptr;
2827 	uint_t nlocks;
2828 
2829 	if ((nlocks = self->ul_heldlockcnt) != 0)
2830 		lockptr = self->ul_heldlocks.array;
2831 	else {
2832 		nlocks = 1;
2833 		lockptr = &self->ul_heldlocks.single;
2834 	}
2835 
2836 	for (; nlocks; nlocks--, lockptr++) {
2837 		if (*lockptr == lock)
2838 			return (lockptr);
2839 		if (*lockptr == NULL && remembered == NULL)
2840 			remembered = lockptr;
2841 	}
2842 	if (remembered != NULL) {
2843 		*remembered = lock;
2844 		return (remembered);
2845 	}
2846 
2847 	/*
2848 	 * No entry available.  Allocate more space, converting
2849 	 * the single entry into an array of entries if necessary.
2850 	 */
2851 	if ((nlocks = self->ul_heldlockcnt) == 0) {
2852 		/*
2853 		 * Initial allocation of the array.
2854 		 * Convert the single entry into an array.
2855 		 */
2856 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
2857 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
2858 		/*
2859 		 * The single entry becomes the first entry in the array.
2860 		 */
2861 		*lockptr = self->ul_heldlocks.single;
2862 		self->ul_heldlocks.array = lockptr;
2863 		/*
2864 		 * Return the next available entry in the array.
2865 		 */
2866 		*++lockptr = lock;
2867 		return (lockptr);
2868 	}
2869 	/*
2870 	 * Reallocate the array, double the size each time.
2871 	 */
2872 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
2873 	(void) memcpy(lockptr, self->ul_heldlocks.array,
2874 	    nlocks * sizeof (mutex_t *));
2875 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2876 	self->ul_heldlocks.array = lockptr;
2877 	self->ul_heldlockcnt *= 2;
2878 	/*
2879 	 * Return the next available entry in the newly allocated array.
2880 	 */
2881 	*(lockptr += nlocks) = lock;
2882 	return (lockptr);
2883 }
2884 
2885 /*
2886  * Insert 'lock' into our list of held locks.
2887  * Currently only used for LOCK_ROBUST mutexes.
2888  */
2889 void
2890 remember_lock(mutex_t *lock)
2891 {
2892 	(void) find_lock_entry(lock);
2893 }
2894 
2895 /*
2896  * Remove 'lock' from our list of held locks.
2897  * Currently only used for LOCK_ROBUST mutexes.
2898  */
2899 void
2900 forget_lock(mutex_t *lock)
2901 {
2902 	*find_lock_entry(lock) = NULL;
2903 }
2904 
2905 /*
2906  * Free the array of held locks.
2907  */
2908 void
2909 heldlock_free(ulwp_t *ulwp)
2910 {
2911 	uint_t nlocks;
2912 
2913 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
2914 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2915 	ulwp->ul_heldlockcnt = 0;
2916 	ulwp->ul_heldlocks.array = NULL;
2917 }
2918 
2919 /*
2920  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
2921  * Called from _thrp_exit() to deal with abandoned locks.
2922  */
2923 void
2924 heldlock_exit(void)
2925 {
2926 	ulwp_t *self = curthread;
2927 	mutex_t **lockptr;
2928 	uint_t nlocks;
2929 	mutex_t *mp;
2930 
2931 	if ((nlocks = self->ul_heldlockcnt) != 0)
2932 		lockptr = self->ul_heldlocks.array;
2933 	else {
2934 		nlocks = 1;
2935 		lockptr = &self->ul_heldlocks.single;
2936 	}
2937 
2938 	for (; nlocks; nlocks--, lockptr++) {
2939 		/*
2940 		 * The kernel takes care of transitioning held
2941 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
2942 		 * We avoid that case here.
2943 		 */
2944 		if ((mp = *lockptr) != NULL &&
2945 		    mutex_is_held(mp) &&
2946 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
2947 		    LOCK_ROBUST) {
2948 			mp->mutex_rcount = 0;
2949 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
2950 				mp->mutex_flag |= LOCK_OWNERDEAD;
2951 			(void) mutex_unlock_internal(mp, 1);
2952 		}
2953 	}
2954 
2955 	heldlock_free(self);
2956 }
2957 
2958 #pragma weak cond_init = _cond_init
2959 /* ARGSUSED2 */
2960 int
2961 _cond_init(cond_t *cvp, int type, void *arg)
2962 {
2963 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
2964 		return (EINVAL);
2965 	(void) memset(cvp, 0, sizeof (*cvp));
2966 	cvp->cond_type = (uint16_t)type;
2967 	cvp->cond_magic = COND_MAGIC;
2968 	return (0);
2969 }
2970 
2971 /*
2972  * cond_sleep_queue(): utility function for cond_wait_queue().
2973  *
2974  * Go to sleep on a condvar sleep queue, expect to be waked up
2975  * by someone calling cond_signal() or cond_broadcast() or due
2976  * to receiving a UNIX signal or being cancelled, or just simply
2977  * due to a spurious wakeup (like someome calling forkall()).
2978  *
2979  * The associated mutex is *not* reacquired before returning.
2980  * That must be done by the caller of cond_sleep_queue().
2981  */
2982 static int
2983 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
2984 {
2985 	ulwp_t *self = curthread;
2986 	queue_head_t *qp;
2987 	queue_head_t *mqp;
2988 	lwpid_t lwpid;
2989 	int signalled;
2990 	int error;
2991 	int cv_wake;
2992 	int release_all;
2993 
2994 	/*
2995 	 * Put ourself on the CV sleep queue, unlock the mutex, then
2996 	 * park ourself and unpark a candidate lwp to grab the mutex.
2997 	 * We must go onto the CV sleep queue before dropping the
2998 	 * mutex in order to guarantee atomicity of the operation.
2999 	 */
3000 	self->ul_sp = stkptr();
3001 	qp = queue_lock(cvp, CV);
3002 	enqueue(qp, self, 0);
3003 	cvp->cond_waiters_user = 1;
3004 	self->ul_cvmutex = mp;
3005 	self->ul_cv_wake = cv_wake = (tsp != NULL);
3006 	self->ul_signalled = 0;
3007 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
3008 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
3009 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3010 	}
3011 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3012 	lwpid = mutex_unlock_queue(mp, release_all);
3013 	for (;;) {
3014 		set_parking_flag(self, 1);
3015 		queue_unlock(qp);
3016 		if (lwpid != 0) {
3017 			lwpid = preempt_unpark(self, lwpid);
3018 			preempt(self);
3019 		}
3020 		/*
3021 		 * We may have a deferred signal present,
3022 		 * in which case we should return EINTR.
3023 		 * Also, we may have received a SIGCANCEL; if so
3024 		 * and we are cancelable we should return EINTR.
3025 		 * We force an immediate EINTR return from
3026 		 * __lwp_park() by turning our parking flag off.
3027 		 */
3028 		if (self->ul_cursig != 0 ||
3029 		    (self->ul_cancelable && self->ul_cancel_pending))
3030 			set_parking_flag(self, 0);
3031 		/*
3032 		 * __lwp_park() will return the residual time in tsp
3033 		 * if we are unparked before the timeout expires.
3034 		 */
3035 		error = __lwp_park(tsp, lwpid);
3036 		set_parking_flag(self, 0);
3037 		lwpid = 0;	/* unpark the other lwp only once */
3038 		/*
3039 		 * We were waked up by cond_signal(), cond_broadcast(),
3040 		 * by an interrupt or timeout (EINTR or ETIME),
3041 		 * or we may just have gotten a spurious wakeup.
3042 		 */
3043 		qp = queue_lock(cvp, CV);
3044 		if (!cv_wake)
3045 			mqp = queue_lock(mp, MX);
3046 		if (self->ul_sleepq == NULL)
3047 			break;
3048 		/*
3049 		 * We are on either the condvar sleep queue or the
3050 		 * mutex sleep queue.  Break out of the sleep if we
3051 		 * were interrupted or we timed out (EINTR or ETIME).
3052 		 * Else this is a spurious wakeup; continue the loop.
3053 		 */
3054 		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3055 			if (error) {
3056 				mp->mutex_waiters = dequeue_self(mqp);
3057 				break;
3058 			}
3059 			tsp = NULL;	/* no more timeout */
3060 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
3061 			if (error) {
3062 				cvp->cond_waiters_user = dequeue_self(qp);
3063 				break;
3064 			}
3065 			/*
3066 			 * Else a spurious wakeup on the condvar queue.
3067 			 * __lwp_park() has already adjusted the timeout.
3068 			 */
3069 		} else {
3070 			thr_panic("cond_sleep_queue(): thread not on queue");
3071 		}
3072 		if (!cv_wake)
3073 			queue_unlock(mqp);
3074 	}
3075 
3076 	self->ul_sp = 0;
3077 	self->ul_cv_wake = 0;
3078 	ASSERT(self->ul_cvmutex == NULL);
3079 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3080 	    self->ul_wchan == NULL);
3081 
3082 	signalled = self->ul_signalled;
3083 	self->ul_signalled = 0;
3084 	queue_unlock(qp);
3085 	if (!cv_wake)
3086 		queue_unlock(mqp);
3087 
3088 	/*
3089 	 * If we were concurrently cond_signal()d and any of:
3090 	 * received a UNIX signal, were cancelled, or got a timeout,
3091 	 * then perform another cond_signal() to avoid consuming it.
3092 	 */
3093 	if (error && signalled)
3094 		(void) cond_signal_internal(cvp);
3095 
3096 	return (error);
3097 }
3098 
3099 int
3100 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3101 {
3102 	ulwp_t *self = curthread;
3103 	int error;
3104 	int merror;
3105 
3106 	/*
3107 	 * The old thread library was programmed to defer signals
3108 	 * while in cond_wait() so that the associated mutex would
3109 	 * be guaranteed to be held when the application signal
3110 	 * handler was invoked.
3111 	 *
3112 	 * We do not behave this way by default; the state of the
3113 	 * associated mutex in the signal handler is undefined.
3114 	 *
3115 	 * To accommodate applications that depend on the old
3116 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
3117 	 * variable can be set to 1 and we will behave in the
3118 	 * old way with respect to cond_wait().
3119 	 */
3120 	if (self->ul_cond_wait_defer)
3121 		sigoff(self);
3122 
3123 	error = cond_sleep_queue(cvp, mp, tsp);
3124 
3125 	/*
3126 	 * Reacquire the mutex.
3127 	 */
3128 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3129 		error = merror;
3130 
3131 	/*
3132 	 * Take any deferred signal now, after we have reacquired the mutex.
3133 	 */
3134 	if (self->ul_cond_wait_defer)
3135 		sigon(self);
3136 
3137 	return (error);
3138 }
3139 
3140 /*
3141  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3142  * See the comment ahead of cond_sleep_queue(), above.
3143  */
3144 static int
3145 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3146 {
3147 	int mtype = mp->mutex_type;
3148 	ulwp_t *self = curthread;
3149 	int error;
3150 
3151 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3152 		_ceil_prio_waive();
3153 
3154 	self->ul_sp = stkptr();
3155 	self->ul_wchan = cvp;
3156 	mp->mutex_owner = 0;
3157 	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3158 	if (mtype & LOCK_PRIO_INHERIT) {
3159 		mp->mutex_lockw = LOCKCLEAR;
3160 		self->ul_pilocks--;
3161 	}
3162 	/*
3163 	 * ___lwp_cond_wait() returns immediately with EINTR if
3164 	 * set_parking_flag(self,0) is called on this lwp before it
3165 	 * goes to sleep in the kernel.  sigacthandler() calls this
3166 	 * when a deferred signal is noted.  This assures that we don't
3167 	 * get stuck in ___lwp_cond_wait() with all signals blocked
3168 	 * due to taking a deferred signal before going to sleep.
3169 	 */
3170 	set_parking_flag(self, 1);
3171 	if (self->ul_cursig != 0 ||
3172 	    (self->ul_cancelable && self->ul_cancel_pending))
3173 		set_parking_flag(self, 0);
3174 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3175 	set_parking_flag(self, 0);
3176 	self->ul_sp = 0;
3177 	self->ul_wchan = NULL;
3178 	return (error);
3179 }
3180 
3181 int
3182 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3183 {
3184 	ulwp_t *self = curthread;
3185 	int error;
3186 	int merror;
3187 
3188 	/*
3189 	 * See the large comment in cond_wait_queue(), above.
3190 	 */
3191 	if (self->ul_cond_wait_defer)
3192 		sigoff(self);
3193 
3194 	error = cond_sleep_kernel(cvp, mp, tsp);
3195 
3196 	/*
3197 	 * Override the return code from ___lwp_cond_wait()
3198 	 * with any non-zero return code from mutex_lock().
3199 	 * This addresses robust lock failures in particular;
3200 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3201 	 * errors in order to take corrective action.
3202 	 */
3203 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3204 		error = merror;
3205 
3206 	/*
3207 	 * Take any deferred signal now, after we have reacquired the mutex.
3208 	 */
3209 	if (self->ul_cond_wait_defer)
3210 		sigon(self);
3211 
3212 	return (error);
3213 }
3214 
3215 /*
3216  * Common code for _cond_wait() and _cond_timedwait()
3217  */
3218 int
3219 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3220 {
3221 	int mtype = mp->mutex_type;
3222 	hrtime_t begin_sleep = 0;
3223 	ulwp_t *self = curthread;
3224 	uberdata_t *udp = self->ul_uberdata;
3225 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3226 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3227 	uint8_t rcount;
3228 	int error = 0;
3229 
3230 	/*
3231 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3232 	 *	Except in the case of [ETIMEDOUT], all these error checks
3233 	 *	shall act as if they were performed immediately at the
3234 	 *	beginning of processing for the function and shall cause
3235 	 *	an error return, in effect, prior to modifying the state
3236 	 *	of the mutex specified by mutex or the condition variable
3237 	 *	specified by cond.
3238 	 * Therefore, we must return EINVAL now if the timout is invalid.
3239 	 */
3240 	if (tsp != NULL &&
3241 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3242 		return (EINVAL);
3243 
3244 	if (__td_event_report(self, TD_SLEEP, udp)) {
3245 		self->ul_sp = stkptr();
3246 		self->ul_wchan = cvp;
3247 		self->ul_td_evbuf.eventnum = TD_SLEEP;
3248 		self->ul_td_evbuf.eventdata = cvp;
3249 		tdb_event(TD_SLEEP, udp);
3250 		self->ul_sp = 0;
3251 	}
3252 	if (csp) {
3253 		if (tsp)
3254 			tdb_incr(csp->cond_timedwait);
3255 		else
3256 			tdb_incr(csp->cond_wait);
3257 	}
3258 	if (msp)
3259 		begin_sleep = record_hold_time(msp);
3260 	else if (csp)
3261 		begin_sleep = gethrtime();
3262 
3263 	if (self->ul_error_detection) {
3264 		if (!mutex_is_held(mp))
3265 			lock_error(mp, "cond_wait", cvp, NULL);
3266 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3267 			lock_error(mp, "recursive mutex in cond_wait",
3268 			    cvp, NULL);
3269 		if (cvp->cond_type & USYNC_PROCESS) {
3270 			if (!(mtype & USYNC_PROCESS))
3271 				lock_error(mp, "cond_wait", cvp,
3272 				    "condvar process-shared, "
3273 				    "mutex process-private");
3274 		} else {
3275 			if (mtype & USYNC_PROCESS)
3276 				lock_error(mp, "cond_wait", cvp,
3277 				    "condvar process-private, "
3278 				    "mutex process-shared");
3279 		}
3280 	}
3281 
3282 	/*
3283 	 * We deal with recursive mutexes by completely
3284 	 * dropping the lock and restoring the recursion
3285 	 * count after waking up.  This is arguably wrong,
3286 	 * but it obeys the principle of least astonishment.
3287 	 */
3288 	rcount = mp->mutex_rcount;
3289 	mp->mutex_rcount = 0;
3290 	if ((mtype &
3291 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3292 	    (cvp->cond_type & USYNC_PROCESS))
3293 		error = cond_wait_kernel(cvp, mp, tsp);
3294 	else
3295 		error = cond_wait_queue(cvp, mp, tsp);
3296 	mp->mutex_rcount = rcount;
3297 
3298 	if (csp) {
3299 		hrtime_t lapse = gethrtime() - begin_sleep;
3300 		if (tsp == NULL)
3301 			csp->cond_wait_sleep_time += lapse;
3302 		else {
3303 			csp->cond_timedwait_sleep_time += lapse;
3304 			if (error == ETIME)
3305 				tdb_incr(csp->cond_timedwait_timeout);
3306 		}
3307 	}
3308 	return (error);
3309 }
3310 
3311 /*
3312  * cond_wait() and _cond_wait() are cancellation points but __cond_wait()
3313  * is not.  Internally, libc calls the non-cancellation version.
3314  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3315  * since __cond_wait() is not exported from libc.
3316  */
3317 int
3318 __cond_wait(cond_t *cvp, mutex_t *mp)
3319 {
3320 	ulwp_t *self = curthread;
3321 	uberdata_t *udp = self->ul_uberdata;
3322 	uberflags_t *gflags;
3323 
3324 	/*
3325 	 * Optimize the common case of USYNC_THREAD plus
3326 	 * no error detection, no lock statistics, and no event tracing.
3327 	 */
3328 	if ((gflags = self->ul_schedctl_called) != NULL &&
3329 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3330 	    self->ul_td_events_enable |
3331 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3332 		return (cond_wait_queue(cvp, mp, NULL));
3333 
3334 	/*
3335 	 * Else do it the long way.
3336 	 */
3337 	return (cond_wait_common(cvp, mp, NULL));
3338 }
3339 
3340 #pragma weak cond_wait = _cond_wait
3341 int
3342 _cond_wait(cond_t *cvp, mutex_t *mp)
3343 {
3344 	int error;
3345 
3346 	_cancelon();
3347 	error = __cond_wait(cvp, mp);
3348 	if (error == EINTR)
3349 		_canceloff();
3350 	else
3351 		_canceloff_nocancel();
3352 	return (error);
3353 }
3354 
3355 /*
3356  * pthread_cond_wait() is a cancellation point.
3357  */
3358 #pragma weak pthread_cond_wait = _pthread_cond_wait
3359 int
3360 _pthread_cond_wait(cond_t *cvp, mutex_t *mp)
3361 {
3362 	int error;
3363 
3364 	error = _cond_wait(cvp, mp);
3365 	return ((error == EINTR)? 0 : error);
3366 }
3367 
3368 /*
3369  * cond_timedwait() and _cond_timedwait() are cancellation points
3370  * but __cond_timedwait() is not.
3371  */
3372 int
3373 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3374 {
3375 	clockid_t clock_id = cvp->cond_clockid;
3376 	timespec_t reltime;
3377 	int error;
3378 
3379 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3380 		clock_id = CLOCK_REALTIME;
3381 	abstime_to_reltime(clock_id, abstime, &reltime);
3382 	error = cond_wait_common(cvp, mp, &reltime);
3383 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3384 		/*
3385 		 * Don't return ETIME if we didn't really get a timeout.
3386 		 * This can happen if we return because someone resets
3387 		 * the system clock.  Just return zero in this case,
3388 		 * giving a spurious wakeup but not a timeout.
3389 		 */
3390 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3391 		    abstime->tv_nsec > gethrtime())
3392 			error = 0;
3393 	}
3394 	return (error);
3395 }
3396 
3397 #pragma weak cond_timedwait = _cond_timedwait
3398 int
3399 _cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3400 {
3401 	int error;
3402 
3403 	_cancelon();
3404 	error = __cond_timedwait(cvp, mp, abstime);
3405 	if (error == EINTR)
3406 		_canceloff();
3407 	else
3408 		_canceloff_nocancel();
3409 	return (error);
3410 }
3411 
3412 /*
3413  * pthread_cond_timedwait() is a cancellation point.
3414  */
3415 #pragma weak pthread_cond_timedwait = _pthread_cond_timedwait
3416 int
3417 _pthread_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3418 {
3419 	int error;
3420 
3421 	error = _cond_timedwait(cvp, mp, abstime);
3422 	if (error == ETIME)
3423 		error = ETIMEDOUT;
3424 	else if (error == EINTR)
3425 		error = 0;
3426 	return (error);
3427 }
3428 
3429 /*
3430  * cond_reltimedwait() and _cond_reltimedwait() are cancellation points
3431  * but __cond_reltimedwait() is not.
3432  */
3433 int
3434 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3435 {
3436 	timespec_t tslocal = *reltime;
3437 
3438 	return (cond_wait_common(cvp, mp, &tslocal));
3439 }
3440 
3441 #pragma weak cond_reltimedwait = _cond_reltimedwait
3442 int
3443 _cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3444 {
3445 	int error;
3446 
3447 	_cancelon();
3448 	error = __cond_reltimedwait(cvp, mp, reltime);
3449 	if (error == EINTR)
3450 		_canceloff();
3451 	else
3452 		_canceloff_nocancel();
3453 	return (error);
3454 }
3455 
3456 #pragma weak pthread_cond_reltimedwait_np = _pthread_cond_reltimedwait_np
3457 int
3458 _pthread_cond_reltimedwait_np(cond_t *cvp, mutex_t *mp,
3459 	const timespec_t *reltime)
3460 {
3461 	int error;
3462 
3463 	error = _cond_reltimedwait(cvp, mp, reltime);
3464 	if (error == ETIME)
3465 		error = ETIMEDOUT;
3466 	else if (error == EINTR)
3467 		error = 0;
3468 	return (error);
3469 }
3470 
3471 #pragma weak pthread_cond_signal = cond_signal_internal
3472 #pragma weak _pthread_cond_signal = cond_signal_internal
3473 #pragma weak cond_signal = cond_signal_internal
3474 #pragma weak _cond_signal = cond_signal_internal
3475 int
3476 cond_signal_internal(cond_t *cvp)
3477 {
3478 	ulwp_t *self = curthread;
3479 	uberdata_t *udp = self->ul_uberdata;
3480 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3481 	int error = 0;
3482 	int more;
3483 	lwpid_t lwpid;
3484 	queue_head_t *qp;
3485 	mutex_t *mp;
3486 	queue_head_t *mqp;
3487 	ulwp_t **ulwpp;
3488 	ulwp_t *ulwp;
3489 	ulwp_t *prev;
3490 
3491 	if (csp)
3492 		tdb_incr(csp->cond_signal);
3493 
3494 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3495 		error = __lwp_cond_signal(cvp);
3496 
3497 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3498 		return (error);
3499 
3500 	/*
3501 	 * Move someone from the condvar sleep queue to the mutex sleep
3502 	 * queue for the mutex that he will acquire on being waked up.
3503 	 * We can do this only if we own the mutex he will acquire.
3504 	 * If we do not own the mutex, or if his ul_cv_wake flag
3505 	 * is set, just dequeue and unpark him.
3506 	 */
3507 	qp = queue_lock(cvp, CV);
3508 	ulwpp = queue_slot(qp, &prev, &more);
3509 	cvp->cond_waiters_user = more;
3510 	if (ulwpp == NULL) {	/* no one on the sleep queue */
3511 		queue_unlock(qp);
3512 		return (error);
3513 	}
3514 	ulwp = *ulwpp;
3515 
3516 	/*
3517 	 * Inform the thread that he was the recipient of a cond_signal().
3518 	 * This lets him deal with cond_signal() and, concurrently,
3519 	 * one or more of a cancellation, a UNIX signal, or a timeout.
3520 	 * These latter conditions must not consume a cond_signal().
3521 	 */
3522 	ulwp->ul_signalled = 1;
3523 
3524 	/*
3525 	 * Dequeue the waiter but leave his ul_sleepq non-NULL
3526 	 * while we move him to the mutex queue so that he can
3527 	 * deal properly with spurious wakeups.
3528 	 */
3529 	queue_unlink(qp, ulwpp, prev);
3530 
3531 	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
3532 	ulwp->ul_cvmutex = NULL;
3533 	ASSERT(mp != NULL);
3534 
3535 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3536 		/* just wake him up */
3537 		lwpid = ulwp->ul_lwpid;
3538 		no_preempt(self);
3539 		ulwp->ul_sleepq = NULL;
3540 		ulwp->ul_wchan = NULL;
3541 		queue_unlock(qp);
3542 		(void) __lwp_unpark(lwpid);
3543 		preempt(self);
3544 	} else {
3545 		/* move him to the mutex queue */
3546 		mqp = queue_lock(mp, MX);
3547 		enqueue(mqp, ulwp, 0);
3548 		mp->mutex_waiters = 1;
3549 		queue_unlock(mqp);
3550 		queue_unlock(qp);
3551 	}
3552 
3553 	return (error);
3554 }
3555 
3556 /*
3557  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3558  * and rw_queue_release() to (re)allocate a big buffer to hold the
3559  * lwpids of all the threads to be set running after they are removed
3560  * from their sleep queues.  Since we are holding a queue lock, we
3561  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3562  * lwp_unpark_all() are simple system calls and are safe in this regard.
3563  */
3564 lwpid_t *
3565 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3566 {
3567 	/*
3568 	 * Allocate NEWLWPS ids on the first overflow.
3569 	 * Double the allocation each time after that.
3570 	 */
3571 	int nlwpid = *nlwpid_ptr;
3572 	int maxlwps = *maxlwps_ptr;
3573 	int first_allocation;
3574 	int newlwps;
3575 	void *vaddr;
3576 
3577 	ASSERT(nlwpid == maxlwps);
3578 
3579 	first_allocation = (maxlwps == MAXLWPS);
3580 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3581 	vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3582 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3583 
3584 	if (vaddr == MAP_FAILED) {
3585 		/*
3586 		 * Let's hope this never happens.
3587 		 * If it does, then we have a terrible
3588 		 * thundering herd on our hands.
3589 		 */
3590 		(void) __lwp_unpark_all(lwpid, nlwpid);
3591 		*nlwpid_ptr = 0;
3592 	} else {
3593 		(void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3594 		if (!first_allocation)
3595 			(void) munmap((caddr_t)lwpid,
3596 			    maxlwps * sizeof (lwpid_t));
3597 		lwpid = vaddr;
3598 		*maxlwps_ptr = newlwps;
3599 	}
3600 
3601 	return (lwpid);
3602 }
3603 
3604 #pragma weak pthread_cond_broadcast = cond_broadcast_internal
3605 #pragma weak _pthread_cond_broadcast = cond_broadcast_internal
3606 #pragma weak cond_broadcast = cond_broadcast_internal
3607 #pragma weak _cond_broadcast = cond_broadcast_internal
3608 int
3609 cond_broadcast_internal(cond_t *cvp)
3610 {
3611 	ulwp_t *self = curthread;
3612 	uberdata_t *udp = self->ul_uberdata;
3613 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3614 	int error = 0;
3615 	queue_head_t *qp;
3616 	queue_root_t *qrp;
3617 	mutex_t *mp;
3618 	mutex_t *mp_cache = NULL;
3619 	queue_head_t *mqp = NULL;
3620 	ulwp_t *ulwp;
3621 	int nlwpid = 0;
3622 	int maxlwps = MAXLWPS;
3623 	lwpid_t buffer[MAXLWPS];
3624 	lwpid_t *lwpid = buffer;
3625 
3626 	if (csp)
3627 		tdb_incr(csp->cond_broadcast);
3628 
3629 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3630 		error = __lwp_cond_broadcast(cvp);
3631 
3632 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3633 		return (error);
3634 
3635 	/*
3636 	 * Move everyone from the condvar sleep queue to the mutex sleep
3637 	 * queue for the mutex that they will acquire on being waked up.
3638 	 * We can do this only if we own the mutex they will acquire.
3639 	 * If we do not own the mutex, or if their ul_cv_wake flag
3640 	 * is set, just dequeue and unpark them.
3641 	 *
3642 	 * We keep track of lwpids that are to be unparked in lwpid[].
3643 	 * __lwp_unpark_all() is called to unpark all of them after
3644 	 * they have been removed from the sleep queue and the sleep
3645 	 * queue lock has been dropped.  If we run out of space in our
3646 	 * on-stack buffer, we need to allocate more but we can't call
3647 	 * lmalloc() because we are holding a queue lock when the overflow
3648 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3649 	 * either because the application may have allocated a small
3650 	 * stack and we don't want to overrun the stack.  So we call
3651 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3652 	 * system call directly since that path acquires no locks.
3653 	 */
3654 	qp = queue_lock(cvp, CV);
3655 	cvp->cond_waiters_user = 0;
3656 	for (;;) {
3657 		if ((qrp = qp->qh_root) == NULL ||
3658 		    (ulwp = qrp->qr_head) == NULL)
3659 			break;
3660 		ASSERT(ulwp->ul_wchan == cvp);
3661 		queue_unlink(qp, &qrp->qr_head, NULL);
3662 		mp = ulwp->ul_cvmutex;		/* his mutex */
3663 		ulwp->ul_cvmutex = NULL;
3664 		ASSERT(mp != NULL);
3665 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3666 			/* just wake him up */
3667 			ulwp->ul_sleepq = NULL;
3668 			ulwp->ul_wchan = NULL;
3669 			if (nlwpid == maxlwps)
3670 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3671 			lwpid[nlwpid++] = ulwp->ul_lwpid;
3672 		} else {
3673 			/* move him to the mutex queue */
3674 			if (mp != mp_cache) {
3675 				mp_cache = mp;
3676 				if (mqp != NULL)
3677 					queue_unlock(mqp);
3678 				mqp = queue_lock(mp, MX);
3679 			}
3680 			enqueue(mqp, ulwp, 0);
3681 			mp->mutex_waiters = 1;
3682 		}
3683 	}
3684 	if (mqp != NULL)
3685 		queue_unlock(mqp);
3686 	if (nlwpid == 0) {
3687 		queue_unlock(qp);
3688 	} else {
3689 		no_preempt(self);
3690 		queue_unlock(qp);
3691 		if (nlwpid == 1)
3692 			(void) __lwp_unpark(lwpid[0]);
3693 		else
3694 			(void) __lwp_unpark_all(lwpid, nlwpid);
3695 		preempt(self);
3696 	}
3697 	if (lwpid != buffer)
3698 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
3699 	return (error);
3700 }
3701 
3702 #pragma weak pthread_cond_destroy = _cond_destroy
3703 #pragma weak _pthread_cond_destroy = _cond_destroy
3704 #pragma weak cond_destroy = _cond_destroy
3705 int
3706 _cond_destroy(cond_t *cvp)
3707 {
3708 	cvp->cond_magic = 0;
3709 	tdb_sync_obj_deregister(cvp);
3710 	return (0);
3711 }
3712 
3713 #if defined(THREAD_DEBUG)
3714 void
3715 assert_no_libc_locks_held(void)
3716 {
3717 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3718 }
3719 
3720 /* protected by link_lock */
3721 uint64_t spin_lock_spin;
3722 uint64_t spin_lock_spin2;
3723 uint64_t spin_lock_sleep;
3724 uint64_t spin_lock_wakeup;
3725 
3726 /*
3727  * Record spin lock statistics.
3728  * Called by a thread exiting itself in thrp_exit().
3729  * Also called via atexit() from the thread calling
3730  * exit() to do all the other threads as well.
3731  */
3732 void
3733 record_spin_locks(ulwp_t *ulwp)
3734 {
3735 	spin_lock_spin += ulwp->ul_spin_lock_spin;
3736 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3737 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3738 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3739 	ulwp->ul_spin_lock_spin = 0;
3740 	ulwp->ul_spin_lock_spin2 = 0;
3741 	ulwp->ul_spin_lock_sleep = 0;
3742 	ulwp->ul_spin_lock_wakeup = 0;
3743 }
3744 
3745 /*
3746  * atexit function:  dump the queue statistics to stderr.
3747  */
3748 #if !defined(__lint)
3749 #define	fprintf	_fprintf
3750 #endif
3751 #include <stdio.h>
3752 void
3753 dump_queue_statistics(void)
3754 {
3755 	uberdata_t *udp = curthread->ul_uberdata;
3756 	queue_head_t *qp;
3757 	int qn;
3758 	uint64_t spin_lock_total = 0;
3759 
3760 	if (udp->queue_head == NULL || thread_queue_dump == 0)
3761 		return;
3762 
3763 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3764 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3765 		return;
3766 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3767 		if (qp->qh_lockcount == 0)
3768 			continue;
3769 		spin_lock_total += qp->qh_lockcount;
3770 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3771 		    (u_longlong_t)qp->qh_lockcount,
3772 		    qp->qh_qmax, qp->qh_hmax) < 0)
3773 			return;
3774 	}
3775 
3776 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3777 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3778 		return;
3779 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3780 		if (qp->qh_lockcount == 0)
3781 			continue;
3782 		spin_lock_total += qp->qh_lockcount;
3783 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3784 		    (u_longlong_t)qp->qh_lockcount,
3785 		    qp->qh_qmax, qp->qh_hmax) < 0)
3786 			return;
3787 	}
3788 
3789 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3790 	    (u_longlong_t)spin_lock_total);
3791 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3792 	    (u_longlong_t)spin_lock_spin);
3793 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3794 	    (u_longlong_t)spin_lock_spin2);
3795 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3796 	    (u_longlong_t)spin_lock_sleep);
3797 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3798 	    (u_longlong_t)spin_lock_wakeup);
3799 }
3800 #endif
3801