xref: /titanic_50/usr/src/lib/libc/port/threads/synch.c (revision 31925ed2254d4e4e9ce3632df5ee99a1d9b3294c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #define	atomic_cas_64	_atomic_cas_64
30 
31 #include "lint.h"
32 #include "thr_uberdata.h"
33 #include <sys/rtpriocntl.h>
34 #include <sys/sdt.h>
35 #include <atomic.h>
36 
37 #if defined(THREAD_DEBUG)
38 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
39 #define	INCR(x)		((x)++)
40 #define	DECR(x)		((x)--)
41 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
42 #else
43 #define	INCR32(x)
44 #define	INCR(x)
45 #define	DECR(x)
46 #define	MAXINCR(m, x)
47 #endif
48 
49 /*
50  * This mutex is initialized to be held by lwp#1.
51  * It is used to block a thread that has returned from a mutex_lock()
52  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
53  */
54 mutex_t	stall_mutex = DEFAULTMUTEX;
55 
56 static int shared_mutex_held(mutex_t *);
57 static int mutex_queuelock_adaptive(mutex_t *);
58 static void mutex_wakeup_all(mutex_t *);
59 
60 /*
61  * Lock statistics support functions.
62  */
63 void
64 record_begin_hold(tdb_mutex_stats_t *msp)
65 {
66 	tdb_incr(msp->mutex_lock);
67 	msp->mutex_begin_hold = gethrtime();
68 }
69 
70 hrtime_t
71 record_hold_time(tdb_mutex_stats_t *msp)
72 {
73 	hrtime_t now = gethrtime();
74 
75 	if (msp->mutex_begin_hold)
76 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
77 	msp->mutex_begin_hold = 0;
78 	return (now);
79 }
80 
81 /*
82  * Called once at library initialization.
83  */
84 void
85 mutex_setup(void)
86 {
87 	if (set_lock_byte(&stall_mutex.mutex_lockw))
88 		thr_panic("mutex_setup() cannot acquire stall_mutex");
89 	stall_mutex.mutex_owner = (uintptr_t)curthread;
90 }
91 
92 /*
93  * The default spin count of 1000 is experimentally determined.
94  * On sun4u machines with any number of processors it could be raised
95  * to 10,000 but that (experimentally) makes almost no difference.
96  * The environment variable:
97  *	_THREAD_ADAPTIVE_SPIN=count
98  * can be used to override and set the count in the range [0 .. 1,000,000].
99  */
100 int	thread_adaptive_spin = 1000;
101 uint_t	thread_max_spinners = 100;
102 int	thread_queue_verify = 0;
103 static	int	ncpus;
104 
105 /*
106  * Distinguish spinning for queue locks from spinning for regular locks.
107  * We try harder to acquire queue locks by spinning.
108  * The environment variable:
109  *	_THREAD_QUEUE_SPIN=count
110  * can be used to override and set the count in the range [0 .. 1,000,000].
111  */
112 int	thread_queue_spin = 10000;
113 
114 #define	ALL_ATTRIBUTES				\
115 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
116 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
117 	LOCK_ROBUST)
118 
119 /*
120  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
121  * augmented by zero or more the flags:
122  *	LOCK_RECURSIVE
123  *	LOCK_ERRORCHECK
124  *	LOCK_PRIO_INHERIT
125  *	LOCK_PRIO_PROTECT
126  *	LOCK_ROBUST
127  */
128 #pragma weak _private_mutex_init = __mutex_init
129 #pragma weak mutex_init = __mutex_init
130 #pragma weak _mutex_init = __mutex_init
131 /* ARGSUSED2 */
132 int
133 __mutex_init(mutex_t *mp, int type, void *arg)
134 {
135 	int basetype = (type & ~ALL_ATTRIBUTES);
136 	const pcclass_t *pccp;
137 	int error = 0;
138 	int ceil;
139 
140 	if (basetype == USYNC_PROCESS_ROBUST) {
141 		/*
142 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
143 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
144 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
145 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
146 		 * mutexes will ever draw ELOCKUNMAPPED).
147 		 */
148 		type |= (USYNC_PROCESS | LOCK_ROBUST);
149 		basetype = USYNC_PROCESS;
150 	}
151 
152 	if (type & LOCK_PRIO_PROTECT)
153 		pccp = get_info_by_policy(SCHED_FIFO);
154 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
155 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
156 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
157 	    ((type & LOCK_PRIO_PROTECT) &&
158 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
159 	    ceil > pccp->pcc_primax))) {
160 		error = EINVAL;
161 	} else if (type & LOCK_ROBUST) {
162 		/*
163 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
164 		 * are required to pass an initially all-zero mutex.
165 		 * Multiple calls to mutex_init() are allowed; all but
166 		 * the first return EBUSY.  A call to mutex_init() is
167 		 * allowed to make an inconsistent robust lock consistent
168 		 * (for historical usage, even though the proper interface
169 		 * for this is mutex_consistent()).  Note that we use
170 		 * atomic_or_16() to set the LOCK_INITED flag so as
171 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
172 		 */
173 		extern void _atomic_or_16(volatile uint16_t *, uint16_t);
174 		if (!(mp->mutex_flag & LOCK_INITED)) {
175 			mp->mutex_type = (uint8_t)type;
176 			_atomic_or_16(&mp->mutex_flag, LOCK_INITED);
177 			mp->mutex_magic = MUTEX_MAGIC;
178 		} else if (type != mp->mutex_type ||
179 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
180 			error = EINVAL;
181 		} else if (__mutex_consistent(mp) != 0) {
182 			error = EBUSY;
183 		}
184 		/* register a process robust mutex with the kernel */
185 		if (basetype == USYNC_PROCESS)
186 			register_lock(mp);
187 	} else {
188 		(void) _memset(mp, 0, sizeof (*mp));
189 		mp->mutex_type = (uint8_t)type;
190 		mp->mutex_flag = LOCK_INITED;
191 		mp->mutex_magic = MUTEX_MAGIC;
192 	}
193 
194 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
195 		mp->mutex_ceiling = ceil;
196 	}
197 
198 	return (error);
199 }
200 
201 /*
202  * Delete mp from list of ceiling mutexes owned by curthread.
203  * Return 1 if the head of the chain was updated.
204  */
205 int
206 _ceil_mylist_del(mutex_t *mp)
207 {
208 	ulwp_t *self = curthread;
209 	mxchain_t **mcpp;
210 	mxchain_t *mcp;
211 
212 	for (mcpp = &self->ul_mxchain;
213 	    (mcp = *mcpp) != NULL;
214 	    mcpp = &mcp->mxchain_next) {
215 		if (mcp->mxchain_mx == mp) {
216 			*mcpp = mcp->mxchain_next;
217 			lfree(mcp, sizeof (*mcp));
218 			return (mcpp == &self->ul_mxchain);
219 		}
220 	}
221 	return (0);
222 }
223 
224 /*
225  * Add mp to the list of ceiling mutexes owned by curthread.
226  * Return ENOMEM if no memory could be allocated.
227  */
228 int
229 _ceil_mylist_add(mutex_t *mp)
230 {
231 	ulwp_t *self = curthread;
232 	mxchain_t *mcp;
233 
234 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
235 		return (ENOMEM);
236 	mcp->mxchain_mx = mp;
237 	mcp->mxchain_next = self->ul_mxchain;
238 	self->ul_mxchain = mcp;
239 	return (0);
240 }
241 
242 /*
243  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
244  */
245 static void
246 set_rt_priority(ulwp_t *self, int prio)
247 {
248 	pcparms_t pcparm;
249 
250 	pcparm.pc_cid = self->ul_rtclassid;
251 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
252 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
253 	(void) _private_priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
254 }
255 
256 /*
257  * Inherit priority from ceiling.
258  * This changes the effective priority, not the assigned priority.
259  */
260 void
261 _ceil_prio_inherit(int prio)
262 {
263 	ulwp_t *self = curthread;
264 
265 	self->ul_epri = prio;
266 	set_rt_priority(self, prio);
267 }
268 
269 /*
270  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
271  * if holding at least one ceiling lock.  If no ceiling locks are held at this
272  * point, disinherit completely, reverting back to assigned priority.
273  */
274 void
275 _ceil_prio_waive(void)
276 {
277 	ulwp_t *self = curthread;
278 	mxchain_t *mcp = self->ul_mxchain;
279 	int prio;
280 
281 	if (mcp == NULL) {
282 		prio = self->ul_pri;
283 		self->ul_epri = 0;
284 	} else {
285 		prio = mcp->mxchain_mx->mutex_ceiling;
286 		self->ul_epri = prio;
287 	}
288 	set_rt_priority(self, prio);
289 }
290 
291 /*
292  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
293  * Return the old value of the lock word.
294  */
295 static uint32_t
296 clear_lockbyte(volatile uint32_t *lockword)
297 {
298 	uint32_t old;
299 	uint32_t new;
300 
301 	do {
302 		old = *lockword;
303 		new = old & ~LOCKMASK;
304 	} while (atomic_cas_32(lockword, old, new) != old);
305 
306 	return (old);
307 }
308 
309 /*
310  * Same as clear_lockbyte(), but operates on mutex_lockword64.
311  * The mutex_ownerpid field is cleared along with the lock byte.
312  */
313 static uint64_t
314 clear_lockbyte64(volatile uint64_t *lockword64)
315 {
316 	uint64_t old;
317 	uint64_t new;
318 
319 	do {
320 		old = *lockword64;
321 		new = old & ~LOCKMASK64;
322 	} while (atomic_cas_64(lockword64, old, new) != old);
323 
324 	return (old);
325 }
326 
327 /*
328  * Similar to set_lock_byte(), which only tries to set the lock byte.
329  * Here, we attempt to set the lock byte AND the mutex_ownerpid,
330  * keeping the remaining bytes constant.
331  */
332 static int
333 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
334 {
335 	uint64_t old;
336 	uint64_t new;
337 
338 	old = *lockword64 & ~LOCKMASK64;
339 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
340 	if (atomic_cas_64(lockword64, old, new) == old)
341 		return (LOCKCLEAR);
342 
343 	return (LOCKSET);
344 }
345 
346 /*
347  * Increment the spinners count in the mutex lock word.
348  * Return 0 on success.  Return -1 if the count would overflow.
349  */
350 static int
351 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
352 {
353 	uint32_t old;
354 	uint32_t new;
355 
356 	do {
357 		old = *lockword;
358 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
359 			return (-1);
360 		new = old + (1 << SPINNERSHIFT);
361 	} while (atomic_cas_32(lockword, old, new) != old);
362 
363 	return (0);
364 }
365 
366 /*
367  * Decrement the spinners count in the mutex lock word.
368  * Return the new value of the lock word.
369  */
370 static uint32_t
371 spinners_decr(volatile uint32_t *lockword)
372 {
373 	uint32_t old;
374 	uint32_t new;
375 
376 	do {
377 		new = old = *lockword;
378 		if (new & SPINNERMASK)
379 			new -= (1 << SPINNERSHIFT);
380 	} while (atomic_cas_32(lockword, old, new) != old);
381 
382 	return (new);
383 }
384 
385 /*
386  * Non-preemptive spin locks.  Used by queue_lock().
387  * No lock statistics are gathered for these locks.
388  * No DTrace probes are provided for these locks.
389  */
390 void
391 spin_lock_set(mutex_t *mp)
392 {
393 	ulwp_t *self = curthread;
394 
395 	no_preempt(self);
396 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
397 		mp->mutex_owner = (uintptr_t)self;
398 		return;
399 	}
400 	/*
401 	 * Spin for a while, attempting to acquire the lock.
402 	 */
403 	INCR32(self->ul_spin_lock_spin);
404 	if (mutex_queuelock_adaptive(mp) == 0 ||
405 	    set_lock_byte(&mp->mutex_lockw) == 0) {
406 		mp->mutex_owner = (uintptr_t)self;
407 		return;
408 	}
409 	/*
410 	 * Try harder if we were previously at a no premption level.
411 	 */
412 	if (self->ul_preempt > 1) {
413 		INCR32(self->ul_spin_lock_spin2);
414 		if (mutex_queuelock_adaptive(mp) == 0 ||
415 		    set_lock_byte(&mp->mutex_lockw) == 0) {
416 			mp->mutex_owner = (uintptr_t)self;
417 			return;
418 		}
419 	}
420 	/*
421 	 * Give up and block in the kernel for the mutex.
422 	 */
423 	INCR32(self->ul_spin_lock_sleep);
424 	(void) ___lwp_mutex_timedlock(mp, NULL);
425 	mp->mutex_owner = (uintptr_t)self;
426 }
427 
428 void
429 spin_lock_clear(mutex_t *mp)
430 {
431 	ulwp_t *self = curthread;
432 
433 	mp->mutex_owner = 0;
434 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
435 		(void) ___lwp_mutex_wakeup(mp, 0);
436 		INCR32(self->ul_spin_lock_wakeup);
437 	}
438 	preempt(self);
439 }
440 
441 /*
442  * Allocate the sleep queue hash table.
443  */
444 void
445 queue_alloc(void)
446 {
447 	ulwp_t *self = curthread;
448 	uberdata_t *udp = self->ul_uberdata;
449 	queue_head_t *qp;
450 	void *data;
451 	int i;
452 
453 	/*
454 	 * No locks are needed; we call here only when single-threaded.
455 	 */
456 	ASSERT(self == udp->ulwp_one);
457 	ASSERT(!udp->uberflags.uf_mt);
458 	if ((data = _private_mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
459 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
460 	    == MAP_FAILED)
461 		thr_panic("cannot allocate thread queue_head table");
462 	udp->queue_head = qp = (queue_head_t *)data;
463 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
464 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
465 		qp->qh_lock.mutex_flag = LOCK_INITED;
466 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
467 		qp->qh_hlist = &qp->qh_def_root;
468 #if defined(THREAD_DEBUG)
469 		qp->qh_hlen = 1;
470 		qp->qh_hmax = 1;
471 #endif
472 	}
473 }
474 
475 #if defined(THREAD_DEBUG)
476 
477 /*
478  * Debugging: verify correctness of a sleep queue.
479  */
480 void
481 QVERIFY(queue_head_t *qp)
482 {
483 	ulwp_t *self = curthread;
484 	uberdata_t *udp = self->ul_uberdata;
485 	queue_root_t *qrp;
486 	ulwp_t *ulwp;
487 	ulwp_t *prev;
488 	uint_t index;
489 	uint32_t cnt;
490 	char qtype;
491 	void *wchan;
492 
493 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
494 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
495 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
496 		cnt++;
497 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
498 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
499 	}
500 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
501 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
502 	ASSERT(qp->qh_type == qtype);
503 	if (!thread_queue_verify)
504 		return;
505 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
506 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
507 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
508 		    prev = ulwp, ulwp = ulwp->ul_link) {
509 			cnt++;
510 			if (ulwp->ul_writer)
511 				ASSERT(prev == NULL || prev->ul_writer);
512 			ASSERT(ulwp->ul_qtype == qtype);
513 			ASSERT(ulwp->ul_wchan != NULL);
514 			ASSERT(ulwp->ul_sleepq == qp);
515 			wchan = ulwp->ul_wchan;
516 			ASSERT(qrp->qr_wchan == wchan);
517 			index = QUEUE_HASH(wchan, qtype);
518 			ASSERT(&udp->queue_head[index] == qp);
519 		}
520 		ASSERT(qrp->qr_tail == prev);
521 	}
522 	ASSERT(qp->qh_qlen == cnt);
523 }
524 
525 #else	/* THREAD_DEBUG */
526 
527 #define	QVERIFY(qp)
528 
529 #endif	/* THREAD_DEBUG */
530 
531 /*
532  * Acquire a queue head.
533  */
534 queue_head_t *
535 queue_lock(void *wchan, int qtype)
536 {
537 	uberdata_t *udp = curthread->ul_uberdata;
538 	queue_head_t *qp;
539 	queue_root_t *qrp;
540 
541 	ASSERT(qtype == MX || qtype == CV);
542 
543 	/*
544 	 * It is possible that we could be called while still single-threaded.
545 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
546 	 */
547 	if ((qp = udp->queue_head) == NULL) {
548 		queue_alloc();
549 		qp = udp->queue_head;
550 	}
551 	qp += QUEUE_HASH(wchan, qtype);
552 	spin_lock_set(&qp->qh_lock);
553 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
554 		if (qrp->qr_wchan == wchan)
555 			break;
556 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
557 		/* the default queue root is available; use it */
558 		qrp = &qp->qh_def_root;
559 		qrp->qr_wchan = wchan;
560 		ASSERT(qrp->qr_next == NULL);
561 		ASSERT(qrp->qr_tail == NULL &&
562 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
563 	}
564 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
565 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
566 	INCR32(qp->qh_lockcount);
567 	QVERIFY(qp);
568 	return (qp);
569 }
570 
571 /*
572  * Release a queue head.
573  */
574 void
575 queue_unlock(queue_head_t *qp)
576 {
577 	QVERIFY(qp);
578 	spin_lock_clear(&qp->qh_lock);
579 }
580 
581 /*
582  * For rwlock queueing, we must queue writers ahead of readers of the
583  * same priority.  We do this by making writers appear to have a half
584  * point higher priority for purposes of priority comparisons below.
585  */
586 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
587 
588 void
589 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
590 {
591 	queue_root_t *qrp;
592 	ulwp_t **ulwpp;
593 	ulwp_t *next;
594 	int pri = CMP_PRIO(ulwp);
595 
596 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
597 	ASSERT(ulwp->ul_sleepq != qp);
598 
599 	if ((qrp = qp->qh_root) == NULL) {
600 		/* use the thread's queue root for the linkage */
601 		qrp = &ulwp->ul_queue_root;
602 		qrp->qr_next = qp->qh_hlist;
603 		qrp->qr_prev = NULL;
604 		qrp->qr_head = NULL;
605 		qrp->qr_tail = NULL;
606 		qrp->qr_wchan = qp->qh_wchan;
607 		qrp->qr_rtcount = 0;
608 		qrp->qr_qlen = 0;
609 		qrp->qr_qmax = 0;
610 		qp->qh_hlist->qr_prev = qrp;
611 		qp->qh_hlist = qrp;
612 		qp->qh_root = qrp;
613 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
614 	}
615 
616 	/*
617 	 * LIFO queue ordering is unfair and can lead to starvation,
618 	 * but it gives better performance for heavily contended locks.
619 	 * We use thread_queue_fifo (range is 0..8) to determine
620 	 * the frequency of FIFO vs LIFO queuing:
621 	 *	0 : every 256th time	(almost always LIFO)
622 	 *	1 : every 128th time
623 	 *	2 : every 64th  time
624 	 *	3 : every 32nd  time
625 	 *	4 : every 16th  time	(the default value, mostly LIFO)
626 	 *	5 : every 8th   time
627 	 *	6 : every 4th   time
628 	 *	7 : every 2nd   time
629 	 *	8 : every time		(never LIFO, always FIFO)
630 	 * Note that there is always some degree of FIFO ordering.
631 	 * This breaks live lock conditions that occur in applications
632 	 * that are written assuming (incorrectly) that threads acquire
633 	 * locks fairly, that is, in roughly round-robin order.
634 	 * In any event, the queue is maintained in kernel priority order.
635 	 *
636 	 * If force_fifo is non-zero, fifo queueing is forced.
637 	 * SUSV3 requires this for semaphores.
638 	 */
639 	if (qrp->qr_head == NULL) {
640 		/*
641 		 * The queue is empty.  LIFO/FIFO doesn't matter.
642 		 */
643 		ASSERT(qrp->qr_tail == NULL);
644 		ulwpp = &qrp->qr_head;
645 	} else if (force_fifo |
646 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
647 		/*
648 		 * Enqueue after the last thread whose priority is greater
649 		 * than or equal to the priority of the thread being queued.
650 		 * Attempt first to go directly onto the tail of the queue.
651 		 */
652 		if (pri <= CMP_PRIO(qrp->qr_tail))
653 			ulwpp = &qrp->qr_tail->ul_link;
654 		else {
655 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
656 			    ulwpp = &next->ul_link)
657 				if (pri > CMP_PRIO(next))
658 					break;
659 		}
660 	} else {
661 		/*
662 		 * Enqueue before the first thread whose priority is less
663 		 * than or equal to the priority of the thread being queued.
664 		 * Hopefully we can go directly onto the head of the queue.
665 		 */
666 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
667 		    ulwpp = &next->ul_link)
668 			if (pri >= CMP_PRIO(next))
669 				break;
670 	}
671 	if ((ulwp->ul_link = *ulwpp) == NULL)
672 		qrp->qr_tail = ulwp;
673 	*ulwpp = ulwp;
674 
675 	ulwp->ul_sleepq = qp;
676 	ulwp->ul_wchan = qp->qh_wchan;
677 	ulwp->ul_qtype = qp->qh_type;
678 	if ((ulwp->ul_schedctl != NULL &&
679 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
680 	    ulwp->ul_pilocks) {
681 		ulwp->ul_rtqueued = 1;
682 		qrp->qr_rtcount++;
683 	}
684 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
685 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
686 }
687 
688 /*
689  * Helper function for queue_slot() and queue_slot_rt().
690  * Try to find a non-suspended thread on the queue.
691  */
692 static ulwp_t **
693 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
694 {
695 	ulwp_t *ulwp;
696 	ulwp_t **foundpp = NULL;
697 	int priority = -1;
698 	ulwp_t *prev;
699 	int tpri;
700 
701 	for (prev = NULL;
702 	    (ulwp = *ulwpp) != NULL;
703 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
704 		if (ulwp->ul_stop)	/* skip suspended threads */
705 			continue;
706 		tpri = rt? CMP_PRIO(ulwp) : 0;
707 		if (tpri > priority) {
708 			foundpp = ulwpp;
709 			*prevp = prev;
710 			priority = tpri;
711 			if (!rt)
712 				break;
713 		}
714 	}
715 	return (foundpp);
716 }
717 
718 /*
719  * For real-time, we search the entire queue because the dispatch
720  * (kernel) priorities may have changed since enqueueing.
721  */
722 static ulwp_t **
723 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
724 {
725 	ulwp_t **ulwpp = ulwpp_org;
726 	ulwp_t *ulwp = *ulwpp;
727 	ulwp_t **foundpp = ulwpp;
728 	int priority = CMP_PRIO(ulwp);
729 	ulwp_t *prev;
730 	int tpri;
731 
732 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
733 	    (ulwp = *ulwpp) != NULL;
734 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
735 		tpri = CMP_PRIO(ulwp);
736 		if (tpri > priority) {
737 			foundpp = ulwpp;
738 			*prevp = prev;
739 			priority = tpri;
740 		}
741 	}
742 	ulwp = *foundpp;
743 
744 	/*
745 	 * Try not to return a suspended thread.
746 	 * This mimics the old libthread's behavior.
747 	 */
748 	if (ulwp->ul_stop &&
749 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
750 		foundpp = ulwpp;
751 		ulwp = *foundpp;
752 	}
753 	ulwp->ul_rt = 1;
754 	return (foundpp);
755 }
756 
757 ulwp_t **
758 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
759 {
760 	queue_root_t *qrp;
761 	ulwp_t **ulwpp;
762 	ulwp_t *ulwp;
763 	int rt;
764 
765 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
766 
767 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
768 		*more = 0;
769 		return (NULL);		/* no lwps on the queue */
770 	}
771 	rt = (qrp->qr_rtcount != 0);
772 	*prevp = NULL;
773 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
774 		*more = 0;
775 		ulwp->ul_rt = rt;
776 		return (&qrp->qr_head);
777 	}
778 	*more = 1;
779 
780 	if (rt)		/* real-time queue */
781 		return (queue_slot_rt(&qrp->qr_head, prevp));
782 	/*
783 	 * Try not to return a suspended thread.
784 	 * This mimics the old libthread's behavior.
785 	 */
786 	if (ulwp->ul_stop &&
787 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
788 		ulwp = *ulwpp;
789 		ulwp->ul_rt = 0;
790 		return (ulwpp);
791 	}
792 	/*
793 	 * The common case; just pick the first thread on the queue.
794 	 */
795 	ulwp->ul_rt = 0;
796 	return (&qrp->qr_head);
797 }
798 
799 /*
800  * Common code for unlinking an lwp from a user-level sleep queue.
801  */
802 void
803 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
804 {
805 	queue_root_t *qrp = qp->qh_root;
806 	queue_root_t *nqrp;
807 	ulwp_t *ulwp = *ulwpp;
808 	ulwp_t *next;
809 
810 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
811 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
812 
813 	DECR(qp->qh_qlen);
814 	DECR(qrp->qr_qlen);
815 	if (ulwp->ul_rtqueued) {
816 		ulwp->ul_rtqueued = 0;
817 		qrp->qr_rtcount--;
818 	}
819 	next = ulwp->ul_link;
820 	*ulwpp = next;
821 	ulwp->ul_link = NULL;
822 	if (qrp->qr_tail == ulwp)
823 		qrp->qr_tail = prev;
824 	if (qrp == &ulwp->ul_queue_root) {
825 		/*
826 		 * We can't continue to use the unlinked thread's
827 		 * queue root for the linkage.
828 		 */
829 		queue_root_t *qr_next = qrp->qr_next;
830 		queue_root_t *qr_prev = qrp->qr_prev;
831 
832 		if (qrp->qr_tail) {
833 			/* switch to using the last thread's queue root */
834 			ASSERT(qrp->qr_qlen != 0);
835 			nqrp = &qrp->qr_tail->ul_queue_root;
836 			*nqrp = *qrp;
837 			if (qr_next)
838 				qr_next->qr_prev = nqrp;
839 			if (qr_prev)
840 				qr_prev->qr_next = nqrp;
841 			else
842 				qp->qh_hlist = nqrp;
843 			qp->qh_root = nqrp;
844 		} else {
845 			/* empty queue root; just delete from the hash list */
846 			ASSERT(qrp->qr_qlen == 0);
847 			if (qr_next)
848 				qr_next->qr_prev = qr_prev;
849 			if (qr_prev)
850 				qr_prev->qr_next = qr_next;
851 			else
852 				qp->qh_hlist = qr_next;
853 			qp->qh_root = NULL;
854 			DECR(qp->qh_hlen);
855 		}
856 	}
857 }
858 
859 ulwp_t *
860 dequeue(queue_head_t *qp, int *more)
861 {
862 	ulwp_t **ulwpp;
863 	ulwp_t *ulwp;
864 	ulwp_t *prev;
865 
866 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
867 		return (NULL);
868 	ulwp = *ulwpp;
869 	queue_unlink(qp, ulwpp, prev);
870 	ulwp->ul_sleepq = NULL;
871 	ulwp->ul_wchan = NULL;
872 	return (ulwp);
873 }
874 
875 /*
876  * Return a pointer to the highest priority thread sleeping on wchan.
877  */
878 ulwp_t *
879 queue_waiter(queue_head_t *qp)
880 {
881 	ulwp_t **ulwpp;
882 	ulwp_t *prev;
883 	int more;
884 
885 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
886 		return (NULL);
887 	return (*ulwpp);
888 }
889 
890 int
891 dequeue_self(queue_head_t *qp)
892 {
893 	ulwp_t *self = curthread;
894 	queue_root_t *qrp;
895 	ulwp_t **ulwpp;
896 	ulwp_t *ulwp;
897 	ulwp_t *prev;
898 	int found = 0;
899 
900 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
901 
902 	/* find self on the sleep queue */
903 	if ((qrp = qp->qh_root) != NULL) {
904 		for (prev = NULL, ulwpp = &qrp->qr_head;
905 		    (ulwp = *ulwpp) != NULL;
906 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
907 			if (ulwp == self) {
908 				queue_unlink(qp, ulwpp, prev);
909 				self->ul_cvmutex = NULL;
910 				self->ul_sleepq = NULL;
911 				self->ul_wchan = NULL;
912 				found = 1;
913 				break;
914 			}
915 		}
916 	}
917 
918 	if (!found)
919 		thr_panic("dequeue_self(): curthread not found on queue");
920 
921 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
922 }
923 
924 /*
925  * Called from call_user_handler() and _thrp_suspend() to take
926  * ourself off of our sleep queue so we can grab locks.
927  */
928 void
929 unsleep_self(void)
930 {
931 	ulwp_t *self = curthread;
932 	queue_head_t *qp;
933 
934 	/*
935 	 * Calling enter_critical()/exit_critical() here would lead
936 	 * to recursion.  Just manipulate self->ul_critical directly.
937 	 */
938 	self->ul_critical++;
939 	while (self->ul_sleepq != NULL) {
940 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
941 		/*
942 		 * We may have been moved from a CV queue to a
943 		 * mutex queue while we were attempting queue_lock().
944 		 * If so, just loop around and try again.
945 		 * dequeue_self() clears self->ul_sleepq.
946 		 */
947 		if (qp == self->ul_sleepq)
948 			(void) dequeue_self(qp);
949 		queue_unlock(qp);
950 	}
951 	self->ul_writer = 0;
952 	self->ul_critical--;
953 }
954 
955 /*
956  * Common code for calling the the ___lwp_mutex_timedlock() system call.
957  * Returns with mutex_owner and mutex_ownerpid set correctly.
958  */
959 static int
960 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
961 {
962 	ulwp_t *self = curthread;
963 	uberdata_t *udp = self->ul_uberdata;
964 	int mtype = mp->mutex_type;
965 	hrtime_t begin_sleep;
966 	int acquired;
967 	int error;
968 
969 	self->ul_sp = stkptr();
970 	self->ul_wchan = mp;
971 	if (__td_event_report(self, TD_SLEEP, udp)) {
972 		self->ul_td_evbuf.eventnum = TD_SLEEP;
973 		self->ul_td_evbuf.eventdata = mp;
974 		tdb_event(TD_SLEEP, udp);
975 	}
976 	if (msp) {
977 		tdb_incr(msp->mutex_sleep);
978 		begin_sleep = gethrtime();
979 	}
980 
981 	DTRACE_PROBE1(plockstat, mutex__block, mp);
982 
983 	for (;;) {
984 		/*
985 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
986 		 * means we successfully acquired the lock.
987 		 */
988 		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0 &&
989 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
990 			acquired = 0;
991 			break;
992 		}
993 
994 		if (mtype & USYNC_PROCESS) {
995 			/*
996 			 * Defend against forkall().  We may be the child,
997 			 * in which case we don't actually own the mutex.
998 			 */
999 			enter_critical(self);
1000 			if (mp->mutex_ownerpid == udp->pid) {
1001 				mp->mutex_owner = (uintptr_t)self;
1002 				exit_critical(self);
1003 				acquired = 1;
1004 				break;
1005 			}
1006 			exit_critical(self);
1007 		} else {
1008 			mp->mutex_owner = (uintptr_t)self;
1009 			acquired = 1;
1010 			break;
1011 		}
1012 	}
1013 	if (msp)
1014 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1015 	self->ul_wchan = NULL;
1016 	self->ul_sp = 0;
1017 
1018 	if (acquired) {
1019 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1020 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1021 	} else {
1022 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1023 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1024 	}
1025 
1026 	return (error);
1027 }
1028 
1029 /*
1030  * Common code for calling the ___lwp_mutex_trylock() system call.
1031  * Returns with mutex_owner and mutex_ownerpid set correctly.
1032  */
1033 int
1034 mutex_trylock_kernel(mutex_t *mp)
1035 {
1036 	ulwp_t *self = curthread;
1037 	uberdata_t *udp = self->ul_uberdata;
1038 	int mtype = mp->mutex_type;
1039 	int error;
1040 	int acquired;
1041 
1042 	for (;;) {
1043 		/*
1044 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1045 		 * means we successfully acquired the lock.
1046 		 */
1047 		if ((error = ___lwp_mutex_trylock(mp)) != 0 &&
1048 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1049 			acquired = 0;
1050 			break;
1051 		}
1052 
1053 		if (mtype & USYNC_PROCESS) {
1054 			/*
1055 			 * Defend against forkall().  We may be the child,
1056 			 * in which case we don't actually own the mutex.
1057 			 */
1058 			enter_critical(self);
1059 			if (mp->mutex_ownerpid == udp->pid) {
1060 				mp->mutex_owner = (uintptr_t)self;
1061 				exit_critical(self);
1062 				acquired = 1;
1063 				break;
1064 			}
1065 			exit_critical(self);
1066 		} else {
1067 			mp->mutex_owner = (uintptr_t)self;
1068 			acquired = 1;
1069 			break;
1070 		}
1071 	}
1072 
1073 	if (acquired) {
1074 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1075 	} else if (error != EBUSY) {
1076 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1077 	}
1078 
1079 	return (error);
1080 }
1081 
1082 volatile sc_shared_t *
1083 setup_schedctl(void)
1084 {
1085 	ulwp_t *self = curthread;
1086 	volatile sc_shared_t *scp;
1087 	sc_shared_t *tmp;
1088 
1089 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1090 	    !self->ul_vfork &&			/* not a child of vfork() */
1091 	    !self->ul_schedctl_called) {	/* haven't been called before */
1092 		enter_critical(self);
1093 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1094 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1095 			self->ul_schedctl = scp = tmp;
1096 		exit_critical(self);
1097 	}
1098 	/*
1099 	 * Unless the call to setup_schedctl() is surrounded
1100 	 * by enter_critical()/exit_critical(), the address
1101 	 * we are returning could be invalid due to a forkall()
1102 	 * having occurred in another thread.
1103 	 */
1104 	return (scp);
1105 }
1106 
1107 /*
1108  * Interfaces from libsched, incorporated into libc.
1109  * libsched.so.1 is now a filter library onto libc.
1110  */
1111 #pragma weak schedctl_lookup = _schedctl_init
1112 #pragma weak _schedctl_lookup = _schedctl_init
1113 #pragma weak schedctl_init = _schedctl_init
1114 schedctl_t *
1115 _schedctl_init(void)
1116 {
1117 	volatile sc_shared_t *scp = setup_schedctl();
1118 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1119 }
1120 
1121 #pragma weak schedctl_exit = _schedctl_exit
1122 void
1123 _schedctl_exit(void)
1124 {
1125 }
1126 
1127 /*
1128  * Contract private interface for java.
1129  * Set up the schedctl data if it doesn't exist yet.
1130  * Return a pointer to the pointer to the schedctl data.
1131  */
1132 volatile sc_shared_t *volatile *
1133 _thr_schedctl(void)
1134 {
1135 	ulwp_t *self = curthread;
1136 	volatile sc_shared_t *volatile *ptr;
1137 
1138 	if (self->ul_vfork)
1139 		return (NULL);
1140 	if (*(ptr = &self->ul_schedctl) == NULL)
1141 		(void) setup_schedctl();
1142 	return (ptr);
1143 }
1144 
1145 /*
1146  * Block signals and attempt to block preemption.
1147  * no_preempt()/preempt() must be used in pairs but can be nested.
1148  */
1149 void
1150 no_preempt(ulwp_t *self)
1151 {
1152 	volatile sc_shared_t *scp;
1153 
1154 	if (self->ul_preempt++ == 0) {
1155 		enter_critical(self);
1156 		if ((scp = self->ul_schedctl) != NULL ||
1157 		    (scp = setup_schedctl()) != NULL) {
1158 			/*
1159 			 * Save the pre-existing preempt value.
1160 			 */
1161 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1162 			scp->sc_preemptctl.sc_nopreempt = 1;
1163 		}
1164 	}
1165 }
1166 
1167 /*
1168  * Undo the effects of no_preempt().
1169  */
1170 void
1171 preempt(ulwp_t *self)
1172 {
1173 	volatile sc_shared_t *scp;
1174 
1175 	ASSERT(self->ul_preempt > 0);
1176 	if (--self->ul_preempt == 0) {
1177 		if ((scp = self->ul_schedctl) != NULL) {
1178 			/*
1179 			 * Restore the pre-existing preempt value.
1180 			 */
1181 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1182 			if (scp->sc_preemptctl.sc_yield &&
1183 			    scp->sc_preemptctl.sc_nopreempt == 0) {
1184 				lwp_yield();
1185 				if (scp->sc_preemptctl.sc_yield) {
1186 					/*
1187 					 * Shouldn't happen.  This is either
1188 					 * a race condition or the thread
1189 					 * just entered the real-time class.
1190 					 */
1191 					lwp_yield();
1192 					scp->sc_preemptctl.sc_yield = 0;
1193 				}
1194 			}
1195 		}
1196 		exit_critical(self);
1197 	}
1198 }
1199 
1200 /*
1201  * If a call to preempt() would cause the current thread to yield or to
1202  * take deferred actions in exit_critical(), then unpark the specified
1203  * lwp so it can run while we delay.  Return the original lwpid if the
1204  * unpark was not performed, else return zero.  The tests are a repeat
1205  * of some of the tests in preempt(), above.  This is a statistical
1206  * optimization solely for cond_sleep_queue(), below.
1207  */
1208 static lwpid_t
1209 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1210 {
1211 	volatile sc_shared_t *scp = self->ul_schedctl;
1212 
1213 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1214 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1215 	    (self->ul_curplease && self->ul_critical == 1)) {
1216 		(void) __lwp_unpark(lwpid);
1217 		lwpid = 0;
1218 	}
1219 	return (lwpid);
1220 }
1221 
1222 /*
1223  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1224  * If this fails, return EBUSY and let the caller deal with it.
1225  * If this succeeds, return 0 with mutex_owner set to curthread.
1226  */
1227 static int
1228 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1229 {
1230 	ulwp_t *self = curthread;
1231 	int error = EBUSY;
1232 	ulwp_t *ulwp;
1233 	volatile sc_shared_t *scp;
1234 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1235 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1236 	uint32_t new_lockword;
1237 	int count = 0;
1238 	int max_count;
1239 	uint8_t max_spinners;
1240 
1241 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1242 
1243 	if (MUTEX_OWNER(mp) == self)
1244 		return (EBUSY);
1245 
1246 	/* short-cut, not definitive (see below) */
1247 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1248 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1249 		error = ENOTRECOVERABLE;
1250 		goto done;
1251 	}
1252 
1253 	/*
1254 	 * Make one attempt to acquire the lock before
1255 	 * incurring the overhead of the spin loop.
1256 	 */
1257 	if (set_lock_byte(lockp) == 0) {
1258 		*ownerp = (uintptr_t)self;
1259 		error = 0;
1260 		goto done;
1261 	}
1262 	if (!tryhard)
1263 		goto done;
1264 	if (ncpus == 0)
1265 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1266 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1267 		max_spinners = ncpus - 1;
1268 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1269 	if (max_count == 0)
1270 		goto done;
1271 
1272 	/*
1273 	 * This spin loop is unfair to lwps that have already dropped into
1274 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1275 	 * This is just too bad.  The adaptive spin algorithm is intended
1276 	 * to allow programs with highly-contended locks (that is, broken
1277 	 * programs) to execute with reasonable speed despite their contention.
1278 	 * Being fair would reduce the speed of such programs and well-written
1279 	 * programs will not suffer in any case.
1280 	 */
1281 	enter_critical(self);
1282 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1283 		exit_critical(self);
1284 		goto done;
1285 	}
1286 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1287 	for (count = 1; ; count++) {
1288 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1289 			*ownerp = (uintptr_t)self;
1290 			error = 0;
1291 			break;
1292 		}
1293 		if (count == max_count)
1294 			break;
1295 		SMT_PAUSE();
1296 		/*
1297 		 * Stop spinning if the mutex owner is not running on
1298 		 * a processor; it will not drop the lock any time soon
1299 		 * and we would just be wasting time to keep spinning.
1300 		 *
1301 		 * Note that we are looking at another thread (ulwp_t)
1302 		 * without ensuring that the other thread does not exit.
1303 		 * The scheme relies on ulwp_t structures never being
1304 		 * deallocated by the library (the library employs a free
1305 		 * list of ulwp_t structs that are reused when new threads
1306 		 * are created) and on schedctl shared memory never being
1307 		 * deallocated once created via __schedctl().
1308 		 *
1309 		 * Thus, the worst that can happen when the spinning thread
1310 		 * looks at the owner's schedctl data is that it is looking
1311 		 * at some other thread's schedctl data.  This almost never
1312 		 * happens and is benign when it does.
1313 		 */
1314 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1315 		    ((scp = ulwp->ul_schedctl) == NULL ||
1316 		    scp->sc_state != SC_ONPROC))
1317 			break;
1318 	}
1319 	new_lockword = spinners_decr(&mp->mutex_lockword);
1320 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1321 		/*
1322 		 * We haven't yet acquired the lock, the lock
1323 		 * is free, and there are no other spinners.
1324 		 * Make one final attempt to acquire the lock.
1325 		 *
1326 		 * This isn't strictly necessary since mutex_lock_queue()
1327 		 * (the next action this thread will take if it doesn't
1328 		 * acquire the lock here) makes one attempt to acquire
1329 		 * the lock before putting the thread to sleep.
1330 		 *
1331 		 * If the next action for this thread (on failure here)
1332 		 * were not to call mutex_lock_queue(), this would be
1333 		 * necessary for correctness, to avoid ending up with an
1334 		 * unheld mutex with waiters but no one to wake them up.
1335 		 */
1336 		if (set_lock_byte(lockp) == 0) {
1337 			*ownerp = (uintptr_t)self;
1338 			error = 0;
1339 		}
1340 		count++;
1341 	}
1342 	exit_critical(self);
1343 
1344 done:
1345 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1346 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1347 		/*
1348 		 * We shouldn't own the mutex.
1349 		 * Just clear the lock; everyone has already been waked up.
1350 		 */
1351 		mp->mutex_owner = 0;
1352 		(void) clear_lockbyte(&mp->mutex_lockword);
1353 		error = ENOTRECOVERABLE;
1354 	}
1355 
1356 	if (error) {
1357 		if (count) {
1358 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1359 		}
1360 		if (error != EBUSY) {
1361 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1362 		}
1363 	} else {
1364 		if (count) {
1365 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1366 		}
1367 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1368 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1369 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1370 			error = EOWNERDEAD;
1371 		}
1372 	}
1373 
1374 	return (error);
1375 }
1376 
1377 /*
1378  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1379  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1380  */
1381 static int
1382 mutex_queuelock_adaptive(mutex_t *mp)
1383 {
1384 	ulwp_t *ulwp;
1385 	volatile sc_shared_t *scp;
1386 	volatile uint8_t *lockp;
1387 	volatile uint64_t *ownerp;
1388 	int count = curthread->ul_queue_spin;
1389 
1390 	ASSERT(mp->mutex_type == USYNC_THREAD);
1391 
1392 	if (count == 0)
1393 		return (EBUSY);
1394 
1395 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1396 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1397 	while (--count >= 0) {
1398 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1399 			return (0);
1400 		SMT_PAUSE();
1401 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1402 		    ((scp = ulwp->ul_schedctl) == NULL ||
1403 		    scp->sc_state != SC_ONPROC))
1404 			break;
1405 	}
1406 
1407 	return (EBUSY);
1408 }
1409 
1410 /*
1411  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1412  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1413  * If this fails, return EBUSY and let the caller deal with it.
1414  * If this succeeds, return 0 with mutex_owner set to curthread
1415  * and mutex_ownerpid set to the current pid.
1416  */
1417 static int
1418 mutex_trylock_process(mutex_t *mp, int tryhard)
1419 {
1420 	ulwp_t *self = curthread;
1421 	uberdata_t *udp = self->ul_uberdata;
1422 	int error = EBUSY;
1423 	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1424 	uint32_t new_lockword;
1425 	int count = 0;
1426 	int max_count;
1427 	uint8_t max_spinners;
1428 
1429 	ASSERT(mp->mutex_type & USYNC_PROCESS);
1430 
1431 	if (shared_mutex_held(mp))
1432 		return (EBUSY);
1433 
1434 	/* short-cut, not definitive (see below) */
1435 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1436 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1437 		error = ENOTRECOVERABLE;
1438 		goto done;
1439 	}
1440 
1441 	/*
1442 	 * Make one attempt to acquire the lock before
1443 	 * incurring the overhead of the spin loop.
1444 	 */
1445 	enter_critical(self);
1446 	if (set_lock_byte64(lockp, udp->pid) == 0) {
1447 		mp->mutex_owner = (uintptr_t)self;
1448 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
1449 		exit_critical(self);
1450 		error = 0;
1451 		goto done;
1452 	}
1453 	exit_critical(self);
1454 	if (!tryhard)
1455 		goto done;
1456 	if (ncpus == 0)
1457 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1458 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1459 		max_spinners = ncpus - 1;
1460 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1461 	if (max_count == 0)
1462 		goto done;
1463 
1464 	/*
1465 	 * This is a process-shared mutex.
1466 	 * We cannot know if the owner is running on a processor.
1467 	 * We just spin and hope that it is on a processor.
1468 	 */
1469 	enter_critical(self);
1470 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1471 		exit_critical(self);
1472 		goto done;
1473 	}
1474 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1475 	for (count = 1; ; count++) {
1476 		if ((*lockp & LOCKMASK64) == 0 &&
1477 		    set_lock_byte64(lockp, udp->pid) == 0) {
1478 			mp->mutex_owner = (uintptr_t)self;
1479 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1480 			error = 0;
1481 			break;
1482 		}
1483 		if (count == max_count)
1484 			break;
1485 		SMT_PAUSE();
1486 	}
1487 	new_lockword = spinners_decr(&mp->mutex_lockword);
1488 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1489 		/*
1490 		 * We haven't yet acquired the lock, the lock
1491 		 * is free, and there are no other spinners.
1492 		 * Make one final attempt to acquire the lock.
1493 		 *
1494 		 * This isn't strictly necessary since mutex_lock_kernel()
1495 		 * (the next action this thread will take if it doesn't
1496 		 * acquire the lock here) makes one attempt to acquire
1497 		 * the lock before putting the thread to sleep.
1498 		 *
1499 		 * If the next action for this thread (on failure here)
1500 		 * were not to call mutex_lock_kernel(), this would be
1501 		 * necessary for correctness, to avoid ending up with an
1502 		 * unheld mutex with waiters but no one to wake them up.
1503 		 */
1504 		if (set_lock_byte64(lockp, udp->pid) == 0) {
1505 			mp->mutex_owner = (uintptr_t)self;
1506 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1507 			error = 0;
1508 		}
1509 		count++;
1510 	}
1511 	exit_critical(self);
1512 
1513 done:
1514 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1515 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1516 		/*
1517 		 * We shouldn't own the mutex.
1518 		 * Just clear the lock; everyone has already been waked up.
1519 		 */
1520 		mp->mutex_owner = 0;
1521 		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1522 		(void) clear_lockbyte64(&mp->mutex_lockword64);
1523 		error = ENOTRECOVERABLE;
1524 	}
1525 
1526 	if (error) {
1527 		if (count) {
1528 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1529 		}
1530 		if (error != EBUSY) {
1531 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1532 		}
1533 	} else {
1534 		if (count) {
1535 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1536 		}
1537 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1538 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1539 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1540 			if (mp->mutex_flag & LOCK_OWNERDEAD)
1541 				error = EOWNERDEAD;
1542 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1543 				error = ELOCKUNMAPPED;
1544 			else
1545 				error = EOWNERDEAD;
1546 		}
1547 	}
1548 
1549 	return (error);
1550 }
1551 
1552 /*
1553  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1554  * Returns the lwpid of the thread that was dequeued, if any.
1555  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1556  * to wake up the specified lwp.
1557  */
1558 static lwpid_t
1559 mutex_wakeup(mutex_t *mp)
1560 {
1561 	lwpid_t lwpid = 0;
1562 	int more;
1563 	queue_head_t *qp;
1564 	ulwp_t *ulwp;
1565 
1566 	/*
1567 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1568 	 * waiters bit if no one was found on the queue because the mutex
1569 	 * might have been deallocated or reallocated for another purpose.
1570 	 */
1571 	qp = queue_lock(mp, MX);
1572 	if ((ulwp = dequeue(qp, &more)) != NULL) {
1573 		lwpid = ulwp->ul_lwpid;
1574 		mp->mutex_waiters = more;
1575 	}
1576 	queue_unlock(qp);
1577 	return (lwpid);
1578 }
1579 
1580 /*
1581  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1582  */
1583 static void
1584 mutex_wakeup_all(mutex_t *mp)
1585 {
1586 	queue_head_t *qp;
1587 	queue_root_t *qrp;
1588 	int nlwpid = 0;
1589 	int maxlwps = MAXLWPS;
1590 	ulwp_t *ulwp;
1591 	lwpid_t buffer[MAXLWPS];
1592 	lwpid_t *lwpid = buffer;
1593 
1594 	/*
1595 	 * Walk the list of waiters and prepare to wake up all of them.
1596 	 * The waiters flag has already been cleared from the mutex.
1597 	 *
1598 	 * We keep track of lwpids that are to be unparked in lwpid[].
1599 	 * __lwp_unpark_all() is called to unpark all of them after
1600 	 * they have been removed from the sleep queue and the sleep
1601 	 * queue lock has been dropped.  If we run out of space in our
1602 	 * on-stack buffer, we need to allocate more but we can't call
1603 	 * lmalloc() because we are holding a queue lock when the overflow
1604 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1605 	 * either because the application may have allocated a small
1606 	 * stack and we don't want to overrun the stack.  So we call
1607 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1608 	 * system call directly since that path acquires no locks.
1609 	 */
1610 	qp = queue_lock(mp, MX);
1611 	for (;;) {
1612 		if ((qrp = qp->qh_root) == NULL ||
1613 		    (ulwp = qrp->qr_head) == NULL)
1614 			break;
1615 		ASSERT(ulwp->ul_wchan == mp);
1616 		queue_unlink(qp, &qrp->qr_head, NULL);
1617 		ulwp->ul_sleepq = NULL;
1618 		ulwp->ul_wchan = NULL;
1619 		if (nlwpid == maxlwps)
1620 			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1621 		lwpid[nlwpid++] = ulwp->ul_lwpid;
1622 	}
1623 
1624 	if (nlwpid == 0) {
1625 		queue_unlock(qp);
1626 	} else {
1627 		mp->mutex_waiters = 0;
1628 		no_preempt(curthread);
1629 		queue_unlock(qp);
1630 		if (nlwpid == 1)
1631 			(void) __lwp_unpark(lwpid[0]);
1632 		else
1633 			(void) __lwp_unpark_all(lwpid, nlwpid);
1634 		preempt(curthread);
1635 	}
1636 
1637 	if (lwpid != buffer)
1638 		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
1639 }
1640 
1641 /*
1642  * Release a process-private mutex.
1643  * As an optimization, if there are waiters but there are also spinners
1644  * attempting to acquire the mutex, then don't bother waking up a waiter;
1645  * one of the spinners will acquire the mutex soon and it would be a waste
1646  * of resources to wake up some thread just to have it spin for a while
1647  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1648  */
1649 static lwpid_t
1650 mutex_unlock_queue(mutex_t *mp, int release_all)
1651 {
1652 	lwpid_t lwpid = 0;
1653 	uint32_t old_lockword;
1654 
1655 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1656 	mp->mutex_owner = 0;
1657 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1658 	if ((old_lockword & WAITERMASK) &&
1659 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1660 		ulwp_t *self = curthread;
1661 		no_preempt(self);	/* ensure a prompt wakeup */
1662 		if (release_all)
1663 			mutex_wakeup_all(mp);
1664 		else
1665 			lwpid = mutex_wakeup(mp);
1666 		if (lwpid == 0)
1667 			preempt(self);
1668 	}
1669 	return (lwpid);
1670 }
1671 
1672 /*
1673  * Like mutex_unlock_queue(), but for process-shared mutexes.
1674  */
1675 static void
1676 mutex_unlock_process(mutex_t *mp, int release_all)
1677 {
1678 	uint64_t old_lockword64;
1679 
1680 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1681 	mp->mutex_owner = 0;
1682 	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1683 	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1684 	if ((old_lockword64 & WAITERMASK64) &&
1685 	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1686 		ulwp_t *self = curthread;
1687 		no_preempt(self);	/* ensure a prompt wakeup */
1688 		(void) ___lwp_mutex_wakeup(mp, release_all);
1689 		preempt(self);
1690 	}
1691 }
1692 
1693 void
1694 stall(void)
1695 {
1696 	for (;;)
1697 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1698 }
1699 
1700 /*
1701  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1702  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1703  * If successful, returns with mutex_owner set correctly.
1704  */
1705 int
1706 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1707 	timespec_t *tsp)
1708 {
1709 	uberdata_t *udp = curthread->ul_uberdata;
1710 	queue_head_t *qp;
1711 	hrtime_t begin_sleep;
1712 	int error = 0;
1713 
1714 	self->ul_sp = stkptr();
1715 	if (__td_event_report(self, TD_SLEEP, udp)) {
1716 		self->ul_wchan = mp;
1717 		self->ul_td_evbuf.eventnum = TD_SLEEP;
1718 		self->ul_td_evbuf.eventdata = mp;
1719 		tdb_event(TD_SLEEP, udp);
1720 	}
1721 	if (msp) {
1722 		tdb_incr(msp->mutex_sleep);
1723 		begin_sleep = gethrtime();
1724 	}
1725 
1726 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1727 
1728 	/*
1729 	 * Put ourself on the sleep queue, and while we are
1730 	 * unable to grab the lock, go park in the kernel.
1731 	 * Take ourself off the sleep queue after we acquire the lock.
1732 	 * The waiter bit can be set/cleared only while holding the queue lock.
1733 	 */
1734 	qp = queue_lock(mp, MX);
1735 	enqueue(qp, self, 0);
1736 	mp->mutex_waiters = 1;
1737 	for (;;) {
1738 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1739 			mp->mutex_owner = (uintptr_t)self;
1740 			mp->mutex_waiters = dequeue_self(qp);
1741 			break;
1742 		}
1743 		set_parking_flag(self, 1);
1744 		queue_unlock(qp);
1745 		/*
1746 		 * __lwp_park() will return the residual time in tsp
1747 		 * if we are unparked before the timeout expires.
1748 		 */
1749 		error = __lwp_park(tsp, 0);
1750 		set_parking_flag(self, 0);
1751 		/*
1752 		 * We could have taken a signal or suspended ourself.
1753 		 * If we did, then we removed ourself from the queue.
1754 		 * Someone else may have removed us from the queue
1755 		 * as a consequence of mutex_unlock().  We may have
1756 		 * gotten a timeout from __lwp_park().  Or we may still
1757 		 * be on the queue and this is just a spurious wakeup.
1758 		 */
1759 		qp = queue_lock(mp, MX);
1760 		if (self->ul_sleepq == NULL) {
1761 			if (error) {
1762 				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1763 				if (error != EINTR)
1764 					break;
1765 				error = 0;
1766 			}
1767 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1768 				mp->mutex_owner = (uintptr_t)self;
1769 				break;
1770 			}
1771 			enqueue(qp, self, 0);
1772 			mp->mutex_waiters = 1;
1773 		}
1774 		ASSERT(self->ul_sleepq == qp &&
1775 		    self->ul_qtype == MX &&
1776 		    self->ul_wchan == mp);
1777 		if (error) {
1778 			if (error != EINTR) {
1779 				mp->mutex_waiters = dequeue_self(qp);
1780 				break;
1781 			}
1782 			error = 0;
1783 		}
1784 	}
1785 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1786 	    self->ul_wchan == NULL);
1787 	self->ul_sp = 0;
1788 	queue_unlock(qp);
1789 
1790 	if (msp)
1791 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1792 
1793 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1794 
1795 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1796 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1797 		/*
1798 		 * We shouldn't own the mutex.
1799 		 * Just clear the lock; everyone has already been waked up.
1800 		 */
1801 		mp->mutex_owner = 0;
1802 		(void) clear_lockbyte(&mp->mutex_lockword);
1803 		error = ENOTRECOVERABLE;
1804 	}
1805 
1806 	if (error) {
1807 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1808 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1809 	} else {
1810 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1811 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1812 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1813 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1814 			error = EOWNERDEAD;
1815 		}
1816 	}
1817 
1818 	return (error);
1819 }
1820 
1821 static int
1822 mutex_recursion(mutex_t *mp, int mtype, int try)
1823 {
1824 	ASSERT(mutex_is_held(mp));
1825 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1826 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1827 
1828 	if (mtype & LOCK_RECURSIVE) {
1829 		if (mp->mutex_rcount == RECURSION_MAX) {
1830 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1831 			return (EAGAIN);
1832 		}
1833 		mp->mutex_rcount++;
1834 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1835 		return (0);
1836 	}
1837 	if (try == MUTEX_LOCK) {
1838 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1839 		return (EDEADLK);
1840 	}
1841 	return (EBUSY);
1842 }
1843 
1844 /*
1845  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1846  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1847  * We use tdb_hash_lock here and in the synch object tracking code in
1848  * the tdb_agent.c file.  There is no conflict between these two usages.
1849  */
1850 void
1851 register_lock(mutex_t *mp)
1852 {
1853 	uberdata_t *udp = curthread->ul_uberdata;
1854 	uint_t hash = LOCK_HASH(mp);
1855 	robust_t *rlp;
1856 	robust_t **rlpp;
1857 	robust_t **table;
1858 
1859 	if ((table = udp->robustlocks) == NULL) {
1860 		lmutex_lock(&udp->tdb_hash_lock);
1861 		if ((table = udp->robustlocks) == NULL) {
1862 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1863 			_membar_producer();
1864 			udp->robustlocks = table;
1865 		}
1866 		lmutex_unlock(&udp->tdb_hash_lock);
1867 	}
1868 	_membar_consumer();
1869 
1870 	/*
1871 	 * First search the registered table with no locks held.
1872 	 * This is safe because the table never shrinks
1873 	 * and we can only get a false negative.
1874 	 */
1875 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1876 		if (rlp->robust_lock == mp)	/* already registered */
1877 			return;
1878 	}
1879 
1880 	/*
1881 	 * The lock was not found.
1882 	 * Repeat the operation with tdb_hash_lock held.
1883 	 */
1884 	lmutex_lock(&udp->tdb_hash_lock);
1885 
1886 	for (rlpp = &table[hash];
1887 	    (rlp = *rlpp) != NULL;
1888 	    rlpp = &rlp->robust_next) {
1889 		if (rlp->robust_lock == mp) {	/* already registered */
1890 			lmutex_unlock(&udp->tdb_hash_lock);
1891 			return;
1892 		}
1893 	}
1894 
1895 	/*
1896 	 * The lock has never been registered.
1897 	 * Register it now and add it to the table.
1898 	 */
1899 	(void) ___lwp_mutex_register(mp);
1900 	rlp = lmalloc(sizeof (*rlp));
1901 	rlp->robust_lock = mp;
1902 	_membar_producer();
1903 	*rlpp = rlp;
1904 
1905 	lmutex_unlock(&udp->tdb_hash_lock);
1906 }
1907 
1908 /*
1909  * This is called in the child of fork()/forkall() to start over
1910  * with a clean slate.  (Each process must register its own locks.)
1911  * No locks are needed because all other threads are suspended or gone.
1912  */
1913 void
1914 unregister_locks(void)
1915 {
1916 	uberdata_t *udp = curthread->ul_uberdata;
1917 	uint_t hash;
1918 	robust_t **table;
1919 	robust_t *rlp;
1920 	robust_t *next;
1921 
1922 	if ((table = udp->robustlocks) != NULL) {
1923 		for (hash = 0; hash < LOCKHASHSZ; hash++) {
1924 			rlp = table[hash];
1925 			while (rlp != NULL) {
1926 				next = rlp->robust_next;
1927 				lfree(rlp, sizeof (*rlp));
1928 				rlp = next;
1929 			}
1930 		}
1931 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
1932 		udp->robustlocks = NULL;
1933 	}
1934 }
1935 
1936 /*
1937  * Returns with mutex_owner set correctly.
1938  */
1939 int
1940 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
1941 {
1942 	ulwp_t *self = curthread;
1943 	uberdata_t *udp = self->ul_uberdata;
1944 	int mtype = mp->mutex_type;
1945 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
1946 	int error = 0;
1947 	int noceil = try & MUTEX_NOCEIL;
1948 	uint8_t ceil;
1949 	int myprio;
1950 
1951 	try &= ~MUTEX_NOCEIL;
1952 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1953 
1954 	if (!self->ul_schedctl_called)
1955 		(void) setup_schedctl();
1956 
1957 	if (msp && try == MUTEX_TRY)
1958 		tdb_incr(msp->mutex_try);
1959 
1960 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_is_held(mp))
1961 		return (mutex_recursion(mp, mtype, try));
1962 
1963 	if (self->ul_error_detection && try == MUTEX_LOCK &&
1964 	    tsp == NULL && mutex_is_held(mp))
1965 		lock_error(mp, "mutex_lock", NULL, NULL);
1966 
1967 	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
1968 		update_sched(self);
1969 		if (self->ul_cid != self->ul_rtclassid) {
1970 			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
1971 			return (EPERM);
1972 		}
1973 		ceil = mp->mutex_ceiling;
1974 		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
1975 		if (myprio > ceil) {
1976 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
1977 			return (EINVAL);
1978 		}
1979 		if ((error = _ceil_mylist_add(mp)) != 0) {
1980 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1981 			return (error);
1982 		}
1983 		if (myprio < ceil)
1984 			_ceil_prio_inherit(ceil);
1985 	}
1986 
1987 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
1988 	    == (USYNC_PROCESS | LOCK_ROBUST))
1989 		register_lock(mp);
1990 
1991 	if (mtype & LOCK_PRIO_INHERIT) {
1992 		/* go straight to the kernel */
1993 		if (try == MUTEX_TRY)
1994 			error = mutex_trylock_kernel(mp);
1995 		else	/* MUTEX_LOCK */
1996 			error = mutex_lock_kernel(mp, tsp, msp);
1997 		/*
1998 		 * The kernel never sets or clears the lock byte
1999 		 * for LOCK_PRIO_INHERIT mutexes.
2000 		 * Set it here for consistency.
2001 		 */
2002 		switch (error) {
2003 		case 0:
2004 			self->ul_pilocks++;
2005 			mp->mutex_lockw = LOCKSET;
2006 			break;
2007 		case EOWNERDEAD:
2008 		case ELOCKUNMAPPED:
2009 			self->ul_pilocks++;
2010 			mp->mutex_lockw = LOCKSET;
2011 			/* FALLTHROUGH */
2012 		case ENOTRECOVERABLE:
2013 			ASSERT(mtype & LOCK_ROBUST);
2014 			break;
2015 		case EDEADLK:
2016 			if (try == MUTEX_LOCK)
2017 				stall();
2018 			error = EBUSY;
2019 			break;
2020 		}
2021 	} else if (mtype & USYNC_PROCESS) {
2022 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2023 		if (error == EBUSY && try == MUTEX_LOCK)
2024 			error = mutex_lock_kernel(mp, tsp, msp);
2025 	} else {	/* USYNC_THREAD */
2026 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2027 		if (error == EBUSY && try == MUTEX_LOCK)
2028 			error = mutex_lock_queue(self, msp, mp, tsp);
2029 	}
2030 
2031 	switch (error) {
2032 	case 0:
2033 	case EOWNERDEAD:
2034 	case ELOCKUNMAPPED:
2035 		if (mtype & LOCK_ROBUST)
2036 			remember_lock(mp);
2037 		if (msp)
2038 			record_begin_hold(msp);
2039 		break;
2040 	default:
2041 		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2042 			(void) _ceil_mylist_del(mp);
2043 			if (myprio < ceil)
2044 				_ceil_prio_waive();
2045 		}
2046 		if (try == MUTEX_TRY) {
2047 			if (msp)
2048 				tdb_incr(msp->mutex_try_fail);
2049 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2050 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2051 				tdb_event(TD_LOCK_TRY, udp);
2052 			}
2053 		}
2054 		break;
2055 	}
2056 
2057 	return (error);
2058 }
2059 
2060 int
2061 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2062 {
2063 	ulwp_t *self = curthread;
2064 	uberdata_t *udp = self->ul_uberdata;
2065 
2066 	/*
2067 	 * We know that USYNC_PROCESS is set in mtype and that
2068 	 * zero, one, or both of the flags LOCK_RECURSIVE and
2069 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
2070 	 */
2071 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2072 	enter_critical(self);
2073 	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2074 		mp->mutex_owner = (uintptr_t)self;
2075 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
2076 		exit_critical(self);
2077 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2078 		return (0);
2079 	}
2080 	exit_critical(self);
2081 
2082 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2083 		return (mutex_recursion(mp, mtype, try));
2084 
2085 	if (try == MUTEX_LOCK) {
2086 		if (mutex_trylock_process(mp, 1) == 0)
2087 			return (0);
2088 		return (mutex_lock_kernel(mp, tsp, NULL));
2089 	}
2090 
2091 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2092 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2093 		tdb_event(TD_LOCK_TRY, udp);
2094 	}
2095 	return (EBUSY);
2096 }
2097 
2098 static int
2099 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2100 {
2101 	ulwp_t *self = curthread;
2102 	int mtype = mp->mutex_type;
2103 	uberflags_t *gflags;
2104 
2105 	/*
2106 	 * Optimize the case of USYNC_THREAD, including
2107 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2108 	 * no error detection, no lock statistics,
2109 	 * and the process has only a single thread.
2110 	 * (Most likely a traditional single-threaded application.)
2111 	 */
2112 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2113 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2114 		/*
2115 		 * Only one thread exists so we don't need an atomic operation.
2116 		 */
2117 		if (mp->mutex_lockw == 0) {
2118 			mp->mutex_lockw = LOCKSET;
2119 			mp->mutex_owner = (uintptr_t)self;
2120 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2121 			return (0);
2122 		}
2123 		if (mtype && MUTEX_OWNER(mp) == self)
2124 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2125 		/*
2126 		 * We have reached a deadlock, probably because the
2127 		 * process is executing non-async-signal-safe code in
2128 		 * a signal handler and is attempting to acquire a lock
2129 		 * that it already owns.  This is not surprising, given
2130 		 * bad programming practices over the years that has
2131 		 * resulted in applications calling printf() and such
2132 		 * in their signal handlers.  Unless the user has told
2133 		 * us that the signal handlers are safe by setting:
2134 		 *	export _THREAD_ASYNC_SAFE=1
2135 		 * we return EDEADLK rather than actually deadlocking.
2136 		 */
2137 		if (tsp == NULL &&
2138 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2139 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2140 			return (EDEADLK);
2141 		}
2142 	}
2143 
2144 	/*
2145 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2146 	 * no error detection, and no lock statistics.
2147 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2148 	 */
2149 	if ((gflags = self->ul_schedctl_called) != NULL &&
2150 	    (gflags->uf_trs_ted |
2151 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2152 		if (mtype & USYNC_PROCESS)
2153 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2154 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2155 			mp->mutex_owner = (uintptr_t)self;
2156 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2157 			return (0);
2158 		}
2159 		if (mtype && MUTEX_OWNER(mp) == self)
2160 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2161 		if (mutex_trylock_adaptive(mp, 1) != 0)
2162 			return (mutex_lock_queue(self, NULL, mp, tsp));
2163 		return (0);
2164 	}
2165 
2166 	/* else do it the long way */
2167 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2168 }
2169 
2170 /*
2171  * Of the following function names (all the same function, of course),
2172  * only _private_mutex_lock() is not exported from libc.  This means
2173  * that calling _private_mutex_lock() within libc will not invoke the
2174  * dynamic linker.  This is critical for any code called in the child
2175  * of vfork() (via posix_spawn()) because invoking the dynamic linker
2176  * in such a case would corrupt the parent's address space.  There are
2177  * other places in libc where avoiding the dynamic linker is necessary.
2178  * Of course, _private_mutex_lock() can be called in cases not requiring
2179  * the avoidance of the dynamic linker too, and often is.
2180  */
2181 #pragma weak _private_mutex_lock = __mutex_lock
2182 #pragma weak mutex_lock = __mutex_lock
2183 #pragma weak _mutex_lock = __mutex_lock
2184 #pragma weak pthread_mutex_lock = __mutex_lock
2185 #pragma weak _pthread_mutex_lock = __mutex_lock
2186 int
2187 __mutex_lock(mutex_t *mp)
2188 {
2189 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2190 	return (mutex_lock_impl(mp, NULL));
2191 }
2192 
2193 #pragma weak pthread_mutex_timedlock = _pthread_mutex_timedlock
2194 int
2195 _pthread_mutex_timedlock(mutex_t *mp, const timespec_t *abstime)
2196 {
2197 	timespec_t tslocal;
2198 	int error;
2199 
2200 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2201 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2202 	error = mutex_lock_impl(mp, &tslocal);
2203 	if (error == ETIME)
2204 		error = ETIMEDOUT;
2205 	return (error);
2206 }
2207 
2208 #pragma weak pthread_mutex_reltimedlock_np = _pthread_mutex_reltimedlock_np
2209 int
2210 _pthread_mutex_reltimedlock_np(mutex_t *mp, const timespec_t *reltime)
2211 {
2212 	timespec_t tslocal;
2213 	int error;
2214 
2215 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2216 	tslocal = *reltime;
2217 	error = mutex_lock_impl(mp, &tslocal);
2218 	if (error == ETIME)
2219 		error = ETIMEDOUT;
2220 	return (error);
2221 }
2222 
2223 #pragma weak _private_mutex_trylock = __mutex_trylock
2224 #pragma weak mutex_trylock = __mutex_trylock
2225 #pragma weak _mutex_trylock = __mutex_trylock
2226 #pragma weak pthread_mutex_trylock = __mutex_trylock
2227 #pragma weak _pthread_mutex_trylock = __mutex_trylock
2228 int
2229 __mutex_trylock(mutex_t *mp)
2230 {
2231 	ulwp_t *self = curthread;
2232 	uberdata_t *udp = self->ul_uberdata;
2233 	int mtype = mp->mutex_type;
2234 	uberflags_t *gflags;
2235 
2236 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2237 
2238 	/*
2239 	 * Optimize the case of USYNC_THREAD, including
2240 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2241 	 * no error detection, no lock statistics,
2242 	 * and the process has only a single thread.
2243 	 * (Most likely a traditional single-threaded application.)
2244 	 */
2245 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2246 	    udp->uberflags.uf_all) == 0) {
2247 		/*
2248 		 * Only one thread exists so we don't need an atomic operation.
2249 		 */
2250 		if (mp->mutex_lockw == 0) {
2251 			mp->mutex_lockw = LOCKSET;
2252 			mp->mutex_owner = (uintptr_t)self;
2253 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2254 			return (0);
2255 		}
2256 		if (mtype && MUTEX_OWNER(mp) == self)
2257 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2258 		return (EBUSY);
2259 	}
2260 
2261 	/*
2262 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2263 	 * no error detection, and no lock statistics.
2264 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2265 	 */
2266 	if ((gflags = self->ul_schedctl_called) != NULL &&
2267 	    (gflags->uf_trs_ted |
2268 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2269 		if (mtype & USYNC_PROCESS)
2270 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2271 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2272 			mp->mutex_owner = (uintptr_t)self;
2273 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2274 			return (0);
2275 		}
2276 		if (mtype && MUTEX_OWNER(mp) == self)
2277 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2278 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2279 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2280 			tdb_event(TD_LOCK_TRY, udp);
2281 		}
2282 		return (EBUSY);
2283 	}
2284 
2285 	/* else do it the long way */
2286 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2287 }
2288 
2289 int
2290 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2291 {
2292 	ulwp_t *self = curthread;
2293 	uberdata_t *udp = self->ul_uberdata;
2294 	int mtype = mp->mutex_type;
2295 	tdb_mutex_stats_t *msp;
2296 	int error = 0;
2297 	int release_all;
2298 	lwpid_t lwpid;
2299 
2300 	if ((mtype & LOCK_ERRORCHECK) && !mutex_is_held(mp))
2301 		return (EPERM);
2302 
2303 	if (self->ul_error_detection && !mutex_is_held(mp))
2304 		lock_error(mp, "mutex_unlock", NULL, NULL);
2305 
2306 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2307 		mp->mutex_rcount--;
2308 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2309 		return (0);
2310 	}
2311 
2312 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2313 		(void) record_hold_time(msp);
2314 
2315 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2316 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2317 		ASSERT(mp->mutex_type & LOCK_ROBUST);
2318 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2319 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2320 	}
2321 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2322 
2323 	if (mtype & LOCK_PRIO_INHERIT) {
2324 		no_preempt(self);
2325 		mp->mutex_owner = 0;
2326 		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2327 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2328 		mp->mutex_lockw = LOCKCLEAR;
2329 		self->ul_pilocks--;
2330 		error = ___lwp_mutex_unlock(mp);
2331 		preempt(self);
2332 	} else if (mtype & USYNC_PROCESS) {
2333 		mutex_unlock_process(mp, release_all);
2334 	} else {	/* USYNC_THREAD */
2335 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2336 			(void) __lwp_unpark(lwpid);
2337 			preempt(self);
2338 		}
2339 	}
2340 
2341 	if (mtype & LOCK_ROBUST)
2342 		forget_lock(mp);
2343 
2344 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2345 		_ceil_prio_waive();
2346 
2347 	return (error);
2348 }
2349 
2350 #pragma weak _private_mutex_unlock = __mutex_unlock
2351 #pragma weak mutex_unlock = __mutex_unlock
2352 #pragma weak _mutex_unlock = __mutex_unlock
2353 #pragma weak pthread_mutex_unlock = __mutex_unlock
2354 #pragma weak _pthread_mutex_unlock = __mutex_unlock
2355 int
2356 __mutex_unlock(mutex_t *mp)
2357 {
2358 	ulwp_t *self = curthread;
2359 	int mtype = mp->mutex_type;
2360 	uberflags_t *gflags;
2361 	lwpid_t lwpid;
2362 	short el;
2363 
2364 	/*
2365 	 * Optimize the case of USYNC_THREAD, including
2366 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2367 	 * no error detection, no lock statistics,
2368 	 * and the process has only a single thread.
2369 	 * (Most likely a traditional single-threaded application.)
2370 	 */
2371 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2372 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2373 		if (mtype) {
2374 			/*
2375 			 * At this point we know that one or both of the
2376 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2377 			 */
2378 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2379 				return (EPERM);
2380 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2381 				mp->mutex_rcount--;
2382 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2383 				return (0);
2384 			}
2385 		}
2386 		/*
2387 		 * Only one thread exists so we don't need an atomic operation.
2388 		 * Also, there can be no waiters.
2389 		 */
2390 		mp->mutex_owner = 0;
2391 		mp->mutex_lockword = 0;
2392 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2393 		return (0);
2394 	}
2395 
2396 	/*
2397 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2398 	 * no error detection, and no lock statistics.
2399 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2400 	 */
2401 	if ((gflags = self->ul_schedctl_called) != NULL) {
2402 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2403 fast_unlock:
2404 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2405 				(void) __lwp_unpark(lwpid);
2406 				preempt(self);
2407 			}
2408 			return (0);
2409 		}
2410 		if (el)		/* error detection or lock statistics */
2411 			goto slow_unlock;
2412 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2413 			/*
2414 			 * At this point we know that one or both of the
2415 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2416 			 */
2417 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2418 				return (EPERM);
2419 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2420 				mp->mutex_rcount--;
2421 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2422 				return (0);
2423 			}
2424 			goto fast_unlock;
2425 		}
2426 		if ((mtype &
2427 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2428 			/*
2429 			 * At this point we know that zero, one, or both of the
2430 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2431 			 * that the USYNC_PROCESS flag is set.
2432 			 */
2433 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2434 				return (EPERM);
2435 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2436 				mp->mutex_rcount--;
2437 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2438 				return (0);
2439 			}
2440 			mutex_unlock_process(mp, 0);
2441 			return (0);
2442 		}
2443 	}
2444 
2445 	/* else do it the long way */
2446 slow_unlock:
2447 	return (mutex_unlock_internal(mp, 0));
2448 }
2449 
2450 /*
2451  * Internally to the library, almost all mutex lock/unlock actions
2452  * go through these lmutex_ functions, to protect critical regions.
2453  * We replicate a bit of code from __mutex_lock() and __mutex_unlock()
2454  * to make these functions faster since we know that the mutex type
2455  * of all internal locks is USYNC_THREAD.  We also know that internal
2456  * locking can never fail, so we panic if it does.
2457  */
2458 void
2459 lmutex_lock(mutex_t *mp)
2460 {
2461 	ulwp_t *self = curthread;
2462 	uberdata_t *udp = self->ul_uberdata;
2463 
2464 	ASSERT(mp->mutex_type == USYNC_THREAD);
2465 
2466 	enter_critical(self);
2467 	/*
2468 	 * Optimize the case of no lock statistics and only a single thread.
2469 	 * (Most likely a traditional single-threaded application.)
2470 	 */
2471 	if (udp->uberflags.uf_all == 0) {
2472 		/*
2473 		 * Only one thread exists; the mutex must be free.
2474 		 */
2475 		ASSERT(mp->mutex_lockw == 0);
2476 		mp->mutex_lockw = LOCKSET;
2477 		mp->mutex_owner = (uintptr_t)self;
2478 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2479 	} else {
2480 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2481 
2482 		if (!self->ul_schedctl_called)
2483 			(void) setup_schedctl();
2484 
2485 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2486 			mp->mutex_owner = (uintptr_t)self;
2487 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2488 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2489 			(void) mutex_lock_queue(self, msp, mp, NULL);
2490 		}
2491 
2492 		if (msp)
2493 			record_begin_hold(msp);
2494 	}
2495 }
2496 
2497 void
2498 lmutex_unlock(mutex_t *mp)
2499 {
2500 	ulwp_t *self = curthread;
2501 	uberdata_t *udp = self->ul_uberdata;
2502 
2503 	ASSERT(mp->mutex_type == USYNC_THREAD);
2504 
2505 	/*
2506 	 * Optimize the case of no lock statistics and only a single thread.
2507 	 * (Most likely a traditional single-threaded application.)
2508 	 */
2509 	if (udp->uberflags.uf_all == 0) {
2510 		/*
2511 		 * Only one thread exists so there can be no waiters.
2512 		 */
2513 		mp->mutex_owner = 0;
2514 		mp->mutex_lockword = 0;
2515 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2516 	} else {
2517 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2518 		lwpid_t lwpid;
2519 
2520 		if (msp)
2521 			(void) record_hold_time(msp);
2522 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2523 			(void) __lwp_unpark(lwpid);
2524 			preempt(self);
2525 		}
2526 	}
2527 	exit_critical(self);
2528 }
2529 
2530 /*
2531  * For specialized code in libc, like the asynchronous i/o code,
2532  * the following sig_*() locking primitives are used in order
2533  * to make the code asynchronous signal safe.  Signals are
2534  * deferred while locks acquired by these functions are held.
2535  */
2536 void
2537 sig_mutex_lock(mutex_t *mp)
2538 {
2539 	sigoff(curthread);
2540 	(void) _private_mutex_lock(mp);
2541 }
2542 
2543 void
2544 sig_mutex_unlock(mutex_t *mp)
2545 {
2546 	(void) _private_mutex_unlock(mp);
2547 	sigon(curthread);
2548 }
2549 
2550 int
2551 sig_mutex_trylock(mutex_t *mp)
2552 {
2553 	int error;
2554 
2555 	sigoff(curthread);
2556 	if ((error = _private_mutex_trylock(mp)) != 0)
2557 		sigon(curthread);
2558 	return (error);
2559 }
2560 
2561 /*
2562  * sig_cond_wait() is a cancellation point.
2563  */
2564 int
2565 sig_cond_wait(cond_t *cv, mutex_t *mp)
2566 {
2567 	int error;
2568 
2569 	ASSERT(curthread->ul_sigdefer != 0);
2570 	_private_testcancel();
2571 	error = __cond_wait(cv, mp);
2572 	if (error == EINTR && curthread->ul_cursig) {
2573 		sig_mutex_unlock(mp);
2574 		/* take the deferred signal here */
2575 		sig_mutex_lock(mp);
2576 	}
2577 	_private_testcancel();
2578 	return (error);
2579 }
2580 
2581 /*
2582  * sig_cond_reltimedwait() is a cancellation point.
2583  */
2584 int
2585 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2586 {
2587 	int error;
2588 
2589 	ASSERT(curthread->ul_sigdefer != 0);
2590 	_private_testcancel();
2591 	error = __cond_reltimedwait(cv, mp, ts);
2592 	if (error == EINTR && curthread->ul_cursig) {
2593 		sig_mutex_unlock(mp);
2594 		/* take the deferred signal here */
2595 		sig_mutex_lock(mp);
2596 	}
2597 	_private_testcancel();
2598 	return (error);
2599 }
2600 
2601 /*
2602  * For specialized code in libc, like the stdio code.
2603  * the following cancel_safe_*() locking primitives are used in
2604  * order to make the code cancellation-safe.  Cancellation is
2605  * deferred while locks acquired by these functions are held.
2606  */
2607 void
2608 cancel_safe_mutex_lock(mutex_t *mp)
2609 {
2610 	(void) _private_mutex_lock(mp);
2611 	curthread->ul_libc_locks++;
2612 }
2613 
2614 int
2615 cancel_safe_mutex_trylock(mutex_t *mp)
2616 {
2617 	int error;
2618 
2619 	if ((error = _private_mutex_trylock(mp)) == 0)
2620 		curthread->ul_libc_locks++;
2621 	return (error);
2622 }
2623 
2624 void
2625 cancel_safe_mutex_unlock(mutex_t *mp)
2626 {
2627 	ulwp_t *self = curthread;
2628 
2629 	ASSERT(self->ul_libc_locks != 0);
2630 
2631 	(void) _private_mutex_unlock(mp);
2632 
2633 	/*
2634 	 * Decrement the count of locks held by cancel_safe_mutex_lock().
2635 	 * If we are then in a position to terminate cleanly and
2636 	 * if there is a pending cancellation and cancellation
2637 	 * is not disabled and we received EINTR from a recent
2638 	 * system call then perform the cancellation action now.
2639 	 */
2640 	if (--self->ul_libc_locks == 0 &&
2641 	    !(self->ul_vfork | self->ul_nocancel |
2642 	    self->ul_critical | self->ul_sigdefer) &&
2643 	    cancel_active())
2644 		_pthread_exit(PTHREAD_CANCELED);
2645 }
2646 
2647 static int
2648 shared_mutex_held(mutex_t *mparg)
2649 {
2650 	/*
2651 	 * The 'volatile' is necessary to make sure the compiler doesn't
2652 	 * reorder the tests of the various components of the mutex.
2653 	 * They must be tested in this order:
2654 	 *	mutex_lockw
2655 	 *	mutex_owner
2656 	 *	mutex_ownerpid
2657 	 * This relies on the fact that everywhere mutex_lockw is cleared,
2658 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2659 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2660 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2661 	 * mutex_lockw is set or cleared with a memory barrier.
2662 	 */
2663 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2664 	ulwp_t *self = curthread;
2665 	uberdata_t *udp = self->ul_uberdata;
2666 
2667 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2668 }
2669 
2670 /*
2671  * Some crufty old programs define their own version of _mutex_held()
2672  * to be simply return(1).  This breaks internal libc logic, so we
2673  * define a private version for exclusive use by libc, mutex_is_held(),
2674  * and also a new public function, __mutex_held(), to be used in new
2675  * code to circumvent these crufty old programs.
2676  */
2677 #pragma weak mutex_held = mutex_is_held
2678 #pragma weak _mutex_held = mutex_is_held
2679 #pragma weak __mutex_held = mutex_is_held
2680 int
2681 mutex_is_held(mutex_t *mparg)
2682 {
2683 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2684 
2685 	if (mparg->mutex_type & USYNC_PROCESS)
2686 		return (shared_mutex_held(mparg));
2687 	return (MUTEX_OWNED(mp, curthread));
2688 }
2689 
2690 #pragma weak _private_mutex_destroy = __mutex_destroy
2691 #pragma weak mutex_destroy = __mutex_destroy
2692 #pragma weak _mutex_destroy = __mutex_destroy
2693 #pragma weak pthread_mutex_destroy = __mutex_destroy
2694 #pragma weak _pthread_mutex_destroy = __mutex_destroy
2695 int
2696 __mutex_destroy(mutex_t *mp)
2697 {
2698 	if (mp->mutex_type & USYNC_PROCESS)
2699 		forget_lock(mp);
2700 	(void) _memset(mp, 0, sizeof (*mp));
2701 	tdb_sync_obj_deregister(mp);
2702 	return (0);
2703 }
2704 
2705 #pragma weak mutex_consistent = __mutex_consistent
2706 #pragma weak _mutex_consistent = __mutex_consistent
2707 #pragma weak pthread_mutex_consistent_np = __mutex_consistent
2708 #pragma weak _pthread_mutex_consistent_np = __mutex_consistent
2709 int
2710 __mutex_consistent(mutex_t *mp)
2711 {
2712 	/*
2713 	 * Do this only for an inconsistent, initialized robust lock
2714 	 * that we hold.  For all other cases, return EINVAL.
2715 	 */
2716 	if (mutex_is_held(mp) &&
2717 	    (mp->mutex_type & LOCK_ROBUST) &&
2718 	    (mp->mutex_flag & LOCK_INITED) &&
2719 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2720 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2721 		mp->mutex_rcount = 0;
2722 		return (0);
2723 	}
2724 	return (EINVAL);
2725 }
2726 
2727 /*
2728  * Spin locks are separate from ordinary mutexes,
2729  * but we use the same data structure for them.
2730  */
2731 
2732 #pragma weak pthread_spin_init = _pthread_spin_init
2733 int
2734 _pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2735 {
2736 	mutex_t *mp = (mutex_t *)lock;
2737 
2738 	(void) _memset(mp, 0, sizeof (*mp));
2739 	if (pshared == PTHREAD_PROCESS_SHARED)
2740 		mp->mutex_type = USYNC_PROCESS;
2741 	else
2742 		mp->mutex_type = USYNC_THREAD;
2743 	mp->mutex_flag = LOCK_INITED;
2744 	mp->mutex_magic = MUTEX_MAGIC;
2745 	return (0);
2746 }
2747 
2748 #pragma weak pthread_spin_destroy = _pthread_spin_destroy
2749 int
2750 _pthread_spin_destroy(pthread_spinlock_t *lock)
2751 {
2752 	(void) _memset(lock, 0, sizeof (*lock));
2753 	return (0);
2754 }
2755 
2756 #pragma weak pthread_spin_trylock = _pthread_spin_trylock
2757 int
2758 _pthread_spin_trylock(pthread_spinlock_t *lock)
2759 {
2760 	mutex_t *mp = (mutex_t *)lock;
2761 	ulwp_t *self = curthread;
2762 	int error = 0;
2763 
2764 	no_preempt(self);
2765 	if (set_lock_byte(&mp->mutex_lockw) != 0)
2766 		error = EBUSY;
2767 	else {
2768 		mp->mutex_owner = (uintptr_t)self;
2769 		if (mp->mutex_type == USYNC_PROCESS)
2770 			mp->mutex_ownerpid = self->ul_uberdata->pid;
2771 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2772 	}
2773 	preempt(self);
2774 	return (error);
2775 }
2776 
2777 #pragma weak pthread_spin_lock = _pthread_spin_lock
2778 int
2779 _pthread_spin_lock(pthread_spinlock_t *lock)
2780 {
2781 	mutex_t *mp = (mutex_t *)lock;
2782 	ulwp_t *self = curthread;
2783 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2784 	int count = 0;
2785 
2786 	ASSERT(!self->ul_critical || self->ul_bindflags);
2787 
2788 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
2789 
2790 	/*
2791 	 * We don't care whether the owner is running on a processor.
2792 	 * We just spin because that's what this interface requires.
2793 	 */
2794 	for (;;) {
2795 		if (*lockp == 0) {	/* lock byte appears to be clear */
2796 			no_preempt(self);
2797 			if (set_lock_byte(lockp) == 0)
2798 				break;
2799 			preempt(self);
2800 		}
2801 		if (count < INT_MAX)
2802 			count++;
2803 		SMT_PAUSE();
2804 	}
2805 	mp->mutex_owner = (uintptr_t)self;
2806 	if (mp->mutex_type == USYNC_PROCESS)
2807 		mp->mutex_ownerpid = self->ul_uberdata->pid;
2808 	preempt(self);
2809 	if (count) {
2810 		DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
2811 	}
2812 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2813 	return (0);
2814 }
2815 
2816 #pragma weak pthread_spin_unlock = _pthread_spin_unlock
2817 int
2818 _pthread_spin_unlock(pthread_spinlock_t *lock)
2819 {
2820 	mutex_t *mp = (mutex_t *)lock;
2821 	ulwp_t *self = curthread;
2822 
2823 	no_preempt(self);
2824 	mp->mutex_owner = 0;
2825 	mp->mutex_ownerpid = 0;
2826 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2827 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
2828 	preempt(self);
2829 	return (0);
2830 }
2831 
2832 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
2833 
2834 /*
2835  * Find/allocate an entry for 'lock' in our array of held locks.
2836  */
2837 static mutex_t **
2838 find_lock_entry(mutex_t *lock)
2839 {
2840 	ulwp_t *self = curthread;
2841 	mutex_t **remembered = NULL;
2842 	mutex_t **lockptr;
2843 	uint_t nlocks;
2844 
2845 	if ((nlocks = self->ul_heldlockcnt) != 0)
2846 		lockptr = self->ul_heldlocks.array;
2847 	else {
2848 		nlocks = 1;
2849 		lockptr = &self->ul_heldlocks.single;
2850 	}
2851 
2852 	for (; nlocks; nlocks--, lockptr++) {
2853 		if (*lockptr == lock)
2854 			return (lockptr);
2855 		if (*lockptr == NULL && remembered == NULL)
2856 			remembered = lockptr;
2857 	}
2858 	if (remembered != NULL) {
2859 		*remembered = lock;
2860 		return (remembered);
2861 	}
2862 
2863 	/*
2864 	 * No entry available.  Allocate more space, converting
2865 	 * the single entry into an array of entries if necessary.
2866 	 */
2867 	if ((nlocks = self->ul_heldlockcnt) == 0) {
2868 		/*
2869 		 * Initial allocation of the array.
2870 		 * Convert the single entry into an array.
2871 		 */
2872 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
2873 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
2874 		/*
2875 		 * The single entry becomes the first entry in the array.
2876 		 */
2877 		*lockptr = self->ul_heldlocks.single;
2878 		self->ul_heldlocks.array = lockptr;
2879 		/*
2880 		 * Return the next available entry in the array.
2881 		 */
2882 		*++lockptr = lock;
2883 		return (lockptr);
2884 	}
2885 	/*
2886 	 * Reallocate the array, double the size each time.
2887 	 */
2888 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
2889 	(void) _memcpy(lockptr, self->ul_heldlocks.array,
2890 	    nlocks * sizeof (mutex_t *));
2891 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2892 	self->ul_heldlocks.array = lockptr;
2893 	self->ul_heldlockcnt *= 2;
2894 	/*
2895 	 * Return the next available entry in the newly allocated array.
2896 	 */
2897 	*(lockptr += nlocks) = lock;
2898 	return (lockptr);
2899 }
2900 
2901 /*
2902  * Insert 'lock' into our list of held locks.
2903  * Currently only used for LOCK_ROBUST mutexes.
2904  */
2905 void
2906 remember_lock(mutex_t *lock)
2907 {
2908 	(void) find_lock_entry(lock);
2909 }
2910 
2911 /*
2912  * Remove 'lock' from our list of held locks.
2913  * Currently only used for LOCK_ROBUST mutexes.
2914  */
2915 void
2916 forget_lock(mutex_t *lock)
2917 {
2918 	*find_lock_entry(lock) = NULL;
2919 }
2920 
2921 /*
2922  * Free the array of held locks.
2923  */
2924 void
2925 heldlock_free(ulwp_t *ulwp)
2926 {
2927 	uint_t nlocks;
2928 
2929 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
2930 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2931 	ulwp->ul_heldlockcnt = 0;
2932 	ulwp->ul_heldlocks.array = NULL;
2933 }
2934 
2935 /*
2936  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
2937  * Called from _thrp_exit() to deal with abandoned locks.
2938  */
2939 void
2940 heldlock_exit(void)
2941 {
2942 	ulwp_t *self = curthread;
2943 	mutex_t **lockptr;
2944 	uint_t nlocks;
2945 	mutex_t *mp;
2946 
2947 	if ((nlocks = self->ul_heldlockcnt) != 0)
2948 		lockptr = self->ul_heldlocks.array;
2949 	else {
2950 		nlocks = 1;
2951 		lockptr = &self->ul_heldlocks.single;
2952 	}
2953 
2954 	for (; nlocks; nlocks--, lockptr++) {
2955 		/*
2956 		 * The kernel takes care of transitioning held
2957 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
2958 		 * We avoid that case here.
2959 		 */
2960 		if ((mp = *lockptr) != NULL &&
2961 		    mutex_is_held(mp) &&
2962 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
2963 		    LOCK_ROBUST) {
2964 			mp->mutex_rcount = 0;
2965 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
2966 				mp->mutex_flag |= LOCK_OWNERDEAD;
2967 			(void) mutex_unlock_internal(mp, 1);
2968 		}
2969 	}
2970 
2971 	heldlock_free(self);
2972 }
2973 
2974 #pragma weak cond_init = _cond_init
2975 /* ARGSUSED2 */
2976 int
2977 _cond_init(cond_t *cvp, int type, void *arg)
2978 {
2979 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
2980 		return (EINVAL);
2981 	(void) _memset(cvp, 0, sizeof (*cvp));
2982 	cvp->cond_type = (uint16_t)type;
2983 	cvp->cond_magic = COND_MAGIC;
2984 	return (0);
2985 }
2986 
2987 /*
2988  * cond_sleep_queue(): utility function for cond_wait_queue().
2989  *
2990  * Go to sleep on a condvar sleep queue, expect to be waked up
2991  * by someone calling cond_signal() or cond_broadcast() or due
2992  * to receiving a UNIX signal or being cancelled, or just simply
2993  * due to a spurious wakeup (like someome calling forkall()).
2994  *
2995  * The associated mutex is *not* reacquired before returning.
2996  * That must be done by the caller of cond_sleep_queue().
2997  */
2998 static int
2999 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3000 {
3001 	ulwp_t *self = curthread;
3002 	queue_head_t *qp;
3003 	queue_head_t *mqp;
3004 	lwpid_t lwpid;
3005 	int signalled;
3006 	int error;
3007 	int cv_wake;
3008 	int release_all;
3009 
3010 	/*
3011 	 * Put ourself on the CV sleep queue, unlock the mutex, then
3012 	 * park ourself and unpark a candidate lwp to grab the mutex.
3013 	 * We must go onto the CV sleep queue before dropping the
3014 	 * mutex in order to guarantee atomicity of the operation.
3015 	 */
3016 	self->ul_sp = stkptr();
3017 	qp = queue_lock(cvp, CV);
3018 	enqueue(qp, self, 0);
3019 	cvp->cond_waiters_user = 1;
3020 	self->ul_cvmutex = mp;
3021 	self->ul_cv_wake = cv_wake = (tsp != NULL);
3022 	self->ul_signalled = 0;
3023 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
3024 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
3025 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3026 	}
3027 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3028 	lwpid = mutex_unlock_queue(mp, release_all);
3029 	for (;;) {
3030 		set_parking_flag(self, 1);
3031 		queue_unlock(qp);
3032 		if (lwpid != 0) {
3033 			lwpid = preempt_unpark(self, lwpid);
3034 			preempt(self);
3035 		}
3036 		/*
3037 		 * We may have a deferred signal present,
3038 		 * in which case we should return EINTR.
3039 		 * Also, we may have received a SIGCANCEL; if so
3040 		 * and we are cancelable we should return EINTR.
3041 		 * We force an immediate EINTR return from
3042 		 * __lwp_park() by turning our parking flag off.
3043 		 */
3044 		if (self->ul_cursig != 0 ||
3045 		    (self->ul_cancelable && self->ul_cancel_pending))
3046 			set_parking_flag(self, 0);
3047 		/*
3048 		 * __lwp_park() will return the residual time in tsp
3049 		 * if we are unparked before the timeout expires.
3050 		 */
3051 		error = __lwp_park(tsp, lwpid);
3052 		set_parking_flag(self, 0);
3053 		lwpid = 0;	/* unpark the other lwp only once */
3054 		/*
3055 		 * We were waked up by cond_signal(), cond_broadcast(),
3056 		 * by an interrupt or timeout (EINTR or ETIME),
3057 		 * or we may just have gotten a spurious wakeup.
3058 		 */
3059 		qp = queue_lock(cvp, CV);
3060 		if (!cv_wake)
3061 			mqp = queue_lock(mp, MX);
3062 		if (self->ul_sleepq == NULL)
3063 			break;
3064 		/*
3065 		 * We are on either the condvar sleep queue or the
3066 		 * mutex sleep queue.  Break out of the sleep if we
3067 		 * were interrupted or we timed out (EINTR or ETIME).
3068 		 * Else this is a spurious wakeup; continue the loop.
3069 		 */
3070 		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3071 			if (error) {
3072 				mp->mutex_waiters = dequeue_self(mqp);
3073 				break;
3074 			}
3075 			tsp = NULL;	/* no more timeout */
3076 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
3077 			if (error) {
3078 				cvp->cond_waiters_user = dequeue_self(qp);
3079 				break;
3080 			}
3081 			/*
3082 			 * Else a spurious wakeup on the condvar queue.
3083 			 * __lwp_park() has already adjusted the timeout.
3084 			 */
3085 		} else {
3086 			thr_panic("cond_sleep_queue(): thread not on queue");
3087 		}
3088 		if (!cv_wake)
3089 			queue_unlock(mqp);
3090 	}
3091 
3092 	self->ul_sp = 0;
3093 	self->ul_cv_wake = 0;
3094 	ASSERT(self->ul_cvmutex == NULL);
3095 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3096 	    self->ul_wchan == NULL);
3097 
3098 	signalled = self->ul_signalled;
3099 	self->ul_signalled = 0;
3100 	queue_unlock(qp);
3101 	if (!cv_wake)
3102 		queue_unlock(mqp);
3103 
3104 	/*
3105 	 * If we were concurrently cond_signal()d and any of:
3106 	 * received a UNIX signal, were cancelled, or got a timeout,
3107 	 * then perform another cond_signal() to avoid consuming it.
3108 	 */
3109 	if (error && signalled)
3110 		(void) cond_signal_internal(cvp);
3111 
3112 	return (error);
3113 }
3114 
3115 int
3116 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3117 {
3118 	ulwp_t *self = curthread;
3119 	int error;
3120 	int merror;
3121 
3122 	/*
3123 	 * The old thread library was programmed to defer signals
3124 	 * while in cond_wait() so that the associated mutex would
3125 	 * be guaranteed to be held when the application signal
3126 	 * handler was invoked.
3127 	 *
3128 	 * We do not behave this way by default; the state of the
3129 	 * associated mutex in the signal handler is undefined.
3130 	 *
3131 	 * To accommodate applications that depend on the old
3132 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
3133 	 * variable can be set to 1 and we will behave in the
3134 	 * old way with respect to cond_wait().
3135 	 */
3136 	if (self->ul_cond_wait_defer)
3137 		sigoff(self);
3138 
3139 	error = cond_sleep_queue(cvp, mp, tsp);
3140 
3141 	/*
3142 	 * Reacquire the mutex.
3143 	 */
3144 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3145 		error = merror;
3146 
3147 	/*
3148 	 * Take any deferred signal now, after we have reacquired the mutex.
3149 	 */
3150 	if (self->ul_cond_wait_defer)
3151 		sigon(self);
3152 
3153 	return (error);
3154 }
3155 
3156 /*
3157  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3158  * See the comment ahead of cond_sleep_queue(), above.
3159  */
3160 static int
3161 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3162 {
3163 	int mtype = mp->mutex_type;
3164 	ulwp_t *self = curthread;
3165 	int error;
3166 
3167 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3168 		_ceil_prio_waive();
3169 
3170 	self->ul_sp = stkptr();
3171 	self->ul_wchan = cvp;
3172 	mp->mutex_owner = 0;
3173 	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3174 	if (mtype & LOCK_PRIO_INHERIT) {
3175 		mp->mutex_lockw = LOCKCLEAR;
3176 		self->ul_pilocks--;
3177 	}
3178 	/*
3179 	 * ___lwp_cond_wait() returns immediately with EINTR if
3180 	 * set_parking_flag(self,0) is called on this lwp before it
3181 	 * goes to sleep in the kernel.  sigacthandler() calls this
3182 	 * when a deferred signal is noted.  This assures that we don't
3183 	 * get stuck in ___lwp_cond_wait() with all signals blocked
3184 	 * due to taking a deferred signal before going to sleep.
3185 	 */
3186 	set_parking_flag(self, 1);
3187 	if (self->ul_cursig != 0 ||
3188 	    (self->ul_cancelable && self->ul_cancel_pending))
3189 		set_parking_flag(self, 0);
3190 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3191 	set_parking_flag(self, 0);
3192 	self->ul_sp = 0;
3193 	self->ul_wchan = NULL;
3194 	return (error);
3195 }
3196 
3197 int
3198 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3199 {
3200 	ulwp_t *self = curthread;
3201 	int error;
3202 	int merror;
3203 
3204 	/*
3205 	 * See the large comment in cond_wait_queue(), above.
3206 	 */
3207 	if (self->ul_cond_wait_defer)
3208 		sigoff(self);
3209 
3210 	error = cond_sleep_kernel(cvp, mp, tsp);
3211 
3212 	/*
3213 	 * Override the return code from ___lwp_cond_wait()
3214 	 * with any non-zero return code from mutex_lock().
3215 	 * This addresses robust lock failures in particular;
3216 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3217 	 * errors in order to take corrective action.
3218 	 */
3219 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3220 		error = merror;
3221 
3222 	/*
3223 	 * Take any deferred signal now, after we have reacquired the mutex.
3224 	 */
3225 	if (self->ul_cond_wait_defer)
3226 		sigon(self);
3227 
3228 	return (error);
3229 }
3230 
3231 /*
3232  * Common code for _cond_wait() and _cond_timedwait()
3233  */
3234 int
3235 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3236 {
3237 	int mtype = mp->mutex_type;
3238 	hrtime_t begin_sleep = 0;
3239 	ulwp_t *self = curthread;
3240 	uberdata_t *udp = self->ul_uberdata;
3241 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3242 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3243 	uint8_t rcount;
3244 	int error = 0;
3245 
3246 	/*
3247 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3248 	 *	Except in the case of [ETIMEDOUT], all these error checks
3249 	 *	shall act as if they were performed immediately at the
3250 	 *	beginning of processing for the function and shall cause
3251 	 *	an error return, in effect, prior to modifying the state
3252 	 *	of the mutex specified by mutex or the condition variable
3253 	 *	specified by cond.
3254 	 * Therefore, we must return EINVAL now if the timout is invalid.
3255 	 */
3256 	if (tsp != NULL &&
3257 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3258 		return (EINVAL);
3259 
3260 	if (__td_event_report(self, TD_SLEEP, udp)) {
3261 		self->ul_sp = stkptr();
3262 		self->ul_wchan = cvp;
3263 		self->ul_td_evbuf.eventnum = TD_SLEEP;
3264 		self->ul_td_evbuf.eventdata = cvp;
3265 		tdb_event(TD_SLEEP, udp);
3266 		self->ul_sp = 0;
3267 	}
3268 	if (csp) {
3269 		if (tsp)
3270 			tdb_incr(csp->cond_timedwait);
3271 		else
3272 			tdb_incr(csp->cond_wait);
3273 	}
3274 	if (msp)
3275 		begin_sleep = record_hold_time(msp);
3276 	else if (csp)
3277 		begin_sleep = gethrtime();
3278 
3279 	if (self->ul_error_detection) {
3280 		if (!mutex_is_held(mp))
3281 			lock_error(mp, "cond_wait", cvp, NULL);
3282 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3283 			lock_error(mp, "recursive mutex in cond_wait",
3284 			    cvp, NULL);
3285 		if (cvp->cond_type & USYNC_PROCESS) {
3286 			if (!(mtype & USYNC_PROCESS))
3287 				lock_error(mp, "cond_wait", cvp,
3288 				    "condvar process-shared, "
3289 				    "mutex process-private");
3290 		} else {
3291 			if (mtype & USYNC_PROCESS)
3292 				lock_error(mp, "cond_wait", cvp,
3293 				    "condvar process-private, "
3294 				    "mutex process-shared");
3295 		}
3296 	}
3297 
3298 	/*
3299 	 * We deal with recursive mutexes by completely
3300 	 * dropping the lock and restoring the recursion
3301 	 * count after waking up.  This is arguably wrong,
3302 	 * but it obeys the principle of least astonishment.
3303 	 */
3304 	rcount = mp->mutex_rcount;
3305 	mp->mutex_rcount = 0;
3306 	if ((mtype &
3307 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3308 	    (cvp->cond_type & USYNC_PROCESS))
3309 		error = cond_wait_kernel(cvp, mp, tsp);
3310 	else
3311 		error = cond_wait_queue(cvp, mp, tsp);
3312 	mp->mutex_rcount = rcount;
3313 
3314 	if (csp) {
3315 		hrtime_t lapse = gethrtime() - begin_sleep;
3316 		if (tsp == NULL)
3317 			csp->cond_wait_sleep_time += lapse;
3318 		else {
3319 			csp->cond_timedwait_sleep_time += lapse;
3320 			if (error == ETIME)
3321 				tdb_incr(csp->cond_timedwait_timeout);
3322 		}
3323 	}
3324 	return (error);
3325 }
3326 
3327 /*
3328  * cond_wait() and _cond_wait() are cancellation points but __cond_wait()
3329  * is not.  Internally, libc calls the non-cancellation version.
3330  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3331  * since __cond_wait() is not exported from libc.
3332  */
3333 int
3334 __cond_wait(cond_t *cvp, mutex_t *mp)
3335 {
3336 	ulwp_t *self = curthread;
3337 	uberdata_t *udp = self->ul_uberdata;
3338 	uberflags_t *gflags;
3339 
3340 	/*
3341 	 * Optimize the common case of USYNC_THREAD plus
3342 	 * no error detection, no lock statistics, and no event tracing.
3343 	 */
3344 	if ((gflags = self->ul_schedctl_called) != NULL &&
3345 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3346 	    self->ul_td_events_enable |
3347 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3348 		return (cond_wait_queue(cvp, mp, NULL));
3349 
3350 	/*
3351 	 * Else do it the long way.
3352 	 */
3353 	return (cond_wait_common(cvp, mp, NULL));
3354 }
3355 
3356 #pragma weak cond_wait = _cond_wait
3357 int
3358 _cond_wait(cond_t *cvp, mutex_t *mp)
3359 {
3360 	int error;
3361 
3362 	_cancelon();
3363 	error = __cond_wait(cvp, mp);
3364 	if (error == EINTR)
3365 		_canceloff();
3366 	else
3367 		_canceloff_nocancel();
3368 	return (error);
3369 }
3370 
3371 /*
3372  * pthread_cond_wait() is a cancellation point.
3373  */
3374 #pragma weak pthread_cond_wait = _pthread_cond_wait
3375 int
3376 _pthread_cond_wait(cond_t *cvp, mutex_t *mp)
3377 {
3378 	int error;
3379 
3380 	error = _cond_wait(cvp, mp);
3381 	return ((error == EINTR)? 0 : error);
3382 }
3383 
3384 /*
3385  * cond_timedwait() and _cond_timedwait() are cancellation points
3386  * but __cond_timedwait() is not.
3387  */
3388 int
3389 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3390 {
3391 	clockid_t clock_id = cvp->cond_clockid;
3392 	timespec_t reltime;
3393 	int error;
3394 
3395 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3396 		clock_id = CLOCK_REALTIME;
3397 	abstime_to_reltime(clock_id, abstime, &reltime);
3398 	error = cond_wait_common(cvp, mp, &reltime);
3399 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3400 		/*
3401 		 * Don't return ETIME if we didn't really get a timeout.
3402 		 * This can happen if we return because someone resets
3403 		 * the system clock.  Just return zero in this case,
3404 		 * giving a spurious wakeup but not a timeout.
3405 		 */
3406 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3407 		    abstime->tv_nsec > gethrtime())
3408 			error = 0;
3409 	}
3410 	return (error);
3411 }
3412 
3413 #pragma weak cond_timedwait = _cond_timedwait
3414 int
3415 _cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3416 {
3417 	int error;
3418 
3419 	_cancelon();
3420 	error = __cond_timedwait(cvp, mp, abstime);
3421 	if (error == EINTR)
3422 		_canceloff();
3423 	else
3424 		_canceloff_nocancel();
3425 	return (error);
3426 }
3427 
3428 /*
3429  * pthread_cond_timedwait() is a cancellation point.
3430  */
3431 #pragma weak pthread_cond_timedwait = _pthread_cond_timedwait
3432 int
3433 _pthread_cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3434 {
3435 	int error;
3436 
3437 	error = _cond_timedwait(cvp, mp, abstime);
3438 	if (error == ETIME)
3439 		error = ETIMEDOUT;
3440 	else if (error == EINTR)
3441 		error = 0;
3442 	return (error);
3443 }
3444 
3445 /*
3446  * cond_reltimedwait() and _cond_reltimedwait() are cancellation points
3447  * but __cond_reltimedwait() is not.
3448  */
3449 int
3450 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3451 {
3452 	timespec_t tslocal = *reltime;
3453 
3454 	return (cond_wait_common(cvp, mp, &tslocal));
3455 }
3456 
3457 #pragma weak cond_reltimedwait = _cond_reltimedwait
3458 int
3459 _cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3460 {
3461 	int error;
3462 
3463 	_cancelon();
3464 	error = __cond_reltimedwait(cvp, mp, reltime);
3465 	if (error == EINTR)
3466 		_canceloff();
3467 	else
3468 		_canceloff_nocancel();
3469 	return (error);
3470 }
3471 
3472 #pragma weak pthread_cond_reltimedwait_np = _pthread_cond_reltimedwait_np
3473 int
3474 _pthread_cond_reltimedwait_np(cond_t *cvp, mutex_t *mp,
3475 	const timespec_t *reltime)
3476 {
3477 	int error;
3478 
3479 	error = _cond_reltimedwait(cvp, mp, reltime);
3480 	if (error == ETIME)
3481 		error = ETIMEDOUT;
3482 	else if (error == EINTR)
3483 		error = 0;
3484 	return (error);
3485 }
3486 
3487 #pragma weak pthread_cond_signal = cond_signal_internal
3488 #pragma weak _pthread_cond_signal = cond_signal_internal
3489 #pragma weak cond_signal = cond_signal_internal
3490 #pragma weak _cond_signal = cond_signal_internal
3491 int
3492 cond_signal_internal(cond_t *cvp)
3493 {
3494 	ulwp_t *self = curthread;
3495 	uberdata_t *udp = self->ul_uberdata;
3496 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3497 	int error = 0;
3498 	int more;
3499 	lwpid_t lwpid;
3500 	queue_head_t *qp;
3501 	mutex_t *mp;
3502 	queue_head_t *mqp;
3503 	ulwp_t **ulwpp;
3504 	ulwp_t *ulwp;
3505 	ulwp_t *prev;
3506 
3507 	if (csp)
3508 		tdb_incr(csp->cond_signal);
3509 
3510 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3511 		error = __lwp_cond_signal(cvp);
3512 
3513 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3514 		return (error);
3515 
3516 	/*
3517 	 * Move someone from the condvar sleep queue to the mutex sleep
3518 	 * queue for the mutex that he will acquire on being waked up.
3519 	 * We can do this only if we own the mutex he will acquire.
3520 	 * If we do not own the mutex, or if his ul_cv_wake flag
3521 	 * is set, just dequeue and unpark him.
3522 	 */
3523 	qp = queue_lock(cvp, CV);
3524 	ulwpp = queue_slot(qp, &prev, &more);
3525 	cvp->cond_waiters_user = more;
3526 	if (ulwpp == NULL) {	/* no one on the sleep queue */
3527 		queue_unlock(qp);
3528 		return (error);
3529 	}
3530 	ulwp = *ulwpp;
3531 
3532 	/*
3533 	 * Inform the thread that he was the recipient of a cond_signal().
3534 	 * This lets him deal with cond_signal() and, concurrently,
3535 	 * one or more of a cancellation, a UNIX signal, or a timeout.
3536 	 * These latter conditions must not consume a cond_signal().
3537 	 */
3538 	ulwp->ul_signalled = 1;
3539 
3540 	/*
3541 	 * Dequeue the waiter but leave his ul_sleepq non-NULL
3542 	 * while we move him to the mutex queue so that he can
3543 	 * deal properly with spurious wakeups.
3544 	 */
3545 	queue_unlink(qp, ulwpp, prev);
3546 
3547 	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
3548 	ulwp->ul_cvmutex = NULL;
3549 	ASSERT(mp != NULL);
3550 
3551 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3552 		/* just wake him up */
3553 		lwpid = ulwp->ul_lwpid;
3554 		no_preempt(self);
3555 		ulwp->ul_sleepq = NULL;
3556 		ulwp->ul_wchan = NULL;
3557 		queue_unlock(qp);
3558 		(void) __lwp_unpark(lwpid);
3559 		preempt(self);
3560 	} else {
3561 		/* move him to the mutex queue */
3562 		mqp = queue_lock(mp, MX);
3563 		enqueue(mqp, ulwp, 0);
3564 		mp->mutex_waiters = 1;
3565 		queue_unlock(mqp);
3566 		queue_unlock(qp);
3567 	}
3568 
3569 	return (error);
3570 }
3571 
3572 /*
3573  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3574  * and rw_queue_release() to (re)allocate a big buffer to hold the
3575  * lwpids of all the threads to be set running after they are removed
3576  * from their sleep queues.  Since we are holding a queue lock, we
3577  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3578  * lwp_unpark_all() are simple system calls and are safe in this regard.
3579  */
3580 lwpid_t *
3581 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3582 {
3583 	/*
3584 	 * Allocate NEWLWPS ids on the first overflow.
3585 	 * Double the allocation each time after that.
3586 	 */
3587 	int nlwpid = *nlwpid_ptr;
3588 	int maxlwps = *maxlwps_ptr;
3589 	int first_allocation;
3590 	int newlwps;
3591 	void *vaddr;
3592 
3593 	ASSERT(nlwpid == maxlwps);
3594 
3595 	first_allocation = (maxlwps == MAXLWPS);
3596 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3597 	vaddr = _private_mmap(NULL, newlwps * sizeof (lwpid_t),
3598 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3599 
3600 	if (vaddr == MAP_FAILED) {
3601 		/*
3602 		 * Let's hope this never happens.
3603 		 * If it does, then we have a terrible
3604 		 * thundering herd on our hands.
3605 		 */
3606 		(void) __lwp_unpark_all(lwpid, nlwpid);
3607 		*nlwpid_ptr = 0;
3608 	} else {
3609 		(void) _memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3610 		if (!first_allocation)
3611 			(void) _private_munmap(lwpid,
3612 			    maxlwps * sizeof (lwpid_t));
3613 		lwpid = vaddr;
3614 		*maxlwps_ptr = newlwps;
3615 	}
3616 
3617 	return (lwpid);
3618 }
3619 
3620 #pragma weak pthread_cond_broadcast = cond_broadcast_internal
3621 #pragma weak _pthread_cond_broadcast = cond_broadcast_internal
3622 #pragma weak cond_broadcast = cond_broadcast_internal
3623 #pragma weak _cond_broadcast = cond_broadcast_internal
3624 int
3625 cond_broadcast_internal(cond_t *cvp)
3626 {
3627 	ulwp_t *self = curthread;
3628 	uberdata_t *udp = self->ul_uberdata;
3629 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3630 	int error = 0;
3631 	queue_head_t *qp;
3632 	queue_root_t *qrp;
3633 	mutex_t *mp;
3634 	mutex_t *mp_cache = NULL;
3635 	queue_head_t *mqp = NULL;
3636 	ulwp_t *ulwp;
3637 	int nlwpid = 0;
3638 	int maxlwps = MAXLWPS;
3639 	lwpid_t buffer[MAXLWPS];
3640 	lwpid_t *lwpid = buffer;
3641 
3642 	if (csp)
3643 		tdb_incr(csp->cond_broadcast);
3644 
3645 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3646 		error = __lwp_cond_broadcast(cvp);
3647 
3648 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3649 		return (error);
3650 
3651 	/*
3652 	 * Move everyone from the condvar sleep queue to the mutex sleep
3653 	 * queue for the mutex that they will acquire on being waked up.
3654 	 * We can do this only if we own the mutex they will acquire.
3655 	 * If we do not own the mutex, or if their ul_cv_wake flag
3656 	 * is set, just dequeue and unpark them.
3657 	 *
3658 	 * We keep track of lwpids that are to be unparked in lwpid[].
3659 	 * __lwp_unpark_all() is called to unpark all of them after
3660 	 * they have been removed from the sleep queue and the sleep
3661 	 * queue lock has been dropped.  If we run out of space in our
3662 	 * on-stack buffer, we need to allocate more but we can't call
3663 	 * lmalloc() because we are holding a queue lock when the overflow
3664 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3665 	 * either because the application may have allocated a small
3666 	 * stack and we don't want to overrun the stack.  So we call
3667 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3668 	 * system call directly since that path acquires no locks.
3669 	 */
3670 	qp = queue_lock(cvp, CV);
3671 	cvp->cond_waiters_user = 0;
3672 	for (;;) {
3673 		if ((qrp = qp->qh_root) == NULL ||
3674 		    (ulwp = qrp->qr_head) == NULL)
3675 			break;
3676 		ASSERT(ulwp->ul_wchan == cvp);
3677 		queue_unlink(qp, &qrp->qr_head, NULL);
3678 		mp = ulwp->ul_cvmutex;		/* his mutex */
3679 		ulwp->ul_cvmutex = NULL;
3680 		ASSERT(mp != NULL);
3681 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3682 			/* just wake him up */
3683 			ulwp->ul_sleepq = NULL;
3684 			ulwp->ul_wchan = NULL;
3685 			if (nlwpid == maxlwps)
3686 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3687 			lwpid[nlwpid++] = ulwp->ul_lwpid;
3688 		} else {
3689 			/* move him to the mutex queue */
3690 			if (mp != mp_cache) {
3691 				mp_cache = mp;
3692 				if (mqp != NULL)
3693 					queue_unlock(mqp);
3694 				mqp = queue_lock(mp, MX);
3695 			}
3696 			enqueue(mqp, ulwp, 0);
3697 			mp->mutex_waiters = 1;
3698 		}
3699 	}
3700 	if (mqp != NULL)
3701 		queue_unlock(mqp);
3702 	if (nlwpid == 0) {
3703 		queue_unlock(qp);
3704 	} else {
3705 		no_preempt(self);
3706 		queue_unlock(qp);
3707 		if (nlwpid == 1)
3708 			(void) __lwp_unpark(lwpid[0]);
3709 		else
3710 			(void) __lwp_unpark_all(lwpid, nlwpid);
3711 		preempt(self);
3712 	}
3713 	if (lwpid != buffer)
3714 		(void) _private_munmap(lwpid, maxlwps * sizeof (lwpid_t));
3715 	return (error);
3716 }
3717 
3718 #pragma weak pthread_cond_destroy = _cond_destroy
3719 #pragma weak _pthread_cond_destroy = _cond_destroy
3720 #pragma weak cond_destroy = _cond_destroy
3721 int
3722 _cond_destroy(cond_t *cvp)
3723 {
3724 	cvp->cond_magic = 0;
3725 	tdb_sync_obj_deregister(cvp);
3726 	return (0);
3727 }
3728 
3729 #if defined(THREAD_DEBUG)
3730 void
3731 assert_no_libc_locks_held(void)
3732 {
3733 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3734 }
3735 
3736 /* protected by link_lock */
3737 uint64_t spin_lock_spin;
3738 uint64_t spin_lock_spin2;
3739 uint64_t spin_lock_sleep;
3740 uint64_t spin_lock_wakeup;
3741 
3742 /*
3743  * Record spin lock statistics.
3744  * Called by a thread exiting itself in thrp_exit().
3745  * Also called via atexit() from the thread calling
3746  * exit() to do all the other threads as well.
3747  */
3748 void
3749 record_spin_locks(ulwp_t *ulwp)
3750 {
3751 	spin_lock_spin += ulwp->ul_spin_lock_spin;
3752 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3753 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3754 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3755 	ulwp->ul_spin_lock_spin = 0;
3756 	ulwp->ul_spin_lock_spin2 = 0;
3757 	ulwp->ul_spin_lock_sleep = 0;
3758 	ulwp->ul_spin_lock_wakeup = 0;
3759 }
3760 
3761 /*
3762  * atexit function:  dump the queue statistics to stderr.
3763  */
3764 #if !defined(__lint)
3765 #define	fprintf	_fprintf
3766 #endif
3767 #include <stdio.h>
3768 void
3769 dump_queue_statistics(void)
3770 {
3771 	uberdata_t *udp = curthread->ul_uberdata;
3772 	queue_head_t *qp;
3773 	int qn;
3774 	uint64_t spin_lock_total = 0;
3775 
3776 	if (udp->queue_head == NULL || thread_queue_dump == 0)
3777 		return;
3778 
3779 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3780 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3781 		return;
3782 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3783 		if (qp->qh_lockcount == 0)
3784 			continue;
3785 		spin_lock_total += qp->qh_lockcount;
3786 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3787 		    (u_longlong_t)qp->qh_lockcount,
3788 		    qp->qh_qmax, qp->qh_hmax) < 0)
3789 			return;
3790 	}
3791 
3792 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3793 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3794 		return;
3795 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3796 		if (qp->qh_lockcount == 0)
3797 			continue;
3798 		spin_lock_total += qp->qh_lockcount;
3799 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3800 		    (u_longlong_t)qp->qh_lockcount,
3801 		    qp->qh_qmax, qp->qh_hmax) < 0)
3802 			return;
3803 	}
3804 
3805 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3806 	    (u_longlong_t)spin_lock_total);
3807 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3808 	    (u_longlong_t)spin_lock_spin);
3809 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3810 	    (u_longlong_t)spin_lock_spin2);
3811 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3812 	    (u_longlong_t)spin_lock_sleep);
3813 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3814 	    (u_longlong_t)spin_lock_wakeup);
3815 }
3816 #endif
3817