xref: /illumos-gate/usr/src/lib/libc/port/threads/synch.c (revision 590e0b5da08d7261161e979afc4bf4aa0f543574)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2015, Joyent, Inc.
26  * Copyright (c) 2016 by Delphix. All rights reserved.
27  * Copyright 2024 Oxide Computer Company
28  */
29 
30 #include "lint.h"
31 #include "thr_uberdata.h"
32 #include <sys/rtpriocntl.h>
33 #include <sys/sdt.h>
34 #include <atomic.h>
35 
36 #if defined(DEBUG)
37 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
38 #define	INCR(x)		((x)++)
39 #define	DECR(x)		((x)--)
40 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
41 #else
42 #define	INCR32(x)
43 #define	INCR(x)
44 #define	DECR(x)
45 #define	MAXINCR(m, x)
46 #endif
47 
48 /*
49  * This mutex is initialized to be held by lwp#1.
50  * It is used to block a thread that has returned from a mutex_lock()
51  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
52  */
53 mutex_t	stall_mutex = DEFAULTMUTEX;
54 
55 static int shared_mutex_held(mutex_t *);
56 static int mutex_queuelock_adaptive(mutex_t *);
57 static void mutex_wakeup_all(mutex_t *);
58 
59 /*
60  * Lock statistics support functions.
61  */
62 void
63 record_begin_hold(tdb_mutex_stats_t *msp)
64 {
65 	tdb_incr(msp->mutex_lock);
66 	msp->mutex_begin_hold = gethrtime();
67 }
68 
69 hrtime_t
70 record_hold_time(tdb_mutex_stats_t *msp)
71 {
72 	hrtime_t now = gethrtime();
73 
74 	if (msp->mutex_begin_hold)
75 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
76 	msp->mutex_begin_hold = 0;
77 	return (now);
78 }
79 
80 /*
81  * Called once at library initialization.
82  */
83 void
84 mutex_setup(void)
85 {
86 	if (set_lock_byte(&stall_mutex.mutex_lockw))
87 		thr_panic("mutex_setup() cannot acquire stall_mutex");
88 	stall_mutex.mutex_owner = (uintptr_t)curthread;
89 }
90 
91 /*
92  * The default spin count of 1000 is experimentally determined.
93  * On sun4u machines with any number of processors it could be raised
94  * to 10,000 but that (experimentally) makes almost no difference.
95  * The environment variable:
96  *	_THREAD_ADAPTIVE_SPIN=count
97  * can be used to override and set the count in the range [0 .. 1,000,000].
98  */
99 int	thread_adaptive_spin = 1000;
100 uint_t	thread_max_spinners = 100;
101 int	thread_queue_verify = 0;
102 static	int	ncpus;
103 
104 /*
105  * Distinguish spinning for queue locks from spinning for regular locks.
106  * We try harder to acquire queue locks by spinning.
107  * The environment variable:
108  *	_THREAD_QUEUE_SPIN=count
109  * can be used to override and set the count in the range [0 .. 1,000,000].
110  */
111 int	thread_queue_spin = 10000;
112 
113 #define	ALL_ATTRIBUTES				\
114 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
115 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
116 	LOCK_ROBUST)
117 
118 /*
119  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
120  * augmented by zero or more the flags:
121  *	LOCK_RECURSIVE
122  *	LOCK_ERRORCHECK
123  *	LOCK_PRIO_INHERIT
124  *	LOCK_PRIO_PROTECT
125  *	LOCK_ROBUST
126  */
127 #pragma weak _mutex_init = mutex_init
128 /* ARGSUSED2 */
129 int
130 mutex_init(mutex_t *mp, int type, void *arg)
131 {
132 	int basetype = (type & ~ALL_ATTRIBUTES);
133 	const pcclass_t *pccp;
134 	int error = 0;
135 	int ceil;
136 
137 	if (basetype == USYNC_PROCESS_ROBUST) {
138 		/*
139 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
140 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
141 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
142 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
143 		 * mutexes will ever draw ELOCKUNMAPPED).
144 		 */
145 		type |= (USYNC_PROCESS | LOCK_ROBUST);
146 		basetype = USYNC_PROCESS;
147 	}
148 
149 	if (type & LOCK_PRIO_PROTECT)
150 		pccp = get_info_by_policy(SCHED_FIFO);
151 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
152 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
153 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
154 	    ((type & LOCK_PRIO_PROTECT) &&
155 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
156 	    ceil > pccp->pcc_primax))) {
157 		error = EINVAL;
158 	} else if (type & LOCK_ROBUST) {
159 		/*
160 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
161 		 * are required to pass an initially all-zero mutex.
162 		 * Multiple calls to mutex_init() are allowed; all but
163 		 * the first return EBUSY.  A call to mutex_init() is
164 		 * allowed to make an inconsistent robust lock consistent
165 		 * (for historical usage, even though the proper interface
166 		 * for this is mutex_consistent()).  Note that we use
167 		 * atomic_or_16() to set the LOCK_INITED flag so as
168 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
169 		 */
170 		if (!(mp->mutex_flag & LOCK_INITED)) {
171 			mp->mutex_type = (uint8_t)type;
172 			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
173 			mp->mutex_magic = MUTEX_MAGIC;
174 		} else if (type != mp->mutex_type ||
175 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
176 			error = EINVAL;
177 		} else if (mutex_consistent(mp) != 0) {
178 			error = EBUSY;
179 		}
180 		/* register a process robust mutex with the kernel */
181 		if (basetype == USYNC_PROCESS)
182 			register_lock(mp);
183 	} else {
184 		(void) memset(mp, 0, sizeof (*mp));
185 		mp->mutex_type = (uint8_t)type;
186 		mp->mutex_flag = LOCK_INITED;
187 		mp->mutex_magic = MUTEX_MAGIC;
188 	}
189 
190 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
191 		mp->mutex_ceiling = ceil;
192 	}
193 
194 	/*
195 	 * This should be at the beginning of the function,
196 	 * but for the sake of old broken applications that
197 	 * do not have proper alignment for their mutexes
198 	 * (and don't check the return code from mutex_init),
199 	 * we put it here, after initializing the mutex regardless.
200 	 */
201 	if (error == 0 &&
202 	    ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
203 	    curthread->ul_misaligned == 0)
204 		error = EINVAL;
205 
206 	return (error);
207 }
208 
209 /*
210  * Delete mp from list of ceiling mutexes owned by curthread.
211  * Return 1 if the head of the chain was updated.
212  */
213 int
214 _ceil_mylist_del(mutex_t *mp)
215 {
216 	ulwp_t *self = curthread;
217 	mxchain_t **mcpp;
218 	mxchain_t *mcp;
219 
220 	for (mcpp = &self->ul_mxchain;
221 	    (mcp = *mcpp) != NULL;
222 	    mcpp = &mcp->mxchain_next) {
223 		if (mcp->mxchain_mx == mp) {
224 			*mcpp = mcp->mxchain_next;
225 			lfree(mcp, sizeof (*mcp));
226 			return (mcpp == &self->ul_mxchain);
227 		}
228 	}
229 	return (0);
230 }
231 
232 /*
233  * Add mp to the list of ceiling mutexes owned by curthread.
234  * Return ENOMEM if no memory could be allocated.
235  */
236 int
237 _ceil_mylist_add(mutex_t *mp)
238 {
239 	ulwp_t *self = curthread;
240 	mxchain_t *mcp;
241 
242 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
243 		return (ENOMEM);
244 	mcp->mxchain_mx = mp;
245 	mcp->mxchain_next = self->ul_mxchain;
246 	self->ul_mxchain = mcp;
247 	return (0);
248 }
249 
250 /*
251  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
252  */
253 static void
254 set_rt_priority(ulwp_t *self, int prio)
255 {
256 	pcparms_t pcparm;
257 
258 	pcparm.pc_cid = self->ul_rtclassid;
259 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
260 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
261 	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
262 }
263 
264 /*
265  * Inherit priority from ceiling.
266  * This changes the effective priority, not the assigned priority.
267  */
268 void
269 _ceil_prio_inherit(int prio)
270 {
271 	ulwp_t *self = curthread;
272 
273 	self->ul_epri = prio;
274 	set_rt_priority(self, prio);
275 }
276 
277 /*
278  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
279  * if holding at least one ceiling lock.  If no ceiling locks are held at this
280  * point, disinherit completely, reverting back to assigned priority.
281  */
282 void
283 _ceil_prio_waive(void)
284 {
285 	ulwp_t *self = curthread;
286 	mxchain_t *mcp = self->ul_mxchain;
287 	int prio;
288 
289 	if (mcp == NULL) {
290 		prio = self->ul_pri;
291 		self->ul_epri = 0;
292 	} else {
293 		prio = mcp->mxchain_mx->mutex_ceiling;
294 		self->ul_epri = prio;
295 	}
296 	set_rt_priority(self, prio);
297 }
298 
299 /*
300  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
301  * Return the old value of the lock word.
302  */
303 static uint32_t
304 clear_lockbyte(volatile uint32_t *lockword)
305 {
306 	uint32_t old;
307 	uint32_t new;
308 
309 	do {
310 		old = *lockword;
311 		new = old & ~LOCKMASK;
312 	} while (atomic_cas_32(lockword, old, new) != old);
313 
314 	return (old);
315 }
316 
317 /*
318  * Same as clear_lockbyte(), but operates on mutex_lockword64.
319  * The mutex_ownerpid field is cleared along with the lock byte.
320  */
321 static uint64_t
322 clear_lockbyte64(volatile uint64_t *lockword64)
323 {
324 	uint64_t old;
325 	uint64_t new;
326 
327 	do {
328 		old = *lockword64;
329 		new = old & ~LOCKMASK64;
330 	} while (atomic_cas_64(lockword64, old, new) != old);
331 
332 	return (old);
333 }
334 
335 /*
336  * Similar to set_lock_byte(), which only tries to set the lock byte.
337  * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
338  * the remaining bytes constant.  This atomic operation is required for the
339  * correctness of process-shared robust locks, otherwise there would be
340  * a window or vulnerability in which the lock byte had been set but the
341  * mutex_ownerpid had not yet been set.  If the process were to die in
342  * this window of vulnerability (due to some other thread calling exit()
343  * or the process receiving a fatal signal), the mutex would be left locked
344  * but without a process-ID to determine which process was holding the lock.
345  * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
346  * when the process died.  For all other cases of process-shared locks, this
347  * operation is just a convenience, for the sake of common code.
348  *
349  * This operation requires process-shared robust locks to be properly
350  * aligned on an 8-byte boundary, at least on sparc machines, lest the
351  * operation incur an alignment fault.  This is automatic when locks
352  * are declared properly using the mutex_t or pthread_mutex_t data types
353  * and the application does not allocate dynamic memory on less than an
354  * 8-byte boundary.  See the 'horrible hack' comments below for cases
355  * dealing with such broken applications.
356  */
357 static int
358 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
359 {
360 	uint64_t old;
361 	uint64_t new;
362 
363 	old = *lockword64 & ~LOCKMASK64;
364 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
365 	if (atomic_cas_64(lockword64, old, new) == old)
366 		return (LOCKCLEAR);
367 
368 	return (LOCKSET);
369 }
370 
371 /*
372  * Increment the spinners count in the mutex lock word.
373  * Return 0 on success.  Return -1 if the count would overflow.
374  */
375 static int
376 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
377 {
378 	uint32_t old;
379 	uint32_t new;
380 
381 	do {
382 		old = *lockword;
383 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
384 			return (-1);
385 		new = old + (1 << SPINNERSHIFT);
386 	} while (atomic_cas_32(lockword, old, new) != old);
387 
388 	return (0);
389 }
390 
391 /*
392  * Decrement the spinners count in the mutex lock word.
393  * Return the new value of the lock word.
394  */
395 static uint32_t
396 spinners_decr(volatile uint32_t *lockword)
397 {
398 	uint32_t old;
399 	uint32_t new;
400 
401 	do {
402 		new = old = *lockword;
403 		if (new & SPINNERMASK)
404 			new -= (1 << SPINNERSHIFT);
405 	} while (atomic_cas_32(lockword, old, new) != old);
406 
407 	return (new);
408 }
409 
410 /*
411  * Non-preemptive spin locks.  Used by queue_lock().
412  * No lock statistics are gathered for these locks.
413  * No DTrace probes are provided for these locks.
414  */
415 void
416 spin_lock_set(mutex_t *mp)
417 {
418 	ulwp_t *self = curthread;
419 
420 	no_preempt(self);
421 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
422 		mp->mutex_owner = (uintptr_t)self;
423 		return;
424 	}
425 	/*
426 	 * Spin for a while, attempting to acquire the lock.
427 	 */
428 	INCR32(self->ul_spin_lock_spin);
429 	if (mutex_queuelock_adaptive(mp) == 0 ||
430 	    set_lock_byte(&mp->mutex_lockw) == 0) {
431 		mp->mutex_owner = (uintptr_t)self;
432 		return;
433 	}
434 	/*
435 	 * Try harder if we were previously at a no premption level.
436 	 */
437 	if (self->ul_preempt > 1) {
438 		INCR32(self->ul_spin_lock_spin2);
439 		if (mutex_queuelock_adaptive(mp) == 0 ||
440 		    set_lock_byte(&mp->mutex_lockw) == 0) {
441 			mp->mutex_owner = (uintptr_t)self;
442 			return;
443 		}
444 	}
445 	/*
446 	 * Give up and block in the kernel for the mutex.
447 	 */
448 	INCR32(self->ul_spin_lock_sleep);
449 	(void) ___lwp_mutex_timedlock(mp, NULL, self);
450 }
451 
452 void
453 spin_lock_clear(mutex_t *mp)
454 {
455 	ulwp_t *self = curthread;
456 
457 	mp->mutex_owner = 0;
458 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
459 		(void) ___lwp_mutex_wakeup(mp, 0);
460 		INCR32(self->ul_spin_lock_wakeup);
461 	}
462 	preempt(self);
463 }
464 
465 /*
466  * Allocate the sleep queue hash table.
467  */
468 void
469 queue_alloc(void)
470 {
471 	ulwp_t *self = curthread;
472 	uberdata_t *udp = self->ul_uberdata;
473 	queue_head_t *qp;
474 	void *data;
475 	int i;
476 
477 	/*
478 	 * No locks are needed; we call here only when single-threaded.
479 	 */
480 	ASSERT(self == udp->ulwp_one);
481 	ASSERT(!udp->uberflags.uf_mt);
482 	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
483 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
484 	    == MAP_FAILED)
485 		thr_panic("cannot allocate thread queue_head table");
486 	udp->queue_head = qp = (queue_head_t *)data;
487 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
488 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
489 		qp->qh_lock.mutex_flag = LOCK_INITED;
490 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
491 		qp->qh_hlist = &qp->qh_def_root;
492 #if defined(DEBUG)
493 		qp->qh_hlen = 1;
494 		qp->qh_hmax = 1;
495 #endif
496 	}
497 }
498 
499 #if defined(DEBUG)
500 
501 /*
502  * Debugging: verify correctness of a sleep queue.
503  */
504 void
505 QVERIFY(queue_head_t *qp)
506 {
507 	ulwp_t *self = curthread;
508 	uberdata_t *udp = self->ul_uberdata;
509 	queue_root_t *qrp;
510 	ulwp_t *ulwp;
511 	ulwp_t *prev;
512 	uint_t index;
513 	uint32_t cnt;
514 	char qtype;
515 	void *wchan;
516 
517 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
518 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
519 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
520 		cnt++;
521 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
522 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
523 	}
524 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
525 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
526 	ASSERT(qp->qh_type == qtype);
527 	if (!thread_queue_verify)
528 		return;
529 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
530 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
531 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
532 		    prev = ulwp, ulwp = ulwp->ul_link) {
533 			cnt++;
534 			if (ulwp->ul_writer)
535 				ASSERT(prev == NULL || prev->ul_writer);
536 			ASSERT(ulwp->ul_qtype == qtype);
537 			ASSERT(ulwp->ul_wchan != NULL);
538 			ASSERT(ulwp->ul_sleepq == qp);
539 			wchan = ulwp->ul_wchan;
540 			ASSERT(qrp->qr_wchan == wchan);
541 			index = QUEUE_HASH(wchan, qtype);
542 			ASSERT(&udp->queue_head[index] == qp);
543 		}
544 		ASSERT(qrp->qr_tail == prev);
545 	}
546 	ASSERT(qp->qh_qlen == cnt);
547 }
548 
549 #else	/* DEBUG */
550 
551 #define	QVERIFY(qp)
552 
553 #endif	/* DEBUG */
554 
555 /*
556  * Acquire a queue head.
557  */
558 queue_head_t *
559 queue_lock(void *wchan, int qtype)
560 {
561 	uberdata_t *udp = curthread->ul_uberdata;
562 	queue_head_t *qp;
563 	queue_root_t *qrp;
564 
565 	ASSERT(qtype == MX || qtype == CV);
566 
567 	/*
568 	 * It is possible that we could be called while still single-threaded.
569 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
570 	 */
571 	if ((qp = udp->queue_head) == NULL) {
572 		queue_alloc();
573 		qp = udp->queue_head;
574 	}
575 	qp += QUEUE_HASH(wchan, qtype);
576 	spin_lock_set(&qp->qh_lock);
577 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
578 		if (qrp->qr_wchan == wchan)
579 			break;
580 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
581 		/* the default queue root is available; use it */
582 		qrp = &qp->qh_def_root;
583 		qrp->qr_wchan = wchan;
584 		ASSERT(qrp->qr_next == NULL);
585 		ASSERT(qrp->qr_tail == NULL &&
586 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
587 	}
588 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
589 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
590 	INCR32(qp->qh_lockcount);
591 	QVERIFY(qp);
592 	return (qp);
593 }
594 
595 /*
596  * Release a queue head.
597  */
598 void
599 queue_unlock(queue_head_t *qp)
600 {
601 	QVERIFY(qp);
602 	spin_lock_clear(&qp->qh_lock);
603 }
604 
605 /*
606  * For rwlock queueing, we must queue writers ahead of readers of the
607  * same priority.  We do this by making writers appear to have a half
608  * point higher priority for purposes of priority comparisons below.
609  */
610 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
611 
612 void
613 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
614 {
615 	queue_root_t *qrp;
616 	ulwp_t **ulwpp;
617 	ulwp_t *next;
618 	int pri = CMP_PRIO(ulwp);
619 
620 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
621 	ASSERT(ulwp->ul_sleepq != qp);
622 
623 	if ((qrp = qp->qh_root) == NULL) {
624 		/* use the thread's queue root for the linkage */
625 		qrp = &ulwp->ul_queue_root;
626 		qrp->qr_next = qp->qh_hlist;
627 		qrp->qr_prev = NULL;
628 		qrp->qr_head = NULL;
629 		qrp->qr_tail = NULL;
630 		qrp->qr_wchan = qp->qh_wchan;
631 		qrp->qr_rtcount = 0;
632 		qrp->qr_qlen = 0;
633 		qrp->qr_qmax = 0;
634 		qp->qh_hlist->qr_prev = qrp;
635 		qp->qh_hlist = qrp;
636 		qp->qh_root = qrp;
637 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
638 	}
639 
640 	/*
641 	 * LIFO queue ordering is unfair and can lead to starvation,
642 	 * but it gives better performance for heavily contended locks.
643 	 * We use thread_queue_fifo (range is 0..8) to determine
644 	 * the frequency of FIFO vs LIFO queuing:
645 	 *	0 : every 256th time	(almost always LIFO)
646 	 *	1 : every 128th time
647 	 *	2 : every 64th  time
648 	 *	3 : every 32nd  time
649 	 *	4 : every 16th  time	(the default value, mostly LIFO)
650 	 *	5 : every 8th   time
651 	 *	6 : every 4th   time
652 	 *	7 : every 2nd   time
653 	 *	8 : every time		(never LIFO, always FIFO)
654 	 * Note that there is always some degree of FIFO ordering.
655 	 * This breaks live lock conditions that occur in applications
656 	 * that are written assuming (incorrectly) that threads acquire
657 	 * locks fairly, that is, in roughly round-robin order.
658 	 * In any event, the queue is maintained in kernel priority order.
659 	 *
660 	 * If force_fifo is non-zero, fifo queueing is forced.
661 	 * SUSV3 requires this for semaphores.
662 	 */
663 	if (qrp->qr_head == NULL) {
664 		/*
665 		 * The queue is empty.  LIFO/FIFO doesn't matter.
666 		 */
667 		ASSERT(qrp->qr_tail == NULL);
668 		ulwpp = &qrp->qr_head;
669 	} else if (force_fifo |
670 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
671 		/*
672 		 * Enqueue after the last thread whose priority is greater
673 		 * than or equal to the priority of the thread being queued.
674 		 * Attempt first to go directly onto the tail of the queue.
675 		 */
676 		if (pri <= CMP_PRIO(qrp->qr_tail))
677 			ulwpp = &qrp->qr_tail->ul_link;
678 		else {
679 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
680 			    ulwpp = &next->ul_link)
681 				if (pri > CMP_PRIO(next))
682 					break;
683 		}
684 	} else {
685 		/*
686 		 * Enqueue before the first thread whose priority is less
687 		 * than or equal to the priority of the thread being queued.
688 		 * Hopefully we can go directly onto the head of the queue.
689 		 */
690 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
691 		    ulwpp = &next->ul_link)
692 			if (pri >= CMP_PRIO(next))
693 				break;
694 	}
695 	if ((ulwp->ul_link = *ulwpp) == NULL)
696 		qrp->qr_tail = ulwp;
697 	*ulwpp = ulwp;
698 
699 	ulwp->ul_sleepq = qp;
700 	ulwp->ul_wchan = qp->qh_wchan;
701 	ulwp->ul_qtype = qp->qh_type;
702 	if ((ulwp->ul_schedctl != NULL &&
703 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
704 	    ulwp->ul_pilocks) {
705 		ulwp->ul_rtqueued = 1;
706 		qrp->qr_rtcount++;
707 	}
708 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
709 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
710 }
711 
712 /*
713  * Helper function for queue_slot() and queue_slot_rt().
714  * Try to find a non-suspended thread on the queue.
715  */
716 static ulwp_t **
717 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
718 {
719 	ulwp_t *ulwp;
720 	ulwp_t **foundpp = NULL;
721 	int priority = -1;
722 	ulwp_t *prev;
723 	int tpri;
724 
725 	for (prev = NULL;
726 	    (ulwp = *ulwpp) != NULL;
727 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
728 		if (ulwp->ul_stop)	/* skip suspended threads */
729 			continue;
730 		tpri = rt? CMP_PRIO(ulwp) : 0;
731 		if (tpri > priority) {
732 			foundpp = ulwpp;
733 			*prevp = prev;
734 			priority = tpri;
735 			if (!rt)
736 				break;
737 		}
738 	}
739 	return (foundpp);
740 }
741 
742 /*
743  * For real-time, we search the entire queue because the dispatch
744  * (kernel) priorities may have changed since enqueueing.
745  */
746 static ulwp_t **
747 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
748 {
749 	ulwp_t **ulwpp = ulwpp_org;
750 	ulwp_t *ulwp = *ulwpp;
751 	ulwp_t **foundpp = ulwpp;
752 	int priority = CMP_PRIO(ulwp);
753 	ulwp_t *prev;
754 	int tpri;
755 
756 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
757 	    (ulwp = *ulwpp) != NULL;
758 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
759 		tpri = CMP_PRIO(ulwp);
760 		if (tpri > priority) {
761 			foundpp = ulwpp;
762 			*prevp = prev;
763 			priority = tpri;
764 		}
765 	}
766 	ulwp = *foundpp;
767 
768 	/*
769 	 * Try not to return a suspended thread.
770 	 * This mimics the old libthread's behavior.
771 	 */
772 	if (ulwp->ul_stop &&
773 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
774 		foundpp = ulwpp;
775 		ulwp = *foundpp;
776 	}
777 	ulwp->ul_rt = 1;
778 	return (foundpp);
779 }
780 
781 ulwp_t **
782 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
783 {
784 	queue_root_t *qrp;
785 	ulwp_t **ulwpp;
786 	ulwp_t *ulwp;
787 	int rt;
788 
789 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
790 
791 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
792 		*more = 0;
793 		return (NULL);		/* no lwps on the queue */
794 	}
795 	rt = (qrp->qr_rtcount != 0);
796 	*prevp = NULL;
797 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
798 		*more = 0;
799 		ulwp->ul_rt = rt;
800 		return (&qrp->qr_head);
801 	}
802 	*more = 1;
803 
804 	if (rt)		/* real-time queue */
805 		return (queue_slot_rt(&qrp->qr_head, prevp));
806 	/*
807 	 * Try not to return a suspended thread.
808 	 * This mimics the old libthread's behavior.
809 	 */
810 	if (ulwp->ul_stop &&
811 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
812 		ulwp = *ulwpp;
813 		ulwp->ul_rt = 0;
814 		return (ulwpp);
815 	}
816 	/*
817 	 * The common case; just pick the first thread on the queue.
818 	 */
819 	ulwp->ul_rt = 0;
820 	return (&qrp->qr_head);
821 }
822 
823 /*
824  * Common code for unlinking an lwp from a user-level sleep queue.
825  */
826 void
827 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
828 {
829 	queue_root_t *qrp = qp->qh_root;
830 	queue_root_t *nqrp;
831 	ulwp_t *ulwp = *ulwpp;
832 	ulwp_t *next;
833 
834 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
835 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
836 
837 	DECR(qp->qh_qlen);
838 	DECR(qrp->qr_qlen);
839 	if (ulwp->ul_rtqueued) {
840 		ulwp->ul_rtqueued = 0;
841 		qrp->qr_rtcount--;
842 	}
843 	next = ulwp->ul_link;
844 	*ulwpp = next;
845 	ulwp->ul_link = NULL;
846 	if (qrp->qr_tail == ulwp)
847 		qrp->qr_tail = prev;
848 	if (qrp == &ulwp->ul_queue_root) {
849 		/*
850 		 * We can't continue to use the unlinked thread's
851 		 * queue root for the linkage.
852 		 */
853 		queue_root_t *qr_next = qrp->qr_next;
854 		queue_root_t *qr_prev = qrp->qr_prev;
855 
856 		if (qrp->qr_tail) {
857 			/* switch to using the last thread's queue root */
858 			ASSERT(qrp->qr_qlen != 0);
859 			nqrp = &qrp->qr_tail->ul_queue_root;
860 			*nqrp = *qrp;
861 			if (qr_next)
862 				qr_next->qr_prev = nqrp;
863 			if (qr_prev)
864 				qr_prev->qr_next = nqrp;
865 			else
866 				qp->qh_hlist = nqrp;
867 			qp->qh_root = nqrp;
868 		} else {
869 			/* empty queue root; just delete from the hash list */
870 			ASSERT(qrp->qr_qlen == 0);
871 			if (qr_next)
872 				qr_next->qr_prev = qr_prev;
873 			if (qr_prev)
874 				qr_prev->qr_next = qr_next;
875 			else
876 				qp->qh_hlist = qr_next;
877 			qp->qh_root = NULL;
878 			DECR(qp->qh_hlen);
879 		}
880 	}
881 }
882 
883 ulwp_t *
884 dequeue(queue_head_t *qp, int *more)
885 {
886 	ulwp_t **ulwpp;
887 	ulwp_t *ulwp;
888 	ulwp_t *prev;
889 
890 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
891 		return (NULL);
892 	ulwp = *ulwpp;
893 	queue_unlink(qp, ulwpp, prev);
894 	ulwp->ul_sleepq = NULL;
895 	ulwp->ul_wchan = NULL;
896 	return (ulwp);
897 }
898 
899 /*
900  * Return a pointer to the highest priority thread sleeping on wchan.
901  */
902 ulwp_t *
903 queue_waiter(queue_head_t *qp)
904 {
905 	ulwp_t **ulwpp;
906 	ulwp_t *prev;
907 	int more;
908 
909 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
910 		return (NULL);
911 	return (*ulwpp);
912 }
913 
914 int
915 dequeue_self(queue_head_t *qp)
916 {
917 	ulwp_t *self = curthread;
918 	queue_root_t *qrp;
919 	ulwp_t **ulwpp;
920 	ulwp_t *ulwp;
921 	ulwp_t *prev;
922 	int found = 0;
923 
924 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
925 
926 	/* find self on the sleep queue */
927 	if ((qrp = qp->qh_root) != NULL) {
928 		for (prev = NULL, ulwpp = &qrp->qr_head;
929 		    (ulwp = *ulwpp) != NULL;
930 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
931 			if (ulwp == self) {
932 				queue_unlink(qp, ulwpp, prev);
933 				self->ul_cvmutex = NULL;
934 				self->ul_sleepq = NULL;
935 				self->ul_wchan = NULL;
936 				found = 1;
937 				break;
938 			}
939 		}
940 	}
941 
942 	if (!found)
943 		thr_panic("dequeue_self(): curthread not found on queue");
944 
945 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
946 }
947 
948 /*
949  * Called from call_user_handler() and _thrp_suspend() to take
950  * ourself off of our sleep queue so we can grab locks.
951  */
952 void
953 unsleep_self(void)
954 {
955 	ulwp_t *self = curthread;
956 	queue_head_t *qp;
957 
958 	/*
959 	 * Calling enter_critical()/exit_critical() here would lead
960 	 * to recursion.  Just manipulate self->ul_critical directly.
961 	 */
962 	self->ul_critical++;
963 	while (self->ul_sleepq != NULL) {
964 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
965 		/*
966 		 * We may have been moved from a CV queue to a
967 		 * mutex queue while we were attempting queue_lock().
968 		 * If so, just loop around and try again.
969 		 * dequeue_self() clears self->ul_sleepq.
970 		 */
971 		if (qp == self->ul_sleepq)
972 			(void) dequeue_self(qp);
973 		queue_unlock(qp);
974 	}
975 	self->ul_writer = 0;
976 	self->ul_critical--;
977 }
978 
979 /*
980  * Common code for calling the the ___lwp_mutex_timedlock() system call.
981  * Returns with mutex_owner and mutex_ownerpid set correctly.
982  */
983 static int
984 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
985 {
986 	ulwp_t *self = curthread;
987 	uberdata_t *udp = self->ul_uberdata;
988 	int mtype = mp->mutex_type;
989 	hrtime_t begin_sleep;
990 	int acquired;
991 	int error;
992 
993 	self->ul_sp = stkptr();
994 	self->ul_wchan = mp;
995 	if (__td_event_report(self, TD_SLEEP, udp)) {
996 		self->ul_td_evbuf.eventnum = TD_SLEEP;
997 		self->ul_td_evbuf.eventdata = mp;
998 		tdb_event(TD_SLEEP, udp);
999 	}
1000 	if (msp) {
1001 		tdb_incr(msp->mutex_sleep);
1002 		begin_sleep = gethrtime();
1003 	}
1004 
1005 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1006 
1007 	for (;;) {
1008 		/*
1009 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1010 		 * means we successfully acquired the lock.
1011 		 */
1012 		if ((error = ___lwp_mutex_timedlock(mp, tsp, self)) != 0 &&
1013 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1014 			acquired = 0;
1015 			break;
1016 		}
1017 
1018 		if (mtype & USYNC_PROCESS) {
1019 			/*
1020 			 * Defend against forkall().  We may be the child,
1021 			 * in which case we don't actually own the mutex.
1022 			 */
1023 			enter_critical(self);
1024 			if (mp->mutex_ownerpid == udp->pid) {
1025 				exit_critical(self);
1026 				acquired = 1;
1027 				break;
1028 			}
1029 			exit_critical(self);
1030 		} else {
1031 			acquired = 1;
1032 			break;
1033 		}
1034 	}
1035 
1036 	if (msp)
1037 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1038 	self->ul_wchan = NULL;
1039 	self->ul_sp = 0;
1040 
1041 	if (acquired) {
1042 		ASSERT(mp->mutex_owner == (uintptr_t)self);
1043 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1044 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1045 	} else {
1046 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1047 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1048 	}
1049 
1050 	return (error);
1051 }
1052 
1053 /*
1054  * Common code for calling the ___lwp_mutex_trylock() system call.
1055  * Returns with mutex_owner and mutex_ownerpid set correctly.
1056  */
1057 int
1058 mutex_trylock_kernel(mutex_t *mp)
1059 {
1060 	ulwp_t *self = curthread;
1061 	uberdata_t *udp = self->ul_uberdata;
1062 	int mtype = mp->mutex_type;
1063 	int error;
1064 	int acquired;
1065 
1066 	for (;;) {
1067 		/*
1068 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1069 		 * means we successfully acquired the lock.
1070 		 */
1071 		if ((error = ___lwp_mutex_trylock(mp, self)) != 0 &&
1072 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1073 			acquired = 0;
1074 			break;
1075 		}
1076 
1077 		if (mtype & USYNC_PROCESS) {
1078 			/*
1079 			 * Defend against forkall().  We may be the child,
1080 			 * in which case we don't actually own the mutex.
1081 			 */
1082 			enter_critical(self);
1083 			if (mp->mutex_ownerpid == udp->pid) {
1084 				exit_critical(self);
1085 				acquired = 1;
1086 				break;
1087 			}
1088 			exit_critical(self);
1089 		} else {
1090 			acquired = 1;
1091 			break;
1092 		}
1093 	}
1094 
1095 	if (acquired) {
1096 		ASSERT(mp->mutex_owner == (uintptr_t)self);
1097 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1098 	} else if (error != EBUSY) {
1099 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1100 	}
1101 
1102 	return (error);
1103 }
1104 
1105 volatile sc_shared_t *
1106 setup_schedctl(void)
1107 {
1108 	ulwp_t *self = curthread;
1109 	volatile sc_shared_t *scp;
1110 	sc_shared_t *tmp;
1111 
1112 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1113 	    !self->ul_vfork &&			/* not a child of vfork() */
1114 	    !self->ul_schedctl_called) {	/* haven't been called before */
1115 		enter_critical(self);
1116 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1117 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1118 			self->ul_schedctl = scp = tmp;
1119 		exit_critical(self);
1120 	}
1121 	/*
1122 	 * Unless the call to setup_schedctl() is surrounded
1123 	 * by enter_critical()/exit_critical(), the address
1124 	 * we are returning could be invalid due to a forkall()
1125 	 * having occurred in another thread.
1126 	 */
1127 	return (scp);
1128 }
1129 
1130 /*
1131  * Interfaces from libsched, incorporated into libc.
1132  * libsched.so.1 is now a filter library onto libc.
1133  */
1134 #pragma weak schedctl_lookup = schedctl_init
1135 schedctl_t *
1136 schedctl_init(void)
1137 {
1138 	volatile sc_shared_t *scp = setup_schedctl();
1139 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1140 }
1141 
1142 void
1143 schedctl_exit(void)
1144 {
1145 }
1146 
1147 /*
1148  * Contract private interface for java.
1149  * Set up the schedctl data if it doesn't exist yet.
1150  * Return a pointer to the pointer to the schedctl data.
1151  */
1152 volatile sc_shared_t *volatile *
1153 _thr_schedctl(void)
1154 {
1155 	ulwp_t *self = curthread;
1156 	volatile sc_shared_t *volatile *ptr;
1157 
1158 	if (self->ul_vfork)
1159 		return (NULL);
1160 	if (*(ptr = &self->ul_schedctl) == NULL)
1161 		(void) setup_schedctl();
1162 	return (ptr);
1163 }
1164 
1165 /*
1166  * Block signals and attempt to block preemption.
1167  * no_preempt()/preempt() must be used in pairs but can be nested.
1168  */
1169 void
1170 no_preempt(ulwp_t *self)
1171 {
1172 	volatile sc_shared_t *scp;
1173 
1174 	if (self->ul_preempt++ == 0) {
1175 		enter_critical(self);
1176 		if ((scp = self->ul_schedctl) != NULL ||
1177 		    (scp = setup_schedctl()) != NULL) {
1178 			/*
1179 			 * Save the pre-existing preempt value.
1180 			 */
1181 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1182 			scp->sc_preemptctl.sc_nopreempt = 1;
1183 		}
1184 	}
1185 }
1186 
1187 /*
1188  * Undo the effects of no_preempt().
1189  */
1190 void
1191 preempt(ulwp_t *self)
1192 {
1193 	volatile sc_shared_t *scp;
1194 
1195 	ASSERT(self->ul_preempt > 0);
1196 	if (--self->ul_preempt == 0) {
1197 		if ((scp = self->ul_schedctl) != NULL) {
1198 			/*
1199 			 * Restore the pre-existing preempt value.
1200 			 */
1201 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1202 			if (scp->sc_preemptctl.sc_yield &&
1203 			    scp->sc_preemptctl.sc_nopreempt == 0) {
1204 				yield();
1205 				if (scp->sc_preemptctl.sc_yield) {
1206 					/*
1207 					 * Shouldn't happen.  This is either
1208 					 * a race condition or the thread
1209 					 * just entered the real-time class.
1210 					 */
1211 					yield();
1212 					scp->sc_preemptctl.sc_yield = 0;
1213 				}
1214 			}
1215 		}
1216 		exit_critical(self);
1217 	}
1218 }
1219 
1220 /*
1221  * If a call to preempt() would cause the current thread to yield or to
1222  * take deferred actions in exit_critical(), then unpark the specified
1223  * lwp so it can run while we delay.  Return the original lwpid if the
1224  * unpark was not performed, else return zero.  The tests are a repeat
1225  * of some of the tests in preempt(), above.  This is a statistical
1226  * optimization solely for cond_sleep_queue(), below.
1227  */
1228 static lwpid_t
1229 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1230 {
1231 	volatile sc_shared_t *scp = self->ul_schedctl;
1232 
1233 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1234 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1235 	    (self->ul_curplease && self->ul_critical == 1)) {
1236 		(void) __lwp_unpark(lwpid);
1237 		lwpid = 0;
1238 	}
1239 	return (lwpid);
1240 }
1241 
1242 /*
1243  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1244  * If this fails, return EBUSY and let the caller deal with it.
1245  * If this succeeds, return 0 with mutex_owner set to curthread.
1246  */
1247 static int
1248 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1249 {
1250 	ulwp_t *self = curthread;
1251 	int error = EBUSY;
1252 	ulwp_t *ulwp;
1253 	volatile sc_shared_t *scp;
1254 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1255 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1256 	uint32_t new_lockword;
1257 	int count = 0;
1258 	int max_count;
1259 	uint8_t max_spinners;
1260 
1261 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1262 
1263 	if (MUTEX_OWNED(mp, self))
1264 		return (EBUSY);
1265 
1266 	enter_critical(self);
1267 
1268 	/* short-cut, not definitive (see below) */
1269 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1270 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1271 		error = ENOTRECOVERABLE;
1272 		goto done;
1273 	}
1274 
1275 	/*
1276 	 * Make one attempt to acquire the lock before
1277 	 * incurring the overhead of the spin loop.
1278 	 */
1279 	if (set_lock_byte(lockp) == 0) {
1280 		*ownerp = (uintptr_t)self;
1281 		error = 0;
1282 		goto done;
1283 	}
1284 	if (!tryhard)
1285 		goto done;
1286 	if (ncpus == 0)
1287 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1288 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1289 		max_spinners = ncpus - 1;
1290 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1291 	if (max_count == 0)
1292 		goto done;
1293 
1294 	/*
1295 	 * This spin loop is unfair to lwps that have already dropped into
1296 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1297 	 * This is just too bad.  The adaptive spin algorithm is intended
1298 	 * to allow programs with highly-contended locks (that is, broken
1299 	 * programs) to execute with reasonable speed despite their contention.
1300 	 * Being fair would reduce the speed of such programs and well-written
1301 	 * programs will not suffer in any case.
1302 	 */
1303 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1304 		goto done;
1305 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1306 	for (count = 1; ; count++) {
1307 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1308 			*ownerp = (uintptr_t)self;
1309 			error = 0;
1310 			break;
1311 		}
1312 		if (count == max_count)
1313 			break;
1314 		SMT_PAUSE();
1315 		/*
1316 		 * Stop spinning if the mutex owner is not running on
1317 		 * a processor; it will not drop the lock any time soon
1318 		 * and we would just be wasting time to keep spinning.
1319 		 *
1320 		 * Note that we are looking at another thread (ulwp_t)
1321 		 * without ensuring that the other thread does not exit.
1322 		 * The scheme relies on ulwp_t structures never being
1323 		 * deallocated by the library (the library employs a free
1324 		 * list of ulwp_t structs that are reused when new threads
1325 		 * are created) and on schedctl shared memory never being
1326 		 * deallocated once created via __schedctl().
1327 		 *
1328 		 * Thus, the worst that can happen when the spinning thread
1329 		 * looks at the owner's schedctl data is that it is looking
1330 		 * at some other thread's schedctl data.  This almost never
1331 		 * happens and is benign when it does.
1332 		 */
1333 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1334 		    ((scp = ulwp->ul_schedctl) == NULL ||
1335 		    scp->sc_state != SC_ONPROC))
1336 			break;
1337 	}
1338 	new_lockword = spinners_decr(&mp->mutex_lockword);
1339 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1340 		/*
1341 		 * We haven't yet acquired the lock, the lock
1342 		 * is free, and there are no other spinners.
1343 		 * Make one final attempt to acquire the lock.
1344 		 *
1345 		 * This isn't strictly necessary since mutex_lock_queue()
1346 		 * (the next action this thread will take if it doesn't
1347 		 * acquire the lock here) makes one attempt to acquire
1348 		 * the lock before putting the thread to sleep.
1349 		 *
1350 		 * If the next action for this thread (on failure here)
1351 		 * were not to call mutex_lock_queue(), this would be
1352 		 * necessary for correctness, to avoid ending up with an
1353 		 * unheld mutex with waiters but no one to wake them up.
1354 		 */
1355 		if (set_lock_byte(lockp) == 0) {
1356 			*ownerp = (uintptr_t)self;
1357 			error = 0;
1358 		}
1359 		count++;
1360 	}
1361 
1362 done:
1363 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1364 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1365 		/*
1366 		 * We shouldn't own the mutex.
1367 		 * Just clear the lock; everyone has already been waked up.
1368 		 */
1369 		*ownerp = 0;
1370 		(void) clear_lockbyte(&mp->mutex_lockword);
1371 		error = ENOTRECOVERABLE;
1372 	}
1373 
1374 	exit_critical(self);
1375 
1376 	if (error) {
1377 		if (count) {
1378 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1379 		}
1380 		if (error != EBUSY) {
1381 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1382 		}
1383 	} else {
1384 		if (count) {
1385 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1386 		}
1387 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1388 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1389 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1390 			error = EOWNERDEAD;
1391 		}
1392 	}
1393 
1394 	return (error);
1395 }
1396 
1397 /*
1398  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1399  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1400  */
1401 static int
1402 mutex_queuelock_adaptive(mutex_t *mp)
1403 {
1404 	ulwp_t *ulwp;
1405 	volatile sc_shared_t *scp;
1406 	volatile uint8_t *lockp;
1407 	volatile uint64_t *ownerp;
1408 	int count = curthread->ul_queue_spin;
1409 
1410 	ASSERT(mp->mutex_type == USYNC_THREAD);
1411 
1412 	if (count == 0)
1413 		return (EBUSY);
1414 
1415 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1416 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1417 	while (--count >= 0) {
1418 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1419 			return (0);
1420 		SMT_PAUSE();
1421 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1422 		    ((scp = ulwp->ul_schedctl) == NULL ||
1423 		    scp->sc_state != SC_ONPROC))
1424 			break;
1425 	}
1426 
1427 	return (EBUSY);
1428 }
1429 
1430 /*
1431  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1432  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1433  * If this fails, return EBUSY and let the caller deal with it.
1434  * If this succeeds, return 0 with mutex_owner set to curthread
1435  * and mutex_ownerpid set to the current pid.
1436  */
1437 static int
1438 mutex_trylock_process(mutex_t *mp, int tryhard)
1439 {
1440 	ulwp_t *self = curthread;
1441 	uberdata_t *udp = self->ul_uberdata;
1442 	int error = EBUSY;
1443 	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1444 	uint32_t new_lockword;
1445 	int count = 0;
1446 	int max_count;
1447 	uint8_t max_spinners;
1448 
1449 #if defined(__sparc) && !defined(_LP64)
1450 	/* horrible hack, necessary only on 32-bit sparc */
1451 	int fix_alignment_problem =
1452 	    (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1453 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1454 #endif
1455 
1456 	ASSERT(mp->mutex_type & USYNC_PROCESS);
1457 
1458 	if (shared_mutex_held(mp))
1459 		return (EBUSY);
1460 
1461 	enter_critical(self);
1462 
1463 	/* short-cut, not definitive (see below) */
1464 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1465 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1466 		error = ENOTRECOVERABLE;
1467 		goto done;
1468 	}
1469 
1470 	/*
1471 	 * Make one attempt to acquire the lock before
1472 	 * incurring the overhead of the spin loop.
1473 	 */
1474 #if defined(__sparc) && !defined(_LP64)
1475 	/* horrible hack, necessary only on 32-bit sparc */
1476 	if (fix_alignment_problem) {
1477 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1478 			mp->mutex_ownerpid = udp->pid;
1479 			mp->mutex_owner = (uintptr_t)self;
1480 			error = 0;
1481 			goto done;
1482 		}
1483 	} else
1484 #endif
1485 	if (set_lock_byte64(lockp, udp->pid) == 0) {
1486 		mp->mutex_owner = (uintptr_t)self;
1487 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
1488 		error = 0;
1489 		goto done;
1490 	}
1491 	if (!tryhard)
1492 		goto done;
1493 	if (ncpus == 0)
1494 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1495 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1496 		max_spinners = ncpus - 1;
1497 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1498 	if (max_count == 0)
1499 		goto done;
1500 
1501 	/*
1502 	 * This is a process-shared mutex.
1503 	 * We cannot know if the owner is running on a processor.
1504 	 * We just spin and hope that it is on a processor.
1505 	 */
1506 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1)
1507 		goto done;
1508 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1509 	for (count = 1; ; count++) {
1510 #if defined(__sparc) && !defined(_LP64)
1511 		/* horrible hack, necessary only on 32-bit sparc */
1512 		if (fix_alignment_problem) {
1513 			if ((*lockp & LOCKMASK64) == 0 &&
1514 			    set_lock_byte(&mp->mutex_lockw) == 0) {
1515 				mp->mutex_ownerpid = udp->pid;
1516 				mp->mutex_owner = (uintptr_t)self;
1517 				error = 0;
1518 				break;
1519 			}
1520 		} else
1521 #endif
1522 		if ((*lockp & LOCKMASK64) == 0 &&
1523 		    set_lock_byte64(lockp, udp->pid) == 0) {
1524 			mp->mutex_owner = (uintptr_t)self;
1525 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1526 			error = 0;
1527 			break;
1528 		}
1529 		if (count == max_count)
1530 			break;
1531 		SMT_PAUSE();
1532 	}
1533 	new_lockword = spinners_decr(&mp->mutex_lockword);
1534 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1535 		/*
1536 		 * We haven't yet acquired the lock, the lock
1537 		 * is free, and there are no other spinners.
1538 		 * Make one final attempt to acquire the lock.
1539 		 *
1540 		 * This isn't strictly necessary since mutex_lock_kernel()
1541 		 * (the next action this thread will take if it doesn't
1542 		 * acquire the lock here) makes one attempt to acquire
1543 		 * the lock before putting the thread to sleep.
1544 		 *
1545 		 * If the next action for this thread (on failure here)
1546 		 * were not to call mutex_lock_kernel(), this would be
1547 		 * necessary for correctness, to avoid ending up with an
1548 		 * unheld mutex with waiters but no one to wake them up.
1549 		 */
1550 #if defined(__sparc) && !defined(_LP64)
1551 		/* horrible hack, necessary only on 32-bit sparc */
1552 		if (fix_alignment_problem) {
1553 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1554 				mp->mutex_ownerpid = udp->pid;
1555 				mp->mutex_owner = (uintptr_t)self;
1556 				error = 0;
1557 			}
1558 		} else
1559 #endif
1560 		if (set_lock_byte64(lockp, udp->pid) == 0) {
1561 			mp->mutex_owner = (uintptr_t)self;
1562 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1563 			error = 0;
1564 		}
1565 		count++;
1566 	}
1567 
1568 done:
1569 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1570 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1571 		/*
1572 		 * We shouldn't own the mutex.
1573 		 * Just clear the lock; everyone has already been waked up.
1574 		 */
1575 		mp->mutex_owner = 0;
1576 		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1577 		(void) clear_lockbyte64(&mp->mutex_lockword64);
1578 		error = ENOTRECOVERABLE;
1579 	}
1580 
1581 	exit_critical(self);
1582 
1583 	if (error) {
1584 		if (count) {
1585 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 0, count);
1586 		}
1587 		if (error != EBUSY) {
1588 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1589 		}
1590 	} else {
1591 		if (count) {
1592 			DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
1593 		}
1594 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1595 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1596 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1597 			if (mp->mutex_flag & LOCK_OWNERDEAD)
1598 				error = EOWNERDEAD;
1599 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1600 				error = ELOCKUNMAPPED;
1601 			else
1602 				error = EOWNERDEAD;
1603 		}
1604 	}
1605 
1606 	return (error);
1607 }
1608 
1609 /*
1610  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1611  * Returns the lwpid of the thread that was dequeued, if any.
1612  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1613  * to wake up the specified lwp.
1614  */
1615 static lwpid_t
1616 mutex_wakeup(mutex_t *mp)
1617 {
1618 	lwpid_t lwpid = 0;
1619 	int more;
1620 	queue_head_t *qp;
1621 	ulwp_t *ulwp;
1622 
1623 	/*
1624 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1625 	 * waiters bit if no one was found on the queue because the mutex
1626 	 * might have been deallocated or reallocated for another purpose.
1627 	 */
1628 	qp = queue_lock(mp, MX);
1629 	if ((ulwp = dequeue(qp, &more)) != NULL) {
1630 		lwpid = ulwp->ul_lwpid;
1631 		mp->mutex_waiters = more;
1632 	}
1633 	queue_unlock(qp);
1634 	return (lwpid);
1635 }
1636 
1637 /*
1638  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1639  */
1640 static void
1641 mutex_wakeup_all(mutex_t *mp)
1642 {
1643 	queue_head_t *qp;
1644 	queue_root_t *qrp;
1645 	int nlwpid = 0;
1646 	int maxlwps = MAXLWPS;
1647 	ulwp_t *ulwp;
1648 	lwpid_t buffer[MAXLWPS];
1649 	lwpid_t *lwpid = buffer;
1650 
1651 	/*
1652 	 * Walk the list of waiters and prepare to wake up all of them.
1653 	 * The waiters flag has already been cleared from the mutex.
1654 	 *
1655 	 * We keep track of lwpids that are to be unparked in lwpid[].
1656 	 * __lwp_unpark_all() is called to unpark all of them after
1657 	 * they have been removed from the sleep queue and the sleep
1658 	 * queue lock has been dropped.  If we run out of space in our
1659 	 * on-stack buffer, we need to allocate more but we can't call
1660 	 * lmalloc() because we are holding a queue lock when the overflow
1661 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1662 	 * either because the application may have allocated a small
1663 	 * stack and we don't want to overrun the stack.  So we call
1664 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1665 	 * system call directly since that path acquires no locks.
1666 	 */
1667 	qp = queue_lock(mp, MX);
1668 	for (;;) {
1669 		if ((qrp = qp->qh_root) == NULL ||
1670 		    (ulwp = qrp->qr_head) == NULL)
1671 			break;
1672 		ASSERT(ulwp->ul_wchan == mp);
1673 		queue_unlink(qp, &qrp->qr_head, NULL);
1674 		ulwp->ul_sleepq = NULL;
1675 		ulwp->ul_wchan = NULL;
1676 		if (nlwpid == maxlwps)
1677 			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1678 		lwpid[nlwpid++] = ulwp->ul_lwpid;
1679 	}
1680 
1681 	if (nlwpid == 0) {
1682 		queue_unlock(qp);
1683 	} else {
1684 		mp->mutex_waiters = 0;
1685 		no_preempt(curthread);
1686 		queue_unlock(qp);
1687 		if (nlwpid == 1)
1688 			(void) __lwp_unpark(lwpid[0]);
1689 		else
1690 			(void) __lwp_unpark_all(lwpid, nlwpid);
1691 		preempt(curthread);
1692 	}
1693 
1694 	if (lwpid != buffer)
1695 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1696 }
1697 
1698 /*
1699  * Release a process-private mutex.
1700  * As an optimization, if there are waiters but there are also spinners
1701  * attempting to acquire the mutex, then don't bother waking up a waiter;
1702  * one of the spinners will acquire the mutex soon and it would be a waste
1703  * of resources to wake up some thread just to have it spin for a while
1704  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1705  */
1706 static lwpid_t
1707 mutex_unlock_queue(mutex_t *mp, int release_all)
1708 {
1709 	ulwp_t *self = curthread;
1710 	lwpid_t lwpid = 0;
1711 	uint32_t old_lockword;
1712 
1713 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1714 	sigoff(self);
1715 	mp->mutex_owner = 0;
1716 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1717 	if ((old_lockword & WAITERMASK) &&
1718 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1719 		no_preempt(self);	/* ensure a prompt wakeup */
1720 		if (release_all)
1721 			mutex_wakeup_all(mp);
1722 		else
1723 			lwpid = mutex_wakeup(mp);
1724 		if (lwpid == 0)
1725 			preempt(self);
1726 	}
1727 	sigon(self);
1728 	return (lwpid);
1729 }
1730 
1731 /*
1732  * Like mutex_unlock_queue(), but for process-shared mutexes.
1733  */
1734 static void
1735 mutex_unlock_process(mutex_t *mp, int release_all)
1736 {
1737 	ulwp_t *self = curthread;
1738 	uint64_t old_lockword64;
1739 
1740 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1741 	sigoff(self);
1742 	mp->mutex_owner = 0;
1743 #if defined(__sparc) && !defined(_LP64)
1744 	/* horrible hack, necessary only on 32-bit sparc */
1745 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1746 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1747 		uint32_t old_lockword;
1748 		mp->mutex_ownerpid = 0;
1749 		old_lockword = clear_lockbyte(&mp->mutex_lockword);
1750 		if ((old_lockword & WAITERMASK) &&
1751 		    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1752 			no_preempt(self);	/* ensure a prompt wakeup */
1753 			(void) ___lwp_mutex_wakeup(mp, release_all);
1754 			preempt(self);
1755 		}
1756 		sigon(self);
1757 		return;
1758 	}
1759 #endif
1760 	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1761 	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1762 	if ((old_lockword64 & WAITERMASK64) &&
1763 	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1764 		no_preempt(self);	/* ensure a prompt wakeup */
1765 		(void) ___lwp_mutex_wakeup(mp, release_all);
1766 		preempt(self);
1767 	}
1768 	sigon(self);
1769 }
1770 
1771 void
1772 stall(void)
1773 {
1774 	for (;;)
1775 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1776 }
1777 
1778 /*
1779  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1780  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1781  * If successful, returns with mutex_owner set correctly.
1782  */
1783 int
1784 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1785     timespec_t *tsp)
1786 {
1787 	uberdata_t *udp = curthread->ul_uberdata;
1788 	queue_head_t *qp;
1789 	hrtime_t begin_sleep;
1790 	int error = 0;
1791 
1792 	self->ul_sp = stkptr();
1793 	if (__td_event_report(self, TD_SLEEP, udp)) {
1794 		self->ul_wchan = mp;
1795 		self->ul_td_evbuf.eventnum = TD_SLEEP;
1796 		self->ul_td_evbuf.eventdata = mp;
1797 		tdb_event(TD_SLEEP, udp);
1798 	}
1799 	if (msp) {
1800 		tdb_incr(msp->mutex_sleep);
1801 		begin_sleep = gethrtime();
1802 	}
1803 
1804 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1805 
1806 	/*
1807 	 * Put ourself on the sleep queue, and while we are
1808 	 * unable to grab the lock, go park in the kernel.
1809 	 * Take ourself off the sleep queue after we acquire the lock.
1810 	 * The waiter bit can be set/cleared only while holding the queue lock.
1811 	 */
1812 	qp = queue_lock(mp, MX);
1813 	enqueue(qp, self, 0);
1814 	mp->mutex_waiters = 1;
1815 	for (;;) {
1816 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1817 			mp->mutex_owner = (uintptr_t)self;
1818 			mp->mutex_waiters = dequeue_self(qp);
1819 			break;
1820 		}
1821 		set_parking_flag(self, 1);
1822 		queue_unlock(qp);
1823 		/*
1824 		 * __lwp_park() will return the residual time in tsp
1825 		 * if we are unparked before the timeout expires.
1826 		 */
1827 		error = __lwp_park(tsp, 0);
1828 		set_parking_flag(self, 0);
1829 		/*
1830 		 * We could have taken a signal or suspended ourself.
1831 		 * If we did, then we removed ourself from the queue.
1832 		 * Someone else may have removed us from the queue
1833 		 * as a consequence of mutex_unlock().  We may have
1834 		 * gotten a timeout from __lwp_park().  Or we may still
1835 		 * be on the queue and this is just a spurious wakeup.
1836 		 */
1837 		qp = queue_lock(mp, MX);
1838 		if (self->ul_sleepq == NULL) {
1839 			if (error) {
1840 				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1841 				if (error != EINTR)
1842 					break;
1843 				error = 0;
1844 			}
1845 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1846 				mp->mutex_owner = (uintptr_t)self;
1847 				break;
1848 			}
1849 			enqueue(qp, self, 0);
1850 			mp->mutex_waiters = 1;
1851 		}
1852 		ASSERT(self->ul_sleepq == qp &&
1853 		    self->ul_qtype == MX &&
1854 		    self->ul_wchan == mp);
1855 		if (error) {
1856 			if (error != EINTR) {
1857 				mp->mutex_waiters = dequeue_self(qp);
1858 				break;
1859 			}
1860 			error = 0;
1861 		}
1862 	}
1863 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1864 	    self->ul_wchan == NULL);
1865 	self->ul_sp = 0;
1866 
1867 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1868 
1869 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1870 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1871 		/*
1872 		 * We shouldn't own the mutex.
1873 		 * Just clear the lock; everyone has already been waked up.
1874 		 */
1875 		mp->mutex_owner = 0;
1876 		(void) clear_lockbyte(&mp->mutex_lockword);
1877 		error = ENOTRECOVERABLE;
1878 	}
1879 
1880 	queue_unlock(qp);
1881 
1882 	if (msp)
1883 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1884 
1885 	if (error) {
1886 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1887 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1888 	} else {
1889 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1890 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1891 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1892 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1893 			error = EOWNERDEAD;
1894 		}
1895 	}
1896 
1897 	return (error);
1898 }
1899 
1900 static int
1901 mutex_recursion(mutex_t *mp, int mtype, int try)
1902 {
1903 	ASSERT(mutex_held(mp));
1904 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1905 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1906 
1907 	if (mtype & LOCK_RECURSIVE) {
1908 		if (mp->mutex_rcount == RECURSION_MAX) {
1909 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1910 			return (EAGAIN);
1911 		}
1912 		mp->mutex_rcount++;
1913 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1914 		return (0);
1915 	}
1916 	if (try == MUTEX_LOCK) {
1917 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1918 		return (EDEADLK);
1919 	}
1920 	return (EBUSY);
1921 }
1922 
1923 /*
1924  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1925  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1926  * We use tdb_hash_lock here and in the synch object tracking code in
1927  * the tdb_agent.c file.  There is no conflict between these two usages.
1928  */
1929 void
1930 register_lock(mutex_t *mp)
1931 {
1932 	uberdata_t *udp = curthread->ul_uberdata;
1933 	uint_t hash = LOCK_HASH(mp);
1934 	robust_t *rlp;
1935 	robust_t *invalid;
1936 	robust_t **rlpp;
1937 	robust_t **table;
1938 
1939 	if ((table = udp->robustlocks) == NULL) {
1940 		lmutex_lock(&udp->tdb_hash_lock);
1941 		if ((table = udp->robustlocks) == NULL) {
1942 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1943 			membar_producer();
1944 			udp->robustlocks = table;
1945 		}
1946 		lmutex_unlock(&udp->tdb_hash_lock);
1947 	}
1948 	membar_consumer();
1949 
1950 	/*
1951 	 * First search the registered table with no locks held.
1952 	 * This is safe because the table never shrinks
1953 	 * and we can only get a false negative.
1954 	 */
1955 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1956 		if (rlp->robust_lock == mp)	/* already registered */
1957 			return;
1958 	}
1959 
1960 	/*
1961 	 * The lock was not found.
1962 	 * Repeat the operation with tdb_hash_lock held.
1963 	 */
1964 	lmutex_lock(&udp->tdb_hash_lock);
1965 
1966 	invalid = NULL;
1967 	for (rlpp = &table[hash];
1968 	    (rlp = *rlpp) != NULL;
1969 	    rlpp = &rlp->robust_next) {
1970 		if (rlp->robust_lock == mp) {	/* already registered */
1971 			lmutex_unlock(&udp->tdb_hash_lock);
1972 			return;
1973 		}
1974 		/* remember the first invalid entry, if any */
1975 		if (rlp->robust_lock == INVALID_ADDR && invalid == NULL)
1976 			invalid = rlp;
1977 	}
1978 
1979 	/*
1980 	 * The lock has never been registered.
1981 	 * Add it to the table and register it now.
1982 	 */
1983 	if ((rlp = invalid) != NULL) {
1984 		/*
1985 		 * Reuse the invalid entry we found above.
1986 		 * The linkages are still correct.
1987 		 */
1988 		rlp->robust_lock = mp;
1989 		membar_producer();
1990 	} else {
1991 		/*
1992 		 * Allocate a new entry and add it to
1993 		 * the hash table and to the global list.
1994 		 */
1995 		rlp = lmalloc(sizeof (*rlp));
1996 		rlp->robust_lock = mp;
1997 		rlp->robust_next = NULL;
1998 		rlp->robust_list = udp->robustlist;
1999 		udp->robustlist = rlp;
2000 		membar_producer();
2001 		*rlpp = rlp;
2002 	}
2003 
2004 	lmutex_unlock(&udp->tdb_hash_lock);
2005 
2006 	(void) ___lwp_mutex_register(mp, &rlp->robust_lock);
2007 }
2008 
2009 /*
2010  * This is called in the child of fork()/forkall() to start over
2011  * with a clean slate.  (Each process must register its own locks.)
2012  * No locks are needed because all other threads are suspended or gone.
2013  */
2014 void
2015 unregister_locks(void)
2016 {
2017 	uberdata_t *udp = curthread->ul_uberdata;
2018 	robust_t **table;
2019 	robust_t *rlp;
2020 	robust_t *next;
2021 
2022 	/*
2023 	 * Do this first, before calling lfree().
2024 	 */
2025 	table = udp->robustlocks;
2026 	udp->robustlocks = NULL;
2027 	rlp = udp->robustlist;
2028 	udp->robustlist = NULL;
2029 
2030 	/*
2031 	 * Do this by traversing the global list, not the hash table.
2032 	 */
2033 	while (rlp != NULL) {
2034 		next = rlp->robust_list;
2035 		lfree(rlp, sizeof (*rlp));
2036 		rlp = next;
2037 	}
2038 	if (table != NULL)
2039 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2040 }
2041 
2042 /*
2043  * Returns with mutex_owner set correctly.
2044  */
2045 int
2046 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2047 {
2048 	ulwp_t *self = curthread;
2049 	uberdata_t *udp = self->ul_uberdata;
2050 	int mtype = mp->mutex_type;
2051 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2052 	int error = 0;
2053 	int noceil = try & MUTEX_NOCEIL;
2054 	uint8_t ceil;
2055 	int myprio;
2056 
2057 	try &= ~MUTEX_NOCEIL;
2058 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2059 
2060 	if (!self->ul_schedctl_called)
2061 		(void) setup_schedctl();
2062 
2063 	if (msp && try == MUTEX_TRY)
2064 		tdb_incr(msp->mutex_try);
2065 
2066 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2067 		return (mutex_recursion(mp, mtype, try));
2068 
2069 	if (self->ul_error_detection && try == MUTEX_LOCK &&
2070 	    tsp == NULL && mutex_held(mp))
2071 		lock_error(mp, "mutex_lock", NULL, NULL);
2072 
2073 	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2074 		update_sched(self);
2075 		if (self->ul_cid != self->ul_rtclassid) {
2076 			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2077 			return (EPERM);
2078 		}
2079 		ceil = mp->mutex_ceiling;
2080 		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2081 		if (myprio > ceil) {
2082 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2083 			return (EINVAL);
2084 		}
2085 		if ((error = _ceil_mylist_add(mp)) != 0) {
2086 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2087 			return (error);
2088 		}
2089 		if (myprio < ceil)
2090 			_ceil_prio_inherit(ceil);
2091 	}
2092 
2093 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2094 	    == (USYNC_PROCESS | LOCK_ROBUST))
2095 		register_lock(mp);
2096 
2097 	if (mtype & LOCK_PRIO_INHERIT) {
2098 		/* go straight to the kernel */
2099 		if (try == MUTEX_TRY)
2100 			error = mutex_trylock_kernel(mp);
2101 		else	/* MUTEX_LOCK */
2102 			error = mutex_lock_kernel(mp, tsp, msp);
2103 		/*
2104 		 * The kernel never sets or clears the lock byte
2105 		 * for LOCK_PRIO_INHERIT mutexes.
2106 		 * Set it here for consistency.
2107 		 */
2108 		switch (error) {
2109 		case 0:
2110 			self->ul_pilocks++;
2111 			mp->mutex_lockw = LOCKSET;
2112 			break;
2113 		case EOWNERDEAD:
2114 		case ELOCKUNMAPPED:
2115 			self->ul_pilocks++;
2116 			mp->mutex_lockw = LOCKSET;
2117 			/* FALLTHROUGH */
2118 		case ENOTRECOVERABLE:
2119 			ASSERT(mtype & LOCK_ROBUST);
2120 			break;
2121 		case EDEADLK:
2122 			if (try == MUTEX_TRY) {
2123 				error = EBUSY;
2124 			} else if (tsp != NULL) {	/* simulate a timeout */
2125 				/*
2126 				 * Note: mutex_timedlock() never returns EINTR.
2127 				 */
2128 				timespec_t ts = *tsp;
2129 				timespec_t rts;
2130 
2131 				while (__nanosleep(&ts, &rts) == EINTR)
2132 					ts = rts;
2133 				error = ETIME;
2134 			} else {		/* simulate a deadlock */
2135 				stall();
2136 			}
2137 			break;
2138 		}
2139 	} else if (mtype & USYNC_PROCESS) {
2140 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2141 		if (error == EBUSY && try == MUTEX_LOCK)
2142 			error = mutex_lock_kernel(mp, tsp, msp);
2143 	} else {	/* USYNC_THREAD */
2144 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2145 		if (error == EBUSY && try == MUTEX_LOCK)
2146 			error = mutex_lock_queue(self, msp, mp, tsp);
2147 	}
2148 
2149 	switch (error) {
2150 	case 0:
2151 	case EOWNERDEAD:
2152 	case ELOCKUNMAPPED:
2153 		if (mtype & LOCK_ROBUST)
2154 			remember_lock(mp);
2155 		if (msp)
2156 			record_begin_hold(msp);
2157 		break;
2158 	default:
2159 		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2160 			(void) _ceil_mylist_del(mp);
2161 			if (myprio < ceil)
2162 				_ceil_prio_waive();
2163 		}
2164 		if (try == MUTEX_TRY) {
2165 			if (msp)
2166 				tdb_incr(msp->mutex_try_fail);
2167 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2168 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2169 				tdb_event(TD_LOCK_TRY, udp);
2170 			}
2171 		}
2172 		break;
2173 	}
2174 
2175 	return (error);
2176 }
2177 
2178 int
2179 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2180 {
2181 	ulwp_t *self = curthread;
2182 	uberdata_t *udp = self->ul_uberdata;
2183 
2184 	/*
2185 	 * We know that USYNC_PROCESS is set in mtype and that
2186 	 * zero, one, or both of the flags LOCK_RECURSIVE and
2187 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
2188 	 */
2189 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2190 	enter_critical(self);
2191 #if defined(__sparc) && !defined(_LP64)
2192 	/* horrible hack, necessary only on 32-bit sparc */
2193 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2194 	    self->ul_misaligned) {
2195 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2196 			mp->mutex_ownerpid = udp->pid;
2197 			mp->mutex_owner = (uintptr_t)self;
2198 			exit_critical(self);
2199 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2200 			return (0);
2201 		}
2202 	} else
2203 #endif
2204 	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2205 		mp->mutex_owner = (uintptr_t)self;
2206 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
2207 		exit_critical(self);
2208 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2209 		return (0);
2210 	}
2211 	exit_critical(self);
2212 
2213 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2214 		return (mutex_recursion(mp, mtype, try));
2215 
2216 	if (try == MUTEX_LOCK) {
2217 		if (mutex_trylock_process(mp, 1) == 0)
2218 			return (0);
2219 		return (mutex_lock_kernel(mp, tsp, NULL));
2220 	}
2221 
2222 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2223 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2224 		tdb_event(TD_LOCK_TRY, udp);
2225 	}
2226 	return (EBUSY);
2227 }
2228 
2229 static int
2230 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2231 {
2232 	ulwp_t *self = curthread;
2233 	int mtype = mp->mutex_type;
2234 	uberflags_t *gflags;
2235 
2236 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2237 	    self->ul_error_detection && self->ul_misaligned == 0)
2238 		lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2239 
2240 	/*
2241 	 * Optimize the case of USYNC_THREAD, including
2242 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2243 	 * no error detection, no lock statistics,
2244 	 * and the process has only a single thread.
2245 	 * (Most likely a traditional single-threaded application.)
2246 	 */
2247 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2248 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2249 		/*
2250 		 * Only one thread exists so we don't need an atomic operation.
2251 		 * We do, however, need to protect against signals.
2252 		 */
2253 		if (mp->mutex_lockw == 0) {
2254 			sigoff(self);
2255 			mp->mutex_lockw = LOCKSET;
2256 			mp->mutex_owner = (uintptr_t)self;
2257 			sigon(self);
2258 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2259 			return (0);
2260 		}
2261 		if (mtype && MUTEX_OWNER(mp) == self)
2262 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2263 		/*
2264 		 * We have reached a deadlock, probably because the
2265 		 * process is executing non-async-signal-safe code in
2266 		 * a signal handler and is attempting to acquire a lock
2267 		 * that it already owns.  This is not surprising, given
2268 		 * bad programming practices over the years that has
2269 		 * resulted in applications calling printf() and such
2270 		 * in their signal handlers.  Unless the user has told
2271 		 * us that the signal handlers are safe by setting:
2272 		 *	export _THREAD_ASYNC_SAFE=1
2273 		 * we return EDEADLK rather than actually deadlocking.
2274 		 *
2275 		 * A lock may explicitly override this with the
2276 		 * LOCK_DEADLOCK flag which is currently set for POSIX
2277 		 * NORMAL mutexes as the specification requires deadlock
2278 		 * behavior and applications _do_ rely on that for their
2279 		 * correctness guarantees.
2280 		 */
2281 		if (tsp == NULL &&
2282 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe &&
2283 		    (mp->mutex_flag & LOCK_DEADLOCK) == 0) {
2284 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2285 			return (EDEADLK);
2286 		}
2287 	}
2288 
2289 	/*
2290 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2291 	 * no error detection, and no lock statistics.
2292 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2293 	 */
2294 	if ((gflags = self->ul_schedctl_called) != NULL &&
2295 	    (gflags->uf_trs_ted |
2296 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2297 		if (mtype & USYNC_PROCESS)
2298 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2299 		sigoff(self);
2300 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2301 			mp->mutex_owner = (uintptr_t)self;
2302 			sigon(self);
2303 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2304 			return (0);
2305 		}
2306 		sigon(self);
2307 		if (mtype && MUTEX_OWNER(mp) == self)
2308 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2309 		if (mutex_trylock_adaptive(mp, 1) != 0)
2310 			return (mutex_lock_queue(self, NULL, mp, tsp));
2311 		return (0);
2312 	}
2313 
2314 	/* else do it the long way */
2315 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2316 }
2317 
2318 #pragma weak pthread_mutex_lock = mutex_lock
2319 #pragma weak _mutex_lock = mutex_lock
2320 int
2321 mutex_lock(mutex_t *mp)
2322 {
2323 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2324 	return (mutex_lock_impl(mp, NULL));
2325 }
2326 
2327 #pragma weak pthread_mutex_enter_np = mutex_enter
2328 void
2329 mutex_enter(mutex_t *mp)
2330 {
2331 	int ret;
2332 	int attr = mp->mutex_type & ALL_ATTRIBUTES;
2333 
2334 	/*
2335 	 * Require LOCK_ERRORCHECK, accept LOCK_RECURSIVE.
2336 	 */
2337 	if (attr != LOCK_ERRORCHECK &&
2338 	    attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2339 		mutex_panic(mp, "mutex_enter: bad mutex type");
2340 	}
2341 	ret = mutex_lock(mp);
2342 	if (ret == EDEADLK) {
2343 		mutex_panic(mp, "recursive mutex_enter");
2344 	} else if (ret == EAGAIN) {
2345 		mutex_panic(mp, "excessive recursive mutex_enter");
2346 	} else if (ret != 0) {
2347 		mutex_panic(mp, "unknown mutex_enter failure");
2348 	}
2349 }
2350 
2351 int
2352 pthread_mutex_clocklock(pthread_mutex_t *restrict mp, clockid_t clock,
2353     const struct timespec *restrict abstime)
2354 {
2355 	timespec_t tslocal;
2356 	int error;
2357 
2358 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2359 
2360 	switch (clock) {
2361 	case CLOCK_REALTIME:
2362 	case CLOCK_HIGHRES:
2363 		break;
2364 	default:
2365 		return (EINVAL);
2366 	}
2367 
2368 	abstime_to_reltime(clock, abstime, &tslocal);
2369 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2370 	if (error == ETIME)
2371 		error = ETIMEDOUT;
2372 	return (error);
2373 }
2374 
2375 int
2376 pthread_mutex_timedlock(pthread_mutex_t *restrict mp,
2377     const struct timespec *restrict abstime)
2378 {
2379 	return (pthread_mutex_clocklock(mp, CLOCK_REALTIME, abstime));
2380 }
2381 
2382 int
2383 pthread_mutex_relclocklock_np(pthread_mutex_t *restrict mp, clockid_t clock,
2384     const struct timespec *restrict reltime)
2385 {
2386 	timespec_t tslocal;
2387 	int error;
2388 
2389 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2390 
2391 	switch (clock) {
2392 	case CLOCK_REALTIME:
2393 	case CLOCK_HIGHRES:
2394 		break;
2395 	default:
2396 		return (EINVAL);
2397 	}
2398 
2399 	tslocal = *reltime;
2400 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2401 	if (error == ETIME)
2402 		error = ETIMEDOUT;
2403 	return (error);
2404 }
2405 
2406 int
2407 pthread_mutex_reltimedlock_np(pthread_mutex_t *restrict mp,
2408     const struct timespec *restrict reltime)
2409 {
2410 	return (pthread_mutex_relclocklock_np(mp, CLOCK_REALTIME, reltime));
2411 }
2412 
2413 #pragma weak pthread_mutex_trylock = mutex_trylock
2414 int
2415 mutex_trylock(mutex_t *mp)
2416 {
2417 	ulwp_t *self = curthread;
2418 	uberdata_t *udp = self->ul_uberdata;
2419 	int mtype = mp->mutex_type;
2420 	uberflags_t *gflags;
2421 
2422 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2423 
2424 	/*
2425 	 * Optimize the case of USYNC_THREAD, including
2426 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2427 	 * no error detection, no lock statistics,
2428 	 * and the process has only a single thread.
2429 	 * (Most likely a traditional single-threaded application.)
2430 	 */
2431 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2432 	    udp->uberflags.uf_all) == 0) {
2433 		/*
2434 		 * Only one thread exists so we don't need an atomic operation.
2435 		 * We do, however, need to protect against signals.
2436 		 */
2437 		if (mp->mutex_lockw == 0) {
2438 			sigoff(self);
2439 			mp->mutex_lockw = LOCKSET;
2440 			mp->mutex_owner = (uintptr_t)self;
2441 			sigon(self);
2442 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2443 			return (0);
2444 		}
2445 		if (mtype && MUTEX_OWNER(mp) == self)
2446 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2447 		return (EBUSY);
2448 	}
2449 
2450 	/*
2451 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2452 	 * no error detection, and no lock statistics.
2453 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2454 	 */
2455 	if ((gflags = self->ul_schedctl_called) != NULL &&
2456 	    (gflags->uf_trs_ted |
2457 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2458 		if (mtype & USYNC_PROCESS)
2459 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2460 		sigoff(self);
2461 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2462 			mp->mutex_owner = (uintptr_t)self;
2463 			sigon(self);
2464 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2465 			return (0);
2466 		}
2467 		sigon(self);
2468 		if (mtype && MUTEX_OWNER(mp) == self)
2469 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2470 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2471 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2472 			tdb_event(TD_LOCK_TRY, udp);
2473 		}
2474 		return (EBUSY);
2475 	}
2476 
2477 	/* else do it the long way */
2478 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2479 }
2480 
2481 int
2482 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2483 {
2484 	ulwp_t *self = curthread;
2485 	uberdata_t *udp = self->ul_uberdata;
2486 	int mtype = mp->mutex_type;
2487 	tdb_mutex_stats_t *msp;
2488 	int error = 0;
2489 	int release_all;
2490 	lwpid_t lwpid;
2491 
2492 	if ((mtype & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
2493 	    !mutex_held(mp))
2494 		return (EPERM);
2495 
2496 	if (self->ul_error_detection && !mutex_held(mp))
2497 		lock_error(mp, "mutex_unlock", NULL, NULL);
2498 
2499 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2500 		mp->mutex_rcount--;
2501 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2502 		return (0);
2503 	}
2504 
2505 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2506 		(void) record_hold_time(msp);
2507 
2508 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2509 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2510 		ASSERT(mtype & LOCK_ROBUST);
2511 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2512 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2513 	}
2514 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2515 
2516 	if (mtype & LOCK_PRIO_INHERIT) {
2517 		no_preempt(self);
2518 		mp->mutex_owner = 0;
2519 		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2520 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2521 		mp->mutex_lockw = LOCKCLEAR;
2522 		self->ul_pilocks--;
2523 		error = ___lwp_mutex_unlock(mp);
2524 		preempt(self);
2525 	} else if (mtype & USYNC_PROCESS) {
2526 		mutex_unlock_process(mp, release_all);
2527 	} else {	/* USYNC_THREAD */
2528 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2529 			(void) __lwp_unpark(lwpid);
2530 			preempt(self);
2531 		}
2532 	}
2533 
2534 	if (mtype & LOCK_ROBUST)
2535 		forget_lock(mp);
2536 
2537 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2538 		_ceil_prio_waive();
2539 
2540 	return (error);
2541 }
2542 
2543 #pragma weak pthread_mutex_unlock = mutex_unlock
2544 #pragma weak _mutex_unlock = mutex_unlock
2545 int
2546 mutex_unlock(mutex_t *mp)
2547 {
2548 	ulwp_t *self = curthread;
2549 	int mtype = mp->mutex_type;
2550 	uberflags_t *gflags;
2551 	lwpid_t lwpid;
2552 	short el;
2553 
2554 	/*
2555 	 * Optimize the case of USYNC_THREAD, including
2556 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2557 	 * no error detection, no lock statistics,
2558 	 * and the process has only a single thread.
2559 	 * (Most likely a traditional single-threaded application.)
2560 	 */
2561 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2562 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2563 		if (mtype) {
2564 			/*
2565 			 * At this point we know that one or both of the
2566 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2567 			 */
2568 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2569 				return (EPERM);
2570 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2571 				mp->mutex_rcount--;
2572 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2573 				return (0);
2574 			}
2575 		}
2576 		/*
2577 		 * Only one thread exists so we don't need an atomic operation.
2578 		 * Also, there can be no waiters.
2579 		 */
2580 		sigoff(self);
2581 		mp->mutex_owner = 0;
2582 		mp->mutex_lockword = 0;
2583 		sigon(self);
2584 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2585 		return (0);
2586 	}
2587 
2588 	/*
2589 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2590 	 * no error detection, and no lock statistics.
2591 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2592 	 */
2593 	if ((gflags = self->ul_schedctl_called) != NULL) {
2594 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2595 fast_unlock:
2596 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2597 				(void) __lwp_unpark(lwpid);
2598 				preempt(self);
2599 			}
2600 			return (0);
2601 		}
2602 		if (el)		/* error detection or lock statistics */
2603 			goto slow_unlock;
2604 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2605 			/*
2606 			 * At this point we know that one or both of the
2607 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2608 			 */
2609 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2610 				return (EPERM);
2611 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2612 				mp->mutex_rcount--;
2613 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2614 				return (0);
2615 			}
2616 			goto fast_unlock;
2617 		}
2618 		if ((mtype &
2619 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2620 			/*
2621 			 * At this point we know that zero, one, or both of the
2622 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2623 			 * that the USYNC_PROCESS flag is set.
2624 			 */
2625 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2626 				return (EPERM);
2627 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2628 				mp->mutex_rcount--;
2629 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2630 				return (0);
2631 			}
2632 			mutex_unlock_process(mp, 0);
2633 			return (0);
2634 		}
2635 	}
2636 
2637 	/* else do it the long way */
2638 slow_unlock:
2639 	return (mutex_unlock_internal(mp, 0));
2640 }
2641 
2642 #pragma weak pthread_mutex_exit_np = mutex_exit
2643 void
2644 mutex_exit(mutex_t *mp)
2645 {
2646 	int ret;
2647 	int attr = mp->mutex_type & ALL_ATTRIBUTES;
2648 
2649 	if (attr != LOCK_ERRORCHECK &&
2650 	    attr != (LOCK_ERRORCHECK | LOCK_RECURSIVE)) {
2651 		mutex_panic(mp, "mutex_exit: bad mutex type");
2652 	}
2653 	ret = mutex_unlock(mp);
2654 	if (ret == EPERM) {
2655 		mutex_panic(mp, "mutex_exit: not owner");
2656 	} else if (ret != 0) {
2657 		mutex_panic(mp, "unknown mutex_exit failure");
2658 	}
2659 
2660 }
2661 
2662 /*
2663  * Internally to the library, almost all mutex lock/unlock actions
2664  * go through these lmutex_ functions, to protect critical regions.
2665  * We replicate a bit of code from mutex_lock() and mutex_unlock()
2666  * to make these functions faster since we know that the mutex type
2667  * of all internal locks is USYNC_THREAD.  We also know that internal
2668  * locking can never fail, so we panic if it does.
2669  */
2670 void
2671 lmutex_lock(mutex_t *mp)
2672 {
2673 	ulwp_t *self = curthread;
2674 	uberdata_t *udp = self->ul_uberdata;
2675 
2676 	ASSERT(mp->mutex_type == USYNC_THREAD);
2677 
2678 	enter_critical(self);
2679 	/*
2680 	 * Optimize the case of no lock statistics and only a single thread.
2681 	 * (Most likely a traditional single-threaded application.)
2682 	 */
2683 	if (udp->uberflags.uf_all == 0) {
2684 		/*
2685 		 * Only one thread exists; the mutex must be free.
2686 		 */
2687 		ASSERT(mp->mutex_lockw == 0);
2688 		mp->mutex_lockw = LOCKSET;
2689 		mp->mutex_owner = (uintptr_t)self;
2690 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2691 	} else {
2692 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2693 
2694 		if (!self->ul_schedctl_called)
2695 			(void) setup_schedctl();
2696 
2697 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2698 			mp->mutex_owner = (uintptr_t)self;
2699 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2700 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2701 			(void) mutex_lock_queue(self, msp, mp, NULL);
2702 		}
2703 
2704 		if (msp)
2705 			record_begin_hold(msp);
2706 	}
2707 }
2708 
2709 void
2710 lmutex_unlock(mutex_t *mp)
2711 {
2712 	ulwp_t *self = curthread;
2713 	uberdata_t *udp = self->ul_uberdata;
2714 
2715 	ASSERT(mp->mutex_type == USYNC_THREAD);
2716 
2717 	/*
2718 	 * Optimize the case of no lock statistics and only a single thread.
2719 	 * (Most likely a traditional single-threaded application.)
2720 	 */
2721 	if (udp->uberflags.uf_all == 0) {
2722 		/*
2723 		 * Only one thread exists so there can be no waiters.
2724 		 */
2725 		mp->mutex_owner = 0;
2726 		mp->mutex_lockword = 0;
2727 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2728 	} else {
2729 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2730 		lwpid_t lwpid;
2731 
2732 		if (msp)
2733 			(void) record_hold_time(msp);
2734 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2735 			(void) __lwp_unpark(lwpid);
2736 			preempt(self);
2737 		}
2738 	}
2739 	exit_critical(self);
2740 }
2741 
2742 /*
2743  * For specialized code in libc, like the asynchronous i/o code,
2744  * the following sig_*() locking primitives are used in order
2745  * to make the code asynchronous signal safe.  Signals are
2746  * deferred while locks acquired by these functions are held.
2747  */
2748 void
2749 sig_mutex_lock(mutex_t *mp)
2750 {
2751 	ulwp_t *self = curthread;
2752 
2753 	sigoff(self);
2754 	(void) mutex_lock(mp);
2755 }
2756 
2757 void
2758 sig_mutex_unlock(mutex_t *mp)
2759 {
2760 	ulwp_t *self = curthread;
2761 
2762 	(void) mutex_unlock(mp);
2763 	sigon(self);
2764 }
2765 
2766 int
2767 sig_mutex_trylock(mutex_t *mp)
2768 {
2769 	ulwp_t *self = curthread;
2770 	int error;
2771 
2772 	sigoff(self);
2773 	if ((error = mutex_trylock(mp)) != 0)
2774 		sigon(self);
2775 	return (error);
2776 }
2777 
2778 /*
2779  * sig_cond_wait() is a cancellation point.
2780  */
2781 int
2782 sig_cond_wait(cond_t *cv, mutex_t *mp)
2783 {
2784 	int error;
2785 
2786 	ASSERT(curthread->ul_sigdefer != 0);
2787 	pthread_testcancel();
2788 	error = __cond_wait(cv, mp);
2789 	if (error == EINTR && curthread->ul_cursig) {
2790 		sig_mutex_unlock(mp);
2791 		/* take the deferred signal here */
2792 		sig_mutex_lock(mp);
2793 	}
2794 	pthread_testcancel();
2795 	return (error);
2796 }
2797 
2798 /*
2799  * sig_cond_reltimedwait() is a cancellation point.
2800  */
2801 int
2802 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2803 {
2804 	int error;
2805 
2806 	ASSERT(curthread->ul_sigdefer != 0);
2807 	pthread_testcancel();
2808 	error = __cond_reltimedwait(cv, mp, ts);
2809 	if (error == EINTR && curthread->ul_cursig) {
2810 		sig_mutex_unlock(mp);
2811 		/* take the deferred signal here */
2812 		sig_mutex_lock(mp);
2813 	}
2814 	pthread_testcancel();
2815 	return (error);
2816 }
2817 
2818 /*
2819  * For specialized code in libc, like the stdio code.
2820  * the following cancel_safe_*() locking primitives are used in
2821  * order to make the code cancellation-safe.  Cancellation is
2822  * deferred while locks acquired by these functions are held.
2823  */
2824 void
2825 cancel_safe_mutex_lock(mutex_t *mp)
2826 {
2827 	(void) mutex_lock(mp);
2828 	curthread->ul_libc_locks++;
2829 }
2830 
2831 int
2832 cancel_safe_mutex_trylock(mutex_t *mp)
2833 {
2834 	int error;
2835 
2836 	if ((error = mutex_trylock(mp)) == 0)
2837 		curthread->ul_libc_locks++;
2838 	return (error);
2839 }
2840 
2841 void
2842 cancel_safe_mutex_unlock(mutex_t *mp)
2843 {
2844 	ulwp_t *self = curthread;
2845 
2846 	ASSERT(self->ul_libc_locks != 0);
2847 
2848 	(void) mutex_unlock(mp);
2849 
2850 	/*
2851 	 * Decrement the count of locks held by cancel_safe_mutex_lock().
2852 	 * If we are then in a position to terminate cleanly and
2853 	 * if there is a pending cancellation and cancellation
2854 	 * is not disabled and we received EINTR from a recent
2855 	 * system call then perform the cancellation action now.
2856 	 */
2857 	if (--self->ul_libc_locks == 0 &&
2858 	    !(self->ul_vfork | self->ul_nocancel |
2859 	    self->ul_critical | self->ul_sigdefer) &&
2860 	    cancel_active())
2861 		pthread_exit(PTHREAD_CANCELED);
2862 }
2863 
2864 static int
2865 shared_mutex_held(mutex_t *mparg)
2866 {
2867 	/*
2868 	 * The 'volatile' is necessary to make sure the compiler doesn't
2869 	 * reorder the tests of the various components of the mutex.
2870 	 * They must be tested in this order:
2871 	 *	mutex_lockw
2872 	 *	mutex_owner
2873 	 *	mutex_ownerpid
2874 	 * This relies on the fact that everywhere mutex_lockw is cleared,
2875 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2876 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2877 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2878 	 * mutex_lockw is set or cleared with a memory barrier.
2879 	 */
2880 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2881 	ulwp_t *self = curthread;
2882 	uberdata_t *udp = self->ul_uberdata;
2883 
2884 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2885 }
2886 
2887 #pragma weak _mutex_held = mutex_held
2888 int
2889 mutex_held(mutex_t *mparg)
2890 {
2891 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2892 
2893 	if (mparg->mutex_type & USYNC_PROCESS)
2894 		return (shared_mutex_held(mparg));
2895 	return (MUTEX_OWNED(mp, curthread));
2896 }
2897 
2898 #pragma weak pthread_mutex_destroy = mutex_destroy
2899 #pragma weak _mutex_destroy = mutex_destroy
2900 int
2901 mutex_destroy(mutex_t *mp)
2902 {
2903 	if (mp->mutex_type & USYNC_PROCESS)
2904 		forget_lock(mp);
2905 	(void) memset(mp, 0, sizeof (*mp));
2906 	tdb_sync_obj_deregister(mp);
2907 	return (0);
2908 }
2909 
2910 #pragma weak pthread_mutex_consistent_np = mutex_consistent
2911 #pragma weak pthread_mutex_consistent = mutex_consistent
2912 int
2913 mutex_consistent(mutex_t *mp)
2914 {
2915 	/*
2916 	 * Do this only for an inconsistent, initialized robust lock
2917 	 * that we hold.  For all other cases, return EINVAL.
2918 	 */
2919 	if (mutex_held(mp) &&
2920 	    (mp->mutex_type & LOCK_ROBUST) &&
2921 	    (mp->mutex_flag & LOCK_INITED) &&
2922 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2923 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2924 		mp->mutex_rcount = 0;
2925 		return (0);
2926 	}
2927 	return (EINVAL);
2928 }
2929 
2930 /*
2931  * Spin locks are separate from ordinary mutexes,
2932  * but we use the same data structure for them.
2933  */
2934 
2935 int
2936 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2937 {
2938 	mutex_t *mp = (mutex_t *)lock;
2939 
2940 	(void) memset(mp, 0, sizeof (*mp));
2941 	if (pshared == PTHREAD_PROCESS_SHARED)
2942 		mp->mutex_type = USYNC_PROCESS;
2943 	else
2944 		mp->mutex_type = USYNC_THREAD;
2945 	mp->mutex_flag = LOCK_INITED;
2946 	mp->mutex_magic = MUTEX_MAGIC;
2947 
2948 	/*
2949 	 * This should be at the beginning of the function,
2950 	 * but for the sake of old broken applications that
2951 	 * do not have proper alignment for their mutexes
2952 	 * (and don't check the return code from pthread_spin_init),
2953 	 * we put it here, after initializing the mutex regardless.
2954 	 */
2955 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2956 	    curthread->ul_misaligned == 0)
2957 		return (EINVAL);
2958 
2959 	return (0);
2960 }
2961 
2962 int
2963 pthread_spin_destroy(pthread_spinlock_t *lock)
2964 {
2965 	(void) memset(lock, 0, sizeof (*lock));
2966 	return (0);
2967 }
2968 
2969 int
2970 pthread_spin_trylock(pthread_spinlock_t *lock)
2971 {
2972 	mutex_t *mp = (mutex_t *)lock;
2973 	ulwp_t *self = curthread;
2974 	int error = 0;
2975 
2976 	no_preempt(self);
2977 	if (set_lock_byte(&mp->mutex_lockw) != 0)
2978 		error = EBUSY;
2979 	else {
2980 		mp->mutex_owner = (uintptr_t)self;
2981 		if (mp->mutex_type == USYNC_PROCESS)
2982 			mp->mutex_ownerpid = self->ul_uberdata->pid;
2983 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2984 	}
2985 	preempt(self);
2986 	return (error);
2987 }
2988 
2989 int
2990 pthread_spin_lock(pthread_spinlock_t *lock)
2991 {
2992 	mutex_t *mp = (mutex_t *)lock;
2993 	ulwp_t *self = curthread;
2994 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2995 	int count = 0;
2996 
2997 	ASSERT(!self->ul_critical || self->ul_bindflags);
2998 
2999 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
3000 
3001 	/*
3002 	 * We don't care whether the owner is running on a processor.
3003 	 * We just spin because that's what this interface requires.
3004 	 */
3005 	for (;;) {
3006 		if (*lockp == 0) {	/* lock byte appears to be clear */
3007 			no_preempt(self);
3008 			if (set_lock_byte(lockp) == 0)
3009 				break;
3010 			preempt(self);
3011 		}
3012 		if (count < INT_MAX)
3013 			count++;
3014 		SMT_PAUSE();
3015 	}
3016 	mp->mutex_owner = (uintptr_t)self;
3017 	if (mp->mutex_type == USYNC_PROCESS)
3018 		mp->mutex_ownerpid = self->ul_uberdata->pid;
3019 	preempt(self);
3020 	if (count) {
3021 		DTRACE_PROBE3(plockstat, mutex__spun, mp, 1, count);
3022 	}
3023 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
3024 	return (0);
3025 }
3026 
3027 int
3028 pthread_spin_unlock(pthread_spinlock_t *lock)
3029 {
3030 	mutex_t *mp = (mutex_t *)lock;
3031 	ulwp_t *self = curthread;
3032 
3033 	no_preempt(self);
3034 	mp->mutex_owner = 0;
3035 	mp->mutex_ownerpid = 0;
3036 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
3037 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
3038 	preempt(self);
3039 	return (0);
3040 }
3041 
3042 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
3043 
3044 /*
3045  * Find/allocate an entry for 'lock' in our array of held locks.
3046  */
3047 static mutex_t **
3048 find_lock_entry(mutex_t *lock)
3049 {
3050 	ulwp_t *self = curthread;
3051 	mutex_t **remembered = NULL;
3052 	mutex_t **lockptr;
3053 	uint_t nlocks;
3054 
3055 	if ((nlocks = self->ul_heldlockcnt) != 0)
3056 		lockptr = self->ul_heldlocks.array;
3057 	else {
3058 		nlocks = 1;
3059 		lockptr = &self->ul_heldlocks.single;
3060 	}
3061 
3062 	for (; nlocks; nlocks--, lockptr++) {
3063 		if (*lockptr == lock)
3064 			return (lockptr);
3065 		if (*lockptr == NULL && remembered == NULL)
3066 			remembered = lockptr;
3067 	}
3068 	if (remembered != NULL) {
3069 		*remembered = lock;
3070 		return (remembered);
3071 	}
3072 
3073 	/*
3074 	 * No entry available.  Allocate more space, converting
3075 	 * the single entry into an array of entries if necessary.
3076 	 */
3077 	if ((nlocks = self->ul_heldlockcnt) == 0) {
3078 		/*
3079 		 * Initial allocation of the array.
3080 		 * Convert the single entry into an array.
3081 		 */
3082 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
3083 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
3084 		/*
3085 		 * The single entry becomes the first entry in the array.
3086 		 */
3087 		*lockptr = self->ul_heldlocks.single;
3088 		self->ul_heldlocks.array = lockptr;
3089 		/*
3090 		 * Return the next available entry in the array.
3091 		 */
3092 		*++lockptr = lock;
3093 		return (lockptr);
3094 	}
3095 	/*
3096 	 * Reallocate the array, double the size each time.
3097 	 */
3098 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
3099 	(void) memcpy(lockptr, self->ul_heldlocks.array,
3100 	    nlocks * sizeof (mutex_t *));
3101 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3102 	self->ul_heldlocks.array = lockptr;
3103 	self->ul_heldlockcnt *= 2;
3104 	/*
3105 	 * Return the next available entry in the newly allocated array.
3106 	 */
3107 	*(lockptr += nlocks) = lock;
3108 	return (lockptr);
3109 }
3110 
3111 /*
3112  * Insert 'lock' into our list of held locks.
3113  * Currently only used for LOCK_ROBUST mutexes.
3114  */
3115 void
3116 remember_lock(mutex_t *lock)
3117 {
3118 	(void) find_lock_entry(lock);
3119 }
3120 
3121 /*
3122  * Remove 'lock' from our list of held locks.
3123  * Currently only used for LOCK_ROBUST mutexes.
3124  */
3125 void
3126 forget_lock(mutex_t *lock)
3127 {
3128 	*find_lock_entry(lock) = NULL;
3129 }
3130 
3131 /*
3132  * Free the array of held locks.
3133  */
3134 void
3135 heldlock_free(ulwp_t *ulwp)
3136 {
3137 	uint_t nlocks;
3138 
3139 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
3140 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
3141 	ulwp->ul_heldlockcnt = 0;
3142 	ulwp->ul_heldlocks.array = NULL;
3143 }
3144 
3145 /*
3146  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3147  * Called from _thrp_exit() to deal with abandoned locks.
3148  */
3149 void
3150 heldlock_exit(void)
3151 {
3152 	ulwp_t *self = curthread;
3153 	mutex_t **lockptr;
3154 	uint_t nlocks;
3155 	mutex_t *mp;
3156 
3157 	if ((nlocks = self->ul_heldlockcnt) != 0)
3158 		lockptr = self->ul_heldlocks.array;
3159 	else {
3160 		nlocks = 1;
3161 		lockptr = &self->ul_heldlocks.single;
3162 	}
3163 
3164 	for (; nlocks; nlocks--, lockptr++) {
3165 		/*
3166 		 * The kernel takes care of transitioning held
3167 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3168 		 * We avoid that case here.
3169 		 */
3170 		if ((mp = *lockptr) != NULL &&
3171 		    mutex_held(mp) &&
3172 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3173 		    LOCK_ROBUST) {
3174 			mp->mutex_rcount = 0;
3175 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
3176 				mp->mutex_flag |= LOCK_OWNERDEAD;
3177 			(void) mutex_unlock_internal(mp, 1);
3178 		}
3179 	}
3180 
3181 	heldlock_free(self);
3182 }
3183 
3184 #pragma weak _cond_init = cond_init
3185 int
3186 cond_init(cond_t *cvp, int type, void *arg __unused)
3187 {
3188 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
3189 		return (EINVAL);
3190 
3191 	/*
3192 	 * This memset initializes cond_clock to CLOCK_REALTIME.
3193 	 */
3194 	(void) memset(cvp, 0, sizeof (*cvp));
3195 	cvp->cond_type = (uint16_t)type;
3196 	cvp->cond_magic = COND_MAGIC;
3197 
3198 	/*
3199 	 * This should be at the beginning of the function,
3200 	 * but for the sake of old broken applications that
3201 	 * do not have proper alignment for their condvars
3202 	 * (and don't check the return code from cond_init),
3203 	 * we put it here, after initializing the condvar regardless.
3204 	 */
3205 	if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3206 	    curthread->ul_misaligned == 0)
3207 		return (EINVAL);
3208 
3209 	return (0);
3210 }
3211 
3212 /*
3213  * cond_sleep_queue(): utility function for cond_wait_queue().
3214  *
3215  * Go to sleep on a condvar sleep queue, expect to be waked up
3216  * by someone calling cond_signal() or cond_broadcast() or due
3217  * to receiving a UNIX signal or being cancelled, or just simply
3218  * due to a spurious wakeup (like someome calling forkall()).
3219  *
3220  * The associated mutex is *not* reacquired before returning.
3221  * That must be done by the caller of cond_sleep_queue().
3222  */
3223 static int
3224 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3225 {
3226 	ulwp_t *self = curthread;
3227 	queue_head_t *qp;
3228 	queue_head_t *mqp;
3229 	lwpid_t lwpid;
3230 	int signalled;
3231 	int error;
3232 	int cv_wake;
3233 	int release_all;
3234 
3235 	/*
3236 	 * Put ourself on the CV sleep queue, unlock the mutex, then
3237 	 * park ourself and unpark a candidate lwp to grab the mutex.
3238 	 * We must go onto the CV sleep queue before dropping the
3239 	 * mutex in order to guarantee atomicity of the operation.
3240 	 */
3241 	self->ul_sp = stkptr();
3242 	qp = queue_lock(cvp, CV);
3243 	enqueue(qp, self, 0);
3244 	cvp->cond_waiters_user = 1;
3245 	self->ul_cvmutex = mp;
3246 	self->ul_cv_wake = cv_wake = (tsp != NULL);
3247 	self->ul_signalled = 0;
3248 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
3249 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
3250 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3251 	}
3252 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3253 	lwpid = mutex_unlock_queue(mp, release_all);
3254 	for (;;) {
3255 		set_parking_flag(self, 1);
3256 		queue_unlock(qp);
3257 		if (lwpid != 0) {
3258 			lwpid = preempt_unpark(self, lwpid);
3259 			preempt(self);
3260 		}
3261 		/*
3262 		 * We may have a deferred signal present,
3263 		 * in which case we should return EINTR.
3264 		 * Also, we may have received a SIGCANCEL; if so
3265 		 * and we are cancelable we should return EINTR.
3266 		 * We force an immediate EINTR return from
3267 		 * __lwp_park() by turning our parking flag off.
3268 		 */
3269 		if (self->ul_cursig != 0 ||
3270 		    (self->ul_cancelable && self->ul_cancel_pending))
3271 			set_parking_flag(self, 0);
3272 		/*
3273 		 * __lwp_park() will return the residual time in tsp
3274 		 * if we are unparked before the timeout expires.
3275 		 */
3276 		error = __lwp_park(tsp, lwpid);
3277 		set_parking_flag(self, 0);
3278 		lwpid = 0;	/* unpark the other lwp only once */
3279 		/*
3280 		 * We were waked up by cond_signal(), cond_broadcast(),
3281 		 * by an interrupt or timeout (EINTR or ETIME),
3282 		 * or we may just have gotten a spurious wakeup.
3283 		 */
3284 		qp = queue_lock(cvp, CV);
3285 		if (!cv_wake)
3286 			mqp = queue_lock(mp, MX);
3287 		if (self->ul_sleepq == NULL)
3288 			break;
3289 		/*
3290 		 * We are on either the condvar sleep queue or the
3291 		 * mutex sleep queue.  Break out of the sleep if we
3292 		 * were interrupted or we timed out (EINTR or ETIME).
3293 		 * Else this is a spurious wakeup; continue the loop.
3294 		 */
3295 		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3296 			if (error) {
3297 				mp->mutex_waiters = dequeue_self(mqp);
3298 				break;
3299 			}
3300 			tsp = NULL;	/* no more timeout */
3301 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
3302 			if (error) {
3303 				cvp->cond_waiters_user = dequeue_self(qp);
3304 				break;
3305 			}
3306 			/*
3307 			 * Else a spurious wakeup on the condvar queue.
3308 			 * __lwp_park() has already adjusted the timeout.
3309 			 */
3310 		} else {
3311 			thr_panic("cond_sleep_queue(): thread not on queue");
3312 		}
3313 		if (!cv_wake)
3314 			queue_unlock(mqp);
3315 	}
3316 
3317 	self->ul_sp = 0;
3318 	self->ul_cv_wake = 0;
3319 	ASSERT(self->ul_cvmutex == NULL);
3320 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3321 	    self->ul_wchan == NULL);
3322 
3323 	signalled = self->ul_signalled;
3324 	self->ul_signalled = 0;
3325 	queue_unlock(qp);
3326 	if (!cv_wake)
3327 		queue_unlock(mqp);
3328 
3329 	/*
3330 	 * If we were concurrently cond_signal()d and any of:
3331 	 * received a UNIX signal, were cancelled, or got a timeout,
3332 	 * then perform another cond_signal() to avoid consuming it.
3333 	 */
3334 	if (error && signalled)
3335 		(void) cond_signal(cvp);
3336 
3337 	return (error);
3338 }
3339 
3340 static void
3341 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3342 {
3343 	if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3344 		lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3345 	if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3346 		lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3347 }
3348 
3349 int
3350 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3351 {
3352 	ulwp_t *self = curthread;
3353 	int error;
3354 	int merror;
3355 
3356 	if (self->ul_error_detection && self->ul_misaligned == 0)
3357 		cond_wait_check_alignment(cvp, mp);
3358 
3359 	/*
3360 	 * The old thread library was programmed to defer signals
3361 	 * while in cond_wait() so that the associated mutex would
3362 	 * be guaranteed to be held when the application signal
3363 	 * handler was invoked.
3364 	 *
3365 	 * We do not behave this way by default; the state of the
3366 	 * associated mutex in the signal handler is undefined.
3367 	 *
3368 	 * To accommodate applications that depend on the old
3369 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
3370 	 * variable can be set to 1 and we will behave in the
3371 	 * old way with respect to cond_wait().
3372 	 */
3373 	if (self->ul_cond_wait_defer)
3374 		sigoff(self);
3375 
3376 	error = cond_sleep_queue(cvp, mp, tsp);
3377 
3378 	/*
3379 	 * Reacquire the mutex.
3380 	 */
3381 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3382 		error = merror;
3383 
3384 	/*
3385 	 * Take any deferred signal now, after we have reacquired the mutex.
3386 	 */
3387 	if (self->ul_cond_wait_defer)
3388 		sigon(self);
3389 
3390 	return (error);
3391 }
3392 
3393 /*
3394  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3395  * See the comment ahead of cond_sleep_queue(), above.
3396  */
3397 static int
3398 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3399 {
3400 	int mtype = mp->mutex_type;
3401 	ulwp_t *self = curthread;
3402 	int error;
3403 
3404 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3405 		_ceil_prio_waive();
3406 
3407 	self->ul_sp = stkptr();
3408 	self->ul_wchan = cvp;
3409 	sigoff(self);
3410 	mp->mutex_owner = 0;
3411 	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3412 	if (mtype & LOCK_PRIO_INHERIT) {
3413 		mp->mutex_lockw = LOCKCLEAR;
3414 		self->ul_pilocks--;
3415 	}
3416 	/*
3417 	 * ___lwp_cond_wait() returns immediately with EINTR if
3418 	 * set_parking_flag(self,0) is called on this lwp before it
3419 	 * goes to sleep in the kernel.  sigacthandler() calls this
3420 	 * when a deferred signal is noted.  This assures that we don't
3421 	 * get stuck in ___lwp_cond_wait() with all signals blocked
3422 	 * due to taking a deferred signal before going to sleep.
3423 	 */
3424 	set_parking_flag(self, 1);
3425 	if (self->ul_cursig != 0 ||
3426 	    (self->ul_cancelable && self->ul_cancel_pending))
3427 		set_parking_flag(self, 0);
3428 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3429 	set_parking_flag(self, 0);
3430 	sigon(self);
3431 	self->ul_sp = 0;
3432 	self->ul_wchan = NULL;
3433 	return (error);
3434 }
3435 
3436 int
3437 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3438 {
3439 	ulwp_t *self = curthread;
3440 	int error;
3441 	int merror;
3442 
3443 	if (self->ul_error_detection && self->ul_misaligned == 0)
3444 		cond_wait_check_alignment(cvp, mp);
3445 
3446 	/*
3447 	 * See the large comment in cond_wait_queue(), above.
3448 	 */
3449 	if (self->ul_cond_wait_defer)
3450 		sigoff(self);
3451 
3452 	error = cond_sleep_kernel(cvp, mp, tsp);
3453 
3454 	/*
3455 	 * Override the return code from ___lwp_cond_wait()
3456 	 * with any non-zero return code from mutex_lock().
3457 	 * This addresses robust lock failures in particular;
3458 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3459 	 * errors in order to take corrective action.
3460 	 */
3461 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3462 		error = merror;
3463 
3464 	/*
3465 	 * Take any deferred signal now, after we have reacquired the mutex.
3466 	 */
3467 	if (self->ul_cond_wait_defer)
3468 		sigon(self);
3469 
3470 	return (error);
3471 }
3472 
3473 /*
3474  * Common code for cond_wait() and cond_timedwait()
3475  */
3476 int
3477 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3478 {
3479 	int mtype = mp->mutex_type;
3480 	hrtime_t begin_sleep = 0;
3481 	ulwp_t *self = curthread;
3482 	uberdata_t *udp = self->ul_uberdata;
3483 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3484 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3485 	uint8_t rcount;
3486 	int error = 0;
3487 
3488 	/*
3489 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3490 	 *	Except in the case of [ETIMEDOUT], all these error checks
3491 	 *	shall act as if they were performed immediately at the
3492 	 *	beginning of processing for the function and shall cause
3493 	 *	an error return, in effect, prior to modifying the state
3494 	 *	of the mutex specified by mutex or the condition variable
3495 	 *	specified by cond.
3496 	 * Therefore, we must return EINVAL now if the timout is invalid.
3497 	 */
3498 	if (tsp != NULL &&
3499 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3500 		return (EINVAL);
3501 
3502 	if (__td_event_report(self, TD_SLEEP, udp)) {
3503 		self->ul_sp = stkptr();
3504 		self->ul_wchan = cvp;
3505 		self->ul_td_evbuf.eventnum = TD_SLEEP;
3506 		self->ul_td_evbuf.eventdata = cvp;
3507 		tdb_event(TD_SLEEP, udp);
3508 		self->ul_sp = 0;
3509 	}
3510 	if (csp) {
3511 		if (tsp)
3512 			tdb_incr(csp->cond_timedwait);
3513 		else
3514 			tdb_incr(csp->cond_wait);
3515 	}
3516 	if (msp)
3517 		begin_sleep = record_hold_time(msp);
3518 	else if (csp)
3519 		begin_sleep = gethrtime();
3520 
3521 	if (self->ul_error_detection) {
3522 		if (!mutex_held(mp))
3523 			lock_error(mp, "cond_wait", cvp, NULL);
3524 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3525 			lock_error(mp, "recursive mutex in cond_wait",
3526 			    cvp, NULL);
3527 		if (cvp->cond_type & USYNC_PROCESS) {
3528 			if (!(mtype & USYNC_PROCESS))
3529 				lock_error(mp, "cond_wait", cvp,
3530 				    "condvar process-shared, "
3531 				    "mutex process-private");
3532 		} else {
3533 			if (mtype & USYNC_PROCESS)
3534 				lock_error(mp, "cond_wait", cvp,
3535 				    "condvar process-private, "
3536 				    "mutex process-shared");
3537 		}
3538 	}
3539 
3540 	/*
3541 	 * We deal with recursive mutexes by completely
3542 	 * dropping the lock and restoring the recursion
3543 	 * count after waking up.  This is arguably wrong,
3544 	 * but it obeys the principle of least astonishment.
3545 	 */
3546 	rcount = mp->mutex_rcount;
3547 	mp->mutex_rcount = 0;
3548 	if ((mtype &
3549 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3550 	    (cvp->cond_type & USYNC_PROCESS))
3551 		error = cond_wait_kernel(cvp, mp, tsp);
3552 	else
3553 		error = cond_wait_queue(cvp, mp, tsp);
3554 	mp->mutex_rcount = rcount;
3555 
3556 	if (csp) {
3557 		hrtime_t lapse = gethrtime() - begin_sleep;
3558 		if (tsp == NULL)
3559 			csp->cond_wait_sleep_time += lapse;
3560 		else {
3561 			csp->cond_timedwait_sleep_time += lapse;
3562 			if (error == ETIME)
3563 				tdb_incr(csp->cond_timedwait_timeout);
3564 		}
3565 	}
3566 	return (error);
3567 }
3568 
3569 /*
3570  * cond_wait() is a cancellation point but __cond_wait() is not.
3571  * Internally, libc calls the non-cancellation version.
3572  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3573  * since __cond_wait() is not exported from libc.
3574  */
3575 int
3576 __cond_wait(cond_t *cvp, mutex_t *mp)
3577 {
3578 	ulwp_t *self = curthread;
3579 	uberdata_t *udp = self->ul_uberdata;
3580 	uberflags_t *gflags;
3581 
3582 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3583 	    !mutex_held(mp))
3584 		return (EPERM);
3585 
3586 	/*
3587 	 * Optimize the common case of USYNC_THREAD plus
3588 	 * no error detection, no lock statistics, and no event tracing.
3589 	 */
3590 	if ((gflags = self->ul_schedctl_called) != NULL &&
3591 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3592 	    self->ul_td_events_enable |
3593 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3594 		return (cond_wait_queue(cvp, mp, NULL));
3595 
3596 	/*
3597 	 * Else do it the long way.
3598 	 */
3599 	return (cond_wait_common(cvp, mp, NULL));
3600 }
3601 
3602 #pragma weak _cond_wait = cond_wait
3603 int
3604 cond_wait(cond_t *cvp, mutex_t *mp)
3605 {
3606 	int error;
3607 
3608 	_cancelon();
3609 	error = __cond_wait(cvp, mp);
3610 	if (error == EINTR)
3611 		_canceloff();
3612 	else
3613 		_canceloff_nocancel();
3614 	return (error);
3615 }
3616 
3617 /*
3618  * pthread_cond_wait() is a cancellation point.
3619  */
3620 int
3621 pthread_cond_wait(pthread_cond_t *restrict cvp, pthread_mutex_t *restrict mp)
3622 {
3623 	int error;
3624 
3625 	error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3626 	return ((error == EINTR)? 0 : error);
3627 }
3628 
3629 /*
3630  * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3631  */
3632 int
3633 __cond_timedwait(cond_t *cvp, mutex_t *mp, clockid_t clock_id,
3634     const timespec_t *abstime)
3635 {
3636 	timespec_t reltime;
3637 	int error;
3638 
3639 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3640 	    !mutex_held(mp))
3641 		return (EPERM);
3642 
3643 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3644 		clock_id = CLOCK_REALTIME;
3645 	abstime_to_reltime(clock_id, abstime, &reltime);
3646 	error = cond_wait_common(cvp, mp, &reltime);
3647 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3648 		/*
3649 		 * Don't return ETIME if we didn't really get a timeout.
3650 		 * This can happen if we return because someone resets
3651 		 * the system clock.  Just return zero in this case,
3652 		 * giving a spurious wakeup but not a timeout.
3653 		 */
3654 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3655 		    abstime->tv_nsec > gethrtime())
3656 			error = 0;
3657 	}
3658 	return (error);
3659 }
3660 
3661 static int
3662 cond_clockwait(cond_t *cvp, mutex_t *mp, clockid_t clock,
3663     const timespec_t *abstime)
3664 {
3665 	int error;
3666 
3667 	_cancelon();
3668 	error = __cond_timedwait(cvp, mp, clock, abstime);
3669 	if (error == EINTR)
3670 		_canceloff();
3671 	else
3672 		_canceloff_nocancel();
3673 	return (error);
3674 }
3675 
3676 /*
3677  * This is a function internal to libc that determines the clockid to return for
3678  * a cond_t. The cond_t (and the pthreads / C equivalent) encode a clock id that
3679  * should be used as a timing source. When using the static initializers, which
3680  * set this to zero, cond_clockid will end up set to __CLOCK_REALTIME0 which
3681  * isn't really used in the system any more. Consumers of the clockid call this
3682  * to translate this. Note, we fail open such that if someone has corrupted the
3683  * clockid it will end up in a well known clock to continue the traditional
3684  * system behavior.
3685  */
3686 static clockid_t
3687 cond_clock(cond_t *cvp)
3688 {
3689 	if (cvp->cond_clockid != CLOCK_REALTIME &&
3690 	    cvp->cond_clockid != CLOCK_MONOTONIC) {
3691 		return (CLOCK_REALTIME);
3692 	}
3693 
3694 	return (cvp->cond_clockid);
3695 }
3696 
3697 int
3698 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3699 {
3700 	return (cond_clockwait(cvp, mp, cond_clock(cvp), abstime));
3701 }
3702 
3703 /*
3704  * pthread_cond_timedwait() and pthread_cond_clockwait() are cancellation
3705  * points. We need to check for cancellation before we evaluate whether the
3706  * clock is valid.
3707  */
3708 int
3709 pthread_cond_clockwait(pthread_cond_t *restrict cvp,
3710     pthread_mutex_t *restrict mp, clockid_t clock,
3711     const struct timespec *restrict abstime)
3712 {
3713 	int error;
3714 
3715 	switch (clock) {
3716 	case CLOCK_REALTIME:
3717 	case CLOCK_HIGHRES:
3718 		break;
3719 	default:
3720 		return (EINVAL);
3721 	}
3722 
3723 	/* We need to translate between the native threads errors and POSIX */
3724 	error = cond_clockwait((cond_t *)cvp, (mutex_t *)mp, clock, abstime);
3725 	if (error == ETIME)
3726 		error = ETIMEDOUT;
3727 	else if (error == EINTR)
3728 		error = 0;
3729 	return (error);
3730 }
3731 
3732 int
3733 pthread_cond_timedwait(pthread_cond_t *restrict cvp,
3734     pthread_mutex_t *restrict mp, const struct timespec *restrict abstime)
3735 {
3736 	cond_t *cond = (cond_t *)cvp;
3737 	return (pthread_cond_clockwait(cvp, mp, cond_clock(cond), abstime));
3738 }
3739 
3740 /*
3741  * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3742  *
3743  * Note, this function does not actually consume the clock id. Internally all
3744  * waits are based upon the highres clock in the system and therefore the actual
3745  * clock used is ignored at this point.
3746  */
3747 int
3748 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3749 {
3750 	timespec_t tslocal = *reltime;
3751 
3752 	if ((mp->mutex_type & (LOCK_ERRORCHECK | LOCK_ROBUST)) &&
3753 	    !mutex_held(mp))
3754 		return (EPERM);
3755 
3756 	return (cond_wait_common(cvp, mp, &tslocal));
3757 }
3758 
3759 int
3760 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3761 {
3762 	int error;
3763 
3764 	_cancelon();
3765 	error = __cond_reltimedwait(cvp, mp, reltime);
3766 	if (error == EINTR)
3767 		_canceloff();
3768 	else
3769 		_canceloff_nocancel();
3770 	return (error);
3771 }
3772 
3773 int
3774 pthread_cond_relclockwait_np(pthread_cond_t *restrict cvp,
3775     pthread_mutex_t *restrict mp, clockid_t clock,
3776     const struct timespec *restrict reltime)
3777 {
3778 	int error;
3779 
3780 	switch (clock) {
3781 	case CLOCK_REALTIME:
3782 	case CLOCK_HIGHRES:
3783 		break;
3784 	default:
3785 		return (EINVAL);
3786 	}
3787 
3788 	error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3789 	if (error == ETIME)
3790 		error = ETIMEDOUT;
3791 	else if (error == EINTR)
3792 		error = 0;
3793 	return (error);
3794 }
3795 
3796 int
3797 pthread_cond_reltimedwait_np(pthread_cond_t *restrict cvp,
3798     pthread_mutex_t *restrict mp, const struct timespec *restrict reltime)
3799 {
3800 	cond_t *cond = (cond_t *)cvp;
3801 	return (pthread_cond_relclockwait_np(cvp, mp, cond_clock(cond),
3802 	    reltime));
3803 }
3804 
3805 #pragma weak pthread_cond_signal = cond_signal
3806 #pragma weak _cond_signal = cond_signal
3807 int
3808 cond_signal(cond_t *cvp)
3809 {
3810 	ulwp_t *self = curthread;
3811 	uberdata_t *udp = self->ul_uberdata;
3812 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3813 	int error = 0;
3814 	int more;
3815 	lwpid_t lwpid;
3816 	queue_head_t *qp;
3817 	mutex_t *mp;
3818 	queue_head_t *mqp;
3819 	ulwp_t **ulwpp;
3820 	ulwp_t *ulwp;
3821 	ulwp_t *prev;
3822 
3823 	if (csp)
3824 		tdb_incr(csp->cond_signal);
3825 
3826 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3827 		error = _lwp_cond_signal(cvp);
3828 
3829 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3830 		return (error);
3831 
3832 	/*
3833 	 * Move some thread from the condvar sleep queue to the mutex sleep
3834 	 * queue for the mutex that it will acquire on being waked up.
3835 	 * We can do this only if we own the mutex it will acquire.
3836 	 * If we do not own the mutex, or if its ul_cv_wake flag
3837 	 * is set, just dequeue and unpark it.
3838 	 */
3839 	qp = queue_lock(cvp, CV);
3840 	ulwpp = queue_slot(qp, &prev, &more);
3841 	cvp->cond_waiters_user = more;
3842 	if (ulwpp == NULL) {	/* no one on the sleep queue */
3843 		queue_unlock(qp);
3844 		return (error);
3845 	}
3846 	ulwp = *ulwpp;
3847 
3848 	/*
3849 	 * Inform the thread that it was the recipient of a cond_signal().
3850 	 * This lets it deal with cond_signal() and, concurrently,
3851 	 * one or more of a cancellation, a UNIX signal, or a timeout.
3852 	 * These latter conditions must not consume a cond_signal().
3853 	 */
3854 	ulwp->ul_signalled = 1;
3855 
3856 	/*
3857 	 * Dequeue the waiter but leave its ul_sleepq non-NULL
3858 	 * while we move it to the mutex queue so that it can
3859 	 * deal properly with spurious wakeups.
3860 	 */
3861 	queue_unlink(qp, ulwpp, prev);
3862 
3863 	mp = ulwp->ul_cvmutex;		/* the mutex it will acquire */
3864 	ulwp->ul_cvmutex = NULL;
3865 	ASSERT(mp != NULL);
3866 
3867 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3868 		/* just wake it up */
3869 		lwpid = ulwp->ul_lwpid;
3870 		no_preempt(self);
3871 		ulwp->ul_sleepq = NULL;
3872 		ulwp->ul_wchan = NULL;
3873 		queue_unlock(qp);
3874 		(void) __lwp_unpark(lwpid);
3875 		preempt(self);
3876 	} else {
3877 		/* move it to the mutex queue */
3878 		mqp = queue_lock(mp, MX);
3879 		enqueue(mqp, ulwp, 0);
3880 		mp->mutex_waiters = 1;
3881 		queue_unlock(mqp);
3882 		queue_unlock(qp);
3883 	}
3884 
3885 	return (error);
3886 }
3887 
3888 /*
3889  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3890  * and rw_queue_release() to (re)allocate a big buffer to hold the
3891  * lwpids of all the threads to be set running after they are removed
3892  * from their sleep queues.  Since we are holding a queue lock, we
3893  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3894  * lwp_unpark_all() are simple system calls and are safe in this regard.
3895  */
3896 lwpid_t *
3897 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3898 {
3899 	/*
3900 	 * Allocate NEWLWPS ids on the first overflow.
3901 	 * Double the allocation each time after that.
3902 	 */
3903 	int nlwpid = *nlwpid_ptr;
3904 	int maxlwps = *maxlwps_ptr;
3905 	int first_allocation;
3906 	int newlwps;
3907 	void *vaddr;
3908 
3909 	ASSERT(nlwpid == maxlwps);
3910 
3911 	first_allocation = (maxlwps == MAXLWPS);
3912 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3913 	vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3914 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3915 
3916 	if (vaddr == MAP_FAILED) {
3917 		/*
3918 		 * Let's hope this never happens.
3919 		 * If it does, then we have a terrible
3920 		 * thundering herd on our hands.
3921 		 */
3922 		(void) __lwp_unpark_all(lwpid, nlwpid);
3923 		*nlwpid_ptr = 0;
3924 	} else {
3925 		(void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3926 		if (!first_allocation)
3927 			(void) munmap((caddr_t)lwpid,
3928 			    maxlwps * sizeof (lwpid_t));
3929 		lwpid = vaddr;
3930 		*maxlwps_ptr = newlwps;
3931 	}
3932 
3933 	return (lwpid);
3934 }
3935 
3936 #pragma weak pthread_cond_broadcast = cond_broadcast
3937 #pragma weak _cond_broadcast = cond_broadcast
3938 int
3939 cond_broadcast(cond_t *cvp)
3940 {
3941 	ulwp_t *self = curthread;
3942 	uberdata_t *udp = self->ul_uberdata;
3943 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3944 	int error = 0;
3945 	queue_head_t *qp;
3946 	queue_root_t *qrp;
3947 	mutex_t *mp;
3948 	mutex_t *mp_cache = NULL;
3949 	queue_head_t *mqp = NULL;
3950 	ulwp_t *ulwp;
3951 	int nlwpid = 0;
3952 	int maxlwps = MAXLWPS;
3953 	lwpid_t buffer[MAXLWPS];
3954 	lwpid_t *lwpid = buffer;
3955 
3956 	if (csp)
3957 		tdb_incr(csp->cond_broadcast);
3958 
3959 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3960 		error = _lwp_cond_broadcast(cvp);
3961 
3962 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3963 		return (error);
3964 
3965 	/*
3966 	 * Move everyone from the condvar sleep queue to the mutex sleep
3967 	 * queue for the mutex that they will acquire on being waked up.
3968 	 * We can do this only if we own the mutex they will acquire.
3969 	 * If we do not own the mutex, or if their ul_cv_wake flag
3970 	 * is set, just dequeue and unpark them.
3971 	 *
3972 	 * We keep track of lwpids that are to be unparked in lwpid[].
3973 	 * __lwp_unpark_all() is called to unpark all of them after
3974 	 * they have been removed from the sleep queue and the sleep
3975 	 * queue lock has been dropped.  If we run out of space in our
3976 	 * on-stack buffer, we need to allocate more but we can't call
3977 	 * lmalloc() because we are holding a queue lock when the overflow
3978 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3979 	 * either because the application may have allocated a small
3980 	 * stack and we don't want to overrun the stack.  So we call
3981 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3982 	 * system call directly since that path acquires no locks.
3983 	 */
3984 	qp = queue_lock(cvp, CV);
3985 	cvp->cond_waiters_user = 0;
3986 	for (;;) {
3987 		if ((qrp = qp->qh_root) == NULL ||
3988 		    (ulwp = qrp->qr_head) == NULL)
3989 			break;
3990 		ASSERT(ulwp->ul_wchan == cvp);
3991 		queue_unlink(qp, &qrp->qr_head, NULL);
3992 		mp = ulwp->ul_cvmutex;		/* its mutex */
3993 		ulwp->ul_cvmutex = NULL;
3994 		ASSERT(mp != NULL);
3995 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3996 			/* just wake it up */
3997 			ulwp->ul_sleepq = NULL;
3998 			ulwp->ul_wchan = NULL;
3999 			if (nlwpid == maxlwps)
4000 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
4001 			lwpid[nlwpid++] = ulwp->ul_lwpid;
4002 		} else {
4003 			/* move it to the mutex queue */
4004 			if (mp != mp_cache) {
4005 				mp_cache = mp;
4006 				if (mqp != NULL)
4007 					queue_unlock(mqp);
4008 				mqp = queue_lock(mp, MX);
4009 			}
4010 			enqueue(mqp, ulwp, 0);
4011 			mp->mutex_waiters = 1;
4012 		}
4013 	}
4014 	if (mqp != NULL)
4015 		queue_unlock(mqp);
4016 	if (nlwpid == 0) {
4017 		queue_unlock(qp);
4018 	} else {
4019 		no_preempt(self);
4020 		queue_unlock(qp);
4021 		if (nlwpid == 1)
4022 			(void) __lwp_unpark(lwpid[0]);
4023 		else
4024 			(void) __lwp_unpark_all(lwpid, nlwpid);
4025 		preempt(self);
4026 	}
4027 	if (lwpid != buffer)
4028 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
4029 	return (error);
4030 }
4031 
4032 #pragma weak pthread_cond_destroy = cond_destroy
4033 int
4034 cond_destroy(cond_t *cvp)
4035 {
4036 	cvp->cond_magic = 0;
4037 	tdb_sync_obj_deregister(cvp);
4038 	return (0);
4039 }
4040 
4041 #if defined(DEBUG)
4042 void
4043 assert_no_libc_locks_held(void)
4044 {
4045 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
4046 }
4047 
4048 /* protected by link_lock */
4049 uint64_t spin_lock_spin;
4050 uint64_t spin_lock_spin2;
4051 uint64_t spin_lock_sleep;
4052 uint64_t spin_lock_wakeup;
4053 
4054 /*
4055  * Record spin lock statistics.
4056  * Called by a thread exiting itself in thrp_exit().
4057  * Also called via atexit() from the thread calling
4058  * exit() to do all the other threads as well.
4059  */
4060 void
4061 record_spin_locks(ulwp_t *ulwp)
4062 {
4063 	spin_lock_spin += ulwp->ul_spin_lock_spin;
4064 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
4065 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
4066 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
4067 	ulwp->ul_spin_lock_spin = 0;
4068 	ulwp->ul_spin_lock_spin2 = 0;
4069 	ulwp->ul_spin_lock_sleep = 0;
4070 	ulwp->ul_spin_lock_wakeup = 0;
4071 }
4072 
4073 /*
4074  * atexit function:  dump the queue statistics to stderr.
4075  */
4076 #include <stdio.h>
4077 void
4078 dump_queue_statistics(void)
4079 {
4080 	uberdata_t *udp = curthread->ul_uberdata;
4081 	queue_head_t *qp;
4082 	int qn;
4083 	uint64_t spin_lock_total = 0;
4084 
4085 	if (udp->queue_head == NULL || thread_queue_dump == 0)
4086 		return;
4087 
4088 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
4089 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
4090 		return;
4091 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
4092 		if (qp->qh_lockcount == 0)
4093 			continue;
4094 		spin_lock_total += qp->qh_lockcount;
4095 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
4096 		    (u_longlong_t)qp->qh_lockcount,
4097 		    qp->qh_qmax, qp->qh_hmax) < 0)
4098 			return;
4099 	}
4100 
4101 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
4102 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
4103 		return;
4104 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
4105 		if (qp->qh_lockcount == 0)
4106 			continue;
4107 		spin_lock_total += qp->qh_lockcount;
4108 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
4109 		    (u_longlong_t)qp->qh_lockcount,
4110 		    qp->qh_qmax, qp->qh_hmax) < 0)
4111 			return;
4112 	}
4113 
4114 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
4115 	    (u_longlong_t)spin_lock_total);
4116 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
4117 	    (u_longlong_t)spin_lock_spin);
4118 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
4119 	    (u_longlong_t)spin_lock_spin2);
4120 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
4121 	    (u_longlong_t)spin_lock_sleep);
4122 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
4123 	    (u_longlong_t)spin_lock_wakeup);
4124 }
4125 #endif
4126