xref: /titanic_51/usr/src/lib/libc/port/threads/synch.c (revision 8c8a8d17e0c6c68e047f0e531c3f8ce133b9aea6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include "lint.h"
30 #include "thr_uberdata.h"
31 #include <sys/rtpriocntl.h>
32 #include <sys/sdt.h>
33 #include <atomic.h>
34 
35 #if defined(THREAD_DEBUG)
36 #define	INCR32(x)	(((x) != UINT32_MAX)? (x)++ : 0)
37 #define	INCR(x)		((x)++)
38 #define	DECR(x)		((x)--)
39 #define	MAXINCR(m, x)	((m < ++x)? (m = x) : 0)
40 #else
41 #define	INCR32(x)
42 #define	INCR(x)
43 #define	DECR(x)
44 #define	MAXINCR(m, x)
45 #endif
46 
47 /*
48  * This mutex is initialized to be held by lwp#1.
49  * It is used to block a thread that has returned from a mutex_lock()
50  * of a LOCK_PRIO_INHERIT mutex with an unrecoverable error.
51  */
52 mutex_t	stall_mutex = DEFAULTMUTEX;
53 
54 static int shared_mutex_held(mutex_t *);
55 static int mutex_queuelock_adaptive(mutex_t *);
56 static void mutex_wakeup_all(mutex_t *);
57 
58 /*
59  * Lock statistics support functions.
60  */
61 void
62 record_begin_hold(tdb_mutex_stats_t *msp)
63 {
64 	tdb_incr(msp->mutex_lock);
65 	msp->mutex_begin_hold = gethrtime();
66 }
67 
68 hrtime_t
69 record_hold_time(tdb_mutex_stats_t *msp)
70 {
71 	hrtime_t now = gethrtime();
72 
73 	if (msp->mutex_begin_hold)
74 		msp->mutex_hold_time += now - msp->mutex_begin_hold;
75 	msp->mutex_begin_hold = 0;
76 	return (now);
77 }
78 
79 /*
80  * Called once at library initialization.
81  */
82 void
83 mutex_setup(void)
84 {
85 	if (set_lock_byte(&stall_mutex.mutex_lockw))
86 		thr_panic("mutex_setup() cannot acquire stall_mutex");
87 	stall_mutex.mutex_owner = (uintptr_t)curthread;
88 }
89 
90 /*
91  * The default spin count of 1000 is experimentally determined.
92  * On sun4u machines with any number of processors it could be raised
93  * to 10,000 but that (experimentally) makes almost no difference.
94  * The environment variable:
95  *	_THREAD_ADAPTIVE_SPIN=count
96  * can be used to override and set the count in the range [0 .. 1,000,000].
97  */
98 int	thread_adaptive_spin = 1000;
99 uint_t	thread_max_spinners = 100;
100 int	thread_queue_verify = 0;
101 static	int	ncpus;
102 
103 /*
104  * Distinguish spinning for queue locks from spinning for regular locks.
105  * We try harder to acquire queue locks by spinning.
106  * The environment variable:
107  *	_THREAD_QUEUE_SPIN=count
108  * can be used to override and set the count in the range [0 .. 1,000,000].
109  */
110 int	thread_queue_spin = 10000;
111 
112 #define	ALL_ATTRIBUTES				\
113 	(LOCK_RECURSIVE | LOCK_ERRORCHECK |	\
114 	LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT |	\
115 	LOCK_ROBUST)
116 
117 /*
118  * 'type' can be one of USYNC_THREAD, USYNC_PROCESS, or USYNC_PROCESS_ROBUST,
119  * augmented by zero or more the flags:
120  *	LOCK_RECURSIVE
121  *	LOCK_ERRORCHECK
122  *	LOCK_PRIO_INHERIT
123  *	LOCK_PRIO_PROTECT
124  *	LOCK_ROBUST
125  */
126 #pragma weak _mutex_init = mutex_init
127 /* ARGSUSED2 */
128 int
129 mutex_init(mutex_t *mp, int type, void *arg)
130 {
131 	int basetype = (type & ~ALL_ATTRIBUTES);
132 	const pcclass_t *pccp;
133 	int error = 0;
134 	int ceil;
135 
136 	if (basetype == USYNC_PROCESS_ROBUST) {
137 		/*
138 		 * USYNC_PROCESS_ROBUST is a deprecated historical type.
139 		 * We change it into (USYNC_PROCESS | LOCK_ROBUST) but
140 		 * retain the USYNC_PROCESS_ROBUST flag so we can return
141 		 * ELOCKUNMAPPED when necessary (only USYNC_PROCESS_ROBUST
142 		 * mutexes will ever draw ELOCKUNMAPPED).
143 		 */
144 		type |= (USYNC_PROCESS | LOCK_ROBUST);
145 		basetype = USYNC_PROCESS;
146 	}
147 
148 	if (type & LOCK_PRIO_PROTECT)
149 		pccp = get_info_by_policy(SCHED_FIFO);
150 	if ((basetype != USYNC_THREAD && basetype != USYNC_PROCESS) ||
151 	    (type & (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT))
152 	    == (LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT) ||
153 	    ((type & LOCK_PRIO_PROTECT) &&
154 	    ((ceil = *(int *)arg) < pccp->pcc_primin ||
155 	    ceil > pccp->pcc_primax))) {
156 		error = EINVAL;
157 	} else if (type & LOCK_ROBUST) {
158 		/*
159 		 * Callers of mutex_init() with the LOCK_ROBUST attribute
160 		 * are required to pass an initially all-zero mutex.
161 		 * Multiple calls to mutex_init() are allowed; all but
162 		 * the first return EBUSY.  A call to mutex_init() is
163 		 * allowed to make an inconsistent robust lock consistent
164 		 * (for historical usage, even though the proper interface
165 		 * for this is mutex_consistent()).  Note that we use
166 		 * atomic_or_16() to set the LOCK_INITED flag so as
167 		 * not to disturb surrounding bits (LOCK_OWNERDEAD, etc).
168 		 */
169 		if (!(mp->mutex_flag & LOCK_INITED)) {
170 			mp->mutex_type = (uint8_t)type;
171 			atomic_or_16(&mp->mutex_flag, LOCK_INITED);
172 			mp->mutex_magic = MUTEX_MAGIC;
173 		} else if (type != mp->mutex_type ||
174 		    ((type & LOCK_PRIO_PROTECT) && mp->mutex_ceiling != ceil)) {
175 			error = EINVAL;
176 		} else if (mutex_consistent(mp) != 0) {
177 			error = EBUSY;
178 		}
179 		/* register a process robust mutex with the kernel */
180 		if (basetype == USYNC_PROCESS)
181 			register_lock(mp);
182 	} else {
183 		(void) memset(mp, 0, sizeof (*mp));
184 		mp->mutex_type = (uint8_t)type;
185 		mp->mutex_flag = LOCK_INITED;
186 		mp->mutex_magic = MUTEX_MAGIC;
187 	}
188 
189 	if (error == 0 && (type & LOCK_PRIO_PROTECT)) {
190 		mp->mutex_ceiling = ceil;
191 	}
192 
193 	/*
194 	 * This should be at the beginning of the function,
195 	 * but for the sake of old broken applications that
196 	 * do not have proper alignment for their mutexes
197 	 * (and don't check the return code from mutex_init),
198 	 * we put it here, after initializing the mutex regardless.
199 	 */
200 	if (error == 0 &&
201 	    ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
202 	    curthread->ul_misaligned == 0)
203 		error = EINVAL;
204 
205 	return (error);
206 }
207 
208 /*
209  * Delete mp from list of ceiling mutexes owned by curthread.
210  * Return 1 if the head of the chain was updated.
211  */
212 int
213 _ceil_mylist_del(mutex_t *mp)
214 {
215 	ulwp_t *self = curthread;
216 	mxchain_t **mcpp;
217 	mxchain_t *mcp;
218 
219 	for (mcpp = &self->ul_mxchain;
220 	    (mcp = *mcpp) != NULL;
221 	    mcpp = &mcp->mxchain_next) {
222 		if (mcp->mxchain_mx == mp) {
223 			*mcpp = mcp->mxchain_next;
224 			lfree(mcp, sizeof (*mcp));
225 			return (mcpp == &self->ul_mxchain);
226 		}
227 	}
228 	return (0);
229 }
230 
231 /*
232  * Add mp to the list of ceiling mutexes owned by curthread.
233  * Return ENOMEM if no memory could be allocated.
234  */
235 int
236 _ceil_mylist_add(mutex_t *mp)
237 {
238 	ulwp_t *self = curthread;
239 	mxchain_t *mcp;
240 
241 	if ((mcp = lmalloc(sizeof (*mcp))) == NULL)
242 		return (ENOMEM);
243 	mcp->mxchain_mx = mp;
244 	mcp->mxchain_next = self->ul_mxchain;
245 	self->ul_mxchain = mcp;
246 	return (0);
247 }
248 
249 /*
250  * Helper function for _ceil_prio_inherit() and _ceil_prio_waive(), below.
251  */
252 static void
253 set_rt_priority(ulwp_t *self, int prio)
254 {
255 	pcparms_t pcparm;
256 
257 	pcparm.pc_cid = self->ul_rtclassid;
258 	((rtparms_t *)pcparm.pc_clparms)->rt_tqnsecs = RT_NOCHANGE;
259 	((rtparms_t *)pcparm.pc_clparms)->rt_pri = prio;
260 	(void) priocntl(P_LWPID, self->ul_lwpid, PC_SETPARMS, &pcparm);
261 }
262 
263 /*
264  * Inherit priority from ceiling.
265  * This changes the effective priority, not the assigned priority.
266  */
267 void
268 _ceil_prio_inherit(int prio)
269 {
270 	ulwp_t *self = curthread;
271 
272 	self->ul_epri = prio;
273 	set_rt_priority(self, prio);
274 }
275 
276 /*
277  * Waive inherited ceiling priority.  Inherit from head of owned ceiling locks
278  * if holding at least one ceiling lock.  If no ceiling locks are held at this
279  * point, disinherit completely, reverting back to assigned priority.
280  */
281 void
282 _ceil_prio_waive(void)
283 {
284 	ulwp_t *self = curthread;
285 	mxchain_t *mcp = self->ul_mxchain;
286 	int prio;
287 
288 	if (mcp == NULL) {
289 		prio = self->ul_pri;
290 		self->ul_epri = 0;
291 	} else {
292 		prio = mcp->mxchain_mx->mutex_ceiling;
293 		self->ul_epri = prio;
294 	}
295 	set_rt_priority(self, prio);
296 }
297 
298 /*
299  * Clear the lock byte.  Retain the waiters byte and the spinners byte.
300  * Return the old value of the lock word.
301  */
302 static uint32_t
303 clear_lockbyte(volatile uint32_t *lockword)
304 {
305 	uint32_t old;
306 	uint32_t new;
307 
308 	do {
309 		old = *lockword;
310 		new = old & ~LOCKMASK;
311 	} while (atomic_cas_32(lockword, old, new) != old);
312 
313 	return (old);
314 }
315 
316 /*
317  * Same as clear_lockbyte(), but operates on mutex_lockword64.
318  * The mutex_ownerpid field is cleared along with the lock byte.
319  */
320 static uint64_t
321 clear_lockbyte64(volatile uint64_t *lockword64)
322 {
323 	uint64_t old;
324 	uint64_t new;
325 
326 	do {
327 		old = *lockword64;
328 		new = old & ~LOCKMASK64;
329 	} while (atomic_cas_64(lockword64, old, new) != old);
330 
331 	return (old);
332 }
333 
334 /*
335  * Similar to set_lock_byte(), which only tries to set the lock byte.
336  * Here, we attempt to set the lock byte AND the mutex_ownerpid, keeping
337  * the remaining bytes constant.  This atomic operation is required for the
338  * correctness of process-shared robust locks, otherwise there would be
339  * a window or vulnerability in which the lock byte had been set but the
340  * mutex_ownerpid had not yet been set.  If the process were to die in
341  * this window of vulnerability (due to some other thread calling exit()
342  * or the process receiving a fatal signal), the mutex would be left locked
343  * but without a process-ID to determine which process was holding the lock.
344  * The kernel would then be unable to mark the robust mutex as LOCK_OWNERDEAD
345  * when the process died.  For all other cases of process-shared locks, this
346  * operation is just a convenience, for the sake of common code.
347  *
348  * This operation requires process-shared robust locks to be properly
349  * aligned on an 8-byte boundary, at least on sparc machines, lest the
350  * operation incur an alignment fault.  This is automatic when locks
351  * are declared properly using the mutex_t or pthread_mutex_t data types
352  * and the application does not allocate dynamic memory on less than an
353  * 8-byte boundary.  See the 'horrible hack' comments below for cases
354  * dealing with such broken applications.
355  */
356 static int
357 set_lock_byte64(volatile uint64_t *lockword64, pid_t ownerpid)
358 {
359 	uint64_t old;
360 	uint64_t new;
361 
362 	old = *lockword64 & ~LOCKMASK64;
363 	new = old | ((uint64_t)(uint_t)ownerpid << PIDSHIFT) | LOCKBYTE64;
364 	if (atomic_cas_64(lockword64, old, new) == old)
365 		return (LOCKCLEAR);
366 
367 	return (LOCKSET);
368 }
369 
370 /*
371  * Increment the spinners count in the mutex lock word.
372  * Return 0 on success.  Return -1 if the count would overflow.
373  */
374 static int
375 spinners_incr(volatile uint32_t *lockword, uint8_t max_spinners)
376 {
377 	uint32_t old;
378 	uint32_t new;
379 
380 	do {
381 		old = *lockword;
382 		if (((old & SPINNERMASK) >> SPINNERSHIFT) >= max_spinners)
383 			return (-1);
384 		new = old + (1 << SPINNERSHIFT);
385 	} while (atomic_cas_32(lockword, old, new) != old);
386 
387 	return (0);
388 }
389 
390 /*
391  * Decrement the spinners count in the mutex lock word.
392  * Return the new value of the lock word.
393  */
394 static uint32_t
395 spinners_decr(volatile uint32_t *lockword)
396 {
397 	uint32_t old;
398 	uint32_t new;
399 
400 	do {
401 		new = old = *lockword;
402 		if (new & SPINNERMASK)
403 			new -= (1 << SPINNERSHIFT);
404 	} while (atomic_cas_32(lockword, old, new) != old);
405 
406 	return (new);
407 }
408 
409 /*
410  * Non-preemptive spin locks.  Used by queue_lock().
411  * No lock statistics are gathered for these locks.
412  * No DTrace probes are provided for these locks.
413  */
414 void
415 spin_lock_set(mutex_t *mp)
416 {
417 	ulwp_t *self = curthread;
418 
419 	no_preempt(self);
420 	if (set_lock_byte(&mp->mutex_lockw) == 0) {
421 		mp->mutex_owner = (uintptr_t)self;
422 		return;
423 	}
424 	/*
425 	 * Spin for a while, attempting to acquire the lock.
426 	 */
427 	INCR32(self->ul_spin_lock_spin);
428 	if (mutex_queuelock_adaptive(mp) == 0 ||
429 	    set_lock_byte(&mp->mutex_lockw) == 0) {
430 		mp->mutex_owner = (uintptr_t)self;
431 		return;
432 	}
433 	/*
434 	 * Try harder if we were previously at a no premption level.
435 	 */
436 	if (self->ul_preempt > 1) {
437 		INCR32(self->ul_spin_lock_spin2);
438 		if (mutex_queuelock_adaptive(mp) == 0 ||
439 		    set_lock_byte(&mp->mutex_lockw) == 0) {
440 			mp->mutex_owner = (uintptr_t)self;
441 			return;
442 		}
443 	}
444 	/*
445 	 * Give up and block in the kernel for the mutex.
446 	 */
447 	INCR32(self->ul_spin_lock_sleep);
448 	(void) ___lwp_mutex_timedlock(mp, NULL);
449 	mp->mutex_owner = (uintptr_t)self;
450 }
451 
452 void
453 spin_lock_clear(mutex_t *mp)
454 {
455 	ulwp_t *self = curthread;
456 
457 	mp->mutex_owner = 0;
458 	if (atomic_swap_32(&mp->mutex_lockword, 0) & WAITERMASK) {
459 		(void) ___lwp_mutex_wakeup(mp, 0);
460 		INCR32(self->ul_spin_lock_wakeup);
461 	}
462 	preempt(self);
463 }
464 
465 /*
466  * Allocate the sleep queue hash table.
467  */
468 void
469 queue_alloc(void)
470 {
471 	ulwp_t *self = curthread;
472 	uberdata_t *udp = self->ul_uberdata;
473 	queue_head_t *qp;
474 	void *data;
475 	int i;
476 
477 	/*
478 	 * No locks are needed; we call here only when single-threaded.
479 	 */
480 	ASSERT(self == udp->ulwp_one);
481 	ASSERT(!udp->uberflags.uf_mt);
482 	if ((data = mmap(NULL, 2 * QHASHSIZE * sizeof (queue_head_t),
483 	    PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, (off_t)0))
484 	    == MAP_FAILED)
485 		thr_panic("cannot allocate thread queue_head table");
486 	udp->queue_head = qp = (queue_head_t *)data;
487 	for (i = 0; i < 2 * QHASHSIZE; qp++, i++) {
488 		qp->qh_type = (i < QHASHSIZE)? MX : CV;
489 		qp->qh_lock.mutex_flag = LOCK_INITED;
490 		qp->qh_lock.mutex_magic = MUTEX_MAGIC;
491 		qp->qh_hlist = &qp->qh_def_root;
492 #if defined(THREAD_DEBUG)
493 		qp->qh_hlen = 1;
494 		qp->qh_hmax = 1;
495 #endif
496 	}
497 }
498 
499 #if defined(THREAD_DEBUG)
500 
501 /*
502  * Debugging: verify correctness of a sleep queue.
503  */
504 void
505 QVERIFY(queue_head_t *qp)
506 {
507 	ulwp_t *self = curthread;
508 	uberdata_t *udp = self->ul_uberdata;
509 	queue_root_t *qrp;
510 	ulwp_t *ulwp;
511 	ulwp_t *prev;
512 	uint_t index;
513 	uint32_t cnt;
514 	char qtype;
515 	void *wchan;
516 
517 	ASSERT(qp >= udp->queue_head && (qp - udp->queue_head) < 2 * QHASHSIZE);
518 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
519 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
520 		cnt++;
521 		ASSERT((qrp->qr_head != NULL && qrp->qr_tail != NULL) ||
522 		    (qrp->qr_head == NULL && qrp->qr_tail == NULL));
523 	}
524 	ASSERT(qp->qh_hlen == cnt && qp->qh_hmax >= cnt);
525 	qtype = ((qp - udp->queue_head) < QHASHSIZE)? MX : CV;
526 	ASSERT(qp->qh_type == qtype);
527 	if (!thread_queue_verify)
528 		return;
529 	/* real expensive stuff, only for _THREAD_QUEUE_VERIFY */
530 	for (cnt = 0, qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next) {
531 		for (prev = NULL, ulwp = qrp->qr_head; ulwp != NULL;
532 		    prev = ulwp, ulwp = ulwp->ul_link) {
533 			cnt++;
534 			if (ulwp->ul_writer)
535 				ASSERT(prev == NULL || prev->ul_writer);
536 			ASSERT(ulwp->ul_qtype == qtype);
537 			ASSERT(ulwp->ul_wchan != NULL);
538 			ASSERT(ulwp->ul_sleepq == qp);
539 			wchan = ulwp->ul_wchan;
540 			ASSERT(qrp->qr_wchan == wchan);
541 			index = QUEUE_HASH(wchan, qtype);
542 			ASSERT(&udp->queue_head[index] == qp);
543 		}
544 		ASSERT(qrp->qr_tail == prev);
545 	}
546 	ASSERT(qp->qh_qlen == cnt);
547 }
548 
549 #else	/* THREAD_DEBUG */
550 
551 #define	QVERIFY(qp)
552 
553 #endif	/* THREAD_DEBUG */
554 
555 /*
556  * Acquire a queue head.
557  */
558 queue_head_t *
559 queue_lock(void *wchan, int qtype)
560 {
561 	uberdata_t *udp = curthread->ul_uberdata;
562 	queue_head_t *qp;
563 	queue_root_t *qrp;
564 
565 	ASSERT(qtype == MX || qtype == CV);
566 
567 	/*
568 	 * It is possible that we could be called while still single-threaded.
569 	 * If so, we call queue_alloc() to allocate the queue_head[] array.
570 	 */
571 	if ((qp = udp->queue_head) == NULL) {
572 		queue_alloc();
573 		qp = udp->queue_head;
574 	}
575 	qp += QUEUE_HASH(wchan, qtype);
576 	spin_lock_set(&qp->qh_lock);
577 	for (qrp = qp->qh_hlist; qrp != NULL; qrp = qrp->qr_next)
578 		if (qrp->qr_wchan == wchan)
579 			break;
580 	if (qrp == NULL && qp->qh_def_root.qr_head == NULL) {
581 		/* the default queue root is available; use it */
582 		qrp = &qp->qh_def_root;
583 		qrp->qr_wchan = wchan;
584 		ASSERT(qrp->qr_next == NULL);
585 		ASSERT(qrp->qr_tail == NULL &&
586 		    qrp->qr_rtcount == 0 && qrp->qr_qlen == 0);
587 	}
588 	qp->qh_wchan = wchan;	/* valid until queue_unlock() is called */
589 	qp->qh_root = qrp;	/* valid until queue_unlock() is called */
590 	INCR32(qp->qh_lockcount);
591 	QVERIFY(qp);
592 	return (qp);
593 }
594 
595 /*
596  * Release a queue head.
597  */
598 void
599 queue_unlock(queue_head_t *qp)
600 {
601 	QVERIFY(qp);
602 	spin_lock_clear(&qp->qh_lock);
603 }
604 
605 /*
606  * For rwlock queueing, we must queue writers ahead of readers of the
607  * same priority.  We do this by making writers appear to have a half
608  * point higher priority for purposes of priority comparisons below.
609  */
610 #define	CMP_PRIO(ulwp)	((real_priority(ulwp) << 1) + (ulwp)->ul_writer)
611 
612 void
613 enqueue(queue_head_t *qp, ulwp_t *ulwp, int force_fifo)
614 {
615 	queue_root_t *qrp;
616 	ulwp_t **ulwpp;
617 	ulwp_t *next;
618 	int pri = CMP_PRIO(ulwp);
619 
620 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
621 	ASSERT(ulwp->ul_sleepq != qp);
622 
623 	if ((qrp = qp->qh_root) == NULL) {
624 		/* use the thread's queue root for the linkage */
625 		qrp = &ulwp->ul_queue_root;
626 		qrp->qr_next = qp->qh_hlist;
627 		qrp->qr_prev = NULL;
628 		qrp->qr_head = NULL;
629 		qrp->qr_tail = NULL;
630 		qrp->qr_wchan = qp->qh_wchan;
631 		qrp->qr_rtcount = 0;
632 		qrp->qr_qlen = 0;
633 		qrp->qr_qmax = 0;
634 		qp->qh_hlist->qr_prev = qrp;
635 		qp->qh_hlist = qrp;
636 		qp->qh_root = qrp;
637 		MAXINCR(qp->qh_hmax, qp->qh_hlen);
638 	}
639 
640 	/*
641 	 * LIFO queue ordering is unfair and can lead to starvation,
642 	 * but it gives better performance for heavily contended locks.
643 	 * We use thread_queue_fifo (range is 0..8) to determine
644 	 * the frequency of FIFO vs LIFO queuing:
645 	 *	0 : every 256th time	(almost always LIFO)
646 	 *	1 : every 128th time
647 	 *	2 : every 64th  time
648 	 *	3 : every 32nd  time
649 	 *	4 : every 16th  time	(the default value, mostly LIFO)
650 	 *	5 : every 8th   time
651 	 *	6 : every 4th   time
652 	 *	7 : every 2nd   time
653 	 *	8 : every time		(never LIFO, always FIFO)
654 	 * Note that there is always some degree of FIFO ordering.
655 	 * This breaks live lock conditions that occur in applications
656 	 * that are written assuming (incorrectly) that threads acquire
657 	 * locks fairly, that is, in roughly round-robin order.
658 	 * In any event, the queue is maintained in kernel priority order.
659 	 *
660 	 * If force_fifo is non-zero, fifo queueing is forced.
661 	 * SUSV3 requires this for semaphores.
662 	 */
663 	if (qrp->qr_head == NULL) {
664 		/*
665 		 * The queue is empty.  LIFO/FIFO doesn't matter.
666 		 */
667 		ASSERT(qrp->qr_tail == NULL);
668 		ulwpp = &qrp->qr_head;
669 	} else if (force_fifo |
670 	    (((++qp->qh_qcnt << curthread->ul_queue_fifo) & 0xff) == 0)) {
671 		/*
672 		 * Enqueue after the last thread whose priority is greater
673 		 * than or equal to the priority of the thread being queued.
674 		 * Attempt first to go directly onto the tail of the queue.
675 		 */
676 		if (pri <= CMP_PRIO(qrp->qr_tail))
677 			ulwpp = &qrp->qr_tail->ul_link;
678 		else {
679 			for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
680 			    ulwpp = &next->ul_link)
681 				if (pri > CMP_PRIO(next))
682 					break;
683 		}
684 	} else {
685 		/*
686 		 * Enqueue before the first thread whose priority is less
687 		 * than or equal to the priority of the thread being queued.
688 		 * Hopefully we can go directly onto the head of the queue.
689 		 */
690 		for (ulwpp = &qrp->qr_head; (next = *ulwpp) != NULL;
691 		    ulwpp = &next->ul_link)
692 			if (pri >= CMP_PRIO(next))
693 				break;
694 	}
695 	if ((ulwp->ul_link = *ulwpp) == NULL)
696 		qrp->qr_tail = ulwp;
697 	*ulwpp = ulwp;
698 
699 	ulwp->ul_sleepq = qp;
700 	ulwp->ul_wchan = qp->qh_wchan;
701 	ulwp->ul_qtype = qp->qh_type;
702 	if ((ulwp->ul_schedctl != NULL &&
703 	    ulwp->ul_schedctl->sc_cid == ulwp->ul_rtclassid) |
704 	    ulwp->ul_pilocks) {
705 		ulwp->ul_rtqueued = 1;
706 		qrp->qr_rtcount++;
707 	}
708 	MAXINCR(qrp->qr_qmax, qrp->qr_qlen);
709 	MAXINCR(qp->qh_qmax, qp->qh_qlen);
710 }
711 
712 /*
713  * Helper function for queue_slot() and queue_slot_rt().
714  * Try to find a non-suspended thread on the queue.
715  */
716 static ulwp_t **
717 queue_slot_runnable(ulwp_t **ulwpp, ulwp_t **prevp, int rt)
718 {
719 	ulwp_t *ulwp;
720 	ulwp_t **foundpp = NULL;
721 	int priority = -1;
722 	ulwp_t *prev;
723 	int tpri;
724 
725 	for (prev = NULL;
726 	    (ulwp = *ulwpp) != NULL;
727 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
728 		if (ulwp->ul_stop)	/* skip suspended threads */
729 			continue;
730 		tpri = rt? CMP_PRIO(ulwp) : 0;
731 		if (tpri > priority) {
732 			foundpp = ulwpp;
733 			*prevp = prev;
734 			priority = tpri;
735 			if (!rt)
736 				break;
737 		}
738 	}
739 	return (foundpp);
740 }
741 
742 /*
743  * For real-time, we search the entire queue because the dispatch
744  * (kernel) priorities may have changed since enqueueing.
745  */
746 static ulwp_t **
747 queue_slot_rt(ulwp_t **ulwpp_org, ulwp_t **prevp)
748 {
749 	ulwp_t **ulwpp = ulwpp_org;
750 	ulwp_t *ulwp = *ulwpp;
751 	ulwp_t **foundpp = ulwpp;
752 	int priority = CMP_PRIO(ulwp);
753 	ulwp_t *prev;
754 	int tpri;
755 
756 	for (prev = ulwp, ulwpp = &ulwp->ul_link;
757 	    (ulwp = *ulwpp) != NULL;
758 	    prev = ulwp, ulwpp = &ulwp->ul_link) {
759 		tpri = CMP_PRIO(ulwp);
760 		if (tpri > priority) {
761 			foundpp = ulwpp;
762 			*prevp = prev;
763 			priority = tpri;
764 		}
765 	}
766 	ulwp = *foundpp;
767 
768 	/*
769 	 * Try not to return a suspended thread.
770 	 * This mimics the old libthread's behavior.
771 	 */
772 	if (ulwp->ul_stop &&
773 	    (ulwpp = queue_slot_runnable(ulwpp_org, prevp, 1)) != NULL) {
774 		foundpp = ulwpp;
775 		ulwp = *foundpp;
776 	}
777 	ulwp->ul_rt = 1;
778 	return (foundpp);
779 }
780 
781 ulwp_t **
782 queue_slot(queue_head_t *qp, ulwp_t **prevp, int *more)
783 {
784 	queue_root_t *qrp;
785 	ulwp_t **ulwpp;
786 	ulwp_t *ulwp;
787 	int rt;
788 
789 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
790 
791 	if ((qrp = qp->qh_root) == NULL || (ulwp = qrp->qr_head) == NULL) {
792 		*more = 0;
793 		return (NULL);		/* no lwps on the queue */
794 	}
795 	rt = (qrp->qr_rtcount != 0);
796 	*prevp = NULL;
797 	if (ulwp->ul_link == NULL) {	/* only one lwp on the queue */
798 		*more = 0;
799 		ulwp->ul_rt = rt;
800 		return (&qrp->qr_head);
801 	}
802 	*more = 1;
803 
804 	if (rt)		/* real-time queue */
805 		return (queue_slot_rt(&qrp->qr_head, prevp));
806 	/*
807 	 * Try not to return a suspended thread.
808 	 * This mimics the old libthread's behavior.
809 	 */
810 	if (ulwp->ul_stop &&
811 	    (ulwpp = queue_slot_runnable(&qrp->qr_head, prevp, 0)) != NULL) {
812 		ulwp = *ulwpp;
813 		ulwp->ul_rt = 0;
814 		return (ulwpp);
815 	}
816 	/*
817 	 * The common case; just pick the first thread on the queue.
818 	 */
819 	ulwp->ul_rt = 0;
820 	return (&qrp->qr_head);
821 }
822 
823 /*
824  * Common code for unlinking an lwp from a user-level sleep queue.
825  */
826 void
827 queue_unlink(queue_head_t *qp, ulwp_t **ulwpp, ulwp_t *prev)
828 {
829 	queue_root_t *qrp = qp->qh_root;
830 	queue_root_t *nqrp;
831 	ulwp_t *ulwp = *ulwpp;
832 	ulwp_t *next;
833 
834 	ASSERT(MUTEX_OWNED(&qp->qh_lock, curthread));
835 	ASSERT(qp->qh_wchan != NULL && ulwp->ul_wchan == qp->qh_wchan);
836 
837 	DECR(qp->qh_qlen);
838 	DECR(qrp->qr_qlen);
839 	if (ulwp->ul_rtqueued) {
840 		ulwp->ul_rtqueued = 0;
841 		qrp->qr_rtcount--;
842 	}
843 	next = ulwp->ul_link;
844 	*ulwpp = next;
845 	ulwp->ul_link = NULL;
846 	if (qrp->qr_tail == ulwp)
847 		qrp->qr_tail = prev;
848 	if (qrp == &ulwp->ul_queue_root) {
849 		/*
850 		 * We can't continue to use the unlinked thread's
851 		 * queue root for the linkage.
852 		 */
853 		queue_root_t *qr_next = qrp->qr_next;
854 		queue_root_t *qr_prev = qrp->qr_prev;
855 
856 		if (qrp->qr_tail) {
857 			/* switch to using the last thread's queue root */
858 			ASSERT(qrp->qr_qlen != 0);
859 			nqrp = &qrp->qr_tail->ul_queue_root;
860 			*nqrp = *qrp;
861 			if (qr_next)
862 				qr_next->qr_prev = nqrp;
863 			if (qr_prev)
864 				qr_prev->qr_next = nqrp;
865 			else
866 				qp->qh_hlist = nqrp;
867 			qp->qh_root = nqrp;
868 		} else {
869 			/* empty queue root; just delete from the hash list */
870 			ASSERT(qrp->qr_qlen == 0);
871 			if (qr_next)
872 				qr_next->qr_prev = qr_prev;
873 			if (qr_prev)
874 				qr_prev->qr_next = qr_next;
875 			else
876 				qp->qh_hlist = qr_next;
877 			qp->qh_root = NULL;
878 			DECR(qp->qh_hlen);
879 		}
880 	}
881 }
882 
883 ulwp_t *
884 dequeue(queue_head_t *qp, int *more)
885 {
886 	ulwp_t **ulwpp;
887 	ulwp_t *ulwp;
888 	ulwp_t *prev;
889 
890 	if ((ulwpp = queue_slot(qp, &prev, more)) == NULL)
891 		return (NULL);
892 	ulwp = *ulwpp;
893 	queue_unlink(qp, ulwpp, prev);
894 	ulwp->ul_sleepq = NULL;
895 	ulwp->ul_wchan = NULL;
896 	return (ulwp);
897 }
898 
899 /*
900  * Return a pointer to the highest priority thread sleeping on wchan.
901  */
902 ulwp_t *
903 queue_waiter(queue_head_t *qp)
904 {
905 	ulwp_t **ulwpp;
906 	ulwp_t *prev;
907 	int more;
908 
909 	if ((ulwpp = queue_slot(qp, &prev, &more)) == NULL)
910 		return (NULL);
911 	return (*ulwpp);
912 }
913 
914 int
915 dequeue_self(queue_head_t *qp)
916 {
917 	ulwp_t *self = curthread;
918 	queue_root_t *qrp;
919 	ulwp_t **ulwpp;
920 	ulwp_t *ulwp;
921 	ulwp_t *prev;
922 	int found = 0;
923 
924 	ASSERT(MUTEX_OWNED(&qp->qh_lock, self));
925 
926 	/* find self on the sleep queue */
927 	if ((qrp = qp->qh_root) != NULL) {
928 		for (prev = NULL, ulwpp = &qrp->qr_head;
929 		    (ulwp = *ulwpp) != NULL;
930 		    prev = ulwp, ulwpp = &ulwp->ul_link) {
931 			if (ulwp == self) {
932 				queue_unlink(qp, ulwpp, prev);
933 				self->ul_cvmutex = NULL;
934 				self->ul_sleepq = NULL;
935 				self->ul_wchan = NULL;
936 				found = 1;
937 				break;
938 			}
939 		}
940 	}
941 
942 	if (!found)
943 		thr_panic("dequeue_self(): curthread not found on queue");
944 
945 	return ((qrp = qp->qh_root) != NULL && qrp->qr_head != NULL);
946 }
947 
948 /*
949  * Called from call_user_handler() and _thrp_suspend() to take
950  * ourself off of our sleep queue so we can grab locks.
951  */
952 void
953 unsleep_self(void)
954 {
955 	ulwp_t *self = curthread;
956 	queue_head_t *qp;
957 
958 	/*
959 	 * Calling enter_critical()/exit_critical() here would lead
960 	 * to recursion.  Just manipulate self->ul_critical directly.
961 	 */
962 	self->ul_critical++;
963 	while (self->ul_sleepq != NULL) {
964 		qp = queue_lock(self->ul_wchan, self->ul_qtype);
965 		/*
966 		 * We may have been moved from a CV queue to a
967 		 * mutex queue while we were attempting queue_lock().
968 		 * If so, just loop around and try again.
969 		 * dequeue_self() clears self->ul_sleepq.
970 		 */
971 		if (qp == self->ul_sleepq)
972 			(void) dequeue_self(qp);
973 		queue_unlock(qp);
974 	}
975 	self->ul_writer = 0;
976 	self->ul_critical--;
977 }
978 
979 /*
980  * Common code for calling the the ___lwp_mutex_timedlock() system call.
981  * Returns with mutex_owner and mutex_ownerpid set correctly.
982  */
983 static int
984 mutex_lock_kernel(mutex_t *mp, timespec_t *tsp, tdb_mutex_stats_t *msp)
985 {
986 	ulwp_t *self = curthread;
987 	uberdata_t *udp = self->ul_uberdata;
988 	int mtype = mp->mutex_type;
989 	hrtime_t begin_sleep;
990 	int acquired;
991 	int error;
992 
993 	self->ul_sp = stkptr();
994 	self->ul_wchan = mp;
995 	if (__td_event_report(self, TD_SLEEP, udp)) {
996 		self->ul_td_evbuf.eventnum = TD_SLEEP;
997 		self->ul_td_evbuf.eventdata = mp;
998 		tdb_event(TD_SLEEP, udp);
999 	}
1000 	if (msp) {
1001 		tdb_incr(msp->mutex_sleep);
1002 		begin_sleep = gethrtime();
1003 	}
1004 
1005 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1006 
1007 	for (;;) {
1008 		/*
1009 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1010 		 * means we successfully acquired the lock.
1011 		 */
1012 		if ((error = ___lwp_mutex_timedlock(mp, tsp)) != 0 &&
1013 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1014 			acquired = 0;
1015 			break;
1016 		}
1017 
1018 		if (mtype & USYNC_PROCESS) {
1019 			/*
1020 			 * Defend against forkall().  We may be the child,
1021 			 * in which case we don't actually own the mutex.
1022 			 */
1023 			enter_critical(self);
1024 			if (mp->mutex_ownerpid == udp->pid) {
1025 				mp->mutex_owner = (uintptr_t)self;
1026 				exit_critical(self);
1027 				acquired = 1;
1028 				break;
1029 			}
1030 			exit_critical(self);
1031 		} else {
1032 			mp->mutex_owner = (uintptr_t)self;
1033 			acquired = 1;
1034 			break;
1035 		}
1036 	}
1037 	if (msp)
1038 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1039 	self->ul_wchan = NULL;
1040 	self->ul_sp = 0;
1041 
1042 	if (acquired) {
1043 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1044 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1045 	} else {
1046 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1047 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1048 	}
1049 
1050 	return (error);
1051 }
1052 
1053 /*
1054  * Common code for calling the ___lwp_mutex_trylock() system call.
1055  * Returns with mutex_owner and mutex_ownerpid set correctly.
1056  */
1057 int
1058 mutex_trylock_kernel(mutex_t *mp)
1059 {
1060 	ulwp_t *self = curthread;
1061 	uberdata_t *udp = self->ul_uberdata;
1062 	int mtype = mp->mutex_type;
1063 	int error;
1064 	int acquired;
1065 
1066 	for (;;) {
1067 		/*
1068 		 * A return value of EOWNERDEAD or ELOCKUNMAPPED
1069 		 * means we successfully acquired the lock.
1070 		 */
1071 		if ((error = ___lwp_mutex_trylock(mp)) != 0 &&
1072 		    error != EOWNERDEAD && error != ELOCKUNMAPPED) {
1073 			acquired = 0;
1074 			break;
1075 		}
1076 
1077 		if (mtype & USYNC_PROCESS) {
1078 			/*
1079 			 * Defend against forkall().  We may be the child,
1080 			 * in which case we don't actually own the mutex.
1081 			 */
1082 			enter_critical(self);
1083 			if (mp->mutex_ownerpid == udp->pid) {
1084 				mp->mutex_owner = (uintptr_t)self;
1085 				exit_critical(self);
1086 				acquired = 1;
1087 				break;
1088 			}
1089 			exit_critical(self);
1090 		} else {
1091 			mp->mutex_owner = (uintptr_t)self;
1092 			acquired = 1;
1093 			break;
1094 		}
1095 	}
1096 
1097 	if (acquired) {
1098 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1099 	} else if (error != EBUSY) {
1100 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1101 	}
1102 
1103 	return (error);
1104 }
1105 
1106 volatile sc_shared_t *
1107 setup_schedctl(void)
1108 {
1109 	ulwp_t *self = curthread;
1110 	volatile sc_shared_t *scp;
1111 	sc_shared_t *tmp;
1112 
1113 	if ((scp = self->ul_schedctl) == NULL && /* no shared state yet */
1114 	    !self->ul_vfork &&			/* not a child of vfork() */
1115 	    !self->ul_schedctl_called) {	/* haven't been called before */
1116 		enter_critical(self);
1117 		self->ul_schedctl_called = &self->ul_uberdata->uberflags;
1118 		if ((tmp = __schedctl()) != (sc_shared_t *)(-1))
1119 			self->ul_schedctl = scp = tmp;
1120 		exit_critical(self);
1121 	}
1122 	/*
1123 	 * Unless the call to setup_schedctl() is surrounded
1124 	 * by enter_critical()/exit_critical(), the address
1125 	 * we are returning could be invalid due to a forkall()
1126 	 * having occurred in another thread.
1127 	 */
1128 	return (scp);
1129 }
1130 
1131 /*
1132  * Interfaces from libsched, incorporated into libc.
1133  * libsched.so.1 is now a filter library onto libc.
1134  */
1135 #pragma weak schedctl_lookup = schedctl_init
1136 schedctl_t *
1137 schedctl_init(void)
1138 {
1139 	volatile sc_shared_t *scp = setup_schedctl();
1140 	return ((scp == NULL)? NULL : (schedctl_t *)&scp->sc_preemptctl);
1141 }
1142 
1143 void
1144 schedctl_exit(void)
1145 {
1146 }
1147 
1148 /*
1149  * Contract private interface for java.
1150  * Set up the schedctl data if it doesn't exist yet.
1151  * Return a pointer to the pointer to the schedctl data.
1152  */
1153 volatile sc_shared_t *volatile *
1154 _thr_schedctl(void)
1155 {
1156 	ulwp_t *self = curthread;
1157 	volatile sc_shared_t *volatile *ptr;
1158 
1159 	if (self->ul_vfork)
1160 		return (NULL);
1161 	if (*(ptr = &self->ul_schedctl) == NULL)
1162 		(void) setup_schedctl();
1163 	return (ptr);
1164 }
1165 
1166 /*
1167  * Block signals and attempt to block preemption.
1168  * no_preempt()/preempt() must be used in pairs but can be nested.
1169  */
1170 void
1171 no_preempt(ulwp_t *self)
1172 {
1173 	volatile sc_shared_t *scp;
1174 
1175 	if (self->ul_preempt++ == 0) {
1176 		enter_critical(self);
1177 		if ((scp = self->ul_schedctl) != NULL ||
1178 		    (scp = setup_schedctl()) != NULL) {
1179 			/*
1180 			 * Save the pre-existing preempt value.
1181 			 */
1182 			self->ul_savpreempt = scp->sc_preemptctl.sc_nopreempt;
1183 			scp->sc_preemptctl.sc_nopreempt = 1;
1184 		}
1185 	}
1186 }
1187 
1188 /*
1189  * Undo the effects of no_preempt().
1190  */
1191 void
1192 preempt(ulwp_t *self)
1193 {
1194 	volatile sc_shared_t *scp;
1195 
1196 	ASSERT(self->ul_preempt > 0);
1197 	if (--self->ul_preempt == 0) {
1198 		if ((scp = self->ul_schedctl) != NULL) {
1199 			/*
1200 			 * Restore the pre-existing preempt value.
1201 			 */
1202 			scp->sc_preemptctl.sc_nopreempt = self->ul_savpreempt;
1203 			if (scp->sc_preemptctl.sc_yield &&
1204 			    scp->sc_preemptctl.sc_nopreempt == 0) {
1205 				yield();
1206 				if (scp->sc_preemptctl.sc_yield) {
1207 					/*
1208 					 * Shouldn't happen.  This is either
1209 					 * a race condition or the thread
1210 					 * just entered the real-time class.
1211 					 */
1212 					yield();
1213 					scp->sc_preemptctl.sc_yield = 0;
1214 				}
1215 			}
1216 		}
1217 		exit_critical(self);
1218 	}
1219 }
1220 
1221 /*
1222  * If a call to preempt() would cause the current thread to yield or to
1223  * take deferred actions in exit_critical(), then unpark the specified
1224  * lwp so it can run while we delay.  Return the original lwpid if the
1225  * unpark was not performed, else return zero.  The tests are a repeat
1226  * of some of the tests in preempt(), above.  This is a statistical
1227  * optimization solely for cond_sleep_queue(), below.
1228  */
1229 static lwpid_t
1230 preempt_unpark(ulwp_t *self, lwpid_t lwpid)
1231 {
1232 	volatile sc_shared_t *scp = self->ul_schedctl;
1233 
1234 	ASSERT(self->ul_preempt == 1 && self->ul_critical > 0);
1235 	if ((scp != NULL && scp->sc_preemptctl.sc_yield) ||
1236 	    (self->ul_curplease && self->ul_critical == 1)) {
1237 		(void) __lwp_unpark(lwpid);
1238 		lwpid = 0;
1239 	}
1240 	return (lwpid);
1241 }
1242 
1243 /*
1244  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1245  * If this fails, return EBUSY and let the caller deal with it.
1246  * If this succeeds, return 0 with mutex_owner set to curthread.
1247  */
1248 static int
1249 mutex_trylock_adaptive(mutex_t *mp, int tryhard)
1250 {
1251 	ulwp_t *self = curthread;
1252 	int error = EBUSY;
1253 	ulwp_t *ulwp;
1254 	volatile sc_shared_t *scp;
1255 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
1256 	volatile uint64_t *ownerp = (volatile uint64_t *)&mp->mutex_owner;
1257 	uint32_t new_lockword;
1258 	int count = 0;
1259 	int max_count;
1260 	uint8_t max_spinners;
1261 
1262 	ASSERT(!(mp->mutex_type & USYNC_PROCESS));
1263 
1264 	if (MUTEX_OWNER(mp) == self)
1265 		return (EBUSY);
1266 
1267 	/* short-cut, not definitive (see below) */
1268 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1269 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1270 		error = ENOTRECOVERABLE;
1271 		goto done;
1272 	}
1273 
1274 	/*
1275 	 * Make one attempt to acquire the lock before
1276 	 * incurring the overhead of the spin loop.
1277 	 */
1278 	if (set_lock_byte(lockp) == 0) {
1279 		*ownerp = (uintptr_t)self;
1280 		error = 0;
1281 		goto done;
1282 	}
1283 	if (!tryhard)
1284 		goto done;
1285 	if (ncpus == 0)
1286 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1287 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1288 		max_spinners = ncpus - 1;
1289 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1290 	if (max_count == 0)
1291 		goto done;
1292 
1293 	/*
1294 	 * This spin loop is unfair to lwps that have already dropped into
1295 	 * the kernel to sleep.  They will starve on a highly-contended mutex.
1296 	 * This is just too bad.  The adaptive spin algorithm is intended
1297 	 * to allow programs with highly-contended locks (that is, broken
1298 	 * programs) to execute with reasonable speed despite their contention.
1299 	 * Being fair would reduce the speed of such programs and well-written
1300 	 * programs will not suffer in any case.
1301 	 */
1302 	enter_critical(self);
1303 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1304 		exit_critical(self);
1305 		goto done;
1306 	}
1307 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1308 	for (count = 1; ; count++) {
1309 		if (*lockp == 0 && set_lock_byte(lockp) == 0) {
1310 			*ownerp = (uintptr_t)self;
1311 			error = 0;
1312 			break;
1313 		}
1314 		if (count == max_count)
1315 			break;
1316 		SMT_PAUSE();
1317 		/*
1318 		 * Stop spinning if the mutex owner is not running on
1319 		 * a processor; it will not drop the lock any time soon
1320 		 * and we would just be wasting time to keep spinning.
1321 		 *
1322 		 * Note that we are looking at another thread (ulwp_t)
1323 		 * without ensuring that the other thread does not exit.
1324 		 * The scheme relies on ulwp_t structures never being
1325 		 * deallocated by the library (the library employs a free
1326 		 * list of ulwp_t structs that are reused when new threads
1327 		 * are created) and on schedctl shared memory never being
1328 		 * deallocated once created via __schedctl().
1329 		 *
1330 		 * Thus, the worst that can happen when the spinning thread
1331 		 * looks at the owner's schedctl data is that it is looking
1332 		 * at some other thread's schedctl data.  This almost never
1333 		 * happens and is benign when it does.
1334 		 */
1335 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1336 		    ((scp = ulwp->ul_schedctl) == NULL ||
1337 		    scp->sc_state != SC_ONPROC))
1338 			break;
1339 	}
1340 	new_lockword = spinners_decr(&mp->mutex_lockword);
1341 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1342 		/*
1343 		 * We haven't yet acquired the lock, the lock
1344 		 * is free, and there are no other spinners.
1345 		 * Make one final attempt to acquire the lock.
1346 		 *
1347 		 * This isn't strictly necessary since mutex_lock_queue()
1348 		 * (the next action this thread will take if it doesn't
1349 		 * acquire the lock here) makes one attempt to acquire
1350 		 * the lock before putting the thread to sleep.
1351 		 *
1352 		 * If the next action for this thread (on failure here)
1353 		 * were not to call mutex_lock_queue(), this would be
1354 		 * necessary for correctness, to avoid ending up with an
1355 		 * unheld mutex with waiters but no one to wake them up.
1356 		 */
1357 		if (set_lock_byte(lockp) == 0) {
1358 			*ownerp = (uintptr_t)self;
1359 			error = 0;
1360 		}
1361 		count++;
1362 	}
1363 	exit_critical(self);
1364 
1365 done:
1366 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1367 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1368 		/*
1369 		 * We shouldn't own the mutex.
1370 		 * Just clear the lock; everyone has already been waked up.
1371 		 */
1372 		mp->mutex_owner = 0;
1373 		(void) clear_lockbyte(&mp->mutex_lockword);
1374 		error = ENOTRECOVERABLE;
1375 	}
1376 
1377 	if (error) {
1378 		if (count) {
1379 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1380 		}
1381 		if (error != EBUSY) {
1382 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1383 		}
1384 	} else {
1385 		if (count) {
1386 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1387 		}
1388 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1389 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1390 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1391 			error = EOWNERDEAD;
1392 		}
1393 	}
1394 
1395 	return (error);
1396 }
1397 
1398 /*
1399  * Same as mutex_trylock_adaptive(), except specifically for queue locks.
1400  * The owner field is not set here; the caller (spin_lock_set()) sets it.
1401  */
1402 static int
1403 mutex_queuelock_adaptive(mutex_t *mp)
1404 {
1405 	ulwp_t *ulwp;
1406 	volatile sc_shared_t *scp;
1407 	volatile uint8_t *lockp;
1408 	volatile uint64_t *ownerp;
1409 	int count = curthread->ul_queue_spin;
1410 
1411 	ASSERT(mp->mutex_type == USYNC_THREAD);
1412 
1413 	if (count == 0)
1414 		return (EBUSY);
1415 
1416 	lockp = (volatile uint8_t *)&mp->mutex_lockw;
1417 	ownerp = (volatile uint64_t *)&mp->mutex_owner;
1418 	while (--count >= 0) {
1419 		if (*lockp == 0 && set_lock_byte(lockp) == 0)
1420 			return (0);
1421 		SMT_PAUSE();
1422 		if ((ulwp = (ulwp_t *)(uintptr_t)*ownerp) != NULL &&
1423 		    ((scp = ulwp->ul_schedctl) == NULL ||
1424 		    scp->sc_state != SC_ONPROC))
1425 			break;
1426 	}
1427 
1428 	return (EBUSY);
1429 }
1430 
1431 /*
1432  * Like mutex_trylock_adaptive(), but for process-shared mutexes.
1433  * Spin for a while (if 'tryhard' is true), trying to grab the lock.
1434  * If this fails, return EBUSY and let the caller deal with it.
1435  * If this succeeds, return 0 with mutex_owner set to curthread
1436  * and mutex_ownerpid set to the current pid.
1437  */
1438 static int
1439 mutex_trylock_process(mutex_t *mp, int tryhard)
1440 {
1441 	ulwp_t *self = curthread;
1442 	uberdata_t *udp = self->ul_uberdata;
1443 	int error = EBUSY;
1444 	volatile uint64_t *lockp = (volatile uint64_t *)&mp->mutex_lockword64;
1445 	uint32_t new_lockword;
1446 	int count = 0;
1447 	int max_count;
1448 	uint8_t max_spinners;
1449 
1450 #if defined(__sparc) && !defined(_LP64)
1451 	/* horrible hack, necessary only on 32-bit sparc */
1452 	int fix_alignment_problem =
1453 	    (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1454 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST));
1455 #endif
1456 
1457 	ASSERT(mp->mutex_type & USYNC_PROCESS);
1458 
1459 	if (shared_mutex_held(mp))
1460 		return (EBUSY);
1461 
1462 	/* short-cut, not definitive (see below) */
1463 	if (mp->mutex_flag & LOCK_NOTRECOVERABLE) {
1464 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1465 		error = ENOTRECOVERABLE;
1466 		goto done;
1467 	}
1468 
1469 	/*
1470 	 * Make one attempt to acquire the lock before
1471 	 * incurring the overhead of the spin loop.
1472 	 */
1473 	enter_critical(self);
1474 #if defined(__sparc) && !defined(_LP64)
1475 	/* horrible hack, necessary only on 32-bit sparc */
1476 	if (fix_alignment_problem) {
1477 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1478 			mp->mutex_ownerpid = udp->pid;
1479 			mp->mutex_owner = (uintptr_t)self;
1480 			exit_critical(self);
1481 			error = 0;
1482 			goto done;
1483 		}
1484 	} else
1485 #endif
1486 	if (set_lock_byte64(lockp, udp->pid) == 0) {
1487 		mp->mutex_owner = (uintptr_t)self;
1488 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
1489 		exit_critical(self);
1490 		error = 0;
1491 		goto done;
1492 	}
1493 	exit_critical(self);
1494 	if (!tryhard)
1495 		goto done;
1496 	if (ncpus == 0)
1497 		ncpus = (int)_sysconf(_SC_NPROCESSORS_ONLN);
1498 	if ((max_spinners = self->ul_max_spinners) >= ncpus)
1499 		max_spinners = ncpus - 1;
1500 	max_count = (max_spinners != 0)? self->ul_adaptive_spin : 0;
1501 	if (max_count == 0)
1502 		goto done;
1503 
1504 	/*
1505 	 * This is a process-shared mutex.
1506 	 * We cannot know if the owner is running on a processor.
1507 	 * We just spin and hope that it is on a processor.
1508 	 */
1509 	enter_critical(self);
1510 	if (spinners_incr(&mp->mutex_lockword, max_spinners) == -1) {
1511 		exit_critical(self);
1512 		goto done;
1513 	}
1514 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
1515 	for (count = 1; ; count++) {
1516 #if defined(__sparc) && !defined(_LP64)
1517 		/* horrible hack, necessary only on 32-bit sparc */
1518 		if (fix_alignment_problem) {
1519 			if ((*lockp & LOCKMASK64) == 0 &&
1520 			    set_lock_byte(&mp->mutex_lockw) == 0) {
1521 				mp->mutex_ownerpid = udp->pid;
1522 				mp->mutex_owner = (uintptr_t)self;
1523 				error = 0;
1524 				break;
1525 			}
1526 		} else
1527 #endif
1528 		if ((*lockp & LOCKMASK64) == 0 &&
1529 		    set_lock_byte64(lockp, udp->pid) == 0) {
1530 			mp->mutex_owner = (uintptr_t)self;
1531 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1532 			error = 0;
1533 			break;
1534 		}
1535 		if (count == max_count)
1536 			break;
1537 		SMT_PAUSE();
1538 	}
1539 	new_lockword = spinners_decr(&mp->mutex_lockword);
1540 	if (error && (new_lockword & (LOCKMASK | SPINNERMASK)) == 0) {
1541 		/*
1542 		 * We haven't yet acquired the lock, the lock
1543 		 * is free, and there are no other spinners.
1544 		 * Make one final attempt to acquire the lock.
1545 		 *
1546 		 * This isn't strictly necessary since mutex_lock_kernel()
1547 		 * (the next action this thread will take if it doesn't
1548 		 * acquire the lock here) makes one attempt to acquire
1549 		 * the lock before putting the thread to sleep.
1550 		 *
1551 		 * If the next action for this thread (on failure here)
1552 		 * were not to call mutex_lock_kernel(), this would be
1553 		 * necessary for correctness, to avoid ending up with an
1554 		 * unheld mutex with waiters but no one to wake them up.
1555 		 */
1556 #if defined(__sparc) && !defined(_LP64)
1557 		/* horrible hack, necessary only on 32-bit sparc */
1558 		if (fix_alignment_problem) {
1559 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1560 				mp->mutex_ownerpid = udp->pid;
1561 				mp->mutex_owner = (uintptr_t)self;
1562 				error = 0;
1563 			}
1564 		} else
1565 #endif
1566 		if (set_lock_byte64(lockp, udp->pid) == 0) {
1567 			mp->mutex_owner = (uintptr_t)self;
1568 			/* mp->mutex_ownerpid was set by set_lock_byte64() */
1569 			error = 0;
1570 		}
1571 		count++;
1572 	}
1573 	exit_critical(self);
1574 
1575 done:
1576 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1577 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1578 		/*
1579 		 * We shouldn't own the mutex.
1580 		 * Just clear the lock; everyone has already been waked up.
1581 		 */
1582 		mp->mutex_owner = 0;
1583 		/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1584 		(void) clear_lockbyte64(&mp->mutex_lockword64);
1585 		error = ENOTRECOVERABLE;
1586 	}
1587 
1588 	if (error) {
1589 		if (count) {
1590 			DTRACE_PROBE2(plockstat, mutex__spun, 0, count);
1591 		}
1592 		if (error != EBUSY) {
1593 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1594 		}
1595 	} else {
1596 		if (count) {
1597 			DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
1598 		}
1599 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
1600 		if (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1601 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1602 			if (mp->mutex_flag & LOCK_OWNERDEAD)
1603 				error = EOWNERDEAD;
1604 			else if (mp->mutex_type & USYNC_PROCESS_ROBUST)
1605 				error = ELOCKUNMAPPED;
1606 			else
1607 				error = EOWNERDEAD;
1608 		}
1609 	}
1610 
1611 	return (error);
1612 }
1613 
1614 /*
1615  * Mutex wakeup code for releasing a USYNC_THREAD mutex.
1616  * Returns the lwpid of the thread that was dequeued, if any.
1617  * The caller of mutex_wakeup() must call __lwp_unpark(lwpid)
1618  * to wake up the specified lwp.
1619  */
1620 static lwpid_t
1621 mutex_wakeup(mutex_t *mp)
1622 {
1623 	lwpid_t lwpid = 0;
1624 	int more;
1625 	queue_head_t *qp;
1626 	ulwp_t *ulwp;
1627 
1628 	/*
1629 	 * Dequeue a waiter from the sleep queue.  Don't touch the mutex
1630 	 * waiters bit if no one was found on the queue because the mutex
1631 	 * might have been deallocated or reallocated for another purpose.
1632 	 */
1633 	qp = queue_lock(mp, MX);
1634 	if ((ulwp = dequeue(qp, &more)) != NULL) {
1635 		lwpid = ulwp->ul_lwpid;
1636 		mp->mutex_waiters = more;
1637 	}
1638 	queue_unlock(qp);
1639 	return (lwpid);
1640 }
1641 
1642 /*
1643  * Mutex wakeup code for releasing all waiters on a USYNC_THREAD mutex.
1644  */
1645 static void
1646 mutex_wakeup_all(mutex_t *mp)
1647 {
1648 	queue_head_t *qp;
1649 	queue_root_t *qrp;
1650 	int nlwpid = 0;
1651 	int maxlwps = MAXLWPS;
1652 	ulwp_t *ulwp;
1653 	lwpid_t buffer[MAXLWPS];
1654 	lwpid_t *lwpid = buffer;
1655 
1656 	/*
1657 	 * Walk the list of waiters and prepare to wake up all of them.
1658 	 * The waiters flag has already been cleared from the mutex.
1659 	 *
1660 	 * We keep track of lwpids that are to be unparked in lwpid[].
1661 	 * __lwp_unpark_all() is called to unpark all of them after
1662 	 * they have been removed from the sleep queue and the sleep
1663 	 * queue lock has been dropped.  If we run out of space in our
1664 	 * on-stack buffer, we need to allocate more but we can't call
1665 	 * lmalloc() because we are holding a queue lock when the overflow
1666 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
1667 	 * either because the application may have allocated a small
1668 	 * stack and we don't want to overrun the stack.  So we call
1669 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
1670 	 * system call directly since that path acquires no locks.
1671 	 */
1672 	qp = queue_lock(mp, MX);
1673 	for (;;) {
1674 		if ((qrp = qp->qh_root) == NULL ||
1675 		    (ulwp = qrp->qr_head) == NULL)
1676 			break;
1677 		ASSERT(ulwp->ul_wchan == mp);
1678 		queue_unlink(qp, &qrp->qr_head, NULL);
1679 		ulwp->ul_sleepq = NULL;
1680 		ulwp->ul_wchan = NULL;
1681 		if (nlwpid == maxlwps)
1682 			lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
1683 		lwpid[nlwpid++] = ulwp->ul_lwpid;
1684 	}
1685 
1686 	if (nlwpid == 0) {
1687 		queue_unlock(qp);
1688 	} else {
1689 		mp->mutex_waiters = 0;
1690 		no_preempt(curthread);
1691 		queue_unlock(qp);
1692 		if (nlwpid == 1)
1693 			(void) __lwp_unpark(lwpid[0]);
1694 		else
1695 			(void) __lwp_unpark_all(lwpid, nlwpid);
1696 		preempt(curthread);
1697 	}
1698 
1699 	if (lwpid != buffer)
1700 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
1701 }
1702 
1703 /*
1704  * Release a process-private mutex.
1705  * As an optimization, if there are waiters but there are also spinners
1706  * attempting to acquire the mutex, then don't bother waking up a waiter;
1707  * one of the spinners will acquire the mutex soon and it would be a waste
1708  * of resources to wake up some thread just to have it spin for a while
1709  * and then possibly go back to sleep.  See mutex_trylock_adaptive().
1710  */
1711 static lwpid_t
1712 mutex_unlock_queue(mutex_t *mp, int release_all)
1713 {
1714 	lwpid_t lwpid = 0;
1715 	uint32_t old_lockword;
1716 
1717 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1718 	mp->mutex_owner = 0;
1719 	old_lockword = clear_lockbyte(&mp->mutex_lockword);
1720 	if ((old_lockword & WAITERMASK) &&
1721 	    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1722 		ulwp_t *self = curthread;
1723 		no_preempt(self);	/* ensure a prompt wakeup */
1724 		if (release_all)
1725 			mutex_wakeup_all(mp);
1726 		else
1727 			lwpid = mutex_wakeup(mp);
1728 		if (lwpid == 0)
1729 			preempt(self);
1730 	}
1731 	return (lwpid);
1732 }
1733 
1734 /*
1735  * Like mutex_unlock_queue(), but for process-shared mutexes.
1736  */
1737 static void
1738 mutex_unlock_process(mutex_t *mp, int release_all)
1739 {
1740 	ulwp_t *self = curthread;
1741 	uint64_t old_lockword64;
1742 
1743 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
1744 	mp->mutex_owner = 0;
1745 #if defined(__sparc) && !defined(_LP64)
1746 	/* horrible hack, necessary only on 32-bit sparc */
1747 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
1748 	    self->ul_misaligned && !(mp->mutex_type & LOCK_ROBUST)) {
1749 		uint32_t old_lockword;
1750 		mp->mutex_ownerpid = 0;
1751 		old_lockword = clear_lockbyte(&mp->mutex_lockword);
1752 		if ((old_lockword & WAITERMASK) &&
1753 		    (release_all || (old_lockword & SPINNERMASK) == 0)) {
1754 			no_preempt(self);	/* ensure a prompt wakeup */
1755 			(void) ___lwp_mutex_wakeup(mp, release_all);
1756 			preempt(self);
1757 		}
1758 		return;
1759 	}
1760 #endif
1761 	/* mp->mutex_ownerpid is cleared by clear_lockbyte64() */
1762 	old_lockword64 = clear_lockbyte64(&mp->mutex_lockword64);
1763 	if ((old_lockword64 & WAITERMASK64) &&
1764 	    (release_all || (old_lockword64 & SPINNERMASK64) == 0)) {
1765 		no_preempt(self);	/* ensure a prompt wakeup */
1766 		(void) ___lwp_mutex_wakeup(mp, release_all);
1767 		preempt(self);
1768 	}
1769 }
1770 
1771 void
1772 stall(void)
1773 {
1774 	for (;;)
1775 		(void) mutex_lock_kernel(&stall_mutex, NULL, NULL);
1776 }
1777 
1778 /*
1779  * Acquire a USYNC_THREAD mutex via user-level sleep queues.
1780  * We failed set_lock_byte(&mp->mutex_lockw) before coming here.
1781  * If successful, returns with mutex_owner set correctly.
1782  */
1783 int
1784 mutex_lock_queue(ulwp_t *self, tdb_mutex_stats_t *msp, mutex_t *mp,
1785 	timespec_t *tsp)
1786 {
1787 	uberdata_t *udp = curthread->ul_uberdata;
1788 	queue_head_t *qp;
1789 	hrtime_t begin_sleep;
1790 	int error = 0;
1791 
1792 	self->ul_sp = stkptr();
1793 	if (__td_event_report(self, TD_SLEEP, udp)) {
1794 		self->ul_wchan = mp;
1795 		self->ul_td_evbuf.eventnum = TD_SLEEP;
1796 		self->ul_td_evbuf.eventdata = mp;
1797 		tdb_event(TD_SLEEP, udp);
1798 	}
1799 	if (msp) {
1800 		tdb_incr(msp->mutex_sleep);
1801 		begin_sleep = gethrtime();
1802 	}
1803 
1804 	DTRACE_PROBE1(plockstat, mutex__block, mp);
1805 
1806 	/*
1807 	 * Put ourself on the sleep queue, and while we are
1808 	 * unable to grab the lock, go park in the kernel.
1809 	 * Take ourself off the sleep queue after we acquire the lock.
1810 	 * The waiter bit can be set/cleared only while holding the queue lock.
1811 	 */
1812 	qp = queue_lock(mp, MX);
1813 	enqueue(qp, self, 0);
1814 	mp->mutex_waiters = 1;
1815 	for (;;) {
1816 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
1817 			mp->mutex_owner = (uintptr_t)self;
1818 			mp->mutex_waiters = dequeue_self(qp);
1819 			break;
1820 		}
1821 		set_parking_flag(self, 1);
1822 		queue_unlock(qp);
1823 		/*
1824 		 * __lwp_park() will return the residual time in tsp
1825 		 * if we are unparked before the timeout expires.
1826 		 */
1827 		error = __lwp_park(tsp, 0);
1828 		set_parking_flag(self, 0);
1829 		/*
1830 		 * We could have taken a signal or suspended ourself.
1831 		 * If we did, then we removed ourself from the queue.
1832 		 * Someone else may have removed us from the queue
1833 		 * as a consequence of mutex_unlock().  We may have
1834 		 * gotten a timeout from __lwp_park().  Or we may still
1835 		 * be on the queue and this is just a spurious wakeup.
1836 		 */
1837 		qp = queue_lock(mp, MX);
1838 		if (self->ul_sleepq == NULL) {
1839 			if (error) {
1840 				mp->mutex_waiters = queue_waiter(qp)? 1 : 0;
1841 				if (error != EINTR)
1842 					break;
1843 				error = 0;
1844 			}
1845 			if (set_lock_byte(&mp->mutex_lockw) == 0) {
1846 				mp->mutex_owner = (uintptr_t)self;
1847 				break;
1848 			}
1849 			enqueue(qp, self, 0);
1850 			mp->mutex_waiters = 1;
1851 		}
1852 		ASSERT(self->ul_sleepq == qp &&
1853 		    self->ul_qtype == MX &&
1854 		    self->ul_wchan == mp);
1855 		if (error) {
1856 			if (error != EINTR) {
1857 				mp->mutex_waiters = dequeue_self(qp);
1858 				break;
1859 			}
1860 			error = 0;
1861 		}
1862 	}
1863 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
1864 	    self->ul_wchan == NULL);
1865 	self->ul_sp = 0;
1866 	queue_unlock(qp);
1867 
1868 	if (msp)
1869 		msp->mutex_sleep_time += gethrtime() - begin_sleep;
1870 
1871 	ASSERT(error == 0 || error == EINVAL || error == ETIME);
1872 
1873 	if (error == 0 && (mp->mutex_flag & LOCK_NOTRECOVERABLE)) {
1874 		ASSERT(mp->mutex_type & LOCK_ROBUST);
1875 		/*
1876 		 * We shouldn't own the mutex.
1877 		 * Just clear the lock; everyone has already been waked up.
1878 		 */
1879 		mp->mutex_owner = 0;
1880 		(void) clear_lockbyte(&mp->mutex_lockword);
1881 		error = ENOTRECOVERABLE;
1882 	}
1883 
1884 	if (error) {
1885 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 0);
1886 		DTRACE_PROBE2(plockstat, mutex__error, mp, error);
1887 	} else {
1888 		DTRACE_PROBE2(plockstat, mutex__blocked, mp, 1);
1889 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
1890 		if (mp->mutex_flag & LOCK_OWNERDEAD) {
1891 			ASSERT(mp->mutex_type & LOCK_ROBUST);
1892 			error = EOWNERDEAD;
1893 		}
1894 	}
1895 
1896 	return (error);
1897 }
1898 
1899 static int
1900 mutex_recursion(mutex_t *mp, int mtype, int try)
1901 {
1902 	ASSERT(mutex_held(mp));
1903 	ASSERT(mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK));
1904 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
1905 
1906 	if (mtype & LOCK_RECURSIVE) {
1907 		if (mp->mutex_rcount == RECURSION_MAX) {
1908 			DTRACE_PROBE2(plockstat, mutex__error, mp, EAGAIN);
1909 			return (EAGAIN);
1910 		}
1911 		mp->mutex_rcount++;
1912 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 1, 0);
1913 		return (0);
1914 	}
1915 	if (try == MUTEX_LOCK) {
1916 		DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
1917 		return (EDEADLK);
1918 	}
1919 	return (EBUSY);
1920 }
1921 
1922 /*
1923  * Register this USYNC_PROCESS|LOCK_ROBUST mutex with the kernel so
1924  * it can apply LOCK_OWNERDEAD|LOCK_UNMAPPED if it becomes necessary.
1925  * We use tdb_hash_lock here and in the synch object tracking code in
1926  * the tdb_agent.c file.  There is no conflict between these two usages.
1927  */
1928 void
1929 register_lock(mutex_t *mp)
1930 {
1931 	uberdata_t *udp = curthread->ul_uberdata;
1932 	uint_t hash = LOCK_HASH(mp);
1933 	robust_t *rlp;
1934 	robust_t **rlpp;
1935 	robust_t **table;
1936 
1937 	if ((table = udp->robustlocks) == NULL) {
1938 		lmutex_lock(&udp->tdb_hash_lock);
1939 		if ((table = udp->robustlocks) == NULL) {
1940 			table = lmalloc(LOCKHASHSZ * sizeof (robust_t *));
1941 			membar_producer();
1942 			udp->robustlocks = table;
1943 		}
1944 		lmutex_unlock(&udp->tdb_hash_lock);
1945 	}
1946 	membar_consumer();
1947 
1948 	/*
1949 	 * First search the registered table with no locks held.
1950 	 * This is safe because the table never shrinks
1951 	 * and we can only get a false negative.
1952 	 */
1953 	for (rlp = table[hash]; rlp != NULL; rlp = rlp->robust_next) {
1954 		if (rlp->robust_lock == mp)	/* already registered */
1955 			return;
1956 	}
1957 
1958 	/*
1959 	 * The lock was not found.
1960 	 * Repeat the operation with tdb_hash_lock held.
1961 	 */
1962 	lmutex_lock(&udp->tdb_hash_lock);
1963 
1964 	for (rlpp = &table[hash];
1965 	    (rlp = *rlpp) != NULL;
1966 	    rlpp = &rlp->robust_next) {
1967 		if (rlp->robust_lock == mp) {	/* already registered */
1968 			lmutex_unlock(&udp->tdb_hash_lock);
1969 			return;
1970 		}
1971 	}
1972 
1973 	/*
1974 	 * The lock has never been registered.
1975 	 * Register it now and add it to the table.
1976 	 */
1977 	(void) ___lwp_mutex_register(mp);
1978 	rlp = lmalloc(sizeof (*rlp));
1979 	rlp->robust_lock = mp;
1980 	membar_producer();
1981 	*rlpp = rlp;
1982 
1983 	lmutex_unlock(&udp->tdb_hash_lock);
1984 }
1985 
1986 /*
1987  * This is called in the child of fork()/forkall() to start over
1988  * with a clean slate.  (Each process must register its own locks.)
1989  * No locks are needed because all other threads are suspended or gone.
1990  */
1991 void
1992 unregister_locks(void)
1993 {
1994 	uberdata_t *udp = curthread->ul_uberdata;
1995 	uint_t hash;
1996 	robust_t **table;
1997 	robust_t *rlp;
1998 	robust_t *next;
1999 
2000 	if ((table = udp->robustlocks) != NULL) {
2001 		for (hash = 0; hash < LOCKHASHSZ; hash++) {
2002 			rlp = table[hash];
2003 			while (rlp != NULL) {
2004 				next = rlp->robust_next;
2005 				lfree(rlp, sizeof (*rlp));
2006 				rlp = next;
2007 			}
2008 		}
2009 		lfree(table, LOCKHASHSZ * sizeof (robust_t *));
2010 		udp->robustlocks = NULL;
2011 	}
2012 }
2013 
2014 /*
2015  * Returns with mutex_owner set correctly.
2016  */
2017 int
2018 mutex_lock_internal(mutex_t *mp, timespec_t *tsp, int try)
2019 {
2020 	ulwp_t *self = curthread;
2021 	uberdata_t *udp = self->ul_uberdata;
2022 	int mtype = mp->mutex_type;
2023 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2024 	int error = 0;
2025 	int noceil = try & MUTEX_NOCEIL;
2026 	uint8_t ceil;
2027 	int myprio;
2028 
2029 	try &= ~MUTEX_NOCEIL;
2030 	ASSERT(try == MUTEX_TRY || try == MUTEX_LOCK);
2031 
2032 	if (!self->ul_schedctl_called)
2033 		(void) setup_schedctl();
2034 
2035 	if (msp && try == MUTEX_TRY)
2036 		tdb_incr(msp->mutex_try);
2037 
2038 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && mutex_held(mp))
2039 		return (mutex_recursion(mp, mtype, try));
2040 
2041 	if (self->ul_error_detection && try == MUTEX_LOCK &&
2042 	    tsp == NULL && mutex_held(mp))
2043 		lock_error(mp, "mutex_lock", NULL, NULL);
2044 
2045 	if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2046 		update_sched(self);
2047 		if (self->ul_cid != self->ul_rtclassid) {
2048 			DTRACE_PROBE2(plockstat, mutex__error, mp, EPERM);
2049 			return (EPERM);
2050 		}
2051 		ceil = mp->mutex_ceiling;
2052 		myprio = self->ul_epri? self->ul_epri : self->ul_pri;
2053 		if (myprio > ceil) {
2054 			DTRACE_PROBE2(plockstat, mutex__error, mp, EINVAL);
2055 			return (EINVAL);
2056 		}
2057 		if ((error = _ceil_mylist_add(mp)) != 0) {
2058 			DTRACE_PROBE2(plockstat, mutex__error, mp, error);
2059 			return (error);
2060 		}
2061 		if (myprio < ceil)
2062 			_ceil_prio_inherit(ceil);
2063 	}
2064 
2065 	if ((mtype & (USYNC_PROCESS | LOCK_ROBUST))
2066 	    == (USYNC_PROCESS | LOCK_ROBUST))
2067 		register_lock(mp);
2068 
2069 	if (mtype & LOCK_PRIO_INHERIT) {
2070 		/* go straight to the kernel */
2071 		if (try == MUTEX_TRY)
2072 			error = mutex_trylock_kernel(mp);
2073 		else	/* MUTEX_LOCK */
2074 			error = mutex_lock_kernel(mp, tsp, msp);
2075 		/*
2076 		 * The kernel never sets or clears the lock byte
2077 		 * for LOCK_PRIO_INHERIT mutexes.
2078 		 * Set it here for consistency.
2079 		 */
2080 		switch (error) {
2081 		case 0:
2082 			self->ul_pilocks++;
2083 			mp->mutex_lockw = LOCKSET;
2084 			break;
2085 		case EOWNERDEAD:
2086 		case ELOCKUNMAPPED:
2087 			self->ul_pilocks++;
2088 			mp->mutex_lockw = LOCKSET;
2089 			/* FALLTHROUGH */
2090 		case ENOTRECOVERABLE:
2091 			ASSERT(mtype & LOCK_ROBUST);
2092 			break;
2093 		case EDEADLK:
2094 			if (try == MUTEX_LOCK)
2095 				stall();
2096 			error = EBUSY;
2097 			break;
2098 		}
2099 	} else if (mtype & USYNC_PROCESS) {
2100 		error = mutex_trylock_process(mp, try == MUTEX_LOCK);
2101 		if (error == EBUSY && try == MUTEX_LOCK)
2102 			error = mutex_lock_kernel(mp, tsp, msp);
2103 	} else {	/* USYNC_THREAD */
2104 		error = mutex_trylock_adaptive(mp, try == MUTEX_LOCK);
2105 		if (error == EBUSY && try == MUTEX_LOCK)
2106 			error = mutex_lock_queue(self, msp, mp, tsp);
2107 	}
2108 
2109 	switch (error) {
2110 	case 0:
2111 	case EOWNERDEAD:
2112 	case ELOCKUNMAPPED:
2113 		if (mtype & LOCK_ROBUST)
2114 			remember_lock(mp);
2115 		if (msp)
2116 			record_begin_hold(msp);
2117 		break;
2118 	default:
2119 		if ((mtype & LOCK_PRIO_PROTECT) && noceil == 0) {
2120 			(void) _ceil_mylist_del(mp);
2121 			if (myprio < ceil)
2122 				_ceil_prio_waive();
2123 		}
2124 		if (try == MUTEX_TRY) {
2125 			if (msp)
2126 				tdb_incr(msp->mutex_try_fail);
2127 			if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2128 				self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2129 				tdb_event(TD_LOCK_TRY, udp);
2130 			}
2131 		}
2132 		break;
2133 	}
2134 
2135 	return (error);
2136 }
2137 
2138 int
2139 fast_process_lock(mutex_t *mp, timespec_t *tsp, int mtype, int try)
2140 {
2141 	ulwp_t *self = curthread;
2142 	uberdata_t *udp = self->ul_uberdata;
2143 
2144 	/*
2145 	 * We know that USYNC_PROCESS is set in mtype and that
2146 	 * zero, one, or both of the flags LOCK_RECURSIVE and
2147 	 * LOCK_ERRORCHECK are set, and that no other flags are set.
2148 	 */
2149 	ASSERT((mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0);
2150 	enter_critical(self);
2151 #if defined(__sparc) && !defined(_LP64)
2152 	/* horrible hack, necessary only on 32-bit sparc */
2153 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2154 	    self->ul_misaligned) {
2155 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2156 			mp->mutex_ownerpid = udp->pid;
2157 			mp->mutex_owner = (uintptr_t)self;
2158 			exit_critical(self);
2159 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2160 			return (0);
2161 		}
2162 	} else
2163 #endif
2164 	if (set_lock_byte64(&mp->mutex_lockword64, udp->pid) == 0) {
2165 		mp->mutex_owner = (uintptr_t)self;
2166 		/* mp->mutex_ownerpid was set by set_lock_byte64() */
2167 		exit_critical(self);
2168 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2169 		return (0);
2170 	}
2171 	exit_critical(self);
2172 
2173 	if ((mtype & (LOCK_RECURSIVE|LOCK_ERRORCHECK)) && shared_mutex_held(mp))
2174 		return (mutex_recursion(mp, mtype, try));
2175 
2176 	if (try == MUTEX_LOCK) {
2177 		if (mutex_trylock_process(mp, 1) == 0)
2178 			return (0);
2179 		return (mutex_lock_kernel(mp, tsp, NULL));
2180 	}
2181 
2182 	if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2183 		self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2184 		tdb_event(TD_LOCK_TRY, udp);
2185 	}
2186 	return (EBUSY);
2187 }
2188 
2189 static int
2190 mutex_lock_impl(mutex_t *mp, timespec_t *tsp)
2191 {
2192 	ulwp_t *self = curthread;
2193 	int mtype = mp->mutex_type;
2194 	uberflags_t *gflags;
2195 
2196 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2197 	    self->ul_error_detection && self->ul_misaligned == 0)
2198 		lock_error(mp, "mutex_lock", NULL, "mutex is misaligned");
2199 
2200 	/*
2201 	 * Optimize the case of USYNC_THREAD, including
2202 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2203 	 * no error detection, no lock statistics,
2204 	 * and the process has only a single thread.
2205 	 * (Most likely a traditional single-threaded application.)
2206 	 */
2207 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2208 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2209 		/*
2210 		 * Only one thread exists so we don't need an atomic operation.
2211 		 */
2212 		if (mp->mutex_lockw == 0) {
2213 			mp->mutex_lockw = LOCKSET;
2214 			mp->mutex_owner = (uintptr_t)self;
2215 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2216 			return (0);
2217 		}
2218 		if (mtype && MUTEX_OWNER(mp) == self)
2219 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2220 		/*
2221 		 * We have reached a deadlock, probably because the
2222 		 * process is executing non-async-signal-safe code in
2223 		 * a signal handler and is attempting to acquire a lock
2224 		 * that it already owns.  This is not surprising, given
2225 		 * bad programming practices over the years that has
2226 		 * resulted in applications calling printf() and such
2227 		 * in their signal handlers.  Unless the user has told
2228 		 * us that the signal handlers are safe by setting:
2229 		 *	export _THREAD_ASYNC_SAFE=1
2230 		 * we return EDEADLK rather than actually deadlocking.
2231 		 */
2232 		if (tsp == NULL &&
2233 		    MUTEX_OWNER(mp) == self && !self->ul_async_safe) {
2234 			DTRACE_PROBE2(plockstat, mutex__error, mp, EDEADLK);
2235 			return (EDEADLK);
2236 		}
2237 	}
2238 
2239 	/*
2240 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2241 	 * no error detection, and no lock statistics.
2242 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2243 	 */
2244 	if ((gflags = self->ul_schedctl_called) != NULL &&
2245 	    (gflags->uf_trs_ted |
2246 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2247 		if (mtype & USYNC_PROCESS)
2248 			return (fast_process_lock(mp, tsp, mtype, MUTEX_LOCK));
2249 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2250 			mp->mutex_owner = (uintptr_t)self;
2251 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2252 			return (0);
2253 		}
2254 		if (mtype && MUTEX_OWNER(mp) == self)
2255 			return (mutex_recursion(mp, mtype, MUTEX_LOCK));
2256 		if (mutex_trylock_adaptive(mp, 1) != 0)
2257 			return (mutex_lock_queue(self, NULL, mp, tsp));
2258 		return (0);
2259 	}
2260 
2261 	/* else do it the long way */
2262 	return (mutex_lock_internal(mp, tsp, MUTEX_LOCK));
2263 }
2264 
2265 #pragma weak pthread_mutex_lock = mutex_lock
2266 #pragma weak _mutex_lock = mutex_lock
2267 int
2268 mutex_lock(mutex_t *mp)
2269 {
2270 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2271 	return (mutex_lock_impl(mp, NULL));
2272 }
2273 
2274 int
2275 pthread_mutex_timedlock(pthread_mutex_t *_RESTRICT_KYWD mp,
2276 	const struct timespec *_RESTRICT_KYWD abstime)
2277 {
2278 	timespec_t tslocal;
2279 	int error;
2280 
2281 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2282 	abstime_to_reltime(CLOCK_REALTIME, abstime, &tslocal);
2283 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2284 	if (error == ETIME)
2285 		error = ETIMEDOUT;
2286 	return (error);
2287 }
2288 
2289 int
2290 pthread_mutex_reltimedlock_np(pthread_mutex_t *_RESTRICT_KYWD mp,
2291 	const struct timespec *_RESTRICT_KYWD reltime)
2292 {
2293 	timespec_t tslocal;
2294 	int error;
2295 
2296 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2297 	tslocal = *reltime;
2298 	error = mutex_lock_impl((mutex_t *)mp, &tslocal);
2299 	if (error == ETIME)
2300 		error = ETIMEDOUT;
2301 	return (error);
2302 }
2303 
2304 #pragma weak pthread_mutex_trylock = mutex_trylock
2305 int
2306 mutex_trylock(mutex_t *mp)
2307 {
2308 	ulwp_t *self = curthread;
2309 	uberdata_t *udp = self->ul_uberdata;
2310 	int mtype = mp->mutex_type;
2311 	uberflags_t *gflags;
2312 
2313 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
2314 
2315 	/*
2316 	 * Optimize the case of USYNC_THREAD, including
2317 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2318 	 * no error detection, no lock statistics,
2319 	 * and the process has only a single thread.
2320 	 * (Most likely a traditional single-threaded application.)
2321 	 */
2322 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2323 	    udp->uberflags.uf_all) == 0) {
2324 		/*
2325 		 * Only one thread exists so we don't need an atomic operation.
2326 		 */
2327 		if (mp->mutex_lockw == 0) {
2328 			mp->mutex_lockw = LOCKSET;
2329 			mp->mutex_owner = (uintptr_t)self;
2330 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2331 			return (0);
2332 		}
2333 		if (mtype && MUTEX_OWNER(mp) == self)
2334 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2335 		return (EBUSY);
2336 	}
2337 
2338 	/*
2339 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2340 	 * no error detection, and no lock statistics.
2341 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2342 	 */
2343 	if ((gflags = self->ul_schedctl_called) != NULL &&
2344 	    (gflags->uf_trs_ted |
2345 	    (mtype & ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK))) == 0) {
2346 		if (mtype & USYNC_PROCESS)
2347 			return (fast_process_lock(mp, NULL, mtype, MUTEX_TRY));
2348 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2349 			mp->mutex_owner = (uintptr_t)self;
2350 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2351 			return (0);
2352 		}
2353 		if (mtype && MUTEX_OWNER(mp) == self)
2354 			return (mutex_recursion(mp, mtype, MUTEX_TRY));
2355 		if (__td_event_report(self, TD_LOCK_TRY, udp)) {
2356 			self->ul_td_evbuf.eventnum = TD_LOCK_TRY;
2357 			tdb_event(TD_LOCK_TRY, udp);
2358 		}
2359 		return (EBUSY);
2360 	}
2361 
2362 	/* else do it the long way */
2363 	return (mutex_lock_internal(mp, NULL, MUTEX_TRY));
2364 }
2365 
2366 int
2367 mutex_unlock_internal(mutex_t *mp, int retain_robust_flags)
2368 {
2369 	ulwp_t *self = curthread;
2370 	uberdata_t *udp = self->ul_uberdata;
2371 	int mtype = mp->mutex_type;
2372 	tdb_mutex_stats_t *msp;
2373 	int error = 0;
2374 	int release_all;
2375 	lwpid_t lwpid;
2376 
2377 	if ((mtype & LOCK_ERRORCHECK) && !mutex_held(mp))
2378 		return (EPERM);
2379 
2380 	if (self->ul_error_detection && !mutex_held(mp))
2381 		lock_error(mp, "mutex_unlock", NULL, NULL);
2382 
2383 	if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2384 		mp->mutex_rcount--;
2385 		DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2386 		return (0);
2387 	}
2388 
2389 	if ((msp = MUTEX_STATS(mp, udp)) != NULL)
2390 		(void) record_hold_time(msp);
2391 
2392 	if (!retain_robust_flags && !(mtype & LOCK_PRIO_INHERIT) &&
2393 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2394 		ASSERT(mp->mutex_type & LOCK_ROBUST);
2395 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2396 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
2397 	}
2398 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
2399 
2400 	if (mtype & LOCK_PRIO_INHERIT) {
2401 		no_preempt(self);
2402 		mp->mutex_owner = 0;
2403 		/* mp->mutex_ownerpid is cleared by ___lwp_mutex_unlock() */
2404 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2405 		mp->mutex_lockw = LOCKCLEAR;
2406 		self->ul_pilocks--;
2407 		error = ___lwp_mutex_unlock(mp);
2408 		preempt(self);
2409 	} else if (mtype & USYNC_PROCESS) {
2410 		mutex_unlock_process(mp, release_all);
2411 	} else {	/* USYNC_THREAD */
2412 		if ((lwpid = mutex_unlock_queue(mp, release_all)) != 0) {
2413 			(void) __lwp_unpark(lwpid);
2414 			preempt(self);
2415 		}
2416 	}
2417 
2418 	if (mtype & LOCK_ROBUST)
2419 		forget_lock(mp);
2420 
2421 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
2422 		_ceil_prio_waive();
2423 
2424 	return (error);
2425 }
2426 
2427 #pragma weak pthread_mutex_unlock = mutex_unlock
2428 #pragma weak _mutex_unlock = mutex_unlock
2429 int
2430 mutex_unlock(mutex_t *mp)
2431 {
2432 	ulwp_t *self = curthread;
2433 	int mtype = mp->mutex_type;
2434 	uberflags_t *gflags;
2435 	lwpid_t lwpid;
2436 	short el;
2437 
2438 	/*
2439 	 * Optimize the case of USYNC_THREAD, including
2440 	 * the LOCK_RECURSIVE and LOCK_ERRORCHECK cases,
2441 	 * no error detection, no lock statistics,
2442 	 * and the process has only a single thread.
2443 	 * (Most likely a traditional single-threaded application.)
2444 	 */
2445 	if (((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) |
2446 	    self->ul_uberdata->uberflags.uf_all) == 0) {
2447 		if (mtype) {
2448 			/*
2449 			 * At this point we know that one or both of the
2450 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2451 			 */
2452 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2453 				return (EPERM);
2454 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2455 				mp->mutex_rcount--;
2456 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2457 				return (0);
2458 			}
2459 		}
2460 		/*
2461 		 * Only one thread exists so we don't need an atomic operation.
2462 		 * Also, there can be no waiters.
2463 		 */
2464 		mp->mutex_owner = 0;
2465 		mp->mutex_lockword = 0;
2466 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2467 		return (0);
2468 	}
2469 
2470 	/*
2471 	 * Optimize the common cases of USYNC_THREAD or USYNC_PROCESS,
2472 	 * no error detection, and no lock statistics.
2473 	 * Include LOCK_RECURSIVE and LOCK_ERRORCHECK cases.
2474 	 */
2475 	if ((gflags = self->ul_schedctl_called) != NULL) {
2476 		if (((el = gflags->uf_trs_ted) | mtype) == 0) {
2477 fast_unlock:
2478 			if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2479 				(void) __lwp_unpark(lwpid);
2480 				preempt(self);
2481 			}
2482 			return (0);
2483 		}
2484 		if (el)		/* error detection or lock statistics */
2485 			goto slow_unlock;
2486 		if ((mtype & ~(LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2487 			/*
2488 			 * At this point we know that one or both of the
2489 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set.
2490 			 */
2491 			if ((mtype & LOCK_ERRORCHECK) && !MUTEX_OWNED(mp, self))
2492 				return (EPERM);
2493 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2494 				mp->mutex_rcount--;
2495 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2496 				return (0);
2497 			}
2498 			goto fast_unlock;
2499 		}
2500 		if ((mtype &
2501 		    ~(USYNC_PROCESS|LOCK_RECURSIVE|LOCK_ERRORCHECK)) == 0) {
2502 			/*
2503 			 * At this point we know that zero, one, or both of the
2504 			 * flags LOCK_RECURSIVE or LOCK_ERRORCHECK is set and
2505 			 * that the USYNC_PROCESS flag is set.
2506 			 */
2507 			if ((mtype & LOCK_ERRORCHECK) && !shared_mutex_held(mp))
2508 				return (EPERM);
2509 			if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0) {
2510 				mp->mutex_rcount--;
2511 				DTRACE_PROBE2(plockstat, mutex__release, mp, 1);
2512 				return (0);
2513 			}
2514 			mutex_unlock_process(mp, 0);
2515 			return (0);
2516 		}
2517 	}
2518 
2519 	/* else do it the long way */
2520 slow_unlock:
2521 	return (mutex_unlock_internal(mp, 0));
2522 }
2523 
2524 /*
2525  * Internally to the library, almost all mutex lock/unlock actions
2526  * go through these lmutex_ functions, to protect critical regions.
2527  * We replicate a bit of code from mutex_lock() and mutex_unlock()
2528  * to make these functions faster since we know that the mutex type
2529  * of all internal locks is USYNC_THREAD.  We also know that internal
2530  * locking can never fail, so we panic if it does.
2531  */
2532 void
2533 lmutex_lock(mutex_t *mp)
2534 {
2535 	ulwp_t *self = curthread;
2536 	uberdata_t *udp = self->ul_uberdata;
2537 
2538 	ASSERT(mp->mutex_type == USYNC_THREAD);
2539 
2540 	enter_critical(self);
2541 	/*
2542 	 * Optimize the case of no lock statistics and only a single thread.
2543 	 * (Most likely a traditional single-threaded application.)
2544 	 */
2545 	if (udp->uberflags.uf_all == 0) {
2546 		/*
2547 		 * Only one thread exists; the mutex must be free.
2548 		 */
2549 		ASSERT(mp->mutex_lockw == 0);
2550 		mp->mutex_lockw = LOCKSET;
2551 		mp->mutex_owner = (uintptr_t)self;
2552 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2553 	} else {
2554 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2555 
2556 		if (!self->ul_schedctl_called)
2557 			(void) setup_schedctl();
2558 
2559 		if (set_lock_byte(&mp->mutex_lockw) == 0) {
2560 			mp->mutex_owner = (uintptr_t)self;
2561 			DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2562 		} else if (mutex_trylock_adaptive(mp, 1) != 0) {
2563 			(void) mutex_lock_queue(self, msp, mp, NULL);
2564 		}
2565 
2566 		if (msp)
2567 			record_begin_hold(msp);
2568 	}
2569 }
2570 
2571 void
2572 lmutex_unlock(mutex_t *mp)
2573 {
2574 	ulwp_t *self = curthread;
2575 	uberdata_t *udp = self->ul_uberdata;
2576 
2577 	ASSERT(mp->mutex_type == USYNC_THREAD);
2578 
2579 	/*
2580 	 * Optimize the case of no lock statistics and only a single thread.
2581 	 * (Most likely a traditional single-threaded application.)
2582 	 */
2583 	if (udp->uberflags.uf_all == 0) {
2584 		/*
2585 		 * Only one thread exists so there can be no waiters.
2586 		 */
2587 		mp->mutex_owner = 0;
2588 		mp->mutex_lockword = 0;
2589 		DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2590 	} else {
2591 		tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
2592 		lwpid_t lwpid;
2593 
2594 		if (msp)
2595 			(void) record_hold_time(msp);
2596 		if ((lwpid = mutex_unlock_queue(mp, 0)) != 0) {
2597 			(void) __lwp_unpark(lwpid);
2598 			preempt(self);
2599 		}
2600 	}
2601 	exit_critical(self);
2602 }
2603 
2604 /*
2605  * For specialized code in libc, like the asynchronous i/o code,
2606  * the following sig_*() locking primitives are used in order
2607  * to make the code asynchronous signal safe.  Signals are
2608  * deferred while locks acquired by these functions are held.
2609  */
2610 void
2611 sig_mutex_lock(mutex_t *mp)
2612 {
2613 	sigoff(curthread);
2614 	(void) mutex_lock(mp);
2615 }
2616 
2617 void
2618 sig_mutex_unlock(mutex_t *mp)
2619 {
2620 	(void) mutex_unlock(mp);
2621 	sigon(curthread);
2622 }
2623 
2624 int
2625 sig_mutex_trylock(mutex_t *mp)
2626 {
2627 	int error;
2628 
2629 	sigoff(curthread);
2630 	if ((error = mutex_trylock(mp)) != 0)
2631 		sigon(curthread);
2632 	return (error);
2633 }
2634 
2635 /*
2636  * sig_cond_wait() is a cancellation point.
2637  */
2638 int
2639 sig_cond_wait(cond_t *cv, mutex_t *mp)
2640 {
2641 	int error;
2642 
2643 	ASSERT(curthread->ul_sigdefer != 0);
2644 	pthread_testcancel();
2645 	error = __cond_wait(cv, mp);
2646 	if (error == EINTR && curthread->ul_cursig) {
2647 		sig_mutex_unlock(mp);
2648 		/* take the deferred signal here */
2649 		sig_mutex_lock(mp);
2650 	}
2651 	pthread_testcancel();
2652 	return (error);
2653 }
2654 
2655 /*
2656  * sig_cond_reltimedwait() is a cancellation point.
2657  */
2658 int
2659 sig_cond_reltimedwait(cond_t *cv, mutex_t *mp, const timespec_t *ts)
2660 {
2661 	int error;
2662 
2663 	ASSERT(curthread->ul_sigdefer != 0);
2664 	pthread_testcancel();
2665 	error = __cond_reltimedwait(cv, mp, ts);
2666 	if (error == EINTR && curthread->ul_cursig) {
2667 		sig_mutex_unlock(mp);
2668 		/* take the deferred signal here */
2669 		sig_mutex_lock(mp);
2670 	}
2671 	pthread_testcancel();
2672 	return (error);
2673 }
2674 
2675 /*
2676  * For specialized code in libc, like the stdio code.
2677  * the following cancel_safe_*() locking primitives are used in
2678  * order to make the code cancellation-safe.  Cancellation is
2679  * deferred while locks acquired by these functions are held.
2680  */
2681 void
2682 cancel_safe_mutex_lock(mutex_t *mp)
2683 {
2684 	(void) mutex_lock(mp);
2685 	curthread->ul_libc_locks++;
2686 }
2687 
2688 int
2689 cancel_safe_mutex_trylock(mutex_t *mp)
2690 {
2691 	int error;
2692 
2693 	if ((error = mutex_trylock(mp)) == 0)
2694 		curthread->ul_libc_locks++;
2695 	return (error);
2696 }
2697 
2698 void
2699 cancel_safe_mutex_unlock(mutex_t *mp)
2700 {
2701 	ulwp_t *self = curthread;
2702 
2703 	ASSERT(self->ul_libc_locks != 0);
2704 
2705 	(void) mutex_unlock(mp);
2706 
2707 	/*
2708 	 * Decrement the count of locks held by cancel_safe_mutex_lock().
2709 	 * If we are then in a position to terminate cleanly and
2710 	 * if there is a pending cancellation and cancellation
2711 	 * is not disabled and we received EINTR from a recent
2712 	 * system call then perform the cancellation action now.
2713 	 */
2714 	if (--self->ul_libc_locks == 0 &&
2715 	    !(self->ul_vfork | self->ul_nocancel |
2716 	    self->ul_critical | self->ul_sigdefer) &&
2717 	    cancel_active())
2718 		pthread_exit(PTHREAD_CANCELED);
2719 }
2720 
2721 static int
2722 shared_mutex_held(mutex_t *mparg)
2723 {
2724 	/*
2725 	 * The 'volatile' is necessary to make sure the compiler doesn't
2726 	 * reorder the tests of the various components of the mutex.
2727 	 * They must be tested in this order:
2728 	 *	mutex_lockw
2729 	 *	mutex_owner
2730 	 *	mutex_ownerpid
2731 	 * This relies on the fact that everywhere mutex_lockw is cleared,
2732 	 * mutex_owner and mutex_ownerpid are cleared before mutex_lockw
2733 	 * is cleared, and that everywhere mutex_lockw is set, mutex_owner
2734 	 * and mutex_ownerpid are set after mutex_lockw is set, and that
2735 	 * mutex_lockw is set or cleared with a memory barrier.
2736 	 */
2737 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2738 	ulwp_t *self = curthread;
2739 	uberdata_t *udp = self->ul_uberdata;
2740 
2741 	return (MUTEX_OWNED(mp, self) && mp->mutex_ownerpid == udp->pid);
2742 }
2743 
2744 #pragma weak _mutex_held = mutex_held
2745 int
2746 mutex_held(mutex_t *mparg)
2747 {
2748 	volatile mutex_t *mp = (volatile mutex_t *)mparg;
2749 
2750 	if (mparg->mutex_type & USYNC_PROCESS)
2751 		return (shared_mutex_held(mparg));
2752 	return (MUTEX_OWNED(mp, curthread));
2753 }
2754 
2755 #pragma weak pthread_mutex_destroy = mutex_destroy
2756 #pragma weak _mutex_destroy = mutex_destroy
2757 int
2758 mutex_destroy(mutex_t *mp)
2759 {
2760 	if (mp->mutex_type & USYNC_PROCESS)
2761 		forget_lock(mp);
2762 	(void) memset(mp, 0, sizeof (*mp));
2763 	tdb_sync_obj_deregister(mp);
2764 	return (0);
2765 }
2766 
2767 #pragma weak pthread_mutex_consistent_np = mutex_consistent
2768 int
2769 mutex_consistent(mutex_t *mp)
2770 {
2771 	/*
2772 	 * Do this only for an inconsistent, initialized robust lock
2773 	 * that we hold.  For all other cases, return EINVAL.
2774 	 */
2775 	if (mutex_held(mp) &&
2776 	    (mp->mutex_type & LOCK_ROBUST) &&
2777 	    (mp->mutex_flag & LOCK_INITED) &&
2778 	    (mp->mutex_flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED))) {
2779 		mp->mutex_flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2780 		mp->mutex_rcount = 0;
2781 		return (0);
2782 	}
2783 	return (EINVAL);
2784 }
2785 
2786 /*
2787  * Spin locks are separate from ordinary mutexes,
2788  * but we use the same data structure for them.
2789  */
2790 
2791 int
2792 pthread_spin_init(pthread_spinlock_t *lock, int pshared)
2793 {
2794 	mutex_t *mp = (mutex_t *)lock;
2795 
2796 	(void) memset(mp, 0, sizeof (*mp));
2797 	if (pshared == PTHREAD_PROCESS_SHARED)
2798 		mp->mutex_type = USYNC_PROCESS;
2799 	else
2800 		mp->mutex_type = USYNC_THREAD;
2801 	mp->mutex_flag = LOCK_INITED;
2802 	mp->mutex_magic = MUTEX_MAGIC;
2803 
2804 	/*
2805 	 * This should be at the beginning of the function,
2806 	 * but for the sake of old broken applications that
2807 	 * do not have proper alignment for their mutexes
2808 	 * (and don't check the return code from pthread_spin_init),
2809 	 * we put it here, after initializing the mutex regardless.
2810 	 */
2811 	if (((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1)) &&
2812 	    curthread->ul_misaligned == 0)
2813 		return (EINVAL);
2814 
2815 	return (0);
2816 }
2817 
2818 int
2819 pthread_spin_destroy(pthread_spinlock_t *lock)
2820 {
2821 	(void) memset(lock, 0, sizeof (*lock));
2822 	return (0);
2823 }
2824 
2825 int
2826 pthread_spin_trylock(pthread_spinlock_t *lock)
2827 {
2828 	mutex_t *mp = (mutex_t *)lock;
2829 	ulwp_t *self = curthread;
2830 	int error = 0;
2831 
2832 	no_preempt(self);
2833 	if (set_lock_byte(&mp->mutex_lockw) != 0)
2834 		error = EBUSY;
2835 	else {
2836 		mp->mutex_owner = (uintptr_t)self;
2837 		if (mp->mutex_type == USYNC_PROCESS)
2838 			mp->mutex_ownerpid = self->ul_uberdata->pid;
2839 		DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, 0);
2840 	}
2841 	preempt(self);
2842 	return (error);
2843 }
2844 
2845 int
2846 pthread_spin_lock(pthread_spinlock_t *lock)
2847 {
2848 	mutex_t *mp = (mutex_t *)lock;
2849 	ulwp_t *self = curthread;
2850 	volatile uint8_t *lockp = (volatile uint8_t *)&mp->mutex_lockw;
2851 	int count = 0;
2852 
2853 	ASSERT(!self->ul_critical || self->ul_bindflags);
2854 
2855 	DTRACE_PROBE1(plockstat, mutex__spin, mp);
2856 
2857 	/*
2858 	 * We don't care whether the owner is running on a processor.
2859 	 * We just spin because that's what this interface requires.
2860 	 */
2861 	for (;;) {
2862 		if (*lockp == 0) {	/* lock byte appears to be clear */
2863 			no_preempt(self);
2864 			if (set_lock_byte(lockp) == 0)
2865 				break;
2866 			preempt(self);
2867 		}
2868 		if (count < INT_MAX)
2869 			count++;
2870 		SMT_PAUSE();
2871 	}
2872 	mp->mutex_owner = (uintptr_t)self;
2873 	if (mp->mutex_type == USYNC_PROCESS)
2874 		mp->mutex_ownerpid = self->ul_uberdata->pid;
2875 	preempt(self);
2876 	if (count) {
2877 		DTRACE_PROBE2(plockstat, mutex__spun, 1, count);
2878 	}
2879 	DTRACE_PROBE3(plockstat, mutex__acquire, mp, 0, count);
2880 	return (0);
2881 }
2882 
2883 int
2884 pthread_spin_unlock(pthread_spinlock_t *lock)
2885 {
2886 	mutex_t *mp = (mutex_t *)lock;
2887 	ulwp_t *self = curthread;
2888 
2889 	no_preempt(self);
2890 	mp->mutex_owner = 0;
2891 	mp->mutex_ownerpid = 0;
2892 	DTRACE_PROBE2(plockstat, mutex__release, mp, 0);
2893 	(void) atomic_swap_32(&mp->mutex_lockword, 0);
2894 	preempt(self);
2895 	return (0);
2896 }
2897 
2898 #define	INITIAL_LOCKS	8	/* initial size of ul_heldlocks.array */
2899 
2900 /*
2901  * Find/allocate an entry for 'lock' in our array of held locks.
2902  */
2903 static mutex_t **
2904 find_lock_entry(mutex_t *lock)
2905 {
2906 	ulwp_t *self = curthread;
2907 	mutex_t **remembered = NULL;
2908 	mutex_t **lockptr;
2909 	uint_t nlocks;
2910 
2911 	if ((nlocks = self->ul_heldlockcnt) != 0)
2912 		lockptr = self->ul_heldlocks.array;
2913 	else {
2914 		nlocks = 1;
2915 		lockptr = &self->ul_heldlocks.single;
2916 	}
2917 
2918 	for (; nlocks; nlocks--, lockptr++) {
2919 		if (*lockptr == lock)
2920 			return (lockptr);
2921 		if (*lockptr == NULL && remembered == NULL)
2922 			remembered = lockptr;
2923 	}
2924 	if (remembered != NULL) {
2925 		*remembered = lock;
2926 		return (remembered);
2927 	}
2928 
2929 	/*
2930 	 * No entry available.  Allocate more space, converting
2931 	 * the single entry into an array of entries if necessary.
2932 	 */
2933 	if ((nlocks = self->ul_heldlockcnt) == 0) {
2934 		/*
2935 		 * Initial allocation of the array.
2936 		 * Convert the single entry into an array.
2937 		 */
2938 		self->ul_heldlockcnt = nlocks = INITIAL_LOCKS;
2939 		lockptr = lmalloc(nlocks * sizeof (mutex_t *));
2940 		/*
2941 		 * The single entry becomes the first entry in the array.
2942 		 */
2943 		*lockptr = self->ul_heldlocks.single;
2944 		self->ul_heldlocks.array = lockptr;
2945 		/*
2946 		 * Return the next available entry in the array.
2947 		 */
2948 		*++lockptr = lock;
2949 		return (lockptr);
2950 	}
2951 	/*
2952 	 * Reallocate the array, double the size each time.
2953 	 */
2954 	lockptr = lmalloc(nlocks * 2 * sizeof (mutex_t *));
2955 	(void) memcpy(lockptr, self->ul_heldlocks.array,
2956 	    nlocks * sizeof (mutex_t *));
2957 	lfree(self->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2958 	self->ul_heldlocks.array = lockptr;
2959 	self->ul_heldlockcnt *= 2;
2960 	/*
2961 	 * Return the next available entry in the newly allocated array.
2962 	 */
2963 	*(lockptr += nlocks) = lock;
2964 	return (lockptr);
2965 }
2966 
2967 /*
2968  * Insert 'lock' into our list of held locks.
2969  * Currently only used for LOCK_ROBUST mutexes.
2970  */
2971 void
2972 remember_lock(mutex_t *lock)
2973 {
2974 	(void) find_lock_entry(lock);
2975 }
2976 
2977 /*
2978  * Remove 'lock' from our list of held locks.
2979  * Currently only used for LOCK_ROBUST mutexes.
2980  */
2981 void
2982 forget_lock(mutex_t *lock)
2983 {
2984 	*find_lock_entry(lock) = NULL;
2985 }
2986 
2987 /*
2988  * Free the array of held locks.
2989  */
2990 void
2991 heldlock_free(ulwp_t *ulwp)
2992 {
2993 	uint_t nlocks;
2994 
2995 	if ((nlocks = ulwp->ul_heldlockcnt) != 0)
2996 		lfree(ulwp->ul_heldlocks.array, nlocks * sizeof (mutex_t *));
2997 	ulwp->ul_heldlockcnt = 0;
2998 	ulwp->ul_heldlocks.array = NULL;
2999 }
3000 
3001 /*
3002  * Mark all held LOCK_ROBUST mutexes LOCK_OWNERDEAD.
3003  * Called from _thrp_exit() to deal with abandoned locks.
3004  */
3005 void
3006 heldlock_exit(void)
3007 {
3008 	ulwp_t *self = curthread;
3009 	mutex_t **lockptr;
3010 	uint_t nlocks;
3011 	mutex_t *mp;
3012 
3013 	if ((nlocks = self->ul_heldlockcnt) != 0)
3014 		lockptr = self->ul_heldlocks.array;
3015 	else {
3016 		nlocks = 1;
3017 		lockptr = &self->ul_heldlocks.single;
3018 	}
3019 
3020 	for (; nlocks; nlocks--, lockptr++) {
3021 		/*
3022 		 * The kernel takes care of transitioning held
3023 		 * LOCK_PRIO_INHERIT mutexes to LOCK_OWNERDEAD.
3024 		 * We avoid that case here.
3025 		 */
3026 		if ((mp = *lockptr) != NULL &&
3027 		    mutex_held(mp) &&
3028 		    (mp->mutex_type & (LOCK_ROBUST | LOCK_PRIO_INHERIT)) ==
3029 		    LOCK_ROBUST) {
3030 			mp->mutex_rcount = 0;
3031 			if (!(mp->mutex_flag & LOCK_UNMAPPED))
3032 				mp->mutex_flag |= LOCK_OWNERDEAD;
3033 			(void) mutex_unlock_internal(mp, 1);
3034 		}
3035 	}
3036 
3037 	heldlock_free(self);
3038 }
3039 
3040 #pragma weak _cond_init = cond_init
3041 /* ARGSUSED2 */
3042 int
3043 cond_init(cond_t *cvp, int type, void *arg)
3044 {
3045 	if (type != USYNC_THREAD && type != USYNC_PROCESS)
3046 		return (EINVAL);
3047 	(void) memset(cvp, 0, sizeof (*cvp));
3048 	cvp->cond_type = (uint16_t)type;
3049 	cvp->cond_magic = COND_MAGIC;
3050 
3051 	/*
3052 	 * This should be at the beginning of the function,
3053 	 * but for the sake of old broken applications that
3054 	 * do not have proper alignment for their condvars
3055 	 * (and don't check the return code from cond_init),
3056 	 * we put it here, after initializing the condvar regardless.
3057 	 */
3058 	if (((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1)) &&
3059 	    curthread->ul_misaligned == 0)
3060 		return (EINVAL);
3061 
3062 	return (0);
3063 }
3064 
3065 /*
3066  * cond_sleep_queue(): utility function for cond_wait_queue().
3067  *
3068  * Go to sleep on a condvar sleep queue, expect to be waked up
3069  * by someone calling cond_signal() or cond_broadcast() or due
3070  * to receiving a UNIX signal or being cancelled, or just simply
3071  * due to a spurious wakeup (like someome calling forkall()).
3072  *
3073  * The associated mutex is *not* reacquired before returning.
3074  * That must be done by the caller of cond_sleep_queue().
3075  */
3076 static int
3077 cond_sleep_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3078 {
3079 	ulwp_t *self = curthread;
3080 	queue_head_t *qp;
3081 	queue_head_t *mqp;
3082 	lwpid_t lwpid;
3083 	int signalled;
3084 	int error;
3085 	int cv_wake;
3086 	int release_all;
3087 
3088 	/*
3089 	 * Put ourself on the CV sleep queue, unlock the mutex, then
3090 	 * park ourself and unpark a candidate lwp to grab the mutex.
3091 	 * We must go onto the CV sleep queue before dropping the
3092 	 * mutex in order to guarantee atomicity of the operation.
3093 	 */
3094 	self->ul_sp = stkptr();
3095 	qp = queue_lock(cvp, CV);
3096 	enqueue(qp, self, 0);
3097 	cvp->cond_waiters_user = 1;
3098 	self->ul_cvmutex = mp;
3099 	self->ul_cv_wake = cv_wake = (tsp != NULL);
3100 	self->ul_signalled = 0;
3101 	if (mp->mutex_flag & LOCK_OWNERDEAD) {
3102 		mp->mutex_flag &= ~LOCK_OWNERDEAD;
3103 		mp->mutex_flag |= LOCK_NOTRECOVERABLE;
3104 	}
3105 	release_all = ((mp->mutex_flag & LOCK_NOTRECOVERABLE) != 0);
3106 	lwpid = mutex_unlock_queue(mp, release_all);
3107 	for (;;) {
3108 		set_parking_flag(self, 1);
3109 		queue_unlock(qp);
3110 		if (lwpid != 0) {
3111 			lwpid = preempt_unpark(self, lwpid);
3112 			preempt(self);
3113 		}
3114 		/*
3115 		 * We may have a deferred signal present,
3116 		 * in which case we should return EINTR.
3117 		 * Also, we may have received a SIGCANCEL; if so
3118 		 * and we are cancelable we should return EINTR.
3119 		 * We force an immediate EINTR return from
3120 		 * __lwp_park() by turning our parking flag off.
3121 		 */
3122 		if (self->ul_cursig != 0 ||
3123 		    (self->ul_cancelable && self->ul_cancel_pending))
3124 			set_parking_flag(self, 0);
3125 		/*
3126 		 * __lwp_park() will return the residual time in tsp
3127 		 * if we are unparked before the timeout expires.
3128 		 */
3129 		error = __lwp_park(tsp, lwpid);
3130 		set_parking_flag(self, 0);
3131 		lwpid = 0;	/* unpark the other lwp only once */
3132 		/*
3133 		 * We were waked up by cond_signal(), cond_broadcast(),
3134 		 * by an interrupt or timeout (EINTR or ETIME),
3135 		 * or we may just have gotten a spurious wakeup.
3136 		 */
3137 		qp = queue_lock(cvp, CV);
3138 		if (!cv_wake)
3139 			mqp = queue_lock(mp, MX);
3140 		if (self->ul_sleepq == NULL)
3141 			break;
3142 		/*
3143 		 * We are on either the condvar sleep queue or the
3144 		 * mutex sleep queue.  Break out of the sleep if we
3145 		 * were interrupted or we timed out (EINTR or ETIME).
3146 		 * Else this is a spurious wakeup; continue the loop.
3147 		 */
3148 		if (!cv_wake && self->ul_sleepq == mqp) { /* mutex queue */
3149 			if (error) {
3150 				mp->mutex_waiters = dequeue_self(mqp);
3151 				break;
3152 			}
3153 			tsp = NULL;	/* no more timeout */
3154 		} else if (self->ul_sleepq == qp) {	/* condvar queue */
3155 			if (error) {
3156 				cvp->cond_waiters_user = dequeue_self(qp);
3157 				break;
3158 			}
3159 			/*
3160 			 * Else a spurious wakeup on the condvar queue.
3161 			 * __lwp_park() has already adjusted the timeout.
3162 			 */
3163 		} else {
3164 			thr_panic("cond_sleep_queue(): thread not on queue");
3165 		}
3166 		if (!cv_wake)
3167 			queue_unlock(mqp);
3168 	}
3169 
3170 	self->ul_sp = 0;
3171 	self->ul_cv_wake = 0;
3172 	ASSERT(self->ul_cvmutex == NULL);
3173 	ASSERT(self->ul_sleepq == NULL && self->ul_link == NULL &&
3174 	    self->ul_wchan == NULL);
3175 
3176 	signalled = self->ul_signalled;
3177 	self->ul_signalled = 0;
3178 	queue_unlock(qp);
3179 	if (!cv_wake)
3180 		queue_unlock(mqp);
3181 
3182 	/*
3183 	 * If we were concurrently cond_signal()d and any of:
3184 	 * received a UNIX signal, were cancelled, or got a timeout,
3185 	 * then perform another cond_signal() to avoid consuming it.
3186 	 */
3187 	if (error && signalled)
3188 		(void) cond_signal(cvp);
3189 
3190 	return (error);
3191 }
3192 
3193 static void
3194 cond_wait_check_alignment(cond_t *cvp, mutex_t *mp)
3195 {
3196 	if ((uintptr_t)mp & (_LONG_LONG_ALIGNMENT - 1))
3197 		lock_error(mp, "cond_wait", cvp, "mutex is misaligned");
3198 	if ((uintptr_t)cvp & (_LONG_LONG_ALIGNMENT - 1))
3199 		lock_error(mp, "cond_wait", cvp, "condvar is misaligned");
3200 }
3201 
3202 int
3203 cond_wait_queue(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3204 {
3205 	ulwp_t *self = curthread;
3206 	int error;
3207 	int merror;
3208 
3209 	if (self->ul_error_detection && self->ul_misaligned == 0)
3210 		cond_wait_check_alignment(cvp, mp);
3211 
3212 	/*
3213 	 * The old thread library was programmed to defer signals
3214 	 * while in cond_wait() so that the associated mutex would
3215 	 * be guaranteed to be held when the application signal
3216 	 * handler was invoked.
3217 	 *
3218 	 * We do not behave this way by default; the state of the
3219 	 * associated mutex in the signal handler is undefined.
3220 	 *
3221 	 * To accommodate applications that depend on the old
3222 	 * behavior, the _THREAD_COND_WAIT_DEFER environment
3223 	 * variable can be set to 1 and we will behave in the
3224 	 * old way with respect to cond_wait().
3225 	 */
3226 	if (self->ul_cond_wait_defer)
3227 		sigoff(self);
3228 
3229 	error = cond_sleep_queue(cvp, mp, tsp);
3230 
3231 	/*
3232 	 * Reacquire the mutex.
3233 	 */
3234 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3235 		error = merror;
3236 
3237 	/*
3238 	 * Take any deferred signal now, after we have reacquired the mutex.
3239 	 */
3240 	if (self->ul_cond_wait_defer)
3241 		sigon(self);
3242 
3243 	return (error);
3244 }
3245 
3246 /*
3247  * cond_sleep_kernel(): utility function for cond_wait_kernel().
3248  * See the comment ahead of cond_sleep_queue(), above.
3249  */
3250 static int
3251 cond_sleep_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3252 {
3253 	int mtype = mp->mutex_type;
3254 	ulwp_t *self = curthread;
3255 	int error;
3256 
3257 	if ((mtype & LOCK_PRIO_PROTECT) && _ceil_mylist_del(mp))
3258 		_ceil_prio_waive();
3259 
3260 	self->ul_sp = stkptr();
3261 	self->ul_wchan = cvp;
3262 	mp->mutex_owner = 0;
3263 	/* mp->mutex_ownerpid is cleared by ___lwp_cond_wait() */
3264 	if (mtype & LOCK_PRIO_INHERIT) {
3265 		mp->mutex_lockw = LOCKCLEAR;
3266 		self->ul_pilocks--;
3267 	}
3268 	/*
3269 	 * ___lwp_cond_wait() returns immediately with EINTR if
3270 	 * set_parking_flag(self,0) is called on this lwp before it
3271 	 * goes to sleep in the kernel.  sigacthandler() calls this
3272 	 * when a deferred signal is noted.  This assures that we don't
3273 	 * get stuck in ___lwp_cond_wait() with all signals blocked
3274 	 * due to taking a deferred signal before going to sleep.
3275 	 */
3276 	set_parking_flag(self, 1);
3277 	if (self->ul_cursig != 0 ||
3278 	    (self->ul_cancelable && self->ul_cancel_pending))
3279 		set_parking_flag(self, 0);
3280 	error = ___lwp_cond_wait(cvp, mp, tsp, 1);
3281 	set_parking_flag(self, 0);
3282 	self->ul_sp = 0;
3283 	self->ul_wchan = NULL;
3284 	return (error);
3285 }
3286 
3287 int
3288 cond_wait_kernel(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3289 {
3290 	ulwp_t *self = curthread;
3291 	int error;
3292 	int merror;
3293 
3294 	if (self->ul_error_detection && self->ul_misaligned == 0)
3295 		cond_wait_check_alignment(cvp, mp);
3296 
3297 	/*
3298 	 * See the large comment in cond_wait_queue(), above.
3299 	 */
3300 	if (self->ul_cond_wait_defer)
3301 		sigoff(self);
3302 
3303 	error = cond_sleep_kernel(cvp, mp, tsp);
3304 
3305 	/*
3306 	 * Override the return code from ___lwp_cond_wait()
3307 	 * with any non-zero return code from mutex_lock().
3308 	 * This addresses robust lock failures in particular;
3309 	 * the caller must see the EOWNERDEAD or ENOTRECOVERABLE
3310 	 * errors in order to take corrective action.
3311 	 */
3312 	if ((merror = mutex_lock_impl(mp, NULL)) != 0)
3313 		error = merror;
3314 
3315 	/*
3316 	 * Take any deferred signal now, after we have reacquired the mutex.
3317 	 */
3318 	if (self->ul_cond_wait_defer)
3319 		sigon(self);
3320 
3321 	return (error);
3322 }
3323 
3324 /*
3325  * Common code for cond_wait() and cond_timedwait()
3326  */
3327 int
3328 cond_wait_common(cond_t *cvp, mutex_t *mp, timespec_t *tsp)
3329 {
3330 	int mtype = mp->mutex_type;
3331 	hrtime_t begin_sleep = 0;
3332 	ulwp_t *self = curthread;
3333 	uberdata_t *udp = self->ul_uberdata;
3334 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3335 	tdb_mutex_stats_t *msp = MUTEX_STATS(mp, udp);
3336 	uint8_t rcount;
3337 	int error = 0;
3338 
3339 	/*
3340 	 * The SUSV3 Posix spec for pthread_cond_timedwait() states:
3341 	 *	Except in the case of [ETIMEDOUT], all these error checks
3342 	 *	shall act as if they were performed immediately at the
3343 	 *	beginning of processing for the function and shall cause
3344 	 *	an error return, in effect, prior to modifying the state
3345 	 *	of the mutex specified by mutex or the condition variable
3346 	 *	specified by cond.
3347 	 * Therefore, we must return EINVAL now if the timout is invalid.
3348 	 */
3349 	if (tsp != NULL &&
3350 	    (tsp->tv_sec < 0 || (ulong_t)tsp->tv_nsec >= NANOSEC))
3351 		return (EINVAL);
3352 
3353 	if (__td_event_report(self, TD_SLEEP, udp)) {
3354 		self->ul_sp = stkptr();
3355 		self->ul_wchan = cvp;
3356 		self->ul_td_evbuf.eventnum = TD_SLEEP;
3357 		self->ul_td_evbuf.eventdata = cvp;
3358 		tdb_event(TD_SLEEP, udp);
3359 		self->ul_sp = 0;
3360 	}
3361 	if (csp) {
3362 		if (tsp)
3363 			tdb_incr(csp->cond_timedwait);
3364 		else
3365 			tdb_incr(csp->cond_wait);
3366 	}
3367 	if (msp)
3368 		begin_sleep = record_hold_time(msp);
3369 	else if (csp)
3370 		begin_sleep = gethrtime();
3371 
3372 	if (self->ul_error_detection) {
3373 		if (!mutex_held(mp))
3374 			lock_error(mp, "cond_wait", cvp, NULL);
3375 		if ((mtype & LOCK_RECURSIVE) && mp->mutex_rcount != 0)
3376 			lock_error(mp, "recursive mutex in cond_wait",
3377 			    cvp, NULL);
3378 		if (cvp->cond_type & USYNC_PROCESS) {
3379 			if (!(mtype & USYNC_PROCESS))
3380 				lock_error(mp, "cond_wait", cvp,
3381 				    "condvar process-shared, "
3382 				    "mutex process-private");
3383 		} else {
3384 			if (mtype & USYNC_PROCESS)
3385 				lock_error(mp, "cond_wait", cvp,
3386 				    "condvar process-private, "
3387 				    "mutex process-shared");
3388 		}
3389 	}
3390 
3391 	/*
3392 	 * We deal with recursive mutexes by completely
3393 	 * dropping the lock and restoring the recursion
3394 	 * count after waking up.  This is arguably wrong,
3395 	 * but it obeys the principle of least astonishment.
3396 	 */
3397 	rcount = mp->mutex_rcount;
3398 	mp->mutex_rcount = 0;
3399 	if ((mtype &
3400 	    (USYNC_PROCESS | LOCK_PRIO_INHERIT | LOCK_PRIO_PROTECT)) |
3401 	    (cvp->cond_type & USYNC_PROCESS))
3402 		error = cond_wait_kernel(cvp, mp, tsp);
3403 	else
3404 		error = cond_wait_queue(cvp, mp, tsp);
3405 	mp->mutex_rcount = rcount;
3406 
3407 	if (csp) {
3408 		hrtime_t lapse = gethrtime() - begin_sleep;
3409 		if (tsp == NULL)
3410 			csp->cond_wait_sleep_time += lapse;
3411 		else {
3412 			csp->cond_timedwait_sleep_time += lapse;
3413 			if (error == ETIME)
3414 				tdb_incr(csp->cond_timedwait_timeout);
3415 		}
3416 	}
3417 	return (error);
3418 }
3419 
3420 /*
3421  * cond_wait() is a cancellation point but __cond_wait() is not.
3422  * Internally, libc calls the non-cancellation version.
3423  * Other libraries need to use pthread_setcancelstate(), as appropriate,
3424  * since __cond_wait() is not exported from libc.
3425  */
3426 int
3427 __cond_wait(cond_t *cvp, mutex_t *mp)
3428 {
3429 	ulwp_t *self = curthread;
3430 	uberdata_t *udp = self->ul_uberdata;
3431 	uberflags_t *gflags;
3432 
3433 	/*
3434 	 * Optimize the common case of USYNC_THREAD plus
3435 	 * no error detection, no lock statistics, and no event tracing.
3436 	 */
3437 	if ((gflags = self->ul_schedctl_called) != NULL &&
3438 	    (cvp->cond_type | mp->mutex_type | gflags->uf_trs_ted |
3439 	    self->ul_td_events_enable |
3440 	    udp->tdb.tdb_ev_global_mask.event_bits[0]) == 0)
3441 		return (cond_wait_queue(cvp, mp, NULL));
3442 
3443 	/*
3444 	 * Else do it the long way.
3445 	 */
3446 	return (cond_wait_common(cvp, mp, NULL));
3447 }
3448 
3449 #pragma weak _cond_wait = cond_wait
3450 int
3451 cond_wait(cond_t *cvp, mutex_t *mp)
3452 {
3453 	int error;
3454 
3455 	_cancelon();
3456 	error = __cond_wait(cvp, mp);
3457 	if (error == EINTR)
3458 		_canceloff();
3459 	else
3460 		_canceloff_nocancel();
3461 	return (error);
3462 }
3463 
3464 /*
3465  * pthread_cond_wait() is a cancellation point.
3466  */
3467 int
3468 pthread_cond_wait(pthread_cond_t *_RESTRICT_KYWD cvp,
3469 	pthread_mutex_t *_RESTRICT_KYWD mp)
3470 {
3471 	int error;
3472 
3473 	error = cond_wait((cond_t *)cvp, (mutex_t *)mp);
3474 	return ((error == EINTR)? 0 : error);
3475 }
3476 
3477 /*
3478  * cond_timedwait() is a cancellation point but __cond_timedwait() is not.
3479  */
3480 int
3481 __cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3482 {
3483 	clockid_t clock_id = cvp->cond_clockid;
3484 	timespec_t reltime;
3485 	int error;
3486 
3487 	if (clock_id != CLOCK_REALTIME && clock_id != CLOCK_HIGHRES)
3488 		clock_id = CLOCK_REALTIME;
3489 	abstime_to_reltime(clock_id, abstime, &reltime);
3490 	error = cond_wait_common(cvp, mp, &reltime);
3491 	if (error == ETIME && clock_id == CLOCK_HIGHRES) {
3492 		/*
3493 		 * Don't return ETIME if we didn't really get a timeout.
3494 		 * This can happen if we return because someone resets
3495 		 * the system clock.  Just return zero in this case,
3496 		 * giving a spurious wakeup but not a timeout.
3497 		 */
3498 		if ((hrtime_t)(uint32_t)abstime->tv_sec * NANOSEC +
3499 		    abstime->tv_nsec > gethrtime())
3500 			error = 0;
3501 	}
3502 	return (error);
3503 }
3504 
3505 int
3506 cond_timedwait(cond_t *cvp, mutex_t *mp, const timespec_t *abstime)
3507 {
3508 	int error;
3509 
3510 	_cancelon();
3511 	error = __cond_timedwait(cvp, mp, abstime);
3512 	if (error == EINTR)
3513 		_canceloff();
3514 	else
3515 		_canceloff_nocancel();
3516 	return (error);
3517 }
3518 
3519 /*
3520  * pthread_cond_timedwait() is a cancellation point.
3521  */
3522 int
3523 pthread_cond_timedwait(pthread_cond_t *_RESTRICT_KYWD cvp,
3524 	pthread_mutex_t *_RESTRICT_KYWD mp,
3525 	const struct timespec *_RESTRICT_KYWD abstime)
3526 {
3527 	int error;
3528 
3529 	error = cond_timedwait((cond_t *)cvp, (mutex_t *)mp, abstime);
3530 	if (error == ETIME)
3531 		error = ETIMEDOUT;
3532 	else if (error == EINTR)
3533 		error = 0;
3534 	return (error);
3535 }
3536 
3537 /*
3538  * cond_reltimedwait() is a cancellation point but __cond_reltimedwait() is not.
3539  */
3540 int
3541 __cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3542 {
3543 	timespec_t tslocal = *reltime;
3544 
3545 	return (cond_wait_common(cvp, mp, &tslocal));
3546 }
3547 
3548 int
3549 cond_reltimedwait(cond_t *cvp, mutex_t *mp, const timespec_t *reltime)
3550 {
3551 	int error;
3552 
3553 	_cancelon();
3554 	error = __cond_reltimedwait(cvp, mp, reltime);
3555 	if (error == EINTR)
3556 		_canceloff();
3557 	else
3558 		_canceloff_nocancel();
3559 	return (error);
3560 }
3561 
3562 int
3563 pthread_cond_reltimedwait_np(pthread_cond_t *_RESTRICT_KYWD cvp,
3564 	pthread_mutex_t *_RESTRICT_KYWD mp,
3565 	const struct timespec *_RESTRICT_KYWD reltime)
3566 {
3567 	int error;
3568 
3569 	error = cond_reltimedwait((cond_t *)cvp, (mutex_t *)mp, reltime);
3570 	if (error == ETIME)
3571 		error = ETIMEDOUT;
3572 	else if (error == EINTR)
3573 		error = 0;
3574 	return (error);
3575 }
3576 
3577 #pragma weak pthread_cond_signal = cond_signal
3578 #pragma weak _cond_signal = cond_signal
3579 int
3580 cond_signal(cond_t *cvp)
3581 {
3582 	ulwp_t *self = curthread;
3583 	uberdata_t *udp = self->ul_uberdata;
3584 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3585 	int error = 0;
3586 	int more;
3587 	lwpid_t lwpid;
3588 	queue_head_t *qp;
3589 	mutex_t *mp;
3590 	queue_head_t *mqp;
3591 	ulwp_t **ulwpp;
3592 	ulwp_t *ulwp;
3593 	ulwp_t *prev;
3594 
3595 	if (csp)
3596 		tdb_incr(csp->cond_signal);
3597 
3598 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3599 		error = _lwp_cond_signal(cvp);
3600 
3601 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3602 		return (error);
3603 
3604 	/*
3605 	 * Move someone from the condvar sleep queue to the mutex sleep
3606 	 * queue for the mutex that he will acquire on being waked up.
3607 	 * We can do this only if we own the mutex he will acquire.
3608 	 * If we do not own the mutex, or if his ul_cv_wake flag
3609 	 * is set, just dequeue and unpark him.
3610 	 */
3611 	qp = queue_lock(cvp, CV);
3612 	ulwpp = queue_slot(qp, &prev, &more);
3613 	cvp->cond_waiters_user = more;
3614 	if (ulwpp == NULL) {	/* no one on the sleep queue */
3615 		queue_unlock(qp);
3616 		return (error);
3617 	}
3618 	ulwp = *ulwpp;
3619 
3620 	/*
3621 	 * Inform the thread that he was the recipient of a cond_signal().
3622 	 * This lets him deal with cond_signal() and, concurrently,
3623 	 * one or more of a cancellation, a UNIX signal, or a timeout.
3624 	 * These latter conditions must not consume a cond_signal().
3625 	 */
3626 	ulwp->ul_signalled = 1;
3627 
3628 	/*
3629 	 * Dequeue the waiter but leave his ul_sleepq non-NULL
3630 	 * while we move him to the mutex queue so that he can
3631 	 * deal properly with spurious wakeups.
3632 	 */
3633 	queue_unlink(qp, ulwpp, prev);
3634 
3635 	mp = ulwp->ul_cvmutex;		/* the mutex he will acquire */
3636 	ulwp->ul_cvmutex = NULL;
3637 	ASSERT(mp != NULL);
3638 
3639 	if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3640 		/* just wake him up */
3641 		lwpid = ulwp->ul_lwpid;
3642 		no_preempt(self);
3643 		ulwp->ul_sleepq = NULL;
3644 		ulwp->ul_wchan = NULL;
3645 		queue_unlock(qp);
3646 		(void) __lwp_unpark(lwpid);
3647 		preempt(self);
3648 	} else {
3649 		/* move him to the mutex queue */
3650 		mqp = queue_lock(mp, MX);
3651 		enqueue(mqp, ulwp, 0);
3652 		mp->mutex_waiters = 1;
3653 		queue_unlock(mqp);
3654 		queue_unlock(qp);
3655 	}
3656 
3657 	return (error);
3658 }
3659 
3660 /*
3661  * Utility function called by mutex_wakeup_all(), cond_broadcast(),
3662  * and rw_queue_release() to (re)allocate a big buffer to hold the
3663  * lwpids of all the threads to be set running after they are removed
3664  * from their sleep queues.  Since we are holding a queue lock, we
3665  * cannot call any function that might acquire a lock.  mmap(), munmap(),
3666  * lwp_unpark_all() are simple system calls and are safe in this regard.
3667  */
3668 lwpid_t *
3669 alloc_lwpids(lwpid_t *lwpid, int *nlwpid_ptr, int *maxlwps_ptr)
3670 {
3671 	/*
3672 	 * Allocate NEWLWPS ids on the first overflow.
3673 	 * Double the allocation each time after that.
3674 	 */
3675 	int nlwpid = *nlwpid_ptr;
3676 	int maxlwps = *maxlwps_ptr;
3677 	int first_allocation;
3678 	int newlwps;
3679 	void *vaddr;
3680 
3681 	ASSERT(nlwpid == maxlwps);
3682 
3683 	first_allocation = (maxlwps == MAXLWPS);
3684 	newlwps = first_allocation? NEWLWPS : 2 * maxlwps;
3685 	vaddr = mmap(NULL, newlwps * sizeof (lwpid_t),
3686 	    PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, (off_t)0);
3687 
3688 	if (vaddr == MAP_FAILED) {
3689 		/*
3690 		 * Let's hope this never happens.
3691 		 * If it does, then we have a terrible
3692 		 * thundering herd on our hands.
3693 		 */
3694 		(void) __lwp_unpark_all(lwpid, nlwpid);
3695 		*nlwpid_ptr = 0;
3696 	} else {
3697 		(void) memcpy(vaddr, lwpid, maxlwps * sizeof (lwpid_t));
3698 		if (!first_allocation)
3699 			(void) munmap((caddr_t)lwpid,
3700 			    maxlwps * sizeof (lwpid_t));
3701 		lwpid = vaddr;
3702 		*maxlwps_ptr = newlwps;
3703 	}
3704 
3705 	return (lwpid);
3706 }
3707 
3708 #pragma weak pthread_cond_broadcast = cond_broadcast
3709 #pragma weak _cond_broadcast = cond_broadcast
3710 int
3711 cond_broadcast(cond_t *cvp)
3712 {
3713 	ulwp_t *self = curthread;
3714 	uberdata_t *udp = self->ul_uberdata;
3715 	tdb_cond_stats_t *csp = COND_STATS(cvp, udp);
3716 	int error = 0;
3717 	queue_head_t *qp;
3718 	queue_root_t *qrp;
3719 	mutex_t *mp;
3720 	mutex_t *mp_cache = NULL;
3721 	queue_head_t *mqp = NULL;
3722 	ulwp_t *ulwp;
3723 	int nlwpid = 0;
3724 	int maxlwps = MAXLWPS;
3725 	lwpid_t buffer[MAXLWPS];
3726 	lwpid_t *lwpid = buffer;
3727 
3728 	if (csp)
3729 		tdb_incr(csp->cond_broadcast);
3730 
3731 	if (cvp->cond_waiters_kernel)	/* someone sleeping in the kernel? */
3732 		error = _lwp_cond_broadcast(cvp);
3733 
3734 	if (!cvp->cond_waiters_user)	/* no one sleeping at user-level */
3735 		return (error);
3736 
3737 	/*
3738 	 * Move everyone from the condvar sleep queue to the mutex sleep
3739 	 * queue for the mutex that they will acquire on being waked up.
3740 	 * We can do this only if we own the mutex they will acquire.
3741 	 * If we do not own the mutex, or if their ul_cv_wake flag
3742 	 * is set, just dequeue and unpark them.
3743 	 *
3744 	 * We keep track of lwpids that are to be unparked in lwpid[].
3745 	 * __lwp_unpark_all() is called to unpark all of them after
3746 	 * they have been removed from the sleep queue and the sleep
3747 	 * queue lock has been dropped.  If we run out of space in our
3748 	 * on-stack buffer, we need to allocate more but we can't call
3749 	 * lmalloc() because we are holding a queue lock when the overflow
3750 	 * occurs and lmalloc() acquires a lock.  We can't use alloca()
3751 	 * either because the application may have allocated a small
3752 	 * stack and we don't want to overrun the stack.  So we call
3753 	 * alloc_lwpids() to allocate a bigger buffer using the mmap()
3754 	 * system call directly since that path acquires no locks.
3755 	 */
3756 	qp = queue_lock(cvp, CV);
3757 	cvp->cond_waiters_user = 0;
3758 	for (;;) {
3759 		if ((qrp = qp->qh_root) == NULL ||
3760 		    (ulwp = qrp->qr_head) == NULL)
3761 			break;
3762 		ASSERT(ulwp->ul_wchan == cvp);
3763 		queue_unlink(qp, &qrp->qr_head, NULL);
3764 		mp = ulwp->ul_cvmutex;		/* his mutex */
3765 		ulwp->ul_cvmutex = NULL;
3766 		ASSERT(mp != NULL);
3767 		if (ulwp->ul_cv_wake || !MUTEX_OWNED(mp, self)) {
3768 			/* just wake him up */
3769 			ulwp->ul_sleepq = NULL;
3770 			ulwp->ul_wchan = NULL;
3771 			if (nlwpid == maxlwps)
3772 				lwpid = alloc_lwpids(lwpid, &nlwpid, &maxlwps);
3773 			lwpid[nlwpid++] = ulwp->ul_lwpid;
3774 		} else {
3775 			/* move him to the mutex queue */
3776 			if (mp != mp_cache) {
3777 				mp_cache = mp;
3778 				if (mqp != NULL)
3779 					queue_unlock(mqp);
3780 				mqp = queue_lock(mp, MX);
3781 			}
3782 			enqueue(mqp, ulwp, 0);
3783 			mp->mutex_waiters = 1;
3784 		}
3785 	}
3786 	if (mqp != NULL)
3787 		queue_unlock(mqp);
3788 	if (nlwpid == 0) {
3789 		queue_unlock(qp);
3790 	} else {
3791 		no_preempt(self);
3792 		queue_unlock(qp);
3793 		if (nlwpid == 1)
3794 			(void) __lwp_unpark(lwpid[0]);
3795 		else
3796 			(void) __lwp_unpark_all(lwpid, nlwpid);
3797 		preempt(self);
3798 	}
3799 	if (lwpid != buffer)
3800 		(void) munmap((caddr_t)lwpid, maxlwps * sizeof (lwpid_t));
3801 	return (error);
3802 }
3803 
3804 #pragma weak pthread_cond_destroy = cond_destroy
3805 int
3806 cond_destroy(cond_t *cvp)
3807 {
3808 	cvp->cond_magic = 0;
3809 	tdb_sync_obj_deregister(cvp);
3810 	return (0);
3811 }
3812 
3813 #if defined(THREAD_DEBUG)
3814 void
3815 assert_no_libc_locks_held(void)
3816 {
3817 	ASSERT(!curthread->ul_critical || curthread->ul_bindflags);
3818 }
3819 
3820 /* protected by link_lock */
3821 uint64_t spin_lock_spin;
3822 uint64_t spin_lock_spin2;
3823 uint64_t spin_lock_sleep;
3824 uint64_t spin_lock_wakeup;
3825 
3826 /*
3827  * Record spin lock statistics.
3828  * Called by a thread exiting itself in thrp_exit().
3829  * Also called via atexit() from the thread calling
3830  * exit() to do all the other threads as well.
3831  */
3832 void
3833 record_spin_locks(ulwp_t *ulwp)
3834 {
3835 	spin_lock_spin += ulwp->ul_spin_lock_spin;
3836 	spin_lock_spin2 += ulwp->ul_spin_lock_spin2;
3837 	spin_lock_sleep += ulwp->ul_spin_lock_sleep;
3838 	spin_lock_wakeup += ulwp->ul_spin_lock_wakeup;
3839 	ulwp->ul_spin_lock_spin = 0;
3840 	ulwp->ul_spin_lock_spin2 = 0;
3841 	ulwp->ul_spin_lock_sleep = 0;
3842 	ulwp->ul_spin_lock_wakeup = 0;
3843 }
3844 
3845 /*
3846  * atexit function:  dump the queue statistics to stderr.
3847  */
3848 #include <stdio.h>
3849 void
3850 dump_queue_statistics(void)
3851 {
3852 	uberdata_t *udp = curthread->ul_uberdata;
3853 	queue_head_t *qp;
3854 	int qn;
3855 	uint64_t spin_lock_total = 0;
3856 
3857 	if (udp->queue_head == NULL || thread_queue_dump == 0)
3858 		return;
3859 
3860 	if (fprintf(stderr, "\n%5d mutex queues:\n", QHASHSIZE) < 0 ||
3861 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3862 		return;
3863 	for (qn = 0, qp = udp->queue_head; qn < QHASHSIZE; qn++, qp++) {
3864 		if (qp->qh_lockcount == 0)
3865 			continue;
3866 		spin_lock_total += qp->qh_lockcount;
3867 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3868 		    (u_longlong_t)qp->qh_lockcount,
3869 		    qp->qh_qmax, qp->qh_hmax) < 0)
3870 			return;
3871 	}
3872 
3873 	if (fprintf(stderr, "\n%5d condvar queues:\n", QHASHSIZE) < 0 ||
3874 	    fprintf(stderr, "queue#   lockcount    max qlen    max hlen\n") < 0)
3875 		return;
3876 	for (qn = 0; qn < QHASHSIZE; qn++, qp++) {
3877 		if (qp->qh_lockcount == 0)
3878 			continue;
3879 		spin_lock_total += qp->qh_lockcount;
3880 		if (fprintf(stderr, "%5d %12llu%12u%12u\n", qn,
3881 		    (u_longlong_t)qp->qh_lockcount,
3882 		    qp->qh_qmax, qp->qh_hmax) < 0)
3883 			return;
3884 	}
3885 
3886 	(void) fprintf(stderr, "\n  spin_lock_total  = %10llu\n",
3887 	    (u_longlong_t)spin_lock_total);
3888 	(void) fprintf(stderr, "  spin_lock_spin   = %10llu\n",
3889 	    (u_longlong_t)spin_lock_spin);
3890 	(void) fprintf(stderr, "  spin_lock_spin2  = %10llu\n",
3891 	    (u_longlong_t)spin_lock_spin2);
3892 	(void) fprintf(stderr, "  spin_lock_sleep  = %10llu\n",
3893 	    (u_longlong_t)spin_lock_sleep);
3894 	(void) fprintf(stderr, "  spin_lock_wakeup = %10llu\n",
3895 	    (u_longlong_t)spin_lock_wakeup);
3896 }
3897 #endif
3898