xref: /titanic_50/usr/src/uts/common/syscall/lwp_sobj.c (revision 41efec2219526a9b3ecce26f97aba761ef1e1d0d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/file.h>
40 #include <sys/proc.h>
41 #include <sys/prsystm.h>
42 #include <sys/kmem.h>
43 #include <sys/sobject.h>
44 #include <sys/fault.h>
45 #include <sys/procfs.h>
46 #include <sys/watchpoint.h>
47 #include <sys/time.h>
48 #include <sys/cmn_err.h>
49 #include <sys/machlock.h>
50 #include <sys/debug.h>
51 #include <sys/synch.h>
52 #include <sys/synch32.h>
53 #include <sys/mman.h>
54 #include <sys/class.h>
55 #include <sys/schedctl.h>
56 #include <sys/sleepq.h>
57 #include <sys/policy.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/lwpchan_impl.h>
60 #include <sys/turnstile.h>
61 #include <sys/atomic.h>
62 #include <sys/lwp_timer_impl.h>
63 #include <sys/lwp_upimutex_impl.h>
64 #include <vm/as.h>
65 #include <sys/sdt.h>
66 
67 static kthread_t *lwpsobj_owner(caddr_t);
68 static void lwp_unsleep(kthread_t *t);
69 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
70 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
71 
72 extern int lwp_cond_signal(lwp_cond_t *cv);
73 
74 /*
75  * Maximum number of user prio inheritance locks that can be held by a thread.
76  * Used to limit kmem for each thread. This is a per-thread limit that
77  * can be administered on a system wide basis (using /etc/system).
78  *
79  * Also, when a limit, say maxlwps is added for numbers of lwps within a
80  * process, the per-thread limit automatically becomes a process-wide limit
81  * of maximum number of held upi locks within a process:
82  *      maxheldupimx = maxnestupimx * maxlwps;
83  */
84 static uint32_t maxnestupimx = 2000;
85 
86 /*
87  * The sobj_ops vector exports a set of functions needed when a thread
88  * is asleep on a synchronization object of this type.
89  */
90 static sobj_ops_t lwp_sobj_ops = {
91 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
92 };
93 
94 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
95 
96 static sobj_ops_t lwp_sobj_pi_ops = {
97 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
98 	turnstile_change_pri
99 };
100 
101 static sleepq_head_t	lwpsleepq[NSLEEPQ];
102 upib_t			upimutextab[UPIMUTEX_TABSIZE];
103 
104 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
105 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
106 
107 /*
108  * We know that both lc_wchan and lc_wchan0 are addresses that most
109  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
110  * 'pool' is either 0 or 1.
111  */
112 #define	LWPCHAN_LOCK_HASH(X, pool) \
113 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
114 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
115 
116 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
117 
118 /*
119  * Is this a POSIX threads user-level lock requiring priority inheritance?
120  */
121 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
122 
123 static sleepq_head_t *
124 lwpsqhash(lwpchan_t *lwpchan)
125 {
126 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
127 	return (&lwpsleepq[SQHASHINDEX(x)]);
128 }
129 
130 /*
131  * Lock an lwpchan.
132  * Keep this in sync with lwpchan_unlock(), below.
133  */
134 static void
135 lwpchan_lock(lwpchan_t *lwpchan, int pool)
136 {
137 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
138 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
139 }
140 
141 /*
142  * Unlock an lwpchan.
143  * Keep this in sync with lwpchan_lock(), above.
144  */
145 static void
146 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
147 {
148 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
149 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
150 }
151 
152 /*
153  * Delete mappings from the lwpchan cache for pages that are being
154  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
155  * all mappings within the range are deleted from the lwpchan cache.
156  */
157 void
158 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
159 {
160 	lwpchan_data_t *lcp;
161 	lwpchan_hashbucket_t *hashbucket;
162 	lwpchan_hashbucket_t *endbucket;
163 	lwpchan_entry_t *ent;
164 	lwpchan_entry_t **prev;
165 	caddr_t addr;
166 
167 	mutex_enter(&p->p_lcp_lock);
168 	lcp = p->p_lcp;
169 	hashbucket = lcp->lwpchan_cache;
170 	endbucket = hashbucket + lcp->lwpchan_size;
171 	for (; hashbucket < endbucket; hashbucket++) {
172 		if (hashbucket->lwpchan_chain == NULL)
173 			continue;
174 		mutex_enter(&hashbucket->lwpchan_lock);
175 		prev = &hashbucket->lwpchan_chain;
176 		/* check entire chain */
177 		while ((ent = *prev) != NULL) {
178 			addr = ent->lwpchan_addr;
179 			if (start <= addr && addr < end) {
180 				*prev = ent->lwpchan_next;
181 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
182 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
183 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
184 				kmem_free(ent, sizeof (*ent));
185 				atomic_add_32(&lcp->lwpchan_entries, -1);
186 			} else {
187 				prev = &ent->lwpchan_next;
188 			}
189 		}
190 		mutex_exit(&hashbucket->lwpchan_lock);
191 	}
192 	mutex_exit(&p->p_lcp_lock);
193 }
194 
195 /*
196  * Given an lwpchan cache pointer and a process virtual address,
197  * return a pointer to the corresponding lwpchan hash bucket.
198  */
199 static lwpchan_hashbucket_t *
200 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
201 {
202 	uint_t i;
203 
204 	/*
205 	 * All user-level sync object addresses are 8-byte aligned.
206 	 * Ignore the lowest 3 bits of the address and use the
207 	 * higher-order 2*lwpchan_bits bits for the hash index.
208 	 */
209 	addr >>= 3;
210 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
211 	return (lcp->lwpchan_cache + i);
212 }
213 
214 /*
215  * (Re)allocate the per-process lwpchan cache.
216  */
217 static void
218 lwpchan_alloc_cache(proc_t *p, uint_t bits)
219 {
220 	lwpchan_data_t *lcp;
221 	lwpchan_data_t *old_lcp;
222 	lwpchan_hashbucket_t *hashbucket;
223 	lwpchan_hashbucket_t *endbucket;
224 	lwpchan_hashbucket_t *newbucket;
225 	lwpchan_entry_t *ent;
226 	lwpchan_entry_t *next;
227 	uint_t count;
228 
229 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
230 
231 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
232 	lcp->lwpchan_bits = bits;
233 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
234 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
235 	lcp->lwpchan_entries = 0;
236 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
237 		sizeof (lwpchan_hashbucket_t), KM_SLEEP);
238 	lcp->lwpchan_next_data = NULL;
239 
240 	mutex_enter(&p->p_lcp_lock);
241 	if ((old_lcp = p->p_lcp) != NULL) {
242 		if (old_lcp->lwpchan_bits >= bits) {
243 			/* someone beat us to it */
244 			mutex_exit(&p->p_lcp_lock);
245 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
246 				sizeof (lwpchan_hashbucket_t));
247 			kmem_free(lcp, sizeof (lwpchan_data_t));
248 			return;
249 		}
250 		/*
251 		 * Acquire all of the old hash table locks.
252 		 */
253 		hashbucket = old_lcp->lwpchan_cache;
254 		endbucket = hashbucket + old_lcp->lwpchan_size;
255 		for (; hashbucket < endbucket; hashbucket++)
256 			mutex_enter(&hashbucket->lwpchan_lock);
257 		/*
258 		 * Move all of the old hash table entries to the
259 		 * new hash table.  The new hash table has not yet
260 		 * been installed so we don't need any of its locks.
261 		 */
262 		count = 0;
263 		hashbucket = old_lcp->lwpchan_cache;
264 		for (; hashbucket < endbucket; hashbucket++) {
265 			ent = hashbucket->lwpchan_chain;
266 			while (ent != NULL) {
267 				next = ent->lwpchan_next;
268 				newbucket = lwpchan_bucket(lcp,
269 					(uintptr_t)ent->lwpchan_addr);
270 				ent->lwpchan_next = newbucket->lwpchan_chain;
271 				newbucket->lwpchan_chain = ent;
272 				ent = next;
273 				count++;
274 			}
275 			hashbucket->lwpchan_chain = NULL;
276 		}
277 		lcp->lwpchan_entries = count;
278 	}
279 
280 	/*
281 	 * Retire the old hash table.  We can't actually kmem_free() it
282 	 * now because someone may still have a pointer to it.  Instead,
283 	 * we link it onto the new hash table's list of retired hash tables.
284 	 * The new hash table is double the size of the previous one, so
285 	 * the total size of all retired hash tables is less than the size
286 	 * of the new one.  exit() and exec() free the retired hash tables
287 	 * (see lwpchan_destroy_cache(), below).
288 	 */
289 	lcp->lwpchan_next_data = old_lcp;
290 
291 	/*
292 	 * As soon as we store the new lcp, future locking operations will
293 	 * use it.  Therefore, we must ensure that all the state we've just
294 	 * established reaches global visibility before the new lcp does.
295 	 */
296 	membar_producer();
297 	p->p_lcp = lcp;
298 
299 	if (old_lcp != NULL) {
300 		/*
301 		 * Release all of the old hash table locks.
302 		 */
303 		hashbucket = old_lcp->lwpchan_cache;
304 		for (; hashbucket < endbucket; hashbucket++)
305 			mutex_exit(&hashbucket->lwpchan_lock);
306 	}
307 	mutex_exit(&p->p_lcp_lock);
308 }
309 
310 /*
311  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
312  * Called when the process exits or execs.  All lwps except one have
313  * exited so we need no locks here.
314  */
315 void
316 lwpchan_destroy_cache(int exec)
317 {
318 	proc_t *p = curproc;
319 	lwpchan_hashbucket_t *hashbucket;
320 	lwpchan_hashbucket_t *endbucket;
321 	lwpchan_data_t *lcp;
322 	lwpchan_entry_t *ent;
323 	lwpchan_entry_t *next;
324 	uint16_t lockflg;
325 
326 	lcp = p->p_lcp;
327 	p->p_lcp = NULL;
328 
329 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
330 	hashbucket = lcp->lwpchan_cache;
331 	endbucket = hashbucket + lcp->lwpchan_size;
332 	for (; hashbucket < endbucket; hashbucket++) {
333 		ent = hashbucket->lwpchan_chain;
334 		hashbucket->lwpchan_chain = NULL;
335 		while (ent != NULL) {
336 			next = ent->lwpchan_next;
337 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
338 			    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
339 				lwp_mutex_cleanup(ent, lockflg);
340 			kmem_free(ent, sizeof (*ent));
341 			ent = next;
342 		}
343 	}
344 
345 	while (lcp != NULL) {
346 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
347 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
348 			sizeof (lwpchan_hashbucket_t));
349 		kmem_free(lcp, sizeof (lwpchan_data_t));
350 		lcp = next_lcp;
351 	}
352 }
353 
354 /*
355  * Return zero when there is an entry in the lwpchan cache for the
356  * given process virtual address and non-zero when there is not.
357  * The returned non-zero value is the current length of the
358  * hash chain plus one.  The caller holds the hash bucket lock.
359  */
360 static uint_t
361 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
362 	lwpchan_hashbucket_t *hashbucket)
363 {
364 	lwpchan_entry_t *ent;
365 	uint_t count = 1;
366 
367 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
368 		if (ent->lwpchan_addr == addr) {
369 			if (ent->lwpchan_type != type ||
370 			    ent->lwpchan_pool != pool) {
371 				/*
372 				 * This shouldn't happen, but might if the
373 				 * process reuses its memory for different
374 				 * types of sync objects.  We test first
375 				 * to avoid grabbing the memory cache line.
376 				 */
377 				ent->lwpchan_type = (uint16_t)type;
378 				ent->lwpchan_pool = (uint16_t)pool;
379 			}
380 			*lwpchan = ent->lwpchan_lwpchan;
381 			return (0);
382 		}
383 		count++;
384 	}
385 	return (count);
386 }
387 
388 /*
389  * Return the cached lwpchan mapping if cached, otherwise insert
390  * a virtual address to lwpchan mapping into the cache.
391  */
392 static int
393 lwpchan_get_mapping(struct as *as, caddr_t addr,
394 	int type, lwpchan_t *lwpchan, int pool)
395 {
396 	proc_t *p = curproc;
397 	lwpchan_data_t *lcp;
398 	lwpchan_hashbucket_t *hashbucket;
399 	lwpchan_entry_t *ent;
400 	memid_t	memid;
401 	uint_t count;
402 	uint_t bits;
403 
404 top:
405 	/* initialize the lwpchan cache, if necesary */
406 	if ((lcp = p->p_lcp) == NULL) {
407 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
408 		goto top;
409 	}
410 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
411 	mutex_enter(&hashbucket->lwpchan_lock);
412 	if (lcp != p->p_lcp) {
413 		/* someone resized the lwpchan cache; start over */
414 		mutex_exit(&hashbucket->lwpchan_lock);
415 		goto top;
416 	}
417 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
418 		/* it's in the cache */
419 		mutex_exit(&hashbucket->lwpchan_lock);
420 		return (1);
421 	}
422 	mutex_exit(&hashbucket->lwpchan_lock);
423 	if (as_getmemid(as, addr, &memid) != 0)
424 		return (0);
425 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
426 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
427 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
428 	mutex_enter(&hashbucket->lwpchan_lock);
429 	if (lcp != p->p_lcp) {
430 		/* someone resized the lwpchan cache; start over */
431 		mutex_exit(&hashbucket->lwpchan_lock);
432 		kmem_free(ent, sizeof (*ent));
433 		goto top;
434 	}
435 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
436 	if (count == 0) {
437 		/* someone else added this entry to the cache */
438 		mutex_exit(&hashbucket->lwpchan_lock);
439 		kmem_free(ent, sizeof (*ent));
440 		return (1);
441 	}
442 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
443 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
444 		/* hash chain too long; reallocate the hash table */
445 		mutex_exit(&hashbucket->lwpchan_lock);
446 		kmem_free(ent, sizeof (*ent));
447 		lwpchan_alloc_cache(p, bits + 1);
448 		goto top;
449 	}
450 	ent->lwpchan_addr = addr;
451 	ent->lwpchan_type = (uint16_t)type;
452 	ent->lwpchan_pool = (uint16_t)pool;
453 	ent->lwpchan_lwpchan = *lwpchan;
454 	ent->lwpchan_next = hashbucket->lwpchan_chain;
455 	hashbucket->lwpchan_chain = ent;
456 	atomic_add_32(&lcp->lwpchan_entries, 1);
457 	mutex_exit(&hashbucket->lwpchan_lock);
458 	return (1);
459 }
460 
461 /*
462  * Return a unique pair of identifiers that corresponds to a
463  * synchronization object's virtual address.  Process-shared
464  * sync objects usually get vnode/offset from as_getmemid().
465  */
466 static int
467 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
468 {
469 	/*
470 	 * If the lwp synch object is defined to be process-private,
471 	 * we just make the first field of the lwpchan be 'as' and
472 	 * the second field be the synch object's virtual address.
473 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
474 	 * The lwpchan cache is used only for process-shared objects.
475 	 */
476 	if ((type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) == 0) {
477 		lwpchan->lc_wchan0 = (caddr_t)as;
478 		lwpchan->lc_wchan = addr;
479 		return (1);
480 	}
481 	/* check the lwpchan cache for mapping */
482 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
483 }
484 
485 static void
486 lwp_block(lwpchan_t *lwpchan)
487 {
488 	kthread_t *t = curthread;
489 	klwp_t *lwp = ttolwp(t);
490 	sleepq_head_t *sqh;
491 
492 	thread_lock(t);
493 	t->t_flag |= T_WAKEABLE;
494 	t->t_lwpchan = *lwpchan;
495 	t->t_sobj_ops = &lwp_sobj_ops;
496 	t->t_release = 0;
497 	sqh = lwpsqhash(lwpchan);
498 	disp_lock_enter_high(&sqh->sq_lock);
499 	CL_SLEEP(t);
500 	DTRACE_SCHED(sleep);
501 	THREAD_SLEEP(t, &sqh->sq_lock);
502 	sleepq_insert(&sqh->sq_queue, t);
503 	thread_unlock(t);
504 	lwp->lwp_asleep = 1;
505 	lwp->lwp_sysabort = 0;
506 	lwp->lwp_ru.nvcsw++;
507 	(void) new_mstate(curthread, LMS_SLEEP);
508 }
509 
510 static kthread_t *
511 lwpsobj_pi_owner(upimutex_t *up)
512 {
513 	return (up->upi_owner);
514 }
515 
516 static struct upimutex *
517 upi_get(upib_t *upibp, lwpchan_t *lcp)
518 {
519 	struct upimutex *upip;
520 
521 	for (upip = upibp->upib_first; upip != NULL;
522 	    upip = upip->upi_nextchain) {
523 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
524 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
525 			break;
526 	}
527 	return (upip);
528 }
529 
530 static void
531 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
532 {
533 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
534 
535 	/*
536 	 * Insert upimutex at front of list. Maybe a bit unfair
537 	 * but assume that not many lwpchans hash to the same
538 	 * upimutextab bucket, i.e. the list of upimutexes from
539 	 * upib_first is not too long.
540 	 */
541 	upimutex->upi_nextchain = upibp->upib_first;
542 	upibp->upib_first = upimutex;
543 }
544 
545 static void
546 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
547 {
548 	struct upimutex **prev;
549 
550 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
551 
552 	prev = &upibp->upib_first;
553 	while (*prev != upimutex) {
554 		prev = &(*prev)->upi_nextchain;
555 	}
556 	*prev = upimutex->upi_nextchain;
557 	upimutex->upi_nextchain = NULL;
558 }
559 
560 /*
561  * Add upimutex to chain of upimutexes held by curthread.
562  * Returns number of upimutexes held by curthread.
563  */
564 static uint32_t
565 upi_mylist_add(struct upimutex *upimutex)
566 {
567 	kthread_t *t = curthread;
568 
569 	/*
570 	 * Insert upimutex at front of list of upimutexes owned by t. This
571 	 * would match typical LIFO order in which nested locks are acquired
572 	 * and released.
573 	 */
574 	upimutex->upi_nextowned = t->t_upimutex;
575 	t->t_upimutex = upimutex;
576 	t->t_nupinest++;
577 	ASSERT(t->t_nupinest > 0);
578 	return (t->t_nupinest);
579 }
580 
581 /*
582  * Delete upimutex from list of upimutexes owned by curthread.
583  */
584 static void
585 upi_mylist_del(struct upimutex *upimutex)
586 {
587 	kthread_t *t = curthread;
588 	struct upimutex **prev;
589 
590 	/*
591 	 * Since the order in which nested locks are acquired and released,
592 	 * is typically LIFO, and typical nesting levels are not too deep, the
593 	 * following should not be expensive in the general case.
594 	 */
595 	prev = &t->t_upimutex;
596 	while (*prev != upimutex) {
597 		prev = &(*prev)->upi_nextowned;
598 	}
599 	*prev = upimutex->upi_nextowned;
600 	upimutex->upi_nextowned = NULL;
601 	ASSERT(t->t_nupinest > 0);
602 	t->t_nupinest--;
603 }
604 
605 /*
606  * Returns true if upimutex is owned. Should be called only when upim points
607  * to kmem which cannot disappear from underneath.
608  */
609 static int
610 upi_owned(upimutex_t *upim)
611 {
612 	return (upim->upi_owner == curthread);
613 }
614 
615 /*
616  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
617  */
618 static struct upimutex *
619 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
620 {
621 	lwpchan_t lwpchan;
622 	upib_t *upibp;
623 	struct upimutex *upimutex;
624 
625 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
626 	    &lwpchan, LWPCHAN_MPPOOL))
627 		return (NULL);
628 
629 	upibp = &UPI_CHAIN(lwpchan);
630 	mutex_enter(&upibp->upib_lock);
631 	upimutex = upi_get(upibp, &lwpchan);
632 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
633 		mutex_exit(&upibp->upib_lock);
634 		return (NULL);
635 	}
636 	mutex_exit(&upibp->upib_lock);
637 	return (upimutex);
638 }
639 
640 /*
641  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
642  * no lock hand-off occurrs.
643  */
644 static void
645 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
646 {
647 	turnstile_t *ts;
648 	upib_t *upibp;
649 	kthread_t *newowner;
650 
651 	upi_mylist_del(upimutex);
652 	upibp = upimutex->upi_upibp;
653 	mutex_enter(&upibp->upib_lock);
654 	if (upimutex->upi_waiter != 0) { /* if waiters */
655 		ts = turnstile_lookup(upimutex);
656 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
657 			/* hand-off lock to highest prio waiter */
658 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
659 			upimutex->upi_owner = newowner;
660 			if (ts->ts_waiters == 1)
661 				upimutex->upi_waiter = 0;
662 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
663 			mutex_exit(&upibp->upib_lock);
664 			return;
665 		} else if (ts != NULL) {
666 			/* LOCK_NOTRECOVERABLE: wakeup all */
667 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
668 		} else {
669 			/*
670 			 * Misleading w bit. Waiters might have been
671 			 * interrupted. No need to clear the w bit (upimutex
672 			 * will soon be freed). Re-calculate PI from existing
673 			 * waiters.
674 			 */
675 			turnstile_exit(upimutex);
676 			turnstile_pi_recalc();
677 		}
678 	}
679 	/*
680 	 * no waiters, or LOCK_NOTRECOVERABLE.
681 	 * remove from the bucket chain of upi mutexes.
682 	 * de-allocate kernel memory (upimutex).
683 	 */
684 	upi_chain_del(upimutex->upi_upibp, upimutex);
685 	mutex_exit(&upibp->upib_lock);
686 	kmem_free(upimutex, sizeof (upimutex_t));
687 }
688 
689 static int
690 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
691 {
692 	label_t ljb;
693 	int error = 0;
694 	lwpchan_t lwpchan;
695 	uint16_t flag;
696 	upib_t *upibp;
697 	volatile struct upimutex *upimutex = NULL;
698 	turnstile_t *ts;
699 	uint32_t nupinest;
700 	volatile int upilocked = 0;
701 
702 	if (on_fault(&ljb)) {
703 		if (upilocked)
704 			upimutex_unlock((upimutex_t *)upimutex, 0);
705 		error = EFAULT;
706 		goto out;
707 	}
708 	/*
709 	 * The apparent assumption made in implementing other _lwp_* synch
710 	 * primitives, is that get_lwpchan() does not return a unique cookie
711 	 * for the case where 2 processes (one forked from the other) point
712 	 * at the same underlying object, which is typed USYNC_PROCESS, but
713 	 * mapped MAP_PRIVATE, since the object has not yet been written to,
714 	 * in the child process.
715 	 *
716 	 * Since get_lwpchan() has been fixed, it is not necessary to do the
717 	 * dummy writes to force a COW fault as in other places (which should
718 	 * be fixed).
719 	 */
720 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
721 	    &lwpchan, LWPCHAN_MPPOOL)) {
722 		error = EFAULT;
723 		goto out;
724 	}
725 	upibp = &UPI_CHAIN(lwpchan);
726 retry:
727 	mutex_enter(&upibp->upib_lock);
728 	upimutex = upi_get(upibp, &lwpchan);
729 	if (upimutex == NULL)  {
730 		/* lock available since lwpchan has no upimutex */
731 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
732 		upi_chain_add(upibp, (upimutex_t *)upimutex);
733 		upimutex->upi_owner = curthread; /* grab lock */
734 		upimutex->upi_upibp = upibp;
735 		upimutex->upi_vaddr = lp;
736 		upimutex->upi_lwpchan = lwpchan;
737 		mutex_exit(&upibp->upib_lock);
738 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
739 		upilocked = 1;
740 		fuword16_noerr(&lp->mutex_flag, &flag);
741 		if (nupinest > maxnestupimx &&
742 		    secpolicy_resource(CRED()) != 0) {
743 			upimutex_unlock((upimutex_t *)upimutex, flag);
744 			error = ENOMEM;
745 			goto out;
746 		}
747 		if (flag & LOCK_OWNERDEAD) {
748 			/*
749 			 * Return with upimutex held.
750 			 */
751 			error = EOWNERDEAD;
752 		} else if (flag & LOCK_NOTRECOVERABLE) {
753 			/*
754 			 * Since the setting of LOCK_NOTRECOVERABLE
755 			 * was done under the high-level upi mutex,
756 			 * in lwp_upimutex_unlock(), this flag needs to
757 			 * be checked while holding the upi mutex.
758 			 * If set, this thread should  return without
759 			 * the lock held, and with the right error
760 			 * code.
761 			 */
762 			upimutex_unlock((upimutex_t *)upimutex, flag);
763 			upilocked = 0;
764 			error = ENOTRECOVERABLE;
765 		}
766 		goto out;
767 	}
768 	/*
769 	 * If a upimutex object exists, it must have an owner.
770 	 * This is due to lock hand-off, and release of upimutex when no
771 	 * waiters are present at unlock time,
772 	 */
773 	ASSERT(upimutex->upi_owner != NULL);
774 	if (upimutex->upi_owner == curthread) {
775 		/*
776 		 * The user wrapper can check if the mutex type is
777 		 * ERRORCHECK: if not, it should stall at user-level.
778 		 * If so, it should return the error code.
779 		 */
780 		mutex_exit(&upibp->upib_lock);
781 		error = EDEADLK;
782 		goto out;
783 	}
784 	if (try == UPIMUTEX_TRY) {
785 		mutex_exit(&upibp->upib_lock);
786 		error = EBUSY;
787 		goto out;
788 	}
789 	/*
790 	 * Block for the lock.
791 	 * Put the lwp in an orderly state for debugging.
792 	 * Calling prstop() has to be done here, and not in
793 	 * turnstile_block(), since the preceding call to
794 	 * turnstile_lookup() raises the PIL to a level
795 	 * at which calls to prstop() should not be made.
796 	 */
797 	if ((error = lwptp->lwpt_time_error) != 0) {
798 		/*
799 		 * The SUSV3 Posix spec is very clear that we
800 		 * should get no error from validating the
801 		 * timer until we would actually sleep.
802 		 */
803 		mutex_exit(&upibp->upib_lock);
804 		goto out;
805 	}
806 	prstop(PR_REQUESTED, 0);
807 	if (lwptp->lwpt_tsp != NULL) {
808 		/*
809 		 * If we successfully queue the timeout
810 		 * (lwp_timer_enqueue() returns zero),
811 		 * then don't drop t_delay_lock until we are
812 		 * on the sleep queue (in turnstile_block()).
813 		 * Otherwise we will get an immediate timeout
814 		 * when we attempt to sleep in turnstile_block().
815 		 */
816 		mutex_enter(&curthread->t_delay_lock);
817 		if (lwp_timer_enqueue(lwptp) != 0)
818 			mutex_exit(&curthread->t_delay_lock);
819 	}
820 	/*
821 	 * Now, set the waiter bit and block for the lock in turnstile_block().
822 	 * No need to preserve the previous wbit since a lock try is not
823 	 * attempted after setting the wait bit. Wait bit is set under
824 	 * the upib_lock, which is not released until the turnstile lock
825 	 * is acquired. Say, the upimutex is L:
826 	 *
827 	 * 1. upib_lock is held so the waiter does not have to retry L after
828 	 *    setting the wait bit: since the owner has to grab the upib_lock
829 	 *    to unlock L, it will certainly see the wait bit set.
830 	 * 2. upib_lock is not released until the turnstile lock is acquired.
831 	 *    This is the key to preventing a missed wake-up. Otherwise, the
832 	 *    owner could acquire the upib_lock, and the tc_lock, to call
833 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
834 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
835 	 *    find this waiter, resulting in the missed wakeup.
836 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
837 	 *    holding the tc_lock (since mutex_exit() could need to acquire
838 	 *    the same tc_lock)...and so is held when calling turnstile_block().
839 	 *    The address of upib_lock is passed to turnstile_block() which
840 	 *    releases it after releasing all turnstile locks, and before going
841 	 *    to sleep in swtch().
842 	 * 4. The waiter value cannot be a count of waiters, because a waiter
843 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
844 	 *    which point, the upib_lock cannot be locked, to decrement waiter
845 	 *    count. So, just treat the waiter state as a bit, not a count.
846 	 */
847 	ts = turnstile_lookup((upimutex_t *)upimutex);
848 	upimutex->upi_waiter = 1;
849 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
850 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
851 	/*
852 	 * Hand-off implies that we wakeup holding the lock, except when:
853 	 *	- deadlock is detected
854 	 *	- lock is not recoverable
855 	 *	- we got an interrupt or timeout
856 	 * If we wake up due to an interrupt or timeout, we may
857 	 * or may not be holding the lock due to mutex hand-off.
858 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
859 	 */
860 	if (error != 0) {
861 		if ((error == EINTR || error == ETIME) &&
862 		    (upimutex = lwp_upimutex_owned(lp, type))) {
863 			/*
864 			 * Unlock and return - the re-startable syscall will
865 			 * try the lock again if we got EINTR.
866 			 */
867 			(void) upi_mylist_add((upimutex_t *)upimutex);
868 			upimutex_unlock((upimutex_t *)upimutex, 0);
869 		}
870 		/*
871 		 * The only other possible error is EDEADLK.  If so, upimutex
872 		 * is valid, since its owner is deadlocked with curthread.
873 		 */
874 		ASSERT(error == EINTR || error == ETIME ||
875 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
876 		ASSERT(!lwp_upimutex_owned(lp, type));
877 		goto out;
878 	}
879 	if (lwp_upimutex_owned(lp, type)) {
880 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
881 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
882 		upilocked = 1;
883 	}
884 	/*
885 	 * Now, need to read the user-level lp->mutex_flag to do the following:
886 	 *
887 	 * - if lock is held, check if EOWNERDEAD should be returned
888 	 * - if lock isn't held, check if ENOTRECOVERABLE should be returned
889 	 *
890 	 * Now, either lp->mutex_flag is readable or it's not. If not
891 	 * readable, the on_fault path will cause a return with EFAULT as
892 	 * it should. If it is readable, the state of the flag encodes the
893 	 * robustness state of the lock:
894 	 *
895 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD setting
896 	 * will influence the return code appropriately. If the upimutex is
897 	 * not locked here, this could be due to a spurious wake-up or a
898 	 * NOTRECOVERABLE event. The flag's setting can be used to distinguish
899 	 * between these two events.
900 	 */
901 	fuword16_noerr(&lp->mutex_flag, &flag);
902 	if (upilocked) {
903 		/*
904 		 * If the thread wakes up from turnstile_block with the lock
905 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
906 		 * since it would not have been handed-off the lock.
907 		 * So, no need to check for this case.
908 		 */
909 		if (nupinest > maxnestupimx &&
910 		    secpolicy_resource(CRED()) != 0) {
911 			upimutex_unlock((upimutex_t *)upimutex, flag);
912 			upilocked = 0;
913 			error = ENOMEM;
914 		} else if (flag & LOCK_OWNERDEAD) {
915 			error = EOWNERDEAD;
916 		}
917 	} else {
918 		/*
919 		 * Wake-up without the upimutex held. Either this is a
920 		 * spurious wake-up (due to signals, forkall(), whatever), or
921 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
922 		 * of the mutex flag can be used to distinguish between the
923 		 * two events.
924 		 */
925 		if (flag & LOCK_NOTRECOVERABLE) {
926 			error = ENOTRECOVERABLE;
927 		} else {
928 			/*
929 			 * Here, the flag could be set to LOCK_OWNERDEAD or
930 			 * not. In both cases, this is a spurious wakeup,
931 			 * since the upi lock is not held, but the thread
932 			 * has returned from turnstile_block().
933 			 *
934 			 * The user flag could be LOCK_OWNERDEAD if, at the
935 			 * same time as curthread having been woken up
936 			 * spuriously, the owner (say Tdead) has died, marked
937 			 * the mutex flag accordingly, and handed off the lock
938 			 * to some other waiter (say Tnew). curthread just
939 			 * happened to read the flag while Tnew has yet to deal
940 			 * with the owner-dead event.
941 			 *
942 			 * In this event, curthread should retry the lock.
943 			 * If Tnew is able to cleanup the lock, curthread
944 			 * will eventually get the lock with a zero error code,
945 			 * If Tnew is unable to cleanup, its eventual call to
946 			 * unlock the lock will result in the mutex flag being
947 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
948 			 * all waiters, including curthread, which will then
949 			 * eventually return ENOTRECOVERABLE due to the above
950 			 * check.
951 			 *
952 			 * Of course, if the user-flag is not set with
953 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
954 			 * this is definitely a spurious wakeup.
955 			 */
956 			goto retry;
957 		}
958 	}
959 
960 out:
961 	no_fault();
962 	return (error);
963 }
964 
965 
966 static int
967 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
968 {
969 	label_t ljb;
970 	int error = 0;
971 	lwpchan_t lwpchan;
972 	uint16_t flag;
973 	upib_t *upibp;
974 	volatile struct upimutex *upimutex = NULL;
975 	volatile int upilocked = 0;
976 
977 	if (on_fault(&ljb)) {
978 		if (upilocked)
979 			upimutex_unlock((upimutex_t *)upimutex, 0);
980 		error = EFAULT;
981 		goto out;
982 	}
983 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
984 	    &lwpchan, LWPCHAN_MPPOOL)) {
985 		error = EFAULT;
986 		goto out;
987 	}
988 	upibp = &UPI_CHAIN(lwpchan);
989 	mutex_enter(&upibp->upib_lock);
990 	upimutex = upi_get(upibp, &lwpchan);
991 	/*
992 	 * If the lock is not held, or the owner is not curthread, return
993 	 * error. The user-level wrapper can return this error or stall,
994 	 * depending on whether mutex is of ERRORCHECK type or not.
995 	 */
996 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
997 		mutex_exit(&upibp->upib_lock);
998 		error = EPERM;
999 		goto out;
1000 	}
1001 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1002 	upilocked = 1;
1003 	fuword16_noerr(&lp->mutex_flag, &flag);
1004 	if (flag & LOCK_OWNERDEAD) {
1005 		/*
1006 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1007 		 */
1008 		flag &= ~LOCK_OWNERDEAD;
1009 		flag |= LOCK_NOTRECOVERABLE;
1010 		suword16_noerr(&lp->mutex_flag, flag);
1011 	}
1012 	upimutex_unlock((upimutex_t *)upimutex, flag);
1013 	upilocked = 0;
1014 out:
1015 	no_fault();
1016 	return (error);
1017 }
1018 
1019 /*
1020  * Mark user mutex state, corresponding to kernel upimutex, as LOCK_OWNERDEAD.
1021  */
1022 static int
1023 upi_dead(upimutex_t *upip)
1024 {
1025 	label_t ljb;
1026 	int error = 0;
1027 	lwp_mutex_t *lp;
1028 	uint16_t flag;
1029 
1030 	if (on_fault(&ljb)) {
1031 		error = EFAULT;
1032 		goto out;
1033 	}
1034 
1035 	lp = upip->upi_vaddr;
1036 	fuword16_noerr(&lp->mutex_flag, &flag);
1037 	flag |= LOCK_OWNERDEAD;
1038 	suword16_noerr(&lp->mutex_flag, flag);
1039 out:
1040 	no_fault();
1041 	return (error);
1042 }
1043 
1044 /*
1045  * Unlock all upimutexes held by curthread, since curthread is dying.
1046  * For each upimutex, attempt to mark its corresponding user mutex object as
1047  * dead.
1048  */
1049 void
1050 upimutex_cleanup()
1051 {
1052 	kthread_t *t = curthread;
1053 	struct upimutex *upip;
1054 
1055 	while ((upip = t->t_upimutex) != NULL) {
1056 		if (upi_dead(upip) != 0) {
1057 			/*
1058 			 * If the user object associated with this upimutex is
1059 			 * unmapped, unlock upimutex with the
1060 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1061 			 * woken up. Since user object is unmapped, it could
1062 			 * not be marked as dead or notrecoverable.
1063 			 * The waiters will now all wake up and return
1064 			 * ENOTRECOVERABLE, since they would find that the lock
1065 			 * has not been handed-off to them.
1066 			 * See lwp_upimutex_lock().
1067 			 */
1068 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1069 		} else {
1070 			/*
1071 			 * The user object has been updated as dead.
1072 			 * Unlock the upimutex: if no waiters, upip kmem will
1073 			 * be freed. If there is a waiter, the lock will be
1074 			 * handed off. If exit() is in progress, each existing
1075 			 * waiter will successively get the lock, as owners
1076 			 * die, and each new owner will call this routine as
1077 			 * it dies. The last owner will free kmem, since
1078 			 * it will find the upimutex has no waiters. So,
1079 			 * eventually, the kmem is guaranteed to be freed.
1080 			 */
1081 			upimutex_unlock(upip, 0);
1082 		}
1083 		/*
1084 		 * Note that the call to upimutex_unlock() above will delete
1085 		 * upimutex from the t_upimutexes chain. And so the
1086 		 * while loop will eventually terminate.
1087 		 */
1088 	}
1089 }
1090 
1091 int
1092 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1093 {
1094 	kthread_t *t = curthread;
1095 	klwp_t *lwp = ttolwp(t);
1096 	proc_t *p = ttoproc(t);
1097 	lwp_timer_t lwpt;
1098 	caddr_t timedwait;
1099 	int error = 0;
1100 	int time_error;
1101 	clock_t tim = -1;
1102 	uchar_t waiters;
1103 	volatile int locked = 0;
1104 	volatile int watched = 0;
1105 	label_t ljb;
1106 	volatile uint8_t type = 0;
1107 	lwpchan_t lwpchan;
1108 	sleepq_head_t *sqh;
1109 	static int iswanted();
1110 	uint16_t flag;
1111 	int imm_timeout = 0;
1112 
1113 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1114 		return (set_errno(EFAULT));
1115 
1116 	timedwait = (caddr_t)tsp;
1117 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1118 	    lwpt.lwpt_imm_timeout) {
1119 		imm_timeout = 1;
1120 		timedwait = NULL;
1121 	}
1122 
1123 	/*
1124 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1125 	 * this micro state is really a run state. If the thread indeed blocks,
1126 	 * this state becomes valid. If not, the state is converted back to
1127 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1128 	 * when blocking.
1129 	 */
1130 	(void) new_mstate(t, LMS_USER_LOCK);
1131 	if (on_fault(&ljb)) {
1132 		if (locked)
1133 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1134 		error = EFAULT;
1135 		goto out;
1136 	}
1137 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1138 	if (UPIMUTEX(type)) {
1139 		no_fault();
1140 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1141 		if ((error == 0 || error == EOWNERDEAD) &&
1142 		    (type & USYNC_PROCESS))
1143 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1144 		if (tsp && !time_error)	/* copyout the residual time left */
1145 			error = lwp_timer_copyout(&lwpt, error);
1146 		if (error)
1147 			return (set_errno(error));
1148 		return (0);
1149 	}
1150 	/*
1151 	 * Force Copy-on-write fault if lwp_mutex_t object is
1152 	 * defined to be MAP_PRIVATE and it was initialized to
1153 	 * USYNC_PROCESS.
1154 	 */
1155 	suword8_noerr(&lp->mutex_type, type);
1156 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1157 	    &lwpchan, LWPCHAN_MPPOOL)) {
1158 		error = EFAULT;
1159 		goto out;
1160 	}
1161 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1162 	locked = 1;
1163 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1164 	suword8_noerr(&lp->mutex_waiters, 1);
1165 	if (type & USYNC_PROCESS_ROBUST) {
1166 		fuword16_noerr(&lp->mutex_flag, &flag);
1167 		if (flag & LOCK_NOTRECOVERABLE) {
1168 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1169 			error = ENOTRECOVERABLE;
1170 			goto out;
1171 		}
1172 	}
1173 
1174 	/*
1175 	 * If watchpoints are set, they need to be restored, since
1176 	 * atomic accesses of memory such as the call to ulock_try()
1177 	 * below cannot be watched.
1178 	 */
1179 
1180 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1181 
1182 	while (!ulock_try(&lp->mutex_lockw)) {
1183 		if (time_error) {
1184 			/*
1185 			 * The SUSV3 Posix spec is very clear that we
1186 			 * should get no error from validating the
1187 			 * timer until we would actually sleep.
1188 			 */
1189 			error = time_error;
1190 			break;
1191 		}
1192 
1193 		if (watched) {
1194 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1195 			watched = 0;
1196 		}
1197 
1198 		/*
1199 		 * Put the lwp in an orderly state for debugging.
1200 		 */
1201 		prstop(PR_REQUESTED, 0);
1202 		if (timedwait) {
1203 			/*
1204 			 * If we successfully queue the timeout,
1205 			 * then don't drop t_delay_lock until
1206 			 * we are on the sleep queue (below).
1207 			 */
1208 			mutex_enter(&t->t_delay_lock);
1209 			if (lwp_timer_enqueue(&lwpt) != 0) {
1210 				mutex_exit(&t->t_delay_lock);
1211 				imm_timeout = 1;
1212 				timedwait = NULL;
1213 			}
1214 		}
1215 		lwp_block(&lwpchan);
1216 		/*
1217 		 * Nothing should happen to cause the lwp to go to
1218 		 * sleep again until after it returns from swtch().
1219 		 */
1220 		if (timedwait)
1221 			mutex_exit(&t->t_delay_lock);
1222 		locked = 0;
1223 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1224 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1225 			setrun(t);
1226 		swtch();
1227 		t->t_flag &= ~T_WAKEABLE;
1228 		if (timedwait)
1229 			tim = lwp_timer_dequeue(&lwpt);
1230 		setallwatch();
1231 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1232 			error = EINTR;
1233 		else if (imm_timeout || (timedwait && tim == -1))
1234 			error = ETIME;
1235 		if (error) {
1236 			lwp->lwp_asleep = 0;
1237 			lwp->lwp_sysabort = 0;
1238 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1239 			    S_WRITE);
1240 
1241 			/*
1242 			 * Need to re-compute waiters bit. The waiters field in
1243 			 * the lock is not reliable. Either of two things could
1244 			 * have occurred: no lwp may have called lwp_release()
1245 			 * for me but I have woken up due to a signal or
1246 			 * timeout.  In this case, the waiter bit is incorrect
1247 			 * since it is still set to 1, set above.
1248 			 * OR an lwp_release() did occur for some other lwp on
1249 			 * the same lwpchan. In this case, the waiter bit is
1250 			 * correct.  But which event occurred, one can't tell.
1251 			 * So, recompute.
1252 			 */
1253 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1254 			locked = 1;
1255 			sqh = lwpsqhash(&lwpchan);
1256 			disp_lock_enter(&sqh->sq_lock);
1257 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1258 			disp_lock_exit(&sqh->sq_lock);
1259 			break;
1260 		}
1261 		lwp->lwp_asleep = 0;
1262 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1263 		    S_WRITE);
1264 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1265 		locked = 1;
1266 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1267 		suword8_noerr(&lp->mutex_waiters, 1);
1268 		if (type & USYNC_PROCESS_ROBUST) {
1269 			fuword16_noerr(&lp->mutex_flag, &flag);
1270 			if (flag & LOCK_NOTRECOVERABLE) {
1271 				error = ENOTRECOVERABLE;
1272 				break;
1273 			}
1274 		}
1275 	}
1276 
1277 	if (t->t_mstate == LMS_USER_LOCK)
1278 		(void) new_mstate(t, LMS_SYSTEM);
1279 
1280 	if (!error && (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))) {
1281 		suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1282 		if (type & USYNC_PROCESS_ROBUST) {
1283 			fuword16_noerr(&lp->mutex_flag, &flag);
1284 			if (flag & LOCK_OWNERDEAD)
1285 				error = EOWNERDEAD;
1286 			else if (flag & LOCK_UNMAPPED)
1287 				error = ELOCKUNMAPPED;
1288 		}
1289 	}
1290 	suword8_noerr(&lp->mutex_waiters, waiters);
1291 	locked = 0;
1292 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1293 out:
1294 	no_fault();
1295 	if (watched)
1296 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1297 	if (tsp && !time_error)		/* copyout the residual time left */
1298 		error = lwp_timer_copyout(&lwpt, error);
1299 	if (error)
1300 		return (set_errno(error));
1301 	return (0);
1302 }
1303 
1304 /*
1305  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1306  * libc now calls lwp_mutex_timedlock(lp, NULL).
1307  * This system call trap continues to exist solely for the benefit
1308  * of old statically-linked binaries from Solaris 9 and before.
1309  * It should be removed from the system when we no longer care
1310  * about such applications.
1311  */
1312 int
1313 lwp_mutex_lock(lwp_mutex_t *lp)
1314 {
1315 	return (lwp_mutex_timedlock(lp, NULL));
1316 }
1317 
1318 static int
1319 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1320 {
1321 	/*
1322 	 * The caller holds the dispatcher lock on the sleep queue.
1323 	 */
1324 	while (t != NULL) {
1325 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1326 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1327 			return (1);
1328 		t = t->t_link;
1329 	}
1330 	return (0);
1331 }
1332 
1333 /*
1334  * Return the highest priority thread sleeping on this lwpchan.
1335  */
1336 static kthread_t *
1337 lwp_queue_waiter(lwpchan_t *lwpchan)
1338 {
1339 	sleepq_head_t *sqh;
1340 	kthread_t *tp;
1341 
1342 	sqh = lwpsqhash(lwpchan);
1343 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1344 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1345 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1346 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1347 			break;
1348 	}
1349 	disp_lock_exit(&sqh->sq_lock);
1350 	return (tp);
1351 }
1352 
1353 static int
1354 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1355 {
1356 	sleepq_head_t *sqh;
1357 	kthread_t *tp;
1358 	kthread_t **tpp;
1359 
1360 	sqh = lwpsqhash(lwpchan);
1361 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1362 	tpp = &sqh->sq_queue.sq_first;
1363 	while ((tp = *tpp) != NULL) {
1364 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1365 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1366 			/*
1367 			 * The following is typically false. It could be true
1368 			 * only if lwp_release() is called from
1369 			 * lwp_mutex_wakeup() after reading the waiters field
1370 			 * from memory in which the lwp lock used to be, but has
1371 			 * since been re-used to hold a lwp cv or lwp semaphore.
1372 			 * The thread "tp" found to match the lwp lock's wchan
1373 			 * is actually sleeping for the cv or semaphore which
1374 			 * now has the same wchan. In this case, lwp_release()
1375 			 * should return failure.
1376 			 */
1377 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1378 				ASSERT(sync_type == 0);
1379 				/*
1380 				 * assert that this can happen only for mutexes
1381 				 * i.e. sync_type == 0, for correctly written
1382 				 * user programs.
1383 				 */
1384 				disp_lock_exit(&sqh->sq_lock);
1385 				return (0);
1386 			}
1387 			*waiters = iswanted(tp->t_link, lwpchan);
1388 			sleepq_unlink(tpp, tp);
1389 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1390 			tp->t_wchan0 = NULL;
1391 			tp->t_wchan = NULL;
1392 			tp->t_sobj_ops = NULL;
1393 			tp->t_release = 1;
1394 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1395 			CL_WAKEUP(tp);
1396 			thread_unlock(tp);	/* drop run queue lock */
1397 			return (1);
1398 		}
1399 		tpp = &tp->t_link;
1400 	}
1401 	*waiters = 0;
1402 	disp_lock_exit(&sqh->sq_lock);
1403 	return (0);
1404 }
1405 
1406 static void
1407 lwp_release_all(lwpchan_t *lwpchan)
1408 {
1409 	sleepq_head_t	*sqh;
1410 	kthread_t *tp;
1411 	kthread_t **tpp;
1412 
1413 	sqh = lwpsqhash(lwpchan);
1414 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1415 	tpp = &sqh->sq_queue.sq_first;
1416 	while ((tp = *tpp) != NULL) {
1417 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1418 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1419 			sleepq_unlink(tpp, tp);
1420 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1421 			tp->t_wchan0 = NULL;
1422 			tp->t_wchan = NULL;
1423 			tp->t_sobj_ops = NULL;
1424 			CL_WAKEUP(tp);
1425 			thread_unlock_high(tp);	/* release run queue lock */
1426 		} else {
1427 			tpp = &tp->t_link;
1428 		}
1429 	}
1430 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1431 }
1432 
1433 /*
1434  * unblock a lwp that is trying to acquire this mutex. the blocked
1435  * lwp resumes and retries to acquire the lock.
1436  */
1437 int
1438 lwp_mutex_wakeup(lwp_mutex_t *lp)
1439 {
1440 	proc_t *p = ttoproc(curthread);
1441 	lwpchan_t lwpchan;
1442 	uchar_t waiters;
1443 	volatile int locked = 0;
1444 	volatile int watched = 0;
1445 	volatile uint8_t type = 0;
1446 	label_t ljb;
1447 	int error = 0;
1448 
1449 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1450 		return (set_errno(EFAULT));
1451 
1452 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1453 
1454 	if (on_fault(&ljb)) {
1455 		if (locked)
1456 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1457 		error = EFAULT;
1458 		goto out;
1459 	}
1460 	/*
1461 	 * Force Copy-on-write fault if lwp_mutex_t object is
1462 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
1463 	 */
1464 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1465 	suword8_noerr(&lp->mutex_type, type);
1466 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1467 	    &lwpchan, LWPCHAN_MPPOOL)) {
1468 		error = EFAULT;
1469 		goto out;
1470 	}
1471 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1472 	locked = 1;
1473 	/*
1474 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1475 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1476 	 * may fail.  If it fails, do not write into the waiter bit.
1477 	 * The call to lwp_release() might fail due to one of three reasons:
1478 	 *
1479 	 * 	1. due to the thread which set the waiter bit not actually
1480 	 *	   sleeping since it got the lock on the re-try. The waiter
1481 	 *	   bit will then be correctly updated by that thread. This
1482 	 *	   window may be closed by reading the wait bit again here
1483 	 *	   and not calling lwp_release() at all if it is zero.
1484 	 *	2. the thread which set the waiter bit and went to sleep
1485 	 *	   was woken up by a signal. This time, the waiter recomputes
1486 	 *	   the wait bit in the return with EINTR code.
1487 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1488 	 *	   memory that has been re-used after the lock was dropped.
1489 	 *	   In this case, writing into the waiter bit would cause data
1490 	 *	   corruption.
1491 	 */
1492 	if (lwp_release(&lwpchan, &waiters, 0) == 1) {
1493 		suword8_noerr(&lp->mutex_waiters, waiters);
1494 	}
1495 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1496 out:
1497 	no_fault();
1498 	if (watched)
1499 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1500 	if (error)
1501 		return (set_errno(error));
1502 	return (0);
1503 }
1504 
1505 /*
1506  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1507  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1508  * a flag telling the kernel whether or not to honor the kernel/user
1509  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1510  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1511  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1512  * it is used an an in/out parameter.  On entry, it contains the relative
1513  * time until timeout.  On exit, we copyout the residual time left to it.
1514  */
1515 int
1516 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1517 {
1518 	kthread_t *t = curthread;
1519 	klwp_t *lwp = ttolwp(t);
1520 	proc_t *p = ttoproc(t);
1521 	lwp_timer_t lwpt;
1522 	lwpchan_t cv_lwpchan;
1523 	lwpchan_t m_lwpchan;
1524 	caddr_t timedwait;
1525 	volatile uint16_t type = 0;
1526 	volatile uint8_t mtype = 0;
1527 	uchar_t waiters;
1528 	volatile int error;
1529 	clock_t tim = -1;
1530 	volatile int locked = 0;
1531 	volatile int m_locked = 0;
1532 	volatile int cvwatched = 0;
1533 	volatile int mpwatched = 0;
1534 	label_t ljb;
1535 	volatile int no_lwpchan = 1;
1536 	int imm_timeout = 0;
1537 	int imm_unpark = 0;
1538 
1539 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1540 	    (caddr_t)mp >= p->p_as->a_userlimit)
1541 		return (set_errno(EFAULT));
1542 
1543 	timedwait = (caddr_t)tsp;
1544 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1545 		return (set_errno(error));
1546 	if (lwpt.lwpt_imm_timeout) {
1547 		imm_timeout = 1;
1548 		timedwait = NULL;
1549 	}
1550 
1551 	(void) new_mstate(t, LMS_USER_LOCK);
1552 
1553 	if (on_fault(&ljb)) {
1554 		if (no_lwpchan) {
1555 			error = EFAULT;
1556 			goto out;
1557 		}
1558 		if (m_locked) {
1559 			m_locked = 0;
1560 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1561 		}
1562 		if (locked) {
1563 			locked = 0;
1564 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1565 		}
1566 		/*
1567 		 * set up another on_fault() for a possible fault
1568 		 * on the user lock accessed at "efault"
1569 		 */
1570 		if (on_fault(&ljb)) {
1571 			if (m_locked) {
1572 				m_locked = 0;
1573 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1574 			}
1575 			goto out;
1576 		}
1577 		error = EFAULT;
1578 		goto efault;
1579 	}
1580 
1581 	/*
1582 	 * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t
1583 	 * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS
1584 	 */
1585 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1586 	if (UPIMUTEX(mtype) == 0) {
1587 		suword8_noerr(&mp->mutex_type, mtype);
1588 		/* convert user level mutex, "mp", to a unique lwpchan */
1589 		/* check if mtype is ok to use below, instead of type from cv */
1590 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1591 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1592 			error = EFAULT;
1593 			goto out;
1594 		}
1595 	}
1596 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1597 	suword16_noerr(&cv->cond_type, type);
1598 	/* convert user level condition variable, "cv", to a unique lwpchan */
1599 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1600 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1601 		error = EFAULT;
1602 		goto out;
1603 	}
1604 	no_lwpchan = 0;
1605 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1606 	if (UPIMUTEX(mtype) == 0)
1607 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1608 		    S_WRITE);
1609 
1610 	/*
1611 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1612 	 * with respect to a possible wakeup which is a result of either
1613 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1614 	 *
1615 	 * What's misleading, is that the lwp is put to sleep after the
1616 	 * condition variable's mutex is released.  This is OK as long as
1617 	 * the release operation is also done while holding lwpchan_lock.
1618 	 * The lwp is then put to sleep when the possibility of pagefaulting
1619 	 * or sleeping is completely eliminated.
1620 	 */
1621 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1622 	locked = 1;
1623 	if (UPIMUTEX(mtype) == 0) {
1624 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1625 		m_locked = 1;
1626 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1627 		/*
1628 		 * unlock the condition variable's mutex. (pagefaults are
1629 		 * possible here.)
1630 		 */
1631 		ulock_clear(&mp->mutex_lockw);
1632 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1633 		if (waiters != 0) {
1634 			/*
1635 			 * Given the locking of lwpchan_lock around the release
1636 			 * of the mutex and checking for waiters, the following
1637 			 * call to lwp_release() can fail ONLY if the lock
1638 			 * acquirer is interrupted after setting the waiter bit,
1639 			 * calling lwp_block() and releasing lwpchan_lock.
1640 			 * In this case, it could get pulled off the lwp sleep
1641 			 * q (via setrun()) before the following call to
1642 			 * lwp_release() occurs. In this case, the lock
1643 			 * requestor will update the waiter bit correctly by
1644 			 * re-evaluating it.
1645 			 */
1646 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1647 				suword8_noerr(&mp->mutex_waiters, waiters);
1648 		}
1649 		m_locked = 0;
1650 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1651 	} else {
1652 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1653 		error = lwp_upimutex_unlock(mp, mtype);
1654 		if (error) {	/* if the upimutex unlock failed */
1655 			locked = 0;
1656 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1657 			goto out;
1658 		}
1659 	}
1660 	no_fault();
1661 
1662 	if (mpwatched) {
1663 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1664 		mpwatched = 0;
1665 	}
1666 	if (cvwatched) {
1667 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1668 		cvwatched = 0;
1669 	}
1670 
1671 	/*
1672 	 * Put the lwp in an orderly state for debugging.
1673 	 */
1674 	prstop(PR_REQUESTED, 0);
1675 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1676 		/*
1677 		 * We received a signal at user-level before calling here
1678 		 * or another thread wants us to return immediately
1679 		 * with EINTR.  See lwp_unpark().
1680 		 */
1681 		imm_unpark = 1;
1682 		t->t_unpark = 0;
1683 		timedwait = NULL;
1684 	} else if (timedwait) {
1685 		/*
1686 		 * If we successfully queue the timeout,
1687 		 * then don't drop t_delay_lock until
1688 		 * we are on the sleep queue (below).
1689 		 */
1690 		mutex_enter(&t->t_delay_lock);
1691 		if (lwp_timer_enqueue(&lwpt) != 0) {
1692 			mutex_exit(&t->t_delay_lock);
1693 			imm_timeout = 1;
1694 			timedwait = NULL;
1695 		}
1696 	}
1697 	t->t_flag |= T_WAITCVSEM;
1698 	lwp_block(&cv_lwpchan);
1699 	/*
1700 	 * Nothing should happen to cause the lwp to go to sleep
1701 	 * until after it returns from swtch().
1702 	 */
1703 	if (timedwait)
1704 		mutex_exit(&t->t_delay_lock);
1705 	locked = 0;
1706 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1707 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1708 	    (imm_timeout | imm_unpark))
1709 		setrun(t);
1710 	swtch();
1711 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1712 	if (timedwait)
1713 		tim = lwp_timer_dequeue(&lwpt);
1714 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1715 	    MUSTRETURN(p, t) || imm_unpark)
1716 		error = EINTR;
1717 	else if (imm_timeout || (timedwait && tim == -1))
1718 		error = ETIME;
1719 	lwp->lwp_asleep = 0;
1720 	lwp->lwp_sysabort = 0;
1721 	setallwatch();
1722 
1723 	if (t->t_mstate == LMS_USER_LOCK)
1724 		(void) new_mstate(t, LMS_SYSTEM);
1725 
1726 	if (tsp && check_park)		/* copyout the residual time left */
1727 		error = lwp_timer_copyout(&lwpt, error);
1728 
1729 	/* the mutex is reacquired by the caller on return to user level */
1730 	if (error) {
1731 		/*
1732 		 * If we were concurrently lwp_cond_signal()d and we
1733 		 * received a UNIX signal or got a timeout, then perform
1734 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1735 		 */
1736 		if (t->t_release)
1737 			(void) lwp_cond_signal(cv);
1738 		return (set_errno(error));
1739 	}
1740 	return (0);
1741 
1742 efault:
1743 	/*
1744 	 * make sure that the user level lock is dropped before
1745 	 * returning to caller, since the caller always re-acquires it.
1746 	 */
1747 	if (UPIMUTEX(mtype) == 0) {
1748 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1749 		m_locked = 1;
1750 		ulock_clear(&mp->mutex_lockw);
1751 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1752 		if (waiters != 0) {
1753 			/*
1754 			 * See comment above on lock clearing and lwp_release()
1755 			 * success/failure.
1756 			 */
1757 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1758 				suword8_noerr(&mp->mutex_waiters, waiters);
1759 		}
1760 		m_locked = 0;
1761 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1762 	} else {
1763 		(void) lwp_upimutex_unlock(mp, mtype);
1764 	}
1765 out:
1766 	no_fault();
1767 	if (mpwatched)
1768 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1769 	if (cvwatched)
1770 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1771 	if (t->t_mstate == LMS_USER_LOCK)
1772 		(void) new_mstate(t, LMS_SYSTEM);
1773 	return (set_errno(error));
1774 }
1775 
1776 /*
1777  * wakeup one lwp that's blocked on this condition variable.
1778  */
1779 int
1780 lwp_cond_signal(lwp_cond_t *cv)
1781 {
1782 	proc_t *p = ttoproc(curthread);
1783 	lwpchan_t lwpchan;
1784 	uchar_t waiters;
1785 	volatile uint16_t type = 0;
1786 	volatile int locked = 0;
1787 	volatile int watched = 0;
1788 	label_t ljb;
1789 	int error = 0;
1790 
1791 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1792 		return (set_errno(EFAULT));
1793 
1794 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1795 
1796 	if (on_fault(&ljb)) {
1797 		if (locked)
1798 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1799 		error = EFAULT;
1800 		goto out;
1801 	}
1802 	/*
1803 	 * Force Copy-on-write fault if lwp_cond_t object is
1804 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1805 	 */
1806 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1807 	suword16_noerr(&cv->cond_type, type);
1808 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1809 	    &lwpchan, LWPCHAN_CVPOOL)) {
1810 		error = EFAULT;
1811 		goto out;
1812 	}
1813 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1814 	locked = 1;
1815 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1816 	if (waiters != 0) {
1817 		/*
1818 		 * The following call to lwp_release() might fail but it is
1819 		 * OK to write into the waiters bit below, since the memory
1820 		 * could not have been re-used or unmapped (for correctly
1821 		 * written user programs) as in the case of lwp_mutex_wakeup().
1822 		 * For an incorrect program, we should not care about data
1823 		 * corruption since this is just one instance of other places
1824 		 * where corruption can occur for such a program. Of course
1825 		 * if the memory is unmapped, normal fault recovery occurs.
1826 		 */
1827 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1828 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1829 	}
1830 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1831 out:
1832 	no_fault();
1833 	if (watched)
1834 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1835 	if (error)
1836 		return (set_errno(error));
1837 	return (0);
1838 }
1839 
1840 /*
1841  * wakeup every lwp that's blocked on this condition variable.
1842  */
1843 int
1844 lwp_cond_broadcast(lwp_cond_t *cv)
1845 {
1846 	proc_t *p = ttoproc(curthread);
1847 	lwpchan_t lwpchan;
1848 	volatile uint16_t type = 0;
1849 	volatile int locked = 0;
1850 	volatile int watched = 0;
1851 	label_t ljb;
1852 	uchar_t waiters;
1853 	int error = 0;
1854 
1855 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1856 		return (set_errno(EFAULT));
1857 
1858 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1859 
1860 	if (on_fault(&ljb)) {
1861 		if (locked)
1862 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1863 		error = EFAULT;
1864 		goto out;
1865 	}
1866 	/*
1867 	 * Force Copy-on-write fault if lwp_cond_t object is
1868 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1869 	 */
1870 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1871 	suword16_noerr(&cv->cond_type, type);
1872 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1873 	    &lwpchan, LWPCHAN_CVPOOL)) {
1874 		error = EFAULT;
1875 		goto out;
1876 	}
1877 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1878 	locked = 1;
1879 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1880 	if (waiters != 0) {
1881 		lwp_release_all(&lwpchan);
1882 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1883 	}
1884 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1885 out:
1886 	no_fault();
1887 	if (watched)
1888 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1889 	if (error)
1890 		return (set_errno(error));
1891 	return (0);
1892 }
1893 
1894 int
1895 lwp_sema_trywait(lwp_sema_t *sp)
1896 {
1897 	kthread_t *t = curthread;
1898 	proc_t *p = ttoproc(t);
1899 	label_t ljb;
1900 	volatile int locked = 0;
1901 	volatile int watched = 0;
1902 	volatile uint16_t type = 0;
1903 	int count;
1904 	lwpchan_t lwpchan;
1905 	uchar_t waiters;
1906 	int error = 0;
1907 
1908 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1909 		return (set_errno(EFAULT));
1910 
1911 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1912 
1913 	if (on_fault(&ljb)) {
1914 		if (locked)
1915 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1916 		error = EFAULT;
1917 		goto out;
1918 	}
1919 	/*
1920 	 * Force Copy-on-write fault if lwp_sema_t object is
1921 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1922 	 */
1923 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1924 	suword16_noerr((void *)&sp->sema_type, type);
1925 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1926 	    &lwpchan, LWPCHAN_CVPOOL)) {
1927 		error = EFAULT;
1928 		goto out;
1929 	}
1930 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1931 	locked = 1;
1932 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1933 	if (count == 0)
1934 		error = EBUSY;
1935 	else
1936 		suword32_noerr((void *)&sp->sema_count, --count);
1937 	if (count != 0) {
1938 		fuword8_noerr(&sp->sema_waiters, &waiters);
1939 		if (waiters != 0) {
1940 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1941 			suword8_noerr(&sp->sema_waiters, waiters);
1942 		}
1943 	}
1944 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1945 out:
1946 	no_fault();
1947 	if (watched)
1948 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1949 	if (error)
1950 		return (set_errno(error));
1951 	return (0);
1952 }
1953 
1954 /*
1955  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
1956  */
1957 int
1958 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
1959 {
1960 	kthread_t *t = curthread;
1961 	klwp_t *lwp = ttolwp(t);
1962 	proc_t *p = ttoproc(t);
1963 	lwp_timer_t lwpt;
1964 	caddr_t timedwait;
1965 	clock_t tim = -1;
1966 	label_t ljb;
1967 	volatile int locked = 0;
1968 	volatile int watched = 0;
1969 	volatile uint16_t type = 0;
1970 	int count;
1971 	lwpchan_t lwpchan;
1972 	uchar_t waiters;
1973 	int error = 0;
1974 	int time_error;
1975 	int imm_timeout = 0;
1976 	int imm_unpark = 0;
1977 
1978 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1979 		return (set_errno(EFAULT));
1980 
1981 	timedwait = (caddr_t)tsp;
1982 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1983 	    lwpt.lwpt_imm_timeout) {
1984 		imm_timeout = 1;
1985 		timedwait = NULL;
1986 	}
1987 
1988 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1989 
1990 	if (on_fault(&ljb)) {
1991 		if (locked)
1992 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1993 		error = EFAULT;
1994 		goto out;
1995 	}
1996 	/*
1997 	 * Force Copy-on-write fault if lwp_sema_t object is
1998 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1999 	 */
2000 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2001 	suword16_noerr((void *)&sp->sema_type, type);
2002 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2003 	    &lwpchan, LWPCHAN_CVPOOL)) {
2004 		error = EFAULT;
2005 		goto out;
2006 	}
2007 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2008 	locked = 1;
2009 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2010 	while (error == 0 && count == 0) {
2011 		if (time_error) {
2012 			/*
2013 			 * The SUSV3 Posix spec is very clear that we
2014 			 * should get no error from validating the
2015 			 * timer until we would actually sleep.
2016 			 */
2017 			error = time_error;
2018 			break;
2019 		}
2020 		suword8_noerr(&sp->sema_waiters, 1);
2021 		if (watched)
2022 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2023 		/*
2024 		 * Put the lwp in an orderly state for debugging.
2025 		 */
2026 		prstop(PR_REQUESTED, 0);
2027 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2028 			/*
2029 			 * We received a signal at user-level before calling
2030 			 * here or another thread wants us to return
2031 			 * immediately with EINTR.  See lwp_unpark().
2032 			 */
2033 			imm_unpark = 1;
2034 			t->t_unpark = 0;
2035 			timedwait = NULL;
2036 		} else if (timedwait) {
2037 			/*
2038 			 * If we successfully queue the timeout,
2039 			 * then don't drop t_delay_lock until
2040 			 * we are on the sleep queue (below).
2041 			 */
2042 			mutex_enter(&t->t_delay_lock);
2043 			if (lwp_timer_enqueue(&lwpt) != 0) {
2044 				mutex_exit(&t->t_delay_lock);
2045 				imm_timeout = 1;
2046 				timedwait = NULL;
2047 			}
2048 		}
2049 		t->t_flag |= T_WAITCVSEM;
2050 		lwp_block(&lwpchan);
2051 		/*
2052 		 * Nothing should happen to cause the lwp to sleep
2053 		 * again until after it returns from swtch().
2054 		 */
2055 		if (timedwait)
2056 			mutex_exit(&t->t_delay_lock);
2057 		locked = 0;
2058 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2059 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2060 		    (imm_timeout | imm_unpark))
2061 			setrun(t);
2062 		swtch();
2063 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2064 		if (timedwait)
2065 			tim = lwp_timer_dequeue(&lwpt);
2066 		setallwatch();
2067 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2068 		    MUSTRETURN(p, t) || imm_unpark)
2069 			error = EINTR;
2070 		else if (imm_timeout || (timedwait && tim == -1))
2071 			error = ETIME;
2072 		lwp->lwp_asleep = 0;
2073 		lwp->lwp_sysabort = 0;
2074 		watched = watch_disable_addr((caddr_t)sp,
2075 		    sizeof (*sp), S_WRITE);
2076 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2077 		locked = 1;
2078 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2079 	}
2080 	if (error == 0)
2081 		suword32_noerr((void *)&sp->sema_count, --count);
2082 	if (count != 0) {
2083 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2084 		suword8_noerr(&sp->sema_waiters, waiters);
2085 	}
2086 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2087 out:
2088 	no_fault();
2089 	if (watched)
2090 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2091 	if (tsp && check_park && !time_error)
2092 		error = lwp_timer_copyout(&lwpt, error);
2093 	if (error)
2094 		return (set_errno(error));
2095 	return (0);
2096 }
2097 
2098 /*
2099  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2100  * libc now calls lwp_sema_timedwait().
2101  * This system call trap exists solely for the benefit of old
2102  * statically linked applications from Solaris 9 and before.
2103  * It should be removed when we no longer care about such applications.
2104  */
2105 int
2106 lwp_sema_wait(lwp_sema_t *sp)
2107 {
2108 	return (lwp_sema_timedwait(sp, NULL, 0));
2109 }
2110 
2111 int
2112 lwp_sema_post(lwp_sema_t *sp)
2113 {
2114 	proc_t *p = ttoproc(curthread);
2115 	label_t ljb;
2116 	volatile int locked = 0;
2117 	volatile int watched = 0;
2118 	volatile uint16_t type = 0;
2119 	int count;
2120 	lwpchan_t lwpchan;
2121 	uchar_t waiters;
2122 	int error = 0;
2123 
2124 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2125 		return (set_errno(EFAULT));
2126 
2127 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2128 
2129 	if (on_fault(&ljb)) {
2130 		if (locked)
2131 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2132 		error = EFAULT;
2133 		goto out;
2134 	}
2135 	/*
2136 	 * Force Copy-on-write fault if lwp_sema_t object is
2137 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2138 	 */
2139 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2140 	suword16_noerr(&sp->sema_type, type);
2141 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2142 	    &lwpchan, LWPCHAN_CVPOOL)) {
2143 		error = EFAULT;
2144 		goto out;
2145 	}
2146 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2147 	locked = 1;
2148 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2149 	if (count == _SEM_VALUE_MAX)
2150 		error = EOVERFLOW;
2151 	else
2152 		suword32_noerr(&sp->sema_count, ++count);
2153 	if (count == 1) {
2154 		fuword8_noerr(&sp->sema_waiters, &waiters);
2155 		if (waiters) {
2156 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2157 			suword8_noerr(&sp->sema_waiters, waiters);
2158 		}
2159 	}
2160 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2161 out:
2162 	no_fault();
2163 	if (watched)
2164 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2165 	if (error)
2166 		return (set_errno(error));
2167 	return (0);
2168 }
2169 
2170 #define	TRW_WANT_WRITE		0x1
2171 #define	TRW_LOCK_GRANTED	0x2
2172 
2173 #define	READ_LOCK		0
2174 #define	WRITE_LOCK		1
2175 #define	TRY_FLAG		0x10
2176 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2177 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2178 
2179 /*
2180  * Release one writer or one or more readers. Compute the rwstate word to
2181  * reflect the new state of the queue. For a safe hand-off we copy the new
2182  * rwstate value back to userland before we wake any of the new lock holders.
2183  *
2184  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2185  * being given precedence over readers of the same priority).
2186  *
2187  * If the first thread is a reader we scan the queue releasing all readers
2188  * until we hit a writer or the end of the queue. If the first thread is a
2189  * writer we still need to check for another writer.
2190  */
2191 void
2192 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2193 {
2194 	sleepq_head_t *sqh;
2195 	kthread_t *tp;
2196 	kthread_t **tpp;
2197 	kthread_t *tpnext;
2198 	kthread_t *wakelist = NULL;
2199 	uint32_t rwstate = 0;
2200 	int wcount = 0;
2201 	int rcount = 0;
2202 
2203 	sqh = lwpsqhash(lwpchan);
2204 	disp_lock_enter(&sqh->sq_lock);
2205 	tpp = &sqh->sq_queue.sq_first;
2206 	while ((tp = *tpp) != NULL) {
2207 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2208 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2209 			if (tp->t_writer & TRW_WANT_WRITE) {
2210 				if ((wcount++ == 0) && (rcount == 0)) {
2211 					rwstate |= URW_WRITE_LOCKED;
2212 
2213 					/* Just one writer to wake. */
2214 					sleepq_unlink(tpp, tp);
2215 					wakelist = tp;
2216 
2217 					/* tpp already set for next thread. */
2218 					continue;
2219 				} else {
2220 					rwstate |= URW_HAS_WAITERS;
2221 					/* We need look no further. */
2222 					break;
2223 				}
2224 			} else {
2225 				rcount++;
2226 				if (wcount == 0) {
2227 					rwstate++;
2228 
2229 					/* Add reader to wake list. */
2230 					sleepq_unlink(tpp, tp);
2231 					tp->t_link = wakelist;
2232 					wakelist = tp;
2233 
2234 					/* tpp already set for next thread. */
2235 					continue;
2236 				} else {
2237 					rwstate |= URW_HAS_WAITERS;
2238 					/* We need look no further. */
2239 					break;
2240 				}
2241 			}
2242 		}
2243 		tpp = &tp->t_link;
2244 	}
2245 
2246 	/* Copy the new rwstate back to userland. */
2247 	suword32_noerr(&rw->rwlock_readers, rwstate);
2248 
2249 	/* Wake the new lock holder(s) up. */
2250 	tp = wakelist;
2251 	while (tp != NULL) {
2252 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2253 		tp->t_wchan0 = NULL;
2254 		tp->t_wchan = NULL;
2255 		tp->t_sobj_ops = NULL;
2256 		tp->t_writer |= TRW_LOCK_GRANTED;
2257 		tpnext = tp->t_link;
2258 		tp->t_link = NULL;
2259 		CL_WAKEUP(tp);
2260 		thread_unlock_high(tp);
2261 		tp = tpnext;
2262 	}
2263 
2264 	disp_lock_exit(&sqh->sq_lock);
2265 }
2266 
2267 /*
2268  * We enter here holding the user-level mutex, which we must release before
2269  * returning or blocking. Based on lwp_cond_wait().
2270  */
2271 static int
2272 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2273 {
2274 	lwp_mutex_t *mp = NULL;
2275 	kthread_t *t = curthread;
2276 	kthread_t *tp;
2277 	klwp_t *lwp = ttolwp(t);
2278 	proc_t *p = ttoproc(t);
2279 	lwp_timer_t lwpt;
2280 	lwpchan_t lwpchan;
2281 	lwpchan_t mlwpchan;
2282 	caddr_t timedwait;
2283 	volatile uint16_t type = 0;
2284 	volatile uint8_t mtype = 0;
2285 	uchar_t mwaiters;
2286 	volatile int error = 0;
2287 	int time_error;
2288 	clock_t tim = -1;
2289 	volatile int locked = 0;
2290 	volatile int mlocked = 0;
2291 	volatile int watched = 0;
2292 	volatile int mwatched = 0;
2293 	label_t ljb;
2294 	volatile int no_lwpchan = 1;
2295 	int imm_timeout = 0;
2296 	int try_flag;
2297 	uint32_t rwstate;
2298 	int acquired = 0;
2299 
2300 	/* We only check rw because the mutex is included in it. */
2301 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2302 		return (set_errno(EFAULT));
2303 
2304 	/* We must only report this error if we are about to sleep (later). */
2305 	timedwait = (caddr_t)tsp;
2306 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2307 	    lwpt.lwpt_imm_timeout) {
2308 		imm_timeout = 1;
2309 		timedwait = NULL;
2310 	}
2311 
2312 	(void) new_mstate(t, LMS_USER_LOCK);
2313 
2314 	if (on_fault(&ljb)) {
2315 		if (no_lwpchan) {
2316 			error = EFAULT;
2317 			goto out_nodrop;
2318 		}
2319 		if (mlocked) {
2320 			mlocked = 0;
2321 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2322 		}
2323 		if (locked) {
2324 			locked = 0;
2325 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2326 		}
2327 		/*
2328 		 * Set up another on_fault() for a possible fault
2329 		 * on the user lock accessed at "out_drop".
2330 		 */
2331 		if (on_fault(&ljb)) {
2332 			if (mlocked) {
2333 				mlocked = 0;
2334 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2335 			}
2336 			error = EFAULT;
2337 			goto out_nodrop;
2338 		}
2339 		error = EFAULT;
2340 		goto out_nodrop;
2341 	}
2342 
2343 	/* Process rd_wr (including sanity check). */
2344 	try_flag = (rd_wr & TRY_FLAG);
2345 	rd_wr &= ~TRY_FLAG;
2346 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2347 		error = EINVAL;
2348 		goto out_nodrop;
2349 	}
2350 
2351 	/* We can only continue for simple USYNC_PROCESS locks. */
2352 	mp = &rw->mutex;
2353 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2354 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2355 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2356 		error = EINVAL;
2357 		goto out_nodrop;
2358 	}
2359 
2360 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2361 	suword8_noerr(&mp->mutex_type, mtype);
2362 	suword16_noerr(&rw->rwlock_type, type);
2363 
2364 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2365 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2366 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2367 		error = EFAULT;
2368 		goto out_nodrop;
2369 	}
2370 
2371 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2372 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2373 	    &lwpchan, LWPCHAN_CVPOOL)) {
2374 		error = EFAULT;
2375 		goto out_nodrop;
2376 	}
2377 
2378 	no_lwpchan = 0;
2379 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2380 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2381 
2382 	/*
2383 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2384 	 * atomically with respect to a possible wakeup which is a result
2385 	 * of lwp_rwlock_unlock().
2386 	 *
2387 	 * What's misleading is that the LWP is put to sleep after the
2388 	 * rwlock's mutex is released. This is OK as long as the release
2389 	 * operation is also done while holding mlwpchan. The LWP is then
2390 	 * put to sleep when the possibility of pagefaulting or sleeping
2391 	 * has been completely eliminated.
2392 	 */
2393 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2394 	locked = 1;
2395 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2396 	mlocked = 1;
2397 
2398 	/*
2399 	 * Fetch the current rwlock state.
2400 	 *
2401 	 * The possibility of spurious wake-ups or killed waiters means
2402 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2403 	 * We only fix these if they are important to us.
2404 	 *
2405 	 * Although various error states can be observed here (e.g. the lock
2406 	 * is not held, but there are waiters) we assume these are applicaton
2407 	 * errors and so we take no corrective action.
2408 	 */
2409 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2410 	/*
2411 	 * We cannot legitimately get here from user-level
2412 	 * without URW_HAS_WAITERS being set.
2413 	 * Set it now to guard against user-level error.
2414 	 */
2415 	rwstate |= URW_HAS_WAITERS;
2416 
2417 	/*
2418 	 * We can try only if the lock isn't held by a writer.
2419 	 */
2420 	if (!(rwstate & URW_WRITE_LOCKED)) {
2421 		tp = lwp_queue_waiter(&lwpchan);
2422 		if (tp == NULL) {
2423 			/*
2424 			 * Hmmm, rwstate indicates waiters but there are
2425 			 * none queued. This could just be the result of a
2426 			 * spurious wakeup, so let's ignore it.
2427 			 *
2428 			 * We now have a chance to acquire the lock
2429 			 * uncontended, but this is the last chance for
2430 			 * a writer to acquire the lock without blocking.
2431 			 */
2432 			if (rd_wr == READ_LOCK) {
2433 				rwstate++;
2434 				acquired = 1;
2435 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2436 				rwstate |= URW_WRITE_LOCKED;
2437 				acquired = 1;
2438 			}
2439 		} else if (rd_wr == READ_LOCK) {
2440 			/*
2441 			 * This is the last chance for a reader to acquire
2442 			 * the lock now, but it can only do so if there is
2443 			 * no writer of equal or greater priority at the
2444 			 * head of the queue .
2445 			 *
2446 			 * It is also just possible that there is a reader
2447 			 * at the head of the queue. This may be the result
2448 			 * of a spurious wakeup or an application failure.
2449 			 * In this case we only acquire the lock if we have
2450 			 * equal or greater priority. It is not our job to
2451 			 * release spurious waiters.
2452 			 */
2453 			pri_t our_pri = DISP_PRIO(t);
2454 			pri_t his_pri = DISP_PRIO(tp);
2455 
2456 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2457 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2458 				rwstate++;
2459 				acquired = 1;
2460 			}
2461 		}
2462 	}
2463 
2464 	if (acquired || try_flag || time_error) {
2465 		/*
2466 		 * We're not going to block this time.
2467 		 */
2468 		suword32_noerr(&rw->rwlock_readers, rwstate);
2469 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2470 		locked = 0;
2471 
2472 		if (acquired) {
2473 			/*
2474 			 * Got the lock!
2475 			 */
2476 			error = 0;
2477 
2478 		} else if (try_flag) {
2479 			/*
2480 			 * We didn't get the lock and we're about to block.
2481 			 * If we're doing a trylock, return EBUSY instead.
2482 			 */
2483 			error = EBUSY;
2484 
2485 		} else if (time_error) {
2486 			/*
2487 			 * The SUSV3 POSIX spec is very clear that we should
2488 			 * get no error from validating the timer (above)
2489 			 * until we would actually sleep.
2490 			 */
2491 			error = time_error;
2492 		}
2493 
2494 		goto out_drop;
2495 	}
2496 
2497 	/*
2498 	 * We're about to block, so indicate what kind of waiter we are.
2499 	 */
2500 	t->t_writer = 0;
2501 	if (rd_wr == WRITE_LOCK)
2502 		t->t_writer = TRW_WANT_WRITE;
2503 	suword32_noerr(&rw->rwlock_readers, rwstate);
2504 
2505 	/*
2506 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2507 	 */
2508 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2509 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2510 	suword32_noerr(&mp->mutex_ownerpid, 0);
2511 	ulock_clear(&mp->mutex_lockw);
2512 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2513 	if (mwaiters != 0) {
2514 		/*
2515 		 * Given the locking of mlwpchan around the release of
2516 		 * the mutex and checking for waiters, the following
2517 		 * call to lwp_release() can fail ONLY if the lock
2518 		 * acquirer is interrupted after setting the waiter bit,
2519 		 * calling lwp_block() and releasing mlwpchan.
2520 		 * In this case, it could get pulled off the LWP sleep
2521 		 * queue (via setrun()) before the following call to
2522 		 * lwp_release() occurs, and the lock requestor will
2523 		 * update the waiter bit correctly by re-evaluating it.
2524 		 */
2525 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2526 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2527 	}
2528 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2529 	mlocked = 0;
2530 	no_fault();
2531 
2532 	if (mwatched) {
2533 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2534 		mwatched = 0;
2535 	}
2536 	if (watched) {
2537 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2538 		watched = 0;
2539 	}
2540 
2541 	/*
2542 	 * Put the LWP in an orderly state for debugging.
2543 	 */
2544 	prstop(PR_REQUESTED, 0);
2545 	if (timedwait) {
2546 		/*
2547 		 * If we successfully queue the timeout,
2548 		 * then don't drop t_delay_lock until
2549 		 * we are on the sleep queue (below).
2550 		 */
2551 		mutex_enter(&t->t_delay_lock);
2552 		if (lwp_timer_enqueue(&lwpt) != 0) {
2553 			mutex_exit(&t->t_delay_lock);
2554 			imm_timeout = 1;
2555 			timedwait = NULL;
2556 		}
2557 	}
2558 	t->t_flag |= T_WAITCVSEM;
2559 	lwp_block(&lwpchan);
2560 
2561 	/*
2562 	 * Nothing should happen to cause the LWp to go to sleep until after
2563 	 * it returns from swtch().
2564 	 */
2565 	if (timedwait)
2566 		mutex_exit(&t->t_delay_lock);
2567 	locked = 0;
2568 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2569 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t))
2570 		setrun(t);
2571 	swtch();
2572 
2573 	/*
2574 	 * We're back, but we need to work out why. Were we interrupted? Did
2575 	 * we timeout? Were we granted the lock?
2576 	 */
2577 	error = EAGAIN;
2578 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2579 	t->t_writer = 0;
2580 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2581 	if (timedwait)
2582 		tim = lwp_timer_dequeue(&lwpt);
2583 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2584 		error = EINTR;
2585 	else if (imm_timeout || (timedwait && tim == -1))
2586 		error = ETIME;
2587 	lwp->lwp_asleep = 0;
2588 	lwp->lwp_sysabort = 0;
2589 	setallwatch();
2590 
2591 	/*
2592 	 * If we were granted the lock we don't care about EINTR or ETIME.
2593 	 */
2594 	if (acquired)
2595 		error = 0;
2596 
2597 	if (t->t_mstate == LMS_USER_LOCK)
2598 		(void) new_mstate(t, LMS_SYSTEM);
2599 
2600 	if (error)
2601 		return (set_errno(error));
2602 	return (0);
2603 
2604 out_drop:
2605 	/*
2606 	 * Make sure that the user level lock is dropped before returning
2607 	 * to the caller.
2608 	 */
2609 	if (!mlocked) {
2610 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2611 		mlocked = 1;
2612 	}
2613 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2614 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2615 	suword32_noerr(&mp->mutex_ownerpid, 0);
2616 	ulock_clear(&mp->mutex_lockw);
2617 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2618 	if (mwaiters != 0) {
2619 		/*
2620 		 * See comment above on lock clearing and lwp_release()
2621 		 * success/failure.
2622 		 */
2623 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2624 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2625 	}
2626 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2627 	mlocked = 0;
2628 
2629 out_nodrop:
2630 	no_fault();
2631 	if (mwatched)
2632 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2633 	if (watched)
2634 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2635 	if (t->t_mstate == LMS_USER_LOCK)
2636 		(void) new_mstate(t, LMS_SYSTEM);
2637 	if (error)
2638 		return (set_errno(error));
2639 	return (0);
2640 }
2641 
2642 /*
2643  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2644  * we never drop the lock.
2645  */
2646 static int
2647 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2648 {
2649 	kthread_t *t = curthread;
2650 	proc_t *p = ttoproc(t);
2651 	lwpchan_t lwpchan;
2652 	volatile uint16_t type = 0;
2653 	volatile int error = 0;
2654 	volatile int locked = 0;
2655 	volatile int watched = 0;
2656 	label_t ljb;
2657 	volatile int no_lwpchan = 1;
2658 	uint32_t rwstate;
2659 
2660 	/* We only check rw because the mutex is included in it. */
2661 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2662 		return (set_errno(EFAULT));
2663 
2664 	if (on_fault(&ljb)) {
2665 		if (no_lwpchan) {
2666 			error = EFAULT;
2667 			goto out_nodrop;
2668 		}
2669 		if (locked) {
2670 			locked = 0;
2671 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2672 		}
2673 		error = EFAULT;
2674 		goto out_nodrop;
2675 	}
2676 
2677 	/* We can only continue for simple USYNC_PROCESS locks. */
2678 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2679 	if (type != USYNC_PROCESS) {
2680 		error = EINVAL;
2681 		goto out_nodrop;
2682 	}
2683 
2684 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2685 	suword16_noerr(&rw->rwlock_type, type);
2686 
2687 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2688 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2689 	    &lwpchan, LWPCHAN_CVPOOL)) {
2690 		error = EFAULT;
2691 		goto out_nodrop;
2692 	}
2693 
2694 	no_lwpchan = 0;
2695 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2696 
2697 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2698 	locked = 1;
2699 
2700 	/*
2701 	 * We can resolve multiple readers (except the last reader) here.
2702 	 * For the last reader or a writer we need lwp_rwlock_release(),
2703 	 * to which we also delegate the task of copying the new rwstate
2704 	 * back to userland (see the comment there).
2705 	 */
2706 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2707 	if (rwstate & URW_WRITE_LOCKED)
2708 		lwp_rwlock_release(&lwpchan, rw);
2709 	else if ((rwstate & URW_READERS_MASK) > 0) {
2710 		rwstate--;
2711 		if ((rwstate & URW_READERS_MASK) == 0)
2712 			lwp_rwlock_release(&lwpchan, rw);
2713 		else
2714 			suword32_noerr(&rw->rwlock_readers, rwstate);
2715 	}
2716 
2717 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2718 	locked = 0;
2719 	error = 0;
2720 
2721 out_nodrop:
2722 	no_fault();
2723 	if (watched)
2724 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2725 	if (error)
2726 		return (set_errno(error));
2727 	return (0);
2728 }
2729 
2730 int
2731 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2732 {
2733 	switch (subcode) {
2734 	case 0:
2735 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2736 	case 1:
2737 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2738 	case 2:
2739 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2740 	case 3:
2741 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2742 	case 4:
2743 		return (lwp_rwlock_unlock(rwlp));
2744 	}
2745 	return (set_errno(EINVAL));
2746 }
2747 
2748 /*
2749  * Return the owner of the user-level s-object.
2750  * Since we can't really do this, return NULL.
2751  */
2752 /* ARGSUSED */
2753 static kthread_t *
2754 lwpsobj_owner(caddr_t sobj)
2755 {
2756 	return ((kthread_t *)NULL);
2757 }
2758 
2759 /*
2760  * Wake up a thread asleep on a user-level synchronization
2761  * object.
2762  */
2763 static void
2764 lwp_unsleep(kthread_t *t)
2765 {
2766 	ASSERT(THREAD_LOCK_HELD(t));
2767 	if (t->t_wchan0 != NULL) {
2768 		sleepq_head_t *sqh;
2769 		sleepq_t *sqp = t->t_sleepq;
2770 
2771 		if (sqp != NULL) {
2772 			sqh = lwpsqhash(&t->t_lwpchan);
2773 			ASSERT(&sqh->sq_queue == sqp);
2774 			sleepq_unsleep(t);
2775 			disp_lock_exit_high(&sqh->sq_lock);
2776 			CL_SETRUN(t);
2777 			return;
2778 		}
2779 	}
2780 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2781 }
2782 
2783 /*
2784  * Change the priority of a thread asleep on a user-level
2785  * synchronization object. To maintain proper priority order,
2786  * we:
2787  *	o dequeue the thread.
2788  *	o change its priority.
2789  *	o re-enqueue the thread.
2790  * Assumption: the thread is locked on entry.
2791  */
2792 static void
2793 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2794 {
2795 	ASSERT(THREAD_LOCK_HELD(t));
2796 	if (t->t_wchan0 != NULL) {
2797 		sleepq_t   *sqp = t->t_sleepq;
2798 
2799 		sleepq_dequeue(t);
2800 		*t_prip = pri;
2801 		sleepq_insert(sqp, t);
2802 	} else
2803 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2804 }
2805 
2806 /*
2807  * Clean up a locked a robust mutex
2808  */
2809 static void
2810 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2811 {
2812 	uint16_t flag;
2813 	uchar_t waiters;
2814 	label_t ljb;
2815 	pid_t owner_pid;
2816 	lwp_mutex_t *lp;
2817 	volatile int locked = 0;
2818 	volatile int watched = 0;
2819 
2820 	ASSERT(ent->lwpchan_type & USYNC_PROCESS_ROBUST);
2821 
2822 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2823 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2824 	if (on_fault(&ljb)) {
2825 		if (locked)
2826 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2827 		goto out;
2828 	}
2829 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2830 	if (owner_pid != curproc->p_pid) {
2831 		goto out;
2832 	}
2833 	lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2834 	locked = 1;
2835 	fuword16_noerr(&lp->mutex_flag, &flag);
2836 	if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) {
2837 		flag |= lockflg;
2838 		suword16_noerr(&lp->mutex_flag, flag);
2839 	}
2840 	suword32_noerr(&lp->mutex_ownerpid, 0);
2841 	ulock_clear(&lp->mutex_lockw);
2842 	fuword8_noerr(&lp->mutex_waiters, &waiters);
2843 	if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2844 		suword8_noerr(&lp->mutex_waiters, waiters);
2845 	lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2846 out:
2847 	no_fault();
2848 	if (watched)
2849 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2850 }
2851 
2852 /*
2853  * Register the mutex and initialize the mutex if it is not already
2854  */
2855 int
2856 lwp_mutex_init(lwp_mutex_t *lp, int type)
2857 {
2858 	proc_t *p = curproc;
2859 	int error = 0;
2860 	volatile int locked = 0;
2861 	volatile int watched = 0;
2862 	label_t ljb;
2863 	uint16_t flag;
2864 	lwpchan_t lwpchan;
2865 	pid_t owner_pid;
2866 
2867 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2868 		return (set_errno(EFAULT));
2869 
2870 	if (type != USYNC_PROCESS_ROBUST)
2871 		return (set_errno(EINVAL));
2872 
2873 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2874 
2875 	if (on_fault(&ljb)) {
2876 		if (locked)
2877 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2878 		error = EFAULT;
2879 		goto out;
2880 	}
2881 	/*
2882 	 * Force Copy-on-write fault if lwp_mutex_t object is
2883 	 * defined to be MAP_PRIVATE and it was initialized to
2884 	 * USYNC_PROCESS.
2885 	 */
2886 	suword8_noerr(&lp->mutex_type, type);
2887 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2888 	    &lwpchan, LWPCHAN_MPPOOL)) {
2889 		error = EFAULT;
2890 		goto out;
2891 	}
2892 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
2893 	locked = 1;
2894 	fuword16_noerr(&lp->mutex_flag, &flag);
2895 	if (flag & LOCK_INITED) {
2896 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
2897 			fuword32_noerr(&lp->mutex_ownerpid,
2898 			    (uint32_t *)&owner_pid);
2899 			if (owner_pid == p->p_pid) {
2900 				flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2901 				suword16_noerr(&lp->mutex_flag, flag);
2902 				locked = 0;
2903 				lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2904 				goto out;
2905 			}
2906 		}
2907 		error = EBUSY;
2908 	} else {
2909 		suword8_noerr(&lp->mutex_waiters, 0);
2910 		suword8_noerr(&lp->mutex_lockw, 0);
2911 		suword16_noerr(&lp->mutex_flag, LOCK_INITED);
2912 		suword32_noerr(&lp->mutex_ownerpid, 0);
2913 	}
2914 	locked = 0;
2915 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2916 out:
2917 	no_fault();
2918 	if (watched)
2919 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2920 	if (error)
2921 		return (set_errno(error));
2922 	return (0);
2923 }
2924 
2925 int
2926 lwp_mutex_trylock(lwp_mutex_t *lp)
2927 {
2928 	kthread_t *t = curthread;
2929 	proc_t *p = ttoproc(t);
2930 	int error = 0;
2931 	volatile int locked = 0;
2932 	volatile int watched = 0;
2933 	label_t ljb;
2934 	volatile uint8_t type = 0;
2935 	uint16_t flag;
2936 	lwpchan_t lwpchan;
2937 
2938 	if ((caddr_t)lp >= p->p_as->a_userlimit)
2939 		return (set_errno(EFAULT));
2940 
2941 	(void) new_mstate(t, LMS_USER_LOCK);
2942 
2943 	if (on_fault(&ljb)) {
2944 		if (locked)
2945 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2946 		error = EFAULT;
2947 		goto out;
2948 	}
2949 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
2950 	if (UPIMUTEX(type)) {
2951 		no_fault();
2952 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
2953 		if ((error == 0 || error == EOWNERDEAD) &&
2954 		    (type & USYNC_PROCESS))
2955 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
2956 		if (error)
2957 			return (set_errno(error));
2958 		return (0);
2959 	}
2960 	/*
2961 	 * Force Copy-on-write fault if lwp_mutex_t object is
2962 	 * defined to be MAP_PRIVATE and it was initialized to
2963 	 * USYNC_PROCESS.
2964 	 */
2965 	suword8_noerr(&lp->mutex_type, type);
2966 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2967 	    &lwpchan, LWPCHAN_MPPOOL)) {
2968 		error = EFAULT;
2969 		goto out;
2970 	}
2971 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
2972 	locked = 1;
2973 	if (type & USYNC_PROCESS_ROBUST) {
2974 		fuword16_noerr((uint16_t *)(&lp->mutex_flag), &flag);
2975 		if (flag & LOCK_NOTRECOVERABLE) {
2976 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2977 			error =  ENOTRECOVERABLE;
2978 			goto out;
2979 		}
2980 	}
2981 
2982 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2983 
2984 	if (!ulock_try(&lp->mutex_lockw))
2985 		error = EBUSY;
2986 	else if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
2987 		suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
2988 		if (type & USYNC_PROCESS_ROBUST) {
2989 			if (flag & LOCK_OWNERDEAD)
2990 				error = EOWNERDEAD;
2991 			else if (flag & LOCK_UNMAPPED)
2992 				error = ELOCKUNMAPPED;
2993 		}
2994 	}
2995 	locked = 0;
2996 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2997 out:
2998 
2999 	if (t->t_mstate == LMS_USER_LOCK)
3000 		(void) new_mstate(t, LMS_SYSTEM);
3001 
3002 	no_fault();
3003 	if (watched)
3004 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3005 	if (error)
3006 		return (set_errno(error));
3007 	return (0);
3008 }
3009 
3010 /*
3011  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3012  * the blocked lwp resumes and retries to acquire the lock.
3013  */
3014 int
3015 lwp_mutex_unlock(lwp_mutex_t *lp)
3016 {
3017 	proc_t *p = ttoproc(curthread);
3018 	lwpchan_t lwpchan;
3019 	uchar_t waiters;
3020 	volatile int locked = 0;
3021 	volatile int watched = 0;
3022 	volatile uint8_t type = 0;
3023 	label_t ljb;
3024 	uint16_t flag;
3025 	int error = 0;
3026 
3027 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3028 		return (set_errno(EFAULT));
3029 
3030 	if (on_fault(&ljb)) {
3031 		if (locked)
3032 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3033 		error = EFAULT;
3034 		goto out;
3035 	}
3036 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3037 	if (UPIMUTEX(type)) {
3038 		no_fault();
3039 		error = lwp_upimutex_unlock(lp, type);
3040 		if (error)
3041 			return (set_errno(error));
3042 		return (0);
3043 	}
3044 
3045 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3046 
3047 	/*
3048 	 * Force Copy-on-write fault if lwp_mutex_t object is
3049 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
3050 	 */
3051 	suword8_noerr(&lp->mutex_type, type);
3052 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3053 	    &lwpchan, LWPCHAN_MPPOOL)) {
3054 		error = EFAULT;
3055 		goto out;
3056 	}
3057 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3058 	locked = 1;
3059 	if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
3060 		if (type & USYNC_PROCESS_ROBUST) {
3061 			fuword16_noerr(&lp->mutex_flag, &flag);
3062 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3063 				flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3064 				flag |= LOCK_NOTRECOVERABLE;
3065 				suword16_noerr(&lp->mutex_flag, flag);
3066 			}
3067 		}
3068 		suword32_noerr(&lp->mutex_ownerpid, 0);
3069 	}
3070 	ulock_clear(&lp->mutex_lockw);
3071 	/*
3072 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3073 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3074 	 * may fail.  If it fails, do not write into the waiter bit.
3075 	 * The call to lwp_release() might fail due to one of three reasons:
3076 	 *
3077 	 * 	1. due to the thread which set the waiter bit not actually
3078 	 *	   sleeping since it got the lock on the re-try. The waiter
3079 	 *	   bit will then be correctly updated by that thread. This
3080 	 *	   window may be closed by reading the wait bit again here
3081 	 *	   and not calling lwp_release() at all if it is zero.
3082 	 *	2. the thread which set the waiter bit and went to sleep
3083 	 *	   was woken up by a signal. This time, the waiter recomputes
3084 	 *	   the wait bit in the return with EINTR code.
3085 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3086 	 *	   memory that has been re-used after the lock was dropped.
3087 	 *	   In this case, writing into the waiter bit would cause data
3088 	 *	   corruption.
3089 	 */
3090 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3091 	if (waiters) {
3092 		if ((type & USYNC_PROCESS_ROBUST) &&
3093 		    (flag & LOCK_NOTRECOVERABLE)) {
3094 			lwp_release_all(&lwpchan);
3095 			suword8_noerr(&lp->mutex_waiters, 0);
3096 		} else if (lwp_release(&lwpchan, &waiters, 0) == 1) {
3097 			suword8_noerr(&lp->mutex_waiters, waiters);
3098 		}
3099 	}
3100 
3101 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3102 out:
3103 	no_fault();
3104 	if (watched)
3105 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3106 	if (error)
3107 		return (set_errno(error));
3108 	return (0);
3109 }
3110