xref: /titanic_50/usr/src/uts/common/syscall/lwp_sobj.c (revision 20ae46ebaff1237662e05edf9db61538aa85d448)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/file.h>
40 #include <sys/proc.h>
41 #include <sys/prsystm.h>
42 #include <sys/kmem.h>
43 #include <sys/sobject.h>
44 #include <sys/fault.h>
45 #include <sys/procfs.h>
46 #include <sys/watchpoint.h>
47 #include <sys/time.h>
48 #include <sys/cmn_err.h>
49 #include <sys/machlock.h>
50 #include <sys/debug.h>
51 #include <sys/synch.h>
52 #include <sys/synch32.h>
53 #include <sys/mman.h>
54 #include <sys/class.h>
55 #include <sys/schedctl.h>
56 #include <sys/sleepq.h>
57 #include <sys/policy.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/lwpchan_impl.h>
60 #include <sys/turnstile.h>
61 #include <sys/atomic.h>
62 #include <sys/lwp_timer_impl.h>
63 #include <sys/lwp_upimutex_impl.h>
64 #include <vm/as.h>
65 #include <sys/sdt.h>
66 
67 static kthread_t *lwpsobj_owner(caddr_t);
68 static void lwp_unsleep(kthread_t *t);
69 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
70 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
71 
72 extern int lwp_cond_signal(lwp_cond_t *cv);
73 
74 /*
75  * Maximum number of user prio inheritance locks that can be held by a thread.
76  * Used to limit kmem for each thread. This is a per-thread limit that
77  * can be administered on a system wide basis (using /etc/system).
78  *
79  * Also, when a limit, say maxlwps is added for numbers of lwps within a
80  * process, the per-thread limit automatically becomes a process-wide limit
81  * of maximum number of held upi locks within a process:
82  *      maxheldupimx = maxnestupimx * maxlwps;
83  */
84 static uint32_t maxnestupimx = 2000;
85 
86 /*
87  * The sobj_ops vector exports a set of functions needed when a thread
88  * is asleep on a synchronization object of this type.
89  */
90 static sobj_ops_t lwp_sobj_ops = {
91 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
92 };
93 
94 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
95 
96 static sobj_ops_t lwp_sobj_pi_ops = {
97 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
98 	turnstile_change_pri
99 };
100 
101 static sleepq_head_t	lwpsleepq[NSLEEPQ];
102 upib_t			upimutextab[UPIMUTEX_TABSIZE];
103 
104 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
105 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
106 
107 /*
108  * We know that both lc_wchan and lc_wchan0 are addresses that most
109  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
110  * 'pool' is either 0 or 1.
111  */
112 #define	LWPCHAN_LOCK_HASH(X, pool) \
113 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
114 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
115 
116 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
117 
118 /*
119  * Is this a POSIX threads user-level lock requiring priority inheritance?
120  */
121 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
122 
123 static sleepq_head_t *
124 lwpsqhash(lwpchan_t *lwpchan)
125 {
126 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
127 	return (&lwpsleepq[SQHASHINDEX(x)]);
128 }
129 
130 /*
131  * Lock an lwpchan.
132  * Keep this in sync with lwpchan_unlock(), below.
133  */
134 static void
135 lwpchan_lock(lwpchan_t *lwpchan, int pool)
136 {
137 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
138 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
139 }
140 
141 /*
142  * Unlock an lwpchan.
143  * Keep this in sync with lwpchan_lock(), above.
144  */
145 static void
146 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
147 {
148 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
149 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
150 }
151 
152 /*
153  * Delete mappings from the lwpchan cache for pages that are being
154  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
155  * all mappings within the range are deleted from the lwpchan cache.
156  */
157 void
158 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
159 {
160 	lwpchan_data_t *lcp;
161 	lwpchan_hashbucket_t *hashbucket;
162 	lwpchan_hashbucket_t *endbucket;
163 	lwpchan_entry_t *ent;
164 	lwpchan_entry_t **prev;
165 	caddr_t addr;
166 
167 	mutex_enter(&p->p_lcp_lock);
168 	lcp = p->p_lcp;
169 	hashbucket = lcp->lwpchan_cache;
170 	endbucket = hashbucket + lcp->lwpchan_size;
171 	for (; hashbucket < endbucket; hashbucket++) {
172 		if (hashbucket->lwpchan_chain == NULL)
173 			continue;
174 		mutex_enter(&hashbucket->lwpchan_lock);
175 		prev = &hashbucket->lwpchan_chain;
176 		/* check entire chain */
177 		while ((ent = *prev) != NULL) {
178 			addr = ent->lwpchan_addr;
179 			if (start <= addr && addr < end) {
180 				*prev = ent->lwpchan_next;
181 				/*
182 				 * We do this only for the obsolete type
183 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
184 				 * locks do not draw ELOCKUNMAPPED or
185 				 * EOWNERDEAD due to being unmapped.
186 				 */
187 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
188 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
189 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
190 				kmem_free(ent, sizeof (*ent));
191 				atomic_add_32(&lcp->lwpchan_entries, -1);
192 			} else {
193 				prev = &ent->lwpchan_next;
194 			}
195 		}
196 		mutex_exit(&hashbucket->lwpchan_lock);
197 	}
198 	mutex_exit(&p->p_lcp_lock);
199 }
200 
201 /*
202  * Given an lwpchan cache pointer and a process virtual address,
203  * return a pointer to the corresponding lwpchan hash bucket.
204  */
205 static lwpchan_hashbucket_t *
206 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
207 {
208 	uint_t i;
209 
210 	/*
211 	 * All user-level sync object addresses are 8-byte aligned.
212 	 * Ignore the lowest 3 bits of the address and use the
213 	 * higher-order 2*lwpchan_bits bits for the hash index.
214 	 */
215 	addr >>= 3;
216 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
217 	return (lcp->lwpchan_cache + i);
218 }
219 
220 /*
221  * (Re)allocate the per-process lwpchan cache.
222  */
223 static void
224 lwpchan_alloc_cache(proc_t *p, uint_t bits)
225 {
226 	lwpchan_data_t *lcp;
227 	lwpchan_data_t *old_lcp;
228 	lwpchan_hashbucket_t *hashbucket;
229 	lwpchan_hashbucket_t *endbucket;
230 	lwpchan_hashbucket_t *newbucket;
231 	lwpchan_entry_t *ent;
232 	lwpchan_entry_t *next;
233 	uint_t count;
234 
235 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
236 
237 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
238 	lcp->lwpchan_bits = bits;
239 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
240 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
241 	lcp->lwpchan_entries = 0;
242 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
243 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
244 	lcp->lwpchan_next_data = NULL;
245 
246 	mutex_enter(&p->p_lcp_lock);
247 	if ((old_lcp = p->p_lcp) != NULL) {
248 		if (old_lcp->lwpchan_bits >= bits) {
249 			/* someone beat us to it */
250 			mutex_exit(&p->p_lcp_lock);
251 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
252 			    sizeof (lwpchan_hashbucket_t));
253 			kmem_free(lcp, sizeof (lwpchan_data_t));
254 			return;
255 		}
256 		/*
257 		 * Acquire all of the old hash table locks.
258 		 */
259 		hashbucket = old_lcp->lwpchan_cache;
260 		endbucket = hashbucket + old_lcp->lwpchan_size;
261 		for (; hashbucket < endbucket; hashbucket++)
262 			mutex_enter(&hashbucket->lwpchan_lock);
263 		/*
264 		 * Move all of the old hash table entries to the
265 		 * new hash table.  The new hash table has not yet
266 		 * been installed so we don't need any of its locks.
267 		 */
268 		count = 0;
269 		hashbucket = old_lcp->lwpchan_cache;
270 		for (; hashbucket < endbucket; hashbucket++) {
271 			ent = hashbucket->lwpchan_chain;
272 			while (ent != NULL) {
273 				next = ent->lwpchan_next;
274 				newbucket = lwpchan_bucket(lcp,
275 				    (uintptr_t)ent->lwpchan_addr);
276 				ent->lwpchan_next = newbucket->lwpchan_chain;
277 				newbucket->lwpchan_chain = ent;
278 				ent = next;
279 				count++;
280 			}
281 			hashbucket->lwpchan_chain = NULL;
282 		}
283 		lcp->lwpchan_entries = count;
284 	}
285 
286 	/*
287 	 * Retire the old hash table.  We can't actually kmem_free() it
288 	 * now because someone may still have a pointer to it.  Instead,
289 	 * we link it onto the new hash table's list of retired hash tables.
290 	 * The new hash table is double the size of the previous one, so
291 	 * the total size of all retired hash tables is less than the size
292 	 * of the new one.  exit() and exec() free the retired hash tables
293 	 * (see lwpchan_destroy_cache(), below).
294 	 */
295 	lcp->lwpchan_next_data = old_lcp;
296 
297 	/*
298 	 * As soon as we store the new lcp, future locking operations will
299 	 * use it.  Therefore, we must ensure that all the state we've just
300 	 * established reaches global visibility before the new lcp does.
301 	 */
302 	membar_producer();
303 	p->p_lcp = lcp;
304 
305 	if (old_lcp != NULL) {
306 		/*
307 		 * Release all of the old hash table locks.
308 		 */
309 		hashbucket = old_lcp->lwpchan_cache;
310 		for (; hashbucket < endbucket; hashbucket++)
311 			mutex_exit(&hashbucket->lwpchan_lock);
312 	}
313 	mutex_exit(&p->p_lcp_lock);
314 }
315 
316 /*
317  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
318  * Called when the process exits or execs.  All lwps except one have
319  * exited so we need no locks here.
320  */
321 void
322 lwpchan_destroy_cache(int exec)
323 {
324 	proc_t *p = curproc;
325 	lwpchan_hashbucket_t *hashbucket;
326 	lwpchan_hashbucket_t *endbucket;
327 	lwpchan_data_t *lcp;
328 	lwpchan_entry_t *ent;
329 	lwpchan_entry_t *next;
330 	uint16_t lockflg;
331 
332 	lcp = p->p_lcp;
333 	p->p_lcp = NULL;
334 
335 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
336 	hashbucket = lcp->lwpchan_cache;
337 	endbucket = hashbucket + lcp->lwpchan_size;
338 	for (; hashbucket < endbucket; hashbucket++) {
339 		ent = hashbucket->lwpchan_chain;
340 		hashbucket->lwpchan_chain = NULL;
341 		while (ent != NULL) {
342 			next = ent->lwpchan_next;
343 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
344 			    (ent->lwpchan_type & LOCK_ROBUST))
345 				lwp_mutex_cleanup(ent, lockflg);
346 			kmem_free(ent, sizeof (*ent));
347 			ent = next;
348 		}
349 	}
350 
351 	while (lcp != NULL) {
352 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
353 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
354 		    sizeof (lwpchan_hashbucket_t));
355 		kmem_free(lcp, sizeof (lwpchan_data_t));
356 		lcp = next_lcp;
357 	}
358 }
359 
360 /*
361  * Return zero when there is an entry in the lwpchan cache for the
362  * given process virtual address and non-zero when there is not.
363  * The returned non-zero value is the current length of the
364  * hash chain plus one.  The caller holds the hash bucket lock.
365  */
366 static uint_t
367 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
368 	lwpchan_hashbucket_t *hashbucket)
369 {
370 	lwpchan_entry_t *ent;
371 	uint_t count = 1;
372 
373 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
374 		if (ent->lwpchan_addr == addr) {
375 			if (ent->lwpchan_type != type ||
376 			    ent->lwpchan_pool != pool) {
377 				/*
378 				 * This shouldn't happen, but might if the
379 				 * process reuses its memory for different
380 				 * types of sync objects.  We test first
381 				 * to avoid grabbing the memory cache line.
382 				 */
383 				ent->lwpchan_type = (uint16_t)type;
384 				ent->lwpchan_pool = (uint16_t)pool;
385 			}
386 			*lwpchan = ent->lwpchan_lwpchan;
387 			return (0);
388 		}
389 		count++;
390 	}
391 	return (count);
392 }
393 
394 /*
395  * Return the cached lwpchan mapping if cached, otherwise insert
396  * a virtual address to lwpchan mapping into the cache.
397  */
398 static int
399 lwpchan_get_mapping(struct as *as, caddr_t addr,
400 	int type, lwpchan_t *lwpchan, int pool)
401 {
402 	proc_t *p = curproc;
403 	lwpchan_data_t *lcp;
404 	lwpchan_hashbucket_t *hashbucket;
405 	lwpchan_entry_t *ent;
406 	memid_t	memid;
407 	uint_t count;
408 	uint_t bits;
409 
410 top:
411 	/* initialize the lwpchan cache, if necesary */
412 	if ((lcp = p->p_lcp) == NULL) {
413 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
414 		goto top;
415 	}
416 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
417 	mutex_enter(&hashbucket->lwpchan_lock);
418 	if (lcp != p->p_lcp) {
419 		/* someone resized the lwpchan cache; start over */
420 		mutex_exit(&hashbucket->lwpchan_lock);
421 		goto top;
422 	}
423 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
424 		/* it's in the cache */
425 		mutex_exit(&hashbucket->lwpchan_lock);
426 		return (1);
427 	}
428 	mutex_exit(&hashbucket->lwpchan_lock);
429 	if (as_getmemid(as, addr, &memid) != 0)
430 		return (0);
431 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
432 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
433 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
434 	mutex_enter(&hashbucket->lwpchan_lock);
435 	if (lcp != p->p_lcp) {
436 		/* someone resized the lwpchan cache; start over */
437 		mutex_exit(&hashbucket->lwpchan_lock);
438 		kmem_free(ent, sizeof (*ent));
439 		goto top;
440 	}
441 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
442 	if (count == 0) {
443 		/* someone else added this entry to the cache */
444 		mutex_exit(&hashbucket->lwpchan_lock);
445 		kmem_free(ent, sizeof (*ent));
446 		return (1);
447 	}
448 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
449 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
450 		/* hash chain too long; reallocate the hash table */
451 		mutex_exit(&hashbucket->lwpchan_lock);
452 		kmem_free(ent, sizeof (*ent));
453 		lwpchan_alloc_cache(p, bits + 1);
454 		goto top;
455 	}
456 	ent->lwpchan_addr = addr;
457 	ent->lwpchan_type = (uint16_t)type;
458 	ent->lwpchan_pool = (uint16_t)pool;
459 	ent->lwpchan_lwpchan = *lwpchan;
460 	ent->lwpchan_next = hashbucket->lwpchan_chain;
461 	hashbucket->lwpchan_chain = ent;
462 	atomic_add_32(&lcp->lwpchan_entries, 1);
463 	mutex_exit(&hashbucket->lwpchan_lock);
464 	return (1);
465 }
466 
467 /*
468  * Return a unique pair of identifiers that corresponds to a
469  * synchronization object's virtual address.  Process-shared
470  * sync objects usually get vnode/offset from as_getmemid().
471  */
472 static int
473 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
474 {
475 	/*
476 	 * If the lwp synch object is defined to be process-private,
477 	 * we just make the first field of the lwpchan be 'as' and
478 	 * the second field be the synch object's virtual address.
479 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
480 	 * The lwpchan cache is used only for process-shared objects.
481 	 */
482 	if (!(type & USYNC_PROCESS)) {
483 		lwpchan->lc_wchan0 = (caddr_t)as;
484 		lwpchan->lc_wchan = addr;
485 		return (1);
486 	}
487 
488 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
489 }
490 
491 static void
492 lwp_block(lwpchan_t *lwpchan)
493 {
494 	kthread_t *t = curthread;
495 	klwp_t *lwp = ttolwp(t);
496 	sleepq_head_t *sqh;
497 
498 	thread_lock(t);
499 	t->t_flag |= T_WAKEABLE;
500 	t->t_lwpchan = *lwpchan;
501 	t->t_sobj_ops = &lwp_sobj_ops;
502 	t->t_release = 0;
503 	sqh = lwpsqhash(lwpchan);
504 	disp_lock_enter_high(&sqh->sq_lock);
505 	CL_SLEEP(t);
506 	DTRACE_SCHED(sleep);
507 	THREAD_SLEEP(t, &sqh->sq_lock);
508 	sleepq_insert(&sqh->sq_queue, t);
509 	thread_unlock(t);
510 	lwp->lwp_asleep = 1;
511 	lwp->lwp_sysabort = 0;
512 	lwp->lwp_ru.nvcsw++;
513 	(void) new_mstate(curthread, LMS_SLEEP);
514 }
515 
516 static kthread_t *
517 lwpsobj_pi_owner(upimutex_t *up)
518 {
519 	return (up->upi_owner);
520 }
521 
522 static struct upimutex *
523 upi_get(upib_t *upibp, lwpchan_t *lcp)
524 {
525 	struct upimutex *upip;
526 
527 	for (upip = upibp->upib_first; upip != NULL;
528 	    upip = upip->upi_nextchain) {
529 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
530 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
531 			break;
532 	}
533 	return (upip);
534 }
535 
536 static void
537 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
538 {
539 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
540 
541 	/*
542 	 * Insert upimutex at front of list. Maybe a bit unfair
543 	 * but assume that not many lwpchans hash to the same
544 	 * upimutextab bucket, i.e. the list of upimutexes from
545 	 * upib_first is not too long.
546 	 */
547 	upimutex->upi_nextchain = upibp->upib_first;
548 	upibp->upib_first = upimutex;
549 }
550 
551 static void
552 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
553 {
554 	struct upimutex **prev;
555 
556 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
557 
558 	prev = &upibp->upib_first;
559 	while (*prev != upimutex) {
560 		prev = &(*prev)->upi_nextchain;
561 	}
562 	*prev = upimutex->upi_nextchain;
563 	upimutex->upi_nextchain = NULL;
564 }
565 
566 /*
567  * Add upimutex to chain of upimutexes held by curthread.
568  * Returns number of upimutexes held by curthread.
569  */
570 static uint32_t
571 upi_mylist_add(struct upimutex *upimutex)
572 {
573 	kthread_t *t = curthread;
574 
575 	/*
576 	 * Insert upimutex at front of list of upimutexes owned by t. This
577 	 * would match typical LIFO order in which nested locks are acquired
578 	 * and released.
579 	 */
580 	upimutex->upi_nextowned = t->t_upimutex;
581 	t->t_upimutex = upimutex;
582 	t->t_nupinest++;
583 	ASSERT(t->t_nupinest > 0);
584 	return (t->t_nupinest);
585 }
586 
587 /*
588  * Delete upimutex from list of upimutexes owned by curthread.
589  */
590 static void
591 upi_mylist_del(struct upimutex *upimutex)
592 {
593 	kthread_t *t = curthread;
594 	struct upimutex **prev;
595 
596 	/*
597 	 * Since the order in which nested locks are acquired and released,
598 	 * is typically LIFO, and typical nesting levels are not too deep, the
599 	 * following should not be expensive in the general case.
600 	 */
601 	prev = &t->t_upimutex;
602 	while (*prev != upimutex) {
603 		prev = &(*prev)->upi_nextowned;
604 	}
605 	*prev = upimutex->upi_nextowned;
606 	upimutex->upi_nextowned = NULL;
607 	ASSERT(t->t_nupinest > 0);
608 	t->t_nupinest--;
609 }
610 
611 /*
612  * Returns true if upimutex is owned. Should be called only when upim points
613  * to kmem which cannot disappear from underneath.
614  */
615 static int
616 upi_owned(upimutex_t *upim)
617 {
618 	return (upim->upi_owner == curthread);
619 }
620 
621 /*
622  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
623  */
624 static struct upimutex *
625 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
626 {
627 	lwpchan_t lwpchan;
628 	upib_t *upibp;
629 	struct upimutex *upimutex;
630 
631 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
632 	    &lwpchan, LWPCHAN_MPPOOL))
633 		return (NULL);
634 
635 	upibp = &UPI_CHAIN(lwpchan);
636 	mutex_enter(&upibp->upib_lock);
637 	upimutex = upi_get(upibp, &lwpchan);
638 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
639 		mutex_exit(&upibp->upib_lock);
640 		return (NULL);
641 	}
642 	mutex_exit(&upibp->upib_lock);
643 	return (upimutex);
644 }
645 
646 /*
647  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
648  * no lock hand-off occurrs.
649  */
650 static void
651 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
652 {
653 	turnstile_t *ts;
654 	upib_t *upibp;
655 	kthread_t *newowner;
656 
657 	upi_mylist_del(upimutex);
658 	upibp = upimutex->upi_upibp;
659 	mutex_enter(&upibp->upib_lock);
660 	if (upimutex->upi_waiter != 0) { /* if waiters */
661 		ts = turnstile_lookup(upimutex);
662 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
663 			/* hand-off lock to highest prio waiter */
664 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
665 			upimutex->upi_owner = newowner;
666 			if (ts->ts_waiters == 1)
667 				upimutex->upi_waiter = 0;
668 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
669 			mutex_exit(&upibp->upib_lock);
670 			return;
671 		} else if (ts != NULL) {
672 			/* LOCK_NOTRECOVERABLE: wakeup all */
673 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
674 		} else {
675 			/*
676 			 * Misleading w bit. Waiters might have been
677 			 * interrupted. No need to clear the w bit (upimutex
678 			 * will soon be freed). Re-calculate PI from existing
679 			 * waiters.
680 			 */
681 			turnstile_exit(upimutex);
682 			turnstile_pi_recalc();
683 		}
684 	}
685 	/*
686 	 * no waiters, or LOCK_NOTRECOVERABLE.
687 	 * remove from the bucket chain of upi mutexes.
688 	 * de-allocate kernel memory (upimutex).
689 	 */
690 	upi_chain_del(upimutex->upi_upibp, upimutex);
691 	mutex_exit(&upibp->upib_lock);
692 	kmem_free(upimutex, sizeof (upimutex_t));
693 }
694 
695 static int
696 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
697 {
698 	label_t ljb;
699 	int error = 0;
700 	lwpchan_t lwpchan;
701 	uint16_t flag;
702 	upib_t *upibp;
703 	volatile struct upimutex *upimutex = NULL;
704 	turnstile_t *ts;
705 	uint32_t nupinest;
706 	volatile int upilocked = 0;
707 
708 	if (on_fault(&ljb)) {
709 		if (upilocked)
710 			upimutex_unlock((upimutex_t *)upimutex, 0);
711 		error = EFAULT;
712 		goto out;
713 	}
714 	/*
715 	 * The apparent assumption made in implementing other _lwp_* synch
716 	 * primitives, is that get_lwpchan() does not return a unique cookie
717 	 * for the case where 2 processes (one forked from the other) point
718 	 * at the same underlying object, which is typed USYNC_PROCESS, but
719 	 * mapped MAP_PRIVATE, since the object has not yet been written to,
720 	 * in the child process.
721 	 *
722 	 * Since get_lwpchan() has been fixed, it is not necessary to do the
723 	 * dummy writes to force a COW fault as in other places (which should
724 	 * be fixed).
725 	 */
726 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
727 	    &lwpchan, LWPCHAN_MPPOOL)) {
728 		error = EFAULT;
729 		goto out;
730 	}
731 	upibp = &UPI_CHAIN(lwpchan);
732 retry:
733 	mutex_enter(&upibp->upib_lock);
734 	upimutex = upi_get(upibp, &lwpchan);
735 	if (upimutex == NULL)  {
736 		/* lock available since lwpchan has no upimutex */
737 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
738 		upi_chain_add(upibp, (upimutex_t *)upimutex);
739 		upimutex->upi_owner = curthread; /* grab lock */
740 		upimutex->upi_upibp = upibp;
741 		upimutex->upi_vaddr = lp;
742 		upimutex->upi_lwpchan = lwpchan;
743 		mutex_exit(&upibp->upib_lock);
744 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
745 		upilocked = 1;
746 		fuword16_noerr(&lp->mutex_flag, &flag);
747 		if (nupinest > maxnestupimx &&
748 		    secpolicy_resource(CRED()) != 0) {
749 			upimutex_unlock((upimutex_t *)upimutex, flag);
750 			error = ENOMEM;
751 			goto out;
752 		}
753 		if (flag & LOCK_NOTRECOVERABLE) {
754 			/*
755 			 * Since the setting of LOCK_NOTRECOVERABLE
756 			 * was done under the high-level upi mutex,
757 			 * in lwp_upimutex_unlock(), this flag needs to
758 			 * be checked while holding the upi mutex.
759 			 * If set, this thread should return without
760 			 * the lock held, and with the right error code.
761 			 */
762 			upimutex_unlock((upimutex_t *)upimutex, flag);
763 			upilocked = 0;
764 			error = ENOTRECOVERABLE;
765 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
766 			if (flag & LOCK_OWNERDEAD)
767 				error = EOWNERDEAD;
768 			else if (type & USYNC_PROCESS_ROBUST)
769 				error = ELOCKUNMAPPED;
770 			else
771 				error = EOWNERDEAD;
772 		}
773 		goto out;
774 	}
775 	/*
776 	 * If a upimutex object exists, it must have an owner.
777 	 * This is due to lock hand-off, and release of upimutex when no
778 	 * waiters are present at unlock time,
779 	 */
780 	ASSERT(upimutex->upi_owner != NULL);
781 	if (upimutex->upi_owner == curthread) {
782 		/*
783 		 * The user wrapper can check if the mutex type is
784 		 * ERRORCHECK: if not, it should stall at user-level.
785 		 * If so, it should return the error code.
786 		 */
787 		mutex_exit(&upibp->upib_lock);
788 		error = EDEADLK;
789 		goto out;
790 	}
791 	if (try == UPIMUTEX_TRY) {
792 		mutex_exit(&upibp->upib_lock);
793 		error = EBUSY;
794 		goto out;
795 	}
796 	/*
797 	 * Block for the lock.
798 	 * Put the lwp in an orderly state for debugging.
799 	 * Calling prstop() has to be done here, and not in
800 	 * turnstile_block(), since the preceding call to
801 	 * turnstile_lookup() raises the PIL to a level
802 	 * at which calls to prstop() should not be made.
803 	 */
804 	if ((error = lwptp->lwpt_time_error) != 0) {
805 		/*
806 		 * The SUSV3 Posix spec is very clear that we
807 		 * should get no error from validating the
808 		 * timer until we would actually sleep.
809 		 */
810 		mutex_exit(&upibp->upib_lock);
811 		goto out;
812 	}
813 	prstop(PR_REQUESTED, 0);
814 	if (lwptp->lwpt_tsp != NULL) {
815 		/*
816 		 * If we successfully queue the timeout
817 		 * (lwp_timer_enqueue() returns zero),
818 		 * then don't drop t_delay_lock until we are
819 		 * on the sleep queue (in turnstile_block()).
820 		 * Otherwise we will get an immediate timeout
821 		 * when we attempt to sleep in turnstile_block().
822 		 */
823 		mutex_enter(&curthread->t_delay_lock);
824 		if (lwp_timer_enqueue(lwptp) != 0)
825 			mutex_exit(&curthread->t_delay_lock);
826 	}
827 	/*
828 	 * Now, set the waiter bit and block for the lock in turnstile_block().
829 	 * No need to preserve the previous wbit since a lock try is not
830 	 * attempted after setting the wait bit. Wait bit is set under
831 	 * the upib_lock, which is not released until the turnstile lock
832 	 * is acquired. Say, the upimutex is L:
833 	 *
834 	 * 1. upib_lock is held so the waiter does not have to retry L after
835 	 *    setting the wait bit: since the owner has to grab the upib_lock
836 	 *    to unlock L, it will certainly see the wait bit set.
837 	 * 2. upib_lock is not released until the turnstile lock is acquired.
838 	 *    This is the key to preventing a missed wake-up. Otherwise, the
839 	 *    owner could acquire the upib_lock, and the tc_lock, to call
840 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
841 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
842 	 *    find this waiter, resulting in the missed wakeup.
843 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
844 	 *    holding the tc_lock (since mutex_exit() could need to acquire
845 	 *    the same tc_lock)...and so is held when calling turnstile_block().
846 	 *    The address of upib_lock is passed to turnstile_block() which
847 	 *    releases it after releasing all turnstile locks, and before going
848 	 *    to sleep in swtch().
849 	 * 4. The waiter value cannot be a count of waiters, because a waiter
850 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
851 	 *    which point, the upib_lock cannot be locked, to decrement waiter
852 	 *    count. So, just treat the waiter state as a bit, not a count.
853 	 */
854 	ts = turnstile_lookup((upimutex_t *)upimutex);
855 	upimutex->upi_waiter = 1;
856 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
857 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
858 	/*
859 	 * Hand-off implies that we wakeup holding the lock, except when:
860 	 *	- deadlock is detected
861 	 *	- lock is not recoverable
862 	 *	- we got an interrupt or timeout
863 	 * If we wake up due to an interrupt or timeout, we may
864 	 * or may not be holding the lock due to mutex hand-off.
865 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
866 	 */
867 	if (error != 0) {
868 		if ((error == EINTR || error == ETIME) &&
869 		    (upimutex = lwp_upimutex_owned(lp, type))) {
870 			/*
871 			 * Unlock and return - the re-startable syscall will
872 			 * try the lock again if we got EINTR.
873 			 */
874 			(void) upi_mylist_add((upimutex_t *)upimutex);
875 			upimutex_unlock((upimutex_t *)upimutex, 0);
876 		}
877 		/*
878 		 * The only other possible error is EDEADLK.  If so, upimutex
879 		 * is valid, since its owner is deadlocked with curthread.
880 		 */
881 		ASSERT(error == EINTR || error == ETIME ||
882 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
883 		ASSERT(!lwp_upimutex_owned(lp, type));
884 		goto out;
885 	}
886 	if (lwp_upimutex_owned(lp, type)) {
887 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
888 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
889 		upilocked = 1;
890 	}
891 	/*
892 	 * Now, need to read the user-level lp->mutex_flag to do the following:
893 	 *
894 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
895 	 *   should be returned.
896 	 * - if lock isn't held, check if ENOTRECOVERABLE should
897 	 *   be returned.
898 	 *
899 	 * Now, either lp->mutex_flag is readable or it's not. If not
900 	 * readable, the on_fault path will cause a return with EFAULT
901 	 * as it should.  If it is readable, the state of the flag
902 	 * encodes the robustness state of the lock:
903 	 *
904 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
905 	 * or LOCK_UNMAPPED setting will influence the return code
906 	 * appropriately.  If the upimutex is not locked here, this
907 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
908 	 * event.  The flag's setting can be used to distinguish
909 	 * between these two events.
910 	 */
911 	fuword16_noerr(&lp->mutex_flag, &flag);
912 	if (upilocked) {
913 		/*
914 		 * If the thread wakes up from turnstile_block with the lock
915 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
916 		 * since it would not have been handed-off the lock.
917 		 * So, no need to check for this case.
918 		 */
919 		if (nupinest > maxnestupimx &&
920 		    secpolicy_resource(CRED()) != 0) {
921 			upimutex_unlock((upimutex_t *)upimutex, flag);
922 			upilocked = 0;
923 			error = ENOMEM;
924 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
925 			if (flag & LOCK_OWNERDEAD)
926 				error = EOWNERDEAD;
927 			else if (type & USYNC_PROCESS_ROBUST)
928 				error = ELOCKUNMAPPED;
929 			else
930 				error = EOWNERDEAD;
931 		}
932 	} else {
933 		/*
934 		 * Wake-up without the upimutex held. Either this is a
935 		 * spurious wake-up (due to signals, forkall(), whatever), or
936 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
937 		 * of the mutex flag can be used to distinguish between the
938 		 * two events.
939 		 */
940 		if (flag & LOCK_NOTRECOVERABLE) {
941 			error = ENOTRECOVERABLE;
942 		} else {
943 			/*
944 			 * Here, the flag could be set to LOCK_OWNERDEAD or
945 			 * not. In both cases, this is a spurious wakeup,
946 			 * since the upi lock is not held, but the thread
947 			 * has returned from turnstile_block().
948 			 *
949 			 * The user flag could be LOCK_OWNERDEAD if, at the
950 			 * same time as curthread having been woken up
951 			 * spuriously, the owner (say Tdead) has died, marked
952 			 * the mutex flag accordingly, and handed off the lock
953 			 * to some other waiter (say Tnew). curthread just
954 			 * happened to read the flag while Tnew has yet to deal
955 			 * with the owner-dead event.
956 			 *
957 			 * In this event, curthread should retry the lock.
958 			 * If Tnew is able to cleanup the lock, curthread
959 			 * will eventually get the lock with a zero error code,
960 			 * If Tnew is unable to cleanup, its eventual call to
961 			 * unlock the lock will result in the mutex flag being
962 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
963 			 * all waiters, including curthread, which will then
964 			 * eventually return ENOTRECOVERABLE due to the above
965 			 * check.
966 			 *
967 			 * Of course, if the user-flag is not set with
968 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
969 			 * this is definitely a spurious wakeup.
970 			 */
971 			goto retry;
972 		}
973 	}
974 
975 out:
976 	no_fault();
977 	return (error);
978 }
979 
980 
981 static int
982 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
983 {
984 	label_t ljb;
985 	int error = 0;
986 	lwpchan_t lwpchan;
987 	uint16_t flag;
988 	upib_t *upibp;
989 	volatile struct upimutex *upimutex = NULL;
990 	volatile int upilocked = 0;
991 
992 	if (on_fault(&ljb)) {
993 		if (upilocked)
994 			upimutex_unlock((upimutex_t *)upimutex, 0);
995 		error = EFAULT;
996 		goto out;
997 	}
998 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
999 	    &lwpchan, LWPCHAN_MPPOOL)) {
1000 		error = EFAULT;
1001 		goto out;
1002 	}
1003 	upibp = &UPI_CHAIN(lwpchan);
1004 	mutex_enter(&upibp->upib_lock);
1005 	upimutex = upi_get(upibp, &lwpchan);
1006 	/*
1007 	 * If the lock is not held, or the owner is not curthread, return
1008 	 * error. The user-level wrapper can return this error or stall,
1009 	 * depending on whether mutex is of ERRORCHECK type or not.
1010 	 */
1011 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1012 		mutex_exit(&upibp->upib_lock);
1013 		error = EPERM;
1014 		goto out;
1015 	}
1016 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1017 	upilocked = 1;
1018 	fuword16_noerr(&lp->mutex_flag, &flag);
1019 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1020 		/*
1021 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1022 		 */
1023 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1024 		flag |= LOCK_NOTRECOVERABLE;
1025 		suword16_noerr(&lp->mutex_flag, flag);
1026 	}
1027 	if (type & USYNC_PROCESS)
1028 		suword32_noerr(&lp->mutex_ownerpid, 0);
1029 	upimutex_unlock((upimutex_t *)upimutex, flag);
1030 	upilocked = 0;
1031 out:
1032 	no_fault();
1033 	return (error);
1034 }
1035 
1036 /*
1037  * Clear the contents of a user-level mutex; return the flags.
1038  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1039  */
1040 static uint16_t
1041 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1042 {
1043 	uint16_t flag;
1044 
1045 	fuword16_noerr(&lp->mutex_flag, &flag);
1046 	if ((flag &
1047 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1048 		flag |= lockflg;
1049 		suword16_noerr(&lp->mutex_flag, flag);
1050 	}
1051 	suword32_noerr((uint32_t *)&lp->mutex_owner, 0);
1052 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0);
1053 	suword32_noerr(&lp->mutex_ownerpid, 0);
1054 	suword8_noerr(&lp->mutex_rcount, 0);
1055 
1056 	return (flag);
1057 }
1058 
1059 /*
1060  * Mark user mutex state, corresponding to kernel upimutex,
1061  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1062  */
1063 static int
1064 upi_dead(upimutex_t *upip, uint16_t lockflg)
1065 {
1066 	label_t ljb;
1067 	int error = 0;
1068 	lwp_mutex_t *lp;
1069 
1070 	if (on_fault(&ljb)) {
1071 		error = EFAULT;
1072 		goto out;
1073 	}
1074 
1075 	lp = upip->upi_vaddr;
1076 	(void) lwp_clear_mutex(lp, lockflg);
1077 	suword8_noerr(&lp->mutex_lockw, 0);
1078 out:
1079 	no_fault();
1080 	return (error);
1081 }
1082 
1083 /*
1084  * Unlock all upimutexes held by curthread, since curthread is dying.
1085  * For each upimutex, attempt to mark its corresponding user mutex object as
1086  * dead.
1087  */
1088 void
1089 upimutex_cleanup()
1090 {
1091 	kthread_t *t = curthread;
1092 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1093 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1094 	struct upimutex *upip;
1095 
1096 	while ((upip = t->t_upimutex) != NULL) {
1097 		if (upi_dead(upip, lockflg) != 0) {
1098 			/*
1099 			 * If the user object associated with this upimutex is
1100 			 * unmapped, unlock upimutex with the
1101 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1102 			 * woken up. Since user object is unmapped, it could
1103 			 * not be marked as dead or notrecoverable.
1104 			 * The waiters will now all wake up and return
1105 			 * ENOTRECOVERABLE, since they would find that the lock
1106 			 * has not been handed-off to them.
1107 			 * See lwp_upimutex_lock().
1108 			 */
1109 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1110 		} else {
1111 			/*
1112 			 * The user object has been updated as dead.
1113 			 * Unlock the upimutex: if no waiters, upip kmem will
1114 			 * be freed. If there is a waiter, the lock will be
1115 			 * handed off. If exit() is in progress, each existing
1116 			 * waiter will successively get the lock, as owners
1117 			 * die, and each new owner will call this routine as
1118 			 * it dies. The last owner will free kmem, since
1119 			 * it will find the upimutex has no waiters. So,
1120 			 * eventually, the kmem is guaranteed to be freed.
1121 			 */
1122 			upimutex_unlock(upip, 0);
1123 		}
1124 		/*
1125 		 * Note that the call to upimutex_unlock() above will delete
1126 		 * upimutex from the t_upimutexes chain. And so the
1127 		 * while loop will eventually terminate.
1128 		 */
1129 	}
1130 }
1131 
1132 int
1133 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1134 {
1135 	kthread_t *t = curthread;
1136 	klwp_t *lwp = ttolwp(t);
1137 	proc_t *p = ttoproc(t);
1138 	lwp_timer_t lwpt;
1139 	caddr_t timedwait;
1140 	int error = 0;
1141 	int time_error;
1142 	clock_t tim = -1;
1143 	uchar_t waiters;
1144 	volatile int locked = 0;
1145 	volatile int watched = 0;
1146 	label_t ljb;
1147 	volatile uint8_t type = 0;
1148 	lwpchan_t lwpchan;
1149 	sleepq_head_t *sqh;
1150 	static int iswanted();
1151 	uint16_t flag;
1152 	int imm_timeout = 0;
1153 
1154 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1155 		return (set_errno(EFAULT));
1156 
1157 	timedwait = (caddr_t)tsp;
1158 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1159 	    lwpt.lwpt_imm_timeout) {
1160 		imm_timeout = 1;
1161 		timedwait = NULL;
1162 	}
1163 
1164 	/*
1165 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1166 	 * this micro state is really a run state. If the thread indeed blocks,
1167 	 * this state becomes valid. If not, the state is converted back to
1168 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1169 	 * when blocking.
1170 	 */
1171 	(void) new_mstate(t, LMS_USER_LOCK);
1172 	if (on_fault(&ljb)) {
1173 		if (locked)
1174 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1175 		error = EFAULT;
1176 		goto out;
1177 	}
1178 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1179 	if (UPIMUTEX(type)) {
1180 		no_fault();
1181 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1182 		if ((type & USYNC_PROCESS) &&
1183 		    (error == 0 ||
1184 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
1185 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1186 		if (tsp && !time_error)	/* copyout the residual time left */
1187 			error = lwp_timer_copyout(&lwpt, error);
1188 		if (error)
1189 			return (set_errno(error));
1190 		return (0);
1191 	}
1192 	/*
1193 	 * Force Copy-on-write fault if lwp_mutex_t object is
1194 	 * defined to be MAP_PRIVATE and it was initialized to
1195 	 * USYNC_PROCESS.
1196 	 */
1197 	suword8_noerr(&lp->mutex_type, type);
1198 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1199 	    &lwpchan, LWPCHAN_MPPOOL)) {
1200 		error = EFAULT;
1201 		goto out;
1202 	}
1203 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1204 	locked = 1;
1205 	if (type & LOCK_ROBUST) {
1206 		fuword16_noerr(&lp->mutex_flag, &flag);
1207 		if (flag & LOCK_NOTRECOVERABLE) {
1208 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1209 			error = ENOTRECOVERABLE;
1210 			goto out;
1211 		}
1212 	}
1213 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1214 	suword8_noerr(&lp->mutex_waiters, 1);
1215 
1216 	/*
1217 	 * If watchpoints are set, they need to be restored, since
1218 	 * atomic accesses of memory such as the call to ulock_try()
1219 	 * below cannot be watched.
1220 	 */
1221 
1222 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1223 
1224 	while (!ulock_try(&lp->mutex_lockw)) {
1225 		if (time_error) {
1226 			/*
1227 			 * The SUSV3 Posix spec is very clear that we
1228 			 * should get no error from validating the
1229 			 * timer until we would actually sleep.
1230 			 */
1231 			error = time_error;
1232 			break;
1233 		}
1234 
1235 		if (watched) {
1236 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1237 			watched = 0;
1238 		}
1239 
1240 		/*
1241 		 * Put the lwp in an orderly state for debugging.
1242 		 */
1243 		prstop(PR_REQUESTED, 0);
1244 		if (timedwait) {
1245 			/*
1246 			 * If we successfully queue the timeout,
1247 			 * then don't drop t_delay_lock until
1248 			 * we are on the sleep queue (below).
1249 			 */
1250 			mutex_enter(&t->t_delay_lock);
1251 			if (lwp_timer_enqueue(&lwpt) != 0) {
1252 				mutex_exit(&t->t_delay_lock);
1253 				imm_timeout = 1;
1254 				timedwait = NULL;
1255 			}
1256 		}
1257 		lwp_block(&lwpchan);
1258 		/*
1259 		 * Nothing should happen to cause the lwp to go to
1260 		 * sleep again until after it returns from swtch().
1261 		 */
1262 		if (timedwait)
1263 			mutex_exit(&t->t_delay_lock);
1264 		locked = 0;
1265 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1266 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1267 			setrun(t);
1268 		swtch();
1269 		t->t_flag &= ~T_WAKEABLE;
1270 		if (timedwait)
1271 			tim = lwp_timer_dequeue(&lwpt);
1272 		setallwatch();
1273 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1274 			error = EINTR;
1275 		else if (imm_timeout || (timedwait && tim == -1))
1276 			error = ETIME;
1277 		if (error) {
1278 			lwp->lwp_asleep = 0;
1279 			lwp->lwp_sysabort = 0;
1280 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1281 			    S_WRITE);
1282 
1283 			/*
1284 			 * Need to re-compute waiters bit. The waiters field in
1285 			 * the lock is not reliable. Either of two things could
1286 			 * have occurred: no lwp may have called lwp_release()
1287 			 * for me but I have woken up due to a signal or
1288 			 * timeout.  In this case, the waiter bit is incorrect
1289 			 * since it is still set to 1, set above.
1290 			 * OR an lwp_release() did occur for some other lwp on
1291 			 * the same lwpchan. In this case, the waiter bit is
1292 			 * correct.  But which event occurred, one can't tell.
1293 			 * So, recompute.
1294 			 */
1295 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1296 			locked = 1;
1297 			sqh = lwpsqhash(&lwpchan);
1298 			disp_lock_enter(&sqh->sq_lock);
1299 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1300 			disp_lock_exit(&sqh->sq_lock);
1301 			break;
1302 		}
1303 		lwp->lwp_asleep = 0;
1304 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1305 		    S_WRITE);
1306 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1307 		locked = 1;
1308 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1309 		suword8_noerr(&lp->mutex_waiters, 1);
1310 		if (type & LOCK_ROBUST) {
1311 			fuword16_noerr(&lp->mutex_flag, &flag);
1312 			if (flag & LOCK_NOTRECOVERABLE) {
1313 				error = ENOTRECOVERABLE;
1314 				break;
1315 			}
1316 		}
1317 	}
1318 
1319 	if (t->t_mstate == LMS_USER_LOCK)
1320 		(void) new_mstate(t, LMS_SYSTEM);
1321 
1322 	if (error == 0) {
1323 		if (type & USYNC_PROCESS)
1324 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1325 		if (type & LOCK_ROBUST) {
1326 			fuword16_noerr(&lp->mutex_flag, &flag);
1327 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1328 				if (flag & LOCK_OWNERDEAD)
1329 					error = EOWNERDEAD;
1330 				else if (type & USYNC_PROCESS_ROBUST)
1331 					error = ELOCKUNMAPPED;
1332 				else
1333 					error = EOWNERDEAD;
1334 			}
1335 		}
1336 	}
1337 	suword8_noerr(&lp->mutex_waiters, waiters);
1338 	locked = 0;
1339 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1340 out:
1341 	no_fault();
1342 	if (watched)
1343 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1344 	if (tsp && !time_error)		/* copyout the residual time left */
1345 		error = lwp_timer_copyout(&lwpt, error);
1346 	if (error)
1347 		return (set_errno(error));
1348 	return (0);
1349 }
1350 
1351 /*
1352  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1353  * libc now calls lwp_mutex_timedlock(lp, NULL).
1354  * This system call trap continues to exist solely for the benefit
1355  * of old statically-linked binaries from Solaris 9 and before.
1356  * It should be removed from the system when we no longer care
1357  * about such applications.
1358  */
1359 int
1360 lwp_mutex_lock(lwp_mutex_t *lp)
1361 {
1362 	return (lwp_mutex_timedlock(lp, NULL));
1363 }
1364 
1365 static int
1366 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1367 {
1368 	/*
1369 	 * The caller holds the dispatcher lock on the sleep queue.
1370 	 */
1371 	while (t != NULL) {
1372 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1373 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1374 			return (1);
1375 		t = t->t_link;
1376 	}
1377 	return (0);
1378 }
1379 
1380 /*
1381  * Return the highest priority thread sleeping on this lwpchan.
1382  */
1383 static kthread_t *
1384 lwp_queue_waiter(lwpchan_t *lwpchan)
1385 {
1386 	sleepq_head_t *sqh;
1387 	kthread_t *tp;
1388 
1389 	sqh = lwpsqhash(lwpchan);
1390 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1391 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1392 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1393 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1394 			break;
1395 	}
1396 	disp_lock_exit(&sqh->sq_lock);
1397 	return (tp);
1398 }
1399 
1400 static int
1401 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1402 {
1403 	sleepq_head_t *sqh;
1404 	kthread_t *tp;
1405 	kthread_t **tpp;
1406 
1407 	sqh = lwpsqhash(lwpchan);
1408 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1409 	tpp = &sqh->sq_queue.sq_first;
1410 	while ((tp = *tpp) != NULL) {
1411 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1412 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1413 			/*
1414 			 * The following is typically false. It could be true
1415 			 * only if lwp_release() is called from
1416 			 * lwp_mutex_wakeup() after reading the waiters field
1417 			 * from memory in which the lwp lock used to be, but has
1418 			 * since been re-used to hold a lwp cv or lwp semaphore.
1419 			 * The thread "tp" found to match the lwp lock's wchan
1420 			 * is actually sleeping for the cv or semaphore which
1421 			 * now has the same wchan. In this case, lwp_release()
1422 			 * should return failure.
1423 			 */
1424 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1425 				ASSERT(sync_type == 0);
1426 				/*
1427 				 * assert that this can happen only for mutexes
1428 				 * i.e. sync_type == 0, for correctly written
1429 				 * user programs.
1430 				 */
1431 				disp_lock_exit(&sqh->sq_lock);
1432 				return (0);
1433 			}
1434 			*waiters = iswanted(tp->t_link, lwpchan);
1435 			sleepq_unlink(tpp, tp);
1436 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1437 			tp->t_wchan0 = NULL;
1438 			tp->t_wchan = NULL;
1439 			tp->t_sobj_ops = NULL;
1440 			tp->t_release = 1;
1441 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1442 			CL_WAKEUP(tp);
1443 			thread_unlock(tp);	/* drop run queue lock */
1444 			return (1);
1445 		}
1446 		tpp = &tp->t_link;
1447 	}
1448 	*waiters = 0;
1449 	disp_lock_exit(&sqh->sq_lock);
1450 	return (0);
1451 }
1452 
1453 static void
1454 lwp_release_all(lwpchan_t *lwpchan)
1455 {
1456 	sleepq_head_t	*sqh;
1457 	kthread_t *tp;
1458 	kthread_t **tpp;
1459 
1460 	sqh = lwpsqhash(lwpchan);
1461 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1462 	tpp = &sqh->sq_queue.sq_first;
1463 	while ((tp = *tpp) != NULL) {
1464 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1465 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1466 			sleepq_unlink(tpp, tp);
1467 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1468 			tp->t_wchan0 = NULL;
1469 			tp->t_wchan = NULL;
1470 			tp->t_sobj_ops = NULL;
1471 			CL_WAKEUP(tp);
1472 			thread_unlock_high(tp);	/* release run queue lock */
1473 		} else {
1474 			tpp = &tp->t_link;
1475 		}
1476 	}
1477 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1478 }
1479 
1480 /*
1481  * unblock a lwp that is trying to acquire this mutex. the blocked
1482  * lwp resumes and retries to acquire the lock.
1483  */
1484 int
1485 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1486 {
1487 	proc_t *p = ttoproc(curthread);
1488 	lwpchan_t lwpchan;
1489 	uchar_t waiters;
1490 	volatile int locked = 0;
1491 	volatile int watched = 0;
1492 	volatile uint8_t type = 0;
1493 	label_t ljb;
1494 	int error = 0;
1495 
1496 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1497 		return (set_errno(EFAULT));
1498 
1499 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1500 
1501 	if (on_fault(&ljb)) {
1502 		if (locked)
1503 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1504 		error = EFAULT;
1505 		goto out;
1506 	}
1507 	/*
1508 	 * Force Copy-on-write fault if lwp_mutex_t object is
1509 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
1510 	 */
1511 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1512 	suword8_noerr(&lp->mutex_type, type);
1513 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1514 	    &lwpchan, LWPCHAN_MPPOOL)) {
1515 		error = EFAULT;
1516 		goto out;
1517 	}
1518 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1519 	locked = 1;
1520 	/*
1521 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1522 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1523 	 * may fail.  If it fails, do not write into the waiter bit.
1524 	 * The call to lwp_release() might fail due to one of three reasons:
1525 	 *
1526 	 * 	1. due to the thread which set the waiter bit not actually
1527 	 *	   sleeping since it got the lock on the re-try. The waiter
1528 	 *	   bit will then be correctly updated by that thread. This
1529 	 *	   window may be closed by reading the wait bit again here
1530 	 *	   and not calling lwp_release() at all if it is zero.
1531 	 *	2. the thread which set the waiter bit and went to sleep
1532 	 *	   was woken up by a signal. This time, the waiter recomputes
1533 	 *	   the wait bit in the return with EINTR code.
1534 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1535 	 *	   memory that has been re-used after the lock was dropped.
1536 	 *	   In this case, writing into the waiter bit would cause data
1537 	 *	   corruption.
1538 	 */
1539 	if (release_all)
1540 		lwp_release_all(&lwpchan);
1541 	else if (lwp_release(&lwpchan, &waiters, 0))
1542 		suword8_noerr(&lp->mutex_waiters, waiters);
1543 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1544 out:
1545 	no_fault();
1546 	if (watched)
1547 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1548 	if (error)
1549 		return (set_errno(error));
1550 	return (0);
1551 }
1552 
1553 /*
1554  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1555  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1556  * a flag telling the kernel whether or not to honor the kernel/user
1557  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1558  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1559  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1560  * it is used an an in/out parameter.  On entry, it contains the relative
1561  * time until timeout.  On exit, we copyout the residual time left to it.
1562  */
1563 int
1564 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1565 {
1566 	kthread_t *t = curthread;
1567 	klwp_t *lwp = ttolwp(t);
1568 	proc_t *p = ttoproc(t);
1569 	lwp_timer_t lwpt;
1570 	lwpchan_t cv_lwpchan;
1571 	lwpchan_t m_lwpchan;
1572 	caddr_t timedwait;
1573 	volatile uint16_t type = 0;
1574 	volatile uint8_t mtype = 0;
1575 	uchar_t waiters;
1576 	volatile int error;
1577 	clock_t tim = -1;
1578 	volatile int locked = 0;
1579 	volatile int m_locked = 0;
1580 	volatile int cvwatched = 0;
1581 	volatile int mpwatched = 0;
1582 	label_t ljb;
1583 	volatile int no_lwpchan = 1;
1584 	int imm_timeout = 0;
1585 	int imm_unpark = 0;
1586 
1587 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1588 	    (caddr_t)mp >= p->p_as->a_userlimit)
1589 		return (set_errno(EFAULT));
1590 
1591 	timedwait = (caddr_t)tsp;
1592 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1593 		return (set_errno(error));
1594 	if (lwpt.lwpt_imm_timeout) {
1595 		imm_timeout = 1;
1596 		timedwait = NULL;
1597 	}
1598 
1599 	(void) new_mstate(t, LMS_USER_LOCK);
1600 
1601 	if (on_fault(&ljb)) {
1602 		if (no_lwpchan) {
1603 			error = EFAULT;
1604 			goto out;
1605 		}
1606 		if (m_locked) {
1607 			m_locked = 0;
1608 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1609 		}
1610 		if (locked) {
1611 			locked = 0;
1612 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1613 		}
1614 		/*
1615 		 * set up another on_fault() for a possible fault
1616 		 * on the user lock accessed at "efault"
1617 		 */
1618 		if (on_fault(&ljb)) {
1619 			if (m_locked) {
1620 				m_locked = 0;
1621 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1622 			}
1623 			goto out;
1624 		}
1625 		error = EFAULT;
1626 		goto efault;
1627 	}
1628 
1629 	/*
1630 	 * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t
1631 	 * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS
1632 	 */
1633 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1634 	if (UPIMUTEX(mtype) == 0) {
1635 		suword8_noerr(&mp->mutex_type, mtype);
1636 		/* convert user level mutex, "mp", to a unique lwpchan */
1637 		/* check if mtype is ok to use below, instead of type from cv */
1638 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1639 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1640 			error = EFAULT;
1641 			goto out;
1642 		}
1643 	}
1644 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1645 	suword16_noerr(&cv->cond_type, type);
1646 	/* convert user level condition variable, "cv", to a unique lwpchan */
1647 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1648 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1649 		error = EFAULT;
1650 		goto out;
1651 	}
1652 	no_lwpchan = 0;
1653 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1654 	if (UPIMUTEX(mtype) == 0)
1655 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1656 		    S_WRITE);
1657 
1658 	/*
1659 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1660 	 * with respect to a possible wakeup which is a result of either
1661 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1662 	 *
1663 	 * What's misleading, is that the lwp is put to sleep after the
1664 	 * condition variable's mutex is released.  This is OK as long as
1665 	 * the release operation is also done while holding lwpchan_lock.
1666 	 * The lwp is then put to sleep when the possibility of pagefaulting
1667 	 * or sleeping is completely eliminated.
1668 	 */
1669 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1670 	locked = 1;
1671 	if (UPIMUTEX(mtype) == 0) {
1672 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1673 		m_locked = 1;
1674 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1675 		/*
1676 		 * unlock the condition variable's mutex. (pagefaults are
1677 		 * possible here.)
1678 		 */
1679 		if (mtype & USYNC_PROCESS)
1680 			suword32_noerr(&mp->mutex_ownerpid, 0);
1681 		ulock_clear(&mp->mutex_lockw);
1682 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1683 		if (waiters != 0) {
1684 			/*
1685 			 * Given the locking of lwpchan_lock around the release
1686 			 * of the mutex and checking for waiters, the following
1687 			 * call to lwp_release() can fail ONLY if the lock
1688 			 * acquirer is interrupted after setting the waiter bit,
1689 			 * calling lwp_block() and releasing lwpchan_lock.
1690 			 * In this case, it could get pulled off the lwp sleep
1691 			 * q (via setrun()) before the following call to
1692 			 * lwp_release() occurs. In this case, the lock
1693 			 * requestor will update the waiter bit correctly by
1694 			 * re-evaluating it.
1695 			 */
1696 			if (lwp_release(&m_lwpchan, &waiters, 0))
1697 				suword8_noerr(&mp->mutex_waiters, waiters);
1698 		}
1699 		m_locked = 0;
1700 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1701 	} else {
1702 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1703 		error = lwp_upimutex_unlock(mp, mtype);
1704 		if (error) {	/* if the upimutex unlock failed */
1705 			locked = 0;
1706 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1707 			goto out;
1708 		}
1709 	}
1710 	no_fault();
1711 
1712 	if (mpwatched) {
1713 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1714 		mpwatched = 0;
1715 	}
1716 	if (cvwatched) {
1717 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1718 		cvwatched = 0;
1719 	}
1720 
1721 	/*
1722 	 * Put the lwp in an orderly state for debugging.
1723 	 */
1724 	prstop(PR_REQUESTED, 0);
1725 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1726 		/*
1727 		 * We received a signal at user-level before calling here
1728 		 * or another thread wants us to return immediately
1729 		 * with EINTR.  See lwp_unpark().
1730 		 */
1731 		imm_unpark = 1;
1732 		t->t_unpark = 0;
1733 		timedwait = NULL;
1734 	} else if (timedwait) {
1735 		/*
1736 		 * If we successfully queue the timeout,
1737 		 * then don't drop t_delay_lock until
1738 		 * we are on the sleep queue (below).
1739 		 */
1740 		mutex_enter(&t->t_delay_lock);
1741 		if (lwp_timer_enqueue(&lwpt) != 0) {
1742 			mutex_exit(&t->t_delay_lock);
1743 			imm_timeout = 1;
1744 			timedwait = NULL;
1745 		}
1746 	}
1747 	t->t_flag |= T_WAITCVSEM;
1748 	lwp_block(&cv_lwpchan);
1749 	/*
1750 	 * Nothing should happen to cause the lwp to go to sleep
1751 	 * until after it returns from swtch().
1752 	 */
1753 	if (timedwait)
1754 		mutex_exit(&t->t_delay_lock);
1755 	locked = 0;
1756 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1757 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1758 	    (imm_timeout | imm_unpark))
1759 		setrun(t);
1760 	swtch();
1761 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1762 	if (timedwait)
1763 		tim = lwp_timer_dequeue(&lwpt);
1764 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1765 	    MUSTRETURN(p, t) || imm_unpark)
1766 		error = EINTR;
1767 	else if (imm_timeout || (timedwait && tim == -1))
1768 		error = ETIME;
1769 	lwp->lwp_asleep = 0;
1770 	lwp->lwp_sysabort = 0;
1771 	setallwatch();
1772 
1773 	if (t->t_mstate == LMS_USER_LOCK)
1774 		(void) new_mstate(t, LMS_SYSTEM);
1775 
1776 	if (tsp && check_park)		/* copyout the residual time left */
1777 		error = lwp_timer_copyout(&lwpt, error);
1778 
1779 	/* the mutex is reacquired by the caller on return to user level */
1780 	if (error) {
1781 		/*
1782 		 * If we were concurrently lwp_cond_signal()d and we
1783 		 * received a UNIX signal or got a timeout, then perform
1784 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1785 		 */
1786 		if (t->t_release)
1787 			(void) lwp_cond_signal(cv);
1788 		return (set_errno(error));
1789 	}
1790 	return (0);
1791 
1792 efault:
1793 	/*
1794 	 * make sure that the user level lock is dropped before
1795 	 * returning to caller, since the caller always re-acquires it.
1796 	 */
1797 	if (UPIMUTEX(mtype) == 0) {
1798 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1799 		m_locked = 1;
1800 		if (mtype & USYNC_PROCESS)
1801 			suword32_noerr(&mp->mutex_ownerpid, 0);
1802 		ulock_clear(&mp->mutex_lockw);
1803 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1804 		if (waiters != 0) {
1805 			/*
1806 			 * See comment above on lock clearing and lwp_release()
1807 			 * success/failure.
1808 			 */
1809 			if (lwp_release(&m_lwpchan, &waiters, 0))
1810 				suword8_noerr(&mp->mutex_waiters, waiters);
1811 		}
1812 		m_locked = 0;
1813 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1814 	} else {
1815 		(void) lwp_upimutex_unlock(mp, mtype);
1816 	}
1817 out:
1818 	no_fault();
1819 	if (mpwatched)
1820 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1821 	if (cvwatched)
1822 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1823 	if (t->t_mstate == LMS_USER_LOCK)
1824 		(void) new_mstate(t, LMS_SYSTEM);
1825 	return (set_errno(error));
1826 }
1827 
1828 /*
1829  * wakeup one lwp that's blocked on this condition variable.
1830  */
1831 int
1832 lwp_cond_signal(lwp_cond_t *cv)
1833 {
1834 	proc_t *p = ttoproc(curthread);
1835 	lwpchan_t lwpchan;
1836 	uchar_t waiters;
1837 	volatile uint16_t type = 0;
1838 	volatile int locked = 0;
1839 	volatile int watched = 0;
1840 	label_t ljb;
1841 	int error = 0;
1842 
1843 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1844 		return (set_errno(EFAULT));
1845 
1846 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1847 
1848 	if (on_fault(&ljb)) {
1849 		if (locked)
1850 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1851 		error = EFAULT;
1852 		goto out;
1853 	}
1854 	/*
1855 	 * Force Copy-on-write fault if lwp_cond_t object is
1856 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1857 	 */
1858 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1859 	suword16_noerr(&cv->cond_type, type);
1860 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1861 	    &lwpchan, LWPCHAN_CVPOOL)) {
1862 		error = EFAULT;
1863 		goto out;
1864 	}
1865 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1866 	locked = 1;
1867 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1868 	if (waiters != 0) {
1869 		/*
1870 		 * The following call to lwp_release() might fail but it is
1871 		 * OK to write into the waiters bit below, since the memory
1872 		 * could not have been re-used or unmapped (for correctly
1873 		 * written user programs) as in the case of lwp_mutex_wakeup().
1874 		 * For an incorrect program, we should not care about data
1875 		 * corruption since this is just one instance of other places
1876 		 * where corruption can occur for such a program. Of course
1877 		 * if the memory is unmapped, normal fault recovery occurs.
1878 		 */
1879 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1880 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1881 	}
1882 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1883 out:
1884 	no_fault();
1885 	if (watched)
1886 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1887 	if (error)
1888 		return (set_errno(error));
1889 	return (0);
1890 }
1891 
1892 /*
1893  * wakeup every lwp that's blocked on this condition variable.
1894  */
1895 int
1896 lwp_cond_broadcast(lwp_cond_t *cv)
1897 {
1898 	proc_t *p = ttoproc(curthread);
1899 	lwpchan_t lwpchan;
1900 	volatile uint16_t type = 0;
1901 	volatile int locked = 0;
1902 	volatile int watched = 0;
1903 	label_t ljb;
1904 	uchar_t waiters;
1905 	int error = 0;
1906 
1907 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1908 		return (set_errno(EFAULT));
1909 
1910 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1911 
1912 	if (on_fault(&ljb)) {
1913 		if (locked)
1914 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1915 		error = EFAULT;
1916 		goto out;
1917 	}
1918 	/*
1919 	 * Force Copy-on-write fault if lwp_cond_t object is
1920 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1921 	 */
1922 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1923 	suword16_noerr(&cv->cond_type, type);
1924 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1925 	    &lwpchan, LWPCHAN_CVPOOL)) {
1926 		error = EFAULT;
1927 		goto out;
1928 	}
1929 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1930 	locked = 1;
1931 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1932 	if (waiters != 0) {
1933 		lwp_release_all(&lwpchan);
1934 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1935 	}
1936 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1937 out:
1938 	no_fault();
1939 	if (watched)
1940 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1941 	if (error)
1942 		return (set_errno(error));
1943 	return (0);
1944 }
1945 
1946 int
1947 lwp_sema_trywait(lwp_sema_t *sp)
1948 {
1949 	kthread_t *t = curthread;
1950 	proc_t *p = ttoproc(t);
1951 	label_t ljb;
1952 	volatile int locked = 0;
1953 	volatile int watched = 0;
1954 	volatile uint16_t type = 0;
1955 	int count;
1956 	lwpchan_t lwpchan;
1957 	uchar_t waiters;
1958 	int error = 0;
1959 
1960 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1961 		return (set_errno(EFAULT));
1962 
1963 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1964 
1965 	if (on_fault(&ljb)) {
1966 		if (locked)
1967 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1968 		error = EFAULT;
1969 		goto out;
1970 	}
1971 	/*
1972 	 * Force Copy-on-write fault if lwp_sema_t object is
1973 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1974 	 */
1975 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1976 	suword16_noerr((void *)&sp->sema_type, type);
1977 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1978 	    &lwpchan, LWPCHAN_CVPOOL)) {
1979 		error = EFAULT;
1980 		goto out;
1981 	}
1982 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1983 	locked = 1;
1984 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1985 	if (count == 0)
1986 		error = EBUSY;
1987 	else
1988 		suword32_noerr((void *)&sp->sema_count, --count);
1989 	if (count != 0) {
1990 		fuword8_noerr(&sp->sema_waiters, &waiters);
1991 		if (waiters != 0) {
1992 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1993 			suword8_noerr(&sp->sema_waiters, waiters);
1994 		}
1995 	}
1996 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1997 out:
1998 	no_fault();
1999 	if (watched)
2000 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2001 	if (error)
2002 		return (set_errno(error));
2003 	return (0);
2004 }
2005 
2006 /*
2007  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2008  */
2009 int
2010 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2011 {
2012 	kthread_t *t = curthread;
2013 	klwp_t *lwp = ttolwp(t);
2014 	proc_t *p = ttoproc(t);
2015 	lwp_timer_t lwpt;
2016 	caddr_t timedwait;
2017 	clock_t tim = -1;
2018 	label_t ljb;
2019 	volatile int locked = 0;
2020 	volatile int watched = 0;
2021 	volatile uint16_t type = 0;
2022 	int count;
2023 	lwpchan_t lwpchan;
2024 	uchar_t waiters;
2025 	int error = 0;
2026 	int time_error;
2027 	int imm_timeout = 0;
2028 	int imm_unpark = 0;
2029 
2030 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2031 		return (set_errno(EFAULT));
2032 
2033 	timedwait = (caddr_t)tsp;
2034 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2035 	    lwpt.lwpt_imm_timeout) {
2036 		imm_timeout = 1;
2037 		timedwait = NULL;
2038 	}
2039 
2040 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2041 
2042 	if (on_fault(&ljb)) {
2043 		if (locked)
2044 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2045 		error = EFAULT;
2046 		goto out;
2047 	}
2048 	/*
2049 	 * Force Copy-on-write fault if lwp_sema_t object is
2050 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2051 	 */
2052 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2053 	suword16_noerr((void *)&sp->sema_type, type);
2054 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2055 	    &lwpchan, LWPCHAN_CVPOOL)) {
2056 		error = EFAULT;
2057 		goto out;
2058 	}
2059 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2060 	locked = 1;
2061 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2062 	while (error == 0 && count == 0) {
2063 		if (time_error) {
2064 			/*
2065 			 * The SUSV3 Posix spec is very clear that we
2066 			 * should get no error from validating the
2067 			 * timer until we would actually sleep.
2068 			 */
2069 			error = time_error;
2070 			break;
2071 		}
2072 		suword8_noerr(&sp->sema_waiters, 1);
2073 		if (watched)
2074 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2075 		/*
2076 		 * Put the lwp in an orderly state for debugging.
2077 		 */
2078 		prstop(PR_REQUESTED, 0);
2079 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2080 			/*
2081 			 * We received a signal at user-level before calling
2082 			 * here or another thread wants us to return
2083 			 * immediately with EINTR.  See lwp_unpark().
2084 			 */
2085 			imm_unpark = 1;
2086 			t->t_unpark = 0;
2087 			timedwait = NULL;
2088 		} else if (timedwait) {
2089 			/*
2090 			 * If we successfully queue the timeout,
2091 			 * then don't drop t_delay_lock until
2092 			 * we are on the sleep queue (below).
2093 			 */
2094 			mutex_enter(&t->t_delay_lock);
2095 			if (lwp_timer_enqueue(&lwpt) != 0) {
2096 				mutex_exit(&t->t_delay_lock);
2097 				imm_timeout = 1;
2098 				timedwait = NULL;
2099 			}
2100 		}
2101 		t->t_flag |= T_WAITCVSEM;
2102 		lwp_block(&lwpchan);
2103 		/*
2104 		 * Nothing should happen to cause the lwp to sleep
2105 		 * again until after it returns from swtch().
2106 		 */
2107 		if (timedwait)
2108 			mutex_exit(&t->t_delay_lock);
2109 		locked = 0;
2110 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2111 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2112 		    (imm_timeout | imm_unpark))
2113 			setrun(t);
2114 		swtch();
2115 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2116 		if (timedwait)
2117 			tim = lwp_timer_dequeue(&lwpt);
2118 		setallwatch();
2119 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2120 		    MUSTRETURN(p, t) || imm_unpark)
2121 			error = EINTR;
2122 		else if (imm_timeout || (timedwait && tim == -1))
2123 			error = ETIME;
2124 		lwp->lwp_asleep = 0;
2125 		lwp->lwp_sysabort = 0;
2126 		watched = watch_disable_addr((caddr_t)sp,
2127 		    sizeof (*sp), S_WRITE);
2128 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2129 		locked = 1;
2130 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2131 	}
2132 	if (error == 0)
2133 		suword32_noerr((void *)&sp->sema_count, --count);
2134 	if (count != 0) {
2135 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2136 		suword8_noerr(&sp->sema_waiters, waiters);
2137 	}
2138 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2139 out:
2140 	no_fault();
2141 	if (watched)
2142 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2143 	if (tsp && check_park && !time_error)
2144 		error = lwp_timer_copyout(&lwpt, error);
2145 	if (error)
2146 		return (set_errno(error));
2147 	return (0);
2148 }
2149 
2150 /*
2151  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2152  * libc now calls lwp_sema_timedwait().
2153  * This system call trap exists solely for the benefit of old
2154  * statically linked applications from Solaris 9 and before.
2155  * It should be removed when we no longer care about such applications.
2156  */
2157 int
2158 lwp_sema_wait(lwp_sema_t *sp)
2159 {
2160 	return (lwp_sema_timedwait(sp, NULL, 0));
2161 }
2162 
2163 int
2164 lwp_sema_post(lwp_sema_t *sp)
2165 {
2166 	proc_t *p = ttoproc(curthread);
2167 	label_t ljb;
2168 	volatile int locked = 0;
2169 	volatile int watched = 0;
2170 	volatile uint16_t type = 0;
2171 	int count;
2172 	lwpchan_t lwpchan;
2173 	uchar_t waiters;
2174 	int error = 0;
2175 
2176 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2177 		return (set_errno(EFAULT));
2178 
2179 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2180 
2181 	if (on_fault(&ljb)) {
2182 		if (locked)
2183 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2184 		error = EFAULT;
2185 		goto out;
2186 	}
2187 	/*
2188 	 * Force Copy-on-write fault if lwp_sema_t object is
2189 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2190 	 */
2191 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2192 	suword16_noerr(&sp->sema_type, type);
2193 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2194 	    &lwpchan, LWPCHAN_CVPOOL)) {
2195 		error = EFAULT;
2196 		goto out;
2197 	}
2198 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2199 	locked = 1;
2200 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2201 	if (count == _SEM_VALUE_MAX)
2202 		error = EOVERFLOW;
2203 	else
2204 		suword32_noerr(&sp->sema_count, ++count);
2205 	if (count == 1) {
2206 		fuword8_noerr(&sp->sema_waiters, &waiters);
2207 		if (waiters) {
2208 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2209 			suword8_noerr(&sp->sema_waiters, waiters);
2210 		}
2211 	}
2212 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2213 out:
2214 	no_fault();
2215 	if (watched)
2216 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2217 	if (error)
2218 		return (set_errno(error));
2219 	return (0);
2220 }
2221 
2222 #define	TRW_WANT_WRITE		0x1
2223 #define	TRW_LOCK_GRANTED	0x2
2224 
2225 #define	READ_LOCK		0
2226 #define	WRITE_LOCK		1
2227 #define	TRY_FLAG		0x10
2228 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2229 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2230 
2231 /*
2232  * Release one writer or one or more readers. Compute the rwstate word to
2233  * reflect the new state of the queue. For a safe hand-off we copy the new
2234  * rwstate value back to userland before we wake any of the new lock holders.
2235  *
2236  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2237  * being given precedence over readers of the same priority).
2238  *
2239  * If the first thread is a reader we scan the queue releasing all readers
2240  * until we hit a writer or the end of the queue. If the first thread is a
2241  * writer we still need to check for another writer.
2242  */
2243 void
2244 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2245 {
2246 	sleepq_head_t *sqh;
2247 	kthread_t *tp;
2248 	kthread_t **tpp;
2249 	kthread_t *tpnext;
2250 	kthread_t *wakelist = NULL;
2251 	uint32_t rwstate = 0;
2252 	int wcount = 0;
2253 	int rcount = 0;
2254 
2255 	sqh = lwpsqhash(lwpchan);
2256 	disp_lock_enter(&sqh->sq_lock);
2257 	tpp = &sqh->sq_queue.sq_first;
2258 	while ((tp = *tpp) != NULL) {
2259 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2260 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2261 			if (tp->t_writer & TRW_WANT_WRITE) {
2262 				if ((wcount++ == 0) && (rcount == 0)) {
2263 					rwstate |= URW_WRITE_LOCKED;
2264 
2265 					/* Just one writer to wake. */
2266 					sleepq_unlink(tpp, tp);
2267 					wakelist = tp;
2268 
2269 					/* tpp already set for next thread. */
2270 					continue;
2271 				} else {
2272 					rwstate |= URW_HAS_WAITERS;
2273 					/* We need look no further. */
2274 					break;
2275 				}
2276 			} else {
2277 				rcount++;
2278 				if (wcount == 0) {
2279 					rwstate++;
2280 
2281 					/* Add reader to wake list. */
2282 					sleepq_unlink(tpp, tp);
2283 					tp->t_link = wakelist;
2284 					wakelist = tp;
2285 
2286 					/* tpp already set for next thread. */
2287 					continue;
2288 				} else {
2289 					rwstate |= URW_HAS_WAITERS;
2290 					/* We need look no further. */
2291 					break;
2292 				}
2293 			}
2294 		}
2295 		tpp = &tp->t_link;
2296 	}
2297 
2298 	/* Copy the new rwstate back to userland. */
2299 	suword32_noerr(&rw->rwlock_readers, rwstate);
2300 
2301 	/* Wake the new lock holder(s) up. */
2302 	tp = wakelist;
2303 	while (tp != NULL) {
2304 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2305 		tp->t_wchan0 = NULL;
2306 		tp->t_wchan = NULL;
2307 		tp->t_sobj_ops = NULL;
2308 		tp->t_writer |= TRW_LOCK_GRANTED;
2309 		tpnext = tp->t_link;
2310 		tp->t_link = NULL;
2311 		CL_WAKEUP(tp);
2312 		thread_unlock_high(tp);
2313 		tp = tpnext;
2314 	}
2315 
2316 	disp_lock_exit(&sqh->sq_lock);
2317 }
2318 
2319 /*
2320  * We enter here holding the user-level mutex, which we must release before
2321  * returning or blocking. Based on lwp_cond_wait().
2322  */
2323 static int
2324 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2325 {
2326 	lwp_mutex_t *mp = NULL;
2327 	kthread_t *t = curthread;
2328 	kthread_t *tp;
2329 	klwp_t *lwp = ttolwp(t);
2330 	proc_t *p = ttoproc(t);
2331 	lwp_timer_t lwpt;
2332 	lwpchan_t lwpchan;
2333 	lwpchan_t mlwpchan;
2334 	caddr_t timedwait;
2335 	volatile uint16_t type = 0;
2336 	volatile uint8_t mtype = 0;
2337 	uchar_t mwaiters;
2338 	volatile int error = 0;
2339 	int time_error;
2340 	clock_t tim = -1;
2341 	volatile int locked = 0;
2342 	volatile int mlocked = 0;
2343 	volatile int watched = 0;
2344 	volatile int mwatched = 0;
2345 	label_t ljb;
2346 	volatile int no_lwpchan = 1;
2347 	int imm_timeout = 0;
2348 	int try_flag;
2349 	uint32_t rwstate;
2350 	int acquired = 0;
2351 
2352 	/* We only check rw because the mutex is included in it. */
2353 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2354 		return (set_errno(EFAULT));
2355 
2356 	/* We must only report this error if we are about to sleep (later). */
2357 	timedwait = (caddr_t)tsp;
2358 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2359 	    lwpt.lwpt_imm_timeout) {
2360 		imm_timeout = 1;
2361 		timedwait = NULL;
2362 	}
2363 
2364 	(void) new_mstate(t, LMS_USER_LOCK);
2365 
2366 	if (on_fault(&ljb)) {
2367 		if (no_lwpchan) {
2368 			error = EFAULT;
2369 			goto out_nodrop;
2370 		}
2371 		if (mlocked) {
2372 			mlocked = 0;
2373 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2374 		}
2375 		if (locked) {
2376 			locked = 0;
2377 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2378 		}
2379 		/*
2380 		 * Set up another on_fault() for a possible fault
2381 		 * on the user lock accessed at "out_drop".
2382 		 */
2383 		if (on_fault(&ljb)) {
2384 			if (mlocked) {
2385 				mlocked = 0;
2386 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2387 			}
2388 			error = EFAULT;
2389 			goto out_nodrop;
2390 		}
2391 		error = EFAULT;
2392 		goto out_nodrop;
2393 	}
2394 
2395 	/* Process rd_wr (including sanity check). */
2396 	try_flag = (rd_wr & TRY_FLAG);
2397 	rd_wr &= ~TRY_FLAG;
2398 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2399 		error = EINVAL;
2400 		goto out_nodrop;
2401 	}
2402 
2403 	/* We can only continue for simple USYNC_PROCESS locks. */
2404 	mp = &rw->mutex;
2405 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2406 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2407 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2408 		error = EINVAL;
2409 		goto out_nodrop;
2410 	}
2411 
2412 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2413 	suword8_noerr(&mp->mutex_type, mtype);
2414 	suword16_noerr(&rw->rwlock_type, type);
2415 
2416 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2417 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2418 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2419 		error = EFAULT;
2420 		goto out_nodrop;
2421 	}
2422 
2423 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2424 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2425 	    &lwpchan, LWPCHAN_CVPOOL)) {
2426 		error = EFAULT;
2427 		goto out_nodrop;
2428 	}
2429 
2430 	no_lwpchan = 0;
2431 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2432 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2433 
2434 	/*
2435 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2436 	 * atomically with respect to a possible wakeup which is a result
2437 	 * of lwp_rwlock_unlock().
2438 	 *
2439 	 * What's misleading is that the LWP is put to sleep after the
2440 	 * rwlock's mutex is released. This is OK as long as the release
2441 	 * operation is also done while holding mlwpchan. The LWP is then
2442 	 * put to sleep when the possibility of pagefaulting or sleeping
2443 	 * has been completely eliminated.
2444 	 */
2445 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2446 	locked = 1;
2447 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2448 	mlocked = 1;
2449 
2450 	/*
2451 	 * Fetch the current rwlock state.
2452 	 *
2453 	 * The possibility of spurious wake-ups or killed waiters means
2454 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2455 	 * We only fix these if they are important to us.
2456 	 *
2457 	 * Although various error states can be observed here (e.g. the lock
2458 	 * is not held, but there are waiters) we assume these are applicaton
2459 	 * errors and so we take no corrective action.
2460 	 */
2461 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2462 	/*
2463 	 * We cannot legitimately get here from user-level
2464 	 * without URW_HAS_WAITERS being set.
2465 	 * Set it now to guard against user-level error.
2466 	 */
2467 	rwstate |= URW_HAS_WAITERS;
2468 
2469 	/*
2470 	 * We can try only if the lock isn't held by a writer.
2471 	 */
2472 	if (!(rwstate & URW_WRITE_LOCKED)) {
2473 		tp = lwp_queue_waiter(&lwpchan);
2474 		if (tp == NULL) {
2475 			/*
2476 			 * Hmmm, rwstate indicates waiters but there are
2477 			 * none queued. This could just be the result of a
2478 			 * spurious wakeup, so let's ignore it.
2479 			 *
2480 			 * We now have a chance to acquire the lock
2481 			 * uncontended, but this is the last chance for
2482 			 * a writer to acquire the lock without blocking.
2483 			 */
2484 			if (rd_wr == READ_LOCK) {
2485 				rwstate++;
2486 				acquired = 1;
2487 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2488 				rwstate |= URW_WRITE_LOCKED;
2489 				acquired = 1;
2490 			}
2491 		} else if (rd_wr == READ_LOCK) {
2492 			/*
2493 			 * This is the last chance for a reader to acquire
2494 			 * the lock now, but it can only do so if there is
2495 			 * no writer of equal or greater priority at the
2496 			 * head of the queue .
2497 			 *
2498 			 * It is also just possible that there is a reader
2499 			 * at the head of the queue. This may be the result
2500 			 * of a spurious wakeup or an application failure.
2501 			 * In this case we only acquire the lock if we have
2502 			 * equal or greater priority. It is not our job to
2503 			 * release spurious waiters.
2504 			 */
2505 			pri_t our_pri = DISP_PRIO(t);
2506 			pri_t his_pri = DISP_PRIO(tp);
2507 
2508 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2509 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2510 				rwstate++;
2511 				acquired = 1;
2512 			}
2513 		}
2514 	}
2515 
2516 	if (acquired || try_flag || time_error) {
2517 		/*
2518 		 * We're not going to block this time.
2519 		 */
2520 		suword32_noerr(&rw->rwlock_readers, rwstate);
2521 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2522 		locked = 0;
2523 
2524 		if (acquired) {
2525 			/*
2526 			 * Got the lock!
2527 			 */
2528 			error = 0;
2529 
2530 		} else if (try_flag) {
2531 			/*
2532 			 * We didn't get the lock and we're about to block.
2533 			 * If we're doing a trylock, return EBUSY instead.
2534 			 */
2535 			error = EBUSY;
2536 
2537 		} else if (time_error) {
2538 			/*
2539 			 * The SUSV3 POSIX spec is very clear that we should
2540 			 * get no error from validating the timer (above)
2541 			 * until we would actually sleep.
2542 			 */
2543 			error = time_error;
2544 		}
2545 
2546 		goto out_drop;
2547 	}
2548 
2549 	/*
2550 	 * We're about to block, so indicate what kind of waiter we are.
2551 	 */
2552 	t->t_writer = 0;
2553 	if (rd_wr == WRITE_LOCK)
2554 		t->t_writer = TRW_WANT_WRITE;
2555 	suword32_noerr(&rw->rwlock_readers, rwstate);
2556 
2557 	/*
2558 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2559 	 */
2560 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2561 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2562 	suword32_noerr(&mp->mutex_ownerpid, 0);
2563 	ulock_clear(&mp->mutex_lockw);
2564 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2565 	if (mwaiters != 0) {
2566 		/*
2567 		 * Given the locking of mlwpchan around the release of
2568 		 * the mutex and checking for waiters, the following
2569 		 * call to lwp_release() can fail ONLY if the lock
2570 		 * acquirer is interrupted after setting the waiter bit,
2571 		 * calling lwp_block() and releasing mlwpchan.
2572 		 * In this case, it could get pulled off the LWP sleep
2573 		 * queue (via setrun()) before the following call to
2574 		 * lwp_release() occurs, and the lock requestor will
2575 		 * update the waiter bit correctly by re-evaluating it.
2576 		 */
2577 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2578 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2579 	}
2580 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2581 	mlocked = 0;
2582 	no_fault();
2583 
2584 	if (mwatched) {
2585 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2586 		mwatched = 0;
2587 	}
2588 	if (watched) {
2589 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2590 		watched = 0;
2591 	}
2592 
2593 	/*
2594 	 * Put the LWP in an orderly state for debugging.
2595 	 */
2596 	prstop(PR_REQUESTED, 0);
2597 	if (timedwait) {
2598 		/*
2599 		 * If we successfully queue the timeout,
2600 		 * then don't drop t_delay_lock until
2601 		 * we are on the sleep queue (below).
2602 		 */
2603 		mutex_enter(&t->t_delay_lock);
2604 		if (lwp_timer_enqueue(&lwpt) != 0) {
2605 			mutex_exit(&t->t_delay_lock);
2606 			imm_timeout = 1;
2607 			timedwait = NULL;
2608 		}
2609 	}
2610 	t->t_flag |= T_WAITCVSEM;
2611 	lwp_block(&lwpchan);
2612 
2613 	/*
2614 	 * Nothing should happen to cause the LWp to go to sleep until after
2615 	 * it returns from swtch().
2616 	 */
2617 	if (timedwait)
2618 		mutex_exit(&t->t_delay_lock);
2619 	locked = 0;
2620 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2621 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t))
2622 		setrun(t);
2623 	swtch();
2624 
2625 	/*
2626 	 * We're back, but we need to work out why. Were we interrupted? Did
2627 	 * we timeout? Were we granted the lock?
2628 	 */
2629 	error = EAGAIN;
2630 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2631 	t->t_writer = 0;
2632 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2633 	if (timedwait)
2634 		tim = lwp_timer_dequeue(&lwpt);
2635 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2636 		error = EINTR;
2637 	else if (imm_timeout || (timedwait && tim == -1))
2638 		error = ETIME;
2639 	lwp->lwp_asleep = 0;
2640 	lwp->lwp_sysabort = 0;
2641 	setallwatch();
2642 
2643 	/*
2644 	 * If we were granted the lock we don't care about EINTR or ETIME.
2645 	 */
2646 	if (acquired)
2647 		error = 0;
2648 
2649 	if (t->t_mstate == LMS_USER_LOCK)
2650 		(void) new_mstate(t, LMS_SYSTEM);
2651 
2652 	if (error)
2653 		return (set_errno(error));
2654 	return (0);
2655 
2656 out_drop:
2657 	/*
2658 	 * Make sure that the user level lock is dropped before returning
2659 	 * to the caller.
2660 	 */
2661 	if (!mlocked) {
2662 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2663 		mlocked = 1;
2664 	}
2665 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2666 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2667 	suword32_noerr(&mp->mutex_ownerpid, 0);
2668 	ulock_clear(&mp->mutex_lockw);
2669 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2670 	if (mwaiters != 0) {
2671 		/*
2672 		 * See comment above on lock clearing and lwp_release()
2673 		 * success/failure.
2674 		 */
2675 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2676 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2677 	}
2678 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2679 	mlocked = 0;
2680 
2681 out_nodrop:
2682 	no_fault();
2683 	if (mwatched)
2684 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2685 	if (watched)
2686 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2687 	if (t->t_mstate == LMS_USER_LOCK)
2688 		(void) new_mstate(t, LMS_SYSTEM);
2689 	if (error)
2690 		return (set_errno(error));
2691 	return (0);
2692 }
2693 
2694 /*
2695  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2696  * we never drop the lock.
2697  */
2698 static int
2699 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2700 {
2701 	kthread_t *t = curthread;
2702 	proc_t *p = ttoproc(t);
2703 	lwpchan_t lwpchan;
2704 	volatile uint16_t type = 0;
2705 	volatile int error = 0;
2706 	volatile int locked = 0;
2707 	volatile int watched = 0;
2708 	label_t ljb;
2709 	volatile int no_lwpchan = 1;
2710 	uint32_t rwstate;
2711 
2712 	/* We only check rw because the mutex is included in it. */
2713 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2714 		return (set_errno(EFAULT));
2715 
2716 	if (on_fault(&ljb)) {
2717 		if (no_lwpchan) {
2718 			error = EFAULT;
2719 			goto out_nodrop;
2720 		}
2721 		if (locked) {
2722 			locked = 0;
2723 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2724 		}
2725 		error = EFAULT;
2726 		goto out_nodrop;
2727 	}
2728 
2729 	/* We can only continue for simple USYNC_PROCESS locks. */
2730 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2731 	if (type != USYNC_PROCESS) {
2732 		error = EINVAL;
2733 		goto out_nodrop;
2734 	}
2735 
2736 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2737 	suword16_noerr(&rw->rwlock_type, type);
2738 
2739 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2740 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2741 	    &lwpchan, LWPCHAN_CVPOOL)) {
2742 		error = EFAULT;
2743 		goto out_nodrop;
2744 	}
2745 
2746 	no_lwpchan = 0;
2747 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2748 
2749 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2750 	locked = 1;
2751 
2752 	/*
2753 	 * We can resolve multiple readers (except the last reader) here.
2754 	 * For the last reader or a writer we need lwp_rwlock_release(),
2755 	 * to which we also delegate the task of copying the new rwstate
2756 	 * back to userland (see the comment there).
2757 	 */
2758 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2759 	if (rwstate & URW_WRITE_LOCKED)
2760 		lwp_rwlock_release(&lwpchan, rw);
2761 	else if ((rwstate & URW_READERS_MASK) > 0) {
2762 		rwstate--;
2763 		if ((rwstate & URW_READERS_MASK) == 0)
2764 			lwp_rwlock_release(&lwpchan, rw);
2765 		else
2766 			suword32_noerr(&rw->rwlock_readers, rwstate);
2767 	}
2768 
2769 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2770 	locked = 0;
2771 	error = 0;
2772 
2773 out_nodrop:
2774 	no_fault();
2775 	if (watched)
2776 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2777 	if (error)
2778 		return (set_errno(error));
2779 	return (0);
2780 }
2781 
2782 int
2783 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2784 {
2785 	switch (subcode) {
2786 	case 0:
2787 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2788 	case 1:
2789 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2790 	case 2:
2791 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2792 	case 3:
2793 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2794 	case 4:
2795 		return (lwp_rwlock_unlock(rwlp));
2796 	}
2797 	return (set_errno(EINVAL));
2798 }
2799 
2800 /*
2801  * Return the owner of the user-level s-object.
2802  * Since we can't really do this, return NULL.
2803  */
2804 /* ARGSUSED */
2805 static kthread_t *
2806 lwpsobj_owner(caddr_t sobj)
2807 {
2808 	return ((kthread_t *)NULL);
2809 }
2810 
2811 /*
2812  * Wake up a thread asleep on a user-level synchronization
2813  * object.
2814  */
2815 static void
2816 lwp_unsleep(kthread_t *t)
2817 {
2818 	ASSERT(THREAD_LOCK_HELD(t));
2819 	if (t->t_wchan0 != NULL) {
2820 		sleepq_head_t *sqh;
2821 		sleepq_t *sqp = t->t_sleepq;
2822 
2823 		if (sqp != NULL) {
2824 			sqh = lwpsqhash(&t->t_lwpchan);
2825 			ASSERT(&sqh->sq_queue == sqp);
2826 			sleepq_unsleep(t);
2827 			disp_lock_exit_high(&sqh->sq_lock);
2828 			CL_SETRUN(t);
2829 			return;
2830 		}
2831 	}
2832 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2833 }
2834 
2835 /*
2836  * Change the priority of a thread asleep on a user-level
2837  * synchronization object. To maintain proper priority order,
2838  * we:
2839  *	o dequeue the thread.
2840  *	o change its priority.
2841  *	o re-enqueue the thread.
2842  * Assumption: the thread is locked on entry.
2843  */
2844 static void
2845 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2846 {
2847 	ASSERT(THREAD_LOCK_HELD(t));
2848 	if (t->t_wchan0 != NULL) {
2849 		sleepq_t   *sqp = t->t_sleepq;
2850 
2851 		sleepq_dequeue(t);
2852 		*t_prip = pri;
2853 		sleepq_insert(sqp, t);
2854 	} else
2855 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2856 }
2857 
2858 /*
2859  * Clean up a locked robust mutex
2860  */
2861 static void
2862 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2863 {
2864 	uint16_t flag;
2865 	uchar_t waiters;
2866 	label_t ljb;
2867 	pid_t owner_pid;
2868 	lwp_mutex_t *lp;
2869 	volatile int locked = 0;
2870 	volatile int watched = 0;
2871 	volatile struct upimutex *upimutex = NULL;
2872 	volatile int upilocked = 0;
2873 
2874 	ASSERT(ent->lwpchan_type & LOCK_ROBUST);
2875 
2876 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2877 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2878 	if (on_fault(&ljb)) {
2879 		if (locked)
2880 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2881 		if (upilocked)
2882 			upimutex_unlock((upimutex_t *)upimutex, 0);
2883 		goto out;
2884 	}
2885 	if (ent->lwpchan_type & USYNC_PROCESS) {
2886 		fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2887 		if ((UPIMUTEX(ent->lwpchan_type) || owner_pid != 0) &&
2888 		    owner_pid != curproc->p_pid)
2889 			goto out;
2890 	}
2891 	if (UPIMUTEX(ent->lwpchan_type)) {
2892 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2893 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2894 
2895 		mutex_enter(&upibp->upib_lock);
2896 		upimutex = upi_get(upibp, &lwpchan);
2897 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2898 			mutex_exit(&upibp->upib_lock);
2899 			goto out;
2900 		}
2901 		mutex_exit(&upibp->upib_lock);
2902 		upilocked = 1;
2903 		flag = lwp_clear_mutex(lp, lockflg);
2904 		suword8_noerr(&lp->mutex_lockw, 0);
2905 		upimutex_unlock((upimutex_t *)upimutex, flag);
2906 	} else {
2907 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2908 		locked = 1;
2909 		if ((ent->lwpchan_type & USYNC_PROCESS) && owner_pid == 0) {
2910 			/*
2911 			 * There is no owner.  If there are waiters,
2912 			 * we should wake up one or all of them.
2913 			 * It doesn't hurt to wake them up in error
2914 			 * since they will just retry the lock and
2915 			 * go to sleep again if necessary.
2916 			 */
2917 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2918 			if (waiters != 0) {	/* there are waiters */
2919 				fuword16_noerr(&lp->mutex_flag, &flag);
2920 				if (flag & LOCK_NOTRECOVERABLE) {
2921 					lwp_release_all(&ent->lwpchan_lwpchan);
2922 					suword8_noerr(&lp->mutex_waiters, 0);
2923 				} else if (lwp_release(&ent->lwpchan_lwpchan,
2924 				    &waiters, 0)) {
2925 					suword8_noerr(&lp->mutex_waiters,
2926 					    waiters);
2927 				}
2928 			}
2929 		} else {
2930 			(void) lwp_clear_mutex(lp, lockflg);
2931 			ulock_clear(&lp->mutex_lockw);
2932 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2933 			if (waiters &&
2934 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2935 				suword8_noerr(&lp->mutex_waiters, waiters);
2936 		}
2937 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2938 	}
2939 out:
2940 	no_fault();
2941 	if (watched)
2942 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2943 }
2944 
2945 /*
2946  * Register a process-shared robust mutex in the lwpchan cache.
2947  */
2948 int
2949 lwp_mutex_register(lwp_mutex_t *lp)
2950 {
2951 	int error = 0;
2952 	volatile int watched;
2953 	label_t ljb;
2954 	uint8_t type;
2955 	lwpchan_t lwpchan;
2956 
2957 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2958 		return (set_errno(EFAULT));
2959 
2960 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2961 
2962 	if (on_fault(&ljb)) {
2963 		error = EFAULT;
2964 	} else {
2965 		fuword8_noerr(&lp->mutex_type, &type);
2966 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2967 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
2968 			error = EINVAL;
2969 		} else {
2970 			/*
2971 			 * Force Copy-on-write fault if lwp_mutex_t object is
2972 			 * defined to be MAP_PRIVATE and it was initialized to
2973 			 * USYNC_PROCESS.
2974 			 */
2975 			suword8_noerr(&lp->mutex_type, type);
2976 			if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2977 			    &lwpchan, LWPCHAN_MPPOOL))
2978 				error = EFAULT;
2979 		}
2980 	}
2981 	no_fault();
2982 	if (watched)
2983 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2984 	if (error)
2985 		return (set_errno(error));
2986 	return (0);
2987 }
2988 
2989 int
2990 lwp_mutex_trylock(lwp_mutex_t *lp)
2991 {
2992 	kthread_t *t = curthread;
2993 	proc_t *p = ttoproc(t);
2994 	int error = 0;
2995 	volatile int locked = 0;
2996 	volatile int watched = 0;
2997 	label_t ljb;
2998 	volatile uint8_t type = 0;
2999 	uint16_t flag;
3000 	lwpchan_t lwpchan;
3001 
3002 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3003 		return (set_errno(EFAULT));
3004 
3005 	(void) new_mstate(t, LMS_USER_LOCK);
3006 
3007 	if (on_fault(&ljb)) {
3008 		if (locked)
3009 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3010 		error = EFAULT;
3011 		goto out;
3012 	}
3013 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3014 	if (UPIMUTEX(type)) {
3015 		no_fault();
3016 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3017 		if ((type & USYNC_PROCESS) &&
3018 		    (error == 0 ||
3019 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
3020 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
3021 		if (error)
3022 			return (set_errno(error));
3023 		return (0);
3024 	}
3025 	/*
3026 	 * Force Copy-on-write fault if lwp_mutex_t object is
3027 	 * defined to be MAP_PRIVATE and it was initialized to
3028 	 * USYNC_PROCESS.
3029 	 */
3030 	suword8_noerr(&lp->mutex_type, type);
3031 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3032 	    &lwpchan, LWPCHAN_MPPOOL)) {
3033 		error = EFAULT;
3034 		goto out;
3035 	}
3036 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3037 	locked = 1;
3038 	if (type & LOCK_ROBUST) {
3039 		fuword16_noerr(&lp->mutex_flag, &flag);
3040 		if (flag & LOCK_NOTRECOVERABLE) {
3041 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3042 			error =  ENOTRECOVERABLE;
3043 			goto out;
3044 		}
3045 	}
3046 
3047 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3048 
3049 	if (!ulock_try(&lp->mutex_lockw))
3050 		error = EBUSY;
3051 	else {
3052 		if (type & USYNC_PROCESS)
3053 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
3054 		if (type & LOCK_ROBUST) {
3055 			fuword16_noerr(&lp->mutex_flag, &flag);
3056 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3057 				if (flag & LOCK_OWNERDEAD)
3058 					error = EOWNERDEAD;
3059 				else if (type & USYNC_PROCESS_ROBUST)
3060 					error = ELOCKUNMAPPED;
3061 				else
3062 					error = EOWNERDEAD;
3063 			}
3064 		}
3065 	}
3066 	locked = 0;
3067 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3068 out:
3069 
3070 	if (t->t_mstate == LMS_USER_LOCK)
3071 		(void) new_mstate(t, LMS_SYSTEM);
3072 
3073 	no_fault();
3074 	if (watched)
3075 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3076 	if (error)
3077 		return (set_errno(error));
3078 	return (0);
3079 }
3080 
3081 /*
3082  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3083  * the blocked lwp resumes and retries to acquire the lock.
3084  */
3085 int
3086 lwp_mutex_unlock(lwp_mutex_t *lp)
3087 {
3088 	proc_t *p = ttoproc(curthread);
3089 	lwpchan_t lwpchan;
3090 	uchar_t waiters;
3091 	volatile int locked = 0;
3092 	volatile int watched = 0;
3093 	volatile uint8_t type = 0;
3094 	label_t ljb;
3095 	uint16_t flag;
3096 	int error = 0;
3097 
3098 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3099 		return (set_errno(EFAULT));
3100 
3101 	if (on_fault(&ljb)) {
3102 		if (locked)
3103 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3104 		error = EFAULT;
3105 		goto out;
3106 	}
3107 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3108 	if (UPIMUTEX(type)) {
3109 		no_fault();
3110 		error = lwp_upimutex_unlock(lp, type);
3111 		if (error)
3112 			return (set_errno(error));
3113 		return (0);
3114 	}
3115 
3116 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3117 
3118 	/*
3119 	 * Force Copy-on-write fault if lwp_mutex_t object is
3120 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
3121 	 */
3122 	suword8_noerr(&lp->mutex_type, type);
3123 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3124 	    &lwpchan, LWPCHAN_MPPOOL)) {
3125 		error = EFAULT;
3126 		goto out;
3127 	}
3128 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3129 	locked = 1;
3130 	if (type & LOCK_ROBUST) {
3131 		fuword16_noerr(&lp->mutex_flag, &flag);
3132 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3133 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3134 			flag |= LOCK_NOTRECOVERABLE;
3135 			suword16_noerr(&lp->mutex_flag, flag);
3136 		}
3137 	}
3138 	if (type & USYNC_PROCESS)
3139 		suword32_noerr(&lp->mutex_ownerpid, 0);
3140 	ulock_clear(&lp->mutex_lockw);
3141 	/*
3142 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3143 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3144 	 * may fail.  If it fails, do not write into the waiter bit.
3145 	 * The call to lwp_release() might fail due to one of three reasons:
3146 	 *
3147 	 * 	1. due to the thread which set the waiter bit not actually
3148 	 *	   sleeping since it got the lock on the re-try. The waiter
3149 	 *	   bit will then be correctly updated by that thread. This
3150 	 *	   window may be closed by reading the wait bit again here
3151 	 *	   and not calling lwp_release() at all if it is zero.
3152 	 *	2. the thread which set the waiter bit and went to sleep
3153 	 *	   was woken up by a signal. This time, the waiter recomputes
3154 	 *	   the wait bit in the return with EINTR code.
3155 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3156 	 *	   memory that has been re-used after the lock was dropped.
3157 	 *	   In this case, writing into the waiter bit would cause data
3158 	 *	   corruption.
3159 	 */
3160 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3161 	if (waiters) {
3162 		if ((type & LOCK_ROBUST) &&
3163 		    (flag & LOCK_NOTRECOVERABLE)) {
3164 			lwp_release_all(&lwpchan);
3165 			suword8_noerr(&lp->mutex_waiters, 0);
3166 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3167 			suword8_noerr(&lp->mutex_waiters, waiters);
3168 		}
3169 	}
3170 
3171 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3172 out:
3173 	no_fault();
3174 	if (watched)
3175 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3176 	if (error)
3177 		return (set_errno(error));
3178 	return (0);
3179 }
3180