xref: /titanic_44/usr/src/uts/common/syscall/lwp_sobj.c (revision ad09f8b827db90c9a0093f0b6382803fa64a5fd1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/sysmacros.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/errno.h>
37 #include <sys/file.h>
38 #include <sys/proc.h>
39 #include <sys/prsystm.h>
40 #include <sys/kmem.h>
41 #include <sys/sobject.h>
42 #include <sys/fault.h>
43 #include <sys/procfs.h>
44 #include <sys/watchpoint.h>
45 #include <sys/time.h>
46 #include <sys/cmn_err.h>
47 #include <sys/machlock.h>
48 #include <sys/debug.h>
49 #include <sys/synch.h>
50 #include <sys/synch32.h>
51 #include <sys/mman.h>
52 #include <sys/class.h>
53 #include <sys/schedctl.h>
54 #include <sys/sleepq.h>
55 #include <sys/policy.h>
56 #include <sys/tnf_probe.h>
57 #include <sys/lwpchan_impl.h>
58 #include <sys/turnstile.h>
59 #include <sys/atomic.h>
60 #include <sys/lwp_timer_impl.h>
61 #include <sys/lwp_upimutex_impl.h>
62 #include <vm/as.h>
63 #include <sys/sdt.h>
64 
65 static kthread_t *lwpsobj_owner(caddr_t);
66 static void lwp_unsleep(kthread_t *t);
67 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
68 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
69 static void lwp_mutex_unregister(void *uaddr);
70 static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
71 static int iswanted(kthread_t *, lwpchan_t *);
72 
73 extern int lwp_cond_signal(lwp_cond_t *cv);
74 
75 /*
76  * Maximum number of user prio inheritance locks that can be held by a thread.
77  * Used to limit kmem for each thread. This is a per-thread limit that
78  * can be administered on a system wide basis (using /etc/system).
79  *
80  * Also, when a limit, say maxlwps is added for numbers of lwps within a
81  * process, the per-thread limit automatically becomes a process-wide limit
82  * of maximum number of held upi locks within a process:
83  *      maxheldupimx = maxnestupimx * maxlwps;
84  */
85 static uint32_t maxnestupimx = 2000;
86 
87 /*
88  * The sobj_ops vector exports a set of functions needed when a thread
89  * is asleep on a synchronization object of this type.
90  */
91 static sobj_ops_t lwp_sobj_ops = {
92 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
93 };
94 
95 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
96 
97 static sobj_ops_t lwp_sobj_pi_ops = {
98 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
99 	turnstile_change_pri
100 };
101 
102 static sleepq_head_t	lwpsleepq[NSLEEPQ];
103 upib_t			upimutextab[UPIMUTEX_TABSIZE];
104 
105 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
106 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
107 
108 /*
109  * We know that both lc_wchan and lc_wchan0 are addresses that most
110  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
111  * 'pool' is either 0 or 1.
112  */
113 #define	LWPCHAN_LOCK_HASH(X, pool) \
114 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
115 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
116 
117 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
118 
119 /*
120  * Is this a POSIX threads user-level lock requiring priority inheritance?
121  */
122 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
123 
124 static sleepq_head_t *
125 lwpsqhash(lwpchan_t *lwpchan)
126 {
127 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
128 	return (&lwpsleepq[SQHASHINDEX(x)]);
129 }
130 
131 /*
132  * Lock an lwpchan.
133  * Keep this in sync with lwpchan_unlock(), below.
134  */
135 static void
136 lwpchan_lock(lwpchan_t *lwpchan, int pool)
137 {
138 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
139 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
140 }
141 
142 /*
143  * Unlock an lwpchan.
144  * Keep this in sync with lwpchan_lock(), above.
145  */
146 static void
147 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
148 {
149 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
150 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
151 }
152 
153 /*
154  * Delete mappings from the lwpchan cache for pages that are being
155  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
156  * all mappings within the range are deleted from the lwpchan cache.
157  */
158 void
159 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
160 {
161 	lwpchan_data_t *lcp;
162 	lwpchan_hashbucket_t *hashbucket;
163 	lwpchan_hashbucket_t *endbucket;
164 	lwpchan_entry_t *ent;
165 	lwpchan_entry_t **prev;
166 	caddr_t addr;
167 
168 	mutex_enter(&p->p_lcp_lock);
169 	lcp = p->p_lcp;
170 	hashbucket = lcp->lwpchan_cache;
171 	endbucket = hashbucket + lcp->lwpchan_size;
172 	for (; hashbucket < endbucket; hashbucket++) {
173 		if (hashbucket->lwpchan_chain == NULL)
174 			continue;
175 		mutex_enter(&hashbucket->lwpchan_lock);
176 		prev = &hashbucket->lwpchan_chain;
177 		/* check entire chain */
178 		while ((ent = *prev) != NULL) {
179 			addr = ent->lwpchan_addr;
180 			if (start <= addr && addr < end) {
181 				*prev = ent->lwpchan_next;
182 				/*
183 				 * We do this only for the obsolete type
184 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
185 				 * locks do not draw ELOCKUNMAPPED or
186 				 * EOWNERDEAD due to being unmapped.
187 				 */
188 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
189 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
190 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
191 				/*
192 				 * If there is a user-level robust lock
193 				 * registration, mark it as invalid.
194 				 */
195 				if ((addr = ent->lwpchan_uaddr) != NULL)
196 					lwp_mutex_unregister(addr);
197 				kmem_free(ent, sizeof (*ent));
198 				atomic_add_32(&lcp->lwpchan_entries, -1);
199 			} else {
200 				prev = &ent->lwpchan_next;
201 			}
202 		}
203 		mutex_exit(&hashbucket->lwpchan_lock);
204 	}
205 	mutex_exit(&p->p_lcp_lock);
206 }
207 
208 /*
209  * Given an lwpchan cache pointer and a process virtual address,
210  * return a pointer to the corresponding lwpchan hash bucket.
211  */
212 static lwpchan_hashbucket_t *
213 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
214 {
215 	uint_t i;
216 
217 	/*
218 	 * All user-level sync object addresses are 8-byte aligned.
219 	 * Ignore the lowest 3 bits of the address and use the
220 	 * higher-order 2*lwpchan_bits bits for the hash index.
221 	 */
222 	addr >>= 3;
223 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
224 	return (lcp->lwpchan_cache + i);
225 }
226 
227 /*
228  * (Re)allocate the per-process lwpchan cache.
229  */
230 static void
231 lwpchan_alloc_cache(proc_t *p, uint_t bits)
232 {
233 	lwpchan_data_t *lcp;
234 	lwpchan_data_t *old_lcp;
235 	lwpchan_hashbucket_t *hashbucket;
236 	lwpchan_hashbucket_t *endbucket;
237 	lwpchan_hashbucket_t *newbucket;
238 	lwpchan_entry_t *ent;
239 	lwpchan_entry_t *next;
240 	uint_t count;
241 
242 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
243 
244 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
245 	lcp->lwpchan_bits = bits;
246 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
247 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
248 	lcp->lwpchan_entries = 0;
249 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
250 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
251 	lcp->lwpchan_next_data = NULL;
252 
253 	mutex_enter(&p->p_lcp_lock);
254 	if ((old_lcp = p->p_lcp) != NULL) {
255 		if (old_lcp->lwpchan_bits >= bits) {
256 			/* someone beat us to it */
257 			mutex_exit(&p->p_lcp_lock);
258 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
259 			    sizeof (lwpchan_hashbucket_t));
260 			kmem_free(lcp, sizeof (lwpchan_data_t));
261 			return;
262 		}
263 		/*
264 		 * Acquire all of the old hash table locks.
265 		 */
266 		hashbucket = old_lcp->lwpchan_cache;
267 		endbucket = hashbucket + old_lcp->lwpchan_size;
268 		for (; hashbucket < endbucket; hashbucket++)
269 			mutex_enter(&hashbucket->lwpchan_lock);
270 		/*
271 		 * Move all of the old hash table entries to the
272 		 * new hash table.  The new hash table has not yet
273 		 * been installed so we don't need any of its locks.
274 		 */
275 		count = 0;
276 		hashbucket = old_lcp->lwpchan_cache;
277 		for (; hashbucket < endbucket; hashbucket++) {
278 			ent = hashbucket->lwpchan_chain;
279 			while (ent != NULL) {
280 				next = ent->lwpchan_next;
281 				newbucket = lwpchan_bucket(lcp,
282 				    (uintptr_t)ent->lwpchan_addr);
283 				ent->lwpchan_next = newbucket->lwpchan_chain;
284 				newbucket->lwpchan_chain = ent;
285 				ent = next;
286 				count++;
287 			}
288 			hashbucket->lwpchan_chain = NULL;
289 		}
290 		lcp->lwpchan_entries = count;
291 	}
292 
293 	/*
294 	 * Retire the old hash table.  We can't actually kmem_free() it
295 	 * now because someone may still have a pointer to it.  Instead,
296 	 * we link it onto the new hash table's list of retired hash tables.
297 	 * The new hash table is double the size of the previous one, so
298 	 * the total size of all retired hash tables is less than the size
299 	 * of the new one.  exit() and exec() free the retired hash tables
300 	 * (see lwpchan_destroy_cache(), below).
301 	 */
302 	lcp->lwpchan_next_data = old_lcp;
303 
304 	/*
305 	 * As soon as we store the new lcp, future locking operations will
306 	 * use it.  Therefore, we must ensure that all the state we've just
307 	 * established reaches global visibility before the new lcp does.
308 	 */
309 	membar_producer();
310 	p->p_lcp = lcp;
311 
312 	if (old_lcp != NULL) {
313 		/*
314 		 * Release all of the old hash table locks.
315 		 */
316 		hashbucket = old_lcp->lwpchan_cache;
317 		for (; hashbucket < endbucket; hashbucket++)
318 			mutex_exit(&hashbucket->lwpchan_lock);
319 	}
320 	mutex_exit(&p->p_lcp_lock);
321 }
322 
323 /*
324  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
325  * Called when the process exits or execs.  All lwps except one have
326  * exited so we need no locks here.
327  */
328 void
329 lwpchan_destroy_cache(int exec)
330 {
331 	proc_t *p = curproc;
332 	lwpchan_hashbucket_t *hashbucket;
333 	lwpchan_hashbucket_t *endbucket;
334 	lwpchan_data_t *lcp;
335 	lwpchan_entry_t *ent;
336 	lwpchan_entry_t *next;
337 	uint16_t lockflg;
338 
339 	lcp = p->p_lcp;
340 	p->p_lcp = NULL;
341 
342 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
343 	hashbucket = lcp->lwpchan_cache;
344 	endbucket = hashbucket + lcp->lwpchan_size;
345 	for (; hashbucket < endbucket; hashbucket++) {
346 		ent = hashbucket->lwpchan_chain;
347 		hashbucket->lwpchan_chain = NULL;
348 		while (ent != NULL) {
349 			next = ent->lwpchan_next;
350 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
351 			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
352 			    == (USYNC_PROCESS | LOCK_ROBUST))
353 				lwp_mutex_cleanup(ent, lockflg);
354 			kmem_free(ent, sizeof (*ent));
355 			ent = next;
356 		}
357 	}
358 
359 	while (lcp != NULL) {
360 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
361 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
362 		    sizeof (lwpchan_hashbucket_t));
363 		kmem_free(lcp, sizeof (lwpchan_data_t));
364 		lcp = next_lcp;
365 	}
366 }
367 
368 /*
369  * Return zero when there is an entry in the lwpchan cache for the
370  * given process virtual address and non-zero when there is not.
371  * The returned non-zero value is the current length of the
372  * hash chain plus one.  The caller holds the hash bucket lock.
373  */
374 static uint_t
375 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
376 	lwpchan_hashbucket_t *hashbucket)
377 {
378 	lwpchan_entry_t *ent;
379 	uint_t count = 1;
380 
381 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
382 		if (ent->lwpchan_addr == addr) {
383 			if (ent->lwpchan_type != type ||
384 			    ent->lwpchan_pool != pool) {
385 				/*
386 				 * This shouldn't happen, but might if the
387 				 * process reuses its memory for different
388 				 * types of sync objects.  We test first
389 				 * to avoid grabbing the memory cache line.
390 				 */
391 				ent->lwpchan_type = (uint16_t)type;
392 				ent->lwpchan_pool = (uint16_t)pool;
393 			}
394 			*lwpchan = ent->lwpchan_lwpchan;
395 			return (0);
396 		}
397 		count++;
398 	}
399 	return (count);
400 }
401 
402 /*
403  * Return the cached lwpchan mapping if cached, otherwise insert
404  * a virtual address to lwpchan mapping into the cache.
405  */
406 static int
407 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
408 	int type, lwpchan_t *lwpchan, int pool)
409 {
410 	proc_t *p = curproc;
411 	lwpchan_data_t *lcp;
412 	lwpchan_hashbucket_t *hashbucket;
413 	lwpchan_entry_t *ent;
414 	memid_t	memid;
415 	uint_t count;
416 	uint_t bits;
417 
418 top:
419 	/* initialize the lwpchan cache, if necesary */
420 	if ((lcp = p->p_lcp) == NULL) {
421 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
422 		goto top;
423 	}
424 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
425 	mutex_enter(&hashbucket->lwpchan_lock);
426 	if (lcp != p->p_lcp) {
427 		/* someone resized the lwpchan cache; start over */
428 		mutex_exit(&hashbucket->lwpchan_lock);
429 		goto top;
430 	}
431 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
432 		/* it's in the cache */
433 		mutex_exit(&hashbucket->lwpchan_lock);
434 		return (1);
435 	}
436 	mutex_exit(&hashbucket->lwpchan_lock);
437 	if (as_getmemid(as, addr, &memid) != 0)
438 		return (0);
439 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
440 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
441 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
442 	mutex_enter(&hashbucket->lwpchan_lock);
443 	if (lcp != p->p_lcp) {
444 		/* someone resized the lwpchan cache; start over */
445 		mutex_exit(&hashbucket->lwpchan_lock);
446 		kmem_free(ent, sizeof (*ent));
447 		goto top;
448 	}
449 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
450 	if (count == 0) {
451 		/* someone else added this entry to the cache */
452 		mutex_exit(&hashbucket->lwpchan_lock);
453 		kmem_free(ent, sizeof (*ent));
454 		return (1);
455 	}
456 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
457 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
458 		/* hash chain too long; reallocate the hash table */
459 		mutex_exit(&hashbucket->lwpchan_lock);
460 		kmem_free(ent, sizeof (*ent));
461 		lwpchan_alloc_cache(p, bits + 1);
462 		goto top;
463 	}
464 	ent->lwpchan_addr = addr;
465 	ent->lwpchan_uaddr = uaddr;
466 	ent->lwpchan_type = (uint16_t)type;
467 	ent->lwpchan_pool = (uint16_t)pool;
468 	ent->lwpchan_lwpchan = *lwpchan;
469 	ent->lwpchan_next = hashbucket->lwpchan_chain;
470 	hashbucket->lwpchan_chain = ent;
471 	atomic_add_32(&lcp->lwpchan_entries, 1);
472 	mutex_exit(&hashbucket->lwpchan_lock);
473 	return (1);
474 }
475 
476 /*
477  * Return a unique pair of identifiers that corresponds to a
478  * synchronization object's virtual address.  Process-shared
479  * sync objects usually get vnode/offset from as_getmemid().
480  */
481 static int
482 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
483 {
484 	/*
485 	 * If the lwp synch object is defined to be process-private,
486 	 * we just make the first field of the lwpchan be 'as' and
487 	 * the second field be the synch object's virtual address.
488 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
489 	 * The lwpchan cache is used only for process-shared objects.
490 	 */
491 	if (!(type & USYNC_PROCESS)) {
492 		lwpchan->lc_wchan0 = (caddr_t)as;
493 		lwpchan->lc_wchan = addr;
494 		return (1);
495 	}
496 
497 	return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
498 }
499 
500 static void
501 lwp_block(lwpchan_t *lwpchan)
502 {
503 	kthread_t *t = curthread;
504 	klwp_t *lwp = ttolwp(t);
505 	sleepq_head_t *sqh;
506 
507 	thread_lock(t);
508 	t->t_flag |= T_WAKEABLE;
509 	t->t_lwpchan = *lwpchan;
510 	t->t_sobj_ops = &lwp_sobj_ops;
511 	t->t_release = 0;
512 	sqh = lwpsqhash(lwpchan);
513 	disp_lock_enter_high(&sqh->sq_lock);
514 	CL_SLEEP(t);
515 	DTRACE_SCHED(sleep);
516 	THREAD_SLEEP(t, &sqh->sq_lock);
517 	sleepq_insert(&sqh->sq_queue, t);
518 	thread_unlock(t);
519 	lwp->lwp_asleep = 1;
520 	lwp->lwp_sysabort = 0;
521 	lwp->lwp_ru.nvcsw++;
522 	(void) new_mstate(curthread, LMS_SLEEP);
523 }
524 
525 static kthread_t *
526 lwpsobj_pi_owner(upimutex_t *up)
527 {
528 	return (up->upi_owner);
529 }
530 
531 static struct upimutex *
532 upi_get(upib_t *upibp, lwpchan_t *lcp)
533 {
534 	struct upimutex *upip;
535 
536 	for (upip = upibp->upib_first; upip != NULL;
537 	    upip = upip->upi_nextchain) {
538 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
539 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
540 			break;
541 	}
542 	return (upip);
543 }
544 
545 static void
546 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
547 {
548 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
549 
550 	/*
551 	 * Insert upimutex at front of list. Maybe a bit unfair
552 	 * but assume that not many lwpchans hash to the same
553 	 * upimutextab bucket, i.e. the list of upimutexes from
554 	 * upib_first is not too long.
555 	 */
556 	upimutex->upi_nextchain = upibp->upib_first;
557 	upibp->upib_first = upimutex;
558 }
559 
560 static void
561 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
562 {
563 	struct upimutex **prev;
564 
565 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
566 
567 	prev = &upibp->upib_first;
568 	while (*prev != upimutex) {
569 		prev = &(*prev)->upi_nextchain;
570 	}
571 	*prev = upimutex->upi_nextchain;
572 	upimutex->upi_nextchain = NULL;
573 }
574 
575 /*
576  * Add upimutex to chain of upimutexes held by curthread.
577  * Returns number of upimutexes held by curthread.
578  */
579 static uint32_t
580 upi_mylist_add(struct upimutex *upimutex)
581 {
582 	kthread_t *t = curthread;
583 
584 	/*
585 	 * Insert upimutex at front of list of upimutexes owned by t. This
586 	 * would match typical LIFO order in which nested locks are acquired
587 	 * and released.
588 	 */
589 	upimutex->upi_nextowned = t->t_upimutex;
590 	t->t_upimutex = upimutex;
591 	t->t_nupinest++;
592 	ASSERT(t->t_nupinest > 0);
593 	return (t->t_nupinest);
594 }
595 
596 /*
597  * Delete upimutex from list of upimutexes owned by curthread.
598  */
599 static void
600 upi_mylist_del(struct upimutex *upimutex)
601 {
602 	kthread_t *t = curthread;
603 	struct upimutex **prev;
604 
605 	/*
606 	 * Since the order in which nested locks are acquired and released,
607 	 * is typically LIFO, and typical nesting levels are not too deep, the
608 	 * following should not be expensive in the general case.
609 	 */
610 	prev = &t->t_upimutex;
611 	while (*prev != upimutex) {
612 		prev = &(*prev)->upi_nextowned;
613 	}
614 	*prev = upimutex->upi_nextowned;
615 	upimutex->upi_nextowned = NULL;
616 	ASSERT(t->t_nupinest > 0);
617 	t->t_nupinest--;
618 }
619 
620 /*
621  * Returns true if upimutex is owned. Should be called only when upim points
622  * to kmem which cannot disappear from underneath.
623  */
624 static int
625 upi_owned(upimutex_t *upim)
626 {
627 	return (upim->upi_owner == curthread);
628 }
629 
630 /*
631  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
632  */
633 static struct upimutex *
634 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
635 {
636 	lwpchan_t lwpchan;
637 	upib_t *upibp;
638 	struct upimutex *upimutex;
639 
640 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
641 	    &lwpchan, LWPCHAN_MPPOOL))
642 		return (NULL);
643 
644 	upibp = &UPI_CHAIN(lwpchan);
645 	mutex_enter(&upibp->upib_lock);
646 	upimutex = upi_get(upibp, &lwpchan);
647 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
648 		mutex_exit(&upibp->upib_lock);
649 		return (NULL);
650 	}
651 	mutex_exit(&upibp->upib_lock);
652 	return (upimutex);
653 }
654 
655 /*
656  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
657  * no lock hand-off occurrs.
658  */
659 static void
660 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
661 {
662 	turnstile_t *ts;
663 	upib_t *upibp;
664 	kthread_t *newowner;
665 
666 	upi_mylist_del(upimutex);
667 	upibp = upimutex->upi_upibp;
668 	mutex_enter(&upibp->upib_lock);
669 	if (upimutex->upi_waiter != 0) { /* if waiters */
670 		ts = turnstile_lookup(upimutex);
671 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
672 			/* hand-off lock to highest prio waiter */
673 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
674 			upimutex->upi_owner = newowner;
675 			if (ts->ts_waiters == 1)
676 				upimutex->upi_waiter = 0;
677 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
678 			mutex_exit(&upibp->upib_lock);
679 			return;
680 		} else if (ts != NULL) {
681 			/* LOCK_NOTRECOVERABLE: wakeup all */
682 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
683 		} else {
684 			/*
685 			 * Misleading w bit. Waiters might have been
686 			 * interrupted. No need to clear the w bit (upimutex
687 			 * will soon be freed). Re-calculate PI from existing
688 			 * waiters.
689 			 */
690 			turnstile_exit(upimutex);
691 			turnstile_pi_recalc();
692 		}
693 	}
694 	/*
695 	 * no waiters, or LOCK_NOTRECOVERABLE.
696 	 * remove from the bucket chain of upi mutexes.
697 	 * de-allocate kernel memory (upimutex).
698 	 */
699 	upi_chain_del(upimutex->upi_upibp, upimutex);
700 	mutex_exit(&upibp->upib_lock);
701 	kmem_free(upimutex, sizeof (upimutex_t));
702 }
703 
704 static int
705 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
706 {
707 	label_t ljb;
708 	int error = 0;
709 	lwpchan_t lwpchan;
710 	uint16_t flag;
711 	upib_t *upibp;
712 	volatile struct upimutex *upimutex = NULL;
713 	turnstile_t *ts;
714 	uint32_t nupinest;
715 	volatile int upilocked = 0;
716 
717 	if (on_fault(&ljb)) {
718 		if (upilocked)
719 			upimutex_unlock((upimutex_t *)upimutex, 0);
720 		error = EFAULT;
721 		goto out;
722 	}
723 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
724 	    &lwpchan, LWPCHAN_MPPOOL)) {
725 		error = EFAULT;
726 		goto out;
727 	}
728 	upibp = &UPI_CHAIN(lwpchan);
729 retry:
730 	mutex_enter(&upibp->upib_lock);
731 	upimutex = upi_get(upibp, &lwpchan);
732 	if (upimutex == NULL)  {
733 		/* lock available since lwpchan has no upimutex */
734 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
735 		upi_chain_add(upibp, (upimutex_t *)upimutex);
736 		upimutex->upi_owner = curthread; /* grab lock */
737 		upimutex->upi_upibp = upibp;
738 		upimutex->upi_vaddr = lp;
739 		upimutex->upi_lwpchan = lwpchan;
740 		mutex_exit(&upibp->upib_lock);
741 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
742 		upilocked = 1;
743 		fuword16_noerr(&lp->mutex_flag, &flag);
744 		if (nupinest > maxnestupimx &&
745 		    secpolicy_resource(CRED()) != 0) {
746 			upimutex_unlock((upimutex_t *)upimutex, flag);
747 			error = ENOMEM;
748 			goto out;
749 		}
750 		if (flag & LOCK_NOTRECOVERABLE) {
751 			/*
752 			 * Since the setting of LOCK_NOTRECOVERABLE
753 			 * was done under the high-level upi mutex,
754 			 * in lwp_upimutex_unlock(), this flag needs to
755 			 * be checked while holding the upi mutex.
756 			 * If set, this thread should return without
757 			 * the lock held, and with the right error code.
758 			 */
759 			upimutex_unlock((upimutex_t *)upimutex, flag);
760 			upilocked = 0;
761 			error = ENOTRECOVERABLE;
762 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
763 			if (flag & LOCK_OWNERDEAD)
764 				error = EOWNERDEAD;
765 			else if (type & USYNC_PROCESS_ROBUST)
766 				error = ELOCKUNMAPPED;
767 			else
768 				error = EOWNERDEAD;
769 		}
770 		goto out;
771 	}
772 	/*
773 	 * If a upimutex object exists, it must have an owner.
774 	 * This is due to lock hand-off, and release of upimutex when no
775 	 * waiters are present at unlock time,
776 	 */
777 	ASSERT(upimutex->upi_owner != NULL);
778 	if (upimutex->upi_owner == curthread) {
779 		/*
780 		 * The user wrapper can check if the mutex type is
781 		 * ERRORCHECK: if not, it should stall at user-level.
782 		 * If so, it should return the error code.
783 		 */
784 		mutex_exit(&upibp->upib_lock);
785 		error = EDEADLK;
786 		goto out;
787 	}
788 	if (try == UPIMUTEX_TRY) {
789 		mutex_exit(&upibp->upib_lock);
790 		error = EBUSY;
791 		goto out;
792 	}
793 	/*
794 	 * Block for the lock.
795 	 */
796 	if ((error = lwptp->lwpt_time_error) != 0) {
797 		/*
798 		 * The SUSV3 Posix spec is very clear that we
799 		 * should get no error from validating the
800 		 * timer until we would actually sleep.
801 		 */
802 		mutex_exit(&upibp->upib_lock);
803 		goto out;
804 	}
805 	if (lwptp->lwpt_tsp != NULL) {
806 		/*
807 		 * Unlike the protocol for other lwp timedwait operations,
808 		 * we must drop t_delay_lock before going to sleep in
809 		 * turnstile_block() for a upi mutex.
810 		 * See the comments below and in turnstile.c
811 		 */
812 		mutex_enter(&curthread->t_delay_lock);
813 		(void) lwp_timer_enqueue(lwptp);
814 		mutex_exit(&curthread->t_delay_lock);
815 	}
816 	/*
817 	 * Now, set the waiter bit and block for the lock in turnstile_block().
818 	 * No need to preserve the previous wbit since a lock try is not
819 	 * attempted after setting the wait bit. Wait bit is set under
820 	 * the upib_lock, which is not released until the turnstile lock
821 	 * is acquired. Say, the upimutex is L:
822 	 *
823 	 * 1. upib_lock is held so the waiter does not have to retry L after
824 	 *    setting the wait bit: since the owner has to grab the upib_lock
825 	 *    to unlock L, it will certainly see the wait bit set.
826 	 * 2. upib_lock is not released until the turnstile lock is acquired.
827 	 *    This is the key to preventing a missed wake-up. Otherwise, the
828 	 *    owner could acquire the upib_lock, and the tc_lock, to call
829 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
830 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
831 	 *    find this waiter, resulting in the missed wakeup.
832 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
833 	 *    holding the tc_lock (since mutex_exit() could need to acquire
834 	 *    the same tc_lock)...and so is held when calling turnstile_block().
835 	 *    The address of upib_lock is passed to turnstile_block() which
836 	 *    releases it after releasing all turnstile locks, and before going
837 	 *    to sleep in swtch().
838 	 * 4. The waiter value cannot be a count of waiters, because a waiter
839 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
840 	 *    which point, the upib_lock cannot be locked, to decrement waiter
841 	 *    count. So, just treat the waiter state as a bit, not a count.
842 	 */
843 	ts = turnstile_lookup((upimutex_t *)upimutex);
844 	upimutex->upi_waiter = 1;
845 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
846 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
847 	/*
848 	 * Hand-off implies that we wakeup holding the lock, except when:
849 	 *	- deadlock is detected
850 	 *	- lock is not recoverable
851 	 *	- we got an interrupt or timeout
852 	 * If we wake up due to an interrupt or timeout, we may
853 	 * or may not be holding the lock due to mutex hand-off.
854 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
855 	 */
856 	if (error != 0) {
857 		if ((error == EINTR || error == ETIME) &&
858 		    (upimutex = lwp_upimutex_owned(lp, type))) {
859 			/*
860 			 * Unlock and return - the re-startable syscall will
861 			 * try the lock again if we got EINTR.
862 			 */
863 			(void) upi_mylist_add((upimutex_t *)upimutex);
864 			upimutex_unlock((upimutex_t *)upimutex, 0);
865 		}
866 		/*
867 		 * The only other possible error is EDEADLK.  If so, upimutex
868 		 * is valid, since its owner is deadlocked with curthread.
869 		 */
870 		ASSERT(error == EINTR || error == ETIME ||
871 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
872 		ASSERT(!lwp_upimutex_owned(lp, type));
873 		goto out;
874 	}
875 	if (lwp_upimutex_owned(lp, type)) {
876 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
877 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
878 		upilocked = 1;
879 	}
880 	/*
881 	 * Now, need to read the user-level lp->mutex_flag to do the following:
882 	 *
883 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
884 	 *   should be returned.
885 	 * - if lock isn't held, check if ENOTRECOVERABLE should
886 	 *   be returned.
887 	 *
888 	 * Now, either lp->mutex_flag is readable or it's not. If not
889 	 * readable, the on_fault path will cause a return with EFAULT
890 	 * as it should.  If it is readable, the state of the flag
891 	 * encodes the robustness state of the lock:
892 	 *
893 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
894 	 * or LOCK_UNMAPPED setting will influence the return code
895 	 * appropriately.  If the upimutex is not locked here, this
896 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
897 	 * event.  The flag's setting can be used to distinguish
898 	 * between these two events.
899 	 */
900 	fuword16_noerr(&lp->mutex_flag, &flag);
901 	if (upilocked) {
902 		/*
903 		 * If the thread wakes up from turnstile_block with the lock
904 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
905 		 * since it would not have been handed-off the lock.
906 		 * So, no need to check for this case.
907 		 */
908 		if (nupinest > maxnestupimx &&
909 		    secpolicy_resource(CRED()) != 0) {
910 			upimutex_unlock((upimutex_t *)upimutex, flag);
911 			upilocked = 0;
912 			error = ENOMEM;
913 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
914 			if (flag & LOCK_OWNERDEAD)
915 				error = EOWNERDEAD;
916 			else if (type & USYNC_PROCESS_ROBUST)
917 				error = ELOCKUNMAPPED;
918 			else
919 				error = EOWNERDEAD;
920 		}
921 	} else {
922 		/*
923 		 * Wake-up without the upimutex held. Either this is a
924 		 * spurious wake-up (due to signals, forkall(), whatever), or
925 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
926 		 * of the mutex flag can be used to distinguish between the
927 		 * two events.
928 		 */
929 		if (flag & LOCK_NOTRECOVERABLE) {
930 			error = ENOTRECOVERABLE;
931 		} else {
932 			/*
933 			 * Here, the flag could be set to LOCK_OWNERDEAD or
934 			 * not. In both cases, this is a spurious wakeup,
935 			 * since the upi lock is not held, but the thread
936 			 * has returned from turnstile_block().
937 			 *
938 			 * The user flag could be LOCK_OWNERDEAD if, at the
939 			 * same time as curthread having been woken up
940 			 * spuriously, the owner (say Tdead) has died, marked
941 			 * the mutex flag accordingly, and handed off the lock
942 			 * to some other waiter (say Tnew). curthread just
943 			 * happened to read the flag while Tnew has yet to deal
944 			 * with the owner-dead event.
945 			 *
946 			 * In this event, curthread should retry the lock.
947 			 * If Tnew is able to cleanup the lock, curthread
948 			 * will eventually get the lock with a zero error code,
949 			 * If Tnew is unable to cleanup, its eventual call to
950 			 * unlock the lock will result in the mutex flag being
951 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
952 			 * all waiters, including curthread, which will then
953 			 * eventually return ENOTRECOVERABLE due to the above
954 			 * check.
955 			 *
956 			 * Of course, if the user-flag is not set with
957 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
958 			 * this is definitely a spurious wakeup.
959 			 */
960 			goto retry;
961 		}
962 	}
963 
964 out:
965 	no_fault();
966 	return (error);
967 }
968 
969 
970 static int
971 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
972 {
973 	label_t ljb;
974 	int error = 0;
975 	lwpchan_t lwpchan;
976 	uint16_t flag;
977 	upib_t *upibp;
978 	volatile struct upimutex *upimutex = NULL;
979 	volatile int upilocked = 0;
980 
981 	if (on_fault(&ljb)) {
982 		if (upilocked)
983 			upimutex_unlock((upimutex_t *)upimutex, 0);
984 		error = EFAULT;
985 		goto out;
986 	}
987 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
988 	    &lwpchan, LWPCHAN_MPPOOL)) {
989 		error = EFAULT;
990 		goto out;
991 	}
992 	upibp = &UPI_CHAIN(lwpchan);
993 	mutex_enter(&upibp->upib_lock);
994 	upimutex = upi_get(upibp, &lwpchan);
995 	/*
996 	 * If the lock is not held, or the owner is not curthread, return
997 	 * error. The user-level wrapper can return this error or stall,
998 	 * depending on whether mutex is of ERRORCHECK type or not.
999 	 */
1000 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1001 		mutex_exit(&upibp->upib_lock);
1002 		error = EPERM;
1003 		goto out;
1004 	}
1005 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1006 	upilocked = 1;
1007 	fuword16_noerr(&lp->mutex_flag, &flag);
1008 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1009 		/*
1010 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1011 		 */
1012 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1013 		flag |= LOCK_NOTRECOVERABLE;
1014 		suword16_noerr(&lp->mutex_flag, flag);
1015 	}
1016 	set_owner_pid(lp, 0, 0);
1017 	upimutex_unlock((upimutex_t *)upimutex, flag);
1018 	upilocked = 0;
1019 out:
1020 	no_fault();
1021 	return (error);
1022 }
1023 
1024 /*
1025  * Set the owner and ownerpid fields of a user-level mutex.
1026  */
1027 static void
1028 set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
1029 {
1030 	union {
1031 		uint64_t word64;
1032 		uint32_t word32[2];
1033 	} un;
1034 
1035 	un.word64 = (uint64_t)owner;
1036 
1037 	suword32_noerr(&lp->mutex_ownerpid, pid);
1038 #if defined(_LP64)
1039 	if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
1040 		suword64_noerr(&lp->mutex_owner, un.word64);
1041 		return;
1042 	}
1043 #endif
1044 	/* mutex is unaligned or we are running on a 32-bit kernel */
1045 	suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
1046 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
1047 }
1048 
1049 /*
1050  * Clear the contents of a user-level mutex; return the flags.
1051  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1052  */
1053 static uint16_t
1054 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1055 {
1056 	uint16_t flag;
1057 
1058 	fuword16_noerr(&lp->mutex_flag, &flag);
1059 	if ((flag &
1060 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1061 		flag |= lockflg;
1062 		suword16_noerr(&lp->mutex_flag, flag);
1063 	}
1064 	set_owner_pid(lp, 0, 0);
1065 	suword8_noerr(&lp->mutex_rcount, 0);
1066 
1067 	return (flag);
1068 }
1069 
1070 /*
1071  * Mark user mutex state, corresponding to kernel upimutex,
1072  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1073  */
1074 static int
1075 upi_dead(upimutex_t *upip, uint16_t lockflg)
1076 {
1077 	label_t ljb;
1078 	int error = 0;
1079 	lwp_mutex_t *lp;
1080 
1081 	if (on_fault(&ljb)) {
1082 		error = EFAULT;
1083 		goto out;
1084 	}
1085 
1086 	lp = upip->upi_vaddr;
1087 	(void) lwp_clear_mutex(lp, lockflg);
1088 	suword8_noerr(&lp->mutex_lockw, 0);
1089 out:
1090 	no_fault();
1091 	return (error);
1092 }
1093 
1094 /*
1095  * Unlock all upimutexes held by curthread, since curthread is dying.
1096  * For each upimutex, attempt to mark its corresponding user mutex object as
1097  * dead.
1098  */
1099 void
1100 upimutex_cleanup()
1101 {
1102 	kthread_t *t = curthread;
1103 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1104 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1105 	struct upimutex *upip;
1106 
1107 	while ((upip = t->t_upimutex) != NULL) {
1108 		if (upi_dead(upip, lockflg) != 0) {
1109 			/*
1110 			 * If the user object associated with this upimutex is
1111 			 * unmapped, unlock upimutex with the
1112 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1113 			 * woken up. Since user object is unmapped, it could
1114 			 * not be marked as dead or notrecoverable.
1115 			 * The waiters will now all wake up and return
1116 			 * ENOTRECOVERABLE, since they would find that the lock
1117 			 * has not been handed-off to them.
1118 			 * See lwp_upimutex_lock().
1119 			 */
1120 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1121 		} else {
1122 			/*
1123 			 * The user object has been updated as dead.
1124 			 * Unlock the upimutex: if no waiters, upip kmem will
1125 			 * be freed. If there is a waiter, the lock will be
1126 			 * handed off. If exit() is in progress, each existing
1127 			 * waiter will successively get the lock, as owners
1128 			 * die, and each new owner will call this routine as
1129 			 * it dies. The last owner will free kmem, since
1130 			 * it will find the upimutex has no waiters. So,
1131 			 * eventually, the kmem is guaranteed to be freed.
1132 			 */
1133 			upimutex_unlock(upip, 0);
1134 		}
1135 		/*
1136 		 * Note that the call to upimutex_unlock() above will delete
1137 		 * upimutex from the t_upimutexes chain. And so the
1138 		 * while loop will eventually terminate.
1139 		 */
1140 	}
1141 }
1142 
1143 int
1144 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
1145 {
1146 	kthread_t *t = curthread;
1147 	klwp_t *lwp = ttolwp(t);
1148 	proc_t *p = ttoproc(t);
1149 	lwp_timer_t lwpt;
1150 	caddr_t timedwait;
1151 	int error = 0;
1152 	int time_error;
1153 	clock_t tim = -1;
1154 	uchar_t waiters;
1155 	volatile int locked = 0;
1156 	volatile int watched = 0;
1157 	label_t ljb;
1158 	volatile uint8_t type = 0;
1159 	lwpchan_t lwpchan;
1160 	sleepq_head_t *sqh;
1161 	uint16_t flag;
1162 	int imm_timeout = 0;
1163 
1164 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1165 		return (set_errno(EFAULT));
1166 
1167 	/*
1168 	 * Put the lwp in an orderly state for debugging,
1169 	 * in case we are stopped while sleeping, below.
1170 	 */
1171 	prstop(PR_REQUESTED, 0);
1172 
1173 	timedwait = (caddr_t)tsp;
1174 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1175 	    lwpt.lwpt_imm_timeout) {
1176 		imm_timeout = 1;
1177 		timedwait = NULL;
1178 	}
1179 
1180 	/*
1181 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1182 	 * this micro state is really a run state. If the thread indeed blocks,
1183 	 * this state becomes valid. If not, the state is converted back to
1184 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1185 	 * when blocking.
1186 	 */
1187 	(void) new_mstate(t, LMS_USER_LOCK);
1188 	if (on_fault(&ljb)) {
1189 		if (locked)
1190 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1191 		error = EFAULT;
1192 		goto out;
1193 	}
1194 	/*
1195 	 * Force Copy-on-write if necessary and ensure that the
1196 	 * synchronization object resides in read/write memory.
1197 	 * Cause an EFAULT return now if this is not so.
1198 	 */
1199 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1200 	suword8_noerr(&lp->mutex_type, type);
1201 	if (UPIMUTEX(type)) {
1202 		no_fault();
1203 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1204 		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
1205 			set_owner_pid(lp, owner,
1206 			    (type & USYNC_PROCESS)? p->p_pid : 0);
1207 		if (tsp && !time_error)	/* copyout the residual time left */
1208 			error = lwp_timer_copyout(&lwpt, error);
1209 		if (error)
1210 			return (set_errno(error));
1211 		return (0);
1212 	}
1213 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1214 	    &lwpchan, LWPCHAN_MPPOOL)) {
1215 		error = EFAULT;
1216 		goto out;
1217 	}
1218 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1219 	locked = 1;
1220 	if (type & LOCK_ROBUST) {
1221 		fuword16_noerr(&lp->mutex_flag, &flag);
1222 		if (flag & LOCK_NOTRECOVERABLE) {
1223 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1224 			error = ENOTRECOVERABLE;
1225 			goto out;
1226 		}
1227 	}
1228 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1229 	suword8_noerr(&lp->mutex_waiters, 1);
1230 
1231 	/*
1232 	 * If watchpoints are set, they need to be restored, since
1233 	 * atomic accesses of memory such as the call to ulock_try()
1234 	 * below cannot be watched.
1235 	 */
1236 
1237 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1238 
1239 	while (!ulock_try(&lp->mutex_lockw)) {
1240 		if (time_error) {
1241 			/*
1242 			 * The SUSV3 Posix spec is very clear that we
1243 			 * should get no error from validating the
1244 			 * timer until we would actually sleep.
1245 			 */
1246 			error = time_error;
1247 			break;
1248 		}
1249 
1250 		if (watched) {
1251 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1252 			watched = 0;
1253 		}
1254 
1255 		if (timedwait) {
1256 			/*
1257 			 * If we successfully queue the timeout,
1258 			 * then don't drop t_delay_lock until
1259 			 * we are on the sleep queue (below).
1260 			 */
1261 			mutex_enter(&t->t_delay_lock);
1262 			if (lwp_timer_enqueue(&lwpt) != 0) {
1263 				mutex_exit(&t->t_delay_lock);
1264 				imm_timeout = 1;
1265 				timedwait = NULL;
1266 			}
1267 		}
1268 		lwp_block(&lwpchan);
1269 		/*
1270 		 * Nothing should happen to cause the lwp to go to
1271 		 * sleep again until after it returns from swtch().
1272 		 */
1273 		if (timedwait)
1274 			mutex_exit(&t->t_delay_lock);
1275 		locked = 0;
1276 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1277 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1278 			setrun(t);
1279 		swtch();
1280 		t->t_flag &= ~T_WAKEABLE;
1281 		if (timedwait)
1282 			tim = lwp_timer_dequeue(&lwpt);
1283 		setallwatch();
1284 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1285 			error = EINTR;
1286 		else if (imm_timeout || (timedwait && tim == -1))
1287 			error = ETIME;
1288 		if (error) {
1289 			lwp->lwp_asleep = 0;
1290 			lwp->lwp_sysabort = 0;
1291 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1292 			    S_WRITE);
1293 
1294 			/*
1295 			 * Need to re-compute waiters bit. The waiters field in
1296 			 * the lock is not reliable. Either of two things could
1297 			 * have occurred: no lwp may have called lwp_release()
1298 			 * for me but I have woken up due to a signal or
1299 			 * timeout.  In this case, the waiter bit is incorrect
1300 			 * since it is still set to 1, set above.
1301 			 * OR an lwp_release() did occur for some other lwp on
1302 			 * the same lwpchan. In this case, the waiter bit is
1303 			 * correct.  But which event occurred, one can't tell.
1304 			 * So, recompute.
1305 			 */
1306 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1307 			locked = 1;
1308 			sqh = lwpsqhash(&lwpchan);
1309 			disp_lock_enter(&sqh->sq_lock);
1310 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1311 			disp_lock_exit(&sqh->sq_lock);
1312 			break;
1313 		}
1314 		lwp->lwp_asleep = 0;
1315 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1316 		    S_WRITE);
1317 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1318 		locked = 1;
1319 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1320 		suword8_noerr(&lp->mutex_waiters, 1);
1321 		if (type & LOCK_ROBUST) {
1322 			fuword16_noerr(&lp->mutex_flag, &flag);
1323 			if (flag & LOCK_NOTRECOVERABLE) {
1324 				error = ENOTRECOVERABLE;
1325 				break;
1326 			}
1327 		}
1328 	}
1329 
1330 	if (t->t_mstate == LMS_USER_LOCK)
1331 		(void) new_mstate(t, LMS_SYSTEM);
1332 
1333 	if (error == 0) {
1334 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
1335 		if (type & LOCK_ROBUST) {
1336 			fuword16_noerr(&lp->mutex_flag, &flag);
1337 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1338 				if (flag & LOCK_OWNERDEAD)
1339 					error = EOWNERDEAD;
1340 				else if (type & USYNC_PROCESS_ROBUST)
1341 					error = ELOCKUNMAPPED;
1342 				else
1343 					error = EOWNERDEAD;
1344 			}
1345 		}
1346 	}
1347 	suword8_noerr(&lp->mutex_waiters, waiters);
1348 	locked = 0;
1349 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1350 out:
1351 	no_fault();
1352 	if (watched)
1353 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1354 	if (tsp && !time_error)		/* copyout the residual time left */
1355 		error = lwp_timer_copyout(&lwpt, error);
1356 	if (error)
1357 		return (set_errno(error));
1358 	return (0);
1359 }
1360 
1361 /*
1362  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1363  * libc now calls lwp_mutex_timedlock(lp, NULL, NULL).
1364  * This system call trap continues to exist solely for the benefit
1365  * of old statically-linked binaries from Solaris 9 and before.
1366  * It should be removed from the system when we no longer care
1367  * about such applications.
1368  */
1369 int
1370 lwp_mutex_lock(lwp_mutex_t *lp)
1371 {
1372 	return (lwp_mutex_timedlock(lp, NULL, NULL));
1373 }
1374 
1375 static int
1376 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1377 {
1378 	/*
1379 	 * The caller holds the dispatcher lock on the sleep queue.
1380 	 */
1381 	while (t != NULL) {
1382 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1383 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1384 			return (1);
1385 		t = t->t_link;
1386 	}
1387 	return (0);
1388 }
1389 
1390 /*
1391  * Return the highest priority thread sleeping on this lwpchan.
1392  */
1393 static kthread_t *
1394 lwp_queue_waiter(lwpchan_t *lwpchan)
1395 {
1396 	sleepq_head_t *sqh;
1397 	kthread_t *tp;
1398 
1399 	sqh = lwpsqhash(lwpchan);
1400 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1401 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1402 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1403 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1404 			break;
1405 	}
1406 	disp_lock_exit(&sqh->sq_lock);
1407 	return (tp);
1408 }
1409 
1410 static int
1411 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1412 {
1413 	sleepq_head_t *sqh;
1414 	kthread_t *tp;
1415 	kthread_t **tpp;
1416 
1417 	sqh = lwpsqhash(lwpchan);
1418 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1419 	tpp = &sqh->sq_queue.sq_first;
1420 	while ((tp = *tpp) != NULL) {
1421 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1422 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1423 			/*
1424 			 * The following is typically false. It could be true
1425 			 * only if lwp_release() is called from
1426 			 * lwp_mutex_wakeup() after reading the waiters field
1427 			 * from memory in which the lwp lock used to be, but has
1428 			 * since been re-used to hold a lwp cv or lwp semaphore.
1429 			 * The thread "tp" found to match the lwp lock's wchan
1430 			 * is actually sleeping for the cv or semaphore which
1431 			 * now has the same wchan. In this case, lwp_release()
1432 			 * should return failure.
1433 			 */
1434 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1435 				ASSERT(sync_type == 0);
1436 				/*
1437 				 * assert that this can happen only for mutexes
1438 				 * i.e. sync_type == 0, for correctly written
1439 				 * user programs.
1440 				 */
1441 				disp_lock_exit(&sqh->sq_lock);
1442 				return (0);
1443 			}
1444 			*waiters = iswanted(tp->t_link, lwpchan);
1445 			sleepq_unlink(tpp, tp);
1446 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1447 			tp->t_wchan0 = NULL;
1448 			tp->t_wchan = NULL;
1449 			tp->t_sobj_ops = NULL;
1450 			tp->t_release = 1;
1451 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1452 			CL_WAKEUP(tp);
1453 			thread_unlock(tp);	/* drop run queue lock */
1454 			return (1);
1455 		}
1456 		tpp = &tp->t_link;
1457 	}
1458 	*waiters = 0;
1459 	disp_lock_exit(&sqh->sq_lock);
1460 	return (0);
1461 }
1462 
1463 static void
1464 lwp_release_all(lwpchan_t *lwpchan)
1465 {
1466 	sleepq_head_t	*sqh;
1467 	kthread_t *tp;
1468 	kthread_t **tpp;
1469 
1470 	sqh = lwpsqhash(lwpchan);
1471 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1472 	tpp = &sqh->sq_queue.sq_first;
1473 	while ((tp = *tpp) != NULL) {
1474 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1475 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1476 			sleepq_unlink(tpp, tp);
1477 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1478 			tp->t_wchan0 = NULL;
1479 			tp->t_wchan = NULL;
1480 			tp->t_sobj_ops = NULL;
1481 			CL_WAKEUP(tp);
1482 			thread_unlock_high(tp);	/* release run queue lock */
1483 		} else {
1484 			tpp = &tp->t_link;
1485 		}
1486 	}
1487 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1488 }
1489 
1490 /*
1491  * unblock a lwp that is trying to acquire this mutex. the blocked
1492  * lwp resumes and retries to acquire the lock.
1493  */
1494 int
1495 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1496 {
1497 	proc_t *p = ttoproc(curthread);
1498 	lwpchan_t lwpchan;
1499 	uchar_t waiters;
1500 	volatile int locked = 0;
1501 	volatile int watched = 0;
1502 	volatile uint8_t type = 0;
1503 	label_t ljb;
1504 	int error = 0;
1505 
1506 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1507 		return (set_errno(EFAULT));
1508 
1509 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1510 
1511 	if (on_fault(&ljb)) {
1512 		if (locked)
1513 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1514 		error = EFAULT;
1515 		goto out;
1516 	}
1517 	/*
1518 	 * Force Copy-on-write if necessary and ensure that the
1519 	 * synchronization object resides in read/write memory.
1520 	 * Cause an EFAULT return now if this is not so.
1521 	 */
1522 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1523 	suword8_noerr(&lp->mutex_type, type);
1524 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1525 	    &lwpchan, LWPCHAN_MPPOOL)) {
1526 		error = EFAULT;
1527 		goto out;
1528 	}
1529 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1530 	locked = 1;
1531 	/*
1532 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1533 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1534 	 * may fail.  If it fails, do not write into the waiter bit.
1535 	 * The call to lwp_release() might fail due to one of three reasons:
1536 	 *
1537 	 * 	1. due to the thread which set the waiter bit not actually
1538 	 *	   sleeping since it got the lock on the re-try. The waiter
1539 	 *	   bit will then be correctly updated by that thread. This
1540 	 *	   window may be closed by reading the wait bit again here
1541 	 *	   and not calling lwp_release() at all if it is zero.
1542 	 *	2. the thread which set the waiter bit and went to sleep
1543 	 *	   was woken up by a signal. This time, the waiter recomputes
1544 	 *	   the wait bit in the return with EINTR code.
1545 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1546 	 *	   memory that has been re-used after the lock was dropped.
1547 	 *	   In this case, writing into the waiter bit would cause data
1548 	 *	   corruption.
1549 	 */
1550 	if (release_all)
1551 		lwp_release_all(&lwpchan);
1552 	else if (lwp_release(&lwpchan, &waiters, 0))
1553 		suword8_noerr(&lp->mutex_waiters, waiters);
1554 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1555 out:
1556 	no_fault();
1557 	if (watched)
1558 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1559 	if (error)
1560 		return (set_errno(error));
1561 	return (0);
1562 }
1563 
1564 /*
1565  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1566  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1567  * a flag telling the kernel whether or not to honor the kernel/user
1568  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1569  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1570  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1571  * it is used an an in/out parameter.  On entry, it contains the relative
1572  * time until timeout.  On exit, we copyout the residual time left to it.
1573  */
1574 int
1575 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1576 {
1577 	kthread_t *t = curthread;
1578 	klwp_t *lwp = ttolwp(t);
1579 	proc_t *p = ttoproc(t);
1580 	lwp_timer_t lwpt;
1581 	lwpchan_t cv_lwpchan;
1582 	lwpchan_t m_lwpchan;
1583 	caddr_t timedwait;
1584 	volatile uint16_t type = 0;
1585 	volatile uint8_t mtype = 0;
1586 	uchar_t waiters;
1587 	volatile int error;
1588 	clock_t tim = -1;
1589 	volatile int locked = 0;
1590 	volatile int m_locked = 0;
1591 	volatile int cvwatched = 0;
1592 	volatile int mpwatched = 0;
1593 	label_t ljb;
1594 	volatile int no_lwpchan = 1;
1595 	int imm_timeout = 0;
1596 	int imm_unpark = 0;
1597 
1598 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1599 	    (caddr_t)mp >= p->p_as->a_userlimit)
1600 		return (set_errno(EFAULT));
1601 
1602 	/*
1603 	 * Put the lwp in an orderly state for debugging,
1604 	 * in case we are stopped while sleeping, below.
1605 	 */
1606 	prstop(PR_REQUESTED, 0);
1607 
1608 	timedwait = (caddr_t)tsp;
1609 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1610 		return (set_errno(error));
1611 	if (lwpt.lwpt_imm_timeout) {
1612 		imm_timeout = 1;
1613 		timedwait = NULL;
1614 	}
1615 
1616 	(void) new_mstate(t, LMS_USER_LOCK);
1617 
1618 	if (on_fault(&ljb)) {
1619 		if (no_lwpchan) {
1620 			error = EFAULT;
1621 			goto out;
1622 		}
1623 		if (m_locked) {
1624 			m_locked = 0;
1625 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1626 		}
1627 		if (locked) {
1628 			locked = 0;
1629 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1630 		}
1631 		/*
1632 		 * set up another on_fault() for a possible fault
1633 		 * on the user lock accessed at "efault"
1634 		 */
1635 		if (on_fault(&ljb)) {
1636 			if (m_locked) {
1637 				m_locked = 0;
1638 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1639 			}
1640 			goto out;
1641 		}
1642 		error = EFAULT;
1643 		goto efault;
1644 	}
1645 
1646 	/*
1647 	 * Force Copy-on-write if necessary and ensure that the
1648 	 * synchronization object resides in read/write memory.
1649 	 * Cause an EFAULT return now if this is not so.
1650 	 */
1651 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1652 	suword8_noerr(&mp->mutex_type, mtype);
1653 	if (UPIMUTEX(mtype) == 0) {
1654 		/* convert user level mutex, "mp", to a unique lwpchan */
1655 		/* check if mtype is ok to use below, instead of type from cv */
1656 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1657 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1658 			error = EFAULT;
1659 			goto out;
1660 		}
1661 	}
1662 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1663 	suword16_noerr(&cv->cond_type, type);
1664 	/* convert user level condition variable, "cv", to a unique lwpchan */
1665 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1666 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1667 		error = EFAULT;
1668 		goto out;
1669 	}
1670 	no_lwpchan = 0;
1671 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1672 	if (UPIMUTEX(mtype) == 0)
1673 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1674 		    S_WRITE);
1675 
1676 	/*
1677 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1678 	 * with respect to a possible wakeup which is a result of either
1679 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1680 	 *
1681 	 * What's misleading, is that the lwp is put to sleep after the
1682 	 * condition variable's mutex is released.  This is OK as long as
1683 	 * the release operation is also done while holding lwpchan_lock.
1684 	 * The lwp is then put to sleep when the possibility of pagefaulting
1685 	 * or sleeping is completely eliminated.
1686 	 */
1687 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1688 	locked = 1;
1689 	if (UPIMUTEX(mtype) == 0) {
1690 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1691 		m_locked = 1;
1692 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1693 		/*
1694 		 * unlock the condition variable's mutex. (pagefaults are
1695 		 * possible here.)
1696 		 */
1697 		set_owner_pid(mp, 0, 0);
1698 		ulock_clear(&mp->mutex_lockw);
1699 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1700 		if (waiters != 0) {
1701 			/*
1702 			 * Given the locking of lwpchan_lock around the release
1703 			 * of the mutex and checking for waiters, the following
1704 			 * call to lwp_release() can fail ONLY if the lock
1705 			 * acquirer is interrupted after setting the waiter bit,
1706 			 * calling lwp_block() and releasing lwpchan_lock.
1707 			 * In this case, it could get pulled off the lwp sleep
1708 			 * q (via setrun()) before the following call to
1709 			 * lwp_release() occurs. In this case, the lock
1710 			 * requestor will update the waiter bit correctly by
1711 			 * re-evaluating it.
1712 			 */
1713 			if (lwp_release(&m_lwpchan, &waiters, 0))
1714 				suword8_noerr(&mp->mutex_waiters, waiters);
1715 		}
1716 		m_locked = 0;
1717 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1718 	} else {
1719 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1720 		error = lwp_upimutex_unlock(mp, mtype);
1721 		if (error) {	/* if the upimutex unlock failed */
1722 			locked = 0;
1723 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1724 			goto out;
1725 		}
1726 	}
1727 	no_fault();
1728 
1729 	if (mpwatched) {
1730 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1731 		mpwatched = 0;
1732 	}
1733 	if (cvwatched) {
1734 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1735 		cvwatched = 0;
1736 	}
1737 
1738 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1739 		/*
1740 		 * We received a signal at user-level before calling here
1741 		 * or another thread wants us to return immediately
1742 		 * with EINTR.  See lwp_unpark().
1743 		 */
1744 		imm_unpark = 1;
1745 		t->t_unpark = 0;
1746 		timedwait = NULL;
1747 	} else if (timedwait) {
1748 		/*
1749 		 * If we successfully queue the timeout,
1750 		 * then don't drop t_delay_lock until
1751 		 * we are on the sleep queue (below).
1752 		 */
1753 		mutex_enter(&t->t_delay_lock);
1754 		if (lwp_timer_enqueue(&lwpt) != 0) {
1755 			mutex_exit(&t->t_delay_lock);
1756 			imm_timeout = 1;
1757 			timedwait = NULL;
1758 		}
1759 	}
1760 	t->t_flag |= T_WAITCVSEM;
1761 	lwp_block(&cv_lwpchan);
1762 	/*
1763 	 * Nothing should happen to cause the lwp to go to sleep
1764 	 * until after it returns from swtch().
1765 	 */
1766 	if (timedwait)
1767 		mutex_exit(&t->t_delay_lock);
1768 	locked = 0;
1769 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1770 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1771 	    (imm_timeout | imm_unpark))
1772 		setrun(t);
1773 	swtch();
1774 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1775 	if (timedwait)
1776 		tim = lwp_timer_dequeue(&lwpt);
1777 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1778 	    MUSTRETURN(p, t) || imm_unpark)
1779 		error = EINTR;
1780 	else if (imm_timeout || (timedwait && tim == -1))
1781 		error = ETIME;
1782 	lwp->lwp_asleep = 0;
1783 	lwp->lwp_sysabort = 0;
1784 	setallwatch();
1785 
1786 	if (t->t_mstate == LMS_USER_LOCK)
1787 		(void) new_mstate(t, LMS_SYSTEM);
1788 
1789 	if (tsp && check_park)		/* copyout the residual time left */
1790 		error = lwp_timer_copyout(&lwpt, error);
1791 
1792 	/* the mutex is reacquired by the caller on return to user level */
1793 	if (error) {
1794 		/*
1795 		 * If we were concurrently lwp_cond_signal()d and we
1796 		 * received a UNIX signal or got a timeout, then perform
1797 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1798 		 */
1799 		if (t->t_release)
1800 			(void) lwp_cond_signal(cv);
1801 		return (set_errno(error));
1802 	}
1803 	return (0);
1804 
1805 efault:
1806 	/*
1807 	 * make sure that the user level lock is dropped before
1808 	 * returning to caller, since the caller always re-acquires it.
1809 	 */
1810 	if (UPIMUTEX(mtype) == 0) {
1811 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1812 		m_locked = 1;
1813 		set_owner_pid(mp, 0, 0);
1814 		ulock_clear(&mp->mutex_lockw);
1815 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1816 		if (waiters != 0) {
1817 			/*
1818 			 * See comment above on lock clearing and lwp_release()
1819 			 * success/failure.
1820 			 */
1821 			if (lwp_release(&m_lwpchan, &waiters, 0))
1822 				suword8_noerr(&mp->mutex_waiters, waiters);
1823 		}
1824 		m_locked = 0;
1825 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1826 	} else {
1827 		(void) lwp_upimutex_unlock(mp, mtype);
1828 	}
1829 out:
1830 	no_fault();
1831 	if (mpwatched)
1832 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1833 	if (cvwatched)
1834 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1835 	if (t->t_mstate == LMS_USER_LOCK)
1836 		(void) new_mstate(t, LMS_SYSTEM);
1837 	return (set_errno(error));
1838 }
1839 
1840 /*
1841  * wakeup one lwp that's blocked on this condition variable.
1842  */
1843 int
1844 lwp_cond_signal(lwp_cond_t *cv)
1845 {
1846 	proc_t *p = ttoproc(curthread);
1847 	lwpchan_t lwpchan;
1848 	uchar_t waiters;
1849 	volatile uint16_t type = 0;
1850 	volatile int locked = 0;
1851 	volatile int watched = 0;
1852 	label_t ljb;
1853 	int error = 0;
1854 
1855 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1856 		return (set_errno(EFAULT));
1857 
1858 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1859 
1860 	if (on_fault(&ljb)) {
1861 		if (locked)
1862 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1863 		error = EFAULT;
1864 		goto out;
1865 	}
1866 	/*
1867 	 * Force Copy-on-write if necessary and ensure that the
1868 	 * synchronization object resides in read/write memory.
1869 	 * Cause an EFAULT return now if this is not so.
1870 	 */
1871 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1872 	suword16_noerr(&cv->cond_type, type);
1873 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1874 	    &lwpchan, LWPCHAN_CVPOOL)) {
1875 		error = EFAULT;
1876 		goto out;
1877 	}
1878 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1879 	locked = 1;
1880 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1881 	if (waiters != 0) {
1882 		/*
1883 		 * The following call to lwp_release() might fail but it is
1884 		 * OK to write into the waiters bit below, since the memory
1885 		 * could not have been re-used or unmapped (for correctly
1886 		 * written user programs) as in the case of lwp_mutex_wakeup().
1887 		 * For an incorrect program, we should not care about data
1888 		 * corruption since this is just one instance of other places
1889 		 * where corruption can occur for such a program. Of course
1890 		 * if the memory is unmapped, normal fault recovery occurs.
1891 		 */
1892 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1893 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1894 	}
1895 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1896 out:
1897 	no_fault();
1898 	if (watched)
1899 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1900 	if (error)
1901 		return (set_errno(error));
1902 	return (0);
1903 }
1904 
1905 /*
1906  * wakeup every lwp that's blocked on this condition variable.
1907  */
1908 int
1909 lwp_cond_broadcast(lwp_cond_t *cv)
1910 {
1911 	proc_t *p = ttoproc(curthread);
1912 	lwpchan_t lwpchan;
1913 	volatile uint16_t type = 0;
1914 	volatile int locked = 0;
1915 	volatile int watched = 0;
1916 	label_t ljb;
1917 	uchar_t waiters;
1918 	int error = 0;
1919 
1920 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1921 		return (set_errno(EFAULT));
1922 
1923 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1924 
1925 	if (on_fault(&ljb)) {
1926 		if (locked)
1927 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1928 		error = EFAULT;
1929 		goto out;
1930 	}
1931 	/*
1932 	 * Force Copy-on-write if necessary and ensure that the
1933 	 * synchronization object resides in read/write memory.
1934 	 * Cause an EFAULT return now if this is not so.
1935 	 */
1936 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1937 	suword16_noerr(&cv->cond_type, type);
1938 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1939 	    &lwpchan, LWPCHAN_CVPOOL)) {
1940 		error = EFAULT;
1941 		goto out;
1942 	}
1943 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1944 	locked = 1;
1945 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1946 	if (waiters != 0) {
1947 		lwp_release_all(&lwpchan);
1948 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1949 	}
1950 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1951 out:
1952 	no_fault();
1953 	if (watched)
1954 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1955 	if (error)
1956 		return (set_errno(error));
1957 	return (0);
1958 }
1959 
1960 int
1961 lwp_sema_trywait(lwp_sema_t *sp)
1962 {
1963 	kthread_t *t = curthread;
1964 	proc_t *p = ttoproc(t);
1965 	label_t ljb;
1966 	volatile int locked = 0;
1967 	volatile int watched = 0;
1968 	volatile uint16_t type = 0;
1969 	int count;
1970 	lwpchan_t lwpchan;
1971 	uchar_t waiters;
1972 	int error = 0;
1973 
1974 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1975 		return (set_errno(EFAULT));
1976 
1977 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1978 
1979 	if (on_fault(&ljb)) {
1980 		if (locked)
1981 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1982 		error = EFAULT;
1983 		goto out;
1984 	}
1985 	/*
1986 	 * Force Copy-on-write if necessary and ensure that the
1987 	 * synchronization object resides in read/write memory.
1988 	 * Cause an EFAULT return now if this is not so.
1989 	 */
1990 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1991 	suword16_noerr((void *)&sp->sema_type, type);
1992 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1993 	    &lwpchan, LWPCHAN_CVPOOL)) {
1994 		error = EFAULT;
1995 		goto out;
1996 	}
1997 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1998 	locked = 1;
1999 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2000 	if (count == 0)
2001 		error = EBUSY;
2002 	else
2003 		suword32_noerr((void *)&sp->sema_count, --count);
2004 	if (count != 0) {
2005 		fuword8_noerr(&sp->sema_waiters, &waiters);
2006 		if (waiters != 0) {
2007 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2008 			suword8_noerr(&sp->sema_waiters, waiters);
2009 		}
2010 	}
2011 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2012 out:
2013 	no_fault();
2014 	if (watched)
2015 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2016 	if (error)
2017 		return (set_errno(error));
2018 	return (0);
2019 }
2020 
2021 /*
2022  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2023  */
2024 int
2025 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2026 {
2027 	kthread_t *t = curthread;
2028 	klwp_t *lwp = ttolwp(t);
2029 	proc_t *p = ttoproc(t);
2030 	lwp_timer_t lwpt;
2031 	caddr_t timedwait;
2032 	clock_t tim = -1;
2033 	label_t ljb;
2034 	volatile int locked = 0;
2035 	volatile int watched = 0;
2036 	volatile uint16_t type = 0;
2037 	int count;
2038 	lwpchan_t lwpchan;
2039 	uchar_t waiters;
2040 	int error = 0;
2041 	int time_error;
2042 	int imm_timeout = 0;
2043 	int imm_unpark = 0;
2044 
2045 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2046 		return (set_errno(EFAULT));
2047 
2048 	/*
2049 	 * Put the lwp in an orderly state for debugging,
2050 	 * in case we are stopped while sleeping, below.
2051 	 */
2052 	prstop(PR_REQUESTED, 0);
2053 
2054 	timedwait = (caddr_t)tsp;
2055 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2056 	    lwpt.lwpt_imm_timeout) {
2057 		imm_timeout = 1;
2058 		timedwait = NULL;
2059 	}
2060 
2061 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2062 
2063 	if (on_fault(&ljb)) {
2064 		if (locked)
2065 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2066 		error = EFAULT;
2067 		goto out;
2068 	}
2069 	/*
2070 	 * Force Copy-on-write if necessary and ensure that the
2071 	 * synchronization object resides in read/write memory.
2072 	 * Cause an EFAULT return now if this is not so.
2073 	 */
2074 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2075 	suword16_noerr((void *)&sp->sema_type, type);
2076 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2077 	    &lwpchan, LWPCHAN_CVPOOL)) {
2078 		error = EFAULT;
2079 		goto out;
2080 	}
2081 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2082 	locked = 1;
2083 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2084 	while (error == 0 && count == 0) {
2085 		if (time_error) {
2086 			/*
2087 			 * The SUSV3 Posix spec is very clear that we
2088 			 * should get no error from validating the
2089 			 * timer until we would actually sleep.
2090 			 */
2091 			error = time_error;
2092 			break;
2093 		}
2094 		suword8_noerr(&sp->sema_waiters, 1);
2095 		if (watched)
2096 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2097 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2098 			/*
2099 			 * We received a signal at user-level before calling
2100 			 * here or another thread wants us to return
2101 			 * immediately with EINTR.  See lwp_unpark().
2102 			 */
2103 			imm_unpark = 1;
2104 			t->t_unpark = 0;
2105 			timedwait = NULL;
2106 		} else if (timedwait) {
2107 			/*
2108 			 * If we successfully queue the timeout,
2109 			 * then don't drop t_delay_lock until
2110 			 * we are on the sleep queue (below).
2111 			 */
2112 			mutex_enter(&t->t_delay_lock);
2113 			if (lwp_timer_enqueue(&lwpt) != 0) {
2114 				mutex_exit(&t->t_delay_lock);
2115 				imm_timeout = 1;
2116 				timedwait = NULL;
2117 			}
2118 		}
2119 		t->t_flag |= T_WAITCVSEM;
2120 		lwp_block(&lwpchan);
2121 		/*
2122 		 * Nothing should happen to cause the lwp to sleep
2123 		 * again until after it returns from swtch().
2124 		 */
2125 		if (timedwait)
2126 			mutex_exit(&t->t_delay_lock);
2127 		locked = 0;
2128 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2129 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2130 		    (imm_timeout | imm_unpark))
2131 			setrun(t);
2132 		swtch();
2133 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2134 		if (timedwait)
2135 			tim = lwp_timer_dequeue(&lwpt);
2136 		setallwatch();
2137 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2138 		    MUSTRETURN(p, t) || imm_unpark)
2139 			error = EINTR;
2140 		else if (imm_timeout || (timedwait && tim == -1))
2141 			error = ETIME;
2142 		lwp->lwp_asleep = 0;
2143 		lwp->lwp_sysabort = 0;
2144 		watched = watch_disable_addr((caddr_t)sp,
2145 		    sizeof (*sp), S_WRITE);
2146 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2147 		locked = 1;
2148 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2149 	}
2150 	if (error == 0)
2151 		suword32_noerr((void *)&sp->sema_count, --count);
2152 	if (count != 0) {
2153 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2154 		suword8_noerr(&sp->sema_waiters, waiters);
2155 	}
2156 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2157 out:
2158 	no_fault();
2159 	if (watched)
2160 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2161 	if (tsp && check_park && !time_error)
2162 		error = lwp_timer_copyout(&lwpt, error);
2163 	if (error)
2164 		return (set_errno(error));
2165 	return (0);
2166 }
2167 
2168 /*
2169  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2170  * libc now calls lwp_sema_timedwait().
2171  * This system call trap exists solely for the benefit of old
2172  * statically linked applications from Solaris 9 and before.
2173  * It should be removed when we no longer care about such applications.
2174  */
2175 int
2176 lwp_sema_wait(lwp_sema_t *sp)
2177 {
2178 	return (lwp_sema_timedwait(sp, NULL, 0));
2179 }
2180 
2181 int
2182 lwp_sema_post(lwp_sema_t *sp)
2183 {
2184 	proc_t *p = ttoproc(curthread);
2185 	label_t ljb;
2186 	volatile int locked = 0;
2187 	volatile int watched = 0;
2188 	volatile uint16_t type = 0;
2189 	int count;
2190 	lwpchan_t lwpchan;
2191 	uchar_t waiters;
2192 	int error = 0;
2193 
2194 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2195 		return (set_errno(EFAULT));
2196 
2197 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2198 
2199 	if (on_fault(&ljb)) {
2200 		if (locked)
2201 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2202 		error = EFAULT;
2203 		goto out;
2204 	}
2205 	/*
2206 	 * Force Copy-on-write if necessary and ensure that the
2207 	 * synchronization object resides in read/write memory.
2208 	 * Cause an EFAULT return now if this is not so.
2209 	 */
2210 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2211 	suword16_noerr(&sp->sema_type, type);
2212 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2213 	    &lwpchan, LWPCHAN_CVPOOL)) {
2214 		error = EFAULT;
2215 		goto out;
2216 	}
2217 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2218 	locked = 1;
2219 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2220 	if (count == _SEM_VALUE_MAX)
2221 		error = EOVERFLOW;
2222 	else
2223 		suword32_noerr(&sp->sema_count, ++count);
2224 	if (count == 1) {
2225 		fuword8_noerr(&sp->sema_waiters, &waiters);
2226 		if (waiters) {
2227 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2228 			suword8_noerr(&sp->sema_waiters, waiters);
2229 		}
2230 	}
2231 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2232 out:
2233 	no_fault();
2234 	if (watched)
2235 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2236 	if (error)
2237 		return (set_errno(error));
2238 	return (0);
2239 }
2240 
2241 #define	TRW_WANT_WRITE		0x1
2242 #define	TRW_LOCK_GRANTED	0x2
2243 
2244 #define	READ_LOCK		0
2245 #define	WRITE_LOCK		1
2246 #define	TRY_FLAG		0x10
2247 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2248 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2249 
2250 /*
2251  * Release one writer or one or more readers. Compute the rwstate word to
2252  * reflect the new state of the queue. For a safe hand-off we copy the new
2253  * rwstate value back to userland before we wake any of the new lock holders.
2254  *
2255  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2256  * being given precedence over readers of the same priority).
2257  *
2258  * If the first thread is a reader we scan the queue releasing all readers
2259  * until we hit a writer or the end of the queue. If the first thread is a
2260  * writer we still need to check for another writer.
2261  */
2262 void
2263 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2264 {
2265 	sleepq_head_t *sqh;
2266 	kthread_t *tp;
2267 	kthread_t **tpp;
2268 	kthread_t *tpnext;
2269 	kthread_t *wakelist = NULL;
2270 	uint32_t rwstate = 0;
2271 	int wcount = 0;
2272 	int rcount = 0;
2273 
2274 	sqh = lwpsqhash(lwpchan);
2275 	disp_lock_enter(&sqh->sq_lock);
2276 	tpp = &sqh->sq_queue.sq_first;
2277 	while ((tp = *tpp) != NULL) {
2278 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2279 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2280 			if (tp->t_writer & TRW_WANT_WRITE) {
2281 				if ((wcount++ == 0) && (rcount == 0)) {
2282 					rwstate |= URW_WRITE_LOCKED;
2283 
2284 					/* Just one writer to wake. */
2285 					sleepq_unlink(tpp, tp);
2286 					wakelist = tp;
2287 
2288 					/* tpp already set for next thread. */
2289 					continue;
2290 				} else {
2291 					rwstate |= URW_HAS_WAITERS;
2292 					/* We need look no further. */
2293 					break;
2294 				}
2295 			} else {
2296 				rcount++;
2297 				if (wcount == 0) {
2298 					rwstate++;
2299 
2300 					/* Add reader to wake list. */
2301 					sleepq_unlink(tpp, tp);
2302 					tp->t_link = wakelist;
2303 					wakelist = tp;
2304 
2305 					/* tpp already set for next thread. */
2306 					continue;
2307 				} else {
2308 					rwstate |= URW_HAS_WAITERS;
2309 					/* We need look no further. */
2310 					break;
2311 				}
2312 			}
2313 		}
2314 		tpp = &tp->t_link;
2315 	}
2316 
2317 	/* Copy the new rwstate back to userland. */
2318 	suword32_noerr(&rw->rwlock_readers, rwstate);
2319 
2320 	/* Wake the new lock holder(s) up. */
2321 	tp = wakelist;
2322 	while (tp != NULL) {
2323 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2324 		tp->t_wchan0 = NULL;
2325 		tp->t_wchan = NULL;
2326 		tp->t_sobj_ops = NULL;
2327 		tp->t_writer |= TRW_LOCK_GRANTED;
2328 		tpnext = tp->t_link;
2329 		tp->t_link = NULL;
2330 		CL_WAKEUP(tp);
2331 		thread_unlock_high(tp);
2332 		tp = tpnext;
2333 	}
2334 
2335 	disp_lock_exit(&sqh->sq_lock);
2336 }
2337 
2338 /*
2339  * We enter here holding the user-level mutex, which we must release before
2340  * returning or blocking. Based on lwp_cond_wait().
2341  */
2342 static int
2343 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2344 {
2345 	lwp_mutex_t *mp = NULL;
2346 	kthread_t *t = curthread;
2347 	kthread_t *tp;
2348 	klwp_t *lwp = ttolwp(t);
2349 	proc_t *p = ttoproc(t);
2350 	lwp_timer_t lwpt;
2351 	lwpchan_t lwpchan;
2352 	lwpchan_t mlwpchan;
2353 	caddr_t timedwait;
2354 	volatile uint16_t type = 0;
2355 	volatile uint8_t mtype = 0;
2356 	uchar_t mwaiters;
2357 	volatile int error = 0;
2358 	int time_error;
2359 	clock_t tim = -1;
2360 	volatile int locked = 0;
2361 	volatile int mlocked = 0;
2362 	volatile int watched = 0;
2363 	volatile int mwatched = 0;
2364 	label_t ljb;
2365 	volatile int no_lwpchan = 1;
2366 	int imm_timeout = 0;
2367 	int try_flag;
2368 	uint32_t rwstate;
2369 	int acquired = 0;
2370 
2371 	/* We only check rw because the mutex is included in it. */
2372 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2373 		return (set_errno(EFAULT));
2374 
2375 	/*
2376 	 * Put the lwp in an orderly state for debugging,
2377 	 * in case we are stopped while sleeping, below.
2378 	 */
2379 	prstop(PR_REQUESTED, 0);
2380 
2381 	/* We must only report this error if we are about to sleep (later). */
2382 	timedwait = (caddr_t)tsp;
2383 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2384 	    lwpt.lwpt_imm_timeout) {
2385 		imm_timeout = 1;
2386 		timedwait = NULL;
2387 	}
2388 
2389 	(void) new_mstate(t, LMS_USER_LOCK);
2390 
2391 	if (on_fault(&ljb)) {
2392 		if (no_lwpchan) {
2393 			error = EFAULT;
2394 			goto out_nodrop;
2395 		}
2396 		if (mlocked) {
2397 			mlocked = 0;
2398 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2399 		}
2400 		if (locked) {
2401 			locked = 0;
2402 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2403 		}
2404 		/*
2405 		 * Set up another on_fault() for a possible fault
2406 		 * on the user lock accessed at "out_drop".
2407 		 */
2408 		if (on_fault(&ljb)) {
2409 			if (mlocked) {
2410 				mlocked = 0;
2411 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2412 			}
2413 			error = EFAULT;
2414 			goto out_nodrop;
2415 		}
2416 		error = EFAULT;
2417 		goto out_nodrop;
2418 	}
2419 
2420 	/* Process rd_wr (including sanity check). */
2421 	try_flag = (rd_wr & TRY_FLAG);
2422 	rd_wr &= ~TRY_FLAG;
2423 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2424 		error = EINVAL;
2425 		goto out_nodrop;
2426 	}
2427 
2428 	/*
2429 	 * Force Copy-on-write if necessary and ensure that the
2430 	 * synchronization object resides in read/write memory.
2431 	 * Cause an EFAULT return now if this is not so.
2432 	 */
2433 	mp = &rw->mutex;
2434 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2435 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2436 	suword8_noerr(&mp->mutex_type, mtype);
2437 	suword16_noerr(&rw->rwlock_type, type);
2438 
2439 	/* We can only continue for simple USYNC_PROCESS locks. */
2440 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2441 		error = EINVAL;
2442 		goto out_nodrop;
2443 	}
2444 
2445 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2446 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2447 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2448 		error = EFAULT;
2449 		goto out_nodrop;
2450 	}
2451 
2452 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2453 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2454 	    &lwpchan, LWPCHAN_CVPOOL)) {
2455 		error = EFAULT;
2456 		goto out_nodrop;
2457 	}
2458 
2459 	no_lwpchan = 0;
2460 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2461 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2462 
2463 	/*
2464 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2465 	 * atomically with respect to a possible wakeup which is a result
2466 	 * of lwp_rwlock_unlock().
2467 	 *
2468 	 * What's misleading is that the LWP is put to sleep after the
2469 	 * rwlock's mutex is released. This is OK as long as the release
2470 	 * operation is also done while holding mlwpchan. The LWP is then
2471 	 * put to sleep when the possibility of pagefaulting or sleeping
2472 	 * has been completely eliminated.
2473 	 */
2474 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2475 	locked = 1;
2476 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2477 	mlocked = 1;
2478 
2479 	/*
2480 	 * Fetch the current rwlock state.
2481 	 *
2482 	 * The possibility of spurious wake-ups or killed waiters means
2483 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2484 	 * We only fix these if they are important to us.
2485 	 *
2486 	 * Although various error states can be observed here (e.g. the lock
2487 	 * is not held, but there are waiters) we assume these are applicaton
2488 	 * errors and so we take no corrective action.
2489 	 */
2490 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2491 	/*
2492 	 * We cannot legitimately get here from user-level
2493 	 * without URW_HAS_WAITERS being set.
2494 	 * Set it now to guard against user-level error.
2495 	 */
2496 	rwstate |= URW_HAS_WAITERS;
2497 
2498 	/*
2499 	 * We can try only if the lock isn't held by a writer.
2500 	 */
2501 	if (!(rwstate & URW_WRITE_LOCKED)) {
2502 		tp = lwp_queue_waiter(&lwpchan);
2503 		if (tp == NULL) {
2504 			/*
2505 			 * Hmmm, rwstate indicates waiters but there are
2506 			 * none queued. This could just be the result of a
2507 			 * spurious wakeup, so let's ignore it.
2508 			 *
2509 			 * We now have a chance to acquire the lock
2510 			 * uncontended, but this is the last chance for
2511 			 * a writer to acquire the lock without blocking.
2512 			 */
2513 			if (rd_wr == READ_LOCK) {
2514 				rwstate++;
2515 				acquired = 1;
2516 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2517 				rwstate |= URW_WRITE_LOCKED;
2518 				acquired = 1;
2519 			}
2520 		} else if (rd_wr == READ_LOCK) {
2521 			/*
2522 			 * This is the last chance for a reader to acquire
2523 			 * the lock now, but it can only do so if there is
2524 			 * no writer of equal or greater priority at the
2525 			 * head of the queue .
2526 			 *
2527 			 * It is also just possible that there is a reader
2528 			 * at the head of the queue. This may be the result
2529 			 * of a spurious wakeup or an application failure.
2530 			 * In this case we only acquire the lock if we have
2531 			 * equal or greater priority. It is not our job to
2532 			 * release spurious waiters.
2533 			 */
2534 			pri_t our_pri = DISP_PRIO(t);
2535 			pri_t his_pri = DISP_PRIO(tp);
2536 
2537 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2538 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2539 				rwstate++;
2540 				acquired = 1;
2541 			}
2542 		}
2543 	}
2544 
2545 	if (acquired || try_flag || time_error) {
2546 		/*
2547 		 * We're not going to block this time.
2548 		 */
2549 		suword32_noerr(&rw->rwlock_readers, rwstate);
2550 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2551 		locked = 0;
2552 
2553 		if (acquired) {
2554 			/*
2555 			 * Got the lock!
2556 			 */
2557 			error = 0;
2558 
2559 		} else if (try_flag) {
2560 			/*
2561 			 * We didn't get the lock and we're about to block.
2562 			 * If we're doing a trylock, return EBUSY instead.
2563 			 */
2564 			error = EBUSY;
2565 
2566 		} else if (time_error) {
2567 			/*
2568 			 * The SUSV3 POSIX spec is very clear that we should
2569 			 * get no error from validating the timer (above)
2570 			 * until we would actually sleep.
2571 			 */
2572 			error = time_error;
2573 		}
2574 
2575 		goto out_drop;
2576 	}
2577 
2578 	/*
2579 	 * We're about to block, so indicate what kind of waiter we are.
2580 	 */
2581 	t->t_writer = 0;
2582 	if (rd_wr == WRITE_LOCK)
2583 		t->t_writer = TRW_WANT_WRITE;
2584 	suword32_noerr(&rw->rwlock_readers, rwstate);
2585 
2586 	/*
2587 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2588 	 */
2589 	set_owner_pid(mp, 0, 0);
2590 	ulock_clear(&mp->mutex_lockw);
2591 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2592 	if (mwaiters != 0) {
2593 		/*
2594 		 * Given the locking of mlwpchan around the release of
2595 		 * the mutex and checking for waiters, the following
2596 		 * call to lwp_release() can fail ONLY if the lock
2597 		 * acquirer is interrupted after setting the waiter bit,
2598 		 * calling lwp_block() and releasing mlwpchan.
2599 		 * In this case, it could get pulled off the LWP sleep
2600 		 * queue (via setrun()) before the following call to
2601 		 * lwp_release() occurs, and the lock requestor will
2602 		 * update the waiter bit correctly by re-evaluating it.
2603 		 */
2604 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2605 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2606 	}
2607 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2608 	mlocked = 0;
2609 	no_fault();
2610 
2611 	if (mwatched) {
2612 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2613 		mwatched = 0;
2614 	}
2615 	if (watched) {
2616 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2617 		watched = 0;
2618 	}
2619 
2620 	if (timedwait) {
2621 		/*
2622 		 * If we successfully queue the timeout,
2623 		 * then don't drop t_delay_lock until
2624 		 * we are on the sleep queue (below).
2625 		 */
2626 		mutex_enter(&t->t_delay_lock);
2627 		if (lwp_timer_enqueue(&lwpt) != 0) {
2628 			mutex_exit(&t->t_delay_lock);
2629 			imm_timeout = 1;
2630 			timedwait = NULL;
2631 		}
2632 	}
2633 	t->t_flag |= T_WAITCVSEM;
2634 	lwp_block(&lwpchan);
2635 
2636 	/*
2637 	 * Nothing should happen to cause the LWp to go to sleep until after
2638 	 * it returns from swtch().
2639 	 */
2640 	if (timedwait)
2641 		mutex_exit(&t->t_delay_lock);
2642 	locked = 0;
2643 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2644 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2645 		setrun(t);
2646 	swtch();
2647 
2648 	/*
2649 	 * We're back, but we need to work out why. Were we interrupted? Did
2650 	 * we timeout? Were we granted the lock?
2651 	 */
2652 	error = EAGAIN;
2653 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2654 	t->t_writer = 0;
2655 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2656 	if (timedwait)
2657 		tim = lwp_timer_dequeue(&lwpt);
2658 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2659 		error = EINTR;
2660 	else if (imm_timeout || (timedwait && tim == -1))
2661 		error = ETIME;
2662 	lwp->lwp_asleep = 0;
2663 	lwp->lwp_sysabort = 0;
2664 	setallwatch();
2665 
2666 	/*
2667 	 * If we were granted the lock we don't care about EINTR or ETIME.
2668 	 */
2669 	if (acquired)
2670 		error = 0;
2671 
2672 	if (t->t_mstate == LMS_USER_LOCK)
2673 		(void) new_mstate(t, LMS_SYSTEM);
2674 
2675 	if (error)
2676 		return (set_errno(error));
2677 	return (0);
2678 
2679 out_drop:
2680 	/*
2681 	 * Make sure that the user level lock is dropped before returning
2682 	 * to the caller.
2683 	 */
2684 	if (!mlocked) {
2685 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2686 		mlocked = 1;
2687 	}
2688 	set_owner_pid(mp, 0, 0);
2689 	ulock_clear(&mp->mutex_lockw);
2690 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2691 	if (mwaiters != 0) {
2692 		/*
2693 		 * See comment above on lock clearing and lwp_release()
2694 		 * success/failure.
2695 		 */
2696 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2697 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2698 	}
2699 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2700 	mlocked = 0;
2701 
2702 out_nodrop:
2703 	no_fault();
2704 	if (mwatched)
2705 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2706 	if (watched)
2707 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2708 	if (t->t_mstate == LMS_USER_LOCK)
2709 		(void) new_mstate(t, LMS_SYSTEM);
2710 	if (error)
2711 		return (set_errno(error));
2712 	return (0);
2713 }
2714 
2715 /*
2716  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2717  * we never drop the lock.
2718  */
2719 static int
2720 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2721 {
2722 	kthread_t *t = curthread;
2723 	proc_t *p = ttoproc(t);
2724 	lwpchan_t lwpchan;
2725 	volatile uint16_t type = 0;
2726 	volatile int error = 0;
2727 	volatile int locked = 0;
2728 	volatile int watched = 0;
2729 	label_t ljb;
2730 	volatile int no_lwpchan = 1;
2731 	uint32_t rwstate;
2732 
2733 	/* We only check rw because the mutex is included in it. */
2734 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2735 		return (set_errno(EFAULT));
2736 
2737 	if (on_fault(&ljb)) {
2738 		if (no_lwpchan) {
2739 			error = EFAULT;
2740 			goto out_nodrop;
2741 		}
2742 		if (locked) {
2743 			locked = 0;
2744 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2745 		}
2746 		error = EFAULT;
2747 		goto out_nodrop;
2748 	}
2749 
2750 	/*
2751 	 * Force Copy-on-write if necessary and ensure that the
2752 	 * synchronization object resides in read/write memory.
2753 	 * Cause an EFAULT return now if this is not so.
2754 	 */
2755 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2756 	suword16_noerr(&rw->rwlock_type, type);
2757 
2758 	/* We can only continue for simple USYNC_PROCESS locks. */
2759 	if (type != USYNC_PROCESS) {
2760 		error = EINVAL;
2761 		goto out_nodrop;
2762 	}
2763 
2764 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2765 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2766 	    &lwpchan, LWPCHAN_CVPOOL)) {
2767 		error = EFAULT;
2768 		goto out_nodrop;
2769 	}
2770 
2771 	no_lwpchan = 0;
2772 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2773 
2774 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2775 	locked = 1;
2776 
2777 	/*
2778 	 * We can resolve multiple readers (except the last reader) here.
2779 	 * For the last reader or a writer we need lwp_rwlock_release(),
2780 	 * to which we also delegate the task of copying the new rwstate
2781 	 * back to userland (see the comment there).
2782 	 */
2783 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2784 	if (rwstate & URW_WRITE_LOCKED)
2785 		lwp_rwlock_release(&lwpchan, rw);
2786 	else if ((rwstate & URW_READERS_MASK) > 0) {
2787 		rwstate--;
2788 		if ((rwstate & URW_READERS_MASK) == 0)
2789 			lwp_rwlock_release(&lwpchan, rw);
2790 		else
2791 			suword32_noerr(&rw->rwlock_readers, rwstate);
2792 	}
2793 
2794 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2795 	locked = 0;
2796 	error = 0;
2797 
2798 out_nodrop:
2799 	no_fault();
2800 	if (watched)
2801 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2802 	if (error)
2803 		return (set_errno(error));
2804 	return (0);
2805 }
2806 
2807 int
2808 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2809 {
2810 	switch (subcode) {
2811 	case 0:
2812 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2813 	case 1:
2814 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2815 	case 2:
2816 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2817 	case 3:
2818 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2819 	case 4:
2820 		return (lwp_rwlock_unlock(rwlp));
2821 	}
2822 	return (set_errno(EINVAL));
2823 }
2824 
2825 /*
2826  * Return the owner of the user-level s-object.
2827  * Since we can't really do this, return NULL.
2828  */
2829 /* ARGSUSED */
2830 static kthread_t *
2831 lwpsobj_owner(caddr_t sobj)
2832 {
2833 	return ((kthread_t *)NULL);
2834 }
2835 
2836 /*
2837  * Wake up a thread asleep on a user-level synchronization
2838  * object.
2839  */
2840 static void
2841 lwp_unsleep(kthread_t *t)
2842 {
2843 	ASSERT(THREAD_LOCK_HELD(t));
2844 	if (t->t_wchan0 != NULL) {
2845 		sleepq_head_t *sqh;
2846 		sleepq_t *sqp = t->t_sleepq;
2847 
2848 		if (sqp != NULL) {
2849 			sqh = lwpsqhash(&t->t_lwpchan);
2850 			ASSERT(&sqh->sq_queue == sqp);
2851 			sleepq_unsleep(t);
2852 			disp_lock_exit_high(&sqh->sq_lock);
2853 			CL_SETRUN(t);
2854 			return;
2855 		}
2856 	}
2857 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2858 }
2859 
2860 /*
2861  * Change the priority of a thread asleep on a user-level
2862  * synchronization object. To maintain proper priority order,
2863  * we:
2864  *	o dequeue the thread.
2865  *	o change its priority.
2866  *	o re-enqueue the thread.
2867  * Assumption: the thread is locked on entry.
2868  */
2869 static void
2870 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2871 {
2872 	ASSERT(THREAD_LOCK_HELD(t));
2873 	if (t->t_wchan0 != NULL) {
2874 		sleepq_t   *sqp = t->t_sleepq;
2875 
2876 		sleepq_dequeue(t);
2877 		*t_prip = pri;
2878 		sleepq_insert(sqp, t);
2879 	} else
2880 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2881 }
2882 
2883 /*
2884  * Clean up a left-over process-shared robust mutex
2885  */
2886 static void
2887 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2888 {
2889 	uint16_t flag;
2890 	uchar_t waiters;
2891 	label_t ljb;
2892 	pid_t owner_pid;
2893 	lwp_mutex_t *lp;
2894 	volatile int locked = 0;
2895 	volatile int watched = 0;
2896 	volatile struct upimutex *upimutex = NULL;
2897 	volatile int upilocked = 0;
2898 
2899 	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2900 	    != (USYNC_PROCESS | LOCK_ROBUST))
2901 		return;
2902 
2903 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2904 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2905 	if (on_fault(&ljb)) {
2906 		if (locked)
2907 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2908 		if (upilocked)
2909 			upimutex_unlock((upimutex_t *)upimutex, 0);
2910 		goto out;
2911 	}
2912 
2913 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2914 
2915 	if (UPIMUTEX(ent->lwpchan_type)) {
2916 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2917 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2918 
2919 		if (owner_pid != curproc->p_pid)
2920 			goto out;
2921 		mutex_enter(&upibp->upib_lock);
2922 		upimutex = upi_get(upibp, &lwpchan);
2923 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2924 			mutex_exit(&upibp->upib_lock);
2925 			goto out;
2926 		}
2927 		mutex_exit(&upibp->upib_lock);
2928 		upilocked = 1;
2929 		flag = lwp_clear_mutex(lp, lockflg);
2930 		suword8_noerr(&lp->mutex_lockw, 0);
2931 		upimutex_unlock((upimutex_t *)upimutex, flag);
2932 	} else {
2933 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2934 		locked = 1;
2935 		/*
2936 		 * Clear the spinners count because one of our
2937 		 * threads could have been spinning for this lock
2938 		 * at user level when the process was suddenly killed.
2939 		 * There is no harm in this since user-level libc code
2940 		 * will adapt to the sudden change in the spinner count.
2941 		 */
2942 		suword8_noerr(&lp->mutex_spinners, 0);
2943 		if (owner_pid != curproc->p_pid) {
2944 			/*
2945 			 * We are not the owner.  There may or may not be one.
2946 			 * If there are waiters, we wake up one or all of them.
2947 			 * It doesn't hurt to wake them up in error since
2948 			 * they will just retry the lock and go to sleep
2949 			 * again if necessary.
2950 			 */
2951 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2952 			if (waiters != 0) {	/* there are waiters */
2953 				fuword16_noerr(&lp->mutex_flag, &flag);
2954 				if (flag & LOCK_NOTRECOVERABLE) {
2955 					lwp_release_all(&ent->lwpchan_lwpchan);
2956 					suword8_noerr(&lp->mutex_waiters, 0);
2957 				} else if (lwp_release(&ent->lwpchan_lwpchan,
2958 				    &waiters, 0)) {
2959 					suword8_noerr(&lp->mutex_waiters,
2960 					    waiters);
2961 				}
2962 			}
2963 		} else {
2964 			/*
2965 			 * We are the owner.  Release it.
2966 			 */
2967 			(void) lwp_clear_mutex(lp, lockflg);
2968 			ulock_clear(&lp->mutex_lockw);
2969 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2970 			if (waiters &&
2971 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2972 				suword8_noerr(&lp->mutex_waiters, waiters);
2973 		}
2974 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2975 	}
2976 out:
2977 	no_fault();
2978 	if (watched)
2979 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2980 }
2981 
2982 /*
2983  * Register a process-shared robust mutex in the lwpchan cache.
2984  */
2985 int
2986 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2987 {
2988 	int error = 0;
2989 	volatile int watched;
2990 	label_t ljb;
2991 	uint8_t type;
2992 	lwpchan_t lwpchan;
2993 
2994 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2995 		return (set_errno(EFAULT));
2996 
2997 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2998 
2999 	if (on_fault(&ljb)) {
3000 		error = EFAULT;
3001 	} else {
3002 		/*
3003 		 * Force Copy-on-write if necessary and ensure that the
3004 		 * synchronization object resides in read/write memory.
3005 		 * Cause an EFAULT return now if this is not so.
3006 		 */
3007 		fuword8_noerr(&lp->mutex_type, &type);
3008 		suword8_noerr(&lp->mutex_type, type);
3009 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
3010 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
3011 			error = EINVAL;
3012 		} else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
3013 		    uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
3014 			error = EFAULT;
3015 		}
3016 	}
3017 	no_fault();
3018 	if (watched)
3019 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3020 	if (error)
3021 		return (set_errno(error));
3022 	return (0);
3023 }
3024 
3025 /*
3026  * There is a user-level robust lock registration in libc.
3027  * Mark it as invalid by storing -1 into the location of the pointer.
3028  */
3029 static void
3030 lwp_mutex_unregister(void *uaddr)
3031 {
3032 	if (get_udatamodel() == DATAMODEL_NATIVE) {
3033 		(void) sulword(uaddr, (ulong_t)-1);
3034 #ifdef _SYSCALL32_IMPL
3035 	} else {
3036 		(void) suword32(uaddr, (uint32_t)-1);
3037 #endif
3038 	}
3039 }
3040 
3041 int
3042 lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
3043 {
3044 	kthread_t *t = curthread;
3045 	proc_t *p = ttoproc(t);
3046 	int error = 0;
3047 	volatile int locked = 0;
3048 	volatile int watched = 0;
3049 	label_t ljb;
3050 	volatile uint8_t type = 0;
3051 	uint16_t flag;
3052 	lwpchan_t lwpchan;
3053 
3054 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3055 		return (set_errno(EFAULT));
3056 
3057 	(void) new_mstate(t, LMS_USER_LOCK);
3058 
3059 	if (on_fault(&ljb)) {
3060 		if (locked)
3061 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3062 		error = EFAULT;
3063 		goto out;
3064 	}
3065 	/*
3066 	 * Force Copy-on-write if necessary and ensure that the
3067 	 * synchronization object resides in read/write memory.
3068 	 * Cause an EFAULT return now if this is not so.
3069 	 */
3070 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3071 	suword8_noerr(&lp->mutex_type, type);
3072 	if (UPIMUTEX(type)) {
3073 		no_fault();
3074 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3075 		if (error == 0 || error == EOWNERDEAD || error == ELOCKUNMAPPED)
3076 			set_owner_pid(lp, owner,
3077 			    (type & USYNC_PROCESS)? p->p_pid : 0);
3078 		if (error)
3079 			return (set_errno(error));
3080 		return (0);
3081 	}
3082 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3083 	    &lwpchan, LWPCHAN_MPPOOL)) {
3084 		error = EFAULT;
3085 		goto out;
3086 	}
3087 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3088 	locked = 1;
3089 	if (type & LOCK_ROBUST) {
3090 		fuword16_noerr(&lp->mutex_flag, &flag);
3091 		if (flag & LOCK_NOTRECOVERABLE) {
3092 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3093 			error =  ENOTRECOVERABLE;
3094 			goto out;
3095 		}
3096 	}
3097 
3098 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3099 
3100 	if (!ulock_try(&lp->mutex_lockw))
3101 		error = EBUSY;
3102 	else {
3103 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
3104 		if (type & LOCK_ROBUST) {
3105 			fuword16_noerr(&lp->mutex_flag, &flag);
3106 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3107 				if (flag & LOCK_OWNERDEAD)
3108 					error = EOWNERDEAD;
3109 				else if (type & USYNC_PROCESS_ROBUST)
3110 					error = ELOCKUNMAPPED;
3111 				else
3112 					error = EOWNERDEAD;
3113 			}
3114 		}
3115 	}
3116 	locked = 0;
3117 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3118 out:
3119 
3120 	if (t->t_mstate == LMS_USER_LOCK)
3121 		(void) new_mstate(t, LMS_SYSTEM);
3122 
3123 	no_fault();
3124 	if (watched)
3125 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3126 	if (error)
3127 		return (set_errno(error));
3128 	return (0);
3129 }
3130 
3131 /*
3132  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3133  * the blocked lwp resumes and retries to acquire the lock.
3134  */
3135 int
3136 lwp_mutex_unlock(lwp_mutex_t *lp)
3137 {
3138 	proc_t *p = ttoproc(curthread);
3139 	lwpchan_t lwpchan;
3140 	uchar_t waiters;
3141 	volatile int locked = 0;
3142 	volatile int watched = 0;
3143 	volatile uint8_t type = 0;
3144 	label_t ljb;
3145 	uint16_t flag;
3146 	int error = 0;
3147 
3148 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3149 		return (set_errno(EFAULT));
3150 
3151 	if (on_fault(&ljb)) {
3152 		if (locked)
3153 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3154 		error = EFAULT;
3155 		goto out;
3156 	}
3157 
3158 	/*
3159 	 * Force Copy-on-write if necessary and ensure that the
3160 	 * synchronization object resides in read/write memory.
3161 	 * Cause an EFAULT return now if this is not so.
3162 	 */
3163 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3164 	suword8_noerr(&lp->mutex_type, type);
3165 
3166 	if (UPIMUTEX(type)) {
3167 		no_fault();
3168 		error = lwp_upimutex_unlock(lp, type);
3169 		if (error)
3170 			return (set_errno(error));
3171 		return (0);
3172 	}
3173 
3174 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3175 
3176 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3177 	    &lwpchan, LWPCHAN_MPPOOL)) {
3178 		error = EFAULT;
3179 		goto out;
3180 	}
3181 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3182 	locked = 1;
3183 	if (type & LOCK_ROBUST) {
3184 		fuword16_noerr(&lp->mutex_flag, &flag);
3185 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3186 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3187 			flag |= LOCK_NOTRECOVERABLE;
3188 			suword16_noerr(&lp->mutex_flag, flag);
3189 		}
3190 	}
3191 	set_owner_pid(lp, 0, 0);
3192 	ulock_clear(&lp->mutex_lockw);
3193 	/*
3194 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3195 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3196 	 * may fail.  If it fails, do not write into the waiter bit.
3197 	 * The call to lwp_release() might fail due to one of three reasons:
3198 	 *
3199 	 * 	1. due to the thread which set the waiter bit not actually
3200 	 *	   sleeping since it got the lock on the re-try. The waiter
3201 	 *	   bit will then be correctly updated by that thread. This
3202 	 *	   window may be closed by reading the wait bit again here
3203 	 *	   and not calling lwp_release() at all if it is zero.
3204 	 *	2. the thread which set the waiter bit and went to sleep
3205 	 *	   was woken up by a signal. This time, the waiter recomputes
3206 	 *	   the wait bit in the return with EINTR code.
3207 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3208 	 *	   memory that has been re-used after the lock was dropped.
3209 	 *	   In this case, writing into the waiter bit would cause data
3210 	 *	   corruption.
3211 	 */
3212 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3213 	if (waiters) {
3214 		if ((type & LOCK_ROBUST) &&
3215 		    (flag & LOCK_NOTRECOVERABLE)) {
3216 			lwp_release_all(&lwpchan);
3217 			suword8_noerr(&lp->mutex_waiters, 0);
3218 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3219 			suword8_noerr(&lp->mutex_waiters, waiters);
3220 		}
3221 	}
3222 
3223 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3224 out:
3225 	no_fault();
3226 	if (watched)
3227 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3228 	if (error)
3229 		return (set_errno(error));
3230 	return (0);
3231 }
3232