xref: /illumos-gate/usr/src/uts/common/syscall/lwp_sobj.c (revision 796b8631498f69a3e21b5c35aee280499f64420e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2015 Joyent, Inc.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved	*/
30 
31 #include <sys/param.h>
32 #include <sys/types.h>
33 #include <sys/sysmacros.h>
34 #include <sys/systm.h>
35 #include <sys/cred.h>
36 #include <sys/user.h>
37 #include <sys/errno.h>
38 #include <sys/file.h>
39 #include <sys/proc.h>
40 #include <sys/prsystm.h>
41 #include <sys/kmem.h>
42 #include <sys/sobject.h>
43 #include <sys/fault.h>
44 #include <sys/procfs.h>
45 #include <sys/watchpoint.h>
46 #include <sys/time.h>
47 #include <sys/cmn_err.h>
48 #include <sys/machlock.h>
49 #include <sys/debug.h>
50 #include <sys/synch.h>
51 #include <sys/synch32.h>
52 #include <sys/mman.h>
53 #include <sys/class.h>
54 #include <sys/schedctl.h>
55 #include <sys/sleepq.h>
56 #include <sys/policy.h>
57 #include <sys/tnf_probe.h>
58 #include <sys/lwpchan_impl.h>
59 #include <sys/turnstile.h>
60 #include <sys/atomic.h>
61 #include <sys/lwp_timer_impl.h>
62 #include <sys/lwp_upimutex_impl.h>
63 #include <vm/as.h>
64 #include <sys/sdt.h>
65 
66 static kthread_t *lwpsobj_owner(caddr_t);
67 static void lwp_unsleep(kthread_t *t);
68 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
69 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
70 static void lwp_mutex_unregister(void *uaddr);
71 static void set_owner_pid(lwp_mutex_t *, uintptr_t, pid_t);
72 static int iswanted(kthread_t *, lwpchan_t *);
73 
74 extern int lwp_cond_signal(lwp_cond_t *cv);
75 
76 /*
77  * Maximum number of user prio inheritance locks that can be held by a thread.
78  * Used to limit kmem for each thread. This is a per-thread limit that
79  * can be administered on a system wide basis (using /etc/system).
80  *
81  * Also, when a limit, say maxlwps is added for numbers of lwps within a
82  * process, the per-thread limit automatically becomes a process-wide limit
83  * of maximum number of held upi locks within a process:
84  *      maxheldupimx = maxnestupimx * maxlwps;
85  */
86 static uint32_t maxnestupimx = 2000;
87 
88 /*
89  * The sobj_ops vector exports a set of functions needed when a thread
90  * is asleep on a synchronization object of this type.
91  */
92 static sobj_ops_t lwp_sobj_ops = {
93 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
94 };
95 
96 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
97 
98 static sobj_ops_t lwp_sobj_pi_ops = {
99 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
100 	turnstile_change_pri
101 };
102 
103 static sleepq_head_t	lwpsleepq[NSLEEPQ];
104 upib_t			upimutextab[UPIMUTEX_TABSIZE];
105 
106 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
107 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
108 
109 /*
110  * We know that both lc_wchan and lc_wchan0 are addresses that most
111  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
112  * 'pool' is either 0 or 1.
113  */
114 #define	LWPCHAN_LOCK_HASH(X, pool) \
115 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
116 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
117 
118 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
119 
120 /*
121  * Is this a POSIX threads user-level lock requiring priority inheritance?
122  */
123 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
124 
125 static sleepq_head_t *
126 lwpsqhash(lwpchan_t *lwpchan)
127 {
128 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
129 	return (&lwpsleepq[SQHASHINDEX(x)]);
130 }
131 
132 /*
133  * Lock an lwpchan.
134  * Keep this in sync with lwpchan_unlock(), below.
135  */
136 static void
137 lwpchan_lock(lwpchan_t *lwpchan, int pool)
138 {
139 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
140 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
141 }
142 
143 /*
144  * Unlock an lwpchan.
145  * Keep this in sync with lwpchan_lock(), above.
146  */
147 static void
148 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
149 {
150 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
151 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
152 }
153 
154 /*
155  * Delete mappings from the lwpchan cache for pages that are being
156  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
157  * all mappings within the range are deleted from the lwpchan cache.
158  */
159 void
160 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
161 {
162 	lwpchan_data_t *lcp;
163 	lwpchan_hashbucket_t *hashbucket;
164 	lwpchan_hashbucket_t *endbucket;
165 	lwpchan_entry_t *ent;
166 	lwpchan_entry_t **prev;
167 	caddr_t addr;
168 
169 	mutex_enter(&p->p_lcp_lock);
170 	lcp = p->p_lcp;
171 	hashbucket = lcp->lwpchan_cache;
172 	endbucket = hashbucket + lcp->lwpchan_size;
173 	for (; hashbucket < endbucket; hashbucket++) {
174 		if (hashbucket->lwpchan_chain == NULL)
175 			continue;
176 		mutex_enter(&hashbucket->lwpchan_lock);
177 		prev = &hashbucket->lwpchan_chain;
178 		/* check entire chain */
179 		while ((ent = *prev) != NULL) {
180 			addr = ent->lwpchan_addr;
181 			if (start <= addr && addr < end) {
182 				*prev = ent->lwpchan_next;
183 				/*
184 				 * We do this only for the obsolete type
185 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
186 				 * locks do not draw ELOCKUNMAPPED or
187 				 * EOWNERDEAD due to being unmapped.
188 				 */
189 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
190 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
191 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
192 				/*
193 				 * If there is a user-level robust lock
194 				 * registration, mark it as invalid.
195 				 */
196 				if ((addr = ent->lwpchan_uaddr) != NULL)
197 					lwp_mutex_unregister(addr);
198 				kmem_free(ent, sizeof (*ent));
199 				atomic_dec_32(&lcp->lwpchan_entries);
200 			} else {
201 				prev = &ent->lwpchan_next;
202 			}
203 		}
204 		mutex_exit(&hashbucket->lwpchan_lock);
205 	}
206 	mutex_exit(&p->p_lcp_lock);
207 }
208 
209 /*
210  * Given an lwpchan cache pointer and a process virtual address,
211  * return a pointer to the corresponding lwpchan hash bucket.
212  */
213 static lwpchan_hashbucket_t *
214 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
215 {
216 	uint_t i;
217 
218 	/*
219 	 * All user-level sync object addresses are 8-byte aligned.
220 	 * Ignore the lowest 3 bits of the address and use the
221 	 * higher-order 2*lwpchan_bits bits for the hash index.
222 	 */
223 	addr >>= 3;
224 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
225 	return (lcp->lwpchan_cache + i);
226 }
227 
228 /*
229  * (Re)allocate the per-process lwpchan cache.
230  */
231 static void
232 lwpchan_alloc_cache(proc_t *p, uint_t bits)
233 {
234 	lwpchan_data_t *lcp;
235 	lwpchan_data_t *old_lcp;
236 	lwpchan_hashbucket_t *hashbucket;
237 	lwpchan_hashbucket_t *endbucket;
238 	lwpchan_hashbucket_t *newbucket;
239 	lwpchan_entry_t *ent;
240 	lwpchan_entry_t *next;
241 	uint_t count;
242 
243 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
244 
245 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
246 	lcp->lwpchan_bits = bits;
247 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
248 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
249 	lcp->lwpchan_entries = 0;
250 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
251 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
252 	lcp->lwpchan_next_data = NULL;
253 
254 	mutex_enter(&p->p_lcp_lock);
255 	if ((old_lcp = p->p_lcp) != NULL) {
256 		if (old_lcp->lwpchan_bits >= bits) {
257 			/* someone beat us to it */
258 			mutex_exit(&p->p_lcp_lock);
259 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
260 			    sizeof (lwpchan_hashbucket_t));
261 			kmem_free(lcp, sizeof (lwpchan_data_t));
262 			return;
263 		}
264 		/*
265 		 * Acquire all of the old hash table locks.
266 		 */
267 		hashbucket = old_lcp->lwpchan_cache;
268 		endbucket = hashbucket + old_lcp->lwpchan_size;
269 		for (; hashbucket < endbucket; hashbucket++)
270 			mutex_enter(&hashbucket->lwpchan_lock);
271 		/*
272 		 * Move all of the old hash table entries to the
273 		 * new hash table.  The new hash table has not yet
274 		 * been installed so we don't need any of its locks.
275 		 */
276 		count = 0;
277 		hashbucket = old_lcp->lwpchan_cache;
278 		for (; hashbucket < endbucket; hashbucket++) {
279 			ent = hashbucket->lwpchan_chain;
280 			while (ent != NULL) {
281 				next = ent->lwpchan_next;
282 				newbucket = lwpchan_bucket(lcp,
283 				    (uintptr_t)ent->lwpchan_addr);
284 				ent->lwpchan_next = newbucket->lwpchan_chain;
285 				newbucket->lwpchan_chain = ent;
286 				ent = next;
287 				count++;
288 			}
289 			hashbucket->lwpchan_chain = NULL;
290 		}
291 		lcp->lwpchan_entries = count;
292 	}
293 
294 	/*
295 	 * Retire the old hash table.  We can't actually kmem_free() it
296 	 * now because someone may still have a pointer to it.  Instead,
297 	 * we link it onto the new hash table's list of retired hash tables.
298 	 * The new hash table is double the size of the previous one, so
299 	 * the total size of all retired hash tables is less than the size
300 	 * of the new one.  exit() and exec() free the retired hash tables
301 	 * (see lwpchan_destroy_cache(), below).
302 	 */
303 	lcp->lwpchan_next_data = old_lcp;
304 
305 	/*
306 	 * As soon as we store the new lcp, future locking operations will
307 	 * use it.  Therefore, we must ensure that all the state we've just
308 	 * established reaches global visibility before the new lcp does.
309 	 */
310 	membar_producer();
311 	p->p_lcp = lcp;
312 
313 	if (old_lcp != NULL) {
314 		/*
315 		 * Release all of the old hash table locks.
316 		 */
317 		hashbucket = old_lcp->lwpchan_cache;
318 		for (; hashbucket < endbucket; hashbucket++)
319 			mutex_exit(&hashbucket->lwpchan_lock);
320 	}
321 	mutex_exit(&p->p_lcp_lock);
322 }
323 
324 /*
325  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
326  * Called when the process exits or execs.  All lwps except one have
327  * exited so we need no locks here.
328  */
329 void
330 lwpchan_destroy_cache(int exec)
331 {
332 	proc_t *p = curproc;
333 	lwpchan_hashbucket_t *hashbucket;
334 	lwpchan_hashbucket_t *endbucket;
335 	lwpchan_data_t *lcp;
336 	lwpchan_entry_t *ent;
337 	lwpchan_entry_t *next;
338 	uint16_t lockflg;
339 
340 	lcp = p->p_lcp;
341 	p->p_lcp = NULL;
342 
343 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
344 	hashbucket = lcp->lwpchan_cache;
345 	endbucket = hashbucket + lcp->lwpchan_size;
346 	for (; hashbucket < endbucket; hashbucket++) {
347 		ent = hashbucket->lwpchan_chain;
348 		hashbucket->lwpchan_chain = NULL;
349 		while (ent != NULL) {
350 			next = ent->lwpchan_next;
351 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
352 			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
353 			    == (USYNC_PROCESS | LOCK_ROBUST))
354 				lwp_mutex_cleanup(ent, lockflg);
355 			kmem_free(ent, sizeof (*ent));
356 			ent = next;
357 		}
358 	}
359 
360 	while (lcp != NULL) {
361 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
362 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
363 		    sizeof (lwpchan_hashbucket_t));
364 		kmem_free(lcp, sizeof (lwpchan_data_t));
365 		lcp = next_lcp;
366 	}
367 }
368 
369 /*
370  * Return zero when there is an entry in the lwpchan cache for the
371  * given process virtual address and non-zero when there is not.
372  * The returned non-zero value is the current length of the
373  * hash chain plus one.  The caller holds the hash bucket lock.
374  */
375 static uint_t
376 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
377 	lwpchan_hashbucket_t *hashbucket)
378 {
379 	lwpchan_entry_t *ent;
380 	uint_t count = 1;
381 
382 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
383 		if (ent->lwpchan_addr == addr) {
384 			if (ent->lwpchan_type != type ||
385 			    ent->lwpchan_pool != pool) {
386 				/*
387 				 * This shouldn't happen, but might if the
388 				 * process reuses its memory for different
389 				 * types of sync objects.  We test first
390 				 * to avoid grabbing the memory cache line.
391 				 */
392 				ent->lwpchan_type = (uint16_t)type;
393 				ent->lwpchan_pool = (uint16_t)pool;
394 			}
395 			*lwpchan = ent->lwpchan_lwpchan;
396 			return (0);
397 		}
398 		count++;
399 	}
400 	return (count);
401 }
402 
403 /*
404  * Return the cached lwpchan mapping if cached, otherwise insert
405  * a virtual address to lwpchan mapping into the cache.
406  */
407 static int
408 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
409 	int type, lwpchan_t *lwpchan, int pool)
410 {
411 	proc_t *p = curproc;
412 	lwpchan_data_t *lcp;
413 	lwpchan_hashbucket_t *hashbucket;
414 	lwpchan_entry_t *ent;
415 	memid_t	memid;
416 	uint_t count;
417 	uint_t bits;
418 
419 top:
420 	/* initialize the lwpchan cache, if necesary */
421 	if ((lcp = p->p_lcp) == NULL) {
422 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
423 		goto top;
424 	}
425 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
426 	mutex_enter(&hashbucket->lwpchan_lock);
427 	if (lcp != p->p_lcp) {
428 		/* someone resized the lwpchan cache; start over */
429 		mutex_exit(&hashbucket->lwpchan_lock);
430 		goto top;
431 	}
432 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
433 		/* it's in the cache */
434 		mutex_exit(&hashbucket->lwpchan_lock);
435 		return (1);
436 	}
437 	mutex_exit(&hashbucket->lwpchan_lock);
438 	if (as_getmemid(as, addr, &memid) != 0)
439 		return (0);
440 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
441 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
442 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
443 	mutex_enter(&hashbucket->lwpchan_lock);
444 	if (lcp != p->p_lcp) {
445 		/* someone resized the lwpchan cache; start over */
446 		mutex_exit(&hashbucket->lwpchan_lock);
447 		kmem_free(ent, sizeof (*ent));
448 		goto top;
449 	}
450 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
451 	if (count == 0) {
452 		/* someone else added this entry to the cache */
453 		mutex_exit(&hashbucket->lwpchan_lock);
454 		kmem_free(ent, sizeof (*ent));
455 		return (1);
456 	}
457 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
458 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
459 		/* hash chain too long; reallocate the hash table */
460 		mutex_exit(&hashbucket->lwpchan_lock);
461 		kmem_free(ent, sizeof (*ent));
462 		lwpchan_alloc_cache(p, bits + 1);
463 		goto top;
464 	}
465 	ent->lwpchan_addr = addr;
466 	ent->lwpchan_uaddr = uaddr;
467 	ent->lwpchan_type = (uint16_t)type;
468 	ent->lwpchan_pool = (uint16_t)pool;
469 	ent->lwpchan_lwpchan = *lwpchan;
470 	ent->lwpchan_next = hashbucket->lwpchan_chain;
471 	hashbucket->lwpchan_chain = ent;
472 	atomic_inc_32(&lcp->lwpchan_entries);
473 	mutex_exit(&hashbucket->lwpchan_lock);
474 	return (1);
475 }
476 
477 /*
478  * Return a unique pair of identifiers that corresponds to a
479  * synchronization object's virtual address.  Process-shared
480  * sync objects usually get vnode/offset from as_getmemid().
481  */
482 static int
483 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
484 {
485 	/*
486 	 * If the lwp synch object is defined to be process-private,
487 	 * we just make the first field of the lwpchan be 'as' and
488 	 * the second field be the synch object's virtual address.
489 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
490 	 * The lwpchan cache is used only for process-shared objects.
491 	 */
492 	if (!(type & USYNC_PROCESS)) {
493 		lwpchan->lc_wchan0 = (caddr_t)as;
494 		lwpchan->lc_wchan = addr;
495 		return (1);
496 	}
497 
498 	return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
499 }
500 
501 static void
502 lwp_block(lwpchan_t *lwpchan)
503 {
504 	kthread_t *t = curthread;
505 	klwp_t *lwp = ttolwp(t);
506 	sleepq_head_t *sqh;
507 
508 	thread_lock(t);
509 	t->t_flag |= T_WAKEABLE;
510 	t->t_lwpchan = *lwpchan;
511 	t->t_sobj_ops = &lwp_sobj_ops;
512 	t->t_release = 0;
513 	sqh = lwpsqhash(lwpchan);
514 	disp_lock_enter_high(&sqh->sq_lock);
515 	CL_SLEEP(t);
516 	DTRACE_SCHED(sleep);
517 	THREAD_SLEEP(t, &sqh->sq_lock);
518 	sleepq_insert(&sqh->sq_queue, t);
519 	thread_unlock(t);
520 	lwp->lwp_asleep = 1;
521 	lwp->lwp_sysabort = 0;
522 	lwp->lwp_ru.nvcsw++;
523 	(void) new_mstate(curthread, LMS_SLEEP);
524 }
525 
526 static kthread_t *
527 lwpsobj_pi_owner(upimutex_t *up)
528 {
529 	return (up->upi_owner);
530 }
531 
532 static struct upimutex *
533 upi_get(upib_t *upibp, lwpchan_t *lcp)
534 {
535 	struct upimutex *upip;
536 
537 	for (upip = upibp->upib_first; upip != NULL;
538 	    upip = upip->upi_nextchain) {
539 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
540 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
541 			break;
542 	}
543 	return (upip);
544 }
545 
546 static void
547 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
548 {
549 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
550 
551 	/*
552 	 * Insert upimutex at front of list. Maybe a bit unfair
553 	 * but assume that not many lwpchans hash to the same
554 	 * upimutextab bucket, i.e. the list of upimutexes from
555 	 * upib_first is not too long.
556 	 */
557 	upimutex->upi_nextchain = upibp->upib_first;
558 	upibp->upib_first = upimutex;
559 }
560 
561 static void
562 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
563 {
564 	struct upimutex **prev;
565 
566 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
567 
568 	prev = &upibp->upib_first;
569 	while (*prev != upimutex) {
570 		prev = &(*prev)->upi_nextchain;
571 	}
572 	*prev = upimutex->upi_nextchain;
573 	upimutex->upi_nextchain = NULL;
574 }
575 
576 /*
577  * Add upimutex to chain of upimutexes held by curthread.
578  * Returns number of upimutexes held by curthread.
579  */
580 static uint32_t
581 upi_mylist_add(struct upimutex *upimutex)
582 {
583 	kthread_t *t = curthread;
584 
585 	/*
586 	 * Insert upimutex at front of list of upimutexes owned by t. This
587 	 * would match typical LIFO order in which nested locks are acquired
588 	 * and released.
589 	 */
590 	upimutex->upi_nextowned = t->t_upimutex;
591 	t->t_upimutex = upimutex;
592 	t->t_nupinest++;
593 	ASSERT(t->t_nupinest > 0);
594 	return (t->t_nupinest);
595 }
596 
597 /*
598  * Delete upimutex from list of upimutexes owned by curthread.
599  */
600 static void
601 upi_mylist_del(struct upimutex *upimutex)
602 {
603 	kthread_t *t = curthread;
604 	struct upimutex **prev;
605 
606 	/*
607 	 * Since the order in which nested locks are acquired and released,
608 	 * is typically LIFO, and typical nesting levels are not too deep, the
609 	 * following should not be expensive in the general case.
610 	 */
611 	prev = &t->t_upimutex;
612 	while (*prev != upimutex) {
613 		prev = &(*prev)->upi_nextowned;
614 	}
615 	*prev = upimutex->upi_nextowned;
616 	upimutex->upi_nextowned = NULL;
617 	ASSERT(t->t_nupinest > 0);
618 	t->t_nupinest--;
619 }
620 
621 /*
622  * Returns true if upimutex is owned. Should be called only when upim points
623  * to kmem which cannot disappear from underneath.
624  */
625 static int
626 upi_owned(upimutex_t *upim)
627 {
628 	return (upim->upi_owner == curthread);
629 }
630 
631 /*
632  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
633  */
634 static struct upimutex *
635 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
636 {
637 	lwpchan_t lwpchan;
638 	upib_t *upibp;
639 	struct upimutex *upimutex;
640 
641 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
642 	    &lwpchan, LWPCHAN_MPPOOL))
643 		return (NULL);
644 
645 	upibp = &UPI_CHAIN(lwpchan);
646 	mutex_enter(&upibp->upib_lock);
647 	upimutex = upi_get(upibp, &lwpchan);
648 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
649 		mutex_exit(&upibp->upib_lock);
650 		return (NULL);
651 	}
652 	mutex_exit(&upibp->upib_lock);
653 	return (upimutex);
654 }
655 
656 /*
657  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
658  * no lock hand-off occurrs.
659  */
660 static void
661 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
662 {
663 	turnstile_t *ts;
664 	upib_t *upibp;
665 	kthread_t *newowner;
666 
667 	upi_mylist_del(upimutex);
668 	upibp = upimutex->upi_upibp;
669 	mutex_enter(&upibp->upib_lock);
670 	if (upimutex->upi_waiter != 0) { /* if waiters */
671 		ts = turnstile_lookup(upimutex);
672 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
673 			/* hand-off lock to highest prio waiter */
674 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
675 			upimutex->upi_owner = newowner;
676 			if (ts->ts_waiters == 1)
677 				upimutex->upi_waiter = 0;
678 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
679 			mutex_exit(&upibp->upib_lock);
680 			return;
681 		} else if (ts != NULL) {
682 			/* LOCK_NOTRECOVERABLE: wakeup all */
683 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
684 		} else {
685 			/*
686 			 * Misleading w bit. Waiters might have been
687 			 * interrupted. No need to clear the w bit (upimutex
688 			 * will soon be freed). Re-calculate PI from existing
689 			 * waiters.
690 			 */
691 			turnstile_exit(upimutex);
692 			turnstile_pi_recalc();
693 		}
694 	}
695 	/*
696 	 * no waiters, or LOCK_NOTRECOVERABLE.
697 	 * remove from the bucket chain of upi mutexes.
698 	 * de-allocate kernel memory (upimutex).
699 	 */
700 	upi_chain_del(upimutex->upi_upibp, upimutex);
701 	mutex_exit(&upibp->upib_lock);
702 	kmem_free(upimutex, sizeof (upimutex_t));
703 }
704 
705 static int
706 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
707 {
708 	label_t ljb;
709 	int error = 0;
710 	lwpchan_t lwpchan;
711 	uint16_t flag;
712 	upib_t *upibp;
713 	volatile struct upimutex *upimutex = NULL;
714 	turnstile_t *ts;
715 	uint32_t nupinest;
716 	volatile int upilocked = 0;
717 
718 	if (on_fault(&ljb)) {
719 		if (upilocked)
720 			upimutex_unlock((upimutex_t *)upimutex, 0);
721 		error = EFAULT;
722 		goto out;
723 	}
724 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
725 	    &lwpchan, LWPCHAN_MPPOOL)) {
726 		error = EFAULT;
727 		goto out;
728 	}
729 	upibp = &UPI_CHAIN(lwpchan);
730 retry:
731 	mutex_enter(&upibp->upib_lock);
732 	upimutex = upi_get(upibp, &lwpchan);
733 	if (upimutex == NULL)  {
734 		/* lock available since lwpchan has no upimutex */
735 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
736 		upi_chain_add(upibp, (upimutex_t *)upimutex);
737 		upimutex->upi_owner = curthread; /* grab lock */
738 		upimutex->upi_upibp = upibp;
739 		upimutex->upi_vaddr = lp;
740 		upimutex->upi_lwpchan = lwpchan;
741 		mutex_exit(&upibp->upib_lock);
742 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
743 		upilocked = 1;
744 		fuword16_noerr(&lp->mutex_flag, &flag);
745 		if (nupinest > maxnestupimx &&
746 		    secpolicy_resource(CRED()) != 0) {
747 			upimutex_unlock((upimutex_t *)upimutex, flag);
748 			error = ENOMEM;
749 			goto out;
750 		}
751 		if (flag & LOCK_NOTRECOVERABLE) {
752 			/*
753 			 * Since the setting of LOCK_NOTRECOVERABLE
754 			 * was done under the high-level upi mutex,
755 			 * in lwp_upimutex_unlock(), this flag needs to
756 			 * be checked while holding the upi mutex.
757 			 * If set, this thread should return without
758 			 * the lock held, and with the right error code.
759 			 */
760 			upimutex_unlock((upimutex_t *)upimutex, flag);
761 			upilocked = 0;
762 			error = ENOTRECOVERABLE;
763 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
764 			if (flag & LOCK_OWNERDEAD)
765 				error = EOWNERDEAD;
766 			else if (type & USYNC_PROCESS_ROBUST)
767 				error = ELOCKUNMAPPED;
768 			else
769 				error = EOWNERDEAD;
770 		}
771 		goto out;
772 	}
773 	/*
774 	 * If a upimutex object exists, it must have an owner.
775 	 * This is due to lock hand-off, and release of upimutex when no
776 	 * waiters are present at unlock time,
777 	 */
778 	ASSERT(upimutex->upi_owner != NULL);
779 	if (upimutex->upi_owner == curthread) {
780 		/*
781 		 * The user wrapper can check if the mutex type is
782 		 * ERRORCHECK: if not, it should stall at user-level.
783 		 * If so, it should return the error code.
784 		 */
785 		mutex_exit(&upibp->upib_lock);
786 		error = EDEADLK;
787 		goto out;
788 	}
789 	if (try == UPIMUTEX_TRY) {
790 		mutex_exit(&upibp->upib_lock);
791 		error = EBUSY;
792 		goto out;
793 	}
794 	/*
795 	 * Block for the lock.
796 	 */
797 	if ((error = lwptp->lwpt_time_error) != 0) {
798 		/*
799 		 * The SUSV3 Posix spec is very clear that we
800 		 * should get no error from validating the
801 		 * timer until we would actually sleep.
802 		 */
803 		mutex_exit(&upibp->upib_lock);
804 		goto out;
805 	}
806 	if (lwptp->lwpt_tsp != NULL) {
807 		/*
808 		 * Unlike the protocol for other lwp timedwait operations,
809 		 * we must drop t_delay_lock before going to sleep in
810 		 * turnstile_block() for a upi mutex.
811 		 * See the comments below and in turnstile.c
812 		 */
813 		mutex_enter(&curthread->t_delay_lock);
814 		(void) lwp_timer_enqueue(lwptp);
815 		mutex_exit(&curthread->t_delay_lock);
816 	}
817 	/*
818 	 * Now, set the waiter bit and block for the lock in turnstile_block().
819 	 * No need to preserve the previous wbit since a lock try is not
820 	 * attempted after setting the wait bit. Wait bit is set under
821 	 * the upib_lock, which is not released until the turnstile lock
822 	 * is acquired. Say, the upimutex is L:
823 	 *
824 	 * 1. upib_lock is held so the waiter does not have to retry L after
825 	 *    setting the wait bit: since the owner has to grab the upib_lock
826 	 *    to unlock L, it will certainly see the wait bit set.
827 	 * 2. upib_lock is not released until the turnstile lock is acquired.
828 	 *    This is the key to preventing a missed wake-up. Otherwise, the
829 	 *    owner could acquire the upib_lock, and the tc_lock, to call
830 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
831 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
832 	 *    find this waiter, resulting in the missed wakeup.
833 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
834 	 *    holding the tc_lock (since mutex_exit() could need to acquire
835 	 *    the same tc_lock)...and so is held when calling turnstile_block().
836 	 *    The address of upib_lock is passed to turnstile_block() which
837 	 *    releases it after releasing all turnstile locks, and before going
838 	 *    to sleep in swtch().
839 	 * 4. The waiter value cannot be a count of waiters, because a waiter
840 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
841 	 *    which point, the upib_lock cannot be locked, to decrement waiter
842 	 *    count. So, just treat the waiter state as a bit, not a count.
843 	 */
844 	ts = turnstile_lookup((upimutex_t *)upimutex);
845 	upimutex->upi_waiter = 1;
846 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
847 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
848 	/*
849 	 * Hand-off implies that we wakeup holding the lock, except when:
850 	 *	- deadlock is detected
851 	 *	- lock is not recoverable
852 	 *	- we got an interrupt or timeout
853 	 * If we wake up due to an interrupt or timeout, we may
854 	 * or may not be holding the lock due to mutex hand-off.
855 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
856 	 */
857 	if (error != 0) {
858 		if ((error == EINTR || error == ETIME) &&
859 		    (upimutex = lwp_upimutex_owned(lp, type))) {
860 			/*
861 			 * Unlock and return - the re-startable syscall will
862 			 * try the lock again if we got EINTR.
863 			 */
864 			(void) upi_mylist_add((upimutex_t *)upimutex);
865 			upimutex_unlock((upimutex_t *)upimutex, 0);
866 		}
867 		/*
868 		 * The only other possible error is EDEADLK.  If so, upimutex
869 		 * is valid, since its owner is deadlocked with curthread.
870 		 */
871 		ASSERT(error == EINTR || error == ETIME ||
872 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
873 		ASSERT(!lwp_upimutex_owned(lp, type));
874 		goto out;
875 	}
876 	if (lwp_upimutex_owned(lp, type)) {
877 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
878 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
879 		upilocked = 1;
880 	}
881 	/*
882 	 * Now, need to read the user-level lp->mutex_flag to do the following:
883 	 *
884 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
885 	 *   should be returned.
886 	 * - if lock isn't held, check if ENOTRECOVERABLE should
887 	 *   be returned.
888 	 *
889 	 * Now, either lp->mutex_flag is readable or it's not. If not
890 	 * readable, the on_fault path will cause a return with EFAULT
891 	 * as it should.  If it is readable, the state of the flag
892 	 * encodes the robustness state of the lock:
893 	 *
894 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
895 	 * or LOCK_UNMAPPED setting will influence the return code
896 	 * appropriately.  If the upimutex is not locked here, this
897 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
898 	 * event.  The flag's setting can be used to distinguish
899 	 * between these two events.
900 	 */
901 	fuword16_noerr(&lp->mutex_flag, &flag);
902 	if (upilocked) {
903 		/*
904 		 * If the thread wakes up from turnstile_block with the lock
905 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
906 		 * since it would not have been handed-off the lock.
907 		 * So, no need to check for this case.
908 		 */
909 		if (nupinest > maxnestupimx &&
910 		    secpolicy_resource(CRED()) != 0) {
911 			upimutex_unlock((upimutex_t *)upimutex, flag);
912 			upilocked = 0;
913 			error = ENOMEM;
914 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
915 			if (flag & LOCK_OWNERDEAD)
916 				error = EOWNERDEAD;
917 			else if (type & USYNC_PROCESS_ROBUST)
918 				error = ELOCKUNMAPPED;
919 			else
920 				error = EOWNERDEAD;
921 		}
922 	} else {
923 		/*
924 		 * Wake-up without the upimutex held. Either this is a
925 		 * spurious wake-up (due to signals, forkall(), whatever), or
926 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
927 		 * of the mutex flag can be used to distinguish between the
928 		 * two events.
929 		 */
930 		if (flag & LOCK_NOTRECOVERABLE) {
931 			error = ENOTRECOVERABLE;
932 		} else {
933 			/*
934 			 * Here, the flag could be set to LOCK_OWNERDEAD or
935 			 * not. In both cases, this is a spurious wakeup,
936 			 * since the upi lock is not held, but the thread
937 			 * has returned from turnstile_block().
938 			 *
939 			 * The user flag could be LOCK_OWNERDEAD if, at the
940 			 * same time as curthread having been woken up
941 			 * spuriously, the owner (say Tdead) has died, marked
942 			 * the mutex flag accordingly, and handed off the lock
943 			 * to some other waiter (say Tnew). curthread just
944 			 * happened to read the flag while Tnew has yet to deal
945 			 * with the owner-dead event.
946 			 *
947 			 * In this event, curthread should retry the lock.
948 			 * If Tnew is able to cleanup the lock, curthread
949 			 * will eventually get the lock with a zero error code,
950 			 * If Tnew is unable to cleanup, its eventual call to
951 			 * unlock the lock will result in the mutex flag being
952 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
953 			 * all waiters, including curthread, which will then
954 			 * eventually return ENOTRECOVERABLE due to the above
955 			 * check.
956 			 *
957 			 * Of course, if the user-flag is not set with
958 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
959 			 * this is definitely a spurious wakeup.
960 			 */
961 			goto retry;
962 		}
963 	}
964 
965 out:
966 	no_fault();
967 	return (error);
968 }
969 
970 
971 static int
972 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
973 {
974 	label_t ljb;
975 	int error = 0;
976 	lwpchan_t lwpchan;
977 	uint16_t flag;
978 	upib_t *upibp;
979 	volatile struct upimutex *upimutex = NULL;
980 	volatile int upilocked = 0;
981 
982 	if (on_fault(&ljb)) {
983 		if (upilocked)
984 			upimutex_unlock((upimutex_t *)upimutex, 0);
985 		error = EFAULT;
986 		goto out;
987 	}
988 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
989 	    &lwpchan, LWPCHAN_MPPOOL)) {
990 		error = EFAULT;
991 		goto out;
992 	}
993 	upibp = &UPI_CHAIN(lwpchan);
994 	mutex_enter(&upibp->upib_lock);
995 	upimutex = upi_get(upibp, &lwpchan);
996 	/*
997 	 * If the lock is not held, or the owner is not curthread, return
998 	 * error. The user-level wrapper can return this error or stall,
999 	 * depending on whether mutex is of ERRORCHECK type or not.
1000 	 */
1001 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1002 		mutex_exit(&upibp->upib_lock);
1003 		error = EPERM;
1004 		goto out;
1005 	}
1006 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1007 	upilocked = 1;
1008 	fuword16_noerr(&lp->mutex_flag, &flag);
1009 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1010 		/*
1011 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1012 		 */
1013 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1014 		flag |= LOCK_NOTRECOVERABLE;
1015 		suword16_noerr(&lp->mutex_flag, flag);
1016 	}
1017 	set_owner_pid(lp, 0, 0);
1018 	upimutex_unlock((upimutex_t *)upimutex, flag);
1019 	upilocked = 0;
1020 out:
1021 	no_fault();
1022 	return (error);
1023 }
1024 
1025 /*
1026  * Set the owner and ownerpid fields of a user-level mutex. Note, this function
1027  * uses the suword*_noerr routines which must be called between
1028  * on_fault/no_fault. However, this routine itself does not do the
1029  * on_fault/no_fault and it is assumed all the callers will do so instead!
1030  */
1031 static void
1032 set_owner_pid(lwp_mutex_t *lp, uintptr_t owner, pid_t pid)
1033 {
1034 	union {
1035 		uint64_t word64;
1036 		uint32_t word32[2];
1037 	} un;
1038 
1039 	un.word64 = (uint64_t)owner;
1040 
1041 	suword32_noerr(&lp->mutex_ownerpid, pid);
1042 #if defined(_LP64)
1043 	if (((uintptr_t)lp & (_LONG_LONG_ALIGNMENT - 1)) == 0) { /* aligned */
1044 		suword64_noerr(&lp->mutex_owner, un.word64);
1045 		return;
1046 	}
1047 #endif
1048 	/* mutex is unaligned or we are running on a 32-bit kernel */
1049 	suword32_noerr((uint32_t *)&lp->mutex_owner, un.word32[0]);
1050 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, un.word32[1]);
1051 }
1052 
1053 /*
1054  * Clear the contents of a user-level mutex; return the flags.
1055  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1056  */
1057 static uint16_t
1058 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1059 {
1060 	uint16_t flag;
1061 
1062 	fuword16_noerr(&lp->mutex_flag, &flag);
1063 	if ((flag &
1064 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1065 		flag |= lockflg;
1066 		suword16_noerr(&lp->mutex_flag, flag);
1067 	}
1068 	set_owner_pid(lp, 0, 0);
1069 	suword8_noerr(&lp->mutex_rcount, 0);
1070 
1071 	return (flag);
1072 }
1073 
1074 /*
1075  * Mark user mutex state, corresponding to kernel upimutex,
1076  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1077  */
1078 static int
1079 upi_dead(upimutex_t *upip, uint16_t lockflg)
1080 {
1081 	label_t ljb;
1082 	int error = 0;
1083 	lwp_mutex_t *lp;
1084 
1085 	if (on_fault(&ljb)) {
1086 		error = EFAULT;
1087 		goto out;
1088 	}
1089 
1090 	lp = upip->upi_vaddr;
1091 	(void) lwp_clear_mutex(lp, lockflg);
1092 	suword8_noerr(&lp->mutex_lockw, 0);
1093 out:
1094 	no_fault();
1095 	return (error);
1096 }
1097 
1098 /*
1099  * Unlock all upimutexes held by curthread, since curthread is dying.
1100  * For each upimutex, attempt to mark its corresponding user mutex object as
1101  * dead.
1102  */
1103 void
1104 upimutex_cleanup()
1105 {
1106 	kthread_t *t = curthread;
1107 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1108 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1109 	struct upimutex *upip;
1110 
1111 	while ((upip = t->t_upimutex) != NULL) {
1112 		if (upi_dead(upip, lockflg) != 0) {
1113 			/*
1114 			 * If the user object associated with this upimutex is
1115 			 * unmapped, unlock upimutex with the
1116 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1117 			 * woken up. Since user object is unmapped, it could
1118 			 * not be marked as dead or notrecoverable.
1119 			 * The waiters will now all wake up and return
1120 			 * ENOTRECOVERABLE, since they would find that the lock
1121 			 * has not been handed-off to them.
1122 			 * See lwp_upimutex_lock().
1123 			 */
1124 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1125 		} else {
1126 			/*
1127 			 * The user object has been updated as dead.
1128 			 * Unlock the upimutex: if no waiters, upip kmem will
1129 			 * be freed. If there is a waiter, the lock will be
1130 			 * handed off. If exit() is in progress, each existing
1131 			 * waiter will successively get the lock, as owners
1132 			 * die, and each new owner will call this routine as
1133 			 * it dies. The last owner will free kmem, since
1134 			 * it will find the upimutex has no waiters. So,
1135 			 * eventually, the kmem is guaranteed to be freed.
1136 			 */
1137 			upimutex_unlock(upip, 0);
1138 		}
1139 		/*
1140 		 * Note that the call to upimutex_unlock() above will delete
1141 		 * upimutex from the t_upimutexes chain. And so the
1142 		 * while loop will eventually terminate.
1143 		 */
1144 	}
1145 }
1146 
1147 int
1148 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp, uintptr_t owner)
1149 {
1150 	kthread_t *t = curthread;
1151 	klwp_t *lwp = ttolwp(t);
1152 	proc_t *p = ttoproc(t);
1153 	lwp_timer_t lwpt;
1154 	caddr_t timedwait;
1155 	int error = 0;
1156 	int time_error;
1157 	clock_t tim = -1;
1158 	uchar_t waiters;
1159 	volatile int locked = 0;
1160 	volatile int watched = 0;
1161 	label_t ljb;
1162 	volatile uint8_t type = 0;
1163 	lwpchan_t lwpchan;
1164 	sleepq_head_t *sqh;
1165 	uint16_t flag;
1166 	int imm_timeout = 0;
1167 
1168 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1169 		return (set_errno(EFAULT));
1170 
1171 	/*
1172 	 * Put the lwp in an orderly state for debugging,
1173 	 * in case we are stopped while sleeping, below.
1174 	 */
1175 	prstop(PR_REQUESTED, 0);
1176 
1177 	timedwait = (caddr_t)tsp;
1178 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1179 	    lwpt.lwpt_imm_timeout) {
1180 		imm_timeout = 1;
1181 		timedwait = NULL;
1182 	}
1183 
1184 	/*
1185 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1186 	 * this micro state is really a run state. If the thread indeed blocks,
1187 	 * this state becomes valid. If not, the state is converted back to
1188 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1189 	 * when blocking.
1190 	 */
1191 	(void) new_mstate(t, LMS_USER_LOCK);
1192 	if (on_fault(&ljb)) {
1193 		if (locked)
1194 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1195 		error = EFAULT;
1196 		goto out;
1197 	}
1198 	/*
1199 	 * Force Copy-on-write if necessary and ensure that the
1200 	 * synchronization object resides in read/write memory.
1201 	 * Cause an EFAULT return now if this is not so.
1202 	 */
1203 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1204 	suword8_noerr(&lp->mutex_type, type);
1205 	if (UPIMUTEX(type)) {
1206 		no_fault();
1207 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1208 		if (error == 0 || error == EOWNERDEAD ||
1209 		    error == ELOCKUNMAPPED) {
1210 			volatile int locked = error != 0;
1211 			if (on_fault(&ljb)) {
1212 				if (locked != 0)
1213 					error = lwp_upimutex_unlock(lp, type);
1214 				else
1215 					error = EFAULT;
1216 				goto upierr;
1217 			}
1218 			set_owner_pid(lp, owner,
1219 			    (type & USYNC_PROCESS)? p->p_pid : 0);
1220 			no_fault();
1221 		}
1222 upierr:
1223 		if (tsp && !time_error)	/* copyout the residual time left */
1224 			error = lwp_timer_copyout(&lwpt, error);
1225 		if (error)
1226 			return (set_errno(error));
1227 		return (0);
1228 	}
1229 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1230 	    &lwpchan, LWPCHAN_MPPOOL)) {
1231 		error = EFAULT;
1232 		goto out;
1233 	}
1234 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1235 	locked = 1;
1236 	if (type & LOCK_ROBUST) {
1237 		fuword16_noerr(&lp->mutex_flag, &flag);
1238 		if (flag & LOCK_NOTRECOVERABLE) {
1239 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1240 			error = ENOTRECOVERABLE;
1241 			goto out;
1242 		}
1243 	}
1244 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1245 	suword8_noerr(&lp->mutex_waiters, 1);
1246 
1247 	/*
1248 	 * If watchpoints are set, they need to be restored, since
1249 	 * atomic accesses of memory such as the call to ulock_try()
1250 	 * below cannot be watched.
1251 	 */
1252 
1253 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1254 
1255 	while (!ulock_try(&lp->mutex_lockw)) {
1256 		if (time_error) {
1257 			/*
1258 			 * The SUSV3 Posix spec is very clear that we
1259 			 * should get no error from validating the
1260 			 * timer until we would actually sleep.
1261 			 */
1262 			error = time_error;
1263 			break;
1264 		}
1265 
1266 		if (watched) {
1267 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1268 			watched = 0;
1269 		}
1270 
1271 		if (timedwait) {
1272 			/*
1273 			 * If we successfully queue the timeout,
1274 			 * then don't drop t_delay_lock until
1275 			 * we are on the sleep queue (below).
1276 			 */
1277 			mutex_enter(&t->t_delay_lock);
1278 			if (lwp_timer_enqueue(&lwpt) != 0) {
1279 				mutex_exit(&t->t_delay_lock);
1280 				imm_timeout = 1;
1281 				timedwait = NULL;
1282 			}
1283 		}
1284 		lwp_block(&lwpchan);
1285 		/*
1286 		 * Nothing should happen to cause the lwp to go to
1287 		 * sleep again until after it returns from swtch().
1288 		 */
1289 		if (timedwait)
1290 			mutex_exit(&t->t_delay_lock);
1291 		locked = 0;
1292 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1293 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1294 			setrun(t);
1295 		swtch();
1296 		t->t_flag &= ~T_WAKEABLE;
1297 		if (timedwait)
1298 			tim = lwp_timer_dequeue(&lwpt);
1299 		setallwatch();
1300 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1301 			error = EINTR;
1302 		else if (imm_timeout || (timedwait && tim == -1))
1303 			error = ETIME;
1304 		if (error) {
1305 			lwp->lwp_asleep = 0;
1306 			lwp->lwp_sysabort = 0;
1307 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1308 			    S_WRITE);
1309 
1310 			/*
1311 			 * Need to re-compute waiters bit. The waiters field in
1312 			 * the lock is not reliable. Either of two things could
1313 			 * have occurred: no lwp may have called lwp_release()
1314 			 * for me but I have woken up due to a signal or
1315 			 * timeout.  In this case, the waiter bit is incorrect
1316 			 * since it is still set to 1, set above.
1317 			 * OR an lwp_release() did occur for some other lwp on
1318 			 * the same lwpchan. In this case, the waiter bit is
1319 			 * correct.  But which event occurred, one can't tell.
1320 			 * So, recompute.
1321 			 */
1322 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1323 			locked = 1;
1324 			sqh = lwpsqhash(&lwpchan);
1325 			disp_lock_enter(&sqh->sq_lock);
1326 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1327 			disp_lock_exit(&sqh->sq_lock);
1328 			break;
1329 		}
1330 		lwp->lwp_asleep = 0;
1331 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1332 		    S_WRITE);
1333 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1334 		locked = 1;
1335 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1336 		suword8_noerr(&lp->mutex_waiters, 1);
1337 		if (type & LOCK_ROBUST) {
1338 			fuword16_noerr(&lp->mutex_flag, &flag);
1339 			if (flag & LOCK_NOTRECOVERABLE) {
1340 				error = ENOTRECOVERABLE;
1341 				break;
1342 			}
1343 		}
1344 	}
1345 
1346 	if (t->t_mstate == LMS_USER_LOCK)
1347 		(void) new_mstate(t, LMS_SYSTEM);
1348 
1349 	if (error == 0) {
1350 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
1351 		if (type & LOCK_ROBUST) {
1352 			fuword16_noerr(&lp->mutex_flag, &flag);
1353 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1354 				if (flag & LOCK_OWNERDEAD)
1355 					error = EOWNERDEAD;
1356 				else if (type & USYNC_PROCESS_ROBUST)
1357 					error = ELOCKUNMAPPED;
1358 				else
1359 					error = EOWNERDEAD;
1360 			}
1361 		}
1362 	}
1363 	suword8_noerr(&lp->mutex_waiters, waiters);
1364 	locked = 0;
1365 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1366 out:
1367 	no_fault();
1368 	if (watched)
1369 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1370 	if (tsp && !time_error)		/* copyout the residual time left */
1371 		error = lwp_timer_copyout(&lwpt, error);
1372 	if (error)
1373 		return (set_errno(error));
1374 	return (0);
1375 }
1376 
1377 static int
1378 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1379 {
1380 	/*
1381 	 * The caller holds the dispatcher lock on the sleep queue.
1382 	 */
1383 	while (t != NULL) {
1384 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1385 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1386 			return (1);
1387 		t = t->t_link;
1388 	}
1389 	return (0);
1390 }
1391 
1392 /*
1393  * Return the highest priority thread sleeping on this lwpchan.
1394  */
1395 static kthread_t *
1396 lwp_queue_waiter(lwpchan_t *lwpchan)
1397 {
1398 	sleepq_head_t *sqh;
1399 	kthread_t *tp;
1400 
1401 	sqh = lwpsqhash(lwpchan);
1402 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1403 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1404 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1405 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1406 			break;
1407 	}
1408 	disp_lock_exit(&sqh->sq_lock);
1409 	return (tp);
1410 }
1411 
1412 static int
1413 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1414 {
1415 	sleepq_head_t *sqh;
1416 	kthread_t *tp;
1417 	kthread_t **tpp;
1418 
1419 	sqh = lwpsqhash(lwpchan);
1420 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1421 	tpp = &sqh->sq_queue.sq_first;
1422 	while ((tp = *tpp) != NULL) {
1423 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1424 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1425 			/*
1426 			 * The following is typically false. It could be true
1427 			 * only if lwp_release() is called from
1428 			 * lwp_mutex_wakeup() after reading the waiters field
1429 			 * from memory in which the lwp lock used to be, but has
1430 			 * since been re-used to hold a lwp cv or lwp semaphore.
1431 			 * The thread "tp" found to match the lwp lock's wchan
1432 			 * is actually sleeping for the cv or semaphore which
1433 			 * now has the same wchan. In this case, lwp_release()
1434 			 * should return failure.
1435 			 */
1436 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1437 				ASSERT(sync_type == 0);
1438 				/*
1439 				 * assert that this can happen only for mutexes
1440 				 * i.e. sync_type == 0, for correctly written
1441 				 * user programs.
1442 				 */
1443 				disp_lock_exit(&sqh->sq_lock);
1444 				return (0);
1445 			}
1446 			*waiters = iswanted(tp->t_link, lwpchan);
1447 			sleepq_unlink(tpp, tp);
1448 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1449 			tp->t_wchan0 = NULL;
1450 			tp->t_wchan = NULL;
1451 			tp->t_sobj_ops = NULL;
1452 			tp->t_release = 1;
1453 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1454 			CL_WAKEUP(tp);
1455 			thread_unlock(tp);	/* drop run queue lock */
1456 			return (1);
1457 		}
1458 		tpp = &tp->t_link;
1459 	}
1460 	*waiters = 0;
1461 	disp_lock_exit(&sqh->sq_lock);
1462 	return (0);
1463 }
1464 
1465 static void
1466 lwp_release_all(lwpchan_t *lwpchan)
1467 {
1468 	sleepq_head_t	*sqh;
1469 	kthread_t *tp;
1470 	kthread_t **tpp;
1471 
1472 	sqh = lwpsqhash(lwpchan);
1473 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1474 	tpp = &sqh->sq_queue.sq_first;
1475 	while ((tp = *tpp) != NULL) {
1476 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1477 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1478 			sleepq_unlink(tpp, tp);
1479 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1480 			tp->t_wchan0 = NULL;
1481 			tp->t_wchan = NULL;
1482 			tp->t_sobj_ops = NULL;
1483 			CL_WAKEUP(tp);
1484 			thread_unlock_high(tp);	/* release run queue lock */
1485 		} else {
1486 			tpp = &tp->t_link;
1487 		}
1488 	}
1489 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1490 }
1491 
1492 /*
1493  * unblock a lwp that is trying to acquire this mutex. the blocked
1494  * lwp resumes and retries to acquire the lock.
1495  */
1496 int
1497 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1498 {
1499 	proc_t *p = ttoproc(curthread);
1500 	lwpchan_t lwpchan;
1501 	uchar_t waiters;
1502 	volatile int locked = 0;
1503 	volatile int watched = 0;
1504 	volatile uint8_t type = 0;
1505 	label_t ljb;
1506 	int error = 0;
1507 
1508 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1509 		return (set_errno(EFAULT));
1510 
1511 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1512 
1513 	if (on_fault(&ljb)) {
1514 		if (locked)
1515 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1516 		error = EFAULT;
1517 		goto out;
1518 	}
1519 	/*
1520 	 * Force Copy-on-write if necessary and ensure that the
1521 	 * synchronization object resides in read/write memory.
1522 	 * Cause an EFAULT return now if this is not so.
1523 	 */
1524 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1525 	suword8_noerr(&lp->mutex_type, type);
1526 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1527 	    &lwpchan, LWPCHAN_MPPOOL)) {
1528 		error = EFAULT;
1529 		goto out;
1530 	}
1531 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1532 	locked = 1;
1533 	/*
1534 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1535 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1536 	 * may fail.  If it fails, do not write into the waiter bit.
1537 	 * The call to lwp_release() might fail due to one of three reasons:
1538 	 *
1539 	 * 	1. due to the thread which set the waiter bit not actually
1540 	 *	   sleeping since it got the lock on the re-try. The waiter
1541 	 *	   bit will then be correctly updated by that thread. This
1542 	 *	   window may be closed by reading the wait bit again here
1543 	 *	   and not calling lwp_release() at all if it is zero.
1544 	 *	2. the thread which set the waiter bit and went to sleep
1545 	 *	   was woken up by a signal. This time, the waiter recomputes
1546 	 *	   the wait bit in the return with EINTR code.
1547 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1548 	 *	   memory that has been re-used after the lock was dropped.
1549 	 *	   In this case, writing into the waiter bit would cause data
1550 	 *	   corruption.
1551 	 */
1552 	if (release_all)
1553 		lwp_release_all(&lwpchan);
1554 	else if (lwp_release(&lwpchan, &waiters, 0))
1555 		suword8_noerr(&lp->mutex_waiters, waiters);
1556 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1557 out:
1558 	no_fault();
1559 	if (watched)
1560 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1561 	if (error)
1562 		return (set_errno(error));
1563 	return (0);
1564 }
1565 
1566 /*
1567  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1568  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1569  * a flag telling the kernel whether or not to honor the kernel/user
1570  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1571  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1572  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1573  * it is used an an in/out parameter.  On entry, it contains the relative
1574  * time until timeout.  On exit, we copyout the residual time left to it.
1575  */
1576 int
1577 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1578 {
1579 	kthread_t *t = curthread;
1580 	klwp_t *lwp = ttolwp(t);
1581 	proc_t *p = ttoproc(t);
1582 	lwp_timer_t lwpt;
1583 	lwpchan_t cv_lwpchan;
1584 	lwpchan_t m_lwpchan;
1585 	caddr_t timedwait;
1586 	volatile uint16_t type = 0;
1587 	volatile uint8_t mtype = 0;
1588 	uchar_t waiters;
1589 	volatile int error;
1590 	clock_t tim = -1;
1591 	volatile int locked = 0;
1592 	volatile int m_locked = 0;
1593 	volatile int cvwatched = 0;
1594 	volatile int mpwatched = 0;
1595 	label_t ljb;
1596 	volatile int no_lwpchan = 1;
1597 	int imm_timeout = 0;
1598 	int imm_unpark = 0;
1599 
1600 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1601 	    (caddr_t)mp >= p->p_as->a_userlimit)
1602 		return (set_errno(EFAULT));
1603 
1604 	/*
1605 	 * Put the lwp in an orderly state for debugging,
1606 	 * in case we are stopped while sleeping, below.
1607 	 */
1608 	prstop(PR_REQUESTED, 0);
1609 
1610 	timedwait = (caddr_t)tsp;
1611 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1612 		return (set_errno(error));
1613 	if (lwpt.lwpt_imm_timeout) {
1614 		imm_timeout = 1;
1615 		timedwait = NULL;
1616 	}
1617 
1618 	(void) new_mstate(t, LMS_USER_LOCK);
1619 
1620 	if (on_fault(&ljb)) {
1621 		if (no_lwpchan) {
1622 			error = EFAULT;
1623 			goto out;
1624 		}
1625 		if (m_locked) {
1626 			m_locked = 0;
1627 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1628 		}
1629 		if (locked) {
1630 			locked = 0;
1631 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1632 		}
1633 		/*
1634 		 * set up another on_fault() for a possible fault
1635 		 * on the user lock accessed at "efault"
1636 		 */
1637 		if (on_fault(&ljb)) {
1638 			if (m_locked) {
1639 				m_locked = 0;
1640 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1641 			}
1642 			goto out;
1643 		}
1644 		error = EFAULT;
1645 		goto efault;
1646 	}
1647 
1648 	/*
1649 	 * Force Copy-on-write if necessary and ensure that the
1650 	 * synchronization object resides in read/write memory.
1651 	 * Cause an EFAULT return now if this is not so.
1652 	 */
1653 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1654 	suword8_noerr(&mp->mutex_type, mtype);
1655 	if (UPIMUTEX(mtype) == 0) {
1656 		/* convert user level mutex, "mp", to a unique lwpchan */
1657 		/* check if mtype is ok to use below, instead of type from cv */
1658 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1659 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1660 			error = EFAULT;
1661 			goto out;
1662 		}
1663 	}
1664 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1665 	suword16_noerr(&cv->cond_type, type);
1666 	/* convert user level condition variable, "cv", to a unique lwpchan */
1667 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1668 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1669 		error = EFAULT;
1670 		goto out;
1671 	}
1672 	no_lwpchan = 0;
1673 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1674 	if (UPIMUTEX(mtype) == 0)
1675 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1676 		    S_WRITE);
1677 
1678 	/*
1679 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1680 	 * with respect to a possible wakeup which is a result of either
1681 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1682 	 *
1683 	 * What's misleading, is that the lwp is put to sleep after the
1684 	 * condition variable's mutex is released.  This is OK as long as
1685 	 * the release operation is also done while holding lwpchan_lock.
1686 	 * The lwp is then put to sleep when the possibility of pagefaulting
1687 	 * or sleeping is completely eliminated.
1688 	 */
1689 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1690 	locked = 1;
1691 	if (UPIMUTEX(mtype) == 0) {
1692 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1693 		m_locked = 1;
1694 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1695 		/*
1696 		 * unlock the condition variable's mutex. (pagefaults are
1697 		 * possible here.)
1698 		 */
1699 		set_owner_pid(mp, 0, 0);
1700 		ulock_clear(&mp->mutex_lockw);
1701 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1702 		if (waiters != 0) {
1703 			/*
1704 			 * Given the locking of lwpchan_lock around the release
1705 			 * of the mutex and checking for waiters, the following
1706 			 * call to lwp_release() can fail ONLY if the lock
1707 			 * acquirer is interrupted after setting the waiter bit,
1708 			 * calling lwp_block() and releasing lwpchan_lock.
1709 			 * In this case, it could get pulled off the lwp sleep
1710 			 * q (via setrun()) before the following call to
1711 			 * lwp_release() occurs. In this case, the lock
1712 			 * requestor will update the waiter bit correctly by
1713 			 * re-evaluating it.
1714 			 */
1715 			if (lwp_release(&m_lwpchan, &waiters, 0))
1716 				suword8_noerr(&mp->mutex_waiters, waiters);
1717 		}
1718 		m_locked = 0;
1719 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1720 	} else {
1721 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1722 		error = lwp_upimutex_unlock(mp, mtype);
1723 		if (error) {	/* if the upimutex unlock failed */
1724 			locked = 0;
1725 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1726 			goto out;
1727 		}
1728 	}
1729 	no_fault();
1730 
1731 	if (mpwatched) {
1732 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1733 		mpwatched = 0;
1734 	}
1735 	if (cvwatched) {
1736 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1737 		cvwatched = 0;
1738 	}
1739 
1740 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1741 		/*
1742 		 * We received a signal at user-level before calling here
1743 		 * or another thread wants us to return immediately
1744 		 * with EINTR.  See lwp_unpark().
1745 		 */
1746 		imm_unpark = 1;
1747 		t->t_unpark = 0;
1748 		timedwait = NULL;
1749 	} else if (timedwait) {
1750 		/*
1751 		 * If we successfully queue the timeout,
1752 		 * then don't drop t_delay_lock until
1753 		 * we are on the sleep queue (below).
1754 		 */
1755 		mutex_enter(&t->t_delay_lock);
1756 		if (lwp_timer_enqueue(&lwpt) != 0) {
1757 			mutex_exit(&t->t_delay_lock);
1758 			imm_timeout = 1;
1759 			timedwait = NULL;
1760 		}
1761 	}
1762 	t->t_flag |= T_WAITCVSEM;
1763 	lwp_block(&cv_lwpchan);
1764 	/*
1765 	 * Nothing should happen to cause the lwp to go to sleep
1766 	 * until after it returns from swtch().
1767 	 */
1768 	if (timedwait)
1769 		mutex_exit(&t->t_delay_lock);
1770 	locked = 0;
1771 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1772 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1773 	    (imm_timeout | imm_unpark))
1774 		setrun(t);
1775 	swtch();
1776 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1777 	if (timedwait)
1778 		tim = lwp_timer_dequeue(&lwpt);
1779 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1780 	    MUSTRETURN(p, t) || imm_unpark)
1781 		error = EINTR;
1782 	else if (imm_timeout || (timedwait && tim == -1))
1783 		error = ETIME;
1784 	lwp->lwp_asleep = 0;
1785 	lwp->lwp_sysabort = 0;
1786 	setallwatch();
1787 
1788 	if (t->t_mstate == LMS_USER_LOCK)
1789 		(void) new_mstate(t, LMS_SYSTEM);
1790 
1791 	if (tsp && check_park)		/* copyout the residual time left */
1792 		error = lwp_timer_copyout(&lwpt, error);
1793 
1794 	/* the mutex is reacquired by the caller on return to user level */
1795 	if (error) {
1796 		/*
1797 		 * If we were concurrently lwp_cond_signal()d and we
1798 		 * received a UNIX signal or got a timeout, then perform
1799 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1800 		 */
1801 		if (t->t_release)
1802 			(void) lwp_cond_signal(cv);
1803 		return (set_errno(error));
1804 	}
1805 	return (0);
1806 
1807 efault:
1808 	/*
1809 	 * make sure that the user level lock is dropped before
1810 	 * returning to caller, since the caller always re-acquires it.
1811 	 */
1812 	if (UPIMUTEX(mtype) == 0) {
1813 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1814 		m_locked = 1;
1815 		set_owner_pid(mp, 0, 0);
1816 		ulock_clear(&mp->mutex_lockw);
1817 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1818 		if (waiters != 0) {
1819 			/*
1820 			 * See comment above on lock clearing and lwp_release()
1821 			 * success/failure.
1822 			 */
1823 			if (lwp_release(&m_lwpchan, &waiters, 0))
1824 				suword8_noerr(&mp->mutex_waiters, waiters);
1825 		}
1826 		m_locked = 0;
1827 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1828 	} else {
1829 		(void) lwp_upimutex_unlock(mp, mtype);
1830 	}
1831 out:
1832 	no_fault();
1833 	if (mpwatched)
1834 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1835 	if (cvwatched)
1836 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1837 	if (t->t_mstate == LMS_USER_LOCK)
1838 		(void) new_mstate(t, LMS_SYSTEM);
1839 	return (set_errno(error));
1840 }
1841 
1842 /*
1843  * wakeup one lwp that's blocked on this condition variable.
1844  */
1845 int
1846 lwp_cond_signal(lwp_cond_t *cv)
1847 {
1848 	proc_t *p = ttoproc(curthread);
1849 	lwpchan_t lwpchan;
1850 	uchar_t waiters;
1851 	volatile uint16_t type = 0;
1852 	volatile int locked = 0;
1853 	volatile int watched = 0;
1854 	label_t ljb;
1855 	int error = 0;
1856 
1857 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1858 		return (set_errno(EFAULT));
1859 
1860 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1861 
1862 	if (on_fault(&ljb)) {
1863 		if (locked)
1864 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1865 		error = EFAULT;
1866 		goto out;
1867 	}
1868 	/*
1869 	 * Force Copy-on-write if necessary and ensure that the
1870 	 * synchronization object resides in read/write memory.
1871 	 * Cause an EFAULT return now if this is not so.
1872 	 */
1873 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1874 	suword16_noerr(&cv->cond_type, type);
1875 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1876 	    &lwpchan, LWPCHAN_CVPOOL)) {
1877 		error = EFAULT;
1878 		goto out;
1879 	}
1880 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1881 	locked = 1;
1882 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1883 	if (waiters != 0) {
1884 		/*
1885 		 * The following call to lwp_release() might fail but it is
1886 		 * OK to write into the waiters bit below, since the memory
1887 		 * could not have been re-used or unmapped (for correctly
1888 		 * written user programs) as in the case of lwp_mutex_wakeup().
1889 		 * For an incorrect program, we should not care about data
1890 		 * corruption since this is just one instance of other places
1891 		 * where corruption can occur for such a program. Of course
1892 		 * if the memory is unmapped, normal fault recovery occurs.
1893 		 */
1894 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1895 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1896 	}
1897 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1898 out:
1899 	no_fault();
1900 	if (watched)
1901 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1902 	if (error)
1903 		return (set_errno(error));
1904 	return (0);
1905 }
1906 
1907 /*
1908  * wakeup every lwp that's blocked on this condition variable.
1909  */
1910 int
1911 lwp_cond_broadcast(lwp_cond_t *cv)
1912 {
1913 	proc_t *p = ttoproc(curthread);
1914 	lwpchan_t lwpchan;
1915 	volatile uint16_t type = 0;
1916 	volatile int locked = 0;
1917 	volatile int watched = 0;
1918 	label_t ljb;
1919 	uchar_t waiters;
1920 	int error = 0;
1921 
1922 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1923 		return (set_errno(EFAULT));
1924 
1925 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1926 
1927 	if (on_fault(&ljb)) {
1928 		if (locked)
1929 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1930 		error = EFAULT;
1931 		goto out;
1932 	}
1933 	/*
1934 	 * Force Copy-on-write if necessary and ensure that the
1935 	 * synchronization object resides in read/write memory.
1936 	 * Cause an EFAULT return now if this is not so.
1937 	 */
1938 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1939 	suword16_noerr(&cv->cond_type, type);
1940 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1941 	    &lwpchan, LWPCHAN_CVPOOL)) {
1942 		error = EFAULT;
1943 		goto out;
1944 	}
1945 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1946 	locked = 1;
1947 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1948 	if (waiters != 0) {
1949 		lwp_release_all(&lwpchan);
1950 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1951 	}
1952 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1953 out:
1954 	no_fault();
1955 	if (watched)
1956 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1957 	if (error)
1958 		return (set_errno(error));
1959 	return (0);
1960 }
1961 
1962 int
1963 lwp_sema_trywait(lwp_sema_t *sp)
1964 {
1965 	kthread_t *t = curthread;
1966 	proc_t *p = ttoproc(t);
1967 	label_t ljb;
1968 	volatile int locked = 0;
1969 	volatile int watched = 0;
1970 	volatile uint16_t type = 0;
1971 	int count;
1972 	lwpchan_t lwpchan;
1973 	uchar_t waiters;
1974 	int error = 0;
1975 
1976 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1977 		return (set_errno(EFAULT));
1978 
1979 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1980 
1981 	if (on_fault(&ljb)) {
1982 		if (locked)
1983 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1984 		error = EFAULT;
1985 		goto out;
1986 	}
1987 	/*
1988 	 * Force Copy-on-write if necessary and ensure that the
1989 	 * synchronization object resides in read/write memory.
1990 	 * Cause an EFAULT return now if this is not so.
1991 	 */
1992 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1993 	suword16_noerr((void *)&sp->sema_type, type);
1994 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1995 	    &lwpchan, LWPCHAN_CVPOOL)) {
1996 		error = EFAULT;
1997 		goto out;
1998 	}
1999 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2000 	locked = 1;
2001 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2002 	if (count == 0)
2003 		error = EBUSY;
2004 	else
2005 		suword32_noerr((void *)&sp->sema_count, --count);
2006 	if (count != 0) {
2007 		fuword8_noerr(&sp->sema_waiters, &waiters);
2008 		if (waiters != 0) {
2009 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2010 			suword8_noerr(&sp->sema_waiters, waiters);
2011 		}
2012 	}
2013 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2014 out:
2015 	no_fault();
2016 	if (watched)
2017 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2018 	if (error)
2019 		return (set_errno(error));
2020 	return (0);
2021 }
2022 
2023 /*
2024  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2025  */
2026 int
2027 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2028 {
2029 	kthread_t *t = curthread;
2030 	klwp_t *lwp = ttolwp(t);
2031 	proc_t *p = ttoproc(t);
2032 	lwp_timer_t lwpt;
2033 	caddr_t timedwait;
2034 	clock_t tim = -1;
2035 	label_t ljb;
2036 	volatile int locked = 0;
2037 	volatile int watched = 0;
2038 	volatile uint16_t type = 0;
2039 	int count;
2040 	lwpchan_t lwpchan;
2041 	uchar_t waiters;
2042 	int error = 0;
2043 	int time_error;
2044 	int imm_timeout = 0;
2045 	int imm_unpark = 0;
2046 
2047 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2048 		return (set_errno(EFAULT));
2049 
2050 	/*
2051 	 * Put the lwp in an orderly state for debugging,
2052 	 * in case we are stopped while sleeping, below.
2053 	 */
2054 	prstop(PR_REQUESTED, 0);
2055 
2056 	timedwait = (caddr_t)tsp;
2057 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2058 	    lwpt.lwpt_imm_timeout) {
2059 		imm_timeout = 1;
2060 		timedwait = NULL;
2061 	}
2062 
2063 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2064 
2065 	if (on_fault(&ljb)) {
2066 		if (locked)
2067 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2068 		error = EFAULT;
2069 		goto out;
2070 	}
2071 	/*
2072 	 * Force Copy-on-write if necessary and ensure that the
2073 	 * synchronization object resides in read/write memory.
2074 	 * Cause an EFAULT return now if this is not so.
2075 	 */
2076 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2077 	suword16_noerr((void *)&sp->sema_type, type);
2078 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2079 	    &lwpchan, LWPCHAN_CVPOOL)) {
2080 		error = EFAULT;
2081 		goto out;
2082 	}
2083 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2084 	locked = 1;
2085 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2086 	while (error == 0 && count == 0) {
2087 		if (time_error) {
2088 			/*
2089 			 * The SUSV3 Posix spec is very clear that we
2090 			 * should get no error from validating the
2091 			 * timer until we would actually sleep.
2092 			 */
2093 			error = time_error;
2094 			break;
2095 		}
2096 		suword8_noerr(&sp->sema_waiters, 1);
2097 		if (watched)
2098 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2099 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2100 			/*
2101 			 * We received a signal at user-level before calling
2102 			 * here or another thread wants us to return
2103 			 * immediately with EINTR.  See lwp_unpark().
2104 			 */
2105 			imm_unpark = 1;
2106 			t->t_unpark = 0;
2107 			timedwait = NULL;
2108 		} else if (timedwait) {
2109 			/*
2110 			 * If we successfully queue the timeout,
2111 			 * then don't drop t_delay_lock until
2112 			 * we are on the sleep queue (below).
2113 			 */
2114 			mutex_enter(&t->t_delay_lock);
2115 			if (lwp_timer_enqueue(&lwpt) != 0) {
2116 				mutex_exit(&t->t_delay_lock);
2117 				imm_timeout = 1;
2118 				timedwait = NULL;
2119 			}
2120 		}
2121 		t->t_flag |= T_WAITCVSEM;
2122 		lwp_block(&lwpchan);
2123 		/*
2124 		 * Nothing should happen to cause the lwp to sleep
2125 		 * again until after it returns from swtch().
2126 		 */
2127 		if (timedwait)
2128 			mutex_exit(&t->t_delay_lock);
2129 		locked = 0;
2130 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2131 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2132 		    (imm_timeout | imm_unpark))
2133 			setrun(t);
2134 		swtch();
2135 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2136 		if (timedwait)
2137 			tim = lwp_timer_dequeue(&lwpt);
2138 		setallwatch();
2139 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2140 		    MUSTRETURN(p, t) || imm_unpark)
2141 			error = EINTR;
2142 		else if (imm_timeout || (timedwait && tim == -1))
2143 			error = ETIME;
2144 		lwp->lwp_asleep = 0;
2145 		lwp->lwp_sysabort = 0;
2146 		watched = watch_disable_addr((caddr_t)sp,
2147 		    sizeof (*sp), S_WRITE);
2148 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2149 		locked = 1;
2150 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2151 	}
2152 	if (error == 0)
2153 		suword32_noerr((void *)&sp->sema_count, --count);
2154 	if (count != 0) {
2155 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2156 		suword8_noerr(&sp->sema_waiters, waiters);
2157 	}
2158 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2159 out:
2160 	no_fault();
2161 	if (watched)
2162 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2163 	if (tsp && check_park && !time_error)
2164 		error = lwp_timer_copyout(&lwpt, error);
2165 	if (error)
2166 		return (set_errno(error));
2167 	return (0);
2168 }
2169 
2170 int
2171 lwp_sema_post(lwp_sema_t *sp)
2172 {
2173 	proc_t *p = ttoproc(curthread);
2174 	label_t ljb;
2175 	volatile int locked = 0;
2176 	volatile int watched = 0;
2177 	volatile uint16_t type = 0;
2178 	int count;
2179 	lwpchan_t lwpchan;
2180 	uchar_t waiters;
2181 	int error = 0;
2182 
2183 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2184 		return (set_errno(EFAULT));
2185 
2186 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2187 
2188 	if (on_fault(&ljb)) {
2189 		if (locked)
2190 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2191 		error = EFAULT;
2192 		goto out;
2193 	}
2194 	/*
2195 	 * Force Copy-on-write if necessary and ensure that the
2196 	 * synchronization object resides in read/write memory.
2197 	 * Cause an EFAULT return now if this is not so.
2198 	 */
2199 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2200 	suword16_noerr(&sp->sema_type, type);
2201 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2202 	    &lwpchan, LWPCHAN_CVPOOL)) {
2203 		error = EFAULT;
2204 		goto out;
2205 	}
2206 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2207 	locked = 1;
2208 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2209 	if (count == _SEM_VALUE_MAX)
2210 		error = EOVERFLOW;
2211 	else
2212 		suword32_noerr(&sp->sema_count, ++count);
2213 	if (count == 1) {
2214 		fuword8_noerr(&sp->sema_waiters, &waiters);
2215 		if (waiters) {
2216 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2217 			suword8_noerr(&sp->sema_waiters, waiters);
2218 		}
2219 	}
2220 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2221 out:
2222 	no_fault();
2223 	if (watched)
2224 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2225 	if (error)
2226 		return (set_errno(error));
2227 	return (0);
2228 }
2229 
2230 #define	TRW_WANT_WRITE		0x1
2231 #define	TRW_LOCK_GRANTED	0x2
2232 
2233 #define	READ_LOCK		0
2234 #define	WRITE_LOCK		1
2235 #define	TRY_FLAG		0x10
2236 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2237 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2238 
2239 /*
2240  * Release one writer or one or more readers. Compute the rwstate word to
2241  * reflect the new state of the queue. For a safe hand-off we copy the new
2242  * rwstate value back to userland before we wake any of the new lock holders.
2243  *
2244  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2245  * being given precedence over readers of the same priority).
2246  *
2247  * If the first thread is a reader we scan the queue releasing all readers
2248  * until we hit a writer or the end of the queue. If the first thread is a
2249  * writer we still need to check for another writer.
2250  */
2251 void
2252 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2253 {
2254 	sleepq_head_t *sqh;
2255 	kthread_t *tp;
2256 	kthread_t **tpp;
2257 	kthread_t *tpnext;
2258 	kthread_t *wakelist = NULL;
2259 	uint32_t rwstate = 0;
2260 	int wcount = 0;
2261 	int rcount = 0;
2262 
2263 	sqh = lwpsqhash(lwpchan);
2264 	disp_lock_enter(&sqh->sq_lock);
2265 	tpp = &sqh->sq_queue.sq_first;
2266 	while ((tp = *tpp) != NULL) {
2267 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2268 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2269 			if (tp->t_writer & TRW_WANT_WRITE) {
2270 				if ((wcount++ == 0) && (rcount == 0)) {
2271 					rwstate |= URW_WRITE_LOCKED;
2272 
2273 					/* Just one writer to wake. */
2274 					sleepq_unlink(tpp, tp);
2275 					wakelist = tp;
2276 
2277 					/* tpp already set for next thread. */
2278 					continue;
2279 				} else {
2280 					rwstate |= URW_HAS_WAITERS;
2281 					/* We need look no further. */
2282 					break;
2283 				}
2284 			} else {
2285 				rcount++;
2286 				if (wcount == 0) {
2287 					rwstate++;
2288 
2289 					/* Add reader to wake list. */
2290 					sleepq_unlink(tpp, tp);
2291 					tp->t_link = wakelist;
2292 					wakelist = tp;
2293 
2294 					/* tpp already set for next thread. */
2295 					continue;
2296 				} else {
2297 					rwstate |= URW_HAS_WAITERS;
2298 					/* We need look no further. */
2299 					break;
2300 				}
2301 			}
2302 		}
2303 		tpp = &tp->t_link;
2304 	}
2305 
2306 	/* Copy the new rwstate back to userland. */
2307 	suword32_noerr(&rw->rwlock_readers, rwstate);
2308 
2309 	/* Wake the new lock holder(s) up. */
2310 	tp = wakelist;
2311 	while (tp != NULL) {
2312 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2313 		tp->t_wchan0 = NULL;
2314 		tp->t_wchan = NULL;
2315 		tp->t_sobj_ops = NULL;
2316 		tp->t_writer |= TRW_LOCK_GRANTED;
2317 		tpnext = tp->t_link;
2318 		tp->t_link = NULL;
2319 		CL_WAKEUP(tp);
2320 		thread_unlock_high(tp);
2321 		tp = tpnext;
2322 	}
2323 
2324 	disp_lock_exit(&sqh->sq_lock);
2325 }
2326 
2327 /*
2328  * We enter here holding the user-level mutex, which we must release before
2329  * returning or blocking. Based on lwp_cond_wait().
2330  */
2331 static int
2332 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2333 {
2334 	lwp_mutex_t *mp = NULL;
2335 	kthread_t *t = curthread;
2336 	kthread_t *tp;
2337 	klwp_t *lwp = ttolwp(t);
2338 	proc_t *p = ttoproc(t);
2339 	lwp_timer_t lwpt;
2340 	lwpchan_t lwpchan;
2341 	lwpchan_t mlwpchan;
2342 	caddr_t timedwait;
2343 	volatile uint16_t type = 0;
2344 	volatile uint8_t mtype = 0;
2345 	uchar_t mwaiters;
2346 	volatile int error = 0;
2347 	int time_error;
2348 	clock_t tim = -1;
2349 	volatile int locked = 0;
2350 	volatile int mlocked = 0;
2351 	volatile int watched = 0;
2352 	volatile int mwatched = 0;
2353 	label_t ljb;
2354 	volatile int no_lwpchan = 1;
2355 	int imm_timeout = 0;
2356 	int try_flag;
2357 	uint32_t rwstate;
2358 	int acquired = 0;
2359 
2360 	/* We only check rw because the mutex is included in it. */
2361 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2362 		return (set_errno(EFAULT));
2363 
2364 	/*
2365 	 * Put the lwp in an orderly state for debugging,
2366 	 * in case we are stopped while sleeping, below.
2367 	 */
2368 	prstop(PR_REQUESTED, 0);
2369 
2370 	/* We must only report this error if we are about to sleep (later). */
2371 	timedwait = (caddr_t)tsp;
2372 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2373 	    lwpt.lwpt_imm_timeout) {
2374 		imm_timeout = 1;
2375 		timedwait = NULL;
2376 	}
2377 
2378 	(void) new_mstate(t, LMS_USER_LOCK);
2379 
2380 	if (on_fault(&ljb)) {
2381 		if (no_lwpchan) {
2382 			error = EFAULT;
2383 			goto out_nodrop;
2384 		}
2385 		if (mlocked) {
2386 			mlocked = 0;
2387 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2388 		}
2389 		if (locked) {
2390 			locked = 0;
2391 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2392 		}
2393 		/*
2394 		 * Set up another on_fault() for a possible fault
2395 		 * on the user lock accessed at "out_drop".
2396 		 */
2397 		if (on_fault(&ljb)) {
2398 			if (mlocked) {
2399 				mlocked = 0;
2400 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2401 			}
2402 			error = EFAULT;
2403 			goto out_nodrop;
2404 		}
2405 		error = EFAULT;
2406 		goto out_nodrop;
2407 	}
2408 
2409 	/* Process rd_wr (including sanity check). */
2410 	try_flag = (rd_wr & TRY_FLAG);
2411 	rd_wr &= ~TRY_FLAG;
2412 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2413 		error = EINVAL;
2414 		goto out_nodrop;
2415 	}
2416 
2417 	/*
2418 	 * Force Copy-on-write if necessary and ensure that the
2419 	 * synchronization object resides in read/write memory.
2420 	 * Cause an EFAULT return now if this is not so.
2421 	 */
2422 	mp = &rw->mutex;
2423 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2424 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2425 	suword8_noerr(&mp->mutex_type, mtype);
2426 	suword16_noerr(&rw->rwlock_type, type);
2427 
2428 	/* We can only continue for simple USYNC_PROCESS locks. */
2429 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2430 		error = EINVAL;
2431 		goto out_nodrop;
2432 	}
2433 
2434 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2435 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2436 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2437 		error = EFAULT;
2438 		goto out_nodrop;
2439 	}
2440 
2441 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2442 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2443 	    &lwpchan, LWPCHAN_CVPOOL)) {
2444 		error = EFAULT;
2445 		goto out_nodrop;
2446 	}
2447 
2448 	no_lwpchan = 0;
2449 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2450 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2451 
2452 	/*
2453 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2454 	 * atomically with respect to a possible wakeup which is a result
2455 	 * of lwp_rwlock_unlock().
2456 	 *
2457 	 * What's misleading is that the LWP is put to sleep after the
2458 	 * rwlock's mutex is released. This is OK as long as the release
2459 	 * operation is also done while holding mlwpchan. The LWP is then
2460 	 * put to sleep when the possibility of pagefaulting or sleeping
2461 	 * has been completely eliminated.
2462 	 */
2463 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2464 	locked = 1;
2465 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2466 	mlocked = 1;
2467 
2468 	/*
2469 	 * Fetch the current rwlock state.
2470 	 *
2471 	 * The possibility of spurious wake-ups or killed waiters means
2472 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2473 	 * We only fix these if they are important to us.
2474 	 *
2475 	 * Although various error states can be observed here (e.g. the lock
2476 	 * is not held, but there are waiters) we assume these are applicaton
2477 	 * errors and so we take no corrective action.
2478 	 */
2479 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2480 	/*
2481 	 * We cannot legitimately get here from user-level
2482 	 * without URW_HAS_WAITERS being set.
2483 	 * Set it now to guard against user-level error.
2484 	 */
2485 	rwstate |= URW_HAS_WAITERS;
2486 
2487 	/*
2488 	 * We can try only if the lock isn't held by a writer.
2489 	 */
2490 	if (!(rwstate & URW_WRITE_LOCKED)) {
2491 		tp = lwp_queue_waiter(&lwpchan);
2492 		if (tp == NULL) {
2493 			/*
2494 			 * Hmmm, rwstate indicates waiters but there are
2495 			 * none queued. This could just be the result of a
2496 			 * spurious wakeup, so let's ignore it.
2497 			 *
2498 			 * We now have a chance to acquire the lock
2499 			 * uncontended, but this is the last chance for
2500 			 * a writer to acquire the lock without blocking.
2501 			 */
2502 			if (rd_wr == READ_LOCK) {
2503 				rwstate++;
2504 				acquired = 1;
2505 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2506 				rwstate |= URW_WRITE_LOCKED;
2507 				acquired = 1;
2508 			}
2509 		} else if (rd_wr == READ_LOCK) {
2510 			/*
2511 			 * This is the last chance for a reader to acquire
2512 			 * the lock now, but it can only do so if there is
2513 			 * no writer of equal or greater priority at the
2514 			 * head of the queue .
2515 			 *
2516 			 * It is also just possible that there is a reader
2517 			 * at the head of the queue. This may be the result
2518 			 * of a spurious wakeup or an application failure.
2519 			 * In this case we only acquire the lock if we have
2520 			 * equal or greater priority. It is not our job to
2521 			 * release spurious waiters.
2522 			 */
2523 			pri_t our_pri = DISP_PRIO(t);
2524 			pri_t his_pri = DISP_PRIO(tp);
2525 
2526 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2527 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2528 				rwstate++;
2529 				acquired = 1;
2530 			}
2531 		}
2532 	}
2533 
2534 	if (acquired || try_flag || time_error) {
2535 		/*
2536 		 * We're not going to block this time.
2537 		 */
2538 		suword32_noerr(&rw->rwlock_readers, rwstate);
2539 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2540 		locked = 0;
2541 
2542 		if (acquired) {
2543 			/*
2544 			 * Got the lock!
2545 			 */
2546 			error = 0;
2547 
2548 		} else if (try_flag) {
2549 			/*
2550 			 * We didn't get the lock and we're about to block.
2551 			 * If we're doing a trylock, return EBUSY instead.
2552 			 */
2553 			error = EBUSY;
2554 
2555 		} else if (time_error) {
2556 			/*
2557 			 * The SUSV3 POSIX spec is very clear that we should
2558 			 * get no error from validating the timer (above)
2559 			 * until we would actually sleep.
2560 			 */
2561 			error = time_error;
2562 		}
2563 
2564 		goto out_drop;
2565 	}
2566 
2567 	/*
2568 	 * We're about to block, so indicate what kind of waiter we are.
2569 	 */
2570 	t->t_writer = 0;
2571 	if (rd_wr == WRITE_LOCK)
2572 		t->t_writer = TRW_WANT_WRITE;
2573 	suword32_noerr(&rw->rwlock_readers, rwstate);
2574 
2575 	/*
2576 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2577 	 */
2578 	set_owner_pid(mp, 0, 0);
2579 	ulock_clear(&mp->mutex_lockw);
2580 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2581 	if (mwaiters != 0) {
2582 		/*
2583 		 * Given the locking of mlwpchan around the release of
2584 		 * the mutex and checking for waiters, the following
2585 		 * call to lwp_release() can fail ONLY if the lock
2586 		 * acquirer is interrupted after setting the waiter bit,
2587 		 * calling lwp_block() and releasing mlwpchan.
2588 		 * In this case, it could get pulled off the LWP sleep
2589 		 * queue (via setrun()) before the following call to
2590 		 * lwp_release() occurs, and the lock requestor will
2591 		 * update the waiter bit correctly by re-evaluating it.
2592 		 */
2593 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2594 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2595 	}
2596 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2597 	mlocked = 0;
2598 	no_fault();
2599 
2600 	if (mwatched) {
2601 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2602 		mwatched = 0;
2603 	}
2604 	if (watched) {
2605 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2606 		watched = 0;
2607 	}
2608 
2609 	if (timedwait) {
2610 		/*
2611 		 * If we successfully queue the timeout,
2612 		 * then don't drop t_delay_lock until
2613 		 * we are on the sleep queue (below).
2614 		 */
2615 		mutex_enter(&t->t_delay_lock);
2616 		if (lwp_timer_enqueue(&lwpt) != 0) {
2617 			mutex_exit(&t->t_delay_lock);
2618 			imm_timeout = 1;
2619 			timedwait = NULL;
2620 		}
2621 	}
2622 	t->t_flag |= T_WAITCVSEM;
2623 	lwp_block(&lwpchan);
2624 
2625 	/*
2626 	 * Nothing should happen to cause the LWp to go to sleep until after
2627 	 * it returns from swtch().
2628 	 */
2629 	if (timedwait)
2630 		mutex_exit(&t->t_delay_lock);
2631 	locked = 0;
2632 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2633 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2634 		setrun(t);
2635 	swtch();
2636 
2637 	/*
2638 	 * We're back, but we need to work out why. Were we interrupted? Did
2639 	 * we timeout? Were we granted the lock?
2640 	 */
2641 	error = EAGAIN;
2642 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2643 	t->t_writer = 0;
2644 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2645 	if (timedwait)
2646 		tim = lwp_timer_dequeue(&lwpt);
2647 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2648 		error = EINTR;
2649 	else if (imm_timeout || (timedwait && tim == -1))
2650 		error = ETIME;
2651 	lwp->lwp_asleep = 0;
2652 	lwp->lwp_sysabort = 0;
2653 	setallwatch();
2654 
2655 	/*
2656 	 * If we were granted the lock we don't care about EINTR or ETIME.
2657 	 */
2658 	if (acquired)
2659 		error = 0;
2660 
2661 	if (t->t_mstate == LMS_USER_LOCK)
2662 		(void) new_mstate(t, LMS_SYSTEM);
2663 
2664 	if (error)
2665 		return (set_errno(error));
2666 	return (0);
2667 
2668 out_drop:
2669 	/*
2670 	 * Make sure that the user level lock is dropped before returning
2671 	 * to the caller.
2672 	 */
2673 	if (!mlocked) {
2674 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2675 		mlocked = 1;
2676 	}
2677 	set_owner_pid(mp, 0, 0);
2678 	ulock_clear(&mp->mutex_lockw);
2679 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2680 	if (mwaiters != 0) {
2681 		/*
2682 		 * See comment above on lock clearing and lwp_release()
2683 		 * success/failure.
2684 		 */
2685 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2686 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2687 	}
2688 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2689 	mlocked = 0;
2690 
2691 out_nodrop:
2692 	no_fault();
2693 	if (mwatched)
2694 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2695 	if (watched)
2696 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2697 	if (t->t_mstate == LMS_USER_LOCK)
2698 		(void) new_mstate(t, LMS_SYSTEM);
2699 	if (error)
2700 		return (set_errno(error));
2701 	return (0);
2702 }
2703 
2704 /*
2705  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2706  * we never drop the lock.
2707  */
2708 static int
2709 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2710 {
2711 	kthread_t *t = curthread;
2712 	proc_t *p = ttoproc(t);
2713 	lwpchan_t lwpchan;
2714 	volatile uint16_t type = 0;
2715 	volatile int error = 0;
2716 	volatile int locked = 0;
2717 	volatile int watched = 0;
2718 	label_t ljb;
2719 	volatile int no_lwpchan = 1;
2720 	uint32_t rwstate;
2721 
2722 	/* We only check rw because the mutex is included in it. */
2723 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2724 		return (set_errno(EFAULT));
2725 
2726 	if (on_fault(&ljb)) {
2727 		if (no_lwpchan) {
2728 			error = EFAULT;
2729 			goto out_nodrop;
2730 		}
2731 		if (locked) {
2732 			locked = 0;
2733 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2734 		}
2735 		error = EFAULT;
2736 		goto out_nodrop;
2737 	}
2738 
2739 	/*
2740 	 * Force Copy-on-write if necessary and ensure that the
2741 	 * synchronization object resides in read/write memory.
2742 	 * Cause an EFAULT return now if this is not so.
2743 	 */
2744 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2745 	suword16_noerr(&rw->rwlock_type, type);
2746 
2747 	/* We can only continue for simple USYNC_PROCESS locks. */
2748 	if (type != USYNC_PROCESS) {
2749 		error = EINVAL;
2750 		goto out_nodrop;
2751 	}
2752 
2753 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2754 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2755 	    &lwpchan, LWPCHAN_CVPOOL)) {
2756 		error = EFAULT;
2757 		goto out_nodrop;
2758 	}
2759 
2760 	no_lwpchan = 0;
2761 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2762 
2763 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2764 	locked = 1;
2765 
2766 	/*
2767 	 * We can resolve multiple readers (except the last reader) here.
2768 	 * For the last reader or a writer we need lwp_rwlock_release(),
2769 	 * to which we also delegate the task of copying the new rwstate
2770 	 * back to userland (see the comment there).
2771 	 */
2772 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2773 	if (rwstate & URW_WRITE_LOCKED)
2774 		lwp_rwlock_release(&lwpchan, rw);
2775 	else if ((rwstate & URW_READERS_MASK) > 0) {
2776 		rwstate--;
2777 		if ((rwstate & URW_READERS_MASK) == 0)
2778 			lwp_rwlock_release(&lwpchan, rw);
2779 		else
2780 			suword32_noerr(&rw->rwlock_readers, rwstate);
2781 	}
2782 
2783 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2784 	locked = 0;
2785 	error = 0;
2786 
2787 out_nodrop:
2788 	no_fault();
2789 	if (watched)
2790 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2791 	if (error)
2792 		return (set_errno(error));
2793 	return (0);
2794 }
2795 
2796 int
2797 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2798 {
2799 	switch (subcode) {
2800 	case 0:
2801 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2802 	case 1:
2803 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2804 	case 2:
2805 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2806 	case 3:
2807 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2808 	case 4:
2809 		return (lwp_rwlock_unlock(rwlp));
2810 	}
2811 	return (set_errno(EINVAL));
2812 }
2813 
2814 /*
2815  * Return the owner of the user-level s-object.
2816  * Since we can't really do this, return NULL.
2817  */
2818 /* ARGSUSED */
2819 static kthread_t *
2820 lwpsobj_owner(caddr_t sobj)
2821 {
2822 	return ((kthread_t *)NULL);
2823 }
2824 
2825 /*
2826  * Wake up a thread asleep on a user-level synchronization
2827  * object.
2828  */
2829 static void
2830 lwp_unsleep(kthread_t *t)
2831 {
2832 	ASSERT(THREAD_LOCK_HELD(t));
2833 	if (t->t_wchan0 != NULL) {
2834 		sleepq_head_t *sqh;
2835 		sleepq_t *sqp = t->t_sleepq;
2836 
2837 		if (sqp != NULL) {
2838 			sqh = lwpsqhash(&t->t_lwpchan);
2839 			ASSERT(&sqh->sq_queue == sqp);
2840 			sleepq_unsleep(t);
2841 			disp_lock_exit_high(&sqh->sq_lock);
2842 			CL_SETRUN(t);
2843 			return;
2844 		}
2845 	}
2846 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2847 }
2848 
2849 /*
2850  * Change the priority of a thread asleep on a user-level
2851  * synchronization object. To maintain proper priority order,
2852  * we:
2853  *	o dequeue the thread.
2854  *	o change its priority.
2855  *	o re-enqueue the thread.
2856  * Assumption: the thread is locked on entry.
2857  */
2858 static void
2859 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2860 {
2861 	ASSERT(THREAD_LOCK_HELD(t));
2862 	if (t->t_wchan0 != NULL) {
2863 		sleepq_t   *sqp = t->t_sleepq;
2864 
2865 		sleepq_dequeue(t);
2866 		*t_prip = pri;
2867 		sleepq_insert(sqp, t);
2868 	} else
2869 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2870 }
2871 
2872 /*
2873  * Clean up a left-over process-shared robust mutex
2874  */
2875 static void
2876 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2877 {
2878 	uint16_t flag;
2879 	uchar_t waiters;
2880 	label_t ljb;
2881 	pid_t owner_pid;
2882 	lwp_mutex_t *lp;
2883 	volatile int locked = 0;
2884 	volatile int watched = 0;
2885 	volatile struct upimutex *upimutex = NULL;
2886 	volatile int upilocked = 0;
2887 
2888 	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2889 	    != (USYNC_PROCESS | LOCK_ROBUST))
2890 		return;
2891 
2892 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2893 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2894 	if (on_fault(&ljb)) {
2895 		if (locked)
2896 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2897 		if (upilocked)
2898 			upimutex_unlock((upimutex_t *)upimutex, 0);
2899 		goto out;
2900 	}
2901 
2902 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2903 
2904 	if (UPIMUTEX(ent->lwpchan_type)) {
2905 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2906 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2907 
2908 		if (owner_pid != curproc->p_pid)
2909 			goto out;
2910 		mutex_enter(&upibp->upib_lock);
2911 		upimutex = upi_get(upibp, &lwpchan);
2912 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2913 			mutex_exit(&upibp->upib_lock);
2914 			goto out;
2915 		}
2916 		mutex_exit(&upibp->upib_lock);
2917 		upilocked = 1;
2918 		flag = lwp_clear_mutex(lp, lockflg);
2919 		suword8_noerr(&lp->mutex_lockw, 0);
2920 		upimutex_unlock((upimutex_t *)upimutex, flag);
2921 	} else {
2922 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2923 		locked = 1;
2924 		/*
2925 		 * Clear the spinners count because one of our
2926 		 * threads could have been spinning for this lock
2927 		 * at user level when the process was suddenly killed.
2928 		 * There is no harm in this since user-level libc code
2929 		 * will adapt to the sudden change in the spinner count.
2930 		 */
2931 		suword8_noerr(&lp->mutex_spinners, 0);
2932 		if (owner_pid != curproc->p_pid) {
2933 			/*
2934 			 * We are not the owner.  There may or may not be one.
2935 			 * If there are waiters, we wake up one or all of them.
2936 			 * It doesn't hurt to wake them up in error since
2937 			 * they will just retry the lock and go to sleep
2938 			 * again if necessary.
2939 			 */
2940 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2941 			if (waiters != 0) {	/* there are waiters */
2942 				fuword16_noerr(&lp->mutex_flag, &flag);
2943 				if (flag & LOCK_NOTRECOVERABLE) {
2944 					lwp_release_all(&ent->lwpchan_lwpchan);
2945 					suword8_noerr(&lp->mutex_waiters, 0);
2946 				} else if (lwp_release(&ent->lwpchan_lwpchan,
2947 				    &waiters, 0)) {
2948 					suword8_noerr(&lp->mutex_waiters,
2949 					    waiters);
2950 				}
2951 			}
2952 		} else {
2953 			/*
2954 			 * We are the owner.  Release it.
2955 			 */
2956 			(void) lwp_clear_mutex(lp, lockflg);
2957 			ulock_clear(&lp->mutex_lockw);
2958 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2959 			if (waiters &&
2960 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2961 				suword8_noerr(&lp->mutex_waiters, waiters);
2962 		}
2963 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2964 	}
2965 out:
2966 	no_fault();
2967 	if (watched)
2968 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2969 }
2970 
2971 /*
2972  * Register a process-shared robust mutex in the lwpchan cache.
2973  */
2974 int
2975 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2976 {
2977 	int error = 0;
2978 	volatile int watched;
2979 	label_t ljb;
2980 	uint8_t type;
2981 	lwpchan_t lwpchan;
2982 
2983 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2984 		return (set_errno(EFAULT));
2985 
2986 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2987 
2988 	if (on_fault(&ljb)) {
2989 		error = EFAULT;
2990 	} else {
2991 		/*
2992 		 * Force Copy-on-write if necessary and ensure that the
2993 		 * synchronization object resides in read/write memory.
2994 		 * Cause an EFAULT return now if this is not so.
2995 		 */
2996 		fuword8_noerr(&lp->mutex_type, &type);
2997 		suword8_noerr(&lp->mutex_type, type);
2998 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2999 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
3000 			error = EINVAL;
3001 		} else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
3002 		    uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
3003 			error = EFAULT;
3004 		}
3005 	}
3006 	no_fault();
3007 	if (watched)
3008 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3009 	if (error)
3010 		return (set_errno(error));
3011 	return (0);
3012 }
3013 
3014 /*
3015  * There is a user-level robust lock registration in libc.
3016  * Mark it as invalid by storing -1 into the location of the pointer.
3017  */
3018 static void
3019 lwp_mutex_unregister(void *uaddr)
3020 {
3021 	if (get_udatamodel() == DATAMODEL_NATIVE) {
3022 		(void) sulword(uaddr, (ulong_t)-1);
3023 #ifdef _SYSCALL32_IMPL
3024 	} else {
3025 		(void) suword32(uaddr, (uint32_t)-1);
3026 #endif
3027 	}
3028 }
3029 
3030 int
3031 lwp_mutex_trylock(lwp_mutex_t *lp, uintptr_t owner)
3032 {
3033 	kthread_t *t = curthread;
3034 	proc_t *p = ttoproc(t);
3035 	int error = 0;
3036 	volatile int locked = 0;
3037 	volatile int watched = 0;
3038 	label_t ljb;
3039 	volatile uint8_t type = 0;
3040 	uint16_t flag;
3041 	lwpchan_t lwpchan;
3042 
3043 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3044 		return (set_errno(EFAULT));
3045 
3046 	(void) new_mstate(t, LMS_USER_LOCK);
3047 
3048 	if (on_fault(&ljb)) {
3049 		if (locked)
3050 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3051 		error = EFAULT;
3052 		goto out;
3053 	}
3054 	/*
3055 	 * Force Copy-on-write if necessary and ensure that the
3056 	 * synchronization object resides in read/write memory.
3057 	 * Cause an EFAULT return now if this is not so.
3058 	 */
3059 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3060 	suword8_noerr(&lp->mutex_type, type);
3061 	if (UPIMUTEX(type)) {
3062 		no_fault();
3063 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3064 		if (error == 0 || error == EOWNERDEAD ||
3065 		    error == ELOCKUNMAPPED) {
3066 			volatile int locked = error != 0;
3067 			if (on_fault(&ljb)) {
3068 				if (locked != 0)
3069 					error = lwp_upimutex_unlock(lp, type);
3070 				else
3071 					error = EFAULT;
3072 				goto upierr;
3073 			}
3074 			set_owner_pid(lp, owner,
3075 			    (type & USYNC_PROCESS)? p->p_pid : 0);
3076 			no_fault();
3077 		}
3078 
3079 upierr:
3080 		if (error)
3081 			return (set_errno(error));
3082 		return (0);
3083 	}
3084 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3085 	    &lwpchan, LWPCHAN_MPPOOL)) {
3086 		error = EFAULT;
3087 		goto out;
3088 	}
3089 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3090 	locked = 1;
3091 	if (type & LOCK_ROBUST) {
3092 		fuword16_noerr(&lp->mutex_flag, &flag);
3093 		if (flag & LOCK_NOTRECOVERABLE) {
3094 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3095 			error =  ENOTRECOVERABLE;
3096 			goto out;
3097 		}
3098 	}
3099 
3100 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3101 
3102 	if (!ulock_try(&lp->mutex_lockw))
3103 		error = EBUSY;
3104 	else {
3105 		set_owner_pid(lp, owner, (type & USYNC_PROCESS)? p->p_pid : 0);
3106 		if (type & LOCK_ROBUST) {
3107 			fuword16_noerr(&lp->mutex_flag, &flag);
3108 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3109 				if (flag & LOCK_OWNERDEAD)
3110 					error = EOWNERDEAD;
3111 				else if (type & USYNC_PROCESS_ROBUST)
3112 					error = ELOCKUNMAPPED;
3113 				else
3114 					error = EOWNERDEAD;
3115 			}
3116 		}
3117 	}
3118 	locked = 0;
3119 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3120 out:
3121 
3122 	if (t->t_mstate == LMS_USER_LOCK)
3123 		(void) new_mstate(t, LMS_SYSTEM);
3124 
3125 	no_fault();
3126 	if (watched)
3127 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3128 	if (error)
3129 		return (set_errno(error));
3130 	return (0);
3131 }
3132 
3133 /*
3134  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3135  * the blocked lwp resumes and retries to acquire the lock.
3136  */
3137 int
3138 lwp_mutex_unlock(lwp_mutex_t *lp)
3139 {
3140 	proc_t *p = ttoproc(curthread);
3141 	lwpchan_t lwpchan;
3142 	uchar_t waiters;
3143 	volatile int locked = 0;
3144 	volatile int watched = 0;
3145 	volatile uint8_t type = 0;
3146 	label_t ljb;
3147 	uint16_t flag;
3148 	int error = 0;
3149 
3150 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3151 		return (set_errno(EFAULT));
3152 
3153 	if (on_fault(&ljb)) {
3154 		if (locked)
3155 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3156 		error = EFAULT;
3157 		goto out;
3158 	}
3159 
3160 	/*
3161 	 * Force Copy-on-write if necessary and ensure that the
3162 	 * synchronization object resides in read/write memory.
3163 	 * Cause an EFAULT return now if this is not so.
3164 	 */
3165 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3166 	suword8_noerr(&lp->mutex_type, type);
3167 
3168 	if (UPIMUTEX(type)) {
3169 		no_fault();
3170 		error = lwp_upimutex_unlock(lp, type);
3171 		if (error)
3172 			return (set_errno(error));
3173 		return (0);
3174 	}
3175 
3176 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3177 
3178 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3179 	    &lwpchan, LWPCHAN_MPPOOL)) {
3180 		error = EFAULT;
3181 		goto out;
3182 	}
3183 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3184 	locked = 1;
3185 	if (type & LOCK_ROBUST) {
3186 		fuword16_noerr(&lp->mutex_flag, &flag);
3187 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3188 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3189 			flag |= LOCK_NOTRECOVERABLE;
3190 			suword16_noerr(&lp->mutex_flag, flag);
3191 		}
3192 	}
3193 	set_owner_pid(lp, 0, 0);
3194 	ulock_clear(&lp->mutex_lockw);
3195 	/*
3196 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3197 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3198 	 * may fail.  If it fails, do not write into the waiter bit.
3199 	 * The call to lwp_release() might fail due to one of three reasons:
3200 	 *
3201 	 * 	1. due to the thread which set the waiter bit not actually
3202 	 *	   sleeping since it got the lock on the re-try. The waiter
3203 	 *	   bit will then be correctly updated by that thread. This
3204 	 *	   window may be closed by reading the wait bit again here
3205 	 *	   and not calling lwp_release() at all if it is zero.
3206 	 *	2. the thread which set the waiter bit and went to sleep
3207 	 *	   was woken up by a signal. This time, the waiter recomputes
3208 	 *	   the wait bit in the return with EINTR code.
3209 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3210 	 *	   memory that has been re-used after the lock was dropped.
3211 	 *	   In this case, writing into the waiter bit would cause data
3212 	 *	   corruption.
3213 	 */
3214 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3215 	if (waiters) {
3216 		if ((type & LOCK_ROBUST) &&
3217 		    (flag & LOCK_NOTRECOVERABLE)) {
3218 			lwp_release_all(&lwpchan);
3219 			suword8_noerr(&lp->mutex_waiters, 0);
3220 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3221 			suword8_noerr(&lp->mutex_waiters, waiters);
3222 		}
3223 	}
3224 
3225 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3226 out:
3227 	no_fault();
3228 	if (watched)
3229 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3230 	if (error)
3231 		return (set_errno(error));
3232 	return (0);
3233 }
3234