xref: /titanic_41/usr/src/uts/common/syscall/lwp_sobj.c (revision 0eb822a1c0c2bea495647510b75f77f0e57633eb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
23 /*	  All Rights Reserved	*/
24 
25 
26 /*
27  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 #include <sys/param.h>
34 #include <sys/types.h>
35 #include <sys/sysmacros.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/user.h>
39 #include <sys/errno.h>
40 #include <sys/file.h>
41 #include <sys/proc.h>
42 #include <sys/prsystm.h>
43 #include <sys/kmem.h>
44 #include <sys/sobject.h>
45 #include <sys/fault.h>
46 #include <sys/procfs.h>
47 #include <sys/watchpoint.h>
48 #include <sys/time.h>
49 #include <sys/cmn_err.h>
50 #include <sys/machlock.h>
51 #include <sys/debug.h>
52 #include <sys/synch.h>
53 #include <sys/synch32.h>
54 #include <sys/mman.h>
55 #include <sys/class.h>
56 #include <sys/schedctl.h>
57 #include <sys/sleepq.h>
58 #include <sys/policy.h>
59 #include <sys/tnf_probe.h>
60 #include <sys/lwpchan_impl.h>
61 #include <sys/turnstile.h>
62 #include <sys/atomic.h>
63 #include <sys/lwp_timer_impl.h>
64 #include <sys/lwp_upimutex_impl.h>
65 #include <vm/as.h>
66 #include <sys/sdt.h>
67 
68 static kthread_t *lwpsobj_owner(caddr_t);
69 static void lwp_unsleep(kthread_t *t);
70 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
71 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
72 
73 extern int lwp_cond_signal(lwp_cond_t *cv);
74 
75 /*
76  * Maximum number of user prio inheritance locks that can be held by a thread.
77  * Used to limit kmem for each thread. This is a per-thread limit that
78  * can be administered on a system wide basis (using /etc/system).
79  *
80  * Also, when a limit, say maxlwps is added for numbers of lwps within a
81  * process, the per-thread limit automatically becomes a process-wide limit
82  * of maximum number of held upi locks within a process:
83  *      maxheldupimx = maxnestupimx * maxlwps;
84  */
85 static uint32_t maxnestupimx = 2000;
86 
87 /*
88  * The sobj_ops vector exports a set of functions needed when a thread
89  * is asleep on a synchronization object of this type.
90  */
91 static sobj_ops_t lwp_sobj_ops = {
92 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
93 };
94 
95 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
96 
97 static sobj_ops_t lwp_sobj_pi_ops = {
98 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
99 	turnstile_change_pri
100 };
101 
102 static sleepq_head_t	lwpsleepq[NSLEEPQ];
103 upib_t			upimutextab[UPIMUTEX_TABSIZE];
104 
105 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
106 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
107 
108 /*
109  * We know that both lc_wchan and lc_wchan0 are addresses that most
110  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
111  * 'pool' is either 0 or 1.
112  */
113 #define	LWPCHAN_LOCK_HASH(X, pool) \
114 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
115 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
116 
117 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
118 
119 /*
120  * Is this a POSIX threads user-level lock requiring priority inheritance?
121  */
122 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
123 
124 static sleepq_head_t *
125 lwpsqhash(lwpchan_t *lwpchan)
126 {
127 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
128 	return (&lwpsleepq[SQHASHINDEX(x)]);
129 }
130 
131 /*
132  * Lock an lwpchan.
133  * Keep this in sync with lwpchan_unlock(), below.
134  */
135 static void
136 lwpchan_lock(lwpchan_t *lwpchan, int pool)
137 {
138 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
139 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
140 }
141 
142 /*
143  * Unlock an lwpchan.
144  * Keep this in sync with lwpchan_lock(), above.
145  */
146 static void
147 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
148 {
149 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
150 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
151 }
152 
153 /*
154  * Delete mappings from the lwpchan cache for pages that are being
155  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
156  * all mappings within the range are deleted from the lwpchan cache.
157  */
158 void
159 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
160 {
161 	lwpchan_data_t *lcp;
162 	lwpchan_hashbucket_t *hashbucket;
163 	lwpchan_hashbucket_t *endbucket;
164 	lwpchan_entry_t *ent;
165 	lwpchan_entry_t **prev;
166 	caddr_t addr;
167 
168 	mutex_enter(&p->p_lcp_lock);
169 	lcp = p->p_lcp;
170 	hashbucket = lcp->lwpchan_cache;
171 	endbucket = hashbucket + lcp->lwpchan_size;
172 	for (; hashbucket < endbucket; hashbucket++) {
173 		if (hashbucket->lwpchan_chain == NULL)
174 			continue;
175 		mutex_enter(&hashbucket->lwpchan_lock);
176 		prev = &hashbucket->lwpchan_chain;
177 		/* check entire chain */
178 		while ((ent = *prev) != NULL) {
179 			addr = ent->lwpchan_addr;
180 			if (start <= addr && addr < end) {
181 				*prev = ent->lwpchan_next;
182 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
183 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
184 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
185 				kmem_free(ent, sizeof (*ent));
186 				atomic_add_32(&lcp->lwpchan_entries, -1);
187 			} else {
188 				prev = &ent->lwpchan_next;
189 			}
190 		}
191 		mutex_exit(&hashbucket->lwpchan_lock);
192 	}
193 	mutex_exit(&p->p_lcp_lock);
194 }
195 
196 /*
197  * Given an lwpchan cache pointer and a process virtual address,
198  * return a pointer to the corresponding lwpchan hash bucket.
199  */
200 static lwpchan_hashbucket_t *
201 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
202 {
203 	uint_t i;
204 
205 	/*
206 	 * All user-level sync object addresses are 8-byte aligned.
207 	 * Ignore the lowest 3 bits of the address and use the
208 	 * higher-order 2*lwpchan_bits bits for the hash index.
209 	 */
210 	addr >>= 3;
211 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
212 	return (lcp->lwpchan_cache + i);
213 }
214 
215 /*
216  * (Re)allocate the per-process lwpchan cache.
217  */
218 static void
219 lwpchan_alloc_cache(proc_t *p, uint_t bits)
220 {
221 	lwpchan_data_t *lcp;
222 	lwpchan_data_t *old_lcp;
223 	lwpchan_hashbucket_t *hashbucket;
224 	lwpchan_hashbucket_t *endbucket;
225 	lwpchan_hashbucket_t *newbucket;
226 	lwpchan_entry_t *ent;
227 	lwpchan_entry_t *next;
228 	uint_t count;
229 
230 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
231 
232 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
233 	lcp->lwpchan_bits = bits;
234 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
235 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
236 	lcp->lwpchan_entries = 0;
237 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
238 		sizeof (lwpchan_hashbucket_t), KM_SLEEP);
239 	lcp->lwpchan_next_data = NULL;
240 
241 	mutex_enter(&p->p_lcp_lock);
242 	if ((old_lcp = p->p_lcp) != NULL) {
243 		if (old_lcp->lwpchan_bits >= bits) {
244 			/* someone beat us to it */
245 			mutex_exit(&p->p_lcp_lock);
246 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
247 				sizeof (lwpchan_hashbucket_t));
248 			kmem_free(lcp, sizeof (lwpchan_data_t));
249 			return;
250 		}
251 		/*
252 		 * Acquire all of the old hash table locks.
253 		 */
254 		hashbucket = old_lcp->lwpchan_cache;
255 		endbucket = hashbucket + old_lcp->lwpchan_size;
256 		for (; hashbucket < endbucket; hashbucket++)
257 			mutex_enter(&hashbucket->lwpchan_lock);
258 		/*
259 		 * Move all of the old hash table entries to the
260 		 * new hash table.  The new hash table has not yet
261 		 * been installed so we don't need any of its locks.
262 		 */
263 		count = 0;
264 		hashbucket = old_lcp->lwpchan_cache;
265 		for (; hashbucket < endbucket; hashbucket++) {
266 			ent = hashbucket->lwpchan_chain;
267 			while (ent != NULL) {
268 				next = ent->lwpchan_next;
269 				newbucket = lwpchan_bucket(lcp,
270 					(uintptr_t)ent->lwpchan_addr);
271 				ent->lwpchan_next = newbucket->lwpchan_chain;
272 				newbucket->lwpchan_chain = ent;
273 				ent = next;
274 				count++;
275 			}
276 			hashbucket->lwpchan_chain = NULL;
277 		}
278 		lcp->lwpchan_entries = count;
279 	}
280 
281 	/*
282 	 * Retire the old hash table.  We can't actually kmem_free() it
283 	 * now because someone may still have a pointer to it.  Instead,
284 	 * we link it onto the new hash table's list of retired hash tables.
285 	 * The new hash table is double the size of the previous one, so
286 	 * the total size of all retired hash tables is less than the size
287 	 * of the new one.  exit() and exec() free the retired hash tables
288 	 * (see lwpchan_destroy_cache(), below).
289 	 */
290 	lcp->lwpchan_next_data = old_lcp;
291 
292 	/*
293 	 * As soon as we store the new lcp, future locking operations will
294 	 * use it.  Therefore, we must ensure that all the state we've just
295 	 * established reaches global visibility before the new lcp does.
296 	 */
297 	membar_producer();
298 	p->p_lcp = lcp;
299 
300 	if (old_lcp != NULL) {
301 		/*
302 		 * Release all of the old hash table locks.
303 		 */
304 		hashbucket = old_lcp->lwpchan_cache;
305 		for (; hashbucket < endbucket; hashbucket++)
306 			mutex_exit(&hashbucket->lwpchan_lock);
307 	}
308 	mutex_exit(&p->p_lcp_lock);
309 }
310 
311 /*
312  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
313  * Called when the process exits or execs.  All lwps except one have
314  * exited so we need no locks here.
315  */
316 void
317 lwpchan_destroy_cache(int exec)
318 {
319 	proc_t *p = curproc;
320 	lwpchan_hashbucket_t *hashbucket;
321 	lwpchan_hashbucket_t *endbucket;
322 	lwpchan_data_t *lcp;
323 	lwpchan_entry_t *ent;
324 	lwpchan_entry_t *next;
325 	uint16_t lockflg;
326 
327 	lcp = p->p_lcp;
328 	p->p_lcp = NULL;
329 
330 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
331 	hashbucket = lcp->lwpchan_cache;
332 	endbucket = hashbucket + lcp->lwpchan_size;
333 	for (; hashbucket < endbucket; hashbucket++) {
334 		ent = hashbucket->lwpchan_chain;
335 		hashbucket->lwpchan_chain = NULL;
336 		while (ent != NULL) {
337 			next = ent->lwpchan_next;
338 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
339 			    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
340 				lwp_mutex_cleanup(ent, lockflg);
341 			kmem_free(ent, sizeof (*ent));
342 			ent = next;
343 		}
344 	}
345 
346 	while (lcp != NULL) {
347 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
348 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
349 			sizeof (lwpchan_hashbucket_t));
350 		kmem_free(lcp, sizeof (lwpchan_data_t));
351 		lcp = next_lcp;
352 	}
353 }
354 
355 /*
356  * Return zero when there is an entry in the lwpchan cache for the
357  * given process virtual address and non-zero when there is not.
358  * The returned non-zero value is the current length of the
359  * hash chain plus one.  The caller holds the hash bucket lock.
360  */
361 static uint_t
362 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
363 	lwpchan_hashbucket_t *hashbucket)
364 {
365 	lwpchan_entry_t *ent;
366 	uint_t count = 1;
367 
368 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
369 		if (ent->lwpchan_addr == addr) {
370 			if (ent->lwpchan_type != type ||
371 			    ent->lwpchan_pool != pool) {
372 				/*
373 				 * This shouldn't happen, but might if the
374 				 * process reuses its memory for different
375 				 * types of sync objects.  We test first
376 				 * to avoid grabbing the memory cache line.
377 				 */
378 				ent->lwpchan_type = (uint16_t)type;
379 				ent->lwpchan_pool = (uint16_t)pool;
380 			}
381 			*lwpchan = ent->lwpchan_lwpchan;
382 			return (0);
383 		}
384 		count++;
385 	}
386 	return (count);
387 }
388 
389 /*
390  * Return the cached lwpchan mapping if cached, otherwise insert
391  * a virtual address to lwpchan mapping into the cache.
392  */
393 static int
394 lwpchan_get_mapping(struct as *as, caddr_t addr,
395 	int type, lwpchan_t *lwpchan, int pool)
396 {
397 	proc_t *p = curproc;
398 	lwpchan_data_t *lcp;
399 	lwpchan_hashbucket_t *hashbucket;
400 	lwpchan_entry_t *ent;
401 	memid_t	memid;
402 	uint_t count;
403 	uint_t bits;
404 
405 top:
406 	/* initialize the lwpchan cache, if necesary */
407 	if ((lcp = p->p_lcp) == NULL) {
408 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
409 		goto top;
410 	}
411 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
412 	mutex_enter(&hashbucket->lwpchan_lock);
413 	if (lcp != p->p_lcp) {
414 		/* someone resized the lwpchan cache; start over */
415 		mutex_exit(&hashbucket->lwpchan_lock);
416 		goto top;
417 	}
418 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
419 		/* it's in the cache */
420 		mutex_exit(&hashbucket->lwpchan_lock);
421 		return (1);
422 	}
423 	mutex_exit(&hashbucket->lwpchan_lock);
424 	if (as_getmemid(as, addr, &memid) != 0)
425 		return (0);
426 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
427 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
428 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
429 	mutex_enter(&hashbucket->lwpchan_lock);
430 	if (lcp != p->p_lcp) {
431 		/* someone resized the lwpchan cache; start over */
432 		mutex_exit(&hashbucket->lwpchan_lock);
433 		kmem_free(ent, sizeof (*ent));
434 		goto top;
435 	}
436 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
437 	if (count == 0) {
438 		/* someone else added this entry to the cache */
439 		mutex_exit(&hashbucket->lwpchan_lock);
440 		kmem_free(ent, sizeof (*ent));
441 		return (1);
442 	}
443 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
444 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
445 		/* hash chain too long; reallocate the hash table */
446 		mutex_exit(&hashbucket->lwpchan_lock);
447 		kmem_free(ent, sizeof (*ent));
448 		lwpchan_alloc_cache(p, bits + 1);
449 		goto top;
450 	}
451 	ent->lwpchan_addr = addr;
452 	ent->lwpchan_type = (uint16_t)type;
453 	ent->lwpchan_pool = (uint16_t)pool;
454 	ent->lwpchan_lwpchan = *lwpchan;
455 	ent->lwpchan_next = hashbucket->lwpchan_chain;
456 	hashbucket->lwpchan_chain = ent;
457 	atomic_add_32(&lcp->lwpchan_entries, 1);
458 	mutex_exit(&hashbucket->lwpchan_lock);
459 	return (1);
460 }
461 
462 /*
463  * Return a unique pair of identifiers that corresponds to a
464  * synchronization object's virtual address.  Process-shared
465  * sync objects usually get vnode/offset from as_getmemid().
466  */
467 static int
468 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
469 {
470 	/*
471 	 * If the lwp synch object is defined to be process-private,
472 	 * we just make the first field of the lwpchan be 'as' and
473 	 * the second field be the synch object's virtual address.
474 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
475 	 * The lwpchan cache is used only for process-shared objects.
476 	 */
477 	if ((type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) == 0) {
478 		lwpchan->lc_wchan0 = (caddr_t)as;
479 		lwpchan->lc_wchan = addr;
480 		return (1);
481 	}
482 	/* check the lwpchan cache for mapping */
483 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
484 }
485 
486 static void
487 lwp_block(lwpchan_t *lwpchan)
488 {
489 	kthread_t *t = curthread;
490 	klwp_t *lwp = ttolwp(t);
491 	sleepq_head_t *sqh;
492 
493 	thread_lock(t);
494 	t->t_flag |= T_WAKEABLE;
495 	t->t_lwpchan = *lwpchan;
496 	t->t_sobj_ops = &lwp_sobj_ops;
497 	t->t_release = 0;
498 	sqh = lwpsqhash(lwpchan);
499 	disp_lock_enter_high(&sqh->sq_lock);
500 	CL_SLEEP(t);
501 	DTRACE_SCHED(sleep);
502 	THREAD_SLEEP(t, &sqh->sq_lock);
503 	sleepq_insert(&sqh->sq_queue, t);
504 	thread_unlock(t);
505 	lwp->lwp_asleep = 1;
506 	lwp->lwp_sysabort = 0;
507 	lwp->lwp_ru.nvcsw++;
508 	(void) new_mstate(curthread, LMS_SLEEP);
509 }
510 
511 static kthread_t *
512 lwpsobj_pi_owner(upimutex_t *up)
513 {
514 	return (up->upi_owner);
515 }
516 
517 static struct upimutex *
518 upi_get(upib_t *upibp, lwpchan_t *lcp)
519 {
520 	struct upimutex *upip;
521 
522 	for (upip = upibp->upib_first; upip != NULL;
523 	    upip = upip->upi_nextchain) {
524 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
525 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
526 			break;
527 	}
528 	return (upip);
529 }
530 
531 static void
532 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
533 {
534 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
535 
536 	/*
537 	 * Insert upimutex at front of list. Maybe a bit unfair
538 	 * but assume that not many lwpchans hash to the same
539 	 * upimutextab bucket, i.e. the list of upimutexes from
540 	 * upib_first is not too long.
541 	 */
542 	upimutex->upi_nextchain = upibp->upib_first;
543 	upibp->upib_first = upimutex;
544 }
545 
546 static void
547 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
548 {
549 	struct upimutex **prev;
550 
551 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
552 
553 	prev = &upibp->upib_first;
554 	while (*prev != upimutex) {
555 		prev = &(*prev)->upi_nextchain;
556 	}
557 	*prev = upimutex->upi_nextchain;
558 	upimutex->upi_nextchain = NULL;
559 }
560 
561 /*
562  * Add upimutex to chain of upimutexes held by curthread.
563  * Returns number of upimutexes held by curthread.
564  */
565 static uint32_t
566 upi_mylist_add(struct upimutex *upimutex)
567 {
568 	kthread_t *t = curthread;
569 
570 	/*
571 	 * Insert upimutex at front of list of upimutexes owned by t. This
572 	 * would match typical LIFO order in which nested locks are acquired
573 	 * and released.
574 	 */
575 	upimutex->upi_nextowned = t->t_upimutex;
576 	t->t_upimutex = upimutex;
577 	t->t_nupinest++;
578 	ASSERT(t->t_nupinest > 0);
579 	return (t->t_nupinest);
580 }
581 
582 /*
583  * Delete upimutex from list of upimutexes owned by curthread.
584  */
585 static void
586 upi_mylist_del(struct upimutex *upimutex)
587 {
588 	kthread_t *t = curthread;
589 	struct upimutex **prev;
590 
591 	/*
592 	 * Since the order in which nested locks are acquired and released,
593 	 * is typically LIFO, and typical nesting levels are not too deep, the
594 	 * following should not be expensive in the general case.
595 	 */
596 	prev = &t->t_upimutex;
597 	while (*prev != upimutex) {
598 		prev = &(*prev)->upi_nextowned;
599 	}
600 	*prev = upimutex->upi_nextowned;
601 	upimutex->upi_nextowned = NULL;
602 	ASSERT(t->t_nupinest > 0);
603 	t->t_nupinest--;
604 }
605 
606 /*
607  * Returns true if upimutex is owned. Should be called only when upim points
608  * to kmem which cannot disappear from underneath.
609  */
610 static int
611 upi_owned(upimutex_t *upim)
612 {
613 	return (upim->upi_owner == curthread);
614 }
615 
616 /*
617  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
618  */
619 static struct upimutex *
620 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
621 {
622 	lwpchan_t lwpchan;
623 	upib_t *upibp;
624 	struct upimutex *upimutex;
625 
626 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
627 	    &lwpchan, LWPCHAN_MPPOOL))
628 		return (NULL);
629 
630 	upibp = &UPI_CHAIN(lwpchan);
631 	mutex_enter(&upibp->upib_lock);
632 	upimutex = upi_get(upibp, &lwpchan);
633 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
634 		mutex_exit(&upibp->upib_lock);
635 		return (NULL);
636 	}
637 	mutex_exit(&upibp->upib_lock);
638 	return (upimutex);
639 }
640 
641 /*
642  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
643  * no lock hand-off occurrs.
644  */
645 static void
646 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
647 {
648 	turnstile_t *ts;
649 	upib_t *upibp;
650 	kthread_t *newowner;
651 
652 	upi_mylist_del(upimutex);
653 	upibp = upimutex->upi_upibp;
654 	mutex_enter(&upibp->upib_lock);
655 	if (upimutex->upi_waiter != 0) { /* if waiters */
656 		ts = turnstile_lookup(upimutex);
657 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
658 			/* hand-off lock to highest prio waiter */
659 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
660 			upimutex->upi_owner = newowner;
661 			if (ts->ts_waiters == 1)
662 				upimutex->upi_waiter = 0;
663 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
664 			mutex_exit(&upibp->upib_lock);
665 			return;
666 		} else if (ts != NULL) {
667 			/* LOCK_NOTRECOVERABLE: wakeup all */
668 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
669 		} else {
670 			/*
671 			 * Misleading w bit. Waiters might have been
672 			 * interrupted. No need to clear the w bit (upimutex
673 			 * will soon be freed). Re-calculate PI from existing
674 			 * waiters.
675 			 */
676 			turnstile_exit(upimutex);
677 			turnstile_pi_recalc();
678 		}
679 	}
680 	/*
681 	 * no waiters, or LOCK_NOTRECOVERABLE.
682 	 * remove from the bucket chain of upi mutexes.
683 	 * de-allocate kernel memory (upimutex).
684 	 */
685 	upi_chain_del(upimutex->upi_upibp, upimutex);
686 	mutex_exit(&upibp->upib_lock);
687 	kmem_free(upimutex, sizeof (upimutex_t));
688 }
689 
690 static int
691 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
692 {
693 	label_t ljb;
694 	int error = 0;
695 	lwpchan_t lwpchan;
696 	uint16_t flag;
697 	upib_t *upibp;
698 	volatile struct upimutex *upimutex = NULL;
699 	turnstile_t *ts;
700 	uint32_t nupinest;
701 	volatile int upilocked = 0;
702 
703 	if (on_fault(&ljb)) {
704 		if (upilocked)
705 			upimutex_unlock((upimutex_t *)upimutex, 0);
706 		error = EFAULT;
707 		goto out;
708 	}
709 	/*
710 	 * The apparent assumption made in implementing other _lwp_* synch
711 	 * primitives, is that get_lwpchan() does not return a unique cookie
712 	 * for the case where 2 processes (one forked from the other) point
713 	 * at the same underlying object, which is typed USYNC_PROCESS, but
714 	 * mapped MAP_PRIVATE, since the object has not yet been written to,
715 	 * in the child process.
716 	 *
717 	 * Since get_lwpchan() has been fixed, it is not necessary to do the
718 	 * dummy writes to force a COW fault as in other places (which should
719 	 * be fixed).
720 	 */
721 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
722 	    &lwpchan, LWPCHAN_MPPOOL)) {
723 		error = EFAULT;
724 		goto out;
725 	}
726 	upibp = &UPI_CHAIN(lwpchan);
727 retry:
728 	mutex_enter(&upibp->upib_lock);
729 	upimutex = upi_get(upibp, &lwpchan);
730 	if (upimutex == NULL)  {
731 		/* lock available since lwpchan has no upimutex */
732 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
733 		upi_chain_add(upibp, (upimutex_t *)upimutex);
734 		upimutex->upi_owner = curthread; /* grab lock */
735 		upimutex->upi_upibp = upibp;
736 		upimutex->upi_vaddr = lp;
737 		upimutex->upi_lwpchan = lwpchan;
738 		mutex_exit(&upibp->upib_lock);
739 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
740 		upilocked = 1;
741 		fuword16_noerr(&lp->mutex_flag, &flag);
742 		if (nupinest > maxnestupimx &&
743 		    secpolicy_resource(CRED()) != 0) {
744 			upimutex_unlock((upimutex_t *)upimutex, flag);
745 			error = ENOMEM;
746 			goto out;
747 		}
748 		if (flag & LOCK_OWNERDEAD) {
749 			/*
750 			 * Return with upimutex held.
751 			 */
752 			error = EOWNERDEAD;
753 		} else if (flag & LOCK_NOTRECOVERABLE) {
754 			/*
755 			 * Since the setting of LOCK_NOTRECOVERABLE
756 			 * was done under the high-level upi mutex,
757 			 * in lwp_upimutex_unlock(), this flag needs to
758 			 * be checked while holding the upi mutex.
759 			 * If set, this thread should  return without
760 			 * the lock held, and with the right error
761 			 * code.
762 			 */
763 			upimutex_unlock((upimutex_t *)upimutex, flag);
764 			upilocked = 0;
765 			error = ENOTRECOVERABLE;
766 		}
767 		goto out;
768 	}
769 	/*
770 	 * If a upimutex object exists, it must have an owner.
771 	 * This is due to lock hand-off, and release of upimutex when no
772 	 * waiters are present at unlock time,
773 	 */
774 	ASSERT(upimutex->upi_owner != NULL);
775 	if (upimutex->upi_owner == curthread) {
776 		/*
777 		 * The user wrapper can check if the mutex type is
778 		 * ERRORCHECK: if not, it should stall at user-level.
779 		 * If so, it should return the error code.
780 		 */
781 		mutex_exit(&upibp->upib_lock);
782 		error = EDEADLK;
783 		goto out;
784 	}
785 	if (try == UPIMUTEX_TRY) {
786 		mutex_exit(&upibp->upib_lock);
787 		error = EBUSY;
788 		goto out;
789 	}
790 	/*
791 	 * Block for the lock.
792 	 * Put the lwp in an orderly state for debugging.
793 	 * Calling prstop() has to be done here, and not in
794 	 * turnstile_block(), since the preceding call to
795 	 * turnstile_lookup() raises the PIL to a level
796 	 * at which calls to prstop() should not be made.
797 	 */
798 	if ((error = lwptp->lwpt_time_error) != 0) {
799 		/*
800 		 * The SUSV3 Posix spec is very clear that we
801 		 * should get no error from validating the
802 		 * timer until we would actually sleep.
803 		 */
804 		mutex_exit(&upibp->upib_lock);
805 		goto out;
806 	}
807 	prstop(PR_REQUESTED, 0);
808 	if (lwptp->lwpt_tsp != NULL) {
809 		/*
810 		 * If we successfully queue the timeout
811 		 * (lwp_timer_enqueue() returns zero),
812 		 * then don't drop t_delay_lock until we are
813 		 * on the sleep queue (in turnstile_block()).
814 		 * Otherwise we will get an immediate timeout
815 		 * when we attempt to sleep in turnstile_block().
816 		 */
817 		mutex_enter(&curthread->t_delay_lock);
818 		if (lwp_timer_enqueue(lwptp) != 0)
819 			mutex_exit(&curthread->t_delay_lock);
820 	}
821 	/*
822 	 * Now, set the waiter bit and block for the lock in turnstile_block().
823 	 * No need to preserve the previous wbit since a lock try is not
824 	 * attempted after setting the wait bit. Wait bit is set under
825 	 * the upib_lock, which is not released until the turnstile lock
826 	 * is acquired. Say, the upimutex is L:
827 	 *
828 	 * 1. upib_lock is held so the waiter does not have to retry L after
829 	 *    setting the wait bit: since the owner has to grab the upib_lock
830 	 *    to unlock L, it will certainly see the wait bit set.
831 	 * 2. upib_lock is not released until the turnstile lock is acquired.
832 	 *    This is the key to preventing a missed wake-up. Otherwise, the
833 	 *    owner could acquire the upib_lock, and the tc_lock, to call
834 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
835 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
836 	 *    find this waiter, resulting in the missed wakeup.
837 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
838 	 *    holding the tc_lock (since mutex_exit() could need to acquire
839 	 *    the same tc_lock)...and so is held when calling turnstile_block().
840 	 *    The address of upib_lock is passed to turnstile_block() which
841 	 *    releases it after releasing all turnstile locks, and before going
842 	 *    to sleep in swtch().
843 	 * 4. The waiter value cannot be a count of waiters, because a waiter
844 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
845 	 *    which point, the upib_lock cannot be locked, to decrement waiter
846 	 *    count. So, just treat the waiter state as a bit, not a count.
847 	 */
848 	ts = turnstile_lookup((upimutex_t *)upimutex);
849 	upimutex->upi_waiter = 1;
850 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
851 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
852 	/*
853 	 * Hand-off implies that we wakeup holding the lock, except when:
854 	 *	- deadlock is detected
855 	 *	- lock is not recoverable
856 	 *	- we got an interrupt or timeout
857 	 * If we wake up due to an interrupt or timeout, we may
858 	 * or may not be holding the lock due to mutex hand-off.
859 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
860 	 */
861 	if (error != 0) {
862 		if ((error == EINTR || error == ETIME) &&
863 		    (upimutex = lwp_upimutex_owned(lp, type))) {
864 			/*
865 			 * Unlock and return - the re-startable syscall will
866 			 * try the lock again if we got EINTR.
867 			 */
868 			(void) upi_mylist_add((upimutex_t *)upimutex);
869 			upimutex_unlock((upimutex_t *)upimutex, 0);
870 		}
871 		/*
872 		 * The only other possible error is EDEADLK.  If so, upimutex
873 		 * is valid, since its owner is deadlocked with curthread.
874 		 */
875 		ASSERT(error == EINTR || error == ETIME ||
876 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
877 		ASSERT(!lwp_upimutex_owned(lp, type));
878 		goto out;
879 	}
880 	if (lwp_upimutex_owned(lp, type)) {
881 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
882 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
883 		upilocked = 1;
884 	}
885 	/*
886 	 * Now, need to read the user-level lp->mutex_flag to do the following:
887 	 *
888 	 * - if lock is held, check if EOWNERDEAD should be returned
889 	 * - if lock isn't held, check if ENOTRECOVERABLE should be returned
890 	 *
891 	 * Now, either lp->mutex_flag is readable or it's not. If not
892 	 * readable, the on_fault path will cause a return with EFAULT as
893 	 * it should. If it is readable, the state of the flag encodes the
894 	 * robustness state of the lock:
895 	 *
896 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD setting
897 	 * will influence the return code appropriately. If the upimutex is
898 	 * not locked here, this could be due to a spurious wake-up or a
899 	 * NOTRECOVERABLE event. The flag's setting can be used to distinguish
900 	 * between these two events.
901 	 */
902 	fuword16_noerr(&lp->mutex_flag, &flag);
903 	if (upilocked) {
904 		/*
905 		 * If the thread wakes up from turnstile_block with the lock
906 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
907 		 * since it would not have been handed-off the lock.
908 		 * So, no need to check for this case.
909 		 */
910 		if (nupinest > maxnestupimx &&
911 		    secpolicy_resource(CRED()) != 0) {
912 			upimutex_unlock((upimutex_t *)upimutex, flag);
913 			upilocked = 0;
914 			error = ENOMEM;
915 		} else if (flag & LOCK_OWNERDEAD) {
916 			error = EOWNERDEAD;
917 		}
918 	} else {
919 		/*
920 		 * Wake-up without the upimutex held. Either this is a
921 		 * spurious wake-up (due to signals, forkall(), whatever), or
922 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
923 		 * of the mutex flag can be used to distinguish between the
924 		 * two events.
925 		 */
926 		if (flag & LOCK_NOTRECOVERABLE) {
927 			error = ENOTRECOVERABLE;
928 		} else {
929 			/*
930 			 * Here, the flag could be set to LOCK_OWNERDEAD or
931 			 * not. In both cases, this is a spurious wakeup,
932 			 * since the upi lock is not held, but the thread
933 			 * has returned from turnstile_block().
934 			 *
935 			 * The user flag could be LOCK_OWNERDEAD if, at the
936 			 * same time as curthread having been woken up
937 			 * spuriously, the owner (say Tdead) has died, marked
938 			 * the mutex flag accordingly, and handed off the lock
939 			 * to some other waiter (say Tnew). curthread just
940 			 * happened to read the flag while Tnew has yet to deal
941 			 * with the owner-dead event.
942 			 *
943 			 * In this event, curthread should retry the lock.
944 			 * If Tnew is able to cleanup the lock, curthread
945 			 * will eventually get the lock with a zero error code,
946 			 * If Tnew is unable to cleanup, its eventual call to
947 			 * unlock the lock will result in the mutex flag being
948 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
949 			 * all waiters, including curthread, which will then
950 			 * eventually return ENOTRECOVERABLE due to the above
951 			 * check.
952 			 *
953 			 * Of course, if the user-flag is not set with
954 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
955 			 * this is definitely a spurious wakeup.
956 			 */
957 			goto retry;
958 		}
959 	}
960 
961 out:
962 	no_fault();
963 	return (error);
964 }
965 
966 
967 static int
968 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
969 {
970 	label_t ljb;
971 	int error = 0;
972 	lwpchan_t lwpchan;
973 	uint16_t flag;
974 	upib_t *upibp;
975 	volatile struct upimutex *upimutex = NULL;
976 	volatile int upilocked = 0;
977 
978 	if (on_fault(&ljb)) {
979 		if (upilocked)
980 			upimutex_unlock((upimutex_t *)upimutex, 0);
981 		error = EFAULT;
982 		goto out;
983 	}
984 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
985 	    &lwpchan, LWPCHAN_MPPOOL)) {
986 		error = EFAULT;
987 		goto out;
988 	}
989 	upibp = &UPI_CHAIN(lwpchan);
990 	mutex_enter(&upibp->upib_lock);
991 	upimutex = upi_get(upibp, &lwpchan);
992 	/*
993 	 * If the lock is not held, or the owner is not curthread, return
994 	 * error. The user-level wrapper can return this error or stall,
995 	 * depending on whether mutex is of ERRORCHECK type or not.
996 	 */
997 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
998 		mutex_exit(&upibp->upib_lock);
999 		error = EPERM;
1000 		goto out;
1001 	}
1002 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1003 	upilocked = 1;
1004 	fuword16_noerr(&lp->mutex_flag, &flag);
1005 	if (flag & LOCK_OWNERDEAD) {
1006 		/*
1007 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1008 		 */
1009 		flag &= ~LOCK_OWNERDEAD;
1010 		flag |= LOCK_NOTRECOVERABLE;
1011 		suword16_noerr(&lp->mutex_flag, flag);
1012 	}
1013 	upimutex_unlock((upimutex_t *)upimutex, flag);
1014 	upilocked = 0;
1015 out:
1016 	no_fault();
1017 	return (error);
1018 }
1019 
1020 /*
1021  * Mark user mutex state, corresponding to kernel upimutex, as LOCK_OWNERDEAD.
1022  */
1023 static int
1024 upi_dead(upimutex_t *upip)
1025 {
1026 	label_t ljb;
1027 	int error = 0;
1028 	lwp_mutex_t *lp;
1029 	uint16_t flag;
1030 
1031 	if (on_fault(&ljb)) {
1032 		error = EFAULT;
1033 		goto out;
1034 	}
1035 
1036 	lp = upip->upi_vaddr;
1037 	fuword16_noerr(&lp->mutex_flag, &flag);
1038 	flag |= LOCK_OWNERDEAD;
1039 	suword16_noerr(&lp->mutex_flag, flag);
1040 out:
1041 	no_fault();
1042 	return (error);
1043 }
1044 
1045 /*
1046  * Unlock all upimutexes held by curthread, since curthread is dying.
1047  * For each upimutex, attempt to mark its corresponding user mutex object as
1048  * dead.
1049  */
1050 void
1051 upimutex_cleanup()
1052 {
1053 	kthread_t *t = curthread;
1054 	struct upimutex *upip;
1055 
1056 	while ((upip = t->t_upimutex) != NULL) {
1057 		if (upi_dead(upip) != 0) {
1058 			/*
1059 			 * If the user object associated with this upimutex is
1060 			 * unmapped, unlock upimutex with the
1061 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1062 			 * woken up. Since user object is unmapped, it could
1063 			 * not be marked as dead or notrecoverable.
1064 			 * The waiters will now all wake up and return
1065 			 * ENOTRECOVERABLE, since they would find that the lock
1066 			 * has not been handed-off to them.
1067 			 * See lwp_upimutex_lock().
1068 			 */
1069 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1070 		} else {
1071 			/*
1072 			 * The user object has been updated as dead.
1073 			 * Unlock the upimutex: if no waiters, upip kmem will
1074 			 * be freed. If there is a waiter, the lock will be
1075 			 * handed off. If exit() is in progress, each existing
1076 			 * waiter will successively get the lock, as owners
1077 			 * die, and each new owner will call this routine as
1078 			 * it dies. The last owner will free kmem, since
1079 			 * it will find the upimutex has no waiters. So,
1080 			 * eventually, the kmem is guaranteed to be freed.
1081 			 */
1082 			upimutex_unlock(upip, 0);
1083 		}
1084 		/*
1085 		 * Note that the call to upimutex_unlock() above will delete
1086 		 * upimutex from the t_upimutexes chain. And so the
1087 		 * while loop will eventually terminate.
1088 		 */
1089 	}
1090 }
1091 
1092 int
1093 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1094 {
1095 	kthread_t *t = curthread;
1096 	klwp_t *lwp = ttolwp(t);
1097 	proc_t *p = ttoproc(t);
1098 	lwp_timer_t lwpt;
1099 	caddr_t timedwait;
1100 	int error = 0;
1101 	int time_error;
1102 	clock_t tim = -1;
1103 	uchar_t waiters;
1104 	volatile int locked = 0;
1105 	volatile int watched = 0;
1106 	label_t ljb;
1107 	volatile uint8_t type = 0;
1108 	lwpchan_t lwpchan;
1109 	sleepq_head_t *sqh;
1110 	static int iswanted();
1111 	uint16_t flag;
1112 	int imm_timeout = 0;
1113 
1114 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1115 		return (set_errno(EFAULT));
1116 
1117 	timedwait = (caddr_t)tsp;
1118 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1119 	    lwpt.lwpt_imm_timeout) {
1120 		imm_timeout = 1;
1121 		timedwait = NULL;
1122 	}
1123 
1124 	/*
1125 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1126 	 * this micro state is really a run state. If the thread indeed blocks,
1127 	 * this state becomes valid. If not, the state is converted back to
1128 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1129 	 * when blocking.
1130 	 */
1131 	(void) new_mstate(t, LMS_USER_LOCK);
1132 	if (on_fault(&ljb)) {
1133 		if (locked)
1134 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1135 		error = EFAULT;
1136 		goto out;
1137 	}
1138 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1139 	if (UPIMUTEX(type)) {
1140 		no_fault();
1141 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1142 		if ((error == 0 || error == EOWNERDEAD) &&
1143 		    (type & USYNC_PROCESS))
1144 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1145 		if (tsp && !time_error)	/* copyout the residual time left */
1146 			error = lwp_timer_copyout(&lwpt, error);
1147 		if (error)
1148 			return (set_errno(error));
1149 		return (0);
1150 	}
1151 	/*
1152 	 * Force Copy-on-write fault if lwp_mutex_t object is
1153 	 * defined to be MAP_PRIVATE and it was initialized to
1154 	 * USYNC_PROCESS.
1155 	 */
1156 	suword8_noerr(&lp->mutex_type, type);
1157 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1158 	    &lwpchan, LWPCHAN_MPPOOL)) {
1159 		error = EFAULT;
1160 		goto out;
1161 	}
1162 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1163 	locked = 1;
1164 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1165 	suword8_noerr(&lp->mutex_waiters, 1);
1166 	if (type & USYNC_PROCESS_ROBUST) {
1167 		fuword16_noerr(&lp->mutex_flag, &flag);
1168 		if (flag & LOCK_NOTRECOVERABLE) {
1169 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1170 			error = ENOTRECOVERABLE;
1171 			goto out;
1172 		}
1173 	}
1174 
1175 	/*
1176 	 * If watchpoints are set, they need to be restored, since
1177 	 * atomic accesses of memory such as the call to ulock_try()
1178 	 * below cannot be watched.
1179 	 */
1180 
1181 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1182 
1183 	while (!ulock_try(&lp->mutex_lockw)) {
1184 		if (time_error) {
1185 			/*
1186 			 * The SUSV3 Posix spec is very clear that we
1187 			 * should get no error from validating the
1188 			 * timer until we would actually sleep.
1189 			 */
1190 			error = time_error;
1191 			break;
1192 		}
1193 
1194 		if (watched) {
1195 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1196 			watched = 0;
1197 		}
1198 
1199 		/*
1200 		 * Put the lwp in an orderly state for debugging.
1201 		 */
1202 		prstop(PR_REQUESTED, 0);
1203 		if (timedwait) {
1204 			/*
1205 			 * If we successfully queue the timeout,
1206 			 * then don't drop t_delay_lock until
1207 			 * we are on the sleep queue (below).
1208 			 */
1209 			mutex_enter(&t->t_delay_lock);
1210 			if (lwp_timer_enqueue(&lwpt) != 0) {
1211 				mutex_exit(&t->t_delay_lock);
1212 				imm_timeout = 1;
1213 				timedwait = NULL;
1214 			}
1215 		}
1216 		lwp_block(&lwpchan);
1217 		/*
1218 		 * Nothing should happen to cause the lwp to go to
1219 		 * sleep again until after it returns from swtch().
1220 		 */
1221 		if (timedwait)
1222 			mutex_exit(&t->t_delay_lock);
1223 		locked = 0;
1224 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1225 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1226 			setrun(t);
1227 		swtch();
1228 		t->t_flag &= ~T_WAKEABLE;
1229 		if (timedwait)
1230 			tim = lwp_timer_dequeue(&lwpt);
1231 		setallwatch();
1232 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1233 			error = EINTR;
1234 		else if (imm_timeout || (timedwait && tim == -1))
1235 			error = ETIME;
1236 		if (error) {
1237 			lwp->lwp_asleep = 0;
1238 			lwp->lwp_sysabort = 0;
1239 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1240 			    S_WRITE);
1241 
1242 			/*
1243 			 * Need to re-compute waiters bit. The waiters field in
1244 			 * the lock is not reliable. Either of two things could
1245 			 * have occurred: no lwp may have called lwp_release()
1246 			 * for me but I have woken up due to a signal or
1247 			 * timeout.  In this case, the waiter bit is incorrect
1248 			 * since it is still set to 1, set above.
1249 			 * OR an lwp_release() did occur for some other lwp on
1250 			 * the same lwpchan. In this case, the waiter bit is
1251 			 * correct.  But which event occurred, one can't tell.
1252 			 * So, recompute.
1253 			 */
1254 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1255 			locked = 1;
1256 			sqh = lwpsqhash(&lwpchan);
1257 			disp_lock_enter(&sqh->sq_lock);
1258 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1259 			disp_lock_exit(&sqh->sq_lock);
1260 			break;
1261 		}
1262 		lwp->lwp_asleep = 0;
1263 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1264 		    S_WRITE);
1265 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1266 		locked = 1;
1267 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1268 		suword8_noerr(&lp->mutex_waiters, 1);
1269 		if (type & USYNC_PROCESS_ROBUST) {
1270 			fuword16_noerr(&lp->mutex_flag, &flag);
1271 			if (flag & LOCK_NOTRECOVERABLE) {
1272 				error = ENOTRECOVERABLE;
1273 				break;
1274 			}
1275 		}
1276 	}
1277 
1278 	if (t->t_mstate == LMS_USER_LOCK)
1279 		(void) new_mstate(t, LMS_SYSTEM);
1280 
1281 	if (!error && (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST))) {
1282 		suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1283 		if (type & USYNC_PROCESS_ROBUST) {
1284 			fuword16_noerr(&lp->mutex_flag, &flag);
1285 			if (flag & LOCK_OWNERDEAD)
1286 				error = EOWNERDEAD;
1287 			else if (flag & LOCK_UNMAPPED)
1288 				error = ELOCKUNMAPPED;
1289 		}
1290 	}
1291 	suword8_noerr(&lp->mutex_waiters, waiters);
1292 	locked = 0;
1293 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1294 out:
1295 	no_fault();
1296 	if (watched)
1297 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1298 	if (tsp && !time_error)		/* copyout the residual time left */
1299 		error = lwp_timer_copyout(&lwpt, error);
1300 	if (error)
1301 		return (set_errno(error));
1302 	return (0);
1303 }
1304 
1305 /*
1306  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1307  * libc now calls lwp_mutex_timedlock(lp, NULL).
1308  * This system call trap continues to exist solely for the benefit
1309  * of old statically-linked binaries from Solaris 9 and before.
1310  * It should be removed from the system when we no longer care
1311  * about such applications.
1312  */
1313 int
1314 lwp_mutex_lock(lwp_mutex_t *lp)
1315 {
1316 	return (lwp_mutex_timedlock(lp, NULL));
1317 }
1318 
1319 static int
1320 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1321 {
1322 	/*
1323 	 * The caller holds the dispatcher lock on the sleep queue.
1324 	 */
1325 	while (t != NULL) {
1326 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1327 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1328 			return (1);
1329 		t = t->t_link;
1330 	}
1331 	return (0);
1332 }
1333 
1334 /*
1335  * Return the highest priority thread sleeping on this lwpchan.
1336  */
1337 static kthread_t *
1338 lwp_queue_waiter(lwpchan_t *lwpchan)
1339 {
1340 	sleepq_head_t *sqh;
1341 	kthread_t *tp;
1342 
1343 	sqh = lwpsqhash(lwpchan);
1344 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1345 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1346 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1347 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1348 			break;
1349 	}
1350 	disp_lock_exit(&sqh->sq_lock);
1351 	return (tp);
1352 }
1353 
1354 static int
1355 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1356 {
1357 	sleepq_head_t *sqh;
1358 	kthread_t *tp;
1359 	kthread_t **tpp;
1360 
1361 	sqh = lwpsqhash(lwpchan);
1362 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1363 	tpp = &sqh->sq_queue.sq_first;
1364 	while ((tp = *tpp) != NULL) {
1365 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1366 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1367 			/*
1368 			 * The following is typically false. It could be true
1369 			 * only if lwp_release() is called from
1370 			 * lwp_mutex_wakeup() after reading the waiters field
1371 			 * from memory in which the lwp lock used to be, but has
1372 			 * since been re-used to hold a lwp cv or lwp semaphore.
1373 			 * The thread "tp" found to match the lwp lock's wchan
1374 			 * is actually sleeping for the cv or semaphore which
1375 			 * now has the same wchan. In this case, lwp_release()
1376 			 * should return failure.
1377 			 */
1378 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1379 				ASSERT(sync_type == 0);
1380 				/*
1381 				 * assert that this can happen only for mutexes
1382 				 * i.e. sync_type == 0, for correctly written
1383 				 * user programs.
1384 				 */
1385 				disp_lock_exit(&sqh->sq_lock);
1386 				return (0);
1387 			}
1388 			*waiters = iswanted(tp->t_link, lwpchan);
1389 			sleepq_unlink(tpp, tp);
1390 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1391 			tp->t_wchan0 = NULL;
1392 			tp->t_wchan = NULL;
1393 			tp->t_sobj_ops = NULL;
1394 			tp->t_release = 1;
1395 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1396 			CL_WAKEUP(tp);
1397 			thread_unlock(tp);	/* drop run queue lock */
1398 			return (1);
1399 		}
1400 		tpp = &tp->t_link;
1401 	}
1402 	*waiters = 0;
1403 	disp_lock_exit(&sqh->sq_lock);
1404 	return (0);
1405 }
1406 
1407 static void
1408 lwp_release_all(lwpchan_t *lwpchan)
1409 {
1410 	sleepq_head_t	*sqh;
1411 	kthread_t *tp;
1412 	kthread_t **tpp;
1413 
1414 	sqh = lwpsqhash(lwpchan);
1415 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1416 	tpp = &sqh->sq_queue.sq_first;
1417 	while ((tp = *tpp) != NULL) {
1418 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1419 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1420 			sleepq_unlink(tpp, tp);
1421 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1422 			tp->t_wchan0 = NULL;
1423 			tp->t_wchan = NULL;
1424 			tp->t_sobj_ops = NULL;
1425 			CL_WAKEUP(tp);
1426 			thread_unlock_high(tp);	/* release run queue lock */
1427 		} else {
1428 			tpp = &tp->t_link;
1429 		}
1430 	}
1431 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1432 }
1433 
1434 /*
1435  * unblock a lwp that is trying to acquire this mutex. the blocked
1436  * lwp resumes and retries to acquire the lock.
1437  */
1438 int
1439 lwp_mutex_wakeup(lwp_mutex_t *lp)
1440 {
1441 	proc_t *p = ttoproc(curthread);
1442 	lwpchan_t lwpchan;
1443 	uchar_t waiters;
1444 	volatile int locked = 0;
1445 	volatile int watched = 0;
1446 	volatile uint8_t type = 0;
1447 	label_t ljb;
1448 	int error = 0;
1449 
1450 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1451 		return (set_errno(EFAULT));
1452 
1453 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1454 
1455 	if (on_fault(&ljb)) {
1456 		if (locked)
1457 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1458 		error = EFAULT;
1459 		goto out;
1460 	}
1461 	/*
1462 	 * Force Copy-on-write fault if lwp_mutex_t object is
1463 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
1464 	 */
1465 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1466 	suword8_noerr(&lp->mutex_type, type);
1467 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1468 	    &lwpchan, LWPCHAN_MPPOOL)) {
1469 		error = EFAULT;
1470 		goto out;
1471 	}
1472 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1473 	locked = 1;
1474 	/*
1475 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1476 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1477 	 * may fail.  If it fails, do not write into the waiter bit.
1478 	 * The call to lwp_release() might fail due to one of three reasons:
1479 	 *
1480 	 * 	1. due to the thread which set the waiter bit not actually
1481 	 *	   sleeping since it got the lock on the re-try. The waiter
1482 	 *	   bit will then be correctly updated by that thread. This
1483 	 *	   window may be closed by reading the wait bit again here
1484 	 *	   and not calling lwp_release() at all if it is zero.
1485 	 *	2. the thread which set the waiter bit and went to sleep
1486 	 *	   was woken up by a signal. This time, the waiter recomputes
1487 	 *	   the wait bit in the return with EINTR code.
1488 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1489 	 *	   memory that has been re-used after the lock was dropped.
1490 	 *	   In this case, writing into the waiter bit would cause data
1491 	 *	   corruption.
1492 	 */
1493 	if (lwp_release(&lwpchan, &waiters, 0) == 1) {
1494 		suword8_noerr(&lp->mutex_waiters, waiters);
1495 	}
1496 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1497 out:
1498 	no_fault();
1499 	if (watched)
1500 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1501 	if (error)
1502 		return (set_errno(error));
1503 	return (0);
1504 }
1505 
1506 /*
1507  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1508  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1509  * a flag telling the kernel whether or not to honor the kernel/user
1510  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1511  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1512  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1513  * it is used an an in/out parameter.  On entry, it contains the relative
1514  * time until timeout.  On exit, we copyout the residual time left to it.
1515  */
1516 int
1517 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1518 {
1519 	kthread_t *t = curthread;
1520 	klwp_t *lwp = ttolwp(t);
1521 	proc_t *p = ttoproc(t);
1522 	lwp_timer_t lwpt;
1523 	lwpchan_t cv_lwpchan;
1524 	lwpchan_t m_lwpchan;
1525 	caddr_t timedwait;
1526 	volatile uint16_t type = 0;
1527 	volatile uint8_t mtype = 0;
1528 	uchar_t waiters;
1529 	volatile int error;
1530 	clock_t tim = -1;
1531 	volatile int locked = 0;
1532 	volatile int m_locked = 0;
1533 	volatile int cvwatched = 0;
1534 	volatile int mpwatched = 0;
1535 	label_t ljb;
1536 	volatile int no_lwpchan = 1;
1537 	int imm_timeout = 0;
1538 	int imm_unpark = 0;
1539 
1540 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1541 	    (caddr_t)mp >= p->p_as->a_userlimit)
1542 		return (set_errno(EFAULT));
1543 
1544 	timedwait = (caddr_t)tsp;
1545 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1546 		return (set_errno(error));
1547 	if (lwpt.lwpt_imm_timeout) {
1548 		imm_timeout = 1;
1549 		timedwait = NULL;
1550 	}
1551 
1552 	(void) new_mstate(t, LMS_USER_LOCK);
1553 
1554 	if (on_fault(&ljb)) {
1555 		if (no_lwpchan) {
1556 			error = EFAULT;
1557 			goto out;
1558 		}
1559 		if (m_locked) {
1560 			m_locked = 0;
1561 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1562 		}
1563 		if (locked) {
1564 			locked = 0;
1565 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1566 		}
1567 		/*
1568 		 * set up another on_fault() for a possible fault
1569 		 * on the user lock accessed at "efault"
1570 		 */
1571 		if (on_fault(&ljb)) {
1572 			if (m_locked) {
1573 				m_locked = 0;
1574 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1575 			}
1576 			goto out;
1577 		}
1578 		error = EFAULT;
1579 		goto efault;
1580 	}
1581 
1582 	/*
1583 	 * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t
1584 	 * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS
1585 	 */
1586 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1587 	if (UPIMUTEX(mtype) == 0) {
1588 		suword8_noerr(&mp->mutex_type, mtype);
1589 		/* convert user level mutex, "mp", to a unique lwpchan */
1590 		/* check if mtype is ok to use below, instead of type from cv */
1591 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1592 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1593 			error = EFAULT;
1594 			goto out;
1595 		}
1596 	}
1597 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1598 	suword16_noerr(&cv->cond_type, type);
1599 	/* convert user level condition variable, "cv", to a unique lwpchan */
1600 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1601 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1602 		error = EFAULT;
1603 		goto out;
1604 	}
1605 	no_lwpchan = 0;
1606 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1607 	if (UPIMUTEX(mtype) == 0)
1608 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1609 		    S_WRITE);
1610 
1611 	/*
1612 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1613 	 * with respect to a possible wakeup which is a result of either
1614 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1615 	 *
1616 	 * What's misleading, is that the lwp is put to sleep after the
1617 	 * condition variable's mutex is released.  This is OK as long as
1618 	 * the release operation is also done while holding lwpchan_lock.
1619 	 * The lwp is then put to sleep when the possibility of pagefaulting
1620 	 * or sleeping is completely eliminated.
1621 	 */
1622 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1623 	locked = 1;
1624 	if (UPIMUTEX(mtype) == 0) {
1625 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1626 		m_locked = 1;
1627 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1628 		/*
1629 		 * unlock the condition variable's mutex. (pagefaults are
1630 		 * possible here.)
1631 		 */
1632 		ulock_clear(&mp->mutex_lockw);
1633 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1634 		if (waiters != 0) {
1635 			/*
1636 			 * Given the locking of lwpchan_lock around the release
1637 			 * of the mutex and checking for waiters, the following
1638 			 * call to lwp_release() can fail ONLY if the lock
1639 			 * acquirer is interrupted after setting the waiter bit,
1640 			 * calling lwp_block() and releasing lwpchan_lock.
1641 			 * In this case, it could get pulled off the lwp sleep
1642 			 * q (via setrun()) before the following call to
1643 			 * lwp_release() occurs. In this case, the lock
1644 			 * requestor will update the waiter bit correctly by
1645 			 * re-evaluating it.
1646 			 */
1647 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1648 				suword8_noerr(&mp->mutex_waiters, waiters);
1649 		}
1650 		m_locked = 0;
1651 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1652 	} else {
1653 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1654 		error = lwp_upimutex_unlock(mp, mtype);
1655 		if (error) {	/* if the upimutex unlock failed */
1656 			locked = 0;
1657 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1658 			goto out;
1659 		}
1660 	}
1661 	no_fault();
1662 
1663 	if (mpwatched) {
1664 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1665 		mpwatched = 0;
1666 	}
1667 	if (cvwatched) {
1668 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1669 		cvwatched = 0;
1670 	}
1671 
1672 	/*
1673 	 * Put the lwp in an orderly state for debugging.
1674 	 */
1675 	prstop(PR_REQUESTED, 0);
1676 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1677 		/*
1678 		 * We received a signal at user-level before calling here
1679 		 * or another thread wants us to return immediately
1680 		 * with EINTR.  See lwp_unpark().
1681 		 */
1682 		imm_unpark = 1;
1683 		t->t_unpark = 0;
1684 		timedwait = NULL;
1685 	} else if (timedwait) {
1686 		/*
1687 		 * If we successfully queue the timeout,
1688 		 * then don't drop t_delay_lock until
1689 		 * we are on the sleep queue (below).
1690 		 */
1691 		mutex_enter(&t->t_delay_lock);
1692 		if (lwp_timer_enqueue(&lwpt) != 0) {
1693 			mutex_exit(&t->t_delay_lock);
1694 			imm_timeout = 1;
1695 			timedwait = NULL;
1696 		}
1697 	}
1698 	t->t_flag |= T_WAITCVSEM;
1699 	lwp_block(&cv_lwpchan);
1700 	/*
1701 	 * Nothing should happen to cause the lwp to go to sleep
1702 	 * until after it returns from swtch().
1703 	 */
1704 	if (timedwait)
1705 		mutex_exit(&t->t_delay_lock);
1706 	locked = 0;
1707 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1708 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1709 	    (imm_timeout | imm_unpark))
1710 		setrun(t);
1711 	swtch();
1712 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1713 	if (timedwait)
1714 		tim = lwp_timer_dequeue(&lwpt);
1715 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1716 	    MUSTRETURN(p, t) || imm_unpark)
1717 		error = EINTR;
1718 	else if (imm_timeout || (timedwait && tim == -1))
1719 		error = ETIME;
1720 	lwp->lwp_asleep = 0;
1721 	lwp->lwp_sysabort = 0;
1722 	setallwatch();
1723 
1724 	if (t->t_mstate == LMS_USER_LOCK)
1725 		(void) new_mstate(t, LMS_SYSTEM);
1726 
1727 	if (tsp && check_park)		/* copyout the residual time left */
1728 		error = lwp_timer_copyout(&lwpt, error);
1729 
1730 	/* the mutex is reacquired by the caller on return to user level */
1731 	if (error) {
1732 		/*
1733 		 * If we were concurrently lwp_cond_signal()d and we
1734 		 * received a UNIX signal or got a timeout, then perform
1735 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1736 		 */
1737 		if (t->t_release)
1738 			(void) lwp_cond_signal(cv);
1739 		return (set_errno(error));
1740 	}
1741 	return (0);
1742 
1743 efault:
1744 	/*
1745 	 * make sure that the user level lock is dropped before
1746 	 * returning to caller, since the caller always re-acquires it.
1747 	 */
1748 	if (UPIMUTEX(mtype) == 0) {
1749 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1750 		m_locked = 1;
1751 		ulock_clear(&mp->mutex_lockw);
1752 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1753 		if (waiters != 0) {
1754 			/*
1755 			 * See comment above on lock clearing and lwp_release()
1756 			 * success/failure.
1757 			 */
1758 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1759 				suword8_noerr(&mp->mutex_waiters, waiters);
1760 		}
1761 		m_locked = 0;
1762 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1763 	} else {
1764 		(void) lwp_upimutex_unlock(mp, mtype);
1765 	}
1766 out:
1767 	no_fault();
1768 	if (mpwatched)
1769 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1770 	if (cvwatched)
1771 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1772 	if (t->t_mstate == LMS_USER_LOCK)
1773 		(void) new_mstate(t, LMS_SYSTEM);
1774 	return (set_errno(error));
1775 }
1776 
1777 /*
1778  * wakeup one lwp that's blocked on this condition variable.
1779  */
1780 int
1781 lwp_cond_signal(lwp_cond_t *cv)
1782 {
1783 	proc_t *p = ttoproc(curthread);
1784 	lwpchan_t lwpchan;
1785 	uchar_t waiters;
1786 	volatile uint16_t type = 0;
1787 	volatile int locked = 0;
1788 	volatile int watched = 0;
1789 	label_t ljb;
1790 	int error = 0;
1791 
1792 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1793 		return (set_errno(EFAULT));
1794 
1795 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1796 
1797 	if (on_fault(&ljb)) {
1798 		if (locked)
1799 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1800 		error = EFAULT;
1801 		goto out;
1802 	}
1803 	/*
1804 	 * Force Copy-on-write fault if lwp_cond_t object is
1805 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1806 	 */
1807 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1808 	suword16_noerr(&cv->cond_type, type);
1809 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1810 	    &lwpchan, LWPCHAN_CVPOOL)) {
1811 		error = EFAULT;
1812 		goto out;
1813 	}
1814 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1815 	locked = 1;
1816 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1817 	if (waiters != 0) {
1818 		/*
1819 		 * The following call to lwp_release() might fail but it is
1820 		 * OK to write into the waiters bit below, since the memory
1821 		 * could not have been re-used or unmapped (for correctly
1822 		 * written user programs) as in the case of lwp_mutex_wakeup().
1823 		 * For an incorrect program, we should not care about data
1824 		 * corruption since this is just one instance of other places
1825 		 * where corruption can occur for such a program. Of course
1826 		 * if the memory is unmapped, normal fault recovery occurs.
1827 		 */
1828 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1829 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1830 	}
1831 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1832 out:
1833 	no_fault();
1834 	if (watched)
1835 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1836 	if (error)
1837 		return (set_errno(error));
1838 	return (0);
1839 }
1840 
1841 /*
1842  * wakeup every lwp that's blocked on this condition variable.
1843  */
1844 int
1845 lwp_cond_broadcast(lwp_cond_t *cv)
1846 {
1847 	proc_t *p = ttoproc(curthread);
1848 	lwpchan_t lwpchan;
1849 	volatile uint16_t type = 0;
1850 	volatile int locked = 0;
1851 	volatile int watched = 0;
1852 	label_t ljb;
1853 	uchar_t waiters;
1854 	int error = 0;
1855 
1856 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1857 		return (set_errno(EFAULT));
1858 
1859 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1860 
1861 	if (on_fault(&ljb)) {
1862 		if (locked)
1863 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1864 		error = EFAULT;
1865 		goto out;
1866 	}
1867 	/*
1868 	 * Force Copy-on-write fault if lwp_cond_t object is
1869 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1870 	 */
1871 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1872 	suword16_noerr(&cv->cond_type, type);
1873 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1874 	    &lwpchan, LWPCHAN_CVPOOL)) {
1875 		error = EFAULT;
1876 		goto out;
1877 	}
1878 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1879 	locked = 1;
1880 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1881 	if (waiters != 0) {
1882 		lwp_release_all(&lwpchan);
1883 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1884 	}
1885 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1886 out:
1887 	no_fault();
1888 	if (watched)
1889 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1890 	if (error)
1891 		return (set_errno(error));
1892 	return (0);
1893 }
1894 
1895 int
1896 lwp_sema_trywait(lwp_sema_t *sp)
1897 {
1898 	kthread_t *t = curthread;
1899 	proc_t *p = ttoproc(t);
1900 	label_t ljb;
1901 	volatile int locked = 0;
1902 	volatile int watched = 0;
1903 	volatile uint16_t type = 0;
1904 	int count;
1905 	lwpchan_t lwpchan;
1906 	uchar_t waiters;
1907 	int error = 0;
1908 
1909 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1910 		return (set_errno(EFAULT));
1911 
1912 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1913 
1914 	if (on_fault(&ljb)) {
1915 		if (locked)
1916 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1917 		error = EFAULT;
1918 		goto out;
1919 	}
1920 	/*
1921 	 * Force Copy-on-write fault if lwp_sema_t object is
1922 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1923 	 */
1924 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1925 	suword16_noerr((void *)&sp->sema_type, type);
1926 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1927 	    &lwpchan, LWPCHAN_CVPOOL)) {
1928 		error = EFAULT;
1929 		goto out;
1930 	}
1931 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1932 	locked = 1;
1933 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1934 	if (count == 0)
1935 		error = EBUSY;
1936 	else
1937 		suword32_noerr((void *)&sp->sema_count, --count);
1938 	if (count != 0) {
1939 		fuword8_noerr(&sp->sema_waiters, &waiters);
1940 		if (waiters != 0) {
1941 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1942 			suword8_noerr(&sp->sema_waiters, waiters);
1943 		}
1944 	}
1945 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1946 out:
1947 	no_fault();
1948 	if (watched)
1949 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1950 	if (error)
1951 		return (set_errno(error));
1952 	return (0);
1953 }
1954 
1955 /*
1956  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
1957  */
1958 int
1959 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
1960 {
1961 	kthread_t *t = curthread;
1962 	klwp_t *lwp = ttolwp(t);
1963 	proc_t *p = ttoproc(t);
1964 	lwp_timer_t lwpt;
1965 	caddr_t timedwait;
1966 	clock_t tim = -1;
1967 	label_t ljb;
1968 	volatile int locked = 0;
1969 	volatile int watched = 0;
1970 	volatile uint16_t type = 0;
1971 	int count;
1972 	lwpchan_t lwpchan;
1973 	uchar_t waiters;
1974 	int error = 0;
1975 	int time_error;
1976 	int imm_timeout = 0;
1977 	int imm_unpark = 0;
1978 
1979 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1980 		return (set_errno(EFAULT));
1981 
1982 	timedwait = (caddr_t)tsp;
1983 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1984 	    lwpt.lwpt_imm_timeout) {
1985 		imm_timeout = 1;
1986 		timedwait = NULL;
1987 	}
1988 
1989 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1990 
1991 	if (on_fault(&ljb)) {
1992 		if (locked)
1993 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1994 		error = EFAULT;
1995 		goto out;
1996 	}
1997 	/*
1998 	 * Force Copy-on-write fault if lwp_sema_t object is
1999 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2000 	 */
2001 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2002 	suword16_noerr((void *)&sp->sema_type, type);
2003 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2004 	    &lwpchan, LWPCHAN_CVPOOL)) {
2005 		error = EFAULT;
2006 		goto out;
2007 	}
2008 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2009 	locked = 1;
2010 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2011 	while (error == 0 && count == 0) {
2012 		if (time_error) {
2013 			/*
2014 			 * The SUSV3 Posix spec is very clear that we
2015 			 * should get no error from validating the
2016 			 * timer until we would actually sleep.
2017 			 */
2018 			error = time_error;
2019 			break;
2020 		}
2021 		suword8_noerr(&sp->sema_waiters, 1);
2022 		if (watched)
2023 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2024 		/*
2025 		 * Put the lwp in an orderly state for debugging.
2026 		 */
2027 		prstop(PR_REQUESTED, 0);
2028 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2029 			/*
2030 			 * We received a signal at user-level before calling
2031 			 * here or another thread wants us to return
2032 			 * immediately with EINTR.  See lwp_unpark().
2033 			 */
2034 			imm_unpark = 1;
2035 			t->t_unpark = 0;
2036 			timedwait = NULL;
2037 		} else if (timedwait) {
2038 			/*
2039 			 * If we successfully queue the timeout,
2040 			 * then don't drop t_delay_lock until
2041 			 * we are on the sleep queue (below).
2042 			 */
2043 			mutex_enter(&t->t_delay_lock);
2044 			if (lwp_timer_enqueue(&lwpt) != 0) {
2045 				mutex_exit(&t->t_delay_lock);
2046 				imm_timeout = 1;
2047 				timedwait = NULL;
2048 			}
2049 		}
2050 		t->t_flag |= T_WAITCVSEM;
2051 		lwp_block(&lwpchan);
2052 		/*
2053 		 * Nothing should happen to cause the lwp to sleep
2054 		 * again until after it returns from swtch().
2055 		 */
2056 		if (timedwait)
2057 			mutex_exit(&t->t_delay_lock);
2058 		locked = 0;
2059 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2060 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2061 		    (imm_timeout | imm_unpark))
2062 			setrun(t);
2063 		swtch();
2064 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2065 		if (timedwait)
2066 			tim = lwp_timer_dequeue(&lwpt);
2067 		setallwatch();
2068 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2069 		    MUSTRETURN(p, t) || imm_unpark)
2070 			error = EINTR;
2071 		else if (imm_timeout || (timedwait && tim == -1))
2072 			error = ETIME;
2073 		lwp->lwp_asleep = 0;
2074 		lwp->lwp_sysabort = 0;
2075 		watched = watch_disable_addr((caddr_t)sp,
2076 		    sizeof (*sp), S_WRITE);
2077 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2078 		locked = 1;
2079 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2080 	}
2081 	if (error == 0)
2082 		suword32_noerr((void *)&sp->sema_count, --count);
2083 	if (count != 0) {
2084 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2085 		suword8_noerr(&sp->sema_waiters, waiters);
2086 	}
2087 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2088 out:
2089 	no_fault();
2090 	if (watched)
2091 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2092 	if (tsp && check_park && !time_error)
2093 		error = lwp_timer_copyout(&lwpt, error);
2094 	if (error)
2095 		return (set_errno(error));
2096 	return (0);
2097 }
2098 
2099 /*
2100  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2101  * libc now calls lwp_sema_timedwait().
2102  * This system call trap exists solely for the benefit of old
2103  * statically linked applications from Solaris 9 and before.
2104  * It should be removed when we no longer care about such applications.
2105  */
2106 int
2107 lwp_sema_wait(lwp_sema_t *sp)
2108 {
2109 	return (lwp_sema_timedwait(sp, NULL, 0));
2110 }
2111 
2112 int
2113 lwp_sema_post(lwp_sema_t *sp)
2114 {
2115 	proc_t *p = ttoproc(curthread);
2116 	label_t ljb;
2117 	volatile int locked = 0;
2118 	volatile int watched = 0;
2119 	volatile uint16_t type = 0;
2120 	int count;
2121 	lwpchan_t lwpchan;
2122 	uchar_t waiters;
2123 	int error = 0;
2124 
2125 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2126 		return (set_errno(EFAULT));
2127 
2128 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2129 
2130 	if (on_fault(&ljb)) {
2131 		if (locked)
2132 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2133 		error = EFAULT;
2134 		goto out;
2135 	}
2136 	/*
2137 	 * Force Copy-on-write fault if lwp_sema_t object is
2138 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2139 	 */
2140 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2141 	suword16_noerr(&sp->sema_type, type);
2142 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2143 	    &lwpchan, LWPCHAN_CVPOOL)) {
2144 		error = EFAULT;
2145 		goto out;
2146 	}
2147 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2148 	locked = 1;
2149 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2150 	if (count == _SEM_VALUE_MAX)
2151 		error = EOVERFLOW;
2152 	else
2153 		suword32_noerr(&sp->sema_count, ++count);
2154 	if (count == 1) {
2155 		fuword8_noerr(&sp->sema_waiters, &waiters);
2156 		if (waiters) {
2157 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2158 			suword8_noerr(&sp->sema_waiters, waiters);
2159 		}
2160 	}
2161 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2162 out:
2163 	no_fault();
2164 	if (watched)
2165 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2166 	if (error)
2167 		return (set_errno(error));
2168 	return (0);
2169 }
2170 
2171 #define	TRW_WANT_WRITE		0x1
2172 #define	TRW_LOCK_GRANTED	0x2
2173 
2174 #define	READ_LOCK		0
2175 #define	WRITE_LOCK		1
2176 #define	TRY_FLAG		0x10
2177 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2178 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2179 
2180 /*
2181  * Release one writer or one or more readers. Compute the rwstate word to
2182  * reflect the new state of the queue. For a safe hand-off we copy the new
2183  * rwstate value back to userland before we wake any of the new lock holders.
2184  *
2185  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2186  * being given precedence over readers of the same priority).
2187  *
2188  * If the first thread is a reader we scan the queue releasing all readers
2189  * until we hit a writer or the end of the queue. If the first thread is a
2190  * writer we still need to check for another writer (i.e. URW_WRITE_WANTED).
2191  */
2192 void
2193 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2194 {
2195 	sleepq_head_t *sqh;
2196 	kthread_t *tp;
2197 	kthread_t **tpp;
2198 	kthread_t *tpnext;
2199 	kthread_t *wakelist = NULL;
2200 	uint32_t rwstate = 0;
2201 	int wcount = 0;
2202 	int rcount = 0;
2203 
2204 	sqh = lwpsqhash(lwpchan);
2205 	disp_lock_enter(&sqh->sq_lock);
2206 	tpp = &sqh->sq_queue.sq_first;
2207 	while ((tp = *tpp) != NULL) {
2208 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2209 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2210 			if (tp->t_writer & TRW_WANT_WRITE) {
2211 				if ((wcount++ == 0) && (rcount == 0)) {
2212 					rwstate |= URW_WRITE_LOCKED;
2213 
2214 					/* Just one writer to wake. */
2215 					sleepq_unlink(tpp, tp);
2216 					wakelist = tp;
2217 
2218 					/* tpp already set for next thread. */
2219 					continue;
2220 				} else {
2221 					rwstate |=
2222 					    (URW_WRITE_WANTED|URW_HAS_WAITERS);
2223 
2224 					/* We need look no further. */
2225 					break;
2226 				}
2227 			} else {
2228 				rcount++;
2229 				if (wcount == 0) {
2230 					rwstate++;
2231 
2232 					/* Add reader to wake list. */
2233 					sleepq_unlink(tpp, tp);
2234 					tp->t_link = wakelist;
2235 					wakelist = tp;
2236 
2237 					/* tpp already set for next thread. */
2238 					continue;
2239 				} else
2240 					rwstate |= URW_HAS_WAITERS;
2241 			}
2242 		}
2243 		tpp = &tp->t_link;
2244 	}
2245 
2246 	/* Copy the new rwstate back to userland. */
2247 	suword32_noerr(&rw->rwlock_readers, rwstate);
2248 
2249 	/* Wake the new lock holder(s) up. */
2250 	tp = wakelist;
2251 	while (tp != NULL) {
2252 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2253 		tp->t_wchan0 = NULL;
2254 		tp->t_wchan = NULL;
2255 		tp->t_sobj_ops = NULL;
2256 		tp->t_writer |= TRW_LOCK_GRANTED;
2257 		tpnext = tp->t_link;
2258 		tp->t_link = NULL;
2259 		CL_WAKEUP(tp);
2260 		thread_unlock_high(tp);
2261 		tp = tpnext;
2262 	}
2263 
2264 	disp_lock_exit(&sqh->sq_lock);
2265 }
2266 
2267 /*
2268  * We enter here holding the user-level mutex, which we must release before
2269  * returning or blocking. Based on lwp_cond_wait().
2270  */
2271 static int
2272 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2273 {
2274 	lwp_mutex_t *mp = NULL;
2275 	kthread_t *t = curthread;
2276 	kthread_t *tp;
2277 	klwp_t *lwp = ttolwp(t);
2278 	proc_t *p = ttoproc(t);
2279 	lwp_timer_t lwpt;
2280 	lwpchan_t lwpchan;
2281 	lwpchan_t mlwpchan;
2282 	caddr_t timedwait;
2283 	volatile uint16_t type = 0;
2284 	volatile uint8_t mtype = 0;
2285 	uchar_t mwaiters;
2286 	volatile int error = 0;
2287 	int time_error;
2288 	clock_t tim = -1;
2289 	volatile int locked = 0;
2290 	volatile int mlocked = 0;
2291 	volatile int watched = 0;
2292 	volatile int mwatched = 0;
2293 	label_t ljb;
2294 	volatile int no_lwpchan = 1;
2295 	int imm_timeout = 0;
2296 	int try_flag;
2297 	uint32_t rwstate;
2298 	int acquired = 0;
2299 
2300 	/* We only check rw because the mutex is included in it. */
2301 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2302 		return (set_errno(EFAULT));
2303 
2304 	/* We must only report this error if we are about to sleep (later). */
2305 	timedwait = (caddr_t)tsp;
2306 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2307 	    lwpt.lwpt_imm_timeout) {
2308 		imm_timeout = 1;
2309 		timedwait = NULL;
2310 	}
2311 
2312 	(void) new_mstate(t, LMS_USER_LOCK);
2313 
2314 	if (on_fault(&ljb)) {
2315 		if (no_lwpchan) {
2316 			error = EFAULT;
2317 			goto out_nodrop;
2318 		}
2319 		if (mlocked) {
2320 			mlocked = 0;
2321 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2322 		}
2323 		if (locked) {
2324 			locked = 0;
2325 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2326 		}
2327 		/*
2328 		 * Set up another on_fault() for a possible fault
2329 		 * on the user lock accessed at "out_drop".
2330 		 */
2331 		if (on_fault(&ljb)) {
2332 			if (mlocked) {
2333 				mlocked = 0;
2334 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2335 			}
2336 			error = EFAULT;
2337 			goto out_nodrop;
2338 		}
2339 		error = EFAULT;
2340 		goto out_nodrop;
2341 	}
2342 
2343 	/* Process rd_wr (including sanity check). */
2344 	try_flag = (rd_wr & TRY_FLAG);
2345 	rd_wr &= ~TRY_FLAG;
2346 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2347 		error = EINVAL;
2348 		goto out_nodrop;
2349 	}
2350 
2351 	/* We can only continue for simple USYNC_PROCESS locks. */
2352 	mp = &rw->mutex;
2353 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2354 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2355 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2356 		error = EINVAL;
2357 		goto out_nodrop;
2358 	}
2359 
2360 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2361 	suword8_noerr(&mp->mutex_type, mtype);
2362 	suword16_noerr(&rw->rwlock_type, type);
2363 
2364 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2365 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2366 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2367 		error = EFAULT;
2368 		goto out_nodrop;
2369 	}
2370 
2371 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2372 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2373 	    &lwpchan, LWPCHAN_CVPOOL)) {
2374 		error = EFAULT;
2375 		goto out_nodrop;
2376 	}
2377 
2378 	no_lwpchan = 0;
2379 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2380 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2381 
2382 	/*
2383 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2384 	 * atomically with respect to a possible wakeup which is a result
2385 	 * of lwp_rwlock_unlock().
2386 	 *
2387 	 * What's misleading is that the LWP is put to sleep after the
2388 	 * rwlock's mutex is released. This is OK as long as the release
2389 	 * operation is also done while holding mlwpchan. The LWP is then
2390 	 * put to sleep when the possibility of pagefaulting or sleeping
2391 	 * has been completely eliminated.
2392 	 */
2393 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2394 	locked = 1;
2395 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2396 	mlocked = 1;
2397 
2398 	/*
2399 	 * Fetch the current rwlock state.
2400 	 *
2401 	 * The possibility of spurious wake-ups or killed waiters means that
2402 	 * rwstate's URW_HAS_WAITERS and URW_WRITE_WANTED bits may indicate
2403 	 * false positives. We only fix these if they are important to us.
2404 	 *
2405 	 * Although various error states can be observed here (e.g. the lock
2406 	 * is not held, but there are waiters) we assume these are applicaton
2407 	 * errors and so we take no corrective action.
2408 	 */
2409 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2410 
2411 	/*
2412 	 * If the lock is uncontended we can acquire it here. These tests
2413 	 * should have already been done at user-level, we just need to be
2414 	 * sure.
2415 	 */
2416 	if (rd_wr == READ_LOCK) {
2417 		if ((rwstate & ~URW_READERS_MASK) == 0) {
2418 			rwstate++;
2419 			acquired = 1;
2420 		}
2421 	} else if (rwstate == 0) {
2422 		rwstate = URW_WRITE_LOCKED;
2423 		acquired = 1;
2424 	}
2425 
2426 	/*
2427 	 * We can only try harder if the lock isn't held by a writer.
2428 	 */
2429 	if (!acquired && !(rwstate & URW_WRITE_LOCKED)) {
2430 		tp = lwp_queue_waiter(&lwpchan);
2431 		if (tp == NULL) {
2432 			/*
2433 			 * Hmmm, rwstate indicates waiters but there are
2434 			 * none queued. This could just be the result of a
2435 			 * spurious wakeup, so let's fix it.
2436 			 */
2437 			rwstate &= URW_READERS_MASK;
2438 
2439 			/*
2440 			 * We now have another chance to acquire the lock
2441 			 * uncontended, but this is the last chance for a
2442 			 * writer to acquire the lock without blocking.
2443 			 */
2444 			if (rd_wr == READ_LOCK) {
2445 				rwstate++;
2446 				acquired = 1;
2447 			} else if (rwstate == 0) {
2448 				rwstate = URW_WRITE_LOCKED;
2449 				acquired = 1;
2450 			}
2451 		} else if (rd_wr == READ_LOCK) {
2452 			/*
2453 			 * This is the last chance for a reader to acquire
2454 			 * the lock now, but it can only do so if there is
2455 			 * no writer of equal or greater priority at the
2456 			 * head of the queue .
2457 			 *
2458 			 * It is also just possible that there is a reader
2459 			 * at the head of the queue. This may be the result
2460 			 * of a spurious wakeup or an application failure.
2461 			 * In this case we only acquire the lock if we have
2462 			 * equal or greater priority. It is not our job to
2463 			 * release spurious waiters.
2464 			 */
2465 			pri_t our_pri = DISP_PRIO(t);
2466 			pri_t his_pri = DISP_PRIO(tp);
2467 
2468 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2469 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2470 				rwstate++;
2471 				acquired = 1;
2472 			}
2473 		}
2474 	}
2475 
2476 	if (acquired || try_flag || time_error) {
2477 		/*
2478 		 * We're not going to block this time!
2479 		 */
2480 		suword32_noerr(&rw->rwlock_readers, rwstate);
2481 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2482 		locked = 0;
2483 
2484 		if (acquired) {
2485 			/*
2486 			 * Got the lock!
2487 			 */
2488 			error = 0;
2489 
2490 		} else if (try_flag) {
2491 			/*
2492 			 * We didn't get the lock and we're about to block.
2493 			 * If we're doing a trylock, return EBUSY instead.
2494 			 */
2495 			error = EBUSY;
2496 
2497 		} else if (time_error) {
2498 			/*
2499 			 * The SUSV3 POSIX spec is very clear that we should
2500 			 * get no error from validating the timer (above)
2501 			 * until we would actually sleep.
2502 			 */
2503 			error = time_error;
2504 		}
2505 
2506 		goto out_drop;
2507 	}
2508 
2509 	/*
2510 	 * We're about to block, so indicate what kind of waiter we are.
2511 	 */
2512 	t->t_writer = 0;
2513 	rwstate |= URW_HAS_WAITERS;
2514 	if (rd_wr == WRITE_LOCK) {
2515 		t->t_writer = TRW_WANT_WRITE;
2516 		rwstate |= URW_WRITE_WANTED;
2517 	}
2518 	suword32_noerr(&rw->rwlock_readers, rwstate);
2519 
2520 	/*
2521 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2522 	 */
2523 	ulock_clear(&mp->mutex_lockw);
2524 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2525 	if (mwaiters != 0) {
2526 		/*
2527 		 * Given the locking of mlwpchan around the release of
2528 		 * the mutex and checking for waiters, the following
2529 		 * call to lwp_release() can fail ONLY if the lock
2530 		 * acquirer is interrupted after setting the waiter bit,
2531 		 * calling lwp_block() and releasing mlwpchan.
2532 		 * In this case, it could get pulled off the LWP sleep
2533 		 * queue (via setrun()) before the following call to
2534 		 * lwp_release() occurs, and the lock requestor will
2535 		 * update the waiter bit correctly by re-evaluating it.
2536 		 */
2537 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2538 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2539 	}
2540 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2541 	mlocked = 0;
2542 	no_fault();
2543 
2544 	if (mwatched) {
2545 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2546 		mwatched = 0;
2547 	}
2548 	if (watched) {
2549 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2550 		watched = 0;
2551 	}
2552 
2553 	/*
2554 	 * Put the LWP in an orderly state for debugging.
2555 	 */
2556 	prstop(PR_REQUESTED, 0);
2557 	if (timedwait) {
2558 		/*
2559 		 * If we successfully queue the timeout,
2560 		 * then don't drop t_delay_lock until
2561 		 * we are on the sleep queue (below).
2562 		 */
2563 		mutex_enter(&t->t_delay_lock);
2564 		if (lwp_timer_enqueue(&lwpt) != 0) {
2565 			mutex_exit(&t->t_delay_lock);
2566 			imm_timeout = 1;
2567 			timedwait = NULL;
2568 		}
2569 	}
2570 	t->t_flag |= T_WAITCVSEM;
2571 	lwp_block(&lwpchan);
2572 
2573 	/*
2574 	 * Nothing should happen to cause the LWp to go to sleep until after
2575 	 * it returns from swtch().
2576 	 */
2577 	if (timedwait)
2578 		mutex_exit(&t->t_delay_lock);
2579 	locked = 0;
2580 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2581 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t))
2582 		setrun(t);
2583 	swtch();
2584 
2585 	/*
2586 	 * We're back, but we need to work out why. Were we interrupted? Did
2587 	 * we timeout? Were we granted the lock?
2588 	 */
2589 	error = EAGAIN;
2590 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2591 	t->t_writer = 0;
2592 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2593 	if (timedwait)
2594 		tim = lwp_timer_dequeue(&lwpt);
2595 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2596 		error = EINTR;
2597 	else if (imm_timeout || (timedwait && tim == -1))
2598 		error = ETIME;
2599 	lwp->lwp_asleep = 0;
2600 	lwp->lwp_sysabort = 0;
2601 	setallwatch();
2602 
2603 	/*
2604 	 * If we were granted the lock we don't care about EINTR or ETIME.
2605 	 */
2606 	if (acquired)
2607 		error = 0;
2608 
2609 	if (t->t_mstate == LMS_USER_LOCK)
2610 		(void) new_mstate(t, LMS_SYSTEM);
2611 
2612 	if (error)
2613 		return (set_errno(error));
2614 	return (0);
2615 
2616 out_drop:
2617 	/*
2618 	 * Make sure that the user level lock is dropped before returning
2619 	 * to the caller.
2620 	 */
2621 	if (!mlocked) {
2622 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2623 		mlocked = 1;
2624 	}
2625 	suword32_noerr(&mp->mutex_ownerpid, 0);
2626 	ulock_clear(&mp->mutex_lockw);
2627 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2628 	if (mwaiters != 0) {
2629 		/*
2630 		 * See comment above on lock clearing and lwp_release()
2631 		 * success/failure.
2632 		 */
2633 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2634 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2635 	}
2636 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2637 	mlocked = 0;
2638 
2639 out_nodrop:
2640 	no_fault();
2641 	if (mwatched)
2642 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2643 	if (watched)
2644 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2645 	if (t->t_mstate == LMS_USER_LOCK)
2646 		(void) new_mstate(t, LMS_SYSTEM);
2647 	if (error)
2648 		return (set_errno(error));
2649 	return (0);
2650 }
2651 
2652 /*
2653  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2654  * we never drop the lock.
2655  */
2656 static int
2657 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2658 {
2659 	kthread_t *t = curthread;
2660 	proc_t *p = ttoproc(t);
2661 	lwpchan_t lwpchan;
2662 	volatile uint16_t type = 0;
2663 	volatile int error = 0;
2664 	volatile int locked = 0;
2665 	volatile int watched = 0;
2666 	label_t ljb;
2667 	volatile int no_lwpchan = 1;
2668 	uint32_t rwstate;
2669 
2670 	/* We only check rw because the mutex is included in it. */
2671 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2672 		return (set_errno(EFAULT));
2673 
2674 	if (on_fault(&ljb)) {
2675 		if (no_lwpchan) {
2676 			error = EFAULT;
2677 			goto out_nodrop;
2678 		}
2679 		if (locked) {
2680 			locked = 0;
2681 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2682 		}
2683 		error = EFAULT;
2684 		goto out_nodrop;
2685 	}
2686 
2687 	/* We can only continue for simple USYNC_PROCESS locks. */
2688 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2689 	if (type != USYNC_PROCESS) {
2690 		error = EINVAL;
2691 		goto out_nodrop;
2692 	}
2693 
2694 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2695 	suword16_noerr(&rw->rwlock_type, type);
2696 
2697 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2698 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2699 	    &lwpchan, LWPCHAN_CVPOOL)) {
2700 		error = EFAULT;
2701 		goto out_nodrop;
2702 	}
2703 
2704 	no_lwpchan = 0;
2705 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2706 
2707 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2708 	locked = 1;
2709 
2710 	/*
2711 	 * We can resolve multiple readers (except the last reader) here.
2712 	 * For the last reader or a writer we need lwp_rwlock_release(),
2713 	 * to which we also delegate the task of copying the new rwstate
2714 	 * back to userland (see the comment there).
2715 	 */
2716 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2717 	if (rwstate & URW_WRITE_LOCKED)
2718 		lwp_rwlock_release(&lwpchan, rw);
2719 	else if ((rwstate & URW_READERS_MASK) > 0) {
2720 		rwstate--;
2721 		if ((rwstate & URW_READERS_MASK) == 0)
2722 			lwp_rwlock_release(&lwpchan, rw);
2723 		else
2724 			suword32_noerr(&rw->rwlock_readers, rwstate);
2725 	}
2726 
2727 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2728 	locked = 0;
2729 	error = 0;
2730 
2731 out_nodrop:
2732 	no_fault();
2733 	if (watched)
2734 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2735 	if (error)
2736 		return (set_errno(error));
2737 	return (0);
2738 }
2739 
2740 int
2741 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2742 {
2743 	switch (subcode) {
2744 	case 0:
2745 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2746 	case 1:
2747 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2748 	case 2:
2749 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2750 	case 3:
2751 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2752 	case 4:
2753 		return (lwp_rwlock_unlock(rwlp));
2754 	}
2755 	return (set_errno(EINVAL));
2756 }
2757 
2758 /*
2759  * Return the owner of the user-level s-object.
2760  * Since we can't really do this, return NULL.
2761  */
2762 /* ARGSUSED */
2763 static kthread_t *
2764 lwpsobj_owner(caddr_t sobj)
2765 {
2766 	return ((kthread_t *)NULL);
2767 }
2768 
2769 /*
2770  * Wake up a thread asleep on a user-level synchronization
2771  * object.
2772  */
2773 static void
2774 lwp_unsleep(kthread_t *t)
2775 {
2776 	ASSERT(THREAD_LOCK_HELD(t));
2777 	if (t->t_wchan0 != NULL) {
2778 		sleepq_head_t *sqh;
2779 		sleepq_t *sqp = t->t_sleepq;
2780 
2781 		if (sqp != NULL) {
2782 			sqh = lwpsqhash(&t->t_lwpchan);
2783 			ASSERT(&sqh->sq_queue == sqp);
2784 			sleepq_unsleep(t);
2785 			disp_lock_exit_high(&sqh->sq_lock);
2786 			CL_SETRUN(t);
2787 			return;
2788 		}
2789 	}
2790 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2791 }
2792 
2793 /*
2794  * Change the priority of a thread asleep on a user-level
2795  * synchronization object. To maintain proper priority order,
2796  * we:
2797  *	o dequeue the thread.
2798  *	o change its priority.
2799  *	o re-enqueue the thread.
2800  * Assumption: the thread is locked on entry.
2801  */
2802 static void
2803 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2804 {
2805 	ASSERT(THREAD_LOCK_HELD(t));
2806 	if (t->t_wchan0 != NULL) {
2807 		sleepq_t   *sqp = t->t_sleepq;
2808 
2809 		sleepq_dequeue(t);
2810 		*t_prip = pri;
2811 		sleepq_insert(sqp, t);
2812 	} else
2813 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2814 }
2815 
2816 /*
2817  * Clean up a locked a robust mutex
2818  */
2819 static void
2820 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2821 {
2822 	uint16_t flag;
2823 	uchar_t waiters;
2824 	label_t ljb;
2825 	pid_t owner_pid;
2826 	lwp_mutex_t *lp;
2827 	volatile int locked = 0;
2828 	volatile int watched = 0;
2829 
2830 	ASSERT(ent->lwpchan_type & USYNC_PROCESS_ROBUST);
2831 
2832 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2833 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2834 	if (on_fault(&ljb)) {
2835 		if (locked)
2836 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2837 		goto out;
2838 	}
2839 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2840 	if (owner_pid != curproc->p_pid) {
2841 		goto out;
2842 	}
2843 	lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2844 	locked = 1;
2845 	fuword16_noerr(&lp->mutex_flag, &flag);
2846 	if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) {
2847 		flag |= lockflg;
2848 		suword16_noerr(&lp->mutex_flag, flag);
2849 	}
2850 	suword32_noerr(&lp->mutex_ownerpid, 0);
2851 	ulock_clear(&lp->mutex_lockw);
2852 	fuword8_noerr(&lp->mutex_waiters, &waiters);
2853 	if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2854 		suword8_noerr(&lp->mutex_waiters, waiters);
2855 	lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2856 out:
2857 	no_fault();
2858 	if (watched)
2859 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2860 }
2861 
2862 /*
2863  * Register the mutex and initialize the mutex if it is not already
2864  */
2865 int
2866 lwp_mutex_init(lwp_mutex_t *lp, int type)
2867 {
2868 	proc_t *p = curproc;
2869 	int error = 0;
2870 	volatile int locked = 0;
2871 	volatile int watched = 0;
2872 	label_t ljb;
2873 	uint16_t flag;
2874 	lwpchan_t lwpchan;
2875 	pid_t owner_pid;
2876 
2877 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2878 		return (set_errno(EFAULT));
2879 
2880 	if (type != USYNC_PROCESS_ROBUST)
2881 		return (set_errno(EINVAL));
2882 
2883 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2884 
2885 	if (on_fault(&ljb)) {
2886 		if (locked)
2887 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2888 		error = EFAULT;
2889 		goto out;
2890 	}
2891 	/*
2892 	 * Force Copy-on-write fault if lwp_mutex_t object is
2893 	 * defined to be MAP_PRIVATE and it was initialized to
2894 	 * USYNC_PROCESS.
2895 	 */
2896 	suword8_noerr(&lp->mutex_type, type);
2897 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2898 	    &lwpchan, LWPCHAN_MPPOOL)) {
2899 		error = EFAULT;
2900 		goto out;
2901 	}
2902 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
2903 	locked = 1;
2904 	fuword16_noerr(&lp->mutex_flag, &flag);
2905 	if (flag & LOCK_INITED) {
2906 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
2907 			fuword32_noerr(&lp->mutex_ownerpid,
2908 			    (uint32_t *)&owner_pid);
2909 			if (owner_pid == p->p_pid) {
2910 				flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
2911 				suword16_noerr(&lp->mutex_flag, flag);
2912 				locked = 0;
2913 				lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2914 				goto out;
2915 			}
2916 		}
2917 		error = EBUSY;
2918 	} else {
2919 		suword8_noerr(&lp->mutex_waiters, 0);
2920 		suword8_noerr(&lp->mutex_lockw, 0);
2921 		suword16_noerr(&lp->mutex_flag, LOCK_INITED);
2922 		suword32_noerr(&lp->mutex_ownerpid, 0);
2923 	}
2924 	locked = 0;
2925 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2926 out:
2927 	no_fault();
2928 	if (watched)
2929 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2930 	if (error)
2931 		return (set_errno(error));
2932 	return (0);
2933 }
2934 
2935 int
2936 lwp_mutex_trylock(lwp_mutex_t *lp)
2937 {
2938 	kthread_t *t = curthread;
2939 	proc_t *p = ttoproc(t);
2940 	int error = 0;
2941 	volatile int locked = 0;
2942 	volatile int watched = 0;
2943 	label_t ljb;
2944 	volatile uint8_t type = 0;
2945 	uint16_t flag;
2946 	lwpchan_t lwpchan;
2947 
2948 	if ((caddr_t)lp >= p->p_as->a_userlimit)
2949 		return (set_errno(EFAULT));
2950 
2951 	(void) new_mstate(t, LMS_USER_LOCK);
2952 
2953 	if (on_fault(&ljb)) {
2954 		if (locked)
2955 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2956 		error = EFAULT;
2957 		goto out;
2958 	}
2959 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
2960 	if (UPIMUTEX(type)) {
2961 		no_fault();
2962 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
2963 		if ((error == 0 || error == EOWNERDEAD) &&
2964 		    (type & USYNC_PROCESS))
2965 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
2966 		if (error)
2967 			return (set_errno(error));
2968 		return (0);
2969 	}
2970 	/*
2971 	 * Force Copy-on-write fault if lwp_mutex_t object is
2972 	 * defined to be MAP_PRIVATE and it was initialized to
2973 	 * USYNC_PROCESS.
2974 	 */
2975 	suword8_noerr(&lp->mutex_type, type);
2976 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2977 	    &lwpchan, LWPCHAN_MPPOOL)) {
2978 		error = EFAULT;
2979 		goto out;
2980 	}
2981 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
2982 	locked = 1;
2983 	if (type & USYNC_PROCESS_ROBUST) {
2984 		fuword16_noerr((uint16_t *)(&lp->mutex_flag), &flag);
2985 		if (flag & LOCK_NOTRECOVERABLE) {
2986 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2987 			error =  ENOTRECOVERABLE;
2988 			goto out;
2989 		}
2990 	}
2991 
2992 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2993 
2994 	if (!ulock_try(&lp->mutex_lockw))
2995 		error = EBUSY;
2996 	else if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
2997 		suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
2998 		if (type & USYNC_PROCESS_ROBUST) {
2999 			if (flag & LOCK_OWNERDEAD)
3000 				error = EOWNERDEAD;
3001 			else if (flag & LOCK_UNMAPPED)
3002 				error = ELOCKUNMAPPED;
3003 		}
3004 	}
3005 	locked = 0;
3006 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3007 out:
3008 
3009 	if (t->t_mstate == LMS_USER_LOCK)
3010 		(void) new_mstate(t, LMS_SYSTEM);
3011 
3012 	no_fault();
3013 	if (watched)
3014 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3015 	if (error)
3016 		return (set_errno(error));
3017 	return (0);
3018 }
3019 
3020 /*
3021  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3022  * the blocked lwp resumes and retries to acquire the lock.
3023  */
3024 int
3025 lwp_mutex_unlock(lwp_mutex_t *lp)
3026 {
3027 	proc_t *p = ttoproc(curthread);
3028 	lwpchan_t lwpchan;
3029 	uchar_t waiters;
3030 	volatile int locked = 0;
3031 	volatile int watched = 0;
3032 	volatile uint8_t type = 0;
3033 	label_t ljb;
3034 	uint16_t flag;
3035 	int error = 0;
3036 
3037 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3038 		return (set_errno(EFAULT));
3039 
3040 	if (on_fault(&ljb)) {
3041 		if (locked)
3042 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3043 		error = EFAULT;
3044 		goto out;
3045 	}
3046 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3047 	if (UPIMUTEX(type)) {
3048 		no_fault();
3049 		error = lwp_upimutex_unlock(lp, type);
3050 		if (error)
3051 			return (set_errno(error));
3052 		return (0);
3053 	}
3054 
3055 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3056 
3057 	/*
3058 	 * Force Copy-on-write fault if lwp_mutex_t object is
3059 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
3060 	 */
3061 	suword8_noerr(&lp->mutex_type, type);
3062 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3063 	    &lwpchan, LWPCHAN_MPPOOL)) {
3064 		error = EFAULT;
3065 		goto out;
3066 	}
3067 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3068 	locked = 1;
3069 	if (type & (USYNC_PROCESS | USYNC_PROCESS_ROBUST)) {
3070 		if (type & USYNC_PROCESS_ROBUST) {
3071 			fuword16_noerr(&lp->mutex_flag, &flag);
3072 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3073 				flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3074 				flag |= LOCK_NOTRECOVERABLE;
3075 				suword16_noerr(&lp->mutex_flag, flag);
3076 			}
3077 		}
3078 		suword32_noerr(&lp->mutex_ownerpid, 0);
3079 	}
3080 	ulock_clear(&lp->mutex_lockw);
3081 	/*
3082 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3083 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3084 	 * may fail.  If it fails, do not write into the waiter bit.
3085 	 * The call to lwp_release() might fail due to one of three reasons:
3086 	 *
3087 	 * 	1. due to the thread which set the waiter bit not actually
3088 	 *	   sleeping since it got the lock on the re-try. The waiter
3089 	 *	   bit will then be correctly updated by that thread. This
3090 	 *	   window may be closed by reading the wait bit again here
3091 	 *	   and not calling lwp_release() at all if it is zero.
3092 	 *	2. the thread which set the waiter bit and went to sleep
3093 	 *	   was woken up by a signal. This time, the waiter recomputes
3094 	 *	   the wait bit in the return with EINTR code.
3095 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3096 	 *	   memory that has been re-used after the lock was dropped.
3097 	 *	   In this case, writing into the waiter bit would cause data
3098 	 *	   corruption.
3099 	 */
3100 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3101 	if (waiters) {
3102 		if ((type & USYNC_PROCESS_ROBUST) &&
3103 		    (flag & LOCK_NOTRECOVERABLE)) {
3104 			lwp_release_all(&lwpchan);
3105 			suword8_noerr(&lp->mutex_waiters, 0);
3106 		} else if (lwp_release(&lwpchan, &waiters, 0) == 1) {
3107 			suword8_noerr(&lp->mutex_waiters, waiters);
3108 		}
3109 	}
3110 
3111 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3112 out:
3113 	no_fault();
3114 	if (watched)
3115 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3116 	if (error)
3117 		return (set_errno(error));
3118 	return (0);
3119 }
3120