xref: /titanic_52/usr/src/uts/common/syscall/lwp_sobj.c (revision b02e9a2d4d2071d770e5aa9ae8f83f2bbe1f2ced)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/cred.h>
37 #include <sys/user.h>
38 #include <sys/errno.h>
39 #include <sys/file.h>
40 #include <sys/proc.h>
41 #include <sys/prsystm.h>
42 #include <sys/kmem.h>
43 #include <sys/sobject.h>
44 #include <sys/fault.h>
45 #include <sys/procfs.h>
46 #include <sys/watchpoint.h>
47 #include <sys/time.h>
48 #include <sys/cmn_err.h>
49 #include <sys/machlock.h>
50 #include <sys/debug.h>
51 #include <sys/synch.h>
52 #include <sys/synch32.h>
53 #include <sys/mman.h>
54 #include <sys/class.h>
55 #include <sys/schedctl.h>
56 #include <sys/sleepq.h>
57 #include <sys/policy.h>
58 #include <sys/tnf_probe.h>
59 #include <sys/lwpchan_impl.h>
60 #include <sys/turnstile.h>
61 #include <sys/atomic.h>
62 #include <sys/lwp_timer_impl.h>
63 #include <sys/lwp_upimutex_impl.h>
64 #include <vm/as.h>
65 #include <sys/sdt.h>
66 
67 static kthread_t *lwpsobj_owner(caddr_t);
68 static void lwp_unsleep(kthread_t *t);
69 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
70 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
71 
72 extern int lwp_cond_signal(lwp_cond_t *cv);
73 
74 /*
75  * Maximum number of user prio inheritance locks that can be held by a thread.
76  * Used to limit kmem for each thread. This is a per-thread limit that
77  * can be administered on a system wide basis (using /etc/system).
78  *
79  * Also, when a limit, say maxlwps is added for numbers of lwps within a
80  * process, the per-thread limit automatically becomes a process-wide limit
81  * of maximum number of held upi locks within a process:
82  *      maxheldupimx = maxnestupimx * maxlwps;
83  */
84 static uint32_t maxnestupimx = 2000;
85 
86 /*
87  * The sobj_ops vector exports a set of functions needed when a thread
88  * is asleep on a synchronization object of this type.
89  */
90 static sobj_ops_t lwp_sobj_ops = {
91 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
92 };
93 
94 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
95 
96 static sobj_ops_t lwp_sobj_pi_ops = {
97 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
98 	turnstile_change_pri
99 };
100 
101 static sleepq_head_t	lwpsleepq[NSLEEPQ];
102 upib_t			upimutextab[UPIMUTEX_TABSIZE];
103 
104 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
105 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
106 
107 /*
108  * We know that both lc_wchan and lc_wchan0 are addresses that most
109  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
110  * 'pool' is either 0 or 1.
111  */
112 #define	LWPCHAN_LOCK_HASH(X, pool) \
113 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
114 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
115 
116 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
117 
118 /*
119  * Is this a POSIX threads user-level lock requiring priority inheritance?
120  */
121 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
122 
123 static sleepq_head_t *
124 lwpsqhash(lwpchan_t *lwpchan)
125 {
126 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
127 	return (&lwpsleepq[SQHASHINDEX(x)]);
128 }
129 
130 /*
131  * Lock an lwpchan.
132  * Keep this in sync with lwpchan_unlock(), below.
133  */
134 static void
135 lwpchan_lock(lwpchan_t *lwpchan, int pool)
136 {
137 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
138 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
139 }
140 
141 /*
142  * Unlock an lwpchan.
143  * Keep this in sync with lwpchan_lock(), above.
144  */
145 static void
146 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
147 {
148 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
149 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
150 }
151 
152 /*
153  * Delete mappings from the lwpchan cache for pages that are being
154  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
155  * all mappings within the range are deleted from the lwpchan cache.
156  */
157 void
158 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
159 {
160 	lwpchan_data_t *lcp;
161 	lwpchan_hashbucket_t *hashbucket;
162 	lwpchan_hashbucket_t *endbucket;
163 	lwpchan_entry_t *ent;
164 	lwpchan_entry_t **prev;
165 	caddr_t addr;
166 
167 	mutex_enter(&p->p_lcp_lock);
168 	lcp = p->p_lcp;
169 	hashbucket = lcp->lwpchan_cache;
170 	endbucket = hashbucket + lcp->lwpchan_size;
171 	for (; hashbucket < endbucket; hashbucket++) {
172 		if (hashbucket->lwpchan_chain == NULL)
173 			continue;
174 		mutex_enter(&hashbucket->lwpchan_lock);
175 		prev = &hashbucket->lwpchan_chain;
176 		/* check entire chain */
177 		while ((ent = *prev) != NULL) {
178 			addr = ent->lwpchan_addr;
179 			if (start <= addr && addr < end) {
180 				*prev = ent->lwpchan_next;
181 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
182 				    (ent->lwpchan_type & LOCK_ROBUST))
183 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
184 				kmem_free(ent, sizeof (*ent));
185 				atomic_add_32(&lcp->lwpchan_entries, -1);
186 			} else {
187 				prev = &ent->lwpchan_next;
188 			}
189 		}
190 		mutex_exit(&hashbucket->lwpchan_lock);
191 	}
192 	mutex_exit(&p->p_lcp_lock);
193 }
194 
195 /*
196  * Given an lwpchan cache pointer and a process virtual address,
197  * return a pointer to the corresponding lwpchan hash bucket.
198  */
199 static lwpchan_hashbucket_t *
200 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
201 {
202 	uint_t i;
203 
204 	/*
205 	 * All user-level sync object addresses are 8-byte aligned.
206 	 * Ignore the lowest 3 bits of the address and use the
207 	 * higher-order 2*lwpchan_bits bits for the hash index.
208 	 */
209 	addr >>= 3;
210 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
211 	return (lcp->lwpchan_cache + i);
212 }
213 
214 /*
215  * (Re)allocate the per-process lwpchan cache.
216  */
217 static void
218 lwpchan_alloc_cache(proc_t *p, uint_t bits)
219 {
220 	lwpchan_data_t *lcp;
221 	lwpchan_data_t *old_lcp;
222 	lwpchan_hashbucket_t *hashbucket;
223 	lwpchan_hashbucket_t *endbucket;
224 	lwpchan_hashbucket_t *newbucket;
225 	lwpchan_entry_t *ent;
226 	lwpchan_entry_t *next;
227 	uint_t count;
228 
229 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
230 
231 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
232 	lcp->lwpchan_bits = bits;
233 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
234 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
235 	lcp->lwpchan_entries = 0;
236 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
237 		sizeof (lwpchan_hashbucket_t), KM_SLEEP);
238 	lcp->lwpchan_next_data = NULL;
239 
240 	mutex_enter(&p->p_lcp_lock);
241 	if ((old_lcp = p->p_lcp) != NULL) {
242 		if (old_lcp->lwpchan_bits >= bits) {
243 			/* someone beat us to it */
244 			mutex_exit(&p->p_lcp_lock);
245 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
246 				sizeof (lwpchan_hashbucket_t));
247 			kmem_free(lcp, sizeof (lwpchan_data_t));
248 			return;
249 		}
250 		/*
251 		 * Acquire all of the old hash table locks.
252 		 */
253 		hashbucket = old_lcp->lwpchan_cache;
254 		endbucket = hashbucket + old_lcp->lwpchan_size;
255 		for (; hashbucket < endbucket; hashbucket++)
256 			mutex_enter(&hashbucket->lwpchan_lock);
257 		/*
258 		 * Move all of the old hash table entries to the
259 		 * new hash table.  The new hash table has not yet
260 		 * been installed so we don't need any of its locks.
261 		 */
262 		count = 0;
263 		hashbucket = old_lcp->lwpchan_cache;
264 		for (; hashbucket < endbucket; hashbucket++) {
265 			ent = hashbucket->lwpchan_chain;
266 			while (ent != NULL) {
267 				next = ent->lwpchan_next;
268 				newbucket = lwpchan_bucket(lcp,
269 					(uintptr_t)ent->lwpchan_addr);
270 				ent->lwpchan_next = newbucket->lwpchan_chain;
271 				newbucket->lwpchan_chain = ent;
272 				ent = next;
273 				count++;
274 			}
275 			hashbucket->lwpchan_chain = NULL;
276 		}
277 		lcp->lwpchan_entries = count;
278 	}
279 
280 	/*
281 	 * Retire the old hash table.  We can't actually kmem_free() it
282 	 * now because someone may still have a pointer to it.  Instead,
283 	 * we link it onto the new hash table's list of retired hash tables.
284 	 * The new hash table is double the size of the previous one, so
285 	 * the total size of all retired hash tables is less than the size
286 	 * of the new one.  exit() and exec() free the retired hash tables
287 	 * (see lwpchan_destroy_cache(), below).
288 	 */
289 	lcp->lwpchan_next_data = old_lcp;
290 
291 	/*
292 	 * As soon as we store the new lcp, future locking operations will
293 	 * use it.  Therefore, we must ensure that all the state we've just
294 	 * established reaches global visibility before the new lcp does.
295 	 */
296 	membar_producer();
297 	p->p_lcp = lcp;
298 
299 	if (old_lcp != NULL) {
300 		/*
301 		 * Release all of the old hash table locks.
302 		 */
303 		hashbucket = old_lcp->lwpchan_cache;
304 		for (; hashbucket < endbucket; hashbucket++)
305 			mutex_exit(&hashbucket->lwpchan_lock);
306 	}
307 	mutex_exit(&p->p_lcp_lock);
308 }
309 
310 /*
311  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
312  * Called when the process exits or execs.  All lwps except one have
313  * exited so we need no locks here.
314  */
315 void
316 lwpchan_destroy_cache(int exec)
317 {
318 	proc_t *p = curproc;
319 	lwpchan_hashbucket_t *hashbucket;
320 	lwpchan_hashbucket_t *endbucket;
321 	lwpchan_data_t *lcp;
322 	lwpchan_entry_t *ent;
323 	lwpchan_entry_t *next;
324 	uint16_t lockflg;
325 
326 	lcp = p->p_lcp;
327 	p->p_lcp = NULL;
328 
329 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
330 	hashbucket = lcp->lwpchan_cache;
331 	endbucket = hashbucket + lcp->lwpchan_size;
332 	for (; hashbucket < endbucket; hashbucket++) {
333 		ent = hashbucket->lwpchan_chain;
334 		hashbucket->lwpchan_chain = NULL;
335 		while (ent != NULL) {
336 			next = ent->lwpchan_next;
337 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
338 			    (ent->lwpchan_type & LOCK_ROBUST))
339 				lwp_mutex_cleanup(ent, lockflg);
340 			kmem_free(ent, sizeof (*ent));
341 			ent = next;
342 		}
343 	}
344 
345 	while (lcp != NULL) {
346 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
347 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
348 			sizeof (lwpchan_hashbucket_t));
349 		kmem_free(lcp, sizeof (lwpchan_data_t));
350 		lcp = next_lcp;
351 	}
352 }
353 
354 /*
355  * Return zero when there is an entry in the lwpchan cache for the
356  * given process virtual address and non-zero when there is not.
357  * The returned non-zero value is the current length of the
358  * hash chain plus one.  The caller holds the hash bucket lock.
359  */
360 static uint_t
361 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
362 	lwpchan_hashbucket_t *hashbucket)
363 {
364 	lwpchan_entry_t *ent;
365 	uint_t count = 1;
366 
367 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
368 		if (ent->lwpchan_addr == addr) {
369 			if (ent->lwpchan_type != type ||
370 			    ent->lwpchan_pool != pool) {
371 				/*
372 				 * This shouldn't happen, but might if the
373 				 * process reuses its memory for different
374 				 * types of sync objects.  We test first
375 				 * to avoid grabbing the memory cache line.
376 				 */
377 				ent->lwpchan_type = (uint16_t)type;
378 				ent->lwpchan_pool = (uint16_t)pool;
379 			}
380 			*lwpchan = ent->lwpchan_lwpchan;
381 			return (0);
382 		}
383 		count++;
384 	}
385 	return (count);
386 }
387 
388 /*
389  * Return the cached lwpchan mapping if cached, otherwise insert
390  * a virtual address to lwpchan mapping into the cache.
391  */
392 static int
393 lwpchan_get_mapping(struct as *as, caddr_t addr,
394 	int type, lwpchan_t *lwpchan, int pool)
395 {
396 	proc_t *p = curproc;
397 	lwpchan_data_t *lcp;
398 	lwpchan_hashbucket_t *hashbucket;
399 	lwpchan_entry_t *ent;
400 	memid_t	memid;
401 	uint_t count;
402 	uint_t bits;
403 
404 top:
405 	/* initialize the lwpchan cache, if necesary */
406 	if ((lcp = p->p_lcp) == NULL) {
407 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
408 		goto top;
409 	}
410 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
411 	mutex_enter(&hashbucket->lwpchan_lock);
412 	if (lcp != p->p_lcp) {
413 		/* someone resized the lwpchan cache; start over */
414 		mutex_exit(&hashbucket->lwpchan_lock);
415 		goto top;
416 	}
417 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
418 		/* it's in the cache */
419 		mutex_exit(&hashbucket->lwpchan_lock);
420 		return (1);
421 	}
422 	mutex_exit(&hashbucket->lwpchan_lock);
423 	if (as_getmemid(as, addr, &memid) != 0)
424 		return (0);
425 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
426 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
427 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
428 	mutex_enter(&hashbucket->lwpchan_lock);
429 	if (lcp != p->p_lcp) {
430 		/* someone resized the lwpchan cache; start over */
431 		mutex_exit(&hashbucket->lwpchan_lock);
432 		kmem_free(ent, sizeof (*ent));
433 		goto top;
434 	}
435 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
436 	if (count == 0) {
437 		/* someone else added this entry to the cache */
438 		mutex_exit(&hashbucket->lwpchan_lock);
439 		kmem_free(ent, sizeof (*ent));
440 		return (1);
441 	}
442 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
443 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
444 		/* hash chain too long; reallocate the hash table */
445 		mutex_exit(&hashbucket->lwpchan_lock);
446 		kmem_free(ent, sizeof (*ent));
447 		lwpchan_alloc_cache(p, bits + 1);
448 		goto top;
449 	}
450 	ent->lwpchan_addr = addr;
451 	ent->lwpchan_type = (uint16_t)type;
452 	ent->lwpchan_pool = (uint16_t)pool;
453 	ent->lwpchan_lwpchan = *lwpchan;
454 	ent->lwpchan_next = hashbucket->lwpchan_chain;
455 	hashbucket->lwpchan_chain = ent;
456 	atomic_add_32(&lcp->lwpchan_entries, 1);
457 	mutex_exit(&hashbucket->lwpchan_lock);
458 	return (1);
459 }
460 
461 /*
462  * Return a unique pair of identifiers that corresponds to a
463  * synchronization object's virtual address.  Process-shared
464  * sync objects usually get vnode/offset from as_getmemid().
465  */
466 static int
467 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
468 {
469 	/*
470 	 * If the lwp synch object is defined to be process-private,
471 	 * we just make the first field of the lwpchan be 'as' and
472 	 * the second field be the synch object's virtual address.
473 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
474 	 * The lwpchan cache is used only for process-shared objects.
475 	 */
476 	if (!(type & USYNC_PROCESS)) {
477 		lwpchan->lc_wchan0 = (caddr_t)as;
478 		lwpchan->lc_wchan = addr;
479 		return (1);
480 	}
481 
482 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
483 }
484 
485 static void
486 lwp_block(lwpchan_t *lwpchan)
487 {
488 	kthread_t *t = curthread;
489 	klwp_t *lwp = ttolwp(t);
490 	sleepq_head_t *sqh;
491 
492 	thread_lock(t);
493 	t->t_flag |= T_WAKEABLE;
494 	t->t_lwpchan = *lwpchan;
495 	t->t_sobj_ops = &lwp_sobj_ops;
496 	t->t_release = 0;
497 	sqh = lwpsqhash(lwpchan);
498 	disp_lock_enter_high(&sqh->sq_lock);
499 	CL_SLEEP(t);
500 	DTRACE_SCHED(sleep);
501 	THREAD_SLEEP(t, &sqh->sq_lock);
502 	sleepq_insert(&sqh->sq_queue, t);
503 	thread_unlock(t);
504 	lwp->lwp_asleep = 1;
505 	lwp->lwp_sysabort = 0;
506 	lwp->lwp_ru.nvcsw++;
507 	(void) new_mstate(curthread, LMS_SLEEP);
508 }
509 
510 static kthread_t *
511 lwpsobj_pi_owner(upimutex_t *up)
512 {
513 	return (up->upi_owner);
514 }
515 
516 static struct upimutex *
517 upi_get(upib_t *upibp, lwpchan_t *lcp)
518 {
519 	struct upimutex *upip;
520 
521 	for (upip = upibp->upib_first; upip != NULL;
522 	    upip = upip->upi_nextchain) {
523 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
524 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
525 			break;
526 	}
527 	return (upip);
528 }
529 
530 static void
531 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
532 {
533 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
534 
535 	/*
536 	 * Insert upimutex at front of list. Maybe a bit unfair
537 	 * but assume that not many lwpchans hash to the same
538 	 * upimutextab bucket, i.e. the list of upimutexes from
539 	 * upib_first is not too long.
540 	 */
541 	upimutex->upi_nextchain = upibp->upib_first;
542 	upibp->upib_first = upimutex;
543 }
544 
545 static void
546 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
547 {
548 	struct upimutex **prev;
549 
550 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
551 
552 	prev = &upibp->upib_first;
553 	while (*prev != upimutex) {
554 		prev = &(*prev)->upi_nextchain;
555 	}
556 	*prev = upimutex->upi_nextchain;
557 	upimutex->upi_nextchain = NULL;
558 }
559 
560 /*
561  * Add upimutex to chain of upimutexes held by curthread.
562  * Returns number of upimutexes held by curthread.
563  */
564 static uint32_t
565 upi_mylist_add(struct upimutex *upimutex)
566 {
567 	kthread_t *t = curthread;
568 
569 	/*
570 	 * Insert upimutex at front of list of upimutexes owned by t. This
571 	 * would match typical LIFO order in which nested locks are acquired
572 	 * and released.
573 	 */
574 	upimutex->upi_nextowned = t->t_upimutex;
575 	t->t_upimutex = upimutex;
576 	t->t_nupinest++;
577 	ASSERT(t->t_nupinest > 0);
578 	return (t->t_nupinest);
579 }
580 
581 /*
582  * Delete upimutex from list of upimutexes owned by curthread.
583  */
584 static void
585 upi_mylist_del(struct upimutex *upimutex)
586 {
587 	kthread_t *t = curthread;
588 	struct upimutex **prev;
589 
590 	/*
591 	 * Since the order in which nested locks are acquired and released,
592 	 * is typically LIFO, and typical nesting levels are not too deep, the
593 	 * following should not be expensive in the general case.
594 	 */
595 	prev = &t->t_upimutex;
596 	while (*prev != upimutex) {
597 		prev = &(*prev)->upi_nextowned;
598 	}
599 	*prev = upimutex->upi_nextowned;
600 	upimutex->upi_nextowned = NULL;
601 	ASSERT(t->t_nupinest > 0);
602 	t->t_nupinest--;
603 }
604 
605 /*
606  * Returns true if upimutex is owned. Should be called only when upim points
607  * to kmem which cannot disappear from underneath.
608  */
609 static int
610 upi_owned(upimutex_t *upim)
611 {
612 	return (upim->upi_owner == curthread);
613 }
614 
615 /*
616  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
617  */
618 static struct upimutex *
619 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
620 {
621 	lwpchan_t lwpchan;
622 	upib_t *upibp;
623 	struct upimutex *upimutex;
624 
625 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
626 	    &lwpchan, LWPCHAN_MPPOOL))
627 		return (NULL);
628 
629 	upibp = &UPI_CHAIN(lwpchan);
630 	mutex_enter(&upibp->upib_lock);
631 	upimutex = upi_get(upibp, &lwpchan);
632 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
633 		mutex_exit(&upibp->upib_lock);
634 		return (NULL);
635 	}
636 	mutex_exit(&upibp->upib_lock);
637 	return (upimutex);
638 }
639 
640 /*
641  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
642  * no lock hand-off occurrs.
643  */
644 static void
645 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
646 {
647 	turnstile_t *ts;
648 	upib_t *upibp;
649 	kthread_t *newowner;
650 
651 	upi_mylist_del(upimutex);
652 	upibp = upimutex->upi_upibp;
653 	mutex_enter(&upibp->upib_lock);
654 	if (upimutex->upi_waiter != 0) { /* if waiters */
655 		ts = turnstile_lookup(upimutex);
656 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
657 			/* hand-off lock to highest prio waiter */
658 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
659 			upimutex->upi_owner = newowner;
660 			if (ts->ts_waiters == 1)
661 				upimutex->upi_waiter = 0;
662 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
663 			mutex_exit(&upibp->upib_lock);
664 			return;
665 		} else if (ts != NULL) {
666 			/* LOCK_NOTRECOVERABLE: wakeup all */
667 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
668 		} else {
669 			/*
670 			 * Misleading w bit. Waiters might have been
671 			 * interrupted. No need to clear the w bit (upimutex
672 			 * will soon be freed). Re-calculate PI from existing
673 			 * waiters.
674 			 */
675 			turnstile_exit(upimutex);
676 			turnstile_pi_recalc();
677 		}
678 	}
679 	/*
680 	 * no waiters, or LOCK_NOTRECOVERABLE.
681 	 * remove from the bucket chain of upi mutexes.
682 	 * de-allocate kernel memory (upimutex).
683 	 */
684 	upi_chain_del(upimutex->upi_upibp, upimutex);
685 	mutex_exit(&upibp->upib_lock);
686 	kmem_free(upimutex, sizeof (upimutex_t));
687 }
688 
689 static int
690 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
691 {
692 	label_t ljb;
693 	int error = 0;
694 	lwpchan_t lwpchan;
695 	uint16_t flag;
696 	upib_t *upibp;
697 	volatile struct upimutex *upimutex = NULL;
698 	turnstile_t *ts;
699 	uint32_t nupinest;
700 	volatile int upilocked = 0;
701 
702 	if (on_fault(&ljb)) {
703 		if (upilocked)
704 			upimutex_unlock((upimutex_t *)upimutex, 0);
705 		error = EFAULT;
706 		goto out;
707 	}
708 	/*
709 	 * The apparent assumption made in implementing other _lwp_* synch
710 	 * primitives, is that get_lwpchan() does not return a unique cookie
711 	 * for the case where 2 processes (one forked from the other) point
712 	 * at the same underlying object, which is typed USYNC_PROCESS, but
713 	 * mapped MAP_PRIVATE, since the object has not yet been written to,
714 	 * in the child process.
715 	 *
716 	 * Since get_lwpchan() has been fixed, it is not necessary to do the
717 	 * dummy writes to force a COW fault as in other places (which should
718 	 * be fixed).
719 	 */
720 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
721 	    &lwpchan, LWPCHAN_MPPOOL)) {
722 		error = EFAULT;
723 		goto out;
724 	}
725 	upibp = &UPI_CHAIN(lwpchan);
726 retry:
727 	mutex_enter(&upibp->upib_lock);
728 	upimutex = upi_get(upibp, &lwpchan);
729 	if (upimutex == NULL)  {
730 		/* lock available since lwpchan has no upimutex */
731 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
732 		upi_chain_add(upibp, (upimutex_t *)upimutex);
733 		upimutex->upi_owner = curthread; /* grab lock */
734 		upimutex->upi_upibp = upibp;
735 		upimutex->upi_vaddr = lp;
736 		upimutex->upi_lwpchan = lwpchan;
737 		mutex_exit(&upibp->upib_lock);
738 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
739 		upilocked = 1;
740 		fuword16_noerr(&lp->mutex_flag, &flag);
741 		if (nupinest > maxnestupimx &&
742 		    secpolicy_resource(CRED()) != 0) {
743 			upimutex_unlock((upimutex_t *)upimutex, flag);
744 			error = ENOMEM;
745 			goto out;
746 		}
747 		if (flag & LOCK_NOTRECOVERABLE) {
748 			/*
749 			 * Since the setting of LOCK_NOTRECOVERABLE
750 			 * was done under the high-level upi mutex,
751 			 * in lwp_upimutex_unlock(), this flag needs to
752 			 * be checked while holding the upi mutex.
753 			 * If set, this thread should return without
754 			 * the lock held, and with the right error code.
755 			 */
756 			upimutex_unlock((upimutex_t *)upimutex, flag);
757 			upilocked = 0;
758 			error = ENOTRECOVERABLE;
759 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
760 			if (flag & LOCK_OWNERDEAD)
761 				error = EOWNERDEAD;
762 			else if (type & USYNC_PROCESS_ROBUST)
763 				error = ELOCKUNMAPPED;
764 			else
765 				error = EOWNERDEAD;
766 		}
767 		goto out;
768 	}
769 	/*
770 	 * If a upimutex object exists, it must have an owner.
771 	 * This is due to lock hand-off, and release of upimutex when no
772 	 * waiters are present at unlock time,
773 	 */
774 	ASSERT(upimutex->upi_owner != NULL);
775 	if (upimutex->upi_owner == curthread) {
776 		/*
777 		 * The user wrapper can check if the mutex type is
778 		 * ERRORCHECK: if not, it should stall at user-level.
779 		 * If so, it should return the error code.
780 		 */
781 		mutex_exit(&upibp->upib_lock);
782 		error = EDEADLK;
783 		goto out;
784 	}
785 	if (try == UPIMUTEX_TRY) {
786 		mutex_exit(&upibp->upib_lock);
787 		error = EBUSY;
788 		goto out;
789 	}
790 	/*
791 	 * Block for the lock.
792 	 * Put the lwp in an orderly state for debugging.
793 	 * Calling prstop() has to be done here, and not in
794 	 * turnstile_block(), since the preceding call to
795 	 * turnstile_lookup() raises the PIL to a level
796 	 * at which calls to prstop() should not be made.
797 	 */
798 	if ((error = lwptp->lwpt_time_error) != 0) {
799 		/*
800 		 * The SUSV3 Posix spec is very clear that we
801 		 * should get no error from validating the
802 		 * timer until we would actually sleep.
803 		 */
804 		mutex_exit(&upibp->upib_lock);
805 		goto out;
806 	}
807 	prstop(PR_REQUESTED, 0);
808 	if (lwptp->lwpt_tsp != NULL) {
809 		/*
810 		 * If we successfully queue the timeout
811 		 * (lwp_timer_enqueue() returns zero),
812 		 * then don't drop t_delay_lock until we are
813 		 * on the sleep queue (in turnstile_block()).
814 		 * Otherwise we will get an immediate timeout
815 		 * when we attempt to sleep in turnstile_block().
816 		 */
817 		mutex_enter(&curthread->t_delay_lock);
818 		if (lwp_timer_enqueue(lwptp) != 0)
819 			mutex_exit(&curthread->t_delay_lock);
820 	}
821 	/*
822 	 * Now, set the waiter bit and block for the lock in turnstile_block().
823 	 * No need to preserve the previous wbit since a lock try is not
824 	 * attempted after setting the wait bit. Wait bit is set under
825 	 * the upib_lock, which is not released until the turnstile lock
826 	 * is acquired. Say, the upimutex is L:
827 	 *
828 	 * 1. upib_lock is held so the waiter does not have to retry L after
829 	 *    setting the wait bit: since the owner has to grab the upib_lock
830 	 *    to unlock L, it will certainly see the wait bit set.
831 	 * 2. upib_lock is not released until the turnstile lock is acquired.
832 	 *    This is the key to preventing a missed wake-up. Otherwise, the
833 	 *    owner could acquire the upib_lock, and the tc_lock, to call
834 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
835 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
836 	 *    find this waiter, resulting in the missed wakeup.
837 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
838 	 *    holding the tc_lock (since mutex_exit() could need to acquire
839 	 *    the same tc_lock)...and so is held when calling turnstile_block().
840 	 *    The address of upib_lock is passed to turnstile_block() which
841 	 *    releases it after releasing all turnstile locks, and before going
842 	 *    to sleep in swtch().
843 	 * 4. The waiter value cannot be a count of waiters, because a waiter
844 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
845 	 *    which point, the upib_lock cannot be locked, to decrement waiter
846 	 *    count. So, just treat the waiter state as a bit, not a count.
847 	 */
848 	ts = turnstile_lookup((upimutex_t *)upimutex);
849 	upimutex->upi_waiter = 1;
850 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
851 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
852 	/*
853 	 * Hand-off implies that we wakeup holding the lock, except when:
854 	 *	- deadlock is detected
855 	 *	- lock is not recoverable
856 	 *	- we got an interrupt or timeout
857 	 * If we wake up due to an interrupt or timeout, we may
858 	 * or may not be holding the lock due to mutex hand-off.
859 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
860 	 */
861 	if (error != 0) {
862 		if ((error == EINTR || error == ETIME) &&
863 		    (upimutex = lwp_upimutex_owned(lp, type))) {
864 			/*
865 			 * Unlock and return - the re-startable syscall will
866 			 * try the lock again if we got EINTR.
867 			 */
868 			(void) upi_mylist_add((upimutex_t *)upimutex);
869 			upimutex_unlock((upimutex_t *)upimutex, 0);
870 		}
871 		/*
872 		 * The only other possible error is EDEADLK.  If so, upimutex
873 		 * is valid, since its owner is deadlocked with curthread.
874 		 */
875 		ASSERT(error == EINTR || error == ETIME ||
876 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
877 		ASSERT(!lwp_upimutex_owned(lp, type));
878 		goto out;
879 	}
880 	if (lwp_upimutex_owned(lp, type)) {
881 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
882 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
883 		upilocked = 1;
884 	}
885 	/*
886 	 * Now, need to read the user-level lp->mutex_flag to do the following:
887 	 *
888 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
889 	 *   should be returned.
890 	 * - if lock isn't held, check if ENOTRECOVERABLE should
891 	 *   be returned.
892 	 *
893 	 * Now, either lp->mutex_flag is readable or it's not. If not
894 	 * readable, the on_fault path will cause a return with EFAULT
895 	 * as it should.  If it is readable, the state of the flag
896 	 * encodes the robustness state of the lock:
897 	 *
898 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
899 	 * or LOCK_UNMAPPED setting will influence the return code
900 	 * appropriately.  If the upimutex is not locked here, this
901 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
902 	 * event.  The flag's setting can be used to distinguish
903 	 * between these two events.
904 	 */
905 	fuword16_noerr(&lp->mutex_flag, &flag);
906 	if (upilocked) {
907 		/*
908 		 * If the thread wakes up from turnstile_block with the lock
909 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
910 		 * since it would not have been handed-off the lock.
911 		 * So, no need to check for this case.
912 		 */
913 		if (nupinest > maxnestupimx &&
914 		    secpolicy_resource(CRED()) != 0) {
915 			upimutex_unlock((upimutex_t *)upimutex, flag);
916 			upilocked = 0;
917 			error = ENOMEM;
918 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
919 			if (flag & LOCK_OWNERDEAD)
920 				error = EOWNERDEAD;
921 			else if (type & USYNC_PROCESS_ROBUST)
922 				error = ELOCKUNMAPPED;
923 			else
924 				error = EOWNERDEAD;
925 		}
926 	} else {
927 		/*
928 		 * Wake-up without the upimutex held. Either this is a
929 		 * spurious wake-up (due to signals, forkall(), whatever), or
930 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
931 		 * of the mutex flag can be used to distinguish between the
932 		 * two events.
933 		 */
934 		if (flag & LOCK_NOTRECOVERABLE) {
935 			error = ENOTRECOVERABLE;
936 		} else {
937 			/*
938 			 * Here, the flag could be set to LOCK_OWNERDEAD or
939 			 * not. In both cases, this is a spurious wakeup,
940 			 * since the upi lock is not held, but the thread
941 			 * has returned from turnstile_block().
942 			 *
943 			 * The user flag could be LOCK_OWNERDEAD if, at the
944 			 * same time as curthread having been woken up
945 			 * spuriously, the owner (say Tdead) has died, marked
946 			 * the mutex flag accordingly, and handed off the lock
947 			 * to some other waiter (say Tnew). curthread just
948 			 * happened to read the flag while Tnew has yet to deal
949 			 * with the owner-dead event.
950 			 *
951 			 * In this event, curthread should retry the lock.
952 			 * If Tnew is able to cleanup the lock, curthread
953 			 * will eventually get the lock with a zero error code,
954 			 * If Tnew is unable to cleanup, its eventual call to
955 			 * unlock the lock will result in the mutex flag being
956 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
957 			 * all waiters, including curthread, which will then
958 			 * eventually return ENOTRECOVERABLE due to the above
959 			 * check.
960 			 *
961 			 * Of course, if the user-flag is not set with
962 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
963 			 * this is definitely a spurious wakeup.
964 			 */
965 			goto retry;
966 		}
967 	}
968 
969 out:
970 	no_fault();
971 	return (error);
972 }
973 
974 
975 static int
976 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
977 {
978 	label_t ljb;
979 	int error = 0;
980 	lwpchan_t lwpchan;
981 	uint16_t flag;
982 	upib_t *upibp;
983 	volatile struct upimutex *upimutex = NULL;
984 	volatile int upilocked = 0;
985 
986 	if (on_fault(&ljb)) {
987 		if (upilocked)
988 			upimutex_unlock((upimutex_t *)upimutex, 0);
989 		error = EFAULT;
990 		goto out;
991 	}
992 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
993 	    &lwpchan, LWPCHAN_MPPOOL)) {
994 		error = EFAULT;
995 		goto out;
996 	}
997 	upibp = &UPI_CHAIN(lwpchan);
998 	mutex_enter(&upibp->upib_lock);
999 	upimutex = upi_get(upibp, &lwpchan);
1000 	/*
1001 	 * If the lock is not held, or the owner is not curthread, return
1002 	 * error. The user-level wrapper can return this error or stall,
1003 	 * depending on whether mutex is of ERRORCHECK type or not.
1004 	 */
1005 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1006 		mutex_exit(&upibp->upib_lock);
1007 		error = EPERM;
1008 		goto out;
1009 	}
1010 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1011 	upilocked = 1;
1012 	fuword16_noerr(&lp->mutex_flag, &flag);
1013 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1014 		/*
1015 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1016 		 */
1017 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1018 		flag |= LOCK_NOTRECOVERABLE;
1019 		suword16_noerr(&lp->mutex_flag, flag);
1020 	}
1021 	if (type & USYNC_PROCESS)
1022 		suword32_noerr(&lp->mutex_ownerpid, 0);
1023 	upimutex_unlock((upimutex_t *)upimutex, flag);
1024 	upilocked = 0;
1025 out:
1026 	no_fault();
1027 	return (error);
1028 }
1029 
1030 /*
1031  * Clear the contents of a user-level mutex; return the flags.
1032  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1033  */
1034 static uint16_t
1035 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1036 {
1037 	uint16_t flag;
1038 
1039 	fuword16_noerr(&lp->mutex_flag, &flag);
1040 	if ((flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) == 0) {
1041 		flag |= lockflg;
1042 		suword16_noerr(&lp->mutex_flag, flag);
1043 	}
1044 	suword32_noerr((uint32_t *)&lp->mutex_owner, 0);
1045 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0);
1046 	suword32_noerr(&lp->mutex_ownerpid, 0);
1047 	suword8_noerr(&lp->mutex_rcount, 0);
1048 
1049 	return (flag);
1050 }
1051 
1052 /*
1053  * Mark user mutex state, corresponding to kernel upimutex,
1054  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1055  */
1056 static int
1057 upi_dead(upimutex_t *upip, uint16_t lockflg)
1058 {
1059 	label_t ljb;
1060 	int error = 0;
1061 	lwp_mutex_t *lp;
1062 
1063 	if (on_fault(&ljb)) {
1064 		error = EFAULT;
1065 		goto out;
1066 	}
1067 
1068 	lp = upip->upi_vaddr;
1069 	(void) lwp_clear_mutex(lp, lockflg);
1070 	suword8_noerr(&lp->mutex_lockw, 0);
1071 out:
1072 	no_fault();
1073 	return (error);
1074 }
1075 
1076 /*
1077  * Unlock all upimutexes held by curthread, since curthread is dying.
1078  * For each upimutex, attempt to mark its corresponding user mutex object as
1079  * dead.
1080  */
1081 void
1082 upimutex_cleanup()
1083 {
1084 	kthread_t *t = curthread;
1085 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1086 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1087 	struct upimutex *upip;
1088 
1089 	while ((upip = t->t_upimutex) != NULL) {
1090 		if (upi_dead(upip, lockflg) != 0) {
1091 			/*
1092 			 * If the user object associated with this upimutex is
1093 			 * unmapped, unlock upimutex with the
1094 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1095 			 * woken up. Since user object is unmapped, it could
1096 			 * not be marked as dead or notrecoverable.
1097 			 * The waiters will now all wake up and return
1098 			 * ENOTRECOVERABLE, since they would find that the lock
1099 			 * has not been handed-off to them.
1100 			 * See lwp_upimutex_lock().
1101 			 */
1102 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1103 		} else {
1104 			/*
1105 			 * The user object has been updated as dead.
1106 			 * Unlock the upimutex: if no waiters, upip kmem will
1107 			 * be freed. If there is a waiter, the lock will be
1108 			 * handed off. If exit() is in progress, each existing
1109 			 * waiter will successively get the lock, as owners
1110 			 * die, and each new owner will call this routine as
1111 			 * it dies. The last owner will free kmem, since
1112 			 * it will find the upimutex has no waiters. So,
1113 			 * eventually, the kmem is guaranteed to be freed.
1114 			 */
1115 			upimutex_unlock(upip, 0);
1116 		}
1117 		/*
1118 		 * Note that the call to upimutex_unlock() above will delete
1119 		 * upimutex from the t_upimutexes chain. And so the
1120 		 * while loop will eventually terminate.
1121 		 */
1122 	}
1123 }
1124 
1125 int
1126 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1127 {
1128 	kthread_t *t = curthread;
1129 	klwp_t *lwp = ttolwp(t);
1130 	proc_t *p = ttoproc(t);
1131 	lwp_timer_t lwpt;
1132 	caddr_t timedwait;
1133 	int error = 0;
1134 	int time_error;
1135 	clock_t tim = -1;
1136 	uchar_t waiters;
1137 	volatile int locked = 0;
1138 	volatile int watched = 0;
1139 	label_t ljb;
1140 	volatile uint8_t type = 0;
1141 	lwpchan_t lwpchan;
1142 	sleepq_head_t *sqh;
1143 	static int iswanted();
1144 	uint16_t flag;
1145 	int imm_timeout = 0;
1146 
1147 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1148 		return (set_errno(EFAULT));
1149 
1150 	timedwait = (caddr_t)tsp;
1151 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1152 	    lwpt.lwpt_imm_timeout) {
1153 		imm_timeout = 1;
1154 		timedwait = NULL;
1155 	}
1156 
1157 	/*
1158 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1159 	 * this micro state is really a run state. If the thread indeed blocks,
1160 	 * this state becomes valid. If not, the state is converted back to
1161 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1162 	 * when blocking.
1163 	 */
1164 	(void) new_mstate(t, LMS_USER_LOCK);
1165 	if (on_fault(&ljb)) {
1166 		if (locked)
1167 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1168 		error = EFAULT;
1169 		goto out;
1170 	}
1171 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1172 	if (UPIMUTEX(type)) {
1173 		no_fault();
1174 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1175 		if ((type & USYNC_PROCESS) &&
1176 		    (error == 0 ||
1177 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
1178 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1179 		if (tsp && !time_error)	/* copyout the residual time left */
1180 			error = lwp_timer_copyout(&lwpt, error);
1181 		if (error)
1182 			return (set_errno(error));
1183 		return (0);
1184 	}
1185 	/*
1186 	 * Force Copy-on-write fault if lwp_mutex_t object is
1187 	 * defined to be MAP_PRIVATE and it was initialized to
1188 	 * USYNC_PROCESS.
1189 	 */
1190 	suword8_noerr(&lp->mutex_type, type);
1191 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1192 	    &lwpchan, LWPCHAN_MPPOOL)) {
1193 		error = EFAULT;
1194 		goto out;
1195 	}
1196 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1197 	locked = 1;
1198 	if (type & LOCK_ROBUST) {
1199 		fuword16_noerr(&lp->mutex_flag, &flag);
1200 		if (flag & LOCK_NOTRECOVERABLE) {
1201 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1202 			error = ENOTRECOVERABLE;
1203 			goto out;
1204 		}
1205 	}
1206 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1207 	suword8_noerr(&lp->mutex_waiters, 1);
1208 
1209 	/*
1210 	 * If watchpoints are set, they need to be restored, since
1211 	 * atomic accesses of memory such as the call to ulock_try()
1212 	 * below cannot be watched.
1213 	 */
1214 
1215 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1216 
1217 	while (!ulock_try(&lp->mutex_lockw)) {
1218 		if (time_error) {
1219 			/*
1220 			 * The SUSV3 Posix spec is very clear that we
1221 			 * should get no error from validating the
1222 			 * timer until we would actually sleep.
1223 			 */
1224 			error = time_error;
1225 			break;
1226 		}
1227 
1228 		if (watched) {
1229 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1230 			watched = 0;
1231 		}
1232 
1233 		/*
1234 		 * Put the lwp in an orderly state for debugging.
1235 		 */
1236 		prstop(PR_REQUESTED, 0);
1237 		if (timedwait) {
1238 			/*
1239 			 * If we successfully queue the timeout,
1240 			 * then don't drop t_delay_lock until
1241 			 * we are on the sleep queue (below).
1242 			 */
1243 			mutex_enter(&t->t_delay_lock);
1244 			if (lwp_timer_enqueue(&lwpt) != 0) {
1245 				mutex_exit(&t->t_delay_lock);
1246 				imm_timeout = 1;
1247 				timedwait = NULL;
1248 			}
1249 		}
1250 		lwp_block(&lwpchan);
1251 		/*
1252 		 * Nothing should happen to cause the lwp to go to
1253 		 * sleep again until after it returns from swtch().
1254 		 */
1255 		if (timedwait)
1256 			mutex_exit(&t->t_delay_lock);
1257 		locked = 0;
1258 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1259 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1260 			setrun(t);
1261 		swtch();
1262 		t->t_flag &= ~T_WAKEABLE;
1263 		if (timedwait)
1264 			tim = lwp_timer_dequeue(&lwpt);
1265 		setallwatch();
1266 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1267 			error = EINTR;
1268 		else if (imm_timeout || (timedwait && tim == -1))
1269 			error = ETIME;
1270 		if (error) {
1271 			lwp->lwp_asleep = 0;
1272 			lwp->lwp_sysabort = 0;
1273 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1274 			    S_WRITE);
1275 
1276 			/*
1277 			 * Need to re-compute waiters bit. The waiters field in
1278 			 * the lock is not reliable. Either of two things could
1279 			 * have occurred: no lwp may have called lwp_release()
1280 			 * for me but I have woken up due to a signal or
1281 			 * timeout.  In this case, the waiter bit is incorrect
1282 			 * since it is still set to 1, set above.
1283 			 * OR an lwp_release() did occur for some other lwp on
1284 			 * the same lwpchan. In this case, the waiter bit is
1285 			 * correct.  But which event occurred, one can't tell.
1286 			 * So, recompute.
1287 			 */
1288 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1289 			locked = 1;
1290 			sqh = lwpsqhash(&lwpchan);
1291 			disp_lock_enter(&sqh->sq_lock);
1292 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1293 			disp_lock_exit(&sqh->sq_lock);
1294 			break;
1295 		}
1296 		lwp->lwp_asleep = 0;
1297 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1298 		    S_WRITE);
1299 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1300 		locked = 1;
1301 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1302 		suword8_noerr(&lp->mutex_waiters, 1);
1303 		if (type & LOCK_ROBUST) {
1304 			fuword16_noerr(&lp->mutex_flag, &flag);
1305 			if (flag & LOCK_NOTRECOVERABLE) {
1306 				error = ENOTRECOVERABLE;
1307 				break;
1308 			}
1309 		}
1310 	}
1311 
1312 	if (t->t_mstate == LMS_USER_LOCK)
1313 		(void) new_mstate(t, LMS_SYSTEM);
1314 
1315 	if (error == 0) {
1316 		if (type & USYNC_PROCESS)
1317 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1318 		if (type & LOCK_ROBUST) {
1319 			fuword16_noerr(&lp->mutex_flag, &flag);
1320 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1321 				if (flag & LOCK_OWNERDEAD)
1322 					error = EOWNERDEAD;
1323 				else if (type & USYNC_PROCESS_ROBUST)
1324 					error = ELOCKUNMAPPED;
1325 				else
1326 					error = EOWNERDEAD;
1327 			}
1328 		}
1329 	}
1330 	suword8_noerr(&lp->mutex_waiters, waiters);
1331 	locked = 0;
1332 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1333 out:
1334 	no_fault();
1335 	if (watched)
1336 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1337 	if (tsp && !time_error)		/* copyout the residual time left */
1338 		error = lwp_timer_copyout(&lwpt, error);
1339 	if (error)
1340 		return (set_errno(error));
1341 	return (0);
1342 }
1343 
1344 /*
1345  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1346  * libc now calls lwp_mutex_timedlock(lp, NULL).
1347  * This system call trap continues to exist solely for the benefit
1348  * of old statically-linked binaries from Solaris 9 and before.
1349  * It should be removed from the system when we no longer care
1350  * about such applications.
1351  */
1352 int
1353 lwp_mutex_lock(lwp_mutex_t *lp)
1354 {
1355 	return (lwp_mutex_timedlock(lp, NULL));
1356 }
1357 
1358 static int
1359 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1360 {
1361 	/*
1362 	 * The caller holds the dispatcher lock on the sleep queue.
1363 	 */
1364 	while (t != NULL) {
1365 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1366 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1367 			return (1);
1368 		t = t->t_link;
1369 	}
1370 	return (0);
1371 }
1372 
1373 /*
1374  * Return the highest priority thread sleeping on this lwpchan.
1375  */
1376 static kthread_t *
1377 lwp_queue_waiter(lwpchan_t *lwpchan)
1378 {
1379 	sleepq_head_t *sqh;
1380 	kthread_t *tp;
1381 
1382 	sqh = lwpsqhash(lwpchan);
1383 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1384 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1385 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1386 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1387 			break;
1388 	}
1389 	disp_lock_exit(&sqh->sq_lock);
1390 	return (tp);
1391 }
1392 
1393 static int
1394 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1395 {
1396 	sleepq_head_t *sqh;
1397 	kthread_t *tp;
1398 	kthread_t **tpp;
1399 
1400 	sqh = lwpsqhash(lwpchan);
1401 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1402 	tpp = &sqh->sq_queue.sq_first;
1403 	while ((tp = *tpp) != NULL) {
1404 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1405 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1406 			/*
1407 			 * The following is typically false. It could be true
1408 			 * only if lwp_release() is called from
1409 			 * lwp_mutex_wakeup() after reading the waiters field
1410 			 * from memory in which the lwp lock used to be, but has
1411 			 * since been re-used to hold a lwp cv or lwp semaphore.
1412 			 * The thread "tp" found to match the lwp lock's wchan
1413 			 * is actually sleeping for the cv or semaphore which
1414 			 * now has the same wchan. In this case, lwp_release()
1415 			 * should return failure.
1416 			 */
1417 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1418 				ASSERT(sync_type == 0);
1419 				/*
1420 				 * assert that this can happen only for mutexes
1421 				 * i.e. sync_type == 0, for correctly written
1422 				 * user programs.
1423 				 */
1424 				disp_lock_exit(&sqh->sq_lock);
1425 				return (0);
1426 			}
1427 			*waiters = iswanted(tp->t_link, lwpchan);
1428 			sleepq_unlink(tpp, tp);
1429 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1430 			tp->t_wchan0 = NULL;
1431 			tp->t_wchan = NULL;
1432 			tp->t_sobj_ops = NULL;
1433 			tp->t_release = 1;
1434 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1435 			CL_WAKEUP(tp);
1436 			thread_unlock(tp);	/* drop run queue lock */
1437 			return (1);
1438 		}
1439 		tpp = &tp->t_link;
1440 	}
1441 	*waiters = 0;
1442 	disp_lock_exit(&sqh->sq_lock);
1443 	return (0);
1444 }
1445 
1446 static void
1447 lwp_release_all(lwpchan_t *lwpchan)
1448 {
1449 	sleepq_head_t	*sqh;
1450 	kthread_t *tp;
1451 	kthread_t **tpp;
1452 
1453 	sqh = lwpsqhash(lwpchan);
1454 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1455 	tpp = &sqh->sq_queue.sq_first;
1456 	while ((tp = *tpp) != NULL) {
1457 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1458 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1459 			sleepq_unlink(tpp, tp);
1460 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1461 			tp->t_wchan0 = NULL;
1462 			tp->t_wchan = NULL;
1463 			tp->t_sobj_ops = NULL;
1464 			CL_WAKEUP(tp);
1465 			thread_unlock_high(tp);	/* release run queue lock */
1466 		} else {
1467 			tpp = &tp->t_link;
1468 		}
1469 	}
1470 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1471 }
1472 
1473 /*
1474  * unblock a lwp that is trying to acquire this mutex. the blocked
1475  * lwp resumes and retries to acquire the lock.
1476  */
1477 int
1478 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1479 {
1480 	proc_t *p = ttoproc(curthread);
1481 	lwpchan_t lwpchan;
1482 	uchar_t waiters;
1483 	volatile int locked = 0;
1484 	volatile int watched = 0;
1485 	volatile uint8_t type = 0;
1486 	label_t ljb;
1487 	int error = 0;
1488 
1489 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1490 		return (set_errno(EFAULT));
1491 
1492 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1493 
1494 	if (on_fault(&ljb)) {
1495 		if (locked)
1496 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1497 		error = EFAULT;
1498 		goto out;
1499 	}
1500 	/*
1501 	 * Force Copy-on-write fault if lwp_mutex_t object is
1502 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
1503 	 */
1504 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1505 	suword8_noerr(&lp->mutex_type, type);
1506 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1507 	    &lwpchan, LWPCHAN_MPPOOL)) {
1508 		error = EFAULT;
1509 		goto out;
1510 	}
1511 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1512 	locked = 1;
1513 	/*
1514 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1515 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1516 	 * may fail.  If it fails, do not write into the waiter bit.
1517 	 * The call to lwp_release() might fail due to one of three reasons:
1518 	 *
1519 	 * 	1. due to the thread which set the waiter bit not actually
1520 	 *	   sleeping since it got the lock on the re-try. The waiter
1521 	 *	   bit will then be correctly updated by that thread. This
1522 	 *	   window may be closed by reading the wait bit again here
1523 	 *	   and not calling lwp_release() at all if it is zero.
1524 	 *	2. the thread which set the waiter bit and went to sleep
1525 	 *	   was woken up by a signal. This time, the waiter recomputes
1526 	 *	   the wait bit in the return with EINTR code.
1527 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1528 	 *	   memory that has been re-used after the lock was dropped.
1529 	 *	   In this case, writing into the waiter bit would cause data
1530 	 *	   corruption.
1531 	 */
1532 	if (release_all)
1533 		lwp_release_all(&lwpchan);
1534 	else if (lwp_release(&lwpchan, &waiters, 0) == 1)
1535 		suword8_noerr(&lp->mutex_waiters, waiters);
1536 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1537 out:
1538 	no_fault();
1539 	if (watched)
1540 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1541 	if (error)
1542 		return (set_errno(error));
1543 	return (0);
1544 }
1545 
1546 /*
1547  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1548  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1549  * a flag telling the kernel whether or not to honor the kernel/user
1550  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1551  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1552  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1553  * it is used an an in/out parameter.  On entry, it contains the relative
1554  * time until timeout.  On exit, we copyout the residual time left to it.
1555  */
1556 int
1557 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1558 {
1559 	kthread_t *t = curthread;
1560 	klwp_t *lwp = ttolwp(t);
1561 	proc_t *p = ttoproc(t);
1562 	lwp_timer_t lwpt;
1563 	lwpchan_t cv_lwpchan;
1564 	lwpchan_t m_lwpchan;
1565 	caddr_t timedwait;
1566 	volatile uint16_t type = 0;
1567 	volatile uint8_t mtype = 0;
1568 	uchar_t waiters;
1569 	volatile int error;
1570 	clock_t tim = -1;
1571 	volatile int locked = 0;
1572 	volatile int m_locked = 0;
1573 	volatile int cvwatched = 0;
1574 	volatile int mpwatched = 0;
1575 	label_t ljb;
1576 	volatile int no_lwpchan = 1;
1577 	int imm_timeout = 0;
1578 	int imm_unpark = 0;
1579 
1580 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1581 	    (caddr_t)mp >= p->p_as->a_userlimit)
1582 		return (set_errno(EFAULT));
1583 
1584 	timedwait = (caddr_t)tsp;
1585 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1586 		return (set_errno(error));
1587 	if (lwpt.lwpt_imm_timeout) {
1588 		imm_timeout = 1;
1589 		timedwait = NULL;
1590 	}
1591 
1592 	(void) new_mstate(t, LMS_USER_LOCK);
1593 
1594 	if (on_fault(&ljb)) {
1595 		if (no_lwpchan) {
1596 			error = EFAULT;
1597 			goto out;
1598 		}
1599 		if (m_locked) {
1600 			m_locked = 0;
1601 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1602 		}
1603 		if (locked) {
1604 			locked = 0;
1605 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1606 		}
1607 		/*
1608 		 * set up another on_fault() for a possible fault
1609 		 * on the user lock accessed at "efault"
1610 		 */
1611 		if (on_fault(&ljb)) {
1612 			if (m_locked) {
1613 				m_locked = 0;
1614 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1615 			}
1616 			goto out;
1617 		}
1618 		error = EFAULT;
1619 		goto efault;
1620 	}
1621 
1622 	/*
1623 	 * Force Copy-on-write fault if lwp_cond_t and lwp_mutex_t
1624 	 * objects are defined to be MAP_PRIVATE, and are USYNC_PROCESS
1625 	 */
1626 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1627 	if (UPIMUTEX(mtype) == 0) {
1628 		suword8_noerr(&mp->mutex_type, mtype);
1629 		/* convert user level mutex, "mp", to a unique lwpchan */
1630 		/* check if mtype is ok to use below, instead of type from cv */
1631 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1632 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1633 			error = EFAULT;
1634 			goto out;
1635 		}
1636 	}
1637 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1638 	suword16_noerr(&cv->cond_type, type);
1639 	/* convert user level condition variable, "cv", to a unique lwpchan */
1640 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1641 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1642 		error = EFAULT;
1643 		goto out;
1644 	}
1645 	no_lwpchan = 0;
1646 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1647 	if (UPIMUTEX(mtype) == 0)
1648 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1649 		    S_WRITE);
1650 
1651 	/*
1652 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1653 	 * with respect to a possible wakeup which is a result of either
1654 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1655 	 *
1656 	 * What's misleading, is that the lwp is put to sleep after the
1657 	 * condition variable's mutex is released.  This is OK as long as
1658 	 * the release operation is also done while holding lwpchan_lock.
1659 	 * The lwp is then put to sleep when the possibility of pagefaulting
1660 	 * or sleeping is completely eliminated.
1661 	 */
1662 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1663 	locked = 1;
1664 	if (UPIMUTEX(mtype) == 0) {
1665 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1666 		m_locked = 1;
1667 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1668 		/*
1669 		 * unlock the condition variable's mutex. (pagefaults are
1670 		 * possible here.)
1671 		 */
1672 		ulock_clear(&mp->mutex_lockw);
1673 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1674 		if (waiters != 0) {
1675 			/*
1676 			 * Given the locking of lwpchan_lock around the release
1677 			 * of the mutex and checking for waiters, the following
1678 			 * call to lwp_release() can fail ONLY if the lock
1679 			 * acquirer is interrupted after setting the waiter bit,
1680 			 * calling lwp_block() and releasing lwpchan_lock.
1681 			 * In this case, it could get pulled off the lwp sleep
1682 			 * q (via setrun()) before the following call to
1683 			 * lwp_release() occurs. In this case, the lock
1684 			 * requestor will update the waiter bit correctly by
1685 			 * re-evaluating it.
1686 			 */
1687 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1688 				suword8_noerr(&mp->mutex_waiters, waiters);
1689 		}
1690 		m_locked = 0;
1691 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1692 	} else {
1693 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1694 		error = lwp_upimutex_unlock(mp, mtype);
1695 		if (error) {	/* if the upimutex unlock failed */
1696 			locked = 0;
1697 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1698 			goto out;
1699 		}
1700 	}
1701 	no_fault();
1702 
1703 	if (mpwatched) {
1704 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1705 		mpwatched = 0;
1706 	}
1707 	if (cvwatched) {
1708 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1709 		cvwatched = 0;
1710 	}
1711 
1712 	/*
1713 	 * Put the lwp in an orderly state for debugging.
1714 	 */
1715 	prstop(PR_REQUESTED, 0);
1716 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1717 		/*
1718 		 * We received a signal at user-level before calling here
1719 		 * or another thread wants us to return immediately
1720 		 * with EINTR.  See lwp_unpark().
1721 		 */
1722 		imm_unpark = 1;
1723 		t->t_unpark = 0;
1724 		timedwait = NULL;
1725 	} else if (timedwait) {
1726 		/*
1727 		 * If we successfully queue the timeout,
1728 		 * then don't drop t_delay_lock until
1729 		 * we are on the sleep queue (below).
1730 		 */
1731 		mutex_enter(&t->t_delay_lock);
1732 		if (lwp_timer_enqueue(&lwpt) != 0) {
1733 			mutex_exit(&t->t_delay_lock);
1734 			imm_timeout = 1;
1735 			timedwait = NULL;
1736 		}
1737 	}
1738 	t->t_flag |= T_WAITCVSEM;
1739 	lwp_block(&cv_lwpchan);
1740 	/*
1741 	 * Nothing should happen to cause the lwp to go to sleep
1742 	 * until after it returns from swtch().
1743 	 */
1744 	if (timedwait)
1745 		mutex_exit(&t->t_delay_lock);
1746 	locked = 0;
1747 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1748 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1749 	    (imm_timeout | imm_unpark))
1750 		setrun(t);
1751 	swtch();
1752 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1753 	if (timedwait)
1754 		tim = lwp_timer_dequeue(&lwpt);
1755 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1756 	    MUSTRETURN(p, t) || imm_unpark)
1757 		error = EINTR;
1758 	else if (imm_timeout || (timedwait && tim == -1))
1759 		error = ETIME;
1760 	lwp->lwp_asleep = 0;
1761 	lwp->lwp_sysabort = 0;
1762 	setallwatch();
1763 
1764 	if (t->t_mstate == LMS_USER_LOCK)
1765 		(void) new_mstate(t, LMS_SYSTEM);
1766 
1767 	if (tsp && check_park)		/* copyout the residual time left */
1768 		error = lwp_timer_copyout(&lwpt, error);
1769 
1770 	/* the mutex is reacquired by the caller on return to user level */
1771 	if (error) {
1772 		/*
1773 		 * If we were concurrently lwp_cond_signal()d and we
1774 		 * received a UNIX signal or got a timeout, then perform
1775 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1776 		 */
1777 		if (t->t_release)
1778 			(void) lwp_cond_signal(cv);
1779 		return (set_errno(error));
1780 	}
1781 	return (0);
1782 
1783 efault:
1784 	/*
1785 	 * make sure that the user level lock is dropped before
1786 	 * returning to caller, since the caller always re-acquires it.
1787 	 */
1788 	if (UPIMUTEX(mtype) == 0) {
1789 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1790 		m_locked = 1;
1791 		ulock_clear(&mp->mutex_lockw);
1792 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1793 		if (waiters != 0) {
1794 			/*
1795 			 * See comment above on lock clearing and lwp_release()
1796 			 * success/failure.
1797 			 */
1798 			if (lwp_release(&m_lwpchan, &waiters, 0) > 0)
1799 				suword8_noerr(&mp->mutex_waiters, waiters);
1800 		}
1801 		m_locked = 0;
1802 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1803 	} else {
1804 		(void) lwp_upimutex_unlock(mp, mtype);
1805 	}
1806 out:
1807 	no_fault();
1808 	if (mpwatched)
1809 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1810 	if (cvwatched)
1811 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1812 	if (t->t_mstate == LMS_USER_LOCK)
1813 		(void) new_mstate(t, LMS_SYSTEM);
1814 	return (set_errno(error));
1815 }
1816 
1817 /*
1818  * wakeup one lwp that's blocked on this condition variable.
1819  */
1820 int
1821 lwp_cond_signal(lwp_cond_t *cv)
1822 {
1823 	proc_t *p = ttoproc(curthread);
1824 	lwpchan_t lwpchan;
1825 	uchar_t waiters;
1826 	volatile uint16_t type = 0;
1827 	volatile int locked = 0;
1828 	volatile int watched = 0;
1829 	label_t ljb;
1830 	int error = 0;
1831 
1832 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1833 		return (set_errno(EFAULT));
1834 
1835 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1836 
1837 	if (on_fault(&ljb)) {
1838 		if (locked)
1839 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1840 		error = EFAULT;
1841 		goto out;
1842 	}
1843 	/*
1844 	 * Force Copy-on-write fault if lwp_cond_t object is
1845 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1846 	 */
1847 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1848 	suword16_noerr(&cv->cond_type, type);
1849 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1850 	    &lwpchan, LWPCHAN_CVPOOL)) {
1851 		error = EFAULT;
1852 		goto out;
1853 	}
1854 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1855 	locked = 1;
1856 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1857 	if (waiters != 0) {
1858 		/*
1859 		 * The following call to lwp_release() might fail but it is
1860 		 * OK to write into the waiters bit below, since the memory
1861 		 * could not have been re-used or unmapped (for correctly
1862 		 * written user programs) as in the case of lwp_mutex_wakeup().
1863 		 * For an incorrect program, we should not care about data
1864 		 * corruption since this is just one instance of other places
1865 		 * where corruption can occur for such a program. Of course
1866 		 * if the memory is unmapped, normal fault recovery occurs.
1867 		 */
1868 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1869 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1870 	}
1871 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1872 out:
1873 	no_fault();
1874 	if (watched)
1875 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1876 	if (error)
1877 		return (set_errno(error));
1878 	return (0);
1879 }
1880 
1881 /*
1882  * wakeup every lwp that's blocked on this condition variable.
1883  */
1884 int
1885 lwp_cond_broadcast(lwp_cond_t *cv)
1886 {
1887 	proc_t *p = ttoproc(curthread);
1888 	lwpchan_t lwpchan;
1889 	volatile uint16_t type = 0;
1890 	volatile int locked = 0;
1891 	volatile int watched = 0;
1892 	label_t ljb;
1893 	uchar_t waiters;
1894 	int error = 0;
1895 
1896 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1897 		return (set_errno(EFAULT));
1898 
1899 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1900 
1901 	if (on_fault(&ljb)) {
1902 		if (locked)
1903 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1904 		error = EFAULT;
1905 		goto out;
1906 	}
1907 	/*
1908 	 * Force Copy-on-write fault if lwp_cond_t object is
1909 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1910 	 */
1911 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1912 	suword16_noerr(&cv->cond_type, type);
1913 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1914 	    &lwpchan, LWPCHAN_CVPOOL)) {
1915 		error = EFAULT;
1916 		goto out;
1917 	}
1918 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1919 	locked = 1;
1920 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1921 	if (waiters != 0) {
1922 		lwp_release_all(&lwpchan);
1923 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1924 	}
1925 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1926 out:
1927 	no_fault();
1928 	if (watched)
1929 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1930 	if (error)
1931 		return (set_errno(error));
1932 	return (0);
1933 }
1934 
1935 int
1936 lwp_sema_trywait(lwp_sema_t *sp)
1937 {
1938 	kthread_t *t = curthread;
1939 	proc_t *p = ttoproc(t);
1940 	label_t ljb;
1941 	volatile int locked = 0;
1942 	volatile int watched = 0;
1943 	volatile uint16_t type = 0;
1944 	int count;
1945 	lwpchan_t lwpchan;
1946 	uchar_t waiters;
1947 	int error = 0;
1948 
1949 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1950 		return (set_errno(EFAULT));
1951 
1952 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1953 
1954 	if (on_fault(&ljb)) {
1955 		if (locked)
1956 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1957 		error = EFAULT;
1958 		goto out;
1959 	}
1960 	/*
1961 	 * Force Copy-on-write fault if lwp_sema_t object is
1962 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
1963 	 */
1964 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1965 	suword16_noerr((void *)&sp->sema_type, type);
1966 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1967 	    &lwpchan, LWPCHAN_CVPOOL)) {
1968 		error = EFAULT;
1969 		goto out;
1970 	}
1971 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1972 	locked = 1;
1973 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1974 	if (count == 0)
1975 		error = EBUSY;
1976 	else
1977 		suword32_noerr((void *)&sp->sema_count, --count);
1978 	if (count != 0) {
1979 		fuword8_noerr(&sp->sema_waiters, &waiters);
1980 		if (waiters != 0) {
1981 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1982 			suword8_noerr(&sp->sema_waiters, waiters);
1983 		}
1984 	}
1985 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1986 out:
1987 	no_fault();
1988 	if (watched)
1989 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1990 	if (error)
1991 		return (set_errno(error));
1992 	return (0);
1993 }
1994 
1995 /*
1996  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
1997  */
1998 int
1999 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2000 {
2001 	kthread_t *t = curthread;
2002 	klwp_t *lwp = ttolwp(t);
2003 	proc_t *p = ttoproc(t);
2004 	lwp_timer_t lwpt;
2005 	caddr_t timedwait;
2006 	clock_t tim = -1;
2007 	label_t ljb;
2008 	volatile int locked = 0;
2009 	volatile int watched = 0;
2010 	volatile uint16_t type = 0;
2011 	int count;
2012 	lwpchan_t lwpchan;
2013 	uchar_t waiters;
2014 	int error = 0;
2015 	int time_error;
2016 	int imm_timeout = 0;
2017 	int imm_unpark = 0;
2018 
2019 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2020 		return (set_errno(EFAULT));
2021 
2022 	timedwait = (caddr_t)tsp;
2023 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2024 	    lwpt.lwpt_imm_timeout) {
2025 		imm_timeout = 1;
2026 		timedwait = NULL;
2027 	}
2028 
2029 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2030 
2031 	if (on_fault(&ljb)) {
2032 		if (locked)
2033 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2034 		error = EFAULT;
2035 		goto out;
2036 	}
2037 	/*
2038 	 * Force Copy-on-write fault if lwp_sema_t object is
2039 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2040 	 */
2041 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2042 	suword16_noerr((void *)&sp->sema_type, type);
2043 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2044 	    &lwpchan, LWPCHAN_CVPOOL)) {
2045 		error = EFAULT;
2046 		goto out;
2047 	}
2048 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2049 	locked = 1;
2050 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2051 	while (error == 0 && count == 0) {
2052 		if (time_error) {
2053 			/*
2054 			 * The SUSV3 Posix spec is very clear that we
2055 			 * should get no error from validating the
2056 			 * timer until we would actually sleep.
2057 			 */
2058 			error = time_error;
2059 			break;
2060 		}
2061 		suword8_noerr(&sp->sema_waiters, 1);
2062 		if (watched)
2063 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2064 		/*
2065 		 * Put the lwp in an orderly state for debugging.
2066 		 */
2067 		prstop(PR_REQUESTED, 0);
2068 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2069 			/*
2070 			 * We received a signal at user-level before calling
2071 			 * here or another thread wants us to return
2072 			 * immediately with EINTR.  See lwp_unpark().
2073 			 */
2074 			imm_unpark = 1;
2075 			t->t_unpark = 0;
2076 			timedwait = NULL;
2077 		} else if (timedwait) {
2078 			/*
2079 			 * If we successfully queue the timeout,
2080 			 * then don't drop t_delay_lock until
2081 			 * we are on the sleep queue (below).
2082 			 */
2083 			mutex_enter(&t->t_delay_lock);
2084 			if (lwp_timer_enqueue(&lwpt) != 0) {
2085 				mutex_exit(&t->t_delay_lock);
2086 				imm_timeout = 1;
2087 				timedwait = NULL;
2088 			}
2089 		}
2090 		t->t_flag |= T_WAITCVSEM;
2091 		lwp_block(&lwpchan);
2092 		/*
2093 		 * Nothing should happen to cause the lwp to sleep
2094 		 * again until after it returns from swtch().
2095 		 */
2096 		if (timedwait)
2097 			mutex_exit(&t->t_delay_lock);
2098 		locked = 0;
2099 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2100 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2101 		    (imm_timeout | imm_unpark))
2102 			setrun(t);
2103 		swtch();
2104 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2105 		if (timedwait)
2106 			tim = lwp_timer_dequeue(&lwpt);
2107 		setallwatch();
2108 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2109 		    MUSTRETURN(p, t) || imm_unpark)
2110 			error = EINTR;
2111 		else if (imm_timeout || (timedwait && tim == -1))
2112 			error = ETIME;
2113 		lwp->lwp_asleep = 0;
2114 		lwp->lwp_sysabort = 0;
2115 		watched = watch_disable_addr((caddr_t)sp,
2116 		    sizeof (*sp), S_WRITE);
2117 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2118 		locked = 1;
2119 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2120 	}
2121 	if (error == 0)
2122 		suword32_noerr((void *)&sp->sema_count, --count);
2123 	if (count != 0) {
2124 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2125 		suword8_noerr(&sp->sema_waiters, waiters);
2126 	}
2127 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2128 out:
2129 	no_fault();
2130 	if (watched)
2131 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2132 	if (tsp && check_park && !time_error)
2133 		error = lwp_timer_copyout(&lwpt, error);
2134 	if (error)
2135 		return (set_errno(error));
2136 	return (0);
2137 }
2138 
2139 /*
2140  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2141  * libc now calls lwp_sema_timedwait().
2142  * This system call trap exists solely for the benefit of old
2143  * statically linked applications from Solaris 9 and before.
2144  * It should be removed when we no longer care about such applications.
2145  */
2146 int
2147 lwp_sema_wait(lwp_sema_t *sp)
2148 {
2149 	return (lwp_sema_timedwait(sp, NULL, 0));
2150 }
2151 
2152 int
2153 lwp_sema_post(lwp_sema_t *sp)
2154 {
2155 	proc_t *p = ttoproc(curthread);
2156 	label_t ljb;
2157 	volatile int locked = 0;
2158 	volatile int watched = 0;
2159 	volatile uint16_t type = 0;
2160 	int count;
2161 	lwpchan_t lwpchan;
2162 	uchar_t waiters;
2163 	int error = 0;
2164 
2165 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2166 		return (set_errno(EFAULT));
2167 
2168 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2169 
2170 	if (on_fault(&ljb)) {
2171 		if (locked)
2172 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2173 		error = EFAULT;
2174 		goto out;
2175 	}
2176 	/*
2177 	 * Force Copy-on-write fault if lwp_sema_t object is
2178 	 * defined to be MAP_PRIVATE, and is USYNC_PROCESS.
2179 	 */
2180 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2181 	suword16_noerr(&sp->sema_type, type);
2182 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2183 	    &lwpchan, LWPCHAN_CVPOOL)) {
2184 		error = EFAULT;
2185 		goto out;
2186 	}
2187 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2188 	locked = 1;
2189 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2190 	if (count == _SEM_VALUE_MAX)
2191 		error = EOVERFLOW;
2192 	else
2193 		suword32_noerr(&sp->sema_count, ++count);
2194 	if (count == 1) {
2195 		fuword8_noerr(&sp->sema_waiters, &waiters);
2196 		if (waiters) {
2197 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2198 			suword8_noerr(&sp->sema_waiters, waiters);
2199 		}
2200 	}
2201 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2202 out:
2203 	no_fault();
2204 	if (watched)
2205 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2206 	if (error)
2207 		return (set_errno(error));
2208 	return (0);
2209 }
2210 
2211 #define	TRW_WANT_WRITE		0x1
2212 #define	TRW_LOCK_GRANTED	0x2
2213 
2214 #define	READ_LOCK		0
2215 #define	WRITE_LOCK		1
2216 #define	TRY_FLAG		0x10
2217 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2218 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2219 
2220 /*
2221  * Release one writer or one or more readers. Compute the rwstate word to
2222  * reflect the new state of the queue. For a safe hand-off we copy the new
2223  * rwstate value back to userland before we wake any of the new lock holders.
2224  *
2225  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2226  * being given precedence over readers of the same priority).
2227  *
2228  * If the first thread is a reader we scan the queue releasing all readers
2229  * until we hit a writer or the end of the queue. If the first thread is a
2230  * writer we still need to check for another writer.
2231  */
2232 void
2233 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2234 {
2235 	sleepq_head_t *sqh;
2236 	kthread_t *tp;
2237 	kthread_t **tpp;
2238 	kthread_t *tpnext;
2239 	kthread_t *wakelist = NULL;
2240 	uint32_t rwstate = 0;
2241 	int wcount = 0;
2242 	int rcount = 0;
2243 
2244 	sqh = lwpsqhash(lwpchan);
2245 	disp_lock_enter(&sqh->sq_lock);
2246 	tpp = &sqh->sq_queue.sq_first;
2247 	while ((tp = *tpp) != NULL) {
2248 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2249 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2250 			if (tp->t_writer & TRW_WANT_WRITE) {
2251 				if ((wcount++ == 0) && (rcount == 0)) {
2252 					rwstate |= URW_WRITE_LOCKED;
2253 
2254 					/* Just one writer to wake. */
2255 					sleepq_unlink(tpp, tp);
2256 					wakelist = tp;
2257 
2258 					/* tpp already set for next thread. */
2259 					continue;
2260 				} else {
2261 					rwstate |= URW_HAS_WAITERS;
2262 					/* We need look no further. */
2263 					break;
2264 				}
2265 			} else {
2266 				rcount++;
2267 				if (wcount == 0) {
2268 					rwstate++;
2269 
2270 					/* Add reader to wake list. */
2271 					sleepq_unlink(tpp, tp);
2272 					tp->t_link = wakelist;
2273 					wakelist = tp;
2274 
2275 					/* tpp already set for next thread. */
2276 					continue;
2277 				} else {
2278 					rwstate |= URW_HAS_WAITERS;
2279 					/* We need look no further. */
2280 					break;
2281 				}
2282 			}
2283 		}
2284 		tpp = &tp->t_link;
2285 	}
2286 
2287 	/* Copy the new rwstate back to userland. */
2288 	suword32_noerr(&rw->rwlock_readers, rwstate);
2289 
2290 	/* Wake the new lock holder(s) up. */
2291 	tp = wakelist;
2292 	while (tp != NULL) {
2293 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2294 		tp->t_wchan0 = NULL;
2295 		tp->t_wchan = NULL;
2296 		tp->t_sobj_ops = NULL;
2297 		tp->t_writer |= TRW_LOCK_GRANTED;
2298 		tpnext = tp->t_link;
2299 		tp->t_link = NULL;
2300 		CL_WAKEUP(tp);
2301 		thread_unlock_high(tp);
2302 		tp = tpnext;
2303 	}
2304 
2305 	disp_lock_exit(&sqh->sq_lock);
2306 }
2307 
2308 /*
2309  * We enter here holding the user-level mutex, which we must release before
2310  * returning or blocking. Based on lwp_cond_wait().
2311  */
2312 static int
2313 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2314 {
2315 	lwp_mutex_t *mp = NULL;
2316 	kthread_t *t = curthread;
2317 	kthread_t *tp;
2318 	klwp_t *lwp = ttolwp(t);
2319 	proc_t *p = ttoproc(t);
2320 	lwp_timer_t lwpt;
2321 	lwpchan_t lwpchan;
2322 	lwpchan_t mlwpchan;
2323 	caddr_t timedwait;
2324 	volatile uint16_t type = 0;
2325 	volatile uint8_t mtype = 0;
2326 	uchar_t mwaiters;
2327 	volatile int error = 0;
2328 	int time_error;
2329 	clock_t tim = -1;
2330 	volatile int locked = 0;
2331 	volatile int mlocked = 0;
2332 	volatile int watched = 0;
2333 	volatile int mwatched = 0;
2334 	label_t ljb;
2335 	volatile int no_lwpchan = 1;
2336 	int imm_timeout = 0;
2337 	int try_flag;
2338 	uint32_t rwstate;
2339 	int acquired = 0;
2340 
2341 	/* We only check rw because the mutex is included in it. */
2342 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2343 		return (set_errno(EFAULT));
2344 
2345 	/* We must only report this error if we are about to sleep (later). */
2346 	timedwait = (caddr_t)tsp;
2347 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2348 	    lwpt.lwpt_imm_timeout) {
2349 		imm_timeout = 1;
2350 		timedwait = NULL;
2351 	}
2352 
2353 	(void) new_mstate(t, LMS_USER_LOCK);
2354 
2355 	if (on_fault(&ljb)) {
2356 		if (no_lwpchan) {
2357 			error = EFAULT;
2358 			goto out_nodrop;
2359 		}
2360 		if (mlocked) {
2361 			mlocked = 0;
2362 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2363 		}
2364 		if (locked) {
2365 			locked = 0;
2366 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2367 		}
2368 		/*
2369 		 * Set up another on_fault() for a possible fault
2370 		 * on the user lock accessed at "out_drop".
2371 		 */
2372 		if (on_fault(&ljb)) {
2373 			if (mlocked) {
2374 				mlocked = 0;
2375 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2376 			}
2377 			error = EFAULT;
2378 			goto out_nodrop;
2379 		}
2380 		error = EFAULT;
2381 		goto out_nodrop;
2382 	}
2383 
2384 	/* Process rd_wr (including sanity check). */
2385 	try_flag = (rd_wr & TRY_FLAG);
2386 	rd_wr &= ~TRY_FLAG;
2387 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2388 		error = EINVAL;
2389 		goto out_nodrop;
2390 	}
2391 
2392 	/* We can only continue for simple USYNC_PROCESS locks. */
2393 	mp = &rw->mutex;
2394 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2395 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2396 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2397 		error = EINVAL;
2398 		goto out_nodrop;
2399 	}
2400 
2401 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2402 	suword8_noerr(&mp->mutex_type, mtype);
2403 	suword16_noerr(&rw->rwlock_type, type);
2404 
2405 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2406 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2407 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2408 		error = EFAULT;
2409 		goto out_nodrop;
2410 	}
2411 
2412 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2413 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2414 	    &lwpchan, LWPCHAN_CVPOOL)) {
2415 		error = EFAULT;
2416 		goto out_nodrop;
2417 	}
2418 
2419 	no_lwpchan = 0;
2420 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2421 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2422 
2423 	/*
2424 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2425 	 * atomically with respect to a possible wakeup which is a result
2426 	 * of lwp_rwlock_unlock().
2427 	 *
2428 	 * What's misleading is that the LWP is put to sleep after the
2429 	 * rwlock's mutex is released. This is OK as long as the release
2430 	 * operation is also done while holding mlwpchan. The LWP is then
2431 	 * put to sleep when the possibility of pagefaulting or sleeping
2432 	 * has been completely eliminated.
2433 	 */
2434 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2435 	locked = 1;
2436 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2437 	mlocked = 1;
2438 
2439 	/*
2440 	 * Fetch the current rwlock state.
2441 	 *
2442 	 * The possibility of spurious wake-ups or killed waiters means
2443 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2444 	 * We only fix these if they are important to us.
2445 	 *
2446 	 * Although various error states can be observed here (e.g. the lock
2447 	 * is not held, but there are waiters) we assume these are applicaton
2448 	 * errors and so we take no corrective action.
2449 	 */
2450 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2451 	/*
2452 	 * We cannot legitimately get here from user-level
2453 	 * without URW_HAS_WAITERS being set.
2454 	 * Set it now to guard against user-level error.
2455 	 */
2456 	rwstate |= URW_HAS_WAITERS;
2457 
2458 	/*
2459 	 * We can try only if the lock isn't held by a writer.
2460 	 */
2461 	if (!(rwstate & URW_WRITE_LOCKED)) {
2462 		tp = lwp_queue_waiter(&lwpchan);
2463 		if (tp == NULL) {
2464 			/*
2465 			 * Hmmm, rwstate indicates waiters but there are
2466 			 * none queued. This could just be the result of a
2467 			 * spurious wakeup, so let's ignore it.
2468 			 *
2469 			 * We now have a chance to acquire the lock
2470 			 * uncontended, but this is the last chance for
2471 			 * a writer to acquire the lock without blocking.
2472 			 */
2473 			if (rd_wr == READ_LOCK) {
2474 				rwstate++;
2475 				acquired = 1;
2476 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2477 				rwstate |= URW_WRITE_LOCKED;
2478 				acquired = 1;
2479 			}
2480 		} else if (rd_wr == READ_LOCK) {
2481 			/*
2482 			 * This is the last chance for a reader to acquire
2483 			 * the lock now, but it can only do so if there is
2484 			 * no writer of equal or greater priority at the
2485 			 * head of the queue .
2486 			 *
2487 			 * It is also just possible that there is a reader
2488 			 * at the head of the queue. This may be the result
2489 			 * of a spurious wakeup or an application failure.
2490 			 * In this case we only acquire the lock if we have
2491 			 * equal or greater priority. It is not our job to
2492 			 * release spurious waiters.
2493 			 */
2494 			pri_t our_pri = DISP_PRIO(t);
2495 			pri_t his_pri = DISP_PRIO(tp);
2496 
2497 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2498 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2499 				rwstate++;
2500 				acquired = 1;
2501 			}
2502 		}
2503 	}
2504 
2505 	if (acquired || try_flag || time_error) {
2506 		/*
2507 		 * We're not going to block this time.
2508 		 */
2509 		suword32_noerr(&rw->rwlock_readers, rwstate);
2510 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2511 		locked = 0;
2512 
2513 		if (acquired) {
2514 			/*
2515 			 * Got the lock!
2516 			 */
2517 			error = 0;
2518 
2519 		} else if (try_flag) {
2520 			/*
2521 			 * We didn't get the lock and we're about to block.
2522 			 * If we're doing a trylock, return EBUSY instead.
2523 			 */
2524 			error = EBUSY;
2525 
2526 		} else if (time_error) {
2527 			/*
2528 			 * The SUSV3 POSIX spec is very clear that we should
2529 			 * get no error from validating the timer (above)
2530 			 * until we would actually sleep.
2531 			 */
2532 			error = time_error;
2533 		}
2534 
2535 		goto out_drop;
2536 	}
2537 
2538 	/*
2539 	 * We're about to block, so indicate what kind of waiter we are.
2540 	 */
2541 	t->t_writer = 0;
2542 	if (rd_wr == WRITE_LOCK)
2543 		t->t_writer = TRW_WANT_WRITE;
2544 	suword32_noerr(&rw->rwlock_readers, rwstate);
2545 
2546 	/*
2547 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2548 	 */
2549 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2550 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2551 	suword32_noerr(&mp->mutex_ownerpid, 0);
2552 	ulock_clear(&mp->mutex_lockw);
2553 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2554 	if (mwaiters != 0) {
2555 		/*
2556 		 * Given the locking of mlwpchan around the release of
2557 		 * the mutex and checking for waiters, the following
2558 		 * call to lwp_release() can fail ONLY if the lock
2559 		 * acquirer is interrupted after setting the waiter bit,
2560 		 * calling lwp_block() and releasing mlwpchan.
2561 		 * In this case, it could get pulled off the LWP sleep
2562 		 * queue (via setrun()) before the following call to
2563 		 * lwp_release() occurs, and the lock requestor will
2564 		 * update the waiter bit correctly by re-evaluating it.
2565 		 */
2566 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2567 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2568 	}
2569 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2570 	mlocked = 0;
2571 	no_fault();
2572 
2573 	if (mwatched) {
2574 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2575 		mwatched = 0;
2576 	}
2577 	if (watched) {
2578 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2579 		watched = 0;
2580 	}
2581 
2582 	/*
2583 	 * Put the LWP in an orderly state for debugging.
2584 	 */
2585 	prstop(PR_REQUESTED, 0);
2586 	if (timedwait) {
2587 		/*
2588 		 * If we successfully queue the timeout,
2589 		 * then don't drop t_delay_lock until
2590 		 * we are on the sleep queue (below).
2591 		 */
2592 		mutex_enter(&t->t_delay_lock);
2593 		if (lwp_timer_enqueue(&lwpt) != 0) {
2594 			mutex_exit(&t->t_delay_lock);
2595 			imm_timeout = 1;
2596 			timedwait = NULL;
2597 		}
2598 	}
2599 	t->t_flag |= T_WAITCVSEM;
2600 	lwp_block(&lwpchan);
2601 
2602 	/*
2603 	 * Nothing should happen to cause the LWp to go to sleep until after
2604 	 * it returns from swtch().
2605 	 */
2606 	if (timedwait)
2607 		mutex_exit(&t->t_delay_lock);
2608 	locked = 0;
2609 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2610 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t))
2611 		setrun(t);
2612 	swtch();
2613 
2614 	/*
2615 	 * We're back, but we need to work out why. Were we interrupted? Did
2616 	 * we timeout? Were we granted the lock?
2617 	 */
2618 	error = EAGAIN;
2619 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2620 	t->t_writer = 0;
2621 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2622 	if (timedwait)
2623 		tim = lwp_timer_dequeue(&lwpt);
2624 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2625 		error = EINTR;
2626 	else if (imm_timeout || (timedwait && tim == -1))
2627 		error = ETIME;
2628 	lwp->lwp_asleep = 0;
2629 	lwp->lwp_sysabort = 0;
2630 	setallwatch();
2631 
2632 	/*
2633 	 * If we were granted the lock we don't care about EINTR or ETIME.
2634 	 */
2635 	if (acquired)
2636 		error = 0;
2637 
2638 	if (t->t_mstate == LMS_USER_LOCK)
2639 		(void) new_mstate(t, LMS_SYSTEM);
2640 
2641 	if (error)
2642 		return (set_errno(error));
2643 	return (0);
2644 
2645 out_drop:
2646 	/*
2647 	 * Make sure that the user level lock is dropped before returning
2648 	 * to the caller.
2649 	 */
2650 	if (!mlocked) {
2651 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2652 		mlocked = 1;
2653 	}
2654 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2655 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2656 	suword32_noerr(&mp->mutex_ownerpid, 0);
2657 	ulock_clear(&mp->mutex_lockw);
2658 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2659 	if (mwaiters != 0) {
2660 		/*
2661 		 * See comment above on lock clearing and lwp_release()
2662 		 * success/failure.
2663 		 */
2664 		if (lwp_release(&mlwpchan, &mwaiters, 0) > 0)
2665 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2666 	}
2667 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2668 	mlocked = 0;
2669 
2670 out_nodrop:
2671 	no_fault();
2672 	if (mwatched)
2673 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2674 	if (watched)
2675 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2676 	if (t->t_mstate == LMS_USER_LOCK)
2677 		(void) new_mstate(t, LMS_SYSTEM);
2678 	if (error)
2679 		return (set_errno(error));
2680 	return (0);
2681 }
2682 
2683 /*
2684  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2685  * we never drop the lock.
2686  */
2687 static int
2688 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2689 {
2690 	kthread_t *t = curthread;
2691 	proc_t *p = ttoproc(t);
2692 	lwpchan_t lwpchan;
2693 	volatile uint16_t type = 0;
2694 	volatile int error = 0;
2695 	volatile int locked = 0;
2696 	volatile int watched = 0;
2697 	label_t ljb;
2698 	volatile int no_lwpchan = 1;
2699 	uint32_t rwstate;
2700 
2701 	/* We only check rw because the mutex is included in it. */
2702 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2703 		return (set_errno(EFAULT));
2704 
2705 	if (on_fault(&ljb)) {
2706 		if (no_lwpchan) {
2707 			error = EFAULT;
2708 			goto out_nodrop;
2709 		}
2710 		if (locked) {
2711 			locked = 0;
2712 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2713 		}
2714 		error = EFAULT;
2715 		goto out_nodrop;
2716 	}
2717 
2718 	/* We can only continue for simple USYNC_PROCESS locks. */
2719 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2720 	if (type != USYNC_PROCESS) {
2721 		error = EINVAL;
2722 		goto out_nodrop;
2723 	}
2724 
2725 	/* Force Copy-on-write fault incase objects are MAP_PRIVATE. */
2726 	suword16_noerr(&rw->rwlock_type, type);
2727 
2728 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2729 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2730 	    &lwpchan, LWPCHAN_CVPOOL)) {
2731 		error = EFAULT;
2732 		goto out_nodrop;
2733 	}
2734 
2735 	no_lwpchan = 0;
2736 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2737 
2738 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2739 	locked = 1;
2740 
2741 	/*
2742 	 * We can resolve multiple readers (except the last reader) here.
2743 	 * For the last reader or a writer we need lwp_rwlock_release(),
2744 	 * to which we also delegate the task of copying the new rwstate
2745 	 * back to userland (see the comment there).
2746 	 */
2747 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2748 	if (rwstate & URW_WRITE_LOCKED)
2749 		lwp_rwlock_release(&lwpchan, rw);
2750 	else if ((rwstate & URW_READERS_MASK) > 0) {
2751 		rwstate--;
2752 		if ((rwstate & URW_READERS_MASK) == 0)
2753 			lwp_rwlock_release(&lwpchan, rw);
2754 		else
2755 			suword32_noerr(&rw->rwlock_readers, rwstate);
2756 	}
2757 
2758 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2759 	locked = 0;
2760 	error = 0;
2761 
2762 out_nodrop:
2763 	no_fault();
2764 	if (watched)
2765 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2766 	if (error)
2767 		return (set_errno(error));
2768 	return (0);
2769 }
2770 
2771 int
2772 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2773 {
2774 	switch (subcode) {
2775 	case 0:
2776 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2777 	case 1:
2778 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2779 	case 2:
2780 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2781 	case 3:
2782 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2783 	case 4:
2784 		return (lwp_rwlock_unlock(rwlp));
2785 	}
2786 	return (set_errno(EINVAL));
2787 }
2788 
2789 /*
2790  * Return the owner of the user-level s-object.
2791  * Since we can't really do this, return NULL.
2792  */
2793 /* ARGSUSED */
2794 static kthread_t *
2795 lwpsobj_owner(caddr_t sobj)
2796 {
2797 	return ((kthread_t *)NULL);
2798 }
2799 
2800 /*
2801  * Wake up a thread asleep on a user-level synchronization
2802  * object.
2803  */
2804 static void
2805 lwp_unsleep(kthread_t *t)
2806 {
2807 	ASSERT(THREAD_LOCK_HELD(t));
2808 	if (t->t_wchan0 != NULL) {
2809 		sleepq_head_t *sqh;
2810 		sleepq_t *sqp = t->t_sleepq;
2811 
2812 		if (sqp != NULL) {
2813 			sqh = lwpsqhash(&t->t_lwpchan);
2814 			ASSERT(&sqh->sq_queue == sqp);
2815 			sleepq_unsleep(t);
2816 			disp_lock_exit_high(&sqh->sq_lock);
2817 			CL_SETRUN(t);
2818 			return;
2819 		}
2820 	}
2821 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2822 }
2823 
2824 /*
2825  * Change the priority of a thread asleep on a user-level
2826  * synchronization object. To maintain proper priority order,
2827  * we:
2828  *	o dequeue the thread.
2829  *	o change its priority.
2830  *	o re-enqueue the thread.
2831  * Assumption: the thread is locked on entry.
2832  */
2833 static void
2834 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2835 {
2836 	ASSERT(THREAD_LOCK_HELD(t));
2837 	if (t->t_wchan0 != NULL) {
2838 		sleepq_t   *sqp = t->t_sleepq;
2839 
2840 		sleepq_dequeue(t);
2841 		*t_prip = pri;
2842 		sleepq_insert(sqp, t);
2843 	} else
2844 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2845 }
2846 
2847 /*
2848  * Clean up a locked robust mutex
2849  */
2850 static void
2851 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2852 {
2853 	uint16_t flag;
2854 	uchar_t waiters;
2855 	label_t ljb;
2856 	pid_t owner_pid;
2857 	lwp_mutex_t *lp;
2858 	volatile int locked = 0;
2859 	volatile int watched = 0;
2860 	volatile struct upimutex *upimutex = NULL;
2861 	volatile int upilocked = 0;
2862 
2863 	ASSERT(ent->lwpchan_type & LOCK_ROBUST);
2864 
2865 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2866 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2867 	if (on_fault(&ljb)) {
2868 		if (locked)
2869 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2870 		if (upilocked)
2871 			upimutex_unlock((upimutex_t *)upimutex, 0);
2872 		goto out;
2873 	}
2874 	if (ent->lwpchan_type & USYNC_PROCESS) {
2875 		fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2876 		if (owner_pid != curproc->p_pid)
2877 			goto out;
2878 	}
2879 	if (UPIMUTEX(ent->lwpchan_type)) {
2880 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2881 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2882 
2883 		mutex_enter(&upibp->upib_lock);
2884 		upimutex = upi_get(upibp, &lwpchan);
2885 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2886 			mutex_exit(&upibp->upib_lock);
2887 			goto out;
2888 		}
2889 		mutex_exit(&upibp->upib_lock);
2890 		upilocked = 1;
2891 		flag = lwp_clear_mutex(lp, lockflg);
2892 		suword8_noerr(&lp->mutex_lockw, 0);
2893 		upimutex_unlock((upimutex_t *)upimutex, flag);
2894 	} else {
2895 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2896 		locked = 1;
2897 		(void) lwp_clear_mutex(lp, lockflg);
2898 		ulock_clear(&lp->mutex_lockw);
2899 		fuword8_noerr(&lp->mutex_waiters, &waiters);
2900 		if (waiters && lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2901 			suword8_noerr(&lp->mutex_waiters, waiters);
2902 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2903 	}
2904 out:
2905 	no_fault();
2906 	if (watched)
2907 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2908 }
2909 
2910 /*
2911  * Register a process-shared robust mutex in the lwpchan cache.
2912  */
2913 int
2914 lwp_mutex_register(lwp_mutex_t *lp)
2915 {
2916 	int error = 0;
2917 	volatile int watched;
2918 	label_t ljb;
2919 	uint8_t type;
2920 	lwpchan_t lwpchan;
2921 
2922 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2923 		return (set_errno(EFAULT));
2924 
2925 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2926 
2927 	if (on_fault(&ljb)) {
2928 		error = EFAULT;
2929 	} else {
2930 		fuword8_noerr(&lp->mutex_type, &type);
2931 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2932 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
2933 			error = EINVAL;
2934 		} else {
2935 			/*
2936 			 * Force Copy-on-write fault if lwp_mutex_t object is
2937 			 * defined to be MAP_PRIVATE and it was initialized to
2938 			 * USYNC_PROCESS.
2939 			 */
2940 			suword8_noerr(&lp->mutex_type, type);
2941 			if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2942 			    &lwpchan, LWPCHAN_MPPOOL))
2943 				error = EFAULT;
2944 		}
2945 	}
2946 	no_fault();
2947 	if (watched)
2948 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2949 	if (error)
2950 		return (set_errno(error));
2951 	return (0);
2952 }
2953 
2954 int
2955 lwp_mutex_trylock(lwp_mutex_t *lp)
2956 {
2957 	kthread_t *t = curthread;
2958 	proc_t *p = ttoproc(t);
2959 	int error = 0;
2960 	volatile int locked = 0;
2961 	volatile int watched = 0;
2962 	label_t ljb;
2963 	volatile uint8_t type = 0;
2964 	uint16_t flag;
2965 	lwpchan_t lwpchan;
2966 
2967 	if ((caddr_t)lp >= p->p_as->a_userlimit)
2968 		return (set_errno(EFAULT));
2969 
2970 	(void) new_mstate(t, LMS_USER_LOCK);
2971 
2972 	if (on_fault(&ljb)) {
2973 		if (locked)
2974 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
2975 		error = EFAULT;
2976 		goto out;
2977 	}
2978 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
2979 	if (UPIMUTEX(type)) {
2980 		no_fault();
2981 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
2982 		if ((type & USYNC_PROCESS) &&
2983 		    (error == 0 ||
2984 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
2985 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
2986 		if (error)
2987 			return (set_errno(error));
2988 		return (0);
2989 	}
2990 	/*
2991 	 * Force Copy-on-write fault if lwp_mutex_t object is
2992 	 * defined to be MAP_PRIVATE and it was initialized to
2993 	 * USYNC_PROCESS.
2994 	 */
2995 	suword8_noerr(&lp->mutex_type, type);
2996 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2997 	    &lwpchan, LWPCHAN_MPPOOL)) {
2998 		error = EFAULT;
2999 		goto out;
3000 	}
3001 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3002 	locked = 1;
3003 	if (type & LOCK_ROBUST) {
3004 		fuword16_noerr(&lp->mutex_flag, &flag);
3005 		if (flag & LOCK_NOTRECOVERABLE) {
3006 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3007 			error =  ENOTRECOVERABLE;
3008 			goto out;
3009 		}
3010 	}
3011 
3012 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3013 
3014 	if (!ulock_try(&lp->mutex_lockw))
3015 		error = EBUSY;
3016 	else {
3017 		if (type & USYNC_PROCESS)
3018 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
3019 		if (type & LOCK_ROBUST) {
3020 			fuword16_noerr(&lp->mutex_flag, &flag);
3021 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3022 				if (flag & LOCK_OWNERDEAD)
3023 					error = EOWNERDEAD;
3024 				else if (type & USYNC_PROCESS_ROBUST)
3025 					error = ELOCKUNMAPPED;
3026 				else
3027 					error = EOWNERDEAD;
3028 			}
3029 		}
3030 	}
3031 	locked = 0;
3032 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3033 out:
3034 
3035 	if (t->t_mstate == LMS_USER_LOCK)
3036 		(void) new_mstate(t, LMS_SYSTEM);
3037 
3038 	no_fault();
3039 	if (watched)
3040 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3041 	if (error)
3042 		return (set_errno(error));
3043 	return (0);
3044 }
3045 
3046 /*
3047  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3048  * the blocked lwp resumes and retries to acquire the lock.
3049  */
3050 int
3051 lwp_mutex_unlock(lwp_mutex_t *lp)
3052 {
3053 	proc_t *p = ttoproc(curthread);
3054 	lwpchan_t lwpchan;
3055 	uchar_t waiters;
3056 	volatile int locked = 0;
3057 	volatile int watched = 0;
3058 	volatile uint8_t type = 0;
3059 	label_t ljb;
3060 	uint16_t flag;
3061 	int error = 0;
3062 
3063 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3064 		return (set_errno(EFAULT));
3065 
3066 	if (on_fault(&ljb)) {
3067 		if (locked)
3068 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3069 		error = EFAULT;
3070 		goto out;
3071 	}
3072 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3073 	if (UPIMUTEX(type)) {
3074 		no_fault();
3075 		error = lwp_upimutex_unlock(lp, type);
3076 		if (error)
3077 			return (set_errno(error));
3078 		return (0);
3079 	}
3080 
3081 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3082 
3083 	/*
3084 	 * Force Copy-on-write fault if lwp_mutex_t object is
3085 	 * defined to be MAP_PRIVATE, and type is USYNC_PROCESS
3086 	 */
3087 	suword8_noerr(&lp->mutex_type, type);
3088 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3089 	    &lwpchan, LWPCHAN_MPPOOL)) {
3090 		error = EFAULT;
3091 		goto out;
3092 	}
3093 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3094 	locked = 1;
3095 	if (type & LOCK_ROBUST) {
3096 		fuword16_noerr(&lp->mutex_flag, &flag);
3097 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3098 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3099 			flag |= LOCK_NOTRECOVERABLE;
3100 			suword16_noerr(&lp->mutex_flag, flag);
3101 		}
3102 	}
3103 	if (type & USYNC_PROCESS)
3104 		suword32_noerr(&lp->mutex_ownerpid, 0);
3105 	ulock_clear(&lp->mutex_lockw);
3106 	/*
3107 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3108 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3109 	 * may fail.  If it fails, do not write into the waiter bit.
3110 	 * The call to lwp_release() might fail due to one of three reasons:
3111 	 *
3112 	 * 	1. due to the thread which set the waiter bit not actually
3113 	 *	   sleeping since it got the lock on the re-try. The waiter
3114 	 *	   bit will then be correctly updated by that thread. This
3115 	 *	   window may be closed by reading the wait bit again here
3116 	 *	   and not calling lwp_release() at all if it is zero.
3117 	 *	2. the thread which set the waiter bit and went to sleep
3118 	 *	   was woken up by a signal. This time, the waiter recomputes
3119 	 *	   the wait bit in the return with EINTR code.
3120 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3121 	 *	   memory that has been re-used after the lock was dropped.
3122 	 *	   In this case, writing into the waiter bit would cause data
3123 	 *	   corruption.
3124 	 */
3125 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3126 	if (waiters) {
3127 		if ((type & LOCK_ROBUST) &&
3128 		    (flag & LOCK_NOTRECOVERABLE)) {
3129 			lwp_release_all(&lwpchan);
3130 			suword8_noerr(&lp->mutex_waiters, 0);
3131 		} else if (lwp_release(&lwpchan, &waiters, 0) == 1) {
3132 			suword8_noerr(&lp->mutex_waiters, waiters);
3133 		}
3134 	}
3135 
3136 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3137 out:
3138 	no_fault();
3139 	if (watched)
3140 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3141 	if (error)
3142 		return (set_errno(error));
3143 	return (0);
3144 }
3145