xref: /titanic_50/usr/src/uts/common/syscall/lwp_sobj.c (revision a24e89c4a1eec8361718d94a6275e6720643284e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/sysmacros.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/errno.h>
37 #include <sys/file.h>
38 #include <sys/proc.h>
39 #include <sys/prsystm.h>
40 #include <sys/kmem.h>
41 #include <sys/sobject.h>
42 #include <sys/fault.h>
43 #include <sys/procfs.h>
44 #include <sys/watchpoint.h>
45 #include <sys/time.h>
46 #include <sys/cmn_err.h>
47 #include <sys/machlock.h>
48 #include <sys/debug.h>
49 #include <sys/synch.h>
50 #include <sys/synch32.h>
51 #include <sys/mman.h>
52 #include <sys/class.h>
53 #include <sys/schedctl.h>
54 #include <sys/sleepq.h>
55 #include <sys/policy.h>
56 #include <sys/tnf_probe.h>
57 #include <sys/lwpchan_impl.h>
58 #include <sys/turnstile.h>
59 #include <sys/atomic.h>
60 #include <sys/lwp_timer_impl.h>
61 #include <sys/lwp_upimutex_impl.h>
62 #include <vm/as.h>
63 #include <sys/sdt.h>
64 
65 static kthread_t *lwpsobj_owner(caddr_t);
66 static void lwp_unsleep(kthread_t *t);
67 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
68 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
69 static void lwp_mutex_unregister(void *uaddr);
70 
71 extern int lwp_cond_signal(lwp_cond_t *cv);
72 
73 /*
74  * Maximum number of user prio inheritance locks that can be held by a thread.
75  * Used to limit kmem for each thread. This is a per-thread limit that
76  * can be administered on a system wide basis (using /etc/system).
77  *
78  * Also, when a limit, say maxlwps is added for numbers of lwps within a
79  * process, the per-thread limit automatically becomes a process-wide limit
80  * of maximum number of held upi locks within a process:
81  *      maxheldupimx = maxnestupimx * maxlwps;
82  */
83 static uint32_t maxnestupimx = 2000;
84 
85 /*
86  * The sobj_ops vector exports a set of functions needed when a thread
87  * is asleep on a synchronization object of this type.
88  */
89 static sobj_ops_t lwp_sobj_ops = {
90 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
91 };
92 
93 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
94 
95 static sobj_ops_t lwp_sobj_pi_ops = {
96 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
97 	turnstile_change_pri
98 };
99 
100 static sleepq_head_t	lwpsleepq[NSLEEPQ];
101 upib_t			upimutextab[UPIMUTEX_TABSIZE];
102 
103 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
104 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
105 
106 /*
107  * We know that both lc_wchan and lc_wchan0 are addresses that most
108  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
109  * 'pool' is either 0 or 1.
110  */
111 #define	LWPCHAN_LOCK_HASH(X, pool) \
112 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
113 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
114 
115 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
116 
117 /*
118  * Is this a POSIX threads user-level lock requiring priority inheritance?
119  */
120 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
121 
122 static sleepq_head_t *
123 lwpsqhash(lwpchan_t *lwpchan)
124 {
125 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
126 	return (&lwpsleepq[SQHASHINDEX(x)]);
127 }
128 
129 /*
130  * Lock an lwpchan.
131  * Keep this in sync with lwpchan_unlock(), below.
132  */
133 static void
134 lwpchan_lock(lwpchan_t *lwpchan, int pool)
135 {
136 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
137 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
138 }
139 
140 /*
141  * Unlock an lwpchan.
142  * Keep this in sync with lwpchan_lock(), above.
143  */
144 static void
145 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
146 {
147 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
148 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
149 }
150 
151 /*
152  * Delete mappings from the lwpchan cache for pages that are being
153  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
154  * all mappings within the range are deleted from the lwpchan cache.
155  */
156 void
157 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
158 {
159 	lwpchan_data_t *lcp;
160 	lwpchan_hashbucket_t *hashbucket;
161 	lwpchan_hashbucket_t *endbucket;
162 	lwpchan_entry_t *ent;
163 	lwpchan_entry_t **prev;
164 	caddr_t addr;
165 
166 	mutex_enter(&p->p_lcp_lock);
167 	lcp = p->p_lcp;
168 	hashbucket = lcp->lwpchan_cache;
169 	endbucket = hashbucket + lcp->lwpchan_size;
170 	for (; hashbucket < endbucket; hashbucket++) {
171 		if (hashbucket->lwpchan_chain == NULL)
172 			continue;
173 		mutex_enter(&hashbucket->lwpchan_lock);
174 		prev = &hashbucket->lwpchan_chain;
175 		/* check entire chain */
176 		while ((ent = *prev) != NULL) {
177 			addr = ent->lwpchan_addr;
178 			if (start <= addr && addr < end) {
179 				*prev = ent->lwpchan_next;
180 				/*
181 				 * We do this only for the obsolete type
182 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
183 				 * locks do not draw ELOCKUNMAPPED or
184 				 * EOWNERDEAD due to being unmapped.
185 				 */
186 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
187 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
188 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
189 				/*
190 				 * If there is a user-level robust lock
191 				 * registration, mark it as invalid.
192 				 */
193 				if ((addr = ent->lwpchan_uaddr) != NULL)
194 					lwp_mutex_unregister(addr);
195 				kmem_free(ent, sizeof (*ent));
196 				atomic_add_32(&lcp->lwpchan_entries, -1);
197 			} else {
198 				prev = &ent->lwpchan_next;
199 			}
200 		}
201 		mutex_exit(&hashbucket->lwpchan_lock);
202 	}
203 	mutex_exit(&p->p_lcp_lock);
204 }
205 
206 /*
207  * Given an lwpchan cache pointer and a process virtual address,
208  * return a pointer to the corresponding lwpchan hash bucket.
209  */
210 static lwpchan_hashbucket_t *
211 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
212 {
213 	uint_t i;
214 
215 	/*
216 	 * All user-level sync object addresses are 8-byte aligned.
217 	 * Ignore the lowest 3 bits of the address and use the
218 	 * higher-order 2*lwpchan_bits bits for the hash index.
219 	 */
220 	addr >>= 3;
221 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
222 	return (lcp->lwpchan_cache + i);
223 }
224 
225 /*
226  * (Re)allocate the per-process lwpchan cache.
227  */
228 static void
229 lwpchan_alloc_cache(proc_t *p, uint_t bits)
230 {
231 	lwpchan_data_t *lcp;
232 	lwpchan_data_t *old_lcp;
233 	lwpchan_hashbucket_t *hashbucket;
234 	lwpchan_hashbucket_t *endbucket;
235 	lwpchan_hashbucket_t *newbucket;
236 	lwpchan_entry_t *ent;
237 	lwpchan_entry_t *next;
238 	uint_t count;
239 
240 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
241 
242 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
243 	lcp->lwpchan_bits = bits;
244 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
245 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
246 	lcp->lwpchan_entries = 0;
247 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
248 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
249 	lcp->lwpchan_next_data = NULL;
250 
251 	mutex_enter(&p->p_lcp_lock);
252 	if ((old_lcp = p->p_lcp) != NULL) {
253 		if (old_lcp->lwpchan_bits >= bits) {
254 			/* someone beat us to it */
255 			mutex_exit(&p->p_lcp_lock);
256 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
257 			    sizeof (lwpchan_hashbucket_t));
258 			kmem_free(lcp, sizeof (lwpchan_data_t));
259 			return;
260 		}
261 		/*
262 		 * Acquire all of the old hash table locks.
263 		 */
264 		hashbucket = old_lcp->lwpchan_cache;
265 		endbucket = hashbucket + old_lcp->lwpchan_size;
266 		for (; hashbucket < endbucket; hashbucket++)
267 			mutex_enter(&hashbucket->lwpchan_lock);
268 		/*
269 		 * Move all of the old hash table entries to the
270 		 * new hash table.  The new hash table has not yet
271 		 * been installed so we don't need any of its locks.
272 		 */
273 		count = 0;
274 		hashbucket = old_lcp->lwpchan_cache;
275 		for (; hashbucket < endbucket; hashbucket++) {
276 			ent = hashbucket->lwpchan_chain;
277 			while (ent != NULL) {
278 				next = ent->lwpchan_next;
279 				newbucket = lwpchan_bucket(lcp,
280 				    (uintptr_t)ent->lwpchan_addr);
281 				ent->lwpchan_next = newbucket->lwpchan_chain;
282 				newbucket->lwpchan_chain = ent;
283 				ent = next;
284 				count++;
285 			}
286 			hashbucket->lwpchan_chain = NULL;
287 		}
288 		lcp->lwpchan_entries = count;
289 	}
290 
291 	/*
292 	 * Retire the old hash table.  We can't actually kmem_free() it
293 	 * now because someone may still have a pointer to it.  Instead,
294 	 * we link it onto the new hash table's list of retired hash tables.
295 	 * The new hash table is double the size of the previous one, so
296 	 * the total size of all retired hash tables is less than the size
297 	 * of the new one.  exit() and exec() free the retired hash tables
298 	 * (see lwpchan_destroy_cache(), below).
299 	 */
300 	lcp->lwpchan_next_data = old_lcp;
301 
302 	/*
303 	 * As soon as we store the new lcp, future locking operations will
304 	 * use it.  Therefore, we must ensure that all the state we've just
305 	 * established reaches global visibility before the new lcp does.
306 	 */
307 	membar_producer();
308 	p->p_lcp = lcp;
309 
310 	if (old_lcp != NULL) {
311 		/*
312 		 * Release all of the old hash table locks.
313 		 */
314 		hashbucket = old_lcp->lwpchan_cache;
315 		for (; hashbucket < endbucket; hashbucket++)
316 			mutex_exit(&hashbucket->lwpchan_lock);
317 	}
318 	mutex_exit(&p->p_lcp_lock);
319 }
320 
321 /*
322  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
323  * Called when the process exits or execs.  All lwps except one have
324  * exited so we need no locks here.
325  */
326 void
327 lwpchan_destroy_cache(int exec)
328 {
329 	proc_t *p = curproc;
330 	lwpchan_hashbucket_t *hashbucket;
331 	lwpchan_hashbucket_t *endbucket;
332 	lwpchan_data_t *lcp;
333 	lwpchan_entry_t *ent;
334 	lwpchan_entry_t *next;
335 	uint16_t lockflg;
336 
337 	lcp = p->p_lcp;
338 	p->p_lcp = NULL;
339 
340 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
341 	hashbucket = lcp->lwpchan_cache;
342 	endbucket = hashbucket + lcp->lwpchan_size;
343 	for (; hashbucket < endbucket; hashbucket++) {
344 		ent = hashbucket->lwpchan_chain;
345 		hashbucket->lwpchan_chain = NULL;
346 		while (ent != NULL) {
347 			next = ent->lwpchan_next;
348 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
349 			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
350 			    == (USYNC_PROCESS | LOCK_ROBUST))
351 				lwp_mutex_cleanup(ent, lockflg);
352 			kmem_free(ent, sizeof (*ent));
353 			ent = next;
354 		}
355 	}
356 
357 	while (lcp != NULL) {
358 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
359 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
360 		    sizeof (lwpchan_hashbucket_t));
361 		kmem_free(lcp, sizeof (lwpchan_data_t));
362 		lcp = next_lcp;
363 	}
364 }
365 
366 /*
367  * Return zero when there is an entry in the lwpchan cache for the
368  * given process virtual address and non-zero when there is not.
369  * The returned non-zero value is the current length of the
370  * hash chain plus one.  The caller holds the hash bucket lock.
371  */
372 static uint_t
373 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
374 	lwpchan_hashbucket_t *hashbucket)
375 {
376 	lwpchan_entry_t *ent;
377 	uint_t count = 1;
378 
379 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
380 		if (ent->lwpchan_addr == addr) {
381 			if (ent->lwpchan_type != type ||
382 			    ent->lwpchan_pool != pool) {
383 				/*
384 				 * This shouldn't happen, but might if the
385 				 * process reuses its memory for different
386 				 * types of sync objects.  We test first
387 				 * to avoid grabbing the memory cache line.
388 				 */
389 				ent->lwpchan_type = (uint16_t)type;
390 				ent->lwpchan_pool = (uint16_t)pool;
391 			}
392 			*lwpchan = ent->lwpchan_lwpchan;
393 			return (0);
394 		}
395 		count++;
396 	}
397 	return (count);
398 }
399 
400 /*
401  * Return the cached lwpchan mapping if cached, otherwise insert
402  * a virtual address to lwpchan mapping into the cache.
403  */
404 static int
405 lwpchan_get_mapping(struct as *as, caddr_t addr, caddr_t uaddr,
406 	int type, lwpchan_t *lwpchan, int pool)
407 {
408 	proc_t *p = curproc;
409 	lwpchan_data_t *lcp;
410 	lwpchan_hashbucket_t *hashbucket;
411 	lwpchan_entry_t *ent;
412 	memid_t	memid;
413 	uint_t count;
414 	uint_t bits;
415 
416 top:
417 	/* initialize the lwpchan cache, if necesary */
418 	if ((lcp = p->p_lcp) == NULL) {
419 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
420 		goto top;
421 	}
422 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
423 	mutex_enter(&hashbucket->lwpchan_lock);
424 	if (lcp != p->p_lcp) {
425 		/* someone resized the lwpchan cache; start over */
426 		mutex_exit(&hashbucket->lwpchan_lock);
427 		goto top;
428 	}
429 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
430 		/* it's in the cache */
431 		mutex_exit(&hashbucket->lwpchan_lock);
432 		return (1);
433 	}
434 	mutex_exit(&hashbucket->lwpchan_lock);
435 	if (as_getmemid(as, addr, &memid) != 0)
436 		return (0);
437 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
438 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
439 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
440 	mutex_enter(&hashbucket->lwpchan_lock);
441 	if (lcp != p->p_lcp) {
442 		/* someone resized the lwpchan cache; start over */
443 		mutex_exit(&hashbucket->lwpchan_lock);
444 		kmem_free(ent, sizeof (*ent));
445 		goto top;
446 	}
447 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
448 	if (count == 0) {
449 		/* someone else added this entry to the cache */
450 		mutex_exit(&hashbucket->lwpchan_lock);
451 		kmem_free(ent, sizeof (*ent));
452 		return (1);
453 	}
454 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
455 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
456 		/* hash chain too long; reallocate the hash table */
457 		mutex_exit(&hashbucket->lwpchan_lock);
458 		kmem_free(ent, sizeof (*ent));
459 		lwpchan_alloc_cache(p, bits + 1);
460 		goto top;
461 	}
462 	ent->lwpchan_addr = addr;
463 	ent->lwpchan_uaddr = uaddr;
464 	ent->lwpchan_type = (uint16_t)type;
465 	ent->lwpchan_pool = (uint16_t)pool;
466 	ent->lwpchan_lwpchan = *lwpchan;
467 	ent->lwpchan_next = hashbucket->lwpchan_chain;
468 	hashbucket->lwpchan_chain = ent;
469 	atomic_add_32(&lcp->lwpchan_entries, 1);
470 	mutex_exit(&hashbucket->lwpchan_lock);
471 	return (1);
472 }
473 
474 /*
475  * Return a unique pair of identifiers that corresponds to a
476  * synchronization object's virtual address.  Process-shared
477  * sync objects usually get vnode/offset from as_getmemid().
478  */
479 static int
480 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
481 {
482 	/*
483 	 * If the lwp synch object is defined to be process-private,
484 	 * we just make the first field of the lwpchan be 'as' and
485 	 * the second field be the synch object's virtual address.
486 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
487 	 * The lwpchan cache is used only for process-shared objects.
488 	 */
489 	if (!(type & USYNC_PROCESS)) {
490 		lwpchan->lc_wchan0 = (caddr_t)as;
491 		lwpchan->lc_wchan = addr;
492 		return (1);
493 	}
494 
495 	return (lwpchan_get_mapping(as, addr, NULL, type, lwpchan, pool));
496 }
497 
498 static void
499 lwp_block(lwpchan_t *lwpchan)
500 {
501 	kthread_t *t = curthread;
502 	klwp_t *lwp = ttolwp(t);
503 	sleepq_head_t *sqh;
504 
505 	thread_lock(t);
506 	t->t_flag |= T_WAKEABLE;
507 	t->t_lwpchan = *lwpchan;
508 	t->t_sobj_ops = &lwp_sobj_ops;
509 	t->t_release = 0;
510 	sqh = lwpsqhash(lwpchan);
511 	disp_lock_enter_high(&sqh->sq_lock);
512 	CL_SLEEP(t);
513 	DTRACE_SCHED(sleep);
514 	THREAD_SLEEP(t, &sqh->sq_lock);
515 	sleepq_insert(&sqh->sq_queue, t);
516 	thread_unlock(t);
517 	lwp->lwp_asleep = 1;
518 	lwp->lwp_sysabort = 0;
519 	lwp->lwp_ru.nvcsw++;
520 	(void) new_mstate(curthread, LMS_SLEEP);
521 }
522 
523 static kthread_t *
524 lwpsobj_pi_owner(upimutex_t *up)
525 {
526 	return (up->upi_owner);
527 }
528 
529 static struct upimutex *
530 upi_get(upib_t *upibp, lwpchan_t *lcp)
531 {
532 	struct upimutex *upip;
533 
534 	for (upip = upibp->upib_first; upip != NULL;
535 	    upip = upip->upi_nextchain) {
536 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
537 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
538 			break;
539 	}
540 	return (upip);
541 }
542 
543 static void
544 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
545 {
546 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
547 
548 	/*
549 	 * Insert upimutex at front of list. Maybe a bit unfair
550 	 * but assume that not many lwpchans hash to the same
551 	 * upimutextab bucket, i.e. the list of upimutexes from
552 	 * upib_first is not too long.
553 	 */
554 	upimutex->upi_nextchain = upibp->upib_first;
555 	upibp->upib_first = upimutex;
556 }
557 
558 static void
559 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
560 {
561 	struct upimutex **prev;
562 
563 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
564 
565 	prev = &upibp->upib_first;
566 	while (*prev != upimutex) {
567 		prev = &(*prev)->upi_nextchain;
568 	}
569 	*prev = upimutex->upi_nextchain;
570 	upimutex->upi_nextchain = NULL;
571 }
572 
573 /*
574  * Add upimutex to chain of upimutexes held by curthread.
575  * Returns number of upimutexes held by curthread.
576  */
577 static uint32_t
578 upi_mylist_add(struct upimutex *upimutex)
579 {
580 	kthread_t *t = curthread;
581 
582 	/*
583 	 * Insert upimutex at front of list of upimutexes owned by t. This
584 	 * would match typical LIFO order in which nested locks are acquired
585 	 * and released.
586 	 */
587 	upimutex->upi_nextowned = t->t_upimutex;
588 	t->t_upimutex = upimutex;
589 	t->t_nupinest++;
590 	ASSERT(t->t_nupinest > 0);
591 	return (t->t_nupinest);
592 }
593 
594 /*
595  * Delete upimutex from list of upimutexes owned by curthread.
596  */
597 static void
598 upi_mylist_del(struct upimutex *upimutex)
599 {
600 	kthread_t *t = curthread;
601 	struct upimutex **prev;
602 
603 	/*
604 	 * Since the order in which nested locks are acquired and released,
605 	 * is typically LIFO, and typical nesting levels are not too deep, the
606 	 * following should not be expensive in the general case.
607 	 */
608 	prev = &t->t_upimutex;
609 	while (*prev != upimutex) {
610 		prev = &(*prev)->upi_nextowned;
611 	}
612 	*prev = upimutex->upi_nextowned;
613 	upimutex->upi_nextowned = NULL;
614 	ASSERT(t->t_nupinest > 0);
615 	t->t_nupinest--;
616 }
617 
618 /*
619  * Returns true if upimutex is owned. Should be called only when upim points
620  * to kmem which cannot disappear from underneath.
621  */
622 static int
623 upi_owned(upimutex_t *upim)
624 {
625 	return (upim->upi_owner == curthread);
626 }
627 
628 /*
629  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
630  */
631 static struct upimutex *
632 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
633 {
634 	lwpchan_t lwpchan;
635 	upib_t *upibp;
636 	struct upimutex *upimutex;
637 
638 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
639 	    &lwpchan, LWPCHAN_MPPOOL))
640 		return (NULL);
641 
642 	upibp = &UPI_CHAIN(lwpchan);
643 	mutex_enter(&upibp->upib_lock);
644 	upimutex = upi_get(upibp, &lwpchan);
645 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
646 		mutex_exit(&upibp->upib_lock);
647 		return (NULL);
648 	}
649 	mutex_exit(&upibp->upib_lock);
650 	return (upimutex);
651 }
652 
653 /*
654  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
655  * no lock hand-off occurrs.
656  */
657 static void
658 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
659 {
660 	turnstile_t *ts;
661 	upib_t *upibp;
662 	kthread_t *newowner;
663 
664 	upi_mylist_del(upimutex);
665 	upibp = upimutex->upi_upibp;
666 	mutex_enter(&upibp->upib_lock);
667 	if (upimutex->upi_waiter != 0) { /* if waiters */
668 		ts = turnstile_lookup(upimutex);
669 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
670 			/* hand-off lock to highest prio waiter */
671 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
672 			upimutex->upi_owner = newowner;
673 			if (ts->ts_waiters == 1)
674 				upimutex->upi_waiter = 0;
675 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
676 			mutex_exit(&upibp->upib_lock);
677 			return;
678 		} else if (ts != NULL) {
679 			/* LOCK_NOTRECOVERABLE: wakeup all */
680 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
681 		} else {
682 			/*
683 			 * Misleading w bit. Waiters might have been
684 			 * interrupted. No need to clear the w bit (upimutex
685 			 * will soon be freed). Re-calculate PI from existing
686 			 * waiters.
687 			 */
688 			turnstile_exit(upimutex);
689 			turnstile_pi_recalc();
690 		}
691 	}
692 	/*
693 	 * no waiters, or LOCK_NOTRECOVERABLE.
694 	 * remove from the bucket chain of upi mutexes.
695 	 * de-allocate kernel memory (upimutex).
696 	 */
697 	upi_chain_del(upimutex->upi_upibp, upimutex);
698 	mutex_exit(&upibp->upib_lock);
699 	kmem_free(upimutex, sizeof (upimutex_t));
700 }
701 
702 static int
703 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
704 {
705 	label_t ljb;
706 	int error = 0;
707 	lwpchan_t lwpchan;
708 	uint16_t flag;
709 	upib_t *upibp;
710 	volatile struct upimutex *upimutex = NULL;
711 	turnstile_t *ts;
712 	uint32_t nupinest;
713 	volatile int upilocked = 0;
714 
715 	if (on_fault(&ljb)) {
716 		if (upilocked)
717 			upimutex_unlock((upimutex_t *)upimutex, 0);
718 		error = EFAULT;
719 		goto out;
720 	}
721 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
722 	    &lwpchan, LWPCHAN_MPPOOL)) {
723 		error = EFAULT;
724 		goto out;
725 	}
726 	upibp = &UPI_CHAIN(lwpchan);
727 retry:
728 	mutex_enter(&upibp->upib_lock);
729 	upimutex = upi_get(upibp, &lwpchan);
730 	if (upimutex == NULL)  {
731 		/* lock available since lwpchan has no upimutex */
732 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
733 		upi_chain_add(upibp, (upimutex_t *)upimutex);
734 		upimutex->upi_owner = curthread; /* grab lock */
735 		upimutex->upi_upibp = upibp;
736 		upimutex->upi_vaddr = lp;
737 		upimutex->upi_lwpchan = lwpchan;
738 		mutex_exit(&upibp->upib_lock);
739 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
740 		upilocked = 1;
741 		fuword16_noerr(&lp->mutex_flag, &flag);
742 		if (nupinest > maxnestupimx &&
743 		    secpolicy_resource(CRED()) != 0) {
744 			upimutex_unlock((upimutex_t *)upimutex, flag);
745 			error = ENOMEM;
746 			goto out;
747 		}
748 		if (flag & LOCK_NOTRECOVERABLE) {
749 			/*
750 			 * Since the setting of LOCK_NOTRECOVERABLE
751 			 * was done under the high-level upi mutex,
752 			 * in lwp_upimutex_unlock(), this flag needs to
753 			 * be checked while holding the upi mutex.
754 			 * If set, this thread should return without
755 			 * the lock held, and with the right error code.
756 			 */
757 			upimutex_unlock((upimutex_t *)upimutex, flag);
758 			upilocked = 0;
759 			error = ENOTRECOVERABLE;
760 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
761 			if (flag & LOCK_OWNERDEAD)
762 				error = EOWNERDEAD;
763 			else if (type & USYNC_PROCESS_ROBUST)
764 				error = ELOCKUNMAPPED;
765 			else
766 				error = EOWNERDEAD;
767 		}
768 		goto out;
769 	}
770 	/*
771 	 * If a upimutex object exists, it must have an owner.
772 	 * This is due to lock hand-off, and release of upimutex when no
773 	 * waiters are present at unlock time,
774 	 */
775 	ASSERT(upimutex->upi_owner != NULL);
776 	if (upimutex->upi_owner == curthread) {
777 		/*
778 		 * The user wrapper can check if the mutex type is
779 		 * ERRORCHECK: if not, it should stall at user-level.
780 		 * If so, it should return the error code.
781 		 */
782 		mutex_exit(&upibp->upib_lock);
783 		error = EDEADLK;
784 		goto out;
785 	}
786 	if (try == UPIMUTEX_TRY) {
787 		mutex_exit(&upibp->upib_lock);
788 		error = EBUSY;
789 		goto out;
790 	}
791 	/*
792 	 * Block for the lock.
793 	 * Put the lwp in an orderly state for debugging.
794 	 * Calling prstop() has to be done here, and not in
795 	 * turnstile_block(), since the preceding call to
796 	 * turnstile_lookup() raises the PIL to a level
797 	 * at which calls to prstop() should not be made.
798 	 */
799 	if ((error = lwptp->lwpt_time_error) != 0) {
800 		/*
801 		 * The SUSV3 Posix spec is very clear that we
802 		 * should get no error from validating the
803 		 * timer until we would actually sleep.
804 		 */
805 		mutex_exit(&upibp->upib_lock);
806 		goto out;
807 	}
808 	prstop(PR_REQUESTED, 0);
809 	if (lwptp->lwpt_tsp != NULL) {
810 		/*
811 		 * Unlike the protocol for other lwp timedwait operations,
812 		 * we must drop t_delay_lock before going to sleep in
813 		 * turnstile_block() for a upi mutex.
814 		 * See the comments below and in turnstile.c
815 		 */
816 		mutex_enter(&curthread->t_delay_lock);
817 		(void) lwp_timer_enqueue(lwptp);
818 		mutex_exit(&curthread->t_delay_lock);
819 	}
820 	/*
821 	 * Now, set the waiter bit and block for the lock in turnstile_block().
822 	 * No need to preserve the previous wbit since a lock try is not
823 	 * attempted after setting the wait bit. Wait bit is set under
824 	 * the upib_lock, which is not released until the turnstile lock
825 	 * is acquired. Say, the upimutex is L:
826 	 *
827 	 * 1. upib_lock is held so the waiter does not have to retry L after
828 	 *    setting the wait bit: since the owner has to grab the upib_lock
829 	 *    to unlock L, it will certainly see the wait bit set.
830 	 * 2. upib_lock is not released until the turnstile lock is acquired.
831 	 *    This is the key to preventing a missed wake-up. Otherwise, the
832 	 *    owner could acquire the upib_lock, and the tc_lock, to call
833 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
834 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
835 	 *    find this waiter, resulting in the missed wakeup.
836 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
837 	 *    holding the tc_lock (since mutex_exit() could need to acquire
838 	 *    the same tc_lock)...and so is held when calling turnstile_block().
839 	 *    The address of upib_lock is passed to turnstile_block() which
840 	 *    releases it after releasing all turnstile locks, and before going
841 	 *    to sleep in swtch().
842 	 * 4. The waiter value cannot be a count of waiters, because a waiter
843 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
844 	 *    which point, the upib_lock cannot be locked, to decrement waiter
845 	 *    count. So, just treat the waiter state as a bit, not a count.
846 	 */
847 	ts = turnstile_lookup((upimutex_t *)upimutex);
848 	upimutex->upi_waiter = 1;
849 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
850 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
851 	/*
852 	 * Hand-off implies that we wakeup holding the lock, except when:
853 	 *	- deadlock is detected
854 	 *	- lock is not recoverable
855 	 *	- we got an interrupt or timeout
856 	 * If we wake up due to an interrupt or timeout, we may
857 	 * or may not be holding the lock due to mutex hand-off.
858 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
859 	 */
860 	if (error != 0) {
861 		if ((error == EINTR || error == ETIME) &&
862 		    (upimutex = lwp_upimutex_owned(lp, type))) {
863 			/*
864 			 * Unlock and return - the re-startable syscall will
865 			 * try the lock again if we got EINTR.
866 			 */
867 			(void) upi_mylist_add((upimutex_t *)upimutex);
868 			upimutex_unlock((upimutex_t *)upimutex, 0);
869 		}
870 		/*
871 		 * The only other possible error is EDEADLK.  If so, upimutex
872 		 * is valid, since its owner is deadlocked with curthread.
873 		 */
874 		ASSERT(error == EINTR || error == ETIME ||
875 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
876 		ASSERT(!lwp_upimutex_owned(lp, type));
877 		goto out;
878 	}
879 	if (lwp_upimutex_owned(lp, type)) {
880 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
881 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
882 		upilocked = 1;
883 	}
884 	/*
885 	 * Now, need to read the user-level lp->mutex_flag to do the following:
886 	 *
887 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
888 	 *   should be returned.
889 	 * - if lock isn't held, check if ENOTRECOVERABLE should
890 	 *   be returned.
891 	 *
892 	 * Now, either lp->mutex_flag is readable or it's not. If not
893 	 * readable, the on_fault path will cause a return with EFAULT
894 	 * as it should.  If it is readable, the state of the flag
895 	 * encodes the robustness state of the lock:
896 	 *
897 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
898 	 * or LOCK_UNMAPPED setting will influence the return code
899 	 * appropriately.  If the upimutex is not locked here, this
900 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
901 	 * event.  The flag's setting can be used to distinguish
902 	 * between these two events.
903 	 */
904 	fuword16_noerr(&lp->mutex_flag, &flag);
905 	if (upilocked) {
906 		/*
907 		 * If the thread wakes up from turnstile_block with the lock
908 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
909 		 * since it would not have been handed-off the lock.
910 		 * So, no need to check for this case.
911 		 */
912 		if (nupinest > maxnestupimx &&
913 		    secpolicy_resource(CRED()) != 0) {
914 			upimutex_unlock((upimutex_t *)upimutex, flag);
915 			upilocked = 0;
916 			error = ENOMEM;
917 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
918 			if (flag & LOCK_OWNERDEAD)
919 				error = EOWNERDEAD;
920 			else if (type & USYNC_PROCESS_ROBUST)
921 				error = ELOCKUNMAPPED;
922 			else
923 				error = EOWNERDEAD;
924 		}
925 	} else {
926 		/*
927 		 * Wake-up without the upimutex held. Either this is a
928 		 * spurious wake-up (due to signals, forkall(), whatever), or
929 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
930 		 * of the mutex flag can be used to distinguish between the
931 		 * two events.
932 		 */
933 		if (flag & LOCK_NOTRECOVERABLE) {
934 			error = ENOTRECOVERABLE;
935 		} else {
936 			/*
937 			 * Here, the flag could be set to LOCK_OWNERDEAD or
938 			 * not. In both cases, this is a spurious wakeup,
939 			 * since the upi lock is not held, but the thread
940 			 * has returned from turnstile_block().
941 			 *
942 			 * The user flag could be LOCK_OWNERDEAD if, at the
943 			 * same time as curthread having been woken up
944 			 * spuriously, the owner (say Tdead) has died, marked
945 			 * the mutex flag accordingly, and handed off the lock
946 			 * to some other waiter (say Tnew). curthread just
947 			 * happened to read the flag while Tnew has yet to deal
948 			 * with the owner-dead event.
949 			 *
950 			 * In this event, curthread should retry the lock.
951 			 * If Tnew is able to cleanup the lock, curthread
952 			 * will eventually get the lock with a zero error code,
953 			 * If Tnew is unable to cleanup, its eventual call to
954 			 * unlock the lock will result in the mutex flag being
955 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
956 			 * all waiters, including curthread, which will then
957 			 * eventually return ENOTRECOVERABLE due to the above
958 			 * check.
959 			 *
960 			 * Of course, if the user-flag is not set with
961 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
962 			 * this is definitely a spurious wakeup.
963 			 */
964 			goto retry;
965 		}
966 	}
967 
968 out:
969 	no_fault();
970 	return (error);
971 }
972 
973 
974 static int
975 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
976 {
977 	label_t ljb;
978 	int error = 0;
979 	lwpchan_t lwpchan;
980 	uint16_t flag;
981 	upib_t *upibp;
982 	volatile struct upimutex *upimutex = NULL;
983 	volatile int upilocked = 0;
984 
985 	if (on_fault(&ljb)) {
986 		if (upilocked)
987 			upimutex_unlock((upimutex_t *)upimutex, 0);
988 		error = EFAULT;
989 		goto out;
990 	}
991 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
992 	    &lwpchan, LWPCHAN_MPPOOL)) {
993 		error = EFAULT;
994 		goto out;
995 	}
996 	upibp = &UPI_CHAIN(lwpchan);
997 	mutex_enter(&upibp->upib_lock);
998 	upimutex = upi_get(upibp, &lwpchan);
999 	/*
1000 	 * If the lock is not held, or the owner is not curthread, return
1001 	 * error. The user-level wrapper can return this error or stall,
1002 	 * depending on whether mutex is of ERRORCHECK type or not.
1003 	 */
1004 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
1005 		mutex_exit(&upibp->upib_lock);
1006 		error = EPERM;
1007 		goto out;
1008 	}
1009 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1010 	upilocked = 1;
1011 	fuword16_noerr(&lp->mutex_flag, &flag);
1012 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1013 		/*
1014 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1015 		 */
1016 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1017 		flag |= LOCK_NOTRECOVERABLE;
1018 		suword16_noerr(&lp->mutex_flag, flag);
1019 	}
1020 	if (type & USYNC_PROCESS)
1021 		suword32_noerr(&lp->mutex_ownerpid, 0);
1022 	upimutex_unlock((upimutex_t *)upimutex, flag);
1023 	upilocked = 0;
1024 out:
1025 	no_fault();
1026 	return (error);
1027 }
1028 
1029 /*
1030  * Clear the contents of a user-level mutex; return the flags.
1031  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1032  */
1033 static uint16_t
1034 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1035 {
1036 	uint16_t flag;
1037 
1038 	fuword16_noerr(&lp->mutex_flag, &flag);
1039 	if ((flag &
1040 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1041 		flag |= lockflg;
1042 		suword16_noerr(&lp->mutex_flag, flag);
1043 	}
1044 	suword32_noerr((uint32_t *)&lp->mutex_owner, 0);
1045 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0);
1046 	suword32_noerr(&lp->mutex_ownerpid, 0);
1047 	suword8_noerr(&lp->mutex_rcount, 0);
1048 
1049 	return (flag);
1050 }
1051 
1052 /*
1053  * Mark user mutex state, corresponding to kernel upimutex,
1054  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1055  */
1056 static int
1057 upi_dead(upimutex_t *upip, uint16_t lockflg)
1058 {
1059 	label_t ljb;
1060 	int error = 0;
1061 	lwp_mutex_t *lp;
1062 
1063 	if (on_fault(&ljb)) {
1064 		error = EFAULT;
1065 		goto out;
1066 	}
1067 
1068 	lp = upip->upi_vaddr;
1069 	(void) lwp_clear_mutex(lp, lockflg);
1070 	suword8_noerr(&lp->mutex_lockw, 0);
1071 out:
1072 	no_fault();
1073 	return (error);
1074 }
1075 
1076 /*
1077  * Unlock all upimutexes held by curthread, since curthread is dying.
1078  * For each upimutex, attempt to mark its corresponding user mutex object as
1079  * dead.
1080  */
1081 void
1082 upimutex_cleanup()
1083 {
1084 	kthread_t *t = curthread;
1085 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1086 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1087 	struct upimutex *upip;
1088 
1089 	while ((upip = t->t_upimutex) != NULL) {
1090 		if (upi_dead(upip, lockflg) != 0) {
1091 			/*
1092 			 * If the user object associated with this upimutex is
1093 			 * unmapped, unlock upimutex with the
1094 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1095 			 * woken up. Since user object is unmapped, it could
1096 			 * not be marked as dead or notrecoverable.
1097 			 * The waiters will now all wake up and return
1098 			 * ENOTRECOVERABLE, since they would find that the lock
1099 			 * has not been handed-off to them.
1100 			 * See lwp_upimutex_lock().
1101 			 */
1102 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1103 		} else {
1104 			/*
1105 			 * The user object has been updated as dead.
1106 			 * Unlock the upimutex: if no waiters, upip kmem will
1107 			 * be freed. If there is a waiter, the lock will be
1108 			 * handed off. If exit() is in progress, each existing
1109 			 * waiter will successively get the lock, as owners
1110 			 * die, and each new owner will call this routine as
1111 			 * it dies. The last owner will free kmem, since
1112 			 * it will find the upimutex has no waiters. So,
1113 			 * eventually, the kmem is guaranteed to be freed.
1114 			 */
1115 			upimutex_unlock(upip, 0);
1116 		}
1117 		/*
1118 		 * Note that the call to upimutex_unlock() above will delete
1119 		 * upimutex from the t_upimutexes chain. And so the
1120 		 * while loop will eventually terminate.
1121 		 */
1122 	}
1123 }
1124 
1125 static int iswanted();
1126 int
1127 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1128 {
1129 	kthread_t *t = curthread;
1130 	klwp_t *lwp = ttolwp(t);
1131 	proc_t *p = ttoproc(t);
1132 	lwp_timer_t lwpt;
1133 	caddr_t timedwait;
1134 	int error = 0;
1135 	int time_error;
1136 	clock_t tim = -1;
1137 	uchar_t waiters;
1138 	volatile int locked = 0;
1139 	volatile int watched = 0;
1140 	label_t ljb;
1141 	volatile uint8_t type = 0;
1142 	lwpchan_t lwpchan;
1143 	sleepq_head_t *sqh;
1144 	uint16_t flag;
1145 	int imm_timeout = 0;
1146 
1147 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1148 		return (set_errno(EFAULT));
1149 
1150 	timedwait = (caddr_t)tsp;
1151 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1152 	    lwpt.lwpt_imm_timeout) {
1153 		imm_timeout = 1;
1154 		timedwait = NULL;
1155 	}
1156 
1157 	/*
1158 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1159 	 * this micro state is really a run state. If the thread indeed blocks,
1160 	 * this state becomes valid. If not, the state is converted back to
1161 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1162 	 * when blocking.
1163 	 */
1164 	(void) new_mstate(t, LMS_USER_LOCK);
1165 	if (on_fault(&ljb)) {
1166 		if (locked)
1167 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1168 		error = EFAULT;
1169 		goto out;
1170 	}
1171 	/*
1172 	 * Force Copy-on-write if necessary and ensure that the
1173 	 * synchronization object resides in read/write memory.
1174 	 * Cause an EFAULT return now if this is not so.
1175 	 */
1176 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1177 	suword8_noerr(&lp->mutex_type, type);
1178 	if (UPIMUTEX(type)) {
1179 		no_fault();
1180 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1181 		if ((type & USYNC_PROCESS) &&
1182 		    (error == 0 ||
1183 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
1184 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1185 		if (tsp && !time_error)	/* copyout the residual time left */
1186 			error = lwp_timer_copyout(&lwpt, error);
1187 		if (error)
1188 			return (set_errno(error));
1189 		return (0);
1190 	}
1191 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1192 	    &lwpchan, LWPCHAN_MPPOOL)) {
1193 		error = EFAULT;
1194 		goto out;
1195 	}
1196 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1197 	locked = 1;
1198 	if (type & LOCK_ROBUST) {
1199 		fuword16_noerr(&lp->mutex_flag, &flag);
1200 		if (flag & LOCK_NOTRECOVERABLE) {
1201 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1202 			error = ENOTRECOVERABLE;
1203 			goto out;
1204 		}
1205 	}
1206 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1207 	suword8_noerr(&lp->mutex_waiters, 1);
1208 
1209 	/*
1210 	 * If watchpoints are set, they need to be restored, since
1211 	 * atomic accesses of memory such as the call to ulock_try()
1212 	 * below cannot be watched.
1213 	 */
1214 
1215 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1216 
1217 	while (!ulock_try(&lp->mutex_lockw)) {
1218 		if (time_error) {
1219 			/*
1220 			 * The SUSV3 Posix spec is very clear that we
1221 			 * should get no error from validating the
1222 			 * timer until we would actually sleep.
1223 			 */
1224 			error = time_error;
1225 			break;
1226 		}
1227 
1228 		if (watched) {
1229 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1230 			watched = 0;
1231 		}
1232 
1233 		/*
1234 		 * Put the lwp in an orderly state for debugging.
1235 		 */
1236 		prstop(PR_REQUESTED, 0);
1237 		if (timedwait) {
1238 			/*
1239 			 * If we successfully queue the timeout,
1240 			 * then don't drop t_delay_lock until
1241 			 * we are on the sleep queue (below).
1242 			 */
1243 			mutex_enter(&t->t_delay_lock);
1244 			if (lwp_timer_enqueue(&lwpt) != 0) {
1245 				mutex_exit(&t->t_delay_lock);
1246 				imm_timeout = 1;
1247 				timedwait = NULL;
1248 			}
1249 		}
1250 		lwp_block(&lwpchan);
1251 		/*
1252 		 * Nothing should happen to cause the lwp to go to
1253 		 * sleep again until after it returns from swtch().
1254 		 */
1255 		if (timedwait)
1256 			mutex_exit(&t->t_delay_lock);
1257 		locked = 0;
1258 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1259 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1260 			setrun(t);
1261 		swtch();
1262 		t->t_flag &= ~T_WAKEABLE;
1263 		if (timedwait)
1264 			tim = lwp_timer_dequeue(&lwpt);
1265 		setallwatch();
1266 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1267 			error = EINTR;
1268 		else if (imm_timeout || (timedwait && tim == -1))
1269 			error = ETIME;
1270 		if (error) {
1271 			lwp->lwp_asleep = 0;
1272 			lwp->lwp_sysabort = 0;
1273 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1274 			    S_WRITE);
1275 
1276 			/*
1277 			 * Need to re-compute waiters bit. The waiters field in
1278 			 * the lock is not reliable. Either of two things could
1279 			 * have occurred: no lwp may have called lwp_release()
1280 			 * for me but I have woken up due to a signal or
1281 			 * timeout.  In this case, the waiter bit is incorrect
1282 			 * since it is still set to 1, set above.
1283 			 * OR an lwp_release() did occur for some other lwp on
1284 			 * the same lwpchan. In this case, the waiter bit is
1285 			 * correct.  But which event occurred, one can't tell.
1286 			 * So, recompute.
1287 			 */
1288 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1289 			locked = 1;
1290 			sqh = lwpsqhash(&lwpchan);
1291 			disp_lock_enter(&sqh->sq_lock);
1292 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1293 			disp_lock_exit(&sqh->sq_lock);
1294 			break;
1295 		}
1296 		lwp->lwp_asleep = 0;
1297 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1298 		    S_WRITE);
1299 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1300 		locked = 1;
1301 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1302 		suword8_noerr(&lp->mutex_waiters, 1);
1303 		if (type & LOCK_ROBUST) {
1304 			fuword16_noerr(&lp->mutex_flag, &flag);
1305 			if (flag & LOCK_NOTRECOVERABLE) {
1306 				error = ENOTRECOVERABLE;
1307 				break;
1308 			}
1309 		}
1310 	}
1311 
1312 	if (t->t_mstate == LMS_USER_LOCK)
1313 		(void) new_mstate(t, LMS_SYSTEM);
1314 
1315 	if (error == 0) {
1316 		if (type & USYNC_PROCESS)
1317 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1318 		if (type & LOCK_ROBUST) {
1319 			fuword16_noerr(&lp->mutex_flag, &flag);
1320 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1321 				if (flag & LOCK_OWNERDEAD)
1322 					error = EOWNERDEAD;
1323 				else if (type & USYNC_PROCESS_ROBUST)
1324 					error = ELOCKUNMAPPED;
1325 				else
1326 					error = EOWNERDEAD;
1327 			}
1328 		}
1329 	}
1330 	suword8_noerr(&lp->mutex_waiters, waiters);
1331 	locked = 0;
1332 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1333 out:
1334 	no_fault();
1335 	if (watched)
1336 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1337 	if (tsp && !time_error)		/* copyout the residual time left */
1338 		error = lwp_timer_copyout(&lwpt, error);
1339 	if (error)
1340 		return (set_errno(error));
1341 	return (0);
1342 }
1343 
1344 /*
1345  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1346  * libc now calls lwp_mutex_timedlock(lp, NULL).
1347  * This system call trap continues to exist solely for the benefit
1348  * of old statically-linked binaries from Solaris 9 and before.
1349  * It should be removed from the system when we no longer care
1350  * about such applications.
1351  */
1352 int
1353 lwp_mutex_lock(lwp_mutex_t *lp)
1354 {
1355 	return (lwp_mutex_timedlock(lp, NULL));
1356 }
1357 
1358 static int
1359 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1360 {
1361 	/*
1362 	 * The caller holds the dispatcher lock on the sleep queue.
1363 	 */
1364 	while (t != NULL) {
1365 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1366 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1367 			return (1);
1368 		t = t->t_link;
1369 	}
1370 	return (0);
1371 }
1372 
1373 /*
1374  * Return the highest priority thread sleeping on this lwpchan.
1375  */
1376 static kthread_t *
1377 lwp_queue_waiter(lwpchan_t *lwpchan)
1378 {
1379 	sleepq_head_t *sqh;
1380 	kthread_t *tp;
1381 
1382 	sqh = lwpsqhash(lwpchan);
1383 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1384 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1385 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1386 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1387 			break;
1388 	}
1389 	disp_lock_exit(&sqh->sq_lock);
1390 	return (tp);
1391 }
1392 
1393 static int
1394 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1395 {
1396 	sleepq_head_t *sqh;
1397 	kthread_t *tp;
1398 	kthread_t **tpp;
1399 
1400 	sqh = lwpsqhash(lwpchan);
1401 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1402 	tpp = &sqh->sq_queue.sq_first;
1403 	while ((tp = *tpp) != NULL) {
1404 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1405 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1406 			/*
1407 			 * The following is typically false. It could be true
1408 			 * only if lwp_release() is called from
1409 			 * lwp_mutex_wakeup() after reading the waiters field
1410 			 * from memory in which the lwp lock used to be, but has
1411 			 * since been re-used to hold a lwp cv or lwp semaphore.
1412 			 * The thread "tp" found to match the lwp lock's wchan
1413 			 * is actually sleeping for the cv or semaphore which
1414 			 * now has the same wchan. In this case, lwp_release()
1415 			 * should return failure.
1416 			 */
1417 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1418 				ASSERT(sync_type == 0);
1419 				/*
1420 				 * assert that this can happen only for mutexes
1421 				 * i.e. sync_type == 0, for correctly written
1422 				 * user programs.
1423 				 */
1424 				disp_lock_exit(&sqh->sq_lock);
1425 				return (0);
1426 			}
1427 			*waiters = iswanted(tp->t_link, lwpchan);
1428 			sleepq_unlink(tpp, tp);
1429 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1430 			tp->t_wchan0 = NULL;
1431 			tp->t_wchan = NULL;
1432 			tp->t_sobj_ops = NULL;
1433 			tp->t_release = 1;
1434 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1435 			CL_WAKEUP(tp);
1436 			thread_unlock(tp);	/* drop run queue lock */
1437 			return (1);
1438 		}
1439 		tpp = &tp->t_link;
1440 	}
1441 	*waiters = 0;
1442 	disp_lock_exit(&sqh->sq_lock);
1443 	return (0);
1444 }
1445 
1446 static void
1447 lwp_release_all(lwpchan_t *lwpchan)
1448 {
1449 	sleepq_head_t	*sqh;
1450 	kthread_t *tp;
1451 	kthread_t **tpp;
1452 
1453 	sqh = lwpsqhash(lwpchan);
1454 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1455 	tpp = &sqh->sq_queue.sq_first;
1456 	while ((tp = *tpp) != NULL) {
1457 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1458 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1459 			sleepq_unlink(tpp, tp);
1460 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1461 			tp->t_wchan0 = NULL;
1462 			tp->t_wchan = NULL;
1463 			tp->t_sobj_ops = NULL;
1464 			CL_WAKEUP(tp);
1465 			thread_unlock_high(tp);	/* release run queue lock */
1466 		} else {
1467 			tpp = &tp->t_link;
1468 		}
1469 	}
1470 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1471 }
1472 
1473 /*
1474  * unblock a lwp that is trying to acquire this mutex. the blocked
1475  * lwp resumes and retries to acquire the lock.
1476  */
1477 int
1478 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1479 {
1480 	proc_t *p = ttoproc(curthread);
1481 	lwpchan_t lwpchan;
1482 	uchar_t waiters;
1483 	volatile int locked = 0;
1484 	volatile int watched = 0;
1485 	volatile uint8_t type = 0;
1486 	label_t ljb;
1487 	int error = 0;
1488 
1489 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1490 		return (set_errno(EFAULT));
1491 
1492 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1493 
1494 	if (on_fault(&ljb)) {
1495 		if (locked)
1496 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1497 		error = EFAULT;
1498 		goto out;
1499 	}
1500 	/*
1501 	 * Force Copy-on-write if necessary and ensure that the
1502 	 * synchronization object resides in read/write memory.
1503 	 * Cause an EFAULT return now if this is not so.
1504 	 */
1505 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1506 	suword8_noerr(&lp->mutex_type, type);
1507 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1508 	    &lwpchan, LWPCHAN_MPPOOL)) {
1509 		error = EFAULT;
1510 		goto out;
1511 	}
1512 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1513 	locked = 1;
1514 	/*
1515 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1516 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1517 	 * may fail.  If it fails, do not write into the waiter bit.
1518 	 * The call to lwp_release() might fail due to one of three reasons:
1519 	 *
1520 	 * 	1. due to the thread which set the waiter bit not actually
1521 	 *	   sleeping since it got the lock on the re-try. The waiter
1522 	 *	   bit will then be correctly updated by that thread. This
1523 	 *	   window may be closed by reading the wait bit again here
1524 	 *	   and not calling lwp_release() at all if it is zero.
1525 	 *	2. the thread which set the waiter bit and went to sleep
1526 	 *	   was woken up by a signal. This time, the waiter recomputes
1527 	 *	   the wait bit in the return with EINTR code.
1528 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1529 	 *	   memory that has been re-used after the lock was dropped.
1530 	 *	   In this case, writing into the waiter bit would cause data
1531 	 *	   corruption.
1532 	 */
1533 	if (release_all)
1534 		lwp_release_all(&lwpchan);
1535 	else if (lwp_release(&lwpchan, &waiters, 0))
1536 		suword8_noerr(&lp->mutex_waiters, waiters);
1537 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1538 out:
1539 	no_fault();
1540 	if (watched)
1541 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1542 	if (error)
1543 		return (set_errno(error));
1544 	return (0);
1545 }
1546 
1547 /*
1548  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1549  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1550  * a flag telling the kernel whether or not to honor the kernel/user
1551  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1552  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1553  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1554  * it is used an an in/out parameter.  On entry, it contains the relative
1555  * time until timeout.  On exit, we copyout the residual time left to it.
1556  */
1557 int
1558 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1559 {
1560 	kthread_t *t = curthread;
1561 	klwp_t *lwp = ttolwp(t);
1562 	proc_t *p = ttoproc(t);
1563 	lwp_timer_t lwpt;
1564 	lwpchan_t cv_lwpchan;
1565 	lwpchan_t m_lwpchan;
1566 	caddr_t timedwait;
1567 	volatile uint16_t type = 0;
1568 	volatile uint8_t mtype = 0;
1569 	uchar_t waiters;
1570 	volatile int error;
1571 	clock_t tim = -1;
1572 	volatile int locked = 0;
1573 	volatile int m_locked = 0;
1574 	volatile int cvwatched = 0;
1575 	volatile int mpwatched = 0;
1576 	label_t ljb;
1577 	volatile int no_lwpchan = 1;
1578 	int imm_timeout = 0;
1579 	int imm_unpark = 0;
1580 
1581 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1582 	    (caddr_t)mp >= p->p_as->a_userlimit)
1583 		return (set_errno(EFAULT));
1584 
1585 	timedwait = (caddr_t)tsp;
1586 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1587 		return (set_errno(error));
1588 	if (lwpt.lwpt_imm_timeout) {
1589 		imm_timeout = 1;
1590 		timedwait = NULL;
1591 	}
1592 
1593 	(void) new_mstate(t, LMS_USER_LOCK);
1594 
1595 	if (on_fault(&ljb)) {
1596 		if (no_lwpchan) {
1597 			error = EFAULT;
1598 			goto out;
1599 		}
1600 		if (m_locked) {
1601 			m_locked = 0;
1602 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1603 		}
1604 		if (locked) {
1605 			locked = 0;
1606 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1607 		}
1608 		/*
1609 		 * set up another on_fault() for a possible fault
1610 		 * on the user lock accessed at "efault"
1611 		 */
1612 		if (on_fault(&ljb)) {
1613 			if (m_locked) {
1614 				m_locked = 0;
1615 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1616 			}
1617 			goto out;
1618 		}
1619 		error = EFAULT;
1620 		goto efault;
1621 	}
1622 
1623 	/*
1624 	 * Force Copy-on-write if necessary and ensure that the
1625 	 * synchronization object resides in read/write memory.
1626 	 * Cause an EFAULT return now if this is not so.
1627 	 */
1628 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1629 	suword8_noerr(&mp->mutex_type, mtype);
1630 	if (UPIMUTEX(mtype) == 0) {
1631 		/* convert user level mutex, "mp", to a unique lwpchan */
1632 		/* check if mtype is ok to use below, instead of type from cv */
1633 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1634 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1635 			error = EFAULT;
1636 			goto out;
1637 		}
1638 	}
1639 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1640 	suword16_noerr(&cv->cond_type, type);
1641 	/* convert user level condition variable, "cv", to a unique lwpchan */
1642 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1643 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1644 		error = EFAULT;
1645 		goto out;
1646 	}
1647 	no_lwpchan = 0;
1648 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1649 	if (UPIMUTEX(mtype) == 0)
1650 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1651 		    S_WRITE);
1652 
1653 	/*
1654 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1655 	 * with respect to a possible wakeup which is a result of either
1656 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1657 	 *
1658 	 * What's misleading, is that the lwp is put to sleep after the
1659 	 * condition variable's mutex is released.  This is OK as long as
1660 	 * the release operation is also done while holding lwpchan_lock.
1661 	 * The lwp is then put to sleep when the possibility of pagefaulting
1662 	 * or sleeping is completely eliminated.
1663 	 */
1664 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1665 	locked = 1;
1666 	if (UPIMUTEX(mtype) == 0) {
1667 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1668 		m_locked = 1;
1669 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1670 		/*
1671 		 * unlock the condition variable's mutex. (pagefaults are
1672 		 * possible here.)
1673 		 */
1674 		if (mtype & USYNC_PROCESS)
1675 			suword32_noerr(&mp->mutex_ownerpid, 0);
1676 		ulock_clear(&mp->mutex_lockw);
1677 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1678 		if (waiters != 0) {
1679 			/*
1680 			 * Given the locking of lwpchan_lock around the release
1681 			 * of the mutex and checking for waiters, the following
1682 			 * call to lwp_release() can fail ONLY if the lock
1683 			 * acquirer is interrupted after setting the waiter bit,
1684 			 * calling lwp_block() and releasing lwpchan_lock.
1685 			 * In this case, it could get pulled off the lwp sleep
1686 			 * q (via setrun()) before the following call to
1687 			 * lwp_release() occurs. In this case, the lock
1688 			 * requestor will update the waiter bit correctly by
1689 			 * re-evaluating it.
1690 			 */
1691 			if (lwp_release(&m_lwpchan, &waiters, 0))
1692 				suword8_noerr(&mp->mutex_waiters, waiters);
1693 		}
1694 		m_locked = 0;
1695 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1696 	} else {
1697 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1698 		error = lwp_upimutex_unlock(mp, mtype);
1699 		if (error) {	/* if the upimutex unlock failed */
1700 			locked = 0;
1701 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1702 			goto out;
1703 		}
1704 	}
1705 	no_fault();
1706 
1707 	if (mpwatched) {
1708 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1709 		mpwatched = 0;
1710 	}
1711 	if (cvwatched) {
1712 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1713 		cvwatched = 0;
1714 	}
1715 
1716 	/*
1717 	 * Put the lwp in an orderly state for debugging.
1718 	 */
1719 	prstop(PR_REQUESTED, 0);
1720 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1721 		/*
1722 		 * We received a signal at user-level before calling here
1723 		 * or another thread wants us to return immediately
1724 		 * with EINTR.  See lwp_unpark().
1725 		 */
1726 		imm_unpark = 1;
1727 		t->t_unpark = 0;
1728 		timedwait = NULL;
1729 	} else if (timedwait) {
1730 		/*
1731 		 * If we successfully queue the timeout,
1732 		 * then don't drop t_delay_lock until
1733 		 * we are on the sleep queue (below).
1734 		 */
1735 		mutex_enter(&t->t_delay_lock);
1736 		if (lwp_timer_enqueue(&lwpt) != 0) {
1737 			mutex_exit(&t->t_delay_lock);
1738 			imm_timeout = 1;
1739 			timedwait = NULL;
1740 		}
1741 	}
1742 	t->t_flag |= T_WAITCVSEM;
1743 	lwp_block(&cv_lwpchan);
1744 	/*
1745 	 * Nothing should happen to cause the lwp to go to sleep
1746 	 * until after it returns from swtch().
1747 	 */
1748 	if (timedwait)
1749 		mutex_exit(&t->t_delay_lock);
1750 	locked = 0;
1751 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1752 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1753 	    (imm_timeout | imm_unpark))
1754 		setrun(t);
1755 	swtch();
1756 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1757 	if (timedwait)
1758 		tim = lwp_timer_dequeue(&lwpt);
1759 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1760 	    MUSTRETURN(p, t) || imm_unpark)
1761 		error = EINTR;
1762 	else if (imm_timeout || (timedwait && tim == -1))
1763 		error = ETIME;
1764 	lwp->lwp_asleep = 0;
1765 	lwp->lwp_sysabort = 0;
1766 	setallwatch();
1767 
1768 	if (t->t_mstate == LMS_USER_LOCK)
1769 		(void) new_mstate(t, LMS_SYSTEM);
1770 
1771 	if (tsp && check_park)		/* copyout the residual time left */
1772 		error = lwp_timer_copyout(&lwpt, error);
1773 
1774 	/* the mutex is reacquired by the caller on return to user level */
1775 	if (error) {
1776 		/*
1777 		 * If we were concurrently lwp_cond_signal()d and we
1778 		 * received a UNIX signal or got a timeout, then perform
1779 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1780 		 */
1781 		if (t->t_release)
1782 			(void) lwp_cond_signal(cv);
1783 		return (set_errno(error));
1784 	}
1785 	return (0);
1786 
1787 efault:
1788 	/*
1789 	 * make sure that the user level lock is dropped before
1790 	 * returning to caller, since the caller always re-acquires it.
1791 	 */
1792 	if (UPIMUTEX(mtype) == 0) {
1793 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1794 		m_locked = 1;
1795 		if (mtype & USYNC_PROCESS)
1796 			suword32_noerr(&mp->mutex_ownerpid, 0);
1797 		ulock_clear(&mp->mutex_lockw);
1798 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1799 		if (waiters != 0) {
1800 			/*
1801 			 * See comment above on lock clearing and lwp_release()
1802 			 * success/failure.
1803 			 */
1804 			if (lwp_release(&m_lwpchan, &waiters, 0))
1805 				suword8_noerr(&mp->mutex_waiters, waiters);
1806 		}
1807 		m_locked = 0;
1808 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1809 	} else {
1810 		(void) lwp_upimutex_unlock(mp, mtype);
1811 	}
1812 out:
1813 	no_fault();
1814 	if (mpwatched)
1815 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1816 	if (cvwatched)
1817 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1818 	if (t->t_mstate == LMS_USER_LOCK)
1819 		(void) new_mstate(t, LMS_SYSTEM);
1820 	return (set_errno(error));
1821 }
1822 
1823 /*
1824  * wakeup one lwp that's blocked on this condition variable.
1825  */
1826 int
1827 lwp_cond_signal(lwp_cond_t *cv)
1828 {
1829 	proc_t *p = ttoproc(curthread);
1830 	lwpchan_t lwpchan;
1831 	uchar_t waiters;
1832 	volatile uint16_t type = 0;
1833 	volatile int locked = 0;
1834 	volatile int watched = 0;
1835 	label_t ljb;
1836 	int error = 0;
1837 
1838 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1839 		return (set_errno(EFAULT));
1840 
1841 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1842 
1843 	if (on_fault(&ljb)) {
1844 		if (locked)
1845 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1846 		error = EFAULT;
1847 		goto out;
1848 	}
1849 	/*
1850 	 * Force Copy-on-write if necessary and ensure that the
1851 	 * synchronization object resides in read/write memory.
1852 	 * Cause an EFAULT return now if this is not so.
1853 	 */
1854 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1855 	suword16_noerr(&cv->cond_type, type);
1856 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1857 	    &lwpchan, LWPCHAN_CVPOOL)) {
1858 		error = EFAULT;
1859 		goto out;
1860 	}
1861 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1862 	locked = 1;
1863 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1864 	if (waiters != 0) {
1865 		/*
1866 		 * The following call to lwp_release() might fail but it is
1867 		 * OK to write into the waiters bit below, since the memory
1868 		 * could not have been re-used or unmapped (for correctly
1869 		 * written user programs) as in the case of lwp_mutex_wakeup().
1870 		 * For an incorrect program, we should not care about data
1871 		 * corruption since this is just one instance of other places
1872 		 * where corruption can occur for such a program. Of course
1873 		 * if the memory is unmapped, normal fault recovery occurs.
1874 		 */
1875 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1876 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1877 	}
1878 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1879 out:
1880 	no_fault();
1881 	if (watched)
1882 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1883 	if (error)
1884 		return (set_errno(error));
1885 	return (0);
1886 }
1887 
1888 /*
1889  * wakeup every lwp that's blocked on this condition variable.
1890  */
1891 int
1892 lwp_cond_broadcast(lwp_cond_t *cv)
1893 {
1894 	proc_t *p = ttoproc(curthread);
1895 	lwpchan_t lwpchan;
1896 	volatile uint16_t type = 0;
1897 	volatile int locked = 0;
1898 	volatile int watched = 0;
1899 	label_t ljb;
1900 	uchar_t waiters;
1901 	int error = 0;
1902 
1903 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1904 		return (set_errno(EFAULT));
1905 
1906 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1907 
1908 	if (on_fault(&ljb)) {
1909 		if (locked)
1910 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1911 		error = EFAULT;
1912 		goto out;
1913 	}
1914 	/*
1915 	 * Force Copy-on-write if necessary and ensure that the
1916 	 * synchronization object resides in read/write memory.
1917 	 * Cause an EFAULT return now if this is not so.
1918 	 */
1919 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1920 	suword16_noerr(&cv->cond_type, type);
1921 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1922 	    &lwpchan, LWPCHAN_CVPOOL)) {
1923 		error = EFAULT;
1924 		goto out;
1925 	}
1926 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1927 	locked = 1;
1928 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1929 	if (waiters != 0) {
1930 		lwp_release_all(&lwpchan);
1931 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1932 	}
1933 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1934 out:
1935 	no_fault();
1936 	if (watched)
1937 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1938 	if (error)
1939 		return (set_errno(error));
1940 	return (0);
1941 }
1942 
1943 int
1944 lwp_sema_trywait(lwp_sema_t *sp)
1945 {
1946 	kthread_t *t = curthread;
1947 	proc_t *p = ttoproc(t);
1948 	label_t ljb;
1949 	volatile int locked = 0;
1950 	volatile int watched = 0;
1951 	volatile uint16_t type = 0;
1952 	int count;
1953 	lwpchan_t lwpchan;
1954 	uchar_t waiters;
1955 	int error = 0;
1956 
1957 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1958 		return (set_errno(EFAULT));
1959 
1960 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1961 
1962 	if (on_fault(&ljb)) {
1963 		if (locked)
1964 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1965 		error = EFAULT;
1966 		goto out;
1967 	}
1968 	/*
1969 	 * Force Copy-on-write if necessary and ensure that the
1970 	 * synchronization object resides in read/write memory.
1971 	 * Cause an EFAULT return now if this is not so.
1972 	 */
1973 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1974 	suword16_noerr((void *)&sp->sema_type, type);
1975 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1976 	    &lwpchan, LWPCHAN_CVPOOL)) {
1977 		error = EFAULT;
1978 		goto out;
1979 	}
1980 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1981 	locked = 1;
1982 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1983 	if (count == 0)
1984 		error = EBUSY;
1985 	else
1986 		suword32_noerr((void *)&sp->sema_count, --count);
1987 	if (count != 0) {
1988 		fuword8_noerr(&sp->sema_waiters, &waiters);
1989 		if (waiters != 0) {
1990 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1991 			suword8_noerr(&sp->sema_waiters, waiters);
1992 		}
1993 	}
1994 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1995 out:
1996 	no_fault();
1997 	if (watched)
1998 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1999 	if (error)
2000 		return (set_errno(error));
2001 	return (0);
2002 }
2003 
2004 /*
2005  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
2006  */
2007 int
2008 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2009 {
2010 	kthread_t *t = curthread;
2011 	klwp_t *lwp = ttolwp(t);
2012 	proc_t *p = ttoproc(t);
2013 	lwp_timer_t lwpt;
2014 	caddr_t timedwait;
2015 	clock_t tim = -1;
2016 	label_t ljb;
2017 	volatile int locked = 0;
2018 	volatile int watched = 0;
2019 	volatile uint16_t type = 0;
2020 	int count;
2021 	lwpchan_t lwpchan;
2022 	uchar_t waiters;
2023 	int error = 0;
2024 	int time_error;
2025 	int imm_timeout = 0;
2026 	int imm_unpark = 0;
2027 
2028 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2029 		return (set_errno(EFAULT));
2030 
2031 	timedwait = (caddr_t)tsp;
2032 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2033 	    lwpt.lwpt_imm_timeout) {
2034 		imm_timeout = 1;
2035 		timedwait = NULL;
2036 	}
2037 
2038 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2039 
2040 	if (on_fault(&ljb)) {
2041 		if (locked)
2042 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2043 		error = EFAULT;
2044 		goto out;
2045 	}
2046 	/*
2047 	 * Force Copy-on-write if necessary and ensure that the
2048 	 * synchronization object resides in read/write memory.
2049 	 * Cause an EFAULT return now if this is not so.
2050 	 */
2051 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2052 	suword16_noerr((void *)&sp->sema_type, type);
2053 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2054 	    &lwpchan, LWPCHAN_CVPOOL)) {
2055 		error = EFAULT;
2056 		goto out;
2057 	}
2058 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2059 	locked = 1;
2060 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2061 	while (error == 0 && count == 0) {
2062 		if (time_error) {
2063 			/*
2064 			 * The SUSV3 Posix spec is very clear that we
2065 			 * should get no error from validating the
2066 			 * timer until we would actually sleep.
2067 			 */
2068 			error = time_error;
2069 			break;
2070 		}
2071 		suword8_noerr(&sp->sema_waiters, 1);
2072 		if (watched)
2073 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2074 		/*
2075 		 * Put the lwp in an orderly state for debugging.
2076 		 */
2077 		prstop(PR_REQUESTED, 0);
2078 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2079 			/*
2080 			 * We received a signal at user-level before calling
2081 			 * here or another thread wants us to return
2082 			 * immediately with EINTR.  See lwp_unpark().
2083 			 */
2084 			imm_unpark = 1;
2085 			t->t_unpark = 0;
2086 			timedwait = NULL;
2087 		} else if (timedwait) {
2088 			/*
2089 			 * If we successfully queue the timeout,
2090 			 * then don't drop t_delay_lock until
2091 			 * we are on the sleep queue (below).
2092 			 */
2093 			mutex_enter(&t->t_delay_lock);
2094 			if (lwp_timer_enqueue(&lwpt) != 0) {
2095 				mutex_exit(&t->t_delay_lock);
2096 				imm_timeout = 1;
2097 				timedwait = NULL;
2098 			}
2099 		}
2100 		t->t_flag |= T_WAITCVSEM;
2101 		lwp_block(&lwpchan);
2102 		/*
2103 		 * Nothing should happen to cause the lwp to sleep
2104 		 * again until after it returns from swtch().
2105 		 */
2106 		if (timedwait)
2107 			mutex_exit(&t->t_delay_lock);
2108 		locked = 0;
2109 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2110 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2111 		    (imm_timeout | imm_unpark))
2112 			setrun(t);
2113 		swtch();
2114 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2115 		if (timedwait)
2116 			tim = lwp_timer_dequeue(&lwpt);
2117 		setallwatch();
2118 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2119 		    MUSTRETURN(p, t) || imm_unpark)
2120 			error = EINTR;
2121 		else if (imm_timeout || (timedwait && tim == -1))
2122 			error = ETIME;
2123 		lwp->lwp_asleep = 0;
2124 		lwp->lwp_sysabort = 0;
2125 		watched = watch_disable_addr((caddr_t)sp,
2126 		    sizeof (*sp), S_WRITE);
2127 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2128 		locked = 1;
2129 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2130 	}
2131 	if (error == 0)
2132 		suword32_noerr((void *)&sp->sema_count, --count);
2133 	if (count != 0) {
2134 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2135 		suword8_noerr(&sp->sema_waiters, waiters);
2136 	}
2137 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2138 out:
2139 	no_fault();
2140 	if (watched)
2141 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2142 	if (tsp && check_park && !time_error)
2143 		error = lwp_timer_copyout(&lwpt, error);
2144 	if (error)
2145 		return (set_errno(error));
2146 	return (0);
2147 }
2148 
2149 /*
2150  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2151  * libc now calls lwp_sema_timedwait().
2152  * This system call trap exists solely for the benefit of old
2153  * statically linked applications from Solaris 9 and before.
2154  * It should be removed when we no longer care about such applications.
2155  */
2156 int
2157 lwp_sema_wait(lwp_sema_t *sp)
2158 {
2159 	return (lwp_sema_timedwait(sp, NULL, 0));
2160 }
2161 
2162 int
2163 lwp_sema_post(lwp_sema_t *sp)
2164 {
2165 	proc_t *p = ttoproc(curthread);
2166 	label_t ljb;
2167 	volatile int locked = 0;
2168 	volatile int watched = 0;
2169 	volatile uint16_t type = 0;
2170 	int count;
2171 	lwpchan_t lwpchan;
2172 	uchar_t waiters;
2173 	int error = 0;
2174 
2175 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2176 		return (set_errno(EFAULT));
2177 
2178 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2179 
2180 	if (on_fault(&ljb)) {
2181 		if (locked)
2182 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2183 		error = EFAULT;
2184 		goto out;
2185 	}
2186 	/*
2187 	 * Force Copy-on-write if necessary and ensure that the
2188 	 * synchronization object resides in read/write memory.
2189 	 * Cause an EFAULT return now if this is not so.
2190 	 */
2191 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2192 	suword16_noerr(&sp->sema_type, type);
2193 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2194 	    &lwpchan, LWPCHAN_CVPOOL)) {
2195 		error = EFAULT;
2196 		goto out;
2197 	}
2198 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2199 	locked = 1;
2200 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2201 	if (count == _SEM_VALUE_MAX)
2202 		error = EOVERFLOW;
2203 	else
2204 		suword32_noerr(&sp->sema_count, ++count);
2205 	if (count == 1) {
2206 		fuword8_noerr(&sp->sema_waiters, &waiters);
2207 		if (waiters) {
2208 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2209 			suword8_noerr(&sp->sema_waiters, waiters);
2210 		}
2211 	}
2212 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2213 out:
2214 	no_fault();
2215 	if (watched)
2216 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2217 	if (error)
2218 		return (set_errno(error));
2219 	return (0);
2220 }
2221 
2222 #define	TRW_WANT_WRITE		0x1
2223 #define	TRW_LOCK_GRANTED	0x2
2224 
2225 #define	READ_LOCK		0
2226 #define	WRITE_LOCK		1
2227 #define	TRY_FLAG		0x10
2228 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2229 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2230 
2231 /*
2232  * Release one writer or one or more readers. Compute the rwstate word to
2233  * reflect the new state of the queue. For a safe hand-off we copy the new
2234  * rwstate value back to userland before we wake any of the new lock holders.
2235  *
2236  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2237  * being given precedence over readers of the same priority).
2238  *
2239  * If the first thread is a reader we scan the queue releasing all readers
2240  * until we hit a writer or the end of the queue. If the first thread is a
2241  * writer we still need to check for another writer.
2242  */
2243 void
2244 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2245 {
2246 	sleepq_head_t *sqh;
2247 	kthread_t *tp;
2248 	kthread_t **tpp;
2249 	kthread_t *tpnext;
2250 	kthread_t *wakelist = NULL;
2251 	uint32_t rwstate = 0;
2252 	int wcount = 0;
2253 	int rcount = 0;
2254 
2255 	sqh = lwpsqhash(lwpchan);
2256 	disp_lock_enter(&sqh->sq_lock);
2257 	tpp = &sqh->sq_queue.sq_first;
2258 	while ((tp = *tpp) != NULL) {
2259 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2260 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2261 			if (tp->t_writer & TRW_WANT_WRITE) {
2262 				if ((wcount++ == 0) && (rcount == 0)) {
2263 					rwstate |= URW_WRITE_LOCKED;
2264 
2265 					/* Just one writer to wake. */
2266 					sleepq_unlink(tpp, tp);
2267 					wakelist = tp;
2268 
2269 					/* tpp already set for next thread. */
2270 					continue;
2271 				} else {
2272 					rwstate |= URW_HAS_WAITERS;
2273 					/* We need look no further. */
2274 					break;
2275 				}
2276 			} else {
2277 				rcount++;
2278 				if (wcount == 0) {
2279 					rwstate++;
2280 
2281 					/* Add reader to wake list. */
2282 					sleepq_unlink(tpp, tp);
2283 					tp->t_link = wakelist;
2284 					wakelist = tp;
2285 
2286 					/* tpp already set for next thread. */
2287 					continue;
2288 				} else {
2289 					rwstate |= URW_HAS_WAITERS;
2290 					/* We need look no further. */
2291 					break;
2292 				}
2293 			}
2294 		}
2295 		tpp = &tp->t_link;
2296 	}
2297 
2298 	/* Copy the new rwstate back to userland. */
2299 	suword32_noerr(&rw->rwlock_readers, rwstate);
2300 
2301 	/* Wake the new lock holder(s) up. */
2302 	tp = wakelist;
2303 	while (tp != NULL) {
2304 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2305 		tp->t_wchan0 = NULL;
2306 		tp->t_wchan = NULL;
2307 		tp->t_sobj_ops = NULL;
2308 		tp->t_writer |= TRW_LOCK_GRANTED;
2309 		tpnext = tp->t_link;
2310 		tp->t_link = NULL;
2311 		CL_WAKEUP(tp);
2312 		thread_unlock_high(tp);
2313 		tp = tpnext;
2314 	}
2315 
2316 	disp_lock_exit(&sqh->sq_lock);
2317 }
2318 
2319 /*
2320  * We enter here holding the user-level mutex, which we must release before
2321  * returning or blocking. Based on lwp_cond_wait().
2322  */
2323 static int
2324 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2325 {
2326 	lwp_mutex_t *mp = NULL;
2327 	kthread_t *t = curthread;
2328 	kthread_t *tp;
2329 	klwp_t *lwp = ttolwp(t);
2330 	proc_t *p = ttoproc(t);
2331 	lwp_timer_t lwpt;
2332 	lwpchan_t lwpchan;
2333 	lwpchan_t mlwpchan;
2334 	caddr_t timedwait;
2335 	volatile uint16_t type = 0;
2336 	volatile uint8_t mtype = 0;
2337 	uchar_t mwaiters;
2338 	volatile int error = 0;
2339 	int time_error;
2340 	clock_t tim = -1;
2341 	volatile int locked = 0;
2342 	volatile int mlocked = 0;
2343 	volatile int watched = 0;
2344 	volatile int mwatched = 0;
2345 	label_t ljb;
2346 	volatile int no_lwpchan = 1;
2347 	int imm_timeout = 0;
2348 	int try_flag;
2349 	uint32_t rwstate;
2350 	int acquired = 0;
2351 
2352 	/* We only check rw because the mutex is included in it. */
2353 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2354 		return (set_errno(EFAULT));
2355 
2356 	/* We must only report this error if we are about to sleep (later). */
2357 	timedwait = (caddr_t)tsp;
2358 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2359 	    lwpt.lwpt_imm_timeout) {
2360 		imm_timeout = 1;
2361 		timedwait = NULL;
2362 	}
2363 
2364 	(void) new_mstate(t, LMS_USER_LOCK);
2365 
2366 	if (on_fault(&ljb)) {
2367 		if (no_lwpchan) {
2368 			error = EFAULT;
2369 			goto out_nodrop;
2370 		}
2371 		if (mlocked) {
2372 			mlocked = 0;
2373 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2374 		}
2375 		if (locked) {
2376 			locked = 0;
2377 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2378 		}
2379 		/*
2380 		 * Set up another on_fault() for a possible fault
2381 		 * on the user lock accessed at "out_drop".
2382 		 */
2383 		if (on_fault(&ljb)) {
2384 			if (mlocked) {
2385 				mlocked = 0;
2386 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2387 			}
2388 			error = EFAULT;
2389 			goto out_nodrop;
2390 		}
2391 		error = EFAULT;
2392 		goto out_nodrop;
2393 	}
2394 
2395 	/* Process rd_wr (including sanity check). */
2396 	try_flag = (rd_wr & TRY_FLAG);
2397 	rd_wr &= ~TRY_FLAG;
2398 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2399 		error = EINVAL;
2400 		goto out_nodrop;
2401 	}
2402 
2403 	/*
2404 	 * Force Copy-on-write if necessary and ensure that the
2405 	 * synchronization object resides in read/write memory.
2406 	 * Cause an EFAULT return now if this is not so.
2407 	 */
2408 	mp = &rw->mutex;
2409 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2410 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2411 	suword8_noerr(&mp->mutex_type, mtype);
2412 	suword16_noerr(&rw->rwlock_type, type);
2413 
2414 	/* We can only continue for simple USYNC_PROCESS locks. */
2415 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2416 		error = EINVAL;
2417 		goto out_nodrop;
2418 	}
2419 
2420 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2421 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2422 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2423 		error = EFAULT;
2424 		goto out_nodrop;
2425 	}
2426 
2427 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2428 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2429 	    &lwpchan, LWPCHAN_CVPOOL)) {
2430 		error = EFAULT;
2431 		goto out_nodrop;
2432 	}
2433 
2434 	no_lwpchan = 0;
2435 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2436 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2437 
2438 	/*
2439 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2440 	 * atomically with respect to a possible wakeup which is a result
2441 	 * of lwp_rwlock_unlock().
2442 	 *
2443 	 * What's misleading is that the LWP is put to sleep after the
2444 	 * rwlock's mutex is released. This is OK as long as the release
2445 	 * operation is also done while holding mlwpchan. The LWP is then
2446 	 * put to sleep when the possibility of pagefaulting or sleeping
2447 	 * has been completely eliminated.
2448 	 */
2449 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2450 	locked = 1;
2451 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2452 	mlocked = 1;
2453 
2454 	/*
2455 	 * Fetch the current rwlock state.
2456 	 *
2457 	 * The possibility of spurious wake-ups or killed waiters means
2458 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2459 	 * We only fix these if they are important to us.
2460 	 *
2461 	 * Although various error states can be observed here (e.g. the lock
2462 	 * is not held, but there are waiters) we assume these are applicaton
2463 	 * errors and so we take no corrective action.
2464 	 */
2465 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2466 	/*
2467 	 * We cannot legitimately get here from user-level
2468 	 * without URW_HAS_WAITERS being set.
2469 	 * Set it now to guard against user-level error.
2470 	 */
2471 	rwstate |= URW_HAS_WAITERS;
2472 
2473 	/*
2474 	 * We can try only if the lock isn't held by a writer.
2475 	 */
2476 	if (!(rwstate & URW_WRITE_LOCKED)) {
2477 		tp = lwp_queue_waiter(&lwpchan);
2478 		if (tp == NULL) {
2479 			/*
2480 			 * Hmmm, rwstate indicates waiters but there are
2481 			 * none queued. This could just be the result of a
2482 			 * spurious wakeup, so let's ignore it.
2483 			 *
2484 			 * We now have a chance to acquire the lock
2485 			 * uncontended, but this is the last chance for
2486 			 * a writer to acquire the lock without blocking.
2487 			 */
2488 			if (rd_wr == READ_LOCK) {
2489 				rwstate++;
2490 				acquired = 1;
2491 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2492 				rwstate |= URW_WRITE_LOCKED;
2493 				acquired = 1;
2494 			}
2495 		} else if (rd_wr == READ_LOCK) {
2496 			/*
2497 			 * This is the last chance for a reader to acquire
2498 			 * the lock now, but it can only do so if there is
2499 			 * no writer of equal or greater priority at the
2500 			 * head of the queue .
2501 			 *
2502 			 * It is also just possible that there is a reader
2503 			 * at the head of the queue. This may be the result
2504 			 * of a spurious wakeup or an application failure.
2505 			 * In this case we only acquire the lock if we have
2506 			 * equal or greater priority. It is not our job to
2507 			 * release spurious waiters.
2508 			 */
2509 			pri_t our_pri = DISP_PRIO(t);
2510 			pri_t his_pri = DISP_PRIO(tp);
2511 
2512 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2513 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2514 				rwstate++;
2515 				acquired = 1;
2516 			}
2517 		}
2518 	}
2519 
2520 	if (acquired || try_flag || time_error) {
2521 		/*
2522 		 * We're not going to block this time.
2523 		 */
2524 		suword32_noerr(&rw->rwlock_readers, rwstate);
2525 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2526 		locked = 0;
2527 
2528 		if (acquired) {
2529 			/*
2530 			 * Got the lock!
2531 			 */
2532 			error = 0;
2533 
2534 		} else if (try_flag) {
2535 			/*
2536 			 * We didn't get the lock and we're about to block.
2537 			 * If we're doing a trylock, return EBUSY instead.
2538 			 */
2539 			error = EBUSY;
2540 
2541 		} else if (time_error) {
2542 			/*
2543 			 * The SUSV3 POSIX spec is very clear that we should
2544 			 * get no error from validating the timer (above)
2545 			 * until we would actually sleep.
2546 			 */
2547 			error = time_error;
2548 		}
2549 
2550 		goto out_drop;
2551 	}
2552 
2553 	/*
2554 	 * We're about to block, so indicate what kind of waiter we are.
2555 	 */
2556 	t->t_writer = 0;
2557 	if (rd_wr == WRITE_LOCK)
2558 		t->t_writer = TRW_WANT_WRITE;
2559 	suword32_noerr(&rw->rwlock_readers, rwstate);
2560 
2561 	/*
2562 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2563 	 */
2564 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2565 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2566 	suword32_noerr(&mp->mutex_ownerpid, 0);
2567 	ulock_clear(&mp->mutex_lockw);
2568 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2569 	if (mwaiters != 0) {
2570 		/*
2571 		 * Given the locking of mlwpchan around the release of
2572 		 * the mutex and checking for waiters, the following
2573 		 * call to lwp_release() can fail ONLY if the lock
2574 		 * acquirer is interrupted after setting the waiter bit,
2575 		 * calling lwp_block() and releasing mlwpchan.
2576 		 * In this case, it could get pulled off the LWP sleep
2577 		 * queue (via setrun()) before the following call to
2578 		 * lwp_release() occurs, and the lock requestor will
2579 		 * update the waiter bit correctly by re-evaluating it.
2580 		 */
2581 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2582 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2583 	}
2584 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2585 	mlocked = 0;
2586 	no_fault();
2587 
2588 	if (mwatched) {
2589 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2590 		mwatched = 0;
2591 	}
2592 	if (watched) {
2593 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2594 		watched = 0;
2595 	}
2596 
2597 	/*
2598 	 * Put the LWP in an orderly state for debugging.
2599 	 */
2600 	prstop(PR_REQUESTED, 0);
2601 	if (timedwait) {
2602 		/*
2603 		 * If we successfully queue the timeout,
2604 		 * then don't drop t_delay_lock until
2605 		 * we are on the sleep queue (below).
2606 		 */
2607 		mutex_enter(&t->t_delay_lock);
2608 		if (lwp_timer_enqueue(&lwpt) != 0) {
2609 			mutex_exit(&t->t_delay_lock);
2610 			imm_timeout = 1;
2611 			timedwait = NULL;
2612 		}
2613 	}
2614 	t->t_flag |= T_WAITCVSEM;
2615 	lwp_block(&lwpchan);
2616 
2617 	/*
2618 	 * Nothing should happen to cause the LWp to go to sleep until after
2619 	 * it returns from swtch().
2620 	 */
2621 	if (timedwait)
2622 		mutex_exit(&t->t_delay_lock);
2623 	locked = 0;
2624 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2625 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2626 		setrun(t);
2627 	swtch();
2628 
2629 	/*
2630 	 * We're back, but we need to work out why. Were we interrupted? Did
2631 	 * we timeout? Were we granted the lock?
2632 	 */
2633 	error = EAGAIN;
2634 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2635 	t->t_writer = 0;
2636 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2637 	if (timedwait)
2638 		tim = lwp_timer_dequeue(&lwpt);
2639 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2640 		error = EINTR;
2641 	else if (imm_timeout || (timedwait && tim == -1))
2642 		error = ETIME;
2643 	lwp->lwp_asleep = 0;
2644 	lwp->lwp_sysabort = 0;
2645 	setallwatch();
2646 
2647 	/*
2648 	 * If we were granted the lock we don't care about EINTR or ETIME.
2649 	 */
2650 	if (acquired)
2651 		error = 0;
2652 
2653 	if (t->t_mstate == LMS_USER_LOCK)
2654 		(void) new_mstate(t, LMS_SYSTEM);
2655 
2656 	if (error)
2657 		return (set_errno(error));
2658 	return (0);
2659 
2660 out_drop:
2661 	/*
2662 	 * Make sure that the user level lock is dropped before returning
2663 	 * to the caller.
2664 	 */
2665 	if (!mlocked) {
2666 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2667 		mlocked = 1;
2668 	}
2669 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2670 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2671 	suword32_noerr(&mp->mutex_ownerpid, 0);
2672 	ulock_clear(&mp->mutex_lockw);
2673 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2674 	if (mwaiters != 0) {
2675 		/*
2676 		 * See comment above on lock clearing and lwp_release()
2677 		 * success/failure.
2678 		 */
2679 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2680 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2681 	}
2682 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2683 	mlocked = 0;
2684 
2685 out_nodrop:
2686 	no_fault();
2687 	if (mwatched)
2688 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2689 	if (watched)
2690 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2691 	if (t->t_mstate == LMS_USER_LOCK)
2692 		(void) new_mstate(t, LMS_SYSTEM);
2693 	if (error)
2694 		return (set_errno(error));
2695 	return (0);
2696 }
2697 
2698 /*
2699  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2700  * we never drop the lock.
2701  */
2702 static int
2703 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2704 {
2705 	kthread_t *t = curthread;
2706 	proc_t *p = ttoproc(t);
2707 	lwpchan_t lwpchan;
2708 	volatile uint16_t type = 0;
2709 	volatile int error = 0;
2710 	volatile int locked = 0;
2711 	volatile int watched = 0;
2712 	label_t ljb;
2713 	volatile int no_lwpchan = 1;
2714 	uint32_t rwstate;
2715 
2716 	/* We only check rw because the mutex is included in it. */
2717 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2718 		return (set_errno(EFAULT));
2719 
2720 	if (on_fault(&ljb)) {
2721 		if (no_lwpchan) {
2722 			error = EFAULT;
2723 			goto out_nodrop;
2724 		}
2725 		if (locked) {
2726 			locked = 0;
2727 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2728 		}
2729 		error = EFAULT;
2730 		goto out_nodrop;
2731 	}
2732 
2733 	/*
2734 	 * Force Copy-on-write if necessary and ensure that the
2735 	 * synchronization object resides in read/write memory.
2736 	 * Cause an EFAULT return now if this is not so.
2737 	 */
2738 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2739 	suword16_noerr(&rw->rwlock_type, type);
2740 
2741 	/* We can only continue for simple USYNC_PROCESS locks. */
2742 	if (type != USYNC_PROCESS) {
2743 		error = EINVAL;
2744 		goto out_nodrop;
2745 	}
2746 
2747 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2748 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2749 	    &lwpchan, LWPCHAN_CVPOOL)) {
2750 		error = EFAULT;
2751 		goto out_nodrop;
2752 	}
2753 
2754 	no_lwpchan = 0;
2755 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2756 
2757 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2758 	locked = 1;
2759 
2760 	/*
2761 	 * We can resolve multiple readers (except the last reader) here.
2762 	 * For the last reader or a writer we need lwp_rwlock_release(),
2763 	 * to which we also delegate the task of copying the new rwstate
2764 	 * back to userland (see the comment there).
2765 	 */
2766 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2767 	if (rwstate & URW_WRITE_LOCKED)
2768 		lwp_rwlock_release(&lwpchan, rw);
2769 	else if ((rwstate & URW_READERS_MASK) > 0) {
2770 		rwstate--;
2771 		if ((rwstate & URW_READERS_MASK) == 0)
2772 			lwp_rwlock_release(&lwpchan, rw);
2773 		else
2774 			suword32_noerr(&rw->rwlock_readers, rwstate);
2775 	}
2776 
2777 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2778 	locked = 0;
2779 	error = 0;
2780 
2781 out_nodrop:
2782 	no_fault();
2783 	if (watched)
2784 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2785 	if (error)
2786 		return (set_errno(error));
2787 	return (0);
2788 }
2789 
2790 int
2791 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2792 {
2793 	switch (subcode) {
2794 	case 0:
2795 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2796 	case 1:
2797 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2798 	case 2:
2799 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2800 	case 3:
2801 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2802 	case 4:
2803 		return (lwp_rwlock_unlock(rwlp));
2804 	}
2805 	return (set_errno(EINVAL));
2806 }
2807 
2808 /*
2809  * Return the owner of the user-level s-object.
2810  * Since we can't really do this, return NULL.
2811  */
2812 /* ARGSUSED */
2813 static kthread_t *
2814 lwpsobj_owner(caddr_t sobj)
2815 {
2816 	return ((kthread_t *)NULL);
2817 }
2818 
2819 /*
2820  * Wake up a thread asleep on a user-level synchronization
2821  * object.
2822  */
2823 static void
2824 lwp_unsleep(kthread_t *t)
2825 {
2826 	ASSERT(THREAD_LOCK_HELD(t));
2827 	if (t->t_wchan0 != NULL) {
2828 		sleepq_head_t *sqh;
2829 		sleepq_t *sqp = t->t_sleepq;
2830 
2831 		if (sqp != NULL) {
2832 			sqh = lwpsqhash(&t->t_lwpchan);
2833 			ASSERT(&sqh->sq_queue == sqp);
2834 			sleepq_unsleep(t);
2835 			disp_lock_exit_high(&sqh->sq_lock);
2836 			CL_SETRUN(t);
2837 			return;
2838 		}
2839 	}
2840 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2841 }
2842 
2843 /*
2844  * Change the priority of a thread asleep on a user-level
2845  * synchronization object. To maintain proper priority order,
2846  * we:
2847  *	o dequeue the thread.
2848  *	o change its priority.
2849  *	o re-enqueue the thread.
2850  * Assumption: the thread is locked on entry.
2851  */
2852 static void
2853 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2854 {
2855 	ASSERT(THREAD_LOCK_HELD(t));
2856 	if (t->t_wchan0 != NULL) {
2857 		sleepq_t   *sqp = t->t_sleepq;
2858 
2859 		sleepq_dequeue(t);
2860 		*t_prip = pri;
2861 		sleepq_insert(sqp, t);
2862 	} else
2863 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2864 }
2865 
2866 /*
2867  * Clean up a left-over process-shared robust mutex
2868  */
2869 static void
2870 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2871 {
2872 	uint16_t flag;
2873 	uchar_t waiters;
2874 	label_t ljb;
2875 	pid_t owner_pid;
2876 	lwp_mutex_t *lp;
2877 	volatile int locked = 0;
2878 	volatile int watched = 0;
2879 	volatile struct upimutex *upimutex = NULL;
2880 	volatile int upilocked = 0;
2881 
2882 	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2883 	    != (USYNC_PROCESS | LOCK_ROBUST))
2884 		return;
2885 
2886 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2887 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2888 	if (on_fault(&ljb)) {
2889 		if (locked)
2890 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2891 		if (upilocked)
2892 			upimutex_unlock((upimutex_t *)upimutex, 0);
2893 		goto out;
2894 	}
2895 
2896 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2897 
2898 	if (UPIMUTEX(ent->lwpchan_type)) {
2899 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2900 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2901 
2902 		if (owner_pid != curproc->p_pid)
2903 			goto out;
2904 		mutex_enter(&upibp->upib_lock);
2905 		upimutex = upi_get(upibp, &lwpchan);
2906 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2907 			mutex_exit(&upibp->upib_lock);
2908 			goto out;
2909 		}
2910 		mutex_exit(&upibp->upib_lock);
2911 		upilocked = 1;
2912 		flag = lwp_clear_mutex(lp, lockflg);
2913 		suword8_noerr(&lp->mutex_lockw, 0);
2914 		upimutex_unlock((upimutex_t *)upimutex, flag);
2915 	} else {
2916 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2917 		locked = 1;
2918 		/*
2919 		 * Clear the spinners count because one of our
2920 		 * threads could have been spinning for this lock
2921 		 * at user level when the process was suddenly killed.
2922 		 * There is no harm in this since user-level libc code
2923 		 * will adapt to the sudden change in the spinner count.
2924 		 */
2925 		suword8_noerr(&lp->mutex_spinners, 0);
2926 		if (owner_pid != curproc->p_pid) {
2927 			/*
2928 			 * We are not the owner.  There may or may not be one.
2929 			 * If there are waiters, we wake up one or all of them.
2930 			 * It doesn't hurt to wake them up in error since
2931 			 * they will just retry the lock and go to sleep
2932 			 * again if necessary.
2933 			 */
2934 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2935 			if (waiters != 0) {	/* there are waiters */
2936 				fuword16_noerr(&lp->mutex_flag, &flag);
2937 				if (flag & LOCK_NOTRECOVERABLE) {
2938 					lwp_release_all(&ent->lwpchan_lwpchan);
2939 					suword8_noerr(&lp->mutex_waiters, 0);
2940 				} else if (lwp_release(&ent->lwpchan_lwpchan,
2941 				    &waiters, 0)) {
2942 					suword8_noerr(&lp->mutex_waiters,
2943 					    waiters);
2944 				}
2945 			}
2946 		} else {
2947 			/*
2948 			 * We are the owner.  Release it.
2949 			 */
2950 			(void) lwp_clear_mutex(lp, lockflg);
2951 			ulock_clear(&lp->mutex_lockw);
2952 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2953 			if (waiters &&
2954 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2955 				suword8_noerr(&lp->mutex_waiters, waiters);
2956 		}
2957 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2958 	}
2959 out:
2960 	no_fault();
2961 	if (watched)
2962 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2963 }
2964 
2965 /*
2966  * Register a process-shared robust mutex in the lwpchan cache.
2967  */
2968 int
2969 lwp_mutex_register(lwp_mutex_t *lp, caddr_t uaddr)
2970 {
2971 	int error = 0;
2972 	volatile int watched;
2973 	label_t ljb;
2974 	uint8_t type;
2975 	lwpchan_t lwpchan;
2976 
2977 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2978 		return (set_errno(EFAULT));
2979 
2980 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2981 
2982 	if (on_fault(&ljb)) {
2983 		error = EFAULT;
2984 	} else {
2985 		/*
2986 		 * Force Copy-on-write if necessary and ensure that the
2987 		 * synchronization object resides in read/write memory.
2988 		 * Cause an EFAULT return now if this is not so.
2989 		 */
2990 		fuword8_noerr(&lp->mutex_type, &type);
2991 		suword8_noerr(&lp->mutex_type, type);
2992 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2993 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
2994 			error = EINVAL;
2995 		} else if (!lwpchan_get_mapping(curproc->p_as, (caddr_t)lp,
2996 		    uaddr, type, &lwpchan, LWPCHAN_MPPOOL)) {
2997 			error = EFAULT;
2998 		}
2999 	}
3000 	no_fault();
3001 	if (watched)
3002 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3003 	if (error)
3004 		return (set_errno(error));
3005 	return (0);
3006 }
3007 
3008 /*
3009  * There is a user-level robust lock registration in libc.
3010  * Mark it as invalid by storing -1 into the location of the pointer.
3011  */
3012 static void
3013 lwp_mutex_unregister(void *uaddr)
3014 {
3015 	if (get_udatamodel() == DATAMODEL_NATIVE) {
3016 		(void) sulword(uaddr, (ulong_t)-1);
3017 #ifdef _SYSCALL32_IMPL
3018 	} else {
3019 		(void) suword32(uaddr, (uint32_t)-1);
3020 #endif
3021 	}
3022 }
3023 
3024 int
3025 lwp_mutex_trylock(lwp_mutex_t *lp)
3026 {
3027 	kthread_t *t = curthread;
3028 	proc_t *p = ttoproc(t);
3029 	int error = 0;
3030 	volatile int locked = 0;
3031 	volatile int watched = 0;
3032 	label_t ljb;
3033 	volatile uint8_t type = 0;
3034 	uint16_t flag;
3035 	lwpchan_t lwpchan;
3036 
3037 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3038 		return (set_errno(EFAULT));
3039 
3040 	(void) new_mstate(t, LMS_USER_LOCK);
3041 
3042 	if (on_fault(&ljb)) {
3043 		if (locked)
3044 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3045 		error = EFAULT;
3046 		goto out;
3047 	}
3048 	/*
3049 	 * Force Copy-on-write if necessary and ensure that the
3050 	 * synchronization object resides in read/write memory.
3051 	 * Cause an EFAULT return now if this is not so.
3052 	 */
3053 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3054 	suword8_noerr(&lp->mutex_type, type);
3055 	if (UPIMUTEX(type)) {
3056 		no_fault();
3057 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3058 		if ((type & USYNC_PROCESS) &&
3059 		    (error == 0 ||
3060 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
3061 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
3062 		if (error)
3063 			return (set_errno(error));
3064 		return (0);
3065 	}
3066 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3067 	    &lwpchan, LWPCHAN_MPPOOL)) {
3068 		error = EFAULT;
3069 		goto out;
3070 	}
3071 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3072 	locked = 1;
3073 	if (type & LOCK_ROBUST) {
3074 		fuword16_noerr(&lp->mutex_flag, &flag);
3075 		if (flag & LOCK_NOTRECOVERABLE) {
3076 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3077 			error =  ENOTRECOVERABLE;
3078 			goto out;
3079 		}
3080 	}
3081 
3082 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3083 
3084 	if (!ulock_try(&lp->mutex_lockw))
3085 		error = EBUSY;
3086 	else {
3087 		if (type & USYNC_PROCESS)
3088 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
3089 		if (type & LOCK_ROBUST) {
3090 			fuword16_noerr(&lp->mutex_flag, &flag);
3091 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3092 				if (flag & LOCK_OWNERDEAD)
3093 					error = EOWNERDEAD;
3094 				else if (type & USYNC_PROCESS_ROBUST)
3095 					error = ELOCKUNMAPPED;
3096 				else
3097 					error = EOWNERDEAD;
3098 			}
3099 		}
3100 	}
3101 	locked = 0;
3102 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3103 out:
3104 
3105 	if (t->t_mstate == LMS_USER_LOCK)
3106 		(void) new_mstate(t, LMS_SYSTEM);
3107 
3108 	no_fault();
3109 	if (watched)
3110 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3111 	if (error)
3112 		return (set_errno(error));
3113 	return (0);
3114 }
3115 
3116 /*
3117  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3118  * the blocked lwp resumes and retries to acquire the lock.
3119  */
3120 int
3121 lwp_mutex_unlock(lwp_mutex_t *lp)
3122 {
3123 	proc_t *p = ttoproc(curthread);
3124 	lwpchan_t lwpchan;
3125 	uchar_t waiters;
3126 	volatile int locked = 0;
3127 	volatile int watched = 0;
3128 	volatile uint8_t type = 0;
3129 	label_t ljb;
3130 	uint16_t flag;
3131 	int error = 0;
3132 
3133 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3134 		return (set_errno(EFAULT));
3135 
3136 	if (on_fault(&ljb)) {
3137 		if (locked)
3138 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3139 		error = EFAULT;
3140 		goto out;
3141 	}
3142 
3143 	/*
3144 	 * Force Copy-on-write if necessary and ensure that the
3145 	 * synchronization object resides in read/write memory.
3146 	 * Cause an EFAULT return now if this is not so.
3147 	 */
3148 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3149 	suword8_noerr(&lp->mutex_type, type);
3150 
3151 	if (UPIMUTEX(type)) {
3152 		no_fault();
3153 		error = lwp_upimutex_unlock(lp, type);
3154 		if (error)
3155 			return (set_errno(error));
3156 		return (0);
3157 	}
3158 
3159 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3160 
3161 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3162 	    &lwpchan, LWPCHAN_MPPOOL)) {
3163 		error = EFAULT;
3164 		goto out;
3165 	}
3166 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3167 	locked = 1;
3168 	if (type & LOCK_ROBUST) {
3169 		fuword16_noerr(&lp->mutex_flag, &flag);
3170 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3171 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3172 			flag |= LOCK_NOTRECOVERABLE;
3173 			suword16_noerr(&lp->mutex_flag, flag);
3174 		}
3175 	}
3176 	if (type & USYNC_PROCESS)
3177 		suword32_noerr(&lp->mutex_ownerpid, 0);
3178 	ulock_clear(&lp->mutex_lockw);
3179 	/*
3180 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3181 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3182 	 * may fail.  If it fails, do not write into the waiter bit.
3183 	 * The call to lwp_release() might fail due to one of three reasons:
3184 	 *
3185 	 * 	1. due to the thread which set the waiter bit not actually
3186 	 *	   sleeping since it got the lock on the re-try. The waiter
3187 	 *	   bit will then be correctly updated by that thread. This
3188 	 *	   window may be closed by reading the wait bit again here
3189 	 *	   and not calling lwp_release() at all if it is zero.
3190 	 *	2. the thread which set the waiter bit and went to sleep
3191 	 *	   was woken up by a signal. This time, the waiter recomputes
3192 	 *	   the wait bit in the return with EINTR code.
3193 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3194 	 *	   memory that has been re-used after the lock was dropped.
3195 	 *	   In this case, writing into the waiter bit would cause data
3196 	 *	   corruption.
3197 	 */
3198 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3199 	if (waiters) {
3200 		if ((type & LOCK_ROBUST) &&
3201 		    (flag & LOCK_NOTRECOVERABLE)) {
3202 			lwp_release_all(&lwpchan);
3203 			suword8_noerr(&lp->mutex_waiters, 0);
3204 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3205 			suword8_noerr(&lp->mutex_waiters, waiters);
3206 		}
3207 	}
3208 
3209 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3210 out:
3211 	no_fault();
3212 	if (watched)
3213 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3214 	if (error)
3215 		return (set_errno(error));
3216 	return (0);
3217 }
3218