xref: /titanic_50/usr/src/uts/common/syscall/lwp_sobj.c (revision 3fbe3e2827948b5ff8ffec94d18c232af100ea3c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
28 /*	  All Rights Reserved	*/
29 
30 #include <sys/param.h>
31 #include <sys/types.h>
32 #include <sys/sysmacros.h>
33 #include <sys/systm.h>
34 #include <sys/cred.h>
35 #include <sys/user.h>
36 #include <sys/errno.h>
37 #include <sys/file.h>
38 #include <sys/proc.h>
39 #include <sys/prsystm.h>
40 #include <sys/kmem.h>
41 #include <sys/sobject.h>
42 #include <sys/fault.h>
43 #include <sys/procfs.h>
44 #include <sys/watchpoint.h>
45 #include <sys/time.h>
46 #include <sys/cmn_err.h>
47 #include <sys/machlock.h>
48 #include <sys/debug.h>
49 #include <sys/synch.h>
50 #include <sys/synch32.h>
51 #include <sys/mman.h>
52 #include <sys/class.h>
53 #include <sys/schedctl.h>
54 #include <sys/sleepq.h>
55 #include <sys/policy.h>
56 #include <sys/tnf_probe.h>
57 #include <sys/lwpchan_impl.h>
58 #include <sys/turnstile.h>
59 #include <sys/atomic.h>
60 #include <sys/lwp_timer_impl.h>
61 #include <sys/lwp_upimutex_impl.h>
62 #include <vm/as.h>
63 #include <sys/sdt.h>
64 
65 static kthread_t *lwpsobj_owner(caddr_t);
66 static void lwp_unsleep(kthread_t *t);
67 static void lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip);
68 static void lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg);
69 
70 extern int lwp_cond_signal(lwp_cond_t *cv);
71 
72 /*
73  * Maximum number of user prio inheritance locks that can be held by a thread.
74  * Used to limit kmem for each thread. This is a per-thread limit that
75  * can be administered on a system wide basis (using /etc/system).
76  *
77  * Also, when a limit, say maxlwps is added for numbers of lwps within a
78  * process, the per-thread limit automatically becomes a process-wide limit
79  * of maximum number of held upi locks within a process:
80  *      maxheldupimx = maxnestupimx * maxlwps;
81  */
82 static uint32_t maxnestupimx = 2000;
83 
84 /*
85  * The sobj_ops vector exports a set of functions needed when a thread
86  * is asleep on a synchronization object of this type.
87  */
88 static sobj_ops_t lwp_sobj_ops = {
89 	SOBJ_USER, lwpsobj_owner, lwp_unsleep, lwp_change_pri
90 };
91 
92 static kthread_t *lwpsobj_pi_owner(upimutex_t *up);
93 
94 static sobj_ops_t lwp_sobj_pi_ops = {
95 	SOBJ_USER_PI, lwpsobj_pi_owner, turnstile_unsleep,
96 	turnstile_change_pri
97 };
98 
99 static sleepq_head_t	lwpsleepq[NSLEEPQ];
100 upib_t			upimutextab[UPIMUTEX_TABSIZE];
101 
102 #define	LWPCHAN_LOCK_SHIFT	10	/* 1024 locks for each pool */
103 #define	LWPCHAN_LOCK_SIZE	(1 << LWPCHAN_LOCK_SHIFT)
104 
105 /*
106  * We know that both lc_wchan and lc_wchan0 are addresses that most
107  * likely are 8-byte aligned, so we shift off the low-order 3 bits.
108  * 'pool' is either 0 or 1.
109  */
110 #define	LWPCHAN_LOCK_HASH(X, pool) \
111 	(((((X) >> 3) ^ ((X) >> (LWPCHAN_LOCK_SHIFT + 3))) & \
112 	(LWPCHAN_LOCK_SIZE - 1)) + ((pool)? LWPCHAN_LOCK_SIZE : 0))
113 
114 static kmutex_t		lwpchanlock[2 * LWPCHAN_LOCK_SIZE];
115 
116 /*
117  * Is this a POSIX threads user-level lock requiring priority inheritance?
118  */
119 #define	UPIMUTEX(type)	((type) & LOCK_PRIO_INHERIT)
120 
121 static sleepq_head_t *
122 lwpsqhash(lwpchan_t *lwpchan)
123 {
124 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
125 	return (&lwpsleepq[SQHASHINDEX(x)]);
126 }
127 
128 /*
129  * Lock an lwpchan.
130  * Keep this in sync with lwpchan_unlock(), below.
131  */
132 static void
133 lwpchan_lock(lwpchan_t *lwpchan, int pool)
134 {
135 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
136 	mutex_enter(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
137 }
138 
139 /*
140  * Unlock an lwpchan.
141  * Keep this in sync with lwpchan_lock(), above.
142  */
143 static void
144 lwpchan_unlock(lwpchan_t *lwpchan, int pool)
145 {
146 	uint_t x = (uintptr_t)lwpchan->lc_wchan ^ (uintptr_t)lwpchan->lc_wchan0;
147 	mutex_exit(&lwpchanlock[LWPCHAN_LOCK_HASH(x, pool)]);
148 }
149 
150 /*
151  * Delete mappings from the lwpchan cache for pages that are being
152  * unmapped by as_unmap().  Given a range of addresses, "start" to "end",
153  * all mappings within the range are deleted from the lwpchan cache.
154  */
155 void
156 lwpchan_delete_mapping(proc_t *p, caddr_t start, caddr_t end)
157 {
158 	lwpchan_data_t *lcp;
159 	lwpchan_hashbucket_t *hashbucket;
160 	lwpchan_hashbucket_t *endbucket;
161 	lwpchan_entry_t *ent;
162 	lwpchan_entry_t **prev;
163 	caddr_t addr;
164 
165 	mutex_enter(&p->p_lcp_lock);
166 	lcp = p->p_lcp;
167 	hashbucket = lcp->lwpchan_cache;
168 	endbucket = hashbucket + lcp->lwpchan_size;
169 	for (; hashbucket < endbucket; hashbucket++) {
170 		if (hashbucket->lwpchan_chain == NULL)
171 			continue;
172 		mutex_enter(&hashbucket->lwpchan_lock);
173 		prev = &hashbucket->lwpchan_chain;
174 		/* check entire chain */
175 		while ((ent = *prev) != NULL) {
176 			addr = ent->lwpchan_addr;
177 			if (start <= addr && addr < end) {
178 				*prev = ent->lwpchan_next;
179 				/*
180 				 * We do this only for the obsolete type
181 				 * USYNC_PROCESS_ROBUST.  Otherwise robust
182 				 * locks do not draw ELOCKUNMAPPED or
183 				 * EOWNERDEAD due to being unmapped.
184 				 */
185 				if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
186 				    (ent->lwpchan_type & USYNC_PROCESS_ROBUST))
187 					lwp_mutex_cleanup(ent, LOCK_UNMAPPED);
188 				kmem_free(ent, sizeof (*ent));
189 				atomic_add_32(&lcp->lwpchan_entries, -1);
190 			} else {
191 				prev = &ent->lwpchan_next;
192 			}
193 		}
194 		mutex_exit(&hashbucket->lwpchan_lock);
195 	}
196 	mutex_exit(&p->p_lcp_lock);
197 }
198 
199 /*
200  * Given an lwpchan cache pointer and a process virtual address,
201  * return a pointer to the corresponding lwpchan hash bucket.
202  */
203 static lwpchan_hashbucket_t *
204 lwpchan_bucket(lwpchan_data_t *lcp, uintptr_t addr)
205 {
206 	uint_t i;
207 
208 	/*
209 	 * All user-level sync object addresses are 8-byte aligned.
210 	 * Ignore the lowest 3 bits of the address and use the
211 	 * higher-order 2*lwpchan_bits bits for the hash index.
212 	 */
213 	addr >>= 3;
214 	i = (addr ^ (addr >> lcp->lwpchan_bits)) & lcp->lwpchan_mask;
215 	return (lcp->lwpchan_cache + i);
216 }
217 
218 /*
219  * (Re)allocate the per-process lwpchan cache.
220  */
221 static void
222 lwpchan_alloc_cache(proc_t *p, uint_t bits)
223 {
224 	lwpchan_data_t *lcp;
225 	lwpchan_data_t *old_lcp;
226 	lwpchan_hashbucket_t *hashbucket;
227 	lwpchan_hashbucket_t *endbucket;
228 	lwpchan_hashbucket_t *newbucket;
229 	lwpchan_entry_t *ent;
230 	lwpchan_entry_t *next;
231 	uint_t count;
232 
233 	ASSERT(bits >= LWPCHAN_INITIAL_BITS && bits <= LWPCHAN_MAX_BITS);
234 
235 	lcp = kmem_alloc(sizeof (lwpchan_data_t), KM_SLEEP);
236 	lcp->lwpchan_bits = bits;
237 	lcp->lwpchan_size = 1 << lcp->lwpchan_bits;
238 	lcp->lwpchan_mask = lcp->lwpchan_size - 1;
239 	lcp->lwpchan_entries = 0;
240 	lcp->lwpchan_cache = kmem_zalloc(lcp->lwpchan_size *
241 	    sizeof (lwpchan_hashbucket_t), KM_SLEEP);
242 	lcp->lwpchan_next_data = NULL;
243 
244 	mutex_enter(&p->p_lcp_lock);
245 	if ((old_lcp = p->p_lcp) != NULL) {
246 		if (old_lcp->lwpchan_bits >= bits) {
247 			/* someone beat us to it */
248 			mutex_exit(&p->p_lcp_lock);
249 			kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
250 			    sizeof (lwpchan_hashbucket_t));
251 			kmem_free(lcp, sizeof (lwpchan_data_t));
252 			return;
253 		}
254 		/*
255 		 * Acquire all of the old hash table locks.
256 		 */
257 		hashbucket = old_lcp->lwpchan_cache;
258 		endbucket = hashbucket + old_lcp->lwpchan_size;
259 		for (; hashbucket < endbucket; hashbucket++)
260 			mutex_enter(&hashbucket->lwpchan_lock);
261 		/*
262 		 * Move all of the old hash table entries to the
263 		 * new hash table.  The new hash table has not yet
264 		 * been installed so we don't need any of its locks.
265 		 */
266 		count = 0;
267 		hashbucket = old_lcp->lwpchan_cache;
268 		for (; hashbucket < endbucket; hashbucket++) {
269 			ent = hashbucket->lwpchan_chain;
270 			while (ent != NULL) {
271 				next = ent->lwpchan_next;
272 				newbucket = lwpchan_bucket(lcp,
273 				    (uintptr_t)ent->lwpchan_addr);
274 				ent->lwpchan_next = newbucket->lwpchan_chain;
275 				newbucket->lwpchan_chain = ent;
276 				ent = next;
277 				count++;
278 			}
279 			hashbucket->lwpchan_chain = NULL;
280 		}
281 		lcp->lwpchan_entries = count;
282 	}
283 
284 	/*
285 	 * Retire the old hash table.  We can't actually kmem_free() it
286 	 * now because someone may still have a pointer to it.  Instead,
287 	 * we link it onto the new hash table's list of retired hash tables.
288 	 * The new hash table is double the size of the previous one, so
289 	 * the total size of all retired hash tables is less than the size
290 	 * of the new one.  exit() and exec() free the retired hash tables
291 	 * (see lwpchan_destroy_cache(), below).
292 	 */
293 	lcp->lwpchan_next_data = old_lcp;
294 
295 	/*
296 	 * As soon as we store the new lcp, future locking operations will
297 	 * use it.  Therefore, we must ensure that all the state we've just
298 	 * established reaches global visibility before the new lcp does.
299 	 */
300 	membar_producer();
301 	p->p_lcp = lcp;
302 
303 	if (old_lcp != NULL) {
304 		/*
305 		 * Release all of the old hash table locks.
306 		 */
307 		hashbucket = old_lcp->lwpchan_cache;
308 		for (; hashbucket < endbucket; hashbucket++)
309 			mutex_exit(&hashbucket->lwpchan_lock);
310 	}
311 	mutex_exit(&p->p_lcp_lock);
312 }
313 
314 /*
315  * Deallocate the lwpchan cache, and any dynamically allocated mappings.
316  * Called when the process exits or execs.  All lwps except one have
317  * exited so we need no locks here.
318  */
319 void
320 lwpchan_destroy_cache(int exec)
321 {
322 	proc_t *p = curproc;
323 	lwpchan_hashbucket_t *hashbucket;
324 	lwpchan_hashbucket_t *endbucket;
325 	lwpchan_data_t *lcp;
326 	lwpchan_entry_t *ent;
327 	lwpchan_entry_t *next;
328 	uint16_t lockflg;
329 
330 	lcp = p->p_lcp;
331 	p->p_lcp = NULL;
332 
333 	lockflg = exec? LOCK_UNMAPPED : LOCK_OWNERDEAD;
334 	hashbucket = lcp->lwpchan_cache;
335 	endbucket = hashbucket + lcp->lwpchan_size;
336 	for (; hashbucket < endbucket; hashbucket++) {
337 		ent = hashbucket->lwpchan_chain;
338 		hashbucket->lwpchan_chain = NULL;
339 		while (ent != NULL) {
340 			next = ent->lwpchan_next;
341 			if (ent->lwpchan_pool == LWPCHAN_MPPOOL &&
342 			    (ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
343 			    == (USYNC_PROCESS | LOCK_ROBUST))
344 				lwp_mutex_cleanup(ent, lockflg);
345 			kmem_free(ent, sizeof (*ent));
346 			ent = next;
347 		}
348 	}
349 
350 	while (lcp != NULL) {
351 		lwpchan_data_t *next_lcp = lcp->lwpchan_next_data;
352 		kmem_free(lcp->lwpchan_cache, lcp->lwpchan_size *
353 		    sizeof (lwpchan_hashbucket_t));
354 		kmem_free(lcp, sizeof (lwpchan_data_t));
355 		lcp = next_lcp;
356 	}
357 }
358 
359 /*
360  * Return zero when there is an entry in the lwpchan cache for the
361  * given process virtual address and non-zero when there is not.
362  * The returned non-zero value is the current length of the
363  * hash chain plus one.  The caller holds the hash bucket lock.
364  */
365 static uint_t
366 lwpchan_cache_mapping(caddr_t addr, int type, int pool, lwpchan_t *lwpchan,
367 	lwpchan_hashbucket_t *hashbucket)
368 {
369 	lwpchan_entry_t *ent;
370 	uint_t count = 1;
371 
372 	for (ent = hashbucket->lwpchan_chain; ent; ent = ent->lwpchan_next) {
373 		if (ent->lwpchan_addr == addr) {
374 			if (ent->lwpchan_type != type ||
375 			    ent->lwpchan_pool != pool) {
376 				/*
377 				 * This shouldn't happen, but might if the
378 				 * process reuses its memory for different
379 				 * types of sync objects.  We test first
380 				 * to avoid grabbing the memory cache line.
381 				 */
382 				ent->lwpchan_type = (uint16_t)type;
383 				ent->lwpchan_pool = (uint16_t)pool;
384 			}
385 			*lwpchan = ent->lwpchan_lwpchan;
386 			return (0);
387 		}
388 		count++;
389 	}
390 	return (count);
391 }
392 
393 /*
394  * Return the cached lwpchan mapping if cached, otherwise insert
395  * a virtual address to lwpchan mapping into the cache.
396  */
397 static int
398 lwpchan_get_mapping(struct as *as, caddr_t addr,
399 	int type, lwpchan_t *lwpchan, int pool)
400 {
401 	proc_t *p = curproc;
402 	lwpchan_data_t *lcp;
403 	lwpchan_hashbucket_t *hashbucket;
404 	lwpchan_entry_t *ent;
405 	memid_t	memid;
406 	uint_t count;
407 	uint_t bits;
408 
409 top:
410 	/* initialize the lwpchan cache, if necesary */
411 	if ((lcp = p->p_lcp) == NULL) {
412 		lwpchan_alloc_cache(p, LWPCHAN_INITIAL_BITS);
413 		goto top;
414 	}
415 	hashbucket = lwpchan_bucket(lcp, (uintptr_t)addr);
416 	mutex_enter(&hashbucket->lwpchan_lock);
417 	if (lcp != p->p_lcp) {
418 		/* someone resized the lwpchan cache; start over */
419 		mutex_exit(&hashbucket->lwpchan_lock);
420 		goto top;
421 	}
422 	if (lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket) == 0) {
423 		/* it's in the cache */
424 		mutex_exit(&hashbucket->lwpchan_lock);
425 		return (1);
426 	}
427 	mutex_exit(&hashbucket->lwpchan_lock);
428 	if (as_getmemid(as, addr, &memid) != 0)
429 		return (0);
430 	lwpchan->lc_wchan0 = (caddr_t)(uintptr_t)memid.val[0];
431 	lwpchan->lc_wchan = (caddr_t)(uintptr_t)memid.val[1];
432 	ent = kmem_alloc(sizeof (lwpchan_entry_t), KM_SLEEP);
433 	mutex_enter(&hashbucket->lwpchan_lock);
434 	if (lcp != p->p_lcp) {
435 		/* someone resized the lwpchan cache; start over */
436 		mutex_exit(&hashbucket->lwpchan_lock);
437 		kmem_free(ent, sizeof (*ent));
438 		goto top;
439 	}
440 	count = lwpchan_cache_mapping(addr, type, pool, lwpchan, hashbucket);
441 	if (count == 0) {
442 		/* someone else added this entry to the cache */
443 		mutex_exit(&hashbucket->lwpchan_lock);
444 		kmem_free(ent, sizeof (*ent));
445 		return (1);
446 	}
447 	if (count > lcp->lwpchan_bits + 2 && /* larger table, longer chains */
448 	    (bits = lcp->lwpchan_bits) < LWPCHAN_MAX_BITS) {
449 		/* hash chain too long; reallocate the hash table */
450 		mutex_exit(&hashbucket->lwpchan_lock);
451 		kmem_free(ent, sizeof (*ent));
452 		lwpchan_alloc_cache(p, bits + 1);
453 		goto top;
454 	}
455 	ent->lwpchan_addr = addr;
456 	ent->lwpchan_type = (uint16_t)type;
457 	ent->lwpchan_pool = (uint16_t)pool;
458 	ent->lwpchan_lwpchan = *lwpchan;
459 	ent->lwpchan_next = hashbucket->lwpchan_chain;
460 	hashbucket->lwpchan_chain = ent;
461 	atomic_add_32(&lcp->lwpchan_entries, 1);
462 	mutex_exit(&hashbucket->lwpchan_lock);
463 	return (1);
464 }
465 
466 /*
467  * Return a unique pair of identifiers that corresponds to a
468  * synchronization object's virtual address.  Process-shared
469  * sync objects usually get vnode/offset from as_getmemid().
470  */
471 static int
472 get_lwpchan(struct as *as, caddr_t addr, int type, lwpchan_t *lwpchan, int pool)
473 {
474 	/*
475 	 * If the lwp synch object is defined to be process-private,
476 	 * we just make the first field of the lwpchan be 'as' and
477 	 * the second field be the synch object's virtual address.
478 	 * (segvn_getmemid() does the same for MAP_PRIVATE mappings.)
479 	 * The lwpchan cache is used only for process-shared objects.
480 	 */
481 	if (!(type & USYNC_PROCESS)) {
482 		lwpchan->lc_wchan0 = (caddr_t)as;
483 		lwpchan->lc_wchan = addr;
484 		return (1);
485 	}
486 
487 	return (lwpchan_get_mapping(as, addr, type, lwpchan, pool));
488 }
489 
490 static void
491 lwp_block(lwpchan_t *lwpchan)
492 {
493 	kthread_t *t = curthread;
494 	klwp_t *lwp = ttolwp(t);
495 	sleepq_head_t *sqh;
496 
497 	thread_lock(t);
498 	t->t_flag |= T_WAKEABLE;
499 	t->t_lwpchan = *lwpchan;
500 	t->t_sobj_ops = &lwp_sobj_ops;
501 	t->t_release = 0;
502 	sqh = lwpsqhash(lwpchan);
503 	disp_lock_enter_high(&sqh->sq_lock);
504 	CL_SLEEP(t);
505 	DTRACE_SCHED(sleep);
506 	THREAD_SLEEP(t, &sqh->sq_lock);
507 	sleepq_insert(&sqh->sq_queue, t);
508 	thread_unlock(t);
509 	lwp->lwp_asleep = 1;
510 	lwp->lwp_sysabort = 0;
511 	lwp->lwp_ru.nvcsw++;
512 	(void) new_mstate(curthread, LMS_SLEEP);
513 }
514 
515 static kthread_t *
516 lwpsobj_pi_owner(upimutex_t *up)
517 {
518 	return (up->upi_owner);
519 }
520 
521 static struct upimutex *
522 upi_get(upib_t *upibp, lwpchan_t *lcp)
523 {
524 	struct upimutex *upip;
525 
526 	for (upip = upibp->upib_first; upip != NULL;
527 	    upip = upip->upi_nextchain) {
528 		if (upip->upi_lwpchan.lc_wchan0 == lcp->lc_wchan0 &&
529 		    upip->upi_lwpchan.lc_wchan == lcp->lc_wchan)
530 			break;
531 	}
532 	return (upip);
533 }
534 
535 static void
536 upi_chain_add(upib_t *upibp, struct upimutex *upimutex)
537 {
538 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
539 
540 	/*
541 	 * Insert upimutex at front of list. Maybe a bit unfair
542 	 * but assume that not many lwpchans hash to the same
543 	 * upimutextab bucket, i.e. the list of upimutexes from
544 	 * upib_first is not too long.
545 	 */
546 	upimutex->upi_nextchain = upibp->upib_first;
547 	upibp->upib_first = upimutex;
548 }
549 
550 static void
551 upi_chain_del(upib_t *upibp, struct upimutex *upimutex)
552 {
553 	struct upimutex **prev;
554 
555 	ASSERT(MUTEX_HELD(&upibp->upib_lock));
556 
557 	prev = &upibp->upib_first;
558 	while (*prev != upimutex) {
559 		prev = &(*prev)->upi_nextchain;
560 	}
561 	*prev = upimutex->upi_nextchain;
562 	upimutex->upi_nextchain = NULL;
563 }
564 
565 /*
566  * Add upimutex to chain of upimutexes held by curthread.
567  * Returns number of upimutexes held by curthread.
568  */
569 static uint32_t
570 upi_mylist_add(struct upimutex *upimutex)
571 {
572 	kthread_t *t = curthread;
573 
574 	/*
575 	 * Insert upimutex at front of list of upimutexes owned by t. This
576 	 * would match typical LIFO order in which nested locks are acquired
577 	 * and released.
578 	 */
579 	upimutex->upi_nextowned = t->t_upimutex;
580 	t->t_upimutex = upimutex;
581 	t->t_nupinest++;
582 	ASSERT(t->t_nupinest > 0);
583 	return (t->t_nupinest);
584 }
585 
586 /*
587  * Delete upimutex from list of upimutexes owned by curthread.
588  */
589 static void
590 upi_mylist_del(struct upimutex *upimutex)
591 {
592 	kthread_t *t = curthread;
593 	struct upimutex **prev;
594 
595 	/*
596 	 * Since the order in which nested locks are acquired and released,
597 	 * is typically LIFO, and typical nesting levels are not too deep, the
598 	 * following should not be expensive in the general case.
599 	 */
600 	prev = &t->t_upimutex;
601 	while (*prev != upimutex) {
602 		prev = &(*prev)->upi_nextowned;
603 	}
604 	*prev = upimutex->upi_nextowned;
605 	upimutex->upi_nextowned = NULL;
606 	ASSERT(t->t_nupinest > 0);
607 	t->t_nupinest--;
608 }
609 
610 /*
611  * Returns true if upimutex is owned. Should be called only when upim points
612  * to kmem which cannot disappear from underneath.
613  */
614 static int
615 upi_owned(upimutex_t *upim)
616 {
617 	return (upim->upi_owner == curthread);
618 }
619 
620 /*
621  * Returns pointer to kernel object (upimutex_t *) if lp is owned.
622  */
623 static struct upimutex *
624 lwp_upimutex_owned(lwp_mutex_t *lp, uint8_t type)
625 {
626 	lwpchan_t lwpchan;
627 	upib_t *upibp;
628 	struct upimutex *upimutex;
629 
630 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
631 	    &lwpchan, LWPCHAN_MPPOOL))
632 		return (NULL);
633 
634 	upibp = &UPI_CHAIN(lwpchan);
635 	mutex_enter(&upibp->upib_lock);
636 	upimutex = upi_get(upibp, &lwpchan);
637 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
638 		mutex_exit(&upibp->upib_lock);
639 		return (NULL);
640 	}
641 	mutex_exit(&upibp->upib_lock);
642 	return (upimutex);
643 }
644 
645 /*
646  * Unlocks upimutex, waking up waiters if any. upimutex kmem is freed if
647  * no lock hand-off occurrs.
648  */
649 static void
650 upimutex_unlock(struct upimutex *upimutex, uint16_t flag)
651 {
652 	turnstile_t *ts;
653 	upib_t *upibp;
654 	kthread_t *newowner;
655 
656 	upi_mylist_del(upimutex);
657 	upibp = upimutex->upi_upibp;
658 	mutex_enter(&upibp->upib_lock);
659 	if (upimutex->upi_waiter != 0) { /* if waiters */
660 		ts = turnstile_lookup(upimutex);
661 		if (ts != NULL && !(flag & LOCK_NOTRECOVERABLE)) {
662 			/* hand-off lock to highest prio waiter */
663 			newowner = ts->ts_sleepq[TS_WRITER_Q].sq_first;
664 			upimutex->upi_owner = newowner;
665 			if (ts->ts_waiters == 1)
666 				upimutex->upi_waiter = 0;
667 			turnstile_wakeup(ts, TS_WRITER_Q, 1, newowner);
668 			mutex_exit(&upibp->upib_lock);
669 			return;
670 		} else if (ts != NULL) {
671 			/* LOCK_NOTRECOVERABLE: wakeup all */
672 			turnstile_wakeup(ts, TS_WRITER_Q, ts->ts_waiters, NULL);
673 		} else {
674 			/*
675 			 * Misleading w bit. Waiters might have been
676 			 * interrupted. No need to clear the w bit (upimutex
677 			 * will soon be freed). Re-calculate PI from existing
678 			 * waiters.
679 			 */
680 			turnstile_exit(upimutex);
681 			turnstile_pi_recalc();
682 		}
683 	}
684 	/*
685 	 * no waiters, or LOCK_NOTRECOVERABLE.
686 	 * remove from the bucket chain of upi mutexes.
687 	 * de-allocate kernel memory (upimutex).
688 	 */
689 	upi_chain_del(upimutex->upi_upibp, upimutex);
690 	mutex_exit(&upibp->upib_lock);
691 	kmem_free(upimutex, sizeof (upimutex_t));
692 }
693 
694 static int
695 lwp_upimutex_lock(lwp_mutex_t *lp, uint8_t type, int try, lwp_timer_t *lwptp)
696 {
697 	label_t ljb;
698 	int error = 0;
699 	lwpchan_t lwpchan;
700 	uint16_t flag;
701 	upib_t *upibp;
702 	volatile struct upimutex *upimutex = NULL;
703 	turnstile_t *ts;
704 	uint32_t nupinest;
705 	volatile int upilocked = 0;
706 
707 	if (on_fault(&ljb)) {
708 		if (upilocked)
709 			upimutex_unlock((upimutex_t *)upimutex, 0);
710 		error = EFAULT;
711 		goto out;
712 	}
713 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
714 	    &lwpchan, LWPCHAN_MPPOOL)) {
715 		error = EFAULT;
716 		goto out;
717 	}
718 	upibp = &UPI_CHAIN(lwpchan);
719 retry:
720 	mutex_enter(&upibp->upib_lock);
721 	upimutex = upi_get(upibp, &lwpchan);
722 	if (upimutex == NULL)  {
723 		/* lock available since lwpchan has no upimutex */
724 		upimutex = kmem_zalloc(sizeof (upimutex_t), KM_SLEEP);
725 		upi_chain_add(upibp, (upimutex_t *)upimutex);
726 		upimutex->upi_owner = curthread; /* grab lock */
727 		upimutex->upi_upibp = upibp;
728 		upimutex->upi_vaddr = lp;
729 		upimutex->upi_lwpchan = lwpchan;
730 		mutex_exit(&upibp->upib_lock);
731 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
732 		upilocked = 1;
733 		fuword16_noerr(&lp->mutex_flag, &flag);
734 		if (nupinest > maxnestupimx &&
735 		    secpolicy_resource(CRED()) != 0) {
736 			upimutex_unlock((upimutex_t *)upimutex, flag);
737 			error = ENOMEM;
738 			goto out;
739 		}
740 		if (flag & LOCK_NOTRECOVERABLE) {
741 			/*
742 			 * Since the setting of LOCK_NOTRECOVERABLE
743 			 * was done under the high-level upi mutex,
744 			 * in lwp_upimutex_unlock(), this flag needs to
745 			 * be checked while holding the upi mutex.
746 			 * If set, this thread should return without
747 			 * the lock held, and with the right error code.
748 			 */
749 			upimutex_unlock((upimutex_t *)upimutex, flag);
750 			upilocked = 0;
751 			error = ENOTRECOVERABLE;
752 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
753 			if (flag & LOCK_OWNERDEAD)
754 				error = EOWNERDEAD;
755 			else if (type & USYNC_PROCESS_ROBUST)
756 				error = ELOCKUNMAPPED;
757 			else
758 				error = EOWNERDEAD;
759 		}
760 		goto out;
761 	}
762 	/*
763 	 * If a upimutex object exists, it must have an owner.
764 	 * This is due to lock hand-off, and release of upimutex when no
765 	 * waiters are present at unlock time,
766 	 */
767 	ASSERT(upimutex->upi_owner != NULL);
768 	if (upimutex->upi_owner == curthread) {
769 		/*
770 		 * The user wrapper can check if the mutex type is
771 		 * ERRORCHECK: if not, it should stall at user-level.
772 		 * If so, it should return the error code.
773 		 */
774 		mutex_exit(&upibp->upib_lock);
775 		error = EDEADLK;
776 		goto out;
777 	}
778 	if (try == UPIMUTEX_TRY) {
779 		mutex_exit(&upibp->upib_lock);
780 		error = EBUSY;
781 		goto out;
782 	}
783 	/*
784 	 * Block for the lock.
785 	 * Put the lwp in an orderly state for debugging.
786 	 * Calling prstop() has to be done here, and not in
787 	 * turnstile_block(), since the preceding call to
788 	 * turnstile_lookup() raises the PIL to a level
789 	 * at which calls to prstop() should not be made.
790 	 */
791 	if ((error = lwptp->lwpt_time_error) != 0) {
792 		/*
793 		 * The SUSV3 Posix spec is very clear that we
794 		 * should get no error from validating the
795 		 * timer until we would actually sleep.
796 		 */
797 		mutex_exit(&upibp->upib_lock);
798 		goto out;
799 	}
800 	prstop(PR_REQUESTED, 0);
801 	if (lwptp->lwpt_tsp != NULL) {
802 		/*
803 		 * Unlike the protocol for other lwp timedwait operations,
804 		 * we must drop t_delay_lock before going to sleep in
805 		 * turnstile_block() for a upi mutex.
806 		 * See the comments below and in turnstile.c
807 		 */
808 		mutex_enter(&curthread->t_delay_lock);
809 		(void) lwp_timer_enqueue(lwptp);
810 		mutex_exit(&curthread->t_delay_lock);
811 	}
812 	/*
813 	 * Now, set the waiter bit and block for the lock in turnstile_block().
814 	 * No need to preserve the previous wbit since a lock try is not
815 	 * attempted after setting the wait bit. Wait bit is set under
816 	 * the upib_lock, which is not released until the turnstile lock
817 	 * is acquired. Say, the upimutex is L:
818 	 *
819 	 * 1. upib_lock is held so the waiter does not have to retry L after
820 	 *    setting the wait bit: since the owner has to grab the upib_lock
821 	 *    to unlock L, it will certainly see the wait bit set.
822 	 * 2. upib_lock is not released until the turnstile lock is acquired.
823 	 *    This is the key to preventing a missed wake-up. Otherwise, the
824 	 *    owner could acquire the upib_lock, and the tc_lock, to call
825 	 *    turnstile_wakeup(). All this, before the waiter gets tc_lock
826 	 *    to sleep in turnstile_block(). turnstile_wakeup() will then not
827 	 *    find this waiter, resulting in the missed wakeup.
828 	 * 3. The upib_lock, being a kernel mutex, cannot be released while
829 	 *    holding the tc_lock (since mutex_exit() could need to acquire
830 	 *    the same tc_lock)...and so is held when calling turnstile_block().
831 	 *    The address of upib_lock is passed to turnstile_block() which
832 	 *    releases it after releasing all turnstile locks, and before going
833 	 *    to sleep in swtch().
834 	 * 4. The waiter value cannot be a count of waiters, because a waiter
835 	 *    can be interrupted. The interrupt occurs under the tc_lock, at
836 	 *    which point, the upib_lock cannot be locked, to decrement waiter
837 	 *    count. So, just treat the waiter state as a bit, not a count.
838 	 */
839 	ts = turnstile_lookup((upimutex_t *)upimutex);
840 	upimutex->upi_waiter = 1;
841 	error = turnstile_block(ts, TS_WRITER_Q, (upimutex_t *)upimutex,
842 	    &lwp_sobj_pi_ops, &upibp->upib_lock, lwptp);
843 	/*
844 	 * Hand-off implies that we wakeup holding the lock, except when:
845 	 *	- deadlock is detected
846 	 *	- lock is not recoverable
847 	 *	- we got an interrupt or timeout
848 	 * If we wake up due to an interrupt or timeout, we may
849 	 * or may not be holding the lock due to mutex hand-off.
850 	 * Use lwp_upimutex_owned() to check if we do hold the lock.
851 	 */
852 	if (error != 0) {
853 		if ((error == EINTR || error == ETIME) &&
854 		    (upimutex = lwp_upimutex_owned(lp, type))) {
855 			/*
856 			 * Unlock and return - the re-startable syscall will
857 			 * try the lock again if we got EINTR.
858 			 */
859 			(void) upi_mylist_add((upimutex_t *)upimutex);
860 			upimutex_unlock((upimutex_t *)upimutex, 0);
861 		}
862 		/*
863 		 * The only other possible error is EDEADLK.  If so, upimutex
864 		 * is valid, since its owner is deadlocked with curthread.
865 		 */
866 		ASSERT(error == EINTR || error == ETIME ||
867 		    (error == EDEADLK && !upi_owned((upimutex_t *)upimutex)));
868 		ASSERT(!lwp_upimutex_owned(lp, type));
869 		goto out;
870 	}
871 	if (lwp_upimutex_owned(lp, type)) {
872 		ASSERT(lwp_upimutex_owned(lp, type) == upimutex);
873 		nupinest = upi_mylist_add((upimutex_t *)upimutex);
874 		upilocked = 1;
875 	}
876 	/*
877 	 * Now, need to read the user-level lp->mutex_flag to do the following:
878 	 *
879 	 * - if lock is held, check if EOWNERDEAD or ELOCKUNMAPPED
880 	 *   should be returned.
881 	 * - if lock isn't held, check if ENOTRECOVERABLE should
882 	 *   be returned.
883 	 *
884 	 * Now, either lp->mutex_flag is readable or it's not. If not
885 	 * readable, the on_fault path will cause a return with EFAULT
886 	 * as it should.  If it is readable, the state of the flag
887 	 * encodes the robustness state of the lock:
888 	 *
889 	 * If the upimutex is locked here, the flag's LOCK_OWNERDEAD
890 	 * or LOCK_UNMAPPED setting will influence the return code
891 	 * appropriately.  If the upimutex is not locked here, this
892 	 * could be due to a spurious wake-up or a NOTRECOVERABLE
893 	 * event.  The flag's setting can be used to distinguish
894 	 * between these two events.
895 	 */
896 	fuword16_noerr(&lp->mutex_flag, &flag);
897 	if (upilocked) {
898 		/*
899 		 * If the thread wakes up from turnstile_block with the lock
900 		 * held, the flag could not be set to LOCK_NOTRECOVERABLE,
901 		 * since it would not have been handed-off the lock.
902 		 * So, no need to check for this case.
903 		 */
904 		if (nupinest > maxnestupimx &&
905 		    secpolicy_resource(CRED()) != 0) {
906 			upimutex_unlock((upimutex_t *)upimutex, flag);
907 			upilocked = 0;
908 			error = ENOMEM;
909 		} else if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
910 			if (flag & LOCK_OWNERDEAD)
911 				error = EOWNERDEAD;
912 			else if (type & USYNC_PROCESS_ROBUST)
913 				error = ELOCKUNMAPPED;
914 			else
915 				error = EOWNERDEAD;
916 		}
917 	} else {
918 		/*
919 		 * Wake-up without the upimutex held. Either this is a
920 		 * spurious wake-up (due to signals, forkall(), whatever), or
921 		 * it is a LOCK_NOTRECOVERABLE robustness event. The setting
922 		 * of the mutex flag can be used to distinguish between the
923 		 * two events.
924 		 */
925 		if (flag & LOCK_NOTRECOVERABLE) {
926 			error = ENOTRECOVERABLE;
927 		} else {
928 			/*
929 			 * Here, the flag could be set to LOCK_OWNERDEAD or
930 			 * not. In both cases, this is a spurious wakeup,
931 			 * since the upi lock is not held, but the thread
932 			 * has returned from turnstile_block().
933 			 *
934 			 * The user flag could be LOCK_OWNERDEAD if, at the
935 			 * same time as curthread having been woken up
936 			 * spuriously, the owner (say Tdead) has died, marked
937 			 * the mutex flag accordingly, and handed off the lock
938 			 * to some other waiter (say Tnew). curthread just
939 			 * happened to read the flag while Tnew has yet to deal
940 			 * with the owner-dead event.
941 			 *
942 			 * In this event, curthread should retry the lock.
943 			 * If Tnew is able to cleanup the lock, curthread
944 			 * will eventually get the lock with a zero error code,
945 			 * If Tnew is unable to cleanup, its eventual call to
946 			 * unlock the lock will result in the mutex flag being
947 			 * set to LOCK_NOTRECOVERABLE, and the wake-up of
948 			 * all waiters, including curthread, which will then
949 			 * eventually return ENOTRECOVERABLE due to the above
950 			 * check.
951 			 *
952 			 * Of course, if the user-flag is not set with
953 			 * LOCK_OWNERDEAD, retrying is the thing to do, since
954 			 * this is definitely a spurious wakeup.
955 			 */
956 			goto retry;
957 		}
958 	}
959 
960 out:
961 	no_fault();
962 	return (error);
963 }
964 
965 
966 static int
967 lwp_upimutex_unlock(lwp_mutex_t *lp, uint8_t type)
968 {
969 	label_t ljb;
970 	int error = 0;
971 	lwpchan_t lwpchan;
972 	uint16_t flag;
973 	upib_t *upibp;
974 	volatile struct upimutex *upimutex = NULL;
975 	volatile int upilocked = 0;
976 
977 	if (on_fault(&ljb)) {
978 		if (upilocked)
979 			upimutex_unlock((upimutex_t *)upimutex, 0);
980 		error = EFAULT;
981 		goto out;
982 	}
983 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
984 	    &lwpchan, LWPCHAN_MPPOOL)) {
985 		error = EFAULT;
986 		goto out;
987 	}
988 	upibp = &UPI_CHAIN(lwpchan);
989 	mutex_enter(&upibp->upib_lock);
990 	upimutex = upi_get(upibp, &lwpchan);
991 	/*
992 	 * If the lock is not held, or the owner is not curthread, return
993 	 * error. The user-level wrapper can return this error or stall,
994 	 * depending on whether mutex is of ERRORCHECK type or not.
995 	 */
996 	if (upimutex == NULL || upimutex->upi_owner != curthread) {
997 		mutex_exit(&upibp->upib_lock);
998 		error = EPERM;
999 		goto out;
1000 	}
1001 	mutex_exit(&upibp->upib_lock); /* release for user memory access */
1002 	upilocked = 1;
1003 	fuword16_noerr(&lp->mutex_flag, &flag);
1004 	if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1005 		/*
1006 		 * transition mutex to the LOCK_NOTRECOVERABLE state.
1007 		 */
1008 		flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
1009 		flag |= LOCK_NOTRECOVERABLE;
1010 		suword16_noerr(&lp->mutex_flag, flag);
1011 	}
1012 	if (type & USYNC_PROCESS)
1013 		suword32_noerr(&lp->mutex_ownerpid, 0);
1014 	upimutex_unlock((upimutex_t *)upimutex, flag);
1015 	upilocked = 0;
1016 out:
1017 	no_fault();
1018 	return (error);
1019 }
1020 
1021 /*
1022  * Clear the contents of a user-level mutex; return the flags.
1023  * Used only by upi_dead() and lwp_mutex_cleanup(), below.
1024  */
1025 static uint16_t
1026 lwp_clear_mutex(lwp_mutex_t *lp, uint16_t lockflg)
1027 {
1028 	uint16_t flag;
1029 
1030 	fuword16_noerr(&lp->mutex_flag, &flag);
1031 	if ((flag &
1032 	    (LOCK_OWNERDEAD | LOCK_UNMAPPED | LOCK_NOTRECOVERABLE)) == 0) {
1033 		flag |= lockflg;
1034 		suword16_noerr(&lp->mutex_flag, flag);
1035 	}
1036 	suword32_noerr((uint32_t *)&lp->mutex_owner, 0);
1037 	suword32_noerr((uint32_t *)&lp->mutex_owner + 1, 0);
1038 	suword32_noerr(&lp->mutex_ownerpid, 0);
1039 	suword8_noerr(&lp->mutex_rcount, 0);
1040 
1041 	return (flag);
1042 }
1043 
1044 /*
1045  * Mark user mutex state, corresponding to kernel upimutex,
1046  * as LOCK_UNMAPPED or LOCK_OWNERDEAD, as appropriate
1047  */
1048 static int
1049 upi_dead(upimutex_t *upip, uint16_t lockflg)
1050 {
1051 	label_t ljb;
1052 	int error = 0;
1053 	lwp_mutex_t *lp;
1054 
1055 	if (on_fault(&ljb)) {
1056 		error = EFAULT;
1057 		goto out;
1058 	}
1059 
1060 	lp = upip->upi_vaddr;
1061 	(void) lwp_clear_mutex(lp, lockflg);
1062 	suword8_noerr(&lp->mutex_lockw, 0);
1063 out:
1064 	no_fault();
1065 	return (error);
1066 }
1067 
1068 /*
1069  * Unlock all upimutexes held by curthread, since curthread is dying.
1070  * For each upimutex, attempt to mark its corresponding user mutex object as
1071  * dead.
1072  */
1073 void
1074 upimutex_cleanup()
1075 {
1076 	kthread_t *t = curthread;
1077 	uint16_t lockflg = (ttoproc(t)->p_proc_flag & P_PR_EXEC)?
1078 	    LOCK_UNMAPPED : LOCK_OWNERDEAD;
1079 	struct upimutex *upip;
1080 
1081 	while ((upip = t->t_upimutex) != NULL) {
1082 		if (upi_dead(upip, lockflg) != 0) {
1083 			/*
1084 			 * If the user object associated with this upimutex is
1085 			 * unmapped, unlock upimutex with the
1086 			 * LOCK_NOTRECOVERABLE flag, so that all waiters are
1087 			 * woken up. Since user object is unmapped, it could
1088 			 * not be marked as dead or notrecoverable.
1089 			 * The waiters will now all wake up and return
1090 			 * ENOTRECOVERABLE, since they would find that the lock
1091 			 * has not been handed-off to them.
1092 			 * See lwp_upimutex_lock().
1093 			 */
1094 			upimutex_unlock(upip, LOCK_NOTRECOVERABLE);
1095 		} else {
1096 			/*
1097 			 * The user object has been updated as dead.
1098 			 * Unlock the upimutex: if no waiters, upip kmem will
1099 			 * be freed. If there is a waiter, the lock will be
1100 			 * handed off. If exit() is in progress, each existing
1101 			 * waiter will successively get the lock, as owners
1102 			 * die, and each new owner will call this routine as
1103 			 * it dies. The last owner will free kmem, since
1104 			 * it will find the upimutex has no waiters. So,
1105 			 * eventually, the kmem is guaranteed to be freed.
1106 			 */
1107 			upimutex_unlock(upip, 0);
1108 		}
1109 		/*
1110 		 * Note that the call to upimutex_unlock() above will delete
1111 		 * upimutex from the t_upimutexes chain. And so the
1112 		 * while loop will eventually terminate.
1113 		 */
1114 	}
1115 }
1116 
1117 int
1118 lwp_mutex_timedlock(lwp_mutex_t *lp, timespec_t *tsp)
1119 {
1120 	kthread_t *t = curthread;
1121 	klwp_t *lwp = ttolwp(t);
1122 	proc_t *p = ttoproc(t);
1123 	lwp_timer_t lwpt;
1124 	caddr_t timedwait;
1125 	int error = 0;
1126 	int time_error;
1127 	clock_t tim = -1;
1128 	uchar_t waiters;
1129 	volatile int locked = 0;
1130 	volatile int watched = 0;
1131 	label_t ljb;
1132 	volatile uint8_t type = 0;
1133 	lwpchan_t lwpchan;
1134 	sleepq_head_t *sqh;
1135 	static int iswanted();
1136 	uint16_t flag;
1137 	int imm_timeout = 0;
1138 
1139 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1140 		return (set_errno(EFAULT));
1141 
1142 	timedwait = (caddr_t)tsp;
1143 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
1144 	    lwpt.lwpt_imm_timeout) {
1145 		imm_timeout = 1;
1146 		timedwait = NULL;
1147 	}
1148 
1149 	/*
1150 	 * Although LMS_USER_LOCK implies "asleep waiting for user-mode lock",
1151 	 * this micro state is really a run state. If the thread indeed blocks,
1152 	 * this state becomes valid. If not, the state is converted back to
1153 	 * LMS_SYSTEM. So, it is OK to set the mstate here, instead of just
1154 	 * when blocking.
1155 	 */
1156 	(void) new_mstate(t, LMS_USER_LOCK);
1157 	if (on_fault(&ljb)) {
1158 		if (locked)
1159 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1160 		error = EFAULT;
1161 		goto out;
1162 	}
1163 	/*
1164 	 * Force Copy-on-write if necessary and ensure that the
1165 	 * synchronization object resides in read/write memory.
1166 	 * Cause an EFAULT return now if this is not so.
1167 	 */
1168 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1169 	suword8_noerr(&lp->mutex_type, type);
1170 	if (UPIMUTEX(type)) {
1171 		no_fault();
1172 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_BLOCK, &lwpt);
1173 		if ((type & USYNC_PROCESS) &&
1174 		    (error == 0 ||
1175 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
1176 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
1177 		if (tsp && !time_error)	/* copyout the residual time left */
1178 			error = lwp_timer_copyout(&lwpt, error);
1179 		if (error)
1180 			return (set_errno(error));
1181 		return (0);
1182 	}
1183 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1184 	    &lwpchan, LWPCHAN_MPPOOL)) {
1185 		error = EFAULT;
1186 		goto out;
1187 	}
1188 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1189 	locked = 1;
1190 	if (type & LOCK_ROBUST) {
1191 		fuword16_noerr(&lp->mutex_flag, &flag);
1192 		if (flag & LOCK_NOTRECOVERABLE) {
1193 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1194 			error = ENOTRECOVERABLE;
1195 			goto out;
1196 		}
1197 	}
1198 	fuword8_noerr(&lp->mutex_waiters, &waiters);
1199 	suword8_noerr(&lp->mutex_waiters, 1);
1200 
1201 	/*
1202 	 * If watchpoints are set, they need to be restored, since
1203 	 * atomic accesses of memory such as the call to ulock_try()
1204 	 * below cannot be watched.
1205 	 */
1206 
1207 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1208 
1209 	while (!ulock_try(&lp->mutex_lockw)) {
1210 		if (time_error) {
1211 			/*
1212 			 * The SUSV3 Posix spec is very clear that we
1213 			 * should get no error from validating the
1214 			 * timer until we would actually sleep.
1215 			 */
1216 			error = time_error;
1217 			break;
1218 		}
1219 
1220 		if (watched) {
1221 			watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1222 			watched = 0;
1223 		}
1224 
1225 		/*
1226 		 * Put the lwp in an orderly state for debugging.
1227 		 */
1228 		prstop(PR_REQUESTED, 0);
1229 		if (timedwait) {
1230 			/*
1231 			 * If we successfully queue the timeout,
1232 			 * then don't drop t_delay_lock until
1233 			 * we are on the sleep queue (below).
1234 			 */
1235 			mutex_enter(&t->t_delay_lock);
1236 			if (lwp_timer_enqueue(&lwpt) != 0) {
1237 				mutex_exit(&t->t_delay_lock);
1238 				imm_timeout = 1;
1239 				timedwait = NULL;
1240 			}
1241 		}
1242 		lwp_block(&lwpchan);
1243 		/*
1244 		 * Nothing should happen to cause the lwp to go to
1245 		 * sleep again until after it returns from swtch().
1246 		 */
1247 		if (timedwait)
1248 			mutex_exit(&t->t_delay_lock);
1249 		locked = 0;
1250 		lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1251 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
1252 			setrun(t);
1253 		swtch();
1254 		t->t_flag &= ~T_WAKEABLE;
1255 		if (timedwait)
1256 			tim = lwp_timer_dequeue(&lwpt);
1257 		setallwatch();
1258 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
1259 			error = EINTR;
1260 		else if (imm_timeout || (timedwait && tim == -1))
1261 			error = ETIME;
1262 		if (error) {
1263 			lwp->lwp_asleep = 0;
1264 			lwp->lwp_sysabort = 0;
1265 			watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1266 			    S_WRITE);
1267 
1268 			/*
1269 			 * Need to re-compute waiters bit. The waiters field in
1270 			 * the lock is not reliable. Either of two things could
1271 			 * have occurred: no lwp may have called lwp_release()
1272 			 * for me but I have woken up due to a signal or
1273 			 * timeout.  In this case, the waiter bit is incorrect
1274 			 * since it is still set to 1, set above.
1275 			 * OR an lwp_release() did occur for some other lwp on
1276 			 * the same lwpchan. In this case, the waiter bit is
1277 			 * correct.  But which event occurred, one can't tell.
1278 			 * So, recompute.
1279 			 */
1280 			lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1281 			locked = 1;
1282 			sqh = lwpsqhash(&lwpchan);
1283 			disp_lock_enter(&sqh->sq_lock);
1284 			waiters = iswanted(sqh->sq_queue.sq_first, &lwpchan);
1285 			disp_lock_exit(&sqh->sq_lock);
1286 			break;
1287 		}
1288 		lwp->lwp_asleep = 0;
1289 		watched = watch_disable_addr((caddr_t)lp, sizeof (*lp),
1290 		    S_WRITE);
1291 		lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1292 		locked = 1;
1293 		fuword8_noerr(&lp->mutex_waiters, &waiters);
1294 		suword8_noerr(&lp->mutex_waiters, 1);
1295 		if (type & LOCK_ROBUST) {
1296 			fuword16_noerr(&lp->mutex_flag, &flag);
1297 			if (flag & LOCK_NOTRECOVERABLE) {
1298 				error = ENOTRECOVERABLE;
1299 				break;
1300 			}
1301 		}
1302 	}
1303 
1304 	if (t->t_mstate == LMS_USER_LOCK)
1305 		(void) new_mstate(t, LMS_SYSTEM);
1306 
1307 	if (error == 0) {
1308 		if (type & USYNC_PROCESS)
1309 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
1310 		if (type & LOCK_ROBUST) {
1311 			fuword16_noerr(&lp->mutex_flag, &flag);
1312 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
1313 				if (flag & LOCK_OWNERDEAD)
1314 					error = EOWNERDEAD;
1315 				else if (type & USYNC_PROCESS_ROBUST)
1316 					error = ELOCKUNMAPPED;
1317 				else
1318 					error = EOWNERDEAD;
1319 			}
1320 		}
1321 	}
1322 	suword8_noerr(&lp->mutex_waiters, waiters);
1323 	locked = 0;
1324 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1325 out:
1326 	no_fault();
1327 	if (watched)
1328 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1329 	if (tsp && !time_error)		/* copyout the residual time left */
1330 		error = lwp_timer_copyout(&lwpt, error);
1331 	if (error)
1332 		return (set_errno(error));
1333 	return (0);
1334 }
1335 
1336 /*
1337  * Obsolete lwp_mutex_lock() interface, no longer called from libc.
1338  * libc now calls lwp_mutex_timedlock(lp, NULL).
1339  * This system call trap continues to exist solely for the benefit
1340  * of old statically-linked binaries from Solaris 9 and before.
1341  * It should be removed from the system when we no longer care
1342  * about such applications.
1343  */
1344 int
1345 lwp_mutex_lock(lwp_mutex_t *lp)
1346 {
1347 	return (lwp_mutex_timedlock(lp, NULL));
1348 }
1349 
1350 static int
1351 iswanted(kthread_t *t, lwpchan_t *lwpchan)
1352 {
1353 	/*
1354 	 * The caller holds the dispatcher lock on the sleep queue.
1355 	 */
1356 	while (t != NULL) {
1357 		if (t->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1358 		    t->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1359 			return (1);
1360 		t = t->t_link;
1361 	}
1362 	return (0);
1363 }
1364 
1365 /*
1366  * Return the highest priority thread sleeping on this lwpchan.
1367  */
1368 static kthread_t *
1369 lwp_queue_waiter(lwpchan_t *lwpchan)
1370 {
1371 	sleepq_head_t *sqh;
1372 	kthread_t *tp;
1373 
1374 	sqh = lwpsqhash(lwpchan);
1375 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1376 	for (tp = sqh->sq_queue.sq_first; tp != NULL; tp = tp->t_link) {
1377 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1378 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan)
1379 			break;
1380 	}
1381 	disp_lock_exit(&sqh->sq_lock);
1382 	return (tp);
1383 }
1384 
1385 static int
1386 lwp_release(lwpchan_t *lwpchan, uchar_t *waiters, int sync_type)
1387 {
1388 	sleepq_head_t *sqh;
1389 	kthread_t *tp;
1390 	kthread_t **tpp;
1391 
1392 	sqh = lwpsqhash(lwpchan);
1393 	disp_lock_enter(&sqh->sq_lock);		/* lock the sleep queue */
1394 	tpp = &sqh->sq_queue.sq_first;
1395 	while ((tp = *tpp) != NULL) {
1396 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1397 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1398 			/*
1399 			 * The following is typically false. It could be true
1400 			 * only if lwp_release() is called from
1401 			 * lwp_mutex_wakeup() after reading the waiters field
1402 			 * from memory in which the lwp lock used to be, but has
1403 			 * since been re-used to hold a lwp cv or lwp semaphore.
1404 			 * The thread "tp" found to match the lwp lock's wchan
1405 			 * is actually sleeping for the cv or semaphore which
1406 			 * now has the same wchan. In this case, lwp_release()
1407 			 * should return failure.
1408 			 */
1409 			if (sync_type != (tp->t_flag & T_WAITCVSEM)) {
1410 				ASSERT(sync_type == 0);
1411 				/*
1412 				 * assert that this can happen only for mutexes
1413 				 * i.e. sync_type == 0, for correctly written
1414 				 * user programs.
1415 				 */
1416 				disp_lock_exit(&sqh->sq_lock);
1417 				return (0);
1418 			}
1419 			*waiters = iswanted(tp->t_link, lwpchan);
1420 			sleepq_unlink(tpp, tp);
1421 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1422 			tp->t_wchan0 = NULL;
1423 			tp->t_wchan = NULL;
1424 			tp->t_sobj_ops = NULL;
1425 			tp->t_release = 1;
1426 			THREAD_TRANSITION(tp);	/* drops sleepq lock */
1427 			CL_WAKEUP(tp);
1428 			thread_unlock(tp);	/* drop run queue lock */
1429 			return (1);
1430 		}
1431 		tpp = &tp->t_link;
1432 	}
1433 	*waiters = 0;
1434 	disp_lock_exit(&sqh->sq_lock);
1435 	return (0);
1436 }
1437 
1438 static void
1439 lwp_release_all(lwpchan_t *lwpchan)
1440 {
1441 	sleepq_head_t	*sqh;
1442 	kthread_t *tp;
1443 	kthread_t **tpp;
1444 
1445 	sqh = lwpsqhash(lwpchan);
1446 	disp_lock_enter(&sqh->sq_lock);		/* lock sleep q queue */
1447 	tpp = &sqh->sq_queue.sq_first;
1448 	while ((tp = *tpp) != NULL) {
1449 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
1450 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
1451 			sleepq_unlink(tpp, tp);
1452 			DTRACE_SCHED1(wakeup, kthread_t *, tp);
1453 			tp->t_wchan0 = NULL;
1454 			tp->t_wchan = NULL;
1455 			tp->t_sobj_ops = NULL;
1456 			CL_WAKEUP(tp);
1457 			thread_unlock_high(tp);	/* release run queue lock */
1458 		} else {
1459 			tpp = &tp->t_link;
1460 		}
1461 	}
1462 	disp_lock_exit(&sqh->sq_lock);		/* drop sleep q lock */
1463 }
1464 
1465 /*
1466  * unblock a lwp that is trying to acquire this mutex. the blocked
1467  * lwp resumes and retries to acquire the lock.
1468  */
1469 int
1470 lwp_mutex_wakeup(lwp_mutex_t *lp, int release_all)
1471 {
1472 	proc_t *p = ttoproc(curthread);
1473 	lwpchan_t lwpchan;
1474 	uchar_t waiters;
1475 	volatile int locked = 0;
1476 	volatile int watched = 0;
1477 	volatile uint8_t type = 0;
1478 	label_t ljb;
1479 	int error = 0;
1480 
1481 	if ((caddr_t)lp >= p->p_as->a_userlimit)
1482 		return (set_errno(EFAULT));
1483 
1484 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1485 
1486 	if (on_fault(&ljb)) {
1487 		if (locked)
1488 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1489 		error = EFAULT;
1490 		goto out;
1491 	}
1492 	/*
1493 	 * Force Copy-on-write if necessary and ensure that the
1494 	 * synchronization object resides in read/write memory.
1495 	 * Cause an EFAULT return now if this is not so.
1496 	 */
1497 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
1498 	suword8_noerr(&lp->mutex_type, type);
1499 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
1500 	    &lwpchan, LWPCHAN_MPPOOL)) {
1501 		error = EFAULT;
1502 		goto out;
1503 	}
1504 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
1505 	locked = 1;
1506 	/*
1507 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
1508 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
1509 	 * may fail.  If it fails, do not write into the waiter bit.
1510 	 * The call to lwp_release() might fail due to one of three reasons:
1511 	 *
1512 	 * 	1. due to the thread which set the waiter bit not actually
1513 	 *	   sleeping since it got the lock on the re-try. The waiter
1514 	 *	   bit will then be correctly updated by that thread. This
1515 	 *	   window may be closed by reading the wait bit again here
1516 	 *	   and not calling lwp_release() at all if it is zero.
1517 	 *	2. the thread which set the waiter bit and went to sleep
1518 	 *	   was woken up by a signal. This time, the waiter recomputes
1519 	 *	   the wait bit in the return with EINTR code.
1520 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
1521 	 *	   memory that has been re-used after the lock was dropped.
1522 	 *	   In this case, writing into the waiter bit would cause data
1523 	 *	   corruption.
1524 	 */
1525 	if (release_all)
1526 		lwp_release_all(&lwpchan);
1527 	else if (lwp_release(&lwpchan, &waiters, 0))
1528 		suword8_noerr(&lp->mutex_waiters, waiters);
1529 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
1530 out:
1531 	no_fault();
1532 	if (watched)
1533 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
1534 	if (error)
1535 		return (set_errno(error));
1536 	return (0);
1537 }
1538 
1539 /*
1540  * lwp_cond_wait() has four arguments, a pointer to a condition variable,
1541  * a pointer to a mutex, a pointer to a timespec for a timed wait and
1542  * a flag telling the kernel whether or not to honor the kernel/user
1543  * schedctl parking protocol (see schedctl_is_park() in schedctl.c).
1544  * The kernel puts the lwp to sleep on a unique pair of caddr_t's called an
1545  * lwpchan, returned by get_lwpchan().  If the timespec pointer is non-NULL,
1546  * it is used an an in/out parameter.  On entry, it contains the relative
1547  * time until timeout.  On exit, we copyout the residual time left to it.
1548  */
1549 int
1550 lwp_cond_wait(lwp_cond_t *cv, lwp_mutex_t *mp, timespec_t *tsp, int check_park)
1551 {
1552 	kthread_t *t = curthread;
1553 	klwp_t *lwp = ttolwp(t);
1554 	proc_t *p = ttoproc(t);
1555 	lwp_timer_t lwpt;
1556 	lwpchan_t cv_lwpchan;
1557 	lwpchan_t m_lwpchan;
1558 	caddr_t timedwait;
1559 	volatile uint16_t type = 0;
1560 	volatile uint8_t mtype = 0;
1561 	uchar_t waiters;
1562 	volatile int error;
1563 	clock_t tim = -1;
1564 	volatile int locked = 0;
1565 	volatile int m_locked = 0;
1566 	volatile int cvwatched = 0;
1567 	volatile int mpwatched = 0;
1568 	label_t ljb;
1569 	volatile int no_lwpchan = 1;
1570 	int imm_timeout = 0;
1571 	int imm_unpark = 0;
1572 
1573 	if ((caddr_t)cv >= p->p_as->a_userlimit ||
1574 	    (caddr_t)mp >= p->p_as->a_userlimit)
1575 		return (set_errno(EFAULT));
1576 
1577 	timedwait = (caddr_t)tsp;
1578 	if ((error = lwp_timer_copyin(&lwpt, tsp)) != 0)
1579 		return (set_errno(error));
1580 	if (lwpt.lwpt_imm_timeout) {
1581 		imm_timeout = 1;
1582 		timedwait = NULL;
1583 	}
1584 
1585 	(void) new_mstate(t, LMS_USER_LOCK);
1586 
1587 	if (on_fault(&ljb)) {
1588 		if (no_lwpchan) {
1589 			error = EFAULT;
1590 			goto out;
1591 		}
1592 		if (m_locked) {
1593 			m_locked = 0;
1594 			lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1595 		}
1596 		if (locked) {
1597 			locked = 0;
1598 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1599 		}
1600 		/*
1601 		 * set up another on_fault() for a possible fault
1602 		 * on the user lock accessed at "efault"
1603 		 */
1604 		if (on_fault(&ljb)) {
1605 			if (m_locked) {
1606 				m_locked = 0;
1607 				lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1608 			}
1609 			goto out;
1610 		}
1611 		error = EFAULT;
1612 		goto efault;
1613 	}
1614 
1615 	/*
1616 	 * Force Copy-on-write if necessary and ensure that the
1617 	 * synchronization object resides in read/write memory.
1618 	 * Cause an EFAULT return now if this is not so.
1619 	 */
1620 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
1621 	suword8_noerr(&mp->mutex_type, mtype);
1622 	if (UPIMUTEX(mtype) == 0) {
1623 		/* convert user level mutex, "mp", to a unique lwpchan */
1624 		/* check if mtype is ok to use below, instead of type from cv */
1625 		if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
1626 		    &m_lwpchan, LWPCHAN_MPPOOL)) {
1627 			error = EFAULT;
1628 			goto out;
1629 		}
1630 	}
1631 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1632 	suword16_noerr(&cv->cond_type, type);
1633 	/* convert user level condition variable, "cv", to a unique lwpchan */
1634 	if (!get_lwpchan(p->p_as, (caddr_t)cv, type,
1635 	    &cv_lwpchan, LWPCHAN_CVPOOL)) {
1636 		error = EFAULT;
1637 		goto out;
1638 	}
1639 	no_lwpchan = 0;
1640 	cvwatched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1641 	if (UPIMUTEX(mtype) == 0)
1642 		mpwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp),
1643 		    S_WRITE);
1644 
1645 	/*
1646 	 * lwpchan_lock ensures that the calling lwp is put to sleep atomically
1647 	 * with respect to a possible wakeup which is a result of either
1648 	 * an lwp_cond_signal() or an lwp_cond_broadcast().
1649 	 *
1650 	 * What's misleading, is that the lwp is put to sleep after the
1651 	 * condition variable's mutex is released.  This is OK as long as
1652 	 * the release operation is also done while holding lwpchan_lock.
1653 	 * The lwp is then put to sleep when the possibility of pagefaulting
1654 	 * or sleeping is completely eliminated.
1655 	 */
1656 	lwpchan_lock(&cv_lwpchan, LWPCHAN_CVPOOL);
1657 	locked = 1;
1658 	if (UPIMUTEX(mtype) == 0) {
1659 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1660 		m_locked = 1;
1661 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1662 		/*
1663 		 * unlock the condition variable's mutex. (pagefaults are
1664 		 * possible here.)
1665 		 */
1666 		if (mtype & USYNC_PROCESS)
1667 			suword32_noerr(&mp->mutex_ownerpid, 0);
1668 		ulock_clear(&mp->mutex_lockw);
1669 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1670 		if (waiters != 0) {
1671 			/*
1672 			 * Given the locking of lwpchan_lock around the release
1673 			 * of the mutex and checking for waiters, the following
1674 			 * call to lwp_release() can fail ONLY if the lock
1675 			 * acquirer is interrupted after setting the waiter bit,
1676 			 * calling lwp_block() and releasing lwpchan_lock.
1677 			 * In this case, it could get pulled off the lwp sleep
1678 			 * q (via setrun()) before the following call to
1679 			 * lwp_release() occurs. In this case, the lock
1680 			 * requestor will update the waiter bit correctly by
1681 			 * re-evaluating it.
1682 			 */
1683 			if (lwp_release(&m_lwpchan, &waiters, 0))
1684 				suword8_noerr(&mp->mutex_waiters, waiters);
1685 		}
1686 		m_locked = 0;
1687 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1688 	} else {
1689 		suword8_noerr(&cv->cond_waiters_kernel, 1);
1690 		error = lwp_upimutex_unlock(mp, mtype);
1691 		if (error) {	/* if the upimutex unlock failed */
1692 			locked = 0;
1693 			lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1694 			goto out;
1695 		}
1696 	}
1697 	no_fault();
1698 
1699 	if (mpwatched) {
1700 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1701 		mpwatched = 0;
1702 	}
1703 	if (cvwatched) {
1704 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1705 		cvwatched = 0;
1706 	}
1707 
1708 	/*
1709 	 * Put the lwp in an orderly state for debugging.
1710 	 */
1711 	prstop(PR_REQUESTED, 0);
1712 	if (check_park && (!schedctl_is_park() || t->t_unpark)) {
1713 		/*
1714 		 * We received a signal at user-level before calling here
1715 		 * or another thread wants us to return immediately
1716 		 * with EINTR.  See lwp_unpark().
1717 		 */
1718 		imm_unpark = 1;
1719 		t->t_unpark = 0;
1720 		timedwait = NULL;
1721 	} else if (timedwait) {
1722 		/*
1723 		 * If we successfully queue the timeout,
1724 		 * then don't drop t_delay_lock until
1725 		 * we are on the sleep queue (below).
1726 		 */
1727 		mutex_enter(&t->t_delay_lock);
1728 		if (lwp_timer_enqueue(&lwpt) != 0) {
1729 			mutex_exit(&t->t_delay_lock);
1730 			imm_timeout = 1;
1731 			timedwait = NULL;
1732 		}
1733 	}
1734 	t->t_flag |= T_WAITCVSEM;
1735 	lwp_block(&cv_lwpchan);
1736 	/*
1737 	 * Nothing should happen to cause the lwp to go to sleep
1738 	 * until after it returns from swtch().
1739 	 */
1740 	if (timedwait)
1741 		mutex_exit(&t->t_delay_lock);
1742 	locked = 0;
1743 	lwpchan_unlock(&cv_lwpchan, LWPCHAN_CVPOOL);
1744 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
1745 	    (imm_timeout | imm_unpark))
1746 		setrun(t);
1747 	swtch();
1748 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
1749 	if (timedwait)
1750 		tim = lwp_timer_dequeue(&lwpt);
1751 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
1752 	    MUSTRETURN(p, t) || imm_unpark)
1753 		error = EINTR;
1754 	else if (imm_timeout || (timedwait && tim == -1))
1755 		error = ETIME;
1756 	lwp->lwp_asleep = 0;
1757 	lwp->lwp_sysabort = 0;
1758 	setallwatch();
1759 
1760 	if (t->t_mstate == LMS_USER_LOCK)
1761 		(void) new_mstate(t, LMS_SYSTEM);
1762 
1763 	if (tsp && check_park)		/* copyout the residual time left */
1764 		error = lwp_timer_copyout(&lwpt, error);
1765 
1766 	/* the mutex is reacquired by the caller on return to user level */
1767 	if (error) {
1768 		/*
1769 		 * If we were concurrently lwp_cond_signal()d and we
1770 		 * received a UNIX signal or got a timeout, then perform
1771 		 * another lwp_cond_signal() to avoid consuming the wakeup.
1772 		 */
1773 		if (t->t_release)
1774 			(void) lwp_cond_signal(cv);
1775 		return (set_errno(error));
1776 	}
1777 	return (0);
1778 
1779 efault:
1780 	/*
1781 	 * make sure that the user level lock is dropped before
1782 	 * returning to caller, since the caller always re-acquires it.
1783 	 */
1784 	if (UPIMUTEX(mtype) == 0) {
1785 		lwpchan_lock(&m_lwpchan, LWPCHAN_MPPOOL);
1786 		m_locked = 1;
1787 		if (mtype & USYNC_PROCESS)
1788 			suword32_noerr(&mp->mutex_ownerpid, 0);
1789 		ulock_clear(&mp->mutex_lockw);
1790 		fuword8_noerr(&mp->mutex_waiters, &waiters);
1791 		if (waiters != 0) {
1792 			/*
1793 			 * See comment above on lock clearing and lwp_release()
1794 			 * success/failure.
1795 			 */
1796 			if (lwp_release(&m_lwpchan, &waiters, 0))
1797 				suword8_noerr(&mp->mutex_waiters, waiters);
1798 		}
1799 		m_locked = 0;
1800 		lwpchan_unlock(&m_lwpchan, LWPCHAN_MPPOOL);
1801 	} else {
1802 		(void) lwp_upimutex_unlock(mp, mtype);
1803 	}
1804 out:
1805 	no_fault();
1806 	if (mpwatched)
1807 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
1808 	if (cvwatched)
1809 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1810 	if (t->t_mstate == LMS_USER_LOCK)
1811 		(void) new_mstate(t, LMS_SYSTEM);
1812 	return (set_errno(error));
1813 }
1814 
1815 /*
1816  * wakeup one lwp that's blocked on this condition variable.
1817  */
1818 int
1819 lwp_cond_signal(lwp_cond_t *cv)
1820 {
1821 	proc_t *p = ttoproc(curthread);
1822 	lwpchan_t lwpchan;
1823 	uchar_t waiters;
1824 	volatile uint16_t type = 0;
1825 	volatile int locked = 0;
1826 	volatile int watched = 0;
1827 	label_t ljb;
1828 	int error = 0;
1829 
1830 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1831 		return (set_errno(EFAULT));
1832 
1833 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1834 
1835 	if (on_fault(&ljb)) {
1836 		if (locked)
1837 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1838 		error = EFAULT;
1839 		goto out;
1840 	}
1841 	/*
1842 	 * Force Copy-on-write if necessary and ensure that the
1843 	 * synchronization object resides in read/write memory.
1844 	 * Cause an EFAULT return now if this is not so.
1845 	 */
1846 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1847 	suword16_noerr(&cv->cond_type, type);
1848 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1849 	    &lwpchan, LWPCHAN_CVPOOL)) {
1850 		error = EFAULT;
1851 		goto out;
1852 	}
1853 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1854 	locked = 1;
1855 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1856 	if (waiters != 0) {
1857 		/*
1858 		 * The following call to lwp_release() might fail but it is
1859 		 * OK to write into the waiters bit below, since the memory
1860 		 * could not have been re-used or unmapped (for correctly
1861 		 * written user programs) as in the case of lwp_mutex_wakeup().
1862 		 * For an incorrect program, we should not care about data
1863 		 * corruption since this is just one instance of other places
1864 		 * where corruption can occur for such a program. Of course
1865 		 * if the memory is unmapped, normal fault recovery occurs.
1866 		 */
1867 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1868 		suword8_noerr(&cv->cond_waiters_kernel, waiters);
1869 	}
1870 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1871 out:
1872 	no_fault();
1873 	if (watched)
1874 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1875 	if (error)
1876 		return (set_errno(error));
1877 	return (0);
1878 }
1879 
1880 /*
1881  * wakeup every lwp that's blocked on this condition variable.
1882  */
1883 int
1884 lwp_cond_broadcast(lwp_cond_t *cv)
1885 {
1886 	proc_t *p = ttoproc(curthread);
1887 	lwpchan_t lwpchan;
1888 	volatile uint16_t type = 0;
1889 	volatile int locked = 0;
1890 	volatile int watched = 0;
1891 	label_t ljb;
1892 	uchar_t waiters;
1893 	int error = 0;
1894 
1895 	if ((caddr_t)cv >= p->p_as->a_userlimit)
1896 		return (set_errno(EFAULT));
1897 
1898 	watched = watch_disable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1899 
1900 	if (on_fault(&ljb)) {
1901 		if (locked)
1902 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1903 		error = EFAULT;
1904 		goto out;
1905 	}
1906 	/*
1907 	 * Force Copy-on-write if necessary and ensure that the
1908 	 * synchronization object resides in read/write memory.
1909 	 * Cause an EFAULT return now if this is not so.
1910 	 */
1911 	fuword16_noerr(&cv->cond_type, (uint16_t *)&type);
1912 	suword16_noerr(&cv->cond_type, type);
1913 	if (!get_lwpchan(curproc->p_as, (caddr_t)cv, type,
1914 	    &lwpchan, LWPCHAN_CVPOOL)) {
1915 		error = EFAULT;
1916 		goto out;
1917 	}
1918 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1919 	locked = 1;
1920 	fuword8_noerr(&cv->cond_waiters_kernel, &waiters);
1921 	if (waiters != 0) {
1922 		lwp_release_all(&lwpchan);
1923 		suword8_noerr(&cv->cond_waiters_kernel, 0);
1924 	}
1925 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1926 out:
1927 	no_fault();
1928 	if (watched)
1929 		watch_enable_addr((caddr_t)cv, sizeof (*cv), S_WRITE);
1930 	if (error)
1931 		return (set_errno(error));
1932 	return (0);
1933 }
1934 
1935 int
1936 lwp_sema_trywait(lwp_sema_t *sp)
1937 {
1938 	kthread_t *t = curthread;
1939 	proc_t *p = ttoproc(t);
1940 	label_t ljb;
1941 	volatile int locked = 0;
1942 	volatile int watched = 0;
1943 	volatile uint16_t type = 0;
1944 	int count;
1945 	lwpchan_t lwpchan;
1946 	uchar_t waiters;
1947 	int error = 0;
1948 
1949 	if ((caddr_t)sp >= p->p_as->a_userlimit)
1950 		return (set_errno(EFAULT));
1951 
1952 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1953 
1954 	if (on_fault(&ljb)) {
1955 		if (locked)
1956 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1957 		error = EFAULT;
1958 		goto out;
1959 	}
1960 	/*
1961 	 * Force Copy-on-write if necessary and ensure that the
1962 	 * synchronization object resides in read/write memory.
1963 	 * Cause an EFAULT return now if this is not so.
1964 	 */
1965 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
1966 	suword16_noerr((void *)&sp->sema_type, type);
1967 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
1968 	    &lwpchan, LWPCHAN_CVPOOL)) {
1969 		error = EFAULT;
1970 		goto out;
1971 	}
1972 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
1973 	locked = 1;
1974 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
1975 	if (count == 0)
1976 		error = EBUSY;
1977 	else
1978 		suword32_noerr((void *)&sp->sema_count, --count);
1979 	if (count != 0) {
1980 		fuword8_noerr(&sp->sema_waiters, &waiters);
1981 		if (waiters != 0) {
1982 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
1983 			suword8_noerr(&sp->sema_waiters, waiters);
1984 		}
1985 	}
1986 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
1987 out:
1988 	no_fault();
1989 	if (watched)
1990 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
1991 	if (error)
1992 		return (set_errno(error));
1993 	return (0);
1994 }
1995 
1996 /*
1997  * See lwp_cond_wait(), above, for an explanation of the 'check_park' argument.
1998  */
1999 int
2000 lwp_sema_timedwait(lwp_sema_t *sp, timespec_t *tsp, int check_park)
2001 {
2002 	kthread_t *t = curthread;
2003 	klwp_t *lwp = ttolwp(t);
2004 	proc_t *p = ttoproc(t);
2005 	lwp_timer_t lwpt;
2006 	caddr_t timedwait;
2007 	clock_t tim = -1;
2008 	label_t ljb;
2009 	volatile int locked = 0;
2010 	volatile int watched = 0;
2011 	volatile uint16_t type = 0;
2012 	int count;
2013 	lwpchan_t lwpchan;
2014 	uchar_t waiters;
2015 	int error = 0;
2016 	int time_error;
2017 	int imm_timeout = 0;
2018 	int imm_unpark = 0;
2019 
2020 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2021 		return (set_errno(EFAULT));
2022 
2023 	timedwait = (caddr_t)tsp;
2024 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2025 	    lwpt.lwpt_imm_timeout) {
2026 		imm_timeout = 1;
2027 		timedwait = NULL;
2028 	}
2029 
2030 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2031 
2032 	if (on_fault(&ljb)) {
2033 		if (locked)
2034 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2035 		error = EFAULT;
2036 		goto out;
2037 	}
2038 	/*
2039 	 * Force Copy-on-write if necessary and ensure that the
2040 	 * synchronization object resides in read/write memory.
2041 	 * Cause an EFAULT return now if this is not so.
2042 	 */
2043 	fuword16_noerr((void *)&sp->sema_type, (uint16_t *)&type);
2044 	suword16_noerr((void *)&sp->sema_type, type);
2045 	if (!get_lwpchan(p->p_as, (caddr_t)sp, type,
2046 	    &lwpchan, LWPCHAN_CVPOOL)) {
2047 		error = EFAULT;
2048 		goto out;
2049 	}
2050 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2051 	locked = 1;
2052 	fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2053 	while (error == 0 && count == 0) {
2054 		if (time_error) {
2055 			/*
2056 			 * The SUSV3 Posix spec is very clear that we
2057 			 * should get no error from validating the
2058 			 * timer until we would actually sleep.
2059 			 */
2060 			error = time_error;
2061 			break;
2062 		}
2063 		suword8_noerr(&sp->sema_waiters, 1);
2064 		if (watched)
2065 			watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2066 		/*
2067 		 * Put the lwp in an orderly state for debugging.
2068 		 */
2069 		prstop(PR_REQUESTED, 0);
2070 		if (check_park && (!schedctl_is_park() || t->t_unpark)) {
2071 			/*
2072 			 * We received a signal at user-level before calling
2073 			 * here or another thread wants us to return
2074 			 * immediately with EINTR.  See lwp_unpark().
2075 			 */
2076 			imm_unpark = 1;
2077 			t->t_unpark = 0;
2078 			timedwait = NULL;
2079 		} else if (timedwait) {
2080 			/*
2081 			 * If we successfully queue the timeout,
2082 			 * then don't drop t_delay_lock until
2083 			 * we are on the sleep queue (below).
2084 			 */
2085 			mutex_enter(&t->t_delay_lock);
2086 			if (lwp_timer_enqueue(&lwpt) != 0) {
2087 				mutex_exit(&t->t_delay_lock);
2088 				imm_timeout = 1;
2089 				timedwait = NULL;
2090 			}
2091 		}
2092 		t->t_flag |= T_WAITCVSEM;
2093 		lwp_block(&lwpchan);
2094 		/*
2095 		 * Nothing should happen to cause the lwp to sleep
2096 		 * again until after it returns from swtch().
2097 		 */
2098 		if (timedwait)
2099 			mutex_exit(&t->t_delay_lock);
2100 		locked = 0;
2101 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2102 		if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) ||
2103 		    (imm_timeout | imm_unpark))
2104 			setrun(t);
2105 		swtch();
2106 		t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2107 		if (timedwait)
2108 			tim = lwp_timer_dequeue(&lwpt);
2109 		setallwatch();
2110 		if (ISSIG(t, FORREAL) || lwp->lwp_sysabort ||
2111 		    MUSTRETURN(p, t) || imm_unpark)
2112 			error = EINTR;
2113 		else if (imm_timeout || (timedwait && tim == -1))
2114 			error = ETIME;
2115 		lwp->lwp_asleep = 0;
2116 		lwp->lwp_sysabort = 0;
2117 		watched = watch_disable_addr((caddr_t)sp,
2118 		    sizeof (*sp), S_WRITE);
2119 		lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2120 		locked = 1;
2121 		fuword32_noerr((void *)&sp->sema_count, (uint32_t *)&count);
2122 	}
2123 	if (error == 0)
2124 		suword32_noerr((void *)&sp->sema_count, --count);
2125 	if (count != 0) {
2126 		(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2127 		suword8_noerr(&sp->sema_waiters, waiters);
2128 	}
2129 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2130 out:
2131 	no_fault();
2132 	if (watched)
2133 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2134 	if (tsp && check_park && !time_error)
2135 		error = lwp_timer_copyout(&lwpt, error);
2136 	if (error)
2137 		return (set_errno(error));
2138 	return (0);
2139 }
2140 
2141 /*
2142  * Obsolete lwp_sema_wait() interface, no longer called from libc.
2143  * libc now calls lwp_sema_timedwait().
2144  * This system call trap exists solely for the benefit of old
2145  * statically linked applications from Solaris 9 and before.
2146  * It should be removed when we no longer care about such applications.
2147  */
2148 int
2149 lwp_sema_wait(lwp_sema_t *sp)
2150 {
2151 	return (lwp_sema_timedwait(sp, NULL, 0));
2152 }
2153 
2154 int
2155 lwp_sema_post(lwp_sema_t *sp)
2156 {
2157 	proc_t *p = ttoproc(curthread);
2158 	label_t ljb;
2159 	volatile int locked = 0;
2160 	volatile int watched = 0;
2161 	volatile uint16_t type = 0;
2162 	int count;
2163 	lwpchan_t lwpchan;
2164 	uchar_t waiters;
2165 	int error = 0;
2166 
2167 	if ((caddr_t)sp >= p->p_as->a_userlimit)
2168 		return (set_errno(EFAULT));
2169 
2170 	watched = watch_disable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2171 
2172 	if (on_fault(&ljb)) {
2173 		if (locked)
2174 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2175 		error = EFAULT;
2176 		goto out;
2177 	}
2178 	/*
2179 	 * Force Copy-on-write if necessary and ensure that the
2180 	 * synchronization object resides in read/write memory.
2181 	 * Cause an EFAULT return now if this is not so.
2182 	 */
2183 	fuword16_noerr(&sp->sema_type, (uint16_t *)&type);
2184 	suword16_noerr(&sp->sema_type, type);
2185 	if (!get_lwpchan(curproc->p_as, (caddr_t)sp, type,
2186 	    &lwpchan, LWPCHAN_CVPOOL)) {
2187 		error = EFAULT;
2188 		goto out;
2189 	}
2190 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2191 	locked = 1;
2192 	fuword32_noerr(&sp->sema_count, (uint32_t *)&count);
2193 	if (count == _SEM_VALUE_MAX)
2194 		error = EOVERFLOW;
2195 	else
2196 		suword32_noerr(&sp->sema_count, ++count);
2197 	if (count == 1) {
2198 		fuword8_noerr(&sp->sema_waiters, &waiters);
2199 		if (waiters) {
2200 			(void) lwp_release(&lwpchan, &waiters, T_WAITCVSEM);
2201 			suword8_noerr(&sp->sema_waiters, waiters);
2202 		}
2203 	}
2204 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2205 out:
2206 	no_fault();
2207 	if (watched)
2208 		watch_enable_addr((caddr_t)sp, sizeof (*sp), S_WRITE);
2209 	if (error)
2210 		return (set_errno(error));
2211 	return (0);
2212 }
2213 
2214 #define	TRW_WANT_WRITE		0x1
2215 #define	TRW_LOCK_GRANTED	0x2
2216 
2217 #define	READ_LOCK		0
2218 #define	WRITE_LOCK		1
2219 #define	TRY_FLAG		0x10
2220 #define	READ_LOCK_TRY		(READ_LOCK | TRY_FLAG)
2221 #define	WRITE_LOCK_TRY		(WRITE_LOCK | TRY_FLAG)
2222 
2223 /*
2224  * Release one writer or one or more readers. Compute the rwstate word to
2225  * reflect the new state of the queue. For a safe hand-off we copy the new
2226  * rwstate value back to userland before we wake any of the new lock holders.
2227  *
2228  * Note that sleepq_insert() implements a prioritized FIFO (with writers
2229  * being given precedence over readers of the same priority).
2230  *
2231  * If the first thread is a reader we scan the queue releasing all readers
2232  * until we hit a writer or the end of the queue. If the first thread is a
2233  * writer we still need to check for another writer.
2234  */
2235 void
2236 lwp_rwlock_release(lwpchan_t *lwpchan, lwp_rwlock_t *rw)
2237 {
2238 	sleepq_head_t *sqh;
2239 	kthread_t *tp;
2240 	kthread_t **tpp;
2241 	kthread_t *tpnext;
2242 	kthread_t *wakelist = NULL;
2243 	uint32_t rwstate = 0;
2244 	int wcount = 0;
2245 	int rcount = 0;
2246 
2247 	sqh = lwpsqhash(lwpchan);
2248 	disp_lock_enter(&sqh->sq_lock);
2249 	tpp = &sqh->sq_queue.sq_first;
2250 	while ((tp = *tpp) != NULL) {
2251 		if (tp->t_lwpchan.lc_wchan0 == lwpchan->lc_wchan0 &&
2252 		    tp->t_lwpchan.lc_wchan == lwpchan->lc_wchan) {
2253 			if (tp->t_writer & TRW_WANT_WRITE) {
2254 				if ((wcount++ == 0) && (rcount == 0)) {
2255 					rwstate |= URW_WRITE_LOCKED;
2256 
2257 					/* Just one writer to wake. */
2258 					sleepq_unlink(tpp, tp);
2259 					wakelist = tp;
2260 
2261 					/* tpp already set for next thread. */
2262 					continue;
2263 				} else {
2264 					rwstate |= URW_HAS_WAITERS;
2265 					/* We need look no further. */
2266 					break;
2267 				}
2268 			} else {
2269 				rcount++;
2270 				if (wcount == 0) {
2271 					rwstate++;
2272 
2273 					/* Add reader to wake list. */
2274 					sleepq_unlink(tpp, tp);
2275 					tp->t_link = wakelist;
2276 					wakelist = tp;
2277 
2278 					/* tpp already set for next thread. */
2279 					continue;
2280 				} else {
2281 					rwstate |= URW_HAS_WAITERS;
2282 					/* We need look no further. */
2283 					break;
2284 				}
2285 			}
2286 		}
2287 		tpp = &tp->t_link;
2288 	}
2289 
2290 	/* Copy the new rwstate back to userland. */
2291 	suword32_noerr(&rw->rwlock_readers, rwstate);
2292 
2293 	/* Wake the new lock holder(s) up. */
2294 	tp = wakelist;
2295 	while (tp != NULL) {
2296 		DTRACE_SCHED1(wakeup, kthread_t *, tp);
2297 		tp->t_wchan0 = NULL;
2298 		tp->t_wchan = NULL;
2299 		tp->t_sobj_ops = NULL;
2300 		tp->t_writer |= TRW_LOCK_GRANTED;
2301 		tpnext = tp->t_link;
2302 		tp->t_link = NULL;
2303 		CL_WAKEUP(tp);
2304 		thread_unlock_high(tp);
2305 		tp = tpnext;
2306 	}
2307 
2308 	disp_lock_exit(&sqh->sq_lock);
2309 }
2310 
2311 /*
2312  * We enter here holding the user-level mutex, which we must release before
2313  * returning or blocking. Based on lwp_cond_wait().
2314  */
2315 static int
2316 lwp_rwlock_lock(lwp_rwlock_t *rw, timespec_t *tsp, int rd_wr)
2317 {
2318 	lwp_mutex_t *mp = NULL;
2319 	kthread_t *t = curthread;
2320 	kthread_t *tp;
2321 	klwp_t *lwp = ttolwp(t);
2322 	proc_t *p = ttoproc(t);
2323 	lwp_timer_t lwpt;
2324 	lwpchan_t lwpchan;
2325 	lwpchan_t mlwpchan;
2326 	caddr_t timedwait;
2327 	volatile uint16_t type = 0;
2328 	volatile uint8_t mtype = 0;
2329 	uchar_t mwaiters;
2330 	volatile int error = 0;
2331 	int time_error;
2332 	clock_t tim = -1;
2333 	volatile int locked = 0;
2334 	volatile int mlocked = 0;
2335 	volatile int watched = 0;
2336 	volatile int mwatched = 0;
2337 	label_t ljb;
2338 	volatile int no_lwpchan = 1;
2339 	int imm_timeout = 0;
2340 	int try_flag;
2341 	uint32_t rwstate;
2342 	int acquired = 0;
2343 
2344 	/* We only check rw because the mutex is included in it. */
2345 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2346 		return (set_errno(EFAULT));
2347 
2348 	/* We must only report this error if we are about to sleep (later). */
2349 	timedwait = (caddr_t)tsp;
2350 	if ((time_error = lwp_timer_copyin(&lwpt, tsp)) == 0 &&
2351 	    lwpt.lwpt_imm_timeout) {
2352 		imm_timeout = 1;
2353 		timedwait = NULL;
2354 	}
2355 
2356 	(void) new_mstate(t, LMS_USER_LOCK);
2357 
2358 	if (on_fault(&ljb)) {
2359 		if (no_lwpchan) {
2360 			error = EFAULT;
2361 			goto out_nodrop;
2362 		}
2363 		if (mlocked) {
2364 			mlocked = 0;
2365 			lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2366 		}
2367 		if (locked) {
2368 			locked = 0;
2369 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2370 		}
2371 		/*
2372 		 * Set up another on_fault() for a possible fault
2373 		 * on the user lock accessed at "out_drop".
2374 		 */
2375 		if (on_fault(&ljb)) {
2376 			if (mlocked) {
2377 				mlocked = 0;
2378 				lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2379 			}
2380 			error = EFAULT;
2381 			goto out_nodrop;
2382 		}
2383 		error = EFAULT;
2384 		goto out_nodrop;
2385 	}
2386 
2387 	/* Process rd_wr (including sanity check). */
2388 	try_flag = (rd_wr & TRY_FLAG);
2389 	rd_wr &= ~TRY_FLAG;
2390 	if ((rd_wr != READ_LOCK) && (rd_wr != WRITE_LOCK)) {
2391 		error = EINVAL;
2392 		goto out_nodrop;
2393 	}
2394 
2395 	/*
2396 	 * Force Copy-on-write if necessary and ensure that the
2397 	 * synchronization object resides in read/write memory.
2398 	 * Cause an EFAULT return now if this is not so.
2399 	 */
2400 	mp = &rw->mutex;
2401 	fuword8_noerr(&mp->mutex_type, (uint8_t *)&mtype);
2402 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2403 	suword8_noerr(&mp->mutex_type, mtype);
2404 	suword16_noerr(&rw->rwlock_type, type);
2405 
2406 	/* We can only continue for simple USYNC_PROCESS locks. */
2407 	if ((mtype != USYNC_PROCESS) || (type != USYNC_PROCESS)) {
2408 		error = EINVAL;
2409 		goto out_nodrop;
2410 	}
2411 
2412 	/* Convert user level mutex, "mp", to a unique lwpchan. */
2413 	if (!get_lwpchan(p->p_as, (caddr_t)mp, mtype,
2414 	    &mlwpchan, LWPCHAN_MPPOOL)) {
2415 		error = EFAULT;
2416 		goto out_nodrop;
2417 	}
2418 
2419 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2420 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2421 	    &lwpchan, LWPCHAN_CVPOOL)) {
2422 		error = EFAULT;
2423 		goto out_nodrop;
2424 	}
2425 
2426 	no_lwpchan = 0;
2427 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2428 	mwatched = watch_disable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2429 
2430 	/*
2431 	 * lwpchan_lock() ensures that the calling LWP is put to sleep
2432 	 * atomically with respect to a possible wakeup which is a result
2433 	 * of lwp_rwlock_unlock().
2434 	 *
2435 	 * What's misleading is that the LWP is put to sleep after the
2436 	 * rwlock's mutex is released. This is OK as long as the release
2437 	 * operation is also done while holding mlwpchan. The LWP is then
2438 	 * put to sleep when the possibility of pagefaulting or sleeping
2439 	 * has been completely eliminated.
2440 	 */
2441 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2442 	locked = 1;
2443 	lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2444 	mlocked = 1;
2445 
2446 	/*
2447 	 * Fetch the current rwlock state.
2448 	 *
2449 	 * The possibility of spurious wake-ups or killed waiters means
2450 	 * rwstate's URW_HAS_WAITERS bit may indicate false positives.
2451 	 * We only fix these if they are important to us.
2452 	 *
2453 	 * Although various error states can be observed here (e.g. the lock
2454 	 * is not held, but there are waiters) we assume these are applicaton
2455 	 * errors and so we take no corrective action.
2456 	 */
2457 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2458 	/*
2459 	 * We cannot legitimately get here from user-level
2460 	 * without URW_HAS_WAITERS being set.
2461 	 * Set it now to guard against user-level error.
2462 	 */
2463 	rwstate |= URW_HAS_WAITERS;
2464 
2465 	/*
2466 	 * We can try only if the lock isn't held by a writer.
2467 	 */
2468 	if (!(rwstate & URW_WRITE_LOCKED)) {
2469 		tp = lwp_queue_waiter(&lwpchan);
2470 		if (tp == NULL) {
2471 			/*
2472 			 * Hmmm, rwstate indicates waiters but there are
2473 			 * none queued. This could just be the result of a
2474 			 * spurious wakeup, so let's ignore it.
2475 			 *
2476 			 * We now have a chance to acquire the lock
2477 			 * uncontended, but this is the last chance for
2478 			 * a writer to acquire the lock without blocking.
2479 			 */
2480 			if (rd_wr == READ_LOCK) {
2481 				rwstate++;
2482 				acquired = 1;
2483 			} else if ((rwstate & URW_READERS_MASK) == 0) {
2484 				rwstate |= URW_WRITE_LOCKED;
2485 				acquired = 1;
2486 			}
2487 		} else if (rd_wr == READ_LOCK) {
2488 			/*
2489 			 * This is the last chance for a reader to acquire
2490 			 * the lock now, but it can only do so if there is
2491 			 * no writer of equal or greater priority at the
2492 			 * head of the queue .
2493 			 *
2494 			 * It is also just possible that there is a reader
2495 			 * at the head of the queue. This may be the result
2496 			 * of a spurious wakeup or an application failure.
2497 			 * In this case we only acquire the lock if we have
2498 			 * equal or greater priority. It is not our job to
2499 			 * release spurious waiters.
2500 			 */
2501 			pri_t our_pri = DISP_PRIO(t);
2502 			pri_t his_pri = DISP_PRIO(tp);
2503 
2504 			if ((our_pri > his_pri) || ((our_pri == his_pri) &&
2505 			    !(tp->t_writer & TRW_WANT_WRITE))) {
2506 				rwstate++;
2507 				acquired = 1;
2508 			}
2509 		}
2510 	}
2511 
2512 	if (acquired || try_flag || time_error) {
2513 		/*
2514 		 * We're not going to block this time.
2515 		 */
2516 		suword32_noerr(&rw->rwlock_readers, rwstate);
2517 		lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2518 		locked = 0;
2519 
2520 		if (acquired) {
2521 			/*
2522 			 * Got the lock!
2523 			 */
2524 			error = 0;
2525 
2526 		} else if (try_flag) {
2527 			/*
2528 			 * We didn't get the lock and we're about to block.
2529 			 * If we're doing a trylock, return EBUSY instead.
2530 			 */
2531 			error = EBUSY;
2532 
2533 		} else if (time_error) {
2534 			/*
2535 			 * The SUSV3 POSIX spec is very clear that we should
2536 			 * get no error from validating the timer (above)
2537 			 * until we would actually sleep.
2538 			 */
2539 			error = time_error;
2540 		}
2541 
2542 		goto out_drop;
2543 	}
2544 
2545 	/*
2546 	 * We're about to block, so indicate what kind of waiter we are.
2547 	 */
2548 	t->t_writer = 0;
2549 	if (rd_wr == WRITE_LOCK)
2550 		t->t_writer = TRW_WANT_WRITE;
2551 	suword32_noerr(&rw->rwlock_readers, rwstate);
2552 
2553 	/*
2554 	 * Unlock the rwlock's mutex (pagefaults are possible here).
2555 	 */
2556 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2557 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2558 	suword32_noerr(&mp->mutex_ownerpid, 0);
2559 	ulock_clear(&mp->mutex_lockw);
2560 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2561 	if (mwaiters != 0) {
2562 		/*
2563 		 * Given the locking of mlwpchan around the release of
2564 		 * the mutex and checking for waiters, the following
2565 		 * call to lwp_release() can fail ONLY if the lock
2566 		 * acquirer is interrupted after setting the waiter bit,
2567 		 * calling lwp_block() and releasing mlwpchan.
2568 		 * In this case, it could get pulled off the LWP sleep
2569 		 * queue (via setrun()) before the following call to
2570 		 * lwp_release() occurs, and the lock requestor will
2571 		 * update the waiter bit correctly by re-evaluating it.
2572 		 */
2573 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2574 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2575 	}
2576 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2577 	mlocked = 0;
2578 	no_fault();
2579 
2580 	if (mwatched) {
2581 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2582 		mwatched = 0;
2583 	}
2584 	if (watched) {
2585 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2586 		watched = 0;
2587 	}
2588 
2589 	/*
2590 	 * Put the LWP in an orderly state for debugging.
2591 	 */
2592 	prstop(PR_REQUESTED, 0);
2593 	if (timedwait) {
2594 		/*
2595 		 * If we successfully queue the timeout,
2596 		 * then don't drop t_delay_lock until
2597 		 * we are on the sleep queue (below).
2598 		 */
2599 		mutex_enter(&t->t_delay_lock);
2600 		if (lwp_timer_enqueue(&lwpt) != 0) {
2601 			mutex_exit(&t->t_delay_lock);
2602 			imm_timeout = 1;
2603 			timedwait = NULL;
2604 		}
2605 	}
2606 	t->t_flag |= T_WAITCVSEM;
2607 	lwp_block(&lwpchan);
2608 
2609 	/*
2610 	 * Nothing should happen to cause the LWp to go to sleep until after
2611 	 * it returns from swtch().
2612 	 */
2613 	if (timedwait)
2614 		mutex_exit(&t->t_delay_lock);
2615 	locked = 0;
2616 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2617 	if (ISSIG(t, JUSTLOOKING) || MUSTRETURN(p, t) || imm_timeout)
2618 		setrun(t);
2619 	swtch();
2620 
2621 	/*
2622 	 * We're back, but we need to work out why. Were we interrupted? Did
2623 	 * we timeout? Were we granted the lock?
2624 	 */
2625 	error = EAGAIN;
2626 	acquired = (t->t_writer & TRW_LOCK_GRANTED);
2627 	t->t_writer = 0;
2628 	t->t_flag &= ~(T_WAITCVSEM | T_WAKEABLE);
2629 	if (timedwait)
2630 		tim = lwp_timer_dequeue(&lwpt);
2631 	if (ISSIG(t, FORREAL) || lwp->lwp_sysabort || MUSTRETURN(p, t))
2632 		error = EINTR;
2633 	else if (imm_timeout || (timedwait && tim == -1))
2634 		error = ETIME;
2635 	lwp->lwp_asleep = 0;
2636 	lwp->lwp_sysabort = 0;
2637 	setallwatch();
2638 
2639 	/*
2640 	 * If we were granted the lock we don't care about EINTR or ETIME.
2641 	 */
2642 	if (acquired)
2643 		error = 0;
2644 
2645 	if (t->t_mstate == LMS_USER_LOCK)
2646 		(void) new_mstate(t, LMS_SYSTEM);
2647 
2648 	if (error)
2649 		return (set_errno(error));
2650 	return (0);
2651 
2652 out_drop:
2653 	/*
2654 	 * Make sure that the user level lock is dropped before returning
2655 	 * to the caller.
2656 	 */
2657 	if (!mlocked) {
2658 		lwpchan_lock(&mlwpchan, LWPCHAN_MPPOOL);
2659 		mlocked = 1;
2660 	}
2661 	suword32_noerr((uint32_t *)&mp->mutex_owner, 0);
2662 	suword32_noerr((uint32_t *)&mp->mutex_owner + 1, 0);
2663 	suword32_noerr(&mp->mutex_ownerpid, 0);
2664 	ulock_clear(&mp->mutex_lockw);
2665 	fuword8_noerr(&mp->mutex_waiters, &mwaiters);
2666 	if (mwaiters != 0) {
2667 		/*
2668 		 * See comment above on lock clearing and lwp_release()
2669 		 * success/failure.
2670 		 */
2671 		if (lwp_release(&mlwpchan, &mwaiters, 0))
2672 			suword8_noerr(&mp->mutex_waiters, mwaiters);
2673 	}
2674 	lwpchan_unlock(&mlwpchan, LWPCHAN_MPPOOL);
2675 	mlocked = 0;
2676 
2677 out_nodrop:
2678 	no_fault();
2679 	if (mwatched)
2680 		watch_enable_addr((caddr_t)mp, sizeof (*mp), S_WRITE);
2681 	if (watched)
2682 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2683 	if (t->t_mstate == LMS_USER_LOCK)
2684 		(void) new_mstate(t, LMS_SYSTEM);
2685 	if (error)
2686 		return (set_errno(error));
2687 	return (0);
2688 }
2689 
2690 /*
2691  * We enter here holding the user-level mutex but, unlike lwp_rwlock_lock(),
2692  * we never drop the lock.
2693  */
2694 static int
2695 lwp_rwlock_unlock(lwp_rwlock_t *rw)
2696 {
2697 	kthread_t *t = curthread;
2698 	proc_t *p = ttoproc(t);
2699 	lwpchan_t lwpchan;
2700 	volatile uint16_t type = 0;
2701 	volatile int error = 0;
2702 	volatile int locked = 0;
2703 	volatile int watched = 0;
2704 	label_t ljb;
2705 	volatile int no_lwpchan = 1;
2706 	uint32_t rwstate;
2707 
2708 	/* We only check rw because the mutex is included in it. */
2709 	if ((caddr_t)rw >= p->p_as->a_userlimit)
2710 		return (set_errno(EFAULT));
2711 
2712 	if (on_fault(&ljb)) {
2713 		if (no_lwpchan) {
2714 			error = EFAULT;
2715 			goto out_nodrop;
2716 		}
2717 		if (locked) {
2718 			locked = 0;
2719 			lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2720 		}
2721 		error = EFAULT;
2722 		goto out_nodrop;
2723 	}
2724 
2725 	/*
2726 	 * Force Copy-on-write if necessary and ensure that the
2727 	 * synchronization object resides in read/write memory.
2728 	 * Cause an EFAULT return now if this is not so.
2729 	 */
2730 	fuword16_noerr(&rw->rwlock_type, (uint16_t *)&type);
2731 	suword16_noerr(&rw->rwlock_type, type);
2732 
2733 	/* We can only continue for simple USYNC_PROCESS locks. */
2734 	if (type != USYNC_PROCESS) {
2735 		error = EINVAL;
2736 		goto out_nodrop;
2737 	}
2738 
2739 	/* Convert user level rwlock, "rw", to a unique lwpchan. */
2740 	if (!get_lwpchan(p->p_as, (caddr_t)rw, type,
2741 	    &lwpchan, LWPCHAN_CVPOOL)) {
2742 		error = EFAULT;
2743 		goto out_nodrop;
2744 	}
2745 
2746 	no_lwpchan = 0;
2747 	watched = watch_disable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2748 
2749 	lwpchan_lock(&lwpchan, LWPCHAN_CVPOOL);
2750 	locked = 1;
2751 
2752 	/*
2753 	 * We can resolve multiple readers (except the last reader) here.
2754 	 * For the last reader or a writer we need lwp_rwlock_release(),
2755 	 * to which we also delegate the task of copying the new rwstate
2756 	 * back to userland (see the comment there).
2757 	 */
2758 	fuword32_noerr(&rw->rwlock_readers, &rwstate);
2759 	if (rwstate & URW_WRITE_LOCKED)
2760 		lwp_rwlock_release(&lwpchan, rw);
2761 	else if ((rwstate & URW_READERS_MASK) > 0) {
2762 		rwstate--;
2763 		if ((rwstate & URW_READERS_MASK) == 0)
2764 			lwp_rwlock_release(&lwpchan, rw);
2765 		else
2766 			suword32_noerr(&rw->rwlock_readers, rwstate);
2767 	}
2768 
2769 	lwpchan_unlock(&lwpchan, LWPCHAN_CVPOOL);
2770 	locked = 0;
2771 	error = 0;
2772 
2773 out_nodrop:
2774 	no_fault();
2775 	if (watched)
2776 		watch_enable_addr((caddr_t)rw, sizeof (*rw), S_WRITE);
2777 	if (error)
2778 		return (set_errno(error));
2779 	return (0);
2780 }
2781 
2782 int
2783 lwp_rwlock_sys(int subcode, lwp_rwlock_t *rwlp, timespec_t *tsp)
2784 {
2785 	switch (subcode) {
2786 	case 0:
2787 		return (lwp_rwlock_lock(rwlp, tsp, READ_LOCK));
2788 	case 1:
2789 		return (lwp_rwlock_lock(rwlp, tsp, WRITE_LOCK));
2790 	case 2:
2791 		return (lwp_rwlock_lock(rwlp, NULL, READ_LOCK_TRY));
2792 	case 3:
2793 		return (lwp_rwlock_lock(rwlp, NULL, WRITE_LOCK_TRY));
2794 	case 4:
2795 		return (lwp_rwlock_unlock(rwlp));
2796 	}
2797 	return (set_errno(EINVAL));
2798 }
2799 
2800 /*
2801  * Return the owner of the user-level s-object.
2802  * Since we can't really do this, return NULL.
2803  */
2804 /* ARGSUSED */
2805 static kthread_t *
2806 lwpsobj_owner(caddr_t sobj)
2807 {
2808 	return ((kthread_t *)NULL);
2809 }
2810 
2811 /*
2812  * Wake up a thread asleep on a user-level synchronization
2813  * object.
2814  */
2815 static void
2816 lwp_unsleep(kthread_t *t)
2817 {
2818 	ASSERT(THREAD_LOCK_HELD(t));
2819 	if (t->t_wchan0 != NULL) {
2820 		sleepq_head_t *sqh;
2821 		sleepq_t *sqp = t->t_sleepq;
2822 
2823 		if (sqp != NULL) {
2824 			sqh = lwpsqhash(&t->t_lwpchan);
2825 			ASSERT(&sqh->sq_queue == sqp);
2826 			sleepq_unsleep(t);
2827 			disp_lock_exit_high(&sqh->sq_lock);
2828 			CL_SETRUN(t);
2829 			return;
2830 		}
2831 	}
2832 	panic("lwp_unsleep: thread %p not on sleepq", (void *)t);
2833 }
2834 
2835 /*
2836  * Change the priority of a thread asleep on a user-level
2837  * synchronization object. To maintain proper priority order,
2838  * we:
2839  *	o dequeue the thread.
2840  *	o change its priority.
2841  *	o re-enqueue the thread.
2842  * Assumption: the thread is locked on entry.
2843  */
2844 static void
2845 lwp_change_pri(kthread_t *t, pri_t pri, pri_t *t_prip)
2846 {
2847 	ASSERT(THREAD_LOCK_HELD(t));
2848 	if (t->t_wchan0 != NULL) {
2849 		sleepq_t   *sqp = t->t_sleepq;
2850 
2851 		sleepq_dequeue(t);
2852 		*t_prip = pri;
2853 		sleepq_insert(sqp, t);
2854 	} else
2855 		panic("lwp_change_pri: %p not on a sleep queue", (void *)t);
2856 }
2857 
2858 /*
2859  * Clean up a left-over process-shared robust mutex
2860  */
2861 static void
2862 lwp_mutex_cleanup(lwpchan_entry_t *ent, uint16_t lockflg)
2863 {
2864 	uint16_t flag;
2865 	uchar_t waiters;
2866 	label_t ljb;
2867 	pid_t owner_pid;
2868 	lwp_mutex_t *lp;
2869 	volatile int locked = 0;
2870 	volatile int watched = 0;
2871 	volatile struct upimutex *upimutex = NULL;
2872 	volatile int upilocked = 0;
2873 
2874 	if ((ent->lwpchan_type & (USYNC_PROCESS | LOCK_ROBUST))
2875 	    != (USYNC_PROCESS | LOCK_ROBUST))
2876 		return;
2877 
2878 	lp = (lwp_mutex_t *)ent->lwpchan_addr;
2879 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2880 	if (on_fault(&ljb)) {
2881 		if (locked)
2882 			lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2883 		if (upilocked)
2884 			upimutex_unlock((upimutex_t *)upimutex, 0);
2885 		goto out;
2886 	}
2887 
2888 	fuword32_noerr(&lp->mutex_ownerpid, (uint32_t *)&owner_pid);
2889 
2890 	if (UPIMUTEX(ent->lwpchan_type)) {
2891 		lwpchan_t lwpchan = ent->lwpchan_lwpchan;
2892 		upib_t *upibp = &UPI_CHAIN(lwpchan);
2893 
2894 		if (owner_pid != curproc->p_pid)
2895 			goto out;
2896 		mutex_enter(&upibp->upib_lock);
2897 		upimutex = upi_get(upibp, &lwpchan);
2898 		if (upimutex == NULL || upimutex->upi_owner != curthread) {
2899 			mutex_exit(&upibp->upib_lock);
2900 			goto out;
2901 		}
2902 		mutex_exit(&upibp->upib_lock);
2903 		upilocked = 1;
2904 		flag = lwp_clear_mutex(lp, lockflg);
2905 		suword8_noerr(&lp->mutex_lockw, 0);
2906 		upimutex_unlock((upimutex_t *)upimutex, flag);
2907 	} else {
2908 		lwpchan_lock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2909 		locked = 1;
2910 		/*
2911 		 * Clear the spinners count because one of our
2912 		 * threads could have been spinning for this lock
2913 		 * at user level when the process was suddenly killed.
2914 		 * There is no harm in this since user-level libc code
2915 		 * will adapt to the sudden change in the spinner count.
2916 		 */
2917 		suword8_noerr(&lp->mutex_spinners, 0);
2918 		if (owner_pid != curproc->p_pid) {
2919 			/*
2920 			 * We are not the owner.  There may or may not be one.
2921 			 * If there are waiters, we wake up one or all of them.
2922 			 * It doesn't hurt to wake them up in error since
2923 			 * they will just retry the lock and go to sleep
2924 			 * again if necessary.
2925 			 */
2926 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2927 			if (waiters != 0) {	/* there are waiters */
2928 				fuword16_noerr(&lp->mutex_flag, &flag);
2929 				if (flag & LOCK_NOTRECOVERABLE) {
2930 					lwp_release_all(&ent->lwpchan_lwpchan);
2931 					suword8_noerr(&lp->mutex_waiters, 0);
2932 				} else if (lwp_release(&ent->lwpchan_lwpchan,
2933 				    &waiters, 0)) {
2934 					suword8_noerr(&lp->mutex_waiters,
2935 					    waiters);
2936 				}
2937 			}
2938 		} else {
2939 			/*
2940 			 * We are the owner.  Release it.
2941 			 */
2942 			(void) lwp_clear_mutex(lp, lockflg);
2943 			ulock_clear(&lp->mutex_lockw);
2944 			fuword8_noerr(&lp->mutex_waiters, &waiters);
2945 			if (waiters &&
2946 			    lwp_release(&ent->lwpchan_lwpchan, &waiters, 0))
2947 				suword8_noerr(&lp->mutex_waiters, waiters);
2948 		}
2949 		lwpchan_unlock(&ent->lwpchan_lwpchan, LWPCHAN_MPPOOL);
2950 	}
2951 out:
2952 	no_fault();
2953 	if (watched)
2954 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2955 }
2956 
2957 /*
2958  * Register a process-shared robust mutex in the lwpchan cache.
2959  */
2960 int
2961 lwp_mutex_register(lwp_mutex_t *lp)
2962 {
2963 	int error = 0;
2964 	volatile int watched;
2965 	label_t ljb;
2966 	uint8_t type;
2967 	lwpchan_t lwpchan;
2968 
2969 	if ((caddr_t)lp >= (caddr_t)USERLIMIT)
2970 		return (set_errno(EFAULT));
2971 
2972 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2973 
2974 	if (on_fault(&ljb)) {
2975 		error = EFAULT;
2976 	} else {
2977 		/*
2978 		 * Force Copy-on-write if necessary and ensure that the
2979 		 * synchronization object resides in read/write memory.
2980 		 * Cause an EFAULT return now if this is not so.
2981 		 */
2982 		fuword8_noerr(&lp->mutex_type, &type);
2983 		suword8_noerr(&lp->mutex_type, type);
2984 		if ((type & (USYNC_PROCESS|LOCK_ROBUST))
2985 		    != (USYNC_PROCESS|LOCK_ROBUST)) {
2986 			error = EINVAL;
2987 		} else if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
2988 		    &lwpchan, LWPCHAN_MPPOOL)) {
2989 			error = EFAULT;
2990 		}
2991 	}
2992 	no_fault();
2993 	if (watched)
2994 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
2995 	if (error)
2996 		return (set_errno(error));
2997 	return (0);
2998 }
2999 
3000 int
3001 lwp_mutex_trylock(lwp_mutex_t *lp)
3002 {
3003 	kthread_t *t = curthread;
3004 	proc_t *p = ttoproc(t);
3005 	int error = 0;
3006 	volatile int locked = 0;
3007 	volatile int watched = 0;
3008 	label_t ljb;
3009 	volatile uint8_t type = 0;
3010 	uint16_t flag;
3011 	lwpchan_t lwpchan;
3012 
3013 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3014 		return (set_errno(EFAULT));
3015 
3016 	(void) new_mstate(t, LMS_USER_LOCK);
3017 
3018 	if (on_fault(&ljb)) {
3019 		if (locked)
3020 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3021 		error = EFAULT;
3022 		goto out;
3023 	}
3024 	/*
3025 	 * Force Copy-on-write if necessary and ensure that the
3026 	 * synchronization object resides in read/write memory.
3027 	 * Cause an EFAULT return now if this is not so.
3028 	 */
3029 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3030 	suword8_noerr(&lp->mutex_type, type);
3031 	if (UPIMUTEX(type)) {
3032 		no_fault();
3033 		error = lwp_upimutex_lock(lp, type, UPIMUTEX_TRY, NULL);
3034 		if ((type & USYNC_PROCESS) &&
3035 		    (error == 0 ||
3036 		    error == EOWNERDEAD || error == ELOCKUNMAPPED))
3037 			(void) suword32(&lp->mutex_ownerpid, p->p_pid);
3038 		if (error)
3039 			return (set_errno(error));
3040 		return (0);
3041 	}
3042 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3043 	    &lwpchan, LWPCHAN_MPPOOL)) {
3044 		error = EFAULT;
3045 		goto out;
3046 	}
3047 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3048 	locked = 1;
3049 	if (type & LOCK_ROBUST) {
3050 		fuword16_noerr(&lp->mutex_flag, &flag);
3051 		if (flag & LOCK_NOTRECOVERABLE) {
3052 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3053 			error =  ENOTRECOVERABLE;
3054 			goto out;
3055 		}
3056 	}
3057 
3058 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3059 
3060 	if (!ulock_try(&lp->mutex_lockw))
3061 		error = EBUSY;
3062 	else {
3063 		if (type & USYNC_PROCESS)
3064 			suword32_noerr(&lp->mutex_ownerpid, p->p_pid);
3065 		if (type & LOCK_ROBUST) {
3066 			fuword16_noerr(&lp->mutex_flag, &flag);
3067 			if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3068 				if (flag & LOCK_OWNERDEAD)
3069 					error = EOWNERDEAD;
3070 				else if (type & USYNC_PROCESS_ROBUST)
3071 					error = ELOCKUNMAPPED;
3072 				else
3073 					error = EOWNERDEAD;
3074 			}
3075 		}
3076 	}
3077 	locked = 0;
3078 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3079 out:
3080 
3081 	if (t->t_mstate == LMS_USER_LOCK)
3082 		(void) new_mstate(t, LMS_SYSTEM);
3083 
3084 	no_fault();
3085 	if (watched)
3086 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3087 	if (error)
3088 		return (set_errno(error));
3089 	return (0);
3090 }
3091 
3092 /*
3093  * unlock the mutex and unblock lwps that is trying to acquire this mutex.
3094  * the blocked lwp resumes and retries to acquire the lock.
3095  */
3096 int
3097 lwp_mutex_unlock(lwp_mutex_t *lp)
3098 {
3099 	proc_t *p = ttoproc(curthread);
3100 	lwpchan_t lwpchan;
3101 	uchar_t waiters;
3102 	volatile int locked = 0;
3103 	volatile int watched = 0;
3104 	volatile uint8_t type = 0;
3105 	label_t ljb;
3106 	uint16_t flag;
3107 	int error = 0;
3108 
3109 	if ((caddr_t)lp >= p->p_as->a_userlimit)
3110 		return (set_errno(EFAULT));
3111 
3112 	if (on_fault(&ljb)) {
3113 		if (locked)
3114 			lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3115 		error = EFAULT;
3116 		goto out;
3117 	}
3118 
3119 	/*
3120 	 * Force Copy-on-write if necessary and ensure that the
3121 	 * synchronization object resides in read/write memory.
3122 	 * Cause an EFAULT return now if this is not so.
3123 	 */
3124 	fuword8_noerr(&lp->mutex_type, (uint8_t *)&type);
3125 	suword8_noerr(&lp->mutex_type, type);
3126 
3127 	if (UPIMUTEX(type)) {
3128 		no_fault();
3129 		error = lwp_upimutex_unlock(lp, type);
3130 		if (error)
3131 			return (set_errno(error));
3132 		return (0);
3133 	}
3134 
3135 	watched = watch_disable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3136 
3137 	if (!get_lwpchan(curproc->p_as, (caddr_t)lp, type,
3138 	    &lwpchan, LWPCHAN_MPPOOL)) {
3139 		error = EFAULT;
3140 		goto out;
3141 	}
3142 	lwpchan_lock(&lwpchan, LWPCHAN_MPPOOL);
3143 	locked = 1;
3144 	if (type & LOCK_ROBUST) {
3145 		fuword16_noerr(&lp->mutex_flag, &flag);
3146 		if (flag & (LOCK_OWNERDEAD | LOCK_UNMAPPED)) {
3147 			flag &= ~(LOCK_OWNERDEAD | LOCK_UNMAPPED);
3148 			flag |= LOCK_NOTRECOVERABLE;
3149 			suword16_noerr(&lp->mutex_flag, flag);
3150 		}
3151 	}
3152 	if (type & USYNC_PROCESS)
3153 		suword32_noerr(&lp->mutex_ownerpid, 0);
3154 	ulock_clear(&lp->mutex_lockw);
3155 	/*
3156 	 * Always wake up an lwp (if any) waiting on lwpchan. The woken lwp will
3157 	 * re-try the lock in lwp_mutex_timedlock(). The call to lwp_release()
3158 	 * may fail.  If it fails, do not write into the waiter bit.
3159 	 * The call to lwp_release() might fail due to one of three reasons:
3160 	 *
3161 	 * 	1. due to the thread which set the waiter bit not actually
3162 	 *	   sleeping since it got the lock on the re-try. The waiter
3163 	 *	   bit will then be correctly updated by that thread. This
3164 	 *	   window may be closed by reading the wait bit again here
3165 	 *	   and not calling lwp_release() at all if it is zero.
3166 	 *	2. the thread which set the waiter bit and went to sleep
3167 	 *	   was woken up by a signal. This time, the waiter recomputes
3168 	 *	   the wait bit in the return with EINTR code.
3169 	 *	3. the waiter bit read by lwp_mutex_wakeup() was in
3170 	 *	   memory that has been re-used after the lock was dropped.
3171 	 *	   In this case, writing into the waiter bit would cause data
3172 	 *	   corruption.
3173 	 */
3174 	fuword8_noerr(&lp->mutex_waiters, &waiters);
3175 	if (waiters) {
3176 		if ((type & LOCK_ROBUST) &&
3177 		    (flag & LOCK_NOTRECOVERABLE)) {
3178 			lwp_release_all(&lwpchan);
3179 			suword8_noerr(&lp->mutex_waiters, 0);
3180 		} else if (lwp_release(&lwpchan, &waiters, 0)) {
3181 			suword8_noerr(&lp->mutex_waiters, waiters);
3182 		}
3183 	}
3184 
3185 	lwpchan_unlock(&lwpchan, LWPCHAN_MPPOOL);
3186 out:
3187 	no_fault();
3188 	if (watched)
3189 		watch_enable_addr((caddr_t)lp, sizeof (*lp), S_WRITE);
3190 	if (error)
3191 		return (set_errno(error));
3192 	return (0);
3193 }
3194