xref: /freebsd/sys/kern/kern_umtx.c (revision db612abe8df3355d1eb23bb3b50fdd97bc21e979)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/eventhandler.h>
47 #include <sys/umtx.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_object.h>
54 
55 #include <machine/cpu.h>
56 
57 #ifdef COMPAT_IA32
58 #include <compat/freebsd32/freebsd32_proto.h>
59 #endif
60 
61 #define TYPE_SIMPLE_LOCK	0
62 #define TYPE_SIMPLE_WAIT	1
63 #define TYPE_NORMAL_UMUTEX	2
64 #define TYPE_PI_UMUTEX		3
65 #define TYPE_PP_UMUTEX		4
66 #define TYPE_CV			5
67 #define TYPE_RWLOCK		6
68 
69 /* Key to represent a unique userland synchronous object */
70 struct umtx_key {
71 	int	hash;
72 	int	type;
73 	int	shared;
74 	union {
75 		struct {
76 			vm_object_t	object;
77 			uintptr_t	offset;
78 		} shared;
79 		struct {
80 			struct vmspace	*vs;
81 			uintptr_t	addr;
82 		} private;
83 		struct {
84 			void		*a;
85 			uintptr_t	b;
86 		} both;
87 	} info;
88 };
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98  	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 };
142 
143 TAILQ_HEAD(umtxq_head, umtx_q);
144 
145 /* Userland lock object's wait-queue chain */
146 struct umtxq_chain {
147 	/* Lock for this chain. */
148 	struct mtx		uc_lock;
149 
150 	/* List of sleep queues. */
151 	struct umtxq_head	uc_queue[2];
152 #define UMTX_SHARED_QUEUE	0
153 #define UMTX_EXCLUSIVE_QUEUE	1
154 
155 	/* Busy flag */
156 	char			uc_busy;
157 
158 	/* Chain lock waiters */
159 	int			uc_waiters;
160 
161 	/* All PI in the list */
162 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
163 };
164 
165 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166 
167 /*
168  * Don't propagate time-sharing priority, there is a security reason,
169  * a user can simply introduce PI-mutex, let thread A lock the mutex,
170  * and let another thread B block on the mutex, because B is
171  * sleeping, its priority will be boosted, this causes A's priority to
172  * be boosted via priority propagating too and will never be lowered even
173  * if it is using 100%CPU, this is unfair to other processes.
174  */
175 
176 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
177 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
178 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
179 
180 #define	GOLDEN_RATIO_PRIME	2654404609U
181 #define	UMTX_CHAINS		128
182 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
183 
184 #define THREAD_SHARE		0
185 #define PROCESS_SHARE		1
186 #define AUTO_SHARE		2
187 
188 #define	GET_SHARE(flags)	\
189     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190 
191 #define BUSY_SPINS		200
192 
193 static uma_zone_t		umtx_pi_zone;
194 static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
195 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
196 static int			umtx_pi_allocated;
197 
198 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
199 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
200     &umtx_pi_allocated, 0, "Allocated umtx_pi");
201 
202 static void umtxq_sysinit(void *);
203 static void umtxq_hash(struct umtx_key *key);
204 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
205 static void umtxq_lock(struct umtx_key *key);
206 static void umtxq_unlock(struct umtx_key *key);
207 static void umtxq_busy(struct umtx_key *key);
208 static void umtxq_unbusy(struct umtx_key *key);
209 static void umtxq_insert_queue(struct umtx_q *uq, int q);
210 static void umtxq_remove_queue(struct umtx_q *uq, int q);
211 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
212 static int umtxq_count(struct umtx_key *key);
213 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
214 static int umtx_key_get(void *addr, int type, int share,
215 	struct umtx_key *key);
216 static void umtx_key_release(struct umtx_key *key);
217 static struct umtx_pi *umtx_pi_alloc(int);
218 static void umtx_pi_free(struct umtx_pi *pi);
219 static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
220 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
221 static void umtx_thread_cleanup(struct thread *td);
222 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
223 	struct image_params *imgp __unused);
224 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
225 
226 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
227 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
228 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
229 
230 static struct mtx umtx_lock;
231 
232 static void
233 umtxq_sysinit(void *arg __unused)
234 {
235 	int i;
236 
237 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
238 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
239 	for (i = 0; i < UMTX_CHAINS; ++i) {
240 		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
241 			 MTX_DEF | MTX_DUPOK);
242 		TAILQ_INIT(&umtxq_chains[i].uc_queue[0]);
243 		TAILQ_INIT(&umtxq_chains[i].uc_queue[1]);
244 		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
245 		umtxq_chains[i].uc_busy = 0;
246 		umtxq_chains[i].uc_waiters = 0;
247 	}
248 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
249 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
250 	    EVENTHANDLER_PRI_ANY);
251 }
252 
253 struct umtx_q *
254 umtxq_alloc(void)
255 {
256 	struct umtx_q *uq;
257 
258 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
259 	TAILQ_INIT(&uq->uq_pi_contested);
260 	uq->uq_inherited_pri = PRI_MAX;
261 	return (uq);
262 }
263 
264 void
265 umtxq_free(struct umtx_q *uq)
266 {
267 	free(uq, M_UMTX);
268 }
269 
270 static inline void
271 umtxq_hash(struct umtx_key *key)
272 {
273 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
274 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
275 }
276 
277 static inline int
278 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
279 {
280 	return (k1->type == k2->type &&
281 		k1->info.both.a == k2->info.both.a &&
282 	        k1->info.both.b == k2->info.both.b);
283 }
284 
285 static inline struct umtxq_chain *
286 umtxq_getchain(struct umtx_key *key)
287 {
288 	return (&umtxq_chains[key->hash]);
289 }
290 
291 /*
292  * Lock a chain.
293  */
294 static inline void
295 umtxq_lock(struct umtx_key *key)
296 {
297 	struct umtxq_chain *uc;
298 
299 	uc = umtxq_getchain(key);
300 	mtx_lock(&uc->uc_lock);
301 }
302 
303 /*
304  * Unlock a chain.
305  */
306 static inline void
307 umtxq_unlock(struct umtx_key *key)
308 {
309 	struct umtxq_chain *uc;
310 
311 	uc = umtxq_getchain(key);
312 	mtx_unlock(&uc->uc_lock);
313 }
314 
315 /*
316  * Set chain to busy state when following operation
317  * may be blocked (kernel mutex can not be used).
318  */
319 static inline void
320 umtxq_busy(struct umtx_key *key)
321 {
322 	struct umtxq_chain *uc;
323 
324 	uc = umtxq_getchain(key);
325 	mtx_assert(&uc->uc_lock, MA_OWNED);
326 	if (uc->uc_busy) {
327 #ifdef SMP
328 		if (smp_cpus > 1) {
329 			int count = BUSY_SPINS;
330 			if (count > 0) {
331 				umtxq_unlock(key);
332 				while (uc->uc_busy && --count > 0)
333 					cpu_spinwait();
334 				umtxq_lock(key);
335 			}
336 		}
337 #endif
338 		while (uc->uc_busy) {
339 			uc->uc_waiters++;
340 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
341 			uc->uc_waiters--;
342 		}
343 	}
344 	uc->uc_busy = 1;
345 }
346 
347 /*
348  * Unbusy a chain.
349  */
350 static inline void
351 umtxq_unbusy(struct umtx_key *key)
352 {
353 	struct umtxq_chain *uc;
354 
355 	uc = umtxq_getchain(key);
356 	mtx_assert(&uc->uc_lock, MA_OWNED);
357 	KASSERT(uc->uc_busy != 0, ("not busy"));
358 	uc->uc_busy = 0;
359 	if (uc->uc_waiters)
360 		wakeup_one(uc);
361 }
362 
363 static inline void
364 umtxq_insert_queue(struct umtx_q *uq, int q)
365 {
366 	struct umtxq_chain *uc;
367 
368 	uc = umtxq_getchain(&uq->uq_key);
369 	UMTXQ_LOCKED_ASSERT(uc);
370 	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
371 	uq->uq_flags |= UQF_UMTXQ;
372 }
373 
374 static inline void
375 umtxq_remove_queue(struct umtx_q *uq, int q)
376 {
377 	struct umtxq_chain *uc;
378 
379 	uc = umtxq_getchain(&uq->uq_key);
380 	UMTXQ_LOCKED_ASSERT(uc);
381 	if (uq->uq_flags & UQF_UMTXQ) {
382 		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
383 		uq->uq_flags &= ~UQF_UMTXQ;
384 	}
385 }
386 
387 /*
388  * Check if there are multiple waiters
389  */
390 static int
391 umtxq_count(struct umtx_key *key)
392 {
393 	struct umtxq_chain *uc;
394 	struct umtx_q *uq;
395 	int count = 0;
396 
397 	uc = umtxq_getchain(key);
398 	UMTXQ_LOCKED_ASSERT(uc);
399 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
400 		if (umtx_key_match(&uq->uq_key, key)) {
401 			if (++count > 1)
402 				break;
403 		}
404 	}
405 	return (count);
406 }
407 
408 /*
409  * Check if there are multiple PI waiters and returns first
410  * waiter.
411  */
412 static int
413 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
414 {
415 	struct umtxq_chain *uc;
416 	struct umtx_q *uq;
417 	int count = 0;
418 
419 	*first = NULL;
420 	uc = umtxq_getchain(key);
421 	UMTXQ_LOCKED_ASSERT(uc);
422 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
423 		if (umtx_key_match(&uq->uq_key, key)) {
424 			if (++count > 1)
425 				break;
426 			*first = uq;
427 		}
428 	}
429 	return (count);
430 }
431 
432 /*
433  * Wake up threads waiting on an userland object.
434  */
435 
436 static int
437 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
438 {
439 	struct umtxq_chain *uc;
440 	struct umtx_q *uq, *next;
441 	int ret;
442 
443 	ret = 0;
444 	uc = umtxq_getchain(key);
445 	UMTXQ_LOCKED_ASSERT(uc);
446 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
447 		if (umtx_key_match(&uq->uq_key, key)) {
448 			umtxq_remove_queue(uq, q);
449 			wakeup(uq);
450 			if (++ret >= n_wake)
451 				break;
452 		}
453 	}
454 	return (ret);
455 }
456 
457 
458 /*
459  * Wake up specified thread.
460  */
461 static inline void
462 umtxq_signal_thread(struct umtx_q *uq)
463 {
464 	struct umtxq_chain *uc;
465 
466 	uc = umtxq_getchain(&uq->uq_key);
467 	UMTXQ_LOCKED_ASSERT(uc);
468 	umtxq_remove(uq);
469 	wakeup(uq);
470 }
471 
472 /*
473  * Put thread into sleep state, before sleeping, check if
474  * thread was removed from umtx queue.
475  */
476 static inline int
477 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
478 {
479 	struct umtxq_chain *uc;
480 	int error;
481 
482 	uc = umtxq_getchain(&uq->uq_key);
483 	UMTXQ_LOCKED_ASSERT(uc);
484 	if (!(uq->uq_flags & UQF_UMTXQ))
485 		return (0);
486 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
487 	if (error == EWOULDBLOCK)
488 		error = ETIMEDOUT;
489 	return (error);
490 }
491 
492 /*
493  * Convert userspace address into unique logical address.
494  */
495 static int
496 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
497 {
498 	struct thread *td = curthread;
499 	vm_map_t map;
500 	vm_map_entry_t entry;
501 	vm_pindex_t pindex;
502 	vm_prot_t prot;
503 	boolean_t wired;
504 
505 	key->type = type;
506 	if (share == THREAD_SHARE) {
507 		key->shared = 0;
508 		key->info.private.vs = td->td_proc->p_vmspace;
509 		key->info.private.addr = (uintptr_t)addr;
510 	} else {
511 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
512 		map = &td->td_proc->p_vmspace->vm_map;
513 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
514 		    &entry, &key->info.shared.object, &pindex, &prot,
515 		    &wired) != KERN_SUCCESS) {
516 			return EFAULT;
517 		}
518 
519 		if ((share == PROCESS_SHARE) ||
520 		    (share == AUTO_SHARE &&
521 		     VM_INHERIT_SHARE == entry->inheritance)) {
522 			key->shared = 1;
523 			key->info.shared.offset = entry->offset + entry->start -
524 				(vm_offset_t)addr;
525 			vm_object_reference(key->info.shared.object);
526 		} else {
527 			key->shared = 0;
528 			key->info.private.vs = td->td_proc->p_vmspace;
529 			key->info.private.addr = (uintptr_t)addr;
530 		}
531 		vm_map_lookup_done(map, entry);
532 	}
533 
534 	umtxq_hash(key);
535 	return (0);
536 }
537 
538 /*
539  * Release key.
540  */
541 static inline void
542 umtx_key_release(struct umtx_key *key)
543 {
544 	if (key->shared)
545 		vm_object_deallocate(key->info.shared.object);
546 }
547 
548 /*
549  * Lock a umtx object.
550  */
551 static int
552 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
553 {
554 	struct umtx_q *uq;
555 	u_long owner;
556 	u_long old;
557 	int error = 0;
558 
559 	uq = td->td_umtxq;
560 
561 	/*
562 	 * Care must be exercised when dealing with umtx structure. It
563 	 * can fault on any access.
564 	 */
565 	for (;;) {
566 		/*
567 		 * Try the uncontested case.  This should be done in userland.
568 		 */
569 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
570 
571 		/* The acquire succeeded. */
572 		if (owner == UMTX_UNOWNED)
573 			return (0);
574 
575 		/* The address was invalid. */
576 		if (owner == -1)
577 			return (EFAULT);
578 
579 		/* If no one owns it but it is contested try to acquire it. */
580 		if (owner == UMTX_CONTESTED) {
581 			owner = casuword(&umtx->u_owner,
582 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
583 
584 			if (owner == UMTX_CONTESTED)
585 				return (0);
586 
587 			/* The address was invalid. */
588 			if (owner == -1)
589 				return (EFAULT);
590 
591 			/* If this failed the lock has changed, restart. */
592 			continue;
593 		}
594 
595 		/*
596 		 * If we caught a signal, we have retried and now
597 		 * exit immediately.
598 		 */
599 		if (error != 0)
600 			return (error);
601 
602 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
603 			AUTO_SHARE, &uq->uq_key)) != 0)
604 			return (error);
605 
606 		umtxq_lock(&uq->uq_key);
607 		umtxq_busy(&uq->uq_key);
608 		umtxq_insert(uq);
609 		umtxq_unbusy(&uq->uq_key);
610 		umtxq_unlock(&uq->uq_key);
611 
612 		/*
613 		 * Set the contested bit so that a release in user space
614 		 * knows to use the system call for unlock.  If this fails
615 		 * either some one else has acquired the lock or it has been
616 		 * released.
617 		 */
618 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
619 
620 		/* The address was invalid. */
621 		if (old == -1) {
622 			umtxq_lock(&uq->uq_key);
623 			umtxq_remove(uq);
624 			umtxq_unlock(&uq->uq_key);
625 			umtx_key_release(&uq->uq_key);
626 			return (EFAULT);
627 		}
628 
629 		/*
630 		 * We set the contested bit, sleep. Otherwise the lock changed
631 		 * and we need to retry or we lost a race to the thread
632 		 * unlocking the umtx.
633 		 */
634 		umtxq_lock(&uq->uq_key);
635 		if (old == owner)
636 			error = umtxq_sleep(uq, "umtx", timo);
637 		umtxq_remove(uq);
638 		umtxq_unlock(&uq->uq_key);
639 		umtx_key_release(&uq->uq_key);
640 	}
641 
642 	return (0);
643 }
644 
645 /*
646  * Lock a umtx object.
647  */
648 static int
649 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
650 	struct timespec *timeout)
651 {
652 	struct timespec ts, ts2, ts3;
653 	struct timeval tv;
654 	int error;
655 
656 	if (timeout == NULL) {
657 		error = _do_lock_umtx(td, umtx, id, 0);
658 		/* Mutex locking is restarted if it is interrupted. */
659 		if (error == EINTR)
660 			error = ERESTART;
661 	} else {
662 		getnanouptime(&ts);
663 		timespecadd(&ts, timeout);
664 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
665 		for (;;) {
666 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
667 			if (error != ETIMEDOUT)
668 				break;
669 			getnanouptime(&ts2);
670 			if (timespeccmp(&ts2, &ts, >=)) {
671 				error = ETIMEDOUT;
672 				break;
673 			}
674 			ts3 = ts;
675 			timespecsub(&ts3, &ts2);
676 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
677 		}
678 		/* Timed-locking is not restarted. */
679 		if (error == ERESTART)
680 			error = EINTR;
681 	}
682 	return (error);
683 }
684 
685 /*
686  * Unlock a umtx object.
687  */
688 static int
689 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
690 {
691 	struct umtx_key key;
692 	u_long owner;
693 	u_long old;
694 	int error;
695 	int count;
696 
697 	/*
698 	 * Make sure we own this mtx.
699 	 */
700 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
701 	if (owner == -1)
702 		return (EFAULT);
703 
704 	if ((owner & ~UMTX_CONTESTED) != id)
705 		return (EPERM);
706 
707 	/* This should be done in userland */
708 	if ((owner & UMTX_CONTESTED) == 0) {
709 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
710 		if (old == -1)
711 			return (EFAULT);
712 		if (old == owner)
713 			return (0);
714 		owner = old;
715 	}
716 
717 	/* We should only ever be in here for contested locks */
718 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
719 		&key)) != 0)
720 		return (error);
721 
722 	umtxq_lock(&key);
723 	umtxq_busy(&key);
724 	count = umtxq_count(&key);
725 	umtxq_unlock(&key);
726 
727 	/*
728 	 * When unlocking the umtx, it must be marked as unowned if
729 	 * there is zero or one thread only waiting for it.
730 	 * Otherwise, it must be marked as contested.
731 	 */
732 	old = casuword(&umtx->u_owner, owner,
733 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
734 	umtxq_lock(&key);
735 	umtxq_signal(&key,1);
736 	umtxq_unbusy(&key);
737 	umtxq_unlock(&key);
738 	umtx_key_release(&key);
739 	if (old == -1)
740 		return (EFAULT);
741 	if (old != owner)
742 		return (EINVAL);
743 	return (0);
744 }
745 
746 #ifdef COMPAT_IA32
747 
748 /*
749  * Lock a umtx object.
750  */
751 static int
752 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
753 {
754 	struct umtx_q *uq;
755 	uint32_t owner;
756 	uint32_t old;
757 	int error = 0;
758 
759 	uq = td->td_umtxq;
760 
761 	/*
762 	 * Care must be exercised when dealing with umtx structure. It
763 	 * can fault on any access.
764 	 */
765 	for (;;) {
766 		/*
767 		 * Try the uncontested case.  This should be done in userland.
768 		 */
769 		owner = casuword32(m, UMUTEX_UNOWNED, id);
770 
771 		/* The acquire succeeded. */
772 		if (owner == UMUTEX_UNOWNED)
773 			return (0);
774 
775 		/* The address was invalid. */
776 		if (owner == -1)
777 			return (EFAULT);
778 
779 		/* If no one owns it but it is contested try to acquire it. */
780 		if (owner == UMUTEX_CONTESTED) {
781 			owner = casuword32(m,
782 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
783 			if (owner == UMUTEX_CONTESTED)
784 				return (0);
785 
786 			/* The address was invalid. */
787 			if (owner == -1)
788 				return (EFAULT);
789 
790 			/* If this failed the lock has changed, restart. */
791 			continue;
792 		}
793 
794 		/*
795 		 * If we caught a signal, we have retried and now
796 		 * exit immediately.
797 		 */
798 		if (error != 0)
799 			return (error);
800 
801 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
802 			AUTO_SHARE, &uq->uq_key)) != 0)
803 			return (error);
804 
805 		umtxq_lock(&uq->uq_key);
806 		umtxq_busy(&uq->uq_key);
807 		umtxq_insert(uq);
808 		umtxq_unbusy(&uq->uq_key);
809 		umtxq_unlock(&uq->uq_key);
810 
811 		/*
812 		 * Set the contested bit so that a release in user space
813 		 * knows to use the system call for unlock.  If this fails
814 		 * either some one else has acquired the lock or it has been
815 		 * released.
816 		 */
817 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
818 
819 		/* The address was invalid. */
820 		if (old == -1) {
821 			umtxq_lock(&uq->uq_key);
822 			umtxq_remove(uq);
823 			umtxq_unlock(&uq->uq_key);
824 			umtx_key_release(&uq->uq_key);
825 			return (EFAULT);
826 		}
827 
828 		/*
829 		 * We set the contested bit, sleep. Otherwise the lock changed
830 		 * and we need to retry or we lost a race to the thread
831 		 * unlocking the umtx.
832 		 */
833 		umtxq_lock(&uq->uq_key);
834 		if (old == owner)
835 			error = umtxq_sleep(uq, "umtx", timo);
836 		umtxq_remove(uq);
837 		umtxq_unlock(&uq->uq_key);
838 		umtx_key_release(&uq->uq_key);
839 	}
840 
841 	return (0);
842 }
843 
844 /*
845  * Lock a umtx object.
846  */
847 static int
848 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
849 	struct timespec *timeout)
850 {
851 	struct timespec ts, ts2, ts3;
852 	struct timeval tv;
853 	int error;
854 
855 	if (timeout == NULL) {
856 		error = _do_lock_umtx32(td, m, id, 0);
857 		/* Mutex locking is restarted if it is interrupted. */
858 		if (error == EINTR)
859 			error = ERESTART;
860 	} else {
861 		getnanouptime(&ts);
862 		timespecadd(&ts, timeout);
863 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
864 		for (;;) {
865 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
866 			if (error != ETIMEDOUT)
867 				break;
868 			getnanouptime(&ts2);
869 			if (timespeccmp(&ts2, &ts, >=)) {
870 				error = ETIMEDOUT;
871 				break;
872 			}
873 			ts3 = ts;
874 			timespecsub(&ts3, &ts2);
875 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
876 		}
877 		/* Timed-locking is not restarted. */
878 		if (error == ERESTART)
879 			error = EINTR;
880 	}
881 	return (error);
882 }
883 
884 /*
885  * Unlock a umtx object.
886  */
887 static int
888 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
889 {
890 	struct umtx_key key;
891 	uint32_t owner;
892 	uint32_t old;
893 	int error;
894 	int count;
895 
896 	/*
897 	 * Make sure we own this mtx.
898 	 */
899 	owner = fuword32(m);
900 	if (owner == -1)
901 		return (EFAULT);
902 
903 	if ((owner & ~UMUTEX_CONTESTED) != id)
904 		return (EPERM);
905 
906 	/* This should be done in userland */
907 	if ((owner & UMUTEX_CONTESTED) == 0) {
908 		old = casuword32(m, owner, UMUTEX_UNOWNED);
909 		if (old == -1)
910 			return (EFAULT);
911 		if (old == owner)
912 			return (0);
913 		owner = old;
914 	}
915 
916 	/* We should only ever be in here for contested locks */
917 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
918 		&key)) != 0)
919 		return (error);
920 
921 	umtxq_lock(&key);
922 	umtxq_busy(&key);
923 	count = umtxq_count(&key);
924 	umtxq_unlock(&key);
925 
926 	/*
927 	 * When unlocking the umtx, it must be marked as unowned if
928 	 * there is zero or one thread only waiting for it.
929 	 * Otherwise, it must be marked as contested.
930 	 */
931 	old = casuword32(m, owner,
932 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
933 	umtxq_lock(&key);
934 	umtxq_signal(&key,1);
935 	umtxq_unbusy(&key);
936 	umtxq_unlock(&key);
937 	umtx_key_release(&key);
938 	if (old == -1)
939 		return (EFAULT);
940 	if (old != owner)
941 		return (EINVAL);
942 	return (0);
943 }
944 #endif
945 
946 /*
947  * Fetch and compare value, sleep on the address if value is not changed.
948  */
949 static int
950 do_wait(struct thread *td, void *addr, u_long id,
951 	struct timespec *timeout, int compat32)
952 {
953 	struct umtx_q *uq;
954 	struct timespec ts, ts2, ts3;
955 	struct timeval tv;
956 	u_long tmp;
957 	int error = 0;
958 
959 	uq = td->td_umtxq;
960 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
961 	    &uq->uq_key)) != 0)
962 		return (error);
963 
964 	umtxq_lock(&uq->uq_key);
965 	umtxq_insert(uq);
966 	umtxq_unlock(&uq->uq_key);
967 	if (compat32 == 0)
968 		tmp = fuword(addr);
969         else
970 		tmp = fuword32(addr);
971 	if (tmp != id) {
972 		umtxq_lock(&uq->uq_key);
973 		umtxq_remove(uq);
974 		umtxq_unlock(&uq->uq_key);
975 	} else if (timeout == NULL) {
976 		umtxq_lock(&uq->uq_key);
977 		error = umtxq_sleep(uq, "uwait", 0);
978 		umtxq_remove(uq);
979 		umtxq_unlock(&uq->uq_key);
980 	} else {
981 		getnanouptime(&ts);
982 		timespecadd(&ts, timeout);
983 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
984 		umtxq_lock(&uq->uq_key);
985 		for (;;) {
986 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
987 			if (!(uq->uq_flags & UQF_UMTXQ))
988 				break;
989 			if (error != ETIMEDOUT)
990 				break;
991 			umtxq_unlock(&uq->uq_key);
992 			getnanouptime(&ts2);
993 			if (timespeccmp(&ts2, &ts, >=)) {
994 				error = ETIMEDOUT;
995 				umtxq_lock(&uq->uq_key);
996 				break;
997 			}
998 			ts3 = ts;
999 			timespecsub(&ts3, &ts2);
1000 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1001 			umtxq_lock(&uq->uq_key);
1002 		}
1003 		umtxq_remove(uq);
1004 		umtxq_unlock(&uq->uq_key);
1005 	}
1006 	umtx_key_release(&uq->uq_key);
1007 	if (error == ERESTART)
1008 		error = EINTR;
1009 	return (error);
1010 }
1011 
1012 /*
1013  * Wake up threads sleeping on the specified address.
1014  */
1015 int
1016 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
1017 {
1018 	struct umtx_key key;
1019 	int ret;
1020 
1021 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1022 	   &key)) != 0)
1023 		return (ret);
1024 	umtxq_lock(&key);
1025 	ret = umtxq_signal(&key, n_wake);
1026 	umtxq_unlock(&key);
1027 	umtx_key_release(&key);
1028 	return (0);
1029 }
1030 
1031 /*
1032  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1033  */
1034 static int
1035 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1036 	int try)
1037 {
1038 	struct umtx_q *uq;
1039 	uint32_t owner, old, id;
1040 	int error = 0;
1041 
1042 	id = td->td_tid;
1043 	uq = td->td_umtxq;
1044 
1045 	/*
1046 	 * Care must be exercised when dealing with umtx structure. It
1047 	 * can fault on any access.
1048 	 */
1049 	for (;;) {
1050 		/*
1051 		 * Try the uncontested case.  This should be done in userland.
1052 		 */
1053 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1054 
1055 		/* The acquire succeeded. */
1056 		if (owner == UMUTEX_UNOWNED)
1057 			return (0);
1058 
1059 		/* The address was invalid. */
1060 		if (owner == -1)
1061 			return (EFAULT);
1062 
1063 		/* If no one owns it but it is contested try to acquire it. */
1064 		if (owner == UMUTEX_CONTESTED) {
1065 			owner = casuword32(&m->m_owner,
1066 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1067 
1068 			if (owner == UMUTEX_CONTESTED)
1069 				return (0);
1070 
1071 			/* The address was invalid. */
1072 			if (owner == -1)
1073 				return (EFAULT);
1074 
1075 			/* If this failed the lock has changed, restart. */
1076 			continue;
1077 		}
1078 
1079 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1080 		    (owner & ~UMUTEX_CONTESTED) == id)
1081 			return (EDEADLK);
1082 
1083 		if (try != 0)
1084 			return (EBUSY);
1085 
1086 		/*
1087 		 * If we caught a signal, we have retried and now
1088 		 * exit immediately.
1089 		 */
1090 		if (error != 0)
1091 			return (error);
1092 
1093 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1094 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1095 			return (error);
1096 
1097 		umtxq_lock(&uq->uq_key);
1098 		umtxq_busy(&uq->uq_key);
1099 		umtxq_insert(uq);
1100 		umtxq_unbusy(&uq->uq_key);
1101 		umtxq_unlock(&uq->uq_key);
1102 
1103 		/*
1104 		 * Set the contested bit so that a release in user space
1105 		 * knows to use the system call for unlock.  If this fails
1106 		 * either some one else has acquired the lock or it has been
1107 		 * released.
1108 		 */
1109 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1110 
1111 		/* The address was invalid. */
1112 		if (old == -1) {
1113 			umtxq_lock(&uq->uq_key);
1114 			umtxq_remove(uq);
1115 			umtxq_unlock(&uq->uq_key);
1116 			umtx_key_release(&uq->uq_key);
1117 			return (EFAULT);
1118 		}
1119 
1120 		/*
1121 		 * We set the contested bit, sleep. Otherwise the lock changed
1122 		 * and we need to retry or we lost a race to the thread
1123 		 * unlocking the umtx.
1124 		 */
1125 		umtxq_lock(&uq->uq_key);
1126 		if (old == owner)
1127 			error = umtxq_sleep(uq, "umtxn", timo);
1128 		umtxq_remove(uq);
1129 		umtxq_unlock(&uq->uq_key);
1130 		umtx_key_release(&uq->uq_key);
1131 	}
1132 
1133 	return (0);
1134 }
1135 
1136 /*
1137  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1138  */
1139 /*
1140  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1141  */
1142 static int
1143 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1144 {
1145 	struct umtx_key key;
1146 	uint32_t owner, old, id;
1147 	int error;
1148 	int count;
1149 
1150 	id = td->td_tid;
1151 	/*
1152 	 * Make sure we own this mtx.
1153 	 */
1154 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1155 	if (owner == -1)
1156 		return (EFAULT);
1157 
1158 	if ((owner & ~UMUTEX_CONTESTED) != id)
1159 		return (EPERM);
1160 
1161 	/* This should be done in userland */
1162 	if ((owner & UMUTEX_CONTESTED) == 0) {
1163 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1164 		if (old == -1)
1165 			return (EFAULT);
1166 		if (old == owner)
1167 			return (0);
1168 		owner = old;
1169 	}
1170 
1171 	/* We should only ever be in here for contested locks */
1172 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1173 	    &key)) != 0)
1174 		return (error);
1175 
1176 	umtxq_lock(&key);
1177 	umtxq_busy(&key);
1178 	count = umtxq_count(&key);
1179 	umtxq_unlock(&key);
1180 
1181 	/*
1182 	 * When unlocking the umtx, it must be marked as unowned if
1183 	 * there is zero or one thread only waiting for it.
1184 	 * Otherwise, it must be marked as contested.
1185 	 */
1186 	old = casuword32(&m->m_owner, owner,
1187 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1188 	umtxq_lock(&key);
1189 	umtxq_signal(&key,1);
1190 	umtxq_unbusy(&key);
1191 	umtxq_unlock(&key);
1192 	umtx_key_release(&key);
1193 	if (old == -1)
1194 		return (EFAULT);
1195 	if (old != owner)
1196 		return (EINVAL);
1197 	return (0);
1198 }
1199 
1200 static inline struct umtx_pi *
1201 umtx_pi_alloc(int flags)
1202 {
1203 	struct umtx_pi *pi;
1204 
1205 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1206 	TAILQ_INIT(&pi->pi_blocked);
1207 	atomic_add_int(&umtx_pi_allocated, 1);
1208 	return (pi);
1209 }
1210 
1211 static inline void
1212 umtx_pi_free(struct umtx_pi *pi)
1213 {
1214 	uma_zfree(umtx_pi_zone, pi);
1215 	atomic_add_int(&umtx_pi_allocated, -1);
1216 }
1217 
1218 /*
1219  * Adjust the thread's position on a pi_state after its priority has been
1220  * changed.
1221  */
1222 static int
1223 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1224 {
1225 	struct umtx_q *uq, *uq1, *uq2;
1226 	struct thread *td1;
1227 
1228 	mtx_assert(&umtx_lock, MA_OWNED);
1229 	if (pi == NULL)
1230 		return (0);
1231 
1232 	uq = td->td_umtxq;
1233 
1234 	/*
1235 	 * Check if the thread needs to be moved on the blocked chain.
1236 	 * It needs to be moved if either its priority is lower than
1237 	 * the previous thread or higher than the next thread.
1238 	 */
1239 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1240 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1241 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1242 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1243 		/*
1244 		 * Remove thread from blocked chain and determine where
1245 		 * it should be moved to.
1246 		 */
1247 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1248 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1249 			td1 = uq1->uq_thread;
1250 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1251 			if (UPRI(td1) > UPRI(td))
1252 				break;
1253 		}
1254 
1255 		if (uq1 == NULL)
1256 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1257 		else
1258 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1259 	}
1260 	return (1);
1261 }
1262 
1263 /*
1264  * Propagate priority when a thread is blocked on POSIX
1265  * PI mutex.
1266  */
1267 static void
1268 umtx_propagate_priority(struct thread *td)
1269 {
1270 	struct umtx_q *uq;
1271 	struct umtx_pi *pi;
1272 	int pri;
1273 
1274 	mtx_assert(&umtx_lock, MA_OWNED);
1275 	pri = UPRI(td);
1276 	uq = td->td_umtxq;
1277 	pi = uq->uq_pi_blocked;
1278 	if (pi == NULL)
1279 		return;
1280 
1281 	for (;;) {
1282 		td = pi->pi_owner;
1283 		if (td == NULL)
1284 			return;
1285 
1286 		MPASS(td->td_proc != NULL);
1287 		MPASS(td->td_proc->p_magic == P_MAGIC);
1288 
1289 		if (UPRI(td) <= pri)
1290 			return;
1291 
1292 		thread_lock(td);
1293 		sched_lend_user_prio(td, pri);
1294 		thread_unlock(td);
1295 
1296 		/*
1297 		 * Pick up the lock that td is blocked on.
1298 		 */
1299 		uq = td->td_umtxq;
1300 		pi = uq->uq_pi_blocked;
1301 		/* Resort td on the list if needed. */
1302 		if (!umtx_pi_adjust_thread(pi, td))
1303 			break;
1304 	}
1305 }
1306 
1307 /*
1308  * Unpropagate priority for a PI mutex when a thread blocked on
1309  * it is interrupted by signal or resumed by others.
1310  */
1311 static void
1312 umtx_unpropagate_priority(struct umtx_pi *pi)
1313 {
1314 	struct umtx_q *uq, *uq_owner;
1315 	struct umtx_pi *pi2;
1316 	int pri, oldpri;
1317 
1318 	mtx_assert(&umtx_lock, MA_OWNED);
1319 
1320 	while (pi != NULL && pi->pi_owner != NULL) {
1321 		pri = PRI_MAX;
1322 		uq_owner = pi->pi_owner->td_umtxq;
1323 
1324 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1325 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1326 			if (uq != NULL) {
1327 				if (pri > UPRI(uq->uq_thread))
1328 					pri = UPRI(uq->uq_thread);
1329 			}
1330 		}
1331 
1332 		if (pri > uq_owner->uq_inherited_pri)
1333 			pri = uq_owner->uq_inherited_pri;
1334 		thread_lock(pi->pi_owner);
1335 		oldpri = pi->pi_owner->td_user_pri;
1336 		sched_unlend_user_prio(pi->pi_owner, pri);
1337 		thread_unlock(pi->pi_owner);
1338 		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1339 		pi = uq_owner->uq_pi_blocked;
1340 	}
1341 }
1342 
1343 /*
1344  * Insert a PI mutex into owned list.
1345  */
1346 static void
1347 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1348 {
1349 	struct umtx_q *uq_owner;
1350 
1351 	uq_owner = owner->td_umtxq;
1352 	mtx_assert(&umtx_lock, MA_OWNED);
1353 	if (pi->pi_owner != NULL)
1354 		panic("pi_ower != NULL");
1355 	pi->pi_owner = owner;
1356 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1357 }
1358 
1359 /*
1360  * Claim ownership of a PI mutex.
1361  */
1362 static int
1363 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1364 {
1365 	struct umtx_q *uq, *uq_owner;
1366 
1367 	uq_owner = owner->td_umtxq;
1368 	mtx_lock_spin(&umtx_lock);
1369 	if (pi->pi_owner == owner) {
1370 		mtx_unlock_spin(&umtx_lock);
1371 		return (0);
1372 	}
1373 
1374 	if (pi->pi_owner != NULL) {
1375 		/*
1376 		 * userland may have already messed the mutex, sigh.
1377 		 */
1378 		mtx_unlock_spin(&umtx_lock);
1379 		return (EPERM);
1380 	}
1381 	umtx_pi_setowner(pi, owner);
1382 	uq = TAILQ_FIRST(&pi->pi_blocked);
1383 	if (uq != NULL) {
1384 		int pri;
1385 
1386 		pri = UPRI(uq->uq_thread);
1387 		thread_lock(owner);
1388 		if (pri < UPRI(owner))
1389 			sched_lend_user_prio(owner, pri);
1390 		thread_unlock(owner);
1391 	}
1392 	mtx_unlock_spin(&umtx_lock);
1393 	return (0);
1394 }
1395 
1396 static void
1397 umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1398 {
1399 	struct umtx_q *uq;
1400 	struct umtx_pi *pi;
1401 
1402 	uq = td->td_umtxq;
1403 	/*
1404 	 * Pick up the lock that td is blocked on.
1405 	 */
1406 	pi = uq->uq_pi_blocked;
1407 	MPASS(pi != NULL);
1408 
1409 	/* Resort the turnstile on the list. */
1410 	if (!umtx_pi_adjust_thread(pi, td))
1411 		return;
1412 
1413 	/*
1414 	 * If our priority was lowered and we are at the head of the
1415 	 * turnstile, then propagate our new priority up the chain.
1416 	 */
1417 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1418 		umtx_propagate_priority(td);
1419 }
1420 
1421 /*
1422  * Adjust a thread's order position in its blocked PI mutex,
1423  * this may result new priority propagating process.
1424  */
1425 void
1426 umtx_pi_adjust(struct thread *td, u_char oldpri)
1427 {
1428 	struct umtx_q *uq;
1429 	struct umtx_pi *pi;
1430 
1431 	uq = td->td_umtxq;
1432 	mtx_lock_spin(&umtx_lock);
1433 	/*
1434 	 * Pick up the lock that td is blocked on.
1435 	 */
1436 	pi = uq->uq_pi_blocked;
1437 	if (pi != NULL)
1438 		umtx_pi_adjust_locked(td, oldpri);
1439 	mtx_unlock_spin(&umtx_lock);
1440 }
1441 
1442 /*
1443  * Sleep on a PI mutex.
1444  */
1445 static int
1446 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1447 	uint32_t owner, const char *wmesg, int timo)
1448 {
1449 	struct umtxq_chain *uc;
1450 	struct thread *td, *td1;
1451 	struct umtx_q *uq1;
1452 	int pri;
1453 	int error = 0;
1454 
1455 	td = uq->uq_thread;
1456 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1457 	uc = umtxq_getchain(&uq->uq_key);
1458 	UMTXQ_LOCKED_ASSERT(uc);
1459 	umtxq_insert(uq);
1460 	if (pi->pi_owner == NULL) {
1461 		/* XXX
1462 		 * Current, We only support process private PI-mutex,
1463 		 * non-contended PI-mutexes are locked in userland.
1464 		 * Process shared PI-mutex should always be initialized
1465 		 * by kernel and be registered in kernel, locking should
1466 		 * always be done by kernel to avoid security problems.
1467 		 * For process private PI-mutex, we can find owner
1468 		 * thread and boost its priority safely.
1469 		 */
1470 		PROC_LOCK(curproc);
1471 		td1 = thread_find(curproc, owner);
1472 		mtx_lock_spin(&umtx_lock);
1473 		if (td1 != NULL && pi->pi_owner == NULL) {
1474 			uq1 = td1->td_umtxq;
1475 			umtx_pi_setowner(pi, td1);
1476 		}
1477 		PROC_UNLOCK(curproc);
1478 	} else {
1479 		mtx_lock_spin(&umtx_lock);
1480 	}
1481 
1482 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1483 		pri = UPRI(uq1->uq_thread);
1484 		if (pri > UPRI(td))
1485 			break;
1486 	}
1487 
1488 	if (uq1 != NULL)
1489 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1490 	else
1491 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1492 
1493 	uq->uq_pi_blocked = pi;
1494 	thread_lock(td);
1495 	td->td_flags |= TDF_UPIBLOCKED;
1496 	thread_unlock(td);
1497 	mtx_unlock_spin(&umtx_lock);
1498 	umtxq_unlock(&uq->uq_key);
1499 
1500 	mtx_lock_spin(&umtx_lock);
1501 	umtx_propagate_priority(td);
1502 	mtx_unlock_spin(&umtx_lock);
1503 
1504 	umtxq_lock(&uq->uq_key);
1505 	if (uq->uq_flags & UQF_UMTXQ) {
1506 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1507 		if (error == EWOULDBLOCK)
1508 			error = ETIMEDOUT;
1509 		if (uq->uq_flags & UQF_UMTXQ) {
1510 			umtxq_busy(&uq->uq_key);
1511 			umtxq_remove(uq);
1512 			umtxq_unbusy(&uq->uq_key);
1513 		}
1514 	}
1515 	umtxq_unlock(&uq->uq_key);
1516 
1517 	mtx_lock_spin(&umtx_lock);
1518 	uq->uq_pi_blocked = NULL;
1519 	thread_lock(td);
1520 	td->td_flags &= ~TDF_UPIBLOCKED;
1521 	thread_unlock(td);
1522 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1523 	umtx_unpropagate_priority(pi);
1524 	mtx_unlock_spin(&umtx_lock);
1525 
1526 	umtxq_lock(&uq->uq_key);
1527 
1528 	return (error);
1529 }
1530 
1531 /*
1532  * Add reference count for a PI mutex.
1533  */
1534 static void
1535 umtx_pi_ref(struct umtx_pi *pi)
1536 {
1537 	struct umtxq_chain *uc;
1538 
1539 	uc = umtxq_getchain(&pi->pi_key);
1540 	UMTXQ_LOCKED_ASSERT(uc);
1541 	pi->pi_refcount++;
1542 }
1543 
1544 /*
1545  * Decrease reference count for a PI mutex, if the counter
1546  * is decreased to zero, its memory space is freed.
1547  */
1548 static void
1549 umtx_pi_unref(struct umtx_pi *pi)
1550 {
1551 	struct umtxq_chain *uc;
1552 	int free = 0;
1553 
1554 	uc = umtxq_getchain(&pi->pi_key);
1555 	UMTXQ_LOCKED_ASSERT(uc);
1556 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1557 	if (--pi->pi_refcount == 0) {
1558 		mtx_lock_spin(&umtx_lock);
1559 		if (pi->pi_owner != NULL) {
1560 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1561 				pi, pi_link);
1562 			pi->pi_owner = NULL;
1563 		}
1564 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1565 			("blocked queue not empty"));
1566 		mtx_unlock_spin(&umtx_lock);
1567 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1568 		free = 1;
1569 	}
1570 	if (free)
1571 		umtx_pi_free(pi);
1572 }
1573 
1574 /*
1575  * Find a PI mutex in hash table.
1576  */
1577 static struct umtx_pi *
1578 umtx_pi_lookup(struct umtx_key *key)
1579 {
1580 	struct umtxq_chain *uc;
1581 	struct umtx_pi *pi;
1582 
1583 	uc = umtxq_getchain(key);
1584 	UMTXQ_LOCKED_ASSERT(uc);
1585 
1586 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1587 		if (umtx_key_match(&pi->pi_key, key)) {
1588 			return (pi);
1589 		}
1590 	}
1591 	return (NULL);
1592 }
1593 
1594 /*
1595  * Insert a PI mutex into hash table.
1596  */
1597 static inline void
1598 umtx_pi_insert(struct umtx_pi *pi)
1599 {
1600 	struct umtxq_chain *uc;
1601 
1602 	uc = umtxq_getchain(&pi->pi_key);
1603 	UMTXQ_LOCKED_ASSERT(uc);
1604 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1605 }
1606 
1607 /*
1608  * Lock a PI mutex.
1609  */
1610 static int
1611 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1612 	int try)
1613 {
1614 	struct umtx_q *uq;
1615 	struct umtx_pi *pi, *new_pi;
1616 	uint32_t id, owner, old;
1617 	int error;
1618 
1619 	id = td->td_tid;
1620 	uq = td->td_umtxq;
1621 
1622 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1623 	    &uq->uq_key)) != 0)
1624 		return (error);
1625 	umtxq_lock(&uq->uq_key);
1626 	pi = umtx_pi_lookup(&uq->uq_key);
1627 	if (pi == NULL) {
1628 		new_pi = umtx_pi_alloc(M_NOWAIT);
1629 		if (new_pi == NULL) {
1630 			umtxq_unlock(&uq->uq_key);
1631 			new_pi = umtx_pi_alloc(M_WAITOK);
1632 			new_pi->pi_key = uq->uq_key;
1633 			umtxq_lock(&uq->uq_key);
1634 			pi = umtx_pi_lookup(&uq->uq_key);
1635 			if (pi != NULL) {
1636 				umtx_pi_free(new_pi);
1637 				new_pi = NULL;
1638 			}
1639 		}
1640 		if (new_pi != NULL) {
1641 			new_pi->pi_key = uq->uq_key;
1642 			umtx_pi_insert(new_pi);
1643 			pi = new_pi;
1644 		}
1645 	}
1646 	umtx_pi_ref(pi);
1647 	umtxq_unlock(&uq->uq_key);
1648 
1649 	/*
1650 	 * Care must be exercised when dealing with umtx structure.  It
1651 	 * can fault on any access.
1652 	 */
1653 	for (;;) {
1654 		/*
1655 		 * Try the uncontested case.  This should be done in userland.
1656 		 */
1657 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1658 
1659 		/* The acquire succeeded. */
1660 		if (owner == UMUTEX_UNOWNED) {
1661 			error = 0;
1662 			break;
1663 		}
1664 
1665 		/* The address was invalid. */
1666 		if (owner == -1) {
1667 			error = EFAULT;
1668 			break;
1669 		}
1670 
1671 		/* If no one owns it but it is contested try to acquire it. */
1672 		if (owner == UMUTEX_CONTESTED) {
1673 			owner = casuword32(&m->m_owner,
1674 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1675 
1676 			if (owner == UMUTEX_CONTESTED) {
1677 				umtxq_lock(&uq->uq_key);
1678 				error = umtx_pi_claim(pi, td);
1679 				umtxq_unlock(&uq->uq_key);
1680 				break;
1681 			}
1682 
1683 			/* The address was invalid. */
1684 			if (owner == -1) {
1685 				error = EFAULT;
1686 				break;
1687 			}
1688 
1689 			/* If this failed the lock has changed, restart. */
1690 			continue;
1691 		}
1692 
1693 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1694 		    (owner & ~UMUTEX_CONTESTED) == id) {
1695 			error = EDEADLK;
1696 			break;
1697 		}
1698 
1699 		if (try != 0) {
1700 			error = EBUSY;
1701 			break;
1702 		}
1703 
1704 		/*
1705 		 * If we caught a signal, we have retried and now
1706 		 * exit immediately.
1707 		 */
1708 		if (error != 0)
1709 			break;
1710 
1711 		umtxq_lock(&uq->uq_key);
1712 		umtxq_busy(&uq->uq_key);
1713 		umtxq_unlock(&uq->uq_key);
1714 
1715 		/*
1716 		 * Set the contested bit so that a release in user space
1717 		 * knows to use the system call for unlock.  If this fails
1718 		 * either some one else has acquired the lock or it has been
1719 		 * released.
1720 		 */
1721 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1722 
1723 		/* The address was invalid. */
1724 		if (old == -1) {
1725 			umtxq_lock(&uq->uq_key);
1726 			umtxq_unbusy(&uq->uq_key);
1727 			umtxq_unlock(&uq->uq_key);
1728 			error = EFAULT;
1729 			break;
1730 		}
1731 
1732 		umtxq_lock(&uq->uq_key);
1733 		umtxq_unbusy(&uq->uq_key);
1734 		/*
1735 		 * We set the contested bit, sleep. Otherwise the lock changed
1736 		 * and we need to retry or we lost a race to the thread
1737 		 * unlocking the umtx.
1738 		 */
1739 		if (old == owner)
1740 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1741 				 "umtxpi", timo);
1742 		umtxq_unlock(&uq->uq_key);
1743 	}
1744 
1745 	umtxq_lock(&uq->uq_key);
1746 	umtx_pi_unref(pi);
1747 	umtxq_unlock(&uq->uq_key);
1748 
1749 	umtx_key_release(&uq->uq_key);
1750 	return (error);
1751 }
1752 
1753 /*
1754  * Unlock a PI mutex.
1755  */
1756 static int
1757 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1758 {
1759 	struct umtx_key key;
1760 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1761 	struct umtx_pi *pi, *pi2;
1762 	uint32_t owner, old, id;
1763 	int error;
1764 	int count;
1765 	int pri;
1766 
1767 	id = td->td_tid;
1768 	/*
1769 	 * Make sure we own this mtx.
1770 	 */
1771 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1772 	if (owner == -1)
1773 		return (EFAULT);
1774 
1775 	if ((owner & ~UMUTEX_CONTESTED) != id)
1776 		return (EPERM);
1777 
1778 	/* This should be done in userland */
1779 	if ((owner & UMUTEX_CONTESTED) == 0) {
1780 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1781 		if (old == -1)
1782 			return (EFAULT);
1783 		if (old == owner)
1784 			return (0);
1785 		owner = old;
1786 	}
1787 
1788 	/* We should only ever be in here for contested locks */
1789 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1790 	    &key)) != 0)
1791 		return (error);
1792 
1793 	umtxq_lock(&key);
1794 	umtxq_busy(&key);
1795 	count = umtxq_count_pi(&key, &uq_first);
1796 	if (uq_first != NULL) {
1797 		pi = uq_first->uq_pi_blocked;
1798 		if (pi->pi_owner != curthread) {
1799 			umtxq_unbusy(&key);
1800 			umtxq_unlock(&key);
1801 			/* userland messed the mutex */
1802 			return (EPERM);
1803 		}
1804 		uq_me = curthread->td_umtxq;
1805 		mtx_lock_spin(&umtx_lock);
1806 		pi->pi_owner = NULL;
1807 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1808 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1809 		pri = PRI_MAX;
1810 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1811 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1812 			if (uq_first2 != NULL) {
1813 				if (pri > UPRI(uq_first2->uq_thread))
1814 					pri = UPRI(uq_first2->uq_thread);
1815 			}
1816 		}
1817 		thread_lock(curthread);
1818 		sched_unlend_user_prio(curthread, pri);
1819 		thread_unlock(curthread);
1820 		mtx_unlock_spin(&umtx_lock);
1821 	}
1822 	umtxq_unlock(&key);
1823 
1824 	/*
1825 	 * When unlocking the umtx, it must be marked as unowned if
1826 	 * there is zero or one thread only waiting for it.
1827 	 * Otherwise, it must be marked as contested.
1828 	 */
1829 	old = casuword32(&m->m_owner, owner,
1830 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1831 
1832 	umtxq_lock(&key);
1833 	if (uq_first != NULL)
1834 		umtxq_signal_thread(uq_first);
1835 	umtxq_unbusy(&key);
1836 	umtxq_unlock(&key);
1837 	umtx_key_release(&key);
1838 	if (old == -1)
1839 		return (EFAULT);
1840 	if (old != owner)
1841 		return (EINVAL);
1842 	return (0);
1843 }
1844 
1845 /*
1846  * Lock a PP mutex.
1847  */
1848 static int
1849 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1850 	int try)
1851 {
1852 	struct umtx_q *uq, *uq2;
1853 	struct umtx_pi *pi;
1854 	uint32_t ceiling;
1855 	uint32_t owner, id;
1856 	int error, pri, old_inherited_pri, su;
1857 
1858 	id = td->td_tid;
1859 	uq = td->td_umtxq;
1860 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1861 	    &uq->uq_key)) != 0)
1862 		return (error);
1863 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1864 	for (;;) {
1865 		old_inherited_pri = uq->uq_inherited_pri;
1866 		umtxq_lock(&uq->uq_key);
1867 		umtxq_busy(&uq->uq_key);
1868 		umtxq_unlock(&uq->uq_key);
1869 
1870 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1871 		if (ceiling > RTP_PRIO_MAX) {
1872 			error = EINVAL;
1873 			goto out;
1874 		}
1875 
1876 		mtx_lock_spin(&umtx_lock);
1877 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1878 			mtx_unlock_spin(&umtx_lock);
1879 			error = EINVAL;
1880 			goto out;
1881 		}
1882 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1883 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1884 			thread_lock(td);
1885 			if (uq->uq_inherited_pri < UPRI(td))
1886 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1887 			thread_unlock(td);
1888 		}
1889 		mtx_unlock_spin(&umtx_lock);
1890 
1891 		owner = casuword32(&m->m_owner,
1892 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1893 
1894 		if (owner == UMUTEX_CONTESTED) {
1895 			error = 0;
1896 			break;
1897 		}
1898 
1899 		/* The address was invalid. */
1900 		if (owner == -1) {
1901 			error = EFAULT;
1902 			break;
1903 		}
1904 
1905 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1906 		    (owner & ~UMUTEX_CONTESTED) == id) {
1907 			error = EDEADLK;
1908 			break;
1909 		}
1910 
1911 		if (try != 0) {
1912 			error = EBUSY;
1913 			break;
1914 		}
1915 
1916 		/*
1917 		 * If we caught a signal, we have retried and now
1918 		 * exit immediately.
1919 		 */
1920 		if (error != 0)
1921 			break;
1922 
1923 		umtxq_lock(&uq->uq_key);
1924 		umtxq_insert(uq);
1925 		umtxq_unbusy(&uq->uq_key);
1926 		error = umtxq_sleep(uq, "umtxpp", timo);
1927 		umtxq_remove(uq);
1928 		umtxq_unlock(&uq->uq_key);
1929 
1930 		mtx_lock_spin(&umtx_lock);
1931 		uq->uq_inherited_pri = old_inherited_pri;
1932 		pri = PRI_MAX;
1933 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1934 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1935 			if (uq2 != NULL) {
1936 				if (pri > UPRI(uq2->uq_thread))
1937 					pri = UPRI(uq2->uq_thread);
1938 			}
1939 		}
1940 		if (pri > uq->uq_inherited_pri)
1941 			pri = uq->uq_inherited_pri;
1942 		thread_lock(td);
1943 		sched_unlend_user_prio(td, pri);
1944 		thread_unlock(td);
1945 		mtx_unlock_spin(&umtx_lock);
1946 	}
1947 
1948 	if (error != 0) {
1949 		mtx_lock_spin(&umtx_lock);
1950 		uq->uq_inherited_pri = old_inherited_pri;
1951 		pri = PRI_MAX;
1952 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1953 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1954 			if (uq2 != NULL) {
1955 				if (pri > UPRI(uq2->uq_thread))
1956 					pri = UPRI(uq2->uq_thread);
1957 			}
1958 		}
1959 		if (pri > uq->uq_inherited_pri)
1960 			pri = uq->uq_inherited_pri;
1961 		thread_lock(td);
1962 		sched_unlend_user_prio(td, pri);
1963 		thread_unlock(td);
1964 		mtx_unlock_spin(&umtx_lock);
1965 	}
1966 
1967 out:
1968 	umtxq_lock(&uq->uq_key);
1969 	umtxq_unbusy(&uq->uq_key);
1970 	umtxq_unlock(&uq->uq_key);
1971 	umtx_key_release(&uq->uq_key);
1972 	return (error);
1973 }
1974 
1975 /*
1976  * Unlock a PP mutex.
1977  */
1978 static int
1979 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1980 {
1981 	struct umtx_key key;
1982 	struct umtx_q *uq, *uq2;
1983 	struct umtx_pi *pi;
1984 	uint32_t owner, id;
1985 	uint32_t rceiling;
1986 	int error, pri, new_inherited_pri, su;
1987 
1988 	id = td->td_tid;
1989 	uq = td->td_umtxq;
1990 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1991 
1992 	/*
1993 	 * Make sure we own this mtx.
1994 	 */
1995 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1996 	if (owner == -1)
1997 		return (EFAULT);
1998 
1999 	if ((owner & ~UMUTEX_CONTESTED) != id)
2000 		return (EPERM);
2001 
2002 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2003 	if (error != 0)
2004 		return (error);
2005 
2006 	if (rceiling == -1)
2007 		new_inherited_pri = PRI_MAX;
2008 	else {
2009 		rceiling = RTP_PRIO_MAX - rceiling;
2010 		if (rceiling > RTP_PRIO_MAX)
2011 			return (EINVAL);
2012 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2013 	}
2014 
2015 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2016 	    &key)) != 0)
2017 		return (error);
2018 	umtxq_lock(&key);
2019 	umtxq_busy(&key);
2020 	umtxq_unlock(&key);
2021 	/*
2022 	 * For priority protected mutex, always set unlocked state
2023 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2024 	 * to lock the mutex, it is necessary because thread priority
2025 	 * has to be adjusted for such mutex.
2026 	 */
2027 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2028 		UMUTEX_CONTESTED);
2029 
2030 	umtxq_lock(&key);
2031 	if (error == 0)
2032 		umtxq_signal(&key, 1);
2033 	umtxq_unbusy(&key);
2034 	umtxq_unlock(&key);
2035 
2036 	if (error == -1)
2037 		error = EFAULT;
2038 	else {
2039 		mtx_lock_spin(&umtx_lock);
2040 		if (su != 0)
2041 			uq->uq_inherited_pri = new_inherited_pri;
2042 		pri = PRI_MAX;
2043 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2044 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2045 			if (uq2 != NULL) {
2046 				if (pri > UPRI(uq2->uq_thread))
2047 					pri = UPRI(uq2->uq_thread);
2048 			}
2049 		}
2050 		if (pri > uq->uq_inherited_pri)
2051 			pri = uq->uq_inherited_pri;
2052 		thread_lock(td);
2053 		sched_unlend_user_prio(td, pri);
2054 		thread_unlock(td);
2055 		mtx_unlock_spin(&umtx_lock);
2056 	}
2057 	umtx_key_release(&key);
2058 	return (error);
2059 }
2060 
2061 static int
2062 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2063 	uint32_t *old_ceiling)
2064 {
2065 	struct umtx_q *uq;
2066 	uint32_t save_ceiling;
2067 	uint32_t owner, id;
2068 	uint32_t flags;
2069 	int error;
2070 
2071 	flags = fuword32(&m->m_flags);
2072 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2073 		return (EINVAL);
2074 	if (ceiling > RTP_PRIO_MAX)
2075 		return (EINVAL);
2076 	id = td->td_tid;
2077 	uq = td->td_umtxq;
2078 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2079 	   &uq->uq_key)) != 0)
2080 		return (error);
2081 	for (;;) {
2082 		umtxq_lock(&uq->uq_key);
2083 		umtxq_busy(&uq->uq_key);
2084 		umtxq_unlock(&uq->uq_key);
2085 
2086 		save_ceiling = fuword32(&m->m_ceilings[0]);
2087 
2088 		owner = casuword32(&m->m_owner,
2089 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2090 
2091 		if (owner == UMUTEX_CONTESTED) {
2092 			suword32(&m->m_ceilings[0], ceiling);
2093 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2094 				UMUTEX_CONTESTED);
2095 			error = 0;
2096 			break;
2097 		}
2098 
2099 		/* The address was invalid. */
2100 		if (owner == -1) {
2101 			error = EFAULT;
2102 			break;
2103 		}
2104 
2105 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2106 			suword32(&m->m_ceilings[0], ceiling);
2107 			error = 0;
2108 			break;
2109 		}
2110 
2111 		/*
2112 		 * If we caught a signal, we have retried and now
2113 		 * exit immediately.
2114 		 */
2115 		if (error != 0)
2116 			break;
2117 
2118 		/*
2119 		 * We set the contested bit, sleep. Otherwise the lock changed
2120 		 * and we need to retry or we lost a race to the thread
2121 		 * unlocking the umtx.
2122 		 */
2123 		umtxq_lock(&uq->uq_key);
2124 		umtxq_insert(uq);
2125 		umtxq_unbusy(&uq->uq_key);
2126 		error = umtxq_sleep(uq, "umtxpp", 0);
2127 		umtxq_remove(uq);
2128 		umtxq_unlock(&uq->uq_key);
2129 	}
2130 	umtxq_lock(&uq->uq_key);
2131 	if (error == 0)
2132 		umtxq_signal(&uq->uq_key, INT_MAX);
2133 	umtxq_unbusy(&uq->uq_key);
2134 	umtxq_unlock(&uq->uq_key);
2135 	umtx_key_release(&uq->uq_key);
2136 	if (error == 0 && old_ceiling != NULL)
2137 		suword32(old_ceiling, save_ceiling);
2138 	return (error);
2139 }
2140 
2141 static int
2142 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2143 	int try)
2144 {
2145 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2146 	case 0:
2147 		return (_do_lock_normal(td, m, flags, timo, try));
2148 	case UMUTEX_PRIO_INHERIT:
2149 		return (_do_lock_pi(td, m, flags, timo, try));
2150 	case UMUTEX_PRIO_PROTECT:
2151 		return (_do_lock_pp(td, m, flags, timo, try));
2152 	}
2153 	return (EINVAL);
2154 }
2155 
2156 /*
2157  * Lock a userland POSIX mutex.
2158  */
2159 static int
2160 do_lock_umutex(struct thread *td, struct umutex *m,
2161 	struct timespec *timeout, int try)
2162 {
2163 	struct timespec ts, ts2, ts3;
2164 	struct timeval tv;
2165 	uint32_t flags;
2166 	int error;
2167 
2168 	flags = fuword32(&m->m_flags);
2169 	if (flags == -1)
2170 		return (EFAULT);
2171 
2172 	if (timeout == NULL) {
2173 		error = _do_lock_umutex(td, m, flags, 0, try);
2174 		/* Mutex locking is restarted if it is interrupted. */
2175 		if (error == EINTR)
2176 			error = ERESTART;
2177 	} else {
2178 		getnanouptime(&ts);
2179 		timespecadd(&ts, timeout);
2180 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2181 		for (;;) {
2182 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2183 			if (error != ETIMEDOUT)
2184 				break;
2185 			getnanouptime(&ts2);
2186 			if (timespeccmp(&ts2, &ts, >=)) {
2187 				error = ETIMEDOUT;
2188 				break;
2189 			}
2190 			ts3 = ts;
2191 			timespecsub(&ts3, &ts2);
2192 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2193 		}
2194 		/* Timed-locking is not restarted. */
2195 		if (error == ERESTART)
2196 			error = EINTR;
2197 	}
2198 	return (error);
2199 }
2200 
2201 /*
2202  * Unlock a userland POSIX mutex.
2203  */
2204 static int
2205 do_unlock_umutex(struct thread *td, struct umutex *m)
2206 {
2207 	uint32_t flags;
2208 
2209 	flags = fuword32(&m->m_flags);
2210 	if (flags == -1)
2211 		return (EFAULT);
2212 
2213 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2214 	case 0:
2215 		return (do_unlock_normal(td, m, flags));
2216 	case UMUTEX_PRIO_INHERIT:
2217 		return (do_unlock_pi(td, m, flags));
2218 	case UMUTEX_PRIO_PROTECT:
2219 		return (do_unlock_pp(td, m, flags));
2220 	}
2221 
2222 	return (EINVAL);
2223 }
2224 
2225 static int
2226 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2227 	struct timespec *timeout, u_long wflags)
2228 {
2229 	struct umtx_q *uq;
2230 	struct timeval tv;
2231 	struct timespec cts, ets, tts;
2232 	uint32_t flags;
2233 	int error;
2234 
2235 	uq = td->td_umtxq;
2236 	flags = fuword32(&cv->c_flags);
2237 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2238 	if (error != 0)
2239 		return (error);
2240 	umtxq_lock(&uq->uq_key);
2241 	umtxq_busy(&uq->uq_key);
2242 	umtxq_insert(uq);
2243 	umtxq_unlock(&uq->uq_key);
2244 
2245 	/*
2246 	 * The magic thing is we should set c_has_waiters to 1 before
2247 	 * releasing user mutex.
2248 	 */
2249 	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2250 
2251 	umtxq_lock(&uq->uq_key);
2252 	umtxq_unbusy(&uq->uq_key);
2253 	umtxq_unlock(&uq->uq_key);
2254 
2255 	error = do_unlock_umutex(td, m);
2256 
2257 	umtxq_lock(&uq->uq_key);
2258 	if (error == 0) {
2259 		if ((wflags & UMTX_CHECK_UNPARKING) &&
2260 		    (td->td_pflags & TDP_WAKEUP)) {
2261 			td->td_pflags &= ~TDP_WAKEUP;
2262 			error = EINTR;
2263 		} else if (timeout == NULL) {
2264 			error = umtxq_sleep(uq, "ucond", 0);
2265 		} else {
2266 			getnanouptime(&ets);
2267 			timespecadd(&ets, timeout);
2268 			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2269 			for (;;) {
2270 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2271 				if (error != ETIMEDOUT)
2272 					break;
2273 				getnanouptime(&cts);
2274 				if (timespeccmp(&cts, &ets, >=)) {
2275 					error = ETIMEDOUT;
2276 					break;
2277 				}
2278 				tts = ets;
2279 				timespecsub(&tts, &cts);
2280 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2281 			}
2282 		}
2283 	}
2284 
2285 	if (error != 0) {
2286 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2287 			/*
2288 			 * If we concurrently got do_cv_signal()d
2289 			 * and we got an error or UNIX signals or a timeout,
2290 			 * then, perform another umtxq_signal to avoid
2291 			 * consuming the wakeup. This may cause supurious
2292 			 * wakeup for another thread which was just queued,
2293 			 * but SUSV3 explicitly allows supurious wakeup to
2294 			 * occur, and indeed a kernel based implementation
2295 			 * can not avoid it.
2296 			 */
2297 			if (!umtxq_signal(&uq->uq_key, 1))
2298 				error = 0;
2299 		}
2300 		if (error == ERESTART)
2301 			error = EINTR;
2302 	}
2303 	umtxq_remove(uq);
2304 	umtxq_unlock(&uq->uq_key);
2305 	umtx_key_release(&uq->uq_key);
2306 	return (error);
2307 }
2308 
2309 /*
2310  * Signal a userland condition variable.
2311  */
2312 static int
2313 do_cv_signal(struct thread *td, struct ucond *cv)
2314 {
2315 	struct umtx_key key;
2316 	int error, cnt, nwake;
2317 	uint32_t flags;
2318 
2319 	flags = fuword32(&cv->c_flags);
2320 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2321 		return (error);
2322 	umtxq_lock(&key);
2323 	umtxq_busy(&key);
2324 	cnt = umtxq_count(&key);
2325 	nwake = umtxq_signal(&key, 1);
2326 	if (cnt <= nwake) {
2327 		umtxq_unlock(&key);
2328 		error = suword32(
2329 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2330 		umtxq_lock(&key);
2331 	}
2332 	umtxq_unbusy(&key);
2333 	umtxq_unlock(&key);
2334 	umtx_key_release(&key);
2335 	return (error);
2336 }
2337 
2338 static int
2339 do_cv_broadcast(struct thread *td, struct ucond *cv)
2340 {
2341 	struct umtx_key key;
2342 	int error;
2343 	uint32_t flags;
2344 
2345 	flags = fuword32(&cv->c_flags);
2346 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2347 		return (error);
2348 
2349 	umtxq_lock(&key);
2350 	umtxq_busy(&key);
2351 	umtxq_signal(&key, INT_MAX);
2352 	umtxq_unlock(&key);
2353 
2354 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2355 
2356 	umtxq_lock(&key);
2357 	umtxq_unbusy(&key);
2358 	umtxq_unlock(&key);
2359 
2360 	umtx_key_release(&key);
2361 	return (error);
2362 }
2363 
2364 static int
2365 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2366 {
2367 	struct umtx_q *uq;
2368 	uint32_t flags, wrflags;
2369 	int32_t state, oldstate;
2370 	int32_t blocked_readers;
2371 	int error;
2372 
2373 	uq = td->td_umtxq;
2374 	flags = fuword32(&rwlock->rw_flags);
2375 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2376 	if (error != 0)
2377 		return (error);
2378 
2379 	wrflags = URWLOCK_WRITE_OWNER;
2380 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2381 		wrflags |= URWLOCK_WRITE_WAITERS;
2382 
2383 	for (;;) {
2384 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2385 		/* try to lock it */
2386 		while (!(state & wrflags)) {
2387 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2388 				umtx_key_release(&uq->uq_key);
2389 				return (EAGAIN);
2390 			}
2391 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2392 			if (oldstate == state) {
2393 				umtx_key_release(&uq->uq_key);
2394 				return (0);
2395 			}
2396 			state = oldstate;
2397 		}
2398 
2399 		if (error)
2400 			break;
2401 
2402 		/* grab monitor lock */
2403 		umtxq_lock(&uq->uq_key);
2404 		umtxq_busy(&uq->uq_key);
2405 		umtxq_unlock(&uq->uq_key);
2406 
2407 		/* set read contention bit */
2408 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2409 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2410 			if (oldstate == state)
2411 				goto sleep;
2412 			state = oldstate;
2413 		}
2414 
2415 		/* state is changed while setting flags, restart */
2416 		if (!(state & wrflags)) {
2417 			umtxq_lock(&uq->uq_key);
2418 			umtxq_unbusy(&uq->uq_key);
2419 			umtxq_unlock(&uq->uq_key);
2420 			continue;
2421 		}
2422 
2423 sleep:
2424 		/* contention bit is set, before sleeping, increase read waiter count */
2425 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2426 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2427 
2428 		while (state & wrflags) {
2429 			umtxq_lock(&uq->uq_key);
2430 			umtxq_insert(uq);
2431 			umtxq_unbusy(&uq->uq_key);
2432 
2433 			error = umtxq_sleep(uq, "urdlck", timo);
2434 
2435 			umtxq_busy(&uq->uq_key);
2436 			umtxq_remove(uq);
2437 			umtxq_unlock(&uq->uq_key);
2438 			if (error)
2439 				break;
2440 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2441 		}
2442 
2443 		/* decrease read waiter count, and may clear read contention bit */
2444 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2445 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2446 		if (blocked_readers == 1) {
2447 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2448 			for (;;) {
2449 				oldstate = casuword32(&rwlock->rw_state, state,
2450 					 state & ~URWLOCK_READ_WAITERS);
2451 				if (oldstate == state)
2452 					break;
2453 				state = oldstate;
2454 			}
2455 		}
2456 
2457 		umtxq_lock(&uq->uq_key);
2458 		umtxq_unbusy(&uq->uq_key);
2459 		umtxq_unlock(&uq->uq_key);
2460 	}
2461 	umtx_key_release(&uq->uq_key);
2462 	return (error);
2463 }
2464 
2465 static int
2466 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2467 {
2468 	struct timespec ts, ts2, ts3;
2469 	struct timeval tv;
2470 	int error;
2471 
2472 	getnanouptime(&ts);
2473 	timespecadd(&ts, timeout);
2474 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2475 	for (;;) {
2476 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2477 		if (error != ETIMEDOUT)
2478 			break;
2479 		getnanouptime(&ts2);
2480 		if (timespeccmp(&ts2, &ts, >=)) {
2481 			error = ETIMEDOUT;
2482 			break;
2483 		}
2484 		ts3 = ts;
2485 		timespecsub(&ts3, &ts2);
2486 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2487 	}
2488 	if (error == ERESTART)
2489 		error = EINTR;
2490 	return (error);
2491 }
2492 
2493 static int
2494 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2495 {
2496 	struct umtx_q *uq;
2497 	uint32_t flags;
2498 	int32_t state, oldstate;
2499 	int32_t blocked_writers;
2500 	int error;
2501 
2502 	uq = td->td_umtxq;
2503 	flags = fuword32(&rwlock->rw_flags);
2504 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2505 	if (error != 0)
2506 		return (error);
2507 
2508 	for (;;) {
2509 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2510 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2511 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2512 			if (oldstate == state) {
2513 				umtx_key_release(&uq->uq_key);
2514 				return (0);
2515 			}
2516 			state = oldstate;
2517 		}
2518 
2519 		if (error)
2520 			break;
2521 
2522 		/* grab monitor lock */
2523 		umtxq_lock(&uq->uq_key);
2524 		umtxq_busy(&uq->uq_key);
2525 		umtxq_unlock(&uq->uq_key);
2526 
2527 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2528 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2529 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2530 			if (oldstate == state)
2531 				goto sleep;
2532 			state = oldstate;
2533 		}
2534 
2535 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2536 			umtxq_lock(&uq->uq_key);
2537 			umtxq_unbusy(&uq->uq_key);
2538 			umtxq_unlock(&uq->uq_key);
2539 			continue;
2540 		}
2541 sleep:
2542 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2543 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2544 
2545 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2546 			umtxq_lock(&uq->uq_key);
2547 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2548 			umtxq_unbusy(&uq->uq_key);
2549 
2550 			error = umtxq_sleep(uq, "uwrlck", timo);
2551 
2552 			umtxq_busy(&uq->uq_key);
2553 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2554 			umtxq_unlock(&uq->uq_key);
2555 			if (error)
2556 				break;
2557 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2558 		}
2559 
2560 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2561 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2562 		if (blocked_writers == 1) {
2563 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2564 			for (;;) {
2565 				oldstate = casuword32(&rwlock->rw_state, state,
2566 					 state & ~URWLOCK_WRITE_WAITERS);
2567 				if (oldstate == state)
2568 					break;
2569 				state = oldstate;
2570 			}
2571 		}
2572 
2573 		umtxq_lock(&uq->uq_key);
2574 		umtxq_unbusy(&uq->uq_key);
2575 		umtxq_unlock(&uq->uq_key);
2576 	}
2577 
2578 	umtx_key_release(&uq->uq_key);
2579 	return (error);
2580 }
2581 
2582 static int
2583 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2584 {
2585 	struct timespec ts, ts2, ts3;
2586 	struct timeval tv;
2587 	int error;
2588 
2589 	getnanouptime(&ts);
2590 	timespecadd(&ts, timeout);
2591 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2592 	for (;;) {
2593 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2594 		if (error != ETIMEDOUT)
2595 			break;
2596 		getnanouptime(&ts2);
2597 		if (timespeccmp(&ts2, &ts, >=)) {
2598 			error = ETIMEDOUT;
2599 			break;
2600 		}
2601 		ts3 = ts;
2602 		timespecsub(&ts3, &ts2);
2603 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2604 	}
2605 	if (error == ERESTART)
2606 		error = EINTR;
2607 	return (error);
2608 }
2609 
2610 static int
2611 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2612 {
2613 	struct umtx_q *uq;
2614 	uint32_t flags;
2615 	int32_t state, oldstate;
2616 	int error, q, count;
2617 
2618 	uq = td->td_umtxq;
2619 	flags = fuword32(&rwlock->rw_flags);
2620 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2621 	if (error != 0)
2622 		return (error);
2623 
2624 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2625 	if (state & URWLOCK_WRITE_OWNER) {
2626 		for (;;) {
2627 			oldstate = casuword32(&rwlock->rw_state, state,
2628 				state & ~URWLOCK_WRITE_OWNER);
2629 			if (oldstate != state) {
2630 				state = oldstate;
2631 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2632 					error = EPERM;
2633 					goto out;
2634 				}
2635 			} else
2636 				break;
2637 		}
2638 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2639 		for (;;) {
2640 			oldstate = casuword32(&rwlock->rw_state, state,
2641 				state - 1);
2642 			if (oldstate != state) {
2643 				state = oldstate;
2644 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2645 					error = EPERM;
2646 					goto out;
2647 				}
2648 			}
2649 			else
2650 				break;
2651 		}
2652 	} else {
2653 		error = EPERM;
2654 		goto out;
2655 	}
2656 
2657 	count = 0;
2658 
2659 	if (!(flags & URWLOCK_PREFER_READER)) {
2660 		if (state & URWLOCK_WRITE_WAITERS) {
2661 			count = 1;
2662 			q = UMTX_EXCLUSIVE_QUEUE;
2663 		} else if (state & URWLOCK_READ_WAITERS) {
2664 			count = INT_MAX;
2665 			q = UMTX_SHARED_QUEUE;
2666 		}
2667 	} else {
2668 		if (state & URWLOCK_READ_WAITERS) {
2669 			count = INT_MAX;
2670 			q = UMTX_SHARED_QUEUE;
2671 		} else if (state & URWLOCK_WRITE_WAITERS) {
2672 			count = 1;
2673 			q = UMTX_EXCLUSIVE_QUEUE;
2674 		}
2675 	}
2676 
2677 	if (count) {
2678 		umtxq_lock(&uq->uq_key);
2679 		umtxq_busy(&uq->uq_key);
2680 		umtxq_signal_queue(&uq->uq_key, count, q);
2681 		umtxq_unbusy(&uq->uq_key);
2682 		umtxq_unlock(&uq->uq_key);
2683 	}
2684 out:
2685 	umtx_key_release(&uq->uq_key);
2686 	return (error);
2687 }
2688 
2689 int
2690 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2691     /* struct umtx *umtx */
2692 {
2693 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2694 }
2695 
2696 int
2697 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2698     /* struct umtx *umtx */
2699 {
2700 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2701 }
2702 
2703 static int
2704 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2705 {
2706 	struct timespec *ts, timeout;
2707 	int error;
2708 
2709 	/* Allow a null timespec (wait forever). */
2710 	if (uap->uaddr2 == NULL)
2711 		ts = NULL;
2712 	else {
2713 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2714 		if (error != 0)
2715 			return (error);
2716 		if (timeout.tv_nsec >= 1000000000 ||
2717 		    timeout.tv_nsec < 0) {
2718 			return (EINVAL);
2719 		}
2720 		ts = &timeout;
2721 	}
2722 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2723 }
2724 
2725 static int
2726 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2727 {
2728 	return (do_unlock_umtx(td, uap->obj, uap->val));
2729 }
2730 
2731 static int
2732 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2733 {
2734 	struct timespec *ts, timeout;
2735 	int error;
2736 
2737 	if (uap->uaddr2 == NULL)
2738 		ts = NULL;
2739 	else {
2740 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2741 		if (error != 0)
2742 			return (error);
2743 		if (timeout.tv_nsec >= 1000000000 ||
2744 		    timeout.tv_nsec < 0)
2745 			return (EINVAL);
2746 		ts = &timeout;
2747 	}
2748 	return do_wait(td, uap->obj, uap->val, ts, 0);
2749 }
2750 
2751 static int
2752 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2753 {
2754 	struct timespec *ts, timeout;
2755 	int error;
2756 
2757 	if (uap->uaddr2 == NULL)
2758 		ts = NULL;
2759 	else {
2760 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2761 		if (error != 0)
2762 			return (error);
2763 		if (timeout.tv_nsec >= 1000000000 ||
2764 		    timeout.tv_nsec < 0)
2765 			return (EINVAL);
2766 		ts = &timeout;
2767 	}
2768 	return do_wait(td, uap->obj, uap->val, ts, 1);
2769 }
2770 
2771 static int
2772 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2773 {
2774 	return (kern_umtx_wake(td, uap->obj, uap->val));
2775 }
2776 
2777 static int
2778 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2779 {
2780 	struct timespec *ts, timeout;
2781 	int error;
2782 
2783 	/* Allow a null timespec (wait forever). */
2784 	if (uap->uaddr2 == NULL)
2785 		ts = NULL;
2786 	else {
2787 		error = copyin(uap->uaddr2, &timeout,
2788 		    sizeof(timeout));
2789 		if (error != 0)
2790 			return (error);
2791 		if (timeout.tv_nsec >= 1000000000 ||
2792 		    timeout.tv_nsec < 0) {
2793 			return (EINVAL);
2794 		}
2795 		ts = &timeout;
2796 	}
2797 	return do_lock_umutex(td, uap->obj, ts, 0);
2798 }
2799 
2800 static int
2801 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2802 {
2803 	return do_lock_umutex(td, uap->obj, NULL, 1);
2804 }
2805 
2806 static int
2807 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2808 {
2809 	return do_unlock_umutex(td, uap->obj);
2810 }
2811 
2812 static int
2813 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2814 {
2815 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2816 }
2817 
2818 static int
2819 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2820 {
2821 	struct timespec *ts, timeout;
2822 	int error;
2823 
2824 	/* Allow a null timespec (wait forever). */
2825 	if (uap->uaddr2 == NULL)
2826 		ts = NULL;
2827 	else {
2828 		error = copyin(uap->uaddr2, &timeout,
2829 		    sizeof(timeout));
2830 		if (error != 0)
2831 			return (error);
2832 		if (timeout.tv_nsec >= 1000000000 ||
2833 		    timeout.tv_nsec < 0) {
2834 			return (EINVAL);
2835 		}
2836 		ts = &timeout;
2837 	}
2838 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2839 }
2840 
2841 static int
2842 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2843 {
2844 	return do_cv_signal(td, uap->obj);
2845 }
2846 
2847 static int
2848 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2849 {
2850 	return do_cv_broadcast(td, uap->obj);
2851 }
2852 
2853 static int
2854 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2855 {
2856 	struct timespec timeout;
2857 	int error;
2858 
2859 	/* Allow a null timespec (wait forever). */
2860 	if (uap->uaddr2 == NULL) {
2861 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2862 	} else {
2863 		error = copyin(uap->uaddr2, &timeout,
2864 		    sizeof(timeout));
2865 		if (error != 0)
2866 			return (error);
2867 		if (timeout.tv_nsec >= 1000000000 ||
2868 		    timeout.tv_nsec < 0) {
2869 			return (EINVAL);
2870 		}
2871 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2872 	}
2873 	return (error);
2874 }
2875 
2876 static int
2877 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2878 {
2879 	struct timespec timeout;
2880 	int error;
2881 
2882 	/* Allow a null timespec (wait forever). */
2883 	if (uap->uaddr2 == NULL) {
2884 		error = do_rw_wrlock(td, uap->obj, 0);
2885 	} else {
2886 		error = copyin(uap->uaddr2, &timeout,
2887 		    sizeof(timeout));
2888 		if (error != 0)
2889 			return (error);
2890 		if (timeout.tv_nsec >= 1000000000 ||
2891 		    timeout.tv_nsec < 0) {
2892 			return (EINVAL);
2893 		}
2894 
2895 		error = do_rw_wrlock2(td, uap->obj, &timeout);
2896 	}
2897 	return (error);
2898 }
2899 
2900 static int
2901 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
2902 {
2903 	return do_rw_unlock(td, uap->obj);
2904 }
2905 
2906 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2907 
2908 static _umtx_op_func op_table[] = {
2909 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2910 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2911 	__umtx_op_wait,			/* UMTX_OP_WAIT */
2912 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2913 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2914 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2915 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2916 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2917 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2918 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2919 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2920 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
2921 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
2922 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
2923 	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
2924 };
2925 
2926 int
2927 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
2928 {
2929 	if ((unsigned)uap->op < UMTX_OP_MAX)
2930 		return (*op_table[uap->op])(td, uap);
2931 	return (EINVAL);
2932 }
2933 
2934 #ifdef COMPAT_IA32
2935 int
2936 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2937     /* struct umtx *umtx */
2938 {
2939 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2940 }
2941 
2942 int
2943 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2944     /* struct umtx *umtx */
2945 {
2946 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2947 }
2948 
2949 struct timespec32 {
2950 	u_int32_t tv_sec;
2951 	u_int32_t tv_nsec;
2952 };
2953 
2954 static inline int
2955 copyin_timeout32(void *addr, struct timespec *tsp)
2956 {
2957 	struct timespec32 ts32;
2958 	int error;
2959 
2960 	error = copyin(addr, &ts32, sizeof(struct timespec32));
2961 	if (error == 0) {
2962 		tsp->tv_sec = ts32.tv_sec;
2963 		tsp->tv_nsec = ts32.tv_nsec;
2964 	}
2965 	return (error);
2966 }
2967 
2968 static int
2969 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2970 {
2971 	struct timespec *ts, timeout;
2972 	int error;
2973 
2974 	/* Allow a null timespec (wait forever). */
2975 	if (uap->uaddr2 == NULL)
2976 		ts = NULL;
2977 	else {
2978 		error = copyin_timeout32(uap->uaddr2, &timeout);
2979 		if (error != 0)
2980 			return (error);
2981 		if (timeout.tv_nsec >= 1000000000 ||
2982 		    timeout.tv_nsec < 0) {
2983 			return (EINVAL);
2984 		}
2985 		ts = &timeout;
2986 	}
2987 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2988 }
2989 
2990 static int
2991 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2992 {
2993 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2994 }
2995 
2996 static int
2997 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2998 {
2999 	struct timespec *ts, timeout;
3000 	int error;
3001 
3002 	if (uap->uaddr2 == NULL)
3003 		ts = NULL;
3004 	else {
3005 		error = copyin_timeout32(uap->uaddr2, &timeout);
3006 		if (error != 0)
3007 			return (error);
3008 		if (timeout.tv_nsec >= 1000000000 ||
3009 		    timeout.tv_nsec < 0)
3010 			return (EINVAL);
3011 		ts = &timeout;
3012 	}
3013 	return do_wait(td, uap->obj, uap->val, ts, 1);
3014 }
3015 
3016 static int
3017 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3018 {
3019 	struct timespec *ts, timeout;
3020 	int error;
3021 
3022 	/* Allow a null timespec (wait forever). */
3023 	if (uap->uaddr2 == NULL)
3024 		ts = NULL;
3025 	else {
3026 		error = copyin_timeout32(uap->uaddr2, &timeout);
3027 		if (error != 0)
3028 			return (error);
3029 		if (timeout.tv_nsec >= 1000000000 ||
3030 		    timeout.tv_nsec < 0)
3031 			return (EINVAL);
3032 		ts = &timeout;
3033 	}
3034 	return do_lock_umutex(td, uap->obj, ts, 0);
3035 }
3036 
3037 static int
3038 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3039 {
3040 	struct timespec *ts, timeout;
3041 	int error;
3042 
3043 	/* Allow a null timespec (wait forever). */
3044 	if (uap->uaddr2 == NULL)
3045 		ts = NULL;
3046 	else {
3047 		error = copyin_timeout32(uap->uaddr2, &timeout);
3048 		if (error != 0)
3049 			return (error);
3050 		if (timeout.tv_nsec >= 1000000000 ||
3051 		    timeout.tv_nsec < 0)
3052 			return (EINVAL);
3053 		ts = &timeout;
3054 	}
3055 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3056 }
3057 
3058 static int
3059 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3060 {
3061 	struct timespec timeout;
3062 	int error;
3063 
3064 	/* Allow a null timespec (wait forever). */
3065 	if (uap->uaddr2 == NULL) {
3066 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3067 	} else {
3068 		error = copyin(uap->uaddr2, &timeout,
3069 		    sizeof(timeout));
3070 		if (error != 0)
3071 			return (error);
3072 		if (timeout.tv_nsec >= 1000000000 ||
3073 		    timeout.tv_nsec < 0) {
3074 			return (EINVAL);
3075 		}
3076 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3077 	}
3078 	return (error);
3079 }
3080 
3081 static int
3082 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3083 {
3084 	struct timespec timeout;
3085 	int error;
3086 
3087 	/* Allow a null timespec (wait forever). */
3088 	if (uap->uaddr2 == NULL) {
3089 		error = do_rw_wrlock(td, uap->obj, 0);
3090 	} else {
3091 		error = copyin_timeout32(uap->uaddr2, &timeout);
3092 		if (error != 0)
3093 			return (error);
3094 		if (timeout.tv_nsec >= 1000000000 ||
3095 		    timeout.tv_nsec < 0) {
3096 			return (EINVAL);
3097 		}
3098 
3099 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3100 	}
3101 	return (error);
3102 }
3103 
3104 static _umtx_op_func op_table_compat32[] = {
3105 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3106 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3107 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3108 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3109 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3110 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3111 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3112 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3113 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3114 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3115 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3116 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3117 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3118 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3119 	__umtx_op_rw_unlock		/* UMTX_OP_RW_UNLOCK */
3120 };
3121 
3122 int
3123 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3124 {
3125 	if ((unsigned)uap->op < UMTX_OP_MAX)
3126 		return (*op_table_compat32[uap->op])(td,
3127 			(struct _umtx_op_args *)uap);
3128 	return (EINVAL);
3129 }
3130 #endif
3131 
3132 void
3133 umtx_thread_init(struct thread *td)
3134 {
3135 	td->td_umtxq = umtxq_alloc();
3136 	td->td_umtxq->uq_thread = td;
3137 }
3138 
3139 void
3140 umtx_thread_fini(struct thread *td)
3141 {
3142 	umtxq_free(td->td_umtxq);
3143 }
3144 
3145 /*
3146  * It will be called when new thread is created, e.g fork().
3147  */
3148 void
3149 umtx_thread_alloc(struct thread *td)
3150 {
3151 	struct umtx_q *uq;
3152 
3153 	uq = td->td_umtxq;
3154 	uq->uq_inherited_pri = PRI_MAX;
3155 
3156 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3157 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3158 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3159 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3160 }
3161 
3162 /*
3163  * exec() hook.
3164  */
3165 static void
3166 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3167 	struct image_params *imgp __unused)
3168 {
3169 	umtx_thread_cleanup(curthread);
3170 }
3171 
3172 /*
3173  * thread_exit() hook.
3174  */
3175 void
3176 umtx_thread_exit(struct thread *td)
3177 {
3178 	umtx_thread_cleanup(td);
3179 }
3180 
3181 /*
3182  * clean up umtx data.
3183  */
3184 static void
3185 umtx_thread_cleanup(struct thread *td)
3186 {
3187 	struct umtx_q *uq;
3188 	struct umtx_pi *pi;
3189 
3190 	if ((uq = td->td_umtxq) == NULL)
3191 		return;
3192 
3193 	mtx_lock_spin(&umtx_lock);
3194 	uq->uq_inherited_pri = PRI_MAX;
3195 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3196 		pi->pi_owner = NULL;
3197 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3198 	}
3199 	thread_lock(td);
3200 	td->td_flags &= ~TDF_UBORROWING;
3201 	thread_unlock(td);
3202 	mtx_unlock_spin(&umtx_lock);
3203 }
3204