xref: /freebsd/sys/kern/kern_umtx.c (revision 70ed590b393173d4ea697be2a27054ed171f0c1a)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/eventhandler.h>
47 #include <sys/umtx.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_object.h>
54 
55 #include <machine/cpu.h>
56 
57 #ifdef COMPAT_FREEBSD32
58 #include <compat/freebsd32/freebsd32_proto.h>
59 #endif
60 
61 enum {
62 	TYPE_SIMPLE_WAIT,
63 	TYPE_CV,
64 	TYPE_SEM,
65 	TYPE_SIMPLE_LOCK,
66 	TYPE_NORMAL_UMUTEX,
67 	TYPE_PI_UMUTEX,
68 	TYPE_PP_UMUTEX,
69 	TYPE_RWLOCK
70 };
71 
72 #define _UMUTEX_TRY		1
73 #define _UMUTEX_WAIT		2
74 
75 /* Key to represent a unique userland synchronous object */
76 struct umtx_key {
77 	int	hash;
78 	int	type;
79 	int	shared;
80 	union {
81 		struct {
82 			vm_object_t	object;
83 			uintptr_t	offset;
84 		} shared;
85 		struct {
86 			struct vmspace	*vs;
87 			uintptr_t	addr;
88 		} private;
89 		struct {
90 			void		*a;
91 			uintptr_t	b;
92 		} both;
93 	} info;
94 };
95 
96 /* Priority inheritance mutex info. */
97 struct umtx_pi {
98 	/* Owner thread */
99 	struct thread		*pi_owner;
100 
101 	/* Reference count */
102 	int			pi_refcount;
103 
104  	/* List entry to link umtx holding by thread */
105 	TAILQ_ENTRY(umtx_pi)	pi_link;
106 
107 	/* List entry in hash */
108 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
109 
110 	/* List for waiters */
111 	TAILQ_HEAD(,umtx_q)	pi_blocked;
112 
113 	/* Identify a userland lock object */
114 	struct umtx_key		pi_key;
115 };
116 
117 /* A userland synchronous object user. */
118 struct umtx_q {
119 	/* Linked list for the hash. */
120 	TAILQ_ENTRY(umtx_q)	uq_link;
121 
122 	/* Umtx key. */
123 	struct umtx_key		uq_key;
124 
125 	/* Umtx flags. */
126 	int			uq_flags;
127 #define UQF_UMTXQ	0x0001
128 
129 	/* The thread waits on. */
130 	struct thread		*uq_thread;
131 
132 	/*
133 	 * Blocked on PI mutex. read can use chain lock
134 	 * or umtx_lock, write must have both chain lock and
135 	 * umtx_lock being hold.
136 	 */
137 	struct umtx_pi		*uq_pi_blocked;
138 
139 	/* On blocked list */
140 	TAILQ_ENTRY(umtx_q)	uq_lockq;
141 
142 	/* Thread contending with us */
143 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
144 
145 	/* Inherited priority from PP mutex */
146 	u_char			uq_inherited_pri;
147 
148 	/* Spare queue ready to be reused */
149 	struct umtxq_queue	*uq_spare_queue;
150 
151 	/* The queue we on */
152 	struct umtxq_queue	*uq_cur_queue;
153 };
154 
155 TAILQ_HEAD(umtxq_head, umtx_q);
156 
157 /* Per-key wait-queue */
158 struct umtxq_queue {
159 	struct umtxq_head	head;
160 	struct umtx_key		key;
161 	LIST_ENTRY(umtxq_queue)	link;
162 	int			length;
163 };
164 
165 LIST_HEAD(umtxq_list, umtxq_queue);
166 
167 /* Userland lock object's wait-queue chain */
168 struct umtxq_chain {
169 	/* Lock for this chain. */
170 	struct mtx		uc_lock;
171 
172 	/* List of sleep queues. */
173 	struct umtxq_list	uc_queue[2];
174 #define UMTX_SHARED_QUEUE	0
175 #define UMTX_EXCLUSIVE_QUEUE	1
176 
177 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
178 
179 	/* Busy flag */
180 	char			uc_busy;
181 
182 	/* Chain lock waiters */
183 	int			uc_waiters;
184 
185 	/* All PI in the list */
186 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
187 
188 };
189 
190 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
191 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
192 
193 /*
194  * Don't propagate time-sharing priority, there is a security reason,
195  * a user can simply introduce PI-mutex, let thread A lock the mutex,
196  * and let another thread B block on the mutex, because B is
197  * sleeping, its priority will be boosted, this causes A's priority to
198  * be boosted via priority propagating too and will never be lowered even
199  * if it is using 100%CPU, this is unfair to other processes.
200  */
201 
202 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
203 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
204 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
205 
206 #define	GOLDEN_RATIO_PRIME	2654404609U
207 #define	UMTX_CHAINS		128
208 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
209 
210 #define THREAD_SHARE		0
211 #define PROCESS_SHARE		1
212 #define AUTO_SHARE		2
213 
214 #define	GET_SHARE(flags)	\
215     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
216 
217 #define BUSY_SPINS		200
218 
219 static uma_zone_t		umtx_pi_zone;
220 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
221 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
222 static int			umtx_pi_allocated;
223 
224 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
225 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
226     &umtx_pi_allocated, 0, "Allocated umtx_pi");
227 
228 static void umtxq_sysinit(void *);
229 static void umtxq_hash(struct umtx_key *key);
230 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
231 static void umtxq_lock(struct umtx_key *key);
232 static void umtxq_unlock(struct umtx_key *key);
233 static void umtxq_busy(struct umtx_key *key);
234 static void umtxq_unbusy(struct umtx_key *key);
235 static void umtxq_insert_queue(struct umtx_q *uq, int q);
236 static void umtxq_remove_queue(struct umtx_q *uq, int q);
237 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
238 static int umtxq_count(struct umtx_key *key);
239 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
240 static int umtx_key_get(void *addr, int type, int share,
241 	struct umtx_key *key);
242 static void umtx_key_release(struct umtx_key *key);
243 static struct umtx_pi *umtx_pi_alloc(int);
244 static void umtx_pi_free(struct umtx_pi *pi);
245 static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
246 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
247 static void umtx_thread_cleanup(struct thread *td);
248 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
249 	struct image_params *imgp __unused);
250 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
251 
252 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
253 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
254 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
255 
256 static struct mtx umtx_lock;
257 
258 static void
259 umtxq_sysinit(void *arg __unused)
260 {
261 	int i, j;
262 
263 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
264 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
265 	for (i = 0; i < 2; ++i) {
266 		for (j = 0; j < UMTX_CHAINS; ++j) {
267 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
268 				 MTX_DEF | MTX_DUPOK);
269 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
270 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
271 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
272 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
273 			umtxq_chains[i][j].uc_busy = 0;
274 			umtxq_chains[i][j].uc_waiters = 0;
275 		}
276 	}
277 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279 	    EVENTHANDLER_PRI_ANY);
280 }
281 
282 struct umtx_q *
283 umtxq_alloc(void)
284 {
285 	struct umtx_q *uq;
286 
287 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289 	TAILQ_INIT(&uq->uq_spare_queue->head);
290 	TAILQ_INIT(&uq->uq_pi_contested);
291 	uq->uq_inherited_pri = PRI_MAX;
292 	return (uq);
293 }
294 
295 void
296 umtxq_free(struct umtx_q *uq)
297 {
298 	MPASS(uq->uq_spare_queue != NULL);
299 	free(uq->uq_spare_queue, M_UMTX);
300 	free(uq, M_UMTX);
301 }
302 
303 static inline void
304 umtxq_hash(struct umtx_key *key)
305 {
306 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308 }
309 
310 static inline int
311 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
312 {
313 	return (k1->type == k2->type &&
314 		k1->info.both.a == k2->info.both.a &&
315 	        k1->info.both.b == k2->info.both.b);
316 }
317 
318 static inline struct umtxq_chain *
319 umtxq_getchain(struct umtx_key *key)
320 {
321 	if (key->type <= TYPE_SEM)
322 		return (&umtxq_chains[1][key->hash]);
323 	return (&umtxq_chains[0][key->hash]);
324 }
325 
326 /*
327  * Lock a chain.
328  */
329 static inline void
330 umtxq_lock(struct umtx_key *key)
331 {
332 	struct umtxq_chain *uc;
333 
334 	uc = umtxq_getchain(key);
335 	mtx_lock(&uc->uc_lock);
336 }
337 
338 /*
339  * Unlock a chain.
340  */
341 static inline void
342 umtxq_unlock(struct umtx_key *key)
343 {
344 	struct umtxq_chain *uc;
345 
346 	uc = umtxq_getchain(key);
347 	mtx_unlock(&uc->uc_lock);
348 }
349 
350 /*
351  * Set chain to busy state when following operation
352  * may be blocked (kernel mutex can not be used).
353  */
354 static inline void
355 umtxq_busy(struct umtx_key *key)
356 {
357 	struct umtxq_chain *uc;
358 
359 	uc = umtxq_getchain(key);
360 	mtx_assert(&uc->uc_lock, MA_OWNED);
361 	if (uc->uc_busy) {
362 #ifdef SMP
363 		if (smp_cpus > 1) {
364 			int count = BUSY_SPINS;
365 			if (count > 0) {
366 				umtxq_unlock(key);
367 				while (uc->uc_busy && --count > 0)
368 					cpu_spinwait();
369 				umtxq_lock(key);
370 			}
371 		}
372 #endif
373 		while (uc->uc_busy) {
374 			uc->uc_waiters++;
375 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
376 			uc->uc_waiters--;
377 		}
378 	}
379 	uc->uc_busy = 1;
380 }
381 
382 /*
383  * Unbusy a chain.
384  */
385 static inline void
386 umtxq_unbusy(struct umtx_key *key)
387 {
388 	struct umtxq_chain *uc;
389 
390 	uc = umtxq_getchain(key);
391 	mtx_assert(&uc->uc_lock, MA_OWNED);
392 	KASSERT(uc->uc_busy != 0, ("not busy"));
393 	uc->uc_busy = 0;
394 	if (uc->uc_waiters)
395 		wakeup_one(uc);
396 }
397 
398 static struct umtxq_queue *
399 umtxq_queue_lookup(struct umtx_key *key, int q)
400 {
401 	struct umtxq_queue *uh;
402 	struct umtxq_chain *uc;
403 
404 	uc = umtxq_getchain(key);
405 	UMTXQ_LOCKED_ASSERT(uc);
406 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
407 		if (umtx_key_match(&uh->key, key))
408 			return (uh);
409 	}
410 
411 	return (NULL);
412 }
413 
414 static inline void
415 umtxq_insert_queue(struct umtx_q *uq, int q)
416 {
417 	struct umtxq_queue *uh;
418 	struct umtxq_chain *uc;
419 
420 	uc = umtxq_getchain(&uq->uq_key);
421 	UMTXQ_LOCKED_ASSERT(uc);
422 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
423 	uh = umtxq_queue_lookup(&uq->uq_key, q);
424 	if (uh != NULL) {
425 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
426 	} else {
427 		uh = uq->uq_spare_queue;
428 		uh->key = uq->uq_key;
429 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
430 	}
431 	uq->uq_spare_queue = NULL;
432 
433 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
434 	uh->length++;
435 	uq->uq_flags |= UQF_UMTXQ;
436 	uq->uq_cur_queue = uh;
437 	return;
438 }
439 
440 static inline void
441 umtxq_remove_queue(struct umtx_q *uq, int q)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtxq_queue *uh;
445 
446 	uc = umtxq_getchain(&uq->uq_key);
447 	UMTXQ_LOCKED_ASSERT(uc);
448 	if (uq->uq_flags & UQF_UMTXQ) {
449 		uh = uq->uq_cur_queue;
450 		TAILQ_REMOVE(&uh->head, uq, uq_link);
451 		uh->length--;
452 		uq->uq_flags &= ~UQF_UMTXQ;
453 		if (TAILQ_EMPTY(&uh->head)) {
454 			KASSERT(uh->length == 0,
455 			    ("inconsistent umtxq_queue length"));
456 			LIST_REMOVE(uh, link);
457 		} else {
458 			uh = LIST_FIRST(&uc->uc_spare_queue);
459 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
460 			LIST_REMOVE(uh, link);
461 		}
462 		uq->uq_spare_queue = uh;
463 		uq->uq_cur_queue = NULL;
464 	}
465 }
466 
467 /*
468  * Check if there are multiple waiters
469  */
470 static int
471 umtxq_count(struct umtx_key *key)
472 {
473 	struct umtxq_chain *uc;
474 	struct umtxq_queue *uh;
475 
476 	uc = umtxq_getchain(key);
477 	UMTXQ_LOCKED_ASSERT(uc);
478 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
479 	if (uh != NULL)
480 		return (uh->length);
481 	return (0);
482 }
483 
484 /*
485  * Check if there are multiple PI waiters and returns first
486  * waiter.
487  */
488 static int
489 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
490 {
491 	struct umtxq_chain *uc;
492 	struct umtxq_queue *uh;
493 
494 	*first = NULL;
495 	uc = umtxq_getchain(key);
496 	UMTXQ_LOCKED_ASSERT(uc);
497 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
498 	if (uh != NULL) {
499 		*first = TAILQ_FIRST(&uh->head);
500 		return (uh->length);
501 	}
502 	return (0);
503 }
504 
505 /*
506  * Wake up threads waiting on an userland object.
507  */
508 
509 static int
510 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
511 {
512 	struct umtxq_chain *uc;
513 	struct umtxq_queue *uh;
514 	struct umtx_q *uq;
515 	int ret;
516 
517 	ret = 0;
518 	uc = umtxq_getchain(key);
519 	UMTXQ_LOCKED_ASSERT(uc);
520 	uh = umtxq_queue_lookup(key, q);
521 	if (uh != NULL) {
522 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
523 			umtxq_remove_queue(uq, q);
524 			wakeup(uq);
525 			if (++ret >= n_wake)
526 				return (ret);
527 		}
528 	}
529 	return (ret);
530 }
531 
532 
533 /*
534  * Wake up specified thread.
535  */
536 static inline void
537 umtxq_signal_thread(struct umtx_q *uq)
538 {
539 	struct umtxq_chain *uc;
540 
541 	uc = umtxq_getchain(&uq->uq_key);
542 	UMTXQ_LOCKED_ASSERT(uc);
543 	umtxq_remove(uq);
544 	wakeup(uq);
545 }
546 
547 /*
548  * Put thread into sleep state, before sleeping, check if
549  * thread was removed from umtx queue.
550  */
551 static inline int
552 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
553 {
554 	struct umtxq_chain *uc;
555 	int error;
556 
557 	uc = umtxq_getchain(&uq->uq_key);
558 	UMTXQ_LOCKED_ASSERT(uc);
559 	if (!(uq->uq_flags & UQF_UMTXQ))
560 		return (0);
561 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
562 	if (error == EWOULDBLOCK)
563 		error = ETIMEDOUT;
564 	return (error);
565 }
566 
567 /*
568  * Convert userspace address into unique logical address.
569  */
570 static int
571 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
572 {
573 	struct thread *td = curthread;
574 	vm_map_t map;
575 	vm_map_entry_t entry;
576 	vm_pindex_t pindex;
577 	vm_prot_t prot;
578 	boolean_t wired;
579 
580 	key->type = type;
581 	if (share == THREAD_SHARE) {
582 		key->shared = 0;
583 		key->info.private.vs = td->td_proc->p_vmspace;
584 		key->info.private.addr = (uintptr_t)addr;
585 	} else {
586 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
587 		map = &td->td_proc->p_vmspace->vm_map;
588 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
589 		    &entry, &key->info.shared.object, &pindex, &prot,
590 		    &wired) != KERN_SUCCESS) {
591 			return EFAULT;
592 		}
593 
594 		if ((share == PROCESS_SHARE) ||
595 		    (share == AUTO_SHARE &&
596 		     VM_INHERIT_SHARE == entry->inheritance)) {
597 			key->shared = 1;
598 			key->info.shared.offset = entry->offset + entry->start -
599 				(vm_offset_t)addr;
600 			vm_object_reference(key->info.shared.object);
601 		} else {
602 			key->shared = 0;
603 			key->info.private.vs = td->td_proc->p_vmspace;
604 			key->info.private.addr = (uintptr_t)addr;
605 		}
606 		vm_map_lookup_done(map, entry);
607 	}
608 
609 	umtxq_hash(key);
610 	return (0);
611 }
612 
613 /*
614  * Release key.
615  */
616 static inline void
617 umtx_key_release(struct umtx_key *key)
618 {
619 	if (key->shared)
620 		vm_object_deallocate(key->info.shared.object);
621 }
622 
623 /*
624  * Lock a umtx object.
625  */
626 static int
627 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
628 {
629 	struct umtx_q *uq;
630 	u_long owner;
631 	u_long old;
632 	int error = 0;
633 
634 	uq = td->td_umtxq;
635 
636 	/*
637 	 * Care must be exercised when dealing with umtx structure. It
638 	 * can fault on any access.
639 	 */
640 	for (;;) {
641 		/*
642 		 * Try the uncontested case.  This should be done in userland.
643 		 */
644 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
645 
646 		/* The acquire succeeded. */
647 		if (owner == UMTX_UNOWNED)
648 			return (0);
649 
650 		/* The address was invalid. */
651 		if (owner == -1)
652 			return (EFAULT);
653 
654 		/* If no one owns it but it is contested try to acquire it. */
655 		if (owner == UMTX_CONTESTED) {
656 			owner = casuword(&umtx->u_owner,
657 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
658 
659 			if (owner == UMTX_CONTESTED)
660 				return (0);
661 
662 			/* The address was invalid. */
663 			if (owner == -1)
664 				return (EFAULT);
665 
666 			/* If this failed the lock has changed, restart. */
667 			continue;
668 		}
669 
670 		/*
671 		 * If we caught a signal, we have retried and now
672 		 * exit immediately.
673 		 */
674 		if (error != 0)
675 			return (error);
676 
677 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
678 			AUTO_SHARE, &uq->uq_key)) != 0)
679 			return (error);
680 
681 		umtxq_lock(&uq->uq_key);
682 		umtxq_busy(&uq->uq_key);
683 		umtxq_insert(uq);
684 		umtxq_unbusy(&uq->uq_key);
685 		umtxq_unlock(&uq->uq_key);
686 
687 		/*
688 		 * Set the contested bit so that a release in user space
689 		 * knows to use the system call for unlock.  If this fails
690 		 * either some one else has acquired the lock or it has been
691 		 * released.
692 		 */
693 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
694 
695 		/* The address was invalid. */
696 		if (old == -1) {
697 			umtxq_lock(&uq->uq_key);
698 			umtxq_remove(uq);
699 			umtxq_unlock(&uq->uq_key);
700 			umtx_key_release(&uq->uq_key);
701 			return (EFAULT);
702 		}
703 
704 		/*
705 		 * We set the contested bit, sleep. Otherwise the lock changed
706 		 * and we need to retry or we lost a race to the thread
707 		 * unlocking the umtx.
708 		 */
709 		umtxq_lock(&uq->uq_key);
710 		if (old == owner)
711 			error = umtxq_sleep(uq, "umtx", timo);
712 		umtxq_remove(uq);
713 		umtxq_unlock(&uq->uq_key);
714 		umtx_key_release(&uq->uq_key);
715 	}
716 
717 	return (0);
718 }
719 
720 /*
721  * Lock a umtx object.
722  */
723 static int
724 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
725 	struct timespec *timeout)
726 {
727 	struct timespec ts, ts2, ts3;
728 	struct timeval tv;
729 	int error;
730 
731 	if (timeout == NULL) {
732 		error = _do_lock_umtx(td, umtx, id, 0);
733 		/* Mutex locking is restarted if it is interrupted. */
734 		if (error == EINTR)
735 			error = ERESTART;
736 	} else {
737 		getnanouptime(&ts);
738 		timespecadd(&ts, timeout);
739 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
740 		for (;;) {
741 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
742 			if (error != ETIMEDOUT)
743 				break;
744 			getnanouptime(&ts2);
745 			if (timespeccmp(&ts2, &ts, >=)) {
746 				error = ETIMEDOUT;
747 				break;
748 			}
749 			ts3 = ts;
750 			timespecsub(&ts3, &ts2);
751 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
752 		}
753 		/* Timed-locking is not restarted. */
754 		if (error == ERESTART)
755 			error = EINTR;
756 	}
757 	return (error);
758 }
759 
760 /*
761  * Unlock a umtx object.
762  */
763 static int
764 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
765 {
766 	struct umtx_key key;
767 	u_long owner;
768 	u_long old;
769 	int error;
770 	int count;
771 
772 	/*
773 	 * Make sure we own this mtx.
774 	 */
775 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
776 	if (owner == -1)
777 		return (EFAULT);
778 
779 	if ((owner & ~UMTX_CONTESTED) != id)
780 		return (EPERM);
781 
782 	/* This should be done in userland */
783 	if ((owner & UMTX_CONTESTED) == 0) {
784 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
785 		if (old == -1)
786 			return (EFAULT);
787 		if (old == owner)
788 			return (0);
789 		owner = old;
790 	}
791 
792 	/* We should only ever be in here for contested locks */
793 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
794 		&key)) != 0)
795 		return (error);
796 
797 	umtxq_lock(&key);
798 	umtxq_busy(&key);
799 	count = umtxq_count(&key);
800 	umtxq_unlock(&key);
801 
802 	/*
803 	 * When unlocking the umtx, it must be marked as unowned if
804 	 * there is zero or one thread only waiting for it.
805 	 * Otherwise, it must be marked as contested.
806 	 */
807 	old = casuword(&umtx->u_owner, owner,
808 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
809 	umtxq_lock(&key);
810 	umtxq_signal(&key,1);
811 	umtxq_unbusy(&key);
812 	umtxq_unlock(&key);
813 	umtx_key_release(&key);
814 	if (old == -1)
815 		return (EFAULT);
816 	if (old != owner)
817 		return (EINVAL);
818 	return (0);
819 }
820 
821 #ifdef COMPAT_FREEBSD32
822 
823 /*
824  * Lock a umtx object.
825  */
826 static int
827 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
828 {
829 	struct umtx_q *uq;
830 	uint32_t owner;
831 	uint32_t old;
832 	int error = 0;
833 
834 	uq = td->td_umtxq;
835 
836 	/*
837 	 * Care must be exercised when dealing with umtx structure. It
838 	 * can fault on any access.
839 	 */
840 	for (;;) {
841 		/*
842 		 * Try the uncontested case.  This should be done in userland.
843 		 */
844 		owner = casuword32(m, UMUTEX_UNOWNED, id);
845 
846 		/* The acquire succeeded. */
847 		if (owner == UMUTEX_UNOWNED)
848 			return (0);
849 
850 		/* The address was invalid. */
851 		if (owner == -1)
852 			return (EFAULT);
853 
854 		/* If no one owns it but it is contested try to acquire it. */
855 		if (owner == UMUTEX_CONTESTED) {
856 			owner = casuword32(m,
857 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
858 			if (owner == UMUTEX_CONTESTED)
859 				return (0);
860 
861 			/* The address was invalid. */
862 			if (owner == -1)
863 				return (EFAULT);
864 
865 			/* If this failed the lock has changed, restart. */
866 			continue;
867 		}
868 
869 		/*
870 		 * If we caught a signal, we have retried and now
871 		 * exit immediately.
872 		 */
873 		if (error != 0)
874 			return (error);
875 
876 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
877 			AUTO_SHARE, &uq->uq_key)) != 0)
878 			return (error);
879 
880 		umtxq_lock(&uq->uq_key);
881 		umtxq_busy(&uq->uq_key);
882 		umtxq_insert(uq);
883 		umtxq_unbusy(&uq->uq_key);
884 		umtxq_unlock(&uq->uq_key);
885 
886 		/*
887 		 * Set the contested bit so that a release in user space
888 		 * knows to use the system call for unlock.  If this fails
889 		 * either some one else has acquired the lock or it has been
890 		 * released.
891 		 */
892 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
893 
894 		/* The address was invalid. */
895 		if (old == -1) {
896 			umtxq_lock(&uq->uq_key);
897 			umtxq_remove(uq);
898 			umtxq_unlock(&uq->uq_key);
899 			umtx_key_release(&uq->uq_key);
900 			return (EFAULT);
901 		}
902 
903 		/*
904 		 * We set the contested bit, sleep. Otherwise the lock changed
905 		 * and we need to retry or we lost a race to the thread
906 		 * unlocking the umtx.
907 		 */
908 		umtxq_lock(&uq->uq_key);
909 		if (old == owner)
910 			error = umtxq_sleep(uq, "umtx", timo);
911 		umtxq_remove(uq);
912 		umtxq_unlock(&uq->uq_key);
913 		umtx_key_release(&uq->uq_key);
914 	}
915 
916 	return (0);
917 }
918 
919 /*
920  * Lock a umtx object.
921  */
922 static int
923 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
924 	struct timespec *timeout)
925 {
926 	struct timespec ts, ts2, ts3;
927 	struct timeval tv;
928 	int error;
929 
930 	if (timeout == NULL) {
931 		error = _do_lock_umtx32(td, m, id, 0);
932 		/* Mutex locking is restarted if it is interrupted. */
933 		if (error == EINTR)
934 			error = ERESTART;
935 	} else {
936 		getnanouptime(&ts);
937 		timespecadd(&ts, timeout);
938 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
939 		for (;;) {
940 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
941 			if (error != ETIMEDOUT)
942 				break;
943 			getnanouptime(&ts2);
944 			if (timespeccmp(&ts2, &ts, >=)) {
945 				error = ETIMEDOUT;
946 				break;
947 			}
948 			ts3 = ts;
949 			timespecsub(&ts3, &ts2);
950 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
951 		}
952 		/* Timed-locking is not restarted. */
953 		if (error == ERESTART)
954 			error = EINTR;
955 	}
956 	return (error);
957 }
958 
959 /*
960  * Unlock a umtx object.
961  */
962 static int
963 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
964 {
965 	struct umtx_key key;
966 	uint32_t owner;
967 	uint32_t old;
968 	int error;
969 	int count;
970 
971 	/*
972 	 * Make sure we own this mtx.
973 	 */
974 	owner = fuword32(m);
975 	if (owner == -1)
976 		return (EFAULT);
977 
978 	if ((owner & ~UMUTEX_CONTESTED) != id)
979 		return (EPERM);
980 
981 	/* This should be done in userland */
982 	if ((owner & UMUTEX_CONTESTED) == 0) {
983 		old = casuword32(m, owner, UMUTEX_UNOWNED);
984 		if (old == -1)
985 			return (EFAULT);
986 		if (old == owner)
987 			return (0);
988 		owner = old;
989 	}
990 
991 	/* We should only ever be in here for contested locks */
992 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
993 		&key)) != 0)
994 		return (error);
995 
996 	umtxq_lock(&key);
997 	umtxq_busy(&key);
998 	count = umtxq_count(&key);
999 	umtxq_unlock(&key);
1000 
1001 	/*
1002 	 * When unlocking the umtx, it must be marked as unowned if
1003 	 * there is zero or one thread only waiting for it.
1004 	 * Otherwise, it must be marked as contested.
1005 	 */
1006 	old = casuword32(m, owner,
1007 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1008 	umtxq_lock(&key);
1009 	umtxq_signal(&key,1);
1010 	umtxq_unbusy(&key);
1011 	umtxq_unlock(&key);
1012 	umtx_key_release(&key);
1013 	if (old == -1)
1014 		return (EFAULT);
1015 	if (old != owner)
1016 		return (EINVAL);
1017 	return (0);
1018 }
1019 #endif
1020 
1021 /*
1022  * Fetch and compare value, sleep on the address if value is not changed.
1023  */
1024 static int
1025 do_wait(struct thread *td, void *addr, u_long id,
1026 	struct timespec *timeout, int compat32, int is_private)
1027 {
1028 	struct umtx_q *uq;
1029 	struct timespec ts, ts2, ts3;
1030 	struct timeval tv;
1031 	u_long tmp;
1032 	int error = 0;
1033 
1034 	uq = td->td_umtxq;
1035 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1036 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1037 		return (error);
1038 
1039 	umtxq_lock(&uq->uq_key);
1040 	umtxq_insert(uq);
1041 	umtxq_unlock(&uq->uq_key);
1042 	if (compat32 == 0)
1043 		tmp = fuword(addr);
1044         else
1045 		tmp = (unsigned int)fuword32(addr);
1046 	if (tmp != id) {
1047 		umtxq_lock(&uq->uq_key);
1048 		umtxq_remove(uq);
1049 		umtxq_unlock(&uq->uq_key);
1050 	} else if (timeout == NULL) {
1051 		umtxq_lock(&uq->uq_key);
1052 		error = umtxq_sleep(uq, "uwait", 0);
1053 		umtxq_remove(uq);
1054 		umtxq_unlock(&uq->uq_key);
1055 	} else {
1056 		getnanouptime(&ts);
1057 		timespecadd(&ts, timeout);
1058 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1059 		umtxq_lock(&uq->uq_key);
1060 		for (;;) {
1061 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1062 			if (!(uq->uq_flags & UQF_UMTXQ))
1063 				break;
1064 			if (error != ETIMEDOUT)
1065 				break;
1066 			umtxq_unlock(&uq->uq_key);
1067 			getnanouptime(&ts2);
1068 			if (timespeccmp(&ts2, &ts, >=)) {
1069 				error = ETIMEDOUT;
1070 				umtxq_lock(&uq->uq_key);
1071 				break;
1072 			}
1073 			ts3 = ts;
1074 			timespecsub(&ts3, &ts2);
1075 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1076 			umtxq_lock(&uq->uq_key);
1077 		}
1078 		umtxq_remove(uq);
1079 		umtxq_unlock(&uq->uq_key);
1080 	}
1081 	umtx_key_release(&uq->uq_key);
1082 	if (error == ERESTART)
1083 		error = EINTR;
1084 	return (error);
1085 }
1086 
1087 /*
1088  * Wake up threads sleeping on the specified address.
1089  */
1090 int
1091 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1092 {
1093 	struct umtx_key key;
1094 	int ret;
1095 
1096 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1097 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1098 		return (ret);
1099 	umtxq_lock(&key);
1100 	ret = umtxq_signal(&key, n_wake);
1101 	umtxq_unlock(&key);
1102 	umtx_key_release(&key);
1103 	return (0);
1104 }
1105 
1106 /*
1107  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1108  */
1109 static int
1110 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1111 	int mode)
1112 {
1113 	struct umtx_q *uq;
1114 	uint32_t owner, old, id;
1115 	int error = 0;
1116 
1117 	id = td->td_tid;
1118 	uq = td->td_umtxq;
1119 
1120 	/*
1121 	 * Care must be exercised when dealing with umtx structure. It
1122 	 * can fault on any access.
1123 	 */
1124 	for (;;) {
1125 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1126 		if (mode == _UMUTEX_WAIT) {
1127 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1128 				return (0);
1129 		} else {
1130 			/*
1131 			 * Try the uncontested case.  This should be done in userland.
1132 			 */
1133 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1134 
1135 			/* The acquire succeeded. */
1136 			if (owner == UMUTEX_UNOWNED)
1137 				return (0);
1138 
1139 			/* The address was invalid. */
1140 			if (owner == -1)
1141 				return (EFAULT);
1142 
1143 			/* If no one owns it but it is contested try to acquire it. */
1144 			if (owner == UMUTEX_CONTESTED) {
1145 				owner = casuword32(&m->m_owner,
1146 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1147 
1148 				if (owner == UMUTEX_CONTESTED)
1149 					return (0);
1150 
1151 				/* The address was invalid. */
1152 				if (owner == -1)
1153 					return (EFAULT);
1154 
1155 				/* If this failed the lock has changed, restart. */
1156 				continue;
1157 			}
1158 		}
1159 
1160 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1161 		    (owner & ~UMUTEX_CONTESTED) == id)
1162 			return (EDEADLK);
1163 
1164 		if (mode == _UMUTEX_TRY)
1165 			return (EBUSY);
1166 
1167 		/*
1168 		 * If we caught a signal, we have retried and now
1169 		 * exit immediately.
1170 		 */
1171 		if (error != 0)
1172 			return (error);
1173 
1174 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1175 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1176 			return (error);
1177 
1178 		umtxq_lock(&uq->uq_key);
1179 		umtxq_busy(&uq->uq_key);
1180 		umtxq_insert(uq);
1181 		umtxq_unlock(&uq->uq_key);
1182 
1183 		/*
1184 		 * Set the contested bit so that a release in user space
1185 		 * knows to use the system call for unlock.  If this fails
1186 		 * either some one else has acquired the lock or it has been
1187 		 * released.
1188 		 */
1189 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1190 
1191 		/* The address was invalid. */
1192 		if (old == -1) {
1193 			umtxq_lock(&uq->uq_key);
1194 			umtxq_remove(uq);
1195 			umtxq_unbusy(&uq->uq_key);
1196 			umtxq_unlock(&uq->uq_key);
1197 			umtx_key_release(&uq->uq_key);
1198 			return (EFAULT);
1199 		}
1200 
1201 		/*
1202 		 * We set the contested bit, sleep. Otherwise the lock changed
1203 		 * and we need to retry or we lost a race to the thread
1204 		 * unlocking the umtx.
1205 		 */
1206 		umtxq_lock(&uq->uq_key);
1207 		umtxq_unbusy(&uq->uq_key);
1208 		if (old == owner)
1209 			error = umtxq_sleep(uq, "umtxn", timo);
1210 		umtxq_remove(uq);
1211 		umtxq_unlock(&uq->uq_key);
1212 		umtx_key_release(&uq->uq_key);
1213 	}
1214 
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1220  */
1221 /*
1222  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1223  */
1224 static int
1225 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1226 {
1227 	struct umtx_key key;
1228 	uint32_t owner, old, id;
1229 	int error;
1230 	int count;
1231 
1232 	id = td->td_tid;
1233 	/*
1234 	 * Make sure we own this mtx.
1235 	 */
1236 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1237 	if (owner == -1)
1238 		return (EFAULT);
1239 
1240 	if ((owner & ~UMUTEX_CONTESTED) != id)
1241 		return (EPERM);
1242 
1243 	if ((owner & UMUTEX_CONTESTED) == 0) {
1244 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1245 		if (old == -1)
1246 			return (EFAULT);
1247 		if (old == owner)
1248 			return (0);
1249 		owner = old;
1250 	}
1251 
1252 	/* We should only ever be in here for contested locks */
1253 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1254 	    &key)) != 0)
1255 		return (error);
1256 
1257 	umtxq_lock(&key);
1258 	umtxq_busy(&key);
1259 	count = umtxq_count(&key);
1260 	umtxq_unlock(&key);
1261 
1262 	/*
1263 	 * When unlocking the umtx, it must be marked as unowned if
1264 	 * there is zero or one thread only waiting for it.
1265 	 * Otherwise, it must be marked as contested.
1266 	 */
1267 	old = casuword32(&m->m_owner, owner,
1268 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1269 	umtxq_lock(&key);
1270 	umtxq_signal(&key,1);
1271 	umtxq_unbusy(&key);
1272 	umtxq_unlock(&key);
1273 	umtx_key_release(&key);
1274 	if (old == -1)
1275 		return (EFAULT);
1276 	if (old != owner)
1277 		return (EINVAL);
1278 	return (0);
1279 }
1280 
1281 /*
1282  * Check if the mutex is available and wake up a waiter,
1283  * only for simple mutex.
1284  */
1285 static int
1286 do_wake_umutex(struct thread *td, struct umutex *m)
1287 {
1288 	struct umtx_key key;
1289 	uint32_t owner;
1290 	uint32_t flags;
1291 	int error;
1292 	int count;
1293 
1294 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1295 	if (owner == -1)
1296 		return (EFAULT);
1297 
1298 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1299 		return (0);
1300 
1301 	flags = fuword32(&m->m_flags);
1302 
1303 	/* We should only ever be in here for contested locks */
1304 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1305 	    &key)) != 0)
1306 		return (error);
1307 
1308 	umtxq_lock(&key);
1309 	umtxq_busy(&key);
1310 	count = umtxq_count(&key);
1311 	umtxq_unlock(&key);
1312 
1313 	if (count <= 1)
1314 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1315 
1316 	umtxq_lock(&key);
1317 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1318 		umtxq_signal(&key, 1);
1319 	umtxq_unbusy(&key);
1320 	umtxq_unlock(&key);
1321 	umtx_key_release(&key);
1322 	return (0);
1323 }
1324 
1325 static inline struct umtx_pi *
1326 umtx_pi_alloc(int flags)
1327 {
1328 	struct umtx_pi *pi;
1329 
1330 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1331 	TAILQ_INIT(&pi->pi_blocked);
1332 	atomic_add_int(&umtx_pi_allocated, 1);
1333 	return (pi);
1334 }
1335 
1336 static inline void
1337 umtx_pi_free(struct umtx_pi *pi)
1338 {
1339 	uma_zfree(umtx_pi_zone, pi);
1340 	atomic_add_int(&umtx_pi_allocated, -1);
1341 }
1342 
1343 /*
1344  * Adjust the thread's position on a pi_state after its priority has been
1345  * changed.
1346  */
1347 static int
1348 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1349 {
1350 	struct umtx_q *uq, *uq1, *uq2;
1351 	struct thread *td1;
1352 
1353 	mtx_assert(&umtx_lock, MA_OWNED);
1354 	if (pi == NULL)
1355 		return (0);
1356 
1357 	uq = td->td_umtxq;
1358 
1359 	/*
1360 	 * Check if the thread needs to be moved on the blocked chain.
1361 	 * It needs to be moved if either its priority is lower than
1362 	 * the previous thread or higher than the next thread.
1363 	 */
1364 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1365 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1366 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1367 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1368 		/*
1369 		 * Remove thread from blocked chain and determine where
1370 		 * it should be moved to.
1371 		 */
1372 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1373 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1374 			td1 = uq1->uq_thread;
1375 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1376 			if (UPRI(td1) > UPRI(td))
1377 				break;
1378 		}
1379 
1380 		if (uq1 == NULL)
1381 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1382 		else
1383 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1384 	}
1385 	return (1);
1386 }
1387 
1388 /*
1389  * Propagate priority when a thread is blocked on POSIX
1390  * PI mutex.
1391  */
1392 static void
1393 umtx_propagate_priority(struct thread *td)
1394 {
1395 	struct umtx_q *uq;
1396 	struct umtx_pi *pi;
1397 	int pri;
1398 
1399 	mtx_assert(&umtx_lock, MA_OWNED);
1400 	pri = UPRI(td);
1401 	uq = td->td_umtxq;
1402 	pi = uq->uq_pi_blocked;
1403 	if (pi == NULL)
1404 		return;
1405 
1406 	for (;;) {
1407 		td = pi->pi_owner;
1408 		if (td == NULL)
1409 			return;
1410 
1411 		MPASS(td->td_proc != NULL);
1412 		MPASS(td->td_proc->p_magic == P_MAGIC);
1413 
1414 		if (UPRI(td) <= pri)
1415 			return;
1416 
1417 		thread_lock(td);
1418 		sched_lend_user_prio(td, pri);
1419 		thread_unlock(td);
1420 
1421 		/*
1422 		 * Pick up the lock that td is blocked on.
1423 		 */
1424 		uq = td->td_umtxq;
1425 		pi = uq->uq_pi_blocked;
1426 		/* Resort td on the list if needed. */
1427 		if (!umtx_pi_adjust_thread(pi, td))
1428 			break;
1429 	}
1430 }
1431 
1432 /*
1433  * Unpropagate priority for a PI mutex when a thread blocked on
1434  * it is interrupted by signal or resumed by others.
1435  */
1436 static void
1437 umtx_unpropagate_priority(struct umtx_pi *pi)
1438 {
1439 	struct umtx_q *uq, *uq_owner;
1440 	struct umtx_pi *pi2;
1441 	int pri, oldpri;
1442 
1443 	mtx_assert(&umtx_lock, MA_OWNED);
1444 
1445 	while (pi != NULL && pi->pi_owner != NULL) {
1446 		pri = PRI_MAX;
1447 		uq_owner = pi->pi_owner->td_umtxq;
1448 
1449 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1450 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1451 			if (uq != NULL) {
1452 				if (pri > UPRI(uq->uq_thread))
1453 					pri = UPRI(uq->uq_thread);
1454 			}
1455 		}
1456 
1457 		if (pri > uq_owner->uq_inherited_pri)
1458 			pri = uq_owner->uq_inherited_pri;
1459 		thread_lock(pi->pi_owner);
1460 		oldpri = pi->pi_owner->td_user_pri;
1461 		sched_unlend_user_prio(pi->pi_owner, pri);
1462 		thread_unlock(pi->pi_owner);
1463 		if (uq_owner->uq_pi_blocked != NULL)
1464 			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1465 		pi = uq_owner->uq_pi_blocked;
1466 	}
1467 }
1468 
1469 /*
1470  * Insert a PI mutex into owned list.
1471  */
1472 static void
1473 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1474 {
1475 	struct umtx_q *uq_owner;
1476 
1477 	uq_owner = owner->td_umtxq;
1478 	mtx_assert(&umtx_lock, MA_OWNED);
1479 	if (pi->pi_owner != NULL)
1480 		panic("pi_ower != NULL");
1481 	pi->pi_owner = owner;
1482 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1483 }
1484 
1485 /*
1486  * Claim ownership of a PI mutex.
1487  */
1488 static int
1489 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1490 {
1491 	struct umtx_q *uq, *uq_owner;
1492 
1493 	uq_owner = owner->td_umtxq;
1494 	mtx_lock_spin(&umtx_lock);
1495 	if (pi->pi_owner == owner) {
1496 		mtx_unlock_spin(&umtx_lock);
1497 		return (0);
1498 	}
1499 
1500 	if (pi->pi_owner != NULL) {
1501 		/*
1502 		 * userland may have already messed the mutex, sigh.
1503 		 */
1504 		mtx_unlock_spin(&umtx_lock);
1505 		return (EPERM);
1506 	}
1507 	umtx_pi_setowner(pi, owner);
1508 	uq = TAILQ_FIRST(&pi->pi_blocked);
1509 	if (uq != NULL) {
1510 		int pri;
1511 
1512 		pri = UPRI(uq->uq_thread);
1513 		thread_lock(owner);
1514 		if (pri < UPRI(owner))
1515 			sched_lend_user_prio(owner, pri);
1516 		thread_unlock(owner);
1517 	}
1518 	mtx_unlock_spin(&umtx_lock);
1519 	return (0);
1520 }
1521 
1522 static void
1523 umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1524 {
1525 	struct umtx_q *uq;
1526 	struct umtx_pi *pi;
1527 
1528 	uq = td->td_umtxq;
1529 	/*
1530 	 * Pick up the lock that td is blocked on.
1531 	 */
1532 	pi = uq->uq_pi_blocked;
1533 	MPASS(pi != NULL);
1534 
1535 	/* Resort the turnstile on the list. */
1536 	if (!umtx_pi_adjust_thread(pi, td))
1537 		return;
1538 
1539 	/*
1540 	 * If our priority was lowered and we are at the head of the
1541 	 * turnstile, then propagate our new priority up the chain.
1542 	 */
1543 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1544 		umtx_propagate_priority(td);
1545 }
1546 
1547 /*
1548  * Adjust a thread's order position in its blocked PI mutex,
1549  * this may result new priority propagating process.
1550  */
1551 void
1552 umtx_pi_adjust(struct thread *td, u_char oldpri)
1553 {
1554 	struct umtx_q *uq;
1555 	struct umtx_pi *pi;
1556 
1557 	uq = td->td_umtxq;
1558 	mtx_lock_spin(&umtx_lock);
1559 	/*
1560 	 * Pick up the lock that td is blocked on.
1561 	 */
1562 	pi = uq->uq_pi_blocked;
1563 	if (pi != NULL)
1564 		umtx_pi_adjust_locked(td, oldpri);
1565 	mtx_unlock_spin(&umtx_lock);
1566 }
1567 
1568 /*
1569  * Sleep on a PI mutex.
1570  */
1571 static int
1572 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1573 	uint32_t owner, const char *wmesg, int timo)
1574 {
1575 	struct umtxq_chain *uc;
1576 	struct thread *td, *td1;
1577 	struct umtx_q *uq1;
1578 	int pri;
1579 	int error = 0;
1580 
1581 	td = uq->uq_thread;
1582 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1583 	uc = umtxq_getchain(&uq->uq_key);
1584 	UMTXQ_LOCKED_ASSERT(uc);
1585 	UMTXQ_BUSY_ASSERT(uc);
1586 	umtxq_insert(uq);
1587 	mtx_lock_spin(&umtx_lock);
1588 	if (pi->pi_owner == NULL) {
1589 		/* XXX
1590 		 * Current, We only support process private PI-mutex,
1591 		 * we need a faster way to find an owner thread for
1592 		 * process-shared mutex (not available yet).
1593 		 */
1594 		mtx_unlock_spin(&umtx_lock);
1595 		PROC_LOCK(curproc);
1596 		td1 = thread_find(curproc, owner);
1597 		mtx_lock_spin(&umtx_lock);
1598 		if (td1 != NULL && pi->pi_owner == NULL) {
1599 			uq1 = td1->td_umtxq;
1600 			umtx_pi_setowner(pi, td1);
1601 		}
1602 		PROC_UNLOCK(curproc);
1603 	}
1604 
1605 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1606 		pri = UPRI(uq1->uq_thread);
1607 		if (pri > UPRI(td))
1608 			break;
1609 	}
1610 
1611 	if (uq1 != NULL)
1612 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1613 	else
1614 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1615 
1616 	uq->uq_pi_blocked = pi;
1617 	thread_lock(td);
1618 	td->td_flags |= TDF_UPIBLOCKED;
1619 	thread_unlock(td);
1620 	umtx_propagate_priority(td);
1621 	mtx_unlock_spin(&umtx_lock);
1622 	umtxq_unbusy(&uq->uq_key);
1623 
1624 	if (uq->uq_flags & UQF_UMTXQ) {
1625 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1626 		if (error == EWOULDBLOCK)
1627 			error = ETIMEDOUT;
1628 		if (uq->uq_flags & UQF_UMTXQ) {
1629 			umtxq_remove(uq);
1630 		}
1631 	}
1632 	mtx_lock_spin(&umtx_lock);
1633 	uq->uq_pi_blocked = NULL;
1634 	thread_lock(td);
1635 	td->td_flags &= ~TDF_UPIBLOCKED;
1636 	thread_unlock(td);
1637 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1638 	umtx_unpropagate_priority(pi);
1639 	mtx_unlock_spin(&umtx_lock);
1640 	umtxq_unlock(&uq->uq_key);
1641 
1642 	return (error);
1643 }
1644 
1645 /*
1646  * Add reference count for a PI mutex.
1647  */
1648 static void
1649 umtx_pi_ref(struct umtx_pi *pi)
1650 {
1651 	struct umtxq_chain *uc;
1652 
1653 	uc = umtxq_getchain(&pi->pi_key);
1654 	UMTXQ_LOCKED_ASSERT(uc);
1655 	pi->pi_refcount++;
1656 }
1657 
1658 /*
1659  * Decrease reference count for a PI mutex, if the counter
1660  * is decreased to zero, its memory space is freed.
1661  */
1662 static void
1663 umtx_pi_unref(struct umtx_pi *pi)
1664 {
1665 	struct umtxq_chain *uc;
1666 
1667 	uc = umtxq_getchain(&pi->pi_key);
1668 	UMTXQ_LOCKED_ASSERT(uc);
1669 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1670 	if (--pi->pi_refcount == 0) {
1671 		mtx_lock_spin(&umtx_lock);
1672 		if (pi->pi_owner != NULL) {
1673 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1674 				pi, pi_link);
1675 			pi->pi_owner = NULL;
1676 		}
1677 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1678 			("blocked queue not empty"));
1679 		mtx_unlock_spin(&umtx_lock);
1680 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1681 		umtx_pi_free(pi);
1682 	}
1683 }
1684 
1685 /*
1686  * Find a PI mutex in hash table.
1687  */
1688 static struct umtx_pi *
1689 umtx_pi_lookup(struct umtx_key *key)
1690 {
1691 	struct umtxq_chain *uc;
1692 	struct umtx_pi *pi;
1693 
1694 	uc = umtxq_getchain(key);
1695 	UMTXQ_LOCKED_ASSERT(uc);
1696 
1697 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1698 		if (umtx_key_match(&pi->pi_key, key)) {
1699 			return (pi);
1700 		}
1701 	}
1702 	return (NULL);
1703 }
1704 
1705 /*
1706  * Insert a PI mutex into hash table.
1707  */
1708 static inline void
1709 umtx_pi_insert(struct umtx_pi *pi)
1710 {
1711 	struct umtxq_chain *uc;
1712 
1713 	uc = umtxq_getchain(&pi->pi_key);
1714 	UMTXQ_LOCKED_ASSERT(uc);
1715 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1716 }
1717 
1718 /*
1719  * Lock a PI mutex.
1720  */
1721 static int
1722 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1723 	int try)
1724 {
1725 	struct umtx_q *uq;
1726 	struct umtx_pi *pi, *new_pi;
1727 	uint32_t id, owner, old;
1728 	int error;
1729 
1730 	id = td->td_tid;
1731 	uq = td->td_umtxq;
1732 
1733 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1734 	    &uq->uq_key)) != 0)
1735 		return (error);
1736 	umtxq_lock(&uq->uq_key);
1737 	pi = umtx_pi_lookup(&uq->uq_key);
1738 	if (pi == NULL) {
1739 		new_pi = umtx_pi_alloc(M_NOWAIT);
1740 		if (new_pi == NULL) {
1741 			umtxq_unlock(&uq->uq_key);
1742 			new_pi = umtx_pi_alloc(M_WAITOK);
1743 			umtxq_lock(&uq->uq_key);
1744 			pi = umtx_pi_lookup(&uq->uq_key);
1745 			if (pi != NULL) {
1746 				umtx_pi_free(new_pi);
1747 				new_pi = NULL;
1748 			}
1749 		}
1750 		if (new_pi != NULL) {
1751 			new_pi->pi_key = uq->uq_key;
1752 			umtx_pi_insert(new_pi);
1753 			pi = new_pi;
1754 		}
1755 	}
1756 	umtx_pi_ref(pi);
1757 	umtxq_unlock(&uq->uq_key);
1758 
1759 	/*
1760 	 * Care must be exercised when dealing with umtx structure.  It
1761 	 * can fault on any access.
1762 	 */
1763 	for (;;) {
1764 		/*
1765 		 * Try the uncontested case.  This should be done in userland.
1766 		 */
1767 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1768 
1769 		/* The acquire succeeded. */
1770 		if (owner == UMUTEX_UNOWNED) {
1771 			error = 0;
1772 			break;
1773 		}
1774 
1775 		/* The address was invalid. */
1776 		if (owner == -1) {
1777 			error = EFAULT;
1778 			break;
1779 		}
1780 
1781 		/* If no one owns it but it is contested try to acquire it. */
1782 		if (owner == UMUTEX_CONTESTED) {
1783 			owner = casuword32(&m->m_owner,
1784 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1785 
1786 			if (owner == UMUTEX_CONTESTED) {
1787 				umtxq_lock(&uq->uq_key);
1788 				umtxq_busy(&uq->uq_key);
1789 				error = umtx_pi_claim(pi, td);
1790 				umtxq_unbusy(&uq->uq_key);
1791 				umtxq_unlock(&uq->uq_key);
1792 				break;
1793 			}
1794 
1795 			/* The address was invalid. */
1796 			if (owner == -1) {
1797 				error = EFAULT;
1798 				break;
1799 			}
1800 
1801 			/* If this failed the lock has changed, restart. */
1802 			continue;
1803 		}
1804 
1805 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1806 		    (owner & ~UMUTEX_CONTESTED) == id) {
1807 			error = EDEADLK;
1808 			break;
1809 		}
1810 
1811 		if (try != 0) {
1812 			error = EBUSY;
1813 			break;
1814 		}
1815 
1816 		/*
1817 		 * If we caught a signal, we have retried and now
1818 		 * exit immediately.
1819 		 */
1820 		if (error != 0)
1821 			break;
1822 
1823 		umtxq_lock(&uq->uq_key);
1824 		umtxq_busy(&uq->uq_key);
1825 		umtxq_unlock(&uq->uq_key);
1826 
1827 		/*
1828 		 * Set the contested bit so that a release in user space
1829 		 * knows to use the system call for unlock.  If this fails
1830 		 * either some one else has acquired the lock or it has been
1831 		 * released.
1832 		 */
1833 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1834 
1835 		/* The address was invalid. */
1836 		if (old == -1) {
1837 			umtxq_lock(&uq->uq_key);
1838 			umtxq_unbusy(&uq->uq_key);
1839 			umtxq_unlock(&uq->uq_key);
1840 			error = EFAULT;
1841 			break;
1842 		}
1843 
1844 		umtxq_lock(&uq->uq_key);
1845 		/*
1846 		 * We set the contested bit, sleep. Otherwise the lock changed
1847 		 * and we need to retry or we lost a race to the thread
1848 		 * unlocking the umtx.
1849 		 */
1850 		if (old == owner)
1851 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1852 				 "umtxpi", timo);
1853 		else {
1854 			umtxq_unbusy(&uq->uq_key);
1855 			umtxq_unlock(&uq->uq_key);
1856 		}
1857 	}
1858 
1859 	umtxq_lock(&uq->uq_key);
1860 	umtx_pi_unref(pi);
1861 	umtxq_unlock(&uq->uq_key);
1862 
1863 	umtx_key_release(&uq->uq_key);
1864 	return (error);
1865 }
1866 
1867 /*
1868  * Unlock a PI mutex.
1869  */
1870 static int
1871 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1872 {
1873 	struct umtx_key key;
1874 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1875 	struct umtx_pi *pi, *pi2;
1876 	uint32_t owner, old, id;
1877 	int error;
1878 	int count;
1879 	int pri;
1880 
1881 	id = td->td_tid;
1882 	/*
1883 	 * Make sure we own this mtx.
1884 	 */
1885 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1886 	if (owner == -1)
1887 		return (EFAULT);
1888 
1889 	if ((owner & ~UMUTEX_CONTESTED) != id)
1890 		return (EPERM);
1891 
1892 	/* This should be done in userland */
1893 	if ((owner & UMUTEX_CONTESTED) == 0) {
1894 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1895 		if (old == -1)
1896 			return (EFAULT);
1897 		if (old == owner)
1898 			return (0);
1899 		owner = old;
1900 	}
1901 
1902 	/* We should only ever be in here for contested locks */
1903 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1904 	    &key)) != 0)
1905 		return (error);
1906 
1907 	umtxq_lock(&key);
1908 	umtxq_busy(&key);
1909 	count = umtxq_count_pi(&key, &uq_first);
1910 	if (uq_first != NULL) {
1911 		mtx_lock_spin(&umtx_lock);
1912 		pi = uq_first->uq_pi_blocked;
1913 		KASSERT(pi != NULL, ("pi == NULL?"));
1914 		if (pi->pi_owner != curthread) {
1915 			mtx_unlock_spin(&umtx_lock);
1916 			umtxq_unbusy(&key);
1917 			umtxq_unlock(&key);
1918 			umtx_key_release(&key);
1919 			/* userland messed the mutex */
1920 			return (EPERM);
1921 		}
1922 		uq_me = curthread->td_umtxq;
1923 		pi->pi_owner = NULL;
1924 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1925 		/* get highest priority thread which is still sleeping. */
1926 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1927 		while (uq_first != NULL &&
1928 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1929 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1930 		}
1931 		pri = PRI_MAX;
1932 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1933 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1934 			if (uq_first2 != NULL) {
1935 				if (pri > UPRI(uq_first2->uq_thread))
1936 					pri = UPRI(uq_first2->uq_thread);
1937 			}
1938 		}
1939 		thread_lock(curthread);
1940 		sched_unlend_user_prio(curthread, pri);
1941 		thread_unlock(curthread);
1942 		mtx_unlock_spin(&umtx_lock);
1943 		if (uq_first)
1944 			umtxq_signal_thread(uq_first);
1945 	}
1946 	umtxq_unlock(&key);
1947 
1948 	/*
1949 	 * When unlocking the umtx, it must be marked as unowned if
1950 	 * there is zero or one thread only waiting for it.
1951 	 * Otherwise, it must be marked as contested.
1952 	 */
1953 	old = casuword32(&m->m_owner, owner,
1954 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1955 
1956 	umtxq_lock(&key);
1957 	umtxq_unbusy(&key);
1958 	umtxq_unlock(&key);
1959 	umtx_key_release(&key);
1960 	if (old == -1)
1961 		return (EFAULT);
1962 	if (old != owner)
1963 		return (EINVAL);
1964 	return (0);
1965 }
1966 
1967 /*
1968  * Lock a PP mutex.
1969  */
1970 static int
1971 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1972 	int try)
1973 {
1974 	struct umtx_q *uq, *uq2;
1975 	struct umtx_pi *pi;
1976 	uint32_t ceiling;
1977 	uint32_t owner, id;
1978 	int error, pri, old_inherited_pri, su;
1979 
1980 	id = td->td_tid;
1981 	uq = td->td_umtxq;
1982 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1983 	    &uq->uq_key)) != 0)
1984 		return (error);
1985 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1986 	for (;;) {
1987 		old_inherited_pri = uq->uq_inherited_pri;
1988 		umtxq_lock(&uq->uq_key);
1989 		umtxq_busy(&uq->uq_key);
1990 		umtxq_unlock(&uq->uq_key);
1991 
1992 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1993 		if (ceiling > RTP_PRIO_MAX) {
1994 			error = EINVAL;
1995 			goto out;
1996 		}
1997 
1998 		mtx_lock_spin(&umtx_lock);
1999 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2000 			mtx_unlock_spin(&umtx_lock);
2001 			error = EINVAL;
2002 			goto out;
2003 		}
2004 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2005 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2006 			thread_lock(td);
2007 			if (uq->uq_inherited_pri < UPRI(td))
2008 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2009 			thread_unlock(td);
2010 		}
2011 		mtx_unlock_spin(&umtx_lock);
2012 
2013 		owner = casuword32(&m->m_owner,
2014 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2015 
2016 		if (owner == UMUTEX_CONTESTED) {
2017 			error = 0;
2018 			break;
2019 		}
2020 
2021 		/* The address was invalid. */
2022 		if (owner == -1) {
2023 			error = EFAULT;
2024 			break;
2025 		}
2026 
2027 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2028 		    (owner & ~UMUTEX_CONTESTED) == id) {
2029 			error = EDEADLK;
2030 			break;
2031 		}
2032 
2033 		if (try != 0) {
2034 			error = EBUSY;
2035 			break;
2036 		}
2037 
2038 		/*
2039 		 * If we caught a signal, we have retried and now
2040 		 * exit immediately.
2041 		 */
2042 		if (error != 0)
2043 			break;
2044 
2045 		umtxq_lock(&uq->uq_key);
2046 		umtxq_insert(uq);
2047 		umtxq_unbusy(&uq->uq_key);
2048 		error = umtxq_sleep(uq, "umtxpp", timo);
2049 		umtxq_remove(uq);
2050 		umtxq_unlock(&uq->uq_key);
2051 
2052 		mtx_lock_spin(&umtx_lock);
2053 		uq->uq_inherited_pri = old_inherited_pri;
2054 		pri = PRI_MAX;
2055 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057 			if (uq2 != NULL) {
2058 				if (pri > UPRI(uq2->uq_thread))
2059 					pri = UPRI(uq2->uq_thread);
2060 			}
2061 		}
2062 		if (pri > uq->uq_inherited_pri)
2063 			pri = uq->uq_inherited_pri;
2064 		thread_lock(td);
2065 		sched_unlend_user_prio(td, pri);
2066 		thread_unlock(td);
2067 		mtx_unlock_spin(&umtx_lock);
2068 	}
2069 
2070 	if (error != 0) {
2071 		mtx_lock_spin(&umtx_lock);
2072 		uq->uq_inherited_pri = old_inherited_pri;
2073 		pri = PRI_MAX;
2074 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2075 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2076 			if (uq2 != NULL) {
2077 				if (pri > UPRI(uq2->uq_thread))
2078 					pri = UPRI(uq2->uq_thread);
2079 			}
2080 		}
2081 		if (pri > uq->uq_inherited_pri)
2082 			pri = uq->uq_inherited_pri;
2083 		thread_lock(td);
2084 		sched_unlend_user_prio(td, pri);
2085 		thread_unlock(td);
2086 		mtx_unlock_spin(&umtx_lock);
2087 	}
2088 
2089 out:
2090 	umtxq_lock(&uq->uq_key);
2091 	umtxq_unbusy(&uq->uq_key);
2092 	umtxq_unlock(&uq->uq_key);
2093 	umtx_key_release(&uq->uq_key);
2094 	return (error);
2095 }
2096 
2097 /*
2098  * Unlock a PP mutex.
2099  */
2100 static int
2101 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2102 {
2103 	struct umtx_key key;
2104 	struct umtx_q *uq, *uq2;
2105 	struct umtx_pi *pi;
2106 	uint32_t owner, id;
2107 	uint32_t rceiling;
2108 	int error, pri, new_inherited_pri, su;
2109 
2110 	id = td->td_tid;
2111 	uq = td->td_umtxq;
2112 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2113 
2114 	/*
2115 	 * Make sure we own this mtx.
2116 	 */
2117 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2118 	if (owner == -1)
2119 		return (EFAULT);
2120 
2121 	if ((owner & ~UMUTEX_CONTESTED) != id)
2122 		return (EPERM);
2123 
2124 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2125 	if (error != 0)
2126 		return (error);
2127 
2128 	if (rceiling == -1)
2129 		new_inherited_pri = PRI_MAX;
2130 	else {
2131 		rceiling = RTP_PRIO_MAX - rceiling;
2132 		if (rceiling > RTP_PRIO_MAX)
2133 			return (EINVAL);
2134 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2135 	}
2136 
2137 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138 	    &key)) != 0)
2139 		return (error);
2140 	umtxq_lock(&key);
2141 	umtxq_busy(&key);
2142 	umtxq_unlock(&key);
2143 	/*
2144 	 * For priority protected mutex, always set unlocked state
2145 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2146 	 * to lock the mutex, it is necessary because thread priority
2147 	 * has to be adjusted for such mutex.
2148 	 */
2149 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2150 		UMUTEX_CONTESTED);
2151 
2152 	umtxq_lock(&key);
2153 	if (error == 0)
2154 		umtxq_signal(&key, 1);
2155 	umtxq_unbusy(&key);
2156 	umtxq_unlock(&key);
2157 
2158 	if (error == -1)
2159 		error = EFAULT;
2160 	else {
2161 		mtx_lock_spin(&umtx_lock);
2162 		if (su != 0)
2163 			uq->uq_inherited_pri = new_inherited_pri;
2164 		pri = PRI_MAX;
2165 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2166 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2167 			if (uq2 != NULL) {
2168 				if (pri > UPRI(uq2->uq_thread))
2169 					pri = UPRI(uq2->uq_thread);
2170 			}
2171 		}
2172 		if (pri > uq->uq_inherited_pri)
2173 			pri = uq->uq_inherited_pri;
2174 		thread_lock(td);
2175 		sched_unlend_user_prio(td, pri);
2176 		thread_unlock(td);
2177 		mtx_unlock_spin(&umtx_lock);
2178 	}
2179 	umtx_key_release(&key);
2180 	return (error);
2181 }
2182 
2183 static int
2184 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2185 	uint32_t *old_ceiling)
2186 {
2187 	struct umtx_q *uq;
2188 	uint32_t save_ceiling;
2189 	uint32_t owner, id;
2190 	uint32_t flags;
2191 	int error;
2192 
2193 	flags = fuword32(&m->m_flags);
2194 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2195 		return (EINVAL);
2196 	if (ceiling > RTP_PRIO_MAX)
2197 		return (EINVAL);
2198 	id = td->td_tid;
2199 	uq = td->td_umtxq;
2200 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2201 	   &uq->uq_key)) != 0)
2202 		return (error);
2203 	for (;;) {
2204 		umtxq_lock(&uq->uq_key);
2205 		umtxq_busy(&uq->uq_key);
2206 		umtxq_unlock(&uq->uq_key);
2207 
2208 		save_ceiling = fuword32(&m->m_ceilings[0]);
2209 
2210 		owner = casuword32(&m->m_owner,
2211 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2212 
2213 		if (owner == UMUTEX_CONTESTED) {
2214 			suword32(&m->m_ceilings[0], ceiling);
2215 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2216 				UMUTEX_CONTESTED);
2217 			error = 0;
2218 			break;
2219 		}
2220 
2221 		/* The address was invalid. */
2222 		if (owner == -1) {
2223 			error = EFAULT;
2224 			break;
2225 		}
2226 
2227 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2228 			suword32(&m->m_ceilings[0], ceiling);
2229 			error = 0;
2230 			break;
2231 		}
2232 
2233 		/*
2234 		 * If we caught a signal, we have retried and now
2235 		 * exit immediately.
2236 		 */
2237 		if (error != 0)
2238 			break;
2239 
2240 		/*
2241 		 * We set the contested bit, sleep. Otherwise the lock changed
2242 		 * and we need to retry or we lost a race to the thread
2243 		 * unlocking the umtx.
2244 		 */
2245 		umtxq_lock(&uq->uq_key);
2246 		umtxq_insert(uq);
2247 		umtxq_unbusy(&uq->uq_key);
2248 		error = umtxq_sleep(uq, "umtxpp", 0);
2249 		umtxq_remove(uq);
2250 		umtxq_unlock(&uq->uq_key);
2251 	}
2252 	umtxq_lock(&uq->uq_key);
2253 	if (error == 0)
2254 		umtxq_signal(&uq->uq_key, INT_MAX);
2255 	umtxq_unbusy(&uq->uq_key);
2256 	umtxq_unlock(&uq->uq_key);
2257 	umtx_key_release(&uq->uq_key);
2258 	if (error == 0 && old_ceiling != NULL)
2259 		suword32(old_ceiling, save_ceiling);
2260 	return (error);
2261 }
2262 
2263 static int
2264 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2265 	int mode)
2266 {
2267 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2268 	case 0:
2269 		return (_do_lock_normal(td, m, flags, timo, mode));
2270 	case UMUTEX_PRIO_INHERIT:
2271 		return (_do_lock_pi(td, m, flags, timo, mode));
2272 	case UMUTEX_PRIO_PROTECT:
2273 		return (_do_lock_pp(td, m, flags, timo, mode));
2274 	}
2275 	return (EINVAL);
2276 }
2277 
2278 /*
2279  * Lock a userland POSIX mutex.
2280  */
2281 static int
2282 do_lock_umutex(struct thread *td, struct umutex *m,
2283 	struct timespec *timeout, int mode)
2284 {
2285 	struct timespec ts, ts2, ts3;
2286 	struct timeval tv;
2287 	uint32_t flags;
2288 	int error;
2289 
2290 	flags = fuword32(&m->m_flags);
2291 	if (flags == -1)
2292 		return (EFAULT);
2293 
2294 	if (timeout == NULL) {
2295 		error = _do_lock_umutex(td, m, flags, 0, mode);
2296 		/* Mutex locking is restarted if it is interrupted. */
2297 		if (error == EINTR && mode != _UMUTEX_WAIT)
2298 			error = ERESTART;
2299 	} else {
2300 		getnanouptime(&ts);
2301 		timespecadd(&ts, timeout);
2302 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2303 		for (;;) {
2304 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2305 			if (error != ETIMEDOUT)
2306 				break;
2307 			getnanouptime(&ts2);
2308 			if (timespeccmp(&ts2, &ts, >=)) {
2309 				error = ETIMEDOUT;
2310 				break;
2311 			}
2312 			ts3 = ts;
2313 			timespecsub(&ts3, &ts2);
2314 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2315 		}
2316 		/* Timed-locking is not restarted. */
2317 		if (error == ERESTART)
2318 			error = EINTR;
2319 	}
2320 	return (error);
2321 }
2322 
2323 /*
2324  * Unlock a userland POSIX mutex.
2325  */
2326 static int
2327 do_unlock_umutex(struct thread *td, struct umutex *m)
2328 {
2329 	uint32_t flags;
2330 
2331 	flags = fuword32(&m->m_flags);
2332 	if (flags == -1)
2333 		return (EFAULT);
2334 
2335 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2336 	case 0:
2337 		return (do_unlock_normal(td, m, flags));
2338 	case UMUTEX_PRIO_INHERIT:
2339 		return (do_unlock_pi(td, m, flags));
2340 	case UMUTEX_PRIO_PROTECT:
2341 		return (do_unlock_pp(td, m, flags));
2342 	}
2343 
2344 	return (EINVAL);
2345 }
2346 
2347 static int
2348 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2349 	struct timespec *timeout, u_long wflags)
2350 {
2351 	struct umtx_q *uq;
2352 	struct timeval tv;
2353 	struct timespec cts, ets, tts;
2354 	uint32_t flags;
2355 	int error;
2356 
2357 	uq = td->td_umtxq;
2358 	flags = fuword32(&cv->c_flags);
2359 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2360 	if (error != 0)
2361 		return (error);
2362 	umtxq_lock(&uq->uq_key);
2363 	umtxq_busy(&uq->uq_key);
2364 	umtxq_insert(uq);
2365 	umtxq_unlock(&uq->uq_key);
2366 
2367 	/*
2368 	 * The magic thing is we should set c_has_waiters to 1 before
2369 	 * releasing user mutex.
2370 	 */
2371 	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2372 
2373 	umtxq_lock(&uq->uq_key);
2374 	umtxq_unbusy(&uq->uq_key);
2375 	umtxq_unlock(&uq->uq_key);
2376 
2377 	error = do_unlock_umutex(td, m);
2378 
2379 	umtxq_lock(&uq->uq_key);
2380 	if (error == 0) {
2381 		if ((wflags & UMTX_CHECK_UNPARKING) &&
2382 		    (td->td_pflags & TDP_WAKEUP)) {
2383 			td->td_pflags &= ~TDP_WAKEUP;
2384 			error = EINTR;
2385 		} else if (timeout == NULL) {
2386 			error = umtxq_sleep(uq, "ucond", 0);
2387 		} else {
2388 			getnanouptime(&ets);
2389 			timespecadd(&ets, timeout);
2390 			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2391 			for (;;) {
2392 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2393 				if (error != ETIMEDOUT)
2394 					break;
2395 				getnanouptime(&cts);
2396 				if (timespeccmp(&cts, &ets, >=)) {
2397 					error = ETIMEDOUT;
2398 					break;
2399 				}
2400 				tts = ets;
2401 				timespecsub(&tts, &cts);
2402 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2403 			}
2404 		}
2405 	}
2406 
2407 	if (error != 0) {
2408 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2409 			/*
2410 			 * If we concurrently got do_cv_signal()d
2411 			 * and we got an error or UNIX signals or a timeout,
2412 			 * then, perform another umtxq_signal to avoid
2413 			 * consuming the wakeup. This may cause supurious
2414 			 * wakeup for another thread which was just queued,
2415 			 * but SUSV3 explicitly allows supurious wakeup to
2416 			 * occur, and indeed a kernel based implementation
2417 			 * can not avoid it.
2418 			 */
2419 			if (!umtxq_signal(&uq->uq_key, 1))
2420 				error = 0;
2421 		}
2422 		if (error == ERESTART)
2423 			error = EINTR;
2424 	}
2425 	umtxq_remove(uq);
2426 	umtxq_unlock(&uq->uq_key);
2427 	umtx_key_release(&uq->uq_key);
2428 	return (error);
2429 }
2430 
2431 /*
2432  * Signal a userland condition variable.
2433  */
2434 static int
2435 do_cv_signal(struct thread *td, struct ucond *cv)
2436 {
2437 	struct umtx_key key;
2438 	int error, cnt, nwake;
2439 	uint32_t flags;
2440 
2441 	flags = fuword32(&cv->c_flags);
2442 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2443 		return (error);
2444 	umtxq_lock(&key);
2445 	umtxq_busy(&key);
2446 	cnt = umtxq_count(&key);
2447 	nwake = umtxq_signal(&key, 1);
2448 	if (cnt <= nwake) {
2449 		umtxq_unlock(&key);
2450 		error = suword32(
2451 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2452 		umtxq_lock(&key);
2453 	}
2454 	umtxq_unbusy(&key);
2455 	umtxq_unlock(&key);
2456 	umtx_key_release(&key);
2457 	return (error);
2458 }
2459 
2460 static int
2461 do_cv_broadcast(struct thread *td, struct ucond *cv)
2462 {
2463 	struct umtx_key key;
2464 	int error;
2465 	uint32_t flags;
2466 
2467 	flags = fuword32(&cv->c_flags);
2468 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2469 		return (error);
2470 
2471 	umtxq_lock(&key);
2472 	umtxq_busy(&key);
2473 	umtxq_signal(&key, INT_MAX);
2474 	umtxq_unlock(&key);
2475 
2476 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2477 
2478 	umtxq_lock(&key);
2479 	umtxq_unbusy(&key);
2480 	umtxq_unlock(&key);
2481 
2482 	umtx_key_release(&key);
2483 	return (error);
2484 }
2485 
2486 static int
2487 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2488 {
2489 	struct umtx_q *uq;
2490 	uint32_t flags, wrflags;
2491 	int32_t state, oldstate;
2492 	int32_t blocked_readers;
2493 	int error;
2494 
2495 	uq = td->td_umtxq;
2496 	flags = fuword32(&rwlock->rw_flags);
2497 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2498 	if (error != 0)
2499 		return (error);
2500 
2501 	wrflags = URWLOCK_WRITE_OWNER;
2502 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2503 		wrflags |= URWLOCK_WRITE_WAITERS;
2504 
2505 	for (;;) {
2506 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2507 		/* try to lock it */
2508 		while (!(state & wrflags)) {
2509 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2510 				umtx_key_release(&uq->uq_key);
2511 				return (EAGAIN);
2512 			}
2513 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2514 			if (oldstate == state) {
2515 				umtx_key_release(&uq->uq_key);
2516 				return (0);
2517 			}
2518 			state = oldstate;
2519 		}
2520 
2521 		if (error)
2522 			break;
2523 
2524 		/* grab monitor lock */
2525 		umtxq_lock(&uq->uq_key);
2526 		umtxq_busy(&uq->uq_key);
2527 		umtxq_unlock(&uq->uq_key);
2528 
2529 		/*
2530 		 * re-read the state, in case it changed between the try-lock above
2531 		 * and the check below
2532 		 */
2533 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2534 
2535 		/* set read contention bit */
2536 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2537 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2538 			if (oldstate == state)
2539 				goto sleep;
2540 			state = oldstate;
2541 		}
2542 
2543 		/* state is changed while setting flags, restart */
2544 		if (!(state & wrflags)) {
2545 			umtxq_lock(&uq->uq_key);
2546 			umtxq_unbusy(&uq->uq_key);
2547 			umtxq_unlock(&uq->uq_key);
2548 			continue;
2549 		}
2550 
2551 sleep:
2552 		/* contention bit is set, before sleeping, increase read waiter count */
2553 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2554 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2555 
2556 		while (state & wrflags) {
2557 			umtxq_lock(&uq->uq_key);
2558 			umtxq_insert(uq);
2559 			umtxq_unbusy(&uq->uq_key);
2560 
2561 			error = umtxq_sleep(uq, "urdlck", timo);
2562 
2563 			umtxq_busy(&uq->uq_key);
2564 			umtxq_remove(uq);
2565 			umtxq_unlock(&uq->uq_key);
2566 			if (error)
2567 				break;
2568 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2569 		}
2570 
2571 		/* decrease read waiter count, and may clear read contention bit */
2572 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2573 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2574 		if (blocked_readers == 1) {
2575 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2576 			for (;;) {
2577 				oldstate = casuword32(&rwlock->rw_state, state,
2578 					 state & ~URWLOCK_READ_WAITERS);
2579 				if (oldstate == state)
2580 					break;
2581 				state = oldstate;
2582 			}
2583 		}
2584 
2585 		umtxq_lock(&uq->uq_key);
2586 		umtxq_unbusy(&uq->uq_key);
2587 		umtxq_unlock(&uq->uq_key);
2588 	}
2589 	umtx_key_release(&uq->uq_key);
2590 	return (error);
2591 }
2592 
2593 static int
2594 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2595 {
2596 	struct timespec ts, ts2, ts3;
2597 	struct timeval tv;
2598 	int error;
2599 
2600 	getnanouptime(&ts);
2601 	timespecadd(&ts, timeout);
2602 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2603 	for (;;) {
2604 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2605 		if (error != ETIMEDOUT)
2606 			break;
2607 		getnanouptime(&ts2);
2608 		if (timespeccmp(&ts2, &ts, >=)) {
2609 			error = ETIMEDOUT;
2610 			break;
2611 		}
2612 		ts3 = ts;
2613 		timespecsub(&ts3, &ts2);
2614 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2615 	}
2616 	if (error == ERESTART)
2617 		error = EINTR;
2618 	return (error);
2619 }
2620 
2621 static int
2622 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2623 {
2624 	struct umtx_q *uq;
2625 	uint32_t flags;
2626 	int32_t state, oldstate;
2627 	int32_t blocked_writers;
2628 	int32_t blocked_readers;
2629 	int error;
2630 
2631 	uq = td->td_umtxq;
2632 	flags = fuword32(&rwlock->rw_flags);
2633 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2634 	if (error != 0)
2635 		return (error);
2636 
2637 	blocked_readers = 0;
2638 	for (;;) {
2639 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2640 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2641 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2642 			if (oldstate == state) {
2643 				umtx_key_release(&uq->uq_key);
2644 				return (0);
2645 			}
2646 			state = oldstate;
2647 		}
2648 
2649 		if (error) {
2650 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2651 			    blocked_readers != 0) {
2652 				umtxq_lock(&uq->uq_key);
2653 				umtxq_busy(&uq->uq_key);
2654 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2655 				umtxq_unbusy(&uq->uq_key);
2656 				umtxq_unlock(&uq->uq_key);
2657 			}
2658 
2659 			break;
2660 		}
2661 
2662 		/* grab monitor lock */
2663 		umtxq_lock(&uq->uq_key);
2664 		umtxq_busy(&uq->uq_key);
2665 		umtxq_unlock(&uq->uq_key);
2666 
2667 		/*
2668 		 * re-read the state, in case it changed between the try-lock above
2669 		 * and the check below
2670 		 */
2671 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2672 
2673 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2674 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2675 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2676 			if (oldstate == state)
2677 				goto sleep;
2678 			state = oldstate;
2679 		}
2680 
2681 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2682 			umtxq_lock(&uq->uq_key);
2683 			umtxq_unbusy(&uq->uq_key);
2684 			umtxq_unlock(&uq->uq_key);
2685 			continue;
2686 		}
2687 sleep:
2688 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2689 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2690 
2691 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2692 			umtxq_lock(&uq->uq_key);
2693 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2694 			umtxq_unbusy(&uq->uq_key);
2695 
2696 			error = umtxq_sleep(uq, "uwrlck", timo);
2697 
2698 			umtxq_busy(&uq->uq_key);
2699 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2700 			umtxq_unlock(&uq->uq_key);
2701 			if (error)
2702 				break;
2703 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2704 		}
2705 
2706 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2707 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2708 		if (blocked_writers == 1) {
2709 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2710 			for (;;) {
2711 				oldstate = casuword32(&rwlock->rw_state, state,
2712 					 state & ~URWLOCK_WRITE_WAITERS);
2713 				if (oldstate == state)
2714 					break;
2715 				state = oldstate;
2716 			}
2717 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2718 		} else
2719 			blocked_readers = 0;
2720 
2721 		umtxq_lock(&uq->uq_key);
2722 		umtxq_unbusy(&uq->uq_key);
2723 		umtxq_unlock(&uq->uq_key);
2724 	}
2725 
2726 	umtx_key_release(&uq->uq_key);
2727 	return (error);
2728 }
2729 
2730 static int
2731 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2732 {
2733 	struct timespec ts, ts2, ts3;
2734 	struct timeval tv;
2735 	int error;
2736 
2737 	getnanouptime(&ts);
2738 	timespecadd(&ts, timeout);
2739 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2740 	for (;;) {
2741 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2742 		if (error != ETIMEDOUT)
2743 			break;
2744 		getnanouptime(&ts2);
2745 		if (timespeccmp(&ts2, &ts, >=)) {
2746 			error = ETIMEDOUT;
2747 			break;
2748 		}
2749 		ts3 = ts;
2750 		timespecsub(&ts3, &ts2);
2751 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2752 	}
2753 	if (error == ERESTART)
2754 		error = EINTR;
2755 	return (error);
2756 }
2757 
2758 static int
2759 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2760 {
2761 	struct umtx_q *uq;
2762 	uint32_t flags;
2763 	int32_t state, oldstate;
2764 	int error, q, count;
2765 
2766 	uq = td->td_umtxq;
2767 	flags = fuword32(&rwlock->rw_flags);
2768 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2769 	if (error != 0)
2770 		return (error);
2771 
2772 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2773 	if (state & URWLOCK_WRITE_OWNER) {
2774 		for (;;) {
2775 			oldstate = casuword32(&rwlock->rw_state, state,
2776 				state & ~URWLOCK_WRITE_OWNER);
2777 			if (oldstate != state) {
2778 				state = oldstate;
2779 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2780 					error = EPERM;
2781 					goto out;
2782 				}
2783 			} else
2784 				break;
2785 		}
2786 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2787 		for (;;) {
2788 			oldstate = casuword32(&rwlock->rw_state, state,
2789 				state - 1);
2790 			if (oldstate != state) {
2791 				state = oldstate;
2792 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2793 					error = EPERM;
2794 					goto out;
2795 				}
2796 			}
2797 			else
2798 				break;
2799 		}
2800 	} else {
2801 		error = EPERM;
2802 		goto out;
2803 	}
2804 
2805 	count = 0;
2806 
2807 	if (!(flags & URWLOCK_PREFER_READER)) {
2808 		if (state & URWLOCK_WRITE_WAITERS) {
2809 			count = 1;
2810 			q = UMTX_EXCLUSIVE_QUEUE;
2811 		} else if (state & URWLOCK_READ_WAITERS) {
2812 			count = INT_MAX;
2813 			q = UMTX_SHARED_QUEUE;
2814 		}
2815 	} else {
2816 		if (state & URWLOCK_READ_WAITERS) {
2817 			count = INT_MAX;
2818 			q = UMTX_SHARED_QUEUE;
2819 		} else if (state & URWLOCK_WRITE_WAITERS) {
2820 			count = 1;
2821 			q = UMTX_EXCLUSIVE_QUEUE;
2822 		}
2823 	}
2824 
2825 	if (count) {
2826 		umtxq_lock(&uq->uq_key);
2827 		umtxq_busy(&uq->uq_key);
2828 		umtxq_signal_queue(&uq->uq_key, count, q);
2829 		umtxq_unbusy(&uq->uq_key);
2830 		umtxq_unlock(&uq->uq_key);
2831 	}
2832 out:
2833 	umtx_key_release(&uq->uq_key);
2834 	return (error);
2835 }
2836 
2837 static int
2838 do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2839 {
2840 	struct umtx_q *uq;
2841 	struct timeval tv;
2842 	struct timespec cts, ets, tts;
2843 	uint32_t flags, count;
2844 	int error;
2845 
2846 	uq = td->td_umtxq;
2847 	flags = fuword32(&sem->_flags);
2848 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2849 	if (error != 0)
2850 		return (error);
2851 	umtxq_lock(&uq->uq_key);
2852 	umtxq_busy(&uq->uq_key);
2853 	umtxq_insert(uq);
2854 	umtxq_unlock(&uq->uq_key);
2855 
2856 	suword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 1);
2857 
2858 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2859 	if (count != 0) {
2860 		umtxq_lock(&uq->uq_key);
2861 		umtxq_unbusy(&uq->uq_key);
2862 		umtxq_remove(uq);
2863 		umtxq_unlock(&uq->uq_key);
2864 		umtx_key_release(&uq->uq_key);
2865 		return (0);
2866 	}
2867 
2868 	umtxq_lock(&uq->uq_key);
2869 	umtxq_unbusy(&uq->uq_key);
2870 	umtxq_unlock(&uq->uq_key);
2871 
2872 	umtxq_lock(&uq->uq_key);
2873 	if (timeout == NULL) {
2874 		error = umtxq_sleep(uq, "usem", 0);
2875 	} else {
2876 		getnanouptime(&ets);
2877 		timespecadd(&ets, timeout);
2878 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2879 		for (;;) {
2880 			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2881 			if (error != ETIMEDOUT)
2882 				break;
2883 			getnanouptime(&cts);
2884 			if (timespeccmp(&cts, &ets, >=)) {
2885 				error = ETIMEDOUT;
2886 				break;
2887 			}
2888 			tts = ets;
2889 			timespecsub(&tts, &cts);
2890 			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2891 		}
2892 	}
2893 
2894 	if (error != 0) {
2895 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2896 			if (!umtxq_signal(&uq->uq_key, 1))
2897 				error = 0;
2898 		}
2899 		if (error == ERESTART)
2900 			error = EINTR;
2901 	}
2902 	umtxq_remove(uq);
2903 	umtxq_unlock(&uq->uq_key);
2904 	umtx_key_release(&uq->uq_key);
2905 	return (error);
2906 }
2907 
2908 /*
2909  * Signal a userland condition variable.
2910  */
2911 static int
2912 do_sem_wake(struct thread *td, struct _usem *sem)
2913 {
2914 	struct umtx_key key;
2915 	int error, cnt, nwake;
2916 	uint32_t flags;
2917 
2918 	flags = fuword32(&sem->_flags);
2919 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2920 		return (error);
2921 	umtxq_lock(&key);
2922 	umtxq_busy(&key);
2923 	cnt = umtxq_count(&key);
2924 	nwake = umtxq_signal(&key, 1);
2925 	if (cnt <= nwake) {
2926 		umtxq_unlock(&key);
2927 		error = suword32(
2928 		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2929 		umtxq_lock(&key);
2930 	}
2931 	umtxq_unbusy(&key);
2932 	umtxq_unlock(&key);
2933 	umtx_key_release(&key);
2934 	return (error);
2935 }
2936 
2937 int
2938 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2939     /* struct umtx *umtx */
2940 {
2941 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2942 }
2943 
2944 int
2945 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2946     /* struct umtx *umtx */
2947 {
2948 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2949 }
2950 
2951 static int
2952 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2953 {
2954 	struct timespec *ts, timeout;
2955 	int error;
2956 
2957 	/* Allow a null timespec (wait forever). */
2958 	if (uap->uaddr2 == NULL)
2959 		ts = NULL;
2960 	else {
2961 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2962 		if (error != 0)
2963 			return (error);
2964 		if (timeout.tv_nsec >= 1000000000 ||
2965 		    timeout.tv_nsec < 0) {
2966 			return (EINVAL);
2967 		}
2968 		ts = &timeout;
2969 	}
2970 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2971 }
2972 
2973 static int
2974 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2975 {
2976 	return (do_unlock_umtx(td, uap->obj, uap->val));
2977 }
2978 
2979 static int
2980 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2981 {
2982 	struct timespec *ts, timeout;
2983 	int error;
2984 
2985 	if (uap->uaddr2 == NULL)
2986 		ts = NULL;
2987 	else {
2988 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2989 		if (error != 0)
2990 			return (error);
2991 		if (timeout.tv_nsec >= 1000000000 ||
2992 		    timeout.tv_nsec < 0)
2993 			return (EINVAL);
2994 		ts = &timeout;
2995 	}
2996 	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2997 }
2998 
2999 static int
3000 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3001 {
3002 	struct timespec *ts, timeout;
3003 	int error;
3004 
3005 	if (uap->uaddr2 == NULL)
3006 		ts = NULL;
3007 	else {
3008 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3009 		if (error != 0)
3010 			return (error);
3011 		if (timeout.tv_nsec >= 1000000000 ||
3012 		    timeout.tv_nsec < 0)
3013 			return (EINVAL);
3014 		ts = &timeout;
3015 	}
3016 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3017 }
3018 
3019 static int
3020 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3021 {
3022 	struct timespec *ts, timeout;
3023 	int error;
3024 
3025 	if (uap->uaddr2 == NULL)
3026 		ts = NULL;
3027 	else {
3028 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3029 		if (error != 0)
3030 			return (error);
3031 		if (timeout.tv_nsec >= 1000000000 ||
3032 		    timeout.tv_nsec < 0)
3033 			return (EINVAL);
3034 		ts = &timeout;
3035 	}
3036 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3037 }
3038 
3039 static int
3040 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3041 {
3042 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3043 }
3044 
3045 static int
3046 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3047 {
3048 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3049 }
3050 
3051 static int
3052 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3053 {
3054 	struct timespec *ts, timeout;
3055 	int error;
3056 
3057 	/* Allow a null timespec (wait forever). */
3058 	if (uap->uaddr2 == NULL)
3059 		ts = NULL;
3060 	else {
3061 		error = copyin(uap->uaddr2, &timeout,
3062 		    sizeof(timeout));
3063 		if (error != 0)
3064 			return (error);
3065 		if (timeout.tv_nsec >= 1000000000 ||
3066 		    timeout.tv_nsec < 0) {
3067 			return (EINVAL);
3068 		}
3069 		ts = &timeout;
3070 	}
3071 	return do_lock_umutex(td, uap->obj, ts, 0);
3072 }
3073 
3074 static int
3075 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3076 {
3077 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3078 }
3079 
3080 static int
3081 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3082 {
3083 	struct timespec *ts, timeout;
3084 	int error;
3085 
3086 	/* Allow a null timespec (wait forever). */
3087 	if (uap->uaddr2 == NULL)
3088 		ts = NULL;
3089 	else {
3090 		error = copyin(uap->uaddr2, &timeout,
3091 		    sizeof(timeout));
3092 		if (error != 0)
3093 			return (error);
3094 		if (timeout.tv_nsec >= 1000000000 ||
3095 		    timeout.tv_nsec < 0) {
3096 			return (EINVAL);
3097 		}
3098 		ts = &timeout;
3099 	}
3100 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3101 }
3102 
3103 static int
3104 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3105 {
3106 	return do_wake_umutex(td, uap->obj);
3107 }
3108 
3109 static int
3110 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3111 {
3112 	return do_unlock_umutex(td, uap->obj);
3113 }
3114 
3115 static int
3116 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3117 {
3118 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3119 }
3120 
3121 static int
3122 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3123 {
3124 	struct timespec *ts, timeout;
3125 	int error;
3126 
3127 	/* Allow a null timespec (wait forever). */
3128 	if (uap->uaddr2 == NULL)
3129 		ts = NULL;
3130 	else {
3131 		error = copyin(uap->uaddr2, &timeout,
3132 		    sizeof(timeout));
3133 		if (error != 0)
3134 			return (error);
3135 		if (timeout.tv_nsec >= 1000000000 ||
3136 		    timeout.tv_nsec < 0) {
3137 			return (EINVAL);
3138 		}
3139 		ts = &timeout;
3140 	}
3141 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3142 }
3143 
3144 static int
3145 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3146 {
3147 	return do_cv_signal(td, uap->obj);
3148 }
3149 
3150 static int
3151 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3152 {
3153 	return do_cv_broadcast(td, uap->obj);
3154 }
3155 
3156 static int
3157 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3158 {
3159 	struct timespec timeout;
3160 	int error;
3161 
3162 	/* Allow a null timespec (wait forever). */
3163 	if (uap->uaddr2 == NULL) {
3164 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3165 	} else {
3166 		error = copyin(uap->uaddr2, &timeout,
3167 		    sizeof(timeout));
3168 		if (error != 0)
3169 			return (error);
3170 		if (timeout.tv_nsec >= 1000000000 ||
3171 		    timeout.tv_nsec < 0) {
3172 			return (EINVAL);
3173 		}
3174 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3175 	}
3176 	return (error);
3177 }
3178 
3179 static int
3180 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3181 {
3182 	struct timespec timeout;
3183 	int error;
3184 
3185 	/* Allow a null timespec (wait forever). */
3186 	if (uap->uaddr2 == NULL) {
3187 		error = do_rw_wrlock(td, uap->obj, 0);
3188 	} else {
3189 		error = copyin(uap->uaddr2, &timeout,
3190 		    sizeof(timeout));
3191 		if (error != 0)
3192 			return (error);
3193 		if (timeout.tv_nsec >= 1000000000 ||
3194 		    timeout.tv_nsec < 0) {
3195 			return (EINVAL);
3196 		}
3197 
3198 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3199 	}
3200 	return (error);
3201 }
3202 
3203 static int
3204 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3205 {
3206 	return do_rw_unlock(td, uap->obj);
3207 }
3208 
3209 static int
3210 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3211 {
3212 	struct timespec *ts, timeout;
3213 	int error;
3214 
3215 	/* Allow a null timespec (wait forever). */
3216 	if (uap->uaddr2 == NULL)
3217 		ts = NULL;
3218 	else {
3219 		error = copyin(uap->uaddr2, &timeout,
3220 		    sizeof(timeout));
3221 		if (error != 0)
3222 			return (error);
3223 		if (timeout.tv_nsec >= 1000000000 ||
3224 		    timeout.tv_nsec < 0) {
3225 			return (EINVAL);
3226 		}
3227 		ts = &timeout;
3228 	}
3229 	return (do_sem_wait(td, uap->obj, ts));
3230 }
3231 
3232 static int
3233 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3234 {
3235 	return do_sem_wake(td, uap->obj);
3236 }
3237 
3238 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3239 
3240 static _umtx_op_func op_table[] = {
3241 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3242 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3243 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3244 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3245 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3246 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3247 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3248 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3249 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3250 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3251 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3252 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3253 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3254 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3255 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3256 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3257 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3258 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3259 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3260 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3261 	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3262 };
3263 
3264 int
3265 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
3266 {
3267 	if ((unsigned)uap->op < UMTX_OP_MAX)
3268 		return (*op_table[uap->op])(td, uap);
3269 	return (EINVAL);
3270 }
3271 
3272 #ifdef COMPAT_FREEBSD32
3273 int
3274 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3275     /* struct umtx *umtx */
3276 {
3277 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3278 }
3279 
3280 int
3281 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3282     /* struct umtx *umtx */
3283 {
3284 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3285 }
3286 
3287 struct timespec32 {
3288 	u_int32_t tv_sec;
3289 	u_int32_t tv_nsec;
3290 };
3291 
3292 static inline int
3293 copyin_timeout32(void *addr, struct timespec *tsp)
3294 {
3295 	struct timespec32 ts32;
3296 	int error;
3297 
3298 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3299 	if (error == 0) {
3300 		tsp->tv_sec = ts32.tv_sec;
3301 		tsp->tv_nsec = ts32.tv_nsec;
3302 	}
3303 	return (error);
3304 }
3305 
3306 static int
3307 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3308 {
3309 	struct timespec *ts, timeout;
3310 	int error;
3311 
3312 	/* Allow a null timespec (wait forever). */
3313 	if (uap->uaddr2 == NULL)
3314 		ts = NULL;
3315 	else {
3316 		error = copyin_timeout32(uap->uaddr2, &timeout);
3317 		if (error != 0)
3318 			return (error);
3319 		if (timeout.tv_nsec >= 1000000000 ||
3320 		    timeout.tv_nsec < 0) {
3321 			return (EINVAL);
3322 		}
3323 		ts = &timeout;
3324 	}
3325 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3326 }
3327 
3328 static int
3329 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3330 {
3331 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3332 }
3333 
3334 static int
3335 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3336 {
3337 	struct timespec *ts, timeout;
3338 	int error;
3339 
3340 	if (uap->uaddr2 == NULL)
3341 		ts = NULL;
3342 	else {
3343 		error = copyin_timeout32(uap->uaddr2, &timeout);
3344 		if (error != 0)
3345 			return (error);
3346 		if (timeout.tv_nsec >= 1000000000 ||
3347 		    timeout.tv_nsec < 0)
3348 			return (EINVAL);
3349 		ts = &timeout;
3350 	}
3351 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3352 }
3353 
3354 static int
3355 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3356 {
3357 	struct timespec *ts, timeout;
3358 	int error;
3359 
3360 	/* Allow a null timespec (wait forever). */
3361 	if (uap->uaddr2 == NULL)
3362 		ts = NULL;
3363 	else {
3364 		error = copyin_timeout32(uap->uaddr2, &timeout);
3365 		if (error != 0)
3366 			return (error);
3367 		if (timeout.tv_nsec >= 1000000000 ||
3368 		    timeout.tv_nsec < 0)
3369 			return (EINVAL);
3370 		ts = &timeout;
3371 	}
3372 	return do_lock_umutex(td, uap->obj, ts, 0);
3373 }
3374 
3375 static int
3376 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3377 {
3378 	struct timespec *ts, timeout;
3379 	int error;
3380 
3381 	/* Allow a null timespec (wait forever). */
3382 	if (uap->uaddr2 == NULL)
3383 		ts = NULL;
3384 	else {
3385 		error = copyin_timeout32(uap->uaddr2, &timeout);
3386 		if (error != 0)
3387 			return (error);
3388 		if (timeout.tv_nsec >= 1000000000 ||
3389 		    timeout.tv_nsec < 0)
3390 			return (EINVAL);
3391 		ts = &timeout;
3392 	}
3393 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3394 }
3395 
3396 static int
3397 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3398 {
3399 	struct timespec *ts, timeout;
3400 	int error;
3401 
3402 	/* Allow a null timespec (wait forever). */
3403 	if (uap->uaddr2 == NULL)
3404 		ts = NULL;
3405 	else {
3406 		error = copyin_timeout32(uap->uaddr2, &timeout);
3407 		if (error != 0)
3408 			return (error);
3409 		if (timeout.tv_nsec >= 1000000000 ||
3410 		    timeout.tv_nsec < 0)
3411 			return (EINVAL);
3412 		ts = &timeout;
3413 	}
3414 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3415 }
3416 
3417 static int
3418 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3419 {
3420 	struct timespec timeout;
3421 	int error;
3422 
3423 	/* Allow a null timespec (wait forever). */
3424 	if (uap->uaddr2 == NULL) {
3425 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3426 	} else {
3427 		error = copyin(uap->uaddr2, &timeout,
3428 		    sizeof(timeout));
3429 		if (error != 0)
3430 			return (error);
3431 		if (timeout.tv_nsec >= 1000000000 ||
3432 		    timeout.tv_nsec < 0) {
3433 			return (EINVAL);
3434 		}
3435 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3436 	}
3437 	return (error);
3438 }
3439 
3440 static int
3441 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3442 {
3443 	struct timespec timeout;
3444 	int error;
3445 
3446 	/* Allow a null timespec (wait forever). */
3447 	if (uap->uaddr2 == NULL) {
3448 		error = do_rw_wrlock(td, uap->obj, 0);
3449 	} else {
3450 		error = copyin_timeout32(uap->uaddr2, &timeout);
3451 		if (error != 0)
3452 			return (error);
3453 		if (timeout.tv_nsec >= 1000000000 ||
3454 		    timeout.tv_nsec < 0) {
3455 			return (EINVAL);
3456 		}
3457 
3458 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3459 	}
3460 	return (error);
3461 }
3462 
3463 static int
3464 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3465 {
3466 	struct timespec *ts, timeout;
3467 	int error;
3468 
3469 	if (uap->uaddr2 == NULL)
3470 		ts = NULL;
3471 	else {
3472 		error = copyin_timeout32(uap->uaddr2, &timeout);
3473 		if (error != 0)
3474 			return (error);
3475 		if (timeout.tv_nsec >= 1000000000 ||
3476 		    timeout.tv_nsec < 0)
3477 			return (EINVAL);
3478 		ts = &timeout;
3479 	}
3480 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3481 }
3482 
3483 static int
3484 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3485 {
3486 	struct timespec *ts, timeout;
3487 	int error;
3488 
3489 	/* Allow a null timespec (wait forever). */
3490 	if (uap->uaddr2 == NULL)
3491 		ts = NULL;
3492 	else {
3493 		error = copyin_timeout32(uap->uaddr2, &timeout);
3494 		if (error != 0)
3495 			return (error);
3496 		if (timeout.tv_nsec >= 1000000000 ||
3497 		    timeout.tv_nsec < 0)
3498 			return (EINVAL);
3499 		ts = &timeout;
3500 	}
3501 	return (do_sem_wait(td, uap->obj, ts));
3502 }
3503 
3504 static _umtx_op_func op_table_compat32[] = {
3505 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3506 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3507 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3508 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3509 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3510 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3511 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3512 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3513 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3514 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3515 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3516 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3517 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3518 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3519 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3520 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3521 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3522 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3523 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3524 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3525 	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3526 };
3527 
3528 int
3529 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3530 {
3531 	if ((unsigned)uap->op < UMTX_OP_MAX)
3532 		return (*op_table_compat32[uap->op])(td,
3533 			(struct _umtx_op_args *)uap);
3534 	return (EINVAL);
3535 }
3536 #endif
3537 
3538 void
3539 umtx_thread_init(struct thread *td)
3540 {
3541 	td->td_umtxq = umtxq_alloc();
3542 	td->td_umtxq->uq_thread = td;
3543 }
3544 
3545 void
3546 umtx_thread_fini(struct thread *td)
3547 {
3548 	umtxq_free(td->td_umtxq);
3549 }
3550 
3551 /*
3552  * It will be called when new thread is created, e.g fork().
3553  */
3554 void
3555 umtx_thread_alloc(struct thread *td)
3556 {
3557 	struct umtx_q *uq;
3558 
3559 	uq = td->td_umtxq;
3560 	uq->uq_inherited_pri = PRI_MAX;
3561 
3562 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3563 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3564 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3565 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3566 }
3567 
3568 /*
3569  * exec() hook.
3570  */
3571 static void
3572 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3573 	struct image_params *imgp __unused)
3574 {
3575 	umtx_thread_cleanup(curthread);
3576 }
3577 
3578 /*
3579  * thread_exit() hook.
3580  */
3581 void
3582 umtx_thread_exit(struct thread *td)
3583 {
3584 	umtx_thread_cleanup(td);
3585 }
3586 
3587 /*
3588  * clean up umtx data.
3589  */
3590 static void
3591 umtx_thread_cleanup(struct thread *td)
3592 {
3593 	struct umtx_q *uq;
3594 	struct umtx_pi *pi;
3595 
3596 	if ((uq = td->td_umtxq) == NULL)
3597 		return;
3598 
3599 	mtx_lock_spin(&umtx_lock);
3600 	uq->uq_inherited_pri = PRI_MAX;
3601 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3602 		pi->pi_owner = NULL;
3603 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3604 	}
3605 	thread_lock(td);
3606 	td->td_flags &= ~TDF_UBORROWING;
3607 	thread_unlock(td);
3608 	mtx_unlock_spin(&umtx_lock);
3609 }
3610