xref: /freebsd/sys/kern/kern_umtx.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/eventhandler.h>
47 #include <sys/umtx.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_object.h>
54 
55 #include <machine/cpu.h>
56 
57 #ifdef COMPAT_IA32
58 #include <compat/freebsd32/freebsd32_proto.h>
59 #endif
60 
61 #define TYPE_SIMPLE_WAIT	0
62 #define TYPE_CV			1
63 #define TYPE_SIMPLE_LOCK	2
64 #define TYPE_NORMAL_UMUTEX	3
65 #define TYPE_PI_UMUTEX		4
66 #define TYPE_PP_UMUTEX		5
67 #define TYPE_RWLOCK		6
68 
69 #define _UMUTEX_TRY		1
70 #define _UMUTEX_WAIT		2
71 
72 /* Key to represent a unique userland synchronous object */
73 struct umtx_key {
74 	int	hash;
75 	int	type;
76 	int	shared;
77 	union {
78 		struct {
79 			vm_object_t	object;
80 			uintptr_t	offset;
81 		} shared;
82 		struct {
83 			struct vmspace	*vs;
84 			uintptr_t	addr;
85 		} private;
86 		struct {
87 			void		*a;
88 			uintptr_t	b;
89 		} both;
90 	} info;
91 };
92 
93 /* Priority inheritance mutex info. */
94 struct umtx_pi {
95 	/* Owner thread */
96 	struct thread		*pi_owner;
97 
98 	/* Reference count */
99 	int			pi_refcount;
100 
101  	/* List entry to link umtx holding by thread */
102 	TAILQ_ENTRY(umtx_pi)	pi_link;
103 
104 	/* List entry in hash */
105 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
106 
107 	/* List for waiters */
108 	TAILQ_HEAD(,umtx_q)	pi_blocked;
109 
110 	/* Identify a userland lock object */
111 	struct umtx_key		pi_key;
112 };
113 
114 /* A userland synchronous object user. */
115 struct umtx_q {
116 	/* Linked list for the hash. */
117 	TAILQ_ENTRY(umtx_q)	uq_link;
118 
119 	/* Umtx key. */
120 	struct umtx_key		uq_key;
121 
122 	/* Umtx flags. */
123 	int			uq_flags;
124 #define UQF_UMTXQ	0x0001
125 
126 	/* The thread waits on. */
127 	struct thread		*uq_thread;
128 
129 	/*
130 	 * Blocked on PI mutex. read can use chain lock
131 	 * or umtx_lock, write must have both chain lock and
132 	 * umtx_lock being hold.
133 	 */
134 	struct umtx_pi		*uq_pi_blocked;
135 
136 	/* On blocked list */
137 	TAILQ_ENTRY(umtx_q)	uq_lockq;
138 
139 	/* Thread contending with us */
140 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
141 
142 	/* Inherited priority from PP mutex */
143 	u_char			uq_inherited_pri;
144 };
145 
146 TAILQ_HEAD(umtxq_head, umtx_q);
147 
148 /* Userland lock object's wait-queue chain */
149 struct umtxq_chain {
150 	/* Lock for this chain. */
151 	struct mtx		uc_lock;
152 
153 	/* List of sleep queues. */
154 	struct umtxq_head	uc_queue[2];
155 #define UMTX_SHARED_QUEUE	0
156 #define UMTX_EXCLUSIVE_QUEUE	1
157 
158 	/* Busy flag */
159 	char			uc_busy;
160 
161 	/* Chain lock waiters */
162 	int			uc_waiters;
163 
164 	/* All PI in the list */
165 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
166 };
167 
168 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
169 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
170 
171 /*
172  * Don't propagate time-sharing priority, there is a security reason,
173  * a user can simply introduce PI-mutex, let thread A lock the mutex,
174  * and let another thread B block on the mutex, because B is
175  * sleeping, its priority will be boosted, this causes A's priority to
176  * be boosted via priority propagating too and will never be lowered even
177  * if it is using 100%CPU, this is unfair to other processes.
178  */
179 
180 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
181 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
182 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
183 
184 #define	GOLDEN_RATIO_PRIME	2654404609U
185 #define	UMTX_CHAINS		128
186 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
187 
188 #define THREAD_SHARE		0
189 #define PROCESS_SHARE		1
190 #define AUTO_SHARE		2
191 
192 #define	GET_SHARE(flags)	\
193     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
194 
195 #define BUSY_SPINS		200
196 
197 static uma_zone_t		umtx_pi_zone;
198 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
199 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
200 static int			umtx_pi_allocated;
201 
202 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
203 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
204     &umtx_pi_allocated, 0, "Allocated umtx_pi");
205 
206 static void umtxq_sysinit(void *);
207 static void umtxq_hash(struct umtx_key *key);
208 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
209 static void umtxq_lock(struct umtx_key *key);
210 static void umtxq_unlock(struct umtx_key *key);
211 static void umtxq_busy(struct umtx_key *key);
212 static void umtxq_unbusy(struct umtx_key *key);
213 static void umtxq_insert_queue(struct umtx_q *uq, int q);
214 static void umtxq_remove_queue(struct umtx_q *uq, int q);
215 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
216 static int umtxq_count(struct umtx_key *key);
217 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
218 static int umtx_key_get(void *addr, int type, int share,
219 	struct umtx_key *key);
220 static void umtx_key_release(struct umtx_key *key);
221 static struct umtx_pi *umtx_pi_alloc(int);
222 static void umtx_pi_free(struct umtx_pi *pi);
223 static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
224 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225 static void umtx_thread_cleanup(struct thread *td);
226 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227 	struct image_params *imgp __unused);
228 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229 
230 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233 
234 static struct mtx umtx_lock;
235 
236 static void
237 umtxq_sysinit(void *arg __unused)
238 {
239 	int i, j;
240 
241 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
242 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
243 	for (i = 0; i < 2; ++i) {
244 		for (j = 0; j < UMTX_CHAINS; ++j) {
245 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
246 				 MTX_DEF | MTX_DUPOK);
247 			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[0]);
248 			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[1]);
249 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
250 			umtxq_chains[i][j].uc_busy = 0;
251 			umtxq_chains[i][j].uc_waiters = 0;
252 		}
253 	}
254 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
255 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
256 	    EVENTHANDLER_PRI_ANY);
257 }
258 
259 struct umtx_q *
260 umtxq_alloc(void)
261 {
262 	struct umtx_q *uq;
263 
264 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
265 	TAILQ_INIT(&uq->uq_pi_contested);
266 	uq->uq_inherited_pri = PRI_MAX;
267 	return (uq);
268 }
269 
270 void
271 umtxq_free(struct umtx_q *uq)
272 {
273 	free(uq, M_UMTX);
274 }
275 
276 static inline void
277 umtxq_hash(struct umtx_key *key)
278 {
279 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
280 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
281 }
282 
283 static inline int
284 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
285 {
286 	return (k1->type == k2->type &&
287 		k1->info.both.a == k2->info.both.a &&
288 	        k1->info.both.b == k2->info.both.b);
289 }
290 
291 static inline struct umtxq_chain *
292 umtxq_getchain(struct umtx_key *key)
293 {
294 	if (key->type <= TYPE_CV)
295 		return (&umtxq_chains[1][key->hash]);
296 	return (&umtxq_chains[0][key->hash]);
297 }
298 
299 /*
300  * Lock a chain.
301  */
302 static inline void
303 umtxq_lock(struct umtx_key *key)
304 {
305 	struct umtxq_chain *uc;
306 
307 	uc = umtxq_getchain(key);
308 	mtx_lock(&uc->uc_lock);
309 }
310 
311 /*
312  * Unlock a chain.
313  */
314 static inline void
315 umtxq_unlock(struct umtx_key *key)
316 {
317 	struct umtxq_chain *uc;
318 
319 	uc = umtxq_getchain(key);
320 	mtx_unlock(&uc->uc_lock);
321 }
322 
323 /*
324  * Set chain to busy state when following operation
325  * may be blocked (kernel mutex can not be used).
326  */
327 static inline void
328 umtxq_busy(struct umtx_key *key)
329 {
330 	struct umtxq_chain *uc;
331 
332 	uc = umtxq_getchain(key);
333 	mtx_assert(&uc->uc_lock, MA_OWNED);
334 	if (uc->uc_busy) {
335 #ifdef SMP
336 		if (smp_cpus > 1) {
337 			int count = BUSY_SPINS;
338 			if (count > 0) {
339 				umtxq_unlock(key);
340 				while (uc->uc_busy && --count > 0)
341 					cpu_spinwait();
342 				umtxq_lock(key);
343 			}
344 		}
345 #endif
346 		while (uc->uc_busy) {
347 			uc->uc_waiters++;
348 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
349 			uc->uc_waiters--;
350 		}
351 	}
352 	uc->uc_busy = 1;
353 }
354 
355 /*
356  * Unbusy a chain.
357  */
358 static inline void
359 umtxq_unbusy(struct umtx_key *key)
360 {
361 	struct umtxq_chain *uc;
362 
363 	uc = umtxq_getchain(key);
364 	mtx_assert(&uc->uc_lock, MA_OWNED);
365 	KASSERT(uc->uc_busy != 0, ("not busy"));
366 	uc->uc_busy = 0;
367 	if (uc->uc_waiters)
368 		wakeup_one(uc);
369 }
370 
371 static inline void
372 umtxq_insert_queue(struct umtx_q *uq, int q)
373 {
374 	struct umtxq_chain *uc;
375 
376 	uc = umtxq_getchain(&uq->uq_key);
377 	UMTXQ_LOCKED_ASSERT(uc);
378 	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
379 	uq->uq_flags |= UQF_UMTXQ;
380 }
381 
382 static inline void
383 umtxq_remove_queue(struct umtx_q *uq, int q)
384 {
385 	struct umtxq_chain *uc;
386 
387 	uc = umtxq_getchain(&uq->uq_key);
388 	UMTXQ_LOCKED_ASSERT(uc);
389 	if (uq->uq_flags & UQF_UMTXQ) {
390 		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
391 		uq->uq_flags &= ~UQF_UMTXQ;
392 	}
393 }
394 
395 /*
396  * Check if there are multiple waiters
397  */
398 static int
399 umtxq_count(struct umtx_key *key)
400 {
401 	struct umtxq_chain *uc;
402 	struct umtx_q *uq;
403 	int count = 0;
404 
405 	uc = umtxq_getchain(key);
406 	UMTXQ_LOCKED_ASSERT(uc);
407 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
408 		if (umtx_key_match(&uq->uq_key, key)) {
409 			if (++count > 1)
410 				break;
411 		}
412 	}
413 	return (count);
414 }
415 
416 /*
417  * Check if there are multiple PI waiters and returns first
418  * waiter.
419  */
420 static int
421 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
422 {
423 	struct umtxq_chain *uc;
424 	struct umtx_q *uq;
425 	int count = 0;
426 
427 	*first = NULL;
428 	uc = umtxq_getchain(key);
429 	UMTXQ_LOCKED_ASSERT(uc);
430 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
431 		if (umtx_key_match(&uq->uq_key, key)) {
432 			if (++count > 1)
433 				break;
434 			*first = uq;
435 		}
436 	}
437 	return (count);
438 }
439 
440 /*
441  * Wake up threads waiting on an userland object.
442  */
443 
444 static int
445 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
446 {
447 	struct umtxq_chain *uc;
448 	struct umtx_q *uq, *next;
449 	int ret;
450 
451 	ret = 0;
452 	uc = umtxq_getchain(key);
453 	UMTXQ_LOCKED_ASSERT(uc);
454 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
455 		if (umtx_key_match(&uq->uq_key, key)) {
456 			umtxq_remove_queue(uq, q);
457 			wakeup(uq);
458 			if (++ret >= n_wake)
459 				break;
460 		}
461 	}
462 	return (ret);
463 }
464 
465 
466 /*
467  * Wake up specified thread.
468  */
469 static inline void
470 umtxq_signal_thread(struct umtx_q *uq)
471 {
472 	struct umtxq_chain *uc;
473 
474 	uc = umtxq_getchain(&uq->uq_key);
475 	UMTXQ_LOCKED_ASSERT(uc);
476 	umtxq_remove(uq);
477 	wakeup(uq);
478 }
479 
480 /*
481  * Put thread into sleep state, before sleeping, check if
482  * thread was removed from umtx queue.
483  */
484 static inline int
485 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
486 {
487 	struct umtxq_chain *uc;
488 	int error;
489 
490 	uc = umtxq_getchain(&uq->uq_key);
491 	UMTXQ_LOCKED_ASSERT(uc);
492 	if (!(uq->uq_flags & UQF_UMTXQ))
493 		return (0);
494 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
495 	if (error == EWOULDBLOCK)
496 		error = ETIMEDOUT;
497 	return (error);
498 }
499 
500 /*
501  * Convert userspace address into unique logical address.
502  */
503 static int
504 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
505 {
506 	struct thread *td = curthread;
507 	vm_map_t map;
508 	vm_map_entry_t entry;
509 	vm_pindex_t pindex;
510 	vm_prot_t prot;
511 	boolean_t wired;
512 
513 	key->type = type;
514 	if (share == THREAD_SHARE) {
515 		key->shared = 0;
516 		key->info.private.vs = td->td_proc->p_vmspace;
517 		key->info.private.addr = (uintptr_t)addr;
518 	} else {
519 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
520 		map = &td->td_proc->p_vmspace->vm_map;
521 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
522 		    &entry, &key->info.shared.object, &pindex, &prot,
523 		    &wired) != KERN_SUCCESS) {
524 			return EFAULT;
525 		}
526 
527 		if ((share == PROCESS_SHARE) ||
528 		    (share == AUTO_SHARE &&
529 		     VM_INHERIT_SHARE == entry->inheritance)) {
530 			key->shared = 1;
531 			key->info.shared.offset = entry->offset + entry->start -
532 				(vm_offset_t)addr;
533 			vm_object_reference(key->info.shared.object);
534 		} else {
535 			key->shared = 0;
536 			key->info.private.vs = td->td_proc->p_vmspace;
537 			key->info.private.addr = (uintptr_t)addr;
538 		}
539 		vm_map_lookup_done(map, entry);
540 	}
541 
542 	umtxq_hash(key);
543 	return (0);
544 }
545 
546 /*
547  * Release key.
548  */
549 static inline void
550 umtx_key_release(struct umtx_key *key)
551 {
552 	if (key->shared)
553 		vm_object_deallocate(key->info.shared.object);
554 }
555 
556 /*
557  * Lock a umtx object.
558  */
559 static int
560 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
561 {
562 	struct umtx_q *uq;
563 	u_long owner;
564 	u_long old;
565 	int error = 0;
566 
567 	uq = td->td_umtxq;
568 
569 	/*
570 	 * Care must be exercised when dealing with umtx structure. It
571 	 * can fault on any access.
572 	 */
573 	for (;;) {
574 		/*
575 		 * Try the uncontested case.  This should be done in userland.
576 		 */
577 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
578 
579 		/* The acquire succeeded. */
580 		if (owner == UMTX_UNOWNED)
581 			return (0);
582 
583 		/* The address was invalid. */
584 		if (owner == -1)
585 			return (EFAULT);
586 
587 		/* If no one owns it but it is contested try to acquire it. */
588 		if (owner == UMTX_CONTESTED) {
589 			owner = casuword(&umtx->u_owner,
590 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
591 
592 			if (owner == UMTX_CONTESTED)
593 				return (0);
594 
595 			/* The address was invalid. */
596 			if (owner == -1)
597 				return (EFAULT);
598 
599 			/* If this failed the lock has changed, restart. */
600 			continue;
601 		}
602 
603 		/*
604 		 * If we caught a signal, we have retried and now
605 		 * exit immediately.
606 		 */
607 		if (error != 0)
608 			return (error);
609 
610 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
611 			AUTO_SHARE, &uq->uq_key)) != 0)
612 			return (error);
613 
614 		umtxq_lock(&uq->uq_key);
615 		umtxq_busy(&uq->uq_key);
616 		umtxq_insert(uq);
617 		umtxq_unbusy(&uq->uq_key);
618 		umtxq_unlock(&uq->uq_key);
619 
620 		/*
621 		 * Set the contested bit so that a release in user space
622 		 * knows to use the system call for unlock.  If this fails
623 		 * either some one else has acquired the lock or it has been
624 		 * released.
625 		 */
626 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
627 
628 		/* The address was invalid. */
629 		if (old == -1) {
630 			umtxq_lock(&uq->uq_key);
631 			umtxq_remove(uq);
632 			umtxq_unlock(&uq->uq_key);
633 			umtx_key_release(&uq->uq_key);
634 			return (EFAULT);
635 		}
636 
637 		/*
638 		 * We set the contested bit, sleep. Otherwise the lock changed
639 		 * and we need to retry or we lost a race to the thread
640 		 * unlocking the umtx.
641 		 */
642 		umtxq_lock(&uq->uq_key);
643 		if (old == owner)
644 			error = umtxq_sleep(uq, "umtx", timo);
645 		umtxq_remove(uq);
646 		umtxq_unlock(&uq->uq_key);
647 		umtx_key_release(&uq->uq_key);
648 	}
649 
650 	return (0);
651 }
652 
653 /*
654  * Lock a umtx object.
655  */
656 static int
657 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
658 	struct timespec *timeout)
659 {
660 	struct timespec ts, ts2, ts3;
661 	struct timeval tv;
662 	int error;
663 
664 	if (timeout == NULL) {
665 		error = _do_lock_umtx(td, umtx, id, 0);
666 		/* Mutex locking is restarted if it is interrupted. */
667 		if (error == EINTR)
668 			error = ERESTART;
669 	} else {
670 		getnanouptime(&ts);
671 		timespecadd(&ts, timeout);
672 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
673 		for (;;) {
674 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
675 			if (error != ETIMEDOUT)
676 				break;
677 			getnanouptime(&ts2);
678 			if (timespeccmp(&ts2, &ts, >=)) {
679 				error = ETIMEDOUT;
680 				break;
681 			}
682 			ts3 = ts;
683 			timespecsub(&ts3, &ts2);
684 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
685 		}
686 		/* Timed-locking is not restarted. */
687 		if (error == ERESTART)
688 			error = EINTR;
689 	}
690 	return (error);
691 }
692 
693 /*
694  * Unlock a umtx object.
695  */
696 static int
697 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
698 {
699 	struct umtx_key key;
700 	u_long owner;
701 	u_long old;
702 	int error;
703 	int count;
704 
705 	/*
706 	 * Make sure we own this mtx.
707 	 */
708 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
709 	if (owner == -1)
710 		return (EFAULT);
711 
712 	if ((owner & ~UMTX_CONTESTED) != id)
713 		return (EPERM);
714 
715 	/* This should be done in userland */
716 	if ((owner & UMTX_CONTESTED) == 0) {
717 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
718 		if (old == -1)
719 			return (EFAULT);
720 		if (old == owner)
721 			return (0);
722 		owner = old;
723 	}
724 
725 	/* We should only ever be in here for contested locks */
726 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
727 		&key)) != 0)
728 		return (error);
729 
730 	umtxq_lock(&key);
731 	umtxq_busy(&key);
732 	count = umtxq_count(&key);
733 	umtxq_unlock(&key);
734 
735 	/*
736 	 * When unlocking the umtx, it must be marked as unowned if
737 	 * there is zero or one thread only waiting for it.
738 	 * Otherwise, it must be marked as contested.
739 	 */
740 	old = casuword(&umtx->u_owner, owner,
741 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
742 	umtxq_lock(&key);
743 	umtxq_signal(&key,1);
744 	umtxq_unbusy(&key);
745 	umtxq_unlock(&key);
746 	umtx_key_release(&key);
747 	if (old == -1)
748 		return (EFAULT);
749 	if (old != owner)
750 		return (EINVAL);
751 	return (0);
752 }
753 
754 #ifdef COMPAT_IA32
755 
756 /*
757  * Lock a umtx object.
758  */
759 static int
760 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
761 {
762 	struct umtx_q *uq;
763 	uint32_t owner;
764 	uint32_t old;
765 	int error = 0;
766 
767 	uq = td->td_umtxq;
768 
769 	/*
770 	 * Care must be exercised when dealing with umtx structure. It
771 	 * can fault on any access.
772 	 */
773 	for (;;) {
774 		/*
775 		 * Try the uncontested case.  This should be done in userland.
776 		 */
777 		owner = casuword32(m, UMUTEX_UNOWNED, id);
778 
779 		/* The acquire succeeded. */
780 		if (owner == UMUTEX_UNOWNED)
781 			return (0);
782 
783 		/* The address was invalid. */
784 		if (owner == -1)
785 			return (EFAULT);
786 
787 		/* If no one owns it but it is contested try to acquire it. */
788 		if (owner == UMUTEX_CONTESTED) {
789 			owner = casuword32(m,
790 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
791 			if (owner == UMUTEX_CONTESTED)
792 				return (0);
793 
794 			/* The address was invalid. */
795 			if (owner == -1)
796 				return (EFAULT);
797 
798 			/* If this failed the lock has changed, restart. */
799 			continue;
800 		}
801 
802 		/*
803 		 * If we caught a signal, we have retried and now
804 		 * exit immediately.
805 		 */
806 		if (error != 0)
807 			return (error);
808 
809 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
810 			AUTO_SHARE, &uq->uq_key)) != 0)
811 			return (error);
812 
813 		umtxq_lock(&uq->uq_key);
814 		umtxq_busy(&uq->uq_key);
815 		umtxq_insert(uq);
816 		umtxq_unbusy(&uq->uq_key);
817 		umtxq_unlock(&uq->uq_key);
818 
819 		/*
820 		 * Set the contested bit so that a release in user space
821 		 * knows to use the system call for unlock.  If this fails
822 		 * either some one else has acquired the lock or it has been
823 		 * released.
824 		 */
825 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
826 
827 		/* The address was invalid. */
828 		if (old == -1) {
829 			umtxq_lock(&uq->uq_key);
830 			umtxq_remove(uq);
831 			umtxq_unlock(&uq->uq_key);
832 			umtx_key_release(&uq->uq_key);
833 			return (EFAULT);
834 		}
835 
836 		/*
837 		 * We set the contested bit, sleep. Otherwise the lock changed
838 		 * and we need to retry or we lost a race to the thread
839 		 * unlocking the umtx.
840 		 */
841 		umtxq_lock(&uq->uq_key);
842 		if (old == owner)
843 			error = umtxq_sleep(uq, "umtx", timo);
844 		umtxq_remove(uq);
845 		umtxq_unlock(&uq->uq_key);
846 		umtx_key_release(&uq->uq_key);
847 	}
848 
849 	return (0);
850 }
851 
852 /*
853  * Lock a umtx object.
854  */
855 static int
856 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
857 	struct timespec *timeout)
858 {
859 	struct timespec ts, ts2, ts3;
860 	struct timeval tv;
861 	int error;
862 
863 	if (timeout == NULL) {
864 		error = _do_lock_umtx32(td, m, id, 0);
865 		/* Mutex locking is restarted if it is interrupted. */
866 		if (error == EINTR)
867 			error = ERESTART;
868 	} else {
869 		getnanouptime(&ts);
870 		timespecadd(&ts, timeout);
871 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
872 		for (;;) {
873 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
874 			if (error != ETIMEDOUT)
875 				break;
876 			getnanouptime(&ts2);
877 			if (timespeccmp(&ts2, &ts, >=)) {
878 				error = ETIMEDOUT;
879 				break;
880 			}
881 			ts3 = ts;
882 			timespecsub(&ts3, &ts2);
883 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
884 		}
885 		/* Timed-locking is not restarted. */
886 		if (error == ERESTART)
887 			error = EINTR;
888 	}
889 	return (error);
890 }
891 
892 /*
893  * Unlock a umtx object.
894  */
895 static int
896 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
897 {
898 	struct umtx_key key;
899 	uint32_t owner;
900 	uint32_t old;
901 	int error;
902 	int count;
903 
904 	/*
905 	 * Make sure we own this mtx.
906 	 */
907 	owner = fuword32(m);
908 	if (owner == -1)
909 		return (EFAULT);
910 
911 	if ((owner & ~UMUTEX_CONTESTED) != id)
912 		return (EPERM);
913 
914 	/* This should be done in userland */
915 	if ((owner & UMUTEX_CONTESTED) == 0) {
916 		old = casuword32(m, owner, UMUTEX_UNOWNED);
917 		if (old == -1)
918 			return (EFAULT);
919 		if (old == owner)
920 			return (0);
921 		owner = old;
922 	}
923 
924 	/* We should only ever be in here for contested locks */
925 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
926 		&key)) != 0)
927 		return (error);
928 
929 	umtxq_lock(&key);
930 	umtxq_busy(&key);
931 	count = umtxq_count(&key);
932 	umtxq_unlock(&key);
933 
934 	/*
935 	 * When unlocking the umtx, it must be marked as unowned if
936 	 * there is zero or one thread only waiting for it.
937 	 * Otherwise, it must be marked as contested.
938 	 */
939 	old = casuword32(m, owner,
940 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
941 	umtxq_lock(&key);
942 	umtxq_signal(&key,1);
943 	umtxq_unbusy(&key);
944 	umtxq_unlock(&key);
945 	umtx_key_release(&key);
946 	if (old == -1)
947 		return (EFAULT);
948 	if (old != owner)
949 		return (EINVAL);
950 	return (0);
951 }
952 #endif
953 
954 /*
955  * Fetch and compare value, sleep on the address if value is not changed.
956  */
957 static int
958 do_wait(struct thread *td, void *addr, u_long id,
959 	struct timespec *timeout, int compat32, int is_private)
960 {
961 	struct umtx_q *uq;
962 	struct timespec ts, ts2, ts3;
963 	struct timeval tv;
964 	u_long tmp;
965 	int error = 0;
966 
967 	uq = td->td_umtxq;
968 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
969 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
970 		return (error);
971 
972 	umtxq_lock(&uq->uq_key);
973 	umtxq_insert(uq);
974 	umtxq_unlock(&uq->uq_key);
975 	if (compat32 == 0)
976 		tmp = fuword(addr);
977         else
978 		tmp = (unsigned int)fuword32(addr);
979 	if (tmp != id) {
980 		umtxq_lock(&uq->uq_key);
981 		umtxq_remove(uq);
982 		umtxq_unlock(&uq->uq_key);
983 	} else if (timeout == NULL) {
984 		umtxq_lock(&uq->uq_key);
985 		error = umtxq_sleep(uq, "uwait", 0);
986 		umtxq_remove(uq);
987 		umtxq_unlock(&uq->uq_key);
988 	} else {
989 		getnanouptime(&ts);
990 		timespecadd(&ts, timeout);
991 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
992 		umtxq_lock(&uq->uq_key);
993 		for (;;) {
994 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
995 			if (!(uq->uq_flags & UQF_UMTXQ))
996 				break;
997 			if (error != ETIMEDOUT)
998 				break;
999 			umtxq_unlock(&uq->uq_key);
1000 			getnanouptime(&ts2);
1001 			if (timespeccmp(&ts2, &ts, >=)) {
1002 				error = ETIMEDOUT;
1003 				umtxq_lock(&uq->uq_key);
1004 				break;
1005 			}
1006 			ts3 = ts;
1007 			timespecsub(&ts3, &ts2);
1008 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1009 			umtxq_lock(&uq->uq_key);
1010 		}
1011 		umtxq_remove(uq);
1012 		umtxq_unlock(&uq->uq_key);
1013 	}
1014 	umtx_key_release(&uq->uq_key);
1015 	if (error == ERESTART)
1016 		error = EINTR;
1017 	return (error);
1018 }
1019 
1020 /*
1021  * Wake up threads sleeping on the specified address.
1022  */
1023 int
1024 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1025 {
1026 	struct umtx_key key;
1027 	int ret;
1028 
1029 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1030 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1031 		return (ret);
1032 	umtxq_lock(&key);
1033 	ret = umtxq_signal(&key, n_wake);
1034 	umtxq_unlock(&key);
1035 	umtx_key_release(&key);
1036 	return (0);
1037 }
1038 
1039 /*
1040  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1041  */
1042 static int
1043 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1044 	int mode)
1045 {
1046 	struct umtx_q *uq;
1047 	uint32_t owner, old, id;
1048 	int error = 0;
1049 
1050 	id = td->td_tid;
1051 	uq = td->td_umtxq;
1052 
1053 	/*
1054 	 * Care must be exercised when dealing with umtx structure. It
1055 	 * can fault on any access.
1056 	 */
1057 	for (;;) {
1058 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1059 		if (mode == _UMUTEX_WAIT) {
1060 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1061 				return (0);
1062 		} else {
1063 			/*
1064 			 * Try the uncontested case.  This should be done in userland.
1065 			 */
1066 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1067 
1068 			/* The acquire succeeded. */
1069 			if (owner == UMUTEX_UNOWNED)
1070 				return (0);
1071 
1072 			/* The address was invalid. */
1073 			if (owner == -1)
1074 				return (EFAULT);
1075 
1076 			/* If no one owns it but it is contested try to acquire it. */
1077 			if (owner == UMUTEX_CONTESTED) {
1078 				owner = casuword32(&m->m_owner,
1079 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1080 
1081 				if (owner == UMUTEX_CONTESTED)
1082 					return (0);
1083 
1084 				/* The address was invalid. */
1085 				if (owner == -1)
1086 					return (EFAULT);
1087 
1088 				/* If this failed the lock has changed, restart. */
1089 				continue;
1090 			}
1091 		}
1092 
1093 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1094 		    (owner & ~UMUTEX_CONTESTED) == id)
1095 			return (EDEADLK);
1096 
1097 		if (mode == _UMUTEX_TRY)
1098 			return (EBUSY);
1099 
1100 		/*
1101 		 * If we caught a signal, we have retried and now
1102 		 * exit immediately.
1103 		 */
1104 		if (error != 0)
1105 			return (error);
1106 
1107 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1108 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1109 			return (error);
1110 
1111 		umtxq_lock(&uq->uq_key);
1112 		umtxq_busy(&uq->uq_key);
1113 		umtxq_insert(uq);
1114 		umtxq_unlock(&uq->uq_key);
1115 
1116 		/*
1117 		 * Set the contested bit so that a release in user space
1118 		 * knows to use the system call for unlock.  If this fails
1119 		 * either some one else has acquired the lock or it has been
1120 		 * released.
1121 		 */
1122 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1123 
1124 		/* The address was invalid. */
1125 		if (old == -1) {
1126 			umtxq_lock(&uq->uq_key);
1127 			umtxq_remove(uq);
1128 			umtxq_unbusy(&uq->uq_key);
1129 			umtxq_unlock(&uq->uq_key);
1130 			umtx_key_release(&uq->uq_key);
1131 			return (EFAULT);
1132 		}
1133 
1134 		/*
1135 		 * We set the contested bit, sleep. Otherwise the lock changed
1136 		 * and we need to retry or we lost a race to the thread
1137 		 * unlocking the umtx.
1138 		 */
1139 		umtxq_lock(&uq->uq_key);
1140 		umtxq_unbusy(&uq->uq_key);
1141 		if (old == owner)
1142 			error = umtxq_sleep(uq, "umtxn", timo);
1143 		umtxq_remove(uq);
1144 		umtxq_unlock(&uq->uq_key);
1145 		umtx_key_release(&uq->uq_key);
1146 	}
1147 
1148 	return (0);
1149 }
1150 
1151 /*
1152  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1153  */
1154 /*
1155  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1156  */
1157 static int
1158 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1159 {
1160 	struct umtx_key key;
1161 	uint32_t owner, old, id;
1162 	int error;
1163 	int count;
1164 
1165 	id = td->td_tid;
1166 	/*
1167 	 * Make sure we own this mtx.
1168 	 */
1169 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1170 	if (owner == -1)
1171 		return (EFAULT);
1172 
1173 	if ((owner & ~UMUTEX_CONTESTED) != id)
1174 		return (EPERM);
1175 
1176 	if ((owner & UMUTEX_CONTESTED) == 0) {
1177 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1178 		if (old == -1)
1179 			return (EFAULT);
1180 		if (old == owner)
1181 			return (0);
1182 		owner = old;
1183 	}
1184 
1185 	/* We should only ever be in here for contested locks */
1186 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1187 	    &key)) != 0)
1188 		return (error);
1189 
1190 	umtxq_lock(&key);
1191 	umtxq_busy(&key);
1192 	count = umtxq_count(&key);
1193 	umtxq_unlock(&key);
1194 
1195 	/*
1196 	 * When unlocking the umtx, it must be marked as unowned if
1197 	 * there is zero or one thread only waiting for it.
1198 	 * Otherwise, it must be marked as contested.
1199 	 */
1200 	old = casuword32(&m->m_owner, owner,
1201 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1202 	umtxq_lock(&key);
1203 	umtxq_signal(&key,1);
1204 	umtxq_unbusy(&key);
1205 	umtxq_unlock(&key);
1206 	umtx_key_release(&key);
1207 	if (old == -1)
1208 		return (EFAULT);
1209 	if (old != owner)
1210 		return (EINVAL);
1211 	return (0);
1212 }
1213 
1214 /*
1215  * Check if the mutex is available and wake up a waiter,
1216  * only for simple mutex.
1217  */
1218 static int
1219 do_wake_umutex(struct thread *td, struct umutex *m)
1220 {
1221 	struct umtx_key key;
1222 	uint32_t owner;
1223 	uint32_t flags;
1224 	int error;
1225 	int count;
1226 
1227 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1228 	if (owner == -1)
1229 		return (EFAULT);
1230 
1231 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1232 		return (0);
1233 
1234 	flags = fuword32(&m->m_flags);
1235 
1236 	/* We should only ever be in here for contested locks */
1237 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1238 	    &key)) != 0)
1239 		return (error);
1240 
1241 	umtxq_lock(&key);
1242 	umtxq_busy(&key);
1243 	count = umtxq_count(&key);
1244 	umtxq_unlock(&key);
1245 
1246 	if (count <= 1)
1247 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1248 
1249 	umtxq_lock(&key);
1250 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1251 		umtxq_signal(&key, 1);
1252 	umtxq_unbusy(&key);
1253 	umtxq_unlock(&key);
1254 	umtx_key_release(&key);
1255 	return (0);
1256 }
1257 
1258 static inline struct umtx_pi *
1259 umtx_pi_alloc(int flags)
1260 {
1261 	struct umtx_pi *pi;
1262 
1263 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1264 	TAILQ_INIT(&pi->pi_blocked);
1265 	atomic_add_int(&umtx_pi_allocated, 1);
1266 	return (pi);
1267 }
1268 
1269 static inline void
1270 umtx_pi_free(struct umtx_pi *pi)
1271 {
1272 	uma_zfree(umtx_pi_zone, pi);
1273 	atomic_add_int(&umtx_pi_allocated, -1);
1274 }
1275 
1276 /*
1277  * Adjust the thread's position on a pi_state after its priority has been
1278  * changed.
1279  */
1280 static int
1281 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1282 {
1283 	struct umtx_q *uq, *uq1, *uq2;
1284 	struct thread *td1;
1285 
1286 	mtx_assert(&umtx_lock, MA_OWNED);
1287 	if (pi == NULL)
1288 		return (0);
1289 
1290 	uq = td->td_umtxq;
1291 
1292 	/*
1293 	 * Check if the thread needs to be moved on the blocked chain.
1294 	 * It needs to be moved if either its priority is lower than
1295 	 * the previous thread or higher than the next thread.
1296 	 */
1297 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1298 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1299 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1300 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1301 		/*
1302 		 * Remove thread from blocked chain and determine where
1303 		 * it should be moved to.
1304 		 */
1305 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1306 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1307 			td1 = uq1->uq_thread;
1308 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1309 			if (UPRI(td1) > UPRI(td))
1310 				break;
1311 		}
1312 
1313 		if (uq1 == NULL)
1314 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1315 		else
1316 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1317 	}
1318 	return (1);
1319 }
1320 
1321 /*
1322  * Propagate priority when a thread is blocked on POSIX
1323  * PI mutex.
1324  */
1325 static void
1326 umtx_propagate_priority(struct thread *td)
1327 {
1328 	struct umtx_q *uq;
1329 	struct umtx_pi *pi;
1330 	int pri;
1331 
1332 	mtx_assert(&umtx_lock, MA_OWNED);
1333 	pri = UPRI(td);
1334 	uq = td->td_umtxq;
1335 	pi = uq->uq_pi_blocked;
1336 	if (pi == NULL)
1337 		return;
1338 
1339 	for (;;) {
1340 		td = pi->pi_owner;
1341 		if (td == NULL)
1342 			return;
1343 
1344 		MPASS(td->td_proc != NULL);
1345 		MPASS(td->td_proc->p_magic == P_MAGIC);
1346 
1347 		if (UPRI(td) <= pri)
1348 			return;
1349 
1350 		thread_lock(td);
1351 		sched_lend_user_prio(td, pri);
1352 		thread_unlock(td);
1353 
1354 		/*
1355 		 * Pick up the lock that td is blocked on.
1356 		 */
1357 		uq = td->td_umtxq;
1358 		pi = uq->uq_pi_blocked;
1359 		/* Resort td on the list if needed. */
1360 		if (!umtx_pi_adjust_thread(pi, td))
1361 			break;
1362 	}
1363 }
1364 
1365 /*
1366  * Unpropagate priority for a PI mutex when a thread blocked on
1367  * it is interrupted by signal or resumed by others.
1368  */
1369 static void
1370 umtx_unpropagate_priority(struct umtx_pi *pi)
1371 {
1372 	struct umtx_q *uq, *uq_owner;
1373 	struct umtx_pi *pi2;
1374 	int pri, oldpri;
1375 
1376 	mtx_assert(&umtx_lock, MA_OWNED);
1377 
1378 	while (pi != NULL && pi->pi_owner != NULL) {
1379 		pri = PRI_MAX;
1380 		uq_owner = pi->pi_owner->td_umtxq;
1381 
1382 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1383 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1384 			if (uq != NULL) {
1385 				if (pri > UPRI(uq->uq_thread))
1386 					pri = UPRI(uq->uq_thread);
1387 			}
1388 		}
1389 
1390 		if (pri > uq_owner->uq_inherited_pri)
1391 			pri = uq_owner->uq_inherited_pri;
1392 		thread_lock(pi->pi_owner);
1393 		oldpri = pi->pi_owner->td_user_pri;
1394 		sched_unlend_user_prio(pi->pi_owner, pri);
1395 		thread_unlock(pi->pi_owner);
1396 		if (uq_owner->uq_pi_blocked != NULL)
1397 			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1398 		pi = uq_owner->uq_pi_blocked;
1399 	}
1400 }
1401 
1402 /*
1403  * Insert a PI mutex into owned list.
1404  */
1405 static void
1406 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1407 {
1408 	struct umtx_q *uq_owner;
1409 
1410 	uq_owner = owner->td_umtxq;
1411 	mtx_assert(&umtx_lock, MA_OWNED);
1412 	if (pi->pi_owner != NULL)
1413 		panic("pi_ower != NULL");
1414 	pi->pi_owner = owner;
1415 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1416 }
1417 
1418 /*
1419  * Claim ownership of a PI mutex.
1420  */
1421 static int
1422 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1423 {
1424 	struct umtx_q *uq, *uq_owner;
1425 
1426 	uq_owner = owner->td_umtxq;
1427 	mtx_lock_spin(&umtx_lock);
1428 	if (pi->pi_owner == owner) {
1429 		mtx_unlock_spin(&umtx_lock);
1430 		return (0);
1431 	}
1432 
1433 	if (pi->pi_owner != NULL) {
1434 		/*
1435 		 * userland may have already messed the mutex, sigh.
1436 		 */
1437 		mtx_unlock_spin(&umtx_lock);
1438 		return (EPERM);
1439 	}
1440 	umtx_pi_setowner(pi, owner);
1441 	uq = TAILQ_FIRST(&pi->pi_blocked);
1442 	if (uq != NULL) {
1443 		int pri;
1444 
1445 		pri = UPRI(uq->uq_thread);
1446 		thread_lock(owner);
1447 		if (pri < UPRI(owner))
1448 			sched_lend_user_prio(owner, pri);
1449 		thread_unlock(owner);
1450 	}
1451 	mtx_unlock_spin(&umtx_lock);
1452 	return (0);
1453 }
1454 
1455 static void
1456 umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1457 {
1458 	struct umtx_q *uq;
1459 	struct umtx_pi *pi;
1460 
1461 	uq = td->td_umtxq;
1462 	/*
1463 	 * Pick up the lock that td is blocked on.
1464 	 */
1465 	pi = uq->uq_pi_blocked;
1466 	MPASS(pi != NULL);
1467 
1468 	/* Resort the turnstile on the list. */
1469 	if (!umtx_pi_adjust_thread(pi, td))
1470 		return;
1471 
1472 	/*
1473 	 * If our priority was lowered and we are at the head of the
1474 	 * turnstile, then propagate our new priority up the chain.
1475 	 */
1476 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1477 		umtx_propagate_priority(td);
1478 }
1479 
1480 /*
1481  * Adjust a thread's order position in its blocked PI mutex,
1482  * this may result new priority propagating process.
1483  */
1484 void
1485 umtx_pi_adjust(struct thread *td, u_char oldpri)
1486 {
1487 	struct umtx_q *uq;
1488 	struct umtx_pi *pi;
1489 
1490 	uq = td->td_umtxq;
1491 	mtx_lock_spin(&umtx_lock);
1492 	/*
1493 	 * Pick up the lock that td is blocked on.
1494 	 */
1495 	pi = uq->uq_pi_blocked;
1496 	if (pi != NULL)
1497 		umtx_pi_adjust_locked(td, oldpri);
1498 	mtx_unlock_spin(&umtx_lock);
1499 }
1500 
1501 /*
1502  * Sleep on a PI mutex.
1503  */
1504 static int
1505 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1506 	uint32_t owner, const char *wmesg, int timo)
1507 {
1508 	struct umtxq_chain *uc;
1509 	struct thread *td, *td1;
1510 	struct umtx_q *uq1;
1511 	int pri;
1512 	int error = 0;
1513 
1514 	td = uq->uq_thread;
1515 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1516 	uc = umtxq_getchain(&uq->uq_key);
1517 	UMTXQ_LOCKED_ASSERT(uc);
1518 	UMTXQ_BUSY_ASSERT(uc);
1519 	umtxq_insert(uq);
1520 	mtx_lock_spin(&umtx_lock);
1521 	if (pi->pi_owner == NULL) {
1522 		/* XXX
1523 		 * Current, We only support process private PI-mutex,
1524 		 * non-contended PI-mutexes are locked in userland.
1525 		 * Process shared PI-mutex should always be initialized
1526 		 * by kernel and be registered in kernel, locking should
1527 		 * always be done by kernel to avoid security problems.
1528 		 * For process private PI-mutex, we can find owner
1529 		 * thread and boost its priority safely.
1530 		 */
1531 		mtx_unlock_spin(&umtx_lock);
1532 		PROC_LOCK(curproc);
1533 		td1 = thread_find(curproc, owner);
1534 		mtx_lock_spin(&umtx_lock);
1535 		if (td1 != NULL && pi->pi_owner == NULL) {
1536 			uq1 = td1->td_umtxq;
1537 			umtx_pi_setowner(pi, td1);
1538 		}
1539 		PROC_UNLOCK(curproc);
1540 	}
1541 
1542 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1543 		pri = UPRI(uq1->uq_thread);
1544 		if (pri > UPRI(td))
1545 			break;
1546 	}
1547 
1548 	if (uq1 != NULL)
1549 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1550 	else
1551 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1552 
1553 	uq->uq_pi_blocked = pi;
1554 	thread_lock(td);
1555 	td->td_flags |= TDF_UPIBLOCKED;
1556 	thread_unlock(td);
1557 	umtx_propagate_priority(td);
1558 	mtx_unlock_spin(&umtx_lock);
1559 	umtxq_unbusy(&uq->uq_key);
1560 
1561 	if (uq->uq_flags & UQF_UMTXQ) {
1562 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1563 		if (error == EWOULDBLOCK)
1564 			error = ETIMEDOUT;
1565 		if (uq->uq_flags & UQF_UMTXQ) {
1566 			umtxq_remove(uq);
1567 		}
1568 	}
1569 	mtx_lock_spin(&umtx_lock);
1570 	uq->uq_pi_blocked = NULL;
1571 	thread_lock(td);
1572 	td->td_flags &= ~TDF_UPIBLOCKED;
1573 	thread_unlock(td);
1574 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1575 	umtx_unpropagate_priority(pi);
1576 	mtx_unlock_spin(&umtx_lock);
1577 	umtxq_unlock(&uq->uq_key);
1578 
1579 	return (error);
1580 }
1581 
1582 /*
1583  * Add reference count for a PI mutex.
1584  */
1585 static void
1586 umtx_pi_ref(struct umtx_pi *pi)
1587 {
1588 	struct umtxq_chain *uc;
1589 
1590 	uc = umtxq_getchain(&pi->pi_key);
1591 	UMTXQ_LOCKED_ASSERT(uc);
1592 	pi->pi_refcount++;
1593 }
1594 
1595 /*
1596  * Decrease reference count for a PI mutex, if the counter
1597  * is decreased to zero, its memory space is freed.
1598  */
1599 static void
1600 umtx_pi_unref(struct umtx_pi *pi)
1601 {
1602 	struct umtxq_chain *uc;
1603 
1604 	uc = umtxq_getchain(&pi->pi_key);
1605 	UMTXQ_LOCKED_ASSERT(uc);
1606 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1607 	if (--pi->pi_refcount == 0) {
1608 		mtx_lock_spin(&umtx_lock);
1609 		if (pi->pi_owner != NULL) {
1610 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1611 				pi, pi_link);
1612 			pi->pi_owner = NULL;
1613 		}
1614 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1615 			("blocked queue not empty"));
1616 		mtx_unlock_spin(&umtx_lock);
1617 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1618 		umtx_pi_free(pi);
1619 	}
1620 }
1621 
1622 /*
1623  * Find a PI mutex in hash table.
1624  */
1625 static struct umtx_pi *
1626 umtx_pi_lookup(struct umtx_key *key)
1627 {
1628 	struct umtxq_chain *uc;
1629 	struct umtx_pi *pi;
1630 
1631 	uc = umtxq_getchain(key);
1632 	UMTXQ_LOCKED_ASSERT(uc);
1633 
1634 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1635 		if (umtx_key_match(&pi->pi_key, key)) {
1636 			return (pi);
1637 		}
1638 	}
1639 	return (NULL);
1640 }
1641 
1642 /*
1643  * Insert a PI mutex into hash table.
1644  */
1645 static inline void
1646 umtx_pi_insert(struct umtx_pi *pi)
1647 {
1648 	struct umtxq_chain *uc;
1649 
1650 	uc = umtxq_getchain(&pi->pi_key);
1651 	UMTXQ_LOCKED_ASSERT(uc);
1652 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1653 }
1654 
1655 /*
1656  * Lock a PI mutex.
1657  */
1658 static int
1659 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1660 	int try)
1661 {
1662 	struct umtx_q *uq;
1663 	struct umtx_pi *pi, *new_pi;
1664 	uint32_t id, owner, old;
1665 	int error;
1666 
1667 	id = td->td_tid;
1668 	uq = td->td_umtxq;
1669 
1670 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1671 	    &uq->uq_key)) != 0)
1672 		return (error);
1673 	umtxq_lock(&uq->uq_key);
1674 	pi = umtx_pi_lookup(&uq->uq_key);
1675 	if (pi == NULL) {
1676 		new_pi = umtx_pi_alloc(M_NOWAIT);
1677 		if (new_pi == NULL) {
1678 			umtxq_unlock(&uq->uq_key);
1679 			new_pi = umtx_pi_alloc(M_WAITOK);
1680 			umtxq_lock(&uq->uq_key);
1681 			pi = umtx_pi_lookup(&uq->uq_key);
1682 			if (pi != NULL) {
1683 				umtx_pi_free(new_pi);
1684 				new_pi = NULL;
1685 			}
1686 		}
1687 		if (new_pi != NULL) {
1688 			new_pi->pi_key = uq->uq_key;
1689 			umtx_pi_insert(new_pi);
1690 			pi = new_pi;
1691 		}
1692 	}
1693 	umtx_pi_ref(pi);
1694 	umtxq_unlock(&uq->uq_key);
1695 
1696 	/*
1697 	 * Care must be exercised when dealing with umtx structure.  It
1698 	 * can fault on any access.
1699 	 */
1700 	for (;;) {
1701 		/*
1702 		 * Try the uncontested case.  This should be done in userland.
1703 		 */
1704 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1705 
1706 		/* The acquire succeeded. */
1707 		if (owner == UMUTEX_UNOWNED) {
1708 			error = 0;
1709 			break;
1710 		}
1711 
1712 		/* The address was invalid. */
1713 		if (owner == -1) {
1714 			error = EFAULT;
1715 			break;
1716 		}
1717 
1718 		/* If no one owns it but it is contested try to acquire it. */
1719 		if (owner == UMUTEX_CONTESTED) {
1720 			owner = casuword32(&m->m_owner,
1721 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1722 
1723 			if (owner == UMUTEX_CONTESTED) {
1724 				umtxq_lock(&uq->uq_key);
1725 				umtxq_busy(&uq->uq_key);
1726 				error = umtx_pi_claim(pi, td);
1727 				umtxq_unbusy(&uq->uq_key);
1728 				umtxq_unlock(&uq->uq_key);
1729 				break;
1730 			}
1731 
1732 			/* The address was invalid. */
1733 			if (owner == -1) {
1734 				error = EFAULT;
1735 				break;
1736 			}
1737 
1738 			/* If this failed the lock has changed, restart. */
1739 			continue;
1740 		}
1741 
1742 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1743 		    (owner & ~UMUTEX_CONTESTED) == id) {
1744 			error = EDEADLK;
1745 			break;
1746 		}
1747 
1748 		if (try != 0) {
1749 			error = EBUSY;
1750 			break;
1751 		}
1752 
1753 		/*
1754 		 * If we caught a signal, we have retried and now
1755 		 * exit immediately.
1756 		 */
1757 		if (error != 0)
1758 			break;
1759 
1760 		umtxq_lock(&uq->uq_key);
1761 		umtxq_busy(&uq->uq_key);
1762 		umtxq_unlock(&uq->uq_key);
1763 
1764 		/*
1765 		 * Set the contested bit so that a release in user space
1766 		 * knows to use the system call for unlock.  If this fails
1767 		 * either some one else has acquired the lock or it has been
1768 		 * released.
1769 		 */
1770 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1771 
1772 		/* The address was invalid. */
1773 		if (old == -1) {
1774 			umtxq_lock(&uq->uq_key);
1775 			umtxq_unbusy(&uq->uq_key);
1776 			umtxq_unlock(&uq->uq_key);
1777 			error = EFAULT;
1778 			break;
1779 		}
1780 
1781 		umtxq_lock(&uq->uq_key);
1782 		/*
1783 		 * We set the contested bit, sleep. Otherwise the lock changed
1784 		 * and we need to retry or we lost a race to the thread
1785 		 * unlocking the umtx.
1786 		 */
1787 		if (old == owner)
1788 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1789 				 "umtxpi", timo);
1790 		else {
1791 			umtxq_unbusy(&uq->uq_key);
1792 			umtxq_unlock(&uq->uq_key);
1793 		}
1794 	}
1795 
1796 	umtxq_lock(&uq->uq_key);
1797 	umtx_pi_unref(pi);
1798 	umtxq_unlock(&uq->uq_key);
1799 
1800 	umtx_key_release(&uq->uq_key);
1801 	return (error);
1802 }
1803 
1804 /*
1805  * Unlock a PI mutex.
1806  */
1807 static int
1808 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1809 {
1810 	struct umtx_key key;
1811 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1812 	struct umtx_pi *pi, *pi2;
1813 	uint32_t owner, old, id;
1814 	int error;
1815 	int count;
1816 	int pri;
1817 
1818 	id = td->td_tid;
1819 	/*
1820 	 * Make sure we own this mtx.
1821 	 */
1822 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1823 	if (owner == -1)
1824 		return (EFAULT);
1825 
1826 	if ((owner & ~UMUTEX_CONTESTED) != id)
1827 		return (EPERM);
1828 
1829 	/* This should be done in userland */
1830 	if ((owner & UMUTEX_CONTESTED) == 0) {
1831 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1832 		if (old == -1)
1833 			return (EFAULT);
1834 		if (old == owner)
1835 			return (0);
1836 		owner = old;
1837 	}
1838 
1839 	/* We should only ever be in here for contested locks */
1840 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1841 	    &key)) != 0)
1842 		return (error);
1843 
1844 	umtxq_lock(&key);
1845 	umtxq_busy(&key);
1846 	count = umtxq_count_pi(&key, &uq_first);
1847 	if (uq_first != NULL) {
1848 		mtx_lock_spin(&umtx_lock);
1849 		pi = uq_first->uq_pi_blocked;
1850 		KASSERT(pi != NULL, ("pi == NULL?"));
1851 		if (pi->pi_owner != curthread) {
1852 			mtx_unlock_spin(&umtx_lock);
1853 			umtxq_unbusy(&key);
1854 			umtxq_unlock(&key);
1855 			umtx_key_release(&key);
1856 			/* userland messed the mutex */
1857 			return (EPERM);
1858 		}
1859 		uq_me = curthread->td_umtxq;
1860 		pi->pi_owner = NULL;
1861 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1862 		/* get highest priority thread which is still sleeping. */
1863 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1864 		while (uq_first != NULL &&
1865 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1866 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1867 		}
1868 		pri = PRI_MAX;
1869 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1870 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1871 			if (uq_first2 != NULL) {
1872 				if (pri > UPRI(uq_first2->uq_thread))
1873 					pri = UPRI(uq_first2->uq_thread);
1874 			}
1875 		}
1876 		thread_lock(curthread);
1877 		sched_unlend_user_prio(curthread, pri);
1878 		thread_unlock(curthread);
1879 		mtx_unlock_spin(&umtx_lock);
1880 		if (uq_first)
1881 			umtxq_signal_thread(uq_first);
1882 	}
1883 	umtxq_unlock(&key);
1884 
1885 	/*
1886 	 * When unlocking the umtx, it must be marked as unowned if
1887 	 * there is zero or one thread only waiting for it.
1888 	 * Otherwise, it must be marked as contested.
1889 	 */
1890 	old = casuword32(&m->m_owner, owner,
1891 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1892 
1893 	umtxq_lock(&key);
1894 	umtxq_unbusy(&key);
1895 	umtxq_unlock(&key);
1896 	umtx_key_release(&key);
1897 	if (old == -1)
1898 		return (EFAULT);
1899 	if (old != owner)
1900 		return (EINVAL);
1901 	return (0);
1902 }
1903 
1904 /*
1905  * Lock a PP mutex.
1906  */
1907 static int
1908 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1909 	int try)
1910 {
1911 	struct umtx_q *uq, *uq2;
1912 	struct umtx_pi *pi;
1913 	uint32_t ceiling;
1914 	uint32_t owner, id;
1915 	int error, pri, old_inherited_pri, su;
1916 
1917 	id = td->td_tid;
1918 	uq = td->td_umtxq;
1919 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1920 	    &uq->uq_key)) != 0)
1921 		return (error);
1922 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1923 	for (;;) {
1924 		old_inherited_pri = uq->uq_inherited_pri;
1925 		umtxq_lock(&uq->uq_key);
1926 		umtxq_busy(&uq->uq_key);
1927 		umtxq_unlock(&uq->uq_key);
1928 
1929 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1930 		if (ceiling > RTP_PRIO_MAX) {
1931 			error = EINVAL;
1932 			goto out;
1933 		}
1934 
1935 		mtx_lock_spin(&umtx_lock);
1936 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1937 			mtx_unlock_spin(&umtx_lock);
1938 			error = EINVAL;
1939 			goto out;
1940 		}
1941 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1942 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1943 			thread_lock(td);
1944 			if (uq->uq_inherited_pri < UPRI(td))
1945 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1946 			thread_unlock(td);
1947 		}
1948 		mtx_unlock_spin(&umtx_lock);
1949 
1950 		owner = casuword32(&m->m_owner,
1951 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1952 
1953 		if (owner == UMUTEX_CONTESTED) {
1954 			error = 0;
1955 			break;
1956 		}
1957 
1958 		/* The address was invalid. */
1959 		if (owner == -1) {
1960 			error = EFAULT;
1961 			break;
1962 		}
1963 
1964 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1965 		    (owner & ~UMUTEX_CONTESTED) == id) {
1966 			error = EDEADLK;
1967 			break;
1968 		}
1969 
1970 		if (try != 0) {
1971 			error = EBUSY;
1972 			break;
1973 		}
1974 
1975 		/*
1976 		 * If we caught a signal, we have retried and now
1977 		 * exit immediately.
1978 		 */
1979 		if (error != 0)
1980 			break;
1981 
1982 		umtxq_lock(&uq->uq_key);
1983 		umtxq_insert(uq);
1984 		umtxq_unbusy(&uq->uq_key);
1985 		error = umtxq_sleep(uq, "umtxpp", timo);
1986 		umtxq_remove(uq);
1987 		umtxq_unlock(&uq->uq_key);
1988 
1989 		mtx_lock_spin(&umtx_lock);
1990 		uq->uq_inherited_pri = old_inherited_pri;
1991 		pri = PRI_MAX;
1992 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1993 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1994 			if (uq2 != NULL) {
1995 				if (pri > UPRI(uq2->uq_thread))
1996 					pri = UPRI(uq2->uq_thread);
1997 			}
1998 		}
1999 		if (pri > uq->uq_inherited_pri)
2000 			pri = uq->uq_inherited_pri;
2001 		thread_lock(td);
2002 		sched_unlend_user_prio(td, pri);
2003 		thread_unlock(td);
2004 		mtx_unlock_spin(&umtx_lock);
2005 	}
2006 
2007 	if (error != 0) {
2008 		mtx_lock_spin(&umtx_lock);
2009 		uq->uq_inherited_pri = old_inherited_pri;
2010 		pri = PRI_MAX;
2011 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2012 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2013 			if (uq2 != NULL) {
2014 				if (pri > UPRI(uq2->uq_thread))
2015 					pri = UPRI(uq2->uq_thread);
2016 			}
2017 		}
2018 		if (pri > uq->uq_inherited_pri)
2019 			pri = uq->uq_inherited_pri;
2020 		thread_lock(td);
2021 		sched_unlend_user_prio(td, pri);
2022 		thread_unlock(td);
2023 		mtx_unlock_spin(&umtx_lock);
2024 	}
2025 
2026 out:
2027 	umtxq_lock(&uq->uq_key);
2028 	umtxq_unbusy(&uq->uq_key);
2029 	umtxq_unlock(&uq->uq_key);
2030 	umtx_key_release(&uq->uq_key);
2031 	return (error);
2032 }
2033 
2034 /*
2035  * Unlock a PP mutex.
2036  */
2037 static int
2038 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2039 {
2040 	struct umtx_key key;
2041 	struct umtx_q *uq, *uq2;
2042 	struct umtx_pi *pi;
2043 	uint32_t owner, id;
2044 	uint32_t rceiling;
2045 	int error, pri, new_inherited_pri, su;
2046 
2047 	id = td->td_tid;
2048 	uq = td->td_umtxq;
2049 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2050 
2051 	/*
2052 	 * Make sure we own this mtx.
2053 	 */
2054 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2055 	if (owner == -1)
2056 		return (EFAULT);
2057 
2058 	if ((owner & ~UMUTEX_CONTESTED) != id)
2059 		return (EPERM);
2060 
2061 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2062 	if (error != 0)
2063 		return (error);
2064 
2065 	if (rceiling == -1)
2066 		new_inherited_pri = PRI_MAX;
2067 	else {
2068 		rceiling = RTP_PRIO_MAX - rceiling;
2069 		if (rceiling > RTP_PRIO_MAX)
2070 			return (EINVAL);
2071 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2072 	}
2073 
2074 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2075 	    &key)) != 0)
2076 		return (error);
2077 	umtxq_lock(&key);
2078 	umtxq_busy(&key);
2079 	umtxq_unlock(&key);
2080 	/*
2081 	 * For priority protected mutex, always set unlocked state
2082 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2083 	 * to lock the mutex, it is necessary because thread priority
2084 	 * has to be adjusted for such mutex.
2085 	 */
2086 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2087 		UMUTEX_CONTESTED);
2088 
2089 	umtxq_lock(&key);
2090 	if (error == 0)
2091 		umtxq_signal(&key, 1);
2092 	umtxq_unbusy(&key);
2093 	umtxq_unlock(&key);
2094 
2095 	if (error == -1)
2096 		error = EFAULT;
2097 	else {
2098 		mtx_lock_spin(&umtx_lock);
2099 		if (su != 0)
2100 			uq->uq_inherited_pri = new_inherited_pri;
2101 		pri = PRI_MAX;
2102 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2103 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2104 			if (uq2 != NULL) {
2105 				if (pri > UPRI(uq2->uq_thread))
2106 					pri = UPRI(uq2->uq_thread);
2107 			}
2108 		}
2109 		if (pri > uq->uq_inherited_pri)
2110 			pri = uq->uq_inherited_pri;
2111 		thread_lock(td);
2112 		sched_unlend_user_prio(td, pri);
2113 		thread_unlock(td);
2114 		mtx_unlock_spin(&umtx_lock);
2115 	}
2116 	umtx_key_release(&key);
2117 	return (error);
2118 }
2119 
2120 static int
2121 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2122 	uint32_t *old_ceiling)
2123 {
2124 	struct umtx_q *uq;
2125 	uint32_t save_ceiling;
2126 	uint32_t owner, id;
2127 	uint32_t flags;
2128 	int error;
2129 
2130 	flags = fuword32(&m->m_flags);
2131 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2132 		return (EINVAL);
2133 	if (ceiling > RTP_PRIO_MAX)
2134 		return (EINVAL);
2135 	id = td->td_tid;
2136 	uq = td->td_umtxq;
2137 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138 	   &uq->uq_key)) != 0)
2139 		return (error);
2140 	for (;;) {
2141 		umtxq_lock(&uq->uq_key);
2142 		umtxq_busy(&uq->uq_key);
2143 		umtxq_unlock(&uq->uq_key);
2144 
2145 		save_ceiling = fuword32(&m->m_ceilings[0]);
2146 
2147 		owner = casuword32(&m->m_owner,
2148 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2149 
2150 		if (owner == UMUTEX_CONTESTED) {
2151 			suword32(&m->m_ceilings[0], ceiling);
2152 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2153 				UMUTEX_CONTESTED);
2154 			error = 0;
2155 			break;
2156 		}
2157 
2158 		/* The address was invalid. */
2159 		if (owner == -1) {
2160 			error = EFAULT;
2161 			break;
2162 		}
2163 
2164 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2165 			suword32(&m->m_ceilings[0], ceiling);
2166 			error = 0;
2167 			break;
2168 		}
2169 
2170 		/*
2171 		 * If we caught a signal, we have retried and now
2172 		 * exit immediately.
2173 		 */
2174 		if (error != 0)
2175 			break;
2176 
2177 		/*
2178 		 * We set the contested bit, sleep. Otherwise the lock changed
2179 		 * and we need to retry or we lost a race to the thread
2180 		 * unlocking the umtx.
2181 		 */
2182 		umtxq_lock(&uq->uq_key);
2183 		umtxq_insert(uq);
2184 		umtxq_unbusy(&uq->uq_key);
2185 		error = umtxq_sleep(uq, "umtxpp", 0);
2186 		umtxq_remove(uq);
2187 		umtxq_unlock(&uq->uq_key);
2188 	}
2189 	umtxq_lock(&uq->uq_key);
2190 	if (error == 0)
2191 		umtxq_signal(&uq->uq_key, INT_MAX);
2192 	umtxq_unbusy(&uq->uq_key);
2193 	umtxq_unlock(&uq->uq_key);
2194 	umtx_key_release(&uq->uq_key);
2195 	if (error == 0 && old_ceiling != NULL)
2196 		suword32(old_ceiling, save_ceiling);
2197 	return (error);
2198 }
2199 
2200 static int
2201 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2202 	int mode)
2203 {
2204 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2205 	case 0:
2206 		return (_do_lock_normal(td, m, flags, timo, mode));
2207 	case UMUTEX_PRIO_INHERIT:
2208 		return (_do_lock_pi(td, m, flags, timo, mode));
2209 	case UMUTEX_PRIO_PROTECT:
2210 		return (_do_lock_pp(td, m, flags, timo, mode));
2211 	}
2212 	return (EINVAL);
2213 }
2214 
2215 /*
2216  * Lock a userland POSIX mutex.
2217  */
2218 static int
2219 do_lock_umutex(struct thread *td, struct umutex *m,
2220 	struct timespec *timeout, int mode)
2221 {
2222 	struct timespec ts, ts2, ts3;
2223 	struct timeval tv;
2224 	uint32_t flags;
2225 	int error;
2226 
2227 	flags = fuword32(&m->m_flags);
2228 	if (flags == -1)
2229 		return (EFAULT);
2230 
2231 	if (timeout == NULL) {
2232 		error = _do_lock_umutex(td, m, flags, 0, mode);
2233 		/* Mutex locking is restarted if it is interrupted. */
2234 		if (error == EINTR && mode != _UMUTEX_WAIT)
2235 			error = ERESTART;
2236 	} else {
2237 		getnanouptime(&ts);
2238 		timespecadd(&ts, timeout);
2239 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2240 		for (;;) {
2241 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2242 			if (error != ETIMEDOUT)
2243 				break;
2244 			getnanouptime(&ts2);
2245 			if (timespeccmp(&ts2, &ts, >=)) {
2246 				error = ETIMEDOUT;
2247 				break;
2248 			}
2249 			ts3 = ts;
2250 			timespecsub(&ts3, &ts2);
2251 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2252 		}
2253 		/* Timed-locking is not restarted. */
2254 		if (error == ERESTART)
2255 			error = EINTR;
2256 	}
2257 	return (error);
2258 }
2259 
2260 /*
2261  * Unlock a userland POSIX mutex.
2262  */
2263 static int
2264 do_unlock_umutex(struct thread *td, struct umutex *m)
2265 {
2266 	uint32_t flags;
2267 
2268 	flags = fuword32(&m->m_flags);
2269 	if (flags == -1)
2270 		return (EFAULT);
2271 
2272 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2273 	case 0:
2274 		return (do_unlock_normal(td, m, flags));
2275 	case UMUTEX_PRIO_INHERIT:
2276 		return (do_unlock_pi(td, m, flags));
2277 	case UMUTEX_PRIO_PROTECT:
2278 		return (do_unlock_pp(td, m, flags));
2279 	}
2280 
2281 	return (EINVAL);
2282 }
2283 
2284 static int
2285 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2286 	struct timespec *timeout, u_long wflags)
2287 {
2288 	struct umtx_q *uq;
2289 	struct timeval tv;
2290 	struct timespec cts, ets, tts;
2291 	uint32_t flags;
2292 	int error;
2293 
2294 	uq = td->td_umtxq;
2295 	flags = fuword32(&cv->c_flags);
2296 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2297 	if (error != 0)
2298 		return (error);
2299 	umtxq_lock(&uq->uq_key);
2300 	umtxq_busy(&uq->uq_key);
2301 	umtxq_insert(uq);
2302 	umtxq_unlock(&uq->uq_key);
2303 
2304 	/*
2305 	 * The magic thing is we should set c_has_waiters to 1 before
2306 	 * releasing user mutex.
2307 	 */
2308 	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2309 
2310 	umtxq_lock(&uq->uq_key);
2311 	umtxq_unbusy(&uq->uq_key);
2312 	umtxq_unlock(&uq->uq_key);
2313 
2314 	error = do_unlock_umutex(td, m);
2315 
2316 	umtxq_lock(&uq->uq_key);
2317 	if (error == 0) {
2318 		if ((wflags & UMTX_CHECK_UNPARKING) &&
2319 		    (td->td_pflags & TDP_WAKEUP)) {
2320 			td->td_pflags &= ~TDP_WAKEUP;
2321 			error = EINTR;
2322 		} else if (timeout == NULL) {
2323 			error = umtxq_sleep(uq, "ucond", 0);
2324 		} else {
2325 			getnanouptime(&ets);
2326 			timespecadd(&ets, timeout);
2327 			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2328 			for (;;) {
2329 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2330 				if (error != ETIMEDOUT)
2331 					break;
2332 				getnanouptime(&cts);
2333 				if (timespeccmp(&cts, &ets, >=)) {
2334 					error = ETIMEDOUT;
2335 					break;
2336 				}
2337 				tts = ets;
2338 				timespecsub(&tts, &cts);
2339 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2340 			}
2341 		}
2342 	}
2343 
2344 	if (error != 0) {
2345 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2346 			/*
2347 			 * If we concurrently got do_cv_signal()d
2348 			 * and we got an error or UNIX signals or a timeout,
2349 			 * then, perform another umtxq_signal to avoid
2350 			 * consuming the wakeup. This may cause supurious
2351 			 * wakeup for another thread which was just queued,
2352 			 * but SUSV3 explicitly allows supurious wakeup to
2353 			 * occur, and indeed a kernel based implementation
2354 			 * can not avoid it.
2355 			 */
2356 			if (!umtxq_signal(&uq->uq_key, 1))
2357 				error = 0;
2358 		}
2359 		if (error == ERESTART)
2360 			error = EINTR;
2361 	}
2362 	umtxq_remove(uq);
2363 	umtxq_unlock(&uq->uq_key);
2364 	umtx_key_release(&uq->uq_key);
2365 	return (error);
2366 }
2367 
2368 /*
2369  * Signal a userland condition variable.
2370  */
2371 static int
2372 do_cv_signal(struct thread *td, struct ucond *cv)
2373 {
2374 	struct umtx_key key;
2375 	int error, cnt, nwake;
2376 	uint32_t flags;
2377 
2378 	flags = fuword32(&cv->c_flags);
2379 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2380 		return (error);
2381 	umtxq_lock(&key);
2382 	umtxq_busy(&key);
2383 	cnt = umtxq_count(&key);
2384 	nwake = umtxq_signal(&key, 1);
2385 	if (cnt <= nwake) {
2386 		umtxq_unlock(&key);
2387 		error = suword32(
2388 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2389 		umtxq_lock(&key);
2390 	}
2391 	umtxq_unbusy(&key);
2392 	umtxq_unlock(&key);
2393 	umtx_key_release(&key);
2394 	return (error);
2395 }
2396 
2397 static int
2398 do_cv_broadcast(struct thread *td, struct ucond *cv)
2399 {
2400 	struct umtx_key key;
2401 	int error;
2402 	uint32_t flags;
2403 
2404 	flags = fuword32(&cv->c_flags);
2405 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2406 		return (error);
2407 
2408 	umtxq_lock(&key);
2409 	umtxq_busy(&key);
2410 	umtxq_signal(&key, INT_MAX);
2411 	umtxq_unlock(&key);
2412 
2413 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2414 
2415 	umtxq_lock(&key);
2416 	umtxq_unbusy(&key);
2417 	umtxq_unlock(&key);
2418 
2419 	umtx_key_release(&key);
2420 	return (error);
2421 }
2422 
2423 static int
2424 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2425 {
2426 	struct umtx_q *uq;
2427 	uint32_t flags, wrflags;
2428 	int32_t state, oldstate;
2429 	int32_t blocked_readers;
2430 	int error;
2431 
2432 	uq = td->td_umtxq;
2433 	flags = fuword32(&rwlock->rw_flags);
2434 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2435 	if (error != 0)
2436 		return (error);
2437 
2438 	wrflags = URWLOCK_WRITE_OWNER;
2439 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2440 		wrflags |= URWLOCK_WRITE_WAITERS;
2441 
2442 	for (;;) {
2443 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2444 		/* try to lock it */
2445 		while (!(state & wrflags)) {
2446 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2447 				umtx_key_release(&uq->uq_key);
2448 				return (EAGAIN);
2449 			}
2450 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2451 			if (oldstate == state) {
2452 				umtx_key_release(&uq->uq_key);
2453 				return (0);
2454 			}
2455 			state = oldstate;
2456 		}
2457 
2458 		if (error)
2459 			break;
2460 
2461 		/* grab monitor lock */
2462 		umtxq_lock(&uq->uq_key);
2463 		umtxq_busy(&uq->uq_key);
2464 		umtxq_unlock(&uq->uq_key);
2465 
2466 		/* set read contention bit */
2467 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2468 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2469 			if (oldstate == state)
2470 				goto sleep;
2471 			state = oldstate;
2472 		}
2473 
2474 		/* state is changed while setting flags, restart */
2475 		if (!(state & wrflags)) {
2476 			umtxq_lock(&uq->uq_key);
2477 			umtxq_unbusy(&uq->uq_key);
2478 			umtxq_unlock(&uq->uq_key);
2479 			continue;
2480 		}
2481 
2482 sleep:
2483 		/* contention bit is set, before sleeping, increase read waiter count */
2484 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2485 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2486 
2487 		while (state & wrflags) {
2488 			umtxq_lock(&uq->uq_key);
2489 			umtxq_insert(uq);
2490 			umtxq_unbusy(&uq->uq_key);
2491 
2492 			error = umtxq_sleep(uq, "urdlck", timo);
2493 
2494 			umtxq_busy(&uq->uq_key);
2495 			umtxq_remove(uq);
2496 			umtxq_unlock(&uq->uq_key);
2497 			if (error)
2498 				break;
2499 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2500 		}
2501 
2502 		/* decrease read waiter count, and may clear read contention bit */
2503 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2504 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2505 		if (blocked_readers == 1) {
2506 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2507 			for (;;) {
2508 				oldstate = casuword32(&rwlock->rw_state, state,
2509 					 state & ~URWLOCK_READ_WAITERS);
2510 				if (oldstate == state)
2511 					break;
2512 				state = oldstate;
2513 			}
2514 		}
2515 
2516 		umtxq_lock(&uq->uq_key);
2517 		umtxq_unbusy(&uq->uq_key);
2518 		umtxq_unlock(&uq->uq_key);
2519 	}
2520 	umtx_key_release(&uq->uq_key);
2521 	return (error);
2522 }
2523 
2524 static int
2525 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2526 {
2527 	struct timespec ts, ts2, ts3;
2528 	struct timeval tv;
2529 	int error;
2530 
2531 	getnanouptime(&ts);
2532 	timespecadd(&ts, timeout);
2533 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2534 	for (;;) {
2535 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2536 		if (error != ETIMEDOUT)
2537 			break;
2538 		getnanouptime(&ts2);
2539 		if (timespeccmp(&ts2, &ts, >=)) {
2540 			error = ETIMEDOUT;
2541 			break;
2542 		}
2543 		ts3 = ts;
2544 		timespecsub(&ts3, &ts2);
2545 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2546 	}
2547 	if (error == ERESTART)
2548 		error = EINTR;
2549 	return (error);
2550 }
2551 
2552 static int
2553 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2554 {
2555 	struct umtx_q *uq;
2556 	uint32_t flags;
2557 	int32_t state, oldstate;
2558 	int32_t blocked_writers;
2559 	int error;
2560 
2561 	uq = td->td_umtxq;
2562 	flags = fuword32(&rwlock->rw_flags);
2563 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2564 	if (error != 0)
2565 		return (error);
2566 
2567 	for (;;) {
2568 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2569 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2570 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2571 			if (oldstate == state) {
2572 				umtx_key_release(&uq->uq_key);
2573 				return (0);
2574 			}
2575 			state = oldstate;
2576 		}
2577 
2578 		if (error)
2579 			break;
2580 
2581 		/* grab monitor lock */
2582 		umtxq_lock(&uq->uq_key);
2583 		umtxq_busy(&uq->uq_key);
2584 		umtxq_unlock(&uq->uq_key);
2585 
2586 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2587 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2588 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2589 			if (oldstate == state)
2590 				goto sleep;
2591 			state = oldstate;
2592 		}
2593 
2594 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2595 			umtxq_lock(&uq->uq_key);
2596 			umtxq_unbusy(&uq->uq_key);
2597 			umtxq_unlock(&uq->uq_key);
2598 			continue;
2599 		}
2600 sleep:
2601 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2602 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2603 
2604 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2605 			umtxq_lock(&uq->uq_key);
2606 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2607 			umtxq_unbusy(&uq->uq_key);
2608 
2609 			error = umtxq_sleep(uq, "uwrlck", timo);
2610 
2611 			umtxq_busy(&uq->uq_key);
2612 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2613 			umtxq_unlock(&uq->uq_key);
2614 			if (error)
2615 				break;
2616 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2617 		}
2618 
2619 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2620 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2621 		if (blocked_writers == 1) {
2622 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2623 			for (;;) {
2624 				oldstate = casuword32(&rwlock->rw_state, state,
2625 					 state & ~URWLOCK_WRITE_WAITERS);
2626 				if (oldstate == state)
2627 					break;
2628 				state = oldstate;
2629 			}
2630 		}
2631 
2632 		umtxq_lock(&uq->uq_key);
2633 		umtxq_unbusy(&uq->uq_key);
2634 		umtxq_unlock(&uq->uq_key);
2635 	}
2636 
2637 	umtx_key_release(&uq->uq_key);
2638 	return (error);
2639 }
2640 
2641 static int
2642 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2643 {
2644 	struct timespec ts, ts2, ts3;
2645 	struct timeval tv;
2646 	int error;
2647 
2648 	getnanouptime(&ts);
2649 	timespecadd(&ts, timeout);
2650 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2651 	for (;;) {
2652 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2653 		if (error != ETIMEDOUT)
2654 			break;
2655 		getnanouptime(&ts2);
2656 		if (timespeccmp(&ts2, &ts, >=)) {
2657 			error = ETIMEDOUT;
2658 			break;
2659 		}
2660 		ts3 = ts;
2661 		timespecsub(&ts3, &ts2);
2662 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2663 	}
2664 	if (error == ERESTART)
2665 		error = EINTR;
2666 	return (error);
2667 }
2668 
2669 static int
2670 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2671 {
2672 	struct umtx_q *uq;
2673 	uint32_t flags;
2674 	int32_t state, oldstate;
2675 	int error, q, count;
2676 
2677 	uq = td->td_umtxq;
2678 	flags = fuword32(&rwlock->rw_flags);
2679 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2680 	if (error != 0)
2681 		return (error);
2682 
2683 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2684 	if (state & URWLOCK_WRITE_OWNER) {
2685 		for (;;) {
2686 			oldstate = casuword32(&rwlock->rw_state, state,
2687 				state & ~URWLOCK_WRITE_OWNER);
2688 			if (oldstate != state) {
2689 				state = oldstate;
2690 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2691 					error = EPERM;
2692 					goto out;
2693 				}
2694 			} else
2695 				break;
2696 		}
2697 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2698 		for (;;) {
2699 			oldstate = casuword32(&rwlock->rw_state, state,
2700 				state - 1);
2701 			if (oldstate != state) {
2702 				state = oldstate;
2703 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2704 					error = EPERM;
2705 					goto out;
2706 				}
2707 			}
2708 			else
2709 				break;
2710 		}
2711 	} else {
2712 		error = EPERM;
2713 		goto out;
2714 	}
2715 
2716 	count = 0;
2717 
2718 	if (!(flags & URWLOCK_PREFER_READER)) {
2719 		if (state & URWLOCK_WRITE_WAITERS) {
2720 			count = 1;
2721 			q = UMTX_EXCLUSIVE_QUEUE;
2722 		} else if (state & URWLOCK_READ_WAITERS) {
2723 			count = INT_MAX;
2724 			q = UMTX_SHARED_QUEUE;
2725 		}
2726 	} else {
2727 		if (state & URWLOCK_READ_WAITERS) {
2728 			count = INT_MAX;
2729 			q = UMTX_SHARED_QUEUE;
2730 		} else if (state & URWLOCK_WRITE_WAITERS) {
2731 			count = 1;
2732 			q = UMTX_EXCLUSIVE_QUEUE;
2733 		}
2734 	}
2735 
2736 	if (count) {
2737 		umtxq_lock(&uq->uq_key);
2738 		umtxq_busy(&uq->uq_key);
2739 		umtxq_signal_queue(&uq->uq_key, count, q);
2740 		umtxq_unbusy(&uq->uq_key);
2741 		umtxq_unlock(&uq->uq_key);
2742 	}
2743 out:
2744 	umtx_key_release(&uq->uq_key);
2745 	return (error);
2746 }
2747 
2748 int
2749 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2750     /* struct umtx *umtx */
2751 {
2752 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2753 }
2754 
2755 int
2756 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2757     /* struct umtx *umtx */
2758 {
2759 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2760 }
2761 
2762 static int
2763 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2764 {
2765 	struct timespec *ts, timeout;
2766 	int error;
2767 
2768 	/* Allow a null timespec (wait forever). */
2769 	if (uap->uaddr2 == NULL)
2770 		ts = NULL;
2771 	else {
2772 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2773 		if (error != 0)
2774 			return (error);
2775 		if (timeout.tv_nsec >= 1000000000 ||
2776 		    timeout.tv_nsec < 0) {
2777 			return (EINVAL);
2778 		}
2779 		ts = &timeout;
2780 	}
2781 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2782 }
2783 
2784 static int
2785 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2786 {
2787 	return (do_unlock_umtx(td, uap->obj, uap->val));
2788 }
2789 
2790 static int
2791 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2792 {
2793 	struct timespec *ts, timeout;
2794 	int error;
2795 
2796 	if (uap->uaddr2 == NULL)
2797 		ts = NULL;
2798 	else {
2799 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2800 		if (error != 0)
2801 			return (error);
2802 		if (timeout.tv_nsec >= 1000000000 ||
2803 		    timeout.tv_nsec < 0)
2804 			return (EINVAL);
2805 		ts = &timeout;
2806 	}
2807 	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2808 }
2809 
2810 static int
2811 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2812 {
2813 	struct timespec *ts, timeout;
2814 	int error;
2815 
2816 	if (uap->uaddr2 == NULL)
2817 		ts = NULL;
2818 	else {
2819 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2820 		if (error != 0)
2821 			return (error);
2822 		if (timeout.tv_nsec >= 1000000000 ||
2823 		    timeout.tv_nsec < 0)
2824 			return (EINVAL);
2825 		ts = &timeout;
2826 	}
2827 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2828 }
2829 
2830 static int
2831 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2832 {
2833 	struct timespec *ts, timeout;
2834 	int error;
2835 
2836 	if (uap->uaddr2 == NULL)
2837 		ts = NULL;
2838 	else {
2839 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2840 		if (error != 0)
2841 			return (error);
2842 		if (timeout.tv_nsec >= 1000000000 ||
2843 		    timeout.tv_nsec < 0)
2844 			return (EINVAL);
2845 		ts = &timeout;
2846 	}
2847 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2848 }
2849 
2850 static int
2851 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2852 {
2853 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2854 }
2855 
2856 static int
2857 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2858 {
2859 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2860 }
2861 
2862 static int
2863 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2864 {
2865 	struct timespec *ts, timeout;
2866 	int error;
2867 
2868 	/* Allow a null timespec (wait forever). */
2869 	if (uap->uaddr2 == NULL)
2870 		ts = NULL;
2871 	else {
2872 		error = copyin(uap->uaddr2, &timeout,
2873 		    sizeof(timeout));
2874 		if (error != 0)
2875 			return (error);
2876 		if (timeout.tv_nsec >= 1000000000 ||
2877 		    timeout.tv_nsec < 0) {
2878 			return (EINVAL);
2879 		}
2880 		ts = &timeout;
2881 	}
2882 	return do_lock_umutex(td, uap->obj, ts, 0);
2883 }
2884 
2885 static int
2886 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2887 {
2888 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
2889 }
2890 
2891 static int
2892 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
2893 {
2894 	struct timespec *ts, timeout;
2895 	int error;
2896 
2897 	/* Allow a null timespec (wait forever). */
2898 	if (uap->uaddr2 == NULL)
2899 		ts = NULL;
2900 	else {
2901 		error = copyin(uap->uaddr2, &timeout,
2902 		    sizeof(timeout));
2903 		if (error != 0)
2904 			return (error);
2905 		if (timeout.tv_nsec >= 1000000000 ||
2906 		    timeout.tv_nsec < 0) {
2907 			return (EINVAL);
2908 		}
2909 		ts = &timeout;
2910 	}
2911 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
2912 }
2913 
2914 static int
2915 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
2916 {
2917 	return do_wake_umutex(td, uap->obj);
2918 }
2919 
2920 static int
2921 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2922 {
2923 	return do_unlock_umutex(td, uap->obj);
2924 }
2925 
2926 static int
2927 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2928 {
2929 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2930 }
2931 
2932 static int
2933 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2934 {
2935 	struct timespec *ts, timeout;
2936 	int error;
2937 
2938 	/* Allow a null timespec (wait forever). */
2939 	if (uap->uaddr2 == NULL)
2940 		ts = NULL;
2941 	else {
2942 		error = copyin(uap->uaddr2, &timeout,
2943 		    sizeof(timeout));
2944 		if (error != 0)
2945 			return (error);
2946 		if (timeout.tv_nsec >= 1000000000 ||
2947 		    timeout.tv_nsec < 0) {
2948 			return (EINVAL);
2949 		}
2950 		ts = &timeout;
2951 	}
2952 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2953 }
2954 
2955 static int
2956 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2957 {
2958 	return do_cv_signal(td, uap->obj);
2959 }
2960 
2961 static int
2962 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2963 {
2964 	return do_cv_broadcast(td, uap->obj);
2965 }
2966 
2967 static int
2968 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2969 {
2970 	struct timespec timeout;
2971 	int error;
2972 
2973 	/* Allow a null timespec (wait forever). */
2974 	if (uap->uaddr2 == NULL) {
2975 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2976 	} else {
2977 		error = copyin(uap->uaddr2, &timeout,
2978 		    sizeof(timeout));
2979 		if (error != 0)
2980 			return (error);
2981 		if (timeout.tv_nsec >= 1000000000 ||
2982 		    timeout.tv_nsec < 0) {
2983 			return (EINVAL);
2984 		}
2985 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2986 	}
2987 	return (error);
2988 }
2989 
2990 static int
2991 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2992 {
2993 	struct timespec timeout;
2994 	int error;
2995 
2996 	/* Allow a null timespec (wait forever). */
2997 	if (uap->uaddr2 == NULL) {
2998 		error = do_rw_wrlock(td, uap->obj, 0);
2999 	} else {
3000 		error = copyin(uap->uaddr2, &timeout,
3001 		    sizeof(timeout));
3002 		if (error != 0)
3003 			return (error);
3004 		if (timeout.tv_nsec >= 1000000000 ||
3005 		    timeout.tv_nsec < 0) {
3006 			return (EINVAL);
3007 		}
3008 
3009 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3010 	}
3011 	return (error);
3012 }
3013 
3014 static int
3015 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3016 {
3017 	return do_rw_unlock(td, uap->obj);
3018 }
3019 
3020 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3021 
3022 static _umtx_op_func op_table[] = {
3023 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3024 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3025 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3026 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3027 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3028 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3029 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3030 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3031 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3032 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3033 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3034 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3035 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3036 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3037 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3038 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3039 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3040 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3041 	__umtx_op_wake_umutex		/* UMTX_OP_UMUTEX_WAKE */
3042 };
3043 
3044 int
3045 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
3046 {
3047 	if ((unsigned)uap->op < UMTX_OP_MAX)
3048 		return (*op_table[uap->op])(td, uap);
3049 	return (EINVAL);
3050 }
3051 
3052 #ifdef COMPAT_IA32
3053 int
3054 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3055     /* struct umtx *umtx */
3056 {
3057 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3058 }
3059 
3060 int
3061 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3062     /* struct umtx *umtx */
3063 {
3064 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3065 }
3066 
3067 struct timespec32 {
3068 	u_int32_t tv_sec;
3069 	u_int32_t tv_nsec;
3070 };
3071 
3072 static inline int
3073 copyin_timeout32(void *addr, struct timespec *tsp)
3074 {
3075 	struct timespec32 ts32;
3076 	int error;
3077 
3078 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3079 	if (error == 0) {
3080 		tsp->tv_sec = ts32.tv_sec;
3081 		tsp->tv_nsec = ts32.tv_nsec;
3082 	}
3083 	return (error);
3084 }
3085 
3086 static int
3087 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3088 {
3089 	struct timespec *ts, timeout;
3090 	int error;
3091 
3092 	/* Allow a null timespec (wait forever). */
3093 	if (uap->uaddr2 == NULL)
3094 		ts = NULL;
3095 	else {
3096 		error = copyin_timeout32(uap->uaddr2, &timeout);
3097 		if (error != 0)
3098 			return (error);
3099 		if (timeout.tv_nsec >= 1000000000 ||
3100 		    timeout.tv_nsec < 0) {
3101 			return (EINVAL);
3102 		}
3103 		ts = &timeout;
3104 	}
3105 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3106 }
3107 
3108 static int
3109 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3110 {
3111 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3112 }
3113 
3114 static int
3115 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3116 {
3117 	struct timespec *ts, timeout;
3118 	int error;
3119 
3120 	if (uap->uaddr2 == NULL)
3121 		ts = NULL;
3122 	else {
3123 		error = copyin_timeout32(uap->uaddr2, &timeout);
3124 		if (error != 0)
3125 			return (error);
3126 		if (timeout.tv_nsec >= 1000000000 ||
3127 		    timeout.tv_nsec < 0)
3128 			return (EINVAL);
3129 		ts = &timeout;
3130 	}
3131 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3132 }
3133 
3134 static int
3135 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3136 {
3137 	struct timespec *ts, timeout;
3138 	int error;
3139 
3140 	/* Allow a null timespec (wait forever). */
3141 	if (uap->uaddr2 == NULL)
3142 		ts = NULL;
3143 	else {
3144 		error = copyin_timeout32(uap->uaddr2, &timeout);
3145 		if (error != 0)
3146 			return (error);
3147 		if (timeout.tv_nsec >= 1000000000 ||
3148 		    timeout.tv_nsec < 0)
3149 			return (EINVAL);
3150 		ts = &timeout;
3151 	}
3152 	return do_lock_umutex(td, uap->obj, ts, 0);
3153 }
3154 
3155 static int
3156 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3157 {
3158 	struct timespec *ts, timeout;
3159 	int error;
3160 
3161 	/* Allow a null timespec (wait forever). */
3162 	if (uap->uaddr2 == NULL)
3163 		ts = NULL;
3164 	else {
3165 		error = copyin_timeout32(uap->uaddr2, &timeout);
3166 		if (error != 0)
3167 			return (error);
3168 		if (timeout.tv_nsec >= 1000000000 ||
3169 		    timeout.tv_nsec < 0)
3170 			return (EINVAL);
3171 		ts = &timeout;
3172 	}
3173 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3174 }
3175 
3176 static int
3177 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3178 {
3179 	struct timespec *ts, timeout;
3180 	int error;
3181 
3182 	/* Allow a null timespec (wait forever). */
3183 	if (uap->uaddr2 == NULL)
3184 		ts = NULL;
3185 	else {
3186 		error = copyin_timeout32(uap->uaddr2, &timeout);
3187 		if (error != 0)
3188 			return (error);
3189 		if (timeout.tv_nsec >= 1000000000 ||
3190 		    timeout.tv_nsec < 0)
3191 			return (EINVAL);
3192 		ts = &timeout;
3193 	}
3194 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3195 }
3196 
3197 static int
3198 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3199 {
3200 	struct timespec timeout;
3201 	int error;
3202 
3203 	/* Allow a null timespec (wait forever). */
3204 	if (uap->uaddr2 == NULL) {
3205 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3206 	} else {
3207 		error = copyin(uap->uaddr2, &timeout,
3208 		    sizeof(timeout));
3209 		if (error != 0)
3210 			return (error);
3211 		if (timeout.tv_nsec >= 1000000000 ||
3212 		    timeout.tv_nsec < 0) {
3213 			return (EINVAL);
3214 		}
3215 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3216 	}
3217 	return (error);
3218 }
3219 
3220 static int
3221 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3222 {
3223 	struct timespec timeout;
3224 	int error;
3225 
3226 	/* Allow a null timespec (wait forever). */
3227 	if (uap->uaddr2 == NULL) {
3228 		error = do_rw_wrlock(td, uap->obj, 0);
3229 	} else {
3230 		error = copyin_timeout32(uap->uaddr2, &timeout);
3231 		if (error != 0)
3232 			return (error);
3233 		if (timeout.tv_nsec >= 1000000000 ||
3234 		    timeout.tv_nsec < 0) {
3235 			return (EINVAL);
3236 		}
3237 
3238 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3239 	}
3240 	return (error);
3241 }
3242 
3243 static int
3244 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3245 {
3246 	struct timespec *ts, timeout;
3247 	int error;
3248 
3249 	if (uap->uaddr2 == NULL)
3250 		ts = NULL;
3251 	else {
3252 		error = copyin_timeout32(uap->uaddr2, &timeout);
3253 		if (error != 0)
3254 			return (error);
3255 		if (timeout.tv_nsec >= 1000000000 ||
3256 		    timeout.tv_nsec < 0)
3257 			return (EINVAL);
3258 		ts = &timeout;
3259 	}
3260 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3261 }
3262 
3263 static _umtx_op_func op_table_compat32[] = {
3264 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3265 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3266 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3267 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3268 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3269 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3270 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3271 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3272 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3273 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3274 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3275 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3276 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3277 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3278 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3279 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3280 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3281 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3282 	__umtx_op_wake_umutex		/* UMTX_OP_UMUTEX_WAKE */
3283 };
3284 
3285 int
3286 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3287 {
3288 	if ((unsigned)uap->op < UMTX_OP_MAX)
3289 		return (*op_table_compat32[uap->op])(td,
3290 			(struct _umtx_op_args *)uap);
3291 	return (EINVAL);
3292 }
3293 #endif
3294 
3295 void
3296 umtx_thread_init(struct thread *td)
3297 {
3298 	td->td_umtxq = umtxq_alloc();
3299 	td->td_umtxq->uq_thread = td;
3300 }
3301 
3302 void
3303 umtx_thread_fini(struct thread *td)
3304 {
3305 	umtxq_free(td->td_umtxq);
3306 }
3307 
3308 /*
3309  * It will be called when new thread is created, e.g fork().
3310  */
3311 void
3312 umtx_thread_alloc(struct thread *td)
3313 {
3314 	struct umtx_q *uq;
3315 
3316 	uq = td->td_umtxq;
3317 	uq->uq_inherited_pri = PRI_MAX;
3318 
3319 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3320 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3321 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3322 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3323 }
3324 
3325 /*
3326  * exec() hook.
3327  */
3328 static void
3329 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3330 	struct image_params *imgp __unused)
3331 {
3332 	umtx_thread_cleanup(curthread);
3333 }
3334 
3335 /*
3336  * thread_exit() hook.
3337  */
3338 void
3339 umtx_thread_exit(struct thread *td)
3340 {
3341 	umtx_thread_cleanup(td);
3342 }
3343 
3344 /*
3345  * clean up umtx data.
3346  */
3347 static void
3348 umtx_thread_cleanup(struct thread *td)
3349 {
3350 	struct umtx_q *uq;
3351 	struct umtx_pi *pi;
3352 
3353 	if ((uq = td->td_umtxq) == NULL)
3354 		return;
3355 
3356 	mtx_lock_spin(&umtx_lock);
3357 	uq->uq_inherited_pri = PRI_MAX;
3358 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3359 		pi->pi_owner = NULL;
3360 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3361 	}
3362 	thread_lock(td);
3363 	td->td_flags &= ~TDF_UBORROWING;
3364 	thread_unlock(td);
3365 	mtx_unlock_spin(&umtx_lock);
3366 }
3367