xref: /freebsd/sys/kern/kern_umtx.c (revision 2227a3e9e1a0bcba8481a8067ee8c4b9a96fdda3)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/eventhandler.h>
47 #include <sys/umtx.h>
48 
49 #include <vm/vm.h>
50 #include <vm/vm_param.h>
51 #include <vm/pmap.h>
52 #include <vm/vm_map.h>
53 #include <vm/vm_object.h>
54 
55 #include <machine/cpu.h>
56 
57 #ifdef COMPAT_IA32
58 #include <compat/freebsd32/freebsd32_proto.h>
59 #endif
60 
61 #define TYPE_SIMPLE_WAIT	0
62 #define TYPE_CV			1
63 #define TYPE_SIMPLE_LOCK	2
64 #define TYPE_NORMAL_UMUTEX	3
65 #define TYPE_PI_UMUTEX		4
66 #define TYPE_PP_UMUTEX		5
67 #define TYPE_RWLOCK		6
68 
69 /* Key to represent a unique userland synchronous object */
70 struct umtx_key {
71 	int	hash;
72 	int	type;
73 	int	shared;
74 	union {
75 		struct {
76 			vm_object_t	object;
77 			uintptr_t	offset;
78 		} shared;
79 		struct {
80 			struct vmspace	*vs;
81 			uintptr_t	addr;
82 		} private;
83 		struct {
84 			void		*a;
85 			uintptr_t	b;
86 		} both;
87 	} info;
88 };
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98  	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 };
142 
143 TAILQ_HEAD(umtxq_head, umtx_q);
144 
145 /* Userland lock object's wait-queue chain */
146 struct umtxq_chain {
147 	/* Lock for this chain. */
148 	struct mtx		uc_lock;
149 
150 	/* List of sleep queues. */
151 	struct umtxq_head	uc_queue[2];
152 #define UMTX_SHARED_QUEUE	0
153 #define UMTX_EXCLUSIVE_QUEUE	1
154 
155 	/* Busy flag */
156 	char			uc_busy;
157 
158 	/* Chain lock waiters */
159 	int			uc_waiters;
160 
161 	/* All PI in the list */
162 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
163 };
164 
165 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166 
167 /*
168  * Don't propagate time-sharing priority, there is a security reason,
169  * a user can simply introduce PI-mutex, let thread A lock the mutex,
170  * and let another thread B block on the mutex, because B is
171  * sleeping, its priority will be boosted, this causes A's priority to
172  * be boosted via priority propagating too and will never be lowered even
173  * if it is using 100%CPU, this is unfair to other processes.
174  */
175 
176 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
177 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
178 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
179 
180 #define	GOLDEN_RATIO_PRIME	2654404609U
181 #define	UMTX_CHAINS		128
182 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
183 
184 #define THREAD_SHARE		0
185 #define PROCESS_SHARE		1
186 #define AUTO_SHARE		2
187 
188 #define	GET_SHARE(flags)	\
189     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190 
191 #define BUSY_SPINS		200
192 
193 static uma_zone_t		umtx_pi_zone;
194 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
195 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
196 static int			umtx_pi_allocated;
197 
198 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
199 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
200     &umtx_pi_allocated, 0, "Allocated umtx_pi");
201 
202 static void umtxq_sysinit(void *);
203 static void umtxq_hash(struct umtx_key *key);
204 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
205 static void umtxq_lock(struct umtx_key *key);
206 static void umtxq_unlock(struct umtx_key *key);
207 static void umtxq_busy(struct umtx_key *key);
208 static void umtxq_unbusy(struct umtx_key *key);
209 static void umtxq_insert_queue(struct umtx_q *uq, int q);
210 static void umtxq_remove_queue(struct umtx_q *uq, int q);
211 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
212 static int umtxq_count(struct umtx_key *key);
213 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
214 static int umtx_key_get(void *addr, int type, int share,
215 	struct umtx_key *key);
216 static void umtx_key_release(struct umtx_key *key);
217 static struct umtx_pi *umtx_pi_alloc(int);
218 static void umtx_pi_free(struct umtx_pi *pi);
219 static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
220 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
221 static void umtx_thread_cleanup(struct thread *td);
222 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
223 	struct image_params *imgp __unused);
224 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
225 
226 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
227 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
228 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
229 
230 static struct mtx umtx_lock;
231 
232 static void
233 umtxq_sysinit(void *arg __unused)
234 {
235 	int i, j;
236 
237 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
238 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
239 	for (i = 0; i < 2; ++i) {
240 		for (j = 0; j < UMTX_CHAINS; ++j) {
241 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
242 				 MTX_DEF | MTX_DUPOK);
243 			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[0]);
244 			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[1]);
245 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
246 			umtxq_chains[i][j].uc_busy = 0;
247 			umtxq_chains[i][j].uc_waiters = 0;
248 		}
249 	}
250 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
251 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
252 	    EVENTHANDLER_PRI_ANY);
253 }
254 
255 struct umtx_q *
256 umtxq_alloc(void)
257 {
258 	struct umtx_q *uq;
259 
260 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
261 	TAILQ_INIT(&uq->uq_pi_contested);
262 	uq->uq_inherited_pri = PRI_MAX;
263 	return (uq);
264 }
265 
266 void
267 umtxq_free(struct umtx_q *uq)
268 {
269 	free(uq, M_UMTX);
270 }
271 
272 static inline void
273 umtxq_hash(struct umtx_key *key)
274 {
275 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
276 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
277 }
278 
279 static inline int
280 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
281 {
282 	return (k1->type == k2->type &&
283 		k1->info.both.a == k2->info.both.a &&
284 	        k1->info.both.b == k2->info.both.b);
285 }
286 
287 static inline struct umtxq_chain *
288 umtxq_getchain(struct umtx_key *key)
289 {
290 	if (key->type <= TYPE_CV)
291 		return (&umtxq_chains[1][key->hash]);
292 	return (&umtxq_chains[0][key->hash]);
293 }
294 
295 /*
296  * Lock a chain.
297  */
298 static inline void
299 umtxq_lock(struct umtx_key *key)
300 {
301 	struct umtxq_chain *uc;
302 
303 	uc = umtxq_getchain(key);
304 	mtx_lock(&uc->uc_lock);
305 }
306 
307 /*
308  * Unlock a chain.
309  */
310 static inline void
311 umtxq_unlock(struct umtx_key *key)
312 {
313 	struct umtxq_chain *uc;
314 
315 	uc = umtxq_getchain(key);
316 	mtx_unlock(&uc->uc_lock);
317 }
318 
319 /*
320  * Set chain to busy state when following operation
321  * may be blocked (kernel mutex can not be used).
322  */
323 static inline void
324 umtxq_busy(struct umtx_key *key)
325 {
326 	struct umtxq_chain *uc;
327 
328 	uc = umtxq_getchain(key);
329 	mtx_assert(&uc->uc_lock, MA_OWNED);
330 	if (uc->uc_busy) {
331 #ifdef SMP
332 		if (smp_cpus > 1) {
333 			int count = BUSY_SPINS;
334 			if (count > 0) {
335 				umtxq_unlock(key);
336 				while (uc->uc_busy && --count > 0)
337 					cpu_spinwait();
338 				umtxq_lock(key);
339 			}
340 		}
341 #endif
342 		while (uc->uc_busy) {
343 			uc->uc_waiters++;
344 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
345 			uc->uc_waiters--;
346 		}
347 	}
348 	uc->uc_busy = 1;
349 }
350 
351 /*
352  * Unbusy a chain.
353  */
354 static inline void
355 umtxq_unbusy(struct umtx_key *key)
356 {
357 	struct umtxq_chain *uc;
358 
359 	uc = umtxq_getchain(key);
360 	mtx_assert(&uc->uc_lock, MA_OWNED);
361 	KASSERT(uc->uc_busy != 0, ("not busy"));
362 	uc->uc_busy = 0;
363 	if (uc->uc_waiters)
364 		wakeup_one(uc);
365 }
366 
367 static inline void
368 umtxq_insert_queue(struct umtx_q *uq, int q)
369 {
370 	struct umtxq_chain *uc;
371 
372 	uc = umtxq_getchain(&uq->uq_key);
373 	UMTXQ_LOCKED_ASSERT(uc);
374 	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
375 	uq->uq_flags |= UQF_UMTXQ;
376 }
377 
378 static inline void
379 umtxq_remove_queue(struct umtx_q *uq, int q)
380 {
381 	struct umtxq_chain *uc;
382 
383 	uc = umtxq_getchain(&uq->uq_key);
384 	UMTXQ_LOCKED_ASSERT(uc);
385 	if (uq->uq_flags & UQF_UMTXQ) {
386 		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
387 		uq->uq_flags &= ~UQF_UMTXQ;
388 	}
389 }
390 
391 /*
392  * Check if there are multiple waiters
393  */
394 static int
395 umtxq_count(struct umtx_key *key)
396 {
397 	struct umtxq_chain *uc;
398 	struct umtx_q *uq;
399 	int count = 0;
400 
401 	uc = umtxq_getchain(key);
402 	UMTXQ_LOCKED_ASSERT(uc);
403 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
404 		if (umtx_key_match(&uq->uq_key, key)) {
405 			if (++count > 1)
406 				break;
407 		}
408 	}
409 	return (count);
410 }
411 
412 /*
413  * Check if there are multiple PI waiters and returns first
414  * waiter.
415  */
416 static int
417 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
418 {
419 	struct umtxq_chain *uc;
420 	struct umtx_q *uq;
421 	int count = 0;
422 
423 	*first = NULL;
424 	uc = umtxq_getchain(key);
425 	UMTXQ_LOCKED_ASSERT(uc);
426 	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
427 		if (umtx_key_match(&uq->uq_key, key)) {
428 			if (++count > 1)
429 				break;
430 			*first = uq;
431 		}
432 	}
433 	return (count);
434 }
435 
436 /*
437  * Wake up threads waiting on an userland object.
438  */
439 
440 static int
441 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtx_q *uq, *next;
445 	int ret;
446 
447 	ret = 0;
448 	uc = umtxq_getchain(key);
449 	UMTXQ_LOCKED_ASSERT(uc);
450 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
451 		if (umtx_key_match(&uq->uq_key, key)) {
452 			umtxq_remove_queue(uq, q);
453 			wakeup(uq);
454 			if (++ret >= n_wake)
455 				break;
456 		}
457 	}
458 	return (ret);
459 }
460 
461 
462 /*
463  * Wake up specified thread.
464  */
465 static inline void
466 umtxq_signal_thread(struct umtx_q *uq)
467 {
468 	struct umtxq_chain *uc;
469 
470 	uc = umtxq_getchain(&uq->uq_key);
471 	UMTXQ_LOCKED_ASSERT(uc);
472 	umtxq_remove(uq);
473 	wakeup(uq);
474 }
475 
476 /*
477  * Put thread into sleep state, before sleeping, check if
478  * thread was removed from umtx queue.
479  */
480 static inline int
481 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
482 {
483 	struct umtxq_chain *uc;
484 	int error;
485 
486 	uc = umtxq_getchain(&uq->uq_key);
487 	UMTXQ_LOCKED_ASSERT(uc);
488 	if (!(uq->uq_flags & UQF_UMTXQ))
489 		return (0);
490 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
491 	if (error == EWOULDBLOCK)
492 		error = ETIMEDOUT;
493 	return (error);
494 }
495 
496 /*
497  * Convert userspace address into unique logical address.
498  */
499 static int
500 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
501 {
502 	struct thread *td = curthread;
503 	vm_map_t map;
504 	vm_map_entry_t entry;
505 	vm_pindex_t pindex;
506 	vm_prot_t prot;
507 	boolean_t wired;
508 
509 	key->type = type;
510 	if (share == THREAD_SHARE) {
511 		key->shared = 0;
512 		key->info.private.vs = td->td_proc->p_vmspace;
513 		key->info.private.addr = (uintptr_t)addr;
514 	} else {
515 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
516 		map = &td->td_proc->p_vmspace->vm_map;
517 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
518 		    &entry, &key->info.shared.object, &pindex, &prot,
519 		    &wired) != KERN_SUCCESS) {
520 			return EFAULT;
521 		}
522 
523 		if ((share == PROCESS_SHARE) ||
524 		    (share == AUTO_SHARE &&
525 		     VM_INHERIT_SHARE == entry->inheritance)) {
526 			key->shared = 1;
527 			key->info.shared.offset = entry->offset + entry->start -
528 				(vm_offset_t)addr;
529 			vm_object_reference(key->info.shared.object);
530 		} else {
531 			key->shared = 0;
532 			key->info.private.vs = td->td_proc->p_vmspace;
533 			key->info.private.addr = (uintptr_t)addr;
534 		}
535 		vm_map_lookup_done(map, entry);
536 	}
537 
538 	umtxq_hash(key);
539 	return (0);
540 }
541 
542 /*
543  * Release key.
544  */
545 static inline void
546 umtx_key_release(struct umtx_key *key)
547 {
548 	if (key->shared)
549 		vm_object_deallocate(key->info.shared.object);
550 }
551 
552 /*
553  * Lock a umtx object.
554  */
555 static int
556 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
557 {
558 	struct umtx_q *uq;
559 	u_long owner;
560 	u_long old;
561 	int error = 0;
562 
563 	uq = td->td_umtxq;
564 
565 	/*
566 	 * Care must be exercised when dealing with umtx structure. It
567 	 * can fault on any access.
568 	 */
569 	for (;;) {
570 		/*
571 		 * Try the uncontested case.  This should be done in userland.
572 		 */
573 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
574 
575 		/* The acquire succeeded. */
576 		if (owner == UMTX_UNOWNED)
577 			return (0);
578 
579 		/* The address was invalid. */
580 		if (owner == -1)
581 			return (EFAULT);
582 
583 		/* If no one owns it but it is contested try to acquire it. */
584 		if (owner == UMTX_CONTESTED) {
585 			owner = casuword(&umtx->u_owner,
586 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
587 
588 			if (owner == UMTX_CONTESTED)
589 				return (0);
590 
591 			/* The address was invalid. */
592 			if (owner == -1)
593 				return (EFAULT);
594 
595 			/* If this failed the lock has changed, restart. */
596 			continue;
597 		}
598 
599 		/*
600 		 * If we caught a signal, we have retried and now
601 		 * exit immediately.
602 		 */
603 		if (error != 0)
604 			return (error);
605 
606 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
607 			AUTO_SHARE, &uq->uq_key)) != 0)
608 			return (error);
609 
610 		umtxq_lock(&uq->uq_key);
611 		umtxq_busy(&uq->uq_key);
612 		umtxq_insert(uq);
613 		umtxq_unbusy(&uq->uq_key);
614 		umtxq_unlock(&uq->uq_key);
615 
616 		/*
617 		 * Set the contested bit so that a release in user space
618 		 * knows to use the system call for unlock.  If this fails
619 		 * either some one else has acquired the lock or it has been
620 		 * released.
621 		 */
622 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
623 
624 		/* The address was invalid. */
625 		if (old == -1) {
626 			umtxq_lock(&uq->uq_key);
627 			umtxq_remove(uq);
628 			umtxq_unlock(&uq->uq_key);
629 			umtx_key_release(&uq->uq_key);
630 			return (EFAULT);
631 		}
632 
633 		/*
634 		 * We set the contested bit, sleep. Otherwise the lock changed
635 		 * and we need to retry or we lost a race to the thread
636 		 * unlocking the umtx.
637 		 */
638 		umtxq_lock(&uq->uq_key);
639 		if (old == owner)
640 			error = umtxq_sleep(uq, "umtx", timo);
641 		umtxq_remove(uq);
642 		umtxq_unlock(&uq->uq_key);
643 		umtx_key_release(&uq->uq_key);
644 	}
645 
646 	return (0);
647 }
648 
649 /*
650  * Lock a umtx object.
651  */
652 static int
653 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
654 	struct timespec *timeout)
655 {
656 	struct timespec ts, ts2, ts3;
657 	struct timeval tv;
658 	int error;
659 
660 	if (timeout == NULL) {
661 		error = _do_lock_umtx(td, umtx, id, 0);
662 		/* Mutex locking is restarted if it is interrupted. */
663 		if (error == EINTR)
664 			error = ERESTART;
665 	} else {
666 		getnanouptime(&ts);
667 		timespecadd(&ts, timeout);
668 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
669 		for (;;) {
670 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
671 			if (error != ETIMEDOUT)
672 				break;
673 			getnanouptime(&ts2);
674 			if (timespeccmp(&ts2, &ts, >=)) {
675 				error = ETIMEDOUT;
676 				break;
677 			}
678 			ts3 = ts;
679 			timespecsub(&ts3, &ts2);
680 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
681 		}
682 		/* Timed-locking is not restarted. */
683 		if (error == ERESTART)
684 			error = EINTR;
685 	}
686 	return (error);
687 }
688 
689 /*
690  * Unlock a umtx object.
691  */
692 static int
693 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
694 {
695 	struct umtx_key key;
696 	u_long owner;
697 	u_long old;
698 	int error;
699 	int count;
700 
701 	/*
702 	 * Make sure we own this mtx.
703 	 */
704 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
705 	if (owner == -1)
706 		return (EFAULT);
707 
708 	if ((owner & ~UMTX_CONTESTED) != id)
709 		return (EPERM);
710 
711 	/* This should be done in userland */
712 	if ((owner & UMTX_CONTESTED) == 0) {
713 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
714 		if (old == -1)
715 			return (EFAULT);
716 		if (old == owner)
717 			return (0);
718 		owner = old;
719 	}
720 
721 	/* We should only ever be in here for contested locks */
722 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
723 		&key)) != 0)
724 		return (error);
725 
726 	umtxq_lock(&key);
727 	umtxq_busy(&key);
728 	count = umtxq_count(&key);
729 	umtxq_unlock(&key);
730 
731 	/*
732 	 * When unlocking the umtx, it must be marked as unowned if
733 	 * there is zero or one thread only waiting for it.
734 	 * Otherwise, it must be marked as contested.
735 	 */
736 	old = casuword(&umtx->u_owner, owner,
737 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
738 	umtxq_lock(&key);
739 	umtxq_signal(&key,1);
740 	umtxq_unbusy(&key);
741 	umtxq_unlock(&key);
742 	umtx_key_release(&key);
743 	if (old == -1)
744 		return (EFAULT);
745 	if (old != owner)
746 		return (EINVAL);
747 	return (0);
748 }
749 
750 #ifdef COMPAT_IA32
751 
752 /*
753  * Lock a umtx object.
754  */
755 static int
756 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
757 {
758 	struct umtx_q *uq;
759 	uint32_t owner;
760 	uint32_t old;
761 	int error = 0;
762 
763 	uq = td->td_umtxq;
764 
765 	/*
766 	 * Care must be exercised when dealing with umtx structure. It
767 	 * can fault on any access.
768 	 */
769 	for (;;) {
770 		/*
771 		 * Try the uncontested case.  This should be done in userland.
772 		 */
773 		owner = casuword32(m, UMUTEX_UNOWNED, id);
774 
775 		/* The acquire succeeded. */
776 		if (owner == UMUTEX_UNOWNED)
777 			return (0);
778 
779 		/* The address was invalid. */
780 		if (owner == -1)
781 			return (EFAULT);
782 
783 		/* If no one owns it but it is contested try to acquire it. */
784 		if (owner == UMUTEX_CONTESTED) {
785 			owner = casuword32(m,
786 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
787 			if (owner == UMUTEX_CONTESTED)
788 				return (0);
789 
790 			/* The address was invalid. */
791 			if (owner == -1)
792 				return (EFAULT);
793 
794 			/* If this failed the lock has changed, restart. */
795 			continue;
796 		}
797 
798 		/*
799 		 * If we caught a signal, we have retried and now
800 		 * exit immediately.
801 		 */
802 		if (error != 0)
803 			return (error);
804 
805 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
806 			AUTO_SHARE, &uq->uq_key)) != 0)
807 			return (error);
808 
809 		umtxq_lock(&uq->uq_key);
810 		umtxq_busy(&uq->uq_key);
811 		umtxq_insert(uq);
812 		umtxq_unbusy(&uq->uq_key);
813 		umtxq_unlock(&uq->uq_key);
814 
815 		/*
816 		 * Set the contested bit so that a release in user space
817 		 * knows to use the system call for unlock.  If this fails
818 		 * either some one else has acquired the lock or it has been
819 		 * released.
820 		 */
821 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
822 
823 		/* The address was invalid. */
824 		if (old == -1) {
825 			umtxq_lock(&uq->uq_key);
826 			umtxq_remove(uq);
827 			umtxq_unlock(&uq->uq_key);
828 			umtx_key_release(&uq->uq_key);
829 			return (EFAULT);
830 		}
831 
832 		/*
833 		 * We set the contested bit, sleep. Otherwise the lock changed
834 		 * and we need to retry or we lost a race to the thread
835 		 * unlocking the umtx.
836 		 */
837 		umtxq_lock(&uq->uq_key);
838 		if (old == owner)
839 			error = umtxq_sleep(uq, "umtx", timo);
840 		umtxq_remove(uq);
841 		umtxq_unlock(&uq->uq_key);
842 		umtx_key_release(&uq->uq_key);
843 	}
844 
845 	return (0);
846 }
847 
848 /*
849  * Lock a umtx object.
850  */
851 static int
852 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
853 	struct timespec *timeout)
854 {
855 	struct timespec ts, ts2, ts3;
856 	struct timeval tv;
857 	int error;
858 
859 	if (timeout == NULL) {
860 		error = _do_lock_umtx32(td, m, id, 0);
861 		/* Mutex locking is restarted if it is interrupted. */
862 		if (error == EINTR)
863 			error = ERESTART;
864 	} else {
865 		getnanouptime(&ts);
866 		timespecadd(&ts, timeout);
867 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
868 		for (;;) {
869 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
870 			if (error != ETIMEDOUT)
871 				break;
872 			getnanouptime(&ts2);
873 			if (timespeccmp(&ts2, &ts, >=)) {
874 				error = ETIMEDOUT;
875 				break;
876 			}
877 			ts3 = ts;
878 			timespecsub(&ts3, &ts2);
879 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
880 		}
881 		/* Timed-locking is not restarted. */
882 		if (error == ERESTART)
883 			error = EINTR;
884 	}
885 	return (error);
886 }
887 
888 /*
889  * Unlock a umtx object.
890  */
891 static int
892 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
893 {
894 	struct umtx_key key;
895 	uint32_t owner;
896 	uint32_t old;
897 	int error;
898 	int count;
899 
900 	/*
901 	 * Make sure we own this mtx.
902 	 */
903 	owner = fuword32(m);
904 	if (owner == -1)
905 		return (EFAULT);
906 
907 	if ((owner & ~UMUTEX_CONTESTED) != id)
908 		return (EPERM);
909 
910 	/* This should be done in userland */
911 	if ((owner & UMUTEX_CONTESTED) == 0) {
912 		old = casuword32(m, owner, UMUTEX_UNOWNED);
913 		if (old == -1)
914 			return (EFAULT);
915 		if (old == owner)
916 			return (0);
917 		owner = old;
918 	}
919 
920 	/* We should only ever be in here for contested locks */
921 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
922 		&key)) != 0)
923 		return (error);
924 
925 	umtxq_lock(&key);
926 	umtxq_busy(&key);
927 	count = umtxq_count(&key);
928 	umtxq_unlock(&key);
929 
930 	/*
931 	 * When unlocking the umtx, it must be marked as unowned if
932 	 * there is zero or one thread only waiting for it.
933 	 * Otherwise, it must be marked as contested.
934 	 */
935 	old = casuword32(m, owner,
936 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
937 	umtxq_lock(&key);
938 	umtxq_signal(&key,1);
939 	umtxq_unbusy(&key);
940 	umtxq_unlock(&key);
941 	umtx_key_release(&key);
942 	if (old == -1)
943 		return (EFAULT);
944 	if (old != owner)
945 		return (EINVAL);
946 	return (0);
947 }
948 #endif
949 
950 /*
951  * Fetch and compare value, sleep on the address if value is not changed.
952  */
953 static int
954 do_wait(struct thread *td, void *addr, u_long id,
955 	struct timespec *timeout, int compat32, int is_private)
956 {
957 	struct umtx_q *uq;
958 	struct timespec ts, ts2, ts3;
959 	struct timeval tv;
960 	u_long tmp;
961 	int error = 0;
962 
963 	uq = td->td_umtxq;
964 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
965 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
966 		return (error);
967 
968 	umtxq_lock(&uq->uq_key);
969 	umtxq_insert(uq);
970 	umtxq_unlock(&uq->uq_key);
971 	if (compat32 == 0)
972 		tmp = fuword(addr);
973         else
974 		tmp = fuword32(addr);
975 	if (tmp != id) {
976 		umtxq_lock(&uq->uq_key);
977 		umtxq_remove(uq);
978 		umtxq_unlock(&uq->uq_key);
979 	} else if (timeout == NULL) {
980 		umtxq_lock(&uq->uq_key);
981 		error = umtxq_sleep(uq, "uwait", 0);
982 		umtxq_remove(uq);
983 		umtxq_unlock(&uq->uq_key);
984 	} else {
985 		getnanouptime(&ts);
986 		timespecadd(&ts, timeout);
987 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
988 		umtxq_lock(&uq->uq_key);
989 		for (;;) {
990 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
991 			if (!(uq->uq_flags & UQF_UMTXQ))
992 				break;
993 			if (error != ETIMEDOUT)
994 				break;
995 			umtxq_unlock(&uq->uq_key);
996 			getnanouptime(&ts2);
997 			if (timespeccmp(&ts2, &ts, >=)) {
998 				error = ETIMEDOUT;
999 				umtxq_lock(&uq->uq_key);
1000 				break;
1001 			}
1002 			ts3 = ts;
1003 			timespecsub(&ts3, &ts2);
1004 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1005 			umtxq_lock(&uq->uq_key);
1006 		}
1007 		umtxq_remove(uq);
1008 		umtxq_unlock(&uq->uq_key);
1009 	}
1010 	umtx_key_release(&uq->uq_key);
1011 	if (error == ERESTART)
1012 		error = EINTR;
1013 	return (error);
1014 }
1015 
1016 /*
1017  * Wake up threads sleeping on the specified address.
1018  */
1019 int
1020 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1021 {
1022 	struct umtx_key key;
1023 	int ret;
1024 
1025 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1026 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1027 		return (ret);
1028 	umtxq_lock(&key);
1029 	ret = umtxq_signal(&key, n_wake);
1030 	umtxq_unlock(&key);
1031 	umtx_key_release(&key);
1032 	return (0);
1033 }
1034 
1035 /*
1036  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1037  */
1038 static int
1039 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1040 	int try)
1041 {
1042 	struct umtx_q *uq;
1043 	uint32_t owner, old, id;
1044 	int error = 0;
1045 
1046 	id = td->td_tid;
1047 	uq = td->td_umtxq;
1048 
1049 	/*
1050 	 * Care must be exercised when dealing with umtx structure. It
1051 	 * can fault on any access.
1052 	 */
1053 	for (;;) {
1054 		/*
1055 		 * Try the uncontested case.  This should be done in userland.
1056 		 */
1057 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1058 
1059 		/* The acquire succeeded. */
1060 		if (owner == UMUTEX_UNOWNED)
1061 			return (0);
1062 
1063 		/* The address was invalid. */
1064 		if (owner == -1)
1065 			return (EFAULT);
1066 
1067 		/* If no one owns it but it is contested try to acquire it. */
1068 		if (owner == UMUTEX_CONTESTED) {
1069 			owner = casuword32(&m->m_owner,
1070 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1071 
1072 			if (owner == UMUTEX_CONTESTED)
1073 				return (0);
1074 
1075 			/* The address was invalid. */
1076 			if (owner == -1)
1077 				return (EFAULT);
1078 
1079 			/* If this failed the lock has changed, restart. */
1080 			continue;
1081 		}
1082 
1083 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1084 		    (owner & ~UMUTEX_CONTESTED) == id)
1085 			return (EDEADLK);
1086 
1087 		if (try != 0)
1088 			return (EBUSY);
1089 
1090 		/*
1091 		 * If we caught a signal, we have retried and now
1092 		 * exit immediately.
1093 		 */
1094 		if (error != 0)
1095 			return (error);
1096 
1097 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1098 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1099 			return (error);
1100 
1101 		umtxq_lock(&uq->uq_key);
1102 		umtxq_busy(&uq->uq_key);
1103 		umtxq_insert(uq);
1104 		umtxq_unbusy(&uq->uq_key);
1105 		umtxq_unlock(&uq->uq_key);
1106 
1107 		/*
1108 		 * Set the contested bit so that a release in user space
1109 		 * knows to use the system call for unlock.  If this fails
1110 		 * either some one else has acquired the lock or it has been
1111 		 * released.
1112 		 */
1113 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1114 
1115 		/* The address was invalid. */
1116 		if (old == -1) {
1117 			umtxq_lock(&uq->uq_key);
1118 			umtxq_remove(uq);
1119 			umtxq_unlock(&uq->uq_key);
1120 			umtx_key_release(&uq->uq_key);
1121 			return (EFAULT);
1122 		}
1123 
1124 		/*
1125 		 * We set the contested bit, sleep. Otherwise the lock changed
1126 		 * and we need to retry or we lost a race to the thread
1127 		 * unlocking the umtx.
1128 		 */
1129 		umtxq_lock(&uq->uq_key);
1130 		if (old == owner)
1131 			error = umtxq_sleep(uq, "umtxn", timo);
1132 		umtxq_remove(uq);
1133 		umtxq_unlock(&uq->uq_key);
1134 		umtx_key_release(&uq->uq_key);
1135 	}
1136 
1137 	return (0);
1138 }
1139 
1140 /*
1141  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1142  */
1143 /*
1144  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1145  */
1146 static int
1147 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1148 {
1149 	struct umtx_key key;
1150 	uint32_t owner, old, id;
1151 	int error;
1152 	int count;
1153 
1154 	id = td->td_tid;
1155 	/*
1156 	 * Make sure we own this mtx.
1157 	 */
1158 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1159 	if (owner == -1)
1160 		return (EFAULT);
1161 
1162 	if ((owner & ~UMUTEX_CONTESTED) != id)
1163 		return (EPERM);
1164 
1165 	/* This should be done in userland */
1166 	if ((owner & UMUTEX_CONTESTED) == 0) {
1167 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1168 		if (old == -1)
1169 			return (EFAULT);
1170 		if (old == owner)
1171 			return (0);
1172 		owner = old;
1173 	}
1174 
1175 	/* We should only ever be in here for contested locks */
1176 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1177 	    &key)) != 0)
1178 		return (error);
1179 
1180 	umtxq_lock(&key);
1181 	umtxq_busy(&key);
1182 	count = umtxq_count(&key);
1183 	umtxq_unlock(&key);
1184 
1185 	/*
1186 	 * When unlocking the umtx, it must be marked as unowned if
1187 	 * there is zero or one thread only waiting for it.
1188 	 * Otherwise, it must be marked as contested.
1189 	 */
1190 	old = casuword32(&m->m_owner, owner,
1191 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1192 	umtxq_lock(&key);
1193 	umtxq_signal(&key,1);
1194 	umtxq_unbusy(&key);
1195 	umtxq_unlock(&key);
1196 	umtx_key_release(&key);
1197 	if (old == -1)
1198 		return (EFAULT);
1199 	if (old != owner)
1200 		return (EINVAL);
1201 	return (0);
1202 }
1203 
1204 static inline struct umtx_pi *
1205 umtx_pi_alloc(int flags)
1206 {
1207 	struct umtx_pi *pi;
1208 
1209 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1210 	TAILQ_INIT(&pi->pi_blocked);
1211 	atomic_add_int(&umtx_pi_allocated, 1);
1212 	return (pi);
1213 }
1214 
1215 static inline void
1216 umtx_pi_free(struct umtx_pi *pi)
1217 {
1218 	uma_zfree(umtx_pi_zone, pi);
1219 	atomic_add_int(&umtx_pi_allocated, -1);
1220 }
1221 
1222 /*
1223  * Adjust the thread's position on a pi_state after its priority has been
1224  * changed.
1225  */
1226 static int
1227 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1228 {
1229 	struct umtx_q *uq, *uq1, *uq2;
1230 	struct thread *td1;
1231 
1232 	mtx_assert(&umtx_lock, MA_OWNED);
1233 	if (pi == NULL)
1234 		return (0);
1235 
1236 	uq = td->td_umtxq;
1237 
1238 	/*
1239 	 * Check if the thread needs to be moved on the blocked chain.
1240 	 * It needs to be moved if either its priority is lower than
1241 	 * the previous thread or higher than the next thread.
1242 	 */
1243 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1244 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1245 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1246 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1247 		/*
1248 		 * Remove thread from blocked chain and determine where
1249 		 * it should be moved to.
1250 		 */
1251 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1252 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1253 			td1 = uq1->uq_thread;
1254 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1255 			if (UPRI(td1) > UPRI(td))
1256 				break;
1257 		}
1258 
1259 		if (uq1 == NULL)
1260 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1261 		else
1262 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1263 	}
1264 	return (1);
1265 }
1266 
1267 /*
1268  * Propagate priority when a thread is blocked on POSIX
1269  * PI mutex.
1270  */
1271 static void
1272 umtx_propagate_priority(struct thread *td)
1273 {
1274 	struct umtx_q *uq;
1275 	struct umtx_pi *pi;
1276 	int pri;
1277 
1278 	mtx_assert(&umtx_lock, MA_OWNED);
1279 	pri = UPRI(td);
1280 	uq = td->td_umtxq;
1281 	pi = uq->uq_pi_blocked;
1282 	if (pi == NULL)
1283 		return;
1284 
1285 	for (;;) {
1286 		td = pi->pi_owner;
1287 		if (td == NULL)
1288 			return;
1289 
1290 		MPASS(td->td_proc != NULL);
1291 		MPASS(td->td_proc->p_magic == P_MAGIC);
1292 
1293 		if (UPRI(td) <= pri)
1294 			return;
1295 
1296 		thread_lock(td);
1297 		sched_lend_user_prio(td, pri);
1298 		thread_unlock(td);
1299 
1300 		/*
1301 		 * Pick up the lock that td is blocked on.
1302 		 */
1303 		uq = td->td_umtxq;
1304 		pi = uq->uq_pi_blocked;
1305 		/* Resort td on the list if needed. */
1306 		if (!umtx_pi_adjust_thread(pi, td))
1307 			break;
1308 	}
1309 }
1310 
1311 /*
1312  * Unpropagate priority for a PI mutex when a thread blocked on
1313  * it is interrupted by signal or resumed by others.
1314  */
1315 static void
1316 umtx_unpropagate_priority(struct umtx_pi *pi)
1317 {
1318 	struct umtx_q *uq, *uq_owner;
1319 	struct umtx_pi *pi2;
1320 	int pri, oldpri;
1321 
1322 	mtx_assert(&umtx_lock, MA_OWNED);
1323 
1324 	while (pi != NULL && pi->pi_owner != NULL) {
1325 		pri = PRI_MAX;
1326 		uq_owner = pi->pi_owner->td_umtxq;
1327 
1328 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1329 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1330 			if (uq != NULL) {
1331 				if (pri > UPRI(uq->uq_thread))
1332 					pri = UPRI(uq->uq_thread);
1333 			}
1334 		}
1335 
1336 		if (pri > uq_owner->uq_inherited_pri)
1337 			pri = uq_owner->uq_inherited_pri;
1338 		thread_lock(pi->pi_owner);
1339 		oldpri = pi->pi_owner->td_user_pri;
1340 		sched_unlend_user_prio(pi->pi_owner, pri);
1341 		thread_unlock(pi->pi_owner);
1342 		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1343 		pi = uq_owner->uq_pi_blocked;
1344 	}
1345 }
1346 
1347 /*
1348  * Insert a PI mutex into owned list.
1349  */
1350 static void
1351 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1352 {
1353 	struct umtx_q *uq_owner;
1354 
1355 	uq_owner = owner->td_umtxq;
1356 	mtx_assert(&umtx_lock, MA_OWNED);
1357 	if (pi->pi_owner != NULL)
1358 		panic("pi_ower != NULL");
1359 	pi->pi_owner = owner;
1360 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1361 }
1362 
1363 /*
1364  * Claim ownership of a PI mutex.
1365  */
1366 static int
1367 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1368 {
1369 	struct umtx_q *uq, *uq_owner;
1370 
1371 	uq_owner = owner->td_umtxq;
1372 	mtx_lock_spin(&umtx_lock);
1373 	if (pi->pi_owner == owner) {
1374 		mtx_unlock_spin(&umtx_lock);
1375 		return (0);
1376 	}
1377 
1378 	if (pi->pi_owner != NULL) {
1379 		/*
1380 		 * userland may have already messed the mutex, sigh.
1381 		 */
1382 		mtx_unlock_spin(&umtx_lock);
1383 		return (EPERM);
1384 	}
1385 	umtx_pi_setowner(pi, owner);
1386 	uq = TAILQ_FIRST(&pi->pi_blocked);
1387 	if (uq != NULL) {
1388 		int pri;
1389 
1390 		pri = UPRI(uq->uq_thread);
1391 		thread_lock(owner);
1392 		if (pri < UPRI(owner))
1393 			sched_lend_user_prio(owner, pri);
1394 		thread_unlock(owner);
1395 	}
1396 	mtx_unlock_spin(&umtx_lock);
1397 	return (0);
1398 }
1399 
1400 static void
1401 umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1402 {
1403 	struct umtx_q *uq;
1404 	struct umtx_pi *pi;
1405 
1406 	uq = td->td_umtxq;
1407 	/*
1408 	 * Pick up the lock that td is blocked on.
1409 	 */
1410 	pi = uq->uq_pi_blocked;
1411 	MPASS(pi != NULL);
1412 
1413 	/* Resort the turnstile on the list. */
1414 	if (!umtx_pi_adjust_thread(pi, td))
1415 		return;
1416 
1417 	/*
1418 	 * If our priority was lowered and we are at the head of the
1419 	 * turnstile, then propagate our new priority up the chain.
1420 	 */
1421 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1422 		umtx_propagate_priority(td);
1423 }
1424 
1425 /*
1426  * Adjust a thread's order position in its blocked PI mutex,
1427  * this may result new priority propagating process.
1428  */
1429 void
1430 umtx_pi_adjust(struct thread *td, u_char oldpri)
1431 {
1432 	struct umtx_q *uq;
1433 	struct umtx_pi *pi;
1434 
1435 	uq = td->td_umtxq;
1436 	mtx_lock_spin(&umtx_lock);
1437 	/*
1438 	 * Pick up the lock that td is blocked on.
1439 	 */
1440 	pi = uq->uq_pi_blocked;
1441 	if (pi != NULL)
1442 		umtx_pi_adjust_locked(td, oldpri);
1443 	mtx_unlock_spin(&umtx_lock);
1444 }
1445 
1446 /*
1447  * Sleep on a PI mutex.
1448  */
1449 static int
1450 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1451 	uint32_t owner, const char *wmesg, int timo)
1452 {
1453 	struct umtxq_chain *uc;
1454 	struct thread *td, *td1;
1455 	struct umtx_q *uq1;
1456 	int pri;
1457 	int error = 0;
1458 
1459 	td = uq->uq_thread;
1460 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1461 	uc = umtxq_getchain(&uq->uq_key);
1462 	UMTXQ_LOCKED_ASSERT(uc);
1463 	umtxq_insert(uq);
1464 	if (pi->pi_owner == NULL) {
1465 		/* XXX
1466 		 * Current, We only support process private PI-mutex,
1467 		 * non-contended PI-mutexes are locked in userland.
1468 		 * Process shared PI-mutex should always be initialized
1469 		 * by kernel and be registered in kernel, locking should
1470 		 * always be done by kernel to avoid security problems.
1471 		 * For process private PI-mutex, we can find owner
1472 		 * thread and boost its priority safely.
1473 		 */
1474 		PROC_LOCK(curproc);
1475 		td1 = thread_find(curproc, owner);
1476 		mtx_lock_spin(&umtx_lock);
1477 		if (td1 != NULL && pi->pi_owner == NULL) {
1478 			uq1 = td1->td_umtxq;
1479 			umtx_pi_setowner(pi, td1);
1480 		}
1481 		PROC_UNLOCK(curproc);
1482 	} else {
1483 		mtx_lock_spin(&umtx_lock);
1484 	}
1485 
1486 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1487 		pri = UPRI(uq1->uq_thread);
1488 		if (pri > UPRI(td))
1489 			break;
1490 	}
1491 
1492 	if (uq1 != NULL)
1493 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1494 	else
1495 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1496 
1497 	uq->uq_pi_blocked = pi;
1498 	thread_lock(td);
1499 	td->td_flags |= TDF_UPIBLOCKED;
1500 	thread_unlock(td);
1501 	mtx_unlock_spin(&umtx_lock);
1502 	umtxq_unlock(&uq->uq_key);
1503 
1504 	mtx_lock_spin(&umtx_lock);
1505 	umtx_propagate_priority(td);
1506 	mtx_unlock_spin(&umtx_lock);
1507 
1508 	umtxq_lock(&uq->uq_key);
1509 	if (uq->uq_flags & UQF_UMTXQ) {
1510 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1511 		if (error == EWOULDBLOCK)
1512 			error = ETIMEDOUT;
1513 		if (uq->uq_flags & UQF_UMTXQ) {
1514 			umtxq_busy(&uq->uq_key);
1515 			umtxq_remove(uq);
1516 			umtxq_unbusy(&uq->uq_key);
1517 		}
1518 	}
1519 	umtxq_unlock(&uq->uq_key);
1520 
1521 	mtx_lock_spin(&umtx_lock);
1522 	uq->uq_pi_blocked = NULL;
1523 	thread_lock(td);
1524 	td->td_flags &= ~TDF_UPIBLOCKED;
1525 	thread_unlock(td);
1526 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1527 	umtx_unpropagate_priority(pi);
1528 	mtx_unlock_spin(&umtx_lock);
1529 
1530 	umtxq_lock(&uq->uq_key);
1531 
1532 	return (error);
1533 }
1534 
1535 /*
1536  * Add reference count for a PI mutex.
1537  */
1538 static void
1539 umtx_pi_ref(struct umtx_pi *pi)
1540 {
1541 	struct umtxq_chain *uc;
1542 
1543 	uc = umtxq_getchain(&pi->pi_key);
1544 	UMTXQ_LOCKED_ASSERT(uc);
1545 	pi->pi_refcount++;
1546 }
1547 
1548 /*
1549  * Decrease reference count for a PI mutex, if the counter
1550  * is decreased to zero, its memory space is freed.
1551  */
1552 static void
1553 umtx_pi_unref(struct umtx_pi *pi)
1554 {
1555 	struct umtxq_chain *uc;
1556 	int free = 0;
1557 
1558 	uc = umtxq_getchain(&pi->pi_key);
1559 	UMTXQ_LOCKED_ASSERT(uc);
1560 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1561 	if (--pi->pi_refcount == 0) {
1562 		mtx_lock_spin(&umtx_lock);
1563 		if (pi->pi_owner != NULL) {
1564 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1565 				pi, pi_link);
1566 			pi->pi_owner = NULL;
1567 		}
1568 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1569 			("blocked queue not empty"));
1570 		mtx_unlock_spin(&umtx_lock);
1571 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1572 		free = 1;
1573 	}
1574 	if (free)
1575 		umtx_pi_free(pi);
1576 }
1577 
1578 /*
1579  * Find a PI mutex in hash table.
1580  */
1581 static struct umtx_pi *
1582 umtx_pi_lookup(struct umtx_key *key)
1583 {
1584 	struct umtxq_chain *uc;
1585 	struct umtx_pi *pi;
1586 
1587 	uc = umtxq_getchain(key);
1588 	UMTXQ_LOCKED_ASSERT(uc);
1589 
1590 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1591 		if (umtx_key_match(&pi->pi_key, key)) {
1592 			return (pi);
1593 		}
1594 	}
1595 	return (NULL);
1596 }
1597 
1598 /*
1599  * Insert a PI mutex into hash table.
1600  */
1601 static inline void
1602 umtx_pi_insert(struct umtx_pi *pi)
1603 {
1604 	struct umtxq_chain *uc;
1605 
1606 	uc = umtxq_getchain(&pi->pi_key);
1607 	UMTXQ_LOCKED_ASSERT(uc);
1608 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1609 }
1610 
1611 /*
1612  * Lock a PI mutex.
1613  */
1614 static int
1615 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1616 	int try)
1617 {
1618 	struct umtx_q *uq;
1619 	struct umtx_pi *pi, *new_pi;
1620 	uint32_t id, owner, old;
1621 	int error;
1622 
1623 	id = td->td_tid;
1624 	uq = td->td_umtxq;
1625 
1626 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1627 	    &uq->uq_key)) != 0)
1628 		return (error);
1629 	umtxq_lock(&uq->uq_key);
1630 	pi = umtx_pi_lookup(&uq->uq_key);
1631 	if (pi == NULL) {
1632 		new_pi = umtx_pi_alloc(M_NOWAIT);
1633 		if (new_pi == NULL) {
1634 			umtxq_unlock(&uq->uq_key);
1635 			new_pi = umtx_pi_alloc(M_WAITOK);
1636 			new_pi->pi_key = uq->uq_key;
1637 			umtxq_lock(&uq->uq_key);
1638 			pi = umtx_pi_lookup(&uq->uq_key);
1639 			if (pi != NULL) {
1640 				umtx_pi_free(new_pi);
1641 				new_pi = NULL;
1642 			}
1643 		}
1644 		if (new_pi != NULL) {
1645 			new_pi->pi_key = uq->uq_key;
1646 			umtx_pi_insert(new_pi);
1647 			pi = new_pi;
1648 		}
1649 	}
1650 	umtx_pi_ref(pi);
1651 	umtxq_unlock(&uq->uq_key);
1652 
1653 	/*
1654 	 * Care must be exercised when dealing with umtx structure.  It
1655 	 * can fault on any access.
1656 	 */
1657 	for (;;) {
1658 		/*
1659 		 * Try the uncontested case.  This should be done in userland.
1660 		 */
1661 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1662 
1663 		/* The acquire succeeded. */
1664 		if (owner == UMUTEX_UNOWNED) {
1665 			error = 0;
1666 			break;
1667 		}
1668 
1669 		/* The address was invalid. */
1670 		if (owner == -1) {
1671 			error = EFAULT;
1672 			break;
1673 		}
1674 
1675 		/* If no one owns it but it is contested try to acquire it. */
1676 		if (owner == UMUTEX_CONTESTED) {
1677 			owner = casuword32(&m->m_owner,
1678 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1679 
1680 			if (owner == UMUTEX_CONTESTED) {
1681 				umtxq_lock(&uq->uq_key);
1682 				error = umtx_pi_claim(pi, td);
1683 				umtxq_unlock(&uq->uq_key);
1684 				break;
1685 			}
1686 
1687 			/* The address was invalid. */
1688 			if (owner == -1) {
1689 				error = EFAULT;
1690 				break;
1691 			}
1692 
1693 			/* If this failed the lock has changed, restart. */
1694 			continue;
1695 		}
1696 
1697 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1698 		    (owner & ~UMUTEX_CONTESTED) == id) {
1699 			error = EDEADLK;
1700 			break;
1701 		}
1702 
1703 		if (try != 0) {
1704 			error = EBUSY;
1705 			break;
1706 		}
1707 
1708 		/*
1709 		 * If we caught a signal, we have retried and now
1710 		 * exit immediately.
1711 		 */
1712 		if (error != 0)
1713 			break;
1714 
1715 		umtxq_lock(&uq->uq_key);
1716 		umtxq_busy(&uq->uq_key);
1717 		umtxq_unlock(&uq->uq_key);
1718 
1719 		/*
1720 		 * Set the contested bit so that a release in user space
1721 		 * knows to use the system call for unlock.  If this fails
1722 		 * either some one else has acquired the lock or it has been
1723 		 * released.
1724 		 */
1725 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1726 
1727 		/* The address was invalid. */
1728 		if (old == -1) {
1729 			umtxq_lock(&uq->uq_key);
1730 			umtxq_unbusy(&uq->uq_key);
1731 			umtxq_unlock(&uq->uq_key);
1732 			error = EFAULT;
1733 			break;
1734 		}
1735 
1736 		umtxq_lock(&uq->uq_key);
1737 		umtxq_unbusy(&uq->uq_key);
1738 		/*
1739 		 * We set the contested bit, sleep. Otherwise the lock changed
1740 		 * and we need to retry or we lost a race to the thread
1741 		 * unlocking the umtx.
1742 		 */
1743 		if (old == owner)
1744 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1745 				 "umtxpi", timo);
1746 		umtxq_unlock(&uq->uq_key);
1747 	}
1748 
1749 	umtxq_lock(&uq->uq_key);
1750 	umtx_pi_unref(pi);
1751 	umtxq_unlock(&uq->uq_key);
1752 
1753 	umtx_key_release(&uq->uq_key);
1754 	return (error);
1755 }
1756 
1757 /*
1758  * Unlock a PI mutex.
1759  */
1760 static int
1761 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1762 {
1763 	struct umtx_key key;
1764 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1765 	struct umtx_pi *pi, *pi2;
1766 	uint32_t owner, old, id;
1767 	int error;
1768 	int count;
1769 	int pri;
1770 
1771 	id = td->td_tid;
1772 	/*
1773 	 * Make sure we own this mtx.
1774 	 */
1775 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1776 	if (owner == -1)
1777 		return (EFAULT);
1778 
1779 	if ((owner & ~UMUTEX_CONTESTED) != id)
1780 		return (EPERM);
1781 
1782 	/* This should be done in userland */
1783 	if ((owner & UMUTEX_CONTESTED) == 0) {
1784 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1785 		if (old == -1)
1786 			return (EFAULT);
1787 		if (old == owner)
1788 			return (0);
1789 		owner = old;
1790 	}
1791 
1792 	/* We should only ever be in here for contested locks */
1793 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1794 	    &key)) != 0)
1795 		return (error);
1796 
1797 	umtxq_lock(&key);
1798 	umtxq_busy(&key);
1799 	count = umtxq_count_pi(&key, &uq_first);
1800 	if (uq_first != NULL) {
1801 		pi = uq_first->uq_pi_blocked;
1802 		if (pi->pi_owner != curthread) {
1803 			umtxq_unbusy(&key);
1804 			umtxq_unlock(&key);
1805 			/* userland messed the mutex */
1806 			return (EPERM);
1807 		}
1808 		uq_me = curthread->td_umtxq;
1809 		mtx_lock_spin(&umtx_lock);
1810 		pi->pi_owner = NULL;
1811 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1812 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1813 		pri = PRI_MAX;
1814 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1815 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1816 			if (uq_first2 != NULL) {
1817 				if (pri > UPRI(uq_first2->uq_thread))
1818 					pri = UPRI(uq_first2->uq_thread);
1819 			}
1820 		}
1821 		thread_lock(curthread);
1822 		sched_unlend_user_prio(curthread, pri);
1823 		thread_unlock(curthread);
1824 		mtx_unlock_spin(&umtx_lock);
1825 	}
1826 	umtxq_unlock(&key);
1827 
1828 	/*
1829 	 * When unlocking the umtx, it must be marked as unowned if
1830 	 * there is zero or one thread only waiting for it.
1831 	 * Otherwise, it must be marked as contested.
1832 	 */
1833 	old = casuword32(&m->m_owner, owner,
1834 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1835 
1836 	umtxq_lock(&key);
1837 	if (uq_first != NULL)
1838 		umtxq_signal_thread(uq_first);
1839 	umtxq_unbusy(&key);
1840 	umtxq_unlock(&key);
1841 	umtx_key_release(&key);
1842 	if (old == -1)
1843 		return (EFAULT);
1844 	if (old != owner)
1845 		return (EINVAL);
1846 	return (0);
1847 }
1848 
1849 /*
1850  * Lock a PP mutex.
1851  */
1852 static int
1853 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1854 	int try)
1855 {
1856 	struct umtx_q *uq, *uq2;
1857 	struct umtx_pi *pi;
1858 	uint32_t ceiling;
1859 	uint32_t owner, id;
1860 	int error, pri, old_inherited_pri, su;
1861 
1862 	id = td->td_tid;
1863 	uq = td->td_umtxq;
1864 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1865 	    &uq->uq_key)) != 0)
1866 		return (error);
1867 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1868 	for (;;) {
1869 		old_inherited_pri = uq->uq_inherited_pri;
1870 		umtxq_lock(&uq->uq_key);
1871 		umtxq_busy(&uq->uq_key);
1872 		umtxq_unlock(&uq->uq_key);
1873 
1874 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1875 		if (ceiling > RTP_PRIO_MAX) {
1876 			error = EINVAL;
1877 			goto out;
1878 		}
1879 
1880 		mtx_lock_spin(&umtx_lock);
1881 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1882 			mtx_unlock_spin(&umtx_lock);
1883 			error = EINVAL;
1884 			goto out;
1885 		}
1886 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1887 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1888 			thread_lock(td);
1889 			if (uq->uq_inherited_pri < UPRI(td))
1890 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1891 			thread_unlock(td);
1892 		}
1893 		mtx_unlock_spin(&umtx_lock);
1894 
1895 		owner = casuword32(&m->m_owner,
1896 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1897 
1898 		if (owner == UMUTEX_CONTESTED) {
1899 			error = 0;
1900 			break;
1901 		}
1902 
1903 		/* The address was invalid. */
1904 		if (owner == -1) {
1905 			error = EFAULT;
1906 			break;
1907 		}
1908 
1909 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1910 		    (owner & ~UMUTEX_CONTESTED) == id) {
1911 			error = EDEADLK;
1912 			break;
1913 		}
1914 
1915 		if (try != 0) {
1916 			error = EBUSY;
1917 			break;
1918 		}
1919 
1920 		/*
1921 		 * If we caught a signal, we have retried and now
1922 		 * exit immediately.
1923 		 */
1924 		if (error != 0)
1925 			break;
1926 
1927 		umtxq_lock(&uq->uq_key);
1928 		umtxq_insert(uq);
1929 		umtxq_unbusy(&uq->uq_key);
1930 		error = umtxq_sleep(uq, "umtxpp", timo);
1931 		umtxq_remove(uq);
1932 		umtxq_unlock(&uq->uq_key);
1933 
1934 		mtx_lock_spin(&umtx_lock);
1935 		uq->uq_inherited_pri = old_inherited_pri;
1936 		pri = PRI_MAX;
1937 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1938 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1939 			if (uq2 != NULL) {
1940 				if (pri > UPRI(uq2->uq_thread))
1941 					pri = UPRI(uq2->uq_thread);
1942 			}
1943 		}
1944 		if (pri > uq->uq_inherited_pri)
1945 			pri = uq->uq_inherited_pri;
1946 		thread_lock(td);
1947 		sched_unlend_user_prio(td, pri);
1948 		thread_unlock(td);
1949 		mtx_unlock_spin(&umtx_lock);
1950 	}
1951 
1952 	if (error != 0) {
1953 		mtx_lock_spin(&umtx_lock);
1954 		uq->uq_inherited_pri = old_inherited_pri;
1955 		pri = PRI_MAX;
1956 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1957 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1958 			if (uq2 != NULL) {
1959 				if (pri > UPRI(uq2->uq_thread))
1960 					pri = UPRI(uq2->uq_thread);
1961 			}
1962 		}
1963 		if (pri > uq->uq_inherited_pri)
1964 			pri = uq->uq_inherited_pri;
1965 		thread_lock(td);
1966 		sched_unlend_user_prio(td, pri);
1967 		thread_unlock(td);
1968 		mtx_unlock_spin(&umtx_lock);
1969 	}
1970 
1971 out:
1972 	umtxq_lock(&uq->uq_key);
1973 	umtxq_unbusy(&uq->uq_key);
1974 	umtxq_unlock(&uq->uq_key);
1975 	umtx_key_release(&uq->uq_key);
1976 	return (error);
1977 }
1978 
1979 /*
1980  * Unlock a PP mutex.
1981  */
1982 static int
1983 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1984 {
1985 	struct umtx_key key;
1986 	struct umtx_q *uq, *uq2;
1987 	struct umtx_pi *pi;
1988 	uint32_t owner, id;
1989 	uint32_t rceiling;
1990 	int error, pri, new_inherited_pri, su;
1991 
1992 	id = td->td_tid;
1993 	uq = td->td_umtxq;
1994 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1995 
1996 	/*
1997 	 * Make sure we own this mtx.
1998 	 */
1999 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2000 	if (owner == -1)
2001 		return (EFAULT);
2002 
2003 	if ((owner & ~UMUTEX_CONTESTED) != id)
2004 		return (EPERM);
2005 
2006 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2007 	if (error != 0)
2008 		return (error);
2009 
2010 	if (rceiling == -1)
2011 		new_inherited_pri = PRI_MAX;
2012 	else {
2013 		rceiling = RTP_PRIO_MAX - rceiling;
2014 		if (rceiling > RTP_PRIO_MAX)
2015 			return (EINVAL);
2016 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2017 	}
2018 
2019 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2020 	    &key)) != 0)
2021 		return (error);
2022 	umtxq_lock(&key);
2023 	umtxq_busy(&key);
2024 	umtxq_unlock(&key);
2025 	/*
2026 	 * For priority protected mutex, always set unlocked state
2027 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2028 	 * to lock the mutex, it is necessary because thread priority
2029 	 * has to be adjusted for such mutex.
2030 	 */
2031 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2032 		UMUTEX_CONTESTED);
2033 
2034 	umtxq_lock(&key);
2035 	if (error == 0)
2036 		umtxq_signal(&key, 1);
2037 	umtxq_unbusy(&key);
2038 	umtxq_unlock(&key);
2039 
2040 	if (error == -1)
2041 		error = EFAULT;
2042 	else {
2043 		mtx_lock_spin(&umtx_lock);
2044 		if (su != 0)
2045 			uq->uq_inherited_pri = new_inherited_pri;
2046 		pri = PRI_MAX;
2047 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2048 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2049 			if (uq2 != NULL) {
2050 				if (pri > UPRI(uq2->uq_thread))
2051 					pri = UPRI(uq2->uq_thread);
2052 			}
2053 		}
2054 		if (pri > uq->uq_inherited_pri)
2055 			pri = uq->uq_inherited_pri;
2056 		thread_lock(td);
2057 		sched_unlend_user_prio(td, pri);
2058 		thread_unlock(td);
2059 		mtx_unlock_spin(&umtx_lock);
2060 	}
2061 	umtx_key_release(&key);
2062 	return (error);
2063 }
2064 
2065 static int
2066 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2067 	uint32_t *old_ceiling)
2068 {
2069 	struct umtx_q *uq;
2070 	uint32_t save_ceiling;
2071 	uint32_t owner, id;
2072 	uint32_t flags;
2073 	int error;
2074 
2075 	flags = fuword32(&m->m_flags);
2076 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2077 		return (EINVAL);
2078 	if (ceiling > RTP_PRIO_MAX)
2079 		return (EINVAL);
2080 	id = td->td_tid;
2081 	uq = td->td_umtxq;
2082 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2083 	   &uq->uq_key)) != 0)
2084 		return (error);
2085 	for (;;) {
2086 		umtxq_lock(&uq->uq_key);
2087 		umtxq_busy(&uq->uq_key);
2088 		umtxq_unlock(&uq->uq_key);
2089 
2090 		save_ceiling = fuword32(&m->m_ceilings[0]);
2091 
2092 		owner = casuword32(&m->m_owner,
2093 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2094 
2095 		if (owner == UMUTEX_CONTESTED) {
2096 			suword32(&m->m_ceilings[0], ceiling);
2097 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2098 				UMUTEX_CONTESTED);
2099 			error = 0;
2100 			break;
2101 		}
2102 
2103 		/* The address was invalid. */
2104 		if (owner == -1) {
2105 			error = EFAULT;
2106 			break;
2107 		}
2108 
2109 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2110 			suword32(&m->m_ceilings[0], ceiling);
2111 			error = 0;
2112 			break;
2113 		}
2114 
2115 		/*
2116 		 * If we caught a signal, we have retried and now
2117 		 * exit immediately.
2118 		 */
2119 		if (error != 0)
2120 			break;
2121 
2122 		/*
2123 		 * We set the contested bit, sleep. Otherwise the lock changed
2124 		 * and we need to retry or we lost a race to the thread
2125 		 * unlocking the umtx.
2126 		 */
2127 		umtxq_lock(&uq->uq_key);
2128 		umtxq_insert(uq);
2129 		umtxq_unbusy(&uq->uq_key);
2130 		error = umtxq_sleep(uq, "umtxpp", 0);
2131 		umtxq_remove(uq);
2132 		umtxq_unlock(&uq->uq_key);
2133 	}
2134 	umtxq_lock(&uq->uq_key);
2135 	if (error == 0)
2136 		umtxq_signal(&uq->uq_key, INT_MAX);
2137 	umtxq_unbusy(&uq->uq_key);
2138 	umtxq_unlock(&uq->uq_key);
2139 	umtx_key_release(&uq->uq_key);
2140 	if (error == 0 && old_ceiling != NULL)
2141 		suword32(old_ceiling, save_ceiling);
2142 	return (error);
2143 }
2144 
2145 static int
2146 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2147 	int try)
2148 {
2149 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2150 	case 0:
2151 		return (_do_lock_normal(td, m, flags, timo, try));
2152 	case UMUTEX_PRIO_INHERIT:
2153 		return (_do_lock_pi(td, m, flags, timo, try));
2154 	case UMUTEX_PRIO_PROTECT:
2155 		return (_do_lock_pp(td, m, flags, timo, try));
2156 	}
2157 	return (EINVAL);
2158 }
2159 
2160 /*
2161  * Lock a userland POSIX mutex.
2162  */
2163 static int
2164 do_lock_umutex(struct thread *td, struct umutex *m,
2165 	struct timespec *timeout, int try)
2166 {
2167 	struct timespec ts, ts2, ts3;
2168 	struct timeval tv;
2169 	uint32_t flags;
2170 	int error;
2171 
2172 	flags = fuword32(&m->m_flags);
2173 	if (flags == -1)
2174 		return (EFAULT);
2175 
2176 	if (timeout == NULL) {
2177 		error = _do_lock_umutex(td, m, flags, 0, try);
2178 		/* Mutex locking is restarted if it is interrupted. */
2179 		if (error == EINTR)
2180 			error = ERESTART;
2181 	} else {
2182 		getnanouptime(&ts);
2183 		timespecadd(&ts, timeout);
2184 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2185 		for (;;) {
2186 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2187 			if (error != ETIMEDOUT)
2188 				break;
2189 			getnanouptime(&ts2);
2190 			if (timespeccmp(&ts2, &ts, >=)) {
2191 				error = ETIMEDOUT;
2192 				break;
2193 			}
2194 			ts3 = ts;
2195 			timespecsub(&ts3, &ts2);
2196 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2197 		}
2198 		/* Timed-locking is not restarted. */
2199 		if (error == ERESTART)
2200 			error = EINTR;
2201 	}
2202 	return (error);
2203 }
2204 
2205 /*
2206  * Unlock a userland POSIX mutex.
2207  */
2208 static int
2209 do_unlock_umutex(struct thread *td, struct umutex *m)
2210 {
2211 	uint32_t flags;
2212 
2213 	flags = fuword32(&m->m_flags);
2214 	if (flags == -1)
2215 		return (EFAULT);
2216 
2217 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2218 	case 0:
2219 		return (do_unlock_normal(td, m, flags));
2220 	case UMUTEX_PRIO_INHERIT:
2221 		return (do_unlock_pi(td, m, flags));
2222 	case UMUTEX_PRIO_PROTECT:
2223 		return (do_unlock_pp(td, m, flags));
2224 	}
2225 
2226 	return (EINVAL);
2227 }
2228 
2229 static int
2230 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2231 	struct timespec *timeout, u_long wflags)
2232 {
2233 	struct umtx_q *uq;
2234 	struct timeval tv;
2235 	struct timespec cts, ets, tts;
2236 	uint32_t flags;
2237 	int error;
2238 
2239 	uq = td->td_umtxq;
2240 	flags = fuword32(&cv->c_flags);
2241 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2242 	if (error != 0)
2243 		return (error);
2244 	umtxq_lock(&uq->uq_key);
2245 	umtxq_busy(&uq->uq_key);
2246 	umtxq_insert(uq);
2247 	umtxq_unlock(&uq->uq_key);
2248 
2249 	/*
2250 	 * The magic thing is we should set c_has_waiters to 1 before
2251 	 * releasing user mutex.
2252 	 */
2253 	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2254 
2255 	umtxq_lock(&uq->uq_key);
2256 	umtxq_unbusy(&uq->uq_key);
2257 	umtxq_unlock(&uq->uq_key);
2258 
2259 	error = do_unlock_umutex(td, m);
2260 
2261 	umtxq_lock(&uq->uq_key);
2262 	if (error == 0) {
2263 		if ((wflags & UMTX_CHECK_UNPARKING) &&
2264 		    (td->td_pflags & TDP_WAKEUP)) {
2265 			td->td_pflags &= ~TDP_WAKEUP;
2266 			error = EINTR;
2267 		} else if (timeout == NULL) {
2268 			error = umtxq_sleep(uq, "ucond", 0);
2269 		} else {
2270 			getnanouptime(&ets);
2271 			timespecadd(&ets, timeout);
2272 			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2273 			for (;;) {
2274 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2275 				if (error != ETIMEDOUT)
2276 					break;
2277 				getnanouptime(&cts);
2278 				if (timespeccmp(&cts, &ets, >=)) {
2279 					error = ETIMEDOUT;
2280 					break;
2281 				}
2282 				tts = ets;
2283 				timespecsub(&tts, &cts);
2284 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2285 			}
2286 		}
2287 	}
2288 
2289 	if (error != 0) {
2290 		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2291 			/*
2292 			 * If we concurrently got do_cv_signal()d
2293 			 * and we got an error or UNIX signals or a timeout,
2294 			 * then, perform another umtxq_signal to avoid
2295 			 * consuming the wakeup. This may cause supurious
2296 			 * wakeup for another thread which was just queued,
2297 			 * but SUSV3 explicitly allows supurious wakeup to
2298 			 * occur, and indeed a kernel based implementation
2299 			 * can not avoid it.
2300 			 */
2301 			if (!umtxq_signal(&uq->uq_key, 1))
2302 				error = 0;
2303 		}
2304 		if (error == ERESTART)
2305 			error = EINTR;
2306 	}
2307 	umtxq_remove(uq);
2308 	umtxq_unlock(&uq->uq_key);
2309 	umtx_key_release(&uq->uq_key);
2310 	return (error);
2311 }
2312 
2313 /*
2314  * Signal a userland condition variable.
2315  */
2316 static int
2317 do_cv_signal(struct thread *td, struct ucond *cv)
2318 {
2319 	struct umtx_key key;
2320 	int error, cnt, nwake;
2321 	uint32_t flags;
2322 
2323 	flags = fuword32(&cv->c_flags);
2324 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2325 		return (error);
2326 	umtxq_lock(&key);
2327 	umtxq_busy(&key);
2328 	cnt = umtxq_count(&key);
2329 	nwake = umtxq_signal(&key, 1);
2330 	if (cnt <= nwake) {
2331 		umtxq_unlock(&key);
2332 		error = suword32(
2333 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2334 		umtxq_lock(&key);
2335 	}
2336 	umtxq_unbusy(&key);
2337 	umtxq_unlock(&key);
2338 	umtx_key_release(&key);
2339 	return (error);
2340 }
2341 
2342 static int
2343 do_cv_broadcast(struct thread *td, struct ucond *cv)
2344 {
2345 	struct umtx_key key;
2346 	int error;
2347 	uint32_t flags;
2348 
2349 	flags = fuword32(&cv->c_flags);
2350 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2351 		return (error);
2352 
2353 	umtxq_lock(&key);
2354 	umtxq_busy(&key);
2355 	umtxq_signal(&key, INT_MAX);
2356 	umtxq_unlock(&key);
2357 
2358 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2359 
2360 	umtxq_lock(&key);
2361 	umtxq_unbusy(&key);
2362 	umtxq_unlock(&key);
2363 
2364 	umtx_key_release(&key);
2365 	return (error);
2366 }
2367 
2368 static int
2369 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2370 {
2371 	struct umtx_q *uq;
2372 	uint32_t flags, wrflags;
2373 	int32_t state, oldstate;
2374 	int32_t blocked_readers;
2375 	int error;
2376 
2377 	uq = td->td_umtxq;
2378 	flags = fuword32(&rwlock->rw_flags);
2379 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2380 	if (error != 0)
2381 		return (error);
2382 
2383 	wrflags = URWLOCK_WRITE_OWNER;
2384 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2385 		wrflags |= URWLOCK_WRITE_WAITERS;
2386 
2387 	for (;;) {
2388 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2389 		/* try to lock it */
2390 		while (!(state & wrflags)) {
2391 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2392 				umtx_key_release(&uq->uq_key);
2393 				return (EAGAIN);
2394 			}
2395 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2396 			if (oldstate == state) {
2397 				umtx_key_release(&uq->uq_key);
2398 				return (0);
2399 			}
2400 			state = oldstate;
2401 		}
2402 
2403 		if (error)
2404 			break;
2405 
2406 		/* grab monitor lock */
2407 		umtxq_lock(&uq->uq_key);
2408 		umtxq_busy(&uq->uq_key);
2409 		umtxq_unlock(&uq->uq_key);
2410 
2411 		/* set read contention bit */
2412 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2413 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2414 			if (oldstate == state)
2415 				goto sleep;
2416 			state = oldstate;
2417 		}
2418 
2419 		/* state is changed while setting flags, restart */
2420 		if (!(state & wrflags)) {
2421 			umtxq_lock(&uq->uq_key);
2422 			umtxq_unbusy(&uq->uq_key);
2423 			umtxq_unlock(&uq->uq_key);
2424 			continue;
2425 		}
2426 
2427 sleep:
2428 		/* contention bit is set, before sleeping, increase read waiter count */
2429 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2430 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2431 
2432 		while (state & wrflags) {
2433 			umtxq_lock(&uq->uq_key);
2434 			umtxq_insert(uq);
2435 			umtxq_unbusy(&uq->uq_key);
2436 
2437 			error = umtxq_sleep(uq, "urdlck", timo);
2438 
2439 			umtxq_busy(&uq->uq_key);
2440 			umtxq_remove(uq);
2441 			umtxq_unlock(&uq->uq_key);
2442 			if (error)
2443 				break;
2444 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2445 		}
2446 
2447 		/* decrease read waiter count, and may clear read contention bit */
2448 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2449 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2450 		if (blocked_readers == 1) {
2451 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2452 			for (;;) {
2453 				oldstate = casuword32(&rwlock->rw_state, state,
2454 					 state & ~URWLOCK_READ_WAITERS);
2455 				if (oldstate == state)
2456 					break;
2457 				state = oldstate;
2458 			}
2459 		}
2460 
2461 		umtxq_lock(&uq->uq_key);
2462 		umtxq_unbusy(&uq->uq_key);
2463 		umtxq_unlock(&uq->uq_key);
2464 	}
2465 	umtx_key_release(&uq->uq_key);
2466 	return (error);
2467 }
2468 
2469 static int
2470 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2471 {
2472 	struct timespec ts, ts2, ts3;
2473 	struct timeval tv;
2474 	int error;
2475 
2476 	getnanouptime(&ts);
2477 	timespecadd(&ts, timeout);
2478 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2479 	for (;;) {
2480 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2481 		if (error != ETIMEDOUT)
2482 			break;
2483 		getnanouptime(&ts2);
2484 		if (timespeccmp(&ts2, &ts, >=)) {
2485 			error = ETIMEDOUT;
2486 			break;
2487 		}
2488 		ts3 = ts;
2489 		timespecsub(&ts3, &ts2);
2490 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2491 	}
2492 	if (error == ERESTART)
2493 		error = EINTR;
2494 	return (error);
2495 }
2496 
2497 static int
2498 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2499 {
2500 	struct umtx_q *uq;
2501 	uint32_t flags;
2502 	int32_t state, oldstate;
2503 	int32_t blocked_writers;
2504 	int error;
2505 
2506 	uq = td->td_umtxq;
2507 	flags = fuword32(&rwlock->rw_flags);
2508 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2509 	if (error != 0)
2510 		return (error);
2511 
2512 	for (;;) {
2513 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2514 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2515 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2516 			if (oldstate == state) {
2517 				umtx_key_release(&uq->uq_key);
2518 				return (0);
2519 			}
2520 			state = oldstate;
2521 		}
2522 
2523 		if (error)
2524 			break;
2525 
2526 		/* grab monitor lock */
2527 		umtxq_lock(&uq->uq_key);
2528 		umtxq_busy(&uq->uq_key);
2529 		umtxq_unlock(&uq->uq_key);
2530 
2531 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2532 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2533 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2534 			if (oldstate == state)
2535 				goto sleep;
2536 			state = oldstate;
2537 		}
2538 
2539 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2540 			umtxq_lock(&uq->uq_key);
2541 			umtxq_unbusy(&uq->uq_key);
2542 			umtxq_unlock(&uq->uq_key);
2543 			continue;
2544 		}
2545 sleep:
2546 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2547 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2548 
2549 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2550 			umtxq_lock(&uq->uq_key);
2551 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2552 			umtxq_unbusy(&uq->uq_key);
2553 
2554 			error = umtxq_sleep(uq, "uwrlck", timo);
2555 
2556 			umtxq_busy(&uq->uq_key);
2557 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2558 			umtxq_unlock(&uq->uq_key);
2559 			if (error)
2560 				break;
2561 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2562 		}
2563 
2564 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2565 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2566 		if (blocked_writers == 1) {
2567 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2568 			for (;;) {
2569 				oldstate = casuword32(&rwlock->rw_state, state,
2570 					 state & ~URWLOCK_WRITE_WAITERS);
2571 				if (oldstate == state)
2572 					break;
2573 				state = oldstate;
2574 			}
2575 		}
2576 
2577 		umtxq_lock(&uq->uq_key);
2578 		umtxq_unbusy(&uq->uq_key);
2579 		umtxq_unlock(&uq->uq_key);
2580 	}
2581 
2582 	umtx_key_release(&uq->uq_key);
2583 	return (error);
2584 }
2585 
2586 static int
2587 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2588 {
2589 	struct timespec ts, ts2, ts3;
2590 	struct timeval tv;
2591 	int error;
2592 
2593 	getnanouptime(&ts);
2594 	timespecadd(&ts, timeout);
2595 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2596 	for (;;) {
2597 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2598 		if (error != ETIMEDOUT)
2599 			break;
2600 		getnanouptime(&ts2);
2601 		if (timespeccmp(&ts2, &ts, >=)) {
2602 			error = ETIMEDOUT;
2603 			break;
2604 		}
2605 		ts3 = ts;
2606 		timespecsub(&ts3, &ts2);
2607 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2608 	}
2609 	if (error == ERESTART)
2610 		error = EINTR;
2611 	return (error);
2612 }
2613 
2614 static int
2615 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2616 {
2617 	struct umtx_q *uq;
2618 	uint32_t flags;
2619 	int32_t state, oldstate;
2620 	int error, q, count;
2621 
2622 	uq = td->td_umtxq;
2623 	flags = fuword32(&rwlock->rw_flags);
2624 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2625 	if (error != 0)
2626 		return (error);
2627 
2628 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2629 	if (state & URWLOCK_WRITE_OWNER) {
2630 		for (;;) {
2631 			oldstate = casuword32(&rwlock->rw_state, state,
2632 				state & ~URWLOCK_WRITE_OWNER);
2633 			if (oldstate != state) {
2634 				state = oldstate;
2635 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2636 					error = EPERM;
2637 					goto out;
2638 				}
2639 			} else
2640 				break;
2641 		}
2642 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2643 		for (;;) {
2644 			oldstate = casuword32(&rwlock->rw_state, state,
2645 				state - 1);
2646 			if (oldstate != state) {
2647 				state = oldstate;
2648 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2649 					error = EPERM;
2650 					goto out;
2651 				}
2652 			}
2653 			else
2654 				break;
2655 		}
2656 	} else {
2657 		error = EPERM;
2658 		goto out;
2659 	}
2660 
2661 	count = 0;
2662 
2663 	if (!(flags & URWLOCK_PREFER_READER)) {
2664 		if (state & URWLOCK_WRITE_WAITERS) {
2665 			count = 1;
2666 			q = UMTX_EXCLUSIVE_QUEUE;
2667 		} else if (state & URWLOCK_READ_WAITERS) {
2668 			count = INT_MAX;
2669 			q = UMTX_SHARED_QUEUE;
2670 		}
2671 	} else {
2672 		if (state & URWLOCK_READ_WAITERS) {
2673 			count = INT_MAX;
2674 			q = UMTX_SHARED_QUEUE;
2675 		} else if (state & URWLOCK_WRITE_WAITERS) {
2676 			count = 1;
2677 			q = UMTX_EXCLUSIVE_QUEUE;
2678 		}
2679 	}
2680 
2681 	if (count) {
2682 		umtxq_lock(&uq->uq_key);
2683 		umtxq_busy(&uq->uq_key);
2684 		umtxq_signal_queue(&uq->uq_key, count, q);
2685 		umtxq_unbusy(&uq->uq_key);
2686 		umtxq_unlock(&uq->uq_key);
2687 	}
2688 out:
2689 	umtx_key_release(&uq->uq_key);
2690 	return (error);
2691 }
2692 
2693 int
2694 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2695     /* struct umtx *umtx */
2696 {
2697 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2698 }
2699 
2700 int
2701 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2702     /* struct umtx *umtx */
2703 {
2704 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2705 }
2706 
2707 static int
2708 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2709 {
2710 	struct timespec *ts, timeout;
2711 	int error;
2712 
2713 	/* Allow a null timespec (wait forever). */
2714 	if (uap->uaddr2 == NULL)
2715 		ts = NULL;
2716 	else {
2717 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2718 		if (error != 0)
2719 			return (error);
2720 		if (timeout.tv_nsec >= 1000000000 ||
2721 		    timeout.tv_nsec < 0) {
2722 			return (EINVAL);
2723 		}
2724 		ts = &timeout;
2725 	}
2726 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2727 }
2728 
2729 static int
2730 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2731 {
2732 	return (do_unlock_umtx(td, uap->obj, uap->val));
2733 }
2734 
2735 static int
2736 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2737 {
2738 	struct timespec *ts, timeout;
2739 	int error;
2740 
2741 	if (uap->uaddr2 == NULL)
2742 		ts = NULL;
2743 	else {
2744 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2745 		if (error != 0)
2746 			return (error);
2747 		if (timeout.tv_nsec >= 1000000000 ||
2748 		    timeout.tv_nsec < 0)
2749 			return (EINVAL);
2750 		ts = &timeout;
2751 	}
2752 	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2753 }
2754 
2755 static int
2756 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2757 {
2758 	struct timespec *ts, timeout;
2759 	int error;
2760 
2761 	if (uap->uaddr2 == NULL)
2762 		ts = NULL;
2763 	else {
2764 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2765 		if (error != 0)
2766 			return (error);
2767 		if (timeout.tv_nsec >= 1000000000 ||
2768 		    timeout.tv_nsec < 0)
2769 			return (EINVAL);
2770 		ts = &timeout;
2771 	}
2772 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2773 }
2774 
2775 static int
2776 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2777 {
2778 	struct timespec *ts, timeout;
2779 	int error;
2780 
2781 	if (uap->uaddr2 == NULL)
2782 		ts = NULL;
2783 	else {
2784 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2785 		if (error != 0)
2786 			return (error);
2787 		if (timeout.tv_nsec >= 1000000000 ||
2788 		    timeout.tv_nsec < 0)
2789 			return (EINVAL);
2790 		ts = &timeout;
2791 	}
2792 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2793 }
2794 
2795 static int
2796 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2797 {
2798 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2799 }
2800 
2801 static int
2802 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2803 {
2804 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2805 }
2806 
2807 static int
2808 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2809 {
2810 	struct timespec *ts, timeout;
2811 	int error;
2812 
2813 	/* Allow a null timespec (wait forever). */
2814 	if (uap->uaddr2 == NULL)
2815 		ts = NULL;
2816 	else {
2817 		error = copyin(uap->uaddr2, &timeout,
2818 		    sizeof(timeout));
2819 		if (error != 0)
2820 			return (error);
2821 		if (timeout.tv_nsec >= 1000000000 ||
2822 		    timeout.tv_nsec < 0) {
2823 			return (EINVAL);
2824 		}
2825 		ts = &timeout;
2826 	}
2827 	return do_lock_umutex(td, uap->obj, ts, 0);
2828 }
2829 
2830 static int
2831 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2832 {
2833 	return do_lock_umutex(td, uap->obj, NULL, 1);
2834 }
2835 
2836 static int
2837 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2838 {
2839 	return do_unlock_umutex(td, uap->obj);
2840 }
2841 
2842 static int
2843 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2844 {
2845 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2846 }
2847 
2848 static int
2849 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2850 {
2851 	struct timespec *ts, timeout;
2852 	int error;
2853 
2854 	/* Allow a null timespec (wait forever). */
2855 	if (uap->uaddr2 == NULL)
2856 		ts = NULL;
2857 	else {
2858 		error = copyin(uap->uaddr2, &timeout,
2859 		    sizeof(timeout));
2860 		if (error != 0)
2861 			return (error);
2862 		if (timeout.tv_nsec >= 1000000000 ||
2863 		    timeout.tv_nsec < 0) {
2864 			return (EINVAL);
2865 		}
2866 		ts = &timeout;
2867 	}
2868 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2869 }
2870 
2871 static int
2872 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2873 {
2874 	return do_cv_signal(td, uap->obj);
2875 }
2876 
2877 static int
2878 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2879 {
2880 	return do_cv_broadcast(td, uap->obj);
2881 }
2882 
2883 static int
2884 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
2885 {
2886 	struct timespec timeout;
2887 	int error;
2888 
2889 	/* Allow a null timespec (wait forever). */
2890 	if (uap->uaddr2 == NULL) {
2891 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
2892 	} else {
2893 		error = copyin(uap->uaddr2, &timeout,
2894 		    sizeof(timeout));
2895 		if (error != 0)
2896 			return (error);
2897 		if (timeout.tv_nsec >= 1000000000 ||
2898 		    timeout.tv_nsec < 0) {
2899 			return (EINVAL);
2900 		}
2901 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
2902 	}
2903 	return (error);
2904 }
2905 
2906 static int
2907 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
2908 {
2909 	struct timespec timeout;
2910 	int error;
2911 
2912 	/* Allow a null timespec (wait forever). */
2913 	if (uap->uaddr2 == NULL) {
2914 		error = do_rw_wrlock(td, uap->obj, 0);
2915 	} else {
2916 		error = copyin(uap->uaddr2, &timeout,
2917 		    sizeof(timeout));
2918 		if (error != 0)
2919 			return (error);
2920 		if (timeout.tv_nsec >= 1000000000 ||
2921 		    timeout.tv_nsec < 0) {
2922 			return (EINVAL);
2923 		}
2924 
2925 		error = do_rw_wrlock2(td, uap->obj, &timeout);
2926 	}
2927 	return (error);
2928 }
2929 
2930 static int
2931 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
2932 {
2933 	return do_rw_unlock(td, uap->obj);
2934 }
2935 
2936 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2937 
2938 static _umtx_op_func op_table[] = {
2939 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2940 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2941 	__umtx_op_wait,			/* UMTX_OP_WAIT */
2942 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2943 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2944 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2945 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2946 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2947 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2948 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2949 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
2950 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
2951 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
2952 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
2953 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
2954 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
2955 	__umtx_op_wake_private		/* UMTX_OP_WAKE_PRIVATE */
2956 };
2957 
2958 int
2959 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
2960 {
2961 	if ((unsigned)uap->op < UMTX_OP_MAX)
2962 		return (*op_table[uap->op])(td, uap);
2963 	return (EINVAL);
2964 }
2965 
2966 #ifdef COMPAT_IA32
2967 int
2968 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2969     /* struct umtx *umtx */
2970 {
2971 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2972 }
2973 
2974 int
2975 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2976     /* struct umtx *umtx */
2977 {
2978 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2979 }
2980 
2981 struct timespec32 {
2982 	u_int32_t tv_sec;
2983 	u_int32_t tv_nsec;
2984 };
2985 
2986 static inline int
2987 copyin_timeout32(void *addr, struct timespec *tsp)
2988 {
2989 	struct timespec32 ts32;
2990 	int error;
2991 
2992 	error = copyin(addr, &ts32, sizeof(struct timespec32));
2993 	if (error == 0) {
2994 		tsp->tv_sec = ts32.tv_sec;
2995 		tsp->tv_nsec = ts32.tv_nsec;
2996 	}
2997 	return (error);
2998 }
2999 
3000 static int
3001 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3002 {
3003 	struct timespec *ts, timeout;
3004 	int error;
3005 
3006 	/* Allow a null timespec (wait forever). */
3007 	if (uap->uaddr2 == NULL)
3008 		ts = NULL;
3009 	else {
3010 		error = copyin_timeout32(uap->uaddr2, &timeout);
3011 		if (error != 0)
3012 			return (error);
3013 		if (timeout.tv_nsec >= 1000000000 ||
3014 		    timeout.tv_nsec < 0) {
3015 			return (EINVAL);
3016 		}
3017 		ts = &timeout;
3018 	}
3019 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3020 }
3021 
3022 static int
3023 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3024 {
3025 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3026 }
3027 
3028 static int
3029 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3030 {
3031 	struct timespec *ts, timeout;
3032 	int error;
3033 
3034 	if (uap->uaddr2 == NULL)
3035 		ts = NULL;
3036 	else {
3037 		error = copyin_timeout32(uap->uaddr2, &timeout);
3038 		if (error != 0)
3039 			return (error);
3040 		if (timeout.tv_nsec >= 1000000000 ||
3041 		    timeout.tv_nsec < 0)
3042 			return (EINVAL);
3043 		ts = &timeout;
3044 	}
3045 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3046 }
3047 
3048 static int
3049 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3050 {
3051 	struct timespec *ts, timeout;
3052 	int error;
3053 
3054 	/* Allow a null timespec (wait forever). */
3055 	if (uap->uaddr2 == NULL)
3056 		ts = NULL;
3057 	else {
3058 		error = copyin_timeout32(uap->uaddr2, &timeout);
3059 		if (error != 0)
3060 			return (error);
3061 		if (timeout.tv_nsec >= 1000000000 ||
3062 		    timeout.tv_nsec < 0)
3063 			return (EINVAL);
3064 		ts = &timeout;
3065 	}
3066 	return do_lock_umutex(td, uap->obj, ts, 0);
3067 }
3068 
3069 static int
3070 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3071 {
3072 	struct timespec *ts, timeout;
3073 	int error;
3074 
3075 	/* Allow a null timespec (wait forever). */
3076 	if (uap->uaddr2 == NULL)
3077 		ts = NULL;
3078 	else {
3079 		error = copyin_timeout32(uap->uaddr2, &timeout);
3080 		if (error != 0)
3081 			return (error);
3082 		if (timeout.tv_nsec >= 1000000000 ||
3083 		    timeout.tv_nsec < 0)
3084 			return (EINVAL);
3085 		ts = &timeout;
3086 	}
3087 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3088 }
3089 
3090 static int
3091 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3092 {
3093 	struct timespec timeout;
3094 	int error;
3095 
3096 	/* Allow a null timespec (wait forever). */
3097 	if (uap->uaddr2 == NULL) {
3098 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3099 	} else {
3100 		error = copyin(uap->uaddr2, &timeout,
3101 		    sizeof(timeout));
3102 		if (error != 0)
3103 			return (error);
3104 		if (timeout.tv_nsec >= 1000000000 ||
3105 		    timeout.tv_nsec < 0) {
3106 			return (EINVAL);
3107 		}
3108 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3109 	}
3110 	return (error);
3111 }
3112 
3113 static int
3114 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3115 {
3116 	struct timespec timeout;
3117 	int error;
3118 
3119 	/* Allow a null timespec (wait forever). */
3120 	if (uap->uaddr2 == NULL) {
3121 		error = do_rw_wrlock(td, uap->obj, 0);
3122 	} else {
3123 		error = copyin_timeout32(uap->uaddr2, &timeout);
3124 		if (error != 0)
3125 			return (error);
3126 		if (timeout.tv_nsec >= 1000000000 ||
3127 		    timeout.tv_nsec < 0) {
3128 			return (EINVAL);
3129 		}
3130 
3131 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3132 	}
3133 	return (error);
3134 }
3135 
3136 static int
3137 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3138 {
3139 	struct timespec *ts, timeout;
3140 	int error;
3141 
3142 	if (uap->uaddr2 == NULL)
3143 		ts = NULL;
3144 	else {
3145 		error = copyin_timeout32(uap->uaddr2, &timeout);
3146 		if (error != 0)
3147 			return (error);
3148 		if (timeout.tv_nsec >= 1000000000 ||
3149 		    timeout.tv_nsec < 0)
3150 			return (EINVAL);
3151 		ts = &timeout;
3152 	}
3153 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3154 }
3155 
3156 static _umtx_op_func op_table_compat32[] = {
3157 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3158 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3159 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3160 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3161 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3162 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3163 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3164 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3165 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3166 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3167 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3168 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3169 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3170 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3171 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3172 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3173 	__umtx_op_wake_private		/* UMTX_OP_WAKE_PRIVATE */
3174 };
3175 
3176 int
3177 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3178 {
3179 	if ((unsigned)uap->op < UMTX_OP_MAX)
3180 		return (*op_table_compat32[uap->op])(td,
3181 			(struct _umtx_op_args *)uap);
3182 	return (EINVAL);
3183 }
3184 #endif
3185 
3186 void
3187 umtx_thread_init(struct thread *td)
3188 {
3189 	td->td_umtxq = umtxq_alloc();
3190 	td->td_umtxq->uq_thread = td;
3191 }
3192 
3193 void
3194 umtx_thread_fini(struct thread *td)
3195 {
3196 	umtxq_free(td->td_umtxq);
3197 }
3198 
3199 /*
3200  * It will be called when new thread is created, e.g fork().
3201  */
3202 void
3203 umtx_thread_alloc(struct thread *td)
3204 {
3205 	struct umtx_q *uq;
3206 
3207 	uq = td->td_umtxq;
3208 	uq->uq_inherited_pri = PRI_MAX;
3209 
3210 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3211 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3212 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3213 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3214 }
3215 
3216 /*
3217  * exec() hook.
3218  */
3219 static void
3220 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3221 	struct image_params *imgp __unused)
3222 {
3223 	umtx_thread_cleanup(curthread);
3224 }
3225 
3226 /*
3227  * thread_exit() hook.
3228  */
3229 void
3230 umtx_thread_exit(struct thread *td)
3231 {
3232 	umtx_thread_cleanup(td);
3233 }
3234 
3235 /*
3236  * clean up umtx data.
3237  */
3238 static void
3239 umtx_thread_cleanup(struct thread *td)
3240 {
3241 	struct umtx_q *uq;
3242 	struct umtx_pi *pi;
3243 
3244 	if ((uq = td->td_umtxq) == NULL)
3245 		return;
3246 
3247 	mtx_lock_spin(&umtx_lock);
3248 	uq->uq_inherited_pri = PRI_MAX;
3249 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3250 		pi->pi_owner = NULL;
3251 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3252 	}
3253 	thread_lock(td);
3254 	td->td_flags &= ~TDF_UBORROWING;
3255 	thread_unlock(td);
3256 	mtx_unlock_spin(&umtx_lock);
3257 }
3258