xref: /freebsd/sys/kern/kern_umtx.c (revision acd3428b7d3e94cef0e1881c868cb4b131d4ff41)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/sysctl.h>
42 #include <sys/sysent.h>
43 #include <sys/systm.h>
44 #include <sys/sysproto.h>
45 #include <sys/eventhandler.h>
46 #include <sys/umtx.h>
47 
48 #include <vm/vm.h>
49 #include <vm/vm_param.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_object.h>
53 
54 #ifdef COMPAT_IA32
55 #include <compat/freebsd32/freebsd32_proto.h>
56 #endif
57 
58 #define TYPE_SIMPLE_LOCK	0
59 #define TYPE_SIMPLE_WAIT	1
60 #define TYPE_NORMAL_UMUTEX	2
61 #define TYPE_PI_UMUTEX		3
62 #define TYPE_PP_UMUTEX		4
63 #define TYPE_CV			5
64 
65 /* Key to represent a unique userland synchronous object */
66 struct umtx_key {
67 	int	hash;
68 	int	type;
69 	int	shared;
70 	union {
71 		struct {
72 			vm_object_t	object;
73 			uintptr_t	offset;
74 		} shared;
75 		struct {
76 			struct vmspace	*vs;
77 			uintptr_t	addr;
78 		} private;
79 		struct {
80 			void		*a;
81 			uintptr_t	b;
82 		} both;
83 	} info;
84 };
85 
86 /* Priority inheritance mutex info. */
87 struct umtx_pi {
88 	/* Owner thread */
89 	struct thread		*pi_owner;
90 
91 	/* Reference count */
92 	int			pi_refcount;
93 
94  	/* List entry to link umtx holding by thread */
95 	TAILQ_ENTRY(umtx_pi)	pi_link;
96 
97 	/* List entry in hash */
98 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
99 
100 	/* List for waiters */
101 	TAILQ_HEAD(,umtx_q)	pi_blocked;
102 
103 	/* Identify a userland lock object */
104 	struct umtx_key		pi_key;
105 };
106 
107 /* A userland synchronous object user. */
108 struct umtx_q {
109 	/* Linked list for the hash. */
110 	TAILQ_ENTRY(umtx_q)	uq_link;
111 
112 	/* Umtx key. */
113 	struct umtx_key		uq_key;
114 
115 	/* Umtx flags. */
116 	int			uq_flags;
117 #define UQF_UMTXQ	0x0001
118 
119 	/* The thread waits on. */
120 	struct thread		*uq_thread;
121 
122 	/*
123 	 * Blocked on PI mutex. read can use chain lock
124 	 * or sched_lock, write must have both chain lock and
125 	 * sched_lock being hold.
126 	 */
127 	struct umtx_pi		*uq_pi_blocked;
128 
129 	/* On blocked list */
130 	TAILQ_ENTRY(umtx_q)	uq_lockq;
131 
132 	/* Thread contending with us */
133 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
134 
135 	/* Inherited priority from PP mutex */
136 	u_char			uq_inherited_pri;
137 };
138 
139 TAILQ_HEAD(umtxq_head, umtx_q);
140 
141 /* Userland lock object's wait-queue chain */
142 struct umtxq_chain {
143 	/* Lock for this chain. */
144 	struct mtx		uc_lock;
145 
146 	/* List of sleep queues. */
147 	struct umtxq_head	uc_queue;
148 
149 	/* Busy flag */
150 	char			uc_busy;
151 
152 	/* Chain lock waiters */
153 	int			uc_waiters;
154 
155 	/* All PI in the list */
156 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
157 };
158 
159 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160 
161 /*
162  * Don't propagate time-sharing priority, there is a security reason,
163  * a user can simply introduce PI-mutex, let thread A lock the mutex,
164  * and let another thread B block on the mutex, because B is
165  * sleeping, its priority will be boosted, this causes A's priority to
166  * be boosted via priority propagating too and will never be lowered even
167  * if it is using 100%CPU, this is unfair to other processes.
168  */
169 
170 #ifdef KSE
171 #define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
172 			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
173 			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
174 #else
175 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
176 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
177 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
178 #endif
179 
180 #define	GOLDEN_RATIO_PRIME	2654404609U
181 #define	UMTX_CHAINS		128
182 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
183 
184 #define THREAD_SHARE		0
185 #define PROCESS_SHARE		1
186 #define AUTO_SHARE		2
187 
188 #define	GET_SHARE(flags)	\
189     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190 
191 static uma_zone_t		umtx_pi_zone;
192 static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
193 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
194 static int			umtx_pi_allocated;
195 
196 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
197 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
198     &umtx_pi_allocated, 0, "Allocated umtx_pi");
199 
200 static void umtxq_sysinit(void *);
201 static void umtxq_hash(struct umtx_key *key);
202 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
203 static void umtxq_lock(struct umtx_key *key);
204 static void umtxq_unlock(struct umtx_key *key);
205 static void umtxq_busy(struct umtx_key *key);
206 static void umtxq_unbusy(struct umtx_key *key);
207 static void umtxq_insert(struct umtx_q *uq);
208 static void umtxq_remove(struct umtx_q *uq);
209 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
210 static int umtxq_count(struct umtx_key *key);
211 static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
212 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
213 static int umtx_key_get(void *addr, int type, int share,
214 	struct umtx_key *key);
215 static void umtx_key_release(struct umtx_key *key);
216 static struct umtx_pi *umtx_pi_alloc(int);
217 static void umtx_pi_free(struct umtx_pi *pi);
218 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
219 static void umtx_thread_cleanup(struct thread *td);
220 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
221 	struct image_params *imgp __unused);
222 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
223 
224 static void
225 umtxq_sysinit(void *arg __unused)
226 {
227 	int i;
228 
229 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
230 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
231 	for (i = 0; i < UMTX_CHAINS; ++i) {
232 		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
233 			 MTX_DEF | MTX_DUPOK);
234 		TAILQ_INIT(&umtxq_chains[i].uc_queue);
235 		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
236 		umtxq_chains[i].uc_busy = 0;
237 		umtxq_chains[i].uc_waiters = 0;
238 	}
239 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
240 	    EVENTHANDLER_PRI_ANY);
241 }
242 
243 struct umtx_q *
244 umtxq_alloc(void)
245 {
246 	struct umtx_q *uq;
247 
248 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
249 	TAILQ_INIT(&uq->uq_pi_contested);
250 	uq->uq_inherited_pri = PRI_MAX;
251 	return (uq);
252 }
253 
254 void
255 umtxq_free(struct umtx_q *uq)
256 {
257 	free(uq, M_UMTX);
258 }
259 
260 static inline void
261 umtxq_hash(struct umtx_key *key)
262 {
263 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
264 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
265 }
266 
267 static inline int
268 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
269 {
270 	return (k1->type == k2->type &&
271 		k1->info.both.a == k2->info.both.a &&
272 	        k1->info.both.b == k2->info.both.b);
273 }
274 
275 static inline struct umtxq_chain *
276 umtxq_getchain(struct umtx_key *key)
277 {
278 	return (&umtxq_chains[key->hash]);
279 }
280 
281 /*
282  * Set chain to busy state when following operation
283  * may be blocked (kernel mutex can not be used).
284  */
285 static inline void
286 umtxq_busy(struct umtx_key *key)
287 {
288 	struct umtxq_chain *uc;
289 
290 	uc = umtxq_getchain(key);
291 	mtx_assert(&uc->uc_lock, MA_OWNED);
292 	while (uc->uc_busy != 0) {
293 		uc->uc_waiters++;
294 		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
295 		uc->uc_waiters--;
296 	}
297 	uc->uc_busy = 1;
298 }
299 
300 /*
301  * Unbusy a chain.
302  */
303 static inline void
304 umtxq_unbusy(struct umtx_key *key)
305 {
306 	struct umtxq_chain *uc;
307 
308 	uc = umtxq_getchain(key);
309 	mtx_assert(&uc->uc_lock, MA_OWNED);
310 	KASSERT(uc->uc_busy != 0, ("not busy"));
311 	uc->uc_busy = 0;
312 	if (uc->uc_waiters)
313 		wakeup_one(uc);
314 }
315 
316 /*
317  * Lock a chain.
318  */
319 static inline void
320 umtxq_lock(struct umtx_key *key)
321 {
322 	struct umtxq_chain *uc;
323 
324 	uc = umtxq_getchain(key);
325 	mtx_lock(&uc->uc_lock);
326 }
327 
328 /*
329  * Unlock a chain.
330  */
331 static inline void
332 umtxq_unlock(struct umtx_key *key)
333 {
334 	struct umtxq_chain *uc;
335 
336 	uc = umtxq_getchain(key);
337 	mtx_unlock(&uc->uc_lock);
338 }
339 
340 /*
341  * Insert a thread onto the umtx queue.
342  */
343 static inline void
344 umtxq_insert(struct umtx_q *uq)
345 {
346 	struct umtxq_chain *uc;
347 
348 	uc = umtxq_getchain(&uq->uq_key);
349 	UMTXQ_LOCKED_ASSERT(uc);
350 	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
351 	uq->uq_flags |= UQF_UMTXQ;
352 }
353 
354 /*
355  * Remove thread from the umtx queue.
356  */
357 static inline void
358 umtxq_remove(struct umtx_q *uq)
359 {
360 	struct umtxq_chain *uc;
361 
362 	uc = umtxq_getchain(&uq->uq_key);
363 	UMTXQ_LOCKED_ASSERT(uc);
364 	if (uq->uq_flags & UQF_UMTXQ) {
365 		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
366 		uq->uq_flags &= ~UQF_UMTXQ;
367 	}
368 }
369 
370 /*
371  * Check if there are multiple waiters
372  */
373 static int
374 umtxq_count(struct umtx_key *key)
375 {
376 	struct umtxq_chain *uc;
377 	struct umtx_q *uq;
378 	int count = 0;
379 
380 	uc = umtxq_getchain(key);
381 	UMTXQ_LOCKED_ASSERT(uc);
382 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
383 		if (umtx_key_match(&uq->uq_key, key)) {
384 			if (++count > 1)
385 				break;
386 		}
387 	}
388 	return (count);
389 }
390 
391 /*
392  * Check if there are multiple PI waiters and returns first
393  * waiter.
394  */
395 static int
396 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
397 {
398 	struct umtxq_chain *uc;
399 	struct umtx_q *uq;
400 	int count = 0;
401 
402 	*first = NULL;
403 	uc = umtxq_getchain(key);
404 	UMTXQ_LOCKED_ASSERT(uc);
405 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
406 		if (umtx_key_match(&uq->uq_key, key)) {
407 			if (++count > 1)
408 				break;
409 			*first = uq;
410 		}
411 	}
412 	return (count);
413 }
414 
415 /*
416  * Wake up threads waiting on an userland object.
417  */
418 static int
419 umtxq_signal(struct umtx_key *key, int n_wake)
420 {
421 	struct umtxq_chain *uc;
422 	struct umtx_q *uq, *next;
423 	int ret;
424 
425 	ret = 0;
426 	uc = umtxq_getchain(key);
427 	UMTXQ_LOCKED_ASSERT(uc);
428 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
429 		if (umtx_key_match(&uq->uq_key, key)) {
430 			umtxq_remove(uq);
431 			wakeup(uq);
432 			if (++ret >= n_wake)
433 				break;
434 		}
435 	}
436 	return (ret);
437 }
438 
439 /*
440  * Wake up specified thread.
441  */
442 static inline void
443 umtxq_signal_thread(struct umtx_q *uq)
444 {
445 	struct umtxq_chain *uc;
446 
447 	uc = umtxq_getchain(&uq->uq_key);
448 	UMTXQ_LOCKED_ASSERT(uc);
449 	umtxq_remove(uq);
450 	wakeup(uq);
451 }
452 
453 /*
454  * Put thread into sleep state, before sleeping, check if
455  * thread was removed from umtx queue.
456  */
457 static inline int
458 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
459 {
460 	struct umtxq_chain *uc;
461 	int error;
462 
463 	uc = umtxq_getchain(&uq->uq_key);
464 	UMTXQ_LOCKED_ASSERT(uc);
465 	if (!(uq->uq_flags & UQF_UMTXQ))
466 		return (0);
467 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
468 	if (error == EWOULDBLOCK)
469 		error = ETIMEDOUT;
470 	return (error);
471 }
472 
473 /*
474  * Convert userspace address into unique logical address.
475  */
476 static int
477 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
478 {
479 	struct thread *td = curthread;
480 	vm_map_t map;
481 	vm_map_entry_t entry;
482 	vm_pindex_t pindex;
483 	vm_prot_t prot;
484 	boolean_t wired;
485 
486 	key->type = type;
487 	if (share == THREAD_SHARE) {
488 		key->shared = 0;
489 		key->info.private.vs = td->td_proc->p_vmspace;
490 		key->info.private.addr = (uintptr_t)addr;
491 	} else {
492 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
493 		map = &td->td_proc->p_vmspace->vm_map;
494 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
495 		    &entry, &key->info.shared.object, &pindex, &prot,
496 		    &wired) != KERN_SUCCESS) {
497 			return EFAULT;
498 		}
499 
500 		if ((share == PROCESS_SHARE) ||
501 		    (share == AUTO_SHARE &&
502 		     VM_INHERIT_SHARE == entry->inheritance)) {
503 			key->shared = 1;
504 			key->info.shared.offset = entry->offset + entry->start -
505 				(vm_offset_t)addr;
506 			vm_object_reference(key->info.shared.object);
507 		} else {
508 			key->shared = 0;
509 			key->info.private.vs = td->td_proc->p_vmspace;
510 			key->info.private.addr = (uintptr_t)addr;
511 		}
512 		vm_map_lookup_done(map, entry);
513 	}
514 
515 	umtxq_hash(key);
516 	return (0);
517 }
518 
519 /*
520  * Release key.
521  */
522 static inline void
523 umtx_key_release(struct umtx_key *key)
524 {
525 	if (key->shared)
526 		vm_object_deallocate(key->info.shared.object);
527 }
528 
529 /*
530  * Lock a umtx object.
531  */
532 static int
533 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
534 {
535 	struct umtx_q *uq;
536 	u_long owner;
537 	u_long old;
538 	int error = 0;
539 
540 	uq = td->td_umtxq;
541 
542 	/*
543 	 * Care must be exercised when dealing with umtx structure. It
544 	 * can fault on any access.
545 	 */
546 	for (;;) {
547 		/*
548 		 * Try the uncontested case.  This should be done in userland.
549 		 */
550 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
551 
552 		/* The acquire succeeded. */
553 		if (owner == UMTX_UNOWNED)
554 			return (0);
555 
556 		/* The address was invalid. */
557 		if (owner == -1)
558 			return (EFAULT);
559 
560 		/* If no one owns it but it is contested try to acquire it. */
561 		if (owner == UMTX_CONTESTED) {
562 			owner = casuword(&umtx->u_owner,
563 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
564 
565 			if (owner == UMTX_CONTESTED)
566 				return (0);
567 
568 			/* The address was invalid. */
569 			if (owner == -1)
570 				return (EFAULT);
571 
572 			/* If this failed the lock has changed, restart. */
573 			continue;
574 		}
575 
576 		/*
577 		 * If we caught a signal, we have retried and now
578 		 * exit immediately.
579 		 */
580 		if (error != 0)
581 			return (error);
582 
583 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
584 			AUTO_SHARE, &uq->uq_key)) != 0)
585 			return (error);
586 
587 		umtxq_lock(&uq->uq_key);
588 		umtxq_busy(&uq->uq_key);
589 		umtxq_insert(uq);
590 		umtxq_unbusy(&uq->uq_key);
591 		umtxq_unlock(&uq->uq_key);
592 
593 		/*
594 		 * Set the contested bit so that a release in user space
595 		 * knows to use the system call for unlock.  If this fails
596 		 * either some one else has acquired the lock or it has been
597 		 * released.
598 		 */
599 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
600 
601 		/* The address was invalid. */
602 		if (old == -1) {
603 			umtxq_lock(&uq->uq_key);
604 			umtxq_remove(uq);
605 			umtxq_unlock(&uq->uq_key);
606 			umtx_key_release(&uq->uq_key);
607 			return (EFAULT);
608 		}
609 
610 		/*
611 		 * We set the contested bit, sleep. Otherwise the lock changed
612 		 * and we need to retry or we lost a race to the thread
613 		 * unlocking the umtx.
614 		 */
615 		umtxq_lock(&uq->uq_key);
616 		if (old == owner)
617 			error = umtxq_sleep(uq, "umtx", timo);
618 		umtxq_remove(uq);
619 		umtxq_unlock(&uq->uq_key);
620 		umtx_key_release(&uq->uq_key);
621 	}
622 
623 	return (0);
624 }
625 
626 /*
627  * Lock a umtx object.
628  */
629 static int
630 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
631 	struct timespec *timeout)
632 {
633 	struct timespec ts, ts2, ts3;
634 	struct timeval tv;
635 	int error;
636 
637 	if (timeout == NULL) {
638 		error = _do_lock_umtx(td, umtx, id, 0);
639 		/* Mutex locking is restarted if it is interrupted. */
640 		if (error == EINTR)
641 			error = ERESTART;
642 	} else {
643 		getnanouptime(&ts);
644 		timespecadd(&ts, timeout);
645 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
646 		for (;;) {
647 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
648 			if (error != ETIMEDOUT)
649 				break;
650 			getnanouptime(&ts2);
651 			if (timespeccmp(&ts2, &ts, >=)) {
652 				error = ETIMEDOUT;
653 				break;
654 			}
655 			ts3 = ts;
656 			timespecsub(&ts3, &ts2);
657 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
658 		}
659 		/* Timed-locking is not restarted. */
660 		if (error == ERESTART)
661 			error = EINTR;
662 	}
663 	return (error);
664 }
665 
666 /*
667  * Unlock a umtx object.
668  */
669 static int
670 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
671 {
672 	struct umtx_key key;
673 	u_long owner;
674 	u_long old;
675 	int error;
676 	int count;
677 
678 	/*
679 	 * Make sure we own this mtx.
680 	 */
681 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
682 	if (owner == -1)
683 		return (EFAULT);
684 
685 	if ((owner & ~UMTX_CONTESTED) != id)
686 		return (EPERM);
687 
688 	/* This should be done in userland */
689 	if ((owner & UMTX_CONTESTED) == 0) {
690 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
691 		if (old == -1)
692 			return (EFAULT);
693 		if (old == owner)
694 			return (0);
695 		owner = old;
696 	}
697 
698 	/* We should only ever be in here for contested locks */
699 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
700 		&key)) != 0)
701 		return (error);
702 
703 	umtxq_lock(&key);
704 	umtxq_busy(&key);
705 	count = umtxq_count(&key);
706 	umtxq_unlock(&key);
707 
708 	/*
709 	 * When unlocking the umtx, it must be marked as unowned if
710 	 * there is zero or one thread only waiting for it.
711 	 * Otherwise, it must be marked as contested.
712 	 */
713 	old = casuword(&umtx->u_owner, owner,
714 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
715 	umtxq_lock(&key);
716 	umtxq_signal(&key,1);
717 	umtxq_unbusy(&key);
718 	umtxq_unlock(&key);
719 	umtx_key_release(&key);
720 	if (old == -1)
721 		return (EFAULT);
722 	if (old != owner)
723 		return (EINVAL);
724 	return (0);
725 }
726 
727 #ifdef COMPAT_IA32
728 
729 /*
730  * Lock a umtx object.
731  */
732 static int
733 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
734 {
735 	struct umtx_q *uq;
736 	uint32_t owner;
737 	uint32_t old;
738 	int error = 0;
739 
740 	uq = td->td_umtxq;
741 
742 	/*
743 	 * Care must be exercised when dealing with umtx structure. It
744 	 * can fault on any access.
745 	 */
746 	for (;;) {
747 		/*
748 		 * Try the uncontested case.  This should be done in userland.
749 		 */
750 		owner = casuword32(m, UMUTEX_UNOWNED, id);
751 
752 		/* The acquire succeeded. */
753 		if (owner == UMUTEX_UNOWNED)
754 			return (0);
755 
756 		/* The address was invalid. */
757 		if (owner == -1)
758 			return (EFAULT);
759 
760 		/* If no one owns it but it is contested try to acquire it. */
761 		if (owner == UMUTEX_CONTESTED) {
762 			owner = casuword32(m,
763 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
764 			if (owner == UMUTEX_CONTESTED)
765 				return (0);
766 
767 			/* The address was invalid. */
768 			if (owner == -1)
769 				return (EFAULT);
770 
771 			/* If this failed the lock has changed, restart. */
772 			continue;
773 		}
774 
775 		/*
776 		 * If we caught a signal, we have retried and now
777 		 * exit immediately.
778 		 */
779 		if (error != 0)
780 			return (error);
781 
782 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
783 			AUTO_SHARE, &uq->uq_key)) != 0)
784 			return (error);
785 
786 		umtxq_lock(&uq->uq_key);
787 		umtxq_busy(&uq->uq_key);
788 		umtxq_insert(uq);
789 		umtxq_unbusy(&uq->uq_key);
790 		umtxq_unlock(&uq->uq_key);
791 
792 		/*
793 		 * Set the contested bit so that a release in user space
794 		 * knows to use the system call for unlock.  If this fails
795 		 * either some one else has acquired the lock or it has been
796 		 * released.
797 		 */
798 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
799 
800 		/* The address was invalid. */
801 		if (old == -1) {
802 			umtxq_lock(&uq->uq_key);
803 			umtxq_remove(uq);
804 			umtxq_unlock(&uq->uq_key);
805 			umtx_key_release(&uq->uq_key);
806 			return (EFAULT);
807 		}
808 
809 		/*
810 		 * We set the contested bit, sleep. Otherwise the lock changed
811 		 * and we need to retry or we lost a race to the thread
812 		 * unlocking the umtx.
813 		 */
814 		umtxq_lock(&uq->uq_key);
815 		if (old == owner)
816 			error = umtxq_sleep(uq, "umtx", timo);
817 		umtxq_remove(uq);
818 		umtxq_unlock(&uq->uq_key);
819 		umtx_key_release(&uq->uq_key);
820 	}
821 
822 	return (0);
823 }
824 
825 /*
826  * Lock a umtx object.
827  */
828 static int
829 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
830 	struct timespec *timeout)
831 {
832 	struct timespec ts, ts2, ts3;
833 	struct timeval tv;
834 	int error;
835 
836 	if (timeout == NULL) {
837 		error = _do_lock_umtx32(td, m, id, 0);
838 		/* Mutex locking is restarted if it is interrupted. */
839 		if (error == EINTR)
840 			error = ERESTART;
841 	} else {
842 		getnanouptime(&ts);
843 		timespecadd(&ts, timeout);
844 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
845 		for (;;) {
846 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
847 			if (error != ETIMEDOUT)
848 				break;
849 			getnanouptime(&ts2);
850 			if (timespeccmp(&ts2, &ts, >=)) {
851 				error = ETIMEDOUT;
852 				break;
853 			}
854 			ts3 = ts;
855 			timespecsub(&ts3, &ts2);
856 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
857 		}
858 		/* Timed-locking is not restarted. */
859 		if (error == ERESTART)
860 			error = EINTR;
861 	}
862 	return (error);
863 }
864 
865 /*
866  * Unlock a umtx object.
867  */
868 static int
869 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
870 {
871 	struct umtx_key key;
872 	uint32_t owner;
873 	uint32_t old;
874 	int error;
875 	int count;
876 
877 	/*
878 	 * Make sure we own this mtx.
879 	 */
880 	owner = fuword32(m);
881 	if (owner == -1)
882 		return (EFAULT);
883 
884 	if ((owner & ~UMUTEX_CONTESTED) != id)
885 		return (EPERM);
886 
887 	/* This should be done in userland */
888 	if ((owner & UMUTEX_CONTESTED) == 0) {
889 		old = casuword32(m, owner, UMUTEX_UNOWNED);
890 		if (old == -1)
891 			return (EFAULT);
892 		if (old == owner)
893 			return (0);
894 		owner = old;
895 	}
896 
897 	/* We should only ever be in here for contested locks */
898 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
899 		&key)) != 0)
900 		return (error);
901 
902 	umtxq_lock(&key);
903 	umtxq_busy(&key);
904 	count = umtxq_count(&key);
905 	umtxq_unlock(&key);
906 
907 	/*
908 	 * When unlocking the umtx, it must be marked as unowned if
909 	 * there is zero or one thread only waiting for it.
910 	 * Otherwise, it must be marked as contested.
911 	 */
912 	old = casuword32(m, owner,
913 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
914 	umtxq_lock(&key);
915 	umtxq_signal(&key,1);
916 	umtxq_unbusy(&key);
917 	umtxq_unlock(&key);
918 	umtx_key_release(&key);
919 	if (old == -1)
920 		return (EFAULT);
921 	if (old != owner)
922 		return (EINVAL);
923 	return (0);
924 }
925 #endif
926 
927 /*
928  * Fetch and compare value, sleep on the address if value is not changed.
929  */
930 static int
931 do_wait(struct thread *td, void *addr, u_long id,
932 	struct timespec *timeout, int compat32)
933 {
934 	struct umtx_q *uq;
935 	struct timespec ts, ts2, ts3;
936 	struct timeval tv;
937 	u_long tmp;
938 	int error = 0;
939 
940 	uq = td->td_umtxq;
941 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
942 	    &uq->uq_key)) != 0)
943 		return (error);
944 
945 	umtxq_lock(&uq->uq_key);
946 	umtxq_insert(uq);
947 	umtxq_unlock(&uq->uq_key);
948 	if (compat32 == 0)
949 		tmp = fuword(addr);
950         else
951 		tmp = fuword32(addr);
952 	if (tmp != id) {
953 		umtxq_lock(&uq->uq_key);
954 		umtxq_remove(uq);
955 		umtxq_unlock(&uq->uq_key);
956 	} else if (timeout == NULL) {
957 		umtxq_lock(&uq->uq_key);
958 		error = umtxq_sleep(uq, "ucond", 0);
959 		umtxq_remove(uq);
960 		umtxq_unlock(&uq->uq_key);
961 	} else {
962 		getnanouptime(&ts);
963 		timespecadd(&ts, timeout);
964 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
965 		umtxq_lock(&uq->uq_key);
966 		for (;;) {
967 			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
968 			if (!(uq->uq_flags & UQF_UMTXQ))
969 				break;
970 			if (error != ETIMEDOUT)
971 				break;
972 			umtxq_unlock(&uq->uq_key);
973 			getnanouptime(&ts2);
974 			if (timespeccmp(&ts2, &ts, >=)) {
975 				error = ETIMEDOUT;
976 				umtxq_lock(&uq->uq_key);
977 				break;
978 			}
979 			ts3 = ts;
980 			timespecsub(&ts3, &ts2);
981 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
982 			umtxq_lock(&uq->uq_key);
983 		}
984 		umtxq_remove(uq);
985 		umtxq_unlock(&uq->uq_key);
986 	}
987 	umtx_key_release(&uq->uq_key);
988 	if (error == ERESTART)
989 		error = EINTR;
990 	return (error);
991 }
992 
993 /*
994  * Wake up threads sleeping on the specified address.
995  */
996 int
997 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
998 {
999 	struct umtx_key key;
1000 	int ret;
1001 
1002 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1003 	   &key)) != 0)
1004 		return (ret);
1005 	umtxq_lock(&key);
1006 	ret = umtxq_signal(&key, n_wake);
1007 	umtxq_unlock(&key);
1008 	umtx_key_release(&key);
1009 	return (0);
1010 }
1011 
1012 /*
1013  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1014  */
1015 static int
1016 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1017 	int try)
1018 {
1019 	struct umtx_q *uq;
1020 	uint32_t owner, old, id;
1021 	int error = 0;
1022 
1023 	id = td->td_tid;
1024 	uq = td->td_umtxq;
1025 
1026 	/*
1027 	 * Care must be exercised when dealing with umtx structure. It
1028 	 * can fault on any access.
1029 	 */
1030 	for (;;) {
1031 		/*
1032 		 * Try the uncontested case.  This should be done in userland.
1033 		 */
1034 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1035 
1036 		/* The acquire succeeded. */
1037 		if (owner == UMUTEX_UNOWNED)
1038 			return (0);
1039 
1040 		/* The address was invalid. */
1041 		if (owner == -1)
1042 			return (EFAULT);
1043 
1044 		/* If no one owns it but it is contested try to acquire it. */
1045 		if (owner == UMUTEX_CONTESTED) {
1046 			owner = casuword32(&m->m_owner,
1047 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1048 
1049 			if (owner == UMUTEX_CONTESTED)
1050 				return (0);
1051 
1052 			/* The address was invalid. */
1053 			if (owner == -1)
1054 				return (EFAULT);
1055 
1056 			/* If this failed the lock has changed, restart. */
1057 			continue;
1058 		}
1059 
1060 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1061 		    (owner & ~UMUTEX_CONTESTED) == id)
1062 			return (EDEADLK);
1063 
1064 		if (try != 0)
1065 			return (EBUSY);
1066 
1067 		/*
1068 		 * If we caught a signal, we have retried and now
1069 		 * exit immediately.
1070 		 */
1071 		if (error != 0)
1072 			return (error);
1073 
1074 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1075 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1076 			return (error);
1077 
1078 		umtxq_lock(&uq->uq_key);
1079 		umtxq_busy(&uq->uq_key);
1080 		umtxq_insert(uq);
1081 		umtxq_unbusy(&uq->uq_key);
1082 		umtxq_unlock(&uq->uq_key);
1083 
1084 		/*
1085 		 * Set the contested bit so that a release in user space
1086 		 * knows to use the system call for unlock.  If this fails
1087 		 * either some one else has acquired the lock or it has been
1088 		 * released.
1089 		 */
1090 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1091 
1092 		/* The address was invalid. */
1093 		if (old == -1) {
1094 			umtxq_lock(&uq->uq_key);
1095 			umtxq_remove(uq);
1096 			umtxq_unlock(&uq->uq_key);
1097 			umtx_key_release(&uq->uq_key);
1098 			return (EFAULT);
1099 		}
1100 
1101 		/*
1102 		 * We set the contested bit, sleep. Otherwise the lock changed
1103 		 * and we need to retry or we lost a race to the thread
1104 		 * unlocking the umtx.
1105 		 */
1106 		umtxq_lock(&uq->uq_key);
1107 		if (old == owner)
1108 			error = umtxq_sleep(uq, "umtxn", timo);
1109 		umtxq_remove(uq);
1110 		umtxq_unlock(&uq->uq_key);
1111 		umtx_key_release(&uq->uq_key);
1112 	}
1113 
1114 	return (0);
1115 }
1116 
1117 /*
1118  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1119  */
1120 /*
1121  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1122  */
1123 static int
1124 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1125 {
1126 	struct umtx_key key;
1127 	uint32_t owner, old, id;
1128 	int error;
1129 	int count;
1130 
1131 	id = td->td_tid;
1132 	/*
1133 	 * Make sure we own this mtx.
1134 	 */
1135 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1136 	if (owner == -1)
1137 		return (EFAULT);
1138 
1139 	if ((owner & ~UMUTEX_CONTESTED) != id)
1140 		return (EPERM);
1141 
1142 	/* This should be done in userland */
1143 	if ((owner & UMUTEX_CONTESTED) == 0) {
1144 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1145 		if (old == -1)
1146 			return (EFAULT);
1147 		if (old == owner)
1148 			return (0);
1149 		owner = old;
1150 	}
1151 
1152 	/* We should only ever be in here for contested locks */
1153 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1154 	    &key)) != 0)
1155 		return (error);
1156 
1157 	umtxq_lock(&key);
1158 	umtxq_busy(&key);
1159 	count = umtxq_count(&key);
1160 	umtxq_unlock(&key);
1161 
1162 	/*
1163 	 * When unlocking the umtx, it must be marked as unowned if
1164 	 * there is zero or one thread only waiting for it.
1165 	 * Otherwise, it must be marked as contested.
1166 	 */
1167 	old = casuword32(&m->m_owner, owner,
1168 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1169 	umtxq_lock(&key);
1170 	umtxq_signal(&key,1);
1171 	umtxq_unbusy(&key);
1172 	umtxq_unlock(&key);
1173 	umtx_key_release(&key);
1174 	if (old == -1)
1175 		return (EFAULT);
1176 	if (old != owner)
1177 		return (EINVAL);
1178 	return (0);
1179 }
1180 
1181 static inline struct umtx_pi *
1182 umtx_pi_alloc(int flags)
1183 {
1184 	struct umtx_pi *pi;
1185 
1186 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1187 	TAILQ_INIT(&pi->pi_blocked);
1188 	atomic_add_int(&umtx_pi_allocated, 1);
1189 	return (pi);
1190 }
1191 
1192 static inline void
1193 umtx_pi_free(struct umtx_pi *pi)
1194 {
1195 	uma_zfree(umtx_pi_zone, pi);
1196 	atomic_add_int(&umtx_pi_allocated, -1);
1197 }
1198 
1199 /*
1200  * Adjust the thread's position on a pi_state after its priority has been
1201  * changed.
1202  */
1203 static int
1204 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1205 {
1206 	struct umtx_q *uq, *uq1, *uq2;
1207 	struct thread *td1;
1208 
1209 	mtx_assert(&sched_lock, MA_OWNED);
1210 	if (pi == NULL)
1211 		return (0);
1212 
1213 	uq = td->td_umtxq;
1214 
1215 	/*
1216 	 * Check if the thread needs to be moved on the blocked chain.
1217 	 * It needs to be moved if either its priority is lower than
1218 	 * the previous thread or higher than the next thread.
1219 	 */
1220 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1221 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1222 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1223 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1224 		/*
1225 		 * Remove thread from blocked chain and determine where
1226 		 * it should be moved to.
1227 		 */
1228 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1229 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1230 			td1 = uq1->uq_thread;
1231 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1232 			if (UPRI(td1) > UPRI(td))
1233 				break;
1234 		}
1235 
1236 		if (uq1 == NULL)
1237 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1238 		else
1239 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1240 	}
1241 	return (1);
1242 }
1243 
1244 /*
1245  * Propagate priority when a thread is blocked on POSIX
1246  * PI mutex.
1247  */
1248 static void
1249 umtx_propagate_priority(struct thread *td)
1250 {
1251 	struct umtx_q *uq;
1252 	struct umtx_pi *pi;
1253 	int pri;
1254 
1255 	mtx_assert(&sched_lock, MA_OWNED);
1256 	pri = UPRI(td);
1257 	uq = td->td_umtxq;
1258 	pi = uq->uq_pi_blocked;
1259 	if (pi == NULL)
1260 		return;
1261 
1262 	for (;;) {
1263 		td = pi->pi_owner;
1264 		if (td == NULL)
1265 			return;
1266 
1267 		MPASS(td->td_proc != NULL);
1268 		MPASS(td->td_proc->p_magic == P_MAGIC);
1269 
1270 		if (UPRI(td) <= pri)
1271 			return;
1272 
1273 		sched_lend_user_prio(td, pri);
1274 
1275 		/*
1276 		 * Pick up the lock that td is blocked on.
1277 		 */
1278 		uq = td->td_umtxq;
1279 		pi = uq->uq_pi_blocked;
1280 		/* Resort td on the list if needed. */
1281 		if (!umtx_pi_adjust_thread(pi, td))
1282 			break;
1283 	}
1284 }
1285 
1286 /*
1287  * Unpropagate priority for a PI mutex when a thread blocked on
1288  * it is interrupted by signal or resumed by others.
1289  */
1290 static void
1291 umtx_unpropagate_priority(struct umtx_pi *pi)
1292 {
1293 	struct umtx_q *uq, *uq_owner;
1294 	struct umtx_pi *pi2;
1295 	int pri;
1296 
1297 	mtx_assert(&sched_lock, MA_OWNED);
1298 
1299 	while (pi != NULL && pi->pi_owner != NULL) {
1300 		pri = PRI_MAX;
1301 		uq_owner = pi->pi_owner->td_umtxq;
1302 
1303 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1304 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1305 			if (uq != NULL) {
1306 				if (pri > UPRI(uq->uq_thread))
1307 					pri = UPRI(uq->uq_thread);
1308 			}
1309 		}
1310 
1311 		if (pri > uq_owner->uq_inherited_pri)
1312 			pri = uq_owner->uq_inherited_pri;
1313 		sched_unlend_user_prio(pi->pi_owner, pri);
1314 		pi = uq_owner->uq_pi_blocked;
1315 	}
1316 }
1317 
1318 /*
1319  * Insert a PI mutex into owned list.
1320  */
1321 static void
1322 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1323 {
1324 	struct umtx_q *uq_owner;
1325 
1326 	uq_owner = owner->td_umtxq;
1327 	mtx_assert(&sched_lock, MA_OWNED);
1328 	if (pi->pi_owner != NULL)
1329 		panic("pi_ower != NULL");
1330 	pi->pi_owner = owner;
1331 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1332 }
1333 
1334 /*
1335  * Claim ownership of a PI mutex.
1336  */
1337 static int
1338 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1339 {
1340 	struct umtx_q *uq, *uq_owner;
1341 
1342 	uq_owner = owner->td_umtxq;
1343 	mtx_lock_spin(&sched_lock);
1344 	if (pi->pi_owner == owner) {
1345 		mtx_unlock_spin(&sched_lock);
1346 		return (0);
1347 	}
1348 
1349 	if (pi->pi_owner != NULL) {
1350 		/*
1351 		 * userland may have already messed the mutex, sigh.
1352 		 */
1353 		mtx_unlock_spin(&sched_lock);
1354 		return (EPERM);
1355 	}
1356 	umtx_pi_setowner(pi, owner);
1357 	uq = TAILQ_FIRST(&pi->pi_blocked);
1358 	if (uq != NULL) {
1359 		int pri;
1360 
1361 		pri = UPRI(uq->uq_thread);
1362 		if (pri < UPRI(owner))
1363 			sched_lend_user_prio(owner, pri);
1364 	}
1365 	mtx_unlock_spin(&sched_lock);
1366 	return (0);
1367 }
1368 
1369 /*
1370  * Adjust a thread's order position in its blocked PI mutex,
1371  * this may result new priority propagating process.
1372  */
1373 void
1374 umtx_pi_adjust(struct thread *td, u_char oldpri)
1375 {
1376 	struct umtx_q *uq;
1377 	struct umtx_pi *pi;
1378 
1379 	uq = td->td_umtxq;
1380 
1381 	mtx_assert(&sched_lock, MA_OWNED);
1382 	MPASS(TD_ON_UPILOCK(td));
1383 
1384 	/*
1385 	 * Pick up the lock that td is blocked on.
1386 	 */
1387 	pi = uq->uq_pi_blocked;
1388 	MPASS(pi != NULL);
1389 
1390 	/* Resort the turnstile on the list. */
1391 	if (!umtx_pi_adjust_thread(pi, td))
1392 		return;
1393 
1394 	/*
1395 	 * If our priority was lowered and we are at the head of the
1396 	 * turnstile, then propagate our new priority up the chain.
1397 	 */
1398 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1399 		umtx_propagate_priority(td);
1400 }
1401 
1402 /*
1403  * Sleep on a PI mutex.
1404  */
1405 static int
1406 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1407 	uint32_t owner, const char *wmesg, int timo)
1408 {
1409 	struct umtxq_chain *uc;
1410 	struct thread *td, *td1;
1411 	struct umtx_q *uq1;
1412 	int pri;
1413 	int error = 0;
1414 
1415 	td = uq->uq_thread;
1416 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1417 	uc = umtxq_getchain(&uq->uq_key);
1418 	UMTXQ_LOCKED_ASSERT(uc);
1419 	umtxq_insert(uq);
1420 	if (pi->pi_owner == NULL) {
1421 		/* XXX
1422 		 * Current, We only support process private PI-mutex,
1423 		 * non-contended PI-mutexes are locked in userland.
1424 		 * Process shared PI-mutex should always be initialized
1425 		 * by kernel and be registered in kernel, locking should
1426 		 * always be done by kernel to avoid security problems.
1427 		 * For process private PI-mutex, we can find owner
1428 		 * thread and boost its priority safely.
1429 		 */
1430 		PROC_LOCK(curproc);
1431 		td1 = thread_find(curproc, owner);
1432 		mtx_lock_spin(&sched_lock);
1433 		if (td1 != NULL && pi->pi_owner == NULL) {
1434 			uq1 = td1->td_umtxq;
1435 			umtx_pi_setowner(pi, td1);
1436 		}
1437 		PROC_UNLOCK(curproc);
1438 	} else {
1439 		mtx_lock_spin(&sched_lock);
1440 	}
1441 
1442 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1443 		pri = UPRI(uq1->uq_thread);
1444 		if (pri > UPRI(td))
1445 			break;
1446 	}
1447 
1448 	if (uq1 != NULL)
1449 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1450 	else
1451 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1452 
1453 	uq->uq_pi_blocked = pi;
1454 	td->td_flags |= TDF_UPIBLOCKED;
1455 	mtx_unlock_spin(&sched_lock);
1456 	umtxq_unlock(&uq->uq_key);
1457 
1458 	mtx_lock_spin(&sched_lock);
1459 	umtx_propagate_priority(td);
1460 	mtx_unlock_spin(&sched_lock);
1461 
1462 	umtxq_lock(&uq->uq_key);
1463 	if (uq->uq_flags & UQF_UMTXQ) {
1464 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1465 		if (error == EWOULDBLOCK)
1466 			error = ETIMEDOUT;
1467 		if (uq->uq_flags & UQF_UMTXQ) {
1468 			umtxq_busy(&uq->uq_key);
1469 			umtxq_remove(uq);
1470 			umtxq_unbusy(&uq->uq_key);
1471 		}
1472 	}
1473 	umtxq_unlock(&uq->uq_key);
1474 
1475 	mtx_lock_spin(&sched_lock);
1476 	uq->uq_pi_blocked = NULL;
1477 	td->td_flags &= ~TDF_UPIBLOCKED;
1478 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1479 	umtx_unpropagate_priority(pi);
1480 	mtx_unlock_spin(&sched_lock);
1481 
1482 	umtxq_lock(&uq->uq_key);
1483 
1484 	return (error);
1485 }
1486 
1487 /*
1488  * Add reference count for a PI mutex.
1489  */
1490 static void
1491 umtx_pi_ref(struct umtx_pi *pi)
1492 {
1493 	struct umtxq_chain *uc;
1494 
1495 	uc = umtxq_getchain(&pi->pi_key);
1496 	UMTXQ_LOCKED_ASSERT(uc);
1497 	pi->pi_refcount++;
1498 }
1499 
1500 /*
1501  * Decrease reference count for a PI mutex, if the counter
1502  * is decreased to zero, its memory space is freed.
1503  */
1504 static void
1505 umtx_pi_unref(struct umtx_pi *pi)
1506 {
1507 	struct umtxq_chain *uc;
1508 	int free = 0;
1509 
1510 	uc = umtxq_getchain(&pi->pi_key);
1511 	UMTXQ_LOCKED_ASSERT(uc);
1512 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1513 	if (--pi->pi_refcount == 0) {
1514 		mtx_lock_spin(&sched_lock);
1515 		if (pi->pi_owner != NULL) {
1516 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1517 				pi, pi_link);
1518 			pi->pi_owner = NULL;
1519 		}
1520 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1521 			("blocked queue not empty"));
1522 		mtx_unlock_spin(&sched_lock);
1523 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1524 		free = 1;
1525 	}
1526 	if (free)
1527 		umtx_pi_free(pi);
1528 }
1529 
1530 /*
1531  * Find a PI mutex in hash table.
1532  */
1533 static struct umtx_pi *
1534 umtx_pi_lookup(struct umtx_key *key)
1535 {
1536 	struct umtxq_chain *uc;
1537 	struct umtx_pi *pi;
1538 
1539 	uc = umtxq_getchain(key);
1540 	UMTXQ_LOCKED_ASSERT(uc);
1541 
1542 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1543 		if (umtx_key_match(&pi->pi_key, key)) {
1544 			return (pi);
1545 		}
1546 	}
1547 	return (NULL);
1548 }
1549 
1550 /*
1551  * Insert a PI mutex into hash table.
1552  */
1553 static inline void
1554 umtx_pi_insert(struct umtx_pi *pi)
1555 {
1556 	struct umtxq_chain *uc;
1557 
1558 	uc = umtxq_getchain(&pi->pi_key);
1559 	UMTXQ_LOCKED_ASSERT(uc);
1560 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1561 }
1562 
1563 /*
1564  * Lock a PI mutex.
1565  */
1566 static int
1567 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1568 	int try)
1569 {
1570 	struct umtx_q *uq;
1571 	struct umtx_pi *pi, *new_pi;
1572 	uint32_t id, owner, old;
1573 	int error;
1574 
1575 	id = td->td_tid;
1576 	uq = td->td_umtxq;
1577 
1578 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1579 	    &uq->uq_key)) != 0)
1580 		return (error);
1581 	umtxq_lock(&uq->uq_key);
1582 	pi = umtx_pi_lookup(&uq->uq_key);
1583 	if (pi == NULL) {
1584 		new_pi = umtx_pi_alloc(M_NOWAIT);
1585 		if (new_pi == NULL) {
1586 			umtxq_unlock(&uq->uq_key);
1587 			new_pi = umtx_pi_alloc(M_WAITOK);
1588 			new_pi->pi_key = uq->uq_key;
1589 			umtxq_lock(&uq->uq_key);
1590 			pi = umtx_pi_lookup(&uq->uq_key);
1591 			if (pi != NULL) {
1592 				umtx_pi_free(new_pi);
1593 				new_pi = NULL;
1594 			}
1595 		}
1596 		if (new_pi != NULL) {
1597 			new_pi->pi_key = uq->uq_key;
1598 			umtx_pi_insert(new_pi);
1599 			pi = new_pi;
1600 		}
1601 	}
1602 	umtx_pi_ref(pi);
1603 	umtxq_unlock(&uq->uq_key);
1604 
1605 	/*
1606 	 * Care must be exercised when dealing with umtx structure.  It
1607 	 * can fault on any access.
1608 	 */
1609 	for (;;) {
1610 		/*
1611 		 * Try the uncontested case.  This should be done in userland.
1612 		 */
1613 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1614 
1615 		/* The acquire succeeded. */
1616 		if (owner == UMUTEX_UNOWNED) {
1617 			error = 0;
1618 			break;
1619 		}
1620 
1621 		/* The address was invalid. */
1622 		if (owner == -1) {
1623 			error = EFAULT;
1624 			break;
1625 		}
1626 
1627 		/* If no one owns it but it is contested try to acquire it. */
1628 		if (owner == UMUTEX_CONTESTED) {
1629 			owner = casuword32(&m->m_owner,
1630 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1631 
1632 			if (owner == UMUTEX_CONTESTED) {
1633 				umtxq_lock(&uq->uq_key);
1634 				error = umtx_pi_claim(pi, td);
1635 				umtxq_unlock(&uq->uq_key);
1636 				break;
1637 			}
1638 
1639 			/* The address was invalid. */
1640 			if (owner == -1) {
1641 				error = EFAULT;
1642 				break;
1643 			}
1644 
1645 			/* If this failed the lock has changed, restart. */
1646 			continue;
1647 		}
1648 
1649 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1650 		    (owner & ~UMUTEX_CONTESTED) == id) {
1651 			error = EDEADLK;
1652 			break;
1653 		}
1654 
1655 		if (try != 0) {
1656 			error = EBUSY;
1657 			break;
1658 		}
1659 
1660 		/*
1661 		 * If we caught a signal, we have retried and now
1662 		 * exit immediately.
1663 		 */
1664 		if (error != 0)
1665 			break;
1666 
1667 		umtxq_lock(&uq->uq_key);
1668 		umtxq_busy(&uq->uq_key);
1669 		umtxq_unlock(&uq->uq_key);
1670 
1671 		/*
1672 		 * Set the contested bit so that a release in user space
1673 		 * knows to use the system call for unlock.  If this fails
1674 		 * either some one else has acquired the lock or it has been
1675 		 * released.
1676 		 */
1677 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1678 
1679 		/* The address was invalid. */
1680 		if (old == -1) {
1681 			umtxq_lock(&uq->uq_key);
1682 			umtxq_unbusy(&uq->uq_key);
1683 			umtxq_unlock(&uq->uq_key);
1684 			error = EFAULT;
1685 			break;
1686 		}
1687 
1688 		umtxq_lock(&uq->uq_key);
1689 		umtxq_unbusy(&uq->uq_key);
1690 		/*
1691 		 * We set the contested bit, sleep. Otherwise the lock changed
1692 		 * and we need to retry or we lost a race to the thread
1693 		 * unlocking the umtx.
1694 		 */
1695 		if (old == owner)
1696 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1697 				 "umtxpi", timo);
1698 		umtxq_unlock(&uq->uq_key);
1699 	}
1700 
1701 	umtxq_lock(&uq->uq_key);
1702 	umtx_pi_unref(pi);
1703 	umtxq_unlock(&uq->uq_key);
1704 
1705 	umtx_key_release(&uq->uq_key);
1706 	return (error);
1707 }
1708 
1709 /*
1710  * Unlock a PI mutex.
1711  */
1712 static int
1713 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1714 {
1715 	struct umtx_key key;
1716 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1717 	struct umtx_pi *pi, *pi2;
1718 	uint32_t owner, old, id;
1719 	int error;
1720 	int count;
1721 	int pri;
1722 
1723 	id = td->td_tid;
1724 	/*
1725 	 * Make sure we own this mtx.
1726 	 */
1727 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1728 	if (owner == -1)
1729 		return (EFAULT);
1730 
1731 	if ((owner & ~UMUTEX_CONTESTED) != id)
1732 		return (EPERM);
1733 
1734 	/* This should be done in userland */
1735 	if ((owner & UMUTEX_CONTESTED) == 0) {
1736 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1737 		if (old == -1)
1738 			return (EFAULT);
1739 		if (old == owner)
1740 			return (0);
1741 		owner = old;
1742 	}
1743 
1744 	/* We should only ever be in here for contested locks */
1745 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1746 	    &key)) != 0)
1747 		return (error);
1748 
1749 	umtxq_lock(&key);
1750 	umtxq_busy(&key);
1751 	count = umtxq_count_pi(&key, &uq_first);
1752 	if (uq_first != NULL) {
1753 		pi = uq_first->uq_pi_blocked;
1754 		if (pi->pi_owner != curthread) {
1755 			umtxq_unbusy(&key);
1756 			umtxq_unlock(&key);
1757 			/* userland messed the mutex */
1758 			return (EPERM);
1759 		}
1760 		uq_me = curthread->td_umtxq;
1761 		mtx_lock_spin(&sched_lock);
1762 		pi->pi_owner = NULL;
1763 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1764 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1765 		pri = PRI_MAX;
1766 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1767 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1768 			if (uq_first2 != NULL) {
1769 				if (pri > UPRI(uq_first2->uq_thread))
1770 					pri = UPRI(uq_first2->uq_thread);
1771 			}
1772 		}
1773 		sched_unlend_user_prio(curthread, pri);
1774 		mtx_unlock_spin(&sched_lock);
1775 	}
1776 	umtxq_unlock(&key);
1777 
1778 	/*
1779 	 * When unlocking the umtx, it must be marked as unowned if
1780 	 * there is zero or one thread only waiting for it.
1781 	 * Otherwise, it must be marked as contested.
1782 	 */
1783 	old = casuword32(&m->m_owner, owner,
1784 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1785 
1786 	umtxq_lock(&key);
1787 	if (uq_first != NULL)
1788 		umtxq_signal_thread(uq_first);
1789 	umtxq_unbusy(&key);
1790 	umtxq_unlock(&key);
1791 	umtx_key_release(&key);
1792 	if (old == -1)
1793 		return (EFAULT);
1794 	if (old != owner)
1795 		return (EINVAL);
1796 	return (0);
1797 }
1798 
1799 /*
1800  * Lock a PP mutex.
1801  */
1802 static int
1803 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1804 	int try)
1805 {
1806 	struct umtx_q *uq, *uq2;
1807 	struct umtx_pi *pi;
1808 	uint32_t ceiling;
1809 	uint32_t owner, id;
1810 	int error, pri, old_inherited_pri, su;
1811 
1812 	id = td->td_tid;
1813 	uq = td->td_umtxq;
1814 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1815 	    &uq->uq_key)) != 0)
1816 		return (error);
1817 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1818 	for (;;) {
1819 		old_inherited_pri = uq->uq_inherited_pri;
1820 		umtxq_lock(&uq->uq_key);
1821 		umtxq_busy(&uq->uq_key);
1822 		umtxq_unlock(&uq->uq_key);
1823 
1824 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1825 		if (ceiling > RTP_PRIO_MAX) {
1826 			error = EINVAL;
1827 			goto out;
1828 		}
1829 
1830 		mtx_lock_spin(&sched_lock);
1831 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1832 			mtx_unlock_spin(&sched_lock);
1833 			error = EINVAL;
1834 			goto out;
1835 		}
1836 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1837 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1838 			if (uq->uq_inherited_pri < UPRI(td))
1839 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1840 		}
1841 		mtx_unlock_spin(&sched_lock);
1842 
1843 		owner = casuword32(&m->m_owner,
1844 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1845 
1846 		if (owner == UMUTEX_CONTESTED) {
1847 			error = 0;
1848 			break;
1849 		}
1850 
1851 		/* The address was invalid. */
1852 		if (owner == -1) {
1853 			error = EFAULT;
1854 			break;
1855 		}
1856 
1857 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1858 		    (owner & ~UMUTEX_CONTESTED) == id) {
1859 			error = EDEADLK;
1860 			break;
1861 		}
1862 
1863 		if (try != 0) {
1864 			error = EBUSY;
1865 			break;
1866 		}
1867 
1868 		/*
1869 		 * If we caught a signal, we have retried and now
1870 		 * exit immediately.
1871 		 */
1872 		if (error != 0)
1873 			break;
1874 
1875 		umtxq_lock(&uq->uq_key);
1876 		umtxq_insert(uq);
1877 		umtxq_unbusy(&uq->uq_key);
1878 		error = umtxq_sleep(uq, "umtxpp", timo);
1879 		umtxq_remove(uq);
1880 		umtxq_unlock(&uq->uq_key);
1881 
1882 		mtx_lock_spin(&sched_lock);
1883 		uq->uq_inherited_pri = old_inherited_pri;
1884 		pri = PRI_MAX;
1885 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1886 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1887 			if (uq2 != NULL) {
1888 				if (pri > UPRI(uq2->uq_thread))
1889 					pri = UPRI(uq2->uq_thread);
1890 			}
1891 		}
1892 		if (pri > uq->uq_inherited_pri)
1893 			pri = uq->uq_inherited_pri;
1894 		sched_unlend_user_prio(td, pri);
1895 		mtx_unlock_spin(&sched_lock);
1896 	}
1897 
1898 	if (error != 0) {
1899 		mtx_lock_spin(&sched_lock);
1900 		uq->uq_inherited_pri = old_inherited_pri;
1901 		pri = PRI_MAX;
1902 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1903 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1904 			if (uq2 != NULL) {
1905 				if (pri > UPRI(uq2->uq_thread))
1906 					pri = UPRI(uq2->uq_thread);
1907 			}
1908 		}
1909 		if (pri > uq->uq_inherited_pri)
1910 			pri = uq->uq_inherited_pri;
1911 		sched_unlend_user_prio(td, pri);
1912 		mtx_unlock_spin(&sched_lock);
1913 	}
1914 
1915 out:
1916 	umtxq_lock(&uq->uq_key);
1917 	umtxq_unbusy(&uq->uq_key);
1918 	umtxq_unlock(&uq->uq_key);
1919 	umtx_key_release(&uq->uq_key);
1920 	return (error);
1921 }
1922 
1923 /*
1924  * Unlock a PP mutex.
1925  */
1926 static int
1927 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1928 {
1929 	struct umtx_key key;
1930 	struct umtx_q *uq, *uq2;
1931 	struct umtx_pi *pi;
1932 	uint32_t owner, id;
1933 	uint32_t rceiling;
1934 	int error, pri, new_inherited_pri, su;
1935 
1936 	id = td->td_tid;
1937 	uq = td->td_umtxq;
1938 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1939 
1940 	/*
1941 	 * Make sure we own this mtx.
1942 	 */
1943 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1944 	if (owner == -1)
1945 		return (EFAULT);
1946 
1947 	if ((owner & ~UMUTEX_CONTESTED) != id)
1948 		return (EPERM);
1949 
1950 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1951 	if (error != 0)
1952 		return (error);
1953 
1954 	if (rceiling == -1)
1955 		new_inherited_pri = PRI_MAX;
1956 	else {
1957 		rceiling = RTP_PRIO_MAX - rceiling;
1958 		if (rceiling > RTP_PRIO_MAX)
1959 			return (EINVAL);
1960 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1961 	}
1962 
1963 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1964 	    &key)) != 0)
1965 		return (error);
1966 	umtxq_lock(&key);
1967 	umtxq_busy(&key);
1968 	umtxq_unlock(&key);
1969 	/*
1970 	 * For priority protected mutex, always set unlocked state
1971 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1972 	 * to lock the mutex, it is necessary because thread priority
1973 	 * has to be adjusted for such mutex.
1974 	 */
1975 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1976 		UMUTEX_CONTESTED);
1977 
1978 	umtxq_lock(&key);
1979 	if (error == 0)
1980 		umtxq_signal(&key, 1);
1981 	umtxq_unbusy(&key);
1982 	umtxq_unlock(&key);
1983 
1984 	if (error == -1)
1985 		error = EFAULT;
1986 	else {
1987 		mtx_lock_spin(&sched_lock);
1988 		if (su != 0)
1989 			uq->uq_inherited_pri = new_inherited_pri;
1990 		pri = PRI_MAX;
1991 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1992 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1993 			if (uq2 != NULL) {
1994 				if (pri > UPRI(uq2->uq_thread))
1995 					pri = UPRI(uq2->uq_thread);
1996 			}
1997 		}
1998 		if (pri > uq->uq_inherited_pri)
1999 			pri = uq->uq_inherited_pri;
2000 		sched_unlend_user_prio(td, pri);
2001 		mtx_unlock_spin(&sched_lock);
2002 	}
2003 	umtx_key_release(&key);
2004 	return (error);
2005 }
2006 
2007 static int
2008 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2009 	uint32_t *old_ceiling)
2010 {
2011 	struct umtx_q *uq;
2012 	uint32_t save_ceiling;
2013 	uint32_t owner, id;
2014 	uint32_t flags;
2015 	int error;
2016 
2017 	flags = fuword32(&m->m_flags);
2018 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2019 		return (EINVAL);
2020 	if (ceiling > RTP_PRIO_MAX)
2021 		return (EINVAL);
2022 	id = td->td_tid;
2023 	uq = td->td_umtxq;
2024 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2025 	   &uq->uq_key)) != 0)
2026 		return (error);
2027 	for (;;) {
2028 		umtxq_lock(&uq->uq_key);
2029 		umtxq_busy(&uq->uq_key);
2030 		umtxq_unlock(&uq->uq_key);
2031 
2032 		save_ceiling = fuword32(&m->m_ceilings[0]);
2033 
2034 		owner = casuword32(&m->m_owner,
2035 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2036 
2037 		if (owner == UMUTEX_CONTESTED) {
2038 			suword32(&m->m_ceilings[0], ceiling);
2039 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2040 				UMUTEX_CONTESTED);
2041 			error = 0;
2042 			break;
2043 		}
2044 
2045 		/* The address was invalid. */
2046 		if (owner == -1) {
2047 			error = EFAULT;
2048 			break;
2049 		}
2050 
2051 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2052 			suword32(&m->m_ceilings[0], ceiling);
2053 			error = 0;
2054 			break;
2055 		}
2056 
2057 		/*
2058 		 * If we caught a signal, we have retried and now
2059 		 * exit immediately.
2060 		 */
2061 		if (error != 0)
2062 			break;
2063 
2064 		/*
2065 		 * We set the contested bit, sleep. Otherwise the lock changed
2066 		 * and we need to retry or we lost a race to the thread
2067 		 * unlocking the umtx.
2068 		 */
2069 		umtxq_lock(&uq->uq_key);
2070 		umtxq_insert(uq);
2071 		umtxq_unbusy(&uq->uq_key);
2072 		error = umtxq_sleep(uq, "umtxpp", 0);
2073 		umtxq_remove(uq);
2074 		umtxq_unlock(&uq->uq_key);
2075 	}
2076 	umtxq_lock(&uq->uq_key);
2077 	if (error == 0)
2078 		umtxq_signal(&uq->uq_key, INT_MAX);
2079 	umtxq_unbusy(&uq->uq_key);
2080 	umtxq_unlock(&uq->uq_key);
2081 	umtx_key_release(&uq->uq_key);
2082 	if (error == 0 && old_ceiling != NULL)
2083 		suword32(old_ceiling, save_ceiling);
2084 	return (error);
2085 }
2086 
2087 static int
2088 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2089 	int try)
2090 {
2091 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2092 	case 0:
2093 		return (_do_lock_normal(td, m, flags, timo, try));
2094 	case UMUTEX_PRIO_INHERIT:
2095 		return (_do_lock_pi(td, m, flags, timo, try));
2096 	case UMUTEX_PRIO_PROTECT:
2097 		return (_do_lock_pp(td, m, flags, timo, try));
2098 	}
2099 	return (EINVAL);
2100 }
2101 
2102 /*
2103  * Lock a userland POSIX mutex.
2104  */
2105 static int
2106 do_lock_umutex(struct thread *td, struct umutex *m,
2107 	struct timespec *timeout, int try)
2108 {
2109 	struct timespec ts, ts2, ts3;
2110 	struct timeval tv;
2111 	uint32_t flags;
2112 	int error;
2113 
2114 	flags = fuword32(&m->m_flags);
2115 	if (flags == -1)
2116 		return (EFAULT);
2117 
2118 	if (timeout == NULL) {
2119 		error = _do_lock_umutex(td, m, flags, 0, try);
2120 		/* Mutex locking is restarted if it is interrupted. */
2121 		if (error == EINTR)
2122 			error = ERESTART;
2123 	} else {
2124 		getnanouptime(&ts);
2125 		timespecadd(&ts, timeout);
2126 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2127 		for (;;) {
2128 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2129 			if (error != ETIMEDOUT)
2130 				break;
2131 			getnanouptime(&ts2);
2132 			if (timespeccmp(&ts2, &ts, >=)) {
2133 				error = ETIMEDOUT;
2134 				break;
2135 			}
2136 			ts3 = ts;
2137 			timespecsub(&ts3, &ts2);
2138 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2139 		}
2140 		/* Timed-locking is not restarted. */
2141 		if (error == ERESTART)
2142 			error = EINTR;
2143 	}
2144 	return (error);
2145 }
2146 
2147 /*
2148  * Unlock a userland POSIX mutex.
2149  */
2150 static int
2151 do_unlock_umutex(struct thread *td, struct umutex *m)
2152 {
2153 	uint32_t flags;
2154 
2155 	flags = fuword32(&m->m_flags);
2156 	if (flags == -1)
2157 		return (EFAULT);
2158 
2159 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2160 	case 0:
2161 		return (do_unlock_normal(td, m, flags));
2162 	case UMUTEX_PRIO_INHERIT:
2163 		return (do_unlock_pi(td, m, flags));
2164 	case UMUTEX_PRIO_PROTECT:
2165 		return (do_unlock_pp(td, m, flags));
2166 	}
2167 
2168 	return (EINVAL);
2169 }
2170 
2171 int
2172 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2173     /* struct umtx *umtx */
2174 {
2175 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2176 }
2177 
2178 int
2179 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2180     /* struct umtx *umtx */
2181 {
2182 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2183 }
2184 
2185 static int
2186 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2187 {
2188 	struct timespec *ts, timeout;
2189 	int error;
2190 
2191 	/* Allow a null timespec (wait forever). */
2192 	if (uap->uaddr2 == NULL)
2193 		ts = NULL;
2194 	else {
2195 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2196 		if (error != 0)
2197 			return (error);
2198 		if (timeout.tv_nsec >= 1000000000 ||
2199 		    timeout.tv_nsec < 0) {
2200 			return (EINVAL);
2201 		}
2202 		ts = &timeout;
2203 	}
2204 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2205 }
2206 
2207 static int
2208 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2209 {
2210 	return (do_unlock_umtx(td, uap->obj, uap->val));
2211 }
2212 
2213 static int
2214 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2215 {
2216 	struct timespec *ts, timeout;
2217 	int error;
2218 
2219 	if (uap->uaddr2 == NULL)
2220 		ts = NULL;
2221 	else {
2222 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2223 		if (error != 0)
2224 			return (error);
2225 		if (timeout.tv_nsec >= 1000000000 ||
2226 		    timeout.tv_nsec < 0)
2227 			return (EINVAL);
2228 		ts = &timeout;
2229 	}
2230 	return do_wait(td, uap->obj, uap->val, ts, 0);
2231 }
2232 
2233 static int
2234 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2235 {
2236 	return (kern_umtx_wake(td, uap->obj, uap->val));
2237 }
2238 
2239 static int
2240 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2241 {
2242 	struct timespec *ts, timeout;
2243 	int error;
2244 
2245 	/* Allow a null timespec (wait forever). */
2246 	if (uap->uaddr2 == NULL)
2247 		ts = NULL;
2248 	else {
2249 		error = copyin(uap->uaddr2, &timeout,
2250 		    sizeof(timeout));
2251 		if (error != 0)
2252 			return (error);
2253 		if (timeout.tv_nsec >= 1000000000 ||
2254 		    timeout.tv_nsec < 0) {
2255 			return (EINVAL);
2256 		}
2257 		ts = &timeout;
2258 	}
2259 	return do_lock_umutex(td, uap->obj, ts, 0);
2260 }
2261 
2262 static int
2263 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2264 {
2265 	return do_lock_umutex(td, uap->obj, NULL, 1);
2266 }
2267 
2268 static int
2269 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2270 {
2271 	return do_unlock_umutex(td, uap->obj);
2272 }
2273 
2274 static int
2275 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2276 {
2277 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2278 }
2279 
2280 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2281 
2282 static _umtx_op_func op_table[] = {
2283 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2284 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2285 	__umtx_op_wait,			/* UMTX_OP_WAIT */
2286 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2287 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2288 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2289 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2290 	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2291 };
2292 
2293 int
2294 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
2295 {
2296 	if ((unsigned)uap->op < UMTX_OP_MAX)
2297 		return (*op_table[uap->op])(td, uap);
2298 	return (EINVAL);
2299 }
2300 
2301 #ifdef COMPAT_IA32
2302 
2303 int
2304 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2305     /* struct umtx *umtx */
2306 {
2307 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2308 }
2309 
2310 int
2311 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2312     /* struct umtx *umtx */
2313 {
2314 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2315 }
2316 
2317 struct timespec32 {
2318 	u_int32_t tv_sec;
2319 	u_int32_t tv_nsec;
2320 };
2321 
2322 static inline int
2323 copyin_timeout32(void *addr, struct timespec *tsp)
2324 {
2325 	struct timespec32 ts32;
2326 	int error;
2327 
2328 	error = copyin(addr, &ts32, sizeof(struct timespec32));
2329 	if (error == 0) {
2330 		tsp->tv_sec = ts32.tv_sec;
2331 		tsp->tv_nsec = ts32.tv_nsec;
2332 	}
2333 	return (error);
2334 }
2335 
2336 static int
2337 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2338 {
2339 	struct timespec *ts, timeout;
2340 	int error;
2341 
2342 	/* Allow a null timespec (wait forever). */
2343 	if (uap->uaddr2 == NULL)
2344 		ts = NULL;
2345 	else {
2346 		error = copyin_timeout32(uap->uaddr2, &timeout);
2347 		if (error != 0)
2348 			return (error);
2349 		if (timeout.tv_nsec >= 1000000000 ||
2350 		    timeout.tv_nsec < 0) {
2351 			return (EINVAL);
2352 		}
2353 		ts = &timeout;
2354 	}
2355 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2356 }
2357 
2358 static int
2359 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2360 {
2361 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2362 }
2363 
2364 static int
2365 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2366 {
2367 	struct timespec *ts, timeout;
2368 	int error;
2369 
2370 	if (uap->uaddr2 == NULL)
2371 		ts = NULL;
2372 	else {
2373 		error = copyin_timeout32(uap->uaddr2, &timeout);
2374 		if (error != 0)
2375 			return (error);
2376 		if (timeout.tv_nsec >= 1000000000 ||
2377 		    timeout.tv_nsec < 0)
2378 			return (EINVAL);
2379 		ts = &timeout;
2380 	}
2381 	return do_wait(td, uap->obj, uap->val, ts, 1);
2382 }
2383 
2384 static int
2385 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2386 {
2387 	struct timespec *ts, timeout;
2388 	int error;
2389 
2390 	/* Allow a null timespec (wait forever). */
2391 	if (uap->uaddr2 == NULL)
2392 		ts = NULL;
2393 	else {
2394 		error = copyin_timeout32(uap->uaddr2, &timeout);
2395 		if (error != 0)
2396 			return (error);
2397 		if (timeout.tv_nsec >= 1000000000 ||
2398 		    timeout.tv_nsec < 0)
2399 			return (EINVAL);
2400 		ts = &timeout;
2401 	}
2402 	return do_lock_umutex(td, uap->obj, ts, 0);
2403 }
2404 
2405 static _umtx_op_func op_table_compat32[] = {
2406 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2407 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2408 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2409 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2410 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2411 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2412 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2413 	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2414 };
2415 
2416 int
2417 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2418 {
2419 	if ((unsigned)uap->op < UMTX_OP_MAX)
2420 		return (*op_table_compat32[uap->op])(td,
2421 			(struct _umtx_op_args *)uap);
2422 	return (EINVAL);
2423 }
2424 #endif
2425 
2426 void
2427 umtx_thread_init(struct thread *td)
2428 {
2429 	td->td_umtxq = umtxq_alloc();
2430 	td->td_umtxq->uq_thread = td;
2431 }
2432 
2433 void
2434 umtx_thread_fini(struct thread *td)
2435 {
2436 	umtxq_free(td->td_umtxq);
2437 }
2438 
2439 /*
2440  * It will be called when new thread is created, e.g fork().
2441  */
2442 void
2443 umtx_thread_alloc(struct thread *td)
2444 {
2445 	struct umtx_q *uq;
2446 
2447 	uq = td->td_umtxq;
2448 	uq->uq_inherited_pri = PRI_MAX;
2449 
2450 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2451 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2452 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2453 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2454 }
2455 
2456 /*
2457  * exec() hook.
2458  */
2459 static void
2460 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2461 	struct image_params *imgp __unused)
2462 {
2463 	umtx_thread_cleanup(curthread);
2464 }
2465 
2466 /*
2467  * thread_exit() hook.
2468  */
2469 void
2470 umtx_thread_exit(struct thread *td)
2471 {
2472 	umtx_thread_cleanup(td);
2473 }
2474 
2475 /*
2476  * clean up umtx data.
2477  */
2478 static void
2479 umtx_thread_cleanup(struct thread *td)
2480 {
2481 	struct umtx_q *uq;
2482 	struct umtx_pi *pi;
2483 
2484 	if ((uq = td->td_umtxq) == NULL)
2485 		return;
2486 
2487 	mtx_lock_spin(&sched_lock);
2488 	uq->uq_inherited_pri = PRI_MAX;
2489 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2490 		pi->pi_owner = NULL;
2491 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2492 	}
2493 	td->td_flags &= ~TDF_UBORROWING;
2494 	mtx_unlock_spin(&sched_lock);
2495 }
2496