xref: /freebsd/sys/kern/kern_umtx.c (revision 84ee9401a3fc8d3c22424266f421a928989cd692)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/proc.h>
39 #include <sys/sched.h>
40 #include <sys/sysctl.h>
41 #include <sys/sysent.h>
42 #include <sys/systm.h>
43 #include <sys/sysproto.h>
44 #include <sys/eventhandler.h>
45 #include <sys/umtx.h>
46 
47 #include <vm/vm.h>
48 #include <vm/vm_param.h>
49 #include <vm/pmap.h>
50 #include <vm/vm_map.h>
51 #include <vm/vm_object.h>
52 
53 #ifdef COMPAT_IA32
54 #include <compat/freebsd32/freebsd32_proto.h>
55 #endif
56 
57 #define TYPE_SIMPLE_LOCK	0
58 #define TYPE_SIMPLE_WAIT	1
59 #define TYPE_NORMAL_UMUTEX	2
60 #define TYPE_PI_UMUTEX		3
61 #define TYPE_PP_UMUTEX		4
62 #define TYPE_CV			5
63 
64 /* Key to represent a unique userland synchronous object */
65 struct umtx_key {
66 	int	hash;
67 	int	type;
68 	int	shared;
69 	union {
70 		struct {
71 			vm_object_t	object;
72 			uintptr_t	offset;
73 		} shared;
74 		struct {
75 			struct vmspace	*vs;
76 			uintptr_t	addr;
77 		} private;
78 		struct {
79 			void		*a;
80 			uintptr_t	b;
81 		} both;
82 	} info;
83 };
84 
85 /* Priority inheritance mutex info. */
86 struct umtx_pi {
87 	/* Owner thread */
88 	struct thread		*pi_owner;
89 
90 	/* Reference count */
91 	int			pi_refcount;
92 
93  	/* List entry to link umtx holding by thread */
94 	TAILQ_ENTRY(umtx_pi)	pi_link;
95 
96 	/* List entry in hash */
97 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
98 
99 	/* List for waiters */
100 	TAILQ_HEAD(,umtx_q)	pi_blocked;
101 
102 	/* Identify a userland lock object */
103 	struct umtx_key		pi_key;
104 };
105 
106 /* A userland synchronous object user. */
107 struct umtx_q {
108 	/* Linked list for the hash. */
109 	TAILQ_ENTRY(umtx_q)	uq_link;
110 
111 	/* Umtx key. */
112 	struct umtx_key		uq_key;
113 
114 	/* Umtx flags. */
115 	int			uq_flags;
116 #define UQF_UMTXQ	0x0001
117 
118 	/* The thread waits on. */
119 	struct thread		*uq_thread;
120 
121 	/*
122 	 * Blocked on PI mutex. read can use chain lock
123 	 * or sched_lock, write must have both chain lock and
124 	 * sched_lock being hold.
125 	 */
126 	struct umtx_pi		*uq_pi_blocked;
127 
128 	/* On blocked list */
129 	TAILQ_ENTRY(umtx_q)	uq_lockq;
130 
131 	/* Thread contending with us */
132 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
133 
134 	/* Inherited priority from PP mutex */
135 	u_char			uq_inherited_pri;
136 };
137 
138 TAILQ_HEAD(umtxq_head, umtx_q);
139 
140 /* Userland lock object's wait-queue chain */
141 struct umtxq_chain {
142 	/* Lock for this chain. */
143 	struct mtx		uc_lock;
144 
145 	/* List of sleep queues. */
146 	struct umtxq_head	uc_queue;
147 
148 	/* Busy flag */
149 	char			uc_busy;
150 
151 	/* Chain lock waiters */
152 	int			uc_waiters;
153 
154 	/* All PI in the list */
155 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156 };
157 
158 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
159 
160 /*
161  * Don't propagate time-sharing priority, there is a security reason,
162  * a user can simply introduce PI-mutex, let thread A lock the mutex,
163  * and let another thread B block on the mutex, because B is
164  * sleeping, its priority will be boosted, this causes A's priority to
165  * be boosted via priority propagating too and will never be lowered even
166  * if it is using 100%CPU, this is unfair to other processes.
167  */
168 
169 #define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
170 			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
171 			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
172 
173 #define	GOLDEN_RATIO_PRIME	2654404609U
174 #define	UMTX_CHAINS		128
175 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
176 
177 #define THREAD_SHARE		0
178 #define PROCESS_SHARE		1
179 #define AUTO_SHARE		2
180 
181 #define	GET_SHARE(flags)	\
182     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
183 
184 static uma_zone_t		umtx_pi_zone;
185 static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
186 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187 static int			umtx_pi_allocated;
188 
189 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191     &umtx_pi_allocated, 0, "Allocated umtx_pi");
192 
193 static void umtxq_sysinit(void *);
194 static void umtxq_hash(struct umtx_key *key);
195 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196 static void umtxq_lock(struct umtx_key *key);
197 static void umtxq_unlock(struct umtx_key *key);
198 static void umtxq_busy(struct umtx_key *key);
199 static void umtxq_unbusy(struct umtx_key *key);
200 static void umtxq_insert(struct umtx_q *uq);
201 static void umtxq_remove(struct umtx_q *uq);
202 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203 static int umtxq_count(struct umtx_key *key);
204 static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
205 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
206 static int umtx_key_get(void *addr, int type, int share,
207 	struct umtx_key *key);
208 static void umtx_key_release(struct umtx_key *key);
209 static struct umtx_pi *umtx_pi_alloc(void);
210 static void umtx_pi_free(struct umtx_pi *pi);
211 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
212 static void umtx_thread_cleanup(struct thread *td);
213 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
214 	struct image_params *imgp __unused);
215 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
216 
217 static void
218 umtxq_sysinit(void *arg __unused)
219 {
220 	int i;
221 
222 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
223 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
224 	for (i = 0; i < UMTX_CHAINS; ++i) {
225 		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
226 			 MTX_DEF | MTX_DUPOK);
227 		TAILQ_INIT(&umtxq_chains[i].uc_queue);
228 		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
229 		umtxq_chains[i].uc_busy = 0;
230 		umtxq_chains[i].uc_waiters = 0;
231 	}
232 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
233 	    EVENTHANDLER_PRI_ANY);
234 }
235 
236 struct umtx_q *
237 umtxq_alloc(void)
238 {
239 	struct umtx_q *uq;
240 
241 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
242 	TAILQ_INIT(&uq->uq_pi_contested);
243 	uq->uq_inherited_pri = PRI_MAX;
244 	return (uq);
245 }
246 
247 void
248 umtxq_free(struct umtx_q *uq)
249 {
250 	free(uq, M_UMTX);
251 }
252 
253 static inline void
254 umtxq_hash(struct umtx_key *key)
255 {
256 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
257 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
258 }
259 
260 static inline int
261 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
262 {
263 	return (k1->type == k2->type &&
264 		k1->info.both.a == k2->info.both.a &&
265 	        k1->info.both.b == k2->info.both.b);
266 }
267 
268 static inline struct umtxq_chain *
269 umtxq_getchain(struct umtx_key *key)
270 {
271 	return (&umtxq_chains[key->hash]);
272 }
273 
274 /*
275  * Set chain to busy state when following operation
276  * may be blocked (kernel mutex can not be used).
277  */
278 static inline void
279 umtxq_busy(struct umtx_key *key)
280 {
281 	struct umtxq_chain *uc;
282 
283 	uc = umtxq_getchain(key);
284 	mtx_assert(&uc->uc_lock, MA_OWNED);
285 	while (uc->uc_busy != 0) {
286 		uc->uc_waiters++;
287 		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
288 		uc->uc_waiters--;
289 	}
290 	uc->uc_busy = 1;
291 }
292 
293 /*
294  * Unbusy a chain.
295  */
296 static inline void
297 umtxq_unbusy(struct umtx_key *key)
298 {
299 	struct umtxq_chain *uc;
300 
301 	uc = umtxq_getchain(key);
302 	mtx_assert(&uc->uc_lock, MA_OWNED);
303 	KASSERT(uc->uc_busy != 0, ("not busy"));
304 	uc->uc_busy = 0;
305 	if (uc->uc_waiters)
306 		wakeup_one(uc);
307 }
308 
309 /*
310  * Lock a chain.
311  */
312 static inline void
313 umtxq_lock(struct umtx_key *key)
314 {
315 	struct umtxq_chain *uc;
316 
317 	uc = umtxq_getchain(key);
318 	mtx_lock(&uc->uc_lock);
319 }
320 
321 /*
322  * Unlock a chain.
323  */
324 static inline void
325 umtxq_unlock(struct umtx_key *key)
326 {
327 	struct umtxq_chain *uc;
328 
329 	uc = umtxq_getchain(key);
330 	mtx_unlock(&uc->uc_lock);
331 }
332 
333 /*
334  * Insert a thread onto the umtx queue.
335  */
336 static inline void
337 umtxq_insert(struct umtx_q *uq)
338 {
339 	struct umtxq_chain *uc;
340 
341 	uc = umtxq_getchain(&uq->uq_key);
342 	UMTXQ_LOCKED_ASSERT(uc);
343 	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
344 	uq->uq_flags |= UQF_UMTXQ;
345 }
346 
347 /*
348  * Remove thread from the umtx queue.
349  */
350 static inline void
351 umtxq_remove(struct umtx_q *uq)
352 {
353 	struct umtxq_chain *uc;
354 
355 	uc = umtxq_getchain(&uq->uq_key);
356 	UMTXQ_LOCKED_ASSERT(uc);
357 	if (uq->uq_flags & UQF_UMTXQ) {
358 		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
359 		uq->uq_flags &= ~UQF_UMTXQ;
360 	}
361 }
362 
363 /*
364  * Check if there are multiple waiters
365  */
366 static int
367 umtxq_count(struct umtx_key *key)
368 {
369 	struct umtxq_chain *uc;
370 	struct umtx_q *uq;
371 	int count = 0;
372 
373 	uc = umtxq_getchain(key);
374 	UMTXQ_LOCKED_ASSERT(uc);
375 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
376 		if (umtx_key_match(&uq->uq_key, key)) {
377 			if (++count > 1)
378 				break;
379 		}
380 	}
381 	return (count);
382 }
383 
384 /*
385  * Check if there are multiple PI waiters and returns first
386  * waiter.
387  */
388 static int
389 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
390 {
391 	struct umtxq_chain *uc;
392 	struct umtx_q *uq;
393 	int count = 0;
394 
395 	*first = NULL;
396 	uc = umtxq_getchain(key);
397 	UMTXQ_LOCKED_ASSERT(uc);
398 	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
399 		if (umtx_key_match(&uq->uq_key, key)) {
400 			if (++count > 1)
401 				break;
402 			*first = uq;
403 		}
404 	}
405 	return (count);
406 }
407 
408 /*
409  * Wake up threads waiting on an userland object.
410  */
411 static int
412 umtxq_signal(struct umtx_key *key, int n_wake)
413 {
414 	struct umtxq_chain *uc;
415 	struct umtx_q *uq, *next;
416 	int ret;
417 
418 	ret = 0;
419 	uc = umtxq_getchain(key);
420 	UMTXQ_LOCKED_ASSERT(uc);
421 	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
422 		if (umtx_key_match(&uq->uq_key, key)) {
423 			umtxq_remove(uq);
424 			wakeup(uq);
425 			if (++ret >= n_wake)
426 				break;
427 		}
428 	}
429 	return (ret);
430 }
431 
432 /*
433  * Wake up specified thread.
434  */
435 static inline void
436 umtxq_signal_thread(struct umtx_q *uq)
437 {
438 	struct umtxq_chain *uc;
439 
440 	uc = umtxq_getchain(&uq->uq_key);
441 	UMTXQ_LOCKED_ASSERT(uc);
442 	umtxq_remove(uq);
443 	wakeup(uq);
444 }
445 
446 /*
447  * Put thread into sleep state, before sleeping, check if
448  * thread was removed from umtx queue.
449  */
450 static inline int
451 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
452 {
453 	struct umtxq_chain *uc;
454 	int error;
455 
456 	uc = umtxq_getchain(&uq->uq_key);
457 	UMTXQ_LOCKED_ASSERT(uc);
458 	if (!(uq->uq_flags & UQF_UMTXQ))
459 		return (0);
460 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
461 	if (error == EWOULDBLOCK)
462 		error = ETIMEDOUT;
463 	return (error);
464 }
465 
466 /*
467  * Convert userspace address into unique logical address.
468  */
469 static int
470 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
471 {
472 	struct thread *td = curthread;
473 	vm_map_t map;
474 	vm_map_entry_t entry;
475 	vm_pindex_t pindex;
476 	vm_prot_t prot;
477 	boolean_t wired;
478 
479 	key->type = type;
480 	if (share == THREAD_SHARE) {
481 		key->shared = 0;
482 		key->info.private.vs = td->td_proc->p_vmspace;
483 		key->info.private.addr = (uintptr_t)addr;
484 	} else if (share == PROCESS_SHARE || share == AUTO_SHARE) {
485 		map = &td->td_proc->p_vmspace->vm_map;
486 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
487 		    &entry, &key->info.shared.object, &pindex, &prot,
488 		    &wired) != KERN_SUCCESS) {
489 			return EFAULT;
490 		}
491 
492 		if ((share == PROCESS_SHARE) ||
493 		    (share == AUTO_SHARE &&
494 		     VM_INHERIT_SHARE == entry->inheritance)) {
495 			key->shared = 1;
496 			key->info.shared.offset = entry->offset + entry->start -
497 				(vm_offset_t)addr;
498 			vm_object_reference(key->info.shared.object);
499 		} else {
500 			key->shared = 0;
501 			key->info.private.vs = td->td_proc->p_vmspace;
502 			key->info.private.addr = (uintptr_t)addr;
503 		}
504 		vm_map_lookup_done(map, entry);
505 	}
506 
507 	umtxq_hash(key);
508 	return (0);
509 }
510 
511 /*
512  * Release key.
513  */
514 static inline void
515 umtx_key_release(struct umtx_key *key)
516 {
517 	if (key->shared)
518 		vm_object_deallocate(key->info.shared.object);
519 }
520 
521 /*
522  * Lock a umtx object.
523  */
524 static int
525 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
526 {
527 	struct umtx_q *uq;
528 	u_long owner;
529 	u_long old;
530 	int error = 0;
531 
532 	uq = td->td_umtxq;
533 
534 	/*
535 	 * Care must be exercised when dealing with umtx structure. It
536 	 * can fault on any access.
537 	 */
538 	for (;;) {
539 		/*
540 		 * Try the uncontested case.  This should be done in userland.
541 		 */
542 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
543 
544 		/* The acquire succeeded. */
545 		if (owner == UMTX_UNOWNED)
546 			return (0);
547 
548 		/* The address was invalid. */
549 		if (owner == -1)
550 			return (EFAULT);
551 
552 		/* If no one owns it but it is contested try to acquire it. */
553 		if (owner == UMTX_CONTESTED) {
554 			owner = casuword(&umtx->u_owner,
555 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
556 
557 			if (owner == UMTX_CONTESTED)
558 				return (0);
559 
560 			/* The address was invalid. */
561 			if (owner == -1)
562 				return (EFAULT);
563 
564 			/* If this failed the lock has changed, restart. */
565 			continue;
566 		}
567 
568 		/*
569 		 * If we caught a signal, we have retried and now
570 		 * exit immediately.
571 		 */
572 		if (error != 0)
573 			return (error);
574 
575 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
576 			AUTO_SHARE, &uq->uq_key)) != 0)
577 			return (error);
578 
579 		umtxq_lock(&uq->uq_key);
580 		umtxq_busy(&uq->uq_key);
581 		umtxq_insert(uq);
582 		umtxq_unbusy(&uq->uq_key);
583 		umtxq_unlock(&uq->uq_key);
584 
585 		/*
586 		 * Set the contested bit so that a release in user space
587 		 * knows to use the system call for unlock.  If this fails
588 		 * either some one else has acquired the lock or it has been
589 		 * released.
590 		 */
591 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
592 
593 		/* The address was invalid. */
594 		if (old == -1) {
595 			umtxq_lock(&uq->uq_key);
596 			umtxq_remove(uq);
597 			umtxq_unlock(&uq->uq_key);
598 			umtx_key_release(&uq->uq_key);
599 			return (EFAULT);
600 		}
601 
602 		/*
603 		 * We set the contested bit, sleep. Otherwise the lock changed
604 		 * and we need to retry or we lost a race to the thread
605 		 * unlocking the umtx.
606 		 */
607 		umtxq_lock(&uq->uq_key);
608 		if (old == owner)
609 			error = umtxq_sleep(uq, "umtx", timo);
610 		umtxq_remove(uq);
611 		umtxq_unlock(&uq->uq_key);
612 		umtx_key_release(&uq->uq_key);
613 	}
614 
615 	return (0);
616 }
617 
618 /*
619  * Lock a umtx object.
620  */
621 static int
622 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
623 	struct timespec *timeout)
624 {
625 	struct timespec ts, ts2, ts3;
626 	struct timeval tv;
627 	int error;
628 
629 	if (timeout == NULL) {
630 		error = _do_lock_umtx(td, umtx, id, 0);
631 		/* Mutex locking is restarted if it is interrupted. */
632 		if (error == EINTR)
633 			error = ERESTART;
634 	} else {
635 		getnanouptime(&ts);
636 		timespecadd(&ts, timeout);
637 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
638 		for (;;) {
639 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
640 			if (error != ETIMEDOUT)
641 				break;
642 			getnanouptime(&ts2);
643 			if (timespeccmp(&ts2, &ts, >=)) {
644 				error = ETIMEDOUT;
645 				break;
646 			}
647 			ts3 = ts;
648 			timespecsub(&ts3, &ts2);
649 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
650 		}
651 		/* Timed-locking is not restarted. */
652 		if (error == ERESTART)
653 			error = EINTR;
654 	}
655 	return (error);
656 }
657 
658 /*
659  * Unlock a umtx object.
660  */
661 static int
662 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
663 {
664 	struct umtx_key key;
665 	u_long owner;
666 	u_long old;
667 	int error;
668 	int count;
669 
670 	/*
671 	 * Make sure we own this mtx.
672 	 */
673 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
674 	if (owner == -1)
675 		return (EFAULT);
676 
677 	if ((owner & ~UMTX_CONTESTED) != id)
678 		return (EPERM);
679 
680 	/* This should be done in userland */
681 	if ((owner & UMTX_CONTESTED) == 0) {
682 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
683 		if (old == -1)
684 			return (EFAULT);
685 		if (old == owner)
686 			return (0);
687 		owner = old;
688 	}
689 
690 	/* We should only ever be in here for contested locks */
691 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
692 		&key)) != 0)
693 		return (error);
694 
695 	umtxq_lock(&key);
696 	umtxq_busy(&key);
697 	count = umtxq_count(&key);
698 	umtxq_unlock(&key);
699 
700 	/*
701 	 * When unlocking the umtx, it must be marked as unowned if
702 	 * there is zero or one thread only waiting for it.
703 	 * Otherwise, it must be marked as contested.
704 	 */
705 	old = casuword(&umtx->u_owner, owner,
706 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
707 	umtxq_lock(&key);
708 	umtxq_signal(&key,1);
709 	umtxq_unbusy(&key);
710 	umtxq_unlock(&key);
711 	umtx_key_release(&key);
712 	if (old == -1)
713 		return (EFAULT);
714 	if (old != owner)
715 		return (EINVAL);
716 	return (0);
717 }
718 
719 #ifdef COMPAT_IA32
720 
721 /*
722  * Lock a umtx object.
723  */
724 static int
725 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
726 {
727 	struct umtx_q *uq;
728 	uint32_t owner;
729 	uint32_t old;
730 	int error = 0;
731 
732 	uq = td->td_umtxq;
733 
734 	/*
735 	 * Care must be exercised when dealing with umtx structure. It
736 	 * can fault on any access.
737 	 */
738 	for (;;) {
739 		/*
740 		 * Try the uncontested case.  This should be done in userland.
741 		 */
742 		owner = casuword32(m, UMUTEX_UNOWNED, id);
743 
744 		/* The acquire succeeded. */
745 		if (owner == UMUTEX_UNOWNED)
746 			return (0);
747 
748 		/* The address was invalid. */
749 		if (owner == -1)
750 			return (EFAULT);
751 
752 		/* If no one owns it but it is contested try to acquire it. */
753 		if (owner == UMUTEX_CONTESTED) {
754 			owner = casuword32(m,
755 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
756 			if (owner == UMUTEX_CONTESTED)
757 				return (0);
758 
759 			/* The address was invalid. */
760 			if (owner == -1)
761 				return (EFAULT);
762 
763 			/* If this failed the lock has changed, restart. */
764 			continue;
765 		}
766 
767 		/*
768 		 * If we caught a signal, we have retried and now
769 		 * exit immediately.
770 		 */
771 		if (error != 0)
772 			return (error);
773 
774 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
775 			AUTO_SHARE, &uq->uq_key)) != 0)
776 			return (error);
777 
778 		umtxq_lock(&uq->uq_key);
779 		umtxq_busy(&uq->uq_key);
780 		umtxq_insert(uq);
781 		umtxq_unbusy(&uq->uq_key);
782 		umtxq_unlock(&uq->uq_key);
783 
784 		/*
785 		 * Set the contested bit so that a release in user space
786 		 * knows to use the system call for unlock.  If this fails
787 		 * either some one else has acquired the lock or it has been
788 		 * released.
789 		 */
790 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
791 
792 		/* The address was invalid. */
793 		if (old == -1) {
794 			umtxq_lock(&uq->uq_key);
795 			umtxq_remove(uq);
796 			umtxq_unlock(&uq->uq_key);
797 			umtx_key_release(&uq->uq_key);
798 			return (EFAULT);
799 		}
800 
801 		/*
802 		 * We set the contested bit, sleep. Otherwise the lock changed
803 		 * and we need to retry or we lost a race to the thread
804 		 * unlocking the umtx.
805 		 */
806 		umtxq_lock(&uq->uq_key);
807 		if (old == owner)
808 			error = umtxq_sleep(uq, "umtx", timo);
809 		umtxq_remove(uq);
810 		umtxq_unlock(&uq->uq_key);
811 		umtx_key_release(&uq->uq_key);
812 	}
813 
814 	return (0);
815 }
816 
817 /*
818  * Lock a umtx object.
819  */
820 static int
821 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
822 	struct timespec *timeout)
823 {
824 	struct timespec ts, ts2, ts3;
825 	struct timeval tv;
826 	int error;
827 
828 	if (timeout == NULL) {
829 		error = _do_lock_umtx32(td, m, id, 0);
830 		/* Mutex locking is restarted if it is interrupted. */
831 		if (error == EINTR)
832 			error = ERESTART;
833 	} else {
834 		getnanouptime(&ts);
835 		timespecadd(&ts, timeout);
836 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
837 		for (;;) {
838 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
839 			if (error != ETIMEDOUT)
840 				break;
841 			getnanouptime(&ts2);
842 			if (timespeccmp(&ts2, &ts, >=)) {
843 				error = ETIMEDOUT;
844 				break;
845 			}
846 			ts3 = ts;
847 			timespecsub(&ts3, &ts2);
848 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
849 		}
850 		/* Timed-locking is not restarted. */
851 		if (error == ERESTART)
852 			error = EINTR;
853 	}
854 	return (error);
855 }
856 
857 /*
858  * Unlock a umtx object.
859  */
860 static int
861 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
862 {
863 	struct umtx_key key;
864 	uint32_t owner;
865 	uint32_t old;
866 	int error;
867 	int count;
868 
869 	/*
870 	 * Make sure we own this mtx.
871 	 */
872 	owner = fuword32(m);
873 	if (owner == -1)
874 		return (EFAULT);
875 
876 	if ((owner & ~UMUTEX_CONTESTED) != id)
877 		return (EPERM);
878 
879 	/* This should be done in userland */
880 	if ((owner & UMUTEX_CONTESTED) == 0) {
881 		old = casuword32(m, owner, UMUTEX_UNOWNED);
882 		if (old == -1)
883 			return (EFAULT);
884 		if (old == owner)
885 			return (0);
886 		owner = old;
887 	}
888 
889 	/* We should only ever be in here for contested locks */
890 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
891 		&key)) != 0)
892 		return (error);
893 
894 	umtxq_lock(&key);
895 	umtxq_busy(&key);
896 	count = umtxq_count(&key);
897 	umtxq_unlock(&key);
898 
899 	/*
900 	 * When unlocking the umtx, it must be marked as unowned if
901 	 * there is zero or one thread only waiting for it.
902 	 * Otherwise, it must be marked as contested.
903 	 */
904 	old = casuword32(m, owner,
905 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
906 	umtxq_lock(&key);
907 	umtxq_signal(&key,1);
908 	umtxq_unbusy(&key);
909 	umtxq_unlock(&key);
910 	umtx_key_release(&key);
911 	if (old == -1)
912 		return (EFAULT);
913 	if (old != owner)
914 		return (EINVAL);
915 	return (0);
916 }
917 #endif
918 
919 /*
920  * Fetch and compare value, sleep on the address if value is not changed.
921  */
922 static int
923 do_wait(struct thread *td, void *addr, u_long id,
924 	struct timespec *timeout, int compat32)
925 {
926 	struct umtx_q *uq;
927 	struct timespec ts, ts2, ts3;
928 	struct timeval tv;
929 	u_long tmp;
930 	int error = 0;
931 
932 	uq = td->td_umtxq;
933 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
934 	    &uq->uq_key)) != 0)
935 		return (error);
936 
937 	umtxq_lock(&uq->uq_key);
938 	umtxq_insert(uq);
939 	umtxq_unlock(&uq->uq_key);
940 	if (compat32 == 0)
941 		tmp = fuword(addr);
942         else
943 		tmp = fuword32(addr);
944 	if (tmp != id) {
945 		umtxq_lock(&uq->uq_key);
946 		umtxq_remove(uq);
947 		umtxq_unlock(&uq->uq_key);
948 	} else if (timeout == NULL) {
949 		umtxq_lock(&uq->uq_key);
950 		error = umtxq_sleep(uq, "ucond", 0);
951 		umtxq_remove(uq);
952 		umtxq_unlock(&uq->uq_key);
953 	} else {
954 		getnanouptime(&ts);
955 		timespecadd(&ts, timeout);
956 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
957 		umtxq_lock(&uq->uq_key);
958 		for (;;) {
959 			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
960 			if (!(uq->uq_flags & UQF_UMTXQ))
961 				break;
962 			if (error != ETIMEDOUT)
963 				break;
964 			umtxq_unlock(&uq->uq_key);
965 			getnanouptime(&ts2);
966 			if (timespeccmp(&ts2, &ts, >=)) {
967 				error = ETIMEDOUT;
968 				umtxq_lock(&uq->uq_key);
969 				break;
970 			}
971 			ts3 = ts;
972 			timespecsub(&ts3, &ts2);
973 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
974 			umtxq_lock(&uq->uq_key);
975 		}
976 		umtxq_remove(uq);
977 		umtxq_unlock(&uq->uq_key);
978 	}
979 	umtx_key_release(&uq->uq_key);
980 	if (error == ERESTART)
981 		error = EINTR;
982 	return (error);
983 }
984 
985 /*
986  * Wake up threads sleeping on the specified address.
987  */
988 int
989 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
990 {
991 	struct umtx_key key;
992 	int ret;
993 
994 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
995 	   &key)) != 0)
996 		return (ret);
997 	umtxq_lock(&key);
998 	ret = umtxq_signal(&key, n_wake);
999 	umtxq_unlock(&key);
1000 	umtx_key_release(&key);
1001 	return (0);
1002 }
1003 
1004 /*
1005  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1006  */
1007 static int
1008 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1009 	int try)
1010 {
1011 	struct umtx_q *uq;
1012 	uint32_t owner, old, id;
1013 	int error = 0;
1014 
1015 	id = td->td_tid;
1016 	uq = td->td_umtxq;
1017 
1018 	/*
1019 	 * Care must be exercised when dealing with umtx structure. It
1020 	 * can fault on any access.
1021 	 */
1022 	for (;;) {
1023 		/*
1024 		 * Try the uncontested case.  This should be done in userland.
1025 		 */
1026 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1027 
1028 		/* The acquire succeeded. */
1029 		if (owner == UMUTEX_UNOWNED)
1030 			return (0);
1031 
1032 		/* The address was invalid. */
1033 		if (owner == -1)
1034 			return (EFAULT);
1035 
1036 		/* If no one owns it but it is contested try to acquire it. */
1037 		if (owner == UMUTEX_CONTESTED) {
1038 			owner = casuword32(&m->m_owner,
1039 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1040 
1041 			if (owner == UMUTEX_CONTESTED)
1042 				return (0);
1043 
1044 			/* The address was invalid. */
1045 			if (owner == -1)
1046 				return (EFAULT);
1047 
1048 			/* If this failed the lock has changed, restart. */
1049 			continue;
1050 		}
1051 
1052 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1053 		    (owner & ~UMUTEX_CONTESTED) == id)
1054 			return (EDEADLK);
1055 
1056 		if (try != 0)
1057 			return (EBUSY);
1058 
1059 		/*
1060 		 * If we caught a signal, we have retried and now
1061 		 * exit immediately.
1062 		 */
1063 		if (error != 0)
1064 			return (error);
1065 
1066 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1067 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1068 			return (error);
1069 
1070 		umtxq_lock(&uq->uq_key);
1071 		umtxq_busy(&uq->uq_key);
1072 		umtxq_insert(uq);
1073 		umtxq_unbusy(&uq->uq_key);
1074 		umtxq_unlock(&uq->uq_key);
1075 
1076 		/*
1077 		 * Set the contested bit so that a release in user space
1078 		 * knows to use the system call for unlock.  If this fails
1079 		 * either some one else has acquired the lock or it has been
1080 		 * released.
1081 		 */
1082 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1083 
1084 		/* The address was invalid. */
1085 		if (old == -1) {
1086 			umtxq_lock(&uq->uq_key);
1087 			umtxq_remove(uq);
1088 			umtxq_unlock(&uq->uq_key);
1089 			umtx_key_release(&uq->uq_key);
1090 			return (EFAULT);
1091 		}
1092 
1093 		/*
1094 		 * We set the contested bit, sleep. Otherwise the lock changed
1095 		 * and we need to retry or we lost a race to the thread
1096 		 * unlocking the umtx.
1097 		 */
1098 		umtxq_lock(&uq->uq_key);
1099 		if (old == owner)
1100 			error = umtxq_sleep(uq, "umtxn", timo);
1101 		umtxq_remove(uq);
1102 		umtxq_unlock(&uq->uq_key);
1103 		umtx_key_release(&uq->uq_key);
1104 	}
1105 
1106 	return (0);
1107 }
1108 
1109 /*
1110  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1111  */
1112 /*
1113  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1114  */
1115 static int
1116 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1117 {
1118 	struct umtx_key key;
1119 	uint32_t owner, old, id;
1120 	int error;
1121 	int count;
1122 
1123 	id = td->td_tid;
1124 	/*
1125 	 * Make sure we own this mtx.
1126 	 */
1127 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1128 	if (owner == -1)
1129 		return (EFAULT);
1130 
1131 	if ((owner & ~UMUTEX_CONTESTED) != id)
1132 		return (EPERM);
1133 
1134 	/* This should be done in userland */
1135 	if ((owner & UMUTEX_CONTESTED) == 0) {
1136 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1137 		if (old == -1)
1138 			return (EFAULT);
1139 		if (old == owner)
1140 			return (0);
1141 		owner = old;
1142 	}
1143 
1144 	/* We should only ever be in here for contested locks */
1145 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1146 	    &key)) != 0)
1147 		return (error);
1148 
1149 	umtxq_lock(&key);
1150 	umtxq_busy(&key);
1151 	count = umtxq_count(&key);
1152 	umtxq_unlock(&key);
1153 
1154 	/*
1155 	 * When unlocking the umtx, it must be marked as unowned if
1156 	 * there is zero or one thread only waiting for it.
1157 	 * Otherwise, it must be marked as contested.
1158 	 */
1159 	old = casuword32(&m->m_owner, owner,
1160 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1161 	umtxq_lock(&key);
1162 	umtxq_signal(&key,1);
1163 	umtxq_unbusy(&key);
1164 	umtxq_unlock(&key);
1165 	umtx_key_release(&key);
1166 	if (old == -1)
1167 		return (EFAULT);
1168 	if (old != owner)
1169 		return (EINVAL);
1170 	return (0);
1171 }
1172 
1173 static inline struct umtx_pi *
1174 umtx_pi_alloc(void)
1175 {
1176 	struct umtx_pi *pi;
1177 
1178 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
1179 	TAILQ_INIT(&pi->pi_blocked);
1180 	atomic_add_int(&umtx_pi_allocated, 1);
1181 	return (pi);
1182 }
1183 
1184 static inline void
1185 umtx_pi_free(struct umtx_pi *pi)
1186 {
1187 	uma_zfree(umtx_pi_zone, pi);
1188 	atomic_add_int(&umtx_pi_allocated, -1);
1189 }
1190 
1191 /*
1192  * Adjust the thread's position on a pi_state after its priority has been
1193  * changed.
1194  */
1195 static int
1196 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1197 {
1198 	struct umtx_q *uq, *uq1, *uq2;
1199 	struct thread *td1;
1200 
1201 	mtx_assert(&sched_lock, MA_OWNED);
1202 	if (pi == NULL)
1203 		return (0);
1204 
1205 	uq = td->td_umtxq;
1206 
1207 	/*
1208 	 * Check if the thread needs to be moved on the blocked chain.
1209 	 * It needs to be moved if either its priority is lower than
1210 	 * the previous thread or higher than the next thread.
1211 	 */
1212 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1213 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1214 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1215 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1216 		/*
1217 		 * Remove thread from blocked chain and determine where
1218 		 * it should be moved to.
1219 		 */
1220 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1221 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1222 			td1 = uq1->uq_thread;
1223 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1224 			if (UPRI(td1) > UPRI(td))
1225 				break;
1226 		}
1227 
1228 		if (uq1 == NULL)
1229 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1230 		else
1231 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1232 	}
1233 	return (1);
1234 }
1235 
1236 /*
1237  * Propagate priority when a thread is blocked on POSIX
1238  * PI mutex.
1239  */
1240 static void
1241 umtx_propagate_priority(struct thread *td)
1242 {
1243 	struct umtx_q *uq;
1244 	struct umtx_pi *pi;
1245 	int pri;
1246 
1247 	mtx_assert(&sched_lock, MA_OWNED);
1248 	pri = UPRI(td);
1249 	uq = td->td_umtxq;
1250 	pi = uq->uq_pi_blocked;
1251 	if (pi == NULL)
1252 		return;
1253 
1254 	for (;;) {
1255 		td = pi->pi_owner;
1256 		if (td == NULL)
1257 			return;
1258 
1259 		MPASS(td->td_proc != NULL);
1260 		MPASS(td->td_proc->p_magic == P_MAGIC);
1261 
1262 		if (UPRI(td) <= pri)
1263 			return;
1264 
1265 		sched_lend_user_prio(td, pri);
1266 
1267 		/*
1268 		 * Pick up the lock that td is blocked on.
1269 		 */
1270 		uq = td->td_umtxq;
1271 		pi = uq->uq_pi_blocked;
1272 		/* Resort td on the list if needed. */
1273 		if (!umtx_pi_adjust_thread(pi, td))
1274 			break;
1275 	}
1276 }
1277 
1278 /*
1279  * Unpropagate priority for a PI mutex when a thread blocked on
1280  * it is interrupted by signal or resumed by others.
1281  */
1282 static void
1283 umtx_unpropagate_priority(struct umtx_pi *pi)
1284 {
1285 	struct umtx_q *uq, *uq_owner;
1286 	struct umtx_pi *pi2;
1287 	int pri;
1288 
1289 	mtx_assert(&sched_lock, MA_OWNED);
1290 
1291 	while (pi != NULL && pi->pi_owner != NULL) {
1292 		pri = PRI_MAX;
1293 		uq_owner = pi->pi_owner->td_umtxq;
1294 
1295 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1296 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1297 			if (uq != NULL) {
1298 				if (pri > UPRI(uq->uq_thread))
1299 					pri = UPRI(uq->uq_thread);
1300 			}
1301 		}
1302 
1303 		if (pri > uq_owner->uq_inherited_pri)
1304 			pri = uq_owner->uq_inherited_pri;
1305 		sched_unlend_user_prio(pi->pi_owner, pri);
1306 		pi = uq_owner->uq_pi_blocked;
1307 	}
1308 }
1309 
1310 /*
1311  * Insert a PI mutex into owned list.
1312  */
1313 static void
1314 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1315 {
1316 	struct umtx_q *uq_owner;
1317 
1318 	uq_owner = owner->td_umtxq;
1319 	mtx_assert(&sched_lock, MA_OWNED);
1320 	if (pi->pi_owner != NULL)
1321 		panic("pi_ower != NULL");
1322 	pi->pi_owner = owner;
1323 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1324 }
1325 
1326 /*
1327  * Claim ownership of a PI mutex.
1328  */
1329 static int
1330 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1331 {
1332 	struct umtx_q *uq, *uq_owner;
1333 
1334 	uq_owner = owner->td_umtxq;
1335 	mtx_lock_spin(&sched_lock);
1336 	if (pi->pi_owner == owner) {
1337 		mtx_unlock_spin(&sched_lock);
1338 		return (0);
1339 	}
1340 
1341 	if (pi->pi_owner != NULL) {
1342 		/*
1343 		 * userland may have already messed the mutex, sigh.
1344 		 */
1345 		mtx_unlock_spin(&sched_lock);
1346 		return (EPERM);
1347 	}
1348 	umtx_pi_setowner(pi, owner);
1349 	uq = TAILQ_FIRST(&pi->pi_blocked);
1350 	if (uq != NULL) {
1351 		int pri;
1352 
1353 		pri = UPRI(uq->uq_thread);
1354 		if (pri < UPRI(owner))
1355 			sched_lend_user_prio(owner, pri);
1356 	}
1357 	mtx_unlock_spin(&sched_lock);
1358 	return (0);
1359 }
1360 
1361 /*
1362  * Adjust a thread's order position in its blocked PI mutex,
1363  * this may result new priority propagating process.
1364  */
1365 void
1366 umtx_pi_adjust(struct thread *td, u_char oldpri)
1367 {
1368 	struct umtx_q *uq;
1369 	struct umtx_pi *pi;
1370 
1371 	uq = td->td_umtxq;
1372 
1373 	mtx_assert(&sched_lock, MA_OWNED);
1374 	MPASS(TD_ON_UPILOCK(td));
1375 
1376 	/*
1377 	 * Pick up the lock that td is blocked on.
1378 	 */
1379 	pi = uq->uq_pi_blocked;
1380 	MPASS(pi != NULL);
1381 
1382 	/* Resort the turnstile on the list. */
1383 	if (!umtx_pi_adjust_thread(pi, td))
1384 		return;
1385 
1386 	/*
1387 	 * If our priority was lowered and we are at the head of the
1388 	 * turnstile, then propagate our new priority up the chain.
1389 	 */
1390 	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1391 		umtx_propagate_priority(td);
1392 }
1393 
1394 /*
1395  * Sleep on a PI mutex.
1396  */
1397 static int
1398 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1399 	uint32_t owner, const char *wmesg, int timo)
1400 {
1401 	struct umtxq_chain *uc;
1402 	struct thread *td, *td1;
1403 	struct umtx_q *uq1;
1404 	int pri;
1405 	int error = 0;
1406 
1407 	td = uq->uq_thread;
1408 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1409 	uc = umtxq_getchain(&uq->uq_key);
1410 	UMTXQ_LOCKED_ASSERT(uc);
1411 	umtxq_insert(uq);
1412 	if (pi->pi_owner == NULL) {
1413 		/* XXX
1414 		 * Current, We only support process private PI-mutex,
1415 		 * non-contended PI-mutexes are locked in userland.
1416 		 * Process shared PI-mutex should always be initialized
1417 		 * by kernel and be registered in kernel, locking should
1418 		 * always be done by kernel to avoid security problems.
1419 		 * For process private PI-mutex, we can find owner
1420 		 * thread and boost its priority safely.
1421 		 */
1422 		PROC_LOCK(curproc);
1423 		td1 = thread_find(curproc, owner);
1424 		mtx_lock_spin(&sched_lock);
1425 		if (td1 != NULL && pi->pi_owner == NULL) {
1426 			uq1 = td1->td_umtxq;
1427 			umtx_pi_setowner(pi, td1);
1428 		}
1429 		PROC_UNLOCK(curproc);
1430 	} else {
1431 		mtx_lock_spin(&sched_lock);
1432 	}
1433 
1434 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1435 		pri = UPRI(uq1->uq_thread);
1436 		if (pri > UPRI(td))
1437 			break;
1438 	}
1439 
1440 	if (uq1 != NULL)
1441 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1442 	else
1443 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1444 
1445 	uq->uq_pi_blocked = pi;
1446 	td->td_flags |= TDF_UPIBLOCKED;
1447 	mtx_unlock_spin(&sched_lock);
1448 	umtxq_unlock(&uq->uq_key);
1449 
1450 	mtx_lock_spin(&sched_lock);
1451 	umtx_propagate_priority(td);
1452 	mtx_unlock_spin(&sched_lock);
1453 
1454 	umtxq_lock(&uq->uq_key);
1455 	if (uq->uq_flags & UQF_UMTXQ) {
1456 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1457 		if (error == EWOULDBLOCK)
1458 			error = ETIMEDOUT;
1459 		if (uq->uq_flags & UQF_UMTXQ) {
1460 			umtxq_busy(&uq->uq_key);
1461 			umtxq_remove(uq);
1462 			umtxq_unbusy(&uq->uq_key);
1463 		}
1464 	}
1465 	umtxq_unlock(&uq->uq_key);
1466 
1467 	mtx_lock_spin(&sched_lock);
1468 	uq->uq_pi_blocked = NULL;
1469 	td->td_flags &= ~TDF_UPIBLOCKED;
1470 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1471 	umtx_unpropagate_priority(pi);
1472 	mtx_unlock_spin(&sched_lock);
1473 
1474 	umtxq_lock(&uq->uq_key);
1475 
1476 	return (error);
1477 }
1478 
1479 /*
1480  * Add reference count for a PI mutex.
1481  */
1482 static void
1483 umtx_pi_ref(struct umtx_pi *pi)
1484 {
1485 	struct umtxq_chain *uc;
1486 
1487 	uc = umtxq_getchain(&pi->pi_key);
1488 	UMTXQ_LOCKED_ASSERT(uc);
1489 	pi->pi_refcount++;
1490 }
1491 
1492 /*
1493  * Decrease reference count for a PI mutex, if the counter
1494  * is decreased to zero, its memory space is freed.
1495  */
1496 static void
1497 umtx_pi_unref(struct umtx_pi *pi)
1498 {
1499 	struct umtxq_chain *uc;
1500 	int free = 0;
1501 
1502 	uc = umtxq_getchain(&pi->pi_key);
1503 	UMTXQ_LOCKED_ASSERT(uc);
1504 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1505 	if (--pi->pi_refcount == 0) {
1506 		mtx_lock_spin(&sched_lock);
1507 		if (pi->pi_owner != NULL) {
1508 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1509 				pi, pi_link);
1510 			pi->pi_owner = NULL;
1511 		}
1512 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1513 			("blocked queue not empty"));
1514 		mtx_unlock_spin(&sched_lock);
1515 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1516 		free = 1;
1517 	}
1518 	if (free)
1519 		umtx_pi_free(pi);
1520 }
1521 
1522 /*
1523  * Find a PI mutex in hash table.
1524  */
1525 static struct umtx_pi *
1526 umtx_pi_lookup(struct umtx_key *key)
1527 {
1528 	struct umtxq_chain *uc;
1529 	struct umtx_pi *pi;
1530 
1531 	uc = umtxq_getchain(key);
1532 	UMTXQ_LOCKED_ASSERT(uc);
1533 
1534 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1535 		if (umtx_key_match(&pi->pi_key, key)) {
1536 			return (pi);
1537 		}
1538 	}
1539 	return (NULL);
1540 }
1541 
1542 /*
1543  * Insert a PI mutex into hash table.
1544  */
1545 static inline void
1546 umtx_pi_insert(struct umtx_pi *pi)
1547 {
1548 	struct umtxq_chain *uc;
1549 
1550 	uc = umtxq_getchain(&pi->pi_key);
1551 	UMTXQ_LOCKED_ASSERT(uc);
1552 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1553 }
1554 
1555 /*
1556  * Lock a PI mutex.
1557  */
1558 static int
1559 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1560 	int try)
1561 {
1562 	struct umtx_q *uq;
1563 	struct umtx_pi *pi, *new_pi;
1564 	uint32_t id, owner, old;
1565 	int error;
1566 
1567 	id = td->td_tid;
1568 	uq = td->td_umtxq;
1569 
1570 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1571 	    &uq->uq_key)) != 0)
1572 		return (error);
1573 	for (;;) {
1574 		pi = NULL;
1575 		umtxq_lock(&uq->uq_key);
1576 		pi = umtx_pi_lookup(&uq->uq_key);
1577 		if (pi == NULL) {
1578 			umtxq_unlock(&uq->uq_key);
1579 			new_pi = umtx_pi_alloc();
1580 			new_pi->pi_key = uq->uq_key;
1581 			umtxq_lock(&uq->uq_key);
1582 			pi = umtx_pi_lookup(&uq->uq_key);
1583 			if (pi != NULL)
1584 				umtx_pi_free(new_pi);
1585 			else {
1586 				umtx_pi_insert(new_pi);
1587 				pi = new_pi;
1588 			}
1589 		}
1590 
1591 		umtx_pi_ref(pi);
1592 		umtxq_unlock(&uq->uq_key);
1593 
1594 		/*
1595 		 * Care must be exercised when dealing with umtx structure.  It
1596 		 * can fault on any access.
1597 		 */
1598 
1599 		/*
1600 		 * Try the uncontested case.  This should be done in userland.
1601 		 */
1602 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1603 
1604 		/* The acquire succeeded. */
1605 		if (owner == UMUTEX_UNOWNED) {
1606 			error = 0;
1607 			break;
1608 		}
1609 
1610 		/* The address was invalid. */
1611 		if (owner == -1) {
1612 			error = EFAULT;
1613 			break;
1614 		}
1615 
1616 		/* If no one owns it but it is contested try to acquire it. */
1617 		if (owner == UMUTEX_CONTESTED) {
1618 			owner = casuword32(&m->m_owner,
1619 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1620 
1621 			if (owner == UMUTEX_CONTESTED) {
1622 				umtxq_lock(&uq->uq_key);
1623 				error = umtx_pi_claim(pi, td);
1624 				umtxq_unlock(&uq->uq_key);
1625 				break;
1626 			}
1627 
1628 			/* The address was invalid. */
1629 			if (owner == -1) {
1630 				error = EFAULT;
1631 				break;
1632 			}
1633 
1634 			/* If this failed the lock has changed, restart. */
1635 			umtxq_lock(&uq->uq_key);
1636 			umtx_pi_unref(pi);
1637 			umtxq_unlock(&uq->uq_key);
1638 			pi = NULL;
1639 			continue;
1640 		}
1641 
1642 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1643 		    (owner & ~UMUTEX_CONTESTED) == id) {
1644 			error = EDEADLK;
1645 			break;
1646 		}
1647 
1648 		if (try != 0) {
1649 			error = EBUSY;
1650 			break;
1651 		}
1652 
1653 		/*
1654 		 * If we caught a signal, we have retried and now
1655 		 * exit immediately.
1656 		 */
1657 		if (error != 0)
1658 			break;
1659 
1660 		umtxq_lock(&uq->uq_key);
1661 		umtxq_busy(&uq->uq_key);
1662 		umtxq_unlock(&uq->uq_key);
1663 
1664 		/*
1665 		 * Set the contested bit so that a release in user space
1666 		 * knows to use the system call for unlock.  If this fails
1667 		 * either some one else has acquired the lock or it has been
1668 		 * released.
1669 		 */
1670 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1671 
1672 		/* The address was invalid. */
1673 		if (old == -1) {
1674 			umtxq_lock(&uq->uq_key);
1675 			umtxq_unbusy(&uq->uq_key);
1676 			umtxq_unlock(&uq->uq_key);
1677 			error = EFAULT;
1678 			break;
1679 		}
1680 
1681 		umtxq_lock(&uq->uq_key);
1682 		umtxq_unbusy(&uq->uq_key);
1683 		/*
1684 		 * We set the contested bit, sleep. Otherwise the lock changed
1685 		 * and we need to retry or we lost a race to the thread
1686 		 * unlocking the umtx.
1687 		 */
1688 		if (old == owner)
1689 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1690 				 "umtxpi", timo);
1691 		umtx_pi_unref(pi);
1692 		umtxq_unlock(&uq->uq_key);
1693 		pi = NULL;
1694 	}
1695 
1696 	if (pi != NULL) {
1697 		umtxq_lock(&uq->uq_key);
1698 		umtx_pi_unref(pi);
1699 		umtxq_unlock(&uq->uq_key);
1700 	}
1701 
1702 	umtx_key_release(&uq->uq_key);
1703 	return (error);
1704 }
1705 
1706 /*
1707  * Unlock a PI mutex.
1708  */
1709 static int
1710 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1711 {
1712 	struct umtx_key key;
1713 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1714 	struct umtx_pi *pi, *pi2;
1715 	uint32_t owner, old, id;
1716 	int error;
1717 	int count;
1718 	int pri;
1719 
1720 	id = td->td_tid;
1721 	/*
1722 	 * Make sure we own this mtx.
1723 	 */
1724 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1725 	if (owner == -1)
1726 		return (EFAULT);
1727 
1728 	if ((owner & ~UMUTEX_CONTESTED) != id)
1729 		return (EPERM);
1730 
1731 	/* This should be done in userland */
1732 	if ((owner & UMUTEX_CONTESTED) == 0) {
1733 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1734 		if (old == -1)
1735 			return (EFAULT);
1736 		if (old == owner)
1737 			return (0);
1738 		owner = old;
1739 	}
1740 
1741 	/* We should only ever be in here for contested locks */
1742 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1743 	    &key)) != 0)
1744 		return (error);
1745 
1746 	umtxq_lock(&key);
1747 	umtxq_busy(&key);
1748 	count = umtxq_count_pi(&key, &uq_first);
1749 	if (uq_first != NULL) {
1750 		pi = uq_first->uq_pi_blocked;
1751 		if (pi->pi_owner != curthread) {
1752 			umtxq_unbusy(&key);
1753 			umtxq_unlock(&key);
1754 			/* userland messed the mutex */
1755 			return (EPERM);
1756 		}
1757 		uq_me = curthread->td_umtxq;
1758 		mtx_lock_spin(&sched_lock);
1759 		pi->pi_owner = NULL;
1760 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1761 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1762 		pri = PRI_MAX;
1763 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1764 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1765 			if (uq_first2 != NULL) {
1766 				if (pri > UPRI(uq_first2->uq_thread))
1767 					pri = UPRI(uq_first2->uq_thread);
1768 			}
1769 		}
1770 		sched_unlend_user_prio(curthread, pri);
1771 		mtx_unlock_spin(&sched_lock);
1772 	}
1773 	umtxq_unlock(&key);
1774 
1775 	/*
1776 	 * When unlocking the umtx, it must be marked as unowned if
1777 	 * there is zero or one thread only waiting for it.
1778 	 * Otherwise, it must be marked as contested.
1779 	 */
1780 	old = casuword32(&m->m_owner, owner,
1781 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1782 
1783 	umtxq_lock(&key);
1784 	if (uq_first != NULL)
1785 		umtxq_signal_thread(uq_first);
1786 	umtxq_unbusy(&key);
1787 	umtxq_unlock(&key);
1788 	umtx_key_release(&key);
1789 	if (old == -1)
1790 		return (EFAULT);
1791 	if (old != owner)
1792 		return (EINVAL);
1793 	return (0);
1794 }
1795 
1796 /*
1797  * Lock a PP mutex.
1798  */
1799 static int
1800 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1801 	int try)
1802 {
1803 	struct umtx_q *uq, *uq2;
1804 	struct umtx_pi *pi;
1805 	uint32_t ceiling;
1806 	uint32_t owner, id;
1807 	int error, pri, old_inherited_pri, su;
1808 
1809 	id = td->td_tid;
1810 	uq = td->td_umtxq;
1811 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1812 	    &uq->uq_key)) != 0)
1813 		return (error);
1814 	su = (suser(td) == 0);
1815 	for (;;) {
1816 		old_inherited_pri = uq->uq_inherited_pri;
1817 		umtxq_lock(&uq->uq_key);
1818 		umtxq_busy(&uq->uq_key);
1819 		umtxq_unlock(&uq->uq_key);
1820 
1821 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1822 		if (ceiling > RTP_PRIO_MAX) {
1823 			error = EINVAL;
1824 			goto out;
1825 		}
1826 
1827 		mtx_lock_spin(&sched_lock);
1828 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1829 			mtx_unlock_spin(&sched_lock);
1830 			error = EINVAL;
1831 			goto out;
1832 		}
1833 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1834 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1835 			if (uq->uq_inherited_pri < UPRI(td))
1836 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1837 		}
1838 		mtx_unlock_spin(&sched_lock);
1839 
1840 		owner = casuword32(&m->m_owner,
1841 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1842 
1843 		if (owner == UMUTEX_CONTESTED) {
1844 			error = 0;
1845 			break;
1846 		}
1847 
1848 		/* The address was invalid. */
1849 		if (owner == -1) {
1850 			error = EFAULT;
1851 			break;
1852 		}
1853 
1854 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1855 		    (owner & ~UMUTEX_CONTESTED) == id) {
1856 			error = EDEADLK;
1857 			break;
1858 		}
1859 
1860 		if (try != 0) {
1861 			error = EBUSY;
1862 			break;
1863 		}
1864 
1865 		/*
1866 		 * If we caught a signal, we have retried and now
1867 		 * exit immediately.
1868 		 */
1869 		if (error != 0)
1870 			break;
1871 
1872 		umtxq_lock(&uq->uq_key);
1873 		umtxq_insert(uq);
1874 		umtxq_unbusy(&uq->uq_key);
1875 		error = umtxq_sleep(uq, "umtxpp", timo);
1876 		umtxq_remove(uq);
1877 		umtxq_unlock(&uq->uq_key);
1878 
1879 		mtx_lock_spin(&sched_lock);
1880 		uq->uq_inherited_pri = old_inherited_pri;
1881 		pri = PRI_MAX;
1882 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1883 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1884 			if (uq2 != NULL) {
1885 				if (pri > UPRI(uq2->uq_thread))
1886 					pri = UPRI(uq2->uq_thread);
1887 			}
1888 		}
1889 		if (pri > uq->uq_inherited_pri)
1890 			pri = uq->uq_inherited_pri;
1891 		sched_unlend_user_prio(td, pri);
1892 		mtx_unlock_spin(&sched_lock);
1893 	}
1894 
1895 	if (error != 0) {
1896 		mtx_lock_spin(&sched_lock);
1897 		uq->uq_inherited_pri = old_inherited_pri;
1898 		pri = PRI_MAX;
1899 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1900 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1901 			if (uq2 != NULL) {
1902 				if (pri > UPRI(uq2->uq_thread))
1903 					pri = UPRI(uq2->uq_thread);
1904 			}
1905 		}
1906 		if (pri > uq->uq_inherited_pri)
1907 			pri = uq->uq_inherited_pri;
1908 		sched_unlend_user_prio(td, pri);
1909 		mtx_unlock_spin(&sched_lock);
1910 	}
1911 
1912 out:
1913 	umtxq_lock(&uq->uq_key);
1914 	umtxq_unbusy(&uq->uq_key);
1915 	umtxq_unlock(&uq->uq_key);
1916 	umtx_key_release(&uq->uq_key);
1917 	return (error);
1918 }
1919 
1920 /*
1921  * Unlock a PP mutex.
1922  */
1923 static int
1924 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1925 {
1926 	struct umtx_key key;
1927 	struct umtx_q *uq, *uq2;
1928 	struct umtx_pi *pi;
1929 	uint32_t owner, id;
1930 	uint32_t rceiling;
1931 	int error, pri, new_inherited_pri, su;
1932 
1933 	id = td->td_tid;
1934 	uq = td->td_umtxq;
1935 	su = (suser(td) == 0);
1936 
1937 	/*
1938 	 * Make sure we own this mtx.
1939 	 */
1940 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1941 	if (owner == -1)
1942 		return (EFAULT);
1943 
1944 	if ((owner & ~UMUTEX_CONTESTED) != id)
1945 		return (EPERM);
1946 
1947 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1948 	if (error != 0)
1949 		return (error);
1950 
1951 	if (rceiling == -1)
1952 		new_inherited_pri = PRI_MAX;
1953 	else {
1954 		rceiling = RTP_PRIO_MAX - rceiling;
1955 		if (rceiling > RTP_PRIO_MAX)
1956 			return (EINVAL);
1957 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1958 	}
1959 
1960 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1961 	    &key)) != 0)
1962 		return (error);
1963 	umtxq_lock(&key);
1964 	umtxq_busy(&key);
1965 	umtxq_unlock(&key);
1966 	/*
1967 	 * For priority protected mutex, always set unlocked state
1968 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1969 	 * to lock the mutex, it is necessary because thread priority
1970 	 * has to be adjusted for such mutex.
1971 	 */
1972 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1973 		UMUTEX_CONTESTED);
1974 
1975 	umtxq_lock(&key);
1976 	if (error == 0)
1977 		umtxq_signal(&key, 1);
1978 	umtxq_unbusy(&key);
1979 	umtxq_unlock(&key);
1980 
1981 	if (error == -1)
1982 		error = EFAULT;
1983 	else {
1984 		mtx_lock_spin(&sched_lock);
1985 		if (su != 0)
1986 			uq->uq_inherited_pri = new_inherited_pri;
1987 		pri = PRI_MAX;
1988 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1989 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1990 			if (uq2 != NULL) {
1991 				if (pri > UPRI(uq2->uq_thread))
1992 					pri = UPRI(uq2->uq_thread);
1993 			}
1994 		}
1995 		if (pri > uq->uq_inherited_pri)
1996 			pri = uq->uq_inherited_pri;
1997 		sched_unlend_user_prio(td, pri);
1998 		mtx_unlock_spin(&sched_lock);
1999 	}
2000 	umtx_key_release(&key);
2001 	return (error);
2002 }
2003 
2004 static int
2005 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2006 	uint32_t *old_ceiling)
2007 {
2008 	struct umtx_q *uq;
2009 	uint32_t save_ceiling;
2010 	uint32_t owner, id;
2011 	uint32_t flags;
2012 	int error;
2013 
2014 	flags = fuword32(&m->m_flags);
2015 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2016 		return (EINVAL);
2017 	if (ceiling > RTP_PRIO_MAX)
2018 		return (EINVAL);
2019 	id = td->td_tid;
2020 	uq = td->td_umtxq;
2021 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2022 	   &uq->uq_key)) != 0)
2023 		return (error);
2024 	for (;;) {
2025 		umtxq_lock(&uq->uq_key);
2026 		umtxq_busy(&uq->uq_key);
2027 		umtxq_unlock(&uq->uq_key);
2028 
2029 		save_ceiling = fuword32(&m->m_ceilings[0]);
2030 
2031 		owner = casuword32(&m->m_owner,
2032 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2033 
2034 		if (owner == UMUTEX_CONTESTED) {
2035 			suword32(&m->m_ceilings[0], ceiling);
2036 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2037 				UMUTEX_CONTESTED);
2038 			error = 0;
2039 			break;
2040 		}
2041 
2042 		/* The address was invalid. */
2043 		if (owner == -1) {
2044 			error = EFAULT;
2045 			break;
2046 		}
2047 
2048 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2049 			suword32(&m->m_ceilings[0], ceiling);
2050 			error = 0;
2051 			break;
2052 		}
2053 
2054 		/*
2055 		 * If we caught a signal, we have retried and now
2056 		 * exit immediately.
2057 		 */
2058 		if (error != 0)
2059 			break;
2060 
2061 		/*
2062 		 * We set the contested bit, sleep. Otherwise the lock changed
2063 		 * and we need to retry or we lost a race to the thread
2064 		 * unlocking the umtx.
2065 		 */
2066 		umtxq_lock(&uq->uq_key);
2067 		umtxq_insert(uq);
2068 		umtxq_unbusy(&uq->uq_key);
2069 		error = umtxq_sleep(uq, "umtxpp", 0);
2070 		umtxq_remove(uq);
2071 		umtxq_unlock(&uq->uq_key);
2072 	}
2073 	umtxq_lock(&uq->uq_key);
2074 	if (error == 0)
2075 		umtxq_signal(&uq->uq_key, INT_MAX);
2076 	umtxq_unbusy(&uq->uq_key);
2077 	umtxq_unlock(&uq->uq_key);
2078 	umtx_key_release(&uq->uq_key);
2079 	if (error == 0 && old_ceiling != NULL)
2080 		suword32(old_ceiling, save_ceiling);
2081 	return (error);
2082 }
2083 
2084 static int
2085 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2086 	int try)
2087 {
2088 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2089 	case 0:
2090 		return (_do_lock_normal(td, m, flags, timo, try));
2091 	case UMUTEX_PRIO_INHERIT:
2092 		return (_do_lock_pi(td, m, flags, timo, try));
2093 	case UMUTEX_PRIO_PROTECT:
2094 		return (_do_lock_pp(td, m, flags, timo, try));
2095 	}
2096 	return (EINVAL);
2097 }
2098 
2099 /*
2100  * Lock a userland POSIX mutex.
2101  */
2102 static int
2103 do_lock_umutex(struct thread *td, struct umutex *m,
2104 	struct timespec *timeout, int try)
2105 {
2106 	struct timespec ts, ts2, ts3;
2107 	struct timeval tv;
2108 	uint32_t flags;
2109 	int error;
2110 
2111 	flags = fuword32(&m->m_flags);
2112 	if (flags == -1)
2113 		return (EFAULT);
2114 
2115 	if (timeout == NULL) {
2116 		error = _do_lock_umutex(td, m, flags, 0, try);
2117 		/* Mutex locking is restarted if it is interrupted. */
2118 		if (error == EINTR)
2119 			error = ERESTART;
2120 	} else {
2121 		getnanouptime(&ts);
2122 		timespecadd(&ts, timeout);
2123 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2124 		for (;;) {
2125 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2126 			if (error != ETIMEDOUT)
2127 				break;
2128 			getnanouptime(&ts2);
2129 			if (timespeccmp(&ts2, &ts, >=)) {
2130 				error = ETIMEDOUT;
2131 				break;
2132 			}
2133 			ts3 = ts;
2134 			timespecsub(&ts3, &ts2);
2135 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2136 		}
2137 		/* Timed-locking is not restarted. */
2138 		if (error == ERESTART)
2139 			error = EINTR;
2140 	}
2141 	return (error);
2142 }
2143 
2144 /*
2145  * Unlock a userland POSIX mutex.
2146  */
2147 static int
2148 do_unlock_umutex(struct thread *td, struct umutex *m)
2149 {
2150 	uint32_t flags;
2151 
2152 	flags = fuword32(&m->m_flags);
2153 	if (flags == -1)
2154 		return (EFAULT);
2155 
2156 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2157 	case 0:
2158 		return (do_unlock_normal(td, m, flags));
2159 	case UMUTEX_PRIO_INHERIT:
2160 		return (do_unlock_pi(td, m, flags));
2161 	case UMUTEX_PRIO_PROTECT:
2162 		return (do_unlock_pp(td, m, flags));
2163 	}
2164 
2165 	return (EINVAL);
2166 }
2167 
2168 int
2169 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2170     /* struct umtx *umtx */
2171 {
2172 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2173 }
2174 
2175 int
2176 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2177     /* struct umtx *umtx */
2178 {
2179 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2180 }
2181 
2182 static int
2183 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2184 {
2185 	struct timespec *ts, timeout;
2186 	int error;
2187 
2188 	/* Allow a null timespec (wait forever). */
2189 	if (uap->uaddr2 == NULL)
2190 		ts = NULL;
2191 	else {
2192 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2193 		if (error != 0)
2194 			return (error);
2195 		if (timeout.tv_nsec >= 1000000000 ||
2196 		    timeout.tv_nsec < 0) {
2197 			return (EINVAL);
2198 		}
2199 		ts = &timeout;
2200 	}
2201 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2202 }
2203 
2204 static int
2205 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2206 {
2207 	return (do_unlock_umtx(td, uap->obj, uap->val));
2208 }
2209 
2210 static int
2211 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2212 {
2213 	struct timespec *ts, timeout;
2214 	int error;
2215 
2216 	if (uap->uaddr2 == NULL)
2217 		ts = NULL;
2218 	else {
2219 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2220 		if (error != 0)
2221 			return (error);
2222 		if (timeout.tv_nsec >= 1000000000 ||
2223 		    timeout.tv_nsec < 0)
2224 			return (EINVAL);
2225 		ts = &timeout;
2226 	}
2227 	return do_wait(td, uap->obj, uap->val, ts, 0);
2228 }
2229 
2230 static int
2231 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2232 {
2233 	return (kern_umtx_wake(td, uap->obj, uap->val));
2234 }
2235 
2236 static int
2237 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2238 {
2239 	struct timespec *ts, timeout;
2240 	int error;
2241 
2242 	/* Allow a null timespec (wait forever). */
2243 	if (uap->uaddr2 == NULL)
2244 		ts = NULL;
2245 	else {
2246 		error = copyin(uap->uaddr2, &timeout,
2247 		    sizeof(timeout));
2248 		if (error != 0)
2249 			return (error);
2250 		if (timeout.tv_nsec >= 1000000000 ||
2251 		    timeout.tv_nsec < 0) {
2252 			return (EINVAL);
2253 		}
2254 		ts = &timeout;
2255 	}
2256 	return do_lock_umutex(td, uap->obj, ts, 0);
2257 }
2258 
2259 static int
2260 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2261 {
2262 	return do_lock_umutex(td, uap->obj, NULL, 1);
2263 }
2264 
2265 static int
2266 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2267 {
2268 	return do_unlock_umutex(td, uap->obj);
2269 }
2270 
2271 static int
2272 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2273 {
2274 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2275 }
2276 
2277 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2278 
2279 static _umtx_op_func op_table[] = {
2280 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2281 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2282 	__umtx_op_wait,			/* UMTX_OP_WAIT */
2283 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2284 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2285 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2286 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2287 	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2288 };
2289 
2290 int
2291 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
2292 {
2293 	if (uap->op >= 0 && uap->op < UMTX_OP_MAX)
2294 		return (*op_table[uap->op])(td, uap);
2295 	return (EINVAL);
2296 }
2297 
2298 #ifdef COMPAT_IA32
2299 
2300 int
2301 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2302     /* struct umtx *umtx */
2303 {
2304 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2305 }
2306 
2307 int
2308 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2309     /* struct umtx *umtx */
2310 {
2311 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2312 }
2313 
2314 struct timespec32 {
2315 	u_int32_t tv_sec;
2316 	u_int32_t tv_nsec;
2317 };
2318 
2319 static inline int
2320 copyin_timeout32(void *addr, struct timespec *tsp)
2321 {
2322 	struct timespec32 ts32;
2323 	int error;
2324 
2325 	error = copyin(addr, &ts32, sizeof(struct timespec32));
2326 	if (error == 0) {
2327 		tsp->tv_sec = ts32.tv_sec;
2328 		tsp->tv_nsec = ts32.tv_nsec;
2329 	}
2330 	return (error);
2331 }
2332 
2333 static int
2334 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2335 {
2336 	struct timespec *ts, timeout;
2337 	int error;
2338 
2339 	/* Allow a null timespec (wait forever). */
2340 	if (uap->uaddr2 == NULL)
2341 		ts = NULL;
2342 	else {
2343 		error = copyin_timeout32(uap->uaddr2, &timeout);
2344 		if (error != 0)
2345 			return (error);
2346 		if (timeout.tv_nsec >= 1000000000 ||
2347 		    timeout.tv_nsec < 0) {
2348 			return (EINVAL);
2349 		}
2350 		ts = &timeout;
2351 	}
2352 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2353 }
2354 
2355 static int
2356 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2357 {
2358 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2359 }
2360 
2361 static int
2362 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2363 {
2364 	struct timespec *ts, timeout;
2365 	int error;
2366 
2367 	if (uap->uaddr2 == NULL)
2368 		ts = NULL;
2369 	else {
2370 		error = copyin_timeout32(uap->uaddr2, &timeout);
2371 		if (error != 0)
2372 			return (error);
2373 		if (timeout.tv_nsec >= 1000000000 ||
2374 		    timeout.tv_nsec < 0)
2375 			return (EINVAL);
2376 		ts = &timeout;
2377 	}
2378 	return do_wait(td, uap->obj, uap->val, ts, 1);
2379 }
2380 
2381 static int
2382 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2383 {
2384 	struct timespec *ts, timeout;
2385 	int error;
2386 
2387 	/* Allow a null timespec (wait forever). */
2388 	if (uap->uaddr2 == NULL)
2389 		ts = NULL;
2390 	else {
2391 		error = copyin_timeout32(uap->uaddr2, &timeout);
2392 		if (error != 0)
2393 			return (error);
2394 		if (timeout.tv_nsec >= 1000000000 ||
2395 		    timeout.tv_nsec < 0)
2396 			return (EINVAL);
2397 		ts = &timeout;
2398 	}
2399 	return do_lock_umutex(td, uap->obj, ts, 0);
2400 }
2401 
2402 static _umtx_op_func op_table_compat32[] = {
2403 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2404 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2405 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2406 	__umtx_op_wake,			/* UMTX_OP_WAKE */
2407 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2408 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2409 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2410 	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2411 };
2412 
2413 int
2414 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2415 {
2416 	if (uap->op >= 0 && uap->op < UMTX_OP_MAX)
2417 		return (*op_table_compat32[uap->op])(td,
2418 			(struct _umtx_op_args *)uap);
2419 	return (EINVAL);
2420 }
2421 #endif
2422 
2423 void
2424 umtx_thread_init(struct thread *td)
2425 {
2426 	td->td_umtxq = umtxq_alloc();
2427 	td->td_umtxq->uq_thread = td;
2428 }
2429 
2430 void
2431 umtx_thread_fini(struct thread *td)
2432 {
2433 	umtxq_free(td->td_umtxq);
2434 }
2435 
2436 /*
2437  * It will be called when new thread is created, e.g fork().
2438  */
2439 void
2440 umtx_thread_alloc(struct thread *td)
2441 {
2442 	struct umtx_q *uq;
2443 
2444 	uq = td->td_umtxq;
2445 	uq->uq_inherited_pri = PRI_MAX;
2446 
2447 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2448 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2449 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2450 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2451 }
2452 
2453 /*
2454  * exec() hook.
2455  */
2456 static void
2457 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2458 	struct image_params *imgp __unused)
2459 {
2460 	umtx_thread_cleanup(curthread);
2461 }
2462 
2463 /*
2464  * thread_exit() hook.
2465  */
2466 void
2467 umtx_thread_exit(struct thread *td)
2468 {
2469 	umtx_thread_cleanup(td);
2470 }
2471 
2472 /*
2473  * clean up umtx data.
2474  */
2475 static void
2476 umtx_thread_cleanup(struct thread *td)
2477 {
2478 	struct umtx_q *uq;
2479 	struct umtx_pi *pi;
2480 
2481 	if ((uq = td->td_umtxq) == NULL)
2482 		return;
2483 
2484 	mtx_lock_spin(&sched_lock);
2485 	uq->uq_inherited_pri = PRI_MAX;
2486 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2487 		pi->pi_owner = NULL;
2488 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2489 	}
2490 	td->td_flags &= ~TDF_UBORROWING;
2491 	mtx_unlock_spin(&sched_lock);
2492 }
2493