xref: /freebsd/sys/kern/kern_umtx.c (revision 050570efa79efcc9cf5adeb545f1a679c8dc377b)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/eventhandler.h>
48 #include <sys/umtx.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_param.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/cpu.h>
57 
58 #ifdef COMPAT_FREEBSD32
59 #include <compat/freebsd32/freebsd32_proto.h>
60 #endif
61 
62 enum {
63 	TYPE_SIMPLE_WAIT,
64 	TYPE_CV,
65 	TYPE_SEM,
66 	TYPE_SIMPLE_LOCK,
67 	TYPE_NORMAL_UMUTEX,
68 	TYPE_PI_UMUTEX,
69 	TYPE_PP_UMUTEX,
70 	TYPE_RWLOCK
71 };
72 
73 #define _UMUTEX_TRY		1
74 #define _UMUTEX_WAIT		2
75 
76 /* Key to represent a unique userland synchronous object */
77 struct umtx_key {
78 	int	hash;
79 	int	type;
80 	int	shared;
81 	union {
82 		struct {
83 			vm_object_t	object;
84 			uintptr_t	offset;
85 		} shared;
86 		struct {
87 			struct vmspace	*vs;
88 			uintptr_t	addr;
89 		} private;
90 		struct {
91 			void		*a;
92 			uintptr_t	b;
93 		} both;
94 	} info;
95 };
96 
97 /* Priority inheritance mutex info. */
98 struct umtx_pi {
99 	/* Owner thread */
100 	struct thread		*pi_owner;
101 
102 	/* Reference count */
103 	int			pi_refcount;
104 
105  	/* List entry to link umtx holding by thread */
106 	TAILQ_ENTRY(umtx_pi)	pi_link;
107 
108 	/* List entry in hash */
109 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
110 
111 	/* List for waiters */
112 	TAILQ_HEAD(,umtx_q)	pi_blocked;
113 
114 	/* Identify a userland lock object */
115 	struct umtx_key		pi_key;
116 };
117 
118 /* A userland synchronous object user. */
119 struct umtx_q {
120 	/* Linked list for the hash. */
121 	TAILQ_ENTRY(umtx_q)	uq_link;
122 
123 	/* Umtx key. */
124 	struct umtx_key		uq_key;
125 
126 	/* Umtx flags. */
127 	int			uq_flags;
128 #define UQF_UMTXQ	0x0001
129 
130 	/* The thread waits on. */
131 	struct thread		*uq_thread;
132 
133 	/*
134 	 * Blocked on PI mutex. read can use chain lock
135 	 * or umtx_lock, write must have both chain lock and
136 	 * umtx_lock being hold.
137 	 */
138 	struct umtx_pi		*uq_pi_blocked;
139 
140 	/* On blocked list */
141 	TAILQ_ENTRY(umtx_q)	uq_lockq;
142 
143 	/* Thread contending with us */
144 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
145 
146 	/* Inherited priority from PP mutex */
147 	u_char			uq_inherited_pri;
148 
149 	/* Spare queue ready to be reused */
150 	struct umtxq_queue	*uq_spare_queue;
151 
152 	/* The queue we on */
153 	struct umtxq_queue	*uq_cur_queue;
154 };
155 
156 TAILQ_HEAD(umtxq_head, umtx_q);
157 
158 /* Per-key wait-queue */
159 struct umtxq_queue {
160 	struct umtxq_head	head;
161 	struct umtx_key		key;
162 	LIST_ENTRY(umtxq_queue)	link;
163 	int			length;
164 };
165 
166 LIST_HEAD(umtxq_list, umtxq_queue);
167 
168 /* Userland lock object's wait-queue chain */
169 struct umtxq_chain {
170 	/* Lock for this chain. */
171 	struct mtx		uc_lock;
172 
173 	/* List of sleep queues. */
174 	struct umtxq_list	uc_queue[2];
175 #define UMTX_SHARED_QUEUE	0
176 #define UMTX_EXCLUSIVE_QUEUE	1
177 
178 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
179 
180 	/* Busy flag */
181 	char			uc_busy;
182 
183 	/* Chain lock waiters */
184 	int			uc_waiters;
185 
186 	/* All PI in the list */
187 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
188 
189 };
190 
191 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
192 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
193 
194 /*
195  * Don't propagate time-sharing priority, there is a security reason,
196  * a user can simply introduce PI-mutex, let thread A lock the mutex,
197  * and let another thread B block on the mutex, because B is
198  * sleeping, its priority will be boosted, this causes A's priority to
199  * be boosted via priority propagating too and will never be lowered even
200  * if it is using 100%CPU, this is unfair to other processes.
201  */
202 
203 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
204 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
205 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
206 
207 #define	GOLDEN_RATIO_PRIME	2654404609U
208 #define	UMTX_CHAINS		512
209 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
210 
211 #define THREAD_SHARE		0
212 #define PROCESS_SHARE		1
213 #define AUTO_SHARE		2
214 
215 #define	GET_SHARE(flags)	\
216     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
217 
218 #define BUSY_SPINS		200
219 
220 static uma_zone_t		umtx_pi_zone;
221 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
222 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
223 static int			umtx_pi_allocated;
224 
225 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
226 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
227     &umtx_pi_allocated, 0, "Allocated umtx_pi");
228 
229 static void umtxq_sysinit(void *);
230 static void umtxq_hash(struct umtx_key *key);
231 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
232 static void umtxq_lock(struct umtx_key *key);
233 static void umtxq_unlock(struct umtx_key *key);
234 static void umtxq_busy(struct umtx_key *key);
235 static void umtxq_unbusy(struct umtx_key *key);
236 static void umtxq_insert_queue(struct umtx_q *uq, int q);
237 static void umtxq_remove_queue(struct umtx_q *uq, int q);
238 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
239 static int umtxq_count(struct umtx_key *key);
240 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
241 static int umtx_key_get(void *addr, int type, int share,
242 	struct umtx_key *key);
243 static void umtx_key_release(struct umtx_key *key);
244 static struct umtx_pi *umtx_pi_alloc(int);
245 static void umtx_pi_free(struct umtx_pi *pi);
246 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
247 static void umtx_thread_cleanup(struct thread *td);
248 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
249 	struct image_params *imgp __unused);
250 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
251 
252 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
253 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
254 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
255 
256 static struct mtx umtx_lock;
257 
258 static void
259 umtxq_sysinit(void *arg __unused)
260 {
261 	int i, j;
262 
263 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
264 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
265 	for (i = 0; i < 2; ++i) {
266 		for (j = 0; j < UMTX_CHAINS; ++j) {
267 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
268 				 MTX_DEF | MTX_DUPOK);
269 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
270 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
271 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
272 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
273 			umtxq_chains[i][j].uc_busy = 0;
274 			umtxq_chains[i][j].uc_waiters = 0;
275 		}
276 	}
277 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279 	    EVENTHANDLER_PRI_ANY);
280 }
281 
282 struct umtx_q *
283 umtxq_alloc(void)
284 {
285 	struct umtx_q *uq;
286 
287 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289 	TAILQ_INIT(&uq->uq_spare_queue->head);
290 	TAILQ_INIT(&uq->uq_pi_contested);
291 	uq->uq_inherited_pri = PRI_MAX;
292 	return (uq);
293 }
294 
295 void
296 umtxq_free(struct umtx_q *uq)
297 {
298 	MPASS(uq->uq_spare_queue != NULL);
299 	free(uq->uq_spare_queue, M_UMTX);
300 	free(uq, M_UMTX);
301 }
302 
303 static inline void
304 umtxq_hash(struct umtx_key *key)
305 {
306 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308 }
309 
310 static inline int
311 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
312 {
313 	return (k1->type == k2->type &&
314 		k1->info.both.a == k2->info.both.a &&
315 	        k1->info.both.b == k2->info.both.b);
316 }
317 
318 static inline struct umtxq_chain *
319 umtxq_getchain(struct umtx_key *key)
320 {
321 	if (key->type <= TYPE_SEM)
322 		return (&umtxq_chains[1][key->hash]);
323 	return (&umtxq_chains[0][key->hash]);
324 }
325 
326 /*
327  * Lock a chain.
328  */
329 static inline void
330 umtxq_lock(struct umtx_key *key)
331 {
332 	struct umtxq_chain *uc;
333 
334 	uc = umtxq_getchain(key);
335 	mtx_lock(&uc->uc_lock);
336 }
337 
338 /*
339  * Unlock a chain.
340  */
341 static inline void
342 umtxq_unlock(struct umtx_key *key)
343 {
344 	struct umtxq_chain *uc;
345 
346 	uc = umtxq_getchain(key);
347 	mtx_unlock(&uc->uc_lock);
348 }
349 
350 /*
351  * Set chain to busy state when following operation
352  * may be blocked (kernel mutex can not be used).
353  */
354 static inline void
355 umtxq_busy(struct umtx_key *key)
356 {
357 	struct umtxq_chain *uc;
358 
359 	uc = umtxq_getchain(key);
360 	mtx_assert(&uc->uc_lock, MA_OWNED);
361 	if (uc->uc_busy) {
362 #ifdef SMP
363 		if (smp_cpus > 1) {
364 			int count = BUSY_SPINS;
365 			if (count > 0) {
366 				umtxq_unlock(key);
367 				while (uc->uc_busy && --count > 0)
368 					cpu_spinwait();
369 				umtxq_lock(key);
370 			}
371 		}
372 #endif
373 		while (uc->uc_busy) {
374 			uc->uc_waiters++;
375 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
376 			uc->uc_waiters--;
377 		}
378 	}
379 	uc->uc_busy = 1;
380 }
381 
382 /*
383  * Unbusy a chain.
384  */
385 static inline void
386 umtxq_unbusy(struct umtx_key *key)
387 {
388 	struct umtxq_chain *uc;
389 
390 	uc = umtxq_getchain(key);
391 	mtx_assert(&uc->uc_lock, MA_OWNED);
392 	KASSERT(uc->uc_busy != 0, ("not busy"));
393 	uc->uc_busy = 0;
394 	if (uc->uc_waiters)
395 		wakeup_one(uc);
396 }
397 
398 static struct umtxq_queue *
399 umtxq_queue_lookup(struct umtx_key *key, int q)
400 {
401 	struct umtxq_queue *uh;
402 	struct umtxq_chain *uc;
403 
404 	uc = umtxq_getchain(key);
405 	UMTXQ_LOCKED_ASSERT(uc);
406 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
407 		if (umtx_key_match(&uh->key, key))
408 			return (uh);
409 	}
410 
411 	return (NULL);
412 }
413 
414 static inline void
415 umtxq_insert_queue(struct umtx_q *uq, int q)
416 {
417 	struct umtxq_queue *uh;
418 	struct umtxq_chain *uc;
419 
420 	uc = umtxq_getchain(&uq->uq_key);
421 	UMTXQ_LOCKED_ASSERT(uc);
422 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
423 	uh = umtxq_queue_lookup(&uq->uq_key, q);
424 	if (uh != NULL) {
425 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
426 	} else {
427 		uh = uq->uq_spare_queue;
428 		uh->key = uq->uq_key;
429 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
430 	}
431 	uq->uq_spare_queue = NULL;
432 
433 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
434 	uh->length++;
435 	uq->uq_flags |= UQF_UMTXQ;
436 	uq->uq_cur_queue = uh;
437 	return;
438 }
439 
440 static inline void
441 umtxq_remove_queue(struct umtx_q *uq, int q)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtxq_queue *uh;
445 
446 	uc = umtxq_getchain(&uq->uq_key);
447 	UMTXQ_LOCKED_ASSERT(uc);
448 	if (uq->uq_flags & UQF_UMTXQ) {
449 		uh = uq->uq_cur_queue;
450 		TAILQ_REMOVE(&uh->head, uq, uq_link);
451 		uh->length--;
452 		uq->uq_flags &= ~UQF_UMTXQ;
453 		if (TAILQ_EMPTY(&uh->head)) {
454 			KASSERT(uh->length == 0,
455 			    ("inconsistent umtxq_queue length"));
456 			LIST_REMOVE(uh, link);
457 		} else {
458 			uh = LIST_FIRST(&uc->uc_spare_queue);
459 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
460 			LIST_REMOVE(uh, link);
461 		}
462 		uq->uq_spare_queue = uh;
463 		uq->uq_cur_queue = NULL;
464 	}
465 }
466 
467 /*
468  * Check if there are multiple waiters
469  */
470 static int
471 umtxq_count(struct umtx_key *key)
472 {
473 	struct umtxq_chain *uc;
474 	struct umtxq_queue *uh;
475 
476 	uc = umtxq_getchain(key);
477 	UMTXQ_LOCKED_ASSERT(uc);
478 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
479 	if (uh != NULL)
480 		return (uh->length);
481 	return (0);
482 }
483 
484 /*
485  * Check if there are multiple PI waiters and returns first
486  * waiter.
487  */
488 static int
489 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
490 {
491 	struct umtxq_chain *uc;
492 	struct umtxq_queue *uh;
493 
494 	*first = NULL;
495 	uc = umtxq_getchain(key);
496 	UMTXQ_LOCKED_ASSERT(uc);
497 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
498 	if (uh != NULL) {
499 		*first = TAILQ_FIRST(&uh->head);
500 		return (uh->length);
501 	}
502 	return (0);
503 }
504 
505 /*
506  * Wake up threads waiting on an userland object.
507  */
508 
509 static int
510 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
511 {
512 	struct umtxq_chain *uc;
513 	struct umtxq_queue *uh;
514 	struct umtx_q *uq;
515 	int ret;
516 
517 	ret = 0;
518 	uc = umtxq_getchain(key);
519 	UMTXQ_LOCKED_ASSERT(uc);
520 	uh = umtxq_queue_lookup(key, q);
521 	if (uh != NULL) {
522 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
523 			umtxq_remove_queue(uq, q);
524 			wakeup(uq);
525 			if (++ret >= n_wake)
526 				return (ret);
527 		}
528 	}
529 	return (ret);
530 }
531 
532 
533 /*
534  * Wake up specified thread.
535  */
536 static inline void
537 umtxq_signal_thread(struct umtx_q *uq)
538 {
539 	struct umtxq_chain *uc;
540 
541 	uc = umtxq_getchain(&uq->uq_key);
542 	UMTXQ_LOCKED_ASSERT(uc);
543 	umtxq_remove(uq);
544 	wakeup(uq);
545 }
546 
547 /*
548  * Put thread into sleep state, before sleeping, check if
549  * thread was removed from umtx queue.
550  */
551 static inline int
552 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
553 {
554 	struct umtxq_chain *uc;
555 	int error;
556 
557 	uc = umtxq_getchain(&uq->uq_key);
558 	UMTXQ_LOCKED_ASSERT(uc);
559 	if (!(uq->uq_flags & UQF_UMTXQ))
560 		return (0);
561 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
562 	if (error == EWOULDBLOCK)
563 		error = ETIMEDOUT;
564 	return (error);
565 }
566 
567 /*
568  * Convert userspace address into unique logical address.
569  */
570 static int
571 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
572 {
573 	struct thread *td = curthread;
574 	vm_map_t map;
575 	vm_map_entry_t entry;
576 	vm_pindex_t pindex;
577 	vm_prot_t prot;
578 	boolean_t wired;
579 
580 	key->type = type;
581 	if (share == THREAD_SHARE) {
582 		key->shared = 0;
583 		key->info.private.vs = td->td_proc->p_vmspace;
584 		key->info.private.addr = (uintptr_t)addr;
585 	} else {
586 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
587 		map = &td->td_proc->p_vmspace->vm_map;
588 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
589 		    &entry, &key->info.shared.object, &pindex, &prot,
590 		    &wired) != KERN_SUCCESS) {
591 			return EFAULT;
592 		}
593 
594 		if ((share == PROCESS_SHARE) ||
595 		    (share == AUTO_SHARE &&
596 		     VM_INHERIT_SHARE == entry->inheritance)) {
597 			key->shared = 1;
598 			key->info.shared.offset = entry->offset + entry->start -
599 				(vm_offset_t)addr;
600 			vm_object_reference(key->info.shared.object);
601 		} else {
602 			key->shared = 0;
603 			key->info.private.vs = td->td_proc->p_vmspace;
604 			key->info.private.addr = (uintptr_t)addr;
605 		}
606 		vm_map_lookup_done(map, entry);
607 	}
608 
609 	umtxq_hash(key);
610 	return (0);
611 }
612 
613 /*
614  * Release key.
615  */
616 static inline void
617 umtx_key_release(struct umtx_key *key)
618 {
619 	if (key->shared)
620 		vm_object_deallocate(key->info.shared.object);
621 }
622 
623 /*
624  * Lock a umtx object.
625  */
626 static int
627 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
628 {
629 	struct umtx_q *uq;
630 	u_long owner;
631 	u_long old;
632 	int error = 0;
633 
634 	uq = td->td_umtxq;
635 
636 	/*
637 	 * Care must be exercised when dealing with umtx structure. It
638 	 * can fault on any access.
639 	 */
640 	for (;;) {
641 		/*
642 		 * Try the uncontested case.  This should be done in userland.
643 		 */
644 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
645 
646 		/* The acquire succeeded. */
647 		if (owner == UMTX_UNOWNED)
648 			return (0);
649 
650 		/* The address was invalid. */
651 		if (owner == -1)
652 			return (EFAULT);
653 
654 		/* If no one owns it but it is contested try to acquire it. */
655 		if (owner == UMTX_CONTESTED) {
656 			owner = casuword(&umtx->u_owner,
657 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
658 
659 			if (owner == UMTX_CONTESTED)
660 				return (0);
661 
662 			/* The address was invalid. */
663 			if (owner == -1)
664 				return (EFAULT);
665 
666 			/* If this failed the lock has changed, restart. */
667 			continue;
668 		}
669 
670 		/*
671 		 * If we caught a signal, we have retried and now
672 		 * exit immediately.
673 		 */
674 		if (error != 0)
675 			return (error);
676 
677 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
678 			AUTO_SHARE, &uq->uq_key)) != 0)
679 			return (error);
680 
681 		umtxq_lock(&uq->uq_key);
682 		umtxq_busy(&uq->uq_key);
683 		umtxq_insert(uq);
684 		umtxq_unbusy(&uq->uq_key);
685 		umtxq_unlock(&uq->uq_key);
686 
687 		/*
688 		 * Set the contested bit so that a release in user space
689 		 * knows to use the system call for unlock.  If this fails
690 		 * either some one else has acquired the lock or it has been
691 		 * released.
692 		 */
693 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
694 
695 		/* The address was invalid. */
696 		if (old == -1) {
697 			umtxq_lock(&uq->uq_key);
698 			umtxq_remove(uq);
699 			umtxq_unlock(&uq->uq_key);
700 			umtx_key_release(&uq->uq_key);
701 			return (EFAULT);
702 		}
703 
704 		/*
705 		 * We set the contested bit, sleep. Otherwise the lock changed
706 		 * and we need to retry or we lost a race to the thread
707 		 * unlocking the umtx.
708 		 */
709 		umtxq_lock(&uq->uq_key);
710 		if (old == owner)
711 			error = umtxq_sleep(uq, "umtx", timo);
712 		umtxq_remove(uq);
713 		umtxq_unlock(&uq->uq_key);
714 		umtx_key_release(&uq->uq_key);
715 	}
716 
717 	return (0);
718 }
719 
720 /*
721  * Lock a umtx object.
722  */
723 static int
724 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
725 	struct timespec *timeout)
726 {
727 	struct timespec ts, ts2, ts3;
728 	struct timeval tv;
729 	int error;
730 
731 	if (timeout == NULL) {
732 		error = _do_lock_umtx(td, umtx, id, 0);
733 		/* Mutex locking is restarted if it is interrupted. */
734 		if (error == EINTR)
735 			error = ERESTART;
736 	} else {
737 		getnanouptime(&ts);
738 		timespecadd(&ts, timeout);
739 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
740 		for (;;) {
741 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
742 			if (error != ETIMEDOUT)
743 				break;
744 			getnanouptime(&ts2);
745 			if (timespeccmp(&ts2, &ts, >=)) {
746 				error = ETIMEDOUT;
747 				break;
748 			}
749 			ts3 = ts;
750 			timespecsub(&ts3, &ts2);
751 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
752 		}
753 		/* Timed-locking is not restarted. */
754 		if (error == ERESTART)
755 			error = EINTR;
756 	}
757 	return (error);
758 }
759 
760 /*
761  * Unlock a umtx object.
762  */
763 static int
764 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
765 {
766 	struct umtx_key key;
767 	u_long owner;
768 	u_long old;
769 	int error;
770 	int count;
771 
772 	/*
773 	 * Make sure we own this mtx.
774 	 */
775 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
776 	if (owner == -1)
777 		return (EFAULT);
778 
779 	if ((owner & ~UMTX_CONTESTED) != id)
780 		return (EPERM);
781 
782 	/* This should be done in userland */
783 	if ((owner & UMTX_CONTESTED) == 0) {
784 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
785 		if (old == -1)
786 			return (EFAULT);
787 		if (old == owner)
788 			return (0);
789 		owner = old;
790 	}
791 
792 	/* We should only ever be in here for contested locks */
793 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
794 		&key)) != 0)
795 		return (error);
796 
797 	umtxq_lock(&key);
798 	umtxq_busy(&key);
799 	count = umtxq_count(&key);
800 	umtxq_unlock(&key);
801 
802 	/*
803 	 * When unlocking the umtx, it must be marked as unowned if
804 	 * there is zero or one thread only waiting for it.
805 	 * Otherwise, it must be marked as contested.
806 	 */
807 	old = casuword(&umtx->u_owner, owner,
808 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
809 	umtxq_lock(&key);
810 	umtxq_signal(&key,1);
811 	umtxq_unbusy(&key);
812 	umtxq_unlock(&key);
813 	umtx_key_release(&key);
814 	if (old == -1)
815 		return (EFAULT);
816 	if (old != owner)
817 		return (EINVAL);
818 	return (0);
819 }
820 
821 #ifdef COMPAT_FREEBSD32
822 
823 /*
824  * Lock a umtx object.
825  */
826 static int
827 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
828 {
829 	struct umtx_q *uq;
830 	uint32_t owner;
831 	uint32_t old;
832 	int error = 0;
833 
834 	uq = td->td_umtxq;
835 
836 	/*
837 	 * Care must be exercised when dealing with umtx structure. It
838 	 * can fault on any access.
839 	 */
840 	for (;;) {
841 		/*
842 		 * Try the uncontested case.  This should be done in userland.
843 		 */
844 		owner = casuword32(m, UMUTEX_UNOWNED, id);
845 
846 		/* The acquire succeeded. */
847 		if (owner == UMUTEX_UNOWNED)
848 			return (0);
849 
850 		/* The address was invalid. */
851 		if (owner == -1)
852 			return (EFAULT);
853 
854 		/* If no one owns it but it is contested try to acquire it. */
855 		if (owner == UMUTEX_CONTESTED) {
856 			owner = casuword32(m,
857 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
858 			if (owner == UMUTEX_CONTESTED)
859 				return (0);
860 
861 			/* The address was invalid. */
862 			if (owner == -1)
863 				return (EFAULT);
864 
865 			/* If this failed the lock has changed, restart. */
866 			continue;
867 		}
868 
869 		/*
870 		 * If we caught a signal, we have retried and now
871 		 * exit immediately.
872 		 */
873 		if (error != 0)
874 			return (error);
875 
876 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
877 			AUTO_SHARE, &uq->uq_key)) != 0)
878 			return (error);
879 
880 		umtxq_lock(&uq->uq_key);
881 		umtxq_busy(&uq->uq_key);
882 		umtxq_insert(uq);
883 		umtxq_unbusy(&uq->uq_key);
884 		umtxq_unlock(&uq->uq_key);
885 
886 		/*
887 		 * Set the contested bit so that a release in user space
888 		 * knows to use the system call for unlock.  If this fails
889 		 * either some one else has acquired the lock or it has been
890 		 * released.
891 		 */
892 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
893 
894 		/* The address was invalid. */
895 		if (old == -1) {
896 			umtxq_lock(&uq->uq_key);
897 			umtxq_remove(uq);
898 			umtxq_unlock(&uq->uq_key);
899 			umtx_key_release(&uq->uq_key);
900 			return (EFAULT);
901 		}
902 
903 		/*
904 		 * We set the contested bit, sleep. Otherwise the lock changed
905 		 * and we need to retry or we lost a race to the thread
906 		 * unlocking the umtx.
907 		 */
908 		umtxq_lock(&uq->uq_key);
909 		if (old == owner)
910 			error = umtxq_sleep(uq, "umtx", timo);
911 		umtxq_remove(uq);
912 		umtxq_unlock(&uq->uq_key);
913 		umtx_key_release(&uq->uq_key);
914 	}
915 
916 	return (0);
917 }
918 
919 /*
920  * Lock a umtx object.
921  */
922 static int
923 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
924 	struct timespec *timeout)
925 {
926 	struct timespec ts, ts2, ts3;
927 	struct timeval tv;
928 	int error;
929 
930 	if (timeout == NULL) {
931 		error = _do_lock_umtx32(td, m, id, 0);
932 		/* Mutex locking is restarted if it is interrupted. */
933 		if (error == EINTR)
934 			error = ERESTART;
935 	} else {
936 		getnanouptime(&ts);
937 		timespecadd(&ts, timeout);
938 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
939 		for (;;) {
940 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
941 			if (error != ETIMEDOUT)
942 				break;
943 			getnanouptime(&ts2);
944 			if (timespeccmp(&ts2, &ts, >=)) {
945 				error = ETIMEDOUT;
946 				break;
947 			}
948 			ts3 = ts;
949 			timespecsub(&ts3, &ts2);
950 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
951 		}
952 		/* Timed-locking is not restarted. */
953 		if (error == ERESTART)
954 			error = EINTR;
955 	}
956 	return (error);
957 }
958 
959 /*
960  * Unlock a umtx object.
961  */
962 static int
963 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
964 {
965 	struct umtx_key key;
966 	uint32_t owner;
967 	uint32_t old;
968 	int error;
969 	int count;
970 
971 	/*
972 	 * Make sure we own this mtx.
973 	 */
974 	owner = fuword32(m);
975 	if (owner == -1)
976 		return (EFAULT);
977 
978 	if ((owner & ~UMUTEX_CONTESTED) != id)
979 		return (EPERM);
980 
981 	/* This should be done in userland */
982 	if ((owner & UMUTEX_CONTESTED) == 0) {
983 		old = casuword32(m, owner, UMUTEX_UNOWNED);
984 		if (old == -1)
985 			return (EFAULT);
986 		if (old == owner)
987 			return (0);
988 		owner = old;
989 	}
990 
991 	/* We should only ever be in here for contested locks */
992 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
993 		&key)) != 0)
994 		return (error);
995 
996 	umtxq_lock(&key);
997 	umtxq_busy(&key);
998 	count = umtxq_count(&key);
999 	umtxq_unlock(&key);
1000 
1001 	/*
1002 	 * When unlocking the umtx, it must be marked as unowned if
1003 	 * there is zero or one thread only waiting for it.
1004 	 * Otherwise, it must be marked as contested.
1005 	 */
1006 	old = casuword32(m, owner,
1007 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1008 	umtxq_lock(&key);
1009 	umtxq_signal(&key,1);
1010 	umtxq_unbusy(&key);
1011 	umtxq_unlock(&key);
1012 	umtx_key_release(&key);
1013 	if (old == -1)
1014 		return (EFAULT);
1015 	if (old != owner)
1016 		return (EINVAL);
1017 	return (0);
1018 }
1019 #endif
1020 
1021 /*
1022  * Fetch and compare value, sleep on the address if value is not changed.
1023  */
1024 static int
1025 do_wait(struct thread *td, void *addr, u_long id,
1026 	struct timespec *timeout, int compat32, int is_private)
1027 {
1028 	struct umtx_q *uq;
1029 	struct timespec ts, ts2, ts3;
1030 	struct timeval tv;
1031 	u_long tmp;
1032 	int error = 0;
1033 
1034 	uq = td->td_umtxq;
1035 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1036 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1037 		return (error);
1038 
1039 	umtxq_lock(&uq->uq_key);
1040 	umtxq_insert(uq);
1041 	umtxq_unlock(&uq->uq_key);
1042 	if (compat32 == 0)
1043 		tmp = fuword(addr);
1044         else
1045 		tmp = (unsigned int)fuword32(addr);
1046 	if (tmp != id) {
1047 		umtxq_lock(&uq->uq_key);
1048 		umtxq_remove(uq);
1049 		umtxq_unlock(&uq->uq_key);
1050 	} else if (timeout == NULL) {
1051 		umtxq_lock(&uq->uq_key);
1052 		error = umtxq_sleep(uq, "uwait", 0);
1053 		umtxq_remove(uq);
1054 		umtxq_unlock(&uq->uq_key);
1055 	} else {
1056 		getnanouptime(&ts);
1057 		timespecadd(&ts, timeout);
1058 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1059 		umtxq_lock(&uq->uq_key);
1060 		for (;;) {
1061 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1062 			if (!(uq->uq_flags & UQF_UMTXQ)) {
1063 				error = 0;
1064 				break;
1065 			}
1066 			if (error != ETIMEDOUT)
1067 				break;
1068 			umtxq_unlock(&uq->uq_key);
1069 			getnanouptime(&ts2);
1070 			if (timespeccmp(&ts2, &ts, >=)) {
1071 				error = ETIMEDOUT;
1072 				umtxq_lock(&uq->uq_key);
1073 				break;
1074 			}
1075 			ts3 = ts;
1076 			timespecsub(&ts3, &ts2);
1077 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1078 			umtxq_lock(&uq->uq_key);
1079 		}
1080 		umtxq_remove(uq);
1081 		umtxq_unlock(&uq->uq_key);
1082 	}
1083 	umtx_key_release(&uq->uq_key);
1084 	if (error == ERESTART)
1085 		error = EINTR;
1086 	return (error);
1087 }
1088 
1089 /*
1090  * Wake up threads sleeping on the specified address.
1091  */
1092 int
1093 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1094 {
1095 	struct umtx_key key;
1096 	int ret;
1097 
1098 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1099 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1100 		return (ret);
1101 	umtxq_lock(&key);
1102 	ret = umtxq_signal(&key, n_wake);
1103 	umtxq_unlock(&key);
1104 	umtx_key_release(&key);
1105 	return (0);
1106 }
1107 
1108 /*
1109  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1110  */
1111 static int
1112 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1113 	int mode)
1114 {
1115 	struct umtx_q *uq;
1116 	uint32_t owner, old, id;
1117 	int error = 0;
1118 
1119 	id = td->td_tid;
1120 	uq = td->td_umtxq;
1121 
1122 	/*
1123 	 * Care must be exercised when dealing with umtx structure. It
1124 	 * can fault on any access.
1125 	 */
1126 	for (;;) {
1127 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1128 		if (mode == _UMUTEX_WAIT) {
1129 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1130 				return (0);
1131 		} else {
1132 			/*
1133 			 * Try the uncontested case.  This should be done in userland.
1134 			 */
1135 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1136 
1137 			/* The acquire succeeded. */
1138 			if (owner == UMUTEX_UNOWNED)
1139 				return (0);
1140 
1141 			/* The address was invalid. */
1142 			if (owner == -1)
1143 				return (EFAULT);
1144 
1145 			/* If no one owns it but it is contested try to acquire it. */
1146 			if (owner == UMUTEX_CONTESTED) {
1147 				owner = casuword32(&m->m_owner,
1148 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1149 
1150 				if (owner == UMUTEX_CONTESTED)
1151 					return (0);
1152 
1153 				/* The address was invalid. */
1154 				if (owner == -1)
1155 					return (EFAULT);
1156 
1157 				/* If this failed the lock has changed, restart. */
1158 				continue;
1159 			}
1160 		}
1161 
1162 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1163 		    (owner & ~UMUTEX_CONTESTED) == id)
1164 			return (EDEADLK);
1165 
1166 		if (mode == _UMUTEX_TRY)
1167 			return (EBUSY);
1168 
1169 		/*
1170 		 * If we caught a signal, we have retried and now
1171 		 * exit immediately.
1172 		 */
1173 		if (error != 0)
1174 			return (error);
1175 
1176 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1177 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1178 			return (error);
1179 
1180 		umtxq_lock(&uq->uq_key);
1181 		umtxq_busy(&uq->uq_key);
1182 		umtxq_insert(uq);
1183 		umtxq_unlock(&uq->uq_key);
1184 
1185 		/*
1186 		 * Set the contested bit so that a release in user space
1187 		 * knows to use the system call for unlock.  If this fails
1188 		 * either some one else has acquired the lock or it has been
1189 		 * released.
1190 		 */
1191 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1192 
1193 		/* The address was invalid. */
1194 		if (old == -1) {
1195 			umtxq_lock(&uq->uq_key);
1196 			umtxq_remove(uq);
1197 			umtxq_unbusy(&uq->uq_key);
1198 			umtxq_unlock(&uq->uq_key);
1199 			umtx_key_release(&uq->uq_key);
1200 			return (EFAULT);
1201 		}
1202 
1203 		/*
1204 		 * We set the contested bit, sleep. Otherwise the lock changed
1205 		 * and we need to retry or we lost a race to the thread
1206 		 * unlocking the umtx.
1207 		 */
1208 		umtxq_lock(&uq->uq_key);
1209 		umtxq_unbusy(&uq->uq_key);
1210 		if (old == owner)
1211 			error = umtxq_sleep(uq, "umtxn", timo);
1212 		umtxq_remove(uq);
1213 		umtxq_unlock(&uq->uq_key);
1214 		umtx_key_release(&uq->uq_key);
1215 	}
1216 
1217 	return (0);
1218 }
1219 
1220 /*
1221  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1222  */
1223 /*
1224  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1225  */
1226 static int
1227 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1228 {
1229 	struct umtx_key key;
1230 	uint32_t owner, old, id;
1231 	int error;
1232 	int count;
1233 
1234 	id = td->td_tid;
1235 	/*
1236 	 * Make sure we own this mtx.
1237 	 */
1238 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1239 	if (owner == -1)
1240 		return (EFAULT);
1241 
1242 	if ((owner & ~UMUTEX_CONTESTED) != id)
1243 		return (EPERM);
1244 
1245 	if ((owner & UMUTEX_CONTESTED) == 0) {
1246 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1247 		if (old == -1)
1248 			return (EFAULT);
1249 		if (old == owner)
1250 			return (0);
1251 		owner = old;
1252 	}
1253 
1254 	/* We should only ever be in here for contested locks */
1255 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1256 	    &key)) != 0)
1257 		return (error);
1258 
1259 	umtxq_lock(&key);
1260 	umtxq_busy(&key);
1261 	count = umtxq_count(&key);
1262 	umtxq_unlock(&key);
1263 
1264 	/*
1265 	 * When unlocking the umtx, it must be marked as unowned if
1266 	 * there is zero or one thread only waiting for it.
1267 	 * Otherwise, it must be marked as contested.
1268 	 */
1269 	old = casuword32(&m->m_owner, owner,
1270 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1271 	umtxq_lock(&key);
1272 	umtxq_signal(&key,1);
1273 	umtxq_unbusy(&key);
1274 	umtxq_unlock(&key);
1275 	umtx_key_release(&key);
1276 	if (old == -1)
1277 		return (EFAULT);
1278 	if (old != owner)
1279 		return (EINVAL);
1280 	return (0);
1281 }
1282 
1283 /*
1284  * Check if the mutex is available and wake up a waiter,
1285  * only for simple mutex.
1286  */
1287 static int
1288 do_wake_umutex(struct thread *td, struct umutex *m)
1289 {
1290 	struct umtx_key key;
1291 	uint32_t owner;
1292 	uint32_t flags;
1293 	int error;
1294 	int count;
1295 
1296 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1297 	if (owner == -1)
1298 		return (EFAULT);
1299 
1300 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1301 		return (0);
1302 
1303 	flags = fuword32(&m->m_flags);
1304 
1305 	/* We should only ever be in here for contested locks */
1306 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1307 	    &key)) != 0)
1308 		return (error);
1309 
1310 	umtxq_lock(&key);
1311 	umtxq_busy(&key);
1312 	count = umtxq_count(&key);
1313 	umtxq_unlock(&key);
1314 
1315 	if (count <= 1)
1316 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1317 
1318 	umtxq_lock(&key);
1319 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1320 		umtxq_signal(&key, 1);
1321 	umtxq_unbusy(&key);
1322 	umtxq_unlock(&key);
1323 	umtx_key_release(&key);
1324 	return (0);
1325 }
1326 
1327 static inline struct umtx_pi *
1328 umtx_pi_alloc(int flags)
1329 {
1330 	struct umtx_pi *pi;
1331 
1332 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1333 	TAILQ_INIT(&pi->pi_blocked);
1334 	atomic_add_int(&umtx_pi_allocated, 1);
1335 	return (pi);
1336 }
1337 
1338 static inline void
1339 umtx_pi_free(struct umtx_pi *pi)
1340 {
1341 	uma_zfree(umtx_pi_zone, pi);
1342 	atomic_add_int(&umtx_pi_allocated, -1);
1343 }
1344 
1345 /*
1346  * Adjust the thread's position on a pi_state after its priority has been
1347  * changed.
1348  */
1349 static int
1350 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1351 {
1352 	struct umtx_q *uq, *uq1, *uq2;
1353 	struct thread *td1;
1354 
1355 	mtx_assert(&umtx_lock, MA_OWNED);
1356 	if (pi == NULL)
1357 		return (0);
1358 
1359 	uq = td->td_umtxq;
1360 
1361 	/*
1362 	 * Check if the thread needs to be moved on the blocked chain.
1363 	 * It needs to be moved if either its priority is lower than
1364 	 * the previous thread or higher than the next thread.
1365 	 */
1366 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1367 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1368 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1369 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1370 		/*
1371 		 * Remove thread from blocked chain and determine where
1372 		 * it should be moved to.
1373 		 */
1374 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1375 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1376 			td1 = uq1->uq_thread;
1377 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1378 			if (UPRI(td1) > UPRI(td))
1379 				break;
1380 		}
1381 
1382 		if (uq1 == NULL)
1383 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1384 		else
1385 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1386 	}
1387 	return (1);
1388 }
1389 
1390 /*
1391  * Propagate priority when a thread is blocked on POSIX
1392  * PI mutex.
1393  */
1394 static void
1395 umtx_propagate_priority(struct thread *td)
1396 {
1397 	struct umtx_q *uq;
1398 	struct umtx_pi *pi;
1399 	int pri;
1400 
1401 	mtx_assert(&umtx_lock, MA_OWNED);
1402 	pri = UPRI(td);
1403 	uq = td->td_umtxq;
1404 	pi = uq->uq_pi_blocked;
1405 	if (pi == NULL)
1406 		return;
1407 
1408 	for (;;) {
1409 		td = pi->pi_owner;
1410 		if (td == NULL || td == curthread)
1411 			return;
1412 
1413 		MPASS(td->td_proc != NULL);
1414 		MPASS(td->td_proc->p_magic == P_MAGIC);
1415 
1416 		thread_lock(td);
1417 		if (td->td_lend_user_pri > pri)
1418 			sched_lend_user_prio(td, pri);
1419 		else {
1420 			thread_unlock(td);
1421 			break;
1422 		}
1423 		thread_unlock(td);
1424 
1425 		/*
1426 		 * Pick up the lock that td is blocked on.
1427 		 */
1428 		uq = td->td_umtxq;
1429 		pi = uq->uq_pi_blocked;
1430 		if (pi == NULL)
1431 			break;
1432 		/* Resort td on the list if needed. */
1433 		umtx_pi_adjust_thread(pi, td);
1434 	}
1435 }
1436 
1437 /*
1438  * Unpropagate priority for a PI mutex when a thread blocked on
1439  * it is interrupted by signal or resumed by others.
1440  */
1441 static void
1442 umtx_repropagate_priority(struct umtx_pi *pi)
1443 {
1444 	struct umtx_q *uq, *uq_owner;
1445 	struct umtx_pi *pi2;
1446 	int pri;
1447 
1448 	mtx_assert(&umtx_lock, MA_OWNED);
1449 
1450 	while (pi != NULL && pi->pi_owner != NULL) {
1451 		pri = PRI_MAX;
1452 		uq_owner = pi->pi_owner->td_umtxq;
1453 
1454 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1455 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1456 			if (uq != NULL) {
1457 				if (pri > UPRI(uq->uq_thread))
1458 					pri = UPRI(uq->uq_thread);
1459 			}
1460 		}
1461 
1462 		if (pri > uq_owner->uq_inherited_pri)
1463 			pri = uq_owner->uq_inherited_pri;
1464 		thread_lock(pi->pi_owner);
1465 		sched_lend_user_prio(pi->pi_owner, pri);
1466 		thread_unlock(pi->pi_owner);
1467 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1468 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1469 	}
1470 }
1471 
1472 /*
1473  * Insert a PI mutex into owned list.
1474  */
1475 static void
1476 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1477 {
1478 	struct umtx_q *uq_owner;
1479 
1480 	uq_owner = owner->td_umtxq;
1481 	mtx_assert(&umtx_lock, MA_OWNED);
1482 	if (pi->pi_owner != NULL)
1483 		panic("pi_ower != NULL");
1484 	pi->pi_owner = owner;
1485 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1486 }
1487 
1488 /*
1489  * Claim ownership of a PI mutex.
1490  */
1491 static int
1492 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1493 {
1494 	struct umtx_q *uq, *uq_owner;
1495 
1496 	uq_owner = owner->td_umtxq;
1497 	mtx_lock_spin(&umtx_lock);
1498 	if (pi->pi_owner == owner) {
1499 		mtx_unlock_spin(&umtx_lock);
1500 		return (0);
1501 	}
1502 
1503 	if (pi->pi_owner != NULL) {
1504 		/*
1505 		 * userland may have already messed the mutex, sigh.
1506 		 */
1507 		mtx_unlock_spin(&umtx_lock);
1508 		return (EPERM);
1509 	}
1510 	umtx_pi_setowner(pi, owner);
1511 	uq = TAILQ_FIRST(&pi->pi_blocked);
1512 	if (uq != NULL) {
1513 		int pri;
1514 
1515 		pri = UPRI(uq->uq_thread);
1516 		thread_lock(owner);
1517 		if (pri < UPRI(owner))
1518 			sched_lend_user_prio(owner, pri);
1519 		thread_unlock(owner);
1520 	}
1521 	mtx_unlock_spin(&umtx_lock);
1522 	return (0);
1523 }
1524 
1525 /*
1526  * Adjust a thread's order position in its blocked PI mutex,
1527  * this may result new priority propagating process.
1528  */
1529 void
1530 umtx_pi_adjust(struct thread *td, u_char oldpri)
1531 {
1532 	struct umtx_q *uq;
1533 	struct umtx_pi *pi;
1534 
1535 	uq = td->td_umtxq;
1536 	mtx_lock_spin(&umtx_lock);
1537 	/*
1538 	 * Pick up the lock that td is blocked on.
1539 	 */
1540 	pi = uq->uq_pi_blocked;
1541 	if (pi != NULL) {
1542 		umtx_pi_adjust_thread(pi, td);
1543 		umtx_repropagate_priority(pi);
1544 	}
1545 	mtx_unlock_spin(&umtx_lock);
1546 }
1547 
1548 /*
1549  * Sleep on a PI mutex.
1550  */
1551 static int
1552 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1553 	uint32_t owner, const char *wmesg, int timo)
1554 {
1555 	struct umtxq_chain *uc;
1556 	struct thread *td, *td1;
1557 	struct umtx_q *uq1;
1558 	int pri;
1559 	int error = 0;
1560 
1561 	td = uq->uq_thread;
1562 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1563 	uc = umtxq_getchain(&uq->uq_key);
1564 	UMTXQ_LOCKED_ASSERT(uc);
1565 	UMTXQ_BUSY_ASSERT(uc);
1566 	umtxq_insert(uq);
1567 	mtx_lock_spin(&umtx_lock);
1568 	if (pi->pi_owner == NULL) {
1569 		mtx_unlock_spin(&umtx_lock);
1570 		/* XXX Only look up thread in current process. */
1571 		td1 = tdfind(owner, curproc->p_pid);
1572 		mtx_lock_spin(&umtx_lock);
1573 		if (td1 != NULL) {
1574 			if (pi->pi_owner == NULL)
1575 				umtx_pi_setowner(pi, td1);
1576 			PROC_UNLOCK(td1->td_proc);
1577 		}
1578 	}
1579 
1580 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1581 		pri = UPRI(uq1->uq_thread);
1582 		if (pri > UPRI(td))
1583 			break;
1584 	}
1585 
1586 	if (uq1 != NULL)
1587 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1588 	else
1589 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1590 
1591 	uq->uq_pi_blocked = pi;
1592 	thread_lock(td);
1593 	td->td_flags |= TDF_UPIBLOCKED;
1594 	thread_unlock(td);
1595 	umtx_propagate_priority(td);
1596 	mtx_unlock_spin(&umtx_lock);
1597 	umtxq_unbusy(&uq->uq_key);
1598 
1599 	if (uq->uq_flags & UQF_UMTXQ) {
1600 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1601 		if (error == EWOULDBLOCK)
1602 			error = ETIMEDOUT;
1603 		if (uq->uq_flags & UQF_UMTXQ) {
1604 			umtxq_remove(uq);
1605 		}
1606 	}
1607 	mtx_lock_spin(&umtx_lock);
1608 	uq->uq_pi_blocked = NULL;
1609 	thread_lock(td);
1610 	td->td_flags &= ~TDF_UPIBLOCKED;
1611 	thread_unlock(td);
1612 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1613 	umtx_repropagate_priority(pi);
1614 	mtx_unlock_spin(&umtx_lock);
1615 	umtxq_unlock(&uq->uq_key);
1616 
1617 	return (error);
1618 }
1619 
1620 /*
1621  * Add reference count for a PI mutex.
1622  */
1623 static void
1624 umtx_pi_ref(struct umtx_pi *pi)
1625 {
1626 	struct umtxq_chain *uc;
1627 
1628 	uc = umtxq_getchain(&pi->pi_key);
1629 	UMTXQ_LOCKED_ASSERT(uc);
1630 	pi->pi_refcount++;
1631 }
1632 
1633 /*
1634  * Decrease reference count for a PI mutex, if the counter
1635  * is decreased to zero, its memory space is freed.
1636  */
1637 static void
1638 umtx_pi_unref(struct umtx_pi *pi)
1639 {
1640 	struct umtxq_chain *uc;
1641 
1642 	uc = umtxq_getchain(&pi->pi_key);
1643 	UMTXQ_LOCKED_ASSERT(uc);
1644 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1645 	if (--pi->pi_refcount == 0) {
1646 		mtx_lock_spin(&umtx_lock);
1647 		if (pi->pi_owner != NULL) {
1648 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1649 				pi, pi_link);
1650 			pi->pi_owner = NULL;
1651 		}
1652 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1653 			("blocked queue not empty"));
1654 		mtx_unlock_spin(&umtx_lock);
1655 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1656 		umtx_pi_free(pi);
1657 	}
1658 }
1659 
1660 /*
1661  * Find a PI mutex in hash table.
1662  */
1663 static struct umtx_pi *
1664 umtx_pi_lookup(struct umtx_key *key)
1665 {
1666 	struct umtxq_chain *uc;
1667 	struct umtx_pi *pi;
1668 
1669 	uc = umtxq_getchain(key);
1670 	UMTXQ_LOCKED_ASSERT(uc);
1671 
1672 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1673 		if (umtx_key_match(&pi->pi_key, key)) {
1674 			return (pi);
1675 		}
1676 	}
1677 	return (NULL);
1678 }
1679 
1680 /*
1681  * Insert a PI mutex into hash table.
1682  */
1683 static inline void
1684 umtx_pi_insert(struct umtx_pi *pi)
1685 {
1686 	struct umtxq_chain *uc;
1687 
1688 	uc = umtxq_getchain(&pi->pi_key);
1689 	UMTXQ_LOCKED_ASSERT(uc);
1690 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1691 }
1692 
1693 /*
1694  * Lock a PI mutex.
1695  */
1696 static int
1697 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1698 	int try)
1699 {
1700 	struct umtx_q *uq;
1701 	struct umtx_pi *pi, *new_pi;
1702 	uint32_t id, owner, old;
1703 	int error;
1704 
1705 	id = td->td_tid;
1706 	uq = td->td_umtxq;
1707 
1708 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1709 	    &uq->uq_key)) != 0)
1710 		return (error);
1711 	umtxq_lock(&uq->uq_key);
1712 	pi = umtx_pi_lookup(&uq->uq_key);
1713 	if (pi == NULL) {
1714 		new_pi = umtx_pi_alloc(M_NOWAIT);
1715 		if (new_pi == NULL) {
1716 			umtxq_unlock(&uq->uq_key);
1717 			new_pi = umtx_pi_alloc(M_WAITOK);
1718 			umtxq_lock(&uq->uq_key);
1719 			pi = umtx_pi_lookup(&uq->uq_key);
1720 			if (pi != NULL) {
1721 				umtx_pi_free(new_pi);
1722 				new_pi = NULL;
1723 			}
1724 		}
1725 		if (new_pi != NULL) {
1726 			new_pi->pi_key = uq->uq_key;
1727 			umtx_pi_insert(new_pi);
1728 			pi = new_pi;
1729 		}
1730 	}
1731 	umtx_pi_ref(pi);
1732 	umtxq_unlock(&uq->uq_key);
1733 
1734 	/*
1735 	 * Care must be exercised when dealing with umtx structure.  It
1736 	 * can fault on any access.
1737 	 */
1738 	for (;;) {
1739 		/*
1740 		 * Try the uncontested case.  This should be done in userland.
1741 		 */
1742 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1743 
1744 		/* The acquire succeeded. */
1745 		if (owner == UMUTEX_UNOWNED) {
1746 			error = 0;
1747 			break;
1748 		}
1749 
1750 		/* The address was invalid. */
1751 		if (owner == -1) {
1752 			error = EFAULT;
1753 			break;
1754 		}
1755 
1756 		/* If no one owns it but it is contested try to acquire it. */
1757 		if (owner == UMUTEX_CONTESTED) {
1758 			owner = casuword32(&m->m_owner,
1759 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1760 
1761 			if (owner == UMUTEX_CONTESTED) {
1762 				umtxq_lock(&uq->uq_key);
1763 				umtxq_busy(&uq->uq_key);
1764 				error = umtx_pi_claim(pi, td);
1765 				umtxq_unbusy(&uq->uq_key);
1766 				umtxq_unlock(&uq->uq_key);
1767 				break;
1768 			}
1769 
1770 			/* The address was invalid. */
1771 			if (owner == -1) {
1772 				error = EFAULT;
1773 				break;
1774 			}
1775 
1776 			/* If this failed the lock has changed, restart. */
1777 			continue;
1778 		}
1779 
1780 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1781 		    (owner & ~UMUTEX_CONTESTED) == id) {
1782 			error = EDEADLK;
1783 			break;
1784 		}
1785 
1786 		if (try != 0) {
1787 			error = EBUSY;
1788 			break;
1789 		}
1790 
1791 		/*
1792 		 * If we caught a signal, we have retried and now
1793 		 * exit immediately.
1794 		 */
1795 		if (error != 0)
1796 			break;
1797 
1798 		umtxq_lock(&uq->uq_key);
1799 		umtxq_busy(&uq->uq_key);
1800 		umtxq_unlock(&uq->uq_key);
1801 
1802 		/*
1803 		 * Set the contested bit so that a release in user space
1804 		 * knows to use the system call for unlock.  If this fails
1805 		 * either some one else has acquired the lock or it has been
1806 		 * released.
1807 		 */
1808 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1809 
1810 		/* The address was invalid. */
1811 		if (old == -1) {
1812 			umtxq_lock(&uq->uq_key);
1813 			umtxq_unbusy(&uq->uq_key);
1814 			umtxq_unlock(&uq->uq_key);
1815 			error = EFAULT;
1816 			break;
1817 		}
1818 
1819 		umtxq_lock(&uq->uq_key);
1820 		/*
1821 		 * We set the contested bit, sleep. Otherwise the lock changed
1822 		 * and we need to retry or we lost a race to the thread
1823 		 * unlocking the umtx.
1824 		 */
1825 		if (old == owner)
1826 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1827 				 "umtxpi", timo);
1828 		else {
1829 			umtxq_unbusy(&uq->uq_key);
1830 			umtxq_unlock(&uq->uq_key);
1831 		}
1832 	}
1833 
1834 	umtxq_lock(&uq->uq_key);
1835 	umtx_pi_unref(pi);
1836 	umtxq_unlock(&uq->uq_key);
1837 
1838 	umtx_key_release(&uq->uq_key);
1839 	return (error);
1840 }
1841 
1842 /*
1843  * Unlock a PI mutex.
1844  */
1845 static int
1846 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1847 {
1848 	struct umtx_key key;
1849 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1850 	struct umtx_pi *pi, *pi2;
1851 	uint32_t owner, old, id;
1852 	int error;
1853 	int count;
1854 	int pri;
1855 
1856 	id = td->td_tid;
1857 	/*
1858 	 * Make sure we own this mtx.
1859 	 */
1860 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1861 	if (owner == -1)
1862 		return (EFAULT);
1863 
1864 	if ((owner & ~UMUTEX_CONTESTED) != id)
1865 		return (EPERM);
1866 
1867 	/* This should be done in userland */
1868 	if ((owner & UMUTEX_CONTESTED) == 0) {
1869 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1870 		if (old == -1)
1871 			return (EFAULT);
1872 		if (old == owner)
1873 			return (0);
1874 		owner = old;
1875 	}
1876 
1877 	/* We should only ever be in here for contested locks */
1878 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1879 	    &key)) != 0)
1880 		return (error);
1881 
1882 	umtxq_lock(&key);
1883 	umtxq_busy(&key);
1884 	count = umtxq_count_pi(&key, &uq_first);
1885 	if (uq_first != NULL) {
1886 		mtx_lock_spin(&umtx_lock);
1887 		pi = uq_first->uq_pi_blocked;
1888 		KASSERT(pi != NULL, ("pi == NULL?"));
1889 		if (pi->pi_owner != curthread) {
1890 			mtx_unlock_spin(&umtx_lock);
1891 			umtxq_unbusy(&key);
1892 			umtxq_unlock(&key);
1893 			umtx_key_release(&key);
1894 			/* userland messed the mutex */
1895 			return (EPERM);
1896 		}
1897 		uq_me = curthread->td_umtxq;
1898 		pi->pi_owner = NULL;
1899 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1900 		/* get highest priority thread which is still sleeping. */
1901 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1902 		while (uq_first != NULL &&
1903 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1904 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1905 		}
1906 		pri = PRI_MAX;
1907 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1908 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1909 			if (uq_first2 != NULL) {
1910 				if (pri > UPRI(uq_first2->uq_thread))
1911 					pri = UPRI(uq_first2->uq_thread);
1912 			}
1913 		}
1914 		thread_lock(curthread);
1915 		sched_lend_user_prio(curthread, pri);
1916 		thread_unlock(curthread);
1917 		mtx_unlock_spin(&umtx_lock);
1918 		if (uq_first)
1919 			umtxq_signal_thread(uq_first);
1920 	}
1921 	umtxq_unlock(&key);
1922 
1923 	/*
1924 	 * When unlocking the umtx, it must be marked as unowned if
1925 	 * there is zero or one thread only waiting for it.
1926 	 * Otherwise, it must be marked as contested.
1927 	 */
1928 	old = casuword32(&m->m_owner, owner,
1929 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1930 
1931 	umtxq_lock(&key);
1932 	umtxq_unbusy(&key);
1933 	umtxq_unlock(&key);
1934 	umtx_key_release(&key);
1935 	if (old == -1)
1936 		return (EFAULT);
1937 	if (old != owner)
1938 		return (EINVAL);
1939 	return (0);
1940 }
1941 
1942 /*
1943  * Lock a PP mutex.
1944  */
1945 static int
1946 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1947 	int try)
1948 {
1949 	struct umtx_q *uq, *uq2;
1950 	struct umtx_pi *pi;
1951 	uint32_t ceiling;
1952 	uint32_t owner, id;
1953 	int error, pri, old_inherited_pri, su;
1954 
1955 	id = td->td_tid;
1956 	uq = td->td_umtxq;
1957 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1958 	    &uq->uq_key)) != 0)
1959 		return (error);
1960 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1961 	for (;;) {
1962 		old_inherited_pri = uq->uq_inherited_pri;
1963 		umtxq_lock(&uq->uq_key);
1964 		umtxq_busy(&uq->uq_key);
1965 		umtxq_unlock(&uq->uq_key);
1966 
1967 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1968 		if (ceiling > RTP_PRIO_MAX) {
1969 			error = EINVAL;
1970 			goto out;
1971 		}
1972 
1973 		mtx_lock_spin(&umtx_lock);
1974 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1975 			mtx_unlock_spin(&umtx_lock);
1976 			error = EINVAL;
1977 			goto out;
1978 		}
1979 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1980 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1981 			thread_lock(td);
1982 			if (uq->uq_inherited_pri < UPRI(td))
1983 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1984 			thread_unlock(td);
1985 		}
1986 		mtx_unlock_spin(&umtx_lock);
1987 
1988 		owner = casuword32(&m->m_owner,
1989 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1990 
1991 		if (owner == UMUTEX_CONTESTED) {
1992 			error = 0;
1993 			break;
1994 		}
1995 
1996 		/* The address was invalid. */
1997 		if (owner == -1) {
1998 			error = EFAULT;
1999 			break;
2000 		}
2001 
2002 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2003 		    (owner & ~UMUTEX_CONTESTED) == id) {
2004 			error = EDEADLK;
2005 			break;
2006 		}
2007 
2008 		if (try != 0) {
2009 			error = EBUSY;
2010 			break;
2011 		}
2012 
2013 		/*
2014 		 * If we caught a signal, we have retried and now
2015 		 * exit immediately.
2016 		 */
2017 		if (error != 0)
2018 			break;
2019 
2020 		umtxq_lock(&uq->uq_key);
2021 		umtxq_insert(uq);
2022 		umtxq_unbusy(&uq->uq_key);
2023 		error = umtxq_sleep(uq, "umtxpp", timo);
2024 		umtxq_remove(uq);
2025 		umtxq_unlock(&uq->uq_key);
2026 
2027 		mtx_lock_spin(&umtx_lock);
2028 		uq->uq_inherited_pri = old_inherited_pri;
2029 		pri = PRI_MAX;
2030 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2031 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2032 			if (uq2 != NULL) {
2033 				if (pri > UPRI(uq2->uq_thread))
2034 					pri = UPRI(uq2->uq_thread);
2035 			}
2036 		}
2037 		if (pri > uq->uq_inherited_pri)
2038 			pri = uq->uq_inherited_pri;
2039 		thread_lock(td);
2040 		sched_lend_user_prio(td, pri);
2041 		thread_unlock(td);
2042 		mtx_unlock_spin(&umtx_lock);
2043 	}
2044 
2045 	if (error != 0) {
2046 		mtx_lock_spin(&umtx_lock);
2047 		uq->uq_inherited_pri = old_inherited_pri;
2048 		pri = PRI_MAX;
2049 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2050 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2051 			if (uq2 != NULL) {
2052 				if (pri > UPRI(uq2->uq_thread))
2053 					pri = UPRI(uq2->uq_thread);
2054 			}
2055 		}
2056 		if (pri > uq->uq_inherited_pri)
2057 			pri = uq->uq_inherited_pri;
2058 		thread_lock(td);
2059 		sched_lend_user_prio(td, pri);
2060 		thread_unlock(td);
2061 		mtx_unlock_spin(&umtx_lock);
2062 	}
2063 
2064 out:
2065 	umtxq_lock(&uq->uq_key);
2066 	umtxq_unbusy(&uq->uq_key);
2067 	umtxq_unlock(&uq->uq_key);
2068 	umtx_key_release(&uq->uq_key);
2069 	return (error);
2070 }
2071 
2072 /*
2073  * Unlock a PP mutex.
2074  */
2075 static int
2076 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2077 {
2078 	struct umtx_key key;
2079 	struct umtx_q *uq, *uq2;
2080 	struct umtx_pi *pi;
2081 	uint32_t owner, id;
2082 	uint32_t rceiling;
2083 	int error, pri, new_inherited_pri, su;
2084 
2085 	id = td->td_tid;
2086 	uq = td->td_umtxq;
2087 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2088 
2089 	/*
2090 	 * Make sure we own this mtx.
2091 	 */
2092 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2093 	if (owner == -1)
2094 		return (EFAULT);
2095 
2096 	if ((owner & ~UMUTEX_CONTESTED) != id)
2097 		return (EPERM);
2098 
2099 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2100 	if (error != 0)
2101 		return (error);
2102 
2103 	if (rceiling == -1)
2104 		new_inherited_pri = PRI_MAX;
2105 	else {
2106 		rceiling = RTP_PRIO_MAX - rceiling;
2107 		if (rceiling > RTP_PRIO_MAX)
2108 			return (EINVAL);
2109 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2110 	}
2111 
2112 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2113 	    &key)) != 0)
2114 		return (error);
2115 	umtxq_lock(&key);
2116 	umtxq_busy(&key);
2117 	umtxq_unlock(&key);
2118 	/*
2119 	 * For priority protected mutex, always set unlocked state
2120 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2121 	 * to lock the mutex, it is necessary because thread priority
2122 	 * has to be adjusted for such mutex.
2123 	 */
2124 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2125 		UMUTEX_CONTESTED);
2126 
2127 	umtxq_lock(&key);
2128 	if (error == 0)
2129 		umtxq_signal(&key, 1);
2130 	umtxq_unbusy(&key);
2131 	umtxq_unlock(&key);
2132 
2133 	if (error == -1)
2134 		error = EFAULT;
2135 	else {
2136 		mtx_lock_spin(&umtx_lock);
2137 		if (su != 0)
2138 			uq->uq_inherited_pri = new_inherited_pri;
2139 		pri = PRI_MAX;
2140 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2141 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2142 			if (uq2 != NULL) {
2143 				if (pri > UPRI(uq2->uq_thread))
2144 					pri = UPRI(uq2->uq_thread);
2145 			}
2146 		}
2147 		if (pri > uq->uq_inherited_pri)
2148 			pri = uq->uq_inherited_pri;
2149 		thread_lock(td);
2150 		sched_lend_user_prio(td, pri);
2151 		thread_unlock(td);
2152 		mtx_unlock_spin(&umtx_lock);
2153 	}
2154 	umtx_key_release(&key);
2155 	return (error);
2156 }
2157 
2158 static int
2159 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2160 	uint32_t *old_ceiling)
2161 {
2162 	struct umtx_q *uq;
2163 	uint32_t save_ceiling;
2164 	uint32_t owner, id;
2165 	uint32_t flags;
2166 	int error;
2167 
2168 	flags = fuword32(&m->m_flags);
2169 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2170 		return (EINVAL);
2171 	if (ceiling > RTP_PRIO_MAX)
2172 		return (EINVAL);
2173 	id = td->td_tid;
2174 	uq = td->td_umtxq;
2175 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2176 	   &uq->uq_key)) != 0)
2177 		return (error);
2178 	for (;;) {
2179 		umtxq_lock(&uq->uq_key);
2180 		umtxq_busy(&uq->uq_key);
2181 		umtxq_unlock(&uq->uq_key);
2182 
2183 		save_ceiling = fuword32(&m->m_ceilings[0]);
2184 
2185 		owner = casuword32(&m->m_owner,
2186 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2187 
2188 		if (owner == UMUTEX_CONTESTED) {
2189 			suword32(&m->m_ceilings[0], ceiling);
2190 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2191 				UMUTEX_CONTESTED);
2192 			error = 0;
2193 			break;
2194 		}
2195 
2196 		/* The address was invalid. */
2197 		if (owner == -1) {
2198 			error = EFAULT;
2199 			break;
2200 		}
2201 
2202 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2203 			suword32(&m->m_ceilings[0], ceiling);
2204 			error = 0;
2205 			break;
2206 		}
2207 
2208 		/*
2209 		 * If we caught a signal, we have retried and now
2210 		 * exit immediately.
2211 		 */
2212 		if (error != 0)
2213 			break;
2214 
2215 		/*
2216 		 * We set the contested bit, sleep. Otherwise the lock changed
2217 		 * and we need to retry or we lost a race to the thread
2218 		 * unlocking the umtx.
2219 		 */
2220 		umtxq_lock(&uq->uq_key);
2221 		umtxq_insert(uq);
2222 		umtxq_unbusy(&uq->uq_key);
2223 		error = umtxq_sleep(uq, "umtxpp", 0);
2224 		umtxq_remove(uq);
2225 		umtxq_unlock(&uq->uq_key);
2226 	}
2227 	umtxq_lock(&uq->uq_key);
2228 	if (error == 0)
2229 		umtxq_signal(&uq->uq_key, INT_MAX);
2230 	umtxq_unbusy(&uq->uq_key);
2231 	umtxq_unlock(&uq->uq_key);
2232 	umtx_key_release(&uq->uq_key);
2233 	if (error == 0 && old_ceiling != NULL)
2234 		suword32(old_ceiling, save_ceiling);
2235 	return (error);
2236 }
2237 
2238 static int
2239 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2240 	int mode)
2241 {
2242 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2243 	case 0:
2244 		return (_do_lock_normal(td, m, flags, timo, mode));
2245 	case UMUTEX_PRIO_INHERIT:
2246 		return (_do_lock_pi(td, m, flags, timo, mode));
2247 	case UMUTEX_PRIO_PROTECT:
2248 		return (_do_lock_pp(td, m, flags, timo, mode));
2249 	}
2250 	return (EINVAL);
2251 }
2252 
2253 /*
2254  * Lock a userland POSIX mutex.
2255  */
2256 static int
2257 do_lock_umutex(struct thread *td, struct umutex *m,
2258 	struct timespec *timeout, int mode)
2259 {
2260 	struct timespec ts, ts2, ts3;
2261 	struct timeval tv;
2262 	uint32_t flags;
2263 	int error;
2264 
2265 	flags = fuword32(&m->m_flags);
2266 	if (flags == -1)
2267 		return (EFAULT);
2268 
2269 	if (timeout == NULL) {
2270 		error = _do_lock_umutex(td, m, flags, 0, mode);
2271 		/* Mutex locking is restarted if it is interrupted. */
2272 		if (error == EINTR && mode != _UMUTEX_WAIT)
2273 			error = ERESTART;
2274 	} else {
2275 		getnanouptime(&ts);
2276 		timespecadd(&ts, timeout);
2277 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2278 		for (;;) {
2279 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2280 			if (error != ETIMEDOUT)
2281 				break;
2282 			getnanouptime(&ts2);
2283 			if (timespeccmp(&ts2, &ts, >=)) {
2284 				error = ETIMEDOUT;
2285 				break;
2286 			}
2287 			ts3 = ts;
2288 			timespecsub(&ts3, &ts2);
2289 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2290 		}
2291 		/* Timed-locking is not restarted. */
2292 		if (error == ERESTART)
2293 			error = EINTR;
2294 	}
2295 	return (error);
2296 }
2297 
2298 /*
2299  * Unlock a userland POSIX mutex.
2300  */
2301 static int
2302 do_unlock_umutex(struct thread *td, struct umutex *m)
2303 {
2304 	uint32_t flags;
2305 
2306 	flags = fuword32(&m->m_flags);
2307 	if (flags == -1)
2308 		return (EFAULT);
2309 
2310 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2311 	case 0:
2312 		return (do_unlock_normal(td, m, flags));
2313 	case UMUTEX_PRIO_INHERIT:
2314 		return (do_unlock_pi(td, m, flags));
2315 	case UMUTEX_PRIO_PROTECT:
2316 		return (do_unlock_pp(td, m, flags));
2317 	}
2318 
2319 	return (EINVAL);
2320 }
2321 
2322 static int
2323 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2324 	struct timespec *timeout, u_long wflags)
2325 {
2326 	struct umtx_q *uq;
2327 	struct timeval tv;
2328 	struct timespec cts, ets, tts;
2329 	uint32_t flags;
2330 	uint32_t clockid;
2331 	int error;
2332 
2333 	uq = td->td_umtxq;
2334 	flags = fuword32(&cv->c_flags);
2335 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2336 	if (error != 0)
2337 		return (error);
2338 
2339 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2340 		clockid = fuword32(&cv->c_clockid);
2341 		if (clockid < CLOCK_REALTIME ||
2342 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2343 			/* hmm, only HW clock id will work. */
2344 			return (EINVAL);
2345 		}
2346 	} else {
2347 		clockid = CLOCK_REALTIME;
2348 	}
2349 
2350 	umtxq_lock(&uq->uq_key);
2351 	umtxq_busy(&uq->uq_key);
2352 	umtxq_insert(uq);
2353 	umtxq_unlock(&uq->uq_key);
2354 
2355 	/*
2356 	 * Set c_has_waiters to 1 before releasing user mutex, also
2357 	 * don't modify cache line when unnecessary.
2358 	 */
2359 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2360 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2361 
2362 	umtxq_lock(&uq->uq_key);
2363 	umtxq_unbusy(&uq->uq_key);
2364 	umtxq_unlock(&uq->uq_key);
2365 
2366 	error = do_unlock_umutex(td, m);
2367 
2368 	umtxq_lock(&uq->uq_key);
2369 	if (error == 0) {
2370 		if (timeout == NULL) {
2371 			error = umtxq_sleep(uq, "ucond", 0);
2372 		} else {
2373 			if ((wflags & CVWAIT_ABSTIME) == 0) {
2374 				kern_clock_gettime(td, clockid, &ets);
2375 				timespecadd(&ets, timeout);
2376 				tts = *timeout;
2377 			} else { /* absolute time */
2378 				ets = *timeout;
2379 				tts = *timeout;
2380 				kern_clock_gettime(td, clockid, &cts);
2381 				timespecsub(&tts, &cts);
2382 			}
2383 			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2384 			for (;;) {
2385 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2386 				if (error != ETIMEDOUT)
2387 					break;
2388 				kern_clock_gettime(td, clockid, &cts);
2389 				if (timespeccmp(&cts, &ets, >=)) {
2390 					error = ETIMEDOUT;
2391 					break;
2392 				}
2393 				tts = ets;
2394 				timespecsub(&tts, &cts);
2395 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2396 			}
2397 		}
2398 	}
2399 
2400 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2401 		error = 0;
2402 	else {
2403 		/*
2404 		 * This must be timeout,interrupted by signal or
2405 		 * surprious wakeup, clear c_has_waiter flag when
2406 		 * necessary.
2407 		 */
2408 		umtxq_busy(&uq->uq_key);
2409 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2410 			int oldlen = uq->uq_cur_queue->length;
2411 			umtxq_remove(uq);
2412 			if (oldlen == 1) {
2413 				umtxq_unlock(&uq->uq_key);
2414 				suword32(
2415 				    __DEVOLATILE(uint32_t *,
2416 					 &cv->c_has_waiters), 0);
2417 				umtxq_lock(&uq->uq_key);
2418 			}
2419 		}
2420 		umtxq_unbusy(&uq->uq_key);
2421 		if (error == ERESTART)
2422 			error = EINTR;
2423 	}
2424 
2425 	umtxq_unlock(&uq->uq_key);
2426 	umtx_key_release(&uq->uq_key);
2427 	return (error);
2428 }
2429 
2430 /*
2431  * Signal a userland condition variable.
2432  */
2433 static int
2434 do_cv_signal(struct thread *td, struct ucond *cv)
2435 {
2436 	struct umtx_key key;
2437 	int error, cnt, nwake;
2438 	uint32_t flags;
2439 
2440 	flags = fuword32(&cv->c_flags);
2441 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2442 		return (error);
2443 	umtxq_lock(&key);
2444 	umtxq_busy(&key);
2445 	cnt = umtxq_count(&key);
2446 	nwake = umtxq_signal(&key, 1);
2447 	if (cnt <= nwake) {
2448 		umtxq_unlock(&key);
2449 		error = suword32(
2450 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2451 		umtxq_lock(&key);
2452 	}
2453 	umtxq_unbusy(&key);
2454 	umtxq_unlock(&key);
2455 	umtx_key_release(&key);
2456 	return (error);
2457 }
2458 
2459 static int
2460 do_cv_broadcast(struct thread *td, struct ucond *cv)
2461 {
2462 	struct umtx_key key;
2463 	int error;
2464 	uint32_t flags;
2465 
2466 	flags = fuword32(&cv->c_flags);
2467 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2468 		return (error);
2469 
2470 	umtxq_lock(&key);
2471 	umtxq_busy(&key);
2472 	umtxq_signal(&key, INT_MAX);
2473 	umtxq_unlock(&key);
2474 
2475 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2476 
2477 	umtxq_lock(&key);
2478 	umtxq_unbusy(&key);
2479 	umtxq_unlock(&key);
2480 
2481 	umtx_key_release(&key);
2482 	return (error);
2483 }
2484 
2485 static int
2486 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2487 {
2488 	struct umtx_q *uq;
2489 	uint32_t flags, wrflags;
2490 	int32_t state, oldstate;
2491 	int32_t blocked_readers;
2492 	int error;
2493 
2494 	uq = td->td_umtxq;
2495 	flags = fuword32(&rwlock->rw_flags);
2496 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2497 	if (error != 0)
2498 		return (error);
2499 
2500 	wrflags = URWLOCK_WRITE_OWNER;
2501 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2502 		wrflags |= URWLOCK_WRITE_WAITERS;
2503 
2504 	for (;;) {
2505 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2506 		/* try to lock it */
2507 		while (!(state & wrflags)) {
2508 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2509 				umtx_key_release(&uq->uq_key);
2510 				return (EAGAIN);
2511 			}
2512 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2513 			if (oldstate == state) {
2514 				umtx_key_release(&uq->uq_key);
2515 				return (0);
2516 			}
2517 			state = oldstate;
2518 		}
2519 
2520 		if (error)
2521 			break;
2522 
2523 		/* grab monitor lock */
2524 		umtxq_lock(&uq->uq_key);
2525 		umtxq_busy(&uq->uq_key);
2526 		umtxq_unlock(&uq->uq_key);
2527 
2528 		/*
2529 		 * re-read the state, in case it changed between the try-lock above
2530 		 * and the check below
2531 		 */
2532 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2533 
2534 		/* set read contention bit */
2535 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2536 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2537 			if (oldstate == state)
2538 				goto sleep;
2539 			state = oldstate;
2540 		}
2541 
2542 		/* state is changed while setting flags, restart */
2543 		if (!(state & wrflags)) {
2544 			umtxq_lock(&uq->uq_key);
2545 			umtxq_unbusy(&uq->uq_key);
2546 			umtxq_unlock(&uq->uq_key);
2547 			continue;
2548 		}
2549 
2550 sleep:
2551 		/* contention bit is set, before sleeping, increase read waiter count */
2552 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2553 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2554 
2555 		while (state & wrflags) {
2556 			umtxq_lock(&uq->uq_key);
2557 			umtxq_insert(uq);
2558 			umtxq_unbusy(&uq->uq_key);
2559 
2560 			error = umtxq_sleep(uq, "urdlck", timo);
2561 
2562 			umtxq_busy(&uq->uq_key);
2563 			umtxq_remove(uq);
2564 			umtxq_unlock(&uq->uq_key);
2565 			if (error)
2566 				break;
2567 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2568 		}
2569 
2570 		/* decrease read waiter count, and may clear read contention bit */
2571 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2572 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2573 		if (blocked_readers == 1) {
2574 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2575 			for (;;) {
2576 				oldstate = casuword32(&rwlock->rw_state, state,
2577 					 state & ~URWLOCK_READ_WAITERS);
2578 				if (oldstate == state)
2579 					break;
2580 				state = oldstate;
2581 			}
2582 		}
2583 
2584 		umtxq_lock(&uq->uq_key);
2585 		umtxq_unbusy(&uq->uq_key);
2586 		umtxq_unlock(&uq->uq_key);
2587 	}
2588 	umtx_key_release(&uq->uq_key);
2589 	return (error);
2590 }
2591 
2592 static int
2593 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2594 {
2595 	struct timespec ts, ts2, ts3;
2596 	struct timeval tv;
2597 	int error;
2598 
2599 	getnanouptime(&ts);
2600 	timespecadd(&ts, timeout);
2601 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2602 	for (;;) {
2603 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2604 		if (error != ETIMEDOUT)
2605 			break;
2606 		getnanouptime(&ts2);
2607 		if (timespeccmp(&ts2, &ts, >=)) {
2608 			error = ETIMEDOUT;
2609 			break;
2610 		}
2611 		ts3 = ts;
2612 		timespecsub(&ts3, &ts2);
2613 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2614 	}
2615 	if (error == ERESTART)
2616 		error = EINTR;
2617 	return (error);
2618 }
2619 
2620 static int
2621 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2622 {
2623 	struct umtx_q *uq;
2624 	uint32_t flags;
2625 	int32_t state, oldstate;
2626 	int32_t blocked_writers;
2627 	int32_t blocked_readers;
2628 	int error;
2629 
2630 	uq = td->td_umtxq;
2631 	flags = fuword32(&rwlock->rw_flags);
2632 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2633 	if (error != 0)
2634 		return (error);
2635 
2636 	blocked_readers = 0;
2637 	for (;;) {
2638 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2639 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2640 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2641 			if (oldstate == state) {
2642 				umtx_key_release(&uq->uq_key);
2643 				return (0);
2644 			}
2645 			state = oldstate;
2646 		}
2647 
2648 		if (error) {
2649 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2650 			    blocked_readers != 0) {
2651 				umtxq_lock(&uq->uq_key);
2652 				umtxq_busy(&uq->uq_key);
2653 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2654 				umtxq_unbusy(&uq->uq_key);
2655 				umtxq_unlock(&uq->uq_key);
2656 			}
2657 
2658 			break;
2659 		}
2660 
2661 		/* grab monitor lock */
2662 		umtxq_lock(&uq->uq_key);
2663 		umtxq_busy(&uq->uq_key);
2664 		umtxq_unlock(&uq->uq_key);
2665 
2666 		/*
2667 		 * re-read the state, in case it changed between the try-lock above
2668 		 * and the check below
2669 		 */
2670 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2671 
2672 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2673 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2674 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2675 			if (oldstate == state)
2676 				goto sleep;
2677 			state = oldstate;
2678 		}
2679 
2680 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2681 			umtxq_lock(&uq->uq_key);
2682 			umtxq_unbusy(&uq->uq_key);
2683 			umtxq_unlock(&uq->uq_key);
2684 			continue;
2685 		}
2686 sleep:
2687 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2688 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2689 
2690 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2691 			umtxq_lock(&uq->uq_key);
2692 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2693 			umtxq_unbusy(&uq->uq_key);
2694 
2695 			error = umtxq_sleep(uq, "uwrlck", timo);
2696 
2697 			umtxq_busy(&uq->uq_key);
2698 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2699 			umtxq_unlock(&uq->uq_key);
2700 			if (error)
2701 				break;
2702 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2703 		}
2704 
2705 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2706 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2707 		if (blocked_writers == 1) {
2708 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2709 			for (;;) {
2710 				oldstate = casuword32(&rwlock->rw_state, state,
2711 					 state & ~URWLOCK_WRITE_WAITERS);
2712 				if (oldstate == state)
2713 					break;
2714 				state = oldstate;
2715 			}
2716 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2717 		} else
2718 			blocked_readers = 0;
2719 
2720 		umtxq_lock(&uq->uq_key);
2721 		umtxq_unbusy(&uq->uq_key);
2722 		umtxq_unlock(&uq->uq_key);
2723 	}
2724 
2725 	umtx_key_release(&uq->uq_key);
2726 	return (error);
2727 }
2728 
2729 static int
2730 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2731 {
2732 	struct timespec ts, ts2, ts3;
2733 	struct timeval tv;
2734 	int error;
2735 
2736 	getnanouptime(&ts);
2737 	timespecadd(&ts, timeout);
2738 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2739 	for (;;) {
2740 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2741 		if (error != ETIMEDOUT)
2742 			break;
2743 		getnanouptime(&ts2);
2744 		if (timespeccmp(&ts2, &ts, >=)) {
2745 			error = ETIMEDOUT;
2746 			break;
2747 		}
2748 		ts3 = ts;
2749 		timespecsub(&ts3, &ts2);
2750 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2751 	}
2752 	if (error == ERESTART)
2753 		error = EINTR;
2754 	return (error);
2755 }
2756 
2757 static int
2758 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2759 {
2760 	struct umtx_q *uq;
2761 	uint32_t flags;
2762 	int32_t state, oldstate;
2763 	int error, q, count;
2764 
2765 	uq = td->td_umtxq;
2766 	flags = fuword32(&rwlock->rw_flags);
2767 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2768 	if (error != 0)
2769 		return (error);
2770 
2771 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2772 	if (state & URWLOCK_WRITE_OWNER) {
2773 		for (;;) {
2774 			oldstate = casuword32(&rwlock->rw_state, state,
2775 				state & ~URWLOCK_WRITE_OWNER);
2776 			if (oldstate != state) {
2777 				state = oldstate;
2778 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2779 					error = EPERM;
2780 					goto out;
2781 				}
2782 			} else
2783 				break;
2784 		}
2785 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2786 		for (;;) {
2787 			oldstate = casuword32(&rwlock->rw_state, state,
2788 				state - 1);
2789 			if (oldstate != state) {
2790 				state = oldstate;
2791 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2792 					error = EPERM;
2793 					goto out;
2794 				}
2795 			}
2796 			else
2797 				break;
2798 		}
2799 	} else {
2800 		error = EPERM;
2801 		goto out;
2802 	}
2803 
2804 	count = 0;
2805 
2806 	if (!(flags & URWLOCK_PREFER_READER)) {
2807 		if (state & URWLOCK_WRITE_WAITERS) {
2808 			count = 1;
2809 			q = UMTX_EXCLUSIVE_QUEUE;
2810 		} else if (state & URWLOCK_READ_WAITERS) {
2811 			count = INT_MAX;
2812 			q = UMTX_SHARED_QUEUE;
2813 		}
2814 	} else {
2815 		if (state & URWLOCK_READ_WAITERS) {
2816 			count = INT_MAX;
2817 			q = UMTX_SHARED_QUEUE;
2818 		} else if (state & URWLOCK_WRITE_WAITERS) {
2819 			count = 1;
2820 			q = UMTX_EXCLUSIVE_QUEUE;
2821 		}
2822 	}
2823 
2824 	if (count) {
2825 		umtxq_lock(&uq->uq_key);
2826 		umtxq_busy(&uq->uq_key);
2827 		umtxq_signal_queue(&uq->uq_key, count, q);
2828 		umtxq_unbusy(&uq->uq_key);
2829 		umtxq_unlock(&uq->uq_key);
2830 	}
2831 out:
2832 	umtx_key_release(&uq->uq_key);
2833 	return (error);
2834 }
2835 
2836 static int
2837 do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2838 {
2839 	struct umtx_q *uq;
2840 	struct timeval tv;
2841 	struct timespec cts, ets, tts;
2842 	uint32_t flags, count;
2843 	int error;
2844 
2845 	uq = td->td_umtxq;
2846 	flags = fuword32(&sem->_flags);
2847 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2848 	if (error != 0)
2849 		return (error);
2850 	umtxq_lock(&uq->uq_key);
2851 	umtxq_busy(&uq->uq_key);
2852 	umtxq_insert(uq);
2853 	umtxq_unlock(&uq->uq_key);
2854 
2855 	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2856 		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2857 
2858 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2859 	if (count != 0) {
2860 		umtxq_lock(&uq->uq_key);
2861 		umtxq_unbusy(&uq->uq_key);
2862 		umtxq_remove(uq);
2863 		umtxq_unlock(&uq->uq_key);
2864 		umtx_key_release(&uq->uq_key);
2865 		return (0);
2866 	}
2867 
2868 	umtxq_lock(&uq->uq_key);
2869 	umtxq_unbusy(&uq->uq_key);
2870 	umtxq_unlock(&uq->uq_key);
2871 
2872 	umtxq_lock(&uq->uq_key);
2873 	if (timeout == NULL) {
2874 		error = umtxq_sleep(uq, "usem", 0);
2875 	} else {
2876 		getnanouptime(&ets);
2877 		timespecadd(&ets, timeout);
2878 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2879 		for (;;) {
2880 			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2881 			if (error != ETIMEDOUT)
2882 				break;
2883 			getnanouptime(&cts);
2884 			if (timespeccmp(&cts, &ets, >=)) {
2885 				error = ETIMEDOUT;
2886 				break;
2887 			}
2888 			tts = ets;
2889 			timespecsub(&tts, &cts);
2890 			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2891 		}
2892 	}
2893 
2894 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2895 		error = 0;
2896 	else {
2897 		umtxq_remove(uq);
2898 		if (error == ERESTART)
2899 			error = EINTR;
2900 	}
2901 	umtxq_unlock(&uq->uq_key);
2902 	umtx_key_release(&uq->uq_key);
2903 	return (error);
2904 }
2905 
2906 /*
2907  * Signal a userland condition variable.
2908  */
2909 static int
2910 do_sem_wake(struct thread *td, struct _usem *sem)
2911 {
2912 	struct umtx_key key;
2913 	int error, cnt, nwake;
2914 	uint32_t flags;
2915 
2916 	flags = fuword32(&sem->_flags);
2917 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2918 		return (error);
2919 	umtxq_lock(&key);
2920 	umtxq_busy(&key);
2921 	cnt = umtxq_count(&key);
2922 	nwake = umtxq_signal(&key, 1);
2923 	if (cnt <= nwake) {
2924 		umtxq_unlock(&key);
2925 		error = suword32(
2926 		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2927 		umtxq_lock(&key);
2928 	}
2929 	umtxq_unbusy(&key);
2930 	umtxq_unlock(&key);
2931 	umtx_key_release(&key);
2932 	return (error);
2933 }
2934 
2935 int
2936 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2937     /* struct umtx *umtx */
2938 {
2939 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2940 }
2941 
2942 int
2943 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2944     /* struct umtx *umtx */
2945 {
2946 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2947 }
2948 
2949 static int
2950 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2951 {
2952 	struct timespec *ts, timeout;
2953 	int error;
2954 
2955 	/* Allow a null timespec (wait forever). */
2956 	if (uap->uaddr2 == NULL)
2957 		ts = NULL;
2958 	else {
2959 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2960 		if (error != 0)
2961 			return (error);
2962 		if (timeout.tv_nsec >= 1000000000 ||
2963 		    timeout.tv_nsec < 0) {
2964 			return (EINVAL);
2965 		}
2966 		ts = &timeout;
2967 	}
2968 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2969 }
2970 
2971 static int
2972 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2973 {
2974 	return (do_unlock_umtx(td, uap->obj, uap->val));
2975 }
2976 
2977 static int
2978 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2979 {
2980 	struct timespec *ts, timeout;
2981 	int error;
2982 
2983 	if (uap->uaddr2 == NULL)
2984 		ts = NULL;
2985 	else {
2986 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2987 		if (error != 0)
2988 			return (error);
2989 		if (timeout.tv_nsec >= 1000000000 ||
2990 		    timeout.tv_nsec < 0)
2991 			return (EINVAL);
2992 		ts = &timeout;
2993 	}
2994 	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2995 }
2996 
2997 static int
2998 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2999 {
3000 	struct timespec *ts, timeout;
3001 	int error;
3002 
3003 	if (uap->uaddr2 == NULL)
3004 		ts = NULL;
3005 	else {
3006 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3007 		if (error != 0)
3008 			return (error);
3009 		if (timeout.tv_nsec >= 1000000000 ||
3010 		    timeout.tv_nsec < 0)
3011 			return (EINVAL);
3012 		ts = &timeout;
3013 	}
3014 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3015 }
3016 
3017 static int
3018 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3019 {
3020 	struct timespec *ts, timeout;
3021 	int error;
3022 
3023 	if (uap->uaddr2 == NULL)
3024 		ts = NULL;
3025 	else {
3026 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
3027 		if (error != 0)
3028 			return (error);
3029 		if (timeout.tv_nsec >= 1000000000 ||
3030 		    timeout.tv_nsec < 0)
3031 			return (EINVAL);
3032 		ts = &timeout;
3033 	}
3034 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3035 }
3036 
3037 static int
3038 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3039 {
3040 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3041 }
3042 
3043 #define BATCH_SIZE	128
3044 static int
3045 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3046 {
3047 	int count = uap->val;
3048 	void *uaddrs[BATCH_SIZE];
3049 	char **upp = (char **)uap->obj;
3050 	int tocopy;
3051 	int error = 0;
3052 	int i, pos = 0;
3053 
3054 	while (count > 0) {
3055 		tocopy = count;
3056 		if (tocopy > BATCH_SIZE)
3057 			tocopy = BATCH_SIZE;
3058 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3059 		if (error != 0)
3060 			break;
3061 		for (i = 0; i < tocopy; ++i)
3062 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3063 		count -= tocopy;
3064 		pos += tocopy;
3065 	}
3066 	return (error);
3067 }
3068 
3069 static int
3070 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3071 {
3072 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3073 }
3074 
3075 static int
3076 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3077 {
3078 	struct timespec *ts, timeout;
3079 	int error;
3080 
3081 	/* Allow a null timespec (wait forever). */
3082 	if (uap->uaddr2 == NULL)
3083 		ts = NULL;
3084 	else {
3085 		error = copyin(uap->uaddr2, &timeout,
3086 		    sizeof(timeout));
3087 		if (error != 0)
3088 			return (error);
3089 		if (timeout.tv_nsec >= 1000000000 ||
3090 		    timeout.tv_nsec < 0) {
3091 			return (EINVAL);
3092 		}
3093 		ts = &timeout;
3094 	}
3095 	return do_lock_umutex(td, uap->obj, ts, 0);
3096 }
3097 
3098 static int
3099 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3100 {
3101 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3102 }
3103 
3104 static int
3105 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3106 {
3107 	struct timespec *ts, timeout;
3108 	int error;
3109 
3110 	/* Allow a null timespec (wait forever). */
3111 	if (uap->uaddr2 == NULL)
3112 		ts = NULL;
3113 	else {
3114 		error = copyin(uap->uaddr2, &timeout,
3115 		    sizeof(timeout));
3116 		if (error != 0)
3117 			return (error);
3118 		if (timeout.tv_nsec >= 1000000000 ||
3119 		    timeout.tv_nsec < 0) {
3120 			return (EINVAL);
3121 		}
3122 		ts = &timeout;
3123 	}
3124 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3125 }
3126 
3127 static int
3128 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3129 {
3130 	return do_wake_umutex(td, uap->obj);
3131 }
3132 
3133 static int
3134 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3135 {
3136 	return do_unlock_umutex(td, uap->obj);
3137 }
3138 
3139 static int
3140 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3141 {
3142 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3143 }
3144 
3145 static int
3146 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3147 {
3148 	struct timespec *ts, timeout;
3149 	int error;
3150 
3151 	/* Allow a null timespec (wait forever). */
3152 	if (uap->uaddr2 == NULL)
3153 		ts = NULL;
3154 	else {
3155 		error = copyin(uap->uaddr2, &timeout,
3156 		    sizeof(timeout));
3157 		if (error != 0)
3158 			return (error);
3159 		if (timeout.tv_nsec >= 1000000000 ||
3160 		    timeout.tv_nsec < 0) {
3161 			return (EINVAL);
3162 		}
3163 		ts = &timeout;
3164 	}
3165 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3166 }
3167 
3168 static int
3169 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3170 {
3171 	return do_cv_signal(td, uap->obj);
3172 }
3173 
3174 static int
3175 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3176 {
3177 	return do_cv_broadcast(td, uap->obj);
3178 }
3179 
3180 static int
3181 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3182 {
3183 	struct timespec timeout;
3184 	int error;
3185 
3186 	/* Allow a null timespec (wait forever). */
3187 	if (uap->uaddr2 == NULL) {
3188 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3189 	} else {
3190 		error = copyin(uap->uaddr2, &timeout,
3191 		    sizeof(timeout));
3192 		if (error != 0)
3193 			return (error);
3194 		if (timeout.tv_nsec >= 1000000000 ||
3195 		    timeout.tv_nsec < 0) {
3196 			return (EINVAL);
3197 		}
3198 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3199 	}
3200 	return (error);
3201 }
3202 
3203 static int
3204 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3205 {
3206 	struct timespec timeout;
3207 	int error;
3208 
3209 	/* Allow a null timespec (wait forever). */
3210 	if (uap->uaddr2 == NULL) {
3211 		error = do_rw_wrlock(td, uap->obj, 0);
3212 	} else {
3213 		error = copyin(uap->uaddr2, &timeout,
3214 		    sizeof(timeout));
3215 		if (error != 0)
3216 			return (error);
3217 		if (timeout.tv_nsec >= 1000000000 ||
3218 		    timeout.tv_nsec < 0) {
3219 			return (EINVAL);
3220 		}
3221 
3222 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3223 	}
3224 	return (error);
3225 }
3226 
3227 static int
3228 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3229 {
3230 	return do_rw_unlock(td, uap->obj);
3231 }
3232 
3233 static int
3234 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3235 {
3236 	struct timespec *ts, timeout;
3237 	int error;
3238 
3239 	/* Allow a null timespec (wait forever). */
3240 	if (uap->uaddr2 == NULL)
3241 		ts = NULL;
3242 	else {
3243 		error = copyin(uap->uaddr2, &timeout,
3244 		    sizeof(timeout));
3245 		if (error != 0)
3246 			return (error);
3247 		if (timeout.tv_nsec >= 1000000000 ||
3248 		    timeout.tv_nsec < 0) {
3249 			return (EINVAL);
3250 		}
3251 		ts = &timeout;
3252 	}
3253 	return (do_sem_wait(td, uap->obj, ts));
3254 }
3255 
3256 static int
3257 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3258 {
3259 	return do_sem_wake(td, uap->obj);
3260 }
3261 
3262 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3263 
3264 static _umtx_op_func op_table[] = {
3265 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3266 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3267 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3268 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3269 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3270 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3271 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3272 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3273 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3274 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3275 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3276 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3277 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3278 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3279 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3280 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3281 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3282 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3283 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3284 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3285 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3286 	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3287 };
3288 
3289 int
3290 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
3291 {
3292 	if ((unsigned)uap->op < UMTX_OP_MAX)
3293 		return (*op_table[uap->op])(td, uap);
3294 	return (EINVAL);
3295 }
3296 
3297 #ifdef COMPAT_FREEBSD32
3298 int
3299 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3300     /* struct umtx *umtx */
3301 {
3302 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3303 }
3304 
3305 int
3306 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3307     /* struct umtx *umtx */
3308 {
3309 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3310 }
3311 
3312 struct timespec32 {
3313 	uint32_t tv_sec;
3314 	uint32_t tv_nsec;
3315 };
3316 
3317 static inline int
3318 copyin_timeout32(void *addr, struct timespec *tsp)
3319 {
3320 	struct timespec32 ts32;
3321 	int error;
3322 
3323 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3324 	if (error == 0) {
3325 		tsp->tv_sec = ts32.tv_sec;
3326 		tsp->tv_nsec = ts32.tv_nsec;
3327 	}
3328 	return (error);
3329 }
3330 
3331 static int
3332 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3333 {
3334 	struct timespec *ts, timeout;
3335 	int error;
3336 
3337 	/* Allow a null timespec (wait forever). */
3338 	if (uap->uaddr2 == NULL)
3339 		ts = NULL;
3340 	else {
3341 		error = copyin_timeout32(uap->uaddr2, &timeout);
3342 		if (error != 0)
3343 			return (error);
3344 		if (timeout.tv_nsec >= 1000000000 ||
3345 		    timeout.tv_nsec < 0) {
3346 			return (EINVAL);
3347 		}
3348 		ts = &timeout;
3349 	}
3350 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3351 }
3352 
3353 static int
3354 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3355 {
3356 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3357 }
3358 
3359 static int
3360 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3361 {
3362 	struct timespec *ts, timeout;
3363 	int error;
3364 
3365 	if (uap->uaddr2 == NULL)
3366 		ts = NULL;
3367 	else {
3368 		error = copyin_timeout32(uap->uaddr2, &timeout);
3369 		if (error != 0)
3370 			return (error);
3371 		if (timeout.tv_nsec >= 1000000000 ||
3372 		    timeout.tv_nsec < 0)
3373 			return (EINVAL);
3374 		ts = &timeout;
3375 	}
3376 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3377 }
3378 
3379 static int
3380 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3381 {
3382 	struct timespec *ts, timeout;
3383 	int error;
3384 
3385 	/* Allow a null timespec (wait forever). */
3386 	if (uap->uaddr2 == NULL)
3387 		ts = NULL;
3388 	else {
3389 		error = copyin_timeout32(uap->uaddr2, &timeout);
3390 		if (error != 0)
3391 			return (error);
3392 		if (timeout.tv_nsec >= 1000000000 ||
3393 		    timeout.tv_nsec < 0)
3394 			return (EINVAL);
3395 		ts = &timeout;
3396 	}
3397 	return do_lock_umutex(td, uap->obj, ts, 0);
3398 }
3399 
3400 static int
3401 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3402 {
3403 	struct timespec *ts, timeout;
3404 	int error;
3405 
3406 	/* Allow a null timespec (wait forever). */
3407 	if (uap->uaddr2 == NULL)
3408 		ts = NULL;
3409 	else {
3410 		error = copyin_timeout32(uap->uaddr2, &timeout);
3411 		if (error != 0)
3412 			return (error);
3413 		if (timeout.tv_nsec >= 1000000000 ||
3414 		    timeout.tv_nsec < 0)
3415 			return (EINVAL);
3416 		ts = &timeout;
3417 	}
3418 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3419 }
3420 
3421 static int
3422 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3423 {
3424 	struct timespec *ts, timeout;
3425 	int error;
3426 
3427 	/* Allow a null timespec (wait forever). */
3428 	if (uap->uaddr2 == NULL)
3429 		ts = NULL;
3430 	else {
3431 		error = copyin_timeout32(uap->uaddr2, &timeout);
3432 		if (error != 0)
3433 			return (error);
3434 		if (timeout.tv_nsec >= 1000000000 ||
3435 		    timeout.tv_nsec < 0)
3436 			return (EINVAL);
3437 		ts = &timeout;
3438 	}
3439 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3440 }
3441 
3442 static int
3443 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3444 {
3445 	struct timespec timeout;
3446 	int error;
3447 
3448 	/* Allow a null timespec (wait forever). */
3449 	if (uap->uaddr2 == NULL) {
3450 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3451 	} else {
3452 		error = copyin_timeout32(uap->uaddr2, &timeout);
3453 		if (error != 0)
3454 			return (error);
3455 		if (timeout.tv_nsec >= 1000000000 ||
3456 		    timeout.tv_nsec < 0) {
3457 			return (EINVAL);
3458 		}
3459 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3460 	}
3461 	return (error);
3462 }
3463 
3464 static int
3465 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3466 {
3467 	struct timespec timeout;
3468 	int error;
3469 
3470 	/* Allow a null timespec (wait forever). */
3471 	if (uap->uaddr2 == NULL) {
3472 		error = do_rw_wrlock(td, uap->obj, 0);
3473 	} else {
3474 		error = copyin_timeout32(uap->uaddr2, &timeout);
3475 		if (error != 0)
3476 			return (error);
3477 		if (timeout.tv_nsec >= 1000000000 ||
3478 		    timeout.tv_nsec < 0) {
3479 			return (EINVAL);
3480 		}
3481 
3482 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3483 	}
3484 	return (error);
3485 }
3486 
3487 static int
3488 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3489 {
3490 	struct timespec *ts, timeout;
3491 	int error;
3492 
3493 	if (uap->uaddr2 == NULL)
3494 		ts = NULL;
3495 	else {
3496 		error = copyin_timeout32(uap->uaddr2, &timeout);
3497 		if (error != 0)
3498 			return (error);
3499 		if (timeout.tv_nsec >= 1000000000 ||
3500 		    timeout.tv_nsec < 0)
3501 			return (EINVAL);
3502 		ts = &timeout;
3503 	}
3504 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3505 }
3506 
3507 static int
3508 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3509 {
3510 	struct timespec *ts, timeout;
3511 	int error;
3512 
3513 	/* Allow a null timespec (wait forever). */
3514 	if (uap->uaddr2 == NULL)
3515 		ts = NULL;
3516 	else {
3517 		error = copyin_timeout32(uap->uaddr2, &timeout);
3518 		if (error != 0)
3519 			return (error);
3520 		if (timeout.tv_nsec >= 1000000000 ||
3521 		    timeout.tv_nsec < 0)
3522 			return (EINVAL);
3523 		ts = &timeout;
3524 	}
3525 	return (do_sem_wait(td, uap->obj, ts));
3526 }
3527 
3528 static int
3529 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3530 {
3531 	int count = uap->val;
3532 	uint32_t uaddrs[BATCH_SIZE];
3533 	uint32_t **upp = (uint32_t **)uap->obj;
3534 	int tocopy;
3535 	int error = 0;
3536 	int i, pos = 0;
3537 
3538 	while (count > 0) {
3539 		tocopy = count;
3540 		if (tocopy > BATCH_SIZE)
3541 			tocopy = BATCH_SIZE;
3542 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3543 		if (error != 0)
3544 			break;
3545 		for (i = 0; i < tocopy; ++i)
3546 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3547 				INT_MAX, 1);
3548 		count -= tocopy;
3549 		pos += tocopy;
3550 	}
3551 	return (error);
3552 }
3553 
3554 static _umtx_op_func op_table_compat32[] = {
3555 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3556 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3557 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3558 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3559 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3560 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3561 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3562 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3563 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3564 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3565 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3566 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3567 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3568 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3569 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3570 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3571 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3572 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3573 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3574 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3575 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3576 	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3577 };
3578 
3579 int
3580 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3581 {
3582 	if ((unsigned)uap->op < UMTX_OP_MAX)
3583 		return (*op_table_compat32[uap->op])(td,
3584 			(struct _umtx_op_args *)uap);
3585 	return (EINVAL);
3586 }
3587 #endif
3588 
3589 void
3590 umtx_thread_init(struct thread *td)
3591 {
3592 	td->td_umtxq = umtxq_alloc();
3593 	td->td_umtxq->uq_thread = td;
3594 }
3595 
3596 void
3597 umtx_thread_fini(struct thread *td)
3598 {
3599 	umtxq_free(td->td_umtxq);
3600 }
3601 
3602 /*
3603  * It will be called when new thread is created, e.g fork().
3604  */
3605 void
3606 umtx_thread_alloc(struct thread *td)
3607 {
3608 	struct umtx_q *uq;
3609 
3610 	uq = td->td_umtxq;
3611 	uq->uq_inherited_pri = PRI_MAX;
3612 
3613 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3614 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3615 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3616 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3617 }
3618 
3619 /*
3620  * exec() hook.
3621  */
3622 static void
3623 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3624 	struct image_params *imgp __unused)
3625 {
3626 	umtx_thread_cleanup(curthread);
3627 }
3628 
3629 /*
3630  * thread_exit() hook.
3631  */
3632 void
3633 umtx_thread_exit(struct thread *td)
3634 {
3635 	umtx_thread_cleanup(td);
3636 }
3637 
3638 /*
3639  * clean up umtx data.
3640  */
3641 static void
3642 umtx_thread_cleanup(struct thread *td)
3643 {
3644 	struct umtx_q *uq;
3645 	struct umtx_pi *pi;
3646 
3647 	if ((uq = td->td_umtxq) == NULL)
3648 		return;
3649 
3650 	mtx_lock_spin(&umtx_lock);
3651 	uq->uq_inherited_pri = PRI_MAX;
3652 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3653 		pi->pi_owner = NULL;
3654 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3655 	}
3656 	mtx_unlock_spin(&umtx_lock);
3657 	thread_lock(td);
3658 	sched_lend_user_prio(td, PRI_MAX);
3659 	thread_unlock(td);
3660 }
3661