xref: /freebsd/sys/kern/kern_umtx.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/eventhandler.h>
48 #include <sys/umtx.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_param.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/cpu.h>
57 
58 #ifdef COMPAT_FREEBSD32
59 #include <compat/freebsd32/freebsd32_proto.h>
60 #endif
61 
62 #define _UMUTEX_TRY		1
63 #define _UMUTEX_WAIT		2
64 
65 /* Priority inheritance mutex info. */
66 struct umtx_pi {
67 	/* Owner thread */
68 	struct thread		*pi_owner;
69 
70 	/* Reference count */
71 	int			pi_refcount;
72 
73  	/* List entry to link umtx holding by thread */
74 	TAILQ_ENTRY(umtx_pi)	pi_link;
75 
76 	/* List entry in hash */
77 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
78 
79 	/* List for waiters */
80 	TAILQ_HEAD(,umtx_q)	pi_blocked;
81 
82 	/* Identify a userland lock object */
83 	struct umtx_key		pi_key;
84 };
85 
86 /* A userland synchronous object user. */
87 struct umtx_q {
88 	/* Linked list for the hash. */
89 	TAILQ_ENTRY(umtx_q)	uq_link;
90 
91 	/* Umtx key. */
92 	struct umtx_key		uq_key;
93 
94 	/* Umtx flags. */
95 	int			uq_flags;
96 #define UQF_UMTXQ	0x0001
97 
98 	/* The thread waits on. */
99 	struct thread		*uq_thread;
100 
101 	/*
102 	 * Blocked on PI mutex. read can use chain lock
103 	 * or umtx_lock, write must have both chain lock and
104 	 * umtx_lock being hold.
105 	 */
106 	struct umtx_pi		*uq_pi_blocked;
107 
108 	/* On blocked list */
109 	TAILQ_ENTRY(umtx_q)	uq_lockq;
110 
111 	/* Thread contending with us */
112 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
113 
114 	/* Inherited priority from PP mutex */
115 	u_char			uq_inherited_pri;
116 
117 	/* Spare queue ready to be reused */
118 	struct umtxq_queue	*uq_spare_queue;
119 
120 	/* The queue we on */
121 	struct umtxq_queue	*uq_cur_queue;
122 };
123 
124 TAILQ_HEAD(umtxq_head, umtx_q);
125 
126 /* Per-key wait-queue */
127 struct umtxq_queue {
128 	struct umtxq_head	head;
129 	struct umtx_key		key;
130 	LIST_ENTRY(umtxq_queue)	link;
131 	int			length;
132 };
133 
134 LIST_HEAD(umtxq_list, umtxq_queue);
135 
136 /* Userland lock object's wait-queue chain */
137 struct umtxq_chain {
138 	/* Lock for this chain. */
139 	struct mtx		uc_lock;
140 
141 	/* List of sleep queues. */
142 	struct umtxq_list	uc_queue[2];
143 #define UMTX_SHARED_QUEUE	0
144 #define UMTX_EXCLUSIVE_QUEUE	1
145 
146 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
147 
148 	/* Busy flag */
149 	char			uc_busy;
150 
151 	/* Chain lock waiters */
152 	int			uc_waiters;
153 
154 	/* All PI in the list */
155 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156 
157 };
158 
159 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
161 
162 /*
163  * Don't propagate time-sharing priority, there is a security reason,
164  * a user can simply introduce PI-mutex, let thread A lock the mutex,
165  * and let another thread B block on the mutex, because B is
166  * sleeping, its priority will be boosted, this causes A's priority to
167  * be boosted via priority propagating too and will never be lowered even
168  * if it is using 100%CPU, this is unfair to other processes.
169  */
170 
171 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
172 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
173 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
174 
175 #define	GOLDEN_RATIO_PRIME	2654404609U
176 #define	UMTX_CHAINS		512
177 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
178 
179 #define	GET_SHARE(flags)	\
180     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
181 
182 #define BUSY_SPINS		200
183 
184 static uma_zone_t		umtx_pi_zone;
185 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
186 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187 static int			umtx_pi_allocated;
188 
189 SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191     &umtx_pi_allocated, 0, "Allocated umtx_pi");
192 
193 static void umtxq_sysinit(void *);
194 static void umtxq_hash(struct umtx_key *key);
195 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196 static void umtxq_lock(struct umtx_key *key);
197 static void umtxq_unlock(struct umtx_key *key);
198 static void umtxq_busy(struct umtx_key *key);
199 static void umtxq_unbusy(struct umtx_key *key);
200 static void umtxq_insert_queue(struct umtx_q *uq, int q);
201 static void umtxq_remove_queue(struct umtx_q *uq, int q);
202 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203 static int umtxq_count(struct umtx_key *key);
204 static struct umtx_pi *umtx_pi_alloc(int);
205 static void umtx_pi_free(struct umtx_pi *pi);
206 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207 static void umtx_thread_cleanup(struct thread *td);
208 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209 	struct image_params *imgp __unused);
210 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211 
212 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
213 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
214 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
215 
216 static struct mtx umtx_lock;
217 
218 static void
219 umtxq_sysinit(void *arg __unused)
220 {
221 	int i, j;
222 
223 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
224 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
225 	for (i = 0; i < 2; ++i) {
226 		for (j = 0; j < UMTX_CHAINS; ++j) {
227 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
228 				 MTX_DEF | MTX_DUPOK);
229 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
230 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
231 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
232 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
233 			umtxq_chains[i][j].uc_busy = 0;
234 			umtxq_chains[i][j].uc_waiters = 0;
235 		}
236 	}
237 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
238 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
239 	    EVENTHANDLER_PRI_ANY);
240 }
241 
242 struct umtx_q *
243 umtxq_alloc(void)
244 {
245 	struct umtx_q *uq;
246 
247 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
248 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
249 	TAILQ_INIT(&uq->uq_spare_queue->head);
250 	TAILQ_INIT(&uq->uq_pi_contested);
251 	uq->uq_inherited_pri = PRI_MAX;
252 	return (uq);
253 }
254 
255 void
256 umtxq_free(struct umtx_q *uq)
257 {
258 	MPASS(uq->uq_spare_queue != NULL);
259 	free(uq->uq_spare_queue, M_UMTX);
260 	free(uq, M_UMTX);
261 }
262 
263 static inline void
264 umtxq_hash(struct umtx_key *key)
265 {
266 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
267 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
268 }
269 
270 static inline struct umtxq_chain *
271 umtxq_getchain(struct umtx_key *key)
272 {
273 	if (key->type <= TYPE_SEM)
274 		return (&umtxq_chains[1][key->hash]);
275 	return (&umtxq_chains[0][key->hash]);
276 }
277 
278 /*
279  * Lock a chain.
280  */
281 static inline void
282 umtxq_lock(struct umtx_key *key)
283 {
284 	struct umtxq_chain *uc;
285 
286 	uc = umtxq_getchain(key);
287 	mtx_lock(&uc->uc_lock);
288 }
289 
290 /*
291  * Unlock a chain.
292  */
293 static inline void
294 umtxq_unlock(struct umtx_key *key)
295 {
296 	struct umtxq_chain *uc;
297 
298 	uc = umtxq_getchain(key);
299 	mtx_unlock(&uc->uc_lock);
300 }
301 
302 /*
303  * Set chain to busy state when following operation
304  * may be blocked (kernel mutex can not be used).
305  */
306 static inline void
307 umtxq_busy(struct umtx_key *key)
308 {
309 	struct umtxq_chain *uc;
310 
311 	uc = umtxq_getchain(key);
312 	mtx_assert(&uc->uc_lock, MA_OWNED);
313 	if (uc->uc_busy) {
314 #ifdef SMP
315 		if (smp_cpus > 1) {
316 			int count = BUSY_SPINS;
317 			if (count > 0) {
318 				umtxq_unlock(key);
319 				while (uc->uc_busy && --count > 0)
320 					cpu_spinwait();
321 				umtxq_lock(key);
322 			}
323 		}
324 #endif
325 		while (uc->uc_busy) {
326 			uc->uc_waiters++;
327 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
328 			uc->uc_waiters--;
329 		}
330 	}
331 	uc->uc_busy = 1;
332 }
333 
334 /*
335  * Unbusy a chain.
336  */
337 static inline void
338 umtxq_unbusy(struct umtx_key *key)
339 {
340 	struct umtxq_chain *uc;
341 
342 	uc = umtxq_getchain(key);
343 	mtx_assert(&uc->uc_lock, MA_OWNED);
344 	KASSERT(uc->uc_busy != 0, ("not busy"));
345 	uc->uc_busy = 0;
346 	if (uc->uc_waiters)
347 		wakeup_one(uc);
348 }
349 
350 static struct umtxq_queue *
351 umtxq_queue_lookup(struct umtx_key *key, int q)
352 {
353 	struct umtxq_queue *uh;
354 	struct umtxq_chain *uc;
355 
356 	uc = umtxq_getchain(key);
357 	UMTXQ_LOCKED_ASSERT(uc);
358 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
359 		if (umtx_key_match(&uh->key, key))
360 			return (uh);
361 	}
362 
363 	return (NULL);
364 }
365 
366 static inline void
367 umtxq_insert_queue(struct umtx_q *uq, int q)
368 {
369 	struct umtxq_queue *uh;
370 	struct umtxq_chain *uc;
371 
372 	uc = umtxq_getchain(&uq->uq_key);
373 	UMTXQ_LOCKED_ASSERT(uc);
374 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
375 	uh = umtxq_queue_lookup(&uq->uq_key, q);
376 	if (uh != NULL) {
377 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
378 	} else {
379 		uh = uq->uq_spare_queue;
380 		uh->key = uq->uq_key;
381 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
382 	}
383 	uq->uq_spare_queue = NULL;
384 
385 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
386 	uh->length++;
387 	uq->uq_flags |= UQF_UMTXQ;
388 	uq->uq_cur_queue = uh;
389 	return;
390 }
391 
392 static inline void
393 umtxq_remove_queue(struct umtx_q *uq, int q)
394 {
395 	struct umtxq_chain *uc;
396 	struct umtxq_queue *uh;
397 
398 	uc = umtxq_getchain(&uq->uq_key);
399 	UMTXQ_LOCKED_ASSERT(uc);
400 	if (uq->uq_flags & UQF_UMTXQ) {
401 		uh = uq->uq_cur_queue;
402 		TAILQ_REMOVE(&uh->head, uq, uq_link);
403 		uh->length--;
404 		uq->uq_flags &= ~UQF_UMTXQ;
405 		if (TAILQ_EMPTY(&uh->head)) {
406 			KASSERT(uh->length == 0,
407 			    ("inconsistent umtxq_queue length"));
408 			LIST_REMOVE(uh, link);
409 		} else {
410 			uh = LIST_FIRST(&uc->uc_spare_queue);
411 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
412 			LIST_REMOVE(uh, link);
413 		}
414 		uq->uq_spare_queue = uh;
415 		uq->uq_cur_queue = NULL;
416 	}
417 }
418 
419 /*
420  * Check if there are multiple waiters
421  */
422 static int
423 umtxq_count(struct umtx_key *key)
424 {
425 	struct umtxq_chain *uc;
426 	struct umtxq_queue *uh;
427 
428 	uc = umtxq_getchain(key);
429 	UMTXQ_LOCKED_ASSERT(uc);
430 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
431 	if (uh != NULL)
432 		return (uh->length);
433 	return (0);
434 }
435 
436 /*
437  * Check if there are multiple PI waiters and returns first
438  * waiter.
439  */
440 static int
441 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtxq_queue *uh;
445 
446 	*first = NULL;
447 	uc = umtxq_getchain(key);
448 	UMTXQ_LOCKED_ASSERT(uc);
449 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
450 	if (uh != NULL) {
451 		*first = TAILQ_FIRST(&uh->head);
452 		return (uh->length);
453 	}
454 	return (0);
455 }
456 
457 /*
458  * Wake up threads waiting on an userland object.
459  */
460 
461 static int
462 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
463 {
464 	struct umtxq_chain *uc;
465 	struct umtxq_queue *uh;
466 	struct umtx_q *uq;
467 	int ret;
468 
469 	ret = 0;
470 	uc = umtxq_getchain(key);
471 	UMTXQ_LOCKED_ASSERT(uc);
472 	uh = umtxq_queue_lookup(key, q);
473 	if (uh != NULL) {
474 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
475 			umtxq_remove_queue(uq, q);
476 			wakeup(uq);
477 			if (++ret >= n_wake)
478 				return (ret);
479 		}
480 	}
481 	return (ret);
482 }
483 
484 
485 /*
486  * Wake up specified thread.
487  */
488 static inline void
489 umtxq_signal_thread(struct umtx_q *uq)
490 {
491 	struct umtxq_chain *uc;
492 
493 	uc = umtxq_getchain(&uq->uq_key);
494 	UMTXQ_LOCKED_ASSERT(uc);
495 	umtxq_remove(uq);
496 	wakeup(uq);
497 }
498 
499 /*
500  * Put thread into sleep state, before sleeping, check if
501  * thread was removed from umtx queue.
502  */
503 static inline int
504 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
505 {
506 	struct umtxq_chain *uc;
507 	int error;
508 
509 	uc = umtxq_getchain(&uq->uq_key);
510 	UMTXQ_LOCKED_ASSERT(uc);
511 	if (!(uq->uq_flags & UQF_UMTXQ))
512 		return (0);
513 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
514 	if (error == EWOULDBLOCK)
515 		error = ETIMEDOUT;
516 	return (error);
517 }
518 
519 /*
520  * Convert userspace address into unique logical address.
521  */
522 int
523 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
524 {
525 	struct thread *td = curthread;
526 	vm_map_t map;
527 	vm_map_entry_t entry;
528 	vm_pindex_t pindex;
529 	vm_prot_t prot;
530 	boolean_t wired;
531 
532 	key->type = type;
533 	if (share == THREAD_SHARE) {
534 		key->shared = 0;
535 		key->info.private.vs = td->td_proc->p_vmspace;
536 		key->info.private.addr = (uintptr_t)addr;
537 	} else {
538 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
539 		map = &td->td_proc->p_vmspace->vm_map;
540 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
541 		    &entry, &key->info.shared.object, &pindex, &prot,
542 		    &wired) != KERN_SUCCESS) {
543 			return EFAULT;
544 		}
545 
546 		if ((share == PROCESS_SHARE) ||
547 		    (share == AUTO_SHARE &&
548 		     VM_INHERIT_SHARE == entry->inheritance)) {
549 			key->shared = 1;
550 			key->info.shared.offset = entry->offset + entry->start -
551 				(vm_offset_t)addr;
552 			vm_object_reference(key->info.shared.object);
553 		} else {
554 			key->shared = 0;
555 			key->info.private.vs = td->td_proc->p_vmspace;
556 			key->info.private.addr = (uintptr_t)addr;
557 		}
558 		vm_map_lookup_done(map, entry);
559 	}
560 
561 	umtxq_hash(key);
562 	return (0);
563 }
564 
565 /*
566  * Release key.
567  */
568 void
569 umtx_key_release(struct umtx_key *key)
570 {
571 	if (key->shared)
572 		vm_object_deallocate(key->info.shared.object);
573 }
574 
575 /*
576  * Lock a umtx object.
577  */
578 static int
579 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
580 {
581 	struct umtx_q *uq;
582 	u_long owner;
583 	u_long old;
584 	int error = 0;
585 
586 	uq = td->td_umtxq;
587 
588 	/*
589 	 * Care must be exercised when dealing with umtx structure. It
590 	 * can fault on any access.
591 	 */
592 	for (;;) {
593 		/*
594 		 * Try the uncontested case.  This should be done in userland.
595 		 */
596 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
597 
598 		/* The acquire succeeded. */
599 		if (owner == UMTX_UNOWNED)
600 			return (0);
601 
602 		/* The address was invalid. */
603 		if (owner == -1)
604 			return (EFAULT);
605 
606 		/* If no one owns it but it is contested try to acquire it. */
607 		if (owner == UMTX_CONTESTED) {
608 			owner = casuword(&umtx->u_owner,
609 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
610 
611 			if (owner == UMTX_CONTESTED)
612 				return (0);
613 
614 			/* The address was invalid. */
615 			if (owner == -1)
616 				return (EFAULT);
617 
618 			/* If this failed the lock has changed, restart. */
619 			continue;
620 		}
621 
622 		/*
623 		 * If we caught a signal, we have retried and now
624 		 * exit immediately.
625 		 */
626 		if (error != 0)
627 			return (error);
628 
629 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
630 			AUTO_SHARE, &uq->uq_key)) != 0)
631 			return (error);
632 
633 		umtxq_lock(&uq->uq_key);
634 		umtxq_busy(&uq->uq_key);
635 		umtxq_insert(uq);
636 		umtxq_unbusy(&uq->uq_key);
637 		umtxq_unlock(&uq->uq_key);
638 
639 		/*
640 		 * Set the contested bit so that a release in user space
641 		 * knows to use the system call for unlock.  If this fails
642 		 * either some one else has acquired the lock or it has been
643 		 * released.
644 		 */
645 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
646 
647 		/* The address was invalid. */
648 		if (old == -1) {
649 			umtxq_lock(&uq->uq_key);
650 			umtxq_remove(uq);
651 			umtxq_unlock(&uq->uq_key);
652 			umtx_key_release(&uq->uq_key);
653 			return (EFAULT);
654 		}
655 
656 		/*
657 		 * We set the contested bit, sleep. Otherwise the lock changed
658 		 * and we need to retry or we lost a race to the thread
659 		 * unlocking the umtx.
660 		 */
661 		umtxq_lock(&uq->uq_key);
662 		if (old == owner)
663 			error = umtxq_sleep(uq, "umtx", timo);
664 		umtxq_remove(uq);
665 		umtxq_unlock(&uq->uq_key);
666 		umtx_key_release(&uq->uq_key);
667 	}
668 
669 	return (0);
670 }
671 
672 /*
673  * Lock a umtx object.
674  */
675 static int
676 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
677 	struct timespec *timeout)
678 {
679 	struct timespec ts, ts2, ts3;
680 	struct timeval tv;
681 	int error;
682 
683 	if (timeout == NULL) {
684 		error = _do_lock_umtx(td, umtx, id, 0);
685 		/* Mutex locking is restarted if it is interrupted. */
686 		if (error == EINTR)
687 			error = ERESTART;
688 	} else {
689 		getnanouptime(&ts);
690 		timespecadd(&ts, timeout);
691 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
692 		for (;;) {
693 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
694 			if (error != ETIMEDOUT)
695 				break;
696 			getnanouptime(&ts2);
697 			if (timespeccmp(&ts2, &ts, >=)) {
698 				error = ETIMEDOUT;
699 				break;
700 			}
701 			ts3 = ts;
702 			timespecsub(&ts3, &ts2);
703 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
704 		}
705 		/* Timed-locking is not restarted. */
706 		if (error == ERESTART)
707 			error = EINTR;
708 	}
709 	return (error);
710 }
711 
712 /*
713  * Unlock a umtx object.
714  */
715 static int
716 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
717 {
718 	struct umtx_key key;
719 	u_long owner;
720 	u_long old;
721 	int error;
722 	int count;
723 
724 	/*
725 	 * Make sure we own this mtx.
726 	 */
727 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
728 	if (owner == -1)
729 		return (EFAULT);
730 
731 	if ((owner & ~UMTX_CONTESTED) != id)
732 		return (EPERM);
733 
734 	/* This should be done in userland */
735 	if ((owner & UMTX_CONTESTED) == 0) {
736 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
737 		if (old == -1)
738 			return (EFAULT);
739 		if (old == owner)
740 			return (0);
741 		owner = old;
742 	}
743 
744 	/* We should only ever be in here for contested locks */
745 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
746 		&key)) != 0)
747 		return (error);
748 
749 	umtxq_lock(&key);
750 	umtxq_busy(&key);
751 	count = umtxq_count(&key);
752 	umtxq_unlock(&key);
753 
754 	/*
755 	 * When unlocking the umtx, it must be marked as unowned if
756 	 * there is zero or one thread only waiting for it.
757 	 * Otherwise, it must be marked as contested.
758 	 */
759 	old = casuword(&umtx->u_owner, owner,
760 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
761 	umtxq_lock(&key);
762 	umtxq_signal(&key,1);
763 	umtxq_unbusy(&key);
764 	umtxq_unlock(&key);
765 	umtx_key_release(&key);
766 	if (old == -1)
767 		return (EFAULT);
768 	if (old != owner)
769 		return (EINVAL);
770 	return (0);
771 }
772 
773 #ifdef COMPAT_FREEBSD32
774 
775 /*
776  * Lock a umtx object.
777  */
778 static int
779 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
780 {
781 	struct umtx_q *uq;
782 	uint32_t owner;
783 	uint32_t old;
784 	int error = 0;
785 
786 	uq = td->td_umtxq;
787 
788 	/*
789 	 * Care must be exercised when dealing with umtx structure. It
790 	 * can fault on any access.
791 	 */
792 	for (;;) {
793 		/*
794 		 * Try the uncontested case.  This should be done in userland.
795 		 */
796 		owner = casuword32(m, UMUTEX_UNOWNED, id);
797 
798 		/* The acquire succeeded. */
799 		if (owner == UMUTEX_UNOWNED)
800 			return (0);
801 
802 		/* The address was invalid. */
803 		if (owner == -1)
804 			return (EFAULT);
805 
806 		/* If no one owns it but it is contested try to acquire it. */
807 		if (owner == UMUTEX_CONTESTED) {
808 			owner = casuword32(m,
809 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
810 			if (owner == UMUTEX_CONTESTED)
811 				return (0);
812 
813 			/* The address was invalid. */
814 			if (owner == -1)
815 				return (EFAULT);
816 
817 			/* If this failed the lock has changed, restart. */
818 			continue;
819 		}
820 
821 		/*
822 		 * If we caught a signal, we have retried and now
823 		 * exit immediately.
824 		 */
825 		if (error != 0)
826 			return (error);
827 
828 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
829 			AUTO_SHARE, &uq->uq_key)) != 0)
830 			return (error);
831 
832 		umtxq_lock(&uq->uq_key);
833 		umtxq_busy(&uq->uq_key);
834 		umtxq_insert(uq);
835 		umtxq_unbusy(&uq->uq_key);
836 		umtxq_unlock(&uq->uq_key);
837 
838 		/*
839 		 * Set the contested bit so that a release in user space
840 		 * knows to use the system call for unlock.  If this fails
841 		 * either some one else has acquired the lock or it has been
842 		 * released.
843 		 */
844 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
845 
846 		/* The address was invalid. */
847 		if (old == -1) {
848 			umtxq_lock(&uq->uq_key);
849 			umtxq_remove(uq);
850 			umtxq_unlock(&uq->uq_key);
851 			umtx_key_release(&uq->uq_key);
852 			return (EFAULT);
853 		}
854 
855 		/*
856 		 * We set the contested bit, sleep. Otherwise the lock changed
857 		 * and we need to retry or we lost a race to the thread
858 		 * unlocking the umtx.
859 		 */
860 		umtxq_lock(&uq->uq_key);
861 		if (old == owner)
862 			error = umtxq_sleep(uq, "umtx", timo);
863 		umtxq_remove(uq);
864 		umtxq_unlock(&uq->uq_key);
865 		umtx_key_release(&uq->uq_key);
866 	}
867 
868 	return (0);
869 }
870 
871 /*
872  * Lock a umtx object.
873  */
874 static int
875 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
876 	struct timespec *timeout)
877 {
878 	struct timespec ts, ts2, ts3;
879 	struct timeval tv;
880 	int error;
881 
882 	if (timeout == NULL) {
883 		error = _do_lock_umtx32(td, m, id, 0);
884 		/* Mutex locking is restarted if it is interrupted. */
885 		if (error == EINTR)
886 			error = ERESTART;
887 	} else {
888 		getnanouptime(&ts);
889 		timespecadd(&ts, timeout);
890 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
891 		for (;;) {
892 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
893 			if (error != ETIMEDOUT)
894 				break;
895 			getnanouptime(&ts2);
896 			if (timespeccmp(&ts2, &ts, >=)) {
897 				error = ETIMEDOUT;
898 				break;
899 			}
900 			ts3 = ts;
901 			timespecsub(&ts3, &ts2);
902 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
903 		}
904 		/* Timed-locking is not restarted. */
905 		if (error == ERESTART)
906 			error = EINTR;
907 	}
908 	return (error);
909 }
910 
911 /*
912  * Unlock a umtx object.
913  */
914 static int
915 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
916 {
917 	struct umtx_key key;
918 	uint32_t owner;
919 	uint32_t old;
920 	int error;
921 	int count;
922 
923 	/*
924 	 * Make sure we own this mtx.
925 	 */
926 	owner = fuword32(m);
927 	if (owner == -1)
928 		return (EFAULT);
929 
930 	if ((owner & ~UMUTEX_CONTESTED) != id)
931 		return (EPERM);
932 
933 	/* This should be done in userland */
934 	if ((owner & UMUTEX_CONTESTED) == 0) {
935 		old = casuword32(m, owner, UMUTEX_UNOWNED);
936 		if (old == -1)
937 			return (EFAULT);
938 		if (old == owner)
939 			return (0);
940 		owner = old;
941 	}
942 
943 	/* We should only ever be in here for contested locks */
944 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
945 		&key)) != 0)
946 		return (error);
947 
948 	umtxq_lock(&key);
949 	umtxq_busy(&key);
950 	count = umtxq_count(&key);
951 	umtxq_unlock(&key);
952 
953 	/*
954 	 * When unlocking the umtx, it must be marked as unowned if
955 	 * there is zero or one thread only waiting for it.
956 	 * Otherwise, it must be marked as contested.
957 	 */
958 	old = casuword32(m, owner,
959 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
960 	umtxq_lock(&key);
961 	umtxq_signal(&key,1);
962 	umtxq_unbusy(&key);
963 	umtxq_unlock(&key);
964 	umtx_key_release(&key);
965 	if (old == -1)
966 		return (EFAULT);
967 	if (old != owner)
968 		return (EINVAL);
969 	return (0);
970 }
971 #endif
972 
973 /*
974  * Fetch and compare value, sleep on the address if value is not changed.
975  */
976 static int
977 do_wait(struct thread *td, void *addr, u_long id,
978 	struct timespec *timeout, int compat32, int is_private)
979 {
980 	struct umtx_q *uq;
981 	struct timespec ts, ts2, ts3;
982 	struct timeval tv;
983 	u_long tmp;
984 	int error = 0;
985 
986 	uq = td->td_umtxq;
987 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
988 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
989 		return (error);
990 
991 	umtxq_lock(&uq->uq_key);
992 	umtxq_insert(uq);
993 	umtxq_unlock(&uq->uq_key);
994 	if (compat32 == 0)
995 		tmp = fuword(addr);
996         else
997 		tmp = (unsigned int)fuword32(addr);
998 	if (tmp != id) {
999 		umtxq_lock(&uq->uq_key);
1000 		umtxq_remove(uq);
1001 		umtxq_unlock(&uq->uq_key);
1002 	} else if (timeout == NULL) {
1003 		umtxq_lock(&uq->uq_key);
1004 		error = umtxq_sleep(uq, "uwait", 0);
1005 		umtxq_remove(uq);
1006 		umtxq_unlock(&uq->uq_key);
1007 	} else {
1008 		getnanouptime(&ts);
1009 		timespecadd(&ts, timeout);
1010 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
1011 		umtxq_lock(&uq->uq_key);
1012 		for (;;) {
1013 			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
1014 			if (!(uq->uq_flags & UQF_UMTXQ)) {
1015 				error = 0;
1016 				break;
1017 			}
1018 			if (error != ETIMEDOUT)
1019 				break;
1020 			umtxq_unlock(&uq->uq_key);
1021 			getnanouptime(&ts2);
1022 			if (timespeccmp(&ts2, &ts, >=)) {
1023 				error = ETIMEDOUT;
1024 				umtxq_lock(&uq->uq_key);
1025 				break;
1026 			}
1027 			ts3 = ts;
1028 			timespecsub(&ts3, &ts2);
1029 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1030 			umtxq_lock(&uq->uq_key);
1031 		}
1032 		umtxq_remove(uq);
1033 		umtxq_unlock(&uq->uq_key);
1034 	}
1035 	umtx_key_release(&uq->uq_key);
1036 	if (error == ERESTART)
1037 		error = EINTR;
1038 	return (error);
1039 }
1040 
1041 /*
1042  * Wake up threads sleeping on the specified address.
1043  */
1044 int
1045 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1046 {
1047 	struct umtx_key key;
1048 	int ret;
1049 
1050 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1051 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1052 		return (ret);
1053 	umtxq_lock(&key);
1054 	ret = umtxq_signal(&key, n_wake);
1055 	umtxq_unlock(&key);
1056 	umtx_key_release(&key);
1057 	return (0);
1058 }
1059 
1060 /*
1061  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1062  */
1063 static int
1064 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1065 	int mode)
1066 {
1067 	struct umtx_q *uq;
1068 	uint32_t owner, old, id;
1069 	int error = 0;
1070 
1071 	id = td->td_tid;
1072 	uq = td->td_umtxq;
1073 
1074 	/*
1075 	 * Care must be exercised when dealing with umtx structure. It
1076 	 * can fault on any access.
1077 	 */
1078 	for (;;) {
1079 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1080 		if (mode == _UMUTEX_WAIT) {
1081 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1082 				return (0);
1083 		} else {
1084 			/*
1085 			 * Try the uncontested case.  This should be done in userland.
1086 			 */
1087 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1088 
1089 			/* The acquire succeeded. */
1090 			if (owner == UMUTEX_UNOWNED)
1091 				return (0);
1092 
1093 			/* The address was invalid. */
1094 			if (owner == -1)
1095 				return (EFAULT);
1096 
1097 			/* If no one owns it but it is contested try to acquire it. */
1098 			if (owner == UMUTEX_CONTESTED) {
1099 				owner = casuword32(&m->m_owner,
1100 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1101 
1102 				if (owner == UMUTEX_CONTESTED)
1103 					return (0);
1104 
1105 				/* The address was invalid. */
1106 				if (owner == -1)
1107 					return (EFAULT);
1108 
1109 				/* If this failed the lock has changed, restart. */
1110 				continue;
1111 			}
1112 		}
1113 
1114 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1115 		    (owner & ~UMUTEX_CONTESTED) == id)
1116 			return (EDEADLK);
1117 
1118 		if (mode == _UMUTEX_TRY)
1119 			return (EBUSY);
1120 
1121 		/*
1122 		 * If we caught a signal, we have retried and now
1123 		 * exit immediately.
1124 		 */
1125 		if (error != 0)
1126 			return (error);
1127 
1128 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1129 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1130 			return (error);
1131 
1132 		umtxq_lock(&uq->uq_key);
1133 		umtxq_busy(&uq->uq_key);
1134 		umtxq_insert(uq);
1135 		umtxq_unlock(&uq->uq_key);
1136 
1137 		/*
1138 		 * Set the contested bit so that a release in user space
1139 		 * knows to use the system call for unlock.  If this fails
1140 		 * either some one else has acquired the lock or it has been
1141 		 * released.
1142 		 */
1143 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1144 
1145 		/* The address was invalid. */
1146 		if (old == -1) {
1147 			umtxq_lock(&uq->uq_key);
1148 			umtxq_remove(uq);
1149 			umtxq_unbusy(&uq->uq_key);
1150 			umtxq_unlock(&uq->uq_key);
1151 			umtx_key_release(&uq->uq_key);
1152 			return (EFAULT);
1153 		}
1154 
1155 		/*
1156 		 * We set the contested bit, sleep. Otherwise the lock changed
1157 		 * and we need to retry or we lost a race to the thread
1158 		 * unlocking the umtx.
1159 		 */
1160 		umtxq_lock(&uq->uq_key);
1161 		umtxq_unbusy(&uq->uq_key);
1162 		if (old == owner)
1163 			error = umtxq_sleep(uq, "umtxn", timo);
1164 		umtxq_remove(uq);
1165 		umtxq_unlock(&uq->uq_key);
1166 		umtx_key_release(&uq->uq_key);
1167 	}
1168 
1169 	return (0);
1170 }
1171 
1172 /*
1173  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1174  */
1175 /*
1176  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1177  */
1178 static int
1179 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1180 {
1181 	struct umtx_key key;
1182 	uint32_t owner, old, id;
1183 	int error;
1184 	int count;
1185 
1186 	id = td->td_tid;
1187 	/*
1188 	 * Make sure we own this mtx.
1189 	 */
1190 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1191 	if (owner == -1)
1192 		return (EFAULT);
1193 
1194 	if ((owner & ~UMUTEX_CONTESTED) != id)
1195 		return (EPERM);
1196 
1197 	if ((owner & UMUTEX_CONTESTED) == 0) {
1198 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1199 		if (old == -1)
1200 			return (EFAULT);
1201 		if (old == owner)
1202 			return (0);
1203 		owner = old;
1204 	}
1205 
1206 	/* We should only ever be in here for contested locks */
1207 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1208 	    &key)) != 0)
1209 		return (error);
1210 
1211 	umtxq_lock(&key);
1212 	umtxq_busy(&key);
1213 	count = umtxq_count(&key);
1214 	umtxq_unlock(&key);
1215 
1216 	/*
1217 	 * When unlocking the umtx, it must be marked as unowned if
1218 	 * there is zero or one thread only waiting for it.
1219 	 * Otherwise, it must be marked as contested.
1220 	 */
1221 	old = casuword32(&m->m_owner, owner,
1222 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1223 	umtxq_lock(&key);
1224 	umtxq_signal(&key,1);
1225 	umtxq_unbusy(&key);
1226 	umtxq_unlock(&key);
1227 	umtx_key_release(&key);
1228 	if (old == -1)
1229 		return (EFAULT);
1230 	if (old != owner)
1231 		return (EINVAL);
1232 	return (0);
1233 }
1234 
1235 /*
1236  * Check if the mutex is available and wake up a waiter,
1237  * only for simple mutex.
1238  */
1239 static int
1240 do_wake_umutex(struct thread *td, struct umutex *m)
1241 {
1242 	struct umtx_key key;
1243 	uint32_t owner;
1244 	uint32_t flags;
1245 	int error;
1246 	int count;
1247 
1248 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1249 	if (owner == -1)
1250 		return (EFAULT);
1251 
1252 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1253 		return (0);
1254 
1255 	flags = fuword32(&m->m_flags);
1256 
1257 	/* We should only ever be in here for contested locks */
1258 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1259 	    &key)) != 0)
1260 		return (error);
1261 
1262 	umtxq_lock(&key);
1263 	umtxq_busy(&key);
1264 	count = umtxq_count(&key);
1265 	umtxq_unlock(&key);
1266 
1267 	if (count <= 1)
1268 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1269 
1270 	umtxq_lock(&key);
1271 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1272 		umtxq_signal(&key, 1);
1273 	umtxq_unbusy(&key);
1274 	umtxq_unlock(&key);
1275 	umtx_key_release(&key);
1276 	return (0);
1277 }
1278 
1279 static inline struct umtx_pi *
1280 umtx_pi_alloc(int flags)
1281 {
1282 	struct umtx_pi *pi;
1283 
1284 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1285 	TAILQ_INIT(&pi->pi_blocked);
1286 	atomic_add_int(&umtx_pi_allocated, 1);
1287 	return (pi);
1288 }
1289 
1290 static inline void
1291 umtx_pi_free(struct umtx_pi *pi)
1292 {
1293 	uma_zfree(umtx_pi_zone, pi);
1294 	atomic_add_int(&umtx_pi_allocated, -1);
1295 }
1296 
1297 /*
1298  * Adjust the thread's position on a pi_state after its priority has been
1299  * changed.
1300  */
1301 static int
1302 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1303 {
1304 	struct umtx_q *uq, *uq1, *uq2;
1305 	struct thread *td1;
1306 
1307 	mtx_assert(&umtx_lock, MA_OWNED);
1308 	if (pi == NULL)
1309 		return (0);
1310 
1311 	uq = td->td_umtxq;
1312 
1313 	/*
1314 	 * Check if the thread needs to be moved on the blocked chain.
1315 	 * It needs to be moved if either its priority is lower than
1316 	 * the previous thread or higher than the next thread.
1317 	 */
1318 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1319 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1320 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1321 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1322 		/*
1323 		 * Remove thread from blocked chain and determine where
1324 		 * it should be moved to.
1325 		 */
1326 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1327 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1328 			td1 = uq1->uq_thread;
1329 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1330 			if (UPRI(td1) > UPRI(td))
1331 				break;
1332 		}
1333 
1334 		if (uq1 == NULL)
1335 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1336 		else
1337 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1338 	}
1339 	return (1);
1340 }
1341 
1342 /*
1343  * Propagate priority when a thread is blocked on POSIX
1344  * PI mutex.
1345  */
1346 static void
1347 umtx_propagate_priority(struct thread *td)
1348 {
1349 	struct umtx_q *uq;
1350 	struct umtx_pi *pi;
1351 	int pri;
1352 
1353 	mtx_assert(&umtx_lock, MA_OWNED);
1354 	pri = UPRI(td);
1355 	uq = td->td_umtxq;
1356 	pi = uq->uq_pi_blocked;
1357 	if (pi == NULL)
1358 		return;
1359 
1360 	for (;;) {
1361 		td = pi->pi_owner;
1362 		if (td == NULL || td == curthread)
1363 			return;
1364 
1365 		MPASS(td->td_proc != NULL);
1366 		MPASS(td->td_proc->p_magic == P_MAGIC);
1367 
1368 		thread_lock(td);
1369 		if (td->td_lend_user_pri > pri)
1370 			sched_lend_user_prio(td, pri);
1371 		else {
1372 			thread_unlock(td);
1373 			break;
1374 		}
1375 		thread_unlock(td);
1376 
1377 		/*
1378 		 * Pick up the lock that td is blocked on.
1379 		 */
1380 		uq = td->td_umtxq;
1381 		pi = uq->uq_pi_blocked;
1382 		if (pi == NULL)
1383 			break;
1384 		/* Resort td on the list if needed. */
1385 		umtx_pi_adjust_thread(pi, td);
1386 	}
1387 }
1388 
1389 /*
1390  * Unpropagate priority for a PI mutex when a thread blocked on
1391  * it is interrupted by signal or resumed by others.
1392  */
1393 static void
1394 umtx_repropagate_priority(struct umtx_pi *pi)
1395 {
1396 	struct umtx_q *uq, *uq_owner;
1397 	struct umtx_pi *pi2;
1398 	int pri;
1399 
1400 	mtx_assert(&umtx_lock, MA_OWNED);
1401 
1402 	while (pi != NULL && pi->pi_owner != NULL) {
1403 		pri = PRI_MAX;
1404 		uq_owner = pi->pi_owner->td_umtxq;
1405 
1406 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1407 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1408 			if (uq != NULL) {
1409 				if (pri > UPRI(uq->uq_thread))
1410 					pri = UPRI(uq->uq_thread);
1411 			}
1412 		}
1413 
1414 		if (pri > uq_owner->uq_inherited_pri)
1415 			pri = uq_owner->uq_inherited_pri;
1416 		thread_lock(pi->pi_owner);
1417 		sched_lend_user_prio(pi->pi_owner, pri);
1418 		thread_unlock(pi->pi_owner);
1419 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1420 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1421 	}
1422 }
1423 
1424 /*
1425  * Insert a PI mutex into owned list.
1426  */
1427 static void
1428 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1429 {
1430 	struct umtx_q *uq_owner;
1431 
1432 	uq_owner = owner->td_umtxq;
1433 	mtx_assert(&umtx_lock, MA_OWNED);
1434 	if (pi->pi_owner != NULL)
1435 		panic("pi_ower != NULL");
1436 	pi->pi_owner = owner;
1437 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1438 }
1439 
1440 /*
1441  * Claim ownership of a PI mutex.
1442  */
1443 static int
1444 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1445 {
1446 	struct umtx_q *uq, *uq_owner;
1447 
1448 	uq_owner = owner->td_umtxq;
1449 	mtx_lock_spin(&umtx_lock);
1450 	if (pi->pi_owner == owner) {
1451 		mtx_unlock_spin(&umtx_lock);
1452 		return (0);
1453 	}
1454 
1455 	if (pi->pi_owner != NULL) {
1456 		/*
1457 		 * userland may have already messed the mutex, sigh.
1458 		 */
1459 		mtx_unlock_spin(&umtx_lock);
1460 		return (EPERM);
1461 	}
1462 	umtx_pi_setowner(pi, owner);
1463 	uq = TAILQ_FIRST(&pi->pi_blocked);
1464 	if (uq != NULL) {
1465 		int pri;
1466 
1467 		pri = UPRI(uq->uq_thread);
1468 		thread_lock(owner);
1469 		if (pri < UPRI(owner))
1470 			sched_lend_user_prio(owner, pri);
1471 		thread_unlock(owner);
1472 	}
1473 	mtx_unlock_spin(&umtx_lock);
1474 	return (0);
1475 }
1476 
1477 /*
1478  * Adjust a thread's order position in its blocked PI mutex,
1479  * this may result new priority propagating process.
1480  */
1481 void
1482 umtx_pi_adjust(struct thread *td, u_char oldpri)
1483 {
1484 	struct umtx_q *uq;
1485 	struct umtx_pi *pi;
1486 
1487 	uq = td->td_umtxq;
1488 	mtx_lock_spin(&umtx_lock);
1489 	/*
1490 	 * Pick up the lock that td is blocked on.
1491 	 */
1492 	pi = uq->uq_pi_blocked;
1493 	if (pi != NULL) {
1494 		umtx_pi_adjust_thread(pi, td);
1495 		umtx_repropagate_priority(pi);
1496 	}
1497 	mtx_unlock_spin(&umtx_lock);
1498 }
1499 
1500 /*
1501  * Sleep on a PI mutex.
1502  */
1503 static int
1504 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1505 	uint32_t owner, const char *wmesg, int timo)
1506 {
1507 	struct umtxq_chain *uc;
1508 	struct thread *td, *td1;
1509 	struct umtx_q *uq1;
1510 	int pri;
1511 	int error = 0;
1512 
1513 	td = uq->uq_thread;
1514 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1515 	uc = umtxq_getchain(&uq->uq_key);
1516 	UMTXQ_LOCKED_ASSERT(uc);
1517 	UMTXQ_BUSY_ASSERT(uc);
1518 	umtxq_insert(uq);
1519 	mtx_lock_spin(&umtx_lock);
1520 	if (pi->pi_owner == NULL) {
1521 		mtx_unlock_spin(&umtx_lock);
1522 		/* XXX Only look up thread in current process. */
1523 		td1 = tdfind(owner, curproc->p_pid);
1524 		mtx_lock_spin(&umtx_lock);
1525 		if (td1 != NULL) {
1526 			if (pi->pi_owner == NULL)
1527 				umtx_pi_setowner(pi, td1);
1528 			PROC_UNLOCK(td1->td_proc);
1529 		}
1530 	}
1531 
1532 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1533 		pri = UPRI(uq1->uq_thread);
1534 		if (pri > UPRI(td))
1535 			break;
1536 	}
1537 
1538 	if (uq1 != NULL)
1539 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1540 	else
1541 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1542 
1543 	uq->uq_pi_blocked = pi;
1544 	thread_lock(td);
1545 	td->td_flags |= TDF_UPIBLOCKED;
1546 	thread_unlock(td);
1547 	umtx_propagate_priority(td);
1548 	mtx_unlock_spin(&umtx_lock);
1549 	umtxq_unbusy(&uq->uq_key);
1550 
1551 	if (uq->uq_flags & UQF_UMTXQ) {
1552 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1553 		if (error == EWOULDBLOCK)
1554 			error = ETIMEDOUT;
1555 		if (uq->uq_flags & UQF_UMTXQ) {
1556 			umtxq_remove(uq);
1557 		}
1558 	}
1559 	mtx_lock_spin(&umtx_lock);
1560 	uq->uq_pi_blocked = NULL;
1561 	thread_lock(td);
1562 	td->td_flags &= ~TDF_UPIBLOCKED;
1563 	thread_unlock(td);
1564 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1565 	umtx_repropagate_priority(pi);
1566 	mtx_unlock_spin(&umtx_lock);
1567 	umtxq_unlock(&uq->uq_key);
1568 
1569 	return (error);
1570 }
1571 
1572 /*
1573  * Add reference count for a PI mutex.
1574  */
1575 static void
1576 umtx_pi_ref(struct umtx_pi *pi)
1577 {
1578 	struct umtxq_chain *uc;
1579 
1580 	uc = umtxq_getchain(&pi->pi_key);
1581 	UMTXQ_LOCKED_ASSERT(uc);
1582 	pi->pi_refcount++;
1583 }
1584 
1585 /*
1586  * Decrease reference count for a PI mutex, if the counter
1587  * is decreased to zero, its memory space is freed.
1588  */
1589 static void
1590 umtx_pi_unref(struct umtx_pi *pi)
1591 {
1592 	struct umtxq_chain *uc;
1593 
1594 	uc = umtxq_getchain(&pi->pi_key);
1595 	UMTXQ_LOCKED_ASSERT(uc);
1596 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1597 	if (--pi->pi_refcount == 0) {
1598 		mtx_lock_spin(&umtx_lock);
1599 		if (pi->pi_owner != NULL) {
1600 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1601 				pi, pi_link);
1602 			pi->pi_owner = NULL;
1603 		}
1604 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1605 			("blocked queue not empty"));
1606 		mtx_unlock_spin(&umtx_lock);
1607 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1608 		umtx_pi_free(pi);
1609 	}
1610 }
1611 
1612 /*
1613  * Find a PI mutex in hash table.
1614  */
1615 static struct umtx_pi *
1616 umtx_pi_lookup(struct umtx_key *key)
1617 {
1618 	struct umtxq_chain *uc;
1619 	struct umtx_pi *pi;
1620 
1621 	uc = umtxq_getchain(key);
1622 	UMTXQ_LOCKED_ASSERT(uc);
1623 
1624 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1625 		if (umtx_key_match(&pi->pi_key, key)) {
1626 			return (pi);
1627 		}
1628 	}
1629 	return (NULL);
1630 }
1631 
1632 /*
1633  * Insert a PI mutex into hash table.
1634  */
1635 static inline void
1636 umtx_pi_insert(struct umtx_pi *pi)
1637 {
1638 	struct umtxq_chain *uc;
1639 
1640 	uc = umtxq_getchain(&pi->pi_key);
1641 	UMTXQ_LOCKED_ASSERT(uc);
1642 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1643 }
1644 
1645 /*
1646  * Lock a PI mutex.
1647  */
1648 static int
1649 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1650 	int try)
1651 {
1652 	struct umtx_q *uq;
1653 	struct umtx_pi *pi, *new_pi;
1654 	uint32_t id, owner, old;
1655 	int error;
1656 
1657 	id = td->td_tid;
1658 	uq = td->td_umtxq;
1659 
1660 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1661 	    &uq->uq_key)) != 0)
1662 		return (error);
1663 	umtxq_lock(&uq->uq_key);
1664 	pi = umtx_pi_lookup(&uq->uq_key);
1665 	if (pi == NULL) {
1666 		new_pi = umtx_pi_alloc(M_NOWAIT);
1667 		if (new_pi == NULL) {
1668 			umtxq_unlock(&uq->uq_key);
1669 			new_pi = umtx_pi_alloc(M_WAITOK);
1670 			umtxq_lock(&uq->uq_key);
1671 			pi = umtx_pi_lookup(&uq->uq_key);
1672 			if (pi != NULL) {
1673 				umtx_pi_free(new_pi);
1674 				new_pi = NULL;
1675 			}
1676 		}
1677 		if (new_pi != NULL) {
1678 			new_pi->pi_key = uq->uq_key;
1679 			umtx_pi_insert(new_pi);
1680 			pi = new_pi;
1681 		}
1682 	}
1683 	umtx_pi_ref(pi);
1684 	umtxq_unlock(&uq->uq_key);
1685 
1686 	/*
1687 	 * Care must be exercised when dealing with umtx structure.  It
1688 	 * can fault on any access.
1689 	 */
1690 	for (;;) {
1691 		/*
1692 		 * Try the uncontested case.  This should be done in userland.
1693 		 */
1694 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1695 
1696 		/* The acquire succeeded. */
1697 		if (owner == UMUTEX_UNOWNED) {
1698 			error = 0;
1699 			break;
1700 		}
1701 
1702 		/* The address was invalid. */
1703 		if (owner == -1) {
1704 			error = EFAULT;
1705 			break;
1706 		}
1707 
1708 		/* If no one owns it but it is contested try to acquire it. */
1709 		if (owner == UMUTEX_CONTESTED) {
1710 			owner = casuword32(&m->m_owner,
1711 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1712 
1713 			if (owner == UMUTEX_CONTESTED) {
1714 				umtxq_lock(&uq->uq_key);
1715 				umtxq_busy(&uq->uq_key);
1716 				error = umtx_pi_claim(pi, td);
1717 				umtxq_unbusy(&uq->uq_key);
1718 				umtxq_unlock(&uq->uq_key);
1719 				break;
1720 			}
1721 
1722 			/* The address was invalid. */
1723 			if (owner == -1) {
1724 				error = EFAULT;
1725 				break;
1726 			}
1727 
1728 			/* If this failed the lock has changed, restart. */
1729 			continue;
1730 		}
1731 
1732 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1733 		    (owner & ~UMUTEX_CONTESTED) == id) {
1734 			error = EDEADLK;
1735 			break;
1736 		}
1737 
1738 		if (try != 0) {
1739 			error = EBUSY;
1740 			break;
1741 		}
1742 
1743 		/*
1744 		 * If we caught a signal, we have retried and now
1745 		 * exit immediately.
1746 		 */
1747 		if (error != 0)
1748 			break;
1749 
1750 		umtxq_lock(&uq->uq_key);
1751 		umtxq_busy(&uq->uq_key);
1752 		umtxq_unlock(&uq->uq_key);
1753 
1754 		/*
1755 		 * Set the contested bit so that a release in user space
1756 		 * knows to use the system call for unlock.  If this fails
1757 		 * either some one else has acquired the lock or it has been
1758 		 * released.
1759 		 */
1760 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1761 
1762 		/* The address was invalid. */
1763 		if (old == -1) {
1764 			umtxq_lock(&uq->uq_key);
1765 			umtxq_unbusy(&uq->uq_key);
1766 			umtxq_unlock(&uq->uq_key);
1767 			error = EFAULT;
1768 			break;
1769 		}
1770 
1771 		umtxq_lock(&uq->uq_key);
1772 		/*
1773 		 * We set the contested bit, sleep. Otherwise the lock changed
1774 		 * and we need to retry or we lost a race to the thread
1775 		 * unlocking the umtx.
1776 		 */
1777 		if (old == owner)
1778 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1779 				 "umtxpi", timo);
1780 		else {
1781 			umtxq_unbusy(&uq->uq_key);
1782 			umtxq_unlock(&uq->uq_key);
1783 		}
1784 	}
1785 
1786 	umtxq_lock(&uq->uq_key);
1787 	umtx_pi_unref(pi);
1788 	umtxq_unlock(&uq->uq_key);
1789 
1790 	umtx_key_release(&uq->uq_key);
1791 	return (error);
1792 }
1793 
1794 /*
1795  * Unlock a PI mutex.
1796  */
1797 static int
1798 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1799 {
1800 	struct umtx_key key;
1801 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1802 	struct umtx_pi *pi, *pi2;
1803 	uint32_t owner, old, id;
1804 	int error;
1805 	int count;
1806 	int pri;
1807 
1808 	id = td->td_tid;
1809 	/*
1810 	 * Make sure we own this mtx.
1811 	 */
1812 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1813 	if (owner == -1)
1814 		return (EFAULT);
1815 
1816 	if ((owner & ~UMUTEX_CONTESTED) != id)
1817 		return (EPERM);
1818 
1819 	/* This should be done in userland */
1820 	if ((owner & UMUTEX_CONTESTED) == 0) {
1821 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1822 		if (old == -1)
1823 			return (EFAULT);
1824 		if (old == owner)
1825 			return (0);
1826 		owner = old;
1827 	}
1828 
1829 	/* We should only ever be in here for contested locks */
1830 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1831 	    &key)) != 0)
1832 		return (error);
1833 
1834 	umtxq_lock(&key);
1835 	umtxq_busy(&key);
1836 	count = umtxq_count_pi(&key, &uq_first);
1837 	if (uq_first != NULL) {
1838 		mtx_lock_spin(&umtx_lock);
1839 		pi = uq_first->uq_pi_blocked;
1840 		KASSERT(pi != NULL, ("pi == NULL?"));
1841 		if (pi->pi_owner != curthread) {
1842 			mtx_unlock_spin(&umtx_lock);
1843 			umtxq_unbusy(&key);
1844 			umtxq_unlock(&key);
1845 			umtx_key_release(&key);
1846 			/* userland messed the mutex */
1847 			return (EPERM);
1848 		}
1849 		uq_me = curthread->td_umtxq;
1850 		pi->pi_owner = NULL;
1851 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1852 		/* get highest priority thread which is still sleeping. */
1853 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1854 		while (uq_first != NULL &&
1855 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1856 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1857 		}
1858 		pri = PRI_MAX;
1859 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1860 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1861 			if (uq_first2 != NULL) {
1862 				if (pri > UPRI(uq_first2->uq_thread))
1863 					pri = UPRI(uq_first2->uq_thread);
1864 			}
1865 		}
1866 		thread_lock(curthread);
1867 		sched_lend_user_prio(curthread, pri);
1868 		thread_unlock(curthread);
1869 		mtx_unlock_spin(&umtx_lock);
1870 		if (uq_first)
1871 			umtxq_signal_thread(uq_first);
1872 	}
1873 	umtxq_unlock(&key);
1874 
1875 	/*
1876 	 * When unlocking the umtx, it must be marked as unowned if
1877 	 * there is zero or one thread only waiting for it.
1878 	 * Otherwise, it must be marked as contested.
1879 	 */
1880 	old = casuword32(&m->m_owner, owner,
1881 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1882 
1883 	umtxq_lock(&key);
1884 	umtxq_unbusy(&key);
1885 	umtxq_unlock(&key);
1886 	umtx_key_release(&key);
1887 	if (old == -1)
1888 		return (EFAULT);
1889 	if (old != owner)
1890 		return (EINVAL);
1891 	return (0);
1892 }
1893 
1894 /*
1895  * Lock a PP mutex.
1896  */
1897 static int
1898 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1899 	int try)
1900 {
1901 	struct umtx_q *uq, *uq2;
1902 	struct umtx_pi *pi;
1903 	uint32_t ceiling;
1904 	uint32_t owner, id;
1905 	int error, pri, old_inherited_pri, su;
1906 
1907 	id = td->td_tid;
1908 	uq = td->td_umtxq;
1909 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1910 	    &uq->uq_key)) != 0)
1911 		return (error);
1912 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1913 	for (;;) {
1914 		old_inherited_pri = uq->uq_inherited_pri;
1915 		umtxq_lock(&uq->uq_key);
1916 		umtxq_busy(&uq->uq_key);
1917 		umtxq_unlock(&uq->uq_key);
1918 
1919 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1920 		if (ceiling > RTP_PRIO_MAX) {
1921 			error = EINVAL;
1922 			goto out;
1923 		}
1924 
1925 		mtx_lock_spin(&umtx_lock);
1926 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1927 			mtx_unlock_spin(&umtx_lock);
1928 			error = EINVAL;
1929 			goto out;
1930 		}
1931 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1932 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1933 			thread_lock(td);
1934 			if (uq->uq_inherited_pri < UPRI(td))
1935 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1936 			thread_unlock(td);
1937 		}
1938 		mtx_unlock_spin(&umtx_lock);
1939 
1940 		owner = casuword32(&m->m_owner,
1941 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1942 
1943 		if (owner == UMUTEX_CONTESTED) {
1944 			error = 0;
1945 			break;
1946 		}
1947 
1948 		/* The address was invalid. */
1949 		if (owner == -1) {
1950 			error = EFAULT;
1951 			break;
1952 		}
1953 
1954 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1955 		    (owner & ~UMUTEX_CONTESTED) == id) {
1956 			error = EDEADLK;
1957 			break;
1958 		}
1959 
1960 		if (try != 0) {
1961 			error = EBUSY;
1962 			break;
1963 		}
1964 
1965 		/*
1966 		 * If we caught a signal, we have retried and now
1967 		 * exit immediately.
1968 		 */
1969 		if (error != 0)
1970 			break;
1971 
1972 		umtxq_lock(&uq->uq_key);
1973 		umtxq_insert(uq);
1974 		umtxq_unbusy(&uq->uq_key);
1975 		error = umtxq_sleep(uq, "umtxpp", timo);
1976 		umtxq_remove(uq);
1977 		umtxq_unlock(&uq->uq_key);
1978 
1979 		mtx_lock_spin(&umtx_lock);
1980 		uq->uq_inherited_pri = old_inherited_pri;
1981 		pri = PRI_MAX;
1982 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1983 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1984 			if (uq2 != NULL) {
1985 				if (pri > UPRI(uq2->uq_thread))
1986 					pri = UPRI(uq2->uq_thread);
1987 			}
1988 		}
1989 		if (pri > uq->uq_inherited_pri)
1990 			pri = uq->uq_inherited_pri;
1991 		thread_lock(td);
1992 		sched_lend_user_prio(td, pri);
1993 		thread_unlock(td);
1994 		mtx_unlock_spin(&umtx_lock);
1995 	}
1996 
1997 	if (error != 0) {
1998 		mtx_lock_spin(&umtx_lock);
1999 		uq->uq_inherited_pri = old_inherited_pri;
2000 		pri = PRI_MAX;
2001 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2002 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2003 			if (uq2 != NULL) {
2004 				if (pri > UPRI(uq2->uq_thread))
2005 					pri = UPRI(uq2->uq_thread);
2006 			}
2007 		}
2008 		if (pri > uq->uq_inherited_pri)
2009 			pri = uq->uq_inherited_pri;
2010 		thread_lock(td);
2011 		sched_lend_user_prio(td, pri);
2012 		thread_unlock(td);
2013 		mtx_unlock_spin(&umtx_lock);
2014 	}
2015 
2016 out:
2017 	umtxq_lock(&uq->uq_key);
2018 	umtxq_unbusy(&uq->uq_key);
2019 	umtxq_unlock(&uq->uq_key);
2020 	umtx_key_release(&uq->uq_key);
2021 	return (error);
2022 }
2023 
2024 /*
2025  * Unlock a PP mutex.
2026  */
2027 static int
2028 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2029 {
2030 	struct umtx_key key;
2031 	struct umtx_q *uq, *uq2;
2032 	struct umtx_pi *pi;
2033 	uint32_t owner, id;
2034 	uint32_t rceiling;
2035 	int error, pri, new_inherited_pri, su;
2036 
2037 	id = td->td_tid;
2038 	uq = td->td_umtxq;
2039 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2040 
2041 	/*
2042 	 * Make sure we own this mtx.
2043 	 */
2044 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2045 	if (owner == -1)
2046 		return (EFAULT);
2047 
2048 	if ((owner & ~UMUTEX_CONTESTED) != id)
2049 		return (EPERM);
2050 
2051 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2052 	if (error != 0)
2053 		return (error);
2054 
2055 	if (rceiling == -1)
2056 		new_inherited_pri = PRI_MAX;
2057 	else {
2058 		rceiling = RTP_PRIO_MAX - rceiling;
2059 		if (rceiling > RTP_PRIO_MAX)
2060 			return (EINVAL);
2061 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2062 	}
2063 
2064 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2065 	    &key)) != 0)
2066 		return (error);
2067 	umtxq_lock(&key);
2068 	umtxq_busy(&key);
2069 	umtxq_unlock(&key);
2070 	/*
2071 	 * For priority protected mutex, always set unlocked state
2072 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2073 	 * to lock the mutex, it is necessary because thread priority
2074 	 * has to be adjusted for such mutex.
2075 	 */
2076 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2077 		UMUTEX_CONTESTED);
2078 
2079 	umtxq_lock(&key);
2080 	if (error == 0)
2081 		umtxq_signal(&key, 1);
2082 	umtxq_unbusy(&key);
2083 	umtxq_unlock(&key);
2084 
2085 	if (error == -1)
2086 		error = EFAULT;
2087 	else {
2088 		mtx_lock_spin(&umtx_lock);
2089 		if (su != 0)
2090 			uq->uq_inherited_pri = new_inherited_pri;
2091 		pri = PRI_MAX;
2092 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2093 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2094 			if (uq2 != NULL) {
2095 				if (pri > UPRI(uq2->uq_thread))
2096 					pri = UPRI(uq2->uq_thread);
2097 			}
2098 		}
2099 		if (pri > uq->uq_inherited_pri)
2100 			pri = uq->uq_inherited_pri;
2101 		thread_lock(td);
2102 		sched_lend_user_prio(td, pri);
2103 		thread_unlock(td);
2104 		mtx_unlock_spin(&umtx_lock);
2105 	}
2106 	umtx_key_release(&key);
2107 	return (error);
2108 }
2109 
2110 static int
2111 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2112 	uint32_t *old_ceiling)
2113 {
2114 	struct umtx_q *uq;
2115 	uint32_t save_ceiling;
2116 	uint32_t owner, id;
2117 	uint32_t flags;
2118 	int error;
2119 
2120 	flags = fuword32(&m->m_flags);
2121 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2122 		return (EINVAL);
2123 	if (ceiling > RTP_PRIO_MAX)
2124 		return (EINVAL);
2125 	id = td->td_tid;
2126 	uq = td->td_umtxq;
2127 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2128 	   &uq->uq_key)) != 0)
2129 		return (error);
2130 	for (;;) {
2131 		umtxq_lock(&uq->uq_key);
2132 		umtxq_busy(&uq->uq_key);
2133 		umtxq_unlock(&uq->uq_key);
2134 
2135 		save_ceiling = fuword32(&m->m_ceilings[0]);
2136 
2137 		owner = casuword32(&m->m_owner,
2138 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2139 
2140 		if (owner == UMUTEX_CONTESTED) {
2141 			suword32(&m->m_ceilings[0], ceiling);
2142 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2143 				UMUTEX_CONTESTED);
2144 			error = 0;
2145 			break;
2146 		}
2147 
2148 		/* The address was invalid. */
2149 		if (owner == -1) {
2150 			error = EFAULT;
2151 			break;
2152 		}
2153 
2154 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2155 			suword32(&m->m_ceilings[0], ceiling);
2156 			error = 0;
2157 			break;
2158 		}
2159 
2160 		/*
2161 		 * If we caught a signal, we have retried and now
2162 		 * exit immediately.
2163 		 */
2164 		if (error != 0)
2165 			break;
2166 
2167 		/*
2168 		 * We set the contested bit, sleep. Otherwise the lock changed
2169 		 * and we need to retry or we lost a race to the thread
2170 		 * unlocking the umtx.
2171 		 */
2172 		umtxq_lock(&uq->uq_key);
2173 		umtxq_insert(uq);
2174 		umtxq_unbusy(&uq->uq_key);
2175 		error = umtxq_sleep(uq, "umtxpp", 0);
2176 		umtxq_remove(uq);
2177 		umtxq_unlock(&uq->uq_key);
2178 	}
2179 	umtxq_lock(&uq->uq_key);
2180 	if (error == 0)
2181 		umtxq_signal(&uq->uq_key, INT_MAX);
2182 	umtxq_unbusy(&uq->uq_key);
2183 	umtxq_unlock(&uq->uq_key);
2184 	umtx_key_release(&uq->uq_key);
2185 	if (error == 0 && old_ceiling != NULL)
2186 		suword32(old_ceiling, save_ceiling);
2187 	return (error);
2188 }
2189 
2190 static int
2191 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2192 	int mode)
2193 {
2194 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2195 	case 0:
2196 		return (_do_lock_normal(td, m, flags, timo, mode));
2197 	case UMUTEX_PRIO_INHERIT:
2198 		return (_do_lock_pi(td, m, flags, timo, mode));
2199 	case UMUTEX_PRIO_PROTECT:
2200 		return (_do_lock_pp(td, m, flags, timo, mode));
2201 	}
2202 	return (EINVAL);
2203 }
2204 
2205 /*
2206  * Lock a userland POSIX mutex.
2207  */
2208 static int
2209 do_lock_umutex(struct thread *td, struct umutex *m,
2210 	struct timespec *timeout, int mode)
2211 {
2212 	struct timespec ts, ts2, ts3;
2213 	struct timeval tv;
2214 	uint32_t flags;
2215 	int error;
2216 
2217 	flags = fuword32(&m->m_flags);
2218 	if (flags == -1)
2219 		return (EFAULT);
2220 
2221 	if (timeout == NULL) {
2222 		error = _do_lock_umutex(td, m, flags, 0, mode);
2223 		/* Mutex locking is restarted if it is interrupted. */
2224 		if (error == EINTR && mode != _UMUTEX_WAIT)
2225 			error = ERESTART;
2226 	} else {
2227 		getnanouptime(&ts);
2228 		timespecadd(&ts, timeout);
2229 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2230 		for (;;) {
2231 			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2232 			if (error != ETIMEDOUT)
2233 				break;
2234 			getnanouptime(&ts2);
2235 			if (timespeccmp(&ts2, &ts, >=)) {
2236 				error = ETIMEDOUT;
2237 				break;
2238 			}
2239 			ts3 = ts;
2240 			timespecsub(&ts3, &ts2);
2241 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2242 		}
2243 		/* Timed-locking is not restarted. */
2244 		if (error == ERESTART)
2245 			error = EINTR;
2246 	}
2247 	return (error);
2248 }
2249 
2250 /*
2251  * Unlock a userland POSIX mutex.
2252  */
2253 static int
2254 do_unlock_umutex(struct thread *td, struct umutex *m)
2255 {
2256 	uint32_t flags;
2257 
2258 	flags = fuword32(&m->m_flags);
2259 	if (flags == -1)
2260 		return (EFAULT);
2261 
2262 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2263 	case 0:
2264 		return (do_unlock_normal(td, m, flags));
2265 	case UMUTEX_PRIO_INHERIT:
2266 		return (do_unlock_pi(td, m, flags));
2267 	case UMUTEX_PRIO_PROTECT:
2268 		return (do_unlock_pp(td, m, flags));
2269 	}
2270 
2271 	return (EINVAL);
2272 }
2273 
2274 static int
2275 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2276 	struct timespec *timeout, u_long wflags)
2277 {
2278 	struct umtx_q *uq;
2279 	struct timeval tv;
2280 	struct timespec cts, ets, tts;
2281 	uint32_t flags;
2282 	uint32_t clockid;
2283 	int error;
2284 
2285 	uq = td->td_umtxq;
2286 	flags = fuword32(&cv->c_flags);
2287 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2288 	if (error != 0)
2289 		return (error);
2290 
2291 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2292 		clockid = fuword32(&cv->c_clockid);
2293 		if (clockid < CLOCK_REALTIME ||
2294 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2295 			/* hmm, only HW clock id will work. */
2296 			return (EINVAL);
2297 		}
2298 	} else {
2299 		clockid = CLOCK_REALTIME;
2300 	}
2301 
2302 	umtxq_lock(&uq->uq_key);
2303 	umtxq_busy(&uq->uq_key);
2304 	umtxq_insert(uq);
2305 	umtxq_unlock(&uq->uq_key);
2306 
2307 	/*
2308 	 * Set c_has_waiters to 1 before releasing user mutex, also
2309 	 * don't modify cache line when unnecessary.
2310 	 */
2311 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2312 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2313 
2314 	umtxq_lock(&uq->uq_key);
2315 	umtxq_unbusy(&uq->uq_key);
2316 	umtxq_unlock(&uq->uq_key);
2317 
2318 	error = do_unlock_umutex(td, m);
2319 
2320 	umtxq_lock(&uq->uq_key);
2321 	if (error == 0) {
2322 		if (timeout == NULL) {
2323 			error = umtxq_sleep(uq, "ucond", 0);
2324 		} else {
2325 			if ((wflags & CVWAIT_ABSTIME) == 0) {
2326 				kern_clock_gettime(td, clockid, &ets);
2327 				timespecadd(&ets, timeout);
2328 				tts = *timeout;
2329 			} else { /* absolute time */
2330 				ets = *timeout;
2331 				tts = *timeout;
2332 				kern_clock_gettime(td, clockid, &cts);
2333 				timespecsub(&tts, &cts);
2334 			}
2335 			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2336 			for (;;) {
2337 				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2338 				if (error != ETIMEDOUT)
2339 					break;
2340 				kern_clock_gettime(td, clockid, &cts);
2341 				if (timespeccmp(&cts, &ets, >=)) {
2342 					error = ETIMEDOUT;
2343 					break;
2344 				}
2345 				tts = ets;
2346 				timespecsub(&tts, &cts);
2347 				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2348 			}
2349 		}
2350 	}
2351 
2352 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2353 		error = 0;
2354 	else {
2355 		/*
2356 		 * This must be timeout,interrupted by signal or
2357 		 * surprious wakeup, clear c_has_waiter flag when
2358 		 * necessary.
2359 		 */
2360 		umtxq_busy(&uq->uq_key);
2361 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2362 			int oldlen = uq->uq_cur_queue->length;
2363 			umtxq_remove(uq);
2364 			if (oldlen == 1) {
2365 				umtxq_unlock(&uq->uq_key);
2366 				suword32(
2367 				    __DEVOLATILE(uint32_t *,
2368 					 &cv->c_has_waiters), 0);
2369 				umtxq_lock(&uq->uq_key);
2370 			}
2371 		}
2372 		umtxq_unbusy(&uq->uq_key);
2373 		if (error == ERESTART)
2374 			error = EINTR;
2375 	}
2376 
2377 	umtxq_unlock(&uq->uq_key);
2378 	umtx_key_release(&uq->uq_key);
2379 	return (error);
2380 }
2381 
2382 /*
2383  * Signal a userland condition variable.
2384  */
2385 static int
2386 do_cv_signal(struct thread *td, struct ucond *cv)
2387 {
2388 	struct umtx_key key;
2389 	int error, cnt, nwake;
2390 	uint32_t flags;
2391 
2392 	flags = fuword32(&cv->c_flags);
2393 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2394 		return (error);
2395 	umtxq_lock(&key);
2396 	umtxq_busy(&key);
2397 	cnt = umtxq_count(&key);
2398 	nwake = umtxq_signal(&key, 1);
2399 	if (cnt <= nwake) {
2400 		umtxq_unlock(&key);
2401 		error = suword32(
2402 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2403 		umtxq_lock(&key);
2404 	}
2405 	umtxq_unbusy(&key);
2406 	umtxq_unlock(&key);
2407 	umtx_key_release(&key);
2408 	return (error);
2409 }
2410 
2411 static int
2412 do_cv_broadcast(struct thread *td, struct ucond *cv)
2413 {
2414 	struct umtx_key key;
2415 	int error;
2416 	uint32_t flags;
2417 
2418 	flags = fuword32(&cv->c_flags);
2419 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2420 		return (error);
2421 
2422 	umtxq_lock(&key);
2423 	umtxq_busy(&key);
2424 	umtxq_signal(&key, INT_MAX);
2425 	umtxq_unlock(&key);
2426 
2427 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2428 
2429 	umtxq_lock(&key);
2430 	umtxq_unbusy(&key);
2431 	umtxq_unlock(&key);
2432 
2433 	umtx_key_release(&key);
2434 	return (error);
2435 }
2436 
2437 static int
2438 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2439 {
2440 	struct umtx_q *uq;
2441 	uint32_t flags, wrflags;
2442 	int32_t state, oldstate;
2443 	int32_t blocked_readers;
2444 	int error;
2445 
2446 	uq = td->td_umtxq;
2447 	flags = fuword32(&rwlock->rw_flags);
2448 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2449 	if (error != 0)
2450 		return (error);
2451 
2452 	wrflags = URWLOCK_WRITE_OWNER;
2453 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2454 		wrflags |= URWLOCK_WRITE_WAITERS;
2455 
2456 	for (;;) {
2457 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2458 		/* try to lock it */
2459 		while (!(state & wrflags)) {
2460 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2461 				umtx_key_release(&uq->uq_key);
2462 				return (EAGAIN);
2463 			}
2464 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2465 			if (oldstate == state) {
2466 				umtx_key_release(&uq->uq_key);
2467 				return (0);
2468 			}
2469 			state = oldstate;
2470 		}
2471 
2472 		if (error)
2473 			break;
2474 
2475 		/* grab monitor lock */
2476 		umtxq_lock(&uq->uq_key);
2477 		umtxq_busy(&uq->uq_key);
2478 		umtxq_unlock(&uq->uq_key);
2479 
2480 		/*
2481 		 * re-read the state, in case it changed between the try-lock above
2482 		 * and the check below
2483 		 */
2484 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2485 
2486 		/* set read contention bit */
2487 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2488 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2489 			if (oldstate == state)
2490 				goto sleep;
2491 			state = oldstate;
2492 		}
2493 
2494 		/* state is changed while setting flags, restart */
2495 		if (!(state & wrflags)) {
2496 			umtxq_lock(&uq->uq_key);
2497 			umtxq_unbusy(&uq->uq_key);
2498 			umtxq_unlock(&uq->uq_key);
2499 			continue;
2500 		}
2501 
2502 sleep:
2503 		/* contention bit is set, before sleeping, increase read waiter count */
2504 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2505 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2506 
2507 		while (state & wrflags) {
2508 			umtxq_lock(&uq->uq_key);
2509 			umtxq_insert(uq);
2510 			umtxq_unbusy(&uq->uq_key);
2511 
2512 			error = umtxq_sleep(uq, "urdlck", timo);
2513 
2514 			umtxq_busy(&uq->uq_key);
2515 			umtxq_remove(uq);
2516 			umtxq_unlock(&uq->uq_key);
2517 			if (error)
2518 				break;
2519 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2520 		}
2521 
2522 		/* decrease read waiter count, and may clear read contention bit */
2523 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2524 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2525 		if (blocked_readers == 1) {
2526 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2527 			for (;;) {
2528 				oldstate = casuword32(&rwlock->rw_state, state,
2529 					 state & ~URWLOCK_READ_WAITERS);
2530 				if (oldstate == state)
2531 					break;
2532 				state = oldstate;
2533 			}
2534 		}
2535 
2536 		umtxq_lock(&uq->uq_key);
2537 		umtxq_unbusy(&uq->uq_key);
2538 		umtxq_unlock(&uq->uq_key);
2539 	}
2540 	umtx_key_release(&uq->uq_key);
2541 	return (error);
2542 }
2543 
2544 static int
2545 do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2546 {
2547 	struct timespec ts, ts2, ts3;
2548 	struct timeval tv;
2549 	int error;
2550 
2551 	getnanouptime(&ts);
2552 	timespecadd(&ts, timeout);
2553 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2554 	for (;;) {
2555 		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2556 		if (error != ETIMEDOUT)
2557 			break;
2558 		getnanouptime(&ts2);
2559 		if (timespeccmp(&ts2, &ts, >=)) {
2560 			error = ETIMEDOUT;
2561 			break;
2562 		}
2563 		ts3 = ts;
2564 		timespecsub(&ts3, &ts2);
2565 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2566 	}
2567 	if (error == ERESTART)
2568 		error = EINTR;
2569 	return (error);
2570 }
2571 
2572 static int
2573 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2574 {
2575 	struct umtx_q *uq;
2576 	uint32_t flags;
2577 	int32_t state, oldstate;
2578 	int32_t blocked_writers;
2579 	int32_t blocked_readers;
2580 	int error;
2581 
2582 	uq = td->td_umtxq;
2583 	flags = fuword32(&rwlock->rw_flags);
2584 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2585 	if (error != 0)
2586 		return (error);
2587 
2588 	blocked_readers = 0;
2589 	for (;;) {
2590 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2591 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2592 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2593 			if (oldstate == state) {
2594 				umtx_key_release(&uq->uq_key);
2595 				return (0);
2596 			}
2597 			state = oldstate;
2598 		}
2599 
2600 		if (error) {
2601 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2602 			    blocked_readers != 0) {
2603 				umtxq_lock(&uq->uq_key);
2604 				umtxq_busy(&uq->uq_key);
2605 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2606 				umtxq_unbusy(&uq->uq_key);
2607 				umtxq_unlock(&uq->uq_key);
2608 			}
2609 
2610 			break;
2611 		}
2612 
2613 		/* grab monitor lock */
2614 		umtxq_lock(&uq->uq_key);
2615 		umtxq_busy(&uq->uq_key);
2616 		umtxq_unlock(&uq->uq_key);
2617 
2618 		/*
2619 		 * re-read the state, in case it changed between the try-lock above
2620 		 * and the check below
2621 		 */
2622 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2623 
2624 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2625 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2626 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2627 			if (oldstate == state)
2628 				goto sleep;
2629 			state = oldstate;
2630 		}
2631 
2632 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2633 			umtxq_lock(&uq->uq_key);
2634 			umtxq_unbusy(&uq->uq_key);
2635 			umtxq_unlock(&uq->uq_key);
2636 			continue;
2637 		}
2638 sleep:
2639 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2640 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2641 
2642 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2643 			umtxq_lock(&uq->uq_key);
2644 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2645 			umtxq_unbusy(&uq->uq_key);
2646 
2647 			error = umtxq_sleep(uq, "uwrlck", timo);
2648 
2649 			umtxq_busy(&uq->uq_key);
2650 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2651 			umtxq_unlock(&uq->uq_key);
2652 			if (error)
2653 				break;
2654 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2655 		}
2656 
2657 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2658 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2659 		if (blocked_writers == 1) {
2660 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2661 			for (;;) {
2662 				oldstate = casuword32(&rwlock->rw_state, state,
2663 					 state & ~URWLOCK_WRITE_WAITERS);
2664 				if (oldstate == state)
2665 					break;
2666 				state = oldstate;
2667 			}
2668 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2669 		} else
2670 			blocked_readers = 0;
2671 
2672 		umtxq_lock(&uq->uq_key);
2673 		umtxq_unbusy(&uq->uq_key);
2674 		umtxq_unlock(&uq->uq_key);
2675 	}
2676 
2677 	umtx_key_release(&uq->uq_key);
2678 	return (error);
2679 }
2680 
2681 static int
2682 do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2683 {
2684 	struct timespec ts, ts2, ts3;
2685 	struct timeval tv;
2686 	int error;
2687 
2688 	getnanouptime(&ts);
2689 	timespecadd(&ts, timeout);
2690 	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2691 	for (;;) {
2692 		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2693 		if (error != ETIMEDOUT)
2694 			break;
2695 		getnanouptime(&ts2);
2696 		if (timespeccmp(&ts2, &ts, >=)) {
2697 			error = ETIMEDOUT;
2698 			break;
2699 		}
2700 		ts3 = ts;
2701 		timespecsub(&ts3, &ts2);
2702 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2703 	}
2704 	if (error == ERESTART)
2705 		error = EINTR;
2706 	return (error);
2707 }
2708 
2709 static int
2710 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2711 {
2712 	struct umtx_q *uq;
2713 	uint32_t flags;
2714 	int32_t state, oldstate;
2715 	int error, q, count;
2716 
2717 	uq = td->td_umtxq;
2718 	flags = fuword32(&rwlock->rw_flags);
2719 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2720 	if (error != 0)
2721 		return (error);
2722 
2723 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2724 	if (state & URWLOCK_WRITE_OWNER) {
2725 		for (;;) {
2726 			oldstate = casuword32(&rwlock->rw_state, state,
2727 				state & ~URWLOCK_WRITE_OWNER);
2728 			if (oldstate != state) {
2729 				state = oldstate;
2730 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2731 					error = EPERM;
2732 					goto out;
2733 				}
2734 			} else
2735 				break;
2736 		}
2737 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2738 		for (;;) {
2739 			oldstate = casuword32(&rwlock->rw_state, state,
2740 				state - 1);
2741 			if (oldstate != state) {
2742 				state = oldstate;
2743 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2744 					error = EPERM;
2745 					goto out;
2746 				}
2747 			}
2748 			else
2749 				break;
2750 		}
2751 	} else {
2752 		error = EPERM;
2753 		goto out;
2754 	}
2755 
2756 	count = 0;
2757 
2758 	if (!(flags & URWLOCK_PREFER_READER)) {
2759 		if (state & URWLOCK_WRITE_WAITERS) {
2760 			count = 1;
2761 			q = UMTX_EXCLUSIVE_QUEUE;
2762 		} else if (state & URWLOCK_READ_WAITERS) {
2763 			count = INT_MAX;
2764 			q = UMTX_SHARED_QUEUE;
2765 		}
2766 	} else {
2767 		if (state & URWLOCK_READ_WAITERS) {
2768 			count = INT_MAX;
2769 			q = UMTX_SHARED_QUEUE;
2770 		} else if (state & URWLOCK_WRITE_WAITERS) {
2771 			count = 1;
2772 			q = UMTX_EXCLUSIVE_QUEUE;
2773 		}
2774 	}
2775 
2776 	if (count) {
2777 		umtxq_lock(&uq->uq_key);
2778 		umtxq_busy(&uq->uq_key);
2779 		umtxq_signal_queue(&uq->uq_key, count, q);
2780 		umtxq_unbusy(&uq->uq_key);
2781 		umtxq_unlock(&uq->uq_key);
2782 	}
2783 out:
2784 	umtx_key_release(&uq->uq_key);
2785 	return (error);
2786 }
2787 
2788 static int
2789 do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2790 {
2791 	struct umtx_q *uq;
2792 	struct timeval tv;
2793 	struct timespec cts, ets, tts;
2794 	uint32_t flags, count;
2795 	int error;
2796 
2797 	uq = td->td_umtxq;
2798 	flags = fuword32(&sem->_flags);
2799 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2800 	if (error != 0)
2801 		return (error);
2802 	umtxq_lock(&uq->uq_key);
2803 	umtxq_busy(&uq->uq_key);
2804 	umtxq_insert(uq);
2805 	umtxq_unlock(&uq->uq_key);
2806 
2807 	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
2808 		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2809 
2810 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2811 	if (count != 0) {
2812 		umtxq_lock(&uq->uq_key);
2813 		umtxq_unbusy(&uq->uq_key);
2814 		umtxq_remove(uq);
2815 		umtxq_unlock(&uq->uq_key);
2816 		umtx_key_release(&uq->uq_key);
2817 		return (0);
2818 	}
2819 
2820 	umtxq_lock(&uq->uq_key);
2821 	umtxq_unbusy(&uq->uq_key);
2822 	umtxq_unlock(&uq->uq_key);
2823 
2824 	umtxq_lock(&uq->uq_key);
2825 	if (timeout == NULL) {
2826 		error = umtxq_sleep(uq, "usem", 0);
2827 	} else {
2828 		getnanouptime(&ets);
2829 		timespecadd(&ets, timeout);
2830 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2831 		for (;;) {
2832 			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2833 			if (error != ETIMEDOUT)
2834 				break;
2835 			getnanouptime(&cts);
2836 			if (timespeccmp(&cts, &ets, >=)) {
2837 				error = ETIMEDOUT;
2838 				break;
2839 			}
2840 			tts = ets;
2841 			timespecsub(&tts, &cts);
2842 			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2843 		}
2844 	}
2845 
2846 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2847 		error = 0;
2848 	else {
2849 		umtxq_remove(uq);
2850 		if (error == ERESTART)
2851 			error = EINTR;
2852 	}
2853 	umtxq_unlock(&uq->uq_key);
2854 	umtx_key_release(&uq->uq_key);
2855 	return (error);
2856 }
2857 
2858 /*
2859  * Signal a userland condition variable.
2860  */
2861 static int
2862 do_sem_wake(struct thread *td, struct _usem *sem)
2863 {
2864 	struct umtx_key key;
2865 	int error, cnt, nwake;
2866 	uint32_t flags;
2867 
2868 	flags = fuword32(&sem->_flags);
2869 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2870 		return (error);
2871 	umtxq_lock(&key);
2872 	umtxq_busy(&key);
2873 	cnt = umtxq_count(&key);
2874 	nwake = umtxq_signal(&key, 1);
2875 	if (cnt <= nwake) {
2876 		umtxq_unlock(&key);
2877 		error = suword32(
2878 		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2879 		umtxq_lock(&key);
2880 	}
2881 	umtxq_unbusy(&key);
2882 	umtxq_unlock(&key);
2883 	umtx_key_release(&key);
2884 	return (error);
2885 }
2886 
2887 int
2888 _umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2889     /* struct umtx *umtx */
2890 {
2891 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2892 }
2893 
2894 int
2895 _umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2896     /* struct umtx *umtx */
2897 {
2898 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2899 }
2900 
2901 static int
2902 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2903 {
2904 	struct timespec *ts, timeout;
2905 	int error;
2906 
2907 	/* Allow a null timespec (wait forever). */
2908 	if (uap->uaddr2 == NULL)
2909 		ts = NULL;
2910 	else {
2911 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2912 		if (error != 0)
2913 			return (error);
2914 		if (timeout.tv_nsec >= 1000000000 ||
2915 		    timeout.tv_nsec < 0) {
2916 			return (EINVAL);
2917 		}
2918 		ts = &timeout;
2919 	}
2920 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2921 }
2922 
2923 static int
2924 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2925 {
2926 	return (do_unlock_umtx(td, uap->obj, uap->val));
2927 }
2928 
2929 static int
2930 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2931 {
2932 	struct timespec *ts, timeout;
2933 	int error;
2934 
2935 	if (uap->uaddr2 == NULL)
2936 		ts = NULL;
2937 	else {
2938 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2939 		if (error != 0)
2940 			return (error);
2941 		if (timeout.tv_nsec >= 1000000000 ||
2942 		    timeout.tv_nsec < 0)
2943 			return (EINVAL);
2944 		ts = &timeout;
2945 	}
2946 	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2947 }
2948 
2949 static int
2950 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2951 {
2952 	struct timespec *ts, timeout;
2953 	int error;
2954 
2955 	if (uap->uaddr2 == NULL)
2956 		ts = NULL;
2957 	else {
2958 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2959 		if (error != 0)
2960 			return (error);
2961 		if (timeout.tv_nsec >= 1000000000 ||
2962 		    timeout.tv_nsec < 0)
2963 			return (EINVAL);
2964 		ts = &timeout;
2965 	}
2966 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2967 }
2968 
2969 static int
2970 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2971 {
2972 	struct timespec *ts, timeout;
2973 	int error;
2974 
2975 	if (uap->uaddr2 == NULL)
2976 		ts = NULL;
2977 	else {
2978 		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2979 		if (error != 0)
2980 			return (error);
2981 		if (timeout.tv_nsec >= 1000000000 ||
2982 		    timeout.tv_nsec < 0)
2983 			return (EINVAL);
2984 		ts = &timeout;
2985 	}
2986 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2987 }
2988 
2989 static int
2990 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2991 {
2992 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2993 }
2994 
2995 #define BATCH_SIZE	128
2996 static int
2997 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
2998 {
2999 	int count = uap->val;
3000 	void *uaddrs[BATCH_SIZE];
3001 	char **upp = (char **)uap->obj;
3002 	int tocopy;
3003 	int error = 0;
3004 	int i, pos = 0;
3005 
3006 	while (count > 0) {
3007 		tocopy = count;
3008 		if (tocopy > BATCH_SIZE)
3009 			tocopy = BATCH_SIZE;
3010 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3011 		if (error != 0)
3012 			break;
3013 		for (i = 0; i < tocopy; ++i)
3014 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3015 		count -= tocopy;
3016 		pos += tocopy;
3017 	}
3018 	return (error);
3019 }
3020 
3021 static int
3022 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3023 {
3024 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3025 }
3026 
3027 static int
3028 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3029 {
3030 	struct timespec *ts, timeout;
3031 	int error;
3032 
3033 	/* Allow a null timespec (wait forever). */
3034 	if (uap->uaddr2 == NULL)
3035 		ts = NULL;
3036 	else {
3037 		error = copyin(uap->uaddr2, &timeout,
3038 		    sizeof(timeout));
3039 		if (error != 0)
3040 			return (error);
3041 		if (timeout.tv_nsec >= 1000000000 ||
3042 		    timeout.tv_nsec < 0) {
3043 			return (EINVAL);
3044 		}
3045 		ts = &timeout;
3046 	}
3047 	return do_lock_umutex(td, uap->obj, ts, 0);
3048 }
3049 
3050 static int
3051 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3052 {
3053 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3054 }
3055 
3056 static int
3057 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3058 {
3059 	struct timespec *ts, timeout;
3060 	int error;
3061 
3062 	/* Allow a null timespec (wait forever). */
3063 	if (uap->uaddr2 == NULL)
3064 		ts = NULL;
3065 	else {
3066 		error = copyin(uap->uaddr2, &timeout,
3067 		    sizeof(timeout));
3068 		if (error != 0)
3069 			return (error);
3070 		if (timeout.tv_nsec >= 1000000000 ||
3071 		    timeout.tv_nsec < 0) {
3072 			return (EINVAL);
3073 		}
3074 		ts = &timeout;
3075 	}
3076 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3077 }
3078 
3079 static int
3080 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3081 {
3082 	return do_wake_umutex(td, uap->obj);
3083 }
3084 
3085 static int
3086 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3087 {
3088 	return do_unlock_umutex(td, uap->obj);
3089 }
3090 
3091 static int
3092 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3093 {
3094 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3095 }
3096 
3097 static int
3098 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3099 {
3100 	struct timespec *ts, timeout;
3101 	int error;
3102 
3103 	/* Allow a null timespec (wait forever). */
3104 	if (uap->uaddr2 == NULL)
3105 		ts = NULL;
3106 	else {
3107 		error = copyin(uap->uaddr2, &timeout,
3108 		    sizeof(timeout));
3109 		if (error != 0)
3110 			return (error);
3111 		if (timeout.tv_nsec >= 1000000000 ||
3112 		    timeout.tv_nsec < 0) {
3113 			return (EINVAL);
3114 		}
3115 		ts = &timeout;
3116 	}
3117 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3118 }
3119 
3120 static int
3121 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3122 {
3123 	return do_cv_signal(td, uap->obj);
3124 }
3125 
3126 static int
3127 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3128 {
3129 	return do_cv_broadcast(td, uap->obj);
3130 }
3131 
3132 static int
3133 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3134 {
3135 	struct timespec timeout;
3136 	int error;
3137 
3138 	/* Allow a null timespec (wait forever). */
3139 	if (uap->uaddr2 == NULL) {
3140 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3141 	} else {
3142 		error = copyin(uap->uaddr2, &timeout,
3143 		    sizeof(timeout));
3144 		if (error != 0)
3145 			return (error);
3146 		if (timeout.tv_nsec >= 1000000000 ||
3147 		    timeout.tv_nsec < 0) {
3148 			return (EINVAL);
3149 		}
3150 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3151 	}
3152 	return (error);
3153 }
3154 
3155 static int
3156 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3157 {
3158 	struct timespec timeout;
3159 	int error;
3160 
3161 	/* Allow a null timespec (wait forever). */
3162 	if (uap->uaddr2 == NULL) {
3163 		error = do_rw_wrlock(td, uap->obj, 0);
3164 	} else {
3165 		error = copyin(uap->uaddr2, &timeout,
3166 		    sizeof(timeout));
3167 		if (error != 0)
3168 			return (error);
3169 		if (timeout.tv_nsec >= 1000000000 ||
3170 		    timeout.tv_nsec < 0) {
3171 			return (EINVAL);
3172 		}
3173 
3174 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3175 	}
3176 	return (error);
3177 }
3178 
3179 static int
3180 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3181 {
3182 	return do_rw_unlock(td, uap->obj);
3183 }
3184 
3185 static int
3186 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3187 {
3188 	struct timespec *ts, timeout;
3189 	int error;
3190 
3191 	/* Allow a null timespec (wait forever). */
3192 	if (uap->uaddr2 == NULL)
3193 		ts = NULL;
3194 	else {
3195 		error = copyin(uap->uaddr2, &timeout,
3196 		    sizeof(timeout));
3197 		if (error != 0)
3198 			return (error);
3199 		if (timeout.tv_nsec >= 1000000000 ||
3200 		    timeout.tv_nsec < 0) {
3201 			return (EINVAL);
3202 		}
3203 		ts = &timeout;
3204 	}
3205 	return (do_sem_wait(td, uap->obj, ts));
3206 }
3207 
3208 static int
3209 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3210 {
3211 	return do_sem_wake(td, uap->obj);
3212 }
3213 
3214 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3215 
3216 static _umtx_op_func op_table[] = {
3217 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3218 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3219 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3220 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3221 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3222 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3223 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3224 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3225 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3226 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3227 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3228 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3229 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3230 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3231 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3232 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3233 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3234 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3235 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3236 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3237 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3238 	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3239 };
3240 
3241 int
3242 _umtx_op(struct thread *td, struct _umtx_op_args *uap)
3243 {
3244 	if ((unsigned)uap->op < UMTX_OP_MAX)
3245 		return (*op_table[uap->op])(td, uap);
3246 	return (EINVAL);
3247 }
3248 
3249 #ifdef COMPAT_FREEBSD32
3250 int
3251 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3252     /* struct umtx *umtx */
3253 {
3254 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3255 }
3256 
3257 int
3258 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3259     /* struct umtx *umtx */
3260 {
3261 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3262 }
3263 
3264 struct timespec32 {
3265 	uint32_t tv_sec;
3266 	uint32_t tv_nsec;
3267 };
3268 
3269 static inline int
3270 copyin_timeout32(void *addr, struct timespec *tsp)
3271 {
3272 	struct timespec32 ts32;
3273 	int error;
3274 
3275 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3276 	if (error == 0) {
3277 		tsp->tv_sec = ts32.tv_sec;
3278 		tsp->tv_nsec = ts32.tv_nsec;
3279 	}
3280 	return (error);
3281 }
3282 
3283 static int
3284 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3285 {
3286 	struct timespec *ts, timeout;
3287 	int error;
3288 
3289 	/* Allow a null timespec (wait forever). */
3290 	if (uap->uaddr2 == NULL)
3291 		ts = NULL;
3292 	else {
3293 		error = copyin_timeout32(uap->uaddr2, &timeout);
3294 		if (error != 0)
3295 			return (error);
3296 		if (timeout.tv_nsec >= 1000000000 ||
3297 		    timeout.tv_nsec < 0) {
3298 			return (EINVAL);
3299 		}
3300 		ts = &timeout;
3301 	}
3302 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3303 }
3304 
3305 static int
3306 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3307 {
3308 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3309 }
3310 
3311 static int
3312 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3313 {
3314 	struct timespec *ts, timeout;
3315 	int error;
3316 
3317 	if (uap->uaddr2 == NULL)
3318 		ts = NULL;
3319 	else {
3320 		error = copyin_timeout32(uap->uaddr2, &timeout);
3321 		if (error != 0)
3322 			return (error);
3323 		if (timeout.tv_nsec >= 1000000000 ||
3324 		    timeout.tv_nsec < 0)
3325 			return (EINVAL);
3326 		ts = &timeout;
3327 	}
3328 	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3329 }
3330 
3331 static int
3332 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3333 {
3334 	struct timespec *ts, timeout;
3335 	int error;
3336 
3337 	/* Allow a null timespec (wait forever). */
3338 	if (uap->uaddr2 == NULL)
3339 		ts = NULL;
3340 	else {
3341 		error = copyin_timeout32(uap->uaddr2, &timeout);
3342 		if (error != 0)
3343 			return (error);
3344 		if (timeout.tv_nsec >= 1000000000 ||
3345 		    timeout.tv_nsec < 0)
3346 			return (EINVAL);
3347 		ts = &timeout;
3348 	}
3349 	return do_lock_umutex(td, uap->obj, ts, 0);
3350 }
3351 
3352 static int
3353 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3354 {
3355 	struct timespec *ts, timeout;
3356 	int error;
3357 
3358 	/* Allow a null timespec (wait forever). */
3359 	if (uap->uaddr2 == NULL)
3360 		ts = NULL;
3361 	else {
3362 		error = copyin_timeout32(uap->uaddr2, &timeout);
3363 		if (error != 0)
3364 			return (error);
3365 		if (timeout.tv_nsec >= 1000000000 ||
3366 		    timeout.tv_nsec < 0)
3367 			return (EINVAL);
3368 		ts = &timeout;
3369 	}
3370 	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3371 }
3372 
3373 static int
3374 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3375 {
3376 	struct timespec *ts, timeout;
3377 	int error;
3378 
3379 	/* Allow a null timespec (wait forever). */
3380 	if (uap->uaddr2 == NULL)
3381 		ts = NULL;
3382 	else {
3383 		error = copyin_timeout32(uap->uaddr2, &timeout);
3384 		if (error != 0)
3385 			return (error);
3386 		if (timeout.tv_nsec >= 1000000000 ||
3387 		    timeout.tv_nsec < 0)
3388 			return (EINVAL);
3389 		ts = &timeout;
3390 	}
3391 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3392 }
3393 
3394 static int
3395 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3396 {
3397 	struct timespec timeout;
3398 	int error;
3399 
3400 	/* Allow a null timespec (wait forever). */
3401 	if (uap->uaddr2 == NULL) {
3402 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3403 	} else {
3404 		error = copyin_timeout32(uap->uaddr2, &timeout);
3405 		if (error != 0)
3406 			return (error);
3407 		if (timeout.tv_nsec >= 1000000000 ||
3408 		    timeout.tv_nsec < 0) {
3409 			return (EINVAL);
3410 		}
3411 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3412 	}
3413 	return (error);
3414 }
3415 
3416 static int
3417 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3418 {
3419 	struct timespec timeout;
3420 	int error;
3421 
3422 	/* Allow a null timespec (wait forever). */
3423 	if (uap->uaddr2 == NULL) {
3424 		error = do_rw_wrlock(td, uap->obj, 0);
3425 	} else {
3426 		error = copyin_timeout32(uap->uaddr2, &timeout);
3427 		if (error != 0)
3428 			return (error);
3429 		if (timeout.tv_nsec >= 1000000000 ||
3430 		    timeout.tv_nsec < 0) {
3431 			return (EINVAL);
3432 		}
3433 
3434 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3435 	}
3436 	return (error);
3437 }
3438 
3439 static int
3440 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3441 {
3442 	struct timespec *ts, timeout;
3443 	int error;
3444 
3445 	if (uap->uaddr2 == NULL)
3446 		ts = NULL;
3447 	else {
3448 		error = copyin_timeout32(uap->uaddr2, &timeout);
3449 		if (error != 0)
3450 			return (error);
3451 		if (timeout.tv_nsec >= 1000000000 ||
3452 		    timeout.tv_nsec < 0)
3453 			return (EINVAL);
3454 		ts = &timeout;
3455 	}
3456 	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3457 }
3458 
3459 static int
3460 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3461 {
3462 	struct timespec *ts, timeout;
3463 	int error;
3464 
3465 	/* Allow a null timespec (wait forever). */
3466 	if (uap->uaddr2 == NULL)
3467 		ts = NULL;
3468 	else {
3469 		error = copyin_timeout32(uap->uaddr2, &timeout);
3470 		if (error != 0)
3471 			return (error);
3472 		if (timeout.tv_nsec >= 1000000000 ||
3473 		    timeout.tv_nsec < 0)
3474 			return (EINVAL);
3475 		ts = &timeout;
3476 	}
3477 	return (do_sem_wait(td, uap->obj, ts));
3478 }
3479 
3480 static int
3481 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3482 {
3483 	int count = uap->val;
3484 	uint32_t uaddrs[BATCH_SIZE];
3485 	uint32_t **upp = (uint32_t **)uap->obj;
3486 	int tocopy;
3487 	int error = 0;
3488 	int i, pos = 0;
3489 
3490 	while (count > 0) {
3491 		tocopy = count;
3492 		if (tocopy > BATCH_SIZE)
3493 			tocopy = BATCH_SIZE;
3494 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3495 		if (error != 0)
3496 			break;
3497 		for (i = 0; i < tocopy; ++i)
3498 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3499 				INT_MAX, 1);
3500 		count -= tocopy;
3501 		pos += tocopy;
3502 	}
3503 	return (error);
3504 }
3505 
3506 static _umtx_op_func op_table_compat32[] = {
3507 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3508 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3509 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3510 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3511 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3512 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3513 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3514 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3515 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3516 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3517 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3518 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3519 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3520 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3521 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3522 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3523 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3524 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3525 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3526 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3527 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3528 	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3529 };
3530 
3531 int
3532 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3533 {
3534 	if ((unsigned)uap->op < UMTX_OP_MAX)
3535 		return (*op_table_compat32[uap->op])(td,
3536 			(struct _umtx_op_args *)uap);
3537 	return (EINVAL);
3538 }
3539 #endif
3540 
3541 void
3542 umtx_thread_init(struct thread *td)
3543 {
3544 	td->td_umtxq = umtxq_alloc();
3545 	td->td_umtxq->uq_thread = td;
3546 }
3547 
3548 void
3549 umtx_thread_fini(struct thread *td)
3550 {
3551 	umtxq_free(td->td_umtxq);
3552 }
3553 
3554 /*
3555  * It will be called when new thread is created, e.g fork().
3556  */
3557 void
3558 umtx_thread_alloc(struct thread *td)
3559 {
3560 	struct umtx_q *uq;
3561 
3562 	uq = td->td_umtxq;
3563 	uq->uq_inherited_pri = PRI_MAX;
3564 
3565 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3566 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3567 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3568 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3569 }
3570 
3571 /*
3572  * exec() hook.
3573  */
3574 static void
3575 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3576 	struct image_params *imgp __unused)
3577 {
3578 	umtx_thread_cleanup(curthread);
3579 }
3580 
3581 /*
3582  * thread_exit() hook.
3583  */
3584 void
3585 umtx_thread_exit(struct thread *td)
3586 {
3587 	umtx_thread_cleanup(td);
3588 }
3589 
3590 /*
3591  * clean up umtx data.
3592  */
3593 static void
3594 umtx_thread_cleanup(struct thread *td)
3595 {
3596 	struct umtx_q *uq;
3597 	struct umtx_pi *pi;
3598 
3599 	if ((uq = td->td_umtxq) == NULL)
3600 		return;
3601 
3602 	mtx_lock_spin(&umtx_lock);
3603 	uq->uq_inherited_pri = PRI_MAX;
3604 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3605 		pi->pi_owner = NULL;
3606 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3607 	}
3608 	mtx_unlock_spin(&umtx_lock);
3609 	thread_lock(td);
3610 	sched_lend_user_prio(td, PRI_MAX);
3611 	thread_unlock(td);
3612 }
3613