xref: /freebsd/sys/kern/kern_umtx.c (revision 9124ddeb4a551977cf6b2218291e7c666ce25f47)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/priv.h>
39 #include <sys/proc.h>
40 #include <sys/sched.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysent.h>
44 #include <sys/systm.h>
45 #include <sys/sysproto.h>
46 #include <sys/syscallsubr.h>
47 #include <sys/eventhandler.h>
48 #include <sys/umtx.h>
49 
50 #include <vm/vm.h>
51 #include <vm/vm_param.h>
52 #include <vm/pmap.h>
53 #include <vm/vm_map.h>
54 #include <vm/vm_object.h>
55 
56 #include <machine/cpu.h>
57 
58 #ifdef COMPAT_FREEBSD32
59 #include <compat/freebsd32/freebsd32_proto.h>
60 #endif
61 
62 #define _UMUTEX_TRY		1
63 #define _UMUTEX_WAIT		2
64 
65 /* Priority inheritance mutex info. */
66 struct umtx_pi {
67 	/* Owner thread */
68 	struct thread		*pi_owner;
69 
70 	/* Reference count */
71 	int			pi_refcount;
72 
73  	/* List entry to link umtx holding by thread */
74 	TAILQ_ENTRY(umtx_pi)	pi_link;
75 
76 	/* List entry in hash */
77 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
78 
79 	/* List for waiters */
80 	TAILQ_HEAD(,umtx_q)	pi_blocked;
81 
82 	/* Identify a userland lock object */
83 	struct umtx_key		pi_key;
84 };
85 
86 /* A userland synchronous object user. */
87 struct umtx_q {
88 	/* Linked list for the hash. */
89 	TAILQ_ENTRY(umtx_q)	uq_link;
90 
91 	/* Umtx key. */
92 	struct umtx_key		uq_key;
93 
94 	/* Umtx flags. */
95 	int			uq_flags;
96 #define UQF_UMTXQ	0x0001
97 
98 	/* The thread waits on. */
99 	struct thread		*uq_thread;
100 
101 	/*
102 	 * Blocked on PI mutex. read can use chain lock
103 	 * or umtx_lock, write must have both chain lock and
104 	 * umtx_lock being hold.
105 	 */
106 	struct umtx_pi		*uq_pi_blocked;
107 
108 	/* On blocked list */
109 	TAILQ_ENTRY(umtx_q)	uq_lockq;
110 
111 	/* Thread contending with us */
112 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
113 
114 	/* Inherited priority from PP mutex */
115 	u_char			uq_inherited_pri;
116 
117 	/* Spare queue ready to be reused */
118 	struct umtxq_queue	*uq_spare_queue;
119 
120 	/* The queue we on */
121 	struct umtxq_queue	*uq_cur_queue;
122 };
123 
124 TAILQ_HEAD(umtxq_head, umtx_q);
125 
126 /* Per-key wait-queue */
127 struct umtxq_queue {
128 	struct umtxq_head	head;
129 	struct umtx_key		key;
130 	LIST_ENTRY(umtxq_queue)	link;
131 	int			length;
132 };
133 
134 LIST_HEAD(umtxq_list, umtxq_queue);
135 
136 /* Userland lock object's wait-queue chain */
137 struct umtxq_chain {
138 	/* Lock for this chain. */
139 	struct mtx		uc_lock;
140 
141 	/* List of sleep queues. */
142 	struct umtxq_list	uc_queue[2];
143 #define UMTX_SHARED_QUEUE	0
144 #define UMTX_EXCLUSIVE_QUEUE	1
145 
146 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
147 
148 	/* Busy flag */
149 	char			uc_busy;
150 
151 	/* Chain lock waiters */
152 	int			uc_waiters;
153 
154 	/* All PI in the list */
155 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156 
157 };
158 
159 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
161 
162 /*
163  * Don't propagate time-sharing priority, there is a security reason,
164  * a user can simply introduce PI-mutex, let thread A lock the mutex,
165  * and let another thread B block on the mutex, because B is
166  * sleeping, its priority will be boosted, this causes A's priority to
167  * be boosted via priority propagating too and will never be lowered even
168  * if it is using 100%CPU, this is unfair to other processes.
169  */
170 
171 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
172 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
173 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
174 
175 #define	GOLDEN_RATIO_PRIME	2654404609U
176 #define	UMTX_CHAINS		512
177 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
178 
179 #define	GET_SHARE(flags)	\
180     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
181 
182 #define BUSY_SPINS		200
183 
184 static uma_zone_t		umtx_pi_zone;
185 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
186 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187 static int			umtx_pi_allocated;
188 
189 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191     &umtx_pi_allocated, 0, "Allocated umtx_pi");
192 
193 static void umtxq_sysinit(void *);
194 static void umtxq_hash(struct umtx_key *key);
195 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196 static void umtxq_lock(struct umtx_key *key);
197 static void umtxq_unlock(struct umtx_key *key);
198 static void umtxq_busy(struct umtx_key *key);
199 static void umtxq_unbusy(struct umtx_key *key);
200 static void umtxq_insert_queue(struct umtx_q *uq, int q);
201 static void umtxq_remove_queue(struct umtx_q *uq, int q);
202 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203 static int umtxq_count(struct umtx_key *key);
204 static struct umtx_pi *umtx_pi_alloc(int);
205 static void umtx_pi_free(struct umtx_pi *pi);
206 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
207 static void umtx_thread_cleanup(struct thread *td);
208 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
209 	struct image_params *imgp __unused);
210 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
211 
212 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
213 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
214 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
215 
216 static struct mtx umtx_lock;
217 
218 static void
219 umtxq_sysinit(void *arg __unused)
220 {
221 	int i, j;
222 
223 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
224 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
225 	for (i = 0; i < 2; ++i) {
226 		for (j = 0; j < UMTX_CHAINS; ++j) {
227 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
228 				 MTX_DEF | MTX_DUPOK);
229 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
230 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
231 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
232 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
233 			umtxq_chains[i][j].uc_busy = 0;
234 			umtxq_chains[i][j].uc_waiters = 0;
235 		}
236 	}
237 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
238 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
239 	    EVENTHANDLER_PRI_ANY);
240 }
241 
242 struct umtx_q *
243 umtxq_alloc(void)
244 {
245 	struct umtx_q *uq;
246 
247 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
248 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
249 	TAILQ_INIT(&uq->uq_spare_queue->head);
250 	TAILQ_INIT(&uq->uq_pi_contested);
251 	uq->uq_inherited_pri = PRI_MAX;
252 	return (uq);
253 }
254 
255 void
256 umtxq_free(struct umtx_q *uq)
257 {
258 	MPASS(uq->uq_spare_queue != NULL);
259 	free(uq->uq_spare_queue, M_UMTX);
260 	free(uq, M_UMTX);
261 }
262 
263 static inline void
264 umtxq_hash(struct umtx_key *key)
265 {
266 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
267 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
268 }
269 
270 static inline struct umtxq_chain *
271 umtxq_getchain(struct umtx_key *key)
272 {
273 	if (key->type <= TYPE_SEM)
274 		return (&umtxq_chains[1][key->hash]);
275 	return (&umtxq_chains[0][key->hash]);
276 }
277 
278 /*
279  * Lock a chain.
280  */
281 static inline void
282 umtxq_lock(struct umtx_key *key)
283 {
284 	struct umtxq_chain *uc;
285 
286 	uc = umtxq_getchain(key);
287 	mtx_lock(&uc->uc_lock);
288 }
289 
290 /*
291  * Unlock a chain.
292  */
293 static inline void
294 umtxq_unlock(struct umtx_key *key)
295 {
296 	struct umtxq_chain *uc;
297 
298 	uc = umtxq_getchain(key);
299 	mtx_unlock(&uc->uc_lock);
300 }
301 
302 /*
303  * Set chain to busy state when following operation
304  * may be blocked (kernel mutex can not be used).
305  */
306 static inline void
307 umtxq_busy(struct umtx_key *key)
308 {
309 	struct umtxq_chain *uc;
310 
311 	uc = umtxq_getchain(key);
312 	mtx_assert(&uc->uc_lock, MA_OWNED);
313 	if (uc->uc_busy) {
314 #ifdef SMP
315 		if (smp_cpus > 1) {
316 			int count = BUSY_SPINS;
317 			if (count > 0) {
318 				umtxq_unlock(key);
319 				while (uc->uc_busy && --count > 0)
320 					cpu_spinwait();
321 				umtxq_lock(key);
322 			}
323 		}
324 #endif
325 		while (uc->uc_busy) {
326 			uc->uc_waiters++;
327 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
328 			uc->uc_waiters--;
329 		}
330 	}
331 	uc->uc_busy = 1;
332 }
333 
334 /*
335  * Unbusy a chain.
336  */
337 static inline void
338 umtxq_unbusy(struct umtx_key *key)
339 {
340 	struct umtxq_chain *uc;
341 
342 	uc = umtxq_getchain(key);
343 	mtx_assert(&uc->uc_lock, MA_OWNED);
344 	KASSERT(uc->uc_busy != 0, ("not busy"));
345 	uc->uc_busy = 0;
346 	if (uc->uc_waiters)
347 		wakeup_one(uc);
348 }
349 
350 static struct umtxq_queue *
351 umtxq_queue_lookup(struct umtx_key *key, int q)
352 {
353 	struct umtxq_queue *uh;
354 	struct umtxq_chain *uc;
355 
356 	uc = umtxq_getchain(key);
357 	UMTXQ_LOCKED_ASSERT(uc);
358 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
359 		if (umtx_key_match(&uh->key, key))
360 			return (uh);
361 	}
362 
363 	return (NULL);
364 }
365 
366 static inline void
367 umtxq_insert_queue(struct umtx_q *uq, int q)
368 {
369 	struct umtxq_queue *uh;
370 	struct umtxq_chain *uc;
371 
372 	uc = umtxq_getchain(&uq->uq_key);
373 	UMTXQ_LOCKED_ASSERT(uc);
374 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
375 	uh = umtxq_queue_lookup(&uq->uq_key, q);
376 	if (uh != NULL) {
377 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
378 	} else {
379 		uh = uq->uq_spare_queue;
380 		uh->key = uq->uq_key;
381 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
382 	}
383 	uq->uq_spare_queue = NULL;
384 
385 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
386 	uh->length++;
387 	uq->uq_flags |= UQF_UMTXQ;
388 	uq->uq_cur_queue = uh;
389 	return;
390 }
391 
392 static inline void
393 umtxq_remove_queue(struct umtx_q *uq, int q)
394 {
395 	struct umtxq_chain *uc;
396 	struct umtxq_queue *uh;
397 
398 	uc = umtxq_getchain(&uq->uq_key);
399 	UMTXQ_LOCKED_ASSERT(uc);
400 	if (uq->uq_flags & UQF_UMTXQ) {
401 		uh = uq->uq_cur_queue;
402 		TAILQ_REMOVE(&uh->head, uq, uq_link);
403 		uh->length--;
404 		uq->uq_flags &= ~UQF_UMTXQ;
405 		if (TAILQ_EMPTY(&uh->head)) {
406 			KASSERT(uh->length == 0,
407 			    ("inconsistent umtxq_queue length"));
408 			LIST_REMOVE(uh, link);
409 		} else {
410 			uh = LIST_FIRST(&uc->uc_spare_queue);
411 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
412 			LIST_REMOVE(uh, link);
413 		}
414 		uq->uq_spare_queue = uh;
415 		uq->uq_cur_queue = NULL;
416 	}
417 }
418 
419 /*
420  * Check if there are multiple waiters
421  */
422 static int
423 umtxq_count(struct umtx_key *key)
424 {
425 	struct umtxq_chain *uc;
426 	struct umtxq_queue *uh;
427 
428 	uc = umtxq_getchain(key);
429 	UMTXQ_LOCKED_ASSERT(uc);
430 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
431 	if (uh != NULL)
432 		return (uh->length);
433 	return (0);
434 }
435 
436 /*
437  * Check if there are multiple PI waiters and returns first
438  * waiter.
439  */
440 static int
441 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtxq_queue *uh;
445 
446 	*first = NULL;
447 	uc = umtxq_getchain(key);
448 	UMTXQ_LOCKED_ASSERT(uc);
449 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
450 	if (uh != NULL) {
451 		*first = TAILQ_FIRST(&uh->head);
452 		return (uh->length);
453 	}
454 	return (0);
455 }
456 
457 /*
458  * Wake up threads waiting on an userland object.
459  */
460 
461 static int
462 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
463 {
464 	struct umtxq_chain *uc;
465 	struct umtxq_queue *uh;
466 	struct umtx_q *uq;
467 	int ret;
468 
469 	ret = 0;
470 	uc = umtxq_getchain(key);
471 	UMTXQ_LOCKED_ASSERT(uc);
472 	uh = umtxq_queue_lookup(key, q);
473 	if (uh != NULL) {
474 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
475 			umtxq_remove_queue(uq, q);
476 			wakeup(uq);
477 			if (++ret >= n_wake)
478 				return (ret);
479 		}
480 	}
481 	return (ret);
482 }
483 
484 
485 /*
486  * Wake up specified thread.
487  */
488 static inline void
489 umtxq_signal_thread(struct umtx_q *uq)
490 {
491 	struct umtxq_chain *uc;
492 
493 	uc = umtxq_getchain(&uq->uq_key);
494 	UMTXQ_LOCKED_ASSERT(uc);
495 	umtxq_remove(uq);
496 	wakeup(uq);
497 }
498 
499 /*
500  * Put thread into sleep state, before sleeping, check if
501  * thread was removed from umtx queue.
502  */
503 static inline int
504 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
505 {
506 	struct umtxq_chain *uc;
507 	int error;
508 
509 	uc = umtxq_getchain(&uq->uq_key);
510 	UMTXQ_LOCKED_ASSERT(uc);
511 	if (!(uq->uq_flags & UQF_UMTXQ))
512 		return (0);
513 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
514 	if (error == EWOULDBLOCK)
515 		error = ETIMEDOUT;
516 	return (error);
517 }
518 
519 /*
520  * Convert userspace address into unique logical address.
521  */
522 int
523 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
524 {
525 	struct thread *td = curthread;
526 	vm_map_t map;
527 	vm_map_entry_t entry;
528 	vm_pindex_t pindex;
529 	vm_prot_t prot;
530 	boolean_t wired;
531 
532 	key->type = type;
533 	if (share == THREAD_SHARE) {
534 		key->shared = 0;
535 		key->info.private.vs = td->td_proc->p_vmspace;
536 		key->info.private.addr = (uintptr_t)addr;
537 	} else {
538 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
539 		map = &td->td_proc->p_vmspace->vm_map;
540 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
541 		    &entry, &key->info.shared.object, &pindex, &prot,
542 		    &wired) != KERN_SUCCESS) {
543 			return EFAULT;
544 		}
545 
546 		if ((share == PROCESS_SHARE) ||
547 		    (share == AUTO_SHARE &&
548 		     VM_INHERIT_SHARE == entry->inheritance)) {
549 			key->shared = 1;
550 			key->info.shared.offset = entry->offset + entry->start -
551 				(vm_offset_t)addr;
552 			vm_object_reference(key->info.shared.object);
553 		} else {
554 			key->shared = 0;
555 			key->info.private.vs = td->td_proc->p_vmspace;
556 			key->info.private.addr = (uintptr_t)addr;
557 		}
558 		vm_map_lookup_done(map, entry);
559 	}
560 
561 	umtxq_hash(key);
562 	return (0);
563 }
564 
565 /*
566  * Release key.
567  */
568 void
569 umtx_key_release(struct umtx_key *key)
570 {
571 	if (key->shared)
572 		vm_object_deallocate(key->info.shared.object);
573 }
574 
575 /*
576  * Lock a umtx object.
577  */
578 static int
579 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
580 {
581 	struct umtx_q *uq;
582 	u_long owner;
583 	u_long old;
584 	int error = 0;
585 
586 	uq = td->td_umtxq;
587 
588 	/*
589 	 * Care must be exercised when dealing with umtx structure. It
590 	 * can fault on any access.
591 	 */
592 	for (;;) {
593 		/*
594 		 * Try the uncontested case.  This should be done in userland.
595 		 */
596 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
597 
598 		/* The acquire succeeded. */
599 		if (owner == UMTX_UNOWNED)
600 			return (0);
601 
602 		/* The address was invalid. */
603 		if (owner == -1)
604 			return (EFAULT);
605 
606 		/* If no one owns it but it is contested try to acquire it. */
607 		if (owner == UMTX_CONTESTED) {
608 			owner = casuword(&umtx->u_owner,
609 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
610 
611 			if (owner == UMTX_CONTESTED)
612 				return (0);
613 
614 			/* The address was invalid. */
615 			if (owner == -1)
616 				return (EFAULT);
617 
618 			/* If this failed the lock has changed, restart. */
619 			continue;
620 		}
621 
622 		/*
623 		 * If we caught a signal, we have retried and now
624 		 * exit immediately.
625 		 */
626 		if (error != 0)
627 			return (error);
628 
629 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
630 			AUTO_SHARE, &uq->uq_key)) != 0)
631 			return (error);
632 
633 		umtxq_lock(&uq->uq_key);
634 		umtxq_busy(&uq->uq_key);
635 		umtxq_insert(uq);
636 		umtxq_unbusy(&uq->uq_key);
637 		umtxq_unlock(&uq->uq_key);
638 
639 		/*
640 		 * Set the contested bit so that a release in user space
641 		 * knows to use the system call for unlock.  If this fails
642 		 * either some one else has acquired the lock or it has been
643 		 * released.
644 		 */
645 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
646 
647 		/* The address was invalid. */
648 		if (old == -1) {
649 			umtxq_lock(&uq->uq_key);
650 			umtxq_remove(uq);
651 			umtxq_unlock(&uq->uq_key);
652 			umtx_key_release(&uq->uq_key);
653 			return (EFAULT);
654 		}
655 
656 		/*
657 		 * We set the contested bit, sleep. Otherwise the lock changed
658 		 * and we need to retry or we lost a race to the thread
659 		 * unlocking the umtx.
660 		 */
661 		umtxq_lock(&uq->uq_key);
662 		if (old == owner)
663 			error = umtxq_sleep(uq, "umtx", timo);
664 		umtxq_remove(uq);
665 		umtxq_unlock(&uq->uq_key);
666 		umtx_key_release(&uq->uq_key);
667 	}
668 
669 	return (0);
670 }
671 
672 /*
673  * Lock a umtx object.
674  */
675 static int
676 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
677 	struct timespec *timeout)
678 {
679 	struct timespec ts, ts2, ts3;
680 	struct timeval tv;
681 	int error;
682 
683 	if (timeout == NULL) {
684 		error = _do_lock_umtx(td, umtx, id, 0);
685 		/* Mutex locking is restarted if it is interrupted. */
686 		if (error == EINTR)
687 			error = ERESTART;
688 	} else {
689 		getnanouptime(&ts);
690 		timespecadd(&ts, timeout);
691 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
692 		for (;;) {
693 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
694 			if (error != ETIMEDOUT)
695 				break;
696 			getnanouptime(&ts2);
697 			if (timespeccmp(&ts2, &ts, >=)) {
698 				error = ETIMEDOUT;
699 				break;
700 			}
701 			ts3 = ts;
702 			timespecsub(&ts3, &ts2);
703 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
704 		}
705 		/* Timed-locking is not restarted. */
706 		if (error == ERESTART)
707 			error = EINTR;
708 	}
709 	return (error);
710 }
711 
712 /*
713  * Unlock a umtx object.
714  */
715 static int
716 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
717 {
718 	struct umtx_key key;
719 	u_long owner;
720 	u_long old;
721 	int error;
722 	int count;
723 
724 	/*
725 	 * Make sure we own this mtx.
726 	 */
727 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
728 	if (owner == -1)
729 		return (EFAULT);
730 
731 	if ((owner & ~UMTX_CONTESTED) != id)
732 		return (EPERM);
733 
734 	/* This should be done in userland */
735 	if ((owner & UMTX_CONTESTED) == 0) {
736 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
737 		if (old == -1)
738 			return (EFAULT);
739 		if (old == owner)
740 			return (0);
741 		owner = old;
742 	}
743 
744 	/* We should only ever be in here for contested locks */
745 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
746 		&key)) != 0)
747 		return (error);
748 
749 	umtxq_lock(&key);
750 	umtxq_busy(&key);
751 	count = umtxq_count(&key);
752 	umtxq_unlock(&key);
753 
754 	/*
755 	 * When unlocking the umtx, it must be marked as unowned if
756 	 * there is zero or one thread only waiting for it.
757 	 * Otherwise, it must be marked as contested.
758 	 */
759 	old = casuword(&umtx->u_owner, owner,
760 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
761 	umtxq_lock(&key);
762 	umtxq_signal(&key,1);
763 	umtxq_unbusy(&key);
764 	umtxq_unlock(&key);
765 	umtx_key_release(&key);
766 	if (old == -1)
767 		return (EFAULT);
768 	if (old != owner)
769 		return (EINVAL);
770 	return (0);
771 }
772 
773 #ifdef COMPAT_FREEBSD32
774 
775 /*
776  * Lock a umtx object.
777  */
778 static int
779 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
780 {
781 	struct umtx_q *uq;
782 	uint32_t owner;
783 	uint32_t old;
784 	int error = 0;
785 
786 	uq = td->td_umtxq;
787 
788 	/*
789 	 * Care must be exercised when dealing with umtx structure. It
790 	 * can fault on any access.
791 	 */
792 	for (;;) {
793 		/*
794 		 * Try the uncontested case.  This should be done in userland.
795 		 */
796 		owner = casuword32(m, UMUTEX_UNOWNED, id);
797 
798 		/* The acquire succeeded. */
799 		if (owner == UMUTEX_UNOWNED)
800 			return (0);
801 
802 		/* The address was invalid. */
803 		if (owner == -1)
804 			return (EFAULT);
805 
806 		/* If no one owns it but it is contested try to acquire it. */
807 		if (owner == UMUTEX_CONTESTED) {
808 			owner = casuword32(m,
809 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
810 			if (owner == UMUTEX_CONTESTED)
811 				return (0);
812 
813 			/* The address was invalid. */
814 			if (owner == -1)
815 				return (EFAULT);
816 
817 			/* If this failed the lock has changed, restart. */
818 			continue;
819 		}
820 
821 		/*
822 		 * If we caught a signal, we have retried and now
823 		 * exit immediately.
824 		 */
825 		if (error != 0)
826 			return (error);
827 
828 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
829 			AUTO_SHARE, &uq->uq_key)) != 0)
830 			return (error);
831 
832 		umtxq_lock(&uq->uq_key);
833 		umtxq_busy(&uq->uq_key);
834 		umtxq_insert(uq);
835 		umtxq_unbusy(&uq->uq_key);
836 		umtxq_unlock(&uq->uq_key);
837 
838 		/*
839 		 * Set the contested bit so that a release in user space
840 		 * knows to use the system call for unlock.  If this fails
841 		 * either some one else has acquired the lock or it has been
842 		 * released.
843 		 */
844 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
845 
846 		/* The address was invalid. */
847 		if (old == -1) {
848 			umtxq_lock(&uq->uq_key);
849 			umtxq_remove(uq);
850 			umtxq_unlock(&uq->uq_key);
851 			umtx_key_release(&uq->uq_key);
852 			return (EFAULT);
853 		}
854 
855 		/*
856 		 * We set the contested bit, sleep. Otherwise the lock changed
857 		 * and we need to retry or we lost a race to the thread
858 		 * unlocking the umtx.
859 		 */
860 		umtxq_lock(&uq->uq_key);
861 		if (old == owner)
862 			error = umtxq_sleep(uq, "umtx", timo);
863 		umtxq_remove(uq);
864 		umtxq_unlock(&uq->uq_key);
865 		umtx_key_release(&uq->uq_key);
866 	}
867 
868 	return (0);
869 }
870 
871 /*
872  * Lock a umtx object.
873  */
874 static int
875 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
876 	struct timespec *timeout)
877 {
878 	struct timespec ts, ts2, ts3;
879 	struct timeval tv;
880 	int error;
881 
882 	if (timeout == NULL) {
883 		error = _do_lock_umtx32(td, m, id, 0);
884 		/* Mutex locking is restarted if it is interrupted. */
885 		if (error == EINTR)
886 			error = ERESTART;
887 	} else {
888 		getnanouptime(&ts);
889 		timespecadd(&ts, timeout);
890 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
891 		for (;;) {
892 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
893 			if (error != ETIMEDOUT)
894 				break;
895 			getnanouptime(&ts2);
896 			if (timespeccmp(&ts2, &ts, >=)) {
897 				error = ETIMEDOUT;
898 				break;
899 			}
900 			ts3 = ts;
901 			timespecsub(&ts3, &ts2);
902 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
903 		}
904 		/* Timed-locking is not restarted. */
905 		if (error == ERESTART)
906 			error = EINTR;
907 	}
908 	return (error);
909 }
910 
911 /*
912  * Unlock a umtx object.
913  */
914 static int
915 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
916 {
917 	struct umtx_key key;
918 	uint32_t owner;
919 	uint32_t old;
920 	int error;
921 	int count;
922 
923 	/*
924 	 * Make sure we own this mtx.
925 	 */
926 	owner = fuword32(m);
927 	if (owner == -1)
928 		return (EFAULT);
929 
930 	if ((owner & ~UMUTEX_CONTESTED) != id)
931 		return (EPERM);
932 
933 	/* This should be done in userland */
934 	if ((owner & UMUTEX_CONTESTED) == 0) {
935 		old = casuword32(m, owner, UMUTEX_UNOWNED);
936 		if (old == -1)
937 			return (EFAULT);
938 		if (old == owner)
939 			return (0);
940 		owner = old;
941 	}
942 
943 	/* We should only ever be in here for contested locks */
944 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
945 		&key)) != 0)
946 		return (error);
947 
948 	umtxq_lock(&key);
949 	umtxq_busy(&key);
950 	count = umtxq_count(&key);
951 	umtxq_unlock(&key);
952 
953 	/*
954 	 * When unlocking the umtx, it must be marked as unowned if
955 	 * there is zero or one thread only waiting for it.
956 	 * Otherwise, it must be marked as contested.
957 	 */
958 	old = casuword32(m, owner,
959 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
960 	umtxq_lock(&key);
961 	umtxq_signal(&key,1);
962 	umtxq_unbusy(&key);
963 	umtxq_unlock(&key);
964 	umtx_key_release(&key);
965 	if (old == -1)
966 		return (EFAULT);
967 	if (old != owner)
968 		return (EINVAL);
969 	return (0);
970 }
971 #endif
972 
973 static inline int
974 tstohz(const struct timespec *tsp)
975 {
976 	struct timeval tv;
977 
978 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
979 	return tvtohz(&tv);
980 }
981 
982 /*
983  * Fetch and compare value, sleep on the address if value is not changed.
984  */
985 static int
986 do_wait(struct thread *td, void *addr, u_long id,
987 	struct _umtx_time *timeout, int compat32, int is_private)
988 {
989 	struct umtx_q *uq;
990 	struct timespec ets, cts, tts;
991 	u_long tmp;
992 	int error = 0;
993 
994 	uq = td->td_umtxq;
995 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
996 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
997 		return (error);
998 
999 	umtxq_lock(&uq->uq_key);
1000 	umtxq_insert(uq);
1001 	umtxq_unlock(&uq->uq_key);
1002 	if (compat32 == 0)
1003 		tmp = fuword(addr);
1004         else
1005 		tmp = (unsigned int)fuword32(addr);
1006 	if (tmp != id) {
1007 		umtxq_lock(&uq->uq_key);
1008 		umtxq_remove(uq);
1009 		umtxq_unlock(&uq->uq_key);
1010 	} else if (timeout == NULL) {
1011 		umtxq_lock(&uq->uq_key);
1012 		error = umtxq_sleep(uq, "uwait", 0);
1013 		umtxq_remove(uq);
1014 		umtxq_unlock(&uq->uq_key);
1015 	} else {
1016 		kern_clock_gettime(td, timeout->_clockid, &cts);
1017 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
1018 			ets = cts;
1019 			timespecadd(&ets, &timeout->_timeout);
1020 		} else {
1021 			ets = timeout->_timeout;
1022 		}
1023 		umtxq_lock(&uq->uq_key);
1024 		for (;;) {
1025 			if (timespeccmp(&cts, &ets, >=)) {
1026 				error = ETIMEDOUT;
1027 				break;
1028 			}
1029 			tts = ets;
1030 			timespecsub(&tts, &cts);
1031 			error = umtxq_sleep(uq, "uwait", tstohz(&tts));
1032 			if (!(uq->uq_flags & UQF_UMTXQ)) {
1033 				error = 0;
1034 				break;
1035 			}
1036 			if (error != ETIMEDOUT)
1037 				break;
1038 			umtxq_unlock(&uq->uq_key);
1039 			kern_clock_gettime(td, timeout->_clockid, &cts);
1040 			umtxq_lock(&uq->uq_key);
1041 		}
1042 		umtxq_remove(uq);
1043 		umtxq_unlock(&uq->uq_key);
1044 	}
1045 	umtx_key_release(&uq->uq_key);
1046 	if (error == ERESTART)
1047 		error = EINTR;
1048 	return (error);
1049 }
1050 
1051 /*
1052  * Wake up threads sleeping on the specified address.
1053  */
1054 int
1055 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1056 {
1057 	struct umtx_key key;
1058 	int ret;
1059 
1060 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1061 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1062 		return (ret);
1063 	umtxq_lock(&key);
1064 	ret = umtxq_signal(&key, n_wake);
1065 	umtxq_unlock(&key);
1066 	umtx_key_release(&key);
1067 	return (0);
1068 }
1069 
1070 /*
1071  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1072  */
1073 static int
1074 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1075 	int mode)
1076 {
1077 	struct umtx_q *uq;
1078 	uint32_t owner, old, id;
1079 	int error = 0;
1080 
1081 	id = td->td_tid;
1082 	uq = td->td_umtxq;
1083 
1084 	/*
1085 	 * Care must be exercised when dealing with umtx structure. It
1086 	 * can fault on any access.
1087 	 */
1088 	for (;;) {
1089 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1090 		if (mode == _UMUTEX_WAIT) {
1091 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1092 				return (0);
1093 		} else {
1094 			/*
1095 			 * Try the uncontested case.  This should be done in userland.
1096 			 */
1097 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1098 
1099 			/* The acquire succeeded. */
1100 			if (owner == UMUTEX_UNOWNED)
1101 				return (0);
1102 
1103 			/* The address was invalid. */
1104 			if (owner == -1)
1105 				return (EFAULT);
1106 
1107 			/* If no one owns it but it is contested try to acquire it. */
1108 			if (owner == UMUTEX_CONTESTED) {
1109 				owner = casuword32(&m->m_owner,
1110 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1111 
1112 				if (owner == UMUTEX_CONTESTED)
1113 					return (0);
1114 
1115 				/* The address was invalid. */
1116 				if (owner == -1)
1117 					return (EFAULT);
1118 
1119 				/* If this failed the lock has changed, restart. */
1120 				continue;
1121 			}
1122 		}
1123 
1124 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1125 		    (owner & ~UMUTEX_CONTESTED) == id)
1126 			return (EDEADLK);
1127 
1128 		if (mode == _UMUTEX_TRY)
1129 			return (EBUSY);
1130 
1131 		/*
1132 		 * If we caught a signal, we have retried and now
1133 		 * exit immediately.
1134 		 */
1135 		if (error != 0)
1136 			return (error);
1137 
1138 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1139 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1140 			return (error);
1141 
1142 		umtxq_lock(&uq->uq_key);
1143 		umtxq_busy(&uq->uq_key);
1144 		umtxq_insert(uq);
1145 		umtxq_unlock(&uq->uq_key);
1146 
1147 		/*
1148 		 * Set the contested bit so that a release in user space
1149 		 * knows to use the system call for unlock.  If this fails
1150 		 * either some one else has acquired the lock or it has been
1151 		 * released.
1152 		 */
1153 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1154 
1155 		/* The address was invalid. */
1156 		if (old == -1) {
1157 			umtxq_lock(&uq->uq_key);
1158 			umtxq_remove(uq);
1159 			umtxq_unbusy(&uq->uq_key);
1160 			umtxq_unlock(&uq->uq_key);
1161 			umtx_key_release(&uq->uq_key);
1162 			return (EFAULT);
1163 		}
1164 
1165 		/*
1166 		 * We set the contested bit, sleep. Otherwise the lock changed
1167 		 * and we need to retry or we lost a race to the thread
1168 		 * unlocking the umtx.
1169 		 */
1170 		umtxq_lock(&uq->uq_key);
1171 		umtxq_unbusy(&uq->uq_key);
1172 		if (old == owner)
1173 			error = umtxq_sleep(uq, "umtxn", timo);
1174 		umtxq_remove(uq);
1175 		umtxq_unlock(&uq->uq_key);
1176 		umtx_key_release(&uq->uq_key);
1177 	}
1178 
1179 	return (0);
1180 }
1181 
1182 /*
1183  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1184  */
1185 /*
1186  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1187  */
1188 static int
1189 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1190 {
1191 	struct umtx_key key;
1192 	uint32_t owner, old, id;
1193 	int error;
1194 	int count;
1195 
1196 	id = td->td_tid;
1197 	/*
1198 	 * Make sure we own this mtx.
1199 	 */
1200 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1201 	if (owner == -1)
1202 		return (EFAULT);
1203 
1204 	if ((owner & ~UMUTEX_CONTESTED) != id)
1205 		return (EPERM);
1206 
1207 	if ((owner & UMUTEX_CONTESTED) == 0) {
1208 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1209 		if (old == -1)
1210 			return (EFAULT);
1211 		if (old == owner)
1212 			return (0);
1213 		owner = old;
1214 	}
1215 
1216 	/* We should only ever be in here for contested locks */
1217 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1218 	    &key)) != 0)
1219 		return (error);
1220 
1221 	umtxq_lock(&key);
1222 	umtxq_busy(&key);
1223 	count = umtxq_count(&key);
1224 	umtxq_unlock(&key);
1225 
1226 	/*
1227 	 * When unlocking the umtx, it must be marked as unowned if
1228 	 * there is zero or one thread only waiting for it.
1229 	 * Otherwise, it must be marked as contested.
1230 	 */
1231 	old = casuword32(&m->m_owner, owner,
1232 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1233 	umtxq_lock(&key);
1234 	umtxq_signal(&key,1);
1235 	umtxq_unbusy(&key);
1236 	umtxq_unlock(&key);
1237 	umtx_key_release(&key);
1238 	if (old == -1)
1239 		return (EFAULT);
1240 	if (old != owner)
1241 		return (EINVAL);
1242 	return (0);
1243 }
1244 
1245 /*
1246  * Check if the mutex is available and wake up a waiter,
1247  * only for simple mutex.
1248  */
1249 static int
1250 do_wake_umutex(struct thread *td, struct umutex *m)
1251 {
1252 	struct umtx_key key;
1253 	uint32_t owner;
1254 	uint32_t flags;
1255 	int error;
1256 	int count;
1257 
1258 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1259 	if (owner == -1)
1260 		return (EFAULT);
1261 
1262 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1263 		return (0);
1264 
1265 	flags = fuword32(&m->m_flags);
1266 
1267 	/* We should only ever be in here for contested locks */
1268 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1269 	    &key)) != 0)
1270 		return (error);
1271 
1272 	umtxq_lock(&key);
1273 	umtxq_busy(&key);
1274 	count = umtxq_count(&key);
1275 	umtxq_unlock(&key);
1276 
1277 	if (count <= 1)
1278 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1279 
1280 	umtxq_lock(&key);
1281 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1282 		umtxq_signal(&key, 1);
1283 	umtxq_unbusy(&key);
1284 	umtxq_unlock(&key);
1285 	umtx_key_release(&key);
1286 	return (0);
1287 }
1288 
1289 static inline struct umtx_pi *
1290 umtx_pi_alloc(int flags)
1291 {
1292 	struct umtx_pi *pi;
1293 
1294 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1295 	TAILQ_INIT(&pi->pi_blocked);
1296 	atomic_add_int(&umtx_pi_allocated, 1);
1297 	return (pi);
1298 }
1299 
1300 static inline void
1301 umtx_pi_free(struct umtx_pi *pi)
1302 {
1303 	uma_zfree(umtx_pi_zone, pi);
1304 	atomic_add_int(&umtx_pi_allocated, -1);
1305 }
1306 
1307 /*
1308  * Adjust the thread's position on a pi_state after its priority has been
1309  * changed.
1310  */
1311 static int
1312 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1313 {
1314 	struct umtx_q *uq, *uq1, *uq2;
1315 	struct thread *td1;
1316 
1317 	mtx_assert(&umtx_lock, MA_OWNED);
1318 	if (pi == NULL)
1319 		return (0);
1320 
1321 	uq = td->td_umtxq;
1322 
1323 	/*
1324 	 * Check if the thread needs to be moved on the blocked chain.
1325 	 * It needs to be moved if either its priority is lower than
1326 	 * the previous thread or higher than the next thread.
1327 	 */
1328 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1329 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1330 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1331 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1332 		/*
1333 		 * Remove thread from blocked chain and determine where
1334 		 * it should be moved to.
1335 		 */
1336 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1337 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1338 			td1 = uq1->uq_thread;
1339 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1340 			if (UPRI(td1) > UPRI(td))
1341 				break;
1342 		}
1343 
1344 		if (uq1 == NULL)
1345 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1346 		else
1347 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1348 	}
1349 	return (1);
1350 }
1351 
1352 /*
1353  * Propagate priority when a thread is blocked on POSIX
1354  * PI mutex.
1355  */
1356 static void
1357 umtx_propagate_priority(struct thread *td)
1358 {
1359 	struct umtx_q *uq;
1360 	struct umtx_pi *pi;
1361 	int pri;
1362 
1363 	mtx_assert(&umtx_lock, MA_OWNED);
1364 	pri = UPRI(td);
1365 	uq = td->td_umtxq;
1366 	pi = uq->uq_pi_blocked;
1367 	if (pi == NULL)
1368 		return;
1369 
1370 	for (;;) {
1371 		td = pi->pi_owner;
1372 		if (td == NULL || td == curthread)
1373 			return;
1374 
1375 		MPASS(td->td_proc != NULL);
1376 		MPASS(td->td_proc->p_magic == P_MAGIC);
1377 
1378 		thread_lock(td);
1379 		if (td->td_lend_user_pri > pri)
1380 			sched_lend_user_prio(td, pri);
1381 		else {
1382 			thread_unlock(td);
1383 			break;
1384 		}
1385 		thread_unlock(td);
1386 
1387 		/*
1388 		 * Pick up the lock that td is blocked on.
1389 		 */
1390 		uq = td->td_umtxq;
1391 		pi = uq->uq_pi_blocked;
1392 		if (pi == NULL)
1393 			break;
1394 		/* Resort td on the list if needed. */
1395 		umtx_pi_adjust_thread(pi, td);
1396 	}
1397 }
1398 
1399 /*
1400  * Unpropagate priority for a PI mutex when a thread blocked on
1401  * it is interrupted by signal or resumed by others.
1402  */
1403 static void
1404 umtx_repropagate_priority(struct umtx_pi *pi)
1405 {
1406 	struct umtx_q *uq, *uq_owner;
1407 	struct umtx_pi *pi2;
1408 	int pri;
1409 
1410 	mtx_assert(&umtx_lock, MA_OWNED);
1411 
1412 	while (pi != NULL && pi->pi_owner != NULL) {
1413 		pri = PRI_MAX;
1414 		uq_owner = pi->pi_owner->td_umtxq;
1415 
1416 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1417 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1418 			if (uq != NULL) {
1419 				if (pri > UPRI(uq->uq_thread))
1420 					pri = UPRI(uq->uq_thread);
1421 			}
1422 		}
1423 
1424 		if (pri > uq_owner->uq_inherited_pri)
1425 			pri = uq_owner->uq_inherited_pri;
1426 		thread_lock(pi->pi_owner);
1427 		sched_lend_user_prio(pi->pi_owner, pri);
1428 		thread_unlock(pi->pi_owner);
1429 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1430 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1431 	}
1432 }
1433 
1434 /*
1435  * Insert a PI mutex into owned list.
1436  */
1437 static void
1438 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1439 {
1440 	struct umtx_q *uq_owner;
1441 
1442 	uq_owner = owner->td_umtxq;
1443 	mtx_assert(&umtx_lock, MA_OWNED);
1444 	if (pi->pi_owner != NULL)
1445 		panic("pi_ower != NULL");
1446 	pi->pi_owner = owner;
1447 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1448 }
1449 
1450 /*
1451  * Claim ownership of a PI mutex.
1452  */
1453 static int
1454 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1455 {
1456 	struct umtx_q *uq, *uq_owner;
1457 
1458 	uq_owner = owner->td_umtxq;
1459 	mtx_lock_spin(&umtx_lock);
1460 	if (pi->pi_owner == owner) {
1461 		mtx_unlock_spin(&umtx_lock);
1462 		return (0);
1463 	}
1464 
1465 	if (pi->pi_owner != NULL) {
1466 		/*
1467 		 * userland may have already messed the mutex, sigh.
1468 		 */
1469 		mtx_unlock_spin(&umtx_lock);
1470 		return (EPERM);
1471 	}
1472 	umtx_pi_setowner(pi, owner);
1473 	uq = TAILQ_FIRST(&pi->pi_blocked);
1474 	if (uq != NULL) {
1475 		int pri;
1476 
1477 		pri = UPRI(uq->uq_thread);
1478 		thread_lock(owner);
1479 		if (pri < UPRI(owner))
1480 			sched_lend_user_prio(owner, pri);
1481 		thread_unlock(owner);
1482 	}
1483 	mtx_unlock_spin(&umtx_lock);
1484 	return (0);
1485 }
1486 
1487 /*
1488  * Adjust a thread's order position in its blocked PI mutex,
1489  * this may result new priority propagating process.
1490  */
1491 void
1492 umtx_pi_adjust(struct thread *td, u_char oldpri)
1493 {
1494 	struct umtx_q *uq;
1495 	struct umtx_pi *pi;
1496 
1497 	uq = td->td_umtxq;
1498 	mtx_lock_spin(&umtx_lock);
1499 	/*
1500 	 * Pick up the lock that td is blocked on.
1501 	 */
1502 	pi = uq->uq_pi_blocked;
1503 	if (pi != NULL) {
1504 		umtx_pi_adjust_thread(pi, td);
1505 		umtx_repropagate_priority(pi);
1506 	}
1507 	mtx_unlock_spin(&umtx_lock);
1508 }
1509 
1510 /*
1511  * Sleep on a PI mutex.
1512  */
1513 static int
1514 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1515 	uint32_t owner, const char *wmesg, int timo)
1516 {
1517 	struct umtxq_chain *uc;
1518 	struct thread *td, *td1;
1519 	struct umtx_q *uq1;
1520 	int pri;
1521 	int error = 0;
1522 
1523 	td = uq->uq_thread;
1524 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1525 	uc = umtxq_getchain(&uq->uq_key);
1526 	UMTXQ_LOCKED_ASSERT(uc);
1527 	UMTXQ_BUSY_ASSERT(uc);
1528 	umtxq_insert(uq);
1529 	mtx_lock_spin(&umtx_lock);
1530 	if (pi->pi_owner == NULL) {
1531 		mtx_unlock_spin(&umtx_lock);
1532 		/* XXX Only look up thread in current process. */
1533 		td1 = tdfind(owner, curproc->p_pid);
1534 		mtx_lock_spin(&umtx_lock);
1535 		if (td1 != NULL) {
1536 			if (pi->pi_owner == NULL)
1537 				umtx_pi_setowner(pi, td1);
1538 			PROC_UNLOCK(td1->td_proc);
1539 		}
1540 	}
1541 
1542 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1543 		pri = UPRI(uq1->uq_thread);
1544 		if (pri > UPRI(td))
1545 			break;
1546 	}
1547 
1548 	if (uq1 != NULL)
1549 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1550 	else
1551 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1552 
1553 	uq->uq_pi_blocked = pi;
1554 	thread_lock(td);
1555 	td->td_flags |= TDF_UPIBLOCKED;
1556 	thread_unlock(td);
1557 	umtx_propagate_priority(td);
1558 	mtx_unlock_spin(&umtx_lock);
1559 	umtxq_unbusy(&uq->uq_key);
1560 
1561 	if (uq->uq_flags & UQF_UMTXQ) {
1562 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1563 		if (error == EWOULDBLOCK)
1564 			error = ETIMEDOUT;
1565 		if (uq->uq_flags & UQF_UMTXQ) {
1566 			umtxq_remove(uq);
1567 		}
1568 	}
1569 	mtx_lock_spin(&umtx_lock);
1570 	uq->uq_pi_blocked = NULL;
1571 	thread_lock(td);
1572 	td->td_flags &= ~TDF_UPIBLOCKED;
1573 	thread_unlock(td);
1574 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1575 	umtx_repropagate_priority(pi);
1576 	mtx_unlock_spin(&umtx_lock);
1577 	umtxq_unlock(&uq->uq_key);
1578 
1579 	return (error);
1580 }
1581 
1582 /*
1583  * Add reference count for a PI mutex.
1584  */
1585 static void
1586 umtx_pi_ref(struct umtx_pi *pi)
1587 {
1588 	struct umtxq_chain *uc;
1589 
1590 	uc = umtxq_getchain(&pi->pi_key);
1591 	UMTXQ_LOCKED_ASSERT(uc);
1592 	pi->pi_refcount++;
1593 }
1594 
1595 /*
1596  * Decrease reference count for a PI mutex, if the counter
1597  * is decreased to zero, its memory space is freed.
1598  */
1599 static void
1600 umtx_pi_unref(struct umtx_pi *pi)
1601 {
1602 	struct umtxq_chain *uc;
1603 
1604 	uc = umtxq_getchain(&pi->pi_key);
1605 	UMTXQ_LOCKED_ASSERT(uc);
1606 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1607 	if (--pi->pi_refcount == 0) {
1608 		mtx_lock_spin(&umtx_lock);
1609 		if (pi->pi_owner != NULL) {
1610 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1611 				pi, pi_link);
1612 			pi->pi_owner = NULL;
1613 		}
1614 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1615 			("blocked queue not empty"));
1616 		mtx_unlock_spin(&umtx_lock);
1617 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1618 		umtx_pi_free(pi);
1619 	}
1620 }
1621 
1622 /*
1623  * Find a PI mutex in hash table.
1624  */
1625 static struct umtx_pi *
1626 umtx_pi_lookup(struct umtx_key *key)
1627 {
1628 	struct umtxq_chain *uc;
1629 	struct umtx_pi *pi;
1630 
1631 	uc = umtxq_getchain(key);
1632 	UMTXQ_LOCKED_ASSERT(uc);
1633 
1634 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1635 		if (umtx_key_match(&pi->pi_key, key)) {
1636 			return (pi);
1637 		}
1638 	}
1639 	return (NULL);
1640 }
1641 
1642 /*
1643  * Insert a PI mutex into hash table.
1644  */
1645 static inline void
1646 umtx_pi_insert(struct umtx_pi *pi)
1647 {
1648 	struct umtxq_chain *uc;
1649 
1650 	uc = umtxq_getchain(&pi->pi_key);
1651 	UMTXQ_LOCKED_ASSERT(uc);
1652 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1653 }
1654 
1655 /*
1656  * Lock a PI mutex.
1657  */
1658 static int
1659 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1660 	int try)
1661 {
1662 	struct umtx_q *uq;
1663 	struct umtx_pi *pi, *new_pi;
1664 	uint32_t id, owner, old;
1665 	int error;
1666 
1667 	id = td->td_tid;
1668 	uq = td->td_umtxq;
1669 
1670 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1671 	    &uq->uq_key)) != 0)
1672 		return (error);
1673 	umtxq_lock(&uq->uq_key);
1674 	pi = umtx_pi_lookup(&uq->uq_key);
1675 	if (pi == NULL) {
1676 		new_pi = umtx_pi_alloc(M_NOWAIT);
1677 		if (new_pi == NULL) {
1678 			umtxq_unlock(&uq->uq_key);
1679 			new_pi = umtx_pi_alloc(M_WAITOK);
1680 			umtxq_lock(&uq->uq_key);
1681 			pi = umtx_pi_lookup(&uq->uq_key);
1682 			if (pi != NULL) {
1683 				umtx_pi_free(new_pi);
1684 				new_pi = NULL;
1685 			}
1686 		}
1687 		if (new_pi != NULL) {
1688 			new_pi->pi_key = uq->uq_key;
1689 			umtx_pi_insert(new_pi);
1690 			pi = new_pi;
1691 		}
1692 	}
1693 	umtx_pi_ref(pi);
1694 	umtxq_unlock(&uq->uq_key);
1695 
1696 	/*
1697 	 * Care must be exercised when dealing with umtx structure.  It
1698 	 * can fault on any access.
1699 	 */
1700 	for (;;) {
1701 		/*
1702 		 * Try the uncontested case.  This should be done in userland.
1703 		 */
1704 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1705 
1706 		/* The acquire succeeded. */
1707 		if (owner == UMUTEX_UNOWNED) {
1708 			error = 0;
1709 			break;
1710 		}
1711 
1712 		/* The address was invalid. */
1713 		if (owner == -1) {
1714 			error = EFAULT;
1715 			break;
1716 		}
1717 
1718 		/* If no one owns it but it is contested try to acquire it. */
1719 		if (owner == UMUTEX_CONTESTED) {
1720 			owner = casuword32(&m->m_owner,
1721 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1722 
1723 			if (owner == UMUTEX_CONTESTED) {
1724 				umtxq_lock(&uq->uq_key);
1725 				umtxq_busy(&uq->uq_key);
1726 				error = umtx_pi_claim(pi, td);
1727 				umtxq_unbusy(&uq->uq_key);
1728 				umtxq_unlock(&uq->uq_key);
1729 				break;
1730 			}
1731 
1732 			/* The address was invalid. */
1733 			if (owner == -1) {
1734 				error = EFAULT;
1735 				break;
1736 			}
1737 
1738 			/* If this failed the lock has changed, restart. */
1739 			continue;
1740 		}
1741 
1742 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1743 		    (owner & ~UMUTEX_CONTESTED) == id) {
1744 			error = EDEADLK;
1745 			break;
1746 		}
1747 
1748 		if (try != 0) {
1749 			error = EBUSY;
1750 			break;
1751 		}
1752 
1753 		/*
1754 		 * If we caught a signal, we have retried and now
1755 		 * exit immediately.
1756 		 */
1757 		if (error != 0)
1758 			break;
1759 
1760 		umtxq_lock(&uq->uq_key);
1761 		umtxq_busy(&uq->uq_key);
1762 		umtxq_unlock(&uq->uq_key);
1763 
1764 		/*
1765 		 * Set the contested bit so that a release in user space
1766 		 * knows to use the system call for unlock.  If this fails
1767 		 * either some one else has acquired the lock or it has been
1768 		 * released.
1769 		 */
1770 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1771 
1772 		/* The address was invalid. */
1773 		if (old == -1) {
1774 			umtxq_lock(&uq->uq_key);
1775 			umtxq_unbusy(&uq->uq_key);
1776 			umtxq_unlock(&uq->uq_key);
1777 			error = EFAULT;
1778 			break;
1779 		}
1780 
1781 		umtxq_lock(&uq->uq_key);
1782 		/*
1783 		 * We set the contested bit, sleep. Otherwise the lock changed
1784 		 * and we need to retry or we lost a race to the thread
1785 		 * unlocking the umtx.
1786 		 */
1787 		if (old == owner)
1788 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1789 				 "umtxpi", timo);
1790 		else {
1791 			umtxq_unbusy(&uq->uq_key);
1792 			umtxq_unlock(&uq->uq_key);
1793 		}
1794 	}
1795 
1796 	umtxq_lock(&uq->uq_key);
1797 	umtx_pi_unref(pi);
1798 	umtxq_unlock(&uq->uq_key);
1799 
1800 	umtx_key_release(&uq->uq_key);
1801 	return (error);
1802 }
1803 
1804 /*
1805  * Unlock a PI mutex.
1806  */
1807 static int
1808 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1809 {
1810 	struct umtx_key key;
1811 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1812 	struct umtx_pi *pi, *pi2;
1813 	uint32_t owner, old, id;
1814 	int error;
1815 	int count;
1816 	int pri;
1817 
1818 	id = td->td_tid;
1819 	/*
1820 	 * Make sure we own this mtx.
1821 	 */
1822 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1823 	if (owner == -1)
1824 		return (EFAULT);
1825 
1826 	if ((owner & ~UMUTEX_CONTESTED) != id)
1827 		return (EPERM);
1828 
1829 	/* This should be done in userland */
1830 	if ((owner & UMUTEX_CONTESTED) == 0) {
1831 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1832 		if (old == -1)
1833 			return (EFAULT);
1834 		if (old == owner)
1835 			return (0);
1836 		owner = old;
1837 	}
1838 
1839 	/* We should only ever be in here for contested locks */
1840 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1841 	    &key)) != 0)
1842 		return (error);
1843 
1844 	umtxq_lock(&key);
1845 	umtxq_busy(&key);
1846 	count = umtxq_count_pi(&key, &uq_first);
1847 	if (uq_first != NULL) {
1848 		mtx_lock_spin(&umtx_lock);
1849 		pi = uq_first->uq_pi_blocked;
1850 		KASSERT(pi != NULL, ("pi == NULL?"));
1851 		if (pi->pi_owner != curthread) {
1852 			mtx_unlock_spin(&umtx_lock);
1853 			umtxq_unbusy(&key);
1854 			umtxq_unlock(&key);
1855 			umtx_key_release(&key);
1856 			/* userland messed the mutex */
1857 			return (EPERM);
1858 		}
1859 		uq_me = curthread->td_umtxq;
1860 		pi->pi_owner = NULL;
1861 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1862 		/* get highest priority thread which is still sleeping. */
1863 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1864 		while (uq_first != NULL &&
1865 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1866 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1867 		}
1868 		pri = PRI_MAX;
1869 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1870 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1871 			if (uq_first2 != NULL) {
1872 				if (pri > UPRI(uq_first2->uq_thread))
1873 					pri = UPRI(uq_first2->uq_thread);
1874 			}
1875 		}
1876 		thread_lock(curthread);
1877 		sched_lend_user_prio(curthread, pri);
1878 		thread_unlock(curthread);
1879 		mtx_unlock_spin(&umtx_lock);
1880 		if (uq_first)
1881 			umtxq_signal_thread(uq_first);
1882 	}
1883 	umtxq_unlock(&key);
1884 
1885 	/*
1886 	 * When unlocking the umtx, it must be marked as unowned if
1887 	 * there is zero or one thread only waiting for it.
1888 	 * Otherwise, it must be marked as contested.
1889 	 */
1890 	old = casuword32(&m->m_owner, owner,
1891 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1892 
1893 	umtxq_lock(&key);
1894 	umtxq_unbusy(&key);
1895 	umtxq_unlock(&key);
1896 	umtx_key_release(&key);
1897 	if (old == -1)
1898 		return (EFAULT);
1899 	if (old != owner)
1900 		return (EINVAL);
1901 	return (0);
1902 }
1903 
1904 /*
1905  * Lock a PP mutex.
1906  */
1907 static int
1908 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1909 	int try)
1910 {
1911 	struct umtx_q *uq, *uq2;
1912 	struct umtx_pi *pi;
1913 	uint32_t ceiling;
1914 	uint32_t owner, id;
1915 	int error, pri, old_inherited_pri, su;
1916 
1917 	id = td->td_tid;
1918 	uq = td->td_umtxq;
1919 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1920 	    &uq->uq_key)) != 0)
1921 		return (error);
1922 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1923 	for (;;) {
1924 		old_inherited_pri = uq->uq_inherited_pri;
1925 		umtxq_lock(&uq->uq_key);
1926 		umtxq_busy(&uq->uq_key);
1927 		umtxq_unlock(&uq->uq_key);
1928 
1929 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1930 		if (ceiling > RTP_PRIO_MAX) {
1931 			error = EINVAL;
1932 			goto out;
1933 		}
1934 
1935 		mtx_lock_spin(&umtx_lock);
1936 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1937 			mtx_unlock_spin(&umtx_lock);
1938 			error = EINVAL;
1939 			goto out;
1940 		}
1941 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1942 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1943 			thread_lock(td);
1944 			if (uq->uq_inherited_pri < UPRI(td))
1945 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1946 			thread_unlock(td);
1947 		}
1948 		mtx_unlock_spin(&umtx_lock);
1949 
1950 		owner = casuword32(&m->m_owner,
1951 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1952 
1953 		if (owner == UMUTEX_CONTESTED) {
1954 			error = 0;
1955 			break;
1956 		}
1957 
1958 		/* The address was invalid. */
1959 		if (owner == -1) {
1960 			error = EFAULT;
1961 			break;
1962 		}
1963 
1964 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1965 		    (owner & ~UMUTEX_CONTESTED) == id) {
1966 			error = EDEADLK;
1967 			break;
1968 		}
1969 
1970 		if (try != 0) {
1971 			error = EBUSY;
1972 			break;
1973 		}
1974 
1975 		/*
1976 		 * If we caught a signal, we have retried and now
1977 		 * exit immediately.
1978 		 */
1979 		if (error != 0)
1980 			break;
1981 
1982 		umtxq_lock(&uq->uq_key);
1983 		umtxq_insert(uq);
1984 		umtxq_unbusy(&uq->uq_key);
1985 		error = umtxq_sleep(uq, "umtxpp", timo);
1986 		umtxq_remove(uq);
1987 		umtxq_unlock(&uq->uq_key);
1988 
1989 		mtx_lock_spin(&umtx_lock);
1990 		uq->uq_inherited_pri = old_inherited_pri;
1991 		pri = PRI_MAX;
1992 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1993 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1994 			if (uq2 != NULL) {
1995 				if (pri > UPRI(uq2->uq_thread))
1996 					pri = UPRI(uq2->uq_thread);
1997 			}
1998 		}
1999 		if (pri > uq->uq_inherited_pri)
2000 			pri = uq->uq_inherited_pri;
2001 		thread_lock(td);
2002 		sched_lend_user_prio(td, pri);
2003 		thread_unlock(td);
2004 		mtx_unlock_spin(&umtx_lock);
2005 	}
2006 
2007 	if (error != 0) {
2008 		mtx_lock_spin(&umtx_lock);
2009 		uq->uq_inherited_pri = old_inherited_pri;
2010 		pri = PRI_MAX;
2011 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2012 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2013 			if (uq2 != NULL) {
2014 				if (pri > UPRI(uq2->uq_thread))
2015 					pri = UPRI(uq2->uq_thread);
2016 			}
2017 		}
2018 		if (pri > uq->uq_inherited_pri)
2019 			pri = uq->uq_inherited_pri;
2020 		thread_lock(td);
2021 		sched_lend_user_prio(td, pri);
2022 		thread_unlock(td);
2023 		mtx_unlock_spin(&umtx_lock);
2024 	}
2025 
2026 out:
2027 	umtxq_lock(&uq->uq_key);
2028 	umtxq_unbusy(&uq->uq_key);
2029 	umtxq_unlock(&uq->uq_key);
2030 	umtx_key_release(&uq->uq_key);
2031 	return (error);
2032 }
2033 
2034 /*
2035  * Unlock a PP mutex.
2036  */
2037 static int
2038 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2039 {
2040 	struct umtx_key key;
2041 	struct umtx_q *uq, *uq2;
2042 	struct umtx_pi *pi;
2043 	uint32_t owner, id;
2044 	uint32_t rceiling;
2045 	int error, pri, new_inherited_pri, su;
2046 
2047 	id = td->td_tid;
2048 	uq = td->td_umtxq;
2049 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2050 
2051 	/*
2052 	 * Make sure we own this mtx.
2053 	 */
2054 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2055 	if (owner == -1)
2056 		return (EFAULT);
2057 
2058 	if ((owner & ~UMUTEX_CONTESTED) != id)
2059 		return (EPERM);
2060 
2061 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2062 	if (error != 0)
2063 		return (error);
2064 
2065 	if (rceiling == -1)
2066 		new_inherited_pri = PRI_MAX;
2067 	else {
2068 		rceiling = RTP_PRIO_MAX - rceiling;
2069 		if (rceiling > RTP_PRIO_MAX)
2070 			return (EINVAL);
2071 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2072 	}
2073 
2074 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2075 	    &key)) != 0)
2076 		return (error);
2077 	umtxq_lock(&key);
2078 	umtxq_busy(&key);
2079 	umtxq_unlock(&key);
2080 	/*
2081 	 * For priority protected mutex, always set unlocked state
2082 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2083 	 * to lock the mutex, it is necessary because thread priority
2084 	 * has to be adjusted for such mutex.
2085 	 */
2086 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2087 		UMUTEX_CONTESTED);
2088 
2089 	umtxq_lock(&key);
2090 	if (error == 0)
2091 		umtxq_signal(&key, 1);
2092 	umtxq_unbusy(&key);
2093 	umtxq_unlock(&key);
2094 
2095 	if (error == -1)
2096 		error = EFAULT;
2097 	else {
2098 		mtx_lock_spin(&umtx_lock);
2099 		if (su != 0)
2100 			uq->uq_inherited_pri = new_inherited_pri;
2101 		pri = PRI_MAX;
2102 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2103 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2104 			if (uq2 != NULL) {
2105 				if (pri > UPRI(uq2->uq_thread))
2106 					pri = UPRI(uq2->uq_thread);
2107 			}
2108 		}
2109 		if (pri > uq->uq_inherited_pri)
2110 			pri = uq->uq_inherited_pri;
2111 		thread_lock(td);
2112 		sched_lend_user_prio(td, pri);
2113 		thread_unlock(td);
2114 		mtx_unlock_spin(&umtx_lock);
2115 	}
2116 	umtx_key_release(&key);
2117 	return (error);
2118 }
2119 
2120 static int
2121 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2122 	uint32_t *old_ceiling)
2123 {
2124 	struct umtx_q *uq;
2125 	uint32_t save_ceiling;
2126 	uint32_t owner, id;
2127 	uint32_t flags;
2128 	int error;
2129 
2130 	flags = fuword32(&m->m_flags);
2131 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2132 		return (EINVAL);
2133 	if (ceiling > RTP_PRIO_MAX)
2134 		return (EINVAL);
2135 	id = td->td_tid;
2136 	uq = td->td_umtxq;
2137 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138 	   &uq->uq_key)) != 0)
2139 		return (error);
2140 	for (;;) {
2141 		umtxq_lock(&uq->uq_key);
2142 		umtxq_busy(&uq->uq_key);
2143 		umtxq_unlock(&uq->uq_key);
2144 
2145 		save_ceiling = fuword32(&m->m_ceilings[0]);
2146 
2147 		owner = casuword32(&m->m_owner,
2148 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2149 
2150 		if (owner == UMUTEX_CONTESTED) {
2151 			suword32(&m->m_ceilings[0], ceiling);
2152 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2153 				UMUTEX_CONTESTED);
2154 			error = 0;
2155 			break;
2156 		}
2157 
2158 		/* The address was invalid. */
2159 		if (owner == -1) {
2160 			error = EFAULT;
2161 			break;
2162 		}
2163 
2164 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2165 			suword32(&m->m_ceilings[0], ceiling);
2166 			error = 0;
2167 			break;
2168 		}
2169 
2170 		/*
2171 		 * If we caught a signal, we have retried and now
2172 		 * exit immediately.
2173 		 */
2174 		if (error != 0)
2175 			break;
2176 
2177 		/*
2178 		 * We set the contested bit, sleep. Otherwise the lock changed
2179 		 * and we need to retry or we lost a race to the thread
2180 		 * unlocking the umtx.
2181 		 */
2182 		umtxq_lock(&uq->uq_key);
2183 		umtxq_insert(uq);
2184 		umtxq_unbusy(&uq->uq_key);
2185 		error = umtxq_sleep(uq, "umtxpp", 0);
2186 		umtxq_remove(uq);
2187 		umtxq_unlock(&uq->uq_key);
2188 	}
2189 	umtxq_lock(&uq->uq_key);
2190 	if (error == 0)
2191 		umtxq_signal(&uq->uq_key, INT_MAX);
2192 	umtxq_unbusy(&uq->uq_key);
2193 	umtxq_unlock(&uq->uq_key);
2194 	umtx_key_release(&uq->uq_key);
2195 	if (error == 0 && old_ceiling != NULL)
2196 		suword32(old_ceiling, save_ceiling);
2197 	return (error);
2198 }
2199 
2200 static int
2201 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2202 	int mode)
2203 {
2204 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2205 	case 0:
2206 		return (_do_lock_normal(td, m, flags, timo, mode));
2207 	case UMUTEX_PRIO_INHERIT:
2208 		return (_do_lock_pi(td, m, flags, timo, mode));
2209 	case UMUTEX_PRIO_PROTECT:
2210 		return (_do_lock_pp(td, m, flags, timo, mode));
2211 	}
2212 	return (EINVAL);
2213 }
2214 
2215 /*
2216  * Lock a userland POSIX mutex.
2217  */
2218 static int
2219 do_lock_umutex(struct thread *td, struct umutex *m,
2220 	struct _umtx_time *timeout, int mode)
2221 {
2222 	struct timespec cts, ets, tts;
2223 	uint32_t flags;
2224 	int error;
2225 
2226 	flags = fuword32(&m->m_flags);
2227 	if (flags == -1)
2228 		return (EFAULT);
2229 
2230 	if (timeout == NULL) {
2231 		error = _do_lock_umutex(td, m, flags, 0, mode);
2232 		/* Mutex locking is restarted if it is interrupted. */
2233 		if (error == EINTR && mode != _UMUTEX_WAIT)
2234 			error = ERESTART;
2235 	} else {
2236 		kern_clock_gettime(td, timeout->_clockid, &cts);
2237 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2238 			ets = cts;
2239 			timespecadd(&ets, &timeout->_timeout);
2240 			tts = timeout->_timeout;
2241 		} else {
2242 			ets = timeout->_timeout;
2243 			tts = timeout->_timeout;
2244 			timespecsub(&tts, &cts);
2245 		}
2246 		for (;;) {
2247 			error = _do_lock_umutex(td, m, flags, tstohz(&tts), mode);
2248 			if (error != ETIMEDOUT)
2249 				break;
2250 			kern_clock_gettime(td, timeout->_clockid, &cts);
2251 			if (timespeccmp(&cts, &ets, >=))
2252 				break;
2253 			tts = ets;
2254 			timespecsub(&tts, &cts);
2255 		}
2256 		/* Timed-locking is not restarted. */
2257 		if (error == ERESTART)
2258 			error = EINTR;
2259 	}
2260 	return (error);
2261 }
2262 
2263 /*
2264  * Unlock a userland POSIX mutex.
2265  */
2266 static int
2267 do_unlock_umutex(struct thread *td, struct umutex *m)
2268 {
2269 	uint32_t flags;
2270 
2271 	flags = fuword32(&m->m_flags);
2272 	if (flags == -1)
2273 		return (EFAULT);
2274 
2275 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2276 	case 0:
2277 		return (do_unlock_normal(td, m, flags));
2278 	case UMUTEX_PRIO_INHERIT:
2279 		return (do_unlock_pi(td, m, flags));
2280 	case UMUTEX_PRIO_PROTECT:
2281 		return (do_unlock_pp(td, m, flags));
2282 	}
2283 
2284 	return (EINVAL);
2285 }
2286 
2287 static int
2288 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2289 	struct timespec *timeout, u_long wflags)
2290 {
2291 	struct umtx_q *uq;
2292 	struct timespec cts, ets, tts;
2293 	uint32_t flags;
2294 	uint32_t clockid;
2295 	int error;
2296 
2297 	uq = td->td_umtxq;
2298 	flags = fuword32(&cv->c_flags);
2299 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2300 	if (error != 0)
2301 		return (error);
2302 
2303 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2304 		clockid = fuword32(&cv->c_clockid);
2305 		if (clockid < CLOCK_REALTIME ||
2306 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2307 			/* hmm, only HW clock id will work. */
2308 			return (EINVAL);
2309 		}
2310 	} else {
2311 		clockid = CLOCK_REALTIME;
2312 	}
2313 
2314 	umtxq_lock(&uq->uq_key);
2315 	umtxq_busy(&uq->uq_key);
2316 	umtxq_insert(uq);
2317 	umtxq_unlock(&uq->uq_key);
2318 
2319 	/*
2320 	 * Set c_has_waiters to 1 before releasing user mutex, also
2321 	 * don't modify cache line when unnecessary.
2322 	 */
2323 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2324 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2325 
2326 	umtxq_lock(&uq->uq_key);
2327 	umtxq_unbusy(&uq->uq_key);
2328 	umtxq_unlock(&uq->uq_key);
2329 
2330 	error = do_unlock_umutex(td, m);
2331 
2332 	umtxq_lock(&uq->uq_key);
2333 	if (error == 0) {
2334 		if (timeout == NULL) {
2335 			error = umtxq_sleep(uq, "ucond", 0);
2336 		} else {
2337 			if ((wflags & CVWAIT_ABSTIME) == 0) {
2338 				kern_clock_gettime(td, clockid, &ets);
2339 				timespecadd(&ets, timeout);
2340 				tts = *timeout;
2341 			} else { /* absolute time */
2342 				ets = *timeout;
2343 				tts = *timeout;
2344 				kern_clock_gettime(td, clockid, &cts);
2345 				timespecsub(&tts, &cts);
2346 			}
2347 			for (;;) {
2348 				error = umtxq_sleep(uq, "ucond", tstohz(&tts));
2349 				if (error != ETIMEDOUT)
2350 					break;
2351 				kern_clock_gettime(td, clockid, &cts);
2352 				if (timespeccmp(&cts, &ets, >=)) {
2353 					error = ETIMEDOUT;
2354 					break;
2355 				}
2356 				tts = ets;
2357 				timespecsub(&tts, &cts);
2358 			}
2359 		}
2360 	}
2361 
2362 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2363 		error = 0;
2364 	else {
2365 		/*
2366 		 * This must be timeout,interrupted by signal or
2367 		 * surprious wakeup, clear c_has_waiter flag when
2368 		 * necessary.
2369 		 */
2370 		umtxq_busy(&uq->uq_key);
2371 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2372 			int oldlen = uq->uq_cur_queue->length;
2373 			umtxq_remove(uq);
2374 			if (oldlen == 1) {
2375 				umtxq_unlock(&uq->uq_key);
2376 				suword32(
2377 				    __DEVOLATILE(uint32_t *,
2378 					 &cv->c_has_waiters), 0);
2379 				umtxq_lock(&uq->uq_key);
2380 			}
2381 		}
2382 		umtxq_unbusy(&uq->uq_key);
2383 		if (error == ERESTART)
2384 			error = EINTR;
2385 	}
2386 
2387 	umtxq_unlock(&uq->uq_key);
2388 	umtx_key_release(&uq->uq_key);
2389 	return (error);
2390 }
2391 
2392 /*
2393  * Signal a userland condition variable.
2394  */
2395 static int
2396 do_cv_signal(struct thread *td, struct ucond *cv)
2397 {
2398 	struct umtx_key key;
2399 	int error, cnt, nwake;
2400 	uint32_t flags;
2401 
2402 	flags = fuword32(&cv->c_flags);
2403 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2404 		return (error);
2405 	umtxq_lock(&key);
2406 	umtxq_busy(&key);
2407 	cnt = umtxq_count(&key);
2408 	nwake = umtxq_signal(&key, 1);
2409 	if (cnt <= nwake) {
2410 		umtxq_unlock(&key);
2411 		error = suword32(
2412 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2413 		umtxq_lock(&key);
2414 	}
2415 	umtxq_unbusy(&key);
2416 	umtxq_unlock(&key);
2417 	umtx_key_release(&key);
2418 	return (error);
2419 }
2420 
2421 static int
2422 do_cv_broadcast(struct thread *td, struct ucond *cv)
2423 {
2424 	struct umtx_key key;
2425 	int error;
2426 	uint32_t flags;
2427 
2428 	flags = fuword32(&cv->c_flags);
2429 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2430 		return (error);
2431 
2432 	umtxq_lock(&key);
2433 	umtxq_busy(&key);
2434 	umtxq_signal(&key, INT_MAX);
2435 	umtxq_unlock(&key);
2436 
2437 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2438 
2439 	umtxq_lock(&key);
2440 	umtxq_unbusy(&key);
2441 	umtxq_unlock(&key);
2442 
2443 	umtx_key_release(&key);
2444 	return (error);
2445 }
2446 
2447 static int
2448 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2449 {
2450 	struct umtx_q *uq;
2451 	uint32_t flags, wrflags;
2452 	int32_t state, oldstate;
2453 	int32_t blocked_readers;
2454 	int error;
2455 
2456 	uq = td->td_umtxq;
2457 	flags = fuword32(&rwlock->rw_flags);
2458 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2459 	if (error != 0)
2460 		return (error);
2461 
2462 	wrflags = URWLOCK_WRITE_OWNER;
2463 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2464 		wrflags |= URWLOCK_WRITE_WAITERS;
2465 
2466 	for (;;) {
2467 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2468 		/* try to lock it */
2469 		while (!(state & wrflags)) {
2470 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2471 				umtx_key_release(&uq->uq_key);
2472 				return (EAGAIN);
2473 			}
2474 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2475 			if (oldstate == state) {
2476 				umtx_key_release(&uq->uq_key);
2477 				return (0);
2478 			}
2479 			state = oldstate;
2480 		}
2481 
2482 		if (error)
2483 			break;
2484 
2485 		/* grab monitor lock */
2486 		umtxq_lock(&uq->uq_key);
2487 		umtxq_busy(&uq->uq_key);
2488 		umtxq_unlock(&uq->uq_key);
2489 
2490 		/*
2491 		 * re-read the state, in case it changed between the try-lock above
2492 		 * and the check below
2493 		 */
2494 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2495 
2496 		/* set read contention bit */
2497 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2498 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2499 			if (oldstate == state)
2500 				goto sleep;
2501 			state = oldstate;
2502 		}
2503 
2504 		/* state is changed while setting flags, restart */
2505 		if (!(state & wrflags)) {
2506 			umtxq_lock(&uq->uq_key);
2507 			umtxq_unbusy(&uq->uq_key);
2508 			umtxq_unlock(&uq->uq_key);
2509 			continue;
2510 		}
2511 
2512 sleep:
2513 		/* contention bit is set, before sleeping, increase read waiter count */
2514 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2515 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2516 
2517 		while (state & wrflags) {
2518 			umtxq_lock(&uq->uq_key);
2519 			umtxq_insert(uq);
2520 			umtxq_unbusy(&uq->uq_key);
2521 
2522 			error = umtxq_sleep(uq, "urdlck", timo);
2523 
2524 			umtxq_busy(&uq->uq_key);
2525 			umtxq_remove(uq);
2526 			umtxq_unlock(&uq->uq_key);
2527 			if (error)
2528 				break;
2529 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2530 		}
2531 
2532 		/* decrease read waiter count, and may clear read contention bit */
2533 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2534 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2535 		if (blocked_readers == 1) {
2536 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2537 			for (;;) {
2538 				oldstate = casuword32(&rwlock->rw_state, state,
2539 					 state & ~URWLOCK_READ_WAITERS);
2540 				if (oldstate == state)
2541 					break;
2542 				state = oldstate;
2543 			}
2544 		}
2545 
2546 		umtxq_lock(&uq->uq_key);
2547 		umtxq_unbusy(&uq->uq_key);
2548 		umtxq_unlock(&uq->uq_key);
2549 	}
2550 	umtx_key_release(&uq->uq_key);
2551 	return (error);
2552 }
2553 
2554 static int
2555 do_rw_rdlock2(struct thread *td, void *obj, long val, struct _umtx_time *timeout)
2556 {
2557 	struct timespec cts, ets, tts;
2558 	int error;
2559 
2560 	kern_clock_gettime(td, timeout->_clockid, &cts);
2561 	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2562 		ets = cts;
2563 		timespecadd(&ets, &timeout->_timeout);
2564 		tts = timeout->_timeout;
2565 	} else {
2566 		ets = timeout->_timeout;
2567 		tts = timeout->_timeout;
2568 		timespecsub(&tts, &cts);
2569 	}
2570 	for (;;) {
2571 		error = do_rw_rdlock(td, obj, val, tstohz(&tts));
2572 		if (error != ETIMEDOUT)
2573 			break;
2574 		kern_clock_gettime(td, timeout->_clockid, &cts);
2575 		if (timespeccmp(&cts, &ets, >=))
2576 			break;
2577 		tts = ets;
2578 		timespecsub(&tts, &cts);
2579 	}
2580 	if (error == ERESTART)
2581 		error = EINTR;
2582 	return (error);
2583 }
2584 
2585 static int
2586 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2587 {
2588 	struct umtx_q *uq;
2589 	uint32_t flags;
2590 	int32_t state, oldstate;
2591 	int32_t blocked_writers;
2592 	int32_t blocked_readers;
2593 	int error;
2594 
2595 	uq = td->td_umtxq;
2596 	flags = fuword32(&rwlock->rw_flags);
2597 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2598 	if (error != 0)
2599 		return (error);
2600 
2601 	blocked_readers = 0;
2602 	for (;;) {
2603 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2604 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2605 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2606 			if (oldstate == state) {
2607 				umtx_key_release(&uq->uq_key);
2608 				return (0);
2609 			}
2610 			state = oldstate;
2611 		}
2612 
2613 		if (error) {
2614 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2615 			    blocked_readers != 0) {
2616 				umtxq_lock(&uq->uq_key);
2617 				umtxq_busy(&uq->uq_key);
2618 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2619 				umtxq_unbusy(&uq->uq_key);
2620 				umtxq_unlock(&uq->uq_key);
2621 			}
2622 
2623 			break;
2624 		}
2625 
2626 		/* grab monitor lock */
2627 		umtxq_lock(&uq->uq_key);
2628 		umtxq_busy(&uq->uq_key);
2629 		umtxq_unlock(&uq->uq_key);
2630 
2631 		/*
2632 		 * re-read the state, in case it changed between the try-lock above
2633 		 * and the check below
2634 		 */
2635 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2636 
2637 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2638 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2639 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2640 			if (oldstate == state)
2641 				goto sleep;
2642 			state = oldstate;
2643 		}
2644 
2645 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2646 			umtxq_lock(&uq->uq_key);
2647 			umtxq_unbusy(&uq->uq_key);
2648 			umtxq_unlock(&uq->uq_key);
2649 			continue;
2650 		}
2651 sleep:
2652 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2653 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2654 
2655 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2656 			umtxq_lock(&uq->uq_key);
2657 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2658 			umtxq_unbusy(&uq->uq_key);
2659 
2660 			error = umtxq_sleep(uq, "uwrlck", timo);
2661 
2662 			umtxq_busy(&uq->uq_key);
2663 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2664 			umtxq_unlock(&uq->uq_key);
2665 			if (error)
2666 				break;
2667 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2668 		}
2669 
2670 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2671 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2672 		if (blocked_writers == 1) {
2673 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2674 			for (;;) {
2675 				oldstate = casuword32(&rwlock->rw_state, state,
2676 					 state & ~URWLOCK_WRITE_WAITERS);
2677 				if (oldstate == state)
2678 					break;
2679 				state = oldstate;
2680 			}
2681 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2682 		} else
2683 			blocked_readers = 0;
2684 
2685 		umtxq_lock(&uq->uq_key);
2686 		umtxq_unbusy(&uq->uq_key);
2687 		umtxq_unlock(&uq->uq_key);
2688 	}
2689 
2690 	umtx_key_release(&uq->uq_key);
2691 	return (error);
2692 }
2693 
2694 static int
2695 do_rw_wrlock2(struct thread *td, void *obj, struct _umtx_time *timeout)
2696 {
2697 	struct timespec cts, ets, tts;
2698 	int error;
2699 
2700 	kern_clock_gettime(td, timeout->_clockid, &cts);
2701 	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2702 		ets = cts;
2703 		timespecadd(&ets, &timeout->_timeout);
2704 		tts = timeout->_timeout;
2705 	} else {
2706 		ets = timeout->_timeout;
2707 		tts = timeout->_timeout;
2708 		timespecsub(&tts, &cts);
2709 	}
2710 	for (;;) {
2711 		error = do_rw_wrlock(td, obj, tstohz(&tts));
2712 		if (error != ETIMEDOUT)
2713 			break;
2714 		kern_clock_gettime(td, timeout->_clockid, &cts);
2715 		if (timespeccmp(&cts, &ets, >=))
2716 			break;
2717 		tts = ets;
2718 		timespecsub(&tts, &cts);
2719 	}
2720 	if (error == ERESTART)
2721 		error = EINTR;
2722 	return (error);
2723 }
2724 
2725 static int
2726 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2727 {
2728 	struct umtx_q *uq;
2729 	uint32_t flags;
2730 	int32_t state, oldstate;
2731 	int error, q, count;
2732 
2733 	uq = td->td_umtxq;
2734 	flags = fuword32(&rwlock->rw_flags);
2735 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2736 	if (error != 0)
2737 		return (error);
2738 
2739 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2740 	if (state & URWLOCK_WRITE_OWNER) {
2741 		for (;;) {
2742 			oldstate = casuword32(&rwlock->rw_state, state,
2743 				state & ~URWLOCK_WRITE_OWNER);
2744 			if (oldstate != state) {
2745 				state = oldstate;
2746 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2747 					error = EPERM;
2748 					goto out;
2749 				}
2750 			} else
2751 				break;
2752 		}
2753 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2754 		for (;;) {
2755 			oldstate = casuword32(&rwlock->rw_state, state,
2756 				state - 1);
2757 			if (oldstate != state) {
2758 				state = oldstate;
2759 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2760 					error = EPERM;
2761 					goto out;
2762 				}
2763 			}
2764 			else
2765 				break;
2766 		}
2767 	} else {
2768 		error = EPERM;
2769 		goto out;
2770 	}
2771 
2772 	count = 0;
2773 
2774 	if (!(flags & URWLOCK_PREFER_READER)) {
2775 		if (state & URWLOCK_WRITE_WAITERS) {
2776 			count = 1;
2777 			q = UMTX_EXCLUSIVE_QUEUE;
2778 		} else if (state & URWLOCK_READ_WAITERS) {
2779 			count = INT_MAX;
2780 			q = UMTX_SHARED_QUEUE;
2781 		}
2782 	} else {
2783 		if (state & URWLOCK_READ_WAITERS) {
2784 			count = INT_MAX;
2785 			q = UMTX_SHARED_QUEUE;
2786 		} else if (state & URWLOCK_WRITE_WAITERS) {
2787 			count = 1;
2788 			q = UMTX_EXCLUSIVE_QUEUE;
2789 		}
2790 	}
2791 
2792 	if (count) {
2793 		umtxq_lock(&uq->uq_key);
2794 		umtxq_busy(&uq->uq_key);
2795 		umtxq_signal_queue(&uq->uq_key, count, q);
2796 		umtxq_unbusy(&uq->uq_key);
2797 		umtxq_unlock(&uq->uq_key);
2798 	}
2799 out:
2800 	umtx_key_release(&uq->uq_key);
2801 	return (error);
2802 }
2803 
2804 static int
2805 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2806 {
2807 	struct umtx_q *uq;
2808 	struct timespec cts, ets, tts;
2809 	uint32_t flags, count;
2810 	int error;
2811 
2812 	uq = td->td_umtxq;
2813 	flags = fuword32(&sem->_flags);
2814 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2815 	if (error != 0)
2816 		return (error);
2817 	umtxq_lock(&uq->uq_key);
2818 	umtxq_busy(&uq->uq_key);
2819 	umtxq_insert(uq);
2820 	umtxq_unlock(&uq->uq_key);
2821 
2822 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2823 	rmb();
2824 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2825 	if (count != 0) {
2826 		umtxq_lock(&uq->uq_key);
2827 		umtxq_unbusy(&uq->uq_key);
2828 		umtxq_remove(uq);
2829 		umtxq_unlock(&uq->uq_key);
2830 		umtx_key_release(&uq->uq_key);
2831 		return (0);
2832 	}
2833 
2834 	umtxq_lock(&uq->uq_key);
2835 	umtxq_unbusy(&uq->uq_key);
2836 
2837 	if (timeout == NULL) {
2838 		error = umtxq_sleep(uq, "usem", 0);
2839 	} else {
2840 		umtxq_unlock(&uq->uq_key);
2841 		kern_clock_gettime(td, timeout->_clockid, &cts);
2842 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2843 			ets = cts;
2844 			timespecadd(&ets, &timeout->_timeout);
2845 		} else {
2846 			ets = timeout->_timeout;
2847 		}
2848 		umtxq_lock(&uq->uq_key);
2849 		for (;;) {
2850 			if (timespeccmp(&cts, &ets, >=)) {
2851 				error = ETIMEDOUT;
2852 				break;
2853 			}
2854 			tts = ets;
2855 			timespecsub(&tts, &cts);
2856 			error = umtxq_sleep(uq, "usem", tstohz(&tts));
2857 			if (error != ETIMEDOUT)
2858 				break;
2859 			umtxq_unlock(&uq->uq_key);
2860 			kern_clock_gettime(td, timeout->_clockid, &cts);
2861 			umtxq_lock(&uq->uq_key);
2862 		}
2863 	}
2864 
2865 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2866 		error = 0;
2867 	else {
2868 		umtxq_remove(uq);
2869 		if (error == ERESTART)
2870 			error = EINTR;
2871 	}
2872 	umtxq_unlock(&uq->uq_key);
2873 	umtx_key_release(&uq->uq_key);
2874 	return (error);
2875 }
2876 
2877 /*
2878  * Signal a userland condition variable.
2879  */
2880 static int
2881 do_sem_wake(struct thread *td, struct _usem *sem)
2882 {
2883 	struct umtx_key key;
2884 	int error, cnt, nwake;
2885 	uint32_t flags;
2886 
2887 	flags = fuword32(&sem->_flags);
2888 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2889 		return (error);
2890 	umtxq_lock(&key);
2891 	umtxq_busy(&key);
2892 	cnt = umtxq_count(&key);
2893 	nwake = umtxq_signal(&key, 1);
2894 	if (cnt <= nwake) {
2895 		umtxq_unlock(&key);
2896 		error = suword32(
2897 		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2898 		umtxq_lock(&key);
2899 	}
2900 	umtxq_unbusy(&key);
2901 	umtxq_unlock(&key);
2902 	umtx_key_release(&key);
2903 	return (error);
2904 }
2905 
2906 int
2907 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2908     /* struct umtx *umtx */
2909 {
2910 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2911 }
2912 
2913 int
2914 sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2915     /* struct umtx *umtx */
2916 {
2917 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2918 }
2919 
2920 inline int
2921 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2922 {
2923 	int error;
2924 
2925 	error = copyin(addr, tsp, sizeof(struct timespec));
2926 	if (error == 0) {
2927 		if (tsp->tv_sec < 0 ||
2928 		    tsp->tv_nsec >= 1000000000 ||
2929 		    tsp->tv_nsec < 0)
2930 			error = EINVAL;
2931 	}
2932 	return (error);
2933 }
2934 
2935 static inline int
2936 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2937 {
2938 	int error;
2939 
2940 	if (size <= sizeof(struct timespec)) {
2941 		tp->_clockid = CLOCK_REALTIME;
2942 		tp->_flags = 0;
2943 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2944 	} else
2945 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2946 	if (error != 0)
2947 		return (error);
2948 	if (tp->_timeout.tv_sec < 0 ||
2949 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2950 		return (EINVAL);
2951 	return (0);
2952 }
2953 
2954 static int
2955 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2956 {
2957 	struct timespec *ts, timeout;
2958 	int error;
2959 
2960 	/* Allow a null timespec (wait forever). */
2961 	if (uap->uaddr2 == NULL)
2962 		ts = NULL;
2963 	else {
2964 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2965 		if (error != 0)
2966 			return (error);
2967 		ts = &timeout;
2968 	}
2969 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2970 }
2971 
2972 static int
2973 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2974 {
2975 	return (do_unlock_umtx(td, uap->obj, uap->val));
2976 }
2977 
2978 static int
2979 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2980 {
2981 	struct _umtx_time timeout, *tm_p;
2982 	int error;
2983 
2984 	if (uap->uaddr2 == NULL)
2985 		tm_p = NULL;
2986 	else {
2987 		error = umtx_copyin_umtx_time(
2988 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2989 		if (error != 0)
2990 			return (error);
2991 		tm_p = &timeout;
2992 	}
2993 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2994 }
2995 
2996 static int
2997 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2998 {
2999 	struct _umtx_time timeout, *tm_p;
3000 	int error;
3001 
3002 	if (uap->uaddr2 == NULL)
3003 		tm_p = NULL;
3004 	else {
3005 		error = umtx_copyin_umtx_time(
3006 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3007 		if (error != 0)
3008 			return (error);
3009 		tm_p = &timeout;
3010 	}
3011 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3012 }
3013 
3014 static int
3015 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3016 {
3017 	struct _umtx_time *tm_p, timeout;
3018 	int error;
3019 
3020 	if (uap->uaddr2 == NULL)
3021 		tm_p = NULL;
3022 	else {
3023 		error = umtx_copyin_umtx_time(
3024 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3025 		if (error != 0)
3026 			return (error);
3027 		tm_p = &timeout;
3028 	}
3029 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3030 }
3031 
3032 static int
3033 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3034 {
3035 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3036 }
3037 
3038 #define BATCH_SIZE	128
3039 static int
3040 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3041 {
3042 	int count = uap->val;
3043 	void *uaddrs[BATCH_SIZE];
3044 	char **upp = (char **)uap->obj;
3045 	int tocopy;
3046 	int error = 0;
3047 	int i, pos = 0;
3048 
3049 	while (count > 0) {
3050 		tocopy = count;
3051 		if (tocopy > BATCH_SIZE)
3052 			tocopy = BATCH_SIZE;
3053 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3054 		if (error != 0)
3055 			break;
3056 		for (i = 0; i < tocopy; ++i)
3057 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3058 		count -= tocopy;
3059 		pos += tocopy;
3060 	}
3061 	return (error);
3062 }
3063 
3064 static int
3065 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3066 {
3067 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3068 }
3069 
3070 static int
3071 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3072 {
3073 	struct _umtx_time *tm_p, timeout;
3074 	int error;
3075 
3076 	/* Allow a null timespec (wait forever). */
3077 	if (uap->uaddr2 == NULL)
3078 		tm_p = NULL;
3079 	else {
3080 		error = umtx_copyin_umtx_time(
3081 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3082 		if (error != 0)
3083 			return (error);
3084 		tm_p = &timeout;
3085 	}
3086 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3087 }
3088 
3089 static int
3090 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3091 {
3092 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3093 }
3094 
3095 static int
3096 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3097 {
3098 	struct _umtx_time *tm_p, timeout;
3099 	int error;
3100 
3101 	/* Allow a null timespec (wait forever). */
3102 	if (uap->uaddr2 == NULL)
3103 		tm_p = NULL;
3104 	else {
3105 		error = umtx_copyin_umtx_time(
3106 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3107 		if (error != 0)
3108 			return (error);
3109 		tm_p = &timeout;
3110 	}
3111 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3112 }
3113 
3114 static int
3115 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3116 {
3117 	return do_wake_umutex(td, uap->obj);
3118 }
3119 
3120 static int
3121 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3122 {
3123 	return do_unlock_umutex(td, uap->obj);
3124 }
3125 
3126 static int
3127 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3128 {
3129 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3130 }
3131 
3132 static int
3133 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3134 {
3135 	struct timespec *ts, timeout;
3136 	int error;
3137 
3138 	/* Allow a null timespec (wait forever). */
3139 	if (uap->uaddr2 == NULL)
3140 		ts = NULL;
3141 	else {
3142 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3143 		if (error != 0)
3144 			return (error);
3145 		ts = &timeout;
3146 	}
3147 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3148 }
3149 
3150 static int
3151 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3152 {
3153 	return do_cv_signal(td, uap->obj);
3154 }
3155 
3156 static int
3157 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3158 {
3159 	return do_cv_broadcast(td, uap->obj);
3160 }
3161 
3162 static int
3163 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3164 {
3165 	struct _umtx_time timeout;
3166 	int error;
3167 
3168 	/* Allow a null timespec (wait forever). */
3169 	if (uap->uaddr2 == NULL) {
3170 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3171 	} else {
3172 		error = umtx_copyin_umtx_time(uap->uaddr2,
3173 		   (size_t)uap->uaddr1, &timeout);
3174 		if (error != 0)
3175 			return (error);
3176 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3177 	}
3178 	return (error);
3179 }
3180 
3181 static int
3182 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3183 {
3184 	struct _umtx_time timeout;
3185 	int error;
3186 
3187 	/* Allow a null timespec (wait forever). */
3188 	if (uap->uaddr2 == NULL) {
3189 		error = do_rw_wrlock(td, uap->obj, 0);
3190 	} else {
3191 		error = umtx_copyin_umtx_time(uap->uaddr2,
3192 		   (size_t)uap->uaddr1, &timeout);
3193 		if (error != 0)
3194 			return (error);
3195 
3196 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3197 	}
3198 	return (error);
3199 }
3200 
3201 static int
3202 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3203 {
3204 	return do_rw_unlock(td, uap->obj);
3205 }
3206 
3207 static int
3208 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	struct _umtx_time *tm_p, timeout;
3211 	int error;
3212 
3213 	/* Allow a null timespec (wait forever). */
3214 	if (uap->uaddr2 == NULL)
3215 		tm_p = NULL;
3216 	else {
3217 		error = umtx_copyin_umtx_time(
3218 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3219 		if (error != 0)
3220 			return (error);
3221 		tm_p = &timeout;
3222 	}
3223 	return (do_sem_wait(td, uap->obj, tm_p));
3224 }
3225 
3226 static int
3227 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3228 {
3229 	return do_sem_wake(td, uap->obj);
3230 }
3231 
3232 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3233 
3234 static _umtx_op_func op_table[] = {
3235 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3236 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3237 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3238 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3239 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3240 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3241 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3242 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3243 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3244 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3245 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3246 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3247 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3248 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3249 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3250 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3251 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3252 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3253 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3254 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3255 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3256 	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3257 };
3258 
3259 int
3260 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3261 {
3262 	if ((unsigned)uap->op < UMTX_OP_MAX)
3263 		return (*op_table[uap->op])(td, uap);
3264 	return (EINVAL);
3265 }
3266 
3267 #ifdef COMPAT_FREEBSD32
3268 int
3269 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3270     /* struct umtx *umtx */
3271 {
3272 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3273 }
3274 
3275 int
3276 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3277     /* struct umtx *umtx */
3278 {
3279 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3280 }
3281 
3282 struct timespec32 {
3283 	uint32_t tv_sec;
3284 	uint32_t tv_nsec;
3285 };
3286 
3287 struct umtx_time32 {
3288 	struct	timespec32	timeout;
3289 	uint32_t		flags;
3290 	uint32_t		clockid;
3291 };
3292 
3293 static inline int
3294 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3295 {
3296 	struct timespec32 ts32;
3297 	int error;
3298 
3299 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3300 	if (error == 0) {
3301 		if (ts32.tv_sec < 0 ||
3302 		    ts32.tv_nsec >= 1000000000 ||
3303 		    ts32.tv_nsec < 0)
3304 			error = EINVAL;
3305 		else {
3306 			tsp->tv_sec = ts32.tv_sec;
3307 			tsp->tv_nsec = ts32.tv_nsec;
3308 		}
3309 	}
3310 	return (error);
3311 }
3312 
3313 static inline int
3314 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3315 {
3316 	struct umtx_time32 t32;
3317 	int error;
3318 
3319 	t32.clockid = CLOCK_REALTIME;
3320 	t32.flags   = 0;
3321 	if (size <= sizeof(struct timespec32))
3322 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3323 	else
3324 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3325 	if (error != 0)
3326 		return (error);
3327 	if (t32.timeout.tv_sec < 0 ||
3328 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3329 		return (EINVAL);
3330 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3331 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3332 	tp->_flags = t32.flags;
3333 	tp->_clockid = t32.clockid;
3334 	return (0);
3335 }
3336 
3337 static int
3338 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3339 {
3340 	struct timespec *ts, timeout;
3341 	int error;
3342 
3343 	/* Allow a null timespec (wait forever). */
3344 	if (uap->uaddr2 == NULL)
3345 		ts = NULL;
3346 	else {
3347 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3348 		if (error != 0)
3349 			return (error);
3350 		ts = &timeout;
3351 	}
3352 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3353 }
3354 
3355 static int
3356 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3357 {
3358 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3359 }
3360 
3361 static int
3362 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3363 {
3364 	struct _umtx_time *tm_p, timeout;
3365 	int error;
3366 
3367 	if (uap->uaddr2 == NULL)
3368 		tm_p = NULL;
3369 	else {
3370 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3371 			(size_t)uap->uaddr1, &timeout);
3372 		if (error != 0)
3373 			return (error);
3374 		tm_p = &timeout;
3375 	}
3376 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3377 }
3378 
3379 static int
3380 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3381 {
3382 	struct _umtx_time *tm_p, timeout;
3383 	int error;
3384 
3385 	/* Allow a null timespec (wait forever). */
3386 	if (uap->uaddr2 == NULL)
3387 		tm_p = NULL;
3388 	else {
3389 		error = umtx_copyin_umtx_time(uap->uaddr2,
3390 			    (size_t)uap->uaddr1, &timeout);
3391 		if (error != 0)
3392 			return (error);
3393 		tm_p = &timeout;
3394 	}
3395 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3396 }
3397 
3398 static int
3399 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3400 {
3401 	struct _umtx_time *tm_p, timeout;
3402 	int error;
3403 
3404 	/* Allow a null timespec (wait forever). */
3405 	if (uap->uaddr2 == NULL)
3406 		tm_p = NULL;
3407 	else {
3408 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3409 		    (size_t)uap->uaddr1, &timeout);
3410 		if (error != 0)
3411 			return (error);
3412 		tm_p = &timeout;
3413 	}
3414 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3415 }
3416 
3417 static int
3418 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3419 {
3420 	struct timespec *ts, timeout;
3421 	int error;
3422 
3423 	/* Allow a null timespec (wait forever). */
3424 	if (uap->uaddr2 == NULL)
3425 		ts = NULL;
3426 	else {
3427 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3428 		if (error != 0)
3429 			return (error);
3430 		ts = &timeout;
3431 	}
3432 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3433 }
3434 
3435 static int
3436 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3437 {
3438 	struct _umtx_time timeout;
3439 	int error;
3440 
3441 	/* Allow a null timespec (wait forever). */
3442 	if (uap->uaddr2 == NULL) {
3443 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3444 	} else {
3445 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3446 		    (size_t)uap->uaddr1, &timeout);
3447 		if (error != 0)
3448 			return (error);
3449 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3450 	}
3451 	return (error);
3452 }
3453 
3454 static int
3455 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3456 {
3457 	struct _umtx_time timeout;
3458 	int error;
3459 
3460 	/* Allow a null timespec (wait forever). */
3461 	if (uap->uaddr2 == NULL) {
3462 		error = do_rw_wrlock(td, uap->obj, 0);
3463 	} else {
3464 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3465 		    (size_t)uap->uaddr1, &timeout);
3466 		if (error != 0)
3467 			return (error);
3468 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3469 	}
3470 	return (error);
3471 }
3472 
3473 static int
3474 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3475 {
3476 	struct _umtx_time *tm_p, timeout;
3477 	int error;
3478 
3479 	if (uap->uaddr2 == NULL)
3480 		tm_p = NULL;
3481 	else {
3482 		error = umtx_copyin_umtx_time32(
3483 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3484 		if (error != 0)
3485 			return (error);
3486 		tm_p = &timeout;
3487 	}
3488 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3489 }
3490 
3491 static int
3492 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3493 {
3494 	struct _umtx_time *tm_p, timeout;
3495 	int error;
3496 
3497 	/* Allow a null timespec (wait forever). */
3498 	if (uap->uaddr2 == NULL)
3499 		tm_p = NULL;
3500 	else {
3501 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3502 		    (size_t)uap->uaddr1, &timeout);
3503 		if (error != 0)
3504 			return (error);
3505 		tm_p = &timeout;
3506 	}
3507 	return (do_sem_wait(td, uap->obj, tm_p));
3508 }
3509 
3510 static int
3511 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3512 {
3513 	int count = uap->val;
3514 	uint32_t uaddrs[BATCH_SIZE];
3515 	uint32_t **upp = (uint32_t **)uap->obj;
3516 	int tocopy;
3517 	int error = 0;
3518 	int i, pos = 0;
3519 
3520 	while (count > 0) {
3521 		tocopy = count;
3522 		if (tocopy > BATCH_SIZE)
3523 			tocopy = BATCH_SIZE;
3524 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3525 		if (error != 0)
3526 			break;
3527 		for (i = 0; i < tocopy; ++i)
3528 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3529 				INT_MAX, 1);
3530 		count -= tocopy;
3531 		pos += tocopy;
3532 	}
3533 	return (error);
3534 }
3535 
3536 static _umtx_op_func op_table_compat32[] = {
3537 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3538 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3539 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3540 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3541 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3542 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3543 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3544 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3545 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3546 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3547 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3548 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3549 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3550 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3551 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3552 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3553 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3554 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3555 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3556 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3557 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3558 	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3559 };
3560 
3561 int
3562 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3563 {
3564 	if ((unsigned)uap->op < UMTX_OP_MAX)
3565 		return (*op_table_compat32[uap->op])(td,
3566 			(struct _umtx_op_args *)uap);
3567 	return (EINVAL);
3568 }
3569 #endif
3570 
3571 void
3572 umtx_thread_init(struct thread *td)
3573 {
3574 	td->td_umtxq = umtxq_alloc();
3575 	td->td_umtxq->uq_thread = td;
3576 }
3577 
3578 void
3579 umtx_thread_fini(struct thread *td)
3580 {
3581 	umtxq_free(td->td_umtxq);
3582 }
3583 
3584 /*
3585  * It will be called when new thread is created, e.g fork().
3586  */
3587 void
3588 umtx_thread_alloc(struct thread *td)
3589 {
3590 	struct umtx_q *uq;
3591 
3592 	uq = td->td_umtxq;
3593 	uq->uq_inherited_pri = PRI_MAX;
3594 
3595 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3596 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3597 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3598 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3599 }
3600 
3601 /*
3602  * exec() hook.
3603  */
3604 static void
3605 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3606 	struct image_params *imgp __unused)
3607 {
3608 	umtx_thread_cleanup(curthread);
3609 }
3610 
3611 /*
3612  * thread_exit() hook.
3613  */
3614 void
3615 umtx_thread_exit(struct thread *td)
3616 {
3617 	umtx_thread_cleanup(td);
3618 }
3619 
3620 /*
3621  * clean up umtx data.
3622  */
3623 static void
3624 umtx_thread_cleanup(struct thread *td)
3625 {
3626 	struct umtx_q *uq;
3627 	struct umtx_pi *pi;
3628 
3629 	if ((uq = td->td_umtxq) == NULL)
3630 		return;
3631 
3632 	mtx_lock_spin(&umtx_lock);
3633 	uq->uq_inherited_pri = PRI_MAX;
3634 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3635 		pi->pi_owner = NULL;
3636 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3637 	}
3638 	mtx_unlock_spin(&umtx_lock);
3639 	thread_lock(td);
3640 	sched_lend_user_prio(td, PRI_MAX);
3641 	thread_unlock(td);
3642 }
3643