xref: /freebsd/sys/kern/kern_umtx.c (revision 7aa65846327fe5bc7e5961c2f7fd0c61f2ec0b01)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysent.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/eventhandler.h>
50 #include <sys/umtx.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cpu.h>
59 
60 #ifdef COMPAT_FREEBSD32
61 #include <compat/freebsd32/freebsd32_proto.h>
62 #endif
63 
64 #define _UMUTEX_TRY		1
65 #define _UMUTEX_WAIT		2
66 
67 /* Priority inheritance mutex info. */
68 struct umtx_pi {
69 	/* Owner thread */
70 	struct thread		*pi_owner;
71 
72 	/* Reference count */
73 	int			pi_refcount;
74 
75  	/* List entry to link umtx holding by thread */
76 	TAILQ_ENTRY(umtx_pi)	pi_link;
77 
78 	/* List entry in hash */
79 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80 
81 	/* List for waiters */
82 	TAILQ_HEAD(,umtx_q)	pi_blocked;
83 
84 	/* Identify a userland lock object */
85 	struct umtx_key		pi_key;
86 };
87 
88 /* A userland synchronous object user. */
89 struct umtx_q {
90 	/* Linked list for the hash. */
91 	TAILQ_ENTRY(umtx_q)	uq_link;
92 
93 	/* Umtx key. */
94 	struct umtx_key		uq_key;
95 
96 	/* Umtx flags. */
97 	int			uq_flags;
98 #define UQF_UMTXQ	0x0001
99 
100 	/* The thread waits on. */
101 	struct thread		*uq_thread;
102 
103 	/*
104 	 * Blocked on PI mutex. read can use chain lock
105 	 * or umtx_lock, write must have both chain lock and
106 	 * umtx_lock being hold.
107 	 */
108 	struct umtx_pi		*uq_pi_blocked;
109 
110 	/* On blocked list */
111 	TAILQ_ENTRY(umtx_q)	uq_lockq;
112 
113 	/* Thread contending with us */
114 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115 
116 	/* Inherited priority from PP mutex */
117 	u_char			uq_inherited_pri;
118 
119 	/* Spare queue ready to be reused */
120 	struct umtxq_queue	*uq_spare_queue;
121 
122 	/* The queue we on */
123 	struct umtxq_queue	*uq_cur_queue;
124 };
125 
126 TAILQ_HEAD(umtxq_head, umtx_q);
127 
128 /* Per-key wait-queue */
129 struct umtxq_queue {
130 	struct umtxq_head	head;
131 	struct umtx_key		key;
132 	LIST_ENTRY(umtxq_queue)	link;
133 	int			length;
134 };
135 
136 LIST_HEAD(umtxq_list, umtxq_queue);
137 
138 /* Userland lock object's wait-queue chain */
139 struct umtxq_chain {
140 	/* Lock for this chain. */
141 	struct mtx		uc_lock;
142 
143 	/* List of sleep queues. */
144 	struct umtxq_list	uc_queue[2];
145 #define UMTX_SHARED_QUEUE	0
146 #define UMTX_EXCLUSIVE_QUEUE	1
147 
148 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149 
150 	/* Busy flag */
151 	char			uc_busy;
152 
153 	/* Chain lock waiters */
154 	int			uc_waiters;
155 
156 	/* All PI in the list */
157 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158 
159 #ifdef UMTX_PROFILING
160 	int 			length;
161 	int			max_length;
162 #endif
163 };
164 
165 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167 
168 /*
169  * Don't propagate time-sharing priority, there is a security reason,
170  * a user can simply introduce PI-mutex, let thread A lock the mutex,
171  * and let another thread B block on the mutex, because B is
172  * sleeping, its priority will be boosted, this causes A's priority to
173  * be boosted via priority propagating too and will never be lowered even
174  * if it is using 100%CPU, this is unfair to other processes.
175  */
176 
177 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180 
181 #define	GOLDEN_RATIO_PRIME	2654404609U
182 #define	UMTX_CHAINS		512
183 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
184 
185 #define	GET_SHARE(flags)	\
186     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187 
188 #define BUSY_SPINS		200
189 
190 struct abs_timeout {
191 	int clockid;
192 	struct timespec cur;
193 	struct timespec end;
194 };
195 
196 static uma_zone_t		umtx_pi_zone;
197 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199 static int			umtx_pi_allocated;
200 
201 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203     &umtx_pi_allocated, 0, "Allocated umtx_pi");
204 
205 #ifdef UMTX_PROFILING
206 static long max_length;
207 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
208 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
209 #endif
210 
211 static void umtxq_sysinit(void *);
212 static void umtxq_hash(struct umtx_key *key);
213 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
214 static void umtxq_lock(struct umtx_key *key);
215 static void umtxq_unlock(struct umtx_key *key);
216 static void umtxq_busy(struct umtx_key *key);
217 static void umtxq_unbusy(struct umtx_key *key);
218 static void umtxq_insert_queue(struct umtx_q *uq, int q);
219 static void umtxq_remove_queue(struct umtx_q *uq, int q);
220 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
221 static int umtxq_count(struct umtx_key *key);
222 static struct umtx_pi *umtx_pi_alloc(int);
223 static void umtx_pi_free(struct umtx_pi *pi);
224 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225 static void umtx_thread_cleanup(struct thread *td);
226 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227 	struct image_params *imgp __unused);
228 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229 
230 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233 
234 static struct mtx umtx_lock;
235 
236 #ifdef UMTX_PROFILING
237 static void
238 umtx_init_profiling(void)
239 {
240 	struct sysctl_oid *chain_oid;
241 	char chain_name[10];
242 	int i;
243 
244 	for (i = 0; i < UMTX_CHAINS; ++i) {
245 		snprintf(chain_name, sizeof(chain_name), "%d", i);
246 		chain_oid = SYSCTL_ADD_NODE(NULL,
247 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
248 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
249 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
250 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
251 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
252 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
253 	}
254 }
255 #endif
256 
257 static void
258 umtxq_sysinit(void *arg __unused)
259 {
260 	int i, j;
261 
262 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
263 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
264 	for (i = 0; i < 2; ++i) {
265 		for (j = 0; j < UMTX_CHAINS; ++j) {
266 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
267 				 MTX_DEF | MTX_DUPOK);
268 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
269 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
270 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
271 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
272 			umtxq_chains[i][j].uc_busy = 0;
273 			umtxq_chains[i][j].uc_waiters = 0;
274 #ifdef UMTX_PROFILING
275 			umtxq_chains[i][j].length = 0;
276 			umtxq_chains[i][j].max_length = 0;
277 #endif
278 		}
279 	}
280 #ifdef UMTX_PROFILING
281 	umtx_init_profiling();
282 #endif
283 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
284 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
285 	    EVENTHANDLER_PRI_ANY);
286 }
287 
288 struct umtx_q *
289 umtxq_alloc(void)
290 {
291 	struct umtx_q *uq;
292 
293 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
294 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
295 	TAILQ_INIT(&uq->uq_spare_queue->head);
296 	TAILQ_INIT(&uq->uq_pi_contested);
297 	uq->uq_inherited_pri = PRI_MAX;
298 	return (uq);
299 }
300 
301 void
302 umtxq_free(struct umtx_q *uq)
303 {
304 	MPASS(uq->uq_spare_queue != NULL);
305 	free(uq->uq_spare_queue, M_UMTX);
306 	free(uq, M_UMTX);
307 }
308 
309 static inline void
310 umtxq_hash(struct umtx_key *key)
311 {
312 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
313 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
314 }
315 
316 static inline struct umtxq_chain *
317 umtxq_getchain(struct umtx_key *key)
318 {
319 	if (key->type <= TYPE_SEM)
320 		return (&umtxq_chains[1][key->hash]);
321 	return (&umtxq_chains[0][key->hash]);
322 }
323 
324 /*
325  * Lock a chain.
326  */
327 static inline void
328 umtxq_lock(struct umtx_key *key)
329 {
330 	struct umtxq_chain *uc;
331 
332 	uc = umtxq_getchain(key);
333 	mtx_lock(&uc->uc_lock);
334 }
335 
336 /*
337  * Unlock a chain.
338  */
339 static inline void
340 umtxq_unlock(struct umtx_key *key)
341 {
342 	struct umtxq_chain *uc;
343 
344 	uc = umtxq_getchain(key);
345 	mtx_unlock(&uc->uc_lock);
346 }
347 
348 /*
349  * Set chain to busy state when following operation
350  * may be blocked (kernel mutex can not be used).
351  */
352 static inline void
353 umtxq_busy(struct umtx_key *key)
354 {
355 	struct umtxq_chain *uc;
356 
357 	uc = umtxq_getchain(key);
358 	mtx_assert(&uc->uc_lock, MA_OWNED);
359 	if (uc->uc_busy) {
360 #ifdef SMP
361 		if (smp_cpus > 1) {
362 			int count = BUSY_SPINS;
363 			if (count > 0) {
364 				umtxq_unlock(key);
365 				while (uc->uc_busy && --count > 0)
366 					cpu_spinwait();
367 				umtxq_lock(key);
368 			}
369 		}
370 #endif
371 		while (uc->uc_busy) {
372 			uc->uc_waiters++;
373 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
374 			uc->uc_waiters--;
375 		}
376 	}
377 	uc->uc_busy = 1;
378 }
379 
380 /*
381  * Unbusy a chain.
382  */
383 static inline void
384 umtxq_unbusy(struct umtx_key *key)
385 {
386 	struct umtxq_chain *uc;
387 
388 	uc = umtxq_getchain(key);
389 	mtx_assert(&uc->uc_lock, MA_OWNED);
390 	KASSERT(uc->uc_busy != 0, ("not busy"));
391 	uc->uc_busy = 0;
392 	if (uc->uc_waiters)
393 		wakeup_one(uc);
394 }
395 
396 static struct umtxq_queue *
397 umtxq_queue_lookup(struct umtx_key *key, int q)
398 {
399 	struct umtxq_queue *uh;
400 	struct umtxq_chain *uc;
401 
402 	uc = umtxq_getchain(key);
403 	UMTXQ_LOCKED_ASSERT(uc);
404 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
405 		if (umtx_key_match(&uh->key, key))
406 			return (uh);
407 	}
408 
409 	return (NULL);
410 }
411 
412 static inline void
413 umtxq_insert_queue(struct umtx_q *uq, int q)
414 {
415 	struct umtxq_queue *uh;
416 	struct umtxq_chain *uc;
417 
418 	uc = umtxq_getchain(&uq->uq_key);
419 	UMTXQ_LOCKED_ASSERT(uc);
420 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
421 	uh = umtxq_queue_lookup(&uq->uq_key, q);
422 	if (uh != NULL) {
423 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
424 	} else {
425 		uh = uq->uq_spare_queue;
426 		uh->key = uq->uq_key;
427 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
428 	}
429 	uq->uq_spare_queue = NULL;
430 
431 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
432 	uh->length++;
433 #ifdef UMTX_PROFILING
434 	uc->length++;
435 	if (uc->length > uc->max_length) {
436 		uc->max_length = uc->length;
437 		if (uc->max_length > max_length)
438 			max_length = uc->max_length;
439 	}
440 #endif
441 	uq->uq_flags |= UQF_UMTXQ;
442 	uq->uq_cur_queue = uh;
443 	return;
444 }
445 
446 static inline void
447 umtxq_remove_queue(struct umtx_q *uq, int q)
448 {
449 	struct umtxq_chain *uc;
450 	struct umtxq_queue *uh;
451 
452 	uc = umtxq_getchain(&uq->uq_key);
453 	UMTXQ_LOCKED_ASSERT(uc);
454 	if (uq->uq_flags & UQF_UMTXQ) {
455 		uh = uq->uq_cur_queue;
456 		TAILQ_REMOVE(&uh->head, uq, uq_link);
457 		uh->length--;
458 #ifdef UMTX_PROFILING
459 		uc->length--;
460 #endif
461 		uq->uq_flags &= ~UQF_UMTXQ;
462 		if (TAILQ_EMPTY(&uh->head)) {
463 			KASSERT(uh->length == 0,
464 			    ("inconsistent umtxq_queue length"));
465 			LIST_REMOVE(uh, link);
466 		} else {
467 			uh = LIST_FIRST(&uc->uc_spare_queue);
468 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
469 			LIST_REMOVE(uh, link);
470 		}
471 		uq->uq_spare_queue = uh;
472 		uq->uq_cur_queue = NULL;
473 	}
474 }
475 
476 /*
477  * Check if there are multiple waiters
478  */
479 static int
480 umtxq_count(struct umtx_key *key)
481 {
482 	struct umtxq_chain *uc;
483 	struct umtxq_queue *uh;
484 
485 	uc = umtxq_getchain(key);
486 	UMTXQ_LOCKED_ASSERT(uc);
487 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
488 	if (uh != NULL)
489 		return (uh->length);
490 	return (0);
491 }
492 
493 /*
494  * Check if there are multiple PI waiters and returns first
495  * waiter.
496  */
497 static int
498 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
499 {
500 	struct umtxq_chain *uc;
501 	struct umtxq_queue *uh;
502 
503 	*first = NULL;
504 	uc = umtxq_getchain(key);
505 	UMTXQ_LOCKED_ASSERT(uc);
506 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
507 	if (uh != NULL) {
508 		*first = TAILQ_FIRST(&uh->head);
509 		return (uh->length);
510 	}
511 	return (0);
512 }
513 
514 /*
515  * Wake up threads waiting on an userland object.
516  */
517 
518 static int
519 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
520 {
521 	struct umtxq_chain *uc;
522 	struct umtxq_queue *uh;
523 	struct umtx_q *uq;
524 	int ret;
525 
526 	ret = 0;
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	uh = umtxq_queue_lookup(key, q);
530 	if (uh != NULL) {
531 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
532 			umtxq_remove_queue(uq, q);
533 			wakeup(uq);
534 			if (++ret >= n_wake)
535 				return (ret);
536 		}
537 	}
538 	return (ret);
539 }
540 
541 
542 /*
543  * Wake up specified thread.
544  */
545 static inline void
546 umtxq_signal_thread(struct umtx_q *uq)
547 {
548 	struct umtxq_chain *uc;
549 
550 	uc = umtxq_getchain(&uq->uq_key);
551 	UMTXQ_LOCKED_ASSERT(uc);
552 	umtxq_remove(uq);
553 	wakeup(uq);
554 }
555 
556 static inline int
557 tstohz(const struct timespec *tsp)
558 {
559 	struct timeval tv;
560 
561 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
562 	return tvtohz(&tv);
563 }
564 
565 static void
566 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
567 	const struct timespec *timeout)
568 {
569 
570 	timo->clockid = clockid;
571 	if (!absolute) {
572 		kern_clock_gettime(curthread, clockid, &timo->end);
573 		timo->cur = timo->end;
574 		timespecadd(&timo->end, timeout);
575 	} else {
576 		timo->end = *timeout;
577 		kern_clock_gettime(curthread, clockid, &timo->cur);
578 	}
579 }
580 
581 static void
582 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
583 {
584 
585 	abs_timeout_init(timo, umtxtime->_clockid,
586 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
587 		&umtxtime->_timeout);
588 }
589 
590 static int
591 abs_timeout_update(struct abs_timeout *timo)
592 {
593 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
594 	return (timespeccmp(&timo->cur, &timo->end, >=));
595 }
596 
597 static int
598 abs_timeout_gethz(struct abs_timeout *timo)
599 {
600 	struct timespec tts;
601 
602 	tts = timo->end;
603 	timespecsub(&tts, &timo->cur);
604 	return (tstohz(&tts));
605 }
606 
607 /*
608  * Put thread into sleep state, before sleeping, check if
609  * thread was removed from umtx queue.
610  */
611 static inline int
612 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *timo)
613 {
614 	struct umtxq_chain *uc;
615 	int error;
616 
617 	uc = umtxq_getchain(&uq->uq_key);
618 	UMTXQ_LOCKED_ASSERT(uc);
619 	for (;;) {
620 		if (!(uq->uq_flags & UQF_UMTXQ))
621 			return (0);
622 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg,
623 		    timo == NULL ? 0 : abs_timeout_gethz(timo));
624 		if (error != EWOULDBLOCK)
625 			break;
626 		umtxq_unlock(&uq->uq_key);
627 		if (abs_timeout_update(timo)) {
628 			error = ETIMEDOUT;
629 			umtxq_lock(&uq->uq_key);
630 			break;
631 		}
632 		umtxq_lock(&uq->uq_key);
633 	}
634 	return (error);
635 }
636 
637 /*
638  * Convert userspace address into unique logical address.
639  */
640 int
641 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
642 {
643 	struct thread *td = curthread;
644 	vm_map_t map;
645 	vm_map_entry_t entry;
646 	vm_pindex_t pindex;
647 	vm_prot_t prot;
648 	boolean_t wired;
649 
650 	key->type = type;
651 	if (share == THREAD_SHARE) {
652 		key->shared = 0;
653 		key->info.private.vs = td->td_proc->p_vmspace;
654 		key->info.private.addr = (uintptr_t)addr;
655 	} else {
656 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
657 		map = &td->td_proc->p_vmspace->vm_map;
658 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
659 		    &entry, &key->info.shared.object, &pindex, &prot,
660 		    &wired) != KERN_SUCCESS) {
661 			return EFAULT;
662 		}
663 
664 		if ((share == PROCESS_SHARE) ||
665 		    (share == AUTO_SHARE &&
666 		     VM_INHERIT_SHARE == entry->inheritance)) {
667 			key->shared = 1;
668 			key->info.shared.offset = entry->offset + entry->start -
669 				(vm_offset_t)addr;
670 			vm_object_reference(key->info.shared.object);
671 		} else {
672 			key->shared = 0;
673 			key->info.private.vs = td->td_proc->p_vmspace;
674 			key->info.private.addr = (uintptr_t)addr;
675 		}
676 		vm_map_lookup_done(map, entry);
677 	}
678 
679 	umtxq_hash(key);
680 	return (0);
681 }
682 
683 /*
684  * Release key.
685  */
686 void
687 umtx_key_release(struct umtx_key *key)
688 {
689 	if (key->shared)
690 		vm_object_deallocate(key->info.shared.object);
691 }
692 
693 /*
694  * Lock a umtx object.
695  */
696 static int
697 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
698 	const struct timespec *timeout)
699 {
700 	struct abs_timeout timo;
701 	struct umtx_q *uq;
702 	u_long owner;
703 	u_long old;
704 	int error = 0;
705 
706 	uq = td->td_umtxq;
707 	if (timeout != NULL)
708 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
709 
710 	/*
711 	 * Care must be exercised when dealing with umtx structure. It
712 	 * can fault on any access.
713 	 */
714 	for (;;) {
715 		/*
716 		 * Try the uncontested case.  This should be done in userland.
717 		 */
718 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
719 
720 		/* The acquire succeeded. */
721 		if (owner == UMTX_UNOWNED)
722 			return (0);
723 
724 		/* The address was invalid. */
725 		if (owner == -1)
726 			return (EFAULT);
727 
728 		/* If no one owns it but it is contested try to acquire it. */
729 		if (owner == UMTX_CONTESTED) {
730 			owner = casuword(&umtx->u_owner,
731 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
732 
733 			if (owner == UMTX_CONTESTED)
734 				return (0);
735 
736 			/* The address was invalid. */
737 			if (owner == -1)
738 				return (EFAULT);
739 
740 			/* If this failed the lock has changed, restart. */
741 			continue;
742 		}
743 
744 		/*
745 		 * If we caught a signal, we have retried and now
746 		 * exit immediately.
747 		 */
748 		if (error != 0)
749 			break;
750 
751 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
752 			AUTO_SHARE, &uq->uq_key)) != 0)
753 			return (error);
754 
755 		umtxq_lock(&uq->uq_key);
756 		umtxq_busy(&uq->uq_key);
757 		umtxq_insert(uq);
758 		umtxq_unbusy(&uq->uq_key);
759 		umtxq_unlock(&uq->uq_key);
760 
761 		/*
762 		 * Set the contested bit so that a release in user space
763 		 * knows to use the system call for unlock.  If this fails
764 		 * either some one else has acquired the lock or it has been
765 		 * released.
766 		 */
767 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
768 
769 		/* The address was invalid. */
770 		if (old == -1) {
771 			umtxq_lock(&uq->uq_key);
772 			umtxq_remove(uq);
773 			umtxq_unlock(&uq->uq_key);
774 			umtx_key_release(&uq->uq_key);
775 			return (EFAULT);
776 		}
777 
778 		/*
779 		 * We set the contested bit, sleep. Otherwise the lock changed
780 		 * and we need to retry or we lost a race to the thread
781 		 * unlocking the umtx.
782 		 */
783 		umtxq_lock(&uq->uq_key);
784 		if (old == owner)
785 			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
786 			    &timo);
787 		umtxq_remove(uq);
788 		umtxq_unlock(&uq->uq_key);
789 		umtx_key_release(&uq->uq_key);
790 	}
791 
792 	if (timeout == NULL) {
793 		/* Mutex locking is restarted if it is interrupted. */
794 		if (error == EINTR)
795 			error = ERESTART;
796 	} else {
797 		/* Timed-locking is not restarted. */
798 		if (error == ERESTART)
799 			error = EINTR;
800 	}
801 	return (error);
802 }
803 
804 /*
805  * Unlock a umtx object.
806  */
807 static int
808 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
809 {
810 	struct umtx_key key;
811 	u_long owner;
812 	u_long old;
813 	int error;
814 	int count;
815 
816 	/*
817 	 * Make sure we own this mtx.
818 	 */
819 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
820 	if (owner == -1)
821 		return (EFAULT);
822 
823 	if ((owner & ~UMTX_CONTESTED) != id)
824 		return (EPERM);
825 
826 	/* This should be done in userland */
827 	if ((owner & UMTX_CONTESTED) == 0) {
828 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
829 		if (old == -1)
830 			return (EFAULT);
831 		if (old == owner)
832 			return (0);
833 		owner = old;
834 	}
835 
836 	/* We should only ever be in here for contested locks */
837 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
838 		&key)) != 0)
839 		return (error);
840 
841 	umtxq_lock(&key);
842 	umtxq_busy(&key);
843 	count = umtxq_count(&key);
844 	umtxq_unlock(&key);
845 
846 	/*
847 	 * When unlocking the umtx, it must be marked as unowned if
848 	 * there is zero or one thread only waiting for it.
849 	 * Otherwise, it must be marked as contested.
850 	 */
851 	old = casuword(&umtx->u_owner, owner,
852 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
853 	umtxq_lock(&key);
854 	umtxq_signal(&key,1);
855 	umtxq_unbusy(&key);
856 	umtxq_unlock(&key);
857 	umtx_key_release(&key);
858 	if (old == -1)
859 		return (EFAULT);
860 	if (old != owner)
861 		return (EINVAL);
862 	return (0);
863 }
864 
865 #ifdef COMPAT_FREEBSD32
866 
867 /*
868  * Lock a umtx object.
869  */
870 static int
871 do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
872 	const struct timespec *timeout)
873 {
874 	struct abs_timeout timo;
875 	struct umtx_q *uq;
876 	uint32_t owner;
877 	uint32_t old;
878 	int error = 0;
879 
880 	uq = td->td_umtxq;
881 
882 	if (timeout != NULL)
883 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
884 
885 	/*
886 	 * Care must be exercised when dealing with umtx structure. It
887 	 * can fault on any access.
888 	 */
889 	for (;;) {
890 		/*
891 		 * Try the uncontested case.  This should be done in userland.
892 		 */
893 		owner = casuword32(m, UMUTEX_UNOWNED, id);
894 
895 		/* The acquire succeeded. */
896 		if (owner == UMUTEX_UNOWNED)
897 			return (0);
898 
899 		/* The address was invalid. */
900 		if (owner == -1)
901 			return (EFAULT);
902 
903 		/* If no one owns it but it is contested try to acquire it. */
904 		if (owner == UMUTEX_CONTESTED) {
905 			owner = casuword32(m,
906 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
907 			if (owner == UMUTEX_CONTESTED)
908 				return (0);
909 
910 			/* The address was invalid. */
911 			if (owner == -1)
912 				return (EFAULT);
913 
914 			/* If this failed the lock has changed, restart. */
915 			continue;
916 		}
917 
918 		/*
919 		 * If we caught a signal, we have retried and now
920 		 * exit immediately.
921 		 */
922 		if (error != 0)
923 			return (error);
924 
925 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
926 			AUTO_SHARE, &uq->uq_key)) != 0)
927 			return (error);
928 
929 		umtxq_lock(&uq->uq_key);
930 		umtxq_busy(&uq->uq_key);
931 		umtxq_insert(uq);
932 		umtxq_unbusy(&uq->uq_key);
933 		umtxq_unlock(&uq->uq_key);
934 
935 		/*
936 		 * Set the contested bit so that a release in user space
937 		 * knows to use the system call for unlock.  If this fails
938 		 * either some one else has acquired the lock or it has been
939 		 * released.
940 		 */
941 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
942 
943 		/* The address was invalid. */
944 		if (old == -1) {
945 			umtxq_lock(&uq->uq_key);
946 			umtxq_remove(uq);
947 			umtxq_unlock(&uq->uq_key);
948 			umtx_key_release(&uq->uq_key);
949 			return (EFAULT);
950 		}
951 
952 		/*
953 		 * We set the contested bit, sleep. Otherwise the lock changed
954 		 * and we need to retry or we lost a race to the thread
955 		 * unlocking the umtx.
956 		 */
957 		umtxq_lock(&uq->uq_key);
958 		if (old == owner)
959 			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
960 			    NULL : &timo);
961 		umtxq_remove(uq);
962 		umtxq_unlock(&uq->uq_key);
963 		umtx_key_release(&uq->uq_key);
964 	}
965 
966 	if (timeout == NULL) {
967 		/* Mutex locking is restarted if it is interrupted. */
968 		if (error == EINTR)
969 			error = ERESTART;
970 	} else {
971 		/* Timed-locking is not restarted. */
972 		if (error == ERESTART)
973 			error = EINTR;
974 	}
975 	return (error);
976 }
977 
978 /*
979  * Unlock a umtx object.
980  */
981 static int
982 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
983 {
984 	struct umtx_key key;
985 	uint32_t owner;
986 	uint32_t old;
987 	int error;
988 	int count;
989 
990 	/*
991 	 * Make sure we own this mtx.
992 	 */
993 	owner = fuword32(m);
994 	if (owner == -1)
995 		return (EFAULT);
996 
997 	if ((owner & ~UMUTEX_CONTESTED) != id)
998 		return (EPERM);
999 
1000 	/* This should be done in userland */
1001 	if ((owner & UMUTEX_CONTESTED) == 0) {
1002 		old = casuword32(m, owner, UMUTEX_UNOWNED);
1003 		if (old == -1)
1004 			return (EFAULT);
1005 		if (old == owner)
1006 			return (0);
1007 		owner = old;
1008 	}
1009 
1010 	/* We should only ever be in here for contested locks */
1011 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1012 		&key)) != 0)
1013 		return (error);
1014 
1015 	umtxq_lock(&key);
1016 	umtxq_busy(&key);
1017 	count = umtxq_count(&key);
1018 	umtxq_unlock(&key);
1019 
1020 	/*
1021 	 * When unlocking the umtx, it must be marked as unowned if
1022 	 * there is zero or one thread only waiting for it.
1023 	 * Otherwise, it must be marked as contested.
1024 	 */
1025 	old = casuword32(m, owner,
1026 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1027 	umtxq_lock(&key);
1028 	umtxq_signal(&key,1);
1029 	umtxq_unbusy(&key);
1030 	umtxq_unlock(&key);
1031 	umtx_key_release(&key);
1032 	if (old == -1)
1033 		return (EFAULT);
1034 	if (old != owner)
1035 		return (EINVAL);
1036 	return (0);
1037 }
1038 #endif
1039 
1040 /*
1041  * Fetch and compare value, sleep on the address if value is not changed.
1042  */
1043 static int
1044 do_wait(struct thread *td, void *addr, u_long id,
1045 	struct _umtx_time *timeout, int compat32, int is_private)
1046 {
1047 	struct abs_timeout timo;
1048 	struct umtx_q *uq;
1049 	u_long tmp;
1050 	int error = 0;
1051 
1052 	uq = td->td_umtxq;
1053 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1054 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1055 		return (error);
1056 
1057 	if (timeout != NULL)
1058 		abs_timeout_init2(&timo, timeout);
1059 
1060 	umtxq_lock(&uq->uq_key);
1061 	umtxq_insert(uq);
1062 	umtxq_unlock(&uq->uq_key);
1063 	if (compat32 == 0)
1064 		tmp = fuword(addr);
1065         else
1066 		tmp = (unsigned int)fuword32(addr);
1067 	umtxq_lock(&uq->uq_key);
1068 	if (tmp == id)
1069 		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1070 		    NULL : &timo);
1071 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1072 		error = 0;
1073 	else
1074 		umtxq_remove(uq);
1075 	umtxq_unlock(&uq->uq_key);
1076 	umtx_key_release(&uq->uq_key);
1077 	if (error == ERESTART)
1078 		error = EINTR;
1079 	return (error);
1080 }
1081 
1082 /*
1083  * Wake up threads sleeping on the specified address.
1084  */
1085 int
1086 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1087 {
1088 	struct umtx_key key;
1089 	int ret;
1090 
1091 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1092 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1093 		return (ret);
1094 	umtxq_lock(&key);
1095 	ret = umtxq_signal(&key, n_wake);
1096 	umtxq_unlock(&key);
1097 	umtx_key_release(&key);
1098 	return (0);
1099 }
1100 
1101 /*
1102  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1103  */
1104 static int
1105 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1106 	struct _umtx_time *timeout, int mode)
1107 {
1108 	struct abs_timeout timo;
1109 	struct umtx_q *uq;
1110 	uint32_t owner, old, id;
1111 	int error = 0;
1112 
1113 	id = td->td_tid;
1114 	uq = td->td_umtxq;
1115 
1116 	if (timeout != NULL)
1117 		abs_timeout_init2(&timo, timeout);
1118 
1119 	/*
1120 	 * Care must be exercised when dealing with umtx structure. It
1121 	 * can fault on any access.
1122 	 */
1123 	for (;;) {
1124 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1125 		if (mode == _UMUTEX_WAIT) {
1126 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1127 				return (0);
1128 		} else {
1129 			/*
1130 			 * Try the uncontested case.  This should be done in userland.
1131 			 */
1132 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1133 
1134 			/* The acquire succeeded. */
1135 			if (owner == UMUTEX_UNOWNED)
1136 				return (0);
1137 
1138 			/* The address was invalid. */
1139 			if (owner == -1)
1140 				return (EFAULT);
1141 
1142 			/* If no one owns it but it is contested try to acquire it. */
1143 			if (owner == UMUTEX_CONTESTED) {
1144 				owner = casuword32(&m->m_owner,
1145 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1146 
1147 				if (owner == UMUTEX_CONTESTED)
1148 					return (0);
1149 
1150 				/* The address was invalid. */
1151 				if (owner == -1)
1152 					return (EFAULT);
1153 
1154 				/* If this failed the lock has changed, restart. */
1155 				continue;
1156 			}
1157 		}
1158 
1159 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1160 		    (owner & ~UMUTEX_CONTESTED) == id)
1161 			return (EDEADLK);
1162 
1163 		if (mode == _UMUTEX_TRY)
1164 			return (EBUSY);
1165 
1166 		/*
1167 		 * If we caught a signal, we have retried and now
1168 		 * exit immediately.
1169 		 */
1170 		if (error != 0)
1171 			return (error);
1172 
1173 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1174 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1175 			return (error);
1176 
1177 		umtxq_lock(&uq->uq_key);
1178 		umtxq_busy(&uq->uq_key);
1179 		umtxq_insert(uq);
1180 		umtxq_unlock(&uq->uq_key);
1181 
1182 		/*
1183 		 * Set the contested bit so that a release in user space
1184 		 * knows to use the system call for unlock.  If this fails
1185 		 * either some one else has acquired the lock or it has been
1186 		 * released.
1187 		 */
1188 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1189 
1190 		/* The address was invalid. */
1191 		if (old == -1) {
1192 			umtxq_lock(&uq->uq_key);
1193 			umtxq_remove(uq);
1194 			umtxq_unbusy(&uq->uq_key);
1195 			umtxq_unlock(&uq->uq_key);
1196 			umtx_key_release(&uq->uq_key);
1197 			return (EFAULT);
1198 		}
1199 
1200 		/*
1201 		 * We set the contested bit, sleep. Otherwise the lock changed
1202 		 * and we need to retry or we lost a race to the thread
1203 		 * unlocking the umtx.
1204 		 */
1205 		umtxq_lock(&uq->uq_key);
1206 		umtxq_unbusy(&uq->uq_key);
1207 		if (old == owner)
1208 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1209 			    NULL : &timo);
1210 		umtxq_remove(uq);
1211 		umtxq_unlock(&uq->uq_key);
1212 		umtx_key_release(&uq->uq_key);
1213 	}
1214 
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1220  */
1221 static int
1222 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1223 {
1224 	struct umtx_key key;
1225 	uint32_t owner, old, id;
1226 	int error;
1227 	int count;
1228 
1229 	id = td->td_tid;
1230 	/*
1231 	 * Make sure we own this mtx.
1232 	 */
1233 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1234 	if (owner == -1)
1235 		return (EFAULT);
1236 
1237 	if ((owner & ~UMUTEX_CONTESTED) != id)
1238 		return (EPERM);
1239 
1240 	if ((owner & UMUTEX_CONTESTED) == 0) {
1241 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1242 		if (old == -1)
1243 			return (EFAULT);
1244 		if (old == owner)
1245 			return (0);
1246 		owner = old;
1247 	}
1248 
1249 	/* We should only ever be in here for contested locks */
1250 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1251 	    &key)) != 0)
1252 		return (error);
1253 
1254 	umtxq_lock(&key);
1255 	umtxq_busy(&key);
1256 	count = umtxq_count(&key);
1257 	umtxq_unlock(&key);
1258 
1259 	/*
1260 	 * When unlocking the umtx, it must be marked as unowned if
1261 	 * there is zero or one thread only waiting for it.
1262 	 * Otherwise, it must be marked as contested.
1263 	 */
1264 	old = casuword32(&m->m_owner, owner,
1265 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1266 	umtxq_lock(&key);
1267 	umtxq_signal(&key,1);
1268 	umtxq_unbusy(&key);
1269 	umtxq_unlock(&key);
1270 	umtx_key_release(&key);
1271 	if (old == -1)
1272 		return (EFAULT);
1273 	if (old != owner)
1274 		return (EINVAL);
1275 	return (0);
1276 }
1277 
1278 /*
1279  * Check if the mutex is available and wake up a waiter,
1280  * only for simple mutex.
1281  */
1282 static int
1283 do_wake_umutex(struct thread *td, struct umutex *m)
1284 {
1285 	struct umtx_key key;
1286 	uint32_t owner;
1287 	uint32_t flags;
1288 	int error;
1289 	int count;
1290 
1291 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1292 	if (owner == -1)
1293 		return (EFAULT);
1294 
1295 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1296 		return (0);
1297 
1298 	flags = fuword32(&m->m_flags);
1299 
1300 	/* We should only ever be in here for contested locks */
1301 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1302 	    &key)) != 0)
1303 		return (error);
1304 
1305 	umtxq_lock(&key);
1306 	umtxq_busy(&key);
1307 	count = umtxq_count(&key);
1308 	umtxq_unlock(&key);
1309 
1310 	if (count <= 1)
1311 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1312 
1313 	umtxq_lock(&key);
1314 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1315 		umtxq_signal(&key, 1);
1316 	umtxq_unbusy(&key);
1317 	umtxq_unlock(&key);
1318 	umtx_key_release(&key);
1319 	return (0);
1320 }
1321 
1322 /*
1323  * Check if the mutex has waiters and tries to fix contention bit.
1324  */
1325 static int
1326 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1327 {
1328 	struct umtx_key key;
1329 	uint32_t owner, old;
1330 	int type;
1331 	int error;
1332 	int count;
1333 
1334 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1335 	case 0:
1336 		type = TYPE_NORMAL_UMUTEX;
1337 		break;
1338 	case UMUTEX_PRIO_INHERIT:
1339 		type = TYPE_PI_UMUTEX;
1340 		break;
1341 	case UMUTEX_PRIO_PROTECT:
1342 		type = TYPE_PP_UMUTEX;
1343 		break;
1344 	default:
1345 		return (EINVAL);
1346 	}
1347 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1348 	    &key)) != 0)
1349 		return (error);
1350 
1351 	owner = 0;
1352 	umtxq_lock(&key);
1353 	umtxq_busy(&key);
1354 	count = umtxq_count(&key);
1355 	umtxq_unlock(&key);
1356 	/*
1357 	 * Only repair contention bit if there is a waiter, this means the mutex
1358 	 * is still being referenced by userland code, otherwise don't update
1359 	 * any memory.
1360 	 */
1361 	if (count > 1) {
1362 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1363 		while ((owner & UMUTEX_CONTESTED) ==0) {
1364 			old = casuword32(&m->m_owner, owner,
1365 			    owner|UMUTEX_CONTESTED);
1366 			if (old == owner)
1367 				break;
1368 			owner = old;
1369 		}
1370 	} else if (count == 1) {
1371 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1372 		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1373 		       (owner & UMUTEX_CONTESTED) == 0) {
1374 			old = casuword32(&m->m_owner, owner,
1375 			    owner|UMUTEX_CONTESTED);
1376 			if (old == owner)
1377 				break;
1378 			owner = old;
1379 		}
1380 	}
1381 	umtxq_lock(&key);
1382 	if (owner == -1) {
1383 		error = EFAULT;
1384 		umtxq_signal(&key, INT_MAX);
1385 	}
1386 	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1387 		umtxq_signal(&key, 1);
1388 	umtxq_unbusy(&key);
1389 	umtxq_unlock(&key);
1390 	umtx_key_release(&key);
1391 	return (error);
1392 }
1393 
1394 static inline struct umtx_pi *
1395 umtx_pi_alloc(int flags)
1396 {
1397 	struct umtx_pi *pi;
1398 
1399 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1400 	TAILQ_INIT(&pi->pi_blocked);
1401 	atomic_add_int(&umtx_pi_allocated, 1);
1402 	return (pi);
1403 }
1404 
1405 static inline void
1406 umtx_pi_free(struct umtx_pi *pi)
1407 {
1408 	uma_zfree(umtx_pi_zone, pi);
1409 	atomic_add_int(&umtx_pi_allocated, -1);
1410 }
1411 
1412 /*
1413  * Adjust the thread's position on a pi_state after its priority has been
1414  * changed.
1415  */
1416 static int
1417 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1418 {
1419 	struct umtx_q *uq, *uq1, *uq2;
1420 	struct thread *td1;
1421 
1422 	mtx_assert(&umtx_lock, MA_OWNED);
1423 	if (pi == NULL)
1424 		return (0);
1425 
1426 	uq = td->td_umtxq;
1427 
1428 	/*
1429 	 * Check if the thread needs to be moved on the blocked chain.
1430 	 * It needs to be moved if either its priority is lower than
1431 	 * the previous thread or higher than the next thread.
1432 	 */
1433 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1434 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1435 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1436 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1437 		/*
1438 		 * Remove thread from blocked chain and determine where
1439 		 * it should be moved to.
1440 		 */
1441 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1442 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1443 			td1 = uq1->uq_thread;
1444 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1445 			if (UPRI(td1) > UPRI(td))
1446 				break;
1447 		}
1448 
1449 		if (uq1 == NULL)
1450 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1451 		else
1452 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1453 	}
1454 	return (1);
1455 }
1456 
1457 /*
1458  * Propagate priority when a thread is blocked on POSIX
1459  * PI mutex.
1460  */
1461 static void
1462 umtx_propagate_priority(struct thread *td)
1463 {
1464 	struct umtx_q *uq;
1465 	struct umtx_pi *pi;
1466 	int pri;
1467 
1468 	mtx_assert(&umtx_lock, MA_OWNED);
1469 	pri = UPRI(td);
1470 	uq = td->td_umtxq;
1471 	pi = uq->uq_pi_blocked;
1472 	if (pi == NULL)
1473 		return;
1474 
1475 	for (;;) {
1476 		td = pi->pi_owner;
1477 		if (td == NULL || td == curthread)
1478 			return;
1479 
1480 		MPASS(td->td_proc != NULL);
1481 		MPASS(td->td_proc->p_magic == P_MAGIC);
1482 
1483 		thread_lock(td);
1484 		if (td->td_lend_user_pri > pri)
1485 			sched_lend_user_prio(td, pri);
1486 		else {
1487 			thread_unlock(td);
1488 			break;
1489 		}
1490 		thread_unlock(td);
1491 
1492 		/*
1493 		 * Pick up the lock that td is blocked on.
1494 		 */
1495 		uq = td->td_umtxq;
1496 		pi = uq->uq_pi_blocked;
1497 		if (pi == NULL)
1498 			break;
1499 		/* Resort td on the list if needed. */
1500 		umtx_pi_adjust_thread(pi, td);
1501 	}
1502 }
1503 
1504 /*
1505  * Unpropagate priority for a PI mutex when a thread blocked on
1506  * it is interrupted by signal or resumed by others.
1507  */
1508 static void
1509 umtx_repropagate_priority(struct umtx_pi *pi)
1510 {
1511 	struct umtx_q *uq, *uq_owner;
1512 	struct umtx_pi *pi2;
1513 	int pri;
1514 
1515 	mtx_assert(&umtx_lock, MA_OWNED);
1516 
1517 	while (pi != NULL && pi->pi_owner != NULL) {
1518 		pri = PRI_MAX;
1519 		uq_owner = pi->pi_owner->td_umtxq;
1520 
1521 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1522 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1523 			if (uq != NULL) {
1524 				if (pri > UPRI(uq->uq_thread))
1525 					pri = UPRI(uq->uq_thread);
1526 			}
1527 		}
1528 
1529 		if (pri > uq_owner->uq_inherited_pri)
1530 			pri = uq_owner->uq_inherited_pri;
1531 		thread_lock(pi->pi_owner);
1532 		sched_lend_user_prio(pi->pi_owner, pri);
1533 		thread_unlock(pi->pi_owner);
1534 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1535 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1536 	}
1537 }
1538 
1539 /*
1540  * Insert a PI mutex into owned list.
1541  */
1542 static void
1543 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1544 {
1545 	struct umtx_q *uq_owner;
1546 
1547 	uq_owner = owner->td_umtxq;
1548 	mtx_assert(&umtx_lock, MA_OWNED);
1549 	if (pi->pi_owner != NULL)
1550 		panic("pi_ower != NULL");
1551 	pi->pi_owner = owner;
1552 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1553 }
1554 
1555 /*
1556  * Claim ownership of a PI mutex.
1557  */
1558 static int
1559 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1560 {
1561 	struct umtx_q *uq, *uq_owner;
1562 
1563 	uq_owner = owner->td_umtxq;
1564 	mtx_lock_spin(&umtx_lock);
1565 	if (pi->pi_owner == owner) {
1566 		mtx_unlock_spin(&umtx_lock);
1567 		return (0);
1568 	}
1569 
1570 	if (pi->pi_owner != NULL) {
1571 		/*
1572 		 * userland may have already messed the mutex, sigh.
1573 		 */
1574 		mtx_unlock_spin(&umtx_lock);
1575 		return (EPERM);
1576 	}
1577 	umtx_pi_setowner(pi, owner);
1578 	uq = TAILQ_FIRST(&pi->pi_blocked);
1579 	if (uq != NULL) {
1580 		int pri;
1581 
1582 		pri = UPRI(uq->uq_thread);
1583 		thread_lock(owner);
1584 		if (pri < UPRI(owner))
1585 			sched_lend_user_prio(owner, pri);
1586 		thread_unlock(owner);
1587 	}
1588 	mtx_unlock_spin(&umtx_lock);
1589 	return (0);
1590 }
1591 
1592 /*
1593  * Adjust a thread's order position in its blocked PI mutex,
1594  * this may result new priority propagating process.
1595  */
1596 void
1597 umtx_pi_adjust(struct thread *td, u_char oldpri)
1598 {
1599 	struct umtx_q *uq;
1600 	struct umtx_pi *pi;
1601 
1602 	uq = td->td_umtxq;
1603 	mtx_lock_spin(&umtx_lock);
1604 	/*
1605 	 * Pick up the lock that td is blocked on.
1606 	 */
1607 	pi = uq->uq_pi_blocked;
1608 	if (pi != NULL) {
1609 		umtx_pi_adjust_thread(pi, td);
1610 		umtx_repropagate_priority(pi);
1611 	}
1612 	mtx_unlock_spin(&umtx_lock);
1613 }
1614 
1615 /*
1616  * Sleep on a PI mutex.
1617  */
1618 static int
1619 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1620 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1621 {
1622 	struct umtxq_chain *uc;
1623 	struct thread *td, *td1;
1624 	struct umtx_q *uq1;
1625 	int pri;
1626 	int error = 0;
1627 
1628 	td = uq->uq_thread;
1629 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1630 	uc = umtxq_getchain(&uq->uq_key);
1631 	UMTXQ_LOCKED_ASSERT(uc);
1632 	UMTXQ_BUSY_ASSERT(uc);
1633 	umtxq_insert(uq);
1634 	mtx_lock_spin(&umtx_lock);
1635 	if (pi->pi_owner == NULL) {
1636 		mtx_unlock_spin(&umtx_lock);
1637 		/* XXX Only look up thread in current process. */
1638 		td1 = tdfind(owner, curproc->p_pid);
1639 		mtx_lock_spin(&umtx_lock);
1640 		if (td1 != NULL) {
1641 			if (pi->pi_owner == NULL)
1642 				umtx_pi_setowner(pi, td1);
1643 			PROC_UNLOCK(td1->td_proc);
1644 		}
1645 	}
1646 
1647 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1648 		pri = UPRI(uq1->uq_thread);
1649 		if (pri > UPRI(td))
1650 			break;
1651 	}
1652 
1653 	if (uq1 != NULL)
1654 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1655 	else
1656 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1657 
1658 	uq->uq_pi_blocked = pi;
1659 	thread_lock(td);
1660 	td->td_flags |= TDF_UPIBLOCKED;
1661 	thread_unlock(td);
1662 	umtx_propagate_priority(td);
1663 	mtx_unlock_spin(&umtx_lock);
1664 	umtxq_unbusy(&uq->uq_key);
1665 
1666 	error = umtxq_sleep(uq, wmesg, timo);
1667 	umtxq_remove(uq);
1668 
1669 	mtx_lock_spin(&umtx_lock);
1670 	uq->uq_pi_blocked = NULL;
1671 	thread_lock(td);
1672 	td->td_flags &= ~TDF_UPIBLOCKED;
1673 	thread_unlock(td);
1674 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1675 	umtx_repropagate_priority(pi);
1676 	mtx_unlock_spin(&umtx_lock);
1677 	umtxq_unlock(&uq->uq_key);
1678 
1679 	return (error);
1680 }
1681 
1682 /*
1683  * Add reference count for a PI mutex.
1684  */
1685 static void
1686 umtx_pi_ref(struct umtx_pi *pi)
1687 {
1688 	struct umtxq_chain *uc;
1689 
1690 	uc = umtxq_getchain(&pi->pi_key);
1691 	UMTXQ_LOCKED_ASSERT(uc);
1692 	pi->pi_refcount++;
1693 }
1694 
1695 /*
1696  * Decrease reference count for a PI mutex, if the counter
1697  * is decreased to zero, its memory space is freed.
1698  */
1699 static void
1700 umtx_pi_unref(struct umtx_pi *pi)
1701 {
1702 	struct umtxq_chain *uc;
1703 
1704 	uc = umtxq_getchain(&pi->pi_key);
1705 	UMTXQ_LOCKED_ASSERT(uc);
1706 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1707 	if (--pi->pi_refcount == 0) {
1708 		mtx_lock_spin(&umtx_lock);
1709 		if (pi->pi_owner != NULL) {
1710 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1711 				pi, pi_link);
1712 			pi->pi_owner = NULL;
1713 		}
1714 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1715 			("blocked queue not empty"));
1716 		mtx_unlock_spin(&umtx_lock);
1717 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1718 		umtx_pi_free(pi);
1719 	}
1720 }
1721 
1722 /*
1723  * Find a PI mutex in hash table.
1724  */
1725 static struct umtx_pi *
1726 umtx_pi_lookup(struct umtx_key *key)
1727 {
1728 	struct umtxq_chain *uc;
1729 	struct umtx_pi *pi;
1730 
1731 	uc = umtxq_getchain(key);
1732 	UMTXQ_LOCKED_ASSERT(uc);
1733 
1734 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1735 		if (umtx_key_match(&pi->pi_key, key)) {
1736 			return (pi);
1737 		}
1738 	}
1739 	return (NULL);
1740 }
1741 
1742 /*
1743  * Insert a PI mutex into hash table.
1744  */
1745 static inline void
1746 umtx_pi_insert(struct umtx_pi *pi)
1747 {
1748 	struct umtxq_chain *uc;
1749 
1750 	uc = umtxq_getchain(&pi->pi_key);
1751 	UMTXQ_LOCKED_ASSERT(uc);
1752 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1753 }
1754 
1755 /*
1756  * Lock a PI mutex.
1757  */
1758 static int
1759 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1760     struct _umtx_time *timeout, int try)
1761 {
1762 	struct abs_timeout timo;
1763 	struct umtx_q *uq;
1764 	struct umtx_pi *pi, *new_pi;
1765 	uint32_t id, owner, old;
1766 	int error;
1767 
1768 	id = td->td_tid;
1769 	uq = td->td_umtxq;
1770 
1771 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1772 	    &uq->uq_key)) != 0)
1773 		return (error);
1774 
1775 	if (timeout != NULL)
1776 		abs_timeout_init2(&timo, timeout);
1777 
1778 	umtxq_lock(&uq->uq_key);
1779 	pi = umtx_pi_lookup(&uq->uq_key);
1780 	if (pi == NULL) {
1781 		new_pi = umtx_pi_alloc(M_NOWAIT);
1782 		if (new_pi == NULL) {
1783 			umtxq_unlock(&uq->uq_key);
1784 			new_pi = umtx_pi_alloc(M_WAITOK);
1785 			umtxq_lock(&uq->uq_key);
1786 			pi = umtx_pi_lookup(&uq->uq_key);
1787 			if (pi != NULL) {
1788 				umtx_pi_free(new_pi);
1789 				new_pi = NULL;
1790 			}
1791 		}
1792 		if (new_pi != NULL) {
1793 			new_pi->pi_key = uq->uq_key;
1794 			umtx_pi_insert(new_pi);
1795 			pi = new_pi;
1796 		}
1797 	}
1798 	umtx_pi_ref(pi);
1799 	umtxq_unlock(&uq->uq_key);
1800 
1801 	/*
1802 	 * Care must be exercised when dealing with umtx structure.  It
1803 	 * can fault on any access.
1804 	 */
1805 	for (;;) {
1806 		/*
1807 		 * Try the uncontested case.  This should be done in userland.
1808 		 */
1809 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1810 
1811 		/* The acquire succeeded. */
1812 		if (owner == UMUTEX_UNOWNED) {
1813 			error = 0;
1814 			break;
1815 		}
1816 
1817 		/* The address was invalid. */
1818 		if (owner == -1) {
1819 			error = EFAULT;
1820 			break;
1821 		}
1822 
1823 		/* If no one owns it but it is contested try to acquire it. */
1824 		if (owner == UMUTEX_CONTESTED) {
1825 			owner = casuword32(&m->m_owner,
1826 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1827 
1828 			if (owner == UMUTEX_CONTESTED) {
1829 				umtxq_lock(&uq->uq_key);
1830 				umtxq_busy(&uq->uq_key);
1831 				error = umtx_pi_claim(pi, td);
1832 				umtxq_unbusy(&uq->uq_key);
1833 				umtxq_unlock(&uq->uq_key);
1834 				break;
1835 			}
1836 
1837 			/* The address was invalid. */
1838 			if (owner == -1) {
1839 				error = EFAULT;
1840 				break;
1841 			}
1842 
1843 			/* If this failed the lock has changed, restart. */
1844 			continue;
1845 		}
1846 
1847 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1848 		    (owner & ~UMUTEX_CONTESTED) == id) {
1849 			error = EDEADLK;
1850 			break;
1851 		}
1852 
1853 		if (try != 0) {
1854 			error = EBUSY;
1855 			break;
1856 		}
1857 
1858 		/*
1859 		 * If we caught a signal, we have retried and now
1860 		 * exit immediately.
1861 		 */
1862 		if (error != 0)
1863 			break;
1864 
1865 		umtxq_lock(&uq->uq_key);
1866 		umtxq_busy(&uq->uq_key);
1867 		umtxq_unlock(&uq->uq_key);
1868 
1869 		/*
1870 		 * Set the contested bit so that a release in user space
1871 		 * knows to use the system call for unlock.  If this fails
1872 		 * either some one else has acquired the lock or it has been
1873 		 * released.
1874 		 */
1875 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1876 
1877 		/* The address was invalid. */
1878 		if (old == -1) {
1879 			umtxq_lock(&uq->uq_key);
1880 			umtxq_unbusy(&uq->uq_key);
1881 			umtxq_unlock(&uq->uq_key);
1882 			error = EFAULT;
1883 			break;
1884 		}
1885 
1886 		umtxq_lock(&uq->uq_key);
1887 		/*
1888 		 * We set the contested bit, sleep. Otherwise the lock changed
1889 		 * and we need to retry or we lost a race to the thread
1890 		 * unlocking the umtx.
1891 		 */
1892 		if (old == owner)
1893 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1894 			    "umtxpi", timeout == NULL ? NULL : &timo);
1895 		else {
1896 			umtxq_unbusy(&uq->uq_key);
1897 			umtxq_unlock(&uq->uq_key);
1898 		}
1899 	}
1900 
1901 	umtxq_lock(&uq->uq_key);
1902 	umtx_pi_unref(pi);
1903 	umtxq_unlock(&uq->uq_key);
1904 
1905 	umtx_key_release(&uq->uq_key);
1906 	return (error);
1907 }
1908 
1909 /*
1910  * Unlock a PI mutex.
1911  */
1912 static int
1913 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1914 {
1915 	struct umtx_key key;
1916 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1917 	struct umtx_pi *pi, *pi2;
1918 	uint32_t owner, old, id;
1919 	int error;
1920 	int count;
1921 	int pri;
1922 
1923 	id = td->td_tid;
1924 	/*
1925 	 * Make sure we own this mtx.
1926 	 */
1927 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1928 	if (owner == -1)
1929 		return (EFAULT);
1930 
1931 	if ((owner & ~UMUTEX_CONTESTED) != id)
1932 		return (EPERM);
1933 
1934 	/* This should be done in userland */
1935 	if ((owner & UMUTEX_CONTESTED) == 0) {
1936 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1937 		if (old == -1)
1938 			return (EFAULT);
1939 		if (old == owner)
1940 			return (0);
1941 		owner = old;
1942 	}
1943 
1944 	/* We should only ever be in here for contested locks */
1945 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1946 	    &key)) != 0)
1947 		return (error);
1948 
1949 	umtxq_lock(&key);
1950 	umtxq_busy(&key);
1951 	count = umtxq_count_pi(&key, &uq_first);
1952 	if (uq_first != NULL) {
1953 		mtx_lock_spin(&umtx_lock);
1954 		pi = uq_first->uq_pi_blocked;
1955 		KASSERT(pi != NULL, ("pi == NULL?"));
1956 		if (pi->pi_owner != curthread) {
1957 			mtx_unlock_spin(&umtx_lock);
1958 			umtxq_unbusy(&key);
1959 			umtxq_unlock(&key);
1960 			umtx_key_release(&key);
1961 			/* userland messed the mutex */
1962 			return (EPERM);
1963 		}
1964 		uq_me = curthread->td_umtxq;
1965 		pi->pi_owner = NULL;
1966 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1967 		/* get highest priority thread which is still sleeping. */
1968 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1969 		while (uq_first != NULL &&
1970 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1971 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1972 		}
1973 		pri = PRI_MAX;
1974 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1975 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1976 			if (uq_first2 != NULL) {
1977 				if (pri > UPRI(uq_first2->uq_thread))
1978 					pri = UPRI(uq_first2->uq_thread);
1979 			}
1980 		}
1981 		thread_lock(curthread);
1982 		sched_lend_user_prio(curthread, pri);
1983 		thread_unlock(curthread);
1984 		mtx_unlock_spin(&umtx_lock);
1985 		if (uq_first)
1986 			umtxq_signal_thread(uq_first);
1987 	}
1988 	umtxq_unlock(&key);
1989 
1990 	/*
1991 	 * When unlocking the umtx, it must be marked as unowned if
1992 	 * there is zero or one thread only waiting for it.
1993 	 * Otherwise, it must be marked as contested.
1994 	 */
1995 	old = casuword32(&m->m_owner, owner,
1996 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1997 
1998 	umtxq_lock(&key);
1999 	umtxq_unbusy(&key);
2000 	umtxq_unlock(&key);
2001 	umtx_key_release(&key);
2002 	if (old == -1)
2003 		return (EFAULT);
2004 	if (old != owner)
2005 		return (EINVAL);
2006 	return (0);
2007 }
2008 
2009 /*
2010  * Lock a PP mutex.
2011  */
2012 static int
2013 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2014     struct _umtx_time *timeout, int try)
2015 {
2016 	struct abs_timeout timo;
2017 	struct umtx_q *uq, *uq2;
2018 	struct umtx_pi *pi;
2019 	uint32_t ceiling;
2020 	uint32_t owner, id;
2021 	int error, pri, old_inherited_pri, su;
2022 
2023 	id = td->td_tid;
2024 	uq = td->td_umtxq;
2025 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2026 	    &uq->uq_key)) != 0)
2027 		return (error);
2028 
2029 	if (timeout != NULL)
2030 		abs_timeout_init2(&timo, timeout);
2031 
2032 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2033 	for (;;) {
2034 		old_inherited_pri = uq->uq_inherited_pri;
2035 		umtxq_lock(&uq->uq_key);
2036 		umtxq_busy(&uq->uq_key);
2037 		umtxq_unlock(&uq->uq_key);
2038 
2039 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2040 		if (ceiling > RTP_PRIO_MAX) {
2041 			error = EINVAL;
2042 			goto out;
2043 		}
2044 
2045 		mtx_lock_spin(&umtx_lock);
2046 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2047 			mtx_unlock_spin(&umtx_lock);
2048 			error = EINVAL;
2049 			goto out;
2050 		}
2051 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2052 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2053 			thread_lock(td);
2054 			if (uq->uq_inherited_pri < UPRI(td))
2055 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2056 			thread_unlock(td);
2057 		}
2058 		mtx_unlock_spin(&umtx_lock);
2059 
2060 		owner = casuword32(&m->m_owner,
2061 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2062 
2063 		if (owner == UMUTEX_CONTESTED) {
2064 			error = 0;
2065 			break;
2066 		}
2067 
2068 		/* The address was invalid. */
2069 		if (owner == -1) {
2070 			error = EFAULT;
2071 			break;
2072 		}
2073 
2074 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2075 		    (owner & ~UMUTEX_CONTESTED) == id) {
2076 			error = EDEADLK;
2077 			break;
2078 		}
2079 
2080 		if (try != 0) {
2081 			error = EBUSY;
2082 			break;
2083 		}
2084 
2085 		/*
2086 		 * If we caught a signal, we have retried and now
2087 		 * exit immediately.
2088 		 */
2089 		if (error != 0)
2090 			break;
2091 
2092 		umtxq_lock(&uq->uq_key);
2093 		umtxq_insert(uq);
2094 		umtxq_unbusy(&uq->uq_key);
2095 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2096 		    NULL : &timo);
2097 		umtxq_remove(uq);
2098 		umtxq_unlock(&uq->uq_key);
2099 
2100 		mtx_lock_spin(&umtx_lock);
2101 		uq->uq_inherited_pri = old_inherited_pri;
2102 		pri = PRI_MAX;
2103 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2104 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2105 			if (uq2 != NULL) {
2106 				if (pri > UPRI(uq2->uq_thread))
2107 					pri = UPRI(uq2->uq_thread);
2108 			}
2109 		}
2110 		if (pri > uq->uq_inherited_pri)
2111 			pri = uq->uq_inherited_pri;
2112 		thread_lock(td);
2113 		sched_lend_user_prio(td, pri);
2114 		thread_unlock(td);
2115 		mtx_unlock_spin(&umtx_lock);
2116 	}
2117 
2118 	if (error != 0) {
2119 		mtx_lock_spin(&umtx_lock);
2120 		uq->uq_inherited_pri = old_inherited_pri;
2121 		pri = PRI_MAX;
2122 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2123 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2124 			if (uq2 != NULL) {
2125 				if (pri > UPRI(uq2->uq_thread))
2126 					pri = UPRI(uq2->uq_thread);
2127 			}
2128 		}
2129 		if (pri > uq->uq_inherited_pri)
2130 			pri = uq->uq_inherited_pri;
2131 		thread_lock(td);
2132 		sched_lend_user_prio(td, pri);
2133 		thread_unlock(td);
2134 		mtx_unlock_spin(&umtx_lock);
2135 	}
2136 
2137 out:
2138 	umtxq_lock(&uq->uq_key);
2139 	umtxq_unbusy(&uq->uq_key);
2140 	umtxq_unlock(&uq->uq_key);
2141 	umtx_key_release(&uq->uq_key);
2142 	return (error);
2143 }
2144 
2145 /*
2146  * Unlock a PP mutex.
2147  */
2148 static int
2149 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2150 {
2151 	struct umtx_key key;
2152 	struct umtx_q *uq, *uq2;
2153 	struct umtx_pi *pi;
2154 	uint32_t owner, id;
2155 	uint32_t rceiling;
2156 	int error, pri, new_inherited_pri, su;
2157 
2158 	id = td->td_tid;
2159 	uq = td->td_umtxq;
2160 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2161 
2162 	/*
2163 	 * Make sure we own this mtx.
2164 	 */
2165 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2166 	if (owner == -1)
2167 		return (EFAULT);
2168 
2169 	if ((owner & ~UMUTEX_CONTESTED) != id)
2170 		return (EPERM);
2171 
2172 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2173 	if (error != 0)
2174 		return (error);
2175 
2176 	if (rceiling == -1)
2177 		new_inherited_pri = PRI_MAX;
2178 	else {
2179 		rceiling = RTP_PRIO_MAX - rceiling;
2180 		if (rceiling > RTP_PRIO_MAX)
2181 			return (EINVAL);
2182 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2183 	}
2184 
2185 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2186 	    &key)) != 0)
2187 		return (error);
2188 	umtxq_lock(&key);
2189 	umtxq_busy(&key);
2190 	umtxq_unlock(&key);
2191 	/*
2192 	 * For priority protected mutex, always set unlocked state
2193 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2194 	 * to lock the mutex, it is necessary because thread priority
2195 	 * has to be adjusted for such mutex.
2196 	 */
2197 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2198 		UMUTEX_CONTESTED);
2199 
2200 	umtxq_lock(&key);
2201 	if (error == 0)
2202 		umtxq_signal(&key, 1);
2203 	umtxq_unbusy(&key);
2204 	umtxq_unlock(&key);
2205 
2206 	if (error == -1)
2207 		error = EFAULT;
2208 	else {
2209 		mtx_lock_spin(&umtx_lock);
2210 		if (su != 0)
2211 			uq->uq_inherited_pri = new_inherited_pri;
2212 		pri = PRI_MAX;
2213 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2214 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2215 			if (uq2 != NULL) {
2216 				if (pri > UPRI(uq2->uq_thread))
2217 					pri = UPRI(uq2->uq_thread);
2218 			}
2219 		}
2220 		if (pri > uq->uq_inherited_pri)
2221 			pri = uq->uq_inherited_pri;
2222 		thread_lock(td);
2223 		sched_lend_user_prio(td, pri);
2224 		thread_unlock(td);
2225 		mtx_unlock_spin(&umtx_lock);
2226 	}
2227 	umtx_key_release(&key);
2228 	return (error);
2229 }
2230 
2231 static int
2232 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2233 	uint32_t *old_ceiling)
2234 {
2235 	struct umtx_q *uq;
2236 	uint32_t save_ceiling;
2237 	uint32_t owner, id;
2238 	uint32_t flags;
2239 	int error;
2240 
2241 	flags = fuword32(&m->m_flags);
2242 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2243 		return (EINVAL);
2244 	if (ceiling > RTP_PRIO_MAX)
2245 		return (EINVAL);
2246 	id = td->td_tid;
2247 	uq = td->td_umtxq;
2248 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2249 	   &uq->uq_key)) != 0)
2250 		return (error);
2251 	for (;;) {
2252 		umtxq_lock(&uq->uq_key);
2253 		umtxq_busy(&uq->uq_key);
2254 		umtxq_unlock(&uq->uq_key);
2255 
2256 		save_ceiling = fuword32(&m->m_ceilings[0]);
2257 
2258 		owner = casuword32(&m->m_owner,
2259 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2260 
2261 		if (owner == UMUTEX_CONTESTED) {
2262 			suword32(&m->m_ceilings[0], ceiling);
2263 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2264 				UMUTEX_CONTESTED);
2265 			error = 0;
2266 			break;
2267 		}
2268 
2269 		/* The address was invalid. */
2270 		if (owner == -1) {
2271 			error = EFAULT;
2272 			break;
2273 		}
2274 
2275 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2276 			suword32(&m->m_ceilings[0], ceiling);
2277 			error = 0;
2278 			break;
2279 		}
2280 
2281 		/*
2282 		 * If we caught a signal, we have retried and now
2283 		 * exit immediately.
2284 		 */
2285 		if (error != 0)
2286 			break;
2287 
2288 		/*
2289 		 * We set the contested bit, sleep. Otherwise the lock changed
2290 		 * and we need to retry or we lost a race to the thread
2291 		 * unlocking the umtx.
2292 		 */
2293 		umtxq_lock(&uq->uq_key);
2294 		umtxq_insert(uq);
2295 		umtxq_unbusy(&uq->uq_key);
2296 		error = umtxq_sleep(uq, "umtxpp", NULL);
2297 		umtxq_remove(uq);
2298 		umtxq_unlock(&uq->uq_key);
2299 	}
2300 	umtxq_lock(&uq->uq_key);
2301 	if (error == 0)
2302 		umtxq_signal(&uq->uq_key, INT_MAX);
2303 	umtxq_unbusy(&uq->uq_key);
2304 	umtxq_unlock(&uq->uq_key);
2305 	umtx_key_release(&uq->uq_key);
2306 	if (error == 0 && old_ceiling != NULL)
2307 		suword32(old_ceiling, save_ceiling);
2308 	return (error);
2309 }
2310 
2311 /*
2312  * Lock a userland POSIX mutex.
2313  */
2314 static int
2315 do_lock_umutex(struct thread *td, struct umutex *m,
2316     struct _umtx_time *timeout, int mode)
2317 {
2318 	uint32_t flags;
2319 	int error;
2320 
2321 	flags = fuword32(&m->m_flags);
2322 	if (flags == -1)
2323 		return (EFAULT);
2324 
2325 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2326 	case 0:
2327 		error = do_lock_normal(td, m, flags, timeout, mode);
2328 		break;
2329 	case UMUTEX_PRIO_INHERIT:
2330 		error = do_lock_pi(td, m, flags, timeout, mode);
2331 		break;
2332 	case UMUTEX_PRIO_PROTECT:
2333 		error = do_lock_pp(td, m, flags, timeout, mode);
2334 		break;
2335 	default:
2336 		return (EINVAL);
2337 	}
2338 	if (timeout == NULL) {
2339 		if (error == EINTR && mode != _UMUTEX_WAIT)
2340 			error = ERESTART;
2341 	} else {
2342 		/* Timed-locking is not restarted. */
2343 		if (error == ERESTART)
2344 			error = EINTR;
2345 	}
2346 	return (error);
2347 }
2348 
2349 /*
2350  * Unlock a userland POSIX mutex.
2351  */
2352 static int
2353 do_unlock_umutex(struct thread *td, struct umutex *m)
2354 {
2355 	uint32_t flags;
2356 
2357 	flags = fuword32(&m->m_flags);
2358 	if (flags == -1)
2359 		return (EFAULT);
2360 
2361 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2362 	case 0:
2363 		return (do_unlock_normal(td, m, flags));
2364 	case UMUTEX_PRIO_INHERIT:
2365 		return (do_unlock_pi(td, m, flags));
2366 	case UMUTEX_PRIO_PROTECT:
2367 		return (do_unlock_pp(td, m, flags));
2368 	}
2369 
2370 	return (EINVAL);
2371 }
2372 
2373 static int
2374 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2375 	struct timespec *timeout, u_long wflags)
2376 {
2377 	struct abs_timeout timo;
2378 	struct umtx_q *uq;
2379 	uint32_t flags;
2380 	uint32_t clockid;
2381 	int error;
2382 
2383 	uq = td->td_umtxq;
2384 	flags = fuword32(&cv->c_flags);
2385 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2386 	if (error != 0)
2387 		return (error);
2388 
2389 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2390 		clockid = fuword32(&cv->c_clockid);
2391 		if (clockid < CLOCK_REALTIME ||
2392 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2393 			/* hmm, only HW clock id will work. */
2394 			return (EINVAL);
2395 		}
2396 	} else {
2397 		clockid = CLOCK_REALTIME;
2398 	}
2399 
2400 	umtxq_lock(&uq->uq_key);
2401 	umtxq_busy(&uq->uq_key);
2402 	umtxq_insert(uq);
2403 	umtxq_unlock(&uq->uq_key);
2404 
2405 	/*
2406 	 * Set c_has_waiters to 1 before releasing user mutex, also
2407 	 * don't modify cache line when unnecessary.
2408 	 */
2409 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2410 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2411 
2412 	umtxq_lock(&uq->uq_key);
2413 	umtxq_unbusy(&uq->uq_key);
2414 	umtxq_unlock(&uq->uq_key);
2415 
2416 	error = do_unlock_umutex(td, m);
2417 
2418 	if (timeout != NULL)
2419 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2420 			timeout);
2421 
2422 	umtxq_lock(&uq->uq_key);
2423 	if (error == 0) {
2424 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2425 		    NULL : &timo);
2426 	}
2427 
2428 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2429 		error = 0;
2430 	else {
2431 		/*
2432 		 * This must be timeout,interrupted by signal or
2433 		 * surprious wakeup, clear c_has_waiter flag when
2434 		 * necessary.
2435 		 */
2436 		umtxq_busy(&uq->uq_key);
2437 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2438 			int oldlen = uq->uq_cur_queue->length;
2439 			umtxq_remove(uq);
2440 			if (oldlen == 1) {
2441 				umtxq_unlock(&uq->uq_key);
2442 				suword32(
2443 				    __DEVOLATILE(uint32_t *,
2444 					 &cv->c_has_waiters), 0);
2445 				umtxq_lock(&uq->uq_key);
2446 			}
2447 		}
2448 		umtxq_unbusy(&uq->uq_key);
2449 		if (error == ERESTART)
2450 			error = EINTR;
2451 	}
2452 
2453 	umtxq_unlock(&uq->uq_key);
2454 	umtx_key_release(&uq->uq_key);
2455 	return (error);
2456 }
2457 
2458 /*
2459  * Signal a userland condition variable.
2460  */
2461 static int
2462 do_cv_signal(struct thread *td, struct ucond *cv)
2463 {
2464 	struct umtx_key key;
2465 	int error, cnt, nwake;
2466 	uint32_t flags;
2467 
2468 	flags = fuword32(&cv->c_flags);
2469 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2470 		return (error);
2471 	umtxq_lock(&key);
2472 	umtxq_busy(&key);
2473 	cnt = umtxq_count(&key);
2474 	nwake = umtxq_signal(&key, 1);
2475 	if (cnt <= nwake) {
2476 		umtxq_unlock(&key);
2477 		error = suword32(
2478 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2479 		umtxq_lock(&key);
2480 	}
2481 	umtxq_unbusy(&key);
2482 	umtxq_unlock(&key);
2483 	umtx_key_release(&key);
2484 	return (error);
2485 }
2486 
2487 static int
2488 do_cv_broadcast(struct thread *td, struct ucond *cv)
2489 {
2490 	struct umtx_key key;
2491 	int error;
2492 	uint32_t flags;
2493 
2494 	flags = fuword32(&cv->c_flags);
2495 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2496 		return (error);
2497 
2498 	umtxq_lock(&key);
2499 	umtxq_busy(&key);
2500 	umtxq_signal(&key, INT_MAX);
2501 	umtxq_unlock(&key);
2502 
2503 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2504 
2505 	umtxq_lock(&key);
2506 	umtxq_unbusy(&key);
2507 	umtxq_unlock(&key);
2508 
2509 	umtx_key_release(&key);
2510 	return (error);
2511 }
2512 
2513 static int
2514 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2515 {
2516 	struct abs_timeout timo;
2517 	struct umtx_q *uq;
2518 	uint32_t flags, wrflags;
2519 	int32_t state, oldstate;
2520 	int32_t blocked_readers;
2521 	int error;
2522 
2523 	uq = td->td_umtxq;
2524 	flags = fuword32(&rwlock->rw_flags);
2525 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2526 	if (error != 0)
2527 		return (error);
2528 
2529 	if (timeout != NULL)
2530 		abs_timeout_init2(&timo, timeout);
2531 
2532 	wrflags = URWLOCK_WRITE_OWNER;
2533 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2534 		wrflags |= URWLOCK_WRITE_WAITERS;
2535 
2536 	for (;;) {
2537 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2538 		/* try to lock it */
2539 		while (!(state & wrflags)) {
2540 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2541 				umtx_key_release(&uq->uq_key);
2542 				return (EAGAIN);
2543 			}
2544 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2545 			if (oldstate == state) {
2546 				umtx_key_release(&uq->uq_key);
2547 				return (0);
2548 			}
2549 			state = oldstate;
2550 		}
2551 
2552 		if (error)
2553 			break;
2554 
2555 		/* grab monitor lock */
2556 		umtxq_lock(&uq->uq_key);
2557 		umtxq_busy(&uq->uq_key);
2558 		umtxq_unlock(&uq->uq_key);
2559 
2560 		/*
2561 		 * re-read the state, in case it changed between the try-lock above
2562 		 * and the check below
2563 		 */
2564 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2565 
2566 		/* set read contention bit */
2567 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2568 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2569 			if (oldstate == state)
2570 				goto sleep;
2571 			state = oldstate;
2572 		}
2573 
2574 		/* state is changed while setting flags, restart */
2575 		if (!(state & wrflags)) {
2576 			umtxq_lock(&uq->uq_key);
2577 			umtxq_unbusy(&uq->uq_key);
2578 			umtxq_unlock(&uq->uq_key);
2579 			continue;
2580 		}
2581 
2582 sleep:
2583 		/* contention bit is set, before sleeping, increase read waiter count */
2584 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2585 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2586 
2587 		while (state & wrflags) {
2588 			umtxq_lock(&uq->uq_key);
2589 			umtxq_insert(uq);
2590 			umtxq_unbusy(&uq->uq_key);
2591 
2592 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2593 			    NULL : &timo);
2594 
2595 			umtxq_busy(&uq->uq_key);
2596 			umtxq_remove(uq);
2597 			umtxq_unlock(&uq->uq_key);
2598 			if (error)
2599 				break;
2600 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2601 		}
2602 
2603 		/* decrease read waiter count, and may clear read contention bit */
2604 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2605 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2606 		if (blocked_readers == 1) {
2607 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2608 			for (;;) {
2609 				oldstate = casuword32(&rwlock->rw_state, state,
2610 					 state & ~URWLOCK_READ_WAITERS);
2611 				if (oldstate == state)
2612 					break;
2613 				state = oldstate;
2614 			}
2615 		}
2616 
2617 		umtxq_lock(&uq->uq_key);
2618 		umtxq_unbusy(&uq->uq_key);
2619 		umtxq_unlock(&uq->uq_key);
2620 	}
2621 	umtx_key_release(&uq->uq_key);
2622 	if (error == ERESTART)
2623 		error = EINTR;
2624 	return (error);
2625 }
2626 
2627 static int
2628 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2629 {
2630 	struct abs_timeout timo;
2631 	struct umtx_q *uq;
2632 	uint32_t flags;
2633 	int32_t state, oldstate;
2634 	int32_t blocked_writers;
2635 	int32_t blocked_readers;
2636 	int error;
2637 
2638 	uq = td->td_umtxq;
2639 	flags = fuword32(&rwlock->rw_flags);
2640 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2641 	if (error != 0)
2642 		return (error);
2643 
2644 	if (timeout != NULL)
2645 		abs_timeout_init2(&timo, timeout);
2646 
2647 	blocked_readers = 0;
2648 	for (;;) {
2649 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2650 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2651 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2652 			if (oldstate == state) {
2653 				umtx_key_release(&uq->uq_key);
2654 				return (0);
2655 			}
2656 			state = oldstate;
2657 		}
2658 
2659 		if (error) {
2660 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2661 			    blocked_readers != 0) {
2662 				umtxq_lock(&uq->uq_key);
2663 				umtxq_busy(&uq->uq_key);
2664 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2665 				umtxq_unbusy(&uq->uq_key);
2666 				umtxq_unlock(&uq->uq_key);
2667 			}
2668 
2669 			break;
2670 		}
2671 
2672 		/* grab monitor lock */
2673 		umtxq_lock(&uq->uq_key);
2674 		umtxq_busy(&uq->uq_key);
2675 		umtxq_unlock(&uq->uq_key);
2676 
2677 		/*
2678 		 * re-read the state, in case it changed between the try-lock above
2679 		 * and the check below
2680 		 */
2681 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2682 
2683 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2684 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2685 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2686 			if (oldstate == state)
2687 				goto sleep;
2688 			state = oldstate;
2689 		}
2690 
2691 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2692 			umtxq_lock(&uq->uq_key);
2693 			umtxq_unbusy(&uq->uq_key);
2694 			umtxq_unlock(&uq->uq_key);
2695 			continue;
2696 		}
2697 sleep:
2698 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2699 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2700 
2701 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2702 			umtxq_lock(&uq->uq_key);
2703 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2704 			umtxq_unbusy(&uq->uq_key);
2705 
2706 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2707 			    NULL : &timo);
2708 
2709 			umtxq_busy(&uq->uq_key);
2710 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2711 			umtxq_unlock(&uq->uq_key);
2712 			if (error)
2713 				break;
2714 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2715 		}
2716 
2717 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2718 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2719 		if (blocked_writers == 1) {
2720 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2721 			for (;;) {
2722 				oldstate = casuword32(&rwlock->rw_state, state,
2723 					 state & ~URWLOCK_WRITE_WAITERS);
2724 				if (oldstate == state)
2725 					break;
2726 				state = oldstate;
2727 			}
2728 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2729 		} else
2730 			blocked_readers = 0;
2731 
2732 		umtxq_lock(&uq->uq_key);
2733 		umtxq_unbusy(&uq->uq_key);
2734 		umtxq_unlock(&uq->uq_key);
2735 	}
2736 
2737 	umtx_key_release(&uq->uq_key);
2738 	if (error == ERESTART)
2739 		error = EINTR;
2740 	return (error);
2741 }
2742 
2743 static int
2744 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2745 {
2746 	struct umtx_q *uq;
2747 	uint32_t flags;
2748 	int32_t state, oldstate;
2749 	int error, q, count;
2750 
2751 	uq = td->td_umtxq;
2752 	flags = fuword32(&rwlock->rw_flags);
2753 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2754 	if (error != 0)
2755 		return (error);
2756 
2757 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2758 	if (state & URWLOCK_WRITE_OWNER) {
2759 		for (;;) {
2760 			oldstate = casuword32(&rwlock->rw_state, state,
2761 				state & ~URWLOCK_WRITE_OWNER);
2762 			if (oldstate != state) {
2763 				state = oldstate;
2764 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2765 					error = EPERM;
2766 					goto out;
2767 				}
2768 			} else
2769 				break;
2770 		}
2771 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2772 		for (;;) {
2773 			oldstate = casuword32(&rwlock->rw_state, state,
2774 				state - 1);
2775 			if (oldstate != state) {
2776 				state = oldstate;
2777 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2778 					error = EPERM;
2779 					goto out;
2780 				}
2781 			}
2782 			else
2783 				break;
2784 		}
2785 	} else {
2786 		error = EPERM;
2787 		goto out;
2788 	}
2789 
2790 	count = 0;
2791 
2792 	if (!(flags & URWLOCK_PREFER_READER)) {
2793 		if (state & URWLOCK_WRITE_WAITERS) {
2794 			count = 1;
2795 			q = UMTX_EXCLUSIVE_QUEUE;
2796 		} else if (state & URWLOCK_READ_WAITERS) {
2797 			count = INT_MAX;
2798 			q = UMTX_SHARED_QUEUE;
2799 		}
2800 	} else {
2801 		if (state & URWLOCK_READ_WAITERS) {
2802 			count = INT_MAX;
2803 			q = UMTX_SHARED_QUEUE;
2804 		} else if (state & URWLOCK_WRITE_WAITERS) {
2805 			count = 1;
2806 			q = UMTX_EXCLUSIVE_QUEUE;
2807 		}
2808 	}
2809 
2810 	if (count) {
2811 		umtxq_lock(&uq->uq_key);
2812 		umtxq_busy(&uq->uq_key);
2813 		umtxq_signal_queue(&uq->uq_key, count, q);
2814 		umtxq_unbusy(&uq->uq_key);
2815 		umtxq_unlock(&uq->uq_key);
2816 	}
2817 out:
2818 	umtx_key_release(&uq->uq_key);
2819 	return (error);
2820 }
2821 
2822 static int
2823 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2824 {
2825 	struct abs_timeout timo;
2826 	struct umtx_q *uq;
2827 	uint32_t flags, count;
2828 	int error;
2829 
2830 	uq = td->td_umtxq;
2831 	flags = fuword32(&sem->_flags);
2832 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2833 	if (error != 0)
2834 		return (error);
2835 
2836 	if (timeout != NULL)
2837 		abs_timeout_init2(&timo, timeout);
2838 
2839 	umtxq_lock(&uq->uq_key);
2840 	umtxq_busy(&uq->uq_key);
2841 	umtxq_insert(uq);
2842 	umtxq_unlock(&uq->uq_key);
2843 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2844 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2845 	if (count != 0) {
2846 		umtxq_lock(&uq->uq_key);
2847 		umtxq_unbusy(&uq->uq_key);
2848 		umtxq_remove(uq);
2849 		umtxq_unlock(&uq->uq_key);
2850 		umtx_key_release(&uq->uq_key);
2851 		return (0);
2852 	}
2853 	umtxq_lock(&uq->uq_key);
2854 	umtxq_unbusy(&uq->uq_key);
2855 
2856 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2857 
2858 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2859 		error = 0;
2860 	else {
2861 		umtxq_remove(uq);
2862 		if (error == ERESTART)
2863 			error = EINTR;
2864 	}
2865 	umtxq_unlock(&uq->uq_key);
2866 	umtx_key_release(&uq->uq_key);
2867 	return (error);
2868 }
2869 
2870 /*
2871  * Signal a userland condition variable.
2872  */
2873 static int
2874 do_sem_wake(struct thread *td, struct _usem *sem)
2875 {
2876 	struct umtx_key key;
2877 	int error, cnt;
2878 	uint32_t flags;
2879 
2880 	flags = fuword32(&sem->_flags);
2881 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2882 		return (error);
2883 	umtxq_lock(&key);
2884 	umtxq_busy(&key);
2885 	cnt = umtxq_count(&key);
2886 	if (cnt > 0) {
2887 		umtxq_signal(&key, 1);
2888 		/*
2889 		 * Check if count is greater than 0, this means the memory is
2890 		 * still being referenced by user code, so we can safely
2891 		 * update _has_waiters flag.
2892 		 */
2893 		if (cnt == 1) {
2894 			umtxq_unlock(&key);
2895 			error = suword32(
2896 			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2897 			umtxq_lock(&key);
2898 		}
2899 	}
2900 	umtxq_unbusy(&key);
2901 	umtxq_unlock(&key);
2902 	umtx_key_release(&key);
2903 	return (error);
2904 }
2905 
2906 int
2907 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2908     /* struct umtx *umtx */
2909 {
2910 	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2911 }
2912 
2913 int
2914 sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2915     /* struct umtx *umtx */
2916 {
2917 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2918 }
2919 
2920 inline int
2921 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2922 {
2923 	int error;
2924 
2925 	error = copyin(addr, tsp, sizeof(struct timespec));
2926 	if (error == 0) {
2927 		if (tsp->tv_sec < 0 ||
2928 		    tsp->tv_nsec >= 1000000000 ||
2929 		    tsp->tv_nsec < 0)
2930 			error = EINVAL;
2931 	}
2932 	return (error);
2933 }
2934 
2935 static inline int
2936 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2937 {
2938 	int error;
2939 
2940 	if (size <= sizeof(struct timespec)) {
2941 		tp->_clockid = CLOCK_REALTIME;
2942 		tp->_flags = 0;
2943 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2944 	} else
2945 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2946 	if (error != 0)
2947 		return (error);
2948 	if (tp->_timeout.tv_sec < 0 ||
2949 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2950 		return (EINVAL);
2951 	return (0);
2952 }
2953 
2954 static int
2955 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2956 {
2957 	struct timespec *ts, timeout;
2958 	int error;
2959 
2960 	/* Allow a null timespec (wait forever). */
2961 	if (uap->uaddr2 == NULL)
2962 		ts = NULL;
2963 	else {
2964 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2965 		if (error != 0)
2966 			return (error);
2967 		ts = &timeout;
2968 	}
2969 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2970 }
2971 
2972 static int
2973 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2974 {
2975 	return (do_unlock_umtx(td, uap->obj, uap->val));
2976 }
2977 
2978 static int
2979 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2980 {
2981 	struct _umtx_time timeout, *tm_p;
2982 	int error;
2983 
2984 	if (uap->uaddr2 == NULL)
2985 		tm_p = NULL;
2986 	else {
2987 		error = umtx_copyin_umtx_time(
2988 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2989 		if (error != 0)
2990 			return (error);
2991 		tm_p = &timeout;
2992 	}
2993 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2994 }
2995 
2996 static int
2997 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2998 {
2999 	struct _umtx_time timeout, *tm_p;
3000 	int error;
3001 
3002 	if (uap->uaddr2 == NULL)
3003 		tm_p = NULL;
3004 	else {
3005 		error = umtx_copyin_umtx_time(
3006 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3007 		if (error != 0)
3008 			return (error);
3009 		tm_p = &timeout;
3010 	}
3011 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3012 }
3013 
3014 static int
3015 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3016 {
3017 	struct _umtx_time *tm_p, timeout;
3018 	int error;
3019 
3020 	if (uap->uaddr2 == NULL)
3021 		tm_p = NULL;
3022 	else {
3023 		error = umtx_copyin_umtx_time(
3024 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3025 		if (error != 0)
3026 			return (error);
3027 		tm_p = &timeout;
3028 	}
3029 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3030 }
3031 
3032 static int
3033 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3034 {
3035 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3036 }
3037 
3038 #define BATCH_SIZE	128
3039 static int
3040 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3041 {
3042 	int count = uap->val;
3043 	void *uaddrs[BATCH_SIZE];
3044 	char **upp = (char **)uap->obj;
3045 	int tocopy;
3046 	int error = 0;
3047 	int i, pos = 0;
3048 
3049 	while (count > 0) {
3050 		tocopy = count;
3051 		if (tocopy > BATCH_SIZE)
3052 			tocopy = BATCH_SIZE;
3053 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3054 		if (error != 0)
3055 			break;
3056 		for (i = 0; i < tocopy; ++i)
3057 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3058 		count -= tocopy;
3059 		pos += tocopy;
3060 	}
3061 	return (error);
3062 }
3063 
3064 static int
3065 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3066 {
3067 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3068 }
3069 
3070 static int
3071 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3072 {
3073 	struct _umtx_time *tm_p, timeout;
3074 	int error;
3075 
3076 	/* Allow a null timespec (wait forever). */
3077 	if (uap->uaddr2 == NULL)
3078 		tm_p = NULL;
3079 	else {
3080 		error = umtx_copyin_umtx_time(
3081 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3082 		if (error != 0)
3083 			return (error);
3084 		tm_p = &timeout;
3085 	}
3086 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3087 }
3088 
3089 static int
3090 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3091 {
3092 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3093 }
3094 
3095 static int
3096 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3097 {
3098 	struct _umtx_time *tm_p, timeout;
3099 	int error;
3100 
3101 	/* Allow a null timespec (wait forever). */
3102 	if (uap->uaddr2 == NULL)
3103 		tm_p = NULL;
3104 	else {
3105 		error = umtx_copyin_umtx_time(
3106 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3107 		if (error != 0)
3108 			return (error);
3109 		tm_p = &timeout;
3110 	}
3111 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3112 }
3113 
3114 static int
3115 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3116 {
3117 	return do_wake_umutex(td, uap->obj);
3118 }
3119 
3120 static int
3121 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3122 {
3123 	return do_unlock_umutex(td, uap->obj);
3124 }
3125 
3126 static int
3127 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3128 {
3129 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3130 }
3131 
3132 static int
3133 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3134 {
3135 	struct timespec *ts, timeout;
3136 	int error;
3137 
3138 	/* Allow a null timespec (wait forever). */
3139 	if (uap->uaddr2 == NULL)
3140 		ts = NULL;
3141 	else {
3142 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3143 		if (error != 0)
3144 			return (error);
3145 		ts = &timeout;
3146 	}
3147 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3148 }
3149 
3150 static int
3151 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3152 {
3153 	return do_cv_signal(td, uap->obj);
3154 }
3155 
3156 static int
3157 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3158 {
3159 	return do_cv_broadcast(td, uap->obj);
3160 }
3161 
3162 static int
3163 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3164 {
3165 	struct _umtx_time timeout;
3166 	int error;
3167 
3168 	/* Allow a null timespec (wait forever). */
3169 	if (uap->uaddr2 == NULL) {
3170 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3171 	} else {
3172 		error = umtx_copyin_umtx_time(uap->uaddr2,
3173 		   (size_t)uap->uaddr1, &timeout);
3174 		if (error != 0)
3175 			return (error);
3176 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3177 	}
3178 	return (error);
3179 }
3180 
3181 static int
3182 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3183 {
3184 	struct _umtx_time timeout;
3185 	int error;
3186 
3187 	/* Allow a null timespec (wait forever). */
3188 	if (uap->uaddr2 == NULL) {
3189 		error = do_rw_wrlock(td, uap->obj, 0);
3190 	} else {
3191 		error = umtx_copyin_umtx_time(uap->uaddr2,
3192 		   (size_t)uap->uaddr1, &timeout);
3193 		if (error != 0)
3194 			return (error);
3195 
3196 		error = do_rw_wrlock(td, uap->obj, &timeout);
3197 	}
3198 	return (error);
3199 }
3200 
3201 static int
3202 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3203 {
3204 	return do_rw_unlock(td, uap->obj);
3205 }
3206 
3207 static int
3208 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	struct _umtx_time *tm_p, timeout;
3211 	int error;
3212 
3213 	/* Allow a null timespec (wait forever). */
3214 	if (uap->uaddr2 == NULL)
3215 		tm_p = NULL;
3216 	else {
3217 		error = umtx_copyin_umtx_time(
3218 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3219 		if (error != 0)
3220 			return (error);
3221 		tm_p = &timeout;
3222 	}
3223 	return (do_sem_wait(td, uap->obj, tm_p));
3224 }
3225 
3226 static int
3227 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3228 {
3229 	return do_sem_wake(td, uap->obj);
3230 }
3231 
3232 static int
3233 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3234 {
3235 	return do_wake2_umutex(td, uap->obj, uap->val);
3236 }
3237 
3238 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3239 
3240 static _umtx_op_func op_table[] = {
3241 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3242 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3243 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3244 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3245 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3246 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3247 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3248 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3249 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3250 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3251 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3252 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3253 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3254 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3255 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3256 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3257 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3258 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3259 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3260 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3261 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3262 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3263 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3264 };
3265 
3266 int
3267 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3268 {
3269 	if ((unsigned)uap->op < UMTX_OP_MAX)
3270 		return (*op_table[uap->op])(td, uap);
3271 	return (EINVAL);
3272 }
3273 
3274 #ifdef COMPAT_FREEBSD32
3275 int
3276 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3277     /* struct umtx *umtx */
3278 {
3279 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3280 }
3281 
3282 int
3283 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3284     /* struct umtx *umtx */
3285 {
3286 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3287 }
3288 
3289 struct timespec32 {
3290 	uint32_t tv_sec;
3291 	uint32_t tv_nsec;
3292 };
3293 
3294 struct umtx_time32 {
3295 	struct	timespec32	timeout;
3296 	uint32_t		flags;
3297 	uint32_t		clockid;
3298 };
3299 
3300 static inline int
3301 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3302 {
3303 	struct timespec32 ts32;
3304 	int error;
3305 
3306 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3307 	if (error == 0) {
3308 		if (ts32.tv_sec < 0 ||
3309 		    ts32.tv_nsec >= 1000000000 ||
3310 		    ts32.tv_nsec < 0)
3311 			error = EINVAL;
3312 		else {
3313 			tsp->tv_sec = ts32.tv_sec;
3314 			tsp->tv_nsec = ts32.tv_nsec;
3315 		}
3316 	}
3317 	return (error);
3318 }
3319 
3320 static inline int
3321 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3322 {
3323 	struct umtx_time32 t32;
3324 	int error;
3325 
3326 	t32.clockid = CLOCK_REALTIME;
3327 	t32.flags   = 0;
3328 	if (size <= sizeof(struct timespec32))
3329 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3330 	else
3331 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3332 	if (error != 0)
3333 		return (error);
3334 	if (t32.timeout.tv_sec < 0 ||
3335 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3336 		return (EINVAL);
3337 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3338 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3339 	tp->_flags = t32.flags;
3340 	tp->_clockid = t32.clockid;
3341 	return (0);
3342 }
3343 
3344 static int
3345 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3346 {
3347 	struct timespec *ts, timeout;
3348 	int error;
3349 
3350 	/* Allow a null timespec (wait forever). */
3351 	if (uap->uaddr2 == NULL)
3352 		ts = NULL;
3353 	else {
3354 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3355 		if (error != 0)
3356 			return (error);
3357 		ts = &timeout;
3358 	}
3359 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3360 }
3361 
3362 static int
3363 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3364 {
3365 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3366 }
3367 
3368 static int
3369 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3370 {
3371 	struct _umtx_time *tm_p, timeout;
3372 	int error;
3373 
3374 	if (uap->uaddr2 == NULL)
3375 		tm_p = NULL;
3376 	else {
3377 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3378 			(size_t)uap->uaddr1, &timeout);
3379 		if (error != 0)
3380 			return (error);
3381 		tm_p = &timeout;
3382 	}
3383 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3384 }
3385 
3386 static int
3387 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3388 {
3389 	struct _umtx_time *tm_p, timeout;
3390 	int error;
3391 
3392 	/* Allow a null timespec (wait forever). */
3393 	if (uap->uaddr2 == NULL)
3394 		tm_p = NULL;
3395 	else {
3396 		error = umtx_copyin_umtx_time(uap->uaddr2,
3397 			    (size_t)uap->uaddr1, &timeout);
3398 		if (error != 0)
3399 			return (error);
3400 		tm_p = &timeout;
3401 	}
3402 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3403 }
3404 
3405 static int
3406 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3407 {
3408 	struct _umtx_time *tm_p, timeout;
3409 	int error;
3410 
3411 	/* Allow a null timespec (wait forever). */
3412 	if (uap->uaddr2 == NULL)
3413 		tm_p = NULL;
3414 	else {
3415 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3416 		    (size_t)uap->uaddr1, &timeout);
3417 		if (error != 0)
3418 			return (error);
3419 		tm_p = &timeout;
3420 	}
3421 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3422 }
3423 
3424 static int
3425 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3426 {
3427 	struct timespec *ts, timeout;
3428 	int error;
3429 
3430 	/* Allow a null timespec (wait forever). */
3431 	if (uap->uaddr2 == NULL)
3432 		ts = NULL;
3433 	else {
3434 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3435 		if (error != 0)
3436 			return (error);
3437 		ts = &timeout;
3438 	}
3439 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3440 }
3441 
3442 static int
3443 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3444 {
3445 	struct _umtx_time timeout;
3446 	int error;
3447 
3448 	/* Allow a null timespec (wait forever). */
3449 	if (uap->uaddr2 == NULL) {
3450 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3451 	} else {
3452 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3453 		    (size_t)uap->uaddr1, &timeout);
3454 		if (error != 0)
3455 			return (error);
3456 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3457 	}
3458 	return (error);
3459 }
3460 
3461 static int
3462 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3463 {
3464 	struct _umtx_time timeout;
3465 	int error;
3466 
3467 	/* Allow a null timespec (wait forever). */
3468 	if (uap->uaddr2 == NULL) {
3469 		error = do_rw_wrlock(td, uap->obj, 0);
3470 	} else {
3471 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3472 		    (size_t)uap->uaddr1, &timeout);
3473 		if (error != 0)
3474 			return (error);
3475 		error = do_rw_wrlock(td, uap->obj, &timeout);
3476 	}
3477 	return (error);
3478 }
3479 
3480 static int
3481 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3482 {
3483 	struct _umtx_time *tm_p, timeout;
3484 	int error;
3485 
3486 	if (uap->uaddr2 == NULL)
3487 		tm_p = NULL;
3488 	else {
3489 		error = umtx_copyin_umtx_time32(
3490 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3491 		if (error != 0)
3492 			return (error);
3493 		tm_p = &timeout;
3494 	}
3495 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3496 }
3497 
3498 static int
3499 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3500 {
3501 	struct _umtx_time *tm_p, timeout;
3502 	int error;
3503 
3504 	/* Allow a null timespec (wait forever). */
3505 	if (uap->uaddr2 == NULL)
3506 		tm_p = NULL;
3507 	else {
3508 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3509 		    (size_t)uap->uaddr1, &timeout);
3510 		if (error != 0)
3511 			return (error);
3512 		tm_p = &timeout;
3513 	}
3514 	return (do_sem_wait(td, uap->obj, tm_p));
3515 }
3516 
3517 static int
3518 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3519 {
3520 	int count = uap->val;
3521 	uint32_t uaddrs[BATCH_SIZE];
3522 	uint32_t **upp = (uint32_t **)uap->obj;
3523 	int tocopy;
3524 	int error = 0;
3525 	int i, pos = 0;
3526 
3527 	while (count > 0) {
3528 		tocopy = count;
3529 		if (tocopy > BATCH_SIZE)
3530 			tocopy = BATCH_SIZE;
3531 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3532 		if (error != 0)
3533 			break;
3534 		for (i = 0; i < tocopy; ++i)
3535 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3536 				INT_MAX, 1);
3537 		count -= tocopy;
3538 		pos += tocopy;
3539 	}
3540 	return (error);
3541 }
3542 
3543 static _umtx_op_func op_table_compat32[] = {
3544 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3545 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3546 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3547 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3548 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3549 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3550 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3551 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3552 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3553 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3554 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3555 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3556 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3557 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3558 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3559 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3560 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3561 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3562 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3563 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3564 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3565 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3566 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3567 };
3568 
3569 int
3570 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3571 {
3572 	if ((unsigned)uap->op < UMTX_OP_MAX)
3573 		return (*op_table_compat32[uap->op])(td,
3574 			(struct _umtx_op_args *)uap);
3575 	return (EINVAL);
3576 }
3577 #endif
3578 
3579 void
3580 umtx_thread_init(struct thread *td)
3581 {
3582 	td->td_umtxq = umtxq_alloc();
3583 	td->td_umtxq->uq_thread = td;
3584 }
3585 
3586 void
3587 umtx_thread_fini(struct thread *td)
3588 {
3589 	umtxq_free(td->td_umtxq);
3590 }
3591 
3592 /*
3593  * It will be called when new thread is created, e.g fork().
3594  */
3595 void
3596 umtx_thread_alloc(struct thread *td)
3597 {
3598 	struct umtx_q *uq;
3599 
3600 	uq = td->td_umtxq;
3601 	uq->uq_inherited_pri = PRI_MAX;
3602 
3603 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3604 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3605 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3606 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3607 }
3608 
3609 /*
3610  * exec() hook.
3611  */
3612 static void
3613 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3614 	struct image_params *imgp __unused)
3615 {
3616 	umtx_thread_cleanup(curthread);
3617 }
3618 
3619 /*
3620  * thread_exit() hook.
3621  */
3622 void
3623 umtx_thread_exit(struct thread *td)
3624 {
3625 	umtx_thread_cleanup(td);
3626 }
3627 
3628 /*
3629  * clean up umtx data.
3630  */
3631 static void
3632 umtx_thread_cleanup(struct thread *td)
3633 {
3634 	struct umtx_q *uq;
3635 	struct umtx_pi *pi;
3636 
3637 	if ((uq = td->td_umtxq) == NULL)
3638 		return;
3639 
3640 	mtx_lock_spin(&umtx_lock);
3641 	uq->uq_inherited_pri = PRI_MAX;
3642 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3643 		pi->pi_owner = NULL;
3644 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3645 	}
3646 	mtx_unlock_spin(&umtx_lock);
3647 	thread_lock(td);
3648 	sched_lend_user_prio(td, PRI_MAX);
3649 	thread_unlock(td);
3650 }
3651