xref: /freebsd/sys/kern/kern_umtx.c (revision aa77200569e397d6ff1fdb4d255d0fa254d0a128)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysent.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/eventhandler.h>
50 #include <sys/umtx.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cpu.h>
59 
60 #ifdef COMPAT_FREEBSD32
61 #include <compat/freebsd32/freebsd32_proto.h>
62 #endif
63 
64 #define _UMUTEX_TRY		1
65 #define _UMUTEX_WAIT		2
66 
67 /* Priority inheritance mutex info. */
68 struct umtx_pi {
69 	/* Owner thread */
70 	struct thread		*pi_owner;
71 
72 	/* Reference count */
73 	int			pi_refcount;
74 
75  	/* List entry to link umtx holding by thread */
76 	TAILQ_ENTRY(umtx_pi)	pi_link;
77 
78 	/* List entry in hash */
79 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80 
81 	/* List for waiters */
82 	TAILQ_HEAD(,umtx_q)	pi_blocked;
83 
84 	/* Identify a userland lock object */
85 	struct umtx_key		pi_key;
86 };
87 
88 /* A userland synchronous object user. */
89 struct umtx_q {
90 	/* Linked list for the hash. */
91 	TAILQ_ENTRY(umtx_q)	uq_link;
92 
93 	/* Umtx key. */
94 	struct umtx_key		uq_key;
95 
96 	/* Umtx flags. */
97 	int			uq_flags;
98 #define UQF_UMTXQ	0x0001
99 
100 	/* The thread waits on. */
101 	struct thread		*uq_thread;
102 
103 	/*
104 	 * Blocked on PI mutex. read can use chain lock
105 	 * or umtx_lock, write must have both chain lock and
106 	 * umtx_lock being hold.
107 	 */
108 	struct umtx_pi		*uq_pi_blocked;
109 
110 	/* On blocked list */
111 	TAILQ_ENTRY(umtx_q)	uq_lockq;
112 
113 	/* Thread contending with us */
114 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115 
116 	/* Inherited priority from PP mutex */
117 	u_char			uq_inherited_pri;
118 
119 	/* Spare queue ready to be reused */
120 	struct umtxq_queue	*uq_spare_queue;
121 
122 	/* The queue we on */
123 	struct umtxq_queue	*uq_cur_queue;
124 };
125 
126 TAILQ_HEAD(umtxq_head, umtx_q);
127 
128 /* Per-key wait-queue */
129 struct umtxq_queue {
130 	struct umtxq_head	head;
131 	struct umtx_key		key;
132 	LIST_ENTRY(umtxq_queue)	link;
133 	int			length;
134 };
135 
136 LIST_HEAD(umtxq_list, umtxq_queue);
137 
138 /* Userland lock object's wait-queue chain */
139 struct umtxq_chain {
140 	/* Lock for this chain. */
141 	struct mtx		uc_lock;
142 
143 	/* List of sleep queues. */
144 	struct umtxq_list	uc_queue[2];
145 #define UMTX_SHARED_QUEUE	0
146 #define UMTX_EXCLUSIVE_QUEUE	1
147 
148 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149 
150 	/* Busy flag */
151 	char			uc_busy;
152 
153 	/* Chain lock waiters */
154 	int			uc_waiters;
155 
156 	/* All PI in the list */
157 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158 
159 #ifdef UMTX_PROFILING
160 	int 			length;
161 	int			max_length;
162 #endif
163 };
164 
165 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167 
168 /*
169  * Don't propagate time-sharing priority, there is a security reason,
170  * a user can simply introduce PI-mutex, let thread A lock the mutex,
171  * and let another thread B block on the mutex, because B is
172  * sleeping, its priority will be boosted, this causes A's priority to
173  * be boosted via priority propagating too and will never be lowered even
174  * if it is using 100%CPU, this is unfair to other processes.
175  */
176 
177 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180 
181 #define	GOLDEN_RATIO_PRIME	2654404609U
182 #define	UMTX_CHAINS		512
183 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
184 
185 #define	GET_SHARE(flags)	\
186     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187 
188 #define BUSY_SPINS		200
189 
190 struct abs_timeout {
191 	int clockid;
192 	struct timespec cur;
193 	struct timespec end;
194 };
195 
196 static uma_zone_t		umtx_pi_zone;
197 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199 static int			umtx_pi_allocated;
200 
201 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203     &umtx_pi_allocated, 0, "Allocated umtx_pi");
204 
205 #ifdef UMTX_PROFILING
206 static long max_length;
207 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
208 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
209 #endif
210 
211 static void umtxq_sysinit(void *);
212 static void umtxq_hash(struct umtx_key *key);
213 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
214 static void umtxq_lock(struct umtx_key *key);
215 static void umtxq_unlock(struct umtx_key *key);
216 static void umtxq_busy(struct umtx_key *key);
217 static void umtxq_unbusy(struct umtx_key *key);
218 static void umtxq_insert_queue(struct umtx_q *uq, int q);
219 static void umtxq_remove_queue(struct umtx_q *uq, int q);
220 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
221 static int umtxq_count(struct umtx_key *key);
222 static struct umtx_pi *umtx_pi_alloc(int);
223 static void umtx_pi_free(struct umtx_pi *pi);
224 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225 static void umtx_thread_cleanup(struct thread *td);
226 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227 	struct image_params *imgp __unused);
228 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229 
230 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233 
234 static struct mtx umtx_lock;
235 
236 #ifdef UMTX_PROFILING
237 static void
238 umtx_init_profiling(void)
239 {
240 	struct sysctl_oid *chain_oid;
241 	char chain_name[10];
242 	int i;
243 
244 	for (i = 0; i < UMTX_CHAINS; ++i) {
245 		snprintf(chain_name, sizeof(chain_name), "%d", i);
246 		chain_oid = SYSCTL_ADD_NODE(NULL,
247 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
248 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
249 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
250 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
251 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
252 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
253 	}
254 }
255 #endif
256 
257 static void
258 umtxq_sysinit(void *arg __unused)
259 {
260 	int i, j;
261 
262 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
263 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
264 	for (i = 0; i < 2; ++i) {
265 		for (j = 0; j < UMTX_CHAINS; ++j) {
266 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
267 				 MTX_DEF | MTX_DUPOK);
268 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
269 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
270 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
271 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
272 			umtxq_chains[i][j].uc_busy = 0;
273 			umtxq_chains[i][j].uc_waiters = 0;
274 #ifdef UMTX_PROFILING
275 			umtxq_chains[i][j].length = 0;
276 			umtxq_chains[i][j].max_length = 0;
277 #endif
278 		}
279 	}
280 #ifdef UMTX_PROFILING
281 	umtx_init_profiling();
282 #endif
283 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
284 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
285 	    EVENTHANDLER_PRI_ANY);
286 }
287 
288 struct umtx_q *
289 umtxq_alloc(void)
290 {
291 	struct umtx_q *uq;
292 
293 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
294 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
295 	TAILQ_INIT(&uq->uq_spare_queue->head);
296 	TAILQ_INIT(&uq->uq_pi_contested);
297 	uq->uq_inherited_pri = PRI_MAX;
298 	return (uq);
299 }
300 
301 void
302 umtxq_free(struct umtx_q *uq)
303 {
304 	MPASS(uq->uq_spare_queue != NULL);
305 	free(uq->uq_spare_queue, M_UMTX);
306 	free(uq, M_UMTX);
307 }
308 
309 static inline void
310 umtxq_hash(struct umtx_key *key)
311 {
312 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
313 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
314 }
315 
316 static inline struct umtxq_chain *
317 umtxq_getchain(struct umtx_key *key)
318 {
319 	if (key->type <= TYPE_SEM)
320 		return (&umtxq_chains[1][key->hash]);
321 	return (&umtxq_chains[0][key->hash]);
322 }
323 
324 /*
325  * Lock a chain.
326  */
327 static inline void
328 umtxq_lock(struct umtx_key *key)
329 {
330 	struct umtxq_chain *uc;
331 
332 	uc = umtxq_getchain(key);
333 	mtx_lock(&uc->uc_lock);
334 }
335 
336 /*
337  * Unlock a chain.
338  */
339 static inline void
340 umtxq_unlock(struct umtx_key *key)
341 {
342 	struct umtxq_chain *uc;
343 
344 	uc = umtxq_getchain(key);
345 	mtx_unlock(&uc->uc_lock);
346 }
347 
348 /*
349  * Set chain to busy state when following operation
350  * may be blocked (kernel mutex can not be used).
351  */
352 static inline void
353 umtxq_busy(struct umtx_key *key)
354 {
355 	struct umtxq_chain *uc;
356 
357 	uc = umtxq_getchain(key);
358 	mtx_assert(&uc->uc_lock, MA_OWNED);
359 	if (uc->uc_busy) {
360 #ifdef SMP
361 		if (smp_cpus > 1) {
362 			int count = BUSY_SPINS;
363 			if (count > 0) {
364 				umtxq_unlock(key);
365 				while (uc->uc_busy && --count > 0)
366 					cpu_spinwait();
367 				umtxq_lock(key);
368 			}
369 		}
370 #endif
371 		while (uc->uc_busy) {
372 			uc->uc_waiters++;
373 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
374 			uc->uc_waiters--;
375 		}
376 	}
377 	uc->uc_busy = 1;
378 }
379 
380 /*
381  * Unbusy a chain.
382  */
383 static inline void
384 umtxq_unbusy(struct umtx_key *key)
385 {
386 	struct umtxq_chain *uc;
387 
388 	uc = umtxq_getchain(key);
389 	mtx_assert(&uc->uc_lock, MA_OWNED);
390 	KASSERT(uc->uc_busy != 0, ("not busy"));
391 	uc->uc_busy = 0;
392 	if (uc->uc_waiters)
393 		wakeup_one(uc);
394 }
395 
396 static struct umtxq_queue *
397 umtxq_queue_lookup(struct umtx_key *key, int q)
398 {
399 	struct umtxq_queue *uh;
400 	struct umtxq_chain *uc;
401 
402 	uc = umtxq_getchain(key);
403 	UMTXQ_LOCKED_ASSERT(uc);
404 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
405 		if (umtx_key_match(&uh->key, key))
406 			return (uh);
407 	}
408 
409 	return (NULL);
410 }
411 
412 static inline void
413 umtxq_insert_queue(struct umtx_q *uq, int q)
414 {
415 	struct umtxq_queue *uh;
416 	struct umtxq_chain *uc;
417 
418 	uc = umtxq_getchain(&uq->uq_key);
419 	UMTXQ_LOCKED_ASSERT(uc);
420 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
421 	uh = umtxq_queue_lookup(&uq->uq_key, q);
422 	if (uh != NULL) {
423 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
424 	} else {
425 		uh = uq->uq_spare_queue;
426 		uh->key = uq->uq_key;
427 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
428 	}
429 	uq->uq_spare_queue = NULL;
430 
431 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
432 	uh->length++;
433 #ifdef UMTX_PROFILING
434 	uc->length++;
435 	if (uc->length > uc->max_length) {
436 		uc->max_length = uc->length;
437 		if (uc->max_length > max_length)
438 			max_length = uc->max_length;
439 	}
440 #endif
441 	uq->uq_flags |= UQF_UMTXQ;
442 	uq->uq_cur_queue = uh;
443 	return;
444 }
445 
446 static inline void
447 umtxq_remove_queue(struct umtx_q *uq, int q)
448 {
449 	struct umtxq_chain *uc;
450 	struct umtxq_queue *uh;
451 
452 	uc = umtxq_getchain(&uq->uq_key);
453 	UMTXQ_LOCKED_ASSERT(uc);
454 	if (uq->uq_flags & UQF_UMTXQ) {
455 		uh = uq->uq_cur_queue;
456 		TAILQ_REMOVE(&uh->head, uq, uq_link);
457 		uh->length--;
458 #ifdef UMTX_PROFILING
459 		uc->length--;
460 #endif
461 		uq->uq_flags &= ~UQF_UMTXQ;
462 		if (TAILQ_EMPTY(&uh->head)) {
463 			KASSERT(uh->length == 0,
464 			    ("inconsistent umtxq_queue length"));
465 			LIST_REMOVE(uh, link);
466 		} else {
467 			uh = LIST_FIRST(&uc->uc_spare_queue);
468 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
469 			LIST_REMOVE(uh, link);
470 		}
471 		uq->uq_spare_queue = uh;
472 		uq->uq_cur_queue = NULL;
473 	}
474 }
475 
476 /*
477  * Check if there are multiple waiters
478  */
479 static int
480 umtxq_count(struct umtx_key *key)
481 {
482 	struct umtxq_chain *uc;
483 	struct umtxq_queue *uh;
484 
485 	uc = umtxq_getchain(key);
486 	UMTXQ_LOCKED_ASSERT(uc);
487 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
488 	if (uh != NULL)
489 		return (uh->length);
490 	return (0);
491 }
492 
493 /*
494  * Check if there are multiple PI waiters and returns first
495  * waiter.
496  */
497 static int
498 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
499 {
500 	struct umtxq_chain *uc;
501 	struct umtxq_queue *uh;
502 
503 	*first = NULL;
504 	uc = umtxq_getchain(key);
505 	UMTXQ_LOCKED_ASSERT(uc);
506 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
507 	if (uh != NULL) {
508 		*first = TAILQ_FIRST(&uh->head);
509 		return (uh->length);
510 	}
511 	return (0);
512 }
513 
514 /*
515  * Wake up threads waiting on an userland object.
516  */
517 
518 static int
519 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
520 {
521 	struct umtxq_chain *uc;
522 	struct umtxq_queue *uh;
523 	struct umtx_q *uq;
524 	int ret;
525 
526 	ret = 0;
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	uh = umtxq_queue_lookup(key, q);
530 	if (uh != NULL) {
531 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
532 			umtxq_remove_queue(uq, q);
533 			wakeup(uq);
534 			if (++ret >= n_wake)
535 				return (ret);
536 		}
537 	}
538 	return (ret);
539 }
540 
541 
542 /*
543  * Wake up specified thread.
544  */
545 static inline void
546 umtxq_signal_thread(struct umtx_q *uq)
547 {
548 	struct umtxq_chain *uc;
549 
550 	uc = umtxq_getchain(&uq->uq_key);
551 	UMTXQ_LOCKED_ASSERT(uc);
552 	umtxq_remove(uq);
553 	wakeup(uq);
554 }
555 
556 static inline int
557 tstohz(const struct timespec *tsp)
558 {
559 	struct timeval tv;
560 
561 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
562 	return tvtohz(&tv);
563 }
564 
565 static void
566 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
567 	const struct timespec *timeout)
568 {
569 
570 	timo->clockid = clockid;
571 	if (!absolute) {
572 		kern_clock_gettime(curthread, clockid, &timo->end);
573 		timo->cur = timo->end;
574 		timespecadd(&timo->end, timeout);
575 	} else {
576 		timo->end = *timeout;
577 		kern_clock_gettime(curthread, clockid, &timo->cur);
578 	}
579 }
580 
581 static void
582 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
583 {
584 
585 	abs_timeout_init(timo, umtxtime->_clockid,
586 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
587 		&umtxtime->_timeout);
588 }
589 
590 static inline void
591 abs_timeout_update(struct abs_timeout *timo)
592 {
593 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
594 }
595 
596 static int
597 abs_timeout_gethz(struct abs_timeout *timo)
598 {
599 	struct timespec tts;
600 
601 	if (timespeccmp(&timo->end, &timo->cur, <=))
602 		return (-1);
603 	tts = timo->end;
604 	timespecsub(&tts, &timo->cur);
605 	return (tstohz(&tts));
606 }
607 
608 /*
609  * Put thread into sleep state, before sleeping, check if
610  * thread was removed from umtx queue.
611  */
612 static inline int
613 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
614 {
615 	struct umtxq_chain *uc;
616 	int error, timo;
617 
618 	uc = umtxq_getchain(&uq->uq_key);
619 	UMTXQ_LOCKED_ASSERT(uc);
620 	for (;;) {
621 		if (!(uq->uq_flags & UQF_UMTXQ))
622 			return (0);
623 		if (abstime != NULL) {
624 			timo = abs_timeout_gethz(abstime);
625 			if (timo < 0)
626 				return (ETIMEDOUT);
627 		} else
628 			timo = 0;
629 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
630 		if (error != EWOULDBLOCK) {
631 			umtxq_lock(&uq->uq_key);
632 			break;
633 		}
634 		if (abstime != NULL)
635 			abs_timeout_update(abstime);
636 		umtxq_lock(&uq->uq_key);
637 	}
638 	return (error);
639 }
640 
641 /*
642  * Convert userspace address into unique logical address.
643  */
644 int
645 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
646 {
647 	struct thread *td = curthread;
648 	vm_map_t map;
649 	vm_map_entry_t entry;
650 	vm_pindex_t pindex;
651 	vm_prot_t prot;
652 	boolean_t wired;
653 
654 	key->type = type;
655 	if (share == THREAD_SHARE) {
656 		key->shared = 0;
657 		key->info.private.vs = td->td_proc->p_vmspace;
658 		key->info.private.addr = (uintptr_t)addr;
659 	} else {
660 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
661 		map = &td->td_proc->p_vmspace->vm_map;
662 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
663 		    &entry, &key->info.shared.object, &pindex, &prot,
664 		    &wired) != KERN_SUCCESS) {
665 			return EFAULT;
666 		}
667 
668 		if ((share == PROCESS_SHARE) ||
669 		    (share == AUTO_SHARE &&
670 		     VM_INHERIT_SHARE == entry->inheritance)) {
671 			key->shared = 1;
672 			key->info.shared.offset = entry->offset + entry->start -
673 				(vm_offset_t)addr;
674 			vm_object_reference(key->info.shared.object);
675 		} else {
676 			key->shared = 0;
677 			key->info.private.vs = td->td_proc->p_vmspace;
678 			key->info.private.addr = (uintptr_t)addr;
679 		}
680 		vm_map_lookup_done(map, entry);
681 	}
682 
683 	umtxq_hash(key);
684 	return (0);
685 }
686 
687 /*
688  * Release key.
689  */
690 void
691 umtx_key_release(struct umtx_key *key)
692 {
693 	if (key->shared)
694 		vm_object_deallocate(key->info.shared.object);
695 }
696 
697 /*
698  * Lock a umtx object.
699  */
700 static int
701 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
702 	const struct timespec *timeout)
703 {
704 	struct abs_timeout timo;
705 	struct umtx_q *uq;
706 	u_long owner;
707 	u_long old;
708 	int error = 0;
709 
710 	uq = td->td_umtxq;
711 	if (timeout != NULL)
712 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
713 
714 	/*
715 	 * Care must be exercised when dealing with umtx structure. It
716 	 * can fault on any access.
717 	 */
718 	for (;;) {
719 		/*
720 		 * Try the uncontested case.  This should be done in userland.
721 		 */
722 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
723 
724 		/* The acquire succeeded. */
725 		if (owner == UMTX_UNOWNED)
726 			return (0);
727 
728 		/* The address was invalid. */
729 		if (owner == -1)
730 			return (EFAULT);
731 
732 		/* If no one owns it but it is contested try to acquire it. */
733 		if (owner == UMTX_CONTESTED) {
734 			owner = casuword(&umtx->u_owner,
735 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
736 
737 			if (owner == UMTX_CONTESTED)
738 				return (0);
739 
740 			/* The address was invalid. */
741 			if (owner == -1)
742 				return (EFAULT);
743 
744 			/* If this failed the lock has changed, restart. */
745 			continue;
746 		}
747 
748 		/*
749 		 * If we caught a signal, we have retried and now
750 		 * exit immediately.
751 		 */
752 		if (error != 0)
753 			break;
754 
755 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
756 			AUTO_SHARE, &uq->uq_key)) != 0)
757 			return (error);
758 
759 		umtxq_lock(&uq->uq_key);
760 		umtxq_busy(&uq->uq_key);
761 		umtxq_insert(uq);
762 		umtxq_unbusy(&uq->uq_key);
763 		umtxq_unlock(&uq->uq_key);
764 
765 		/*
766 		 * Set the contested bit so that a release in user space
767 		 * knows to use the system call for unlock.  If this fails
768 		 * either some one else has acquired the lock or it has been
769 		 * released.
770 		 */
771 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
772 
773 		/* The address was invalid. */
774 		if (old == -1) {
775 			umtxq_lock(&uq->uq_key);
776 			umtxq_remove(uq);
777 			umtxq_unlock(&uq->uq_key);
778 			umtx_key_release(&uq->uq_key);
779 			return (EFAULT);
780 		}
781 
782 		/*
783 		 * We set the contested bit, sleep. Otherwise the lock changed
784 		 * and we need to retry or we lost a race to the thread
785 		 * unlocking the umtx.
786 		 */
787 		umtxq_lock(&uq->uq_key);
788 		if (old == owner)
789 			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
790 			    &timo);
791 		umtxq_remove(uq);
792 		umtxq_unlock(&uq->uq_key);
793 		umtx_key_release(&uq->uq_key);
794 	}
795 
796 	if (timeout == NULL) {
797 		/* Mutex locking is restarted if it is interrupted. */
798 		if (error == EINTR)
799 			error = ERESTART;
800 	} else {
801 		/* Timed-locking is not restarted. */
802 		if (error == ERESTART)
803 			error = EINTR;
804 	}
805 	return (error);
806 }
807 
808 /*
809  * Unlock a umtx object.
810  */
811 static int
812 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
813 {
814 	struct umtx_key key;
815 	u_long owner;
816 	u_long old;
817 	int error;
818 	int count;
819 
820 	/*
821 	 * Make sure we own this mtx.
822 	 */
823 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
824 	if (owner == -1)
825 		return (EFAULT);
826 
827 	if ((owner & ~UMTX_CONTESTED) != id)
828 		return (EPERM);
829 
830 	/* This should be done in userland */
831 	if ((owner & UMTX_CONTESTED) == 0) {
832 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
833 		if (old == -1)
834 			return (EFAULT);
835 		if (old == owner)
836 			return (0);
837 		owner = old;
838 	}
839 
840 	/* We should only ever be in here for contested locks */
841 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
842 		&key)) != 0)
843 		return (error);
844 
845 	umtxq_lock(&key);
846 	umtxq_busy(&key);
847 	count = umtxq_count(&key);
848 	umtxq_unlock(&key);
849 
850 	/*
851 	 * When unlocking the umtx, it must be marked as unowned if
852 	 * there is zero or one thread only waiting for it.
853 	 * Otherwise, it must be marked as contested.
854 	 */
855 	old = casuword(&umtx->u_owner, owner,
856 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
857 	umtxq_lock(&key);
858 	umtxq_signal(&key,1);
859 	umtxq_unbusy(&key);
860 	umtxq_unlock(&key);
861 	umtx_key_release(&key);
862 	if (old == -1)
863 		return (EFAULT);
864 	if (old != owner)
865 		return (EINVAL);
866 	return (0);
867 }
868 
869 #ifdef COMPAT_FREEBSD32
870 
871 /*
872  * Lock a umtx object.
873  */
874 static int
875 do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
876 	const struct timespec *timeout)
877 {
878 	struct abs_timeout timo;
879 	struct umtx_q *uq;
880 	uint32_t owner;
881 	uint32_t old;
882 	int error = 0;
883 
884 	uq = td->td_umtxq;
885 
886 	if (timeout != NULL)
887 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
888 
889 	/*
890 	 * Care must be exercised when dealing with umtx structure. It
891 	 * can fault on any access.
892 	 */
893 	for (;;) {
894 		/*
895 		 * Try the uncontested case.  This should be done in userland.
896 		 */
897 		owner = casuword32(m, UMUTEX_UNOWNED, id);
898 
899 		/* The acquire succeeded. */
900 		if (owner == UMUTEX_UNOWNED)
901 			return (0);
902 
903 		/* The address was invalid. */
904 		if (owner == -1)
905 			return (EFAULT);
906 
907 		/* If no one owns it but it is contested try to acquire it. */
908 		if (owner == UMUTEX_CONTESTED) {
909 			owner = casuword32(m,
910 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
911 			if (owner == UMUTEX_CONTESTED)
912 				return (0);
913 
914 			/* The address was invalid. */
915 			if (owner == -1)
916 				return (EFAULT);
917 
918 			/* If this failed the lock has changed, restart. */
919 			continue;
920 		}
921 
922 		/*
923 		 * If we caught a signal, we have retried and now
924 		 * exit immediately.
925 		 */
926 		if (error != 0)
927 			return (error);
928 
929 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
930 			AUTO_SHARE, &uq->uq_key)) != 0)
931 			return (error);
932 
933 		umtxq_lock(&uq->uq_key);
934 		umtxq_busy(&uq->uq_key);
935 		umtxq_insert(uq);
936 		umtxq_unbusy(&uq->uq_key);
937 		umtxq_unlock(&uq->uq_key);
938 
939 		/*
940 		 * Set the contested bit so that a release in user space
941 		 * knows to use the system call for unlock.  If this fails
942 		 * either some one else has acquired the lock or it has been
943 		 * released.
944 		 */
945 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
946 
947 		/* The address was invalid. */
948 		if (old == -1) {
949 			umtxq_lock(&uq->uq_key);
950 			umtxq_remove(uq);
951 			umtxq_unlock(&uq->uq_key);
952 			umtx_key_release(&uq->uq_key);
953 			return (EFAULT);
954 		}
955 
956 		/*
957 		 * We set the contested bit, sleep. Otherwise the lock changed
958 		 * and we need to retry or we lost a race to the thread
959 		 * unlocking the umtx.
960 		 */
961 		umtxq_lock(&uq->uq_key);
962 		if (old == owner)
963 			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
964 			    NULL : &timo);
965 		umtxq_remove(uq);
966 		umtxq_unlock(&uq->uq_key);
967 		umtx_key_release(&uq->uq_key);
968 	}
969 
970 	if (timeout == NULL) {
971 		/* Mutex locking is restarted if it is interrupted. */
972 		if (error == EINTR)
973 			error = ERESTART;
974 	} else {
975 		/* Timed-locking is not restarted. */
976 		if (error == ERESTART)
977 			error = EINTR;
978 	}
979 	return (error);
980 }
981 
982 /*
983  * Unlock a umtx object.
984  */
985 static int
986 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
987 {
988 	struct umtx_key key;
989 	uint32_t owner;
990 	uint32_t old;
991 	int error;
992 	int count;
993 
994 	/*
995 	 * Make sure we own this mtx.
996 	 */
997 	owner = fuword32(m);
998 	if (owner == -1)
999 		return (EFAULT);
1000 
1001 	if ((owner & ~UMUTEX_CONTESTED) != id)
1002 		return (EPERM);
1003 
1004 	/* This should be done in userland */
1005 	if ((owner & UMUTEX_CONTESTED) == 0) {
1006 		old = casuword32(m, owner, UMUTEX_UNOWNED);
1007 		if (old == -1)
1008 			return (EFAULT);
1009 		if (old == owner)
1010 			return (0);
1011 		owner = old;
1012 	}
1013 
1014 	/* We should only ever be in here for contested locks */
1015 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1016 		&key)) != 0)
1017 		return (error);
1018 
1019 	umtxq_lock(&key);
1020 	umtxq_busy(&key);
1021 	count = umtxq_count(&key);
1022 	umtxq_unlock(&key);
1023 
1024 	/*
1025 	 * When unlocking the umtx, it must be marked as unowned if
1026 	 * there is zero or one thread only waiting for it.
1027 	 * Otherwise, it must be marked as contested.
1028 	 */
1029 	old = casuword32(m, owner,
1030 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1031 	umtxq_lock(&key);
1032 	umtxq_signal(&key,1);
1033 	umtxq_unbusy(&key);
1034 	umtxq_unlock(&key);
1035 	umtx_key_release(&key);
1036 	if (old == -1)
1037 		return (EFAULT);
1038 	if (old != owner)
1039 		return (EINVAL);
1040 	return (0);
1041 }
1042 #endif
1043 
1044 /*
1045  * Fetch and compare value, sleep on the address if value is not changed.
1046  */
1047 static int
1048 do_wait(struct thread *td, void *addr, u_long id,
1049 	struct _umtx_time *timeout, int compat32, int is_private)
1050 {
1051 	struct abs_timeout timo;
1052 	struct umtx_q *uq;
1053 	u_long tmp;
1054 	int error = 0;
1055 
1056 	uq = td->td_umtxq;
1057 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1058 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1059 		return (error);
1060 
1061 	if (timeout != NULL)
1062 		abs_timeout_init2(&timo, timeout);
1063 
1064 	umtxq_lock(&uq->uq_key);
1065 	umtxq_insert(uq);
1066 	umtxq_unlock(&uq->uq_key);
1067 	if (compat32 == 0)
1068 		tmp = fuword(addr);
1069         else
1070 		tmp = (unsigned int)fuword32(addr);
1071 	umtxq_lock(&uq->uq_key);
1072 	if (tmp == id)
1073 		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1074 		    NULL : &timo);
1075 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1076 		error = 0;
1077 	else
1078 		umtxq_remove(uq);
1079 	umtxq_unlock(&uq->uq_key);
1080 	umtx_key_release(&uq->uq_key);
1081 	if (error == ERESTART)
1082 		error = EINTR;
1083 	return (error);
1084 }
1085 
1086 /*
1087  * Wake up threads sleeping on the specified address.
1088  */
1089 int
1090 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1091 {
1092 	struct umtx_key key;
1093 	int ret;
1094 
1095 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1096 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1097 		return (ret);
1098 	umtxq_lock(&key);
1099 	ret = umtxq_signal(&key, n_wake);
1100 	umtxq_unlock(&key);
1101 	umtx_key_release(&key);
1102 	return (0);
1103 }
1104 
1105 /*
1106  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1107  */
1108 static int
1109 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1110 	struct _umtx_time *timeout, int mode)
1111 {
1112 	struct abs_timeout timo;
1113 	struct umtx_q *uq;
1114 	uint32_t owner, old, id;
1115 	int error = 0;
1116 
1117 	id = td->td_tid;
1118 	uq = td->td_umtxq;
1119 
1120 	if (timeout != NULL)
1121 		abs_timeout_init2(&timo, timeout);
1122 
1123 	/*
1124 	 * Care must be exercised when dealing with umtx structure. It
1125 	 * can fault on any access.
1126 	 */
1127 	for (;;) {
1128 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1129 		if (mode == _UMUTEX_WAIT) {
1130 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1131 				return (0);
1132 		} else {
1133 			/*
1134 			 * Try the uncontested case.  This should be done in userland.
1135 			 */
1136 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1137 
1138 			/* The acquire succeeded. */
1139 			if (owner == UMUTEX_UNOWNED)
1140 				return (0);
1141 
1142 			/* The address was invalid. */
1143 			if (owner == -1)
1144 				return (EFAULT);
1145 
1146 			/* If no one owns it but it is contested try to acquire it. */
1147 			if (owner == UMUTEX_CONTESTED) {
1148 				owner = casuword32(&m->m_owner,
1149 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1150 
1151 				if (owner == UMUTEX_CONTESTED)
1152 					return (0);
1153 
1154 				/* The address was invalid. */
1155 				if (owner == -1)
1156 					return (EFAULT);
1157 
1158 				/* If this failed the lock has changed, restart. */
1159 				continue;
1160 			}
1161 		}
1162 
1163 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1164 		    (owner & ~UMUTEX_CONTESTED) == id)
1165 			return (EDEADLK);
1166 
1167 		if (mode == _UMUTEX_TRY)
1168 			return (EBUSY);
1169 
1170 		/*
1171 		 * If we caught a signal, we have retried and now
1172 		 * exit immediately.
1173 		 */
1174 		if (error != 0)
1175 			return (error);
1176 
1177 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1178 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1179 			return (error);
1180 
1181 		umtxq_lock(&uq->uq_key);
1182 		umtxq_busy(&uq->uq_key);
1183 		umtxq_insert(uq);
1184 		umtxq_unlock(&uq->uq_key);
1185 
1186 		/*
1187 		 * Set the contested bit so that a release in user space
1188 		 * knows to use the system call for unlock.  If this fails
1189 		 * either some one else has acquired the lock or it has been
1190 		 * released.
1191 		 */
1192 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1193 
1194 		/* The address was invalid. */
1195 		if (old == -1) {
1196 			umtxq_lock(&uq->uq_key);
1197 			umtxq_remove(uq);
1198 			umtxq_unbusy(&uq->uq_key);
1199 			umtxq_unlock(&uq->uq_key);
1200 			umtx_key_release(&uq->uq_key);
1201 			return (EFAULT);
1202 		}
1203 
1204 		/*
1205 		 * We set the contested bit, sleep. Otherwise the lock changed
1206 		 * and we need to retry or we lost a race to the thread
1207 		 * unlocking the umtx.
1208 		 */
1209 		umtxq_lock(&uq->uq_key);
1210 		umtxq_unbusy(&uq->uq_key);
1211 		if (old == owner)
1212 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1213 			    NULL : &timo);
1214 		umtxq_remove(uq);
1215 		umtxq_unlock(&uq->uq_key);
1216 		umtx_key_release(&uq->uq_key);
1217 	}
1218 
1219 	return (0);
1220 }
1221 
1222 /*
1223  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1224  */
1225 static int
1226 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1227 {
1228 	struct umtx_key key;
1229 	uint32_t owner, old, id;
1230 	int error;
1231 	int count;
1232 
1233 	id = td->td_tid;
1234 	/*
1235 	 * Make sure we own this mtx.
1236 	 */
1237 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1238 	if (owner == -1)
1239 		return (EFAULT);
1240 
1241 	if ((owner & ~UMUTEX_CONTESTED) != id)
1242 		return (EPERM);
1243 
1244 	if ((owner & UMUTEX_CONTESTED) == 0) {
1245 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1246 		if (old == -1)
1247 			return (EFAULT);
1248 		if (old == owner)
1249 			return (0);
1250 		owner = old;
1251 	}
1252 
1253 	/* We should only ever be in here for contested locks */
1254 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1255 	    &key)) != 0)
1256 		return (error);
1257 
1258 	umtxq_lock(&key);
1259 	umtxq_busy(&key);
1260 	count = umtxq_count(&key);
1261 	umtxq_unlock(&key);
1262 
1263 	/*
1264 	 * When unlocking the umtx, it must be marked as unowned if
1265 	 * there is zero or one thread only waiting for it.
1266 	 * Otherwise, it must be marked as contested.
1267 	 */
1268 	old = casuword32(&m->m_owner, owner,
1269 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1270 	umtxq_lock(&key);
1271 	umtxq_signal(&key,1);
1272 	umtxq_unbusy(&key);
1273 	umtxq_unlock(&key);
1274 	umtx_key_release(&key);
1275 	if (old == -1)
1276 		return (EFAULT);
1277 	if (old != owner)
1278 		return (EINVAL);
1279 	return (0);
1280 }
1281 
1282 /*
1283  * Check if the mutex is available and wake up a waiter,
1284  * only for simple mutex.
1285  */
1286 static int
1287 do_wake_umutex(struct thread *td, struct umutex *m)
1288 {
1289 	struct umtx_key key;
1290 	uint32_t owner;
1291 	uint32_t flags;
1292 	int error;
1293 	int count;
1294 
1295 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1296 	if (owner == -1)
1297 		return (EFAULT);
1298 
1299 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1300 		return (0);
1301 
1302 	flags = fuword32(&m->m_flags);
1303 
1304 	/* We should only ever be in here for contested locks */
1305 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1306 	    &key)) != 0)
1307 		return (error);
1308 
1309 	umtxq_lock(&key);
1310 	umtxq_busy(&key);
1311 	count = umtxq_count(&key);
1312 	umtxq_unlock(&key);
1313 
1314 	if (count <= 1)
1315 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1316 
1317 	umtxq_lock(&key);
1318 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1319 		umtxq_signal(&key, 1);
1320 	umtxq_unbusy(&key);
1321 	umtxq_unlock(&key);
1322 	umtx_key_release(&key);
1323 	return (0);
1324 }
1325 
1326 /*
1327  * Check if the mutex has waiters and tries to fix contention bit.
1328  */
1329 static int
1330 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1331 {
1332 	struct umtx_key key;
1333 	uint32_t owner, old;
1334 	int type;
1335 	int error;
1336 	int count;
1337 
1338 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1339 	case 0:
1340 		type = TYPE_NORMAL_UMUTEX;
1341 		break;
1342 	case UMUTEX_PRIO_INHERIT:
1343 		type = TYPE_PI_UMUTEX;
1344 		break;
1345 	case UMUTEX_PRIO_PROTECT:
1346 		type = TYPE_PP_UMUTEX;
1347 		break;
1348 	default:
1349 		return (EINVAL);
1350 	}
1351 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1352 	    &key)) != 0)
1353 		return (error);
1354 
1355 	owner = 0;
1356 	umtxq_lock(&key);
1357 	umtxq_busy(&key);
1358 	count = umtxq_count(&key);
1359 	umtxq_unlock(&key);
1360 	/*
1361 	 * Only repair contention bit if there is a waiter, this means the mutex
1362 	 * is still being referenced by userland code, otherwise don't update
1363 	 * any memory.
1364 	 */
1365 	if (count > 1) {
1366 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1367 		while ((owner & UMUTEX_CONTESTED) ==0) {
1368 			old = casuword32(&m->m_owner, owner,
1369 			    owner|UMUTEX_CONTESTED);
1370 			if (old == owner)
1371 				break;
1372 			owner = old;
1373 		}
1374 	} else if (count == 1) {
1375 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1376 		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1377 		       (owner & UMUTEX_CONTESTED) == 0) {
1378 			old = casuword32(&m->m_owner, owner,
1379 			    owner|UMUTEX_CONTESTED);
1380 			if (old == owner)
1381 				break;
1382 			owner = old;
1383 		}
1384 	}
1385 	umtxq_lock(&key);
1386 	if (owner == -1) {
1387 		error = EFAULT;
1388 		umtxq_signal(&key, INT_MAX);
1389 	}
1390 	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1391 		umtxq_signal(&key, 1);
1392 	umtxq_unbusy(&key);
1393 	umtxq_unlock(&key);
1394 	umtx_key_release(&key);
1395 	return (error);
1396 }
1397 
1398 static inline struct umtx_pi *
1399 umtx_pi_alloc(int flags)
1400 {
1401 	struct umtx_pi *pi;
1402 
1403 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1404 	TAILQ_INIT(&pi->pi_blocked);
1405 	atomic_add_int(&umtx_pi_allocated, 1);
1406 	return (pi);
1407 }
1408 
1409 static inline void
1410 umtx_pi_free(struct umtx_pi *pi)
1411 {
1412 	uma_zfree(umtx_pi_zone, pi);
1413 	atomic_add_int(&umtx_pi_allocated, -1);
1414 }
1415 
1416 /*
1417  * Adjust the thread's position on a pi_state after its priority has been
1418  * changed.
1419  */
1420 static int
1421 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1422 {
1423 	struct umtx_q *uq, *uq1, *uq2;
1424 	struct thread *td1;
1425 
1426 	mtx_assert(&umtx_lock, MA_OWNED);
1427 	if (pi == NULL)
1428 		return (0);
1429 
1430 	uq = td->td_umtxq;
1431 
1432 	/*
1433 	 * Check if the thread needs to be moved on the blocked chain.
1434 	 * It needs to be moved if either its priority is lower than
1435 	 * the previous thread or higher than the next thread.
1436 	 */
1437 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1438 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1439 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1440 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1441 		/*
1442 		 * Remove thread from blocked chain and determine where
1443 		 * it should be moved to.
1444 		 */
1445 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1446 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1447 			td1 = uq1->uq_thread;
1448 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1449 			if (UPRI(td1) > UPRI(td))
1450 				break;
1451 		}
1452 
1453 		if (uq1 == NULL)
1454 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1455 		else
1456 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1457 	}
1458 	return (1);
1459 }
1460 
1461 /*
1462  * Propagate priority when a thread is blocked on POSIX
1463  * PI mutex.
1464  */
1465 static void
1466 umtx_propagate_priority(struct thread *td)
1467 {
1468 	struct umtx_q *uq;
1469 	struct umtx_pi *pi;
1470 	int pri;
1471 
1472 	mtx_assert(&umtx_lock, MA_OWNED);
1473 	pri = UPRI(td);
1474 	uq = td->td_umtxq;
1475 	pi = uq->uq_pi_blocked;
1476 	if (pi == NULL)
1477 		return;
1478 
1479 	for (;;) {
1480 		td = pi->pi_owner;
1481 		if (td == NULL || td == curthread)
1482 			return;
1483 
1484 		MPASS(td->td_proc != NULL);
1485 		MPASS(td->td_proc->p_magic == P_MAGIC);
1486 
1487 		thread_lock(td);
1488 		if (td->td_lend_user_pri > pri)
1489 			sched_lend_user_prio(td, pri);
1490 		else {
1491 			thread_unlock(td);
1492 			break;
1493 		}
1494 		thread_unlock(td);
1495 
1496 		/*
1497 		 * Pick up the lock that td is blocked on.
1498 		 */
1499 		uq = td->td_umtxq;
1500 		pi = uq->uq_pi_blocked;
1501 		if (pi == NULL)
1502 			break;
1503 		/* Resort td on the list if needed. */
1504 		umtx_pi_adjust_thread(pi, td);
1505 	}
1506 }
1507 
1508 /*
1509  * Unpropagate priority for a PI mutex when a thread blocked on
1510  * it is interrupted by signal or resumed by others.
1511  */
1512 static void
1513 umtx_repropagate_priority(struct umtx_pi *pi)
1514 {
1515 	struct umtx_q *uq, *uq_owner;
1516 	struct umtx_pi *pi2;
1517 	int pri;
1518 
1519 	mtx_assert(&umtx_lock, MA_OWNED);
1520 
1521 	while (pi != NULL && pi->pi_owner != NULL) {
1522 		pri = PRI_MAX;
1523 		uq_owner = pi->pi_owner->td_umtxq;
1524 
1525 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1526 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1527 			if (uq != NULL) {
1528 				if (pri > UPRI(uq->uq_thread))
1529 					pri = UPRI(uq->uq_thread);
1530 			}
1531 		}
1532 
1533 		if (pri > uq_owner->uq_inherited_pri)
1534 			pri = uq_owner->uq_inherited_pri;
1535 		thread_lock(pi->pi_owner);
1536 		sched_lend_user_prio(pi->pi_owner, pri);
1537 		thread_unlock(pi->pi_owner);
1538 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1539 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1540 	}
1541 }
1542 
1543 /*
1544  * Insert a PI mutex into owned list.
1545  */
1546 static void
1547 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1548 {
1549 	struct umtx_q *uq_owner;
1550 
1551 	uq_owner = owner->td_umtxq;
1552 	mtx_assert(&umtx_lock, MA_OWNED);
1553 	if (pi->pi_owner != NULL)
1554 		panic("pi_ower != NULL");
1555 	pi->pi_owner = owner;
1556 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1557 }
1558 
1559 /*
1560  * Claim ownership of a PI mutex.
1561  */
1562 static int
1563 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1564 {
1565 	struct umtx_q *uq, *uq_owner;
1566 
1567 	uq_owner = owner->td_umtxq;
1568 	mtx_lock_spin(&umtx_lock);
1569 	if (pi->pi_owner == owner) {
1570 		mtx_unlock_spin(&umtx_lock);
1571 		return (0);
1572 	}
1573 
1574 	if (pi->pi_owner != NULL) {
1575 		/*
1576 		 * userland may have already messed the mutex, sigh.
1577 		 */
1578 		mtx_unlock_spin(&umtx_lock);
1579 		return (EPERM);
1580 	}
1581 	umtx_pi_setowner(pi, owner);
1582 	uq = TAILQ_FIRST(&pi->pi_blocked);
1583 	if (uq != NULL) {
1584 		int pri;
1585 
1586 		pri = UPRI(uq->uq_thread);
1587 		thread_lock(owner);
1588 		if (pri < UPRI(owner))
1589 			sched_lend_user_prio(owner, pri);
1590 		thread_unlock(owner);
1591 	}
1592 	mtx_unlock_spin(&umtx_lock);
1593 	return (0);
1594 }
1595 
1596 /*
1597  * Adjust a thread's order position in its blocked PI mutex,
1598  * this may result new priority propagating process.
1599  */
1600 void
1601 umtx_pi_adjust(struct thread *td, u_char oldpri)
1602 {
1603 	struct umtx_q *uq;
1604 	struct umtx_pi *pi;
1605 
1606 	uq = td->td_umtxq;
1607 	mtx_lock_spin(&umtx_lock);
1608 	/*
1609 	 * Pick up the lock that td is blocked on.
1610 	 */
1611 	pi = uq->uq_pi_blocked;
1612 	if (pi != NULL) {
1613 		umtx_pi_adjust_thread(pi, td);
1614 		umtx_repropagate_priority(pi);
1615 	}
1616 	mtx_unlock_spin(&umtx_lock);
1617 }
1618 
1619 /*
1620  * Sleep on a PI mutex.
1621  */
1622 static int
1623 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1624 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1625 {
1626 	struct umtxq_chain *uc;
1627 	struct thread *td, *td1;
1628 	struct umtx_q *uq1;
1629 	int pri;
1630 	int error = 0;
1631 
1632 	td = uq->uq_thread;
1633 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1634 	uc = umtxq_getchain(&uq->uq_key);
1635 	UMTXQ_LOCKED_ASSERT(uc);
1636 	UMTXQ_BUSY_ASSERT(uc);
1637 	umtxq_insert(uq);
1638 	mtx_lock_spin(&umtx_lock);
1639 	if (pi->pi_owner == NULL) {
1640 		mtx_unlock_spin(&umtx_lock);
1641 		/* XXX Only look up thread in current process. */
1642 		td1 = tdfind(owner, curproc->p_pid);
1643 		mtx_lock_spin(&umtx_lock);
1644 		if (td1 != NULL) {
1645 			if (pi->pi_owner == NULL)
1646 				umtx_pi_setowner(pi, td1);
1647 			PROC_UNLOCK(td1->td_proc);
1648 		}
1649 	}
1650 
1651 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1652 		pri = UPRI(uq1->uq_thread);
1653 		if (pri > UPRI(td))
1654 			break;
1655 	}
1656 
1657 	if (uq1 != NULL)
1658 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1659 	else
1660 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1661 
1662 	uq->uq_pi_blocked = pi;
1663 	thread_lock(td);
1664 	td->td_flags |= TDF_UPIBLOCKED;
1665 	thread_unlock(td);
1666 	umtx_propagate_priority(td);
1667 	mtx_unlock_spin(&umtx_lock);
1668 	umtxq_unbusy(&uq->uq_key);
1669 
1670 	error = umtxq_sleep(uq, wmesg, timo);
1671 	umtxq_remove(uq);
1672 
1673 	mtx_lock_spin(&umtx_lock);
1674 	uq->uq_pi_blocked = NULL;
1675 	thread_lock(td);
1676 	td->td_flags &= ~TDF_UPIBLOCKED;
1677 	thread_unlock(td);
1678 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1679 	umtx_repropagate_priority(pi);
1680 	mtx_unlock_spin(&umtx_lock);
1681 	umtxq_unlock(&uq->uq_key);
1682 
1683 	return (error);
1684 }
1685 
1686 /*
1687  * Add reference count for a PI mutex.
1688  */
1689 static void
1690 umtx_pi_ref(struct umtx_pi *pi)
1691 {
1692 	struct umtxq_chain *uc;
1693 
1694 	uc = umtxq_getchain(&pi->pi_key);
1695 	UMTXQ_LOCKED_ASSERT(uc);
1696 	pi->pi_refcount++;
1697 }
1698 
1699 /*
1700  * Decrease reference count for a PI mutex, if the counter
1701  * is decreased to zero, its memory space is freed.
1702  */
1703 static void
1704 umtx_pi_unref(struct umtx_pi *pi)
1705 {
1706 	struct umtxq_chain *uc;
1707 
1708 	uc = umtxq_getchain(&pi->pi_key);
1709 	UMTXQ_LOCKED_ASSERT(uc);
1710 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1711 	if (--pi->pi_refcount == 0) {
1712 		mtx_lock_spin(&umtx_lock);
1713 		if (pi->pi_owner != NULL) {
1714 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1715 				pi, pi_link);
1716 			pi->pi_owner = NULL;
1717 		}
1718 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1719 			("blocked queue not empty"));
1720 		mtx_unlock_spin(&umtx_lock);
1721 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1722 		umtx_pi_free(pi);
1723 	}
1724 }
1725 
1726 /*
1727  * Find a PI mutex in hash table.
1728  */
1729 static struct umtx_pi *
1730 umtx_pi_lookup(struct umtx_key *key)
1731 {
1732 	struct umtxq_chain *uc;
1733 	struct umtx_pi *pi;
1734 
1735 	uc = umtxq_getchain(key);
1736 	UMTXQ_LOCKED_ASSERT(uc);
1737 
1738 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1739 		if (umtx_key_match(&pi->pi_key, key)) {
1740 			return (pi);
1741 		}
1742 	}
1743 	return (NULL);
1744 }
1745 
1746 /*
1747  * Insert a PI mutex into hash table.
1748  */
1749 static inline void
1750 umtx_pi_insert(struct umtx_pi *pi)
1751 {
1752 	struct umtxq_chain *uc;
1753 
1754 	uc = umtxq_getchain(&pi->pi_key);
1755 	UMTXQ_LOCKED_ASSERT(uc);
1756 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1757 }
1758 
1759 /*
1760  * Lock a PI mutex.
1761  */
1762 static int
1763 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1764     struct _umtx_time *timeout, int try)
1765 {
1766 	struct abs_timeout timo;
1767 	struct umtx_q *uq;
1768 	struct umtx_pi *pi, *new_pi;
1769 	uint32_t id, owner, old;
1770 	int error;
1771 
1772 	id = td->td_tid;
1773 	uq = td->td_umtxq;
1774 
1775 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1776 	    &uq->uq_key)) != 0)
1777 		return (error);
1778 
1779 	if (timeout != NULL)
1780 		abs_timeout_init2(&timo, timeout);
1781 
1782 	umtxq_lock(&uq->uq_key);
1783 	pi = umtx_pi_lookup(&uq->uq_key);
1784 	if (pi == NULL) {
1785 		new_pi = umtx_pi_alloc(M_NOWAIT);
1786 		if (new_pi == NULL) {
1787 			umtxq_unlock(&uq->uq_key);
1788 			new_pi = umtx_pi_alloc(M_WAITOK);
1789 			umtxq_lock(&uq->uq_key);
1790 			pi = umtx_pi_lookup(&uq->uq_key);
1791 			if (pi != NULL) {
1792 				umtx_pi_free(new_pi);
1793 				new_pi = NULL;
1794 			}
1795 		}
1796 		if (new_pi != NULL) {
1797 			new_pi->pi_key = uq->uq_key;
1798 			umtx_pi_insert(new_pi);
1799 			pi = new_pi;
1800 		}
1801 	}
1802 	umtx_pi_ref(pi);
1803 	umtxq_unlock(&uq->uq_key);
1804 
1805 	/*
1806 	 * Care must be exercised when dealing with umtx structure.  It
1807 	 * can fault on any access.
1808 	 */
1809 	for (;;) {
1810 		/*
1811 		 * Try the uncontested case.  This should be done in userland.
1812 		 */
1813 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1814 
1815 		/* The acquire succeeded. */
1816 		if (owner == UMUTEX_UNOWNED) {
1817 			error = 0;
1818 			break;
1819 		}
1820 
1821 		/* The address was invalid. */
1822 		if (owner == -1) {
1823 			error = EFAULT;
1824 			break;
1825 		}
1826 
1827 		/* If no one owns it but it is contested try to acquire it. */
1828 		if (owner == UMUTEX_CONTESTED) {
1829 			owner = casuword32(&m->m_owner,
1830 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1831 
1832 			if (owner == UMUTEX_CONTESTED) {
1833 				umtxq_lock(&uq->uq_key);
1834 				umtxq_busy(&uq->uq_key);
1835 				error = umtx_pi_claim(pi, td);
1836 				umtxq_unbusy(&uq->uq_key);
1837 				umtxq_unlock(&uq->uq_key);
1838 				break;
1839 			}
1840 
1841 			/* The address was invalid. */
1842 			if (owner == -1) {
1843 				error = EFAULT;
1844 				break;
1845 			}
1846 
1847 			/* If this failed the lock has changed, restart. */
1848 			continue;
1849 		}
1850 
1851 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1852 		    (owner & ~UMUTEX_CONTESTED) == id) {
1853 			error = EDEADLK;
1854 			break;
1855 		}
1856 
1857 		if (try != 0) {
1858 			error = EBUSY;
1859 			break;
1860 		}
1861 
1862 		/*
1863 		 * If we caught a signal, we have retried and now
1864 		 * exit immediately.
1865 		 */
1866 		if (error != 0)
1867 			break;
1868 
1869 		umtxq_lock(&uq->uq_key);
1870 		umtxq_busy(&uq->uq_key);
1871 		umtxq_unlock(&uq->uq_key);
1872 
1873 		/*
1874 		 * Set the contested bit so that a release in user space
1875 		 * knows to use the system call for unlock.  If this fails
1876 		 * either some one else has acquired the lock or it has been
1877 		 * released.
1878 		 */
1879 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1880 
1881 		/* The address was invalid. */
1882 		if (old == -1) {
1883 			umtxq_lock(&uq->uq_key);
1884 			umtxq_unbusy(&uq->uq_key);
1885 			umtxq_unlock(&uq->uq_key);
1886 			error = EFAULT;
1887 			break;
1888 		}
1889 
1890 		umtxq_lock(&uq->uq_key);
1891 		/*
1892 		 * We set the contested bit, sleep. Otherwise the lock changed
1893 		 * and we need to retry or we lost a race to the thread
1894 		 * unlocking the umtx.
1895 		 */
1896 		if (old == owner)
1897 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1898 			    "umtxpi", timeout == NULL ? NULL : &timo);
1899 		else {
1900 			umtxq_unbusy(&uq->uq_key);
1901 			umtxq_unlock(&uq->uq_key);
1902 		}
1903 	}
1904 
1905 	umtxq_lock(&uq->uq_key);
1906 	umtx_pi_unref(pi);
1907 	umtxq_unlock(&uq->uq_key);
1908 
1909 	umtx_key_release(&uq->uq_key);
1910 	return (error);
1911 }
1912 
1913 /*
1914  * Unlock a PI mutex.
1915  */
1916 static int
1917 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1918 {
1919 	struct umtx_key key;
1920 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1921 	struct umtx_pi *pi, *pi2;
1922 	uint32_t owner, old, id;
1923 	int error;
1924 	int count;
1925 	int pri;
1926 
1927 	id = td->td_tid;
1928 	/*
1929 	 * Make sure we own this mtx.
1930 	 */
1931 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1932 	if (owner == -1)
1933 		return (EFAULT);
1934 
1935 	if ((owner & ~UMUTEX_CONTESTED) != id)
1936 		return (EPERM);
1937 
1938 	/* This should be done in userland */
1939 	if ((owner & UMUTEX_CONTESTED) == 0) {
1940 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1941 		if (old == -1)
1942 			return (EFAULT);
1943 		if (old == owner)
1944 			return (0);
1945 		owner = old;
1946 	}
1947 
1948 	/* We should only ever be in here for contested locks */
1949 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1950 	    &key)) != 0)
1951 		return (error);
1952 
1953 	umtxq_lock(&key);
1954 	umtxq_busy(&key);
1955 	count = umtxq_count_pi(&key, &uq_first);
1956 	if (uq_first != NULL) {
1957 		mtx_lock_spin(&umtx_lock);
1958 		pi = uq_first->uq_pi_blocked;
1959 		KASSERT(pi != NULL, ("pi == NULL?"));
1960 		if (pi->pi_owner != curthread) {
1961 			mtx_unlock_spin(&umtx_lock);
1962 			umtxq_unbusy(&key);
1963 			umtxq_unlock(&key);
1964 			umtx_key_release(&key);
1965 			/* userland messed the mutex */
1966 			return (EPERM);
1967 		}
1968 		uq_me = curthread->td_umtxq;
1969 		pi->pi_owner = NULL;
1970 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1971 		/* get highest priority thread which is still sleeping. */
1972 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1973 		while (uq_first != NULL &&
1974 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1975 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1976 		}
1977 		pri = PRI_MAX;
1978 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1979 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1980 			if (uq_first2 != NULL) {
1981 				if (pri > UPRI(uq_first2->uq_thread))
1982 					pri = UPRI(uq_first2->uq_thread);
1983 			}
1984 		}
1985 		thread_lock(curthread);
1986 		sched_lend_user_prio(curthread, pri);
1987 		thread_unlock(curthread);
1988 		mtx_unlock_spin(&umtx_lock);
1989 		if (uq_first)
1990 			umtxq_signal_thread(uq_first);
1991 	}
1992 	umtxq_unlock(&key);
1993 
1994 	/*
1995 	 * When unlocking the umtx, it must be marked as unowned if
1996 	 * there is zero or one thread only waiting for it.
1997 	 * Otherwise, it must be marked as contested.
1998 	 */
1999 	old = casuword32(&m->m_owner, owner,
2000 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2001 
2002 	umtxq_lock(&key);
2003 	umtxq_unbusy(&key);
2004 	umtxq_unlock(&key);
2005 	umtx_key_release(&key);
2006 	if (old == -1)
2007 		return (EFAULT);
2008 	if (old != owner)
2009 		return (EINVAL);
2010 	return (0);
2011 }
2012 
2013 /*
2014  * Lock a PP mutex.
2015  */
2016 static int
2017 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2018     struct _umtx_time *timeout, int try)
2019 {
2020 	struct abs_timeout timo;
2021 	struct umtx_q *uq, *uq2;
2022 	struct umtx_pi *pi;
2023 	uint32_t ceiling;
2024 	uint32_t owner, id;
2025 	int error, pri, old_inherited_pri, su;
2026 
2027 	id = td->td_tid;
2028 	uq = td->td_umtxq;
2029 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2030 	    &uq->uq_key)) != 0)
2031 		return (error);
2032 
2033 	if (timeout != NULL)
2034 		abs_timeout_init2(&timo, timeout);
2035 
2036 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2037 	for (;;) {
2038 		old_inherited_pri = uq->uq_inherited_pri;
2039 		umtxq_lock(&uq->uq_key);
2040 		umtxq_busy(&uq->uq_key);
2041 		umtxq_unlock(&uq->uq_key);
2042 
2043 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2044 		if (ceiling > RTP_PRIO_MAX) {
2045 			error = EINVAL;
2046 			goto out;
2047 		}
2048 
2049 		mtx_lock_spin(&umtx_lock);
2050 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2051 			mtx_unlock_spin(&umtx_lock);
2052 			error = EINVAL;
2053 			goto out;
2054 		}
2055 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2056 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2057 			thread_lock(td);
2058 			if (uq->uq_inherited_pri < UPRI(td))
2059 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2060 			thread_unlock(td);
2061 		}
2062 		mtx_unlock_spin(&umtx_lock);
2063 
2064 		owner = casuword32(&m->m_owner,
2065 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2066 
2067 		if (owner == UMUTEX_CONTESTED) {
2068 			error = 0;
2069 			break;
2070 		}
2071 
2072 		/* The address was invalid. */
2073 		if (owner == -1) {
2074 			error = EFAULT;
2075 			break;
2076 		}
2077 
2078 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2079 		    (owner & ~UMUTEX_CONTESTED) == id) {
2080 			error = EDEADLK;
2081 			break;
2082 		}
2083 
2084 		if (try != 0) {
2085 			error = EBUSY;
2086 			break;
2087 		}
2088 
2089 		/*
2090 		 * If we caught a signal, we have retried and now
2091 		 * exit immediately.
2092 		 */
2093 		if (error != 0)
2094 			break;
2095 
2096 		umtxq_lock(&uq->uq_key);
2097 		umtxq_insert(uq);
2098 		umtxq_unbusy(&uq->uq_key);
2099 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2100 		    NULL : &timo);
2101 		umtxq_remove(uq);
2102 		umtxq_unlock(&uq->uq_key);
2103 
2104 		mtx_lock_spin(&umtx_lock);
2105 		uq->uq_inherited_pri = old_inherited_pri;
2106 		pri = PRI_MAX;
2107 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2108 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2109 			if (uq2 != NULL) {
2110 				if (pri > UPRI(uq2->uq_thread))
2111 					pri = UPRI(uq2->uq_thread);
2112 			}
2113 		}
2114 		if (pri > uq->uq_inherited_pri)
2115 			pri = uq->uq_inherited_pri;
2116 		thread_lock(td);
2117 		sched_lend_user_prio(td, pri);
2118 		thread_unlock(td);
2119 		mtx_unlock_spin(&umtx_lock);
2120 	}
2121 
2122 	if (error != 0) {
2123 		mtx_lock_spin(&umtx_lock);
2124 		uq->uq_inherited_pri = old_inherited_pri;
2125 		pri = PRI_MAX;
2126 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2127 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2128 			if (uq2 != NULL) {
2129 				if (pri > UPRI(uq2->uq_thread))
2130 					pri = UPRI(uq2->uq_thread);
2131 			}
2132 		}
2133 		if (pri > uq->uq_inherited_pri)
2134 			pri = uq->uq_inherited_pri;
2135 		thread_lock(td);
2136 		sched_lend_user_prio(td, pri);
2137 		thread_unlock(td);
2138 		mtx_unlock_spin(&umtx_lock);
2139 	}
2140 
2141 out:
2142 	umtxq_lock(&uq->uq_key);
2143 	umtxq_unbusy(&uq->uq_key);
2144 	umtxq_unlock(&uq->uq_key);
2145 	umtx_key_release(&uq->uq_key);
2146 	return (error);
2147 }
2148 
2149 /*
2150  * Unlock a PP mutex.
2151  */
2152 static int
2153 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2154 {
2155 	struct umtx_key key;
2156 	struct umtx_q *uq, *uq2;
2157 	struct umtx_pi *pi;
2158 	uint32_t owner, id;
2159 	uint32_t rceiling;
2160 	int error, pri, new_inherited_pri, su;
2161 
2162 	id = td->td_tid;
2163 	uq = td->td_umtxq;
2164 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2165 
2166 	/*
2167 	 * Make sure we own this mtx.
2168 	 */
2169 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2170 	if (owner == -1)
2171 		return (EFAULT);
2172 
2173 	if ((owner & ~UMUTEX_CONTESTED) != id)
2174 		return (EPERM);
2175 
2176 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2177 	if (error != 0)
2178 		return (error);
2179 
2180 	if (rceiling == -1)
2181 		new_inherited_pri = PRI_MAX;
2182 	else {
2183 		rceiling = RTP_PRIO_MAX - rceiling;
2184 		if (rceiling > RTP_PRIO_MAX)
2185 			return (EINVAL);
2186 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2187 	}
2188 
2189 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2190 	    &key)) != 0)
2191 		return (error);
2192 	umtxq_lock(&key);
2193 	umtxq_busy(&key);
2194 	umtxq_unlock(&key);
2195 	/*
2196 	 * For priority protected mutex, always set unlocked state
2197 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2198 	 * to lock the mutex, it is necessary because thread priority
2199 	 * has to be adjusted for such mutex.
2200 	 */
2201 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2202 		UMUTEX_CONTESTED);
2203 
2204 	umtxq_lock(&key);
2205 	if (error == 0)
2206 		umtxq_signal(&key, 1);
2207 	umtxq_unbusy(&key);
2208 	umtxq_unlock(&key);
2209 
2210 	if (error == -1)
2211 		error = EFAULT;
2212 	else {
2213 		mtx_lock_spin(&umtx_lock);
2214 		if (su != 0)
2215 			uq->uq_inherited_pri = new_inherited_pri;
2216 		pri = PRI_MAX;
2217 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2218 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2219 			if (uq2 != NULL) {
2220 				if (pri > UPRI(uq2->uq_thread))
2221 					pri = UPRI(uq2->uq_thread);
2222 			}
2223 		}
2224 		if (pri > uq->uq_inherited_pri)
2225 			pri = uq->uq_inherited_pri;
2226 		thread_lock(td);
2227 		sched_lend_user_prio(td, pri);
2228 		thread_unlock(td);
2229 		mtx_unlock_spin(&umtx_lock);
2230 	}
2231 	umtx_key_release(&key);
2232 	return (error);
2233 }
2234 
2235 static int
2236 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2237 	uint32_t *old_ceiling)
2238 {
2239 	struct umtx_q *uq;
2240 	uint32_t save_ceiling;
2241 	uint32_t owner, id;
2242 	uint32_t flags;
2243 	int error;
2244 
2245 	flags = fuword32(&m->m_flags);
2246 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2247 		return (EINVAL);
2248 	if (ceiling > RTP_PRIO_MAX)
2249 		return (EINVAL);
2250 	id = td->td_tid;
2251 	uq = td->td_umtxq;
2252 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2253 	   &uq->uq_key)) != 0)
2254 		return (error);
2255 	for (;;) {
2256 		umtxq_lock(&uq->uq_key);
2257 		umtxq_busy(&uq->uq_key);
2258 		umtxq_unlock(&uq->uq_key);
2259 
2260 		save_ceiling = fuword32(&m->m_ceilings[0]);
2261 
2262 		owner = casuword32(&m->m_owner,
2263 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2264 
2265 		if (owner == UMUTEX_CONTESTED) {
2266 			suword32(&m->m_ceilings[0], ceiling);
2267 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2268 				UMUTEX_CONTESTED);
2269 			error = 0;
2270 			break;
2271 		}
2272 
2273 		/* The address was invalid. */
2274 		if (owner == -1) {
2275 			error = EFAULT;
2276 			break;
2277 		}
2278 
2279 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2280 			suword32(&m->m_ceilings[0], ceiling);
2281 			error = 0;
2282 			break;
2283 		}
2284 
2285 		/*
2286 		 * If we caught a signal, we have retried and now
2287 		 * exit immediately.
2288 		 */
2289 		if (error != 0)
2290 			break;
2291 
2292 		/*
2293 		 * We set the contested bit, sleep. Otherwise the lock changed
2294 		 * and we need to retry or we lost a race to the thread
2295 		 * unlocking the umtx.
2296 		 */
2297 		umtxq_lock(&uq->uq_key);
2298 		umtxq_insert(uq);
2299 		umtxq_unbusy(&uq->uq_key);
2300 		error = umtxq_sleep(uq, "umtxpp", NULL);
2301 		umtxq_remove(uq);
2302 		umtxq_unlock(&uq->uq_key);
2303 	}
2304 	umtxq_lock(&uq->uq_key);
2305 	if (error == 0)
2306 		umtxq_signal(&uq->uq_key, INT_MAX);
2307 	umtxq_unbusy(&uq->uq_key);
2308 	umtxq_unlock(&uq->uq_key);
2309 	umtx_key_release(&uq->uq_key);
2310 	if (error == 0 && old_ceiling != NULL)
2311 		suword32(old_ceiling, save_ceiling);
2312 	return (error);
2313 }
2314 
2315 /*
2316  * Lock a userland POSIX mutex.
2317  */
2318 static int
2319 do_lock_umutex(struct thread *td, struct umutex *m,
2320     struct _umtx_time *timeout, int mode)
2321 {
2322 	uint32_t flags;
2323 	int error;
2324 
2325 	flags = fuword32(&m->m_flags);
2326 	if (flags == -1)
2327 		return (EFAULT);
2328 
2329 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2330 	case 0:
2331 		error = do_lock_normal(td, m, flags, timeout, mode);
2332 		break;
2333 	case UMUTEX_PRIO_INHERIT:
2334 		error = do_lock_pi(td, m, flags, timeout, mode);
2335 		break;
2336 	case UMUTEX_PRIO_PROTECT:
2337 		error = do_lock_pp(td, m, flags, timeout, mode);
2338 		break;
2339 	default:
2340 		return (EINVAL);
2341 	}
2342 	if (timeout == NULL) {
2343 		if (error == EINTR && mode != _UMUTEX_WAIT)
2344 			error = ERESTART;
2345 	} else {
2346 		/* Timed-locking is not restarted. */
2347 		if (error == ERESTART)
2348 			error = EINTR;
2349 	}
2350 	return (error);
2351 }
2352 
2353 /*
2354  * Unlock a userland POSIX mutex.
2355  */
2356 static int
2357 do_unlock_umutex(struct thread *td, struct umutex *m)
2358 {
2359 	uint32_t flags;
2360 
2361 	flags = fuword32(&m->m_flags);
2362 	if (flags == -1)
2363 		return (EFAULT);
2364 
2365 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2366 	case 0:
2367 		return (do_unlock_normal(td, m, flags));
2368 	case UMUTEX_PRIO_INHERIT:
2369 		return (do_unlock_pi(td, m, flags));
2370 	case UMUTEX_PRIO_PROTECT:
2371 		return (do_unlock_pp(td, m, flags));
2372 	}
2373 
2374 	return (EINVAL);
2375 }
2376 
2377 static int
2378 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2379 	struct timespec *timeout, u_long wflags)
2380 {
2381 	struct abs_timeout timo;
2382 	struct umtx_q *uq;
2383 	uint32_t flags;
2384 	uint32_t clockid;
2385 	int error;
2386 
2387 	uq = td->td_umtxq;
2388 	flags = fuword32(&cv->c_flags);
2389 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2390 	if (error != 0)
2391 		return (error);
2392 
2393 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2394 		clockid = fuword32(&cv->c_clockid);
2395 		if (clockid < CLOCK_REALTIME ||
2396 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2397 			/* hmm, only HW clock id will work. */
2398 			return (EINVAL);
2399 		}
2400 	} else {
2401 		clockid = CLOCK_REALTIME;
2402 	}
2403 
2404 	umtxq_lock(&uq->uq_key);
2405 	umtxq_busy(&uq->uq_key);
2406 	umtxq_insert(uq);
2407 	umtxq_unlock(&uq->uq_key);
2408 
2409 	/*
2410 	 * Set c_has_waiters to 1 before releasing user mutex, also
2411 	 * don't modify cache line when unnecessary.
2412 	 */
2413 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2414 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2415 
2416 	umtxq_lock(&uq->uq_key);
2417 	umtxq_unbusy(&uq->uq_key);
2418 	umtxq_unlock(&uq->uq_key);
2419 
2420 	error = do_unlock_umutex(td, m);
2421 
2422 	if (timeout != NULL)
2423 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2424 			timeout);
2425 
2426 	umtxq_lock(&uq->uq_key);
2427 	if (error == 0) {
2428 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2429 		    NULL : &timo);
2430 	}
2431 
2432 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2433 		error = 0;
2434 	else {
2435 		/*
2436 		 * This must be timeout,interrupted by signal or
2437 		 * surprious wakeup, clear c_has_waiter flag when
2438 		 * necessary.
2439 		 */
2440 		umtxq_busy(&uq->uq_key);
2441 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2442 			int oldlen = uq->uq_cur_queue->length;
2443 			umtxq_remove(uq);
2444 			if (oldlen == 1) {
2445 				umtxq_unlock(&uq->uq_key);
2446 				suword32(
2447 				    __DEVOLATILE(uint32_t *,
2448 					 &cv->c_has_waiters), 0);
2449 				umtxq_lock(&uq->uq_key);
2450 			}
2451 		}
2452 		umtxq_unbusy(&uq->uq_key);
2453 		if (error == ERESTART)
2454 			error = EINTR;
2455 	}
2456 
2457 	umtxq_unlock(&uq->uq_key);
2458 	umtx_key_release(&uq->uq_key);
2459 	return (error);
2460 }
2461 
2462 /*
2463  * Signal a userland condition variable.
2464  */
2465 static int
2466 do_cv_signal(struct thread *td, struct ucond *cv)
2467 {
2468 	struct umtx_key key;
2469 	int error, cnt, nwake;
2470 	uint32_t flags;
2471 
2472 	flags = fuword32(&cv->c_flags);
2473 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2474 		return (error);
2475 	umtxq_lock(&key);
2476 	umtxq_busy(&key);
2477 	cnt = umtxq_count(&key);
2478 	nwake = umtxq_signal(&key, 1);
2479 	if (cnt <= nwake) {
2480 		umtxq_unlock(&key);
2481 		error = suword32(
2482 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2483 		umtxq_lock(&key);
2484 	}
2485 	umtxq_unbusy(&key);
2486 	umtxq_unlock(&key);
2487 	umtx_key_release(&key);
2488 	return (error);
2489 }
2490 
2491 static int
2492 do_cv_broadcast(struct thread *td, struct ucond *cv)
2493 {
2494 	struct umtx_key key;
2495 	int error;
2496 	uint32_t flags;
2497 
2498 	flags = fuword32(&cv->c_flags);
2499 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2500 		return (error);
2501 
2502 	umtxq_lock(&key);
2503 	umtxq_busy(&key);
2504 	umtxq_signal(&key, INT_MAX);
2505 	umtxq_unlock(&key);
2506 
2507 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2508 
2509 	umtxq_lock(&key);
2510 	umtxq_unbusy(&key);
2511 	umtxq_unlock(&key);
2512 
2513 	umtx_key_release(&key);
2514 	return (error);
2515 }
2516 
2517 static int
2518 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2519 {
2520 	struct abs_timeout timo;
2521 	struct umtx_q *uq;
2522 	uint32_t flags, wrflags;
2523 	int32_t state, oldstate;
2524 	int32_t blocked_readers;
2525 	int error;
2526 
2527 	uq = td->td_umtxq;
2528 	flags = fuword32(&rwlock->rw_flags);
2529 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2530 	if (error != 0)
2531 		return (error);
2532 
2533 	if (timeout != NULL)
2534 		abs_timeout_init2(&timo, timeout);
2535 
2536 	wrflags = URWLOCK_WRITE_OWNER;
2537 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2538 		wrflags |= URWLOCK_WRITE_WAITERS;
2539 
2540 	for (;;) {
2541 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2542 		/* try to lock it */
2543 		while (!(state & wrflags)) {
2544 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2545 				umtx_key_release(&uq->uq_key);
2546 				return (EAGAIN);
2547 			}
2548 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2549 			if (oldstate == state) {
2550 				umtx_key_release(&uq->uq_key);
2551 				return (0);
2552 			}
2553 			state = oldstate;
2554 		}
2555 
2556 		if (error)
2557 			break;
2558 
2559 		/* grab monitor lock */
2560 		umtxq_lock(&uq->uq_key);
2561 		umtxq_busy(&uq->uq_key);
2562 		umtxq_unlock(&uq->uq_key);
2563 
2564 		/*
2565 		 * re-read the state, in case it changed between the try-lock above
2566 		 * and the check below
2567 		 */
2568 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2569 
2570 		/* set read contention bit */
2571 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2572 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2573 			if (oldstate == state)
2574 				goto sleep;
2575 			state = oldstate;
2576 		}
2577 
2578 		/* state is changed while setting flags, restart */
2579 		if (!(state & wrflags)) {
2580 			umtxq_lock(&uq->uq_key);
2581 			umtxq_unbusy(&uq->uq_key);
2582 			umtxq_unlock(&uq->uq_key);
2583 			continue;
2584 		}
2585 
2586 sleep:
2587 		/* contention bit is set, before sleeping, increase read waiter count */
2588 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2589 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2590 
2591 		while (state & wrflags) {
2592 			umtxq_lock(&uq->uq_key);
2593 			umtxq_insert(uq);
2594 			umtxq_unbusy(&uq->uq_key);
2595 
2596 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2597 			    NULL : &timo);
2598 
2599 			umtxq_busy(&uq->uq_key);
2600 			umtxq_remove(uq);
2601 			umtxq_unlock(&uq->uq_key);
2602 			if (error)
2603 				break;
2604 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2605 		}
2606 
2607 		/* decrease read waiter count, and may clear read contention bit */
2608 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2609 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2610 		if (blocked_readers == 1) {
2611 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2612 			for (;;) {
2613 				oldstate = casuword32(&rwlock->rw_state, state,
2614 					 state & ~URWLOCK_READ_WAITERS);
2615 				if (oldstate == state)
2616 					break;
2617 				state = oldstate;
2618 			}
2619 		}
2620 
2621 		umtxq_lock(&uq->uq_key);
2622 		umtxq_unbusy(&uq->uq_key);
2623 		umtxq_unlock(&uq->uq_key);
2624 	}
2625 	umtx_key_release(&uq->uq_key);
2626 	if (error == ERESTART)
2627 		error = EINTR;
2628 	return (error);
2629 }
2630 
2631 static int
2632 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2633 {
2634 	struct abs_timeout timo;
2635 	struct umtx_q *uq;
2636 	uint32_t flags;
2637 	int32_t state, oldstate;
2638 	int32_t blocked_writers;
2639 	int32_t blocked_readers;
2640 	int error;
2641 
2642 	uq = td->td_umtxq;
2643 	flags = fuword32(&rwlock->rw_flags);
2644 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2645 	if (error != 0)
2646 		return (error);
2647 
2648 	if (timeout != NULL)
2649 		abs_timeout_init2(&timo, timeout);
2650 
2651 	blocked_readers = 0;
2652 	for (;;) {
2653 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2654 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2655 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2656 			if (oldstate == state) {
2657 				umtx_key_release(&uq->uq_key);
2658 				return (0);
2659 			}
2660 			state = oldstate;
2661 		}
2662 
2663 		if (error) {
2664 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2665 			    blocked_readers != 0) {
2666 				umtxq_lock(&uq->uq_key);
2667 				umtxq_busy(&uq->uq_key);
2668 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2669 				umtxq_unbusy(&uq->uq_key);
2670 				umtxq_unlock(&uq->uq_key);
2671 			}
2672 
2673 			break;
2674 		}
2675 
2676 		/* grab monitor lock */
2677 		umtxq_lock(&uq->uq_key);
2678 		umtxq_busy(&uq->uq_key);
2679 		umtxq_unlock(&uq->uq_key);
2680 
2681 		/*
2682 		 * re-read the state, in case it changed between the try-lock above
2683 		 * and the check below
2684 		 */
2685 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2686 
2687 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2688 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2689 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2690 			if (oldstate == state)
2691 				goto sleep;
2692 			state = oldstate;
2693 		}
2694 
2695 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2696 			umtxq_lock(&uq->uq_key);
2697 			umtxq_unbusy(&uq->uq_key);
2698 			umtxq_unlock(&uq->uq_key);
2699 			continue;
2700 		}
2701 sleep:
2702 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2703 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2704 
2705 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2706 			umtxq_lock(&uq->uq_key);
2707 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2708 			umtxq_unbusy(&uq->uq_key);
2709 
2710 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2711 			    NULL : &timo);
2712 
2713 			umtxq_busy(&uq->uq_key);
2714 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2715 			umtxq_unlock(&uq->uq_key);
2716 			if (error)
2717 				break;
2718 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2719 		}
2720 
2721 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2722 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2723 		if (blocked_writers == 1) {
2724 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2725 			for (;;) {
2726 				oldstate = casuword32(&rwlock->rw_state, state,
2727 					 state & ~URWLOCK_WRITE_WAITERS);
2728 				if (oldstate == state)
2729 					break;
2730 				state = oldstate;
2731 			}
2732 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2733 		} else
2734 			blocked_readers = 0;
2735 
2736 		umtxq_lock(&uq->uq_key);
2737 		umtxq_unbusy(&uq->uq_key);
2738 		umtxq_unlock(&uq->uq_key);
2739 	}
2740 
2741 	umtx_key_release(&uq->uq_key);
2742 	if (error == ERESTART)
2743 		error = EINTR;
2744 	return (error);
2745 }
2746 
2747 static int
2748 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2749 {
2750 	struct umtx_q *uq;
2751 	uint32_t flags;
2752 	int32_t state, oldstate;
2753 	int error, q, count;
2754 
2755 	uq = td->td_umtxq;
2756 	flags = fuword32(&rwlock->rw_flags);
2757 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2758 	if (error != 0)
2759 		return (error);
2760 
2761 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2762 	if (state & URWLOCK_WRITE_OWNER) {
2763 		for (;;) {
2764 			oldstate = casuword32(&rwlock->rw_state, state,
2765 				state & ~URWLOCK_WRITE_OWNER);
2766 			if (oldstate != state) {
2767 				state = oldstate;
2768 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2769 					error = EPERM;
2770 					goto out;
2771 				}
2772 			} else
2773 				break;
2774 		}
2775 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2776 		for (;;) {
2777 			oldstate = casuword32(&rwlock->rw_state, state,
2778 				state - 1);
2779 			if (oldstate != state) {
2780 				state = oldstate;
2781 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2782 					error = EPERM;
2783 					goto out;
2784 				}
2785 			}
2786 			else
2787 				break;
2788 		}
2789 	} else {
2790 		error = EPERM;
2791 		goto out;
2792 	}
2793 
2794 	count = 0;
2795 
2796 	if (!(flags & URWLOCK_PREFER_READER)) {
2797 		if (state & URWLOCK_WRITE_WAITERS) {
2798 			count = 1;
2799 			q = UMTX_EXCLUSIVE_QUEUE;
2800 		} else if (state & URWLOCK_READ_WAITERS) {
2801 			count = INT_MAX;
2802 			q = UMTX_SHARED_QUEUE;
2803 		}
2804 	} else {
2805 		if (state & URWLOCK_READ_WAITERS) {
2806 			count = INT_MAX;
2807 			q = UMTX_SHARED_QUEUE;
2808 		} else if (state & URWLOCK_WRITE_WAITERS) {
2809 			count = 1;
2810 			q = UMTX_EXCLUSIVE_QUEUE;
2811 		}
2812 	}
2813 
2814 	if (count) {
2815 		umtxq_lock(&uq->uq_key);
2816 		umtxq_busy(&uq->uq_key);
2817 		umtxq_signal_queue(&uq->uq_key, count, q);
2818 		umtxq_unbusy(&uq->uq_key);
2819 		umtxq_unlock(&uq->uq_key);
2820 	}
2821 out:
2822 	umtx_key_release(&uq->uq_key);
2823 	return (error);
2824 }
2825 
2826 static int
2827 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2828 {
2829 	struct abs_timeout timo;
2830 	struct umtx_q *uq;
2831 	uint32_t flags, count;
2832 	int error;
2833 
2834 	uq = td->td_umtxq;
2835 	flags = fuword32(&sem->_flags);
2836 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2837 	if (error != 0)
2838 		return (error);
2839 
2840 	if (timeout != NULL)
2841 		abs_timeout_init2(&timo, timeout);
2842 
2843 	umtxq_lock(&uq->uq_key);
2844 	umtxq_busy(&uq->uq_key);
2845 	umtxq_insert(uq);
2846 	umtxq_unlock(&uq->uq_key);
2847 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2848 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2849 	if (count != 0) {
2850 		umtxq_lock(&uq->uq_key);
2851 		umtxq_unbusy(&uq->uq_key);
2852 		umtxq_remove(uq);
2853 		umtxq_unlock(&uq->uq_key);
2854 		umtx_key_release(&uq->uq_key);
2855 		return (0);
2856 	}
2857 	umtxq_lock(&uq->uq_key);
2858 	umtxq_unbusy(&uq->uq_key);
2859 
2860 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2861 
2862 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2863 		error = 0;
2864 	else {
2865 		umtxq_remove(uq);
2866 		if (error == ERESTART)
2867 			error = EINTR;
2868 	}
2869 	umtxq_unlock(&uq->uq_key);
2870 	umtx_key_release(&uq->uq_key);
2871 	return (error);
2872 }
2873 
2874 /*
2875  * Signal a userland condition variable.
2876  */
2877 static int
2878 do_sem_wake(struct thread *td, struct _usem *sem)
2879 {
2880 	struct umtx_key key;
2881 	int error, cnt;
2882 	uint32_t flags;
2883 
2884 	flags = fuword32(&sem->_flags);
2885 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2886 		return (error);
2887 	umtxq_lock(&key);
2888 	umtxq_busy(&key);
2889 	cnt = umtxq_count(&key);
2890 	if (cnt > 0) {
2891 		umtxq_signal(&key, 1);
2892 		/*
2893 		 * Check if count is greater than 0, this means the memory is
2894 		 * still being referenced by user code, so we can safely
2895 		 * update _has_waiters flag.
2896 		 */
2897 		if (cnt == 1) {
2898 			umtxq_unlock(&key);
2899 			error = suword32(
2900 			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2901 			umtxq_lock(&key);
2902 		}
2903 	}
2904 	umtxq_unbusy(&key);
2905 	umtxq_unlock(&key);
2906 	umtx_key_release(&key);
2907 	return (error);
2908 }
2909 
2910 int
2911 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2912     /* struct umtx *umtx */
2913 {
2914 	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2915 }
2916 
2917 int
2918 sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2919     /* struct umtx *umtx */
2920 {
2921 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2922 }
2923 
2924 inline int
2925 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2926 {
2927 	int error;
2928 
2929 	error = copyin(addr, tsp, sizeof(struct timespec));
2930 	if (error == 0) {
2931 		if (tsp->tv_sec < 0 ||
2932 		    tsp->tv_nsec >= 1000000000 ||
2933 		    tsp->tv_nsec < 0)
2934 			error = EINVAL;
2935 	}
2936 	return (error);
2937 }
2938 
2939 static inline int
2940 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2941 {
2942 	int error;
2943 
2944 	if (size <= sizeof(struct timespec)) {
2945 		tp->_clockid = CLOCK_REALTIME;
2946 		tp->_flags = 0;
2947 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2948 	} else
2949 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2950 	if (error != 0)
2951 		return (error);
2952 	if (tp->_timeout.tv_sec < 0 ||
2953 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2954 		return (EINVAL);
2955 	return (0);
2956 }
2957 
2958 static int
2959 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2960 {
2961 	struct timespec *ts, timeout;
2962 	int error;
2963 
2964 	/* Allow a null timespec (wait forever). */
2965 	if (uap->uaddr2 == NULL)
2966 		ts = NULL;
2967 	else {
2968 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2969 		if (error != 0)
2970 			return (error);
2971 		ts = &timeout;
2972 	}
2973 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2974 }
2975 
2976 static int
2977 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2978 {
2979 	return (do_unlock_umtx(td, uap->obj, uap->val));
2980 }
2981 
2982 static int
2983 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2984 {
2985 	struct _umtx_time timeout, *tm_p;
2986 	int error;
2987 
2988 	if (uap->uaddr2 == NULL)
2989 		tm_p = NULL;
2990 	else {
2991 		error = umtx_copyin_umtx_time(
2992 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2993 		if (error != 0)
2994 			return (error);
2995 		tm_p = &timeout;
2996 	}
2997 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2998 }
2999 
3000 static int
3001 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3002 {
3003 	struct _umtx_time timeout, *tm_p;
3004 	int error;
3005 
3006 	if (uap->uaddr2 == NULL)
3007 		tm_p = NULL;
3008 	else {
3009 		error = umtx_copyin_umtx_time(
3010 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3011 		if (error != 0)
3012 			return (error);
3013 		tm_p = &timeout;
3014 	}
3015 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3016 }
3017 
3018 static int
3019 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3020 {
3021 	struct _umtx_time *tm_p, timeout;
3022 	int error;
3023 
3024 	if (uap->uaddr2 == NULL)
3025 		tm_p = NULL;
3026 	else {
3027 		error = umtx_copyin_umtx_time(
3028 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3029 		if (error != 0)
3030 			return (error);
3031 		tm_p = &timeout;
3032 	}
3033 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3034 }
3035 
3036 static int
3037 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3038 {
3039 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3040 }
3041 
3042 #define BATCH_SIZE	128
3043 static int
3044 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3045 {
3046 	int count = uap->val;
3047 	void *uaddrs[BATCH_SIZE];
3048 	char **upp = (char **)uap->obj;
3049 	int tocopy;
3050 	int error = 0;
3051 	int i, pos = 0;
3052 
3053 	while (count > 0) {
3054 		tocopy = count;
3055 		if (tocopy > BATCH_SIZE)
3056 			tocopy = BATCH_SIZE;
3057 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3058 		if (error != 0)
3059 			break;
3060 		for (i = 0; i < tocopy; ++i)
3061 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3062 		count -= tocopy;
3063 		pos += tocopy;
3064 	}
3065 	return (error);
3066 }
3067 
3068 static int
3069 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3070 {
3071 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3072 }
3073 
3074 static int
3075 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3076 {
3077 	struct _umtx_time *tm_p, timeout;
3078 	int error;
3079 
3080 	/* Allow a null timespec (wait forever). */
3081 	if (uap->uaddr2 == NULL)
3082 		tm_p = NULL;
3083 	else {
3084 		error = umtx_copyin_umtx_time(
3085 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3086 		if (error != 0)
3087 			return (error);
3088 		tm_p = &timeout;
3089 	}
3090 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3091 }
3092 
3093 static int
3094 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3095 {
3096 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3097 }
3098 
3099 static int
3100 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3101 {
3102 	struct _umtx_time *tm_p, timeout;
3103 	int error;
3104 
3105 	/* Allow a null timespec (wait forever). */
3106 	if (uap->uaddr2 == NULL)
3107 		tm_p = NULL;
3108 	else {
3109 		error = umtx_copyin_umtx_time(
3110 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3111 		if (error != 0)
3112 			return (error);
3113 		tm_p = &timeout;
3114 	}
3115 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3116 }
3117 
3118 static int
3119 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3120 {
3121 	return do_wake_umutex(td, uap->obj);
3122 }
3123 
3124 static int
3125 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3126 {
3127 	return do_unlock_umutex(td, uap->obj);
3128 }
3129 
3130 static int
3131 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3132 {
3133 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3134 }
3135 
3136 static int
3137 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3138 {
3139 	struct timespec *ts, timeout;
3140 	int error;
3141 
3142 	/* Allow a null timespec (wait forever). */
3143 	if (uap->uaddr2 == NULL)
3144 		ts = NULL;
3145 	else {
3146 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3147 		if (error != 0)
3148 			return (error);
3149 		ts = &timeout;
3150 	}
3151 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3152 }
3153 
3154 static int
3155 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3156 {
3157 	return do_cv_signal(td, uap->obj);
3158 }
3159 
3160 static int
3161 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3162 {
3163 	return do_cv_broadcast(td, uap->obj);
3164 }
3165 
3166 static int
3167 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3168 {
3169 	struct _umtx_time timeout;
3170 	int error;
3171 
3172 	/* Allow a null timespec (wait forever). */
3173 	if (uap->uaddr2 == NULL) {
3174 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3175 	} else {
3176 		error = umtx_copyin_umtx_time(uap->uaddr2,
3177 		   (size_t)uap->uaddr1, &timeout);
3178 		if (error != 0)
3179 			return (error);
3180 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3181 	}
3182 	return (error);
3183 }
3184 
3185 static int
3186 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3187 {
3188 	struct _umtx_time timeout;
3189 	int error;
3190 
3191 	/* Allow a null timespec (wait forever). */
3192 	if (uap->uaddr2 == NULL) {
3193 		error = do_rw_wrlock(td, uap->obj, 0);
3194 	} else {
3195 		error = umtx_copyin_umtx_time(uap->uaddr2,
3196 		   (size_t)uap->uaddr1, &timeout);
3197 		if (error != 0)
3198 			return (error);
3199 
3200 		error = do_rw_wrlock(td, uap->obj, &timeout);
3201 	}
3202 	return (error);
3203 }
3204 
3205 static int
3206 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3207 {
3208 	return do_rw_unlock(td, uap->obj);
3209 }
3210 
3211 static int
3212 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3213 {
3214 	struct _umtx_time *tm_p, timeout;
3215 	int error;
3216 
3217 	/* Allow a null timespec (wait forever). */
3218 	if (uap->uaddr2 == NULL)
3219 		tm_p = NULL;
3220 	else {
3221 		error = umtx_copyin_umtx_time(
3222 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3223 		if (error != 0)
3224 			return (error);
3225 		tm_p = &timeout;
3226 	}
3227 	return (do_sem_wait(td, uap->obj, tm_p));
3228 }
3229 
3230 static int
3231 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3232 {
3233 	return do_sem_wake(td, uap->obj);
3234 }
3235 
3236 static int
3237 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3238 {
3239 	return do_wake2_umutex(td, uap->obj, uap->val);
3240 }
3241 
3242 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3243 
3244 static _umtx_op_func op_table[] = {
3245 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3246 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3247 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3248 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3249 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3250 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3251 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3252 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3253 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3254 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3255 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3256 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3257 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3258 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3259 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3260 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3261 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3262 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3263 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3264 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3265 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3266 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3267 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3268 };
3269 
3270 int
3271 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3272 {
3273 	if ((unsigned)uap->op < UMTX_OP_MAX)
3274 		return (*op_table[uap->op])(td, uap);
3275 	return (EINVAL);
3276 }
3277 
3278 #ifdef COMPAT_FREEBSD32
3279 int
3280 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3281     /* struct umtx *umtx */
3282 {
3283 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3284 }
3285 
3286 int
3287 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3288     /* struct umtx *umtx */
3289 {
3290 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3291 }
3292 
3293 struct timespec32 {
3294 	int32_t tv_sec;
3295 	int32_t tv_nsec;
3296 };
3297 
3298 struct umtx_time32 {
3299 	struct	timespec32	timeout;
3300 	uint32_t		flags;
3301 	uint32_t		clockid;
3302 };
3303 
3304 static inline int
3305 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3306 {
3307 	struct timespec32 ts32;
3308 	int error;
3309 
3310 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3311 	if (error == 0) {
3312 		if (ts32.tv_sec < 0 ||
3313 		    ts32.tv_nsec >= 1000000000 ||
3314 		    ts32.tv_nsec < 0)
3315 			error = EINVAL;
3316 		else {
3317 			tsp->tv_sec = ts32.tv_sec;
3318 			tsp->tv_nsec = ts32.tv_nsec;
3319 		}
3320 	}
3321 	return (error);
3322 }
3323 
3324 static inline int
3325 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3326 {
3327 	struct umtx_time32 t32;
3328 	int error;
3329 
3330 	t32.clockid = CLOCK_REALTIME;
3331 	t32.flags   = 0;
3332 	if (size <= sizeof(struct timespec32))
3333 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3334 	else
3335 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3336 	if (error != 0)
3337 		return (error);
3338 	if (t32.timeout.tv_sec < 0 ||
3339 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3340 		return (EINVAL);
3341 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3342 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3343 	tp->_flags = t32.flags;
3344 	tp->_clockid = t32.clockid;
3345 	return (0);
3346 }
3347 
3348 static int
3349 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3350 {
3351 	struct timespec *ts, timeout;
3352 	int error;
3353 
3354 	/* Allow a null timespec (wait forever). */
3355 	if (uap->uaddr2 == NULL)
3356 		ts = NULL;
3357 	else {
3358 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3359 		if (error != 0)
3360 			return (error);
3361 		ts = &timeout;
3362 	}
3363 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3364 }
3365 
3366 static int
3367 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3368 {
3369 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3370 }
3371 
3372 static int
3373 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3374 {
3375 	struct _umtx_time *tm_p, timeout;
3376 	int error;
3377 
3378 	if (uap->uaddr2 == NULL)
3379 		tm_p = NULL;
3380 	else {
3381 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3382 			(size_t)uap->uaddr1, &timeout);
3383 		if (error != 0)
3384 			return (error);
3385 		tm_p = &timeout;
3386 	}
3387 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3388 }
3389 
3390 static int
3391 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3392 {
3393 	struct _umtx_time *tm_p, timeout;
3394 	int error;
3395 
3396 	/* Allow a null timespec (wait forever). */
3397 	if (uap->uaddr2 == NULL)
3398 		tm_p = NULL;
3399 	else {
3400 		error = umtx_copyin_umtx_time(uap->uaddr2,
3401 			    (size_t)uap->uaddr1, &timeout);
3402 		if (error != 0)
3403 			return (error);
3404 		tm_p = &timeout;
3405 	}
3406 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3407 }
3408 
3409 static int
3410 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3411 {
3412 	struct _umtx_time *tm_p, timeout;
3413 	int error;
3414 
3415 	/* Allow a null timespec (wait forever). */
3416 	if (uap->uaddr2 == NULL)
3417 		tm_p = NULL;
3418 	else {
3419 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3420 		    (size_t)uap->uaddr1, &timeout);
3421 		if (error != 0)
3422 			return (error);
3423 		tm_p = &timeout;
3424 	}
3425 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3426 }
3427 
3428 static int
3429 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3430 {
3431 	struct timespec *ts, timeout;
3432 	int error;
3433 
3434 	/* Allow a null timespec (wait forever). */
3435 	if (uap->uaddr2 == NULL)
3436 		ts = NULL;
3437 	else {
3438 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3439 		if (error != 0)
3440 			return (error);
3441 		ts = &timeout;
3442 	}
3443 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3444 }
3445 
3446 static int
3447 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3448 {
3449 	struct _umtx_time timeout;
3450 	int error;
3451 
3452 	/* Allow a null timespec (wait forever). */
3453 	if (uap->uaddr2 == NULL) {
3454 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3455 	} else {
3456 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3457 		    (size_t)uap->uaddr1, &timeout);
3458 		if (error != 0)
3459 			return (error);
3460 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3461 	}
3462 	return (error);
3463 }
3464 
3465 static int
3466 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3467 {
3468 	struct _umtx_time timeout;
3469 	int error;
3470 
3471 	/* Allow a null timespec (wait forever). */
3472 	if (uap->uaddr2 == NULL) {
3473 		error = do_rw_wrlock(td, uap->obj, 0);
3474 	} else {
3475 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3476 		    (size_t)uap->uaddr1, &timeout);
3477 		if (error != 0)
3478 			return (error);
3479 		error = do_rw_wrlock(td, uap->obj, &timeout);
3480 	}
3481 	return (error);
3482 }
3483 
3484 static int
3485 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3486 {
3487 	struct _umtx_time *tm_p, timeout;
3488 	int error;
3489 
3490 	if (uap->uaddr2 == NULL)
3491 		tm_p = NULL;
3492 	else {
3493 		error = umtx_copyin_umtx_time32(
3494 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3495 		if (error != 0)
3496 			return (error);
3497 		tm_p = &timeout;
3498 	}
3499 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3500 }
3501 
3502 static int
3503 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3504 {
3505 	struct _umtx_time *tm_p, timeout;
3506 	int error;
3507 
3508 	/* Allow a null timespec (wait forever). */
3509 	if (uap->uaddr2 == NULL)
3510 		tm_p = NULL;
3511 	else {
3512 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3513 		    (size_t)uap->uaddr1, &timeout);
3514 		if (error != 0)
3515 			return (error);
3516 		tm_p = &timeout;
3517 	}
3518 	return (do_sem_wait(td, uap->obj, tm_p));
3519 }
3520 
3521 static int
3522 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3523 {
3524 	int count = uap->val;
3525 	uint32_t uaddrs[BATCH_SIZE];
3526 	uint32_t **upp = (uint32_t **)uap->obj;
3527 	int tocopy;
3528 	int error = 0;
3529 	int i, pos = 0;
3530 
3531 	while (count > 0) {
3532 		tocopy = count;
3533 		if (tocopy > BATCH_SIZE)
3534 			tocopy = BATCH_SIZE;
3535 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3536 		if (error != 0)
3537 			break;
3538 		for (i = 0; i < tocopy; ++i)
3539 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3540 				INT_MAX, 1);
3541 		count -= tocopy;
3542 		pos += tocopy;
3543 	}
3544 	return (error);
3545 }
3546 
3547 static _umtx_op_func op_table_compat32[] = {
3548 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3549 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3550 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3551 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3552 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3553 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3554 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3555 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3556 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3557 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3558 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3559 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3560 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3561 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3562 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3563 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3564 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3565 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3566 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3567 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3568 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3569 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3570 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3571 };
3572 
3573 int
3574 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3575 {
3576 	if ((unsigned)uap->op < UMTX_OP_MAX)
3577 		return (*op_table_compat32[uap->op])(td,
3578 			(struct _umtx_op_args *)uap);
3579 	return (EINVAL);
3580 }
3581 #endif
3582 
3583 void
3584 umtx_thread_init(struct thread *td)
3585 {
3586 	td->td_umtxq = umtxq_alloc();
3587 	td->td_umtxq->uq_thread = td;
3588 }
3589 
3590 void
3591 umtx_thread_fini(struct thread *td)
3592 {
3593 	umtxq_free(td->td_umtxq);
3594 }
3595 
3596 /*
3597  * It will be called when new thread is created, e.g fork().
3598  */
3599 void
3600 umtx_thread_alloc(struct thread *td)
3601 {
3602 	struct umtx_q *uq;
3603 
3604 	uq = td->td_umtxq;
3605 	uq->uq_inherited_pri = PRI_MAX;
3606 
3607 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3608 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3609 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3610 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3611 }
3612 
3613 /*
3614  * exec() hook.
3615  */
3616 static void
3617 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3618 	struct image_params *imgp __unused)
3619 {
3620 	umtx_thread_cleanup(curthread);
3621 }
3622 
3623 /*
3624  * thread_exit() hook.
3625  */
3626 void
3627 umtx_thread_exit(struct thread *td)
3628 {
3629 	umtx_thread_cleanup(td);
3630 }
3631 
3632 /*
3633  * clean up umtx data.
3634  */
3635 static void
3636 umtx_thread_cleanup(struct thread *td)
3637 {
3638 	struct umtx_q *uq;
3639 	struct umtx_pi *pi;
3640 
3641 	if ((uq = td->td_umtxq) == NULL)
3642 		return;
3643 
3644 	mtx_lock_spin(&umtx_lock);
3645 	uq->uq_inherited_pri = PRI_MAX;
3646 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3647 		pi->pi_owner = NULL;
3648 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3649 	}
3650 	mtx_unlock_spin(&umtx_lock);
3651 	thread_lock(td);
3652 	sched_lend_user_prio(td, PRI_MAX);
3653 	thread_unlock(td);
3654 }
3655