xref: /freebsd/sys/kern/kern_umtx.c (revision 64de80195bba295c961a4cdf96dbe0e4979bdf2a)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 
173 /*
174  * Don't propagate time-sharing priority, there is a security reason,
175  * a user can simply introduce PI-mutex, let thread A lock the mutex,
176  * and let another thread B block on the mutex, because B is
177  * sleeping, its priority will be boosted, this causes A's priority to
178  * be boosted via priority propagating too and will never be lowered even
179  * if it is using 100%CPU, this is unfair to other processes.
180  */
181 
182 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
183 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
184 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
185 
186 #define	GOLDEN_RATIO_PRIME	2654404609U
187 #define	UMTX_CHAINS		512
188 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
189 
190 #define	GET_SHARE(flags)	\
191     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
192 
193 #define BUSY_SPINS		200
194 
195 struct abs_timeout {
196 	int clockid;
197 	struct timespec cur;
198 	struct timespec end;
199 };
200 
201 static uma_zone_t		umtx_pi_zone;
202 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
203 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
204 static int			umtx_pi_allocated;
205 
206 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
207 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
208     &umtx_pi_allocated, 0, "Allocated umtx_pi");
209 
210 #ifdef UMTX_PROFILING
211 static long max_length;
212 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
213 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
214 #endif
215 
216 static void umtxq_sysinit(void *);
217 static void umtxq_hash(struct umtx_key *key);
218 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
219 static void umtxq_lock(struct umtx_key *key);
220 static void umtxq_unlock(struct umtx_key *key);
221 static void umtxq_busy(struct umtx_key *key);
222 static void umtxq_unbusy(struct umtx_key *key);
223 static void umtxq_insert_queue(struct umtx_q *uq, int q);
224 static void umtxq_remove_queue(struct umtx_q *uq, int q);
225 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
226 static int umtxq_count(struct umtx_key *key);
227 static struct umtx_pi *umtx_pi_alloc(int);
228 static void umtx_pi_free(struct umtx_pi *pi);
229 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
230 static void umtx_thread_cleanup(struct thread *td);
231 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
232 	struct image_params *imgp __unused);
233 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
234 
235 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
236 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
237 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
238 
239 static struct mtx umtx_lock;
240 
241 #ifdef UMTX_PROFILING
242 static void
243 umtx_init_profiling(void)
244 {
245 	struct sysctl_oid *chain_oid;
246 	char chain_name[10];
247 	int i;
248 
249 	for (i = 0; i < UMTX_CHAINS; ++i) {
250 		snprintf(chain_name, sizeof(chain_name), "%d", i);
251 		chain_oid = SYSCTL_ADD_NODE(NULL,
252 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
253 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
254 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
255 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
256 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
257 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
258 	}
259 }
260 
261 static int
262 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
263 {
264 	char buf[512];
265 	struct sbuf sb;
266 	struct umtxq_chain *uc;
267 	u_int fract, i, j, tot, whole;
268 	u_int sf0, sf1, sf2, sf3, sf4;
269 	u_int si0, si1, si2, si3, si4;
270 	u_int sw0, sw1, sw2, sw3, sw4;
271 
272 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
273 	for (i = 0; i < 2; i++) {
274 		tot = 0;
275 		for (j = 0; j < UMTX_CHAINS; ++j) {
276 			uc = &umtxq_chains[i][j];
277 			mtx_lock(&uc->uc_lock);
278 			tot += uc->max_length;
279 			mtx_unlock(&uc->uc_lock);
280 		}
281 		if (tot == 0)
282 			sbuf_printf(&sb, "%u) Empty ", i);
283 		else {
284 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
285 			si0 = si1 = si2 = si3 = si4 = 0;
286 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
287 			for (j = 0; j < UMTX_CHAINS; j++) {
288 				uc = &umtxq_chains[i][j];
289 				mtx_lock(&uc->uc_lock);
290 				whole = uc->max_length * 100;
291 				mtx_unlock(&uc->uc_lock);
292 				fract = (whole % tot) * 100;
293 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
294 					sf0 = fract;
295 					si0 = j;
296 					sw0 = whole;
297 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
298 				    sf1)) {
299 					sf1 = fract;
300 					si1 = j;
301 					sw1 = whole;
302 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
303 				    sf2)) {
304 					sf2 = fract;
305 					si2 = j;
306 					sw2 = whole;
307 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
308 				    sf3)) {
309 					sf3 = fract;
310 					si3 = j;
311 					sw3 = whole;
312 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
313 				    sf4)) {
314 					sf4 = fract;
315 					si4 = j;
316 					sw4 = whole;
317 				}
318 			}
319 			sbuf_printf(&sb, "queue %u:\n", i);
320 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
321 			    sf0 / tot, si0);
322 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
323 			    sf1 / tot, si1);
324 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
325 			    sf2 / tot, si2);
326 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
327 			    sf3 / tot, si3);
328 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
329 			    sf4 / tot, si4);
330 		}
331 	}
332 	sbuf_trim(&sb);
333 	sbuf_finish(&sb);
334 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
335 	sbuf_delete(&sb);
336 	return (0);
337 }
338 
339 static int
340 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
341 {
342 	struct umtxq_chain *uc;
343 	u_int i, j;
344 	int clear, error;
345 
346 	clear = 0;
347 	error = sysctl_handle_int(oidp, &clear, 0, req);
348 	if (error != 0 || req->newptr == NULL)
349 		return (error);
350 
351 	if (clear != 0) {
352 		for (i = 0; i < 2; ++i) {
353 			for (j = 0; j < UMTX_CHAINS; ++j) {
354 				uc = &umtxq_chains[i][j];
355 				mtx_lock(&uc->uc_lock);
356 				uc->length = 0;
357 				uc->max_length = 0;
358 				mtx_unlock(&uc->uc_lock);
359 			}
360 		}
361 	}
362 	return (0);
363 }
364 
365 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
366     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
367     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
368 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
369     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
370     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
371 #endif
372 
373 static void
374 umtxq_sysinit(void *arg __unused)
375 {
376 	int i, j;
377 
378 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
379 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
380 	for (i = 0; i < 2; ++i) {
381 		for (j = 0; j < UMTX_CHAINS; ++j) {
382 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
383 				 MTX_DEF | MTX_DUPOK);
384 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
387 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
388 			umtxq_chains[i][j].uc_busy = 0;
389 			umtxq_chains[i][j].uc_waiters = 0;
390 #ifdef UMTX_PROFILING
391 			umtxq_chains[i][j].length = 0;
392 			umtxq_chains[i][j].max_length = 0;
393 #endif
394 		}
395 	}
396 #ifdef UMTX_PROFILING
397 	umtx_init_profiling();
398 #endif
399 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
400 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
401 	    EVENTHANDLER_PRI_ANY);
402 }
403 
404 struct umtx_q *
405 umtxq_alloc(void)
406 {
407 	struct umtx_q *uq;
408 
409 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
410 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
411 	TAILQ_INIT(&uq->uq_spare_queue->head);
412 	TAILQ_INIT(&uq->uq_pi_contested);
413 	uq->uq_inherited_pri = PRI_MAX;
414 	return (uq);
415 }
416 
417 void
418 umtxq_free(struct umtx_q *uq)
419 {
420 	MPASS(uq->uq_spare_queue != NULL);
421 	free(uq->uq_spare_queue, M_UMTX);
422 	free(uq, M_UMTX);
423 }
424 
425 static inline void
426 umtxq_hash(struct umtx_key *key)
427 {
428 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
429 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
430 }
431 
432 static inline struct umtxq_chain *
433 umtxq_getchain(struct umtx_key *key)
434 {
435 	if (key->type <= TYPE_SEM)
436 		return (&umtxq_chains[1][key->hash]);
437 	return (&umtxq_chains[0][key->hash]);
438 }
439 
440 /*
441  * Lock a chain.
442  */
443 static inline void
444 umtxq_lock(struct umtx_key *key)
445 {
446 	struct umtxq_chain *uc;
447 
448 	uc = umtxq_getchain(key);
449 	mtx_lock(&uc->uc_lock);
450 }
451 
452 /*
453  * Unlock a chain.
454  */
455 static inline void
456 umtxq_unlock(struct umtx_key *key)
457 {
458 	struct umtxq_chain *uc;
459 
460 	uc = umtxq_getchain(key);
461 	mtx_unlock(&uc->uc_lock);
462 }
463 
464 /*
465  * Set chain to busy state when following operation
466  * may be blocked (kernel mutex can not be used).
467  */
468 static inline void
469 umtxq_busy(struct umtx_key *key)
470 {
471 	struct umtxq_chain *uc;
472 
473 	uc = umtxq_getchain(key);
474 	mtx_assert(&uc->uc_lock, MA_OWNED);
475 	if (uc->uc_busy) {
476 #ifdef SMP
477 		if (smp_cpus > 1) {
478 			int count = BUSY_SPINS;
479 			if (count > 0) {
480 				umtxq_unlock(key);
481 				while (uc->uc_busy && --count > 0)
482 					cpu_spinwait();
483 				umtxq_lock(key);
484 			}
485 		}
486 #endif
487 		while (uc->uc_busy) {
488 			uc->uc_waiters++;
489 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
490 			uc->uc_waiters--;
491 		}
492 	}
493 	uc->uc_busy = 1;
494 }
495 
496 /*
497  * Unbusy a chain.
498  */
499 static inline void
500 umtxq_unbusy(struct umtx_key *key)
501 {
502 	struct umtxq_chain *uc;
503 
504 	uc = umtxq_getchain(key);
505 	mtx_assert(&uc->uc_lock, MA_OWNED);
506 	KASSERT(uc->uc_busy != 0, ("not busy"));
507 	uc->uc_busy = 0;
508 	if (uc->uc_waiters)
509 		wakeup_one(uc);
510 }
511 
512 static inline void
513 umtxq_unbusy_unlocked(struct umtx_key *key)
514 {
515 
516 	umtxq_lock(key);
517 	umtxq_unbusy(key);
518 	umtxq_unlock(key);
519 }
520 
521 static struct umtxq_queue *
522 umtxq_queue_lookup(struct umtx_key *key, int q)
523 {
524 	struct umtxq_queue *uh;
525 	struct umtxq_chain *uc;
526 
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
530 		if (umtx_key_match(&uh->key, key))
531 			return (uh);
532 	}
533 
534 	return (NULL);
535 }
536 
537 static inline void
538 umtxq_insert_queue(struct umtx_q *uq, int q)
539 {
540 	struct umtxq_queue *uh;
541 	struct umtxq_chain *uc;
542 
543 	uc = umtxq_getchain(&uq->uq_key);
544 	UMTXQ_LOCKED_ASSERT(uc);
545 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
546 	uh = umtxq_queue_lookup(&uq->uq_key, q);
547 	if (uh != NULL) {
548 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
549 	} else {
550 		uh = uq->uq_spare_queue;
551 		uh->key = uq->uq_key;
552 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
553 #ifdef UMTX_PROFILING
554 		uc->length++;
555 		if (uc->length > uc->max_length) {
556 			uc->max_length = uc->length;
557 			if (uc->max_length > max_length)
558 				max_length = uc->max_length;
559 		}
560 #endif
561 	}
562 	uq->uq_spare_queue = NULL;
563 
564 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
565 	uh->length++;
566 	uq->uq_flags |= UQF_UMTXQ;
567 	uq->uq_cur_queue = uh;
568 	return;
569 }
570 
571 static inline void
572 umtxq_remove_queue(struct umtx_q *uq, int q)
573 {
574 	struct umtxq_chain *uc;
575 	struct umtxq_queue *uh;
576 
577 	uc = umtxq_getchain(&uq->uq_key);
578 	UMTXQ_LOCKED_ASSERT(uc);
579 	if (uq->uq_flags & UQF_UMTXQ) {
580 		uh = uq->uq_cur_queue;
581 		TAILQ_REMOVE(&uh->head, uq, uq_link);
582 		uh->length--;
583 		uq->uq_flags &= ~UQF_UMTXQ;
584 		if (TAILQ_EMPTY(&uh->head)) {
585 			KASSERT(uh->length == 0,
586 			    ("inconsistent umtxq_queue length"));
587 #ifdef UMTX_PROFILING
588 			uc->length--;
589 #endif
590 			LIST_REMOVE(uh, link);
591 		} else {
592 			uh = LIST_FIRST(&uc->uc_spare_queue);
593 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
594 			LIST_REMOVE(uh, link);
595 		}
596 		uq->uq_spare_queue = uh;
597 		uq->uq_cur_queue = NULL;
598 	}
599 }
600 
601 /*
602  * Check if there are multiple waiters
603  */
604 static int
605 umtxq_count(struct umtx_key *key)
606 {
607 	struct umtxq_chain *uc;
608 	struct umtxq_queue *uh;
609 
610 	uc = umtxq_getchain(key);
611 	UMTXQ_LOCKED_ASSERT(uc);
612 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
613 	if (uh != NULL)
614 		return (uh->length);
615 	return (0);
616 }
617 
618 /*
619  * Check if there are multiple PI waiters and returns first
620  * waiter.
621  */
622 static int
623 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
624 {
625 	struct umtxq_chain *uc;
626 	struct umtxq_queue *uh;
627 
628 	*first = NULL;
629 	uc = umtxq_getchain(key);
630 	UMTXQ_LOCKED_ASSERT(uc);
631 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
632 	if (uh != NULL) {
633 		*first = TAILQ_FIRST(&uh->head);
634 		return (uh->length);
635 	}
636 	return (0);
637 }
638 
639 static int
640 umtxq_check_susp(struct thread *td)
641 {
642 	struct proc *p;
643 	int error;
644 
645 	/*
646 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
647 	 * eventually break the lockstep loop.
648 	 */
649 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
650 		return (0);
651 	error = 0;
652 	p = td->td_proc;
653 	PROC_LOCK(p);
654 	if (P_SHOULDSTOP(p) ||
655 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
656 		if (p->p_flag & P_SINGLE_EXIT)
657 			error = EINTR;
658 		else
659 			error = ERESTART;
660 	}
661 	PROC_UNLOCK(p);
662 	return (error);
663 }
664 
665 /*
666  * Wake up threads waiting on an userland object.
667  */
668 
669 static int
670 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
671 {
672 	struct umtxq_chain *uc;
673 	struct umtxq_queue *uh;
674 	struct umtx_q *uq;
675 	int ret;
676 
677 	ret = 0;
678 	uc = umtxq_getchain(key);
679 	UMTXQ_LOCKED_ASSERT(uc);
680 	uh = umtxq_queue_lookup(key, q);
681 	if (uh != NULL) {
682 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
683 			umtxq_remove_queue(uq, q);
684 			wakeup(uq);
685 			if (++ret >= n_wake)
686 				return (ret);
687 		}
688 	}
689 	return (ret);
690 }
691 
692 
693 /*
694  * Wake up specified thread.
695  */
696 static inline void
697 umtxq_signal_thread(struct umtx_q *uq)
698 {
699 	struct umtxq_chain *uc;
700 
701 	uc = umtxq_getchain(&uq->uq_key);
702 	UMTXQ_LOCKED_ASSERT(uc);
703 	umtxq_remove(uq);
704 	wakeup(uq);
705 }
706 
707 static inline int
708 tstohz(const struct timespec *tsp)
709 {
710 	struct timeval tv;
711 
712 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
713 	return tvtohz(&tv);
714 }
715 
716 static void
717 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
718 	const struct timespec *timeout)
719 {
720 
721 	timo->clockid = clockid;
722 	if (!absolute) {
723 		kern_clock_gettime(curthread, clockid, &timo->end);
724 		timo->cur = timo->end;
725 		timespecadd(&timo->end, timeout);
726 	} else {
727 		timo->end = *timeout;
728 		kern_clock_gettime(curthread, clockid, &timo->cur);
729 	}
730 }
731 
732 static void
733 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
734 {
735 
736 	abs_timeout_init(timo, umtxtime->_clockid,
737 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
738 		&umtxtime->_timeout);
739 }
740 
741 static inline void
742 abs_timeout_update(struct abs_timeout *timo)
743 {
744 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
745 }
746 
747 static int
748 abs_timeout_gethz(struct abs_timeout *timo)
749 {
750 	struct timespec tts;
751 
752 	if (timespeccmp(&timo->end, &timo->cur, <=))
753 		return (-1);
754 	tts = timo->end;
755 	timespecsub(&tts, &timo->cur);
756 	return (tstohz(&tts));
757 }
758 
759 /*
760  * Put thread into sleep state, before sleeping, check if
761  * thread was removed from umtx queue.
762  */
763 static inline int
764 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
765 {
766 	struct umtxq_chain *uc;
767 	int error, timo;
768 
769 	uc = umtxq_getchain(&uq->uq_key);
770 	UMTXQ_LOCKED_ASSERT(uc);
771 	for (;;) {
772 		if (!(uq->uq_flags & UQF_UMTXQ))
773 			return (0);
774 		if (abstime != NULL) {
775 			timo = abs_timeout_gethz(abstime);
776 			if (timo < 0)
777 				return (ETIMEDOUT);
778 		} else
779 			timo = 0;
780 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
781 		if (error != EWOULDBLOCK) {
782 			umtxq_lock(&uq->uq_key);
783 			break;
784 		}
785 		if (abstime != NULL)
786 			abs_timeout_update(abstime);
787 		umtxq_lock(&uq->uq_key);
788 	}
789 	return (error);
790 }
791 
792 /*
793  * Convert userspace address into unique logical address.
794  */
795 int
796 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
797 {
798 	struct thread *td = curthread;
799 	vm_map_t map;
800 	vm_map_entry_t entry;
801 	vm_pindex_t pindex;
802 	vm_prot_t prot;
803 	boolean_t wired;
804 
805 	key->type = type;
806 	if (share == THREAD_SHARE) {
807 		key->shared = 0;
808 		key->info.private.vs = td->td_proc->p_vmspace;
809 		key->info.private.addr = (uintptr_t)addr;
810 	} else {
811 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
812 		map = &td->td_proc->p_vmspace->vm_map;
813 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
814 		    &entry, &key->info.shared.object, &pindex, &prot,
815 		    &wired) != KERN_SUCCESS) {
816 			return EFAULT;
817 		}
818 
819 		if ((share == PROCESS_SHARE) ||
820 		    (share == AUTO_SHARE &&
821 		     VM_INHERIT_SHARE == entry->inheritance)) {
822 			key->shared = 1;
823 			key->info.shared.offset = entry->offset + entry->start -
824 				(vm_offset_t)addr;
825 			vm_object_reference(key->info.shared.object);
826 		} else {
827 			key->shared = 0;
828 			key->info.private.vs = td->td_proc->p_vmspace;
829 			key->info.private.addr = (uintptr_t)addr;
830 		}
831 		vm_map_lookup_done(map, entry);
832 	}
833 
834 	umtxq_hash(key);
835 	return (0);
836 }
837 
838 /*
839  * Release key.
840  */
841 void
842 umtx_key_release(struct umtx_key *key)
843 {
844 	if (key->shared)
845 		vm_object_deallocate(key->info.shared.object);
846 }
847 
848 /*
849  * Fetch and compare value, sleep on the address if value is not changed.
850  */
851 static int
852 do_wait(struct thread *td, void *addr, u_long id,
853 	struct _umtx_time *timeout, int compat32, int is_private)
854 {
855 	struct abs_timeout timo;
856 	struct umtx_q *uq;
857 	u_long tmp;
858 	uint32_t tmp32;
859 	int error = 0;
860 
861 	uq = td->td_umtxq;
862 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
863 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
864 		return (error);
865 
866 	if (timeout != NULL)
867 		abs_timeout_init2(&timo, timeout);
868 
869 	umtxq_lock(&uq->uq_key);
870 	umtxq_insert(uq);
871 	umtxq_unlock(&uq->uq_key);
872 	if (compat32 == 0) {
873 		error = fueword(addr, &tmp);
874 		if (error != 0)
875 			error = EFAULT;
876 	} else {
877 		error = fueword32(addr, &tmp32);
878 		if (error == 0)
879 			tmp = tmp32;
880 		else
881 			error = EFAULT;
882 	}
883 	umtxq_lock(&uq->uq_key);
884 	if (error == 0) {
885 		if (tmp == id)
886 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
887 			    NULL : &timo);
888 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
889 			error = 0;
890 		else
891 			umtxq_remove(uq);
892 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
893 		umtxq_remove(uq);
894 	}
895 	umtxq_unlock(&uq->uq_key);
896 	umtx_key_release(&uq->uq_key);
897 	if (error == ERESTART)
898 		error = EINTR;
899 	return (error);
900 }
901 
902 /*
903  * Wake up threads sleeping on the specified address.
904  */
905 int
906 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
907 {
908 	struct umtx_key key;
909 	int ret;
910 
911 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
912 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
913 		return (ret);
914 	umtxq_lock(&key);
915 	ret = umtxq_signal(&key, n_wake);
916 	umtxq_unlock(&key);
917 	umtx_key_release(&key);
918 	return (0);
919 }
920 
921 /*
922  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
923  */
924 static int
925 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
926 	struct _umtx_time *timeout, int mode)
927 {
928 	struct abs_timeout timo;
929 	struct umtx_q *uq;
930 	uint32_t owner, old, id;
931 	int error, rv;
932 
933 	id = td->td_tid;
934 	uq = td->td_umtxq;
935 	error = 0;
936 	if (timeout != NULL)
937 		abs_timeout_init2(&timo, timeout);
938 
939 	/*
940 	 * Care must be exercised when dealing with umtx structure. It
941 	 * can fault on any access.
942 	 */
943 	for (;;) {
944 		rv = fueword32(&m->m_owner, &owner);
945 		if (rv == -1)
946 			return (EFAULT);
947 		if (mode == _UMUTEX_WAIT) {
948 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
949 				return (0);
950 		} else {
951 			/*
952 			 * Try the uncontested case.  This should be done in userland.
953 			 */
954 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
955 			    &owner, id);
956 			/* The address was invalid. */
957 			if (rv == -1)
958 				return (EFAULT);
959 
960 			/* The acquire succeeded. */
961 			if (owner == UMUTEX_UNOWNED)
962 				return (0);
963 
964 			/* If no one owns it but it is contested try to acquire it. */
965 			if (owner == UMUTEX_CONTESTED) {
966 				rv = casueword32(&m->m_owner,
967 				    UMUTEX_CONTESTED, &owner,
968 				    id | UMUTEX_CONTESTED);
969 				/* The address was invalid. */
970 				if (rv == -1)
971 					return (EFAULT);
972 
973 				if (owner == UMUTEX_CONTESTED)
974 					return (0);
975 
976 				rv = umtxq_check_susp(td);
977 				if (rv != 0)
978 					return (rv);
979 
980 				/* If this failed the lock has changed, restart. */
981 				continue;
982 			}
983 		}
984 
985 		if (mode == _UMUTEX_TRY)
986 			return (EBUSY);
987 
988 		/*
989 		 * If we caught a signal, we have retried and now
990 		 * exit immediately.
991 		 */
992 		if (error != 0)
993 			return (error);
994 
995 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
996 		    GET_SHARE(flags), &uq->uq_key)) != 0)
997 			return (error);
998 
999 		umtxq_lock(&uq->uq_key);
1000 		umtxq_busy(&uq->uq_key);
1001 		umtxq_insert(uq);
1002 		umtxq_unlock(&uq->uq_key);
1003 
1004 		/*
1005 		 * Set the contested bit so that a release in user space
1006 		 * knows to use the system call for unlock.  If this fails
1007 		 * either some one else has acquired the lock or it has been
1008 		 * released.
1009 		 */
1010 		rv = casueword32(&m->m_owner, owner, &old,
1011 		    owner | UMUTEX_CONTESTED);
1012 
1013 		/* The address was invalid. */
1014 		if (rv == -1) {
1015 			umtxq_lock(&uq->uq_key);
1016 			umtxq_remove(uq);
1017 			umtxq_unbusy(&uq->uq_key);
1018 			umtxq_unlock(&uq->uq_key);
1019 			umtx_key_release(&uq->uq_key);
1020 			return (EFAULT);
1021 		}
1022 
1023 		/*
1024 		 * We set the contested bit, sleep. Otherwise the lock changed
1025 		 * and we need to retry or we lost a race to the thread
1026 		 * unlocking the umtx.
1027 		 */
1028 		umtxq_lock(&uq->uq_key);
1029 		umtxq_unbusy(&uq->uq_key);
1030 		if (old == owner)
1031 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1032 			    NULL : &timo);
1033 		umtxq_remove(uq);
1034 		umtxq_unlock(&uq->uq_key);
1035 		umtx_key_release(&uq->uq_key);
1036 
1037 		if (error == 0)
1038 			error = umtxq_check_susp(td);
1039 	}
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1046  */
1047 static int
1048 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1049 {
1050 	struct umtx_key key;
1051 	uint32_t owner, old, id;
1052 	int error;
1053 	int count;
1054 
1055 	id = td->td_tid;
1056 	/*
1057 	 * Make sure we own this mtx.
1058 	 */
1059 	error = fueword32(&m->m_owner, &owner);
1060 	if (error == -1)
1061 		return (EFAULT);
1062 
1063 	if ((owner & ~UMUTEX_CONTESTED) != id)
1064 		return (EPERM);
1065 
1066 	if ((owner & UMUTEX_CONTESTED) == 0) {
1067 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1068 		if (error == -1)
1069 			return (EFAULT);
1070 		if (old == owner)
1071 			return (0);
1072 		owner = old;
1073 	}
1074 
1075 	/* We should only ever be in here for contested locks */
1076 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1077 	    &key)) != 0)
1078 		return (error);
1079 
1080 	umtxq_lock(&key);
1081 	umtxq_busy(&key);
1082 	count = umtxq_count(&key);
1083 	umtxq_unlock(&key);
1084 
1085 	/*
1086 	 * When unlocking the umtx, it must be marked as unowned if
1087 	 * there is zero or one thread only waiting for it.
1088 	 * Otherwise, it must be marked as contested.
1089 	 */
1090 	error = casueword32(&m->m_owner, owner, &old,
1091 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1092 	umtxq_lock(&key);
1093 	umtxq_signal(&key,1);
1094 	umtxq_unbusy(&key);
1095 	umtxq_unlock(&key);
1096 	umtx_key_release(&key);
1097 	if (error == -1)
1098 		return (EFAULT);
1099 	if (old != owner)
1100 		return (EINVAL);
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Check if the mutex is available and wake up a waiter,
1106  * only for simple mutex.
1107  */
1108 static int
1109 do_wake_umutex(struct thread *td, struct umutex *m)
1110 {
1111 	struct umtx_key key;
1112 	uint32_t owner;
1113 	uint32_t flags;
1114 	int error;
1115 	int count;
1116 
1117 	error = fueword32(&m->m_owner, &owner);
1118 	if (error == -1)
1119 		return (EFAULT);
1120 
1121 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1122 		return (0);
1123 
1124 	error = fueword32(&m->m_flags, &flags);
1125 	if (error == -1)
1126 		return (EFAULT);
1127 
1128 	/* We should only ever be in here for contested locks */
1129 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1130 	    &key)) != 0)
1131 		return (error);
1132 
1133 	umtxq_lock(&key);
1134 	umtxq_busy(&key);
1135 	count = umtxq_count(&key);
1136 	umtxq_unlock(&key);
1137 
1138 	if (count <= 1) {
1139 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1140 		    UMUTEX_UNOWNED);
1141 		if (error == -1)
1142 			error = EFAULT;
1143 	}
1144 
1145 	umtxq_lock(&key);
1146 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1147 		umtxq_signal(&key, 1);
1148 	umtxq_unbusy(&key);
1149 	umtxq_unlock(&key);
1150 	umtx_key_release(&key);
1151 	return (error);
1152 }
1153 
1154 /*
1155  * Check if the mutex has waiters and tries to fix contention bit.
1156  */
1157 static int
1158 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1159 {
1160 	struct umtx_key key;
1161 	uint32_t owner, old;
1162 	int type;
1163 	int error;
1164 	int count;
1165 
1166 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1167 	case 0:
1168 		type = TYPE_NORMAL_UMUTEX;
1169 		break;
1170 	case UMUTEX_PRIO_INHERIT:
1171 		type = TYPE_PI_UMUTEX;
1172 		break;
1173 	case UMUTEX_PRIO_PROTECT:
1174 		type = TYPE_PP_UMUTEX;
1175 		break;
1176 	default:
1177 		return (EINVAL);
1178 	}
1179 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1180 	    &key)) != 0)
1181 		return (error);
1182 
1183 	owner = 0;
1184 	umtxq_lock(&key);
1185 	umtxq_busy(&key);
1186 	count = umtxq_count(&key);
1187 	umtxq_unlock(&key);
1188 	/*
1189 	 * Only repair contention bit if there is a waiter, this means the mutex
1190 	 * is still being referenced by userland code, otherwise don't update
1191 	 * any memory.
1192 	 */
1193 	if (count > 1) {
1194 		error = fueword32(&m->m_owner, &owner);
1195 		if (error == -1)
1196 			error = EFAULT;
1197 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1198 			error = casueword32(&m->m_owner, owner, &old,
1199 			    owner | UMUTEX_CONTESTED);
1200 			if (error == -1) {
1201 				error = EFAULT;
1202 				break;
1203 			}
1204 			if (old == owner)
1205 				break;
1206 			owner = old;
1207 			error = umtxq_check_susp(td);
1208 			if (error != 0)
1209 				break;
1210 		}
1211 	} else if (count == 1) {
1212 		error = fueword32(&m->m_owner, &owner);
1213 		if (error == -1)
1214 			error = EFAULT;
1215 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1216 		       (owner & UMUTEX_CONTESTED) == 0) {
1217 			error = casueword32(&m->m_owner, owner, &old,
1218 			    owner | UMUTEX_CONTESTED);
1219 			if (error == -1) {
1220 				error = EFAULT;
1221 				break;
1222 			}
1223 			if (old == owner)
1224 				break;
1225 			owner = old;
1226 			error = umtxq_check_susp(td);
1227 			if (error != 0)
1228 				break;
1229 		}
1230 	}
1231 	umtxq_lock(&key);
1232 	if (error == EFAULT) {
1233 		umtxq_signal(&key, INT_MAX);
1234 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1235 		umtxq_signal(&key, 1);
1236 	umtxq_unbusy(&key);
1237 	umtxq_unlock(&key);
1238 	umtx_key_release(&key);
1239 	return (error);
1240 }
1241 
1242 static inline struct umtx_pi *
1243 umtx_pi_alloc(int flags)
1244 {
1245 	struct umtx_pi *pi;
1246 
1247 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1248 	TAILQ_INIT(&pi->pi_blocked);
1249 	atomic_add_int(&umtx_pi_allocated, 1);
1250 	return (pi);
1251 }
1252 
1253 static inline void
1254 umtx_pi_free(struct umtx_pi *pi)
1255 {
1256 	uma_zfree(umtx_pi_zone, pi);
1257 	atomic_add_int(&umtx_pi_allocated, -1);
1258 }
1259 
1260 /*
1261  * Adjust the thread's position on a pi_state after its priority has been
1262  * changed.
1263  */
1264 static int
1265 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1266 {
1267 	struct umtx_q *uq, *uq1, *uq2;
1268 	struct thread *td1;
1269 
1270 	mtx_assert(&umtx_lock, MA_OWNED);
1271 	if (pi == NULL)
1272 		return (0);
1273 
1274 	uq = td->td_umtxq;
1275 
1276 	/*
1277 	 * Check if the thread needs to be moved on the blocked chain.
1278 	 * It needs to be moved if either its priority is lower than
1279 	 * the previous thread or higher than the next thread.
1280 	 */
1281 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1282 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1283 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1284 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1285 		/*
1286 		 * Remove thread from blocked chain and determine where
1287 		 * it should be moved to.
1288 		 */
1289 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1290 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1291 			td1 = uq1->uq_thread;
1292 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1293 			if (UPRI(td1) > UPRI(td))
1294 				break;
1295 		}
1296 
1297 		if (uq1 == NULL)
1298 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1299 		else
1300 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1301 	}
1302 	return (1);
1303 }
1304 
1305 static struct umtx_pi *
1306 umtx_pi_next(struct umtx_pi *pi)
1307 {
1308 	struct umtx_q *uq_owner;
1309 
1310 	if (pi->pi_owner == NULL)
1311 		return (NULL);
1312 	uq_owner = pi->pi_owner->td_umtxq;
1313 	if (uq_owner == NULL)
1314 		return (NULL);
1315 	return (uq_owner->uq_pi_blocked);
1316 }
1317 
1318 /*
1319  * Floyd's Cycle-Finding Algorithm.
1320  */
1321 static bool
1322 umtx_pi_check_loop(struct umtx_pi *pi)
1323 {
1324 	struct umtx_pi *pi1;	/* fast iterator */
1325 
1326 	mtx_assert(&umtx_lock, MA_OWNED);
1327 	if (pi == NULL)
1328 		return (false);
1329 	pi1 = pi;
1330 	for (;;) {
1331 		pi = umtx_pi_next(pi);
1332 		if (pi == NULL)
1333 			break;
1334 		pi1 = umtx_pi_next(pi1);
1335 		if (pi1 == NULL)
1336 			break;
1337 		pi1 = umtx_pi_next(pi1);
1338 		if (pi1 == NULL)
1339 			break;
1340 		if (pi == pi1)
1341 			return (true);
1342 	}
1343 	return (false);
1344 }
1345 
1346 /*
1347  * Propagate priority when a thread is blocked on POSIX
1348  * PI mutex.
1349  */
1350 static void
1351 umtx_propagate_priority(struct thread *td)
1352 {
1353 	struct umtx_q *uq;
1354 	struct umtx_pi *pi;
1355 	int pri;
1356 
1357 	mtx_assert(&umtx_lock, MA_OWNED);
1358 	pri = UPRI(td);
1359 	uq = td->td_umtxq;
1360 	pi = uq->uq_pi_blocked;
1361 	if (pi == NULL)
1362 		return;
1363 	if (umtx_pi_check_loop(pi))
1364 		return;
1365 
1366 	for (;;) {
1367 		td = pi->pi_owner;
1368 		if (td == NULL || td == curthread)
1369 			return;
1370 
1371 		MPASS(td->td_proc != NULL);
1372 		MPASS(td->td_proc->p_magic == P_MAGIC);
1373 
1374 		thread_lock(td);
1375 		if (td->td_lend_user_pri > pri)
1376 			sched_lend_user_prio(td, pri);
1377 		else {
1378 			thread_unlock(td);
1379 			break;
1380 		}
1381 		thread_unlock(td);
1382 
1383 		/*
1384 		 * Pick up the lock that td is blocked on.
1385 		 */
1386 		uq = td->td_umtxq;
1387 		pi = uq->uq_pi_blocked;
1388 		if (pi == NULL)
1389 			break;
1390 		/* Resort td on the list if needed. */
1391 		umtx_pi_adjust_thread(pi, td);
1392 	}
1393 }
1394 
1395 /*
1396  * Unpropagate priority for a PI mutex when a thread blocked on
1397  * it is interrupted by signal or resumed by others.
1398  */
1399 static void
1400 umtx_repropagate_priority(struct umtx_pi *pi)
1401 {
1402 	struct umtx_q *uq, *uq_owner;
1403 	struct umtx_pi *pi2;
1404 	int pri;
1405 
1406 	mtx_assert(&umtx_lock, MA_OWNED);
1407 
1408 	if (umtx_pi_check_loop(pi))
1409 		return;
1410 	while (pi != NULL && pi->pi_owner != NULL) {
1411 		pri = PRI_MAX;
1412 		uq_owner = pi->pi_owner->td_umtxq;
1413 
1414 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1415 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1416 			if (uq != NULL) {
1417 				if (pri > UPRI(uq->uq_thread))
1418 					pri = UPRI(uq->uq_thread);
1419 			}
1420 		}
1421 
1422 		if (pri > uq_owner->uq_inherited_pri)
1423 			pri = uq_owner->uq_inherited_pri;
1424 		thread_lock(pi->pi_owner);
1425 		sched_lend_user_prio(pi->pi_owner, pri);
1426 		thread_unlock(pi->pi_owner);
1427 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1428 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1429 	}
1430 }
1431 
1432 /*
1433  * Insert a PI mutex into owned list.
1434  */
1435 static void
1436 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1437 {
1438 	struct umtx_q *uq_owner;
1439 
1440 	uq_owner = owner->td_umtxq;
1441 	mtx_assert(&umtx_lock, MA_OWNED);
1442 	if (pi->pi_owner != NULL)
1443 		panic("pi_ower != NULL");
1444 	pi->pi_owner = owner;
1445 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1446 }
1447 
1448 /*
1449  * Claim ownership of a PI mutex.
1450  */
1451 static int
1452 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1453 {
1454 	struct umtx_q *uq, *uq_owner;
1455 
1456 	uq_owner = owner->td_umtxq;
1457 	mtx_lock_spin(&umtx_lock);
1458 	if (pi->pi_owner == owner) {
1459 		mtx_unlock_spin(&umtx_lock);
1460 		return (0);
1461 	}
1462 
1463 	if (pi->pi_owner != NULL) {
1464 		/*
1465 		 * userland may have already messed the mutex, sigh.
1466 		 */
1467 		mtx_unlock_spin(&umtx_lock);
1468 		return (EPERM);
1469 	}
1470 	umtx_pi_setowner(pi, owner);
1471 	uq = TAILQ_FIRST(&pi->pi_blocked);
1472 	if (uq != NULL) {
1473 		int pri;
1474 
1475 		pri = UPRI(uq->uq_thread);
1476 		thread_lock(owner);
1477 		if (pri < UPRI(owner))
1478 			sched_lend_user_prio(owner, pri);
1479 		thread_unlock(owner);
1480 	}
1481 	mtx_unlock_spin(&umtx_lock);
1482 	return (0);
1483 }
1484 
1485 /*
1486  * Adjust a thread's order position in its blocked PI mutex,
1487  * this may result new priority propagating process.
1488  */
1489 void
1490 umtx_pi_adjust(struct thread *td, u_char oldpri)
1491 {
1492 	struct umtx_q *uq;
1493 	struct umtx_pi *pi;
1494 
1495 	uq = td->td_umtxq;
1496 	mtx_lock_spin(&umtx_lock);
1497 	/*
1498 	 * Pick up the lock that td is blocked on.
1499 	 */
1500 	pi = uq->uq_pi_blocked;
1501 	if (pi != NULL) {
1502 		umtx_pi_adjust_thread(pi, td);
1503 		umtx_repropagate_priority(pi);
1504 	}
1505 	mtx_unlock_spin(&umtx_lock);
1506 }
1507 
1508 /*
1509  * Sleep on a PI mutex.
1510  */
1511 static int
1512 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1513 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1514 {
1515 	struct umtxq_chain *uc;
1516 	struct thread *td, *td1;
1517 	struct umtx_q *uq1;
1518 	int pri;
1519 	int error = 0;
1520 
1521 	td = uq->uq_thread;
1522 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1523 	uc = umtxq_getchain(&uq->uq_key);
1524 	UMTXQ_LOCKED_ASSERT(uc);
1525 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1526 	umtxq_insert(uq);
1527 	mtx_lock_spin(&umtx_lock);
1528 	if (pi->pi_owner == NULL) {
1529 		mtx_unlock_spin(&umtx_lock);
1530 		/* XXX Only look up thread in current process. */
1531 		td1 = tdfind(owner, curproc->p_pid);
1532 		mtx_lock_spin(&umtx_lock);
1533 		if (td1 != NULL) {
1534 			if (pi->pi_owner == NULL)
1535 				umtx_pi_setowner(pi, td1);
1536 			PROC_UNLOCK(td1->td_proc);
1537 		}
1538 	}
1539 
1540 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1541 		pri = UPRI(uq1->uq_thread);
1542 		if (pri > UPRI(td))
1543 			break;
1544 	}
1545 
1546 	if (uq1 != NULL)
1547 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1548 	else
1549 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1550 
1551 	uq->uq_pi_blocked = pi;
1552 	thread_lock(td);
1553 	td->td_flags |= TDF_UPIBLOCKED;
1554 	thread_unlock(td);
1555 	umtx_propagate_priority(td);
1556 	mtx_unlock_spin(&umtx_lock);
1557 	umtxq_unbusy(&uq->uq_key);
1558 
1559 	error = umtxq_sleep(uq, wmesg, timo);
1560 	umtxq_remove(uq);
1561 
1562 	mtx_lock_spin(&umtx_lock);
1563 	uq->uq_pi_blocked = NULL;
1564 	thread_lock(td);
1565 	td->td_flags &= ~TDF_UPIBLOCKED;
1566 	thread_unlock(td);
1567 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1568 	umtx_repropagate_priority(pi);
1569 	mtx_unlock_spin(&umtx_lock);
1570 	umtxq_unlock(&uq->uq_key);
1571 
1572 	return (error);
1573 }
1574 
1575 /*
1576  * Add reference count for a PI mutex.
1577  */
1578 static void
1579 umtx_pi_ref(struct umtx_pi *pi)
1580 {
1581 	struct umtxq_chain *uc;
1582 
1583 	uc = umtxq_getchain(&pi->pi_key);
1584 	UMTXQ_LOCKED_ASSERT(uc);
1585 	pi->pi_refcount++;
1586 }
1587 
1588 /*
1589  * Decrease reference count for a PI mutex, if the counter
1590  * is decreased to zero, its memory space is freed.
1591  */
1592 static void
1593 umtx_pi_unref(struct umtx_pi *pi)
1594 {
1595 	struct umtxq_chain *uc;
1596 
1597 	uc = umtxq_getchain(&pi->pi_key);
1598 	UMTXQ_LOCKED_ASSERT(uc);
1599 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1600 	if (--pi->pi_refcount == 0) {
1601 		mtx_lock_spin(&umtx_lock);
1602 		if (pi->pi_owner != NULL) {
1603 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1604 				pi, pi_link);
1605 			pi->pi_owner = NULL;
1606 		}
1607 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1608 			("blocked queue not empty"));
1609 		mtx_unlock_spin(&umtx_lock);
1610 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1611 		umtx_pi_free(pi);
1612 	}
1613 }
1614 
1615 /*
1616  * Find a PI mutex in hash table.
1617  */
1618 static struct umtx_pi *
1619 umtx_pi_lookup(struct umtx_key *key)
1620 {
1621 	struct umtxq_chain *uc;
1622 	struct umtx_pi *pi;
1623 
1624 	uc = umtxq_getchain(key);
1625 	UMTXQ_LOCKED_ASSERT(uc);
1626 
1627 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1628 		if (umtx_key_match(&pi->pi_key, key)) {
1629 			return (pi);
1630 		}
1631 	}
1632 	return (NULL);
1633 }
1634 
1635 /*
1636  * Insert a PI mutex into hash table.
1637  */
1638 static inline void
1639 umtx_pi_insert(struct umtx_pi *pi)
1640 {
1641 	struct umtxq_chain *uc;
1642 
1643 	uc = umtxq_getchain(&pi->pi_key);
1644 	UMTXQ_LOCKED_ASSERT(uc);
1645 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1646 }
1647 
1648 /*
1649  * Lock a PI mutex.
1650  */
1651 static int
1652 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1653     struct _umtx_time *timeout, int try)
1654 {
1655 	struct abs_timeout timo;
1656 	struct umtx_q *uq;
1657 	struct umtx_pi *pi, *new_pi;
1658 	uint32_t id, owner, old;
1659 	int error, rv;
1660 
1661 	id = td->td_tid;
1662 	uq = td->td_umtxq;
1663 
1664 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1665 	    &uq->uq_key)) != 0)
1666 		return (error);
1667 
1668 	if (timeout != NULL)
1669 		abs_timeout_init2(&timo, timeout);
1670 
1671 	umtxq_lock(&uq->uq_key);
1672 	pi = umtx_pi_lookup(&uq->uq_key);
1673 	if (pi == NULL) {
1674 		new_pi = umtx_pi_alloc(M_NOWAIT);
1675 		if (new_pi == NULL) {
1676 			umtxq_unlock(&uq->uq_key);
1677 			new_pi = umtx_pi_alloc(M_WAITOK);
1678 			umtxq_lock(&uq->uq_key);
1679 			pi = umtx_pi_lookup(&uq->uq_key);
1680 			if (pi != NULL) {
1681 				umtx_pi_free(new_pi);
1682 				new_pi = NULL;
1683 			}
1684 		}
1685 		if (new_pi != NULL) {
1686 			new_pi->pi_key = uq->uq_key;
1687 			umtx_pi_insert(new_pi);
1688 			pi = new_pi;
1689 		}
1690 	}
1691 	umtx_pi_ref(pi);
1692 	umtxq_unlock(&uq->uq_key);
1693 
1694 	/*
1695 	 * Care must be exercised when dealing with umtx structure.  It
1696 	 * can fault on any access.
1697 	 */
1698 	for (;;) {
1699 		/*
1700 		 * Try the uncontested case.  This should be done in userland.
1701 		 */
1702 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1703 		/* The address was invalid. */
1704 		if (rv == -1) {
1705 			error = EFAULT;
1706 			break;
1707 		}
1708 
1709 		/* The acquire succeeded. */
1710 		if (owner == UMUTEX_UNOWNED) {
1711 			error = 0;
1712 			break;
1713 		}
1714 
1715 		/* If no one owns it but it is contested try to acquire it. */
1716 		if (owner == UMUTEX_CONTESTED) {
1717 			rv = casueword32(&m->m_owner,
1718 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1719 			/* The address was invalid. */
1720 			if (rv == -1) {
1721 				error = EFAULT;
1722 				break;
1723 			}
1724 
1725 			if (owner == UMUTEX_CONTESTED) {
1726 				umtxq_lock(&uq->uq_key);
1727 				umtxq_busy(&uq->uq_key);
1728 				error = umtx_pi_claim(pi, td);
1729 				umtxq_unbusy(&uq->uq_key);
1730 				umtxq_unlock(&uq->uq_key);
1731 				break;
1732 			}
1733 
1734 			error = umtxq_check_susp(td);
1735 			if (error != 0)
1736 				break;
1737 
1738 			/* If this failed the lock has changed, restart. */
1739 			continue;
1740 		}
1741 
1742 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1743 			error = EDEADLK;
1744 			break;
1745 		}
1746 
1747 		if (try != 0) {
1748 			error = EBUSY;
1749 			break;
1750 		}
1751 
1752 		/*
1753 		 * If we caught a signal, we have retried and now
1754 		 * exit immediately.
1755 		 */
1756 		if (error != 0)
1757 			break;
1758 
1759 		umtxq_lock(&uq->uq_key);
1760 		umtxq_busy(&uq->uq_key);
1761 		umtxq_unlock(&uq->uq_key);
1762 
1763 		/*
1764 		 * Set the contested bit so that a release in user space
1765 		 * knows to use the system call for unlock.  If this fails
1766 		 * either some one else has acquired the lock or it has been
1767 		 * released.
1768 		 */
1769 		rv = casueword32(&m->m_owner, owner, &old,
1770 		    owner | UMUTEX_CONTESTED);
1771 
1772 		/* The address was invalid. */
1773 		if (rv == -1) {
1774 			umtxq_unbusy_unlocked(&uq->uq_key);
1775 			error = EFAULT;
1776 			break;
1777 		}
1778 
1779 		umtxq_lock(&uq->uq_key);
1780 		/*
1781 		 * We set the contested bit, sleep. Otherwise the lock changed
1782 		 * and we need to retry or we lost a race to the thread
1783 		 * unlocking the umtx.
1784 		 */
1785 		if (old == owner) {
1786 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1787 			    "umtxpi", timeout == NULL ? NULL : &timo);
1788 			if (error != 0)
1789 				continue;
1790 		} else {
1791 			umtxq_unbusy(&uq->uq_key);
1792 			umtxq_unlock(&uq->uq_key);
1793 		}
1794 
1795 		error = umtxq_check_susp(td);
1796 		if (error != 0)
1797 			break;
1798 	}
1799 
1800 	umtxq_lock(&uq->uq_key);
1801 	umtx_pi_unref(pi);
1802 	umtxq_unlock(&uq->uq_key);
1803 
1804 	umtx_key_release(&uq->uq_key);
1805 	return (error);
1806 }
1807 
1808 /*
1809  * Unlock a PI mutex.
1810  */
1811 static int
1812 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1813 {
1814 	struct umtx_key key;
1815 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1816 	struct umtx_pi *pi, *pi2;
1817 	uint32_t owner, old, id;
1818 	int error;
1819 	int count;
1820 	int pri;
1821 
1822 	id = td->td_tid;
1823 	/*
1824 	 * Make sure we own this mtx.
1825 	 */
1826 	error = fueword32(&m->m_owner, &owner);
1827 	if (error == -1)
1828 		return (EFAULT);
1829 
1830 	if ((owner & ~UMUTEX_CONTESTED) != id)
1831 		return (EPERM);
1832 
1833 	/* This should be done in userland */
1834 	if ((owner & UMUTEX_CONTESTED) == 0) {
1835 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1836 		if (error == -1)
1837 			return (EFAULT);
1838 		if (old == owner)
1839 			return (0);
1840 		owner = old;
1841 	}
1842 
1843 	/* We should only ever be in here for contested locks */
1844 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1845 	    &key)) != 0)
1846 		return (error);
1847 
1848 	umtxq_lock(&key);
1849 	umtxq_busy(&key);
1850 	count = umtxq_count_pi(&key, &uq_first);
1851 	if (uq_first != NULL) {
1852 		mtx_lock_spin(&umtx_lock);
1853 		pi = uq_first->uq_pi_blocked;
1854 		KASSERT(pi != NULL, ("pi == NULL?"));
1855 		if (pi->pi_owner != curthread) {
1856 			mtx_unlock_spin(&umtx_lock);
1857 			umtxq_unbusy(&key);
1858 			umtxq_unlock(&key);
1859 			umtx_key_release(&key);
1860 			/* userland messed the mutex */
1861 			return (EPERM);
1862 		}
1863 		uq_me = curthread->td_umtxq;
1864 		pi->pi_owner = NULL;
1865 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1866 		/* get highest priority thread which is still sleeping. */
1867 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1868 		while (uq_first != NULL &&
1869 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1870 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1871 		}
1872 		pri = PRI_MAX;
1873 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1874 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1875 			if (uq_first2 != NULL) {
1876 				if (pri > UPRI(uq_first2->uq_thread))
1877 					pri = UPRI(uq_first2->uq_thread);
1878 			}
1879 		}
1880 		thread_lock(curthread);
1881 		sched_lend_user_prio(curthread, pri);
1882 		thread_unlock(curthread);
1883 		mtx_unlock_spin(&umtx_lock);
1884 		if (uq_first)
1885 			umtxq_signal_thread(uq_first);
1886 	}
1887 	umtxq_unlock(&key);
1888 
1889 	/*
1890 	 * When unlocking the umtx, it must be marked as unowned if
1891 	 * there is zero or one thread only waiting for it.
1892 	 * Otherwise, it must be marked as contested.
1893 	 */
1894 	error = casueword32(&m->m_owner, owner, &old,
1895 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1896 
1897 	umtxq_unbusy_unlocked(&key);
1898 	umtx_key_release(&key);
1899 	if (error == -1)
1900 		return (EFAULT);
1901 	if (old != owner)
1902 		return (EINVAL);
1903 	return (0);
1904 }
1905 
1906 /*
1907  * Lock a PP mutex.
1908  */
1909 static int
1910 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1911     struct _umtx_time *timeout, int try)
1912 {
1913 	struct abs_timeout timo;
1914 	struct umtx_q *uq, *uq2;
1915 	struct umtx_pi *pi;
1916 	uint32_t ceiling;
1917 	uint32_t owner, id;
1918 	int error, pri, old_inherited_pri, su, rv;
1919 
1920 	id = td->td_tid;
1921 	uq = td->td_umtxq;
1922 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1923 	    &uq->uq_key)) != 0)
1924 		return (error);
1925 
1926 	if (timeout != NULL)
1927 		abs_timeout_init2(&timo, timeout);
1928 
1929 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1930 	for (;;) {
1931 		old_inherited_pri = uq->uq_inherited_pri;
1932 		umtxq_lock(&uq->uq_key);
1933 		umtxq_busy(&uq->uq_key);
1934 		umtxq_unlock(&uq->uq_key);
1935 
1936 		rv = fueword32(&m->m_ceilings[0], &ceiling);
1937 		if (rv == -1) {
1938 			error = EFAULT;
1939 			goto out;
1940 		}
1941 		ceiling = RTP_PRIO_MAX - ceiling;
1942 		if (ceiling > RTP_PRIO_MAX) {
1943 			error = EINVAL;
1944 			goto out;
1945 		}
1946 
1947 		mtx_lock_spin(&umtx_lock);
1948 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1949 			mtx_unlock_spin(&umtx_lock);
1950 			error = EINVAL;
1951 			goto out;
1952 		}
1953 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1954 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1955 			thread_lock(td);
1956 			if (uq->uq_inherited_pri < UPRI(td))
1957 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1958 			thread_unlock(td);
1959 		}
1960 		mtx_unlock_spin(&umtx_lock);
1961 
1962 		rv = casueword32(&m->m_owner,
1963 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1964 		/* The address was invalid. */
1965 		if (rv == -1) {
1966 			error = EFAULT;
1967 			break;
1968 		}
1969 
1970 		if (owner == UMUTEX_CONTESTED) {
1971 			error = 0;
1972 			break;
1973 		}
1974 
1975 		if (try != 0) {
1976 			error = EBUSY;
1977 			break;
1978 		}
1979 
1980 		/*
1981 		 * If we caught a signal, we have retried and now
1982 		 * exit immediately.
1983 		 */
1984 		if (error != 0)
1985 			break;
1986 
1987 		umtxq_lock(&uq->uq_key);
1988 		umtxq_insert(uq);
1989 		umtxq_unbusy(&uq->uq_key);
1990 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
1991 		    NULL : &timo);
1992 		umtxq_remove(uq);
1993 		umtxq_unlock(&uq->uq_key);
1994 
1995 		mtx_lock_spin(&umtx_lock);
1996 		uq->uq_inherited_pri = old_inherited_pri;
1997 		pri = PRI_MAX;
1998 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1999 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2000 			if (uq2 != NULL) {
2001 				if (pri > UPRI(uq2->uq_thread))
2002 					pri = UPRI(uq2->uq_thread);
2003 			}
2004 		}
2005 		if (pri > uq->uq_inherited_pri)
2006 			pri = uq->uq_inherited_pri;
2007 		thread_lock(td);
2008 		sched_lend_user_prio(td, pri);
2009 		thread_unlock(td);
2010 		mtx_unlock_spin(&umtx_lock);
2011 	}
2012 
2013 	if (error != 0) {
2014 		mtx_lock_spin(&umtx_lock);
2015 		uq->uq_inherited_pri = old_inherited_pri;
2016 		pri = PRI_MAX;
2017 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2018 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2019 			if (uq2 != NULL) {
2020 				if (pri > UPRI(uq2->uq_thread))
2021 					pri = UPRI(uq2->uq_thread);
2022 			}
2023 		}
2024 		if (pri > uq->uq_inherited_pri)
2025 			pri = uq->uq_inherited_pri;
2026 		thread_lock(td);
2027 		sched_lend_user_prio(td, pri);
2028 		thread_unlock(td);
2029 		mtx_unlock_spin(&umtx_lock);
2030 	}
2031 
2032 out:
2033 	umtxq_unbusy_unlocked(&uq->uq_key);
2034 	umtx_key_release(&uq->uq_key);
2035 	return (error);
2036 }
2037 
2038 /*
2039  * Unlock a PP mutex.
2040  */
2041 static int
2042 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2043 {
2044 	struct umtx_key key;
2045 	struct umtx_q *uq, *uq2;
2046 	struct umtx_pi *pi;
2047 	uint32_t owner, id;
2048 	uint32_t rceiling;
2049 	int error, pri, new_inherited_pri, su;
2050 
2051 	id = td->td_tid;
2052 	uq = td->td_umtxq;
2053 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2054 
2055 	/*
2056 	 * Make sure we own this mtx.
2057 	 */
2058 	error = fueword32(&m->m_owner, &owner);
2059 	if (error == -1)
2060 		return (EFAULT);
2061 
2062 	if ((owner & ~UMUTEX_CONTESTED) != id)
2063 		return (EPERM);
2064 
2065 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2066 	if (error != 0)
2067 		return (error);
2068 
2069 	if (rceiling == -1)
2070 		new_inherited_pri = PRI_MAX;
2071 	else {
2072 		rceiling = RTP_PRIO_MAX - rceiling;
2073 		if (rceiling > RTP_PRIO_MAX)
2074 			return (EINVAL);
2075 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2076 	}
2077 
2078 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2079 	    &key)) != 0)
2080 		return (error);
2081 	umtxq_lock(&key);
2082 	umtxq_busy(&key);
2083 	umtxq_unlock(&key);
2084 	/*
2085 	 * For priority protected mutex, always set unlocked state
2086 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2087 	 * to lock the mutex, it is necessary because thread priority
2088 	 * has to be adjusted for such mutex.
2089 	 */
2090 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2091 
2092 	umtxq_lock(&key);
2093 	if (error == 0)
2094 		umtxq_signal(&key, 1);
2095 	umtxq_unbusy(&key);
2096 	umtxq_unlock(&key);
2097 
2098 	if (error == -1)
2099 		error = EFAULT;
2100 	else {
2101 		mtx_lock_spin(&umtx_lock);
2102 		if (su != 0)
2103 			uq->uq_inherited_pri = new_inherited_pri;
2104 		pri = PRI_MAX;
2105 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2106 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2107 			if (uq2 != NULL) {
2108 				if (pri > UPRI(uq2->uq_thread))
2109 					pri = UPRI(uq2->uq_thread);
2110 			}
2111 		}
2112 		if (pri > uq->uq_inherited_pri)
2113 			pri = uq->uq_inherited_pri;
2114 		thread_lock(td);
2115 		sched_lend_user_prio(td, pri);
2116 		thread_unlock(td);
2117 		mtx_unlock_spin(&umtx_lock);
2118 	}
2119 	umtx_key_release(&key);
2120 	return (error);
2121 }
2122 
2123 static int
2124 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2125 	uint32_t *old_ceiling)
2126 {
2127 	struct umtx_q *uq;
2128 	uint32_t save_ceiling;
2129 	uint32_t owner, id;
2130 	uint32_t flags;
2131 	int error, rv;
2132 
2133 	error = fueword32(&m->m_flags, &flags);
2134 	if (error == -1)
2135 		return (EFAULT);
2136 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2137 		return (EINVAL);
2138 	if (ceiling > RTP_PRIO_MAX)
2139 		return (EINVAL);
2140 	id = td->td_tid;
2141 	uq = td->td_umtxq;
2142 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2143 	   &uq->uq_key)) != 0)
2144 		return (error);
2145 	for (;;) {
2146 		umtxq_lock(&uq->uq_key);
2147 		umtxq_busy(&uq->uq_key);
2148 		umtxq_unlock(&uq->uq_key);
2149 
2150 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2151 		if (rv == -1) {
2152 			error = EFAULT;
2153 			break;
2154 		}
2155 
2156 		rv = casueword32(&m->m_owner,
2157 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2158 		if (rv == -1) {
2159 			error = EFAULT;
2160 			break;
2161 		}
2162 
2163 		if (owner == UMUTEX_CONTESTED) {
2164 			suword32(&m->m_ceilings[0], ceiling);
2165 			suword32(&m->m_owner, UMUTEX_CONTESTED);
2166 			error = 0;
2167 			break;
2168 		}
2169 
2170 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2171 			suword32(&m->m_ceilings[0], ceiling);
2172 			error = 0;
2173 			break;
2174 		}
2175 
2176 		/*
2177 		 * If we caught a signal, we have retried and now
2178 		 * exit immediately.
2179 		 */
2180 		if (error != 0)
2181 			break;
2182 
2183 		/*
2184 		 * We set the contested bit, sleep. Otherwise the lock changed
2185 		 * and we need to retry or we lost a race to the thread
2186 		 * unlocking the umtx.
2187 		 */
2188 		umtxq_lock(&uq->uq_key);
2189 		umtxq_insert(uq);
2190 		umtxq_unbusy(&uq->uq_key);
2191 		error = umtxq_sleep(uq, "umtxpp", NULL);
2192 		umtxq_remove(uq);
2193 		umtxq_unlock(&uq->uq_key);
2194 	}
2195 	umtxq_lock(&uq->uq_key);
2196 	if (error == 0)
2197 		umtxq_signal(&uq->uq_key, INT_MAX);
2198 	umtxq_unbusy(&uq->uq_key);
2199 	umtxq_unlock(&uq->uq_key);
2200 	umtx_key_release(&uq->uq_key);
2201 	if (error == 0 && old_ceiling != NULL)
2202 		suword32(old_ceiling, save_ceiling);
2203 	return (error);
2204 }
2205 
2206 /*
2207  * Lock a userland POSIX mutex.
2208  */
2209 static int
2210 do_lock_umutex(struct thread *td, struct umutex *m,
2211     struct _umtx_time *timeout, int mode)
2212 {
2213 	uint32_t flags;
2214 	int error;
2215 
2216 	error = fueword32(&m->m_flags, &flags);
2217 	if (error == -1)
2218 		return (EFAULT);
2219 
2220 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2221 	case 0:
2222 		error = do_lock_normal(td, m, flags, timeout, mode);
2223 		break;
2224 	case UMUTEX_PRIO_INHERIT:
2225 		error = do_lock_pi(td, m, flags, timeout, mode);
2226 		break;
2227 	case UMUTEX_PRIO_PROTECT:
2228 		error = do_lock_pp(td, m, flags, timeout, mode);
2229 		break;
2230 	default:
2231 		return (EINVAL);
2232 	}
2233 	if (timeout == NULL) {
2234 		if (error == EINTR && mode != _UMUTEX_WAIT)
2235 			error = ERESTART;
2236 	} else {
2237 		/* Timed-locking is not restarted. */
2238 		if (error == ERESTART)
2239 			error = EINTR;
2240 	}
2241 	return (error);
2242 }
2243 
2244 /*
2245  * Unlock a userland POSIX mutex.
2246  */
2247 static int
2248 do_unlock_umutex(struct thread *td, struct umutex *m)
2249 {
2250 	uint32_t flags;
2251 	int error;
2252 
2253 	error = fueword32(&m->m_flags, &flags);
2254 	if (error == -1)
2255 		return (EFAULT);
2256 
2257 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2258 	case 0:
2259 		return (do_unlock_normal(td, m, flags));
2260 	case UMUTEX_PRIO_INHERIT:
2261 		return (do_unlock_pi(td, m, flags));
2262 	case UMUTEX_PRIO_PROTECT:
2263 		return (do_unlock_pp(td, m, flags));
2264 	}
2265 
2266 	return (EINVAL);
2267 }
2268 
2269 static int
2270 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2271 	struct timespec *timeout, u_long wflags)
2272 {
2273 	struct abs_timeout timo;
2274 	struct umtx_q *uq;
2275 	uint32_t flags, clockid, hasw;
2276 	int error;
2277 
2278 	uq = td->td_umtxq;
2279 	error = fueword32(&cv->c_flags, &flags);
2280 	if (error == -1)
2281 		return (EFAULT);
2282 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2283 	if (error != 0)
2284 		return (error);
2285 
2286 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2287 		error = fueword32(&cv->c_clockid, &clockid);
2288 		if (error == -1) {
2289 			umtx_key_release(&uq->uq_key);
2290 			return (EFAULT);
2291 		}
2292 		if (clockid < CLOCK_REALTIME ||
2293 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2294 			/* hmm, only HW clock id will work. */
2295 			umtx_key_release(&uq->uq_key);
2296 			return (EINVAL);
2297 		}
2298 	} else {
2299 		clockid = CLOCK_REALTIME;
2300 	}
2301 
2302 	umtxq_lock(&uq->uq_key);
2303 	umtxq_busy(&uq->uq_key);
2304 	umtxq_insert(uq);
2305 	umtxq_unlock(&uq->uq_key);
2306 
2307 	/*
2308 	 * Set c_has_waiters to 1 before releasing user mutex, also
2309 	 * don't modify cache line when unnecessary.
2310 	 */
2311 	error = fueword32(&cv->c_has_waiters, &hasw);
2312 	if (error == 0 && hasw == 0)
2313 		suword32(&cv->c_has_waiters, 1);
2314 
2315 	umtxq_unbusy_unlocked(&uq->uq_key);
2316 
2317 	error = do_unlock_umutex(td, m);
2318 
2319 	if (timeout != NULL)
2320 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2321 			timeout);
2322 
2323 	umtxq_lock(&uq->uq_key);
2324 	if (error == 0) {
2325 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2326 		    NULL : &timo);
2327 	}
2328 
2329 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2330 		error = 0;
2331 	else {
2332 		/*
2333 		 * This must be timeout,interrupted by signal or
2334 		 * surprious wakeup, clear c_has_waiter flag when
2335 		 * necessary.
2336 		 */
2337 		umtxq_busy(&uq->uq_key);
2338 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2339 			int oldlen = uq->uq_cur_queue->length;
2340 			umtxq_remove(uq);
2341 			if (oldlen == 1) {
2342 				umtxq_unlock(&uq->uq_key);
2343 				suword32(&cv->c_has_waiters, 0);
2344 				umtxq_lock(&uq->uq_key);
2345 			}
2346 		}
2347 		umtxq_unbusy(&uq->uq_key);
2348 		if (error == ERESTART)
2349 			error = EINTR;
2350 	}
2351 
2352 	umtxq_unlock(&uq->uq_key);
2353 	umtx_key_release(&uq->uq_key);
2354 	return (error);
2355 }
2356 
2357 /*
2358  * Signal a userland condition variable.
2359  */
2360 static int
2361 do_cv_signal(struct thread *td, struct ucond *cv)
2362 {
2363 	struct umtx_key key;
2364 	int error, cnt, nwake;
2365 	uint32_t flags;
2366 
2367 	error = fueword32(&cv->c_flags, &flags);
2368 	if (error == -1)
2369 		return (EFAULT);
2370 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2371 		return (error);
2372 	umtxq_lock(&key);
2373 	umtxq_busy(&key);
2374 	cnt = umtxq_count(&key);
2375 	nwake = umtxq_signal(&key, 1);
2376 	if (cnt <= nwake) {
2377 		umtxq_unlock(&key);
2378 		error = suword32(&cv->c_has_waiters, 0);
2379 		if (error == -1)
2380 			error = EFAULT;
2381 		umtxq_lock(&key);
2382 	}
2383 	umtxq_unbusy(&key);
2384 	umtxq_unlock(&key);
2385 	umtx_key_release(&key);
2386 	return (error);
2387 }
2388 
2389 static int
2390 do_cv_broadcast(struct thread *td, struct ucond *cv)
2391 {
2392 	struct umtx_key key;
2393 	int error;
2394 	uint32_t flags;
2395 
2396 	error = fueword32(&cv->c_flags, &flags);
2397 	if (error == -1)
2398 		return (EFAULT);
2399 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2400 		return (error);
2401 
2402 	umtxq_lock(&key);
2403 	umtxq_busy(&key);
2404 	umtxq_signal(&key, INT_MAX);
2405 	umtxq_unlock(&key);
2406 
2407 	error = suword32(&cv->c_has_waiters, 0);
2408 	if (error == -1)
2409 		error = EFAULT;
2410 
2411 	umtxq_unbusy_unlocked(&key);
2412 
2413 	umtx_key_release(&key);
2414 	return (error);
2415 }
2416 
2417 static int
2418 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2419 {
2420 	struct abs_timeout timo;
2421 	struct umtx_q *uq;
2422 	uint32_t flags, wrflags;
2423 	int32_t state, oldstate;
2424 	int32_t blocked_readers;
2425 	int error, rv;
2426 
2427 	uq = td->td_umtxq;
2428 	error = fueword32(&rwlock->rw_flags, &flags);
2429 	if (error == -1)
2430 		return (EFAULT);
2431 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2432 	if (error != 0)
2433 		return (error);
2434 
2435 	if (timeout != NULL)
2436 		abs_timeout_init2(&timo, timeout);
2437 
2438 	wrflags = URWLOCK_WRITE_OWNER;
2439 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2440 		wrflags |= URWLOCK_WRITE_WAITERS;
2441 
2442 	for (;;) {
2443 		rv = fueword32(&rwlock->rw_state, &state);
2444 		if (rv == -1) {
2445 			umtx_key_release(&uq->uq_key);
2446 			return (EFAULT);
2447 		}
2448 
2449 		/* try to lock it */
2450 		while (!(state & wrflags)) {
2451 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2452 				umtx_key_release(&uq->uq_key);
2453 				return (EAGAIN);
2454 			}
2455 			rv = casueword32(&rwlock->rw_state, state,
2456 			    &oldstate, state + 1);
2457 			if (rv == -1) {
2458 				umtx_key_release(&uq->uq_key);
2459 				return (EFAULT);
2460 			}
2461 			if (oldstate == state) {
2462 				umtx_key_release(&uq->uq_key);
2463 				return (0);
2464 			}
2465 			error = umtxq_check_susp(td);
2466 			if (error != 0)
2467 				break;
2468 			state = oldstate;
2469 		}
2470 
2471 		if (error)
2472 			break;
2473 
2474 		/* grab monitor lock */
2475 		umtxq_lock(&uq->uq_key);
2476 		umtxq_busy(&uq->uq_key);
2477 		umtxq_unlock(&uq->uq_key);
2478 
2479 		/*
2480 		 * re-read the state, in case it changed between the try-lock above
2481 		 * and the check below
2482 		 */
2483 		rv = fueword32(&rwlock->rw_state, &state);
2484 		if (rv == -1)
2485 			error = EFAULT;
2486 
2487 		/* set read contention bit */
2488 		while (error == 0 && (state & wrflags) &&
2489 		    !(state & URWLOCK_READ_WAITERS)) {
2490 			rv = casueword32(&rwlock->rw_state, state,
2491 			    &oldstate, state | URWLOCK_READ_WAITERS);
2492 			if (rv == -1) {
2493 				error = EFAULT;
2494 				break;
2495 			}
2496 			if (oldstate == state)
2497 				goto sleep;
2498 			state = oldstate;
2499 			error = umtxq_check_susp(td);
2500 			if (error != 0)
2501 				break;
2502 		}
2503 		if (error != 0) {
2504 			umtxq_unbusy_unlocked(&uq->uq_key);
2505 			break;
2506 		}
2507 
2508 		/* state is changed while setting flags, restart */
2509 		if (!(state & wrflags)) {
2510 			umtxq_unbusy_unlocked(&uq->uq_key);
2511 			error = umtxq_check_susp(td);
2512 			if (error != 0)
2513 				break;
2514 			continue;
2515 		}
2516 
2517 sleep:
2518 		/* contention bit is set, before sleeping, increase read waiter count */
2519 		rv = fueword32(&rwlock->rw_blocked_readers,
2520 		    &blocked_readers);
2521 		if (rv == -1) {
2522 			umtxq_unbusy_unlocked(&uq->uq_key);
2523 			error = EFAULT;
2524 			break;
2525 		}
2526 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2527 
2528 		while (state & wrflags) {
2529 			umtxq_lock(&uq->uq_key);
2530 			umtxq_insert(uq);
2531 			umtxq_unbusy(&uq->uq_key);
2532 
2533 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2534 			    NULL : &timo);
2535 
2536 			umtxq_busy(&uq->uq_key);
2537 			umtxq_remove(uq);
2538 			umtxq_unlock(&uq->uq_key);
2539 			if (error)
2540 				break;
2541 			rv = fueword32(&rwlock->rw_state, &state);
2542 			if (rv == -1) {
2543 				error = EFAULT;
2544 				break;
2545 			}
2546 		}
2547 
2548 		/* decrease read waiter count, and may clear read contention bit */
2549 		rv = fueword32(&rwlock->rw_blocked_readers,
2550 		    &blocked_readers);
2551 		if (rv == -1) {
2552 			umtxq_unbusy_unlocked(&uq->uq_key);
2553 			error = EFAULT;
2554 			break;
2555 		}
2556 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2557 		if (blocked_readers == 1) {
2558 			rv = fueword32(&rwlock->rw_state, &state);
2559 			if (rv == -1)
2560 				error = EFAULT;
2561 			while (error == 0) {
2562 				rv = casueword32(&rwlock->rw_state, state,
2563 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2564 				if (rv == -1) {
2565 					error = EFAULT;
2566 					break;
2567 				}
2568 				if (oldstate == state)
2569 					break;
2570 				state = oldstate;
2571 				error = umtxq_check_susp(td);
2572 			}
2573 		}
2574 
2575 		umtxq_unbusy_unlocked(&uq->uq_key);
2576 		if (error != 0)
2577 			break;
2578 	}
2579 	umtx_key_release(&uq->uq_key);
2580 	if (error == ERESTART)
2581 		error = EINTR;
2582 	return (error);
2583 }
2584 
2585 static int
2586 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2587 {
2588 	struct abs_timeout timo;
2589 	struct umtx_q *uq;
2590 	uint32_t flags;
2591 	int32_t state, oldstate;
2592 	int32_t blocked_writers;
2593 	int32_t blocked_readers;
2594 	int error, rv;
2595 
2596 	uq = td->td_umtxq;
2597 	error = fueword32(&rwlock->rw_flags, &flags);
2598 	if (error == -1)
2599 		return (EFAULT);
2600 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2601 	if (error != 0)
2602 		return (error);
2603 
2604 	if (timeout != NULL)
2605 		abs_timeout_init2(&timo, timeout);
2606 
2607 	blocked_readers = 0;
2608 	for (;;) {
2609 		rv = fueword32(&rwlock->rw_state, &state);
2610 		if (rv == -1) {
2611 			umtx_key_release(&uq->uq_key);
2612 			return (EFAULT);
2613 		}
2614 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2615 			rv = casueword32(&rwlock->rw_state, state,
2616 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2617 			if (rv == -1) {
2618 				umtx_key_release(&uq->uq_key);
2619 				return (EFAULT);
2620 			}
2621 			if (oldstate == state) {
2622 				umtx_key_release(&uq->uq_key);
2623 				return (0);
2624 			}
2625 			state = oldstate;
2626 			error = umtxq_check_susp(td);
2627 			if (error != 0)
2628 				break;
2629 		}
2630 
2631 		if (error) {
2632 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2633 			    blocked_readers != 0) {
2634 				umtxq_lock(&uq->uq_key);
2635 				umtxq_busy(&uq->uq_key);
2636 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2637 				umtxq_unbusy(&uq->uq_key);
2638 				umtxq_unlock(&uq->uq_key);
2639 			}
2640 
2641 			break;
2642 		}
2643 
2644 		/* grab monitor lock */
2645 		umtxq_lock(&uq->uq_key);
2646 		umtxq_busy(&uq->uq_key);
2647 		umtxq_unlock(&uq->uq_key);
2648 
2649 		/*
2650 		 * re-read the state, in case it changed between the try-lock above
2651 		 * and the check below
2652 		 */
2653 		rv = fueword32(&rwlock->rw_state, &state);
2654 		if (rv == -1)
2655 			error = EFAULT;
2656 
2657 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2658 		    URWLOCK_READER_COUNT(state) != 0) &&
2659 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2660 			rv = casueword32(&rwlock->rw_state, state,
2661 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2662 			if (rv == -1) {
2663 				error = EFAULT;
2664 				break;
2665 			}
2666 			if (oldstate == state)
2667 				goto sleep;
2668 			state = oldstate;
2669 			error = umtxq_check_susp(td);
2670 			if (error != 0)
2671 				break;
2672 		}
2673 		if (error != 0) {
2674 			umtxq_unbusy_unlocked(&uq->uq_key);
2675 			break;
2676 		}
2677 
2678 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2679 			umtxq_unbusy_unlocked(&uq->uq_key);
2680 			error = umtxq_check_susp(td);
2681 			if (error != 0)
2682 				break;
2683 			continue;
2684 		}
2685 sleep:
2686 		rv = fueword32(&rwlock->rw_blocked_writers,
2687 		    &blocked_writers);
2688 		if (rv == -1) {
2689 			umtxq_unbusy_unlocked(&uq->uq_key);
2690 			error = EFAULT;
2691 			break;
2692 		}
2693 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2694 
2695 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2696 			umtxq_lock(&uq->uq_key);
2697 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2698 			umtxq_unbusy(&uq->uq_key);
2699 
2700 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2701 			    NULL : &timo);
2702 
2703 			umtxq_busy(&uq->uq_key);
2704 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2705 			umtxq_unlock(&uq->uq_key);
2706 			if (error)
2707 				break;
2708 			rv = fueword32(&rwlock->rw_state, &state);
2709 			if (rv == -1) {
2710 				error = EFAULT;
2711 				break;
2712 			}
2713 		}
2714 
2715 		rv = fueword32(&rwlock->rw_blocked_writers,
2716 		    &blocked_writers);
2717 		if (rv == -1) {
2718 			umtxq_unbusy_unlocked(&uq->uq_key);
2719 			error = EFAULT;
2720 			break;
2721 		}
2722 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2723 		if (blocked_writers == 1) {
2724 			rv = fueword32(&rwlock->rw_state, &state);
2725 			if (rv == -1) {
2726 				umtxq_unbusy_unlocked(&uq->uq_key);
2727 				error = EFAULT;
2728 				break;
2729 			}
2730 			for (;;) {
2731 				rv = casueword32(&rwlock->rw_state, state,
2732 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2733 				if (rv == -1) {
2734 					error = EFAULT;
2735 					break;
2736 				}
2737 				if (oldstate == state)
2738 					break;
2739 				state = oldstate;
2740 				error = umtxq_check_susp(td);
2741 				/*
2742 				 * We are leaving the URWLOCK_WRITE_WAITERS
2743 				 * behind, but this should not harm the
2744 				 * correctness.
2745 				 */
2746 				if (error != 0)
2747 					break;
2748 			}
2749 			rv = fueword32(&rwlock->rw_blocked_readers,
2750 			    &blocked_readers);
2751 			if (rv == -1) {
2752 				umtxq_unbusy_unlocked(&uq->uq_key);
2753 				error = EFAULT;
2754 				break;
2755 			}
2756 		} else
2757 			blocked_readers = 0;
2758 
2759 		umtxq_unbusy_unlocked(&uq->uq_key);
2760 	}
2761 
2762 	umtx_key_release(&uq->uq_key);
2763 	if (error == ERESTART)
2764 		error = EINTR;
2765 	return (error);
2766 }
2767 
2768 static int
2769 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2770 {
2771 	struct umtx_q *uq;
2772 	uint32_t flags;
2773 	int32_t state, oldstate;
2774 	int error, rv, q, count;
2775 
2776 	uq = td->td_umtxq;
2777 	error = fueword32(&rwlock->rw_flags, &flags);
2778 	if (error == -1)
2779 		return (EFAULT);
2780 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2781 	if (error != 0)
2782 		return (error);
2783 
2784 	error = fueword32(&rwlock->rw_state, &state);
2785 	if (error == -1) {
2786 		error = EFAULT;
2787 		goto out;
2788 	}
2789 	if (state & URWLOCK_WRITE_OWNER) {
2790 		for (;;) {
2791 			rv = casueword32(&rwlock->rw_state, state,
2792 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
2793 			if (rv == -1) {
2794 				error = EFAULT;
2795 				goto out;
2796 			}
2797 			if (oldstate != state) {
2798 				state = oldstate;
2799 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2800 					error = EPERM;
2801 					goto out;
2802 				}
2803 				error = umtxq_check_susp(td);
2804 				if (error != 0)
2805 					goto out;
2806 			} else
2807 				break;
2808 		}
2809 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2810 		for (;;) {
2811 			rv = casueword32(&rwlock->rw_state, state,
2812 			    &oldstate, state - 1);
2813 			if (rv == -1) {
2814 				error = EFAULT;
2815 				goto out;
2816 			}
2817 			if (oldstate != state) {
2818 				state = oldstate;
2819 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2820 					error = EPERM;
2821 					goto out;
2822 				}
2823 				error = umtxq_check_susp(td);
2824 				if (error != 0)
2825 					goto out;
2826 			} else
2827 				break;
2828 		}
2829 	} else {
2830 		error = EPERM;
2831 		goto out;
2832 	}
2833 
2834 	count = 0;
2835 
2836 	if (!(flags & URWLOCK_PREFER_READER)) {
2837 		if (state & URWLOCK_WRITE_WAITERS) {
2838 			count = 1;
2839 			q = UMTX_EXCLUSIVE_QUEUE;
2840 		} else if (state & URWLOCK_READ_WAITERS) {
2841 			count = INT_MAX;
2842 			q = UMTX_SHARED_QUEUE;
2843 		}
2844 	} else {
2845 		if (state & URWLOCK_READ_WAITERS) {
2846 			count = INT_MAX;
2847 			q = UMTX_SHARED_QUEUE;
2848 		} else if (state & URWLOCK_WRITE_WAITERS) {
2849 			count = 1;
2850 			q = UMTX_EXCLUSIVE_QUEUE;
2851 		}
2852 	}
2853 
2854 	if (count) {
2855 		umtxq_lock(&uq->uq_key);
2856 		umtxq_busy(&uq->uq_key);
2857 		umtxq_signal_queue(&uq->uq_key, count, q);
2858 		umtxq_unbusy(&uq->uq_key);
2859 		umtxq_unlock(&uq->uq_key);
2860 	}
2861 out:
2862 	umtx_key_release(&uq->uq_key);
2863 	return (error);
2864 }
2865 
2866 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
2867 static int
2868 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2869 {
2870 	struct abs_timeout timo;
2871 	struct umtx_q *uq;
2872 	uint32_t flags, count, count1;
2873 	int error, rv;
2874 
2875 	uq = td->td_umtxq;
2876 	error = fueword32(&sem->_flags, &flags);
2877 	if (error == -1)
2878 		return (EFAULT);
2879 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2880 	if (error != 0)
2881 		return (error);
2882 
2883 	if (timeout != NULL)
2884 		abs_timeout_init2(&timo, timeout);
2885 
2886 	umtxq_lock(&uq->uq_key);
2887 	umtxq_busy(&uq->uq_key);
2888 	umtxq_insert(uq);
2889 	umtxq_unlock(&uq->uq_key);
2890 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
2891 	if (rv == 0)
2892 		rv = fueword32(&sem->_count, &count);
2893 	if (rv == -1 || count != 0) {
2894 		umtxq_lock(&uq->uq_key);
2895 		umtxq_unbusy(&uq->uq_key);
2896 		umtxq_remove(uq);
2897 		umtxq_unlock(&uq->uq_key);
2898 		umtx_key_release(&uq->uq_key);
2899 		return (rv == -1 ? EFAULT : 0);
2900 	}
2901 	umtxq_lock(&uq->uq_key);
2902 	umtxq_unbusy(&uq->uq_key);
2903 
2904 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2905 
2906 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2907 		error = 0;
2908 	else {
2909 		umtxq_remove(uq);
2910 		/* A relative timeout cannot be restarted. */
2911 		if (error == ERESTART && timeout != NULL &&
2912 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2913 			error = EINTR;
2914 	}
2915 	umtxq_unlock(&uq->uq_key);
2916 	umtx_key_release(&uq->uq_key);
2917 	return (error);
2918 }
2919 
2920 /*
2921  * Signal a userland semaphore.
2922  */
2923 static int
2924 do_sem_wake(struct thread *td, struct _usem *sem)
2925 {
2926 	struct umtx_key key;
2927 	int error, cnt;
2928 	uint32_t flags;
2929 
2930 	error = fueword32(&sem->_flags, &flags);
2931 	if (error == -1)
2932 		return (EFAULT);
2933 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2934 		return (error);
2935 	umtxq_lock(&key);
2936 	umtxq_busy(&key);
2937 	cnt = umtxq_count(&key);
2938 	if (cnt > 0) {
2939 		umtxq_signal(&key, 1);
2940 		/*
2941 		 * Check if count is greater than 0, this means the memory is
2942 		 * still being referenced by user code, so we can safely
2943 		 * update _has_waiters flag.
2944 		 */
2945 		if (cnt == 1) {
2946 			umtxq_unlock(&key);
2947 			error = suword32(&sem->_has_waiters, 0);
2948 			umtxq_lock(&key);
2949 			if (error == -1)
2950 				error = EFAULT;
2951 		}
2952 	}
2953 	umtxq_unbusy(&key);
2954 	umtxq_unlock(&key);
2955 	umtx_key_release(&key);
2956 	return (error);
2957 }
2958 #endif
2959 
2960 static int
2961 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
2962 {
2963 	struct abs_timeout timo;
2964 	struct umtx_q *uq;
2965 	uint32_t count, flags;
2966 	int error, rv;
2967 
2968 	uq = td->td_umtxq;
2969 	flags = fuword32(&sem->_flags);
2970 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2971 	if (error != 0)
2972 		return (error);
2973 
2974 	if (timeout != NULL)
2975 		abs_timeout_init2(&timo, timeout);
2976 
2977 	umtxq_lock(&uq->uq_key);
2978 	umtxq_busy(&uq->uq_key);
2979 	umtxq_insert(uq);
2980 	umtxq_unlock(&uq->uq_key);
2981 	rv = fueword32(&sem->_count, &count);
2982 	if (rv == -1) {
2983 		umtxq_lock(&uq->uq_key);
2984 		umtxq_unbusy(&uq->uq_key);
2985 		umtxq_remove(uq);
2986 		umtxq_unlock(&uq->uq_key);
2987 		umtx_key_release(&uq->uq_key);
2988 		return (EFAULT);
2989 	}
2990 	for (;;) {
2991 		if (USEM_COUNT(count) != 0) {
2992 			umtxq_lock(&uq->uq_key);
2993 			umtxq_unbusy(&uq->uq_key);
2994 			umtxq_remove(uq);
2995 			umtxq_unlock(&uq->uq_key);
2996 			umtx_key_release(&uq->uq_key);
2997 			return (0);
2998 		}
2999 		if (count == USEM_HAS_WAITERS)
3000 			break;
3001 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3002 		if (rv == -1) {
3003 			umtxq_lock(&uq->uq_key);
3004 			umtxq_unbusy(&uq->uq_key);
3005 			umtxq_remove(uq);
3006 			umtxq_unlock(&uq->uq_key);
3007 			umtx_key_release(&uq->uq_key);
3008 			return (EFAULT);
3009 		}
3010 		if (count == 0)
3011 			break;
3012 	}
3013 	umtxq_lock(&uq->uq_key);
3014 	umtxq_unbusy(&uq->uq_key);
3015 
3016 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3017 
3018 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3019 		error = 0;
3020 	else {
3021 		umtxq_remove(uq);
3022 		/* A relative timeout cannot be restarted. */
3023 		if (error == ERESTART && timeout != NULL &&
3024 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3025 			error = EINTR;
3026 	}
3027 	umtxq_unlock(&uq->uq_key);
3028 	umtx_key_release(&uq->uq_key);
3029 	return (error);
3030 }
3031 
3032 /*
3033  * Signal a userland semaphore.
3034  */
3035 static int
3036 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3037 {
3038 	struct umtx_key key;
3039 	int error, cnt, rv;
3040 	uint32_t count, flags;
3041 
3042 	rv = fueword32(&sem->_flags, &flags);
3043 	if (rv == -1)
3044 		return (EFAULT);
3045 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3046 		return (error);
3047 	umtxq_lock(&key);
3048 	umtxq_busy(&key);
3049 	cnt = umtxq_count(&key);
3050 	if (cnt > 0) {
3051 		umtxq_signal(&key, 1);
3052 
3053 		/*
3054 		 * If this was the last sleeping thread, clear the waiters
3055 		 * flag in _count.
3056 		 */
3057 		if (cnt == 1) {
3058 			umtxq_unlock(&key);
3059 			rv = fueword32(&sem->_count, &count);
3060 			while (rv != -1 && count & USEM_HAS_WAITERS)
3061 				rv = casueword32(&sem->_count, count, &count,
3062 				    count & ~USEM_HAS_WAITERS);
3063 			if (rv == -1)
3064 				error = EFAULT;
3065 			umtxq_lock(&key);
3066 		}
3067 	}
3068 	umtxq_unbusy(&key);
3069 	umtxq_unlock(&key);
3070 	umtx_key_release(&key);
3071 	return (error);
3072 }
3073 
3074 inline int
3075 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3076 {
3077 	int error;
3078 
3079 	error = copyin(addr, tsp, sizeof(struct timespec));
3080 	if (error == 0) {
3081 		if (tsp->tv_sec < 0 ||
3082 		    tsp->tv_nsec >= 1000000000 ||
3083 		    tsp->tv_nsec < 0)
3084 			error = EINVAL;
3085 	}
3086 	return (error);
3087 }
3088 
3089 static inline int
3090 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3091 {
3092 	int error;
3093 
3094 	if (size <= sizeof(struct timespec)) {
3095 		tp->_clockid = CLOCK_REALTIME;
3096 		tp->_flags = 0;
3097 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3098 	} else
3099 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3100 	if (error != 0)
3101 		return (error);
3102 	if (tp->_timeout.tv_sec < 0 ||
3103 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3104 		return (EINVAL);
3105 	return (0);
3106 }
3107 
3108 static int
3109 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3110 {
3111 
3112 	return (EOPNOTSUPP);
3113 }
3114 
3115 static int
3116 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3117 {
3118 	struct _umtx_time timeout, *tm_p;
3119 	int error;
3120 
3121 	if (uap->uaddr2 == NULL)
3122 		tm_p = NULL;
3123 	else {
3124 		error = umtx_copyin_umtx_time(
3125 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3126 		if (error != 0)
3127 			return (error);
3128 		tm_p = &timeout;
3129 	}
3130 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3131 }
3132 
3133 static int
3134 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3135 {
3136 	struct _umtx_time timeout, *tm_p;
3137 	int error;
3138 
3139 	if (uap->uaddr2 == NULL)
3140 		tm_p = NULL;
3141 	else {
3142 		error = umtx_copyin_umtx_time(
3143 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3144 		if (error != 0)
3145 			return (error);
3146 		tm_p = &timeout;
3147 	}
3148 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3149 }
3150 
3151 static int
3152 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3153 {
3154 	struct _umtx_time *tm_p, timeout;
3155 	int error;
3156 
3157 	if (uap->uaddr2 == NULL)
3158 		tm_p = NULL;
3159 	else {
3160 		error = umtx_copyin_umtx_time(
3161 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3162 		if (error != 0)
3163 			return (error);
3164 		tm_p = &timeout;
3165 	}
3166 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3167 }
3168 
3169 static int
3170 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3171 {
3172 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3173 }
3174 
3175 #define BATCH_SIZE	128
3176 static int
3177 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3178 {
3179 	int count = uap->val;
3180 	void *uaddrs[BATCH_SIZE];
3181 	char **upp = (char **)uap->obj;
3182 	int tocopy;
3183 	int error = 0;
3184 	int i, pos = 0;
3185 
3186 	while (count > 0) {
3187 		tocopy = count;
3188 		if (tocopy > BATCH_SIZE)
3189 			tocopy = BATCH_SIZE;
3190 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3191 		if (error != 0)
3192 			break;
3193 		for (i = 0; i < tocopy; ++i)
3194 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3195 		count -= tocopy;
3196 		pos += tocopy;
3197 	}
3198 	return (error);
3199 }
3200 
3201 static int
3202 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3203 {
3204 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3205 }
3206 
3207 static int
3208 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	struct _umtx_time *tm_p, timeout;
3211 	int error;
3212 
3213 	/* Allow a null timespec (wait forever). */
3214 	if (uap->uaddr2 == NULL)
3215 		tm_p = NULL;
3216 	else {
3217 		error = umtx_copyin_umtx_time(
3218 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3219 		if (error != 0)
3220 			return (error);
3221 		tm_p = &timeout;
3222 	}
3223 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3224 }
3225 
3226 static int
3227 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3228 {
3229 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3230 }
3231 
3232 static int
3233 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3234 {
3235 	struct _umtx_time *tm_p, timeout;
3236 	int error;
3237 
3238 	/* Allow a null timespec (wait forever). */
3239 	if (uap->uaddr2 == NULL)
3240 		tm_p = NULL;
3241 	else {
3242 		error = umtx_copyin_umtx_time(
3243 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3244 		if (error != 0)
3245 			return (error);
3246 		tm_p = &timeout;
3247 	}
3248 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3249 }
3250 
3251 static int
3252 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3253 {
3254 	return do_wake_umutex(td, uap->obj);
3255 }
3256 
3257 static int
3258 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3259 {
3260 	return do_unlock_umutex(td, uap->obj);
3261 }
3262 
3263 static int
3264 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3265 {
3266 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3267 }
3268 
3269 static int
3270 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3271 {
3272 	struct timespec *ts, timeout;
3273 	int error;
3274 
3275 	/* Allow a null timespec (wait forever). */
3276 	if (uap->uaddr2 == NULL)
3277 		ts = NULL;
3278 	else {
3279 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3280 		if (error != 0)
3281 			return (error);
3282 		ts = &timeout;
3283 	}
3284 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3285 }
3286 
3287 static int
3288 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3289 {
3290 	return do_cv_signal(td, uap->obj);
3291 }
3292 
3293 static int
3294 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3295 {
3296 	return do_cv_broadcast(td, uap->obj);
3297 }
3298 
3299 static int
3300 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3301 {
3302 	struct _umtx_time timeout;
3303 	int error;
3304 
3305 	/* Allow a null timespec (wait forever). */
3306 	if (uap->uaddr2 == NULL) {
3307 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3308 	} else {
3309 		error = umtx_copyin_umtx_time(uap->uaddr2,
3310 		   (size_t)uap->uaddr1, &timeout);
3311 		if (error != 0)
3312 			return (error);
3313 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3314 	}
3315 	return (error);
3316 }
3317 
3318 static int
3319 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3320 {
3321 	struct _umtx_time timeout;
3322 	int error;
3323 
3324 	/* Allow a null timespec (wait forever). */
3325 	if (uap->uaddr2 == NULL) {
3326 		error = do_rw_wrlock(td, uap->obj, 0);
3327 	} else {
3328 		error = umtx_copyin_umtx_time(uap->uaddr2,
3329 		   (size_t)uap->uaddr1, &timeout);
3330 		if (error != 0)
3331 			return (error);
3332 
3333 		error = do_rw_wrlock(td, uap->obj, &timeout);
3334 	}
3335 	return (error);
3336 }
3337 
3338 static int
3339 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3340 {
3341 	return do_rw_unlock(td, uap->obj);
3342 }
3343 
3344 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3345 static int
3346 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3347 {
3348 	struct _umtx_time *tm_p, timeout;
3349 	int error;
3350 
3351 	/* Allow a null timespec (wait forever). */
3352 	if (uap->uaddr2 == NULL)
3353 		tm_p = NULL;
3354 	else {
3355 		error = umtx_copyin_umtx_time(
3356 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3357 		if (error != 0)
3358 			return (error);
3359 		tm_p = &timeout;
3360 	}
3361 	return (do_sem_wait(td, uap->obj, tm_p));
3362 }
3363 
3364 static int
3365 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3366 {
3367 	return do_sem_wake(td, uap->obj);
3368 }
3369 #endif
3370 
3371 static int
3372 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3373 {
3374 	return do_wake2_umutex(td, uap->obj, uap->val);
3375 }
3376 
3377 static int
3378 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3379 {
3380 	struct _umtx_time *tm_p, timeout;
3381 	int error;
3382 
3383 	/* Allow a null timespec (wait forever). */
3384 	if (uap->uaddr2 == NULL)
3385 		tm_p = NULL;
3386 	else {
3387 		error = umtx_copyin_umtx_time(
3388 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3389 		if (error != 0)
3390 			return (error);
3391 		tm_p = &timeout;
3392 	}
3393 	return (do_sem2_wait(td, uap->obj, tm_p));
3394 }
3395 
3396 static int
3397 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3398 {
3399 	return do_sem2_wake(td, uap->obj);
3400 }
3401 
3402 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3403 
3404 static _umtx_op_func op_table[] = {
3405 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3406 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3407 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3408 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3409 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3410 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3411 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3412 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3413 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3414 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3415 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3416 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3417 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3418 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3419 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3420 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3421 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3422 	__umtx_op_wait_umutex,		/* UMTX_OP_MUTEX_WAIT */
3423 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3424 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3425 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3426 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3427 #else
3428 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3429 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3430 #endif
3431 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3432 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3433 	__umtx_op_sem2_wait,		/* UMTX_OP_SEM2_WAIT */
3434 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3435 };
3436 
3437 int
3438 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3439 {
3440 	if ((unsigned)uap->op < UMTX_OP_MAX)
3441 		return (*op_table[uap->op])(td, uap);
3442 	return (EINVAL);
3443 }
3444 
3445 #ifdef COMPAT_FREEBSD32
3446 
3447 struct timespec32 {
3448 	int32_t tv_sec;
3449 	int32_t tv_nsec;
3450 };
3451 
3452 struct umtx_time32 {
3453 	struct	timespec32	timeout;
3454 	uint32_t		flags;
3455 	uint32_t		clockid;
3456 };
3457 
3458 static inline int
3459 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3460 {
3461 	struct timespec32 ts32;
3462 	int error;
3463 
3464 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3465 	if (error == 0) {
3466 		if (ts32.tv_sec < 0 ||
3467 		    ts32.tv_nsec >= 1000000000 ||
3468 		    ts32.tv_nsec < 0)
3469 			error = EINVAL;
3470 		else {
3471 			tsp->tv_sec = ts32.tv_sec;
3472 			tsp->tv_nsec = ts32.tv_nsec;
3473 		}
3474 	}
3475 	return (error);
3476 }
3477 
3478 static inline int
3479 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3480 {
3481 	struct umtx_time32 t32;
3482 	int error;
3483 
3484 	t32.clockid = CLOCK_REALTIME;
3485 	t32.flags   = 0;
3486 	if (size <= sizeof(struct timespec32))
3487 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3488 	else
3489 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3490 	if (error != 0)
3491 		return (error);
3492 	if (t32.timeout.tv_sec < 0 ||
3493 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3494 		return (EINVAL);
3495 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3496 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3497 	tp->_flags = t32.flags;
3498 	tp->_clockid = t32.clockid;
3499 	return (0);
3500 }
3501 
3502 static int
3503 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3504 {
3505 	struct _umtx_time *tm_p, timeout;
3506 	int error;
3507 
3508 	if (uap->uaddr2 == NULL)
3509 		tm_p = NULL;
3510 	else {
3511 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3512 			(size_t)uap->uaddr1, &timeout);
3513 		if (error != 0)
3514 			return (error);
3515 		tm_p = &timeout;
3516 	}
3517 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3518 }
3519 
3520 static int
3521 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3522 {
3523 	struct _umtx_time *tm_p, timeout;
3524 	int error;
3525 
3526 	/* Allow a null timespec (wait forever). */
3527 	if (uap->uaddr2 == NULL)
3528 		tm_p = NULL;
3529 	else {
3530 		error = umtx_copyin_umtx_time(uap->uaddr2,
3531 			    (size_t)uap->uaddr1, &timeout);
3532 		if (error != 0)
3533 			return (error);
3534 		tm_p = &timeout;
3535 	}
3536 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3537 }
3538 
3539 static int
3540 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3541 {
3542 	struct _umtx_time *tm_p, timeout;
3543 	int error;
3544 
3545 	/* Allow a null timespec (wait forever). */
3546 	if (uap->uaddr2 == NULL)
3547 		tm_p = NULL;
3548 	else {
3549 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3550 		    (size_t)uap->uaddr1, &timeout);
3551 		if (error != 0)
3552 			return (error);
3553 		tm_p = &timeout;
3554 	}
3555 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3556 }
3557 
3558 static int
3559 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3560 {
3561 	struct timespec *ts, timeout;
3562 	int error;
3563 
3564 	/* Allow a null timespec (wait forever). */
3565 	if (uap->uaddr2 == NULL)
3566 		ts = NULL;
3567 	else {
3568 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3569 		if (error != 0)
3570 			return (error);
3571 		ts = &timeout;
3572 	}
3573 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3574 }
3575 
3576 static int
3577 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3578 {
3579 	struct _umtx_time timeout;
3580 	int error;
3581 
3582 	/* Allow a null timespec (wait forever). */
3583 	if (uap->uaddr2 == NULL) {
3584 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3585 	} else {
3586 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3587 		    (size_t)uap->uaddr1, &timeout);
3588 		if (error != 0)
3589 			return (error);
3590 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3591 	}
3592 	return (error);
3593 }
3594 
3595 static int
3596 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3597 {
3598 	struct _umtx_time timeout;
3599 	int error;
3600 
3601 	/* Allow a null timespec (wait forever). */
3602 	if (uap->uaddr2 == NULL) {
3603 		error = do_rw_wrlock(td, uap->obj, 0);
3604 	} else {
3605 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3606 		    (size_t)uap->uaddr1, &timeout);
3607 		if (error != 0)
3608 			return (error);
3609 		error = do_rw_wrlock(td, uap->obj, &timeout);
3610 	}
3611 	return (error);
3612 }
3613 
3614 static int
3615 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3616 {
3617 	struct _umtx_time *tm_p, timeout;
3618 	int error;
3619 
3620 	if (uap->uaddr2 == NULL)
3621 		tm_p = NULL;
3622 	else {
3623 		error = umtx_copyin_umtx_time32(
3624 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3625 		if (error != 0)
3626 			return (error);
3627 		tm_p = &timeout;
3628 	}
3629 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3630 }
3631 
3632 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3633 static int
3634 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3635 {
3636 	struct _umtx_time *tm_p, timeout;
3637 	int error;
3638 
3639 	/* Allow a null timespec (wait forever). */
3640 	if (uap->uaddr2 == NULL)
3641 		tm_p = NULL;
3642 	else {
3643 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3644 		    (size_t)uap->uaddr1, &timeout);
3645 		if (error != 0)
3646 			return (error);
3647 		tm_p = &timeout;
3648 	}
3649 	return (do_sem_wait(td, uap->obj, tm_p));
3650 }
3651 #endif
3652 
3653 static int
3654 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3655 {
3656 	struct _umtx_time *tm_p, timeout;
3657 	int error;
3658 
3659 	/* Allow a null timespec (wait forever). */
3660 	if (uap->uaddr2 == NULL)
3661 		tm_p = NULL;
3662 	else {
3663 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3664 		    (size_t)uap->uaddr1, &timeout);
3665 		if (error != 0)
3666 			return (error);
3667 		tm_p = &timeout;
3668 	}
3669 	return (do_sem2_wait(td, uap->obj, tm_p));
3670 }
3671 
3672 static int
3673 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3674 {
3675 	int count = uap->val;
3676 	uint32_t uaddrs[BATCH_SIZE];
3677 	uint32_t **upp = (uint32_t **)uap->obj;
3678 	int tocopy;
3679 	int error = 0;
3680 	int i, pos = 0;
3681 
3682 	while (count > 0) {
3683 		tocopy = count;
3684 		if (tocopy > BATCH_SIZE)
3685 			tocopy = BATCH_SIZE;
3686 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3687 		if (error != 0)
3688 			break;
3689 		for (i = 0; i < tocopy; ++i)
3690 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3691 				INT_MAX, 1);
3692 		count -= tocopy;
3693 		pos += tocopy;
3694 	}
3695 	return (error);
3696 }
3697 
3698 static _umtx_op_func op_table_compat32[] = {
3699 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3700 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3701 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3702 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3703 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3704 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3705 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3706 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3707 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3708 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3709 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3710 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3711 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3712 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3713 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3714 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3715 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3716 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_MUTEX_WAIT */
3717 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3718 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3719 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3720 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3721 #else
3722 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3723 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3724 #endif
3725 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3726 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3727 	__umtx_op_sem2_wait_compat32,	/* UMTX_OP_SEM2_WAIT */
3728 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3729 };
3730 
3731 int
3732 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3733 {
3734 	if ((unsigned)uap->op < UMTX_OP_MAX)
3735 		return (*op_table_compat32[uap->op])(td,
3736 			(struct _umtx_op_args *)uap);
3737 	return (EINVAL);
3738 }
3739 #endif
3740 
3741 void
3742 umtx_thread_init(struct thread *td)
3743 {
3744 	td->td_umtxq = umtxq_alloc();
3745 	td->td_umtxq->uq_thread = td;
3746 }
3747 
3748 void
3749 umtx_thread_fini(struct thread *td)
3750 {
3751 	umtxq_free(td->td_umtxq);
3752 }
3753 
3754 /*
3755  * It will be called when new thread is created, e.g fork().
3756  */
3757 void
3758 umtx_thread_alloc(struct thread *td)
3759 {
3760 	struct umtx_q *uq;
3761 
3762 	uq = td->td_umtxq;
3763 	uq->uq_inherited_pri = PRI_MAX;
3764 
3765 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3766 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3767 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3768 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3769 }
3770 
3771 /*
3772  * exec() hook.
3773  */
3774 static void
3775 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3776 	struct image_params *imgp __unused)
3777 {
3778 	umtx_thread_cleanup(curthread);
3779 }
3780 
3781 /*
3782  * thread_exit() hook.
3783  */
3784 void
3785 umtx_thread_exit(struct thread *td)
3786 {
3787 	umtx_thread_cleanup(td);
3788 }
3789 
3790 /*
3791  * clean up umtx data.
3792  */
3793 static void
3794 umtx_thread_cleanup(struct thread *td)
3795 {
3796 	struct umtx_q *uq;
3797 	struct umtx_pi *pi;
3798 
3799 	if ((uq = td->td_umtxq) == NULL)
3800 		return;
3801 
3802 	mtx_lock_spin(&umtx_lock);
3803 	uq->uq_inherited_pri = PRI_MAX;
3804 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3805 		pi->pi_owner = NULL;
3806 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3807 	}
3808 	mtx_unlock_spin(&umtx_lock);
3809 	thread_lock(td);
3810 	sched_lend_user_prio(td, PRI_MAX);
3811 	thread_unlock(td);
3812 }
3813