xref: /freebsd/sys/kern/kern_umtx.c (revision 8ef24a0d4b28fe230e20637f56869cc4148cd2ca)
1 /*-
2  * Copyright (c) 2015 The FreeBSD Foundation
3  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
4  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
5  * All rights reserved.
6  *
7  * Portions of this software were developed by Konstantin Belousov
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice unmodified, this list of conditions, and the following
15  *    disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_compat.h"
36 #include "opt_umtx_profiling.h"
37 
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/limits.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/mman.h>
47 #include <sys/mutex.h>
48 #include <sys/priv.h>
49 #include <sys/proc.h>
50 #include <sys/resource.h>
51 #include <sys/resourcevar.h>
52 #include <sys/rwlock.h>
53 #include <sys/sbuf.h>
54 #include <sys/sched.h>
55 #include <sys/smp.h>
56 #include <sys/sysctl.h>
57 #include <sys/sysent.h>
58 #include <sys/systm.h>
59 #include <sys/sysproto.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/taskqueue.h>
62 #include <sys/eventhandler.h>
63 #include <sys/umtx.h>
64 
65 #include <security/mac/mac_framework.h>
66 
67 #include <vm/vm.h>
68 #include <vm/vm_param.h>
69 #include <vm/pmap.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_object.h>
72 
73 #include <machine/cpu.h>
74 
75 #ifdef COMPAT_FREEBSD32
76 #include <compat/freebsd32/freebsd32_proto.h>
77 #endif
78 
79 #define _UMUTEX_TRY		1
80 #define _UMUTEX_WAIT		2
81 
82 #ifdef UMTX_PROFILING
83 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
84 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
85 #endif
86 
87 /* Priority inheritance mutex info. */
88 struct umtx_pi {
89 	/* Owner thread */
90 	struct thread		*pi_owner;
91 
92 	/* Reference count */
93 	int			pi_refcount;
94 
95  	/* List entry to link umtx holding by thread */
96 	TAILQ_ENTRY(umtx_pi)	pi_link;
97 
98 	/* List entry in hash */
99 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
100 
101 	/* List for waiters */
102 	TAILQ_HEAD(,umtx_q)	pi_blocked;
103 
104 	/* Identify a userland lock object */
105 	struct umtx_key		pi_key;
106 };
107 
108 /* A userland synchronous object user. */
109 struct umtx_q {
110 	/* Linked list for the hash. */
111 	TAILQ_ENTRY(umtx_q)	uq_link;
112 
113 	/* Umtx key. */
114 	struct umtx_key		uq_key;
115 
116 	/* Umtx flags. */
117 	int			uq_flags;
118 #define UQF_UMTXQ	0x0001
119 
120 	/* The thread waits on. */
121 	struct thread		*uq_thread;
122 
123 	/*
124 	 * Blocked on PI mutex. read can use chain lock
125 	 * or umtx_lock, write must have both chain lock and
126 	 * umtx_lock being hold.
127 	 */
128 	struct umtx_pi		*uq_pi_blocked;
129 
130 	/* On blocked list */
131 	TAILQ_ENTRY(umtx_q)	uq_lockq;
132 
133 	/* Thread contending with us */
134 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
135 
136 	/* Inherited priority from PP mutex */
137 	u_char			uq_inherited_pri;
138 
139 	/* Spare queue ready to be reused */
140 	struct umtxq_queue	*uq_spare_queue;
141 
142 	/* The queue we on */
143 	struct umtxq_queue	*uq_cur_queue;
144 };
145 
146 TAILQ_HEAD(umtxq_head, umtx_q);
147 
148 /* Per-key wait-queue */
149 struct umtxq_queue {
150 	struct umtxq_head	head;
151 	struct umtx_key		key;
152 	LIST_ENTRY(umtxq_queue)	link;
153 	int			length;
154 };
155 
156 LIST_HEAD(umtxq_list, umtxq_queue);
157 
158 /* Userland lock object's wait-queue chain */
159 struct umtxq_chain {
160 	/* Lock for this chain. */
161 	struct mtx		uc_lock;
162 
163 	/* List of sleep queues. */
164 	struct umtxq_list	uc_queue[2];
165 #define UMTX_SHARED_QUEUE	0
166 #define UMTX_EXCLUSIVE_QUEUE	1
167 
168 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
169 
170 	/* Busy flag */
171 	char			uc_busy;
172 
173 	/* Chain lock waiters */
174 	int			uc_waiters;
175 
176 	/* All PI in the list */
177 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
178 
179 #ifdef UMTX_PROFILING
180 	u_int 			length;
181 	u_int			max_length;
182 #endif
183 };
184 
185 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
186 
187 /*
188  * Don't propagate time-sharing priority, there is a security reason,
189  * a user can simply introduce PI-mutex, let thread A lock the mutex,
190  * and let another thread B block on the mutex, because B is
191  * sleeping, its priority will be boosted, this causes A's priority to
192  * be boosted via priority propagating too and will never be lowered even
193  * if it is using 100%CPU, this is unfair to other processes.
194  */
195 
196 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
197 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
198 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
199 
200 #define	GOLDEN_RATIO_PRIME	2654404609U
201 #define	UMTX_CHAINS		512
202 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
203 
204 #define	GET_SHARE(flags)	\
205     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
206 
207 #define BUSY_SPINS		200
208 
209 struct abs_timeout {
210 	int clockid;
211 	struct timespec cur;
212 	struct timespec end;
213 };
214 
215 static uma_zone_t		umtx_pi_zone;
216 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
217 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
218 static int			umtx_pi_allocated;
219 
220 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
221 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
222     &umtx_pi_allocated, 0, "Allocated umtx_pi");
223 
224 #ifdef UMTX_PROFILING
225 static long max_length;
226 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
227 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
228 #endif
229 
230 static void umtx_shm_init(void);
231 static void umtxq_sysinit(void *);
232 static void umtxq_hash(struct umtx_key *key);
233 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
234 static void umtxq_lock(struct umtx_key *key);
235 static void umtxq_unlock(struct umtx_key *key);
236 static void umtxq_busy(struct umtx_key *key);
237 static void umtxq_unbusy(struct umtx_key *key);
238 static void umtxq_insert_queue(struct umtx_q *uq, int q);
239 static void umtxq_remove_queue(struct umtx_q *uq, int q);
240 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
241 static int umtxq_count(struct umtx_key *key);
242 static struct umtx_pi *umtx_pi_alloc(int);
243 static void umtx_pi_free(struct umtx_pi *pi);
244 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
245 static void umtx_thread_cleanup(struct thread *td);
246 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
247 	struct image_params *imgp __unused);
248 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
249 
250 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
251 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
252 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
253 
254 static struct mtx umtx_lock;
255 
256 #ifdef UMTX_PROFILING
257 static void
258 umtx_init_profiling(void)
259 {
260 	struct sysctl_oid *chain_oid;
261 	char chain_name[10];
262 	int i;
263 
264 	for (i = 0; i < UMTX_CHAINS; ++i) {
265 		snprintf(chain_name, sizeof(chain_name), "%d", i);
266 		chain_oid = SYSCTL_ADD_NODE(NULL,
267 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
268 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
269 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
270 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
271 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
272 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
273 	}
274 }
275 
276 static int
277 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
278 {
279 	char buf[512];
280 	struct sbuf sb;
281 	struct umtxq_chain *uc;
282 	u_int fract, i, j, tot, whole;
283 	u_int sf0, sf1, sf2, sf3, sf4;
284 	u_int si0, si1, si2, si3, si4;
285 	u_int sw0, sw1, sw2, sw3, sw4;
286 
287 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
288 	for (i = 0; i < 2; i++) {
289 		tot = 0;
290 		for (j = 0; j < UMTX_CHAINS; ++j) {
291 			uc = &umtxq_chains[i][j];
292 			mtx_lock(&uc->uc_lock);
293 			tot += uc->max_length;
294 			mtx_unlock(&uc->uc_lock);
295 		}
296 		if (tot == 0)
297 			sbuf_printf(&sb, "%u) Empty ", i);
298 		else {
299 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
300 			si0 = si1 = si2 = si3 = si4 = 0;
301 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
302 			for (j = 0; j < UMTX_CHAINS; j++) {
303 				uc = &umtxq_chains[i][j];
304 				mtx_lock(&uc->uc_lock);
305 				whole = uc->max_length * 100;
306 				mtx_unlock(&uc->uc_lock);
307 				fract = (whole % tot) * 100;
308 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
309 					sf0 = fract;
310 					si0 = j;
311 					sw0 = whole;
312 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
313 				    sf1)) {
314 					sf1 = fract;
315 					si1 = j;
316 					sw1 = whole;
317 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
318 				    sf2)) {
319 					sf2 = fract;
320 					si2 = j;
321 					sw2 = whole;
322 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
323 				    sf3)) {
324 					sf3 = fract;
325 					si3 = j;
326 					sw3 = whole;
327 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
328 				    sf4)) {
329 					sf4 = fract;
330 					si4 = j;
331 					sw4 = whole;
332 				}
333 			}
334 			sbuf_printf(&sb, "queue %u:\n", i);
335 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
336 			    sf0 / tot, si0);
337 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
338 			    sf1 / tot, si1);
339 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
340 			    sf2 / tot, si2);
341 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
342 			    sf3 / tot, si3);
343 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
344 			    sf4 / tot, si4);
345 		}
346 	}
347 	sbuf_trim(&sb);
348 	sbuf_finish(&sb);
349 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
350 	sbuf_delete(&sb);
351 	return (0);
352 }
353 
354 static int
355 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
356 {
357 	struct umtxq_chain *uc;
358 	u_int i, j;
359 	int clear, error;
360 
361 	clear = 0;
362 	error = sysctl_handle_int(oidp, &clear, 0, req);
363 	if (error != 0 || req->newptr == NULL)
364 		return (error);
365 
366 	if (clear != 0) {
367 		for (i = 0; i < 2; ++i) {
368 			for (j = 0; j < UMTX_CHAINS; ++j) {
369 				uc = &umtxq_chains[i][j];
370 				mtx_lock(&uc->uc_lock);
371 				uc->length = 0;
372 				uc->max_length = 0;
373 				mtx_unlock(&uc->uc_lock);
374 			}
375 		}
376 	}
377 	return (0);
378 }
379 
380 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
381     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
382     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
383 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
384     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
385     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
386 #endif
387 
388 static void
389 umtxq_sysinit(void *arg __unused)
390 {
391 	int i, j;
392 
393 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
394 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
395 	for (i = 0; i < 2; ++i) {
396 		for (j = 0; j < UMTX_CHAINS; ++j) {
397 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
398 				 MTX_DEF | MTX_DUPOK);
399 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
400 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
401 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
402 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
403 			umtxq_chains[i][j].uc_busy = 0;
404 			umtxq_chains[i][j].uc_waiters = 0;
405 #ifdef UMTX_PROFILING
406 			umtxq_chains[i][j].length = 0;
407 			umtxq_chains[i][j].max_length = 0;
408 #endif
409 		}
410 	}
411 #ifdef UMTX_PROFILING
412 	umtx_init_profiling();
413 #endif
414 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
415 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
416 	    EVENTHANDLER_PRI_ANY);
417 	umtx_shm_init();
418 }
419 
420 struct umtx_q *
421 umtxq_alloc(void)
422 {
423 	struct umtx_q *uq;
424 
425 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
426 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
427 	TAILQ_INIT(&uq->uq_spare_queue->head);
428 	TAILQ_INIT(&uq->uq_pi_contested);
429 	uq->uq_inherited_pri = PRI_MAX;
430 	return (uq);
431 }
432 
433 void
434 umtxq_free(struct umtx_q *uq)
435 {
436 	MPASS(uq->uq_spare_queue != NULL);
437 	free(uq->uq_spare_queue, M_UMTX);
438 	free(uq, M_UMTX);
439 }
440 
441 static inline void
442 umtxq_hash(struct umtx_key *key)
443 {
444 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
445 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
446 }
447 
448 static inline struct umtxq_chain *
449 umtxq_getchain(struct umtx_key *key)
450 {
451 	if (key->type <= TYPE_SEM)
452 		return (&umtxq_chains[1][key->hash]);
453 	return (&umtxq_chains[0][key->hash]);
454 }
455 
456 /*
457  * Lock a chain.
458  */
459 static inline void
460 umtxq_lock(struct umtx_key *key)
461 {
462 	struct umtxq_chain *uc;
463 
464 	uc = umtxq_getchain(key);
465 	mtx_lock(&uc->uc_lock);
466 }
467 
468 /*
469  * Unlock a chain.
470  */
471 static inline void
472 umtxq_unlock(struct umtx_key *key)
473 {
474 	struct umtxq_chain *uc;
475 
476 	uc = umtxq_getchain(key);
477 	mtx_unlock(&uc->uc_lock);
478 }
479 
480 /*
481  * Set chain to busy state when following operation
482  * may be blocked (kernel mutex can not be used).
483  */
484 static inline void
485 umtxq_busy(struct umtx_key *key)
486 {
487 	struct umtxq_chain *uc;
488 
489 	uc = umtxq_getchain(key);
490 	mtx_assert(&uc->uc_lock, MA_OWNED);
491 	if (uc->uc_busy) {
492 #ifdef SMP
493 		if (smp_cpus > 1) {
494 			int count = BUSY_SPINS;
495 			if (count > 0) {
496 				umtxq_unlock(key);
497 				while (uc->uc_busy && --count > 0)
498 					cpu_spinwait();
499 				umtxq_lock(key);
500 			}
501 		}
502 #endif
503 		while (uc->uc_busy) {
504 			uc->uc_waiters++;
505 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
506 			uc->uc_waiters--;
507 		}
508 	}
509 	uc->uc_busy = 1;
510 }
511 
512 /*
513  * Unbusy a chain.
514  */
515 static inline void
516 umtxq_unbusy(struct umtx_key *key)
517 {
518 	struct umtxq_chain *uc;
519 
520 	uc = umtxq_getchain(key);
521 	mtx_assert(&uc->uc_lock, MA_OWNED);
522 	KASSERT(uc->uc_busy != 0, ("not busy"));
523 	uc->uc_busy = 0;
524 	if (uc->uc_waiters)
525 		wakeup_one(uc);
526 }
527 
528 static inline void
529 umtxq_unbusy_unlocked(struct umtx_key *key)
530 {
531 
532 	umtxq_lock(key);
533 	umtxq_unbusy(key);
534 	umtxq_unlock(key);
535 }
536 
537 static struct umtxq_queue *
538 umtxq_queue_lookup(struct umtx_key *key, int q)
539 {
540 	struct umtxq_queue *uh;
541 	struct umtxq_chain *uc;
542 
543 	uc = umtxq_getchain(key);
544 	UMTXQ_LOCKED_ASSERT(uc);
545 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
546 		if (umtx_key_match(&uh->key, key))
547 			return (uh);
548 	}
549 
550 	return (NULL);
551 }
552 
553 static inline void
554 umtxq_insert_queue(struct umtx_q *uq, int q)
555 {
556 	struct umtxq_queue *uh;
557 	struct umtxq_chain *uc;
558 
559 	uc = umtxq_getchain(&uq->uq_key);
560 	UMTXQ_LOCKED_ASSERT(uc);
561 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
562 	uh = umtxq_queue_lookup(&uq->uq_key, q);
563 	if (uh != NULL) {
564 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
565 	} else {
566 		uh = uq->uq_spare_queue;
567 		uh->key = uq->uq_key;
568 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
569 #ifdef UMTX_PROFILING
570 		uc->length++;
571 		if (uc->length > uc->max_length) {
572 			uc->max_length = uc->length;
573 			if (uc->max_length > max_length)
574 				max_length = uc->max_length;
575 		}
576 #endif
577 	}
578 	uq->uq_spare_queue = NULL;
579 
580 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
581 	uh->length++;
582 	uq->uq_flags |= UQF_UMTXQ;
583 	uq->uq_cur_queue = uh;
584 	return;
585 }
586 
587 static inline void
588 umtxq_remove_queue(struct umtx_q *uq, int q)
589 {
590 	struct umtxq_chain *uc;
591 	struct umtxq_queue *uh;
592 
593 	uc = umtxq_getchain(&uq->uq_key);
594 	UMTXQ_LOCKED_ASSERT(uc);
595 	if (uq->uq_flags & UQF_UMTXQ) {
596 		uh = uq->uq_cur_queue;
597 		TAILQ_REMOVE(&uh->head, uq, uq_link);
598 		uh->length--;
599 		uq->uq_flags &= ~UQF_UMTXQ;
600 		if (TAILQ_EMPTY(&uh->head)) {
601 			KASSERT(uh->length == 0,
602 			    ("inconsistent umtxq_queue length"));
603 #ifdef UMTX_PROFILING
604 			uc->length--;
605 #endif
606 			LIST_REMOVE(uh, link);
607 		} else {
608 			uh = LIST_FIRST(&uc->uc_spare_queue);
609 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
610 			LIST_REMOVE(uh, link);
611 		}
612 		uq->uq_spare_queue = uh;
613 		uq->uq_cur_queue = NULL;
614 	}
615 }
616 
617 /*
618  * Check if there are multiple waiters
619  */
620 static int
621 umtxq_count(struct umtx_key *key)
622 {
623 	struct umtxq_chain *uc;
624 	struct umtxq_queue *uh;
625 
626 	uc = umtxq_getchain(key);
627 	UMTXQ_LOCKED_ASSERT(uc);
628 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
629 	if (uh != NULL)
630 		return (uh->length);
631 	return (0);
632 }
633 
634 /*
635  * Check if there are multiple PI waiters and returns first
636  * waiter.
637  */
638 static int
639 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
640 {
641 	struct umtxq_chain *uc;
642 	struct umtxq_queue *uh;
643 
644 	*first = NULL;
645 	uc = umtxq_getchain(key);
646 	UMTXQ_LOCKED_ASSERT(uc);
647 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
648 	if (uh != NULL) {
649 		*first = TAILQ_FIRST(&uh->head);
650 		return (uh->length);
651 	}
652 	return (0);
653 }
654 
655 static int
656 umtxq_check_susp(struct thread *td)
657 {
658 	struct proc *p;
659 	int error;
660 
661 	/*
662 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
663 	 * eventually break the lockstep loop.
664 	 */
665 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
666 		return (0);
667 	error = 0;
668 	p = td->td_proc;
669 	PROC_LOCK(p);
670 	if (P_SHOULDSTOP(p) ||
671 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
672 		if (p->p_flag & P_SINGLE_EXIT)
673 			error = EINTR;
674 		else
675 			error = ERESTART;
676 	}
677 	PROC_UNLOCK(p);
678 	return (error);
679 }
680 
681 /*
682  * Wake up threads waiting on an userland object.
683  */
684 
685 static int
686 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
687 {
688 	struct umtxq_chain *uc;
689 	struct umtxq_queue *uh;
690 	struct umtx_q *uq;
691 	int ret;
692 
693 	ret = 0;
694 	uc = umtxq_getchain(key);
695 	UMTXQ_LOCKED_ASSERT(uc);
696 	uh = umtxq_queue_lookup(key, q);
697 	if (uh != NULL) {
698 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
699 			umtxq_remove_queue(uq, q);
700 			wakeup(uq);
701 			if (++ret >= n_wake)
702 				return (ret);
703 		}
704 	}
705 	return (ret);
706 }
707 
708 
709 /*
710  * Wake up specified thread.
711  */
712 static inline void
713 umtxq_signal_thread(struct umtx_q *uq)
714 {
715 	struct umtxq_chain *uc;
716 
717 	uc = umtxq_getchain(&uq->uq_key);
718 	UMTXQ_LOCKED_ASSERT(uc);
719 	umtxq_remove(uq);
720 	wakeup(uq);
721 }
722 
723 static inline int
724 tstohz(const struct timespec *tsp)
725 {
726 	struct timeval tv;
727 
728 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
729 	return tvtohz(&tv);
730 }
731 
732 static void
733 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
734 	const struct timespec *timeout)
735 {
736 
737 	timo->clockid = clockid;
738 	if (!absolute) {
739 		kern_clock_gettime(curthread, clockid, &timo->end);
740 		timo->cur = timo->end;
741 		timespecadd(&timo->end, timeout);
742 	} else {
743 		timo->end = *timeout;
744 		kern_clock_gettime(curthread, clockid, &timo->cur);
745 	}
746 }
747 
748 static void
749 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
750 {
751 
752 	abs_timeout_init(timo, umtxtime->_clockid,
753 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
754 		&umtxtime->_timeout);
755 }
756 
757 static inline void
758 abs_timeout_update(struct abs_timeout *timo)
759 {
760 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
761 }
762 
763 static int
764 abs_timeout_gethz(struct abs_timeout *timo)
765 {
766 	struct timespec tts;
767 
768 	if (timespeccmp(&timo->end, &timo->cur, <=))
769 		return (-1);
770 	tts = timo->end;
771 	timespecsub(&tts, &timo->cur);
772 	return (tstohz(&tts));
773 }
774 
775 /*
776  * Put thread into sleep state, before sleeping, check if
777  * thread was removed from umtx queue.
778  */
779 static inline int
780 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
781 {
782 	struct umtxq_chain *uc;
783 	int error, timo;
784 
785 	uc = umtxq_getchain(&uq->uq_key);
786 	UMTXQ_LOCKED_ASSERT(uc);
787 	for (;;) {
788 		if (!(uq->uq_flags & UQF_UMTXQ))
789 			return (0);
790 		if (abstime != NULL) {
791 			timo = abs_timeout_gethz(abstime);
792 			if (timo < 0)
793 				return (ETIMEDOUT);
794 		} else
795 			timo = 0;
796 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
797 		if (error != EWOULDBLOCK) {
798 			umtxq_lock(&uq->uq_key);
799 			break;
800 		}
801 		if (abstime != NULL)
802 			abs_timeout_update(abstime);
803 		umtxq_lock(&uq->uq_key);
804 	}
805 	return (error);
806 }
807 
808 /*
809  * Convert userspace address into unique logical address.
810  */
811 int
812 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
813 {
814 	struct thread *td = curthread;
815 	vm_map_t map;
816 	vm_map_entry_t entry;
817 	vm_pindex_t pindex;
818 	vm_prot_t prot;
819 	boolean_t wired;
820 
821 	key->type = type;
822 	if (share == THREAD_SHARE) {
823 		key->shared = 0;
824 		key->info.private.vs = td->td_proc->p_vmspace;
825 		key->info.private.addr = (uintptr_t)addr;
826 	} else {
827 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
828 		map = &td->td_proc->p_vmspace->vm_map;
829 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
830 		    &entry, &key->info.shared.object, &pindex, &prot,
831 		    &wired) != KERN_SUCCESS) {
832 			return (EFAULT);
833 		}
834 
835 		if ((share == PROCESS_SHARE) ||
836 		    (share == AUTO_SHARE &&
837 		     VM_INHERIT_SHARE == entry->inheritance)) {
838 			key->shared = 1;
839 			key->info.shared.offset = (vm_offset_t)addr -
840 			    entry->start + entry->offset;
841 			vm_object_reference(key->info.shared.object);
842 		} else {
843 			key->shared = 0;
844 			key->info.private.vs = td->td_proc->p_vmspace;
845 			key->info.private.addr = (uintptr_t)addr;
846 		}
847 		vm_map_lookup_done(map, entry);
848 	}
849 
850 	umtxq_hash(key);
851 	return (0);
852 }
853 
854 /*
855  * Release key.
856  */
857 void
858 umtx_key_release(struct umtx_key *key)
859 {
860 	if (key->shared)
861 		vm_object_deallocate(key->info.shared.object);
862 }
863 
864 /*
865  * Fetch and compare value, sleep on the address if value is not changed.
866  */
867 static int
868 do_wait(struct thread *td, void *addr, u_long id,
869 	struct _umtx_time *timeout, int compat32, int is_private)
870 {
871 	struct abs_timeout timo;
872 	struct umtx_q *uq;
873 	u_long tmp;
874 	uint32_t tmp32;
875 	int error = 0;
876 
877 	uq = td->td_umtxq;
878 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
879 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
880 		return (error);
881 
882 	if (timeout != NULL)
883 		abs_timeout_init2(&timo, timeout);
884 
885 	umtxq_lock(&uq->uq_key);
886 	umtxq_insert(uq);
887 	umtxq_unlock(&uq->uq_key);
888 	if (compat32 == 0) {
889 		error = fueword(addr, &tmp);
890 		if (error != 0)
891 			error = EFAULT;
892 	} else {
893 		error = fueword32(addr, &tmp32);
894 		if (error == 0)
895 			tmp = tmp32;
896 		else
897 			error = EFAULT;
898 	}
899 	umtxq_lock(&uq->uq_key);
900 	if (error == 0) {
901 		if (tmp == id)
902 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
903 			    NULL : &timo);
904 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
905 			error = 0;
906 		else
907 			umtxq_remove(uq);
908 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
909 		umtxq_remove(uq);
910 	}
911 	umtxq_unlock(&uq->uq_key);
912 	umtx_key_release(&uq->uq_key);
913 	if (error == ERESTART)
914 		error = EINTR;
915 	return (error);
916 }
917 
918 /*
919  * Wake up threads sleeping on the specified address.
920  */
921 int
922 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
923 {
924 	struct umtx_key key;
925 	int ret;
926 
927 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
928 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
929 		return (ret);
930 	umtxq_lock(&key);
931 	umtxq_signal(&key, n_wake);
932 	umtxq_unlock(&key);
933 	umtx_key_release(&key);
934 	return (0);
935 }
936 
937 /*
938  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
939  */
940 static int
941 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
942 	struct _umtx_time *timeout, int mode)
943 {
944 	struct abs_timeout timo;
945 	struct umtx_q *uq;
946 	uint32_t owner, old, id;
947 	int error, rv;
948 
949 	id = td->td_tid;
950 	uq = td->td_umtxq;
951 	error = 0;
952 	if (timeout != NULL)
953 		abs_timeout_init2(&timo, timeout);
954 
955 	/*
956 	 * Care must be exercised when dealing with umtx structure. It
957 	 * can fault on any access.
958 	 */
959 	for (;;) {
960 		rv = fueword32(&m->m_owner, &owner);
961 		if (rv == -1)
962 			return (EFAULT);
963 		if (mode == _UMUTEX_WAIT) {
964 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
965 				return (0);
966 		} else {
967 			/*
968 			 * Try the uncontested case.  This should be done in userland.
969 			 */
970 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
971 			    &owner, id);
972 			/* The address was invalid. */
973 			if (rv == -1)
974 				return (EFAULT);
975 
976 			/* The acquire succeeded. */
977 			if (owner == UMUTEX_UNOWNED)
978 				return (0);
979 
980 			/* If no one owns it but it is contested try to acquire it. */
981 			if (owner == UMUTEX_CONTESTED) {
982 				rv = casueword32(&m->m_owner,
983 				    UMUTEX_CONTESTED, &owner,
984 				    id | UMUTEX_CONTESTED);
985 				/* The address was invalid. */
986 				if (rv == -1)
987 					return (EFAULT);
988 
989 				if (owner == UMUTEX_CONTESTED)
990 					return (0);
991 
992 				rv = umtxq_check_susp(td);
993 				if (rv != 0)
994 					return (rv);
995 
996 				/* If this failed the lock has changed, restart. */
997 				continue;
998 			}
999 		}
1000 
1001 		if (mode == _UMUTEX_TRY)
1002 			return (EBUSY);
1003 
1004 		/*
1005 		 * If we caught a signal, we have retried and now
1006 		 * exit immediately.
1007 		 */
1008 		if (error != 0)
1009 			return (error);
1010 
1011 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1012 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1013 			return (error);
1014 
1015 		umtxq_lock(&uq->uq_key);
1016 		umtxq_busy(&uq->uq_key);
1017 		umtxq_insert(uq);
1018 		umtxq_unlock(&uq->uq_key);
1019 
1020 		/*
1021 		 * Set the contested bit so that a release in user space
1022 		 * knows to use the system call for unlock.  If this fails
1023 		 * either some one else has acquired the lock or it has been
1024 		 * released.
1025 		 */
1026 		rv = casueword32(&m->m_owner, owner, &old,
1027 		    owner | UMUTEX_CONTESTED);
1028 
1029 		/* The address was invalid. */
1030 		if (rv == -1) {
1031 			umtxq_lock(&uq->uq_key);
1032 			umtxq_remove(uq);
1033 			umtxq_unbusy(&uq->uq_key);
1034 			umtxq_unlock(&uq->uq_key);
1035 			umtx_key_release(&uq->uq_key);
1036 			return (EFAULT);
1037 		}
1038 
1039 		/*
1040 		 * We set the contested bit, sleep. Otherwise the lock changed
1041 		 * and we need to retry or we lost a race to the thread
1042 		 * unlocking the umtx.
1043 		 */
1044 		umtxq_lock(&uq->uq_key);
1045 		umtxq_unbusy(&uq->uq_key);
1046 		if (old == owner)
1047 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1048 			    NULL : &timo);
1049 		umtxq_remove(uq);
1050 		umtxq_unlock(&uq->uq_key);
1051 		umtx_key_release(&uq->uq_key);
1052 
1053 		if (error == 0)
1054 			error = umtxq_check_susp(td);
1055 	}
1056 
1057 	return (0);
1058 }
1059 
1060 /*
1061  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1062  */
1063 static int
1064 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1065 {
1066 	struct umtx_key key;
1067 	uint32_t owner, old, id;
1068 	int error;
1069 	int count;
1070 
1071 	id = td->td_tid;
1072 	/*
1073 	 * Make sure we own this mtx.
1074 	 */
1075 	error = fueword32(&m->m_owner, &owner);
1076 	if (error == -1)
1077 		return (EFAULT);
1078 
1079 	if ((owner & ~UMUTEX_CONTESTED) != id)
1080 		return (EPERM);
1081 
1082 	if ((owner & UMUTEX_CONTESTED) == 0) {
1083 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1084 		if (error == -1)
1085 			return (EFAULT);
1086 		if (old == owner)
1087 			return (0);
1088 		owner = old;
1089 	}
1090 
1091 	/* We should only ever be in here for contested locks */
1092 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1093 	    &key)) != 0)
1094 		return (error);
1095 
1096 	umtxq_lock(&key);
1097 	umtxq_busy(&key);
1098 	count = umtxq_count(&key);
1099 	umtxq_unlock(&key);
1100 
1101 	/*
1102 	 * When unlocking the umtx, it must be marked as unowned if
1103 	 * there is zero or one thread only waiting for it.
1104 	 * Otherwise, it must be marked as contested.
1105 	 */
1106 	error = casueword32(&m->m_owner, owner, &old,
1107 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1108 	umtxq_lock(&key);
1109 	umtxq_signal(&key,1);
1110 	umtxq_unbusy(&key);
1111 	umtxq_unlock(&key);
1112 	umtx_key_release(&key);
1113 	if (error == -1)
1114 		return (EFAULT);
1115 	if (old != owner)
1116 		return (EINVAL);
1117 	return (0);
1118 }
1119 
1120 /*
1121  * Check if the mutex is available and wake up a waiter,
1122  * only for simple mutex.
1123  */
1124 static int
1125 do_wake_umutex(struct thread *td, struct umutex *m)
1126 {
1127 	struct umtx_key key;
1128 	uint32_t owner;
1129 	uint32_t flags;
1130 	int error;
1131 	int count;
1132 
1133 	error = fueword32(&m->m_owner, &owner);
1134 	if (error == -1)
1135 		return (EFAULT);
1136 
1137 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1138 		return (0);
1139 
1140 	error = fueword32(&m->m_flags, &flags);
1141 	if (error == -1)
1142 		return (EFAULT);
1143 
1144 	/* We should only ever be in here for contested locks */
1145 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1146 	    &key)) != 0)
1147 		return (error);
1148 
1149 	umtxq_lock(&key);
1150 	umtxq_busy(&key);
1151 	count = umtxq_count(&key);
1152 	umtxq_unlock(&key);
1153 
1154 	if (count <= 1) {
1155 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1156 		    UMUTEX_UNOWNED);
1157 		if (error == -1)
1158 			error = EFAULT;
1159 	}
1160 
1161 	umtxq_lock(&key);
1162 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1163 		umtxq_signal(&key, 1);
1164 	umtxq_unbusy(&key);
1165 	umtxq_unlock(&key);
1166 	umtx_key_release(&key);
1167 	return (error);
1168 }
1169 
1170 /*
1171  * Check if the mutex has waiters and tries to fix contention bit.
1172  */
1173 static int
1174 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1175 {
1176 	struct umtx_key key;
1177 	uint32_t owner, old;
1178 	int type;
1179 	int error;
1180 	int count;
1181 
1182 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1183 	case 0:
1184 		type = TYPE_NORMAL_UMUTEX;
1185 		break;
1186 	case UMUTEX_PRIO_INHERIT:
1187 		type = TYPE_PI_UMUTEX;
1188 		break;
1189 	case UMUTEX_PRIO_PROTECT:
1190 		type = TYPE_PP_UMUTEX;
1191 		break;
1192 	default:
1193 		return (EINVAL);
1194 	}
1195 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1196 	    &key)) != 0)
1197 		return (error);
1198 
1199 	owner = 0;
1200 	umtxq_lock(&key);
1201 	umtxq_busy(&key);
1202 	count = umtxq_count(&key);
1203 	umtxq_unlock(&key);
1204 	/*
1205 	 * Only repair contention bit if there is a waiter, this means the mutex
1206 	 * is still being referenced by userland code, otherwise don't update
1207 	 * any memory.
1208 	 */
1209 	if (count > 1) {
1210 		error = fueword32(&m->m_owner, &owner);
1211 		if (error == -1)
1212 			error = EFAULT;
1213 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1214 			error = casueword32(&m->m_owner, owner, &old,
1215 			    owner | UMUTEX_CONTESTED);
1216 			if (error == -1) {
1217 				error = EFAULT;
1218 				break;
1219 			}
1220 			if (old == owner)
1221 				break;
1222 			owner = old;
1223 			error = umtxq_check_susp(td);
1224 			if (error != 0)
1225 				break;
1226 		}
1227 	} else if (count == 1) {
1228 		error = fueword32(&m->m_owner, &owner);
1229 		if (error == -1)
1230 			error = EFAULT;
1231 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1232 		       (owner & UMUTEX_CONTESTED) == 0) {
1233 			error = casueword32(&m->m_owner, owner, &old,
1234 			    owner | UMUTEX_CONTESTED);
1235 			if (error == -1) {
1236 				error = EFAULT;
1237 				break;
1238 			}
1239 			if (old == owner)
1240 				break;
1241 			owner = old;
1242 			error = umtxq_check_susp(td);
1243 			if (error != 0)
1244 				break;
1245 		}
1246 	}
1247 	umtxq_lock(&key);
1248 	if (error == EFAULT) {
1249 		umtxq_signal(&key, INT_MAX);
1250 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1251 		umtxq_signal(&key, 1);
1252 	umtxq_unbusy(&key);
1253 	umtxq_unlock(&key);
1254 	umtx_key_release(&key);
1255 	return (error);
1256 }
1257 
1258 static inline struct umtx_pi *
1259 umtx_pi_alloc(int flags)
1260 {
1261 	struct umtx_pi *pi;
1262 
1263 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1264 	TAILQ_INIT(&pi->pi_blocked);
1265 	atomic_add_int(&umtx_pi_allocated, 1);
1266 	return (pi);
1267 }
1268 
1269 static inline void
1270 umtx_pi_free(struct umtx_pi *pi)
1271 {
1272 	uma_zfree(umtx_pi_zone, pi);
1273 	atomic_add_int(&umtx_pi_allocated, -1);
1274 }
1275 
1276 /*
1277  * Adjust the thread's position on a pi_state after its priority has been
1278  * changed.
1279  */
1280 static int
1281 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1282 {
1283 	struct umtx_q *uq, *uq1, *uq2;
1284 	struct thread *td1;
1285 
1286 	mtx_assert(&umtx_lock, MA_OWNED);
1287 	if (pi == NULL)
1288 		return (0);
1289 
1290 	uq = td->td_umtxq;
1291 
1292 	/*
1293 	 * Check if the thread needs to be moved on the blocked chain.
1294 	 * It needs to be moved if either its priority is lower than
1295 	 * the previous thread or higher than the next thread.
1296 	 */
1297 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1298 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1299 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1300 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1301 		/*
1302 		 * Remove thread from blocked chain and determine where
1303 		 * it should be moved to.
1304 		 */
1305 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1306 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1307 			td1 = uq1->uq_thread;
1308 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1309 			if (UPRI(td1) > UPRI(td))
1310 				break;
1311 		}
1312 
1313 		if (uq1 == NULL)
1314 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1315 		else
1316 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1317 	}
1318 	return (1);
1319 }
1320 
1321 static struct umtx_pi *
1322 umtx_pi_next(struct umtx_pi *pi)
1323 {
1324 	struct umtx_q *uq_owner;
1325 
1326 	if (pi->pi_owner == NULL)
1327 		return (NULL);
1328 	uq_owner = pi->pi_owner->td_umtxq;
1329 	if (uq_owner == NULL)
1330 		return (NULL);
1331 	return (uq_owner->uq_pi_blocked);
1332 }
1333 
1334 /*
1335  * Floyd's Cycle-Finding Algorithm.
1336  */
1337 static bool
1338 umtx_pi_check_loop(struct umtx_pi *pi)
1339 {
1340 	struct umtx_pi *pi1;	/* fast iterator */
1341 
1342 	mtx_assert(&umtx_lock, MA_OWNED);
1343 	if (pi == NULL)
1344 		return (false);
1345 	pi1 = pi;
1346 	for (;;) {
1347 		pi = umtx_pi_next(pi);
1348 		if (pi == NULL)
1349 			break;
1350 		pi1 = umtx_pi_next(pi1);
1351 		if (pi1 == NULL)
1352 			break;
1353 		pi1 = umtx_pi_next(pi1);
1354 		if (pi1 == NULL)
1355 			break;
1356 		if (pi == pi1)
1357 			return (true);
1358 	}
1359 	return (false);
1360 }
1361 
1362 /*
1363  * Propagate priority when a thread is blocked on POSIX
1364  * PI mutex.
1365  */
1366 static void
1367 umtx_propagate_priority(struct thread *td)
1368 {
1369 	struct umtx_q *uq;
1370 	struct umtx_pi *pi;
1371 	int pri;
1372 
1373 	mtx_assert(&umtx_lock, MA_OWNED);
1374 	pri = UPRI(td);
1375 	uq = td->td_umtxq;
1376 	pi = uq->uq_pi_blocked;
1377 	if (pi == NULL)
1378 		return;
1379 	if (umtx_pi_check_loop(pi))
1380 		return;
1381 
1382 	for (;;) {
1383 		td = pi->pi_owner;
1384 		if (td == NULL || td == curthread)
1385 			return;
1386 
1387 		MPASS(td->td_proc != NULL);
1388 		MPASS(td->td_proc->p_magic == P_MAGIC);
1389 
1390 		thread_lock(td);
1391 		if (td->td_lend_user_pri > pri)
1392 			sched_lend_user_prio(td, pri);
1393 		else {
1394 			thread_unlock(td);
1395 			break;
1396 		}
1397 		thread_unlock(td);
1398 
1399 		/*
1400 		 * Pick up the lock that td is blocked on.
1401 		 */
1402 		uq = td->td_umtxq;
1403 		pi = uq->uq_pi_blocked;
1404 		if (pi == NULL)
1405 			break;
1406 		/* Resort td on the list if needed. */
1407 		umtx_pi_adjust_thread(pi, td);
1408 	}
1409 }
1410 
1411 /*
1412  * Unpropagate priority for a PI mutex when a thread blocked on
1413  * it is interrupted by signal or resumed by others.
1414  */
1415 static void
1416 umtx_repropagate_priority(struct umtx_pi *pi)
1417 {
1418 	struct umtx_q *uq, *uq_owner;
1419 	struct umtx_pi *pi2;
1420 	int pri;
1421 
1422 	mtx_assert(&umtx_lock, MA_OWNED);
1423 
1424 	if (umtx_pi_check_loop(pi))
1425 		return;
1426 	while (pi != NULL && pi->pi_owner != NULL) {
1427 		pri = PRI_MAX;
1428 		uq_owner = pi->pi_owner->td_umtxq;
1429 
1430 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1431 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1432 			if (uq != NULL) {
1433 				if (pri > UPRI(uq->uq_thread))
1434 					pri = UPRI(uq->uq_thread);
1435 			}
1436 		}
1437 
1438 		if (pri > uq_owner->uq_inherited_pri)
1439 			pri = uq_owner->uq_inherited_pri;
1440 		thread_lock(pi->pi_owner);
1441 		sched_lend_user_prio(pi->pi_owner, pri);
1442 		thread_unlock(pi->pi_owner);
1443 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1444 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1445 	}
1446 }
1447 
1448 /*
1449  * Insert a PI mutex into owned list.
1450  */
1451 static void
1452 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1453 {
1454 	struct umtx_q *uq_owner;
1455 
1456 	uq_owner = owner->td_umtxq;
1457 	mtx_assert(&umtx_lock, MA_OWNED);
1458 	if (pi->pi_owner != NULL)
1459 		panic("pi_owner != NULL");
1460 	pi->pi_owner = owner;
1461 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1462 }
1463 
1464 
1465 /*
1466  * Disown a PI mutex, and remove it from the owned list.
1467  */
1468 static void
1469 umtx_pi_disown(struct umtx_pi *pi)
1470 {
1471 
1472 	mtx_assert(&umtx_lock, MA_OWNED);
1473 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1474 	pi->pi_owner = NULL;
1475 }
1476 
1477 /*
1478  * Claim ownership of a PI mutex.
1479  */
1480 static int
1481 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1482 {
1483 	struct umtx_q *uq;
1484 
1485 	mtx_lock(&umtx_lock);
1486 	if (pi->pi_owner == owner) {
1487 		mtx_unlock(&umtx_lock);
1488 		return (0);
1489 	}
1490 
1491 	if (pi->pi_owner != NULL) {
1492 		/*
1493 		 * userland may have already messed the mutex, sigh.
1494 		 */
1495 		mtx_unlock(&umtx_lock);
1496 		return (EPERM);
1497 	}
1498 	umtx_pi_setowner(pi, owner);
1499 	uq = TAILQ_FIRST(&pi->pi_blocked);
1500 	if (uq != NULL) {
1501 		int pri;
1502 
1503 		pri = UPRI(uq->uq_thread);
1504 		thread_lock(owner);
1505 		if (pri < UPRI(owner))
1506 			sched_lend_user_prio(owner, pri);
1507 		thread_unlock(owner);
1508 	}
1509 	mtx_unlock(&umtx_lock);
1510 	return (0);
1511 }
1512 
1513 /*
1514  * Adjust a thread's order position in its blocked PI mutex,
1515  * this may result new priority propagating process.
1516  */
1517 void
1518 umtx_pi_adjust(struct thread *td, u_char oldpri)
1519 {
1520 	struct umtx_q *uq;
1521 	struct umtx_pi *pi;
1522 
1523 	uq = td->td_umtxq;
1524 	mtx_lock(&umtx_lock);
1525 	/*
1526 	 * Pick up the lock that td is blocked on.
1527 	 */
1528 	pi = uq->uq_pi_blocked;
1529 	if (pi != NULL) {
1530 		umtx_pi_adjust_thread(pi, td);
1531 		umtx_repropagate_priority(pi);
1532 	}
1533 	mtx_unlock(&umtx_lock);
1534 }
1535 
1536 /*
1537  * Sleep on a PI mutex.
1538  */
1539 static int
1540 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1541 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1542 {
1543 	struct umtxq_chain *uc;
1544 	struct thread *td, *td1;
1545 	struct umtx_q *uq1;
1546 	int pri;
1547 	int error = 0;
1548 
1549 	td = uq->uq_thread;
1550 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1551 	uc = umtxq_getchain(&uq->uq_key);
1552 	UMTXQ_LOCKED_ASSERT(uc);
1553 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1554 	umtxq_insert(uq);
1555 	mtx_lock(&umtx_lock);
1556 	if (pi->pi_owner == NULL) {
1557 		mtx_unlock(&umtx_lock);
1558 		/* XXX Only look up thread in current process. */
1559 		td1 = tdfind(owner, curproc->p_pid);
1560 		mtx_lock(&umtx_lock);
1561 		if (td1 != NULL) {
1562 			if (pi->pi_owner == NULL)
1563 				umtx_pi_setowner(pi, td1);
1564 			PROC_UNLOCK(td1->td_proc);
1565 		}
1566 	}
1567 
1568 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1569 		pri = UPRI(uq1->uq_thread);
1570 		if (pri > UPRI(td))
1571 			break;
1572 	}
1573 
1574 	if (uq1 != NULL)
1575 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1576 	else
1577 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1578 
1579 	uq->uq_pi_blocked = pi;
1580 	thread_lock(td);
1581 	td->td_flags |= TDF_UPIBLOCKED;
1582 	thread_unlock(td);
1583 	umtx_propagate_priority(td);
1584 	mtx_unlock(&umtx_lock);
1585 	umtxq_unbusy(&uq->uq_key);
1586 
1587 	error = umtxq_sleep(uq, wmesg, timo);
1588 	umtxq_remove(uq);
1589 
1590 	mtx_lock(&umtx_lock);
1591 	uq->uq_pi_blocked = NULL;
1592 	thread_lock(td);
1593 	td->td_flags &= ~TDF_UPIBLOCKED;
1594 	thread_unlock(td);
1595 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1596 	umtx_repropagate_priority(pi);
1597 	mtx_unlock(&umtx_lock);
1598 	umtxq_unlock(&uq->uq_key);
1599 
1600 	return (error);
1601 }
1602 
1603 /*
1604  * Add reference count for a PI mutex.
1605  */
1606 static void
1607 umtx_pi_ref(struct umtx_pi *pi)
1608 {
1609 	struct umtxq_chain *uc;
1610 
1611 	uc = umtxq_getchain(&pi->pi_key);
1612 	UMTXQ_LOCKED_ASSERT(uc);
1613 	pi->pi_refcount++;
1614 }
1615 
1616 /*
1617  * Decrease reference count for a PI mutex, if the counter
1618  * is decreased to zero, its memory space is freed.
1619  */
1620 static void
1621 umtx_pi_unref(struct umtx_pi *pi)
1622 {
1623 	struct umtxq_chain *uc;
1624 
1625 	uc = umtxq_getchain(&pi->pi_key);
1626 	UMTXQ_LOCKED_ASSERT(uc);
1627 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1628 	if (--pi->pi_refcount == 0) {
1629 		mtx_lock(&umtx_lock);
1630 		if (pi->pi_owner != NULL)
1631 			umtx_pi_disown(pi);
1632 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1633 			("blocked queue not empty"));
1634 		mtx_unlock(&umtx_lock);
1635 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1636 		umtx_pi_free(pi);
1637 	}
1638 }
1639 
1640 /*
1641  * Find a PI mutex in hash table.
1642  */
1643 static struct umtx_pi *
1644 umtx_pi_lookup(struct umtx_key *key)
1645 {
1646 	struct umtxq_chain *uc;
1647 	struct umtx_pi *pi;
1648 
1649 	uc = umtxq_getchain(key);
1650 	UMTXQ_LOCKED_ASSERT(uc);
1651 
1652 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1653 		if (umtx_key_match(&pi->pi_key, key)) {
1654 			return (pi);
1655 		}
1656 	}
1657 	return (NULL);
1658 }
1659 
1660 /*
1661  * Insert a PI mutex into hash table.
1662  */
1663 static inline void
1664 umtx_pi_insert(struct umtx_pi *pi)
1665 {
1666 	struct umtxq_chain *uc;
1667 
1668 	uc = umtxq_getchain(&pi->pi_key);
1669 	UMTXQ_LOCKED_ASSERT(uc);
1670 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1671 }
1672 
1673 /*
1674  * Lock a PI mutex.
1675  */
1676 static int
1677 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1678     struct _umtx_time *timeout, int try)
1679 {
1680 	struct abs_timeout timo;
1681 	struct umtx_q *uq;
1682 	struct umtx_pi *pi, *new_pi;
1683 	uint32_t id, owner, old;
1684 	int error, rv;
1685 
1686 	id = td->td_tid;
1687 	uq = td->td_umtxq;
1688 
1689 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1690 	    &uq->uq_key)) != 0)
1691 		return (error);
1692 
1693 	if (timeout != NULL)
1694 		abs_timeout_init2(&timo, timeout);
1695 
1696 	umtxq_lock(&uq->uq_key);
1697 	pi = umtx_pi_lookup(&uq->uq_key);
1698 	if (pi == NULL) {
1699 		new_pi = umtx_pi_alloc(M_NOWAIT);
1700 		if (new_pi == NULL) {
1701 			umtxq_unlock(&uq->uq_key);
1702 			new_pi = umtx_pi_alloc(M_WAITOK);
1703 			umtxq_lock(&uq->uq_key);
1704 			pi = umtx_pi_lookup(&uq->uq_key);
1705 			if (pi != NULL) {
1706 				umtx_pi_free(new_pi);
1707 				new_pi = NULL;
1708 			}
1709 		}
1710 		if (new_pi != NULL) {
1711 			new_pi->pi_key = uq->uq_key;
1712 			umtx_pi_insert(new_pi);
1713 			pi = new_pi;
1714 		}
1715 	}
1716 	umtx_pi_ref(pi);
1717 	umtxq_unlock(&uq->uq_key);
1718 
1719 	/*
1720 	 * Care must be exercised when dealing with umtx structure.  It
1721 	 * can fault on any access.
1722 	 */
1723 	for (;;) {
1724 		/*
1725 		 * Try the uncontested case.  This should be done in userland.
1726 		 */
1727 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1728 		/* The address was invalid. */
1729 		if (rv == -1) {
1730 			error = EFAULT;
1731 			break;
1732 		}
1733 
1734 		/* The acquire succeeded. */
1735 		if (owner == UMUTEX_UNOWNED) {
1736 			error = 0;
1737 			break;
1738 		}
1739 
1740 		/* If no one owns it but it is contested try to acquire it. */
1741 		if (owner == UMUTEX_CONTESTED) {
1742 			rv = casueword32(&m->m_owner,
1743 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1744 			/* The address was invalid. */
1745 			if (rv == -1) {
1746 				error = EFAULT;
1747 				break;
1748 			}
1749 
1750 			if (owner == UMUTEX_CONTESTED) {
1751 				umtxq_lock(&uq->uq_key);
1752 				umtxq_busy(&uq->uq_key);
1753 				error = umtx_pi_claim(pi, td);
1754 				umtxq_unbusy(&uq->uq_key);
1755 				umtxq_unlock(&uq->uq_key);
1756 				if (error != 0) {
1757 					/*
1758 					 * Since we're going to return an
1759 					 * error, restore the m_owner to its
1760 					 * previous, unowned state to avoid
1761 					 * compounding the problem.
1762 					 */
1763 					(void)casuword32(&m->m_owner,
1764 					    id | UMUTEX_CONTESTED,
1765 					    UMUTEX_CONTESTED);
1766 				}
1767 				break;
1768 			}
1769 
1770 			error = umtxq_check_susp(td);
1771 			if (error != 0)
1772 				break;
1773 
1774 			/* If this failed the lock has changed, restart. */
1775 			continue;
1776 		}
1777 
1778 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1779 			error = EDEADLK;
1780 			break;
1781 		}
1782 
1783 		if (try != 0) {
1784 			error = EBUSY;
1785 			break;
1786 		}
1787 
1788 		/*
1789 		 * If we caught a signal, we have retried and now
1790 		 * exit immediately.
1791 		 */
1792 		if (error != 0)
1793 			break;
1794 
1795 		umtxq_lock(&uq->uq_key);
1796 		umtxq_busy(&uq->uq_key);
1797 		umtxq_unlock(&uq->uq_key);
1798 
1799 		/*
1800 		 * Set the contested bit so that a release in user space
1801 		 * knows to use the system call for unlock.  If this fails
1802 		 * either some one else has acquired the lock or it has been
1803 		 * released.
1804 		 */
1805 		rv = casueword32(&m->m_owner, owner, &old,
1806 		    owner | UMUTEX_CONTESTED);
1807 
1808 		/* The address was invalid. */
1809 		if (rv == -1) {
1810 			umtxq_unbusy_unlocked(&uq->uq_key);
1811 			error = EFAULT;
1812 			break;
1813 		}
1814 
1815 		umtxq_lock(&uq->uq_key);
1816 		/*
1817 		 * We set the contested bit, sleep. Otherwise the lock changed
1818 		 * and we need to retry or we lost a race to the thread
1819 		 * unlocking the umtx.
1820 		 */
1821 		if (old == owner) {
1822 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1823 			    "umtxpi", timeout == NULL ? NULL : &timo);
1824 			if (error != 0)
1825 				continue;
1826 		} else {
1827 			umtxq_unbusy(&uq->uq_key);
1828 			umtxq_unlock(&uq->uq_key);
1829 		}
1830 
1831 		error = umtxq_check_susp(td);
1832 		if (error != 0)
1833 			break;
1834 	}
1835 
1836 	umtxq_lock(&uq->uq_key);
1837 	umtx_pi_unref(pi);
1838 	umtxq_unlock(&uq->uq_key);
1839 
1840 	umtx_key_release(&uq->uq_key);
1841 	return (error);
1842 }
1843 
1844 /*
1845  * Unlock a PI mutex.
1846  */
1847 static int
1848 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1849 {
1850 	struct umtx_key key;
1851 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1852 	struct umtx_pi *pi, *pi2;
1853 	uint32_t owner, old, id;
1854 	int error;
1855 	int count;
1856 	int pri;
1857 
1858 	id = td->td_tid;
1859 	/*
1860 	 * Make sure we own this mtx.
1861 	 */
1862 	error = fueword32(&m->m_owner, &owner);
1863 	if (error == -1)
1864 		return (EFAULT);
1865 
1866 	if ((owner & ~UMUTEX_CONTESTED) != id)
1867 		return (EPERM);
1868 
1869 	/* This should be done in userland */
1870 	if ((owner & UMUTEX_CONTESTED) == 0) {
1871 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1872 		if (error == -1)
1873 			return (EFAULT);
1874 		if (old == owner)
1875 			return (0);
1876 		owner = old;
1877 	}
1878 
1879 	/* We should only ever be in here for contested locks */
1880 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1881 	    &key)) != 0)
1882 		return (error);
1883 
1884 	umtxq_lock(&key);
1885 	umtxq_busy(&key);
1886 	count = umtxq_count_pi(&key, &uq_first);
1887 	if (uq_first != NULL) {
1888 		mtx_lock(&umtx_lock);
1889 		pi = uq_first->uq_pi_blocked;
1890 		KASSERT(pi != NULL, ("pi == NULL?"));
1891 		if (pi->pi_owner != td) {
1892 			mtx_unlock(&umtx_lock);
1893 			umtxq_unbusy(&key);
1894 			umtxq_unlock(&key);
1895 			umtx_key_release(&key);
1896 			/* userland messed the mutex */
1897 			return (EPERM);
1898 		}
1899 		uq_me = td->td_umtxq;
1900 		umtx_pi_disown(pi);
1901 		/* get highest priority thread which is still sleeping. */
1902 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1903 		while (uq_first != NULL &&
1904 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1905 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1906 		}
1907 		pri = PRI_MAX;
1908 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1909 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1910 			if (uq_first2 != NULL) {
1911 				if (pri > UPRI(uq_first2->uq_thread))
1912 					pri = UPRI(uq_first2->uq_thread);
1913 			}
1914 		}
1915 		thread_lock(td);
1916 		sched_lend_user_prio(td, pri);
1917 		thread_unlock(td);
1918 		mtx_unlock(&umtx_lock);
1919 		if (uq_first)
1920 			umtxq_signal_thread(uq_first);
1921 	} else {
1922 		pi = umtx_pi_lookup(&key);
1923 		/*
1924 		 * A umtx_pi can exist if a signal or timeout removed the
1925 		 * last waiter from the umtxq, but there is still
1926 		 * a thread in do_lock_pi() holding the umtx_pi.
1927 		 */
1928 		if (pi != NULL) {
1929 			/*
1930 			 * The umtx_pi can be unowned, such as when a thread
1931 			 * has just entered do_lock_pi(), allocated the
1932 			 * umtx_pi, and unlocked the umtxq.
1933 			 * If the current thread owns it, it must disown it.
1934 			 */
1935 			mtx_lock(&umtx_lock);
1936 			if (pi->pi_owner == td)
1937 				umtx_pi_disown(pi);
1938 			mtx_unlock(&umtx_lock);
1939 		}
1940 	}
1941 	umtxq_unlock(&key);
1942 
1943 	/*
1944 	 * When unlocking the umtx, it must be marked as unowned if
1945 	 * there is zero or one thread only waiting for it.
1946 	 * Otherwise, it must be marked as contested.
1947 	 */
1948 	error = casueword32(&m->m_owner, owner, &old,
1949 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1950 
1951 	umtxq_unbusy_unlocked(&key);
1952 	umtx_key_release(&key);
1953 	if (error == -1)
1954 		return (EFAULT);
1955 	if (old != owner)
1956 		return (EINVAL);
1957 	return (0);
1958 }
1959 
1960 /*
1961  * Lock a PP mutex.
1962  */
1963 static int
1964 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1965     struct _umtx_time *timeout, int try)
1966 {
1967 	struct abs_timeout timo;
1968 	struct umtx_q *uq, *uq2;
1969 	struct umtx_pi *pi;
1970 	uint32_t ceiling;
1971 	uint32_t owner, id;
1972 	int error, pri, old_inherited_pri, su, rv;
1973 
1974 	id = td->td_tid;
1975 	uq = td->td_umtxq;
1976 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1977 	    &uq->uq_key)) != 0)
1978 		return (error);
1979 
1980 	if (timeout != NULL)
1981 		abs_timeout_init2(&timo, timeout);
1982 
1983 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1984 	for (;;) {
1985 		old_inherited_pri = uq->uq_inherited_pri;
1986 		umtxq_lock(&uq->uq_key);
1987 		umtxq_busy(&uq->uq_key);
1988 		umtxq_unlock(&uq->uq_key);
1989 
1990 		rv = fueword32(&m->m_ceilings[0], &ceiling);
1991 		if (rv == -1) {
1992 			error = EFAULT;
1993 			goto out;
1994 		}
1995 		ceiling = RTP_PRIO_MAX - ceiling;
1996 		if (ceiling > RTP_PRIO_MAX) {
1997 			error = EINVAL;
1998 			goto out;
1999 		}
2000 
2001 		mtx_lock(&umtx_lock);
2002 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2003 			mtx_unlock(&umtx_lock);
2004 			error = EINVAL;
2005 			goto out;
2006 		}
2007 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2008 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2009 			thread_lock(td);
2010 			if (uq->uq_inherited_pri < UPRI(td))
2011 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2012 			thread_unlock(td);
2013 		}
2014 		mtx_unlock(&umtx_lock);
2015 
2016 		rv = casueword32(&m->m_owner,
2017 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2018 		/* The address was invalid. */
2019 		if (rv == -1) {
2020 			error = EFAULT;
2021 			break;
2022 		}
2023 
2024 		if (owner == UMUTEX_CONTESTED) {
2025 			error = 0;
2026 			break;
2027 		}
2028 
2029 		if (try != 0) {
2030 			error = EBUSY;
2031 			break;
2032 		}
2033 
2034 		/*
2035 		 * If we caught a signal, we have retried and now
2036 		 * exit immediately.
2037 		 */
2038 		if (error != 0)
2039 			break;
2040 
2041 		umtxq_lock(&uq->uq_key);
2042 		umtxq_insert(uq);
2043 		umtxq_unbusy(&uq->uq_key);
2044 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2045 		    NULL : &timo);
2046 		umtxq_remove(uq);
2047 		umtxq_unlock(&uq->uq_key);
2048 
2049 		mtx_lock(&umtx_lock);
2050 		uq->uq_inherited_pri = old_inherited_pri;
2051 		pri = PRI_MAX;
2052 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2053 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2054 			if (uq2 != NULL) {
2055 				if (pri > UPRI(uq2->uq_thread))
2056 					pri = UPRI(uq2->uq_thread);
2057 			}
2058 		}
2059 		if (pri > uq->uq_inherited_pri)
2060 			pri = uq->uq_inherited_pri;
2061 		thread_lock(td);
2062 		sched_lend_user_prio(td, pri);
2063 		thread_unlock(td);
2064 		mtx_unlock(&umtx_lock);
2065 	}
2066 
2067 	if (error != 0) {
2068 		mtx_lock(&umtx_lock);
2069 		uq->uq_inherited_pri = old_inherited_pri;
2070 		pri = PRI_MAX;
2071 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2072 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2073 			if (uq2 != NULL) {
2074 				if (pri > UPRI(uq2->uq_thread))
2075 					pri = UPRI(uq2->uq_thread);
2076 			}
2077 		}
2078 		if (pri > uq->uq_inherited_pri)
2079 			pri = uq->uq_inherited_pri;
2080 		thread_lock(td);
2081 		sched_lend_user_prio(td, pri);
2082 		thread_unlock(td);
2083 		mtx_unlock(&umtx_lock);
2084 	}
2085 
2086 out:
2087 	umtxq_unbusy_unlocked(&uq->uq_key);
2088 	umtx_key_release(&uq->uq_key);
2089 	return (error);
2090 }
2091 
2092 /*
2093  * Unlock a PP mutex.
2094  */
2095 static int
2096 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2097 {
2098 	struct umtx_key key;
2099 	struct umtx_q *uq, *uq2;
2100 	struct umtx_pi *pi;
2101 	uint32_t owner, id;
2102 	uint32_t rceiling;
2103 	int error, pri, new_inherited_pri, su;
2104 
2105 	id = td->td_tid;
2106 	uq = td->td_umtxq;
2107 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2108 
2109 	/*
2110 	 * Make sure we own this mtx.
2111 	 */
2112 	error = fueword32(&m->m_owner, &owner);
2113 	if (error == -1)
2114 		return (EFAULT);
2115 
2116 	if ((owner & ~UMUTEX_CONTESTED) != id)
2117 		return (EPERM);
2118 
2119 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2120 	if (error != 0)
2121 		return (error);
2122 
2123 	if (rceiling == -1)
2124 		new_inherited_pri = PRI_MAX;
2125 	else {
2126 		rceiling = RTP_PRIO_MAX - rceiling;
2127 		if (rceiling > RTP_PRIO_MAX)
2128 			return (EINVAL);
2129 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2130 	}
2131 
2132 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2133 	    &key)) != 0)
2134 		return (error);
2135 	umtxq_lock(&key);
2136 	umtxq_busy(&key);
2137 	umtxq_unlock(&key);
2138 	/*
2139 	 * For priority protected mutex, always set unlocked state
2140 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2141 	 * to lock the mutex, it is necessary because thread priority
2142 	 * has to be adjusted for such mutex.
2143 	 */
2144 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2145 
2146 	umtxq_lock(&key);
2147 	if (error == 0)
2148 		umtxq_signal(&key, 1);
2149 	umtxq_unbusy(&key);
2150 	umtxq_unlock(&key);
2151 
2152 	if (error == -1)
2153 		error = EFAULT;
2154 	else {
2155 		mtx_lock(&umtx_lock);
2156 		if (su != 0)
2157 			uq->uq_inherited_pri = new_inherited_pri;
2158 		pri = PRI_MAX;
2159 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2160 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2161 			if (uq2 != NULL) {
2162 				if (pri > UPRI(uq2->uq_thread))
2163 					pri = UPRI(uq2->uq_thread);
2164 			}
2165 		}
2166 		if (pri > uq->uq_inherited_pri)
2167 			pri = uq->uq_inherited_pri;
2168 		thread_lock(td);
2169 		sched_lend_user_prio(td, pri);
2170 		thread_unlock(td);
2171 		mtx_unlock(&umtx_lock);
2172 	}
2173 	umtx_key_release(&key);
2174 	return (error);
2175 }
2176 
2177 static int
2178 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2179 	uint32_t *old_ceiling)
2180 {
2181 	struct umtx_q *uq;
2182 	uint32_t save_ceiling;
2183 	uint32_t owner, id;
2184 	uint32_t flags;
2185 	int error, rv;
2186 
2187 	error = fueword32(&m->m_flags, &flags);
2188 	if (error == -1)
2189 		return (EFAULT);
2190 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2191 		return (EINVAL);
2192 	if (ceiling > RTP_PRIO_MAX)
2193 		return (EINVAL);
2194 	id = td->td_tid;
2195 	uq = td->td_umtxq;
2196 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2197 	   &uq->uq_key)) != 0)
2198 		return (error);
2199 	for (;;) {
2200 		umtxq_lock(&uq->uq_key);
2201 		umtxq_busy(&uq->uq_key);
2202 		umtxq_unlock(&uq->uq_key);
2203 
2204 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2205 		if (rv == -1) {
2206 			error = EFAULT;
2207 			break;
2208 		}
2209 
2210 		rv = casueword32(&m->m_owner,
2211 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2212 		if (rv == -1) {
2213 			error = EFAULT;
2214 			break;
2215 		}
2216 
2217 		if (owner == UMUTEX_CONTESTED) {
2218 			suword32(&m->m_ceilings[0], ceiling);
2219 			suword32(&m->m_owner, UMUTEX_CONTESTED);
2220 			error = 0;
2221 			break;
2222 		}
2223 
2224 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2225 			suword32(&m->m_ceilings[0], ceiling);
2226 			error = 0;
2227 			break;
2228 		}
2229 
2230 		/*
2231 		 * If we caught a signal, we have retried and now
2232 		 * exit immediately.
2233 		 */
2234 		if (error != 0)
2235 			break;
2236 
2237 		/*
2238 		 * We set the contested bit, sleep. Otherwise the lock changed
2239 		 * and we need to retry or we lost a race to the thread
2240 		 * unlocking the umtx.
2241 		 */
2242 		umtxq_lock(&uq->uq_key);
2243 		umtxq_insert(uq);
2244 		umtxq_unbusy(&uq->uq_key);
2245 		error = umtxq_sleep(uq, "umtxpp", NULL);
2246 		umtxq_remove(uq);
2247 		umtxq_unlock(&uq->uq_key);
2248 	}
2249 	umtxq_lock(&uq->uq_key);
2250 	if (error == 0)
2251 		umtxq_signal(&uq->uq_key, INT_MAX);
2252 	umtxq_unbusy(&uq->uq_key);
2253 	umtxq_unlock(&uq->uq_key);
2254 	umtx_key_release(&uq->uq_key);
2255 	if (error == 0 && old_ceiling != NULL)
2256 		suword32(old_ceiling, save_ceiling);
2257 	return (error);
2258 }
2259 
2260 /*
2261  * Lock a userland POSIX mutex.
2262  */
2263 static int
2264 do_lock_umutex(struct thread *td, struct umutex *m,
2265     struct _umtx_time *timeout, int mode)
2266 {
2267 	uint32_t flags;
2268 	int error;
2269 
2270 	error = fueword32(&m->m_flags, &flags);
2271 	if (error == -1)
2272 		return (EFAULT);
2273 
2274 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2275 	case 0:
2276 		error = do_lock_normal(td, m, flags, timeout, mode);
2277 		break;
2278 	case UMUTEX_PRIO_INHERIT:
2279 		error = do_lock_pi(td, m, flags, timeout, mode);
2280 		break;
2281 	case UMUTEX_PRIO_PROTECT:
2282 		error = do_lock_pp(td, m, flags, timeout, mode);
2283 		break;
2284 	default:
2285 		return (EINVAL);
2286 	}
2287 	if (timeout == NULL) {
2288 		if (error == EINTR && mode != _UMUTEX_WAIT)
2289 			error = ERESTART;
2290 	} else {
2291 		/* Timed-locking is not restarted. */
2292 		if (error == ERESTART)
2293 			error = EINTR;
2294 	}
2295 	return (error);
2296 }
2297 
2298 /*
2299  * Unlock a userland POSIX mutex.
2300  */
2301 static int
2302 do_unlock_umutex(struct thread *td, struct umutex *m)
2303 {
2304 	uint32_t flags;
2305 	int error;
2306 
2307 	error = fueword32(&m->m_flags, &flags);
2308 	if (error == -1)
2309 		return (EFAULT);
2310 
2311 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2312 	case 0:
2313 		return (do_unlock_normal(td, m, flags));
2314 	case UMUTEX_PRIO_INHERIT:
2315 		return (do_unlock_pi(td, m, flags));
2316 	case UMUTEX_PRIO_PROTECT:
2317 		return (do_unlock_pp(td, m, flags));
2318 	}
2319 
2320 	return (EINVAL);
2321 }
2322 
2323 static int
2324 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2325 	struct timespec *timeout, u_long wflags)
2326 {
2327 	struct abs_timeout timo;
2328 	struct umtx_q *uq;
2329 	uint32_t flags, clockid, hasw;
2330 	int error;
2331 
2332 	uq = td->td_umtxq;
2333 	error = fueword32(&cv->c_flags, &flags);
2334 	if (error == -1)
2335 		return (EFAULT);
2336 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2337 	if (error != 0)
2338 		return (error);
2339 
2340 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2341 		error = fueword32(&cv->c_clockid, &clockid);
2342 		if (error == -1) {
2343 			umtx_key_release(&uq->uq_key);
2344 			return (EFAULT);
2345 		}
2346 		if (clockid < CLOCK_REALTIME ||
2347 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2348 			/* hmm, only HW clock id will work. */
2349 			umtx_key_release(&uq->uq_key);
2350 			return (EINVAL);
2351 		}
2352 	} else {
2353 		clockid = CLOCK_REALTIME;
2354 	}
2355 
2356 	umtxq_lock(&uq->uq_key);
2357 	umtxq_busy(&uq->uq_key);
2358 	umtxq_insert(uq);
2359 	umtxq_unlock(&uq->uq_key);
2360 
2361 	/*
2362 	 * Set c_has_waiters to 1 before releasing user mutex, also
2363 	 * don't modify cache line when unnecessary.
2364 	 */
2365 	error = fueword32(&cv->c_has_waiters, &hasw);
2366 	if (error == 0 && hasw == 0)
2367 		suword32(&cv->c_has_waiters, 1);
2368 
2369 	umtxq_unbusy_unlocked(&uq->uq_key);
2370 
2371 	error = do_unlock_umutex(td, m);
2372 
2373 	if (timeout != NULL)
2374 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2375 			timeout);
2376 
2377 	umtxq_lock(&uq->uq_key);
2378 	if (error == 0) {
2379 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2380 		    NULL : &timo);
2381 	}
2382 
2383 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2384 		error = 0;
2385 	else {
2386 		/*
2387 		 * This must be timeout,interrupted by signal or
2388 		 * surprious wakeup, clear c_has_waiter flag when
2389 		 * necessary.
2390 		 */
2391 		umtxq_busy(&uq->uq_key);
2392 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2393 			int oldlen = uq->uq_cur_queue->length;
2394 			umtxq_remove(uq);
2395 			if (oldlen == 1) {
2396 				umtxq_unlock(&uq->uq_key);
2397 				suword32(&cv->c_has_waiters, 0);
2398 				umtxq_lock(&uq->uq_key);
2399 			}
2400 		}
2401 		umtxq_unbusy(&uq->uq_key);
2402 		if (error == ERESTART)
2403 			error = EINTR;
2404 	}
2405 
2406 	umtxq_unlock(&uq->uq_key);
2407 	umtx_key_release(&uq->uq_key);
2408 	return (error);
2409 }
2410 
2411 /*
2412  * Signal a userland condition variable.
2413  */
2414 static int
2415 do_cv_signal(struct thread *td, struct ucond *cv)
2416 {
2417 	struct umtx_key key;
2418 	int error, cnt, nwake;
2419 	uint32_t flags;
2420 
2421 	error = fueword32(&cv->c_flags, &flags);
2422 	if (error == -1)
2423 		return (EFAULT);
2424 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2425 		return (error);
2426 	umtxq_lock(&key);
2427 	umtxq_busy(&key);
2428 	cnt = umtxq_count(&key);
2429 	nwake = umtxq_signal(&key, 1);
2430 	if (cnt <= nwake) {
2431 		umtxq_unlock(&key);
2432 		error = suword32(&cv->c_has_waiters, 0);
2433 		if (error == -1)
2434 			error = EFAULT;
2435 		umtxq_lock(&key);
2436 	}
2437 	umtxq_unbusy(&key);
2438 	umtxq_unlock(&key);
2439 	umtx_key_release(&key);
2440 	return (error);
2441 }
2442 
2443 static int
2444 do_cv_broadcast(struct thread *td, struct ucond *cv)
2445 {
2446 	struct umtx_key key;
2447 	int error;
2448 	uint32_t flags;
2449 
2450 	error = fueword32(&cv->c_flags, &flags);
2451 	if (error == -1)
2452 		return (EFAULT);
2453 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2454 		return (error);
2455 
2456 	umtxq_lock(&key);
2457 	umtxq_busy(&key);
2458 	umtxq_signal(&key, INT_MAX);
2459 	umtxq_unlock(&key);
2460 
2461 	error = suword32(&cv->c_has_waiters, 0);
2462 	if (error == -1)
2463 		error = EFAULT;
2464 
2465 	umtxq_unbusy_unlocked(&key);
2466 
2467 	umtx_key_release(&key);
2468 	return (error);
2469 }
2470 
2471 static int
2472 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2473 {
2474 	struct abs_timeout timo;
2475 	struct umtx_q *uq;
2476 	uint32_t flags, wrflags;
2477 	int32_t state, oldstate;
2478 	int32_t blocked_readers;
2479 	int error, rv;
2480 
2481 	uq = td->td_umtxq;
2482 	error = fueword32(&rwlock->rw_flags, &flags);
2483 	if (error == -1)
2484 		return (EFAULT);
2485 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2486 	if (error != 0)
2487 		return (error);
2488 
2489 	if (timeout != NULL)
2490 		abs_timeout_init2(&timo, timeout);
2491 
2492 	wrflags = URWLOCK_WRITE_OWNER;
2493 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2494 		wrflags |= URWLOCK_WRITE_WAITERS;
2495 
2496 	for (;;) {
2497 		rv = fueword32(&rwlock->rw_state, &state);
2498 		if (rv == -1) {
2499 			umtx_key_release(&uq->uq_key);
2500 			return (EFAULT);
2501 		}
2502 
2503 		/* try to lock it */
2504 		while (!(state & wrflags)) {
2505 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2506 				umtx_key_release(&uq->uq_key);
2507 				return (EAGAIN);
2508 			}
2509 			rv = casueword32(&rwlock->rw_state, state,
2510 			    &oldstate, state + 1);
2511 			if (rv == -1) {
2512 				umtx_key_release(&uq->uq_key);
2513 				return (EFAULT);
2514 			}
2515 			if (oldstate == state) {
2516 				umtx_key_release(&uq->uq_key);
2517 				return (0);
2518 			}
2519 			error = umtxq_check_susp(td);
2520 			if (error != 0)
2521 				break;
2522 			state = oldstate;
2523 		}
2524 
2525 		if (error)
2526 			break;
2527 
2528 		/* grab monitor lock */
2529 		umtxq_lock(&uq->uq_key);
2530 		umtxq_busy(&uq->uq_key);
2531 		umtxq_unlock(&uq->uq_key);
2532 
2533 		/*
2534 		 * re-read the state, in case it changed between the try-lock above
2535 		 * and the check below
2536 		 */
2537 		rv = fueword32(&rwlock->rw_state, &state);
2538 		if (rv == -1)
2539 			error = EFAULT;
2540 
2541 		/* set read contention bit */
2542 		while (error == 0 && (state & wrflags) &&
2543 		    !(state & URWLOCK_READ_WAITERS)) {
2544 			rv = casueword32(&rwlock->rw_state, state,
2545 			    &oldstate, state | URWLOCK_READ_WAITERS);
2546 			if (rv == -1) {
2547 				error = EFAULT;
2548 				break;
2549 			}
2550 			if (oldstate == state)
2551 				goto sleep;
2552 			state = oldstate;
2553 			error = umtxq_check_susp(td);
2554 			if (error != 0)
2555 				break;
2556 		}
2557 		if (error != 0) {
2558 			umtxq_unbusy_unlocked(&uq->uq_key);
2559 			break;
2560 		}
2561 
2562 		/* state is changed while setting flags, restart */
2563 		if (!(state & wrflags)) {
2564 			umtxq_unbusy_unlocked(&uq->uq_key);
2565 			error = umtxq_check_susp(td);
2566 			if (error != 0)
2567 				break;
2568 			continue;
2569 		}
2570 
2571 sleep:
2572 		/* contention bit is set, before sleeping, increase read waiter count */
2573 		rv = fueword32(&rwlock->rw_blocked_readers,
2574 		    &blocked_readers);
2575 		if (rv == -1) {
2576 			umtxq_unbusy_unlocked(&uq->uq_key);
2577 			error = EFAULT;
2578 			break;
2579 		}
2580 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2581 
2582 		while (state & wrflags) {
2583 			umtxq_lock(&uq->uq_key);
2584 			umtxq_insert(uq);
2585 			umtxq_unbusy(&uq->uq_key);
2586 
2587 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2588 			    NULL : &timo);
2589 
2590 			umtxq_busy(&uq->uq_key);
2591 			umtxq_remove(uq);
2592 			umtxq_unlock(&uq->uq_key);
2593 			if (error)
2594 				break;
2595 			rv = fueword32(&rwlock->rw_state, &state);
2596 			if (rv == -1) {
2597 				error = EFAULT;
2598 				break;
2599 			}
2600 		}
2601 
2602 		/* decrease read waiter count, and may clear read contention bit */
2603 		rv = fueword32(&rwlock->rw_blocked_readers,
2604 		    &blocked_readers);
2605 		if (rv == -1) {
2606 			umtxq_unbusy_unlocked(&uq->uq_key);
2607 			error = EFAULT;
2608 			break;
2609 		}
2610 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2611 		if (blocked_readers == 1) {
2612 			rv = fueword32(&rwlock->rw_state, &state);
2613 			if (rv == -1)
2614 				error = EFAULT;
2615 			while (error == 0) {
2616 				rv = casueword32(&rwlock->rw_state, state,
2617 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2618 				if (rv == -1) {
2619 					error = EFAULT;
2620 					break;
2621 				}
2622 				if (oldstate == state)
2623 					break;
2624 				state = oldstate;
2625 				error = umtxq_check_susp(td);
2626 			}
2627 		}
2628 
2629 		umtxq_unbusy_unlocked(&uq->uq_key);
2630 		if (error != 0)
2631 			break;
2632 	}
2633 	umtx_key_release(&uq->uq_key);
2634 	if (error == ERESTART)
2635 		error = EINTR;
2636 	return (error);
2637 }
2638 
2639 static int
2640 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2641 {
2642 	struct abs_timeout timo;
2643 	struct umtx_q *uq;
2644 	uint32_t flags;
2645 	int32_t state, oldstate;
2646 	int32_t blocked_writers;
2647 	int32_t blocked_readers;
2648 	int error, rv;
2649 
2650 	uq = td->td_umtxq;
2651 	error = fueword32(&rwlock->rw_flags, &flags);
2652 	if (error == -1)
2653 		return (EFAULT);
2654 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2655 	if (error != 0)
2656 		return (error);
2657 
2658 	if (timeout != NULL)
2659 		abs_timeout_init2(&timo, timeout);
2660 
2661 	blocked_readers = 0;
2662 	for (;;) {
2663 		rv = fueword32(&rwlock->rw_state, &state);
2664 		if (rv == -1) {
2665 			umtx_key_release(&uq->uq_key);
2666 			return (EFAULT);
2667 		}
2668 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2669 			rv = casueword32(&rwlock->rw_state, state,
2670 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2671 			if (rv == -1) {
2672 				umtx_key_release(&uq->uq_key);
2673 				return (EFAULT);
2674 			}
2675 			if (oldstate == state) {
2676 				umtx_key_release(&uq->uq_key);
2677 				return (0);
2678 			}
2679 			state = oldstate;
2680 			error = umtxq_check_susp(td);
2681 			if (error != 0)
2682 				break;
2683 		}
2684 
2685 		if (error) {
2686 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2687 			    blocked_readers != 0) {
2688 				umtxq_lock(&uq->uq_key);
2689 				umtxq_busy(&uq->uq_key);
2690 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2691 				umtxq_unbusy(&uq->uq_key);
2692 				umtxq_unlock(&uq->uq_key);
2693 			}
2694 
2695 			break;
2696 		}
2697 
2698 		/* grab monitor lock */
2699 		umtxq_lock(&uq->uq_key);
2700 		umtxq_busy(&uq->uq_key);
2701 		umtxq_unlock(&uq->uq_key);
2702 
2703 		/*
2704 		 * re-read the state, in case it changed between the try-lock above
2705 		 * and the check below
2706 		 */
2707 		rv = fueword32(&rwlock->rw_state, &state);
2708 		if (rv == -1)
2709 			error = EFAULT;
2710 
2711 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2712 		    URWLOCK_READER_COUNT(state) != 0) &&
2713 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2714 			rv = casueword32(&rwlock->rw_state, state,
2715 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2716 			if (rv == -1) {
2717 				error = EFAULT;
2718 				break;
2719 			}
2720 			if (oldstate == state)
2721 				goto sleep;
2722 			state = oldstate;
2723 			error = umtxq_check_susp(td);
2724 			if (error != 0)
2725 				break;
2726 		}
2727 		if (error != 0) {
2728 			umtxq_unbusy_unlocked(&uq->uq_key);
2729 			break;
2730 		}
2731 
2732 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2733 			umtxq_unbusy_unlocked(&uq->uq_key);
2734 			error = umtxq_check_susp(td);
2735 			if (error != 0)
2736 				break;
2737 			continue;
2738 		}
2739 sleep:
2740 		rv = fueword32(&rwlock->rw_blocked_writers,
2741 		    &blocked_writers);
2742 		if (rv == -1) {
2743 			umtxq_unbusy_unlocked(&uq->uq_key);
2744 			error = EFAULT;
2745 			break;
2746 		}
2747 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2748 
2749 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2750 			umtxq_lock(&uq->uq_key);
2751 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2752 			umtxq_unbusy(&uq->uq_key);
2753 
2754 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2755 			    NULL : &timo);
2756 
2757 			umtxq_busy(&uq->uq_key);
2758 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2759 			umtxq_unlock(&uq->uq_key);
2760 			if (error)
2761 				break;
2762 			rv = fueword32(&rwlock->rw_state, &state);
2763 			if (rv == -1) {
2764 				error = EFAULT;
2765 				break;
2766 			}
2767 		}
2768 
2769 		rv = fueword32(&rwlock->rw_blocked_writers,
2770 		    &blocked_writers);
2771 		if (rv == -1) {
2772 			umtxq_unbusy_unlocked(&uq->uq_key);
2773 			error = EFAULT;
2774 			break;
2775 		}
2776 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2777 		if (blocked_writers == 1) {
2778 			rv = fueword32(&rwlock->rw_state, &state);
2779 			if (rv == -1) {
2780 				umtxq_unbusy_unlocked(&uq->uq_key);
2781 				error = EFAULT;
2782 				break;
2783 			}
2784 			for (;;) {
2785 				rv = casueword32(&rwlock->rw_state, state,
2786 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2787 				if (rv == -1) {
2788 					error = EFAULT;
2789 					break;
2790 				}
2791 				if (oldstate == state)
2792 					break;
2793 				state = oldstate;
2794 				error = umtxq_check_susp(td);
2795 				/*
2796 				 * We are leaving the URWLOCK_WRITE_WAITERS
2797 				 * behind, but this should not harm the
2798 				 * correctness.
2799 				 */
2800 				if (error != 0)
2801 					break;
2802 			}
2803 			rv = fueword32(&rwlock->rw_blocked_readers,
2804 			    &blocked_readers);
2805 			if (rv == -1) {
2806 				umtxq_unbusy_unlocked(&uq->uq_key);
2807 				error = EFAULT;
2808 				break;
2809 			}
2810 		} else
2811 			blocked_readers = 0;
2812 
2813 		umtxq_unbusy_unlocked(&uq->uq_key);
2814 	}
2815 
2816 	umtx_key_release(&uq->uq_key);
2817 	if (error == ERESTART)
2818 		error = EINTR;
2819 	return (error);
2820 }
2821 
2822 static int
2823 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2824 {
2825 	struct umtx_q *uq;
2826 	uint32_t flags;
2827 	int32_t state, oldstate;
2828 	int error, rv, q, count;
2829 
2830 	uq = td->td_umtxq;
2831 	error = fueword32(&rwlock->rw_flags, &flags);
2832 	if (error == -1)
2833 		return (EFAULT);
2834 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2835 	if (error != 0)
2836 		return (error);
2837 
2838 	error = fueword32(&rwlock->rw_state, &state);
2839 	if (error == -1) {
2840 		error = EFAULT;
2841 		goto out;
2842 	}
2843 	if (state & URWLOCK_WRITE_OWNER) {
2844 		for (;;) {
2845 			rv = casueword32(&rwlock->rw_state, state,
2846 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
2847 			if (rv == -1) {
2848 				error = EFAULT;
2849 				goto out;
2850 			}
2851 			if (oldstate != state) {
2852 				state = oldstate;
2853 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2854 					error = EPERM;
2855 					goto out;
2856 				}
2857 				error = umtxq_check_susp(td);
2858 				if (error != 0)
2859 					goto out;
2860 			} else
2861 				break;
2862 		}
2863 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2864 		for (;;) {
2865 			rv = casueword32(&rwlock->rw_state, state,
2866 			    &oldstate, state - 1);
2867 			if (rv == -1) {
2868 				error = EFAULT;
2869 				goto out;
2870 			}
2871 			if (oldstate != state) {
2872 				state = oldstate;
2873 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2874 					error = EPERM;
2875 					goto out;
2876 				}
2877 				error = umtxq_check_susp(td);
2878 				if (error != 0)
2879 					goto out;
2880 			} else
2881 				break;
2882 		}
2883 	} else {
2884 		error = EPERM;
2885 		goto out;
2886 	}
2887 
2888 	count = 0;
2889 
2890 	if (!(flags & URWLOCK_PREFER_READER)) {
2891 		if (state & URWLOCK_WRITE_WAITERS) {
2892 			count = 1;
2893 			q = UMTX_EXCLUSIVE_QUEUE;
2894 		} else if (state & URWLOCK_READ_WAITERS) {
2895 			count = INT_MAX;
2896 			q = UMTX_SHARED_QUEUE;
2897 		}
2898 	} else {
2899 		if (state & URWLOCK_READ_WAITERS) {
2900 			count = INT_MAX;
2901 			q = UMTX_SHARED_QUEUE;
2902 		} else if (state & URWLOCK_WRITE_WAITERS) {
2903 			count = 1;
2904 			q = UMTX_EXCLUSIVE_QUEUE;
2905 		}
2906 	}
2907 
2908 	if (count) {
2909 		umtxq_lock(&uq->uq_key);
2910 		umtxq_busy(&uq->uq_key);
2911 		umtxq_signal_queue(&uq->uq_key, count, q);
2912 		umtxq_unbusy(&uq->uq_key);
2913 		umtxq_unlock(&uq->uq_key);
2914 	}
2915 out:
2916 	umtx_key_release(&uq->uq_key);
2917 	return (error);
2918 }
2919 
2920 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
2921 static int
2922 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2923 {
2924 	struct abs_timeout timo;
2925 	struct umtx_q *uq;
2926 	uint32_t flags, count, count1;
2927 	int error, rv;
2928 
2929 	uq = td->td_umtxq;
2930 	error = fueword32(&sem->_flags, &flags);
2931 	if (error == -1)
2932 		return (EFAULT);
2933 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2934 	if (error != 0)
2935 		return (error);
2936 
2937 	if (timeout != NULL)
2938 		abs_timeout_init2(&timo, timeout);
2939 
2940 	umtxq_lock(&uq->uq_key);
2941 	umtxq_busy(&uq->uq_key);
2942 	umtxq_insert(uq);
2943 	umtxq_unlock(&uq->uq_key);
2944 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
2945 	if (rv == 0)
2946 		rv = fueword32(&sem->_count, &count);
2947 	if (rv == -1 || count != 0) {
2948 		umtxq_lock(&uq->uq_key);
2949 		umtxq_unbusy(&uq->uq_key);
2950 		umtxq_remove(uq);
2951 		umtxq_unlock(&uq->uq_key);
2952 		umtx_key_release(&uq->uq_key);
2953 		return (rv == -1 ? EFAULT : 0);
2954 	}
2955 	umtxq_lock(&uq->uq_key);
2956 	umtxq_unbusy(&uq->uq_key);
2957 
2958 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2959 
2960 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2961 		error = 0;
2962 	else {
2963 		umtxq_remove(uq);
2964 		/* A relative timeout cannot be restarted. */
2965 		if (error == ERESTART && timeout != NULL &&
2966 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2967 			error = EINTR;
2968 	}
2969 	umtxq_unlock(&uq->uq_key);
2970 	umtx_key_release(&uq->uq_key);
2971 	return (error);
2972 }
2973 
2974 /*
2975  * Signal a userland semaphore.
2976  */
2977 static int
2978 do_sem_wake(struct thread *td, struct _usem *sem)
2979 {
2980 	struct umtx_key key;
2981 	int error, cnt;
2982 	uint32_t flags;
2983 
2984 	error = fueword32(&sem->_flags, &flags);
2985 	if (error == -1)
2986 		return (EFAULT);
2987 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2988 		return (error);
2989 	umtxq_lock(&key);
2990 	umtxq_busy(&key);
2991 	cnt = umtxq_count(&key);
2992 	if (cnt > 0) {
2993 		umtxq_signal(&key, 1);
2994 		/*
2995 		 * Check if count is greater than 0, this means the memory is
2996 		 * still being referenced by user code, so we can safely
2997 		 * update _has_waiters flag.
2998 		 */
2999 		if (cnt == 1) {
3000 			umtxq_unlock(&key);
3001 			error = suword32(&sem->_has_waiters, 0);
3002 			umtxq_lock(&key);
3003 			if (error == -1)
3004 				error = EFAULT;
3005 		}
3006 	}
3007 	umtxq_unbusy(&key);
3008 	umtxq_unlock(&key);
3009 	umtx_key_release(&key);
3010 	return (error);
3011 }
3012 #endif
3013 
3014 static int
3015 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3016 {
3017 	struct abs_timeout timo;
3018 	struct umtx_q *uq;
3019 	uint32_t count, flags;
3020 	int error, rv;
3021 
3022 	uq = td->td_umtxq;
3023 	flags = fuword32(&sem->_flags);
3024 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3025 	if (error != 0)
3026 		return (error);
3027 
3028 	if (timeout != NULL)
3029 		abs_timeout_init2(&timo, timeout);
3030 
3031 	umtxq_lock(&uq->uq_key);
3032 	umtxq_busy(&uq->uq_key);
3033 	umtxq_insert(uq);
3034 	umtxq_unlock(&uq->uq_key);
3035 	rv = fueword32(&sem->_count, &count);
3036 	if (rv == -1) {
3037 		umtxq_lock(&uq->uq_key);
3038 		umtxq_unbusy(&uq->uq_key);
3039 		umtxq_remove(uq);
3040 		umtxq_unlock(&uq->uq_key);
3041 		umtx_key_release(&uq->uq_key);
3042 		return (EFAULT);
3043 	}
3044 	for (;;) {
3045 		if (USEM_COUNT(count) != 0) {
3046 			umtxq_lock(&uq->uq_key);
3047 			umtxq_unbusy(&uq->uq_key);
3048 			umtxq_remove(uq);
3049 			umtxq_unlock(&uq->uq_key);
3050 			umtx_key_release(&uq->uq_key);
3051 			return (0);
3052 		}
3053 		if (count == USEM_HAS_WAITERS)
3054 			break;
3055 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3056 		if (rv == -1) {
3057 			umtxq_lock(&uq->uq_key);
3058 			umtxq_unbusy(&uq->uq_key);
3059 			umtxq_remove(uq);
3060 			umtxq_unlock(&uq->uq_key);
3061 			umtx_key_release(&uq->uq_key);
3062 			return (EFAULT);
3063 		}
3064 		if (count == 0)
3065 			break;
3066 	}
3067 	umtxq_lock(&uq->uq_key);
3068 	umtxq_unbusy(&uq->uq_key);
3069 
3070 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3071 
3072 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3073 		error = 0;
3074 	else {
3075 		umtxq_remove(uq);
3076 		/* A relative timeout cannot be restarted. */
3077 		if (error == ERESTART && timeout != NULL &&
3078 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3079 			error = EINTR;
3080 	}
3081 	umtxq_unlock(&uq->uq_key);
3082 	umtx_key_release(&uq->uq_key);
3083 	return (error);
3084 }
3085 
3086 /*
3087  * Signal a userland semaphore.
3088  */
3089 static int
3090 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3091 {
3092 	struct umtx_key key;
3093 	int error, cnt, rv;
3094 	uint32_t count, flags;
3095 
3096 	rv = fueword32(&sem->_flags, &flags);
3097 	if (rv == -1)
3098 		return (EFAULT);
3099 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3100 		return (error);
3101 	umtxq_lock(&key);
3102 	umtxq_busy(&key);
3103 	cnt = umtxq_count(&key);
3104 	if (cnt > 0) {
3105 		umtxq_signal(&key, 1);
3106 
3107 		/*
3108 		 * If this was the last sleeping thread, clear the waiters
3109 		 * flag in _count.
3110 		 */
3111 		if (cnt == 1) {
3112 			umtxq_unlock(&key);
3113 			rv = fueword32(&sem->_count, &count);
3114 			while (rv != -1 && count & USEM_HAS_WAITERS)
3115 				rv = casueword32(&sem->_count, count, &count,
3116 				    count & ~USEM_HAS_WAITERS);
3117 			if (rv == -1)
3118 				error = EFAULT;
3119 			umtxq_lock(&key);
3120 		}
3121 	}
3122 	umtxq_unbusy(&key);
3123 	umtxq_unlock(&key);
3124 	umtx_key_release(&key);
3125 	return (error);
3126 }
3127 
3128 inline int
3129 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3130 {
3131 	int error;
3132 
3133 	error = copyin(addr, tsp, sizeof(struct timespec));
3134 	if (error == 0) {
3135 		if (tsp->tv_sec < 0 ||
3136 		    tsp->tv_nsec >= 1000000000 ||
3137 		    tsp->tv_nsec < 0)
3138 			error = EINVAL;
3139 	}
3140 	return (error);
3141 }
3142 
3143 static inline int
3144 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3145 {
3146 	int error;
3147 
3148 	if (size <= sizeof(struct timespec)) {
3149 		tp->_clockid = CLOCK_REALTIME;
3150 		tp->_flags = 0;
3151 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3152 	} else
3153 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3154 	if (error != 0)
3155 		return (error);
3156 	if (tp->_timeout.tv_sec < 0 ||
3157 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3158 		return (EINVAL);
3159 	return (0);
3160 }
3161 
3162 static int
3163 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3164 {
3165 
3166 	return (EOPNOTSUPP);
3167 }
3168 
3169 static int
3170 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3171 {
3172 	struct _umtx_time timeout, *tm_p;
3173 	int error;
3174 
3175 	if (uap->uaddr2 == NULL)
3176 		tm_p = NULL;
3177 	else {
3178 		error = umtx_copyin_umtx_time(
3179 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3180 		if (error != 0)
3181 			return (error);
3182 		tm_p = &timeout;
3183 	}
3184 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3185 }
3186 
3187 static int
3188 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3189 {
3190 	struct _umtx_time timeout, *tm_p;
3191 	int error;
3192 
3193 	if (uap->uaddr2 == NULL)
3194 		tm_p = NULL;
3195 	else {
3196 		error = umtx_copyin_umtx_time(
3197 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3198 		if (error != 0)
3199 			return (error);
3200 		tm_p = &timeout;
3201 	}
3202 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3203 }
3204 
3205 static int
3206 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3207 {
3208 	struct _umtx_time *tm_p, timeout;
3209 	int error;
3210 
3211 	if (uap->uaddr2 == NULL)
3212 		tm_p = NULL;
3213 	else {
3214 		error = umtx_copyin_umtx_time(
3215 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3216 		if (error != 0)
3217 			return (error);
3218 		tm_p = &timeout;
3219 	}
3220 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3221 }
3222 
3223 static int
3224 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3225 {
3226 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3227 }
3228 
3229 #define BATCH_SIZE	128
3230 static int
3231 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3232 {
3233 	int count = uap->val;
3234 	void *uaddrs[BATCH_SIZE];
3235 	char **upp = (char **)uap->obj;
3236 	int tocopy;
3237 	int error = 0;
3238 	int i, pos = 0;
3239 
3240 	while (count > 0) {
3241 		tocopy = count;
3242 		if (tocopy > BATCH_SIZE)
3243 			tocopy = BATCH_SIZE;
3244 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3245 		if (error != 0)
3246 			break;
3247 		for (i = 0; i < tocopy; ++i)
3248 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3249 		count -= tocopy;
3250 		pos += tocopy;
3251 	}
3252 	return (error);
3253 }
3254 
3255 static int
3256 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3257 {
3258 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3259 }
3260 
3261 static int
3262 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3263 {
3264 	struct _umtx_time *tm_p, timeout;
3265 	int error;
3266 
3267 	/* Allow a null timespec (wait forever). */
3268 	if (uap->uaddr2 == NULL)
3269 		tm_p = NULL;
3270 	else {
3271 		error = umtx_copyin_umtx_time(
3272 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3273 		if (error != 0)
3274 			return (error);
3275 		tm_p = &timeout;
3276 	}
3277 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3278 }
3279 
3280 static int
3281 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3282 {
3283 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3284 }
3285 
3286 static int
3287 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3288 {
3289 	struct _umtx_time *tm_p, timeout;
3290 	int error;
3291 
3292 	/* Allow a null timespec (wait forever). */
3293 	if (uap->uaddr2 == NULL)
3294 		tm_p = NULL;
3295 	else {
3296 		error = umtx_copyin_umtx_time(
3297 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3298 		if (error != 0)
3299 			return (error);
3300 		tm_p = &timeout;
3301 	}
3302 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3303 }
3304 
3305 static int
3306 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3307 {
3308 	return do_wake_umutex(td, uap->obj);
3309 }
3310 
3311 static int
3312 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3313 {
3314 	return do_unlock_umutex(td, uap->obj);
3315 }
3316 
3317 static int
3318 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3319 {
3320 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3321 }
3322 
3323 static int
3324 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3325 {
3326 	struct timespec *ts, timeout;
3327 	int error;
3328 
3329 	/* Allow a null timespec (wait forever). */
3330 	if (uap->uaddr2 == NULL)
3331 		ts = NULL;
3332 	else {
3333 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3334 		if (error != 0)
3335 			return (error);
3336 		ts = &timeout;
3337 	}
3338 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3339 }
3340 
3341 static int
3342 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3343 {
3344 	return do_cv_signal(td, uap->obj);
3345 }
3346 
3347 static int
3348 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3349 {
3350 	return do_cv_broadcast(td, uap->obj);
3351 }
3352 
3353 static int
3354 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3355 {
3356 	struct _umtx_time timeout;
3357 	int error;
3358 
3359 	/* Allow a null timespec (wait forever). */
3360 	if (uap->uaddr2 == NULL) {
3361 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3362 	} else {
3363 		error = umtx_copyin_umtx_time(uap->uaddr2,
3364 		   (size_t)uap->uaddr1, &timeout);
3365 		if (error != 0)
3366 			return (error);
3367 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3368 	}
3369 	return (error);
3370 }
3371 
3372 static int
3373 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3374 {
3375 	struct _umtx_time timeout;
3376 	int error;
3377 
3378 	/* Allow a null timespec (wait forever). */
3379 	if (uap->uaddr2 == NULL) {
3380 		error = do_rw_wrlock(td, uap->obj, 0);
3381 	} else {
3382 		error = umtx_copyin_umtx_time(uap->uaddr2,
3383 		   (size_t)uap->uaddr1, &timeout);
3384 		if (error != 0)
3385 			return (error);
3386 
3387 		error = do_rw_wrlock(td, uap->obj, &timeout);
3388 	}
3389 	return (error);
3390 }
3391 
3392 static int
3393 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3394 {
3395 	return do_rw_unlock(td, uap->obj);
3396 }
3397 
3398 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3399 static int
3400 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3401 {
3402 	struct _umtx_time *tm_p, timeout;
3403 	int error;
3404 
3405 	/* Allow a null timespec (wait forever). */
3406 	if (uap->uaddr2 == NULL)
3407 		tm_p = NULL;
3408 	else {
3409 		error = umtx_copyin_umtx_time(
3410 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3411 		if (error != 0)
3412 			return (error);
3413 		tm_p = &timeout;
3414 	}
3415 	return (do_sem_wait(td, uap->obj, tm_p));
3416 }
3417 
3418 static int
3419 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3420 {
3421 
3422 	return (do_sem_wake(td, uap->obj));
3423 }
3424 #endif
3425 
3426 static int
3427 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3428 {
3429 
3430 	return (do_wake2_umutex(td, uap->obj, uap->val));
3431 }
3432 
3433 static int
3434 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3435 {
3436 	struct _umtx_time *tm_p, timeout;
3437 	int error;
3438 
3439 	/* Allow a null timespec (wait forever). */
3440 	if (uap->uaddr2 == NULL)
3441 		tm_p = NULL;
3442 	else {
3443 		error = umtx_copyin_umtx_time(
3444 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3445 		if (error != 0)
3446 			return (error);
3447 		tm_p = &timeout;
3448 	}
3449 	return (do_sem2_wait(td, uap->obj, tm_p));
3450 }
3451 
3452 static int
3453 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3454 {
3455 
3456 	return (do_sem2_wake(td, uap->obj));
3457 }
3458 
3459 #define	USHM_OBJ_UMTX(o)						\
3460     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3461 
3462 #define	USHMF_REG_LINKED	0x0001
3463 #define	USHMF_OBJ_LINKED	0x0002
3464 struct umtx_shm_reg {
3465 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3466 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3467 	struct umtx_key		ushm_key;
3468 	struct ucred		*ushm_cred;
3469 	struct shmfd		*ushm_obj;
3470 	u_int			ushm_refcnt;
3471 	u_int			ushm_flags;
3472 };
3473 
3474 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3475 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3476 
3477 static uma_zone_t umtx_shm_reg_zone;
3478 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3479 static struct mtx umtx_shm_lock;
3480 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3481     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3482 
3483 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3484 
3485 static void
3486 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3487 {
3488 	struct umtx_shm_reg_head d;
3489 	struct umtx_shm_reg *reg, *reg1;
3490 
3491 	TAILQ_INIT(&d);
3492 	mtx_lock(&umtx_shm_lock);
3493 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3494 	mtx_unlock(&umtx_shm_lock);
3495 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3496 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3497 		umtx_shm_free_reg(reg);
3498 	}
3499 }
3500 
3501 static struct task umtx_shm_reg_delfree_task =
3502     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3503 
3504 static struct umtx_shm_reg *
3505 umtx_shm_find_reg_locked(const struct umtx_key *key)
3506 {
3507 	struct umtx_shm_reg *reg;
3508 	struct umtx_shm_reg_head *reg_head;
3509 
3510 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3511 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3512 	reg_head = &umtx_shm_registry[key->hash];
3513 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3514 		KASSERT(reg->ushm_key.shared,
3515 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3516 		if (reg->ushm_key.info.shared.object ==
3517 		    key->info.shared.object &&
3518 		    reg->ushm_key.info.shared.offset ==
3519 		    key->info.shared.offset) {
3520 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3521 			KASSERT(reg->ushm_refcnt > 0,
3522 			    ("reg %p refcnt 0 onlist", reg));
3523 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3524 			    ("reg %p not linked", reg));
3525 			reg->ushm_refcnt++;
3526 			return (reg);
3527 		}
3528 	}
3529 	return (NULL);
3530 }
3531 
3532 static struct umtx_shm_reg *
3533 umtx_shm_find_reg(const struct umtx_key *key)
3534 {
3535 	struct umtx_shm_reg *reg;
3536 
3537 	mtx_lock(&umtx_shm_lock);
3538 	reg = umtx_shm_find_reg_locked(key);
3539 	mtx_unlock(&umtx_shm_lock);
3540 	return (reg);
3541 }
3542 
3543 static void
3544 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3545 {
3546 
3547 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3548 	crfree(reg->ushm_cred);
3549 	shm_drop(reg->ushm_obj);
3550 	uma_zfree(umtx_shm_reg_zone, reg);
3551 }
3552 
3553 static bool
3554 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3555 {
3556 	bool res;
3557 
3558 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3559 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3560 	reg->ushm_refcnt--;
3561 	res = reg->ushm_refcnt == 0;
3562 	if (res || force) {
3563 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3564 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3565 			    reg, ushm_reg_link);
3566 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3567 		}
3568 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3569 			LIST_REMOVE(reg, ushm_obj_link);
3570 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3571 		}
3572 	}
3573 	return (res);
3574 }
3575 
3576 static void
3577 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3578 {
3579 	vm_object_t object;
3580 	bool dofree;
3581 
3582 	if (force) {
3583 		object = reg->ushm_obj->shm_object;
3584 		VM_OBJECT_WLOCK(object);
3585 		object->flags |= OBJ_UMTXDEAD;
3586 		VM_OBJECT_WUNLOCK(object);
3587 	}
3588 	mtx_lock(&umtx_shm_lock);
3589 	dofree = umtx_shm_unref_reg_locked(reg, force);
3590 	mtx_unlock(&umtx_shm_lock);
3591 	if (dofree)
3592 		umtx_shm_free_reg(reg);
3593 }
3594 
3595 void
3596 umtx_shm_object_init(vm_object_t object)
3597 {
3598 
3599 	LIST_INIT(USHM_OBJ_UMTX(object));
3600 }
3601 
3602 void
3603 umtx_shm_object_terminated(vm_object_t object)
3604 {
3605 	struct umtx_shm_reg *reg, *reg1;
3606 	bool dofree;
3607 
3608 	dofree = false;
3609 	mtx_lock(&umtx_shm_lock);
3610 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3611 		if (umtx_shm_unref_reg_locked(reg, true)) {
3612 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3613 			    ushm_reg_link);
3614 			dofree = true;
3615 		}
3616 	}
3617 	mtx_unlock(&umtx_shm_lock);
3618 	if (dofree)
3619 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3620 }
3621 
3622 static int
3623 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3624     struct umtx_shm_reg **res)
3625 {
3626 	struct umtx_shm_reg *reg, *reg1;
3627 	struct ucred *cred;
3628 	int error;
3629 
3630 	reg = umtx_shm_find_reg(key);
3631 	if (reg != NULL) {
3632 		*res = reg;
3633 		return (0);
3634 	}
3635 	cred = td->td_ucred;
3636 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3637 		return (ENOMEM);
3638 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3639 	reg->ushm_refcnt = 1;
3640 	bcopy(key, &reg->ushm_key, sizeof(*key));
3641 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3642 	reg->ushm_cred = crhold(cred);
3643 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3644 	if (error != 0) {
3645 		umtx_shm_free_reg(reg);
3646 		return (error);
3647 	}
3648 	mtx_lock(&umtx_shm_lock);
3649 	reg1 = umtx_shm_find_reg_locked(key);
3650 	if (reg1 != NULL) {
3651 		mtx_unlock(&umtx_shm_lock);
3652 		umtx_shm_free_reg(reg);
3653 		*res = reg1;
3654 		return (0);
3655 	}
3656 	reg->ushm_refcnt++;
3657 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3658 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3659 	    ushm_obj_link);
3660 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3661 	mtx_unlock(&umtx_shm_lock);
3662 	*res = reg;
3663 	return (0);
3664 }
3665 
3666 static int
3667 umtx_shm_alive(struct thread *td, void *addr)
3668 {
3669 	vm_map_t map;
3670 	vm_map_entry_t entry;
3671 	vm_object_t object;
3672 	vm_pindex_t pindex;
3673 	vm_prot_t prot;
3674 	int res, ret;
3675 	boolean_t wired;
3676 
3677 	map = &td->td_proc->p_vmspace->vm_map;
3678 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3679 	    &object, &pindex, &prot, &wired);
3680 	if (res != KERN_SUCCESS)
3681 		return (EFAULT);
3682 	if (object == NULL)
3683 		ret = EINVAL;
3684 	else
3685 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3686 	vm_map_lookup_done(map, entry);
3687 	return (ret);
3688 }
3689 
3690 static void
3691 umtx_shm_init(void)
3692 {
3693 	int i;
3694 
3695 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3696 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3697 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3698 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3699 		TAILQ_INIT(&umtx_shm_registry[i]);
3700 }
3701 
3702 static int
3703 umtx_shm(struct thread *td, void *addr, u_int flags)
3704 {
3705 	struct umtx_key key;
3706 	struct umtx_shm_reg *reg;
3707 	struct file *fp;
3708 	int error, fd;
3709 
3710 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
3711 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
3712 		return (EINVAL);
3713 	if ((flags & UMTX_SHM_ALIVE) != 0)
3714 		return (umtx_shm_alive(td, addr));
3715 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
3716 	if (error != 0)
3717 		return (error);
3718 	KASSERT(key.shared == 1, ("non-shared key"));
3719 	if ((flags & UMTX_SHM_CREAT) != 0) {
3720 		error = umtx_shm_create_reg(td, &key, &reg);
3721 	} else {
3722 		reg = umtx_shm_find_reg(&key);
3723 		if (reg == NULL)
3724 			error = ESRCH;
3725 	}
3726 	umtx_key_release(&key);
3727 	if (error != 0)
3728 		return (error);
3729 	KASSERT(reg != NULL, ("no reg"));
3730 	if ((flags & UMTX_SHM_DESTROY) != 0) {
3731 		umtx_shm_unref_reg(reg, true);
3732 	} else {
3733 #if 0
3734 #ifdef MAC
3735 		error = mac_posixshm_check_open(td->td_ucred,
3736 		    reg->ushm_obj, FFLAGS(O_RDWR));
3737 		if (error == 0)
3738 #endif
3739 			error = shm_access(reg->ushm_obj, td->td_ucred,
3740 			    FFLAGS(O_RDWR));
3741 		if (error == 0)
3742 #endif
3743 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
3744 		if (error == 0) {
3745 			shm_hold(reg->ushm_obj);
3746 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
3747 			    &shm_ops);
3748 			td->td_retval[0] = fd;
3749 			fdrop(fp, td);
3750 		}
3751 	}
3752 	umtx_shm_unref_reg(reg, false);
3753 	return (error);
3754 }
3755 
3756 static int
3757 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
3758 {
3759 
3760 	return (umtx_shm(td, uap->uaddr1, uap->val));
3761 }
3762 
3763 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3764 
3765 static const _umtx_op_func op_table[] = {
3766 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3767 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3768 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3769 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3770 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3771 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3772 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3773 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3774 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
3775 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
3776 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
3777 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
3778 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
3779 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
3780 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
3781 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
3782 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
3783 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
3784 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
3785 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3786 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
3787 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
3788 #else
3789 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
3790 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
3791 #endif
3792 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
3793 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
3794 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
3795 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
3796 	[UMTX_OP_SHM]		= __umtx_op_shm,
3797 };
3798 
3799 int
3800 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3801 {
3802 
3803 	if ((unsigned)uap->op < nitems(op_table))
3804 		return (*op_table[uap->op])(td, uap);
3805 	return (EINVAL);
3806 }
3807 
3808 #ifdef COMPAT_FREEBSD32
3809 
3810 struct timespec32 {
3811 	int32_t tv_sec;
3812 	int32_t tv_nsec;
3813 };
3814 
3815 struct umtx_time32 {
3816 	struct	timespec32	timeout;
3817 	uint32_t		flags;
3818 	uint32_t		clockid;
3819 };
3820 
3821 static inline int
3822 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3823 {
3824 	struct timespec32 ts32;
3825 	int error;
3826 
3827 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3828 	if (error == 0) {
3829 		if (ts32.tv_sec < 0 ||
3830 		    ts32.tv_nsec >= 1000000000 ||
3831 		    ts32.tv_nsec < 0)
3832 			error = EINVAL;
3833 		else {
3834 			tsp->tv_sec = ts32.tv_sec;
3835 			tsp->tv_nsec = ts32.tv_nsec;
3836 		}
3837 	}
3838 	return (error);
3839 }
3840 
3841 static inline int
3842 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3843 {
3844 	struct umtx_time32 t32;
3845 	int error;
3846 
3847 	t32.clockid = CLOCK_REALTIME;
3848 	t32.flags   = 0;
3849 	if (size <= sizeof(struct timespec32))
3850 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3851 	else
3852 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3853 	if (error != 0)
3854 		return (error);
3855 	if (t32.timeout.tv_sec < 0 ||
3856 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3857 		return (EINVAL);
3858 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3859 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3860 	tp->_flags = t32.flags;
3861 	tp->_clockid = t32.clockid;
3862 	return (0);
3863 }
3864 
3865 static int
3866 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3867 {
3868 	struct _umtx_time *tm_p, timeout;
3869 	int error;
3870 
3871 	if (uap->uaddr2 == NULL)
3872 		tm_p = NULL;
3873 	else {
3874 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3875 			(size_t)uap->uaddr1, &timeout);
3876 		if (error != 0)
3877 			return (error);
3878 		tm_p = &timeout;
3879 	}
3880 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3881 }
3882 
3883 static int
3884 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3885 {
3886 	struct _umtx_time *tm_p, timeout;
3887 	int error;
3888 
3889 	/* Allow a null timespec (wait forever). */
3890 	if (uap->uaddr2 == NULL)
3891 		tm_p = NULL;
3892 	else {
3893 		error = umtx_copyin_umtx_time(uap->uaddr2,
3894 			    (size_t)uap->uaddr1, &timeout);
3895 		if (error != 0)
3896 			return (error);
3897 		tm_p = &timeout;
3898 	}
3899 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3900 }
3901 
3902 static int
3903 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3904 {
3905 	struct _umtx_time *tm_p, timeout;
3906 	int error;
3907 
3908 	/* Allow a null timespec (wait forever). */
3909 	if (uap->uaddr2 == NULL)
3910 		tm_p = NULL;
3911 	else {
3912 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3913 		    (size_t)uap->uaddr1, &timeout);
3914 		if (error != 0)
3915 			return (error);
3916 		tm_p = &timeout;
3917 	}
3918 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3919 }
3920 
3921 static int
3922 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3923 {
3924 	struct timespec *ts, timeout;
3925 	int error;
3926 
3927 	/* Allow a null timespec (wait forever). */
3928 	if (uap->uaddr2 == NULL)
3929 		ts = NULL;
3930 	else {
3931 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3932 		if (error != 0)
3933 			return (error);
3934 		ts = &timeout;
3935 	}
3936 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3937 }
3938 
3939 static int
3940 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3941 {
3942 	struct _umtx_time timeout;
3943 	int error;
3944 
3945 	/* Allow a null timespec (wait forever). */
3946 	if (uap->uaddr2 == NULL) {
3947 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3948 	} else {
3949 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3950 		    (size_t)uap->uaddr1, &timeout);
3951 		if (error != 0)
3952 			return (error);
3953 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3954 	}
3955 	return (error);
3956 }
3957 
3958 static int
3959 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3960 {
3961 	struct _umtx_time timeout;
3962 	int error;
3963 
3964 	/* Allow a null timespec (wait forever). */
3965 	if (uap->uaddr2 == NULL) {
3966 		error = do_rw_wrlock(td, uap->obj, 0);
3967 	} else {
3968 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3969 		    (size_t)uap->uaddr1, &timeout);
3970 		if (error != 0)
3971 			return (error);
3972 		error = do_rw_wrlock(td, uap->obj, &timeout);
3973 	}
3974 	return (error);
3975 }
3976 
3977 static int
3978 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3979 {
3980 	struct _umtx_time *tm_p, timeout;
3981 	int error;
3982 
3983 	if (uap->uaddr2 == NULL)
3984 		tm_p = NULL;
3985 	else {
3986 		error = umtx_copyin_umtx_time32(
3987 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3988 		if (error != 0)
3989 			return (error);
3990 		tm_p = &timeout;
3991 	}
3992 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3993 }
3994 
3995 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3996 static int
3997 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3998 {
3999 	struct _umtx_time *tm_p, timeout;
4000 	int error;
4001 
4002 	/* Allow a null timespec (wait forever). */
4003 	if (uap->uaddr2 == NULL)
4004 		tm_p = NULL;
4005 	else {
4006 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4007 		    (size_t)uap->uaddr1, &timeout);
4008 		if (error != 0)
4009 			return (error);
4010 		tm_p = &timeout;
4011 	}
4012 	return (do_sem_wait(td, uap->obj, tm_p));
4013 }
4014 #endif
4015 
4016 static int
4017 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4018 {
4019 	struct _umtx_time *tm_p, timeout;
4020 	int error;
4021 
4022 	/* Allow a null timespec (wait forever). */
4023 	if (uap->uaddr2 == NULL)
4024 		tm_p = NULL;
4025 	else {
4026 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4027 		    (size_t)uap->uaddr1, &timeout);
4028 		if (error != 0)
4029 			return (error);
4030 		tm_p = &timeout;
4031 	}
4032 	return (do_sem2_wait(td, uap->obj, tm_p));
4033 }
4034 
4035 static int
4036 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4037 {
4038 	int count = uap->val;
4039 	uint32_t uaddrs[BATCH_SIZE];
4040 	uint32_t **upp = (uint32_t **)uap->obj;
4041 	int tocopy;
4042 	int error = 0;
4043 	int i, pos = 0;
4044 
4045 	while (count > 0) {
4046 		tocopy = count;
4047 		if (tocopy > BATCH_SIZE)
4048 			tocopy = BATCH_SIZE;
4049 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
4050 		if (error != 0)
4051 			break;
4052 		for (i = 0; i < tocopy; ++i)
4053 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4054 				INT_MAX, 1);
4055 		count -= tocopy;
4056 		pos += tocopy;
4057 	}
4058 	return (error);
4059 }
4060 
4061 static const _umtx_op_func op_table_compat32[] = {
4062 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4063 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4064 	[UMTX_OP_WAIT]	= __umtx_op_wait_compat32,
4065 	[UMTX_OP_WAKE]	= __umtx_op_wake,
4066 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4067 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4068 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4069 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4070 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4071 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4072 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4073 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4074 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4075 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4076 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4077 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4078 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4079 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4080 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4081 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4082 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4083 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4084 #else
4085 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4086 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4087 #endif
4088 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4089 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4090 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4091 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4092 	[UMTX_OP_SHM]		= __umtx_op_shm,
4093 };
4094 
4095 int
4096 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
4097 {
4098 
4099 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4100 		return (*op_table_compat32[uap->op])(td,
4101 		    (struct _umtx_op_args *)uap);
4102 	}
4103 	return (EINVAL);
4104 }
4105 #endif
4106 
4107 void
4108 umtx_thread_init(struct thread *td)
4109 {
4110 	td->td_umtxq = umtxq_alloc();
4111 	td->td_umtxq->uq_thread = td;
4112 }
4113 
4114 void
4115 umtx_thread_fini(struct thread *td)
4116 {
4117 	umtxq_free(td->td_umtxq);
4118 }
4119 
4120 /*
4121  * It will be called when new thread is created, e.g fork().
4122  */
4123 void
4124 umtx_thread_alloc(struct thread *td)
4125 {
4126 	struct umtx_q *uq;
4127 
4128 	uq = td->td_umtxq;
4129 	uq->uq_inherited_pri = PRI_MAX;
4130 
4131 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4132 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4133 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4134 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4135 }
4136 
4137 /*
4138  * exec() hook.
4139  */
4140 static void
4141 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
4142 	struct image_params *imgp __unused)
4143 {
4144 	umtx_thread_cleanup(curthread);
4145 }
4146 
4147 /*
4148  * thread_exit() hook.
4149  */
4150 void
4151 umtx_thread_exit(struct thread *td)
4152 {
4153 	umtx_thread_cleanup(td);
4154 }
4155 
4156 /*
4157  * clean up umtx data.
4158  */
4159 static void
4160 umtx_thread_cleanup(struct thread *td)
4161 {
4162 	struct umtx_q *uq;
4163 	struct umtx_pi *pi;
4164 
4165 	if ((uq = td->td_umtxq) == NULL)
4166 		return;
4167 
4168 	mtx_lock(&umtx_lock);
4169 	uq->uq_inherited_pri = PRI_MAX;
4170 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4171 		pi->pi_owner = NULL;
4172 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4173 	}
4174 	mtx_unlock(&umtx_lock);
4175 	thread_lock(td);
4176 	sched_lend_user_prio(td, PRI_MAX);
4177 	thread_unlock(td);
4178 }
4179