xref: /freebsd/sys/kern/kern_umtx.c (revision 6683132d54bd6d589889e43dabdc53d35e38a028)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015, 2016 The FreeBSD Foundation
5  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Konstantin Belousov
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice unmodified, this list of conditions, and the following
17  *    disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_umtx_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysent.h>
59 #include <sys/systm.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/time.h>
64 #include <sys/eventhandler.h>
65 #include <sys/umtx.h>
66 
67 #include <security/mac/mac_framework.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_object.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/cpu.h>
77 
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32_proto.h>
80 #endif
81 
82 #define _UMUTEX_TRY		1
83 #define _UMUTEX_WAIT		2
84 
85 #ifdef UMTX_PROFILING
86 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
87 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
88 #endif
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98  	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 
142 	/* Spare queue ready to be reused */
143 	struct umtxq_queue	*uq_spare_queue;
144 
145 	/* The queue we on */
146 	struct umtxq_queue	*uq_cur_queue;
147 };
148 
149 TAILQ_HEAD(umtxq_head, umtx_q);
150 
151 /* Per-key wait-queue */
152 struct umtxq_queue {
153 	struct umtxq_head	head;
154 	struct umtx_key		key;
155 	LIST_ENTRY(umtxq_queue)	link;
156 	int			length;
157 };
158 
159 LIST_HEAD(umtxq_list, umtxq_queue);
160 
161 /* Userland lock object's wait-queue chain */
162 struct umtxq_chain {
163 	/* Lock for this chain. */
164 	struct mtx		uc_lock;
165 
166 	/* List of sleep queues. */
167 	struct umtxq_list	uc_queue[2];
168 #define UMTX_SHARED_QUEUE	0
169 #define UMTX_EXCLUSIVE_QUEUE	1
170 
171 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
172 
173 	/* Busy flag */
174 	char			uc_busy;
175 
176 	/* Chain lock waiters */
177 	int			uc_waiters;
178 
179 	/* All PI in the list */
180 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
181 
182 #ifdef UMTX_PROFILING
183 	u_int 			length;
184 	u_int			max_length;
185 #endif
186 };
187 
188 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
189 
190 /*
191  * Don't propagate time-sharing priority, there is a security reason,
192  * a user can simply introduce PI-mutex, let thread A lock the mutex,
193  * and let another thread B block on the mutex, because B is
194  * sleeping, its priority will be boosted, this causes A's priority to
195  * be boosted via priority propagating too and will never be lowered even
196  * if it is using 100%CPU, this is unfair to other processes.
197  */
198 
199 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
200 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
201 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
202 
203 #define	GOLDEN_RATIO_PRIME	2654404609U
204 #ifndef	UMTX_CHAINS
205 #define	UMTX_CHAINS		512
206 #endif
207 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
208 
209 #define	GET_SHARE(flags)	\
210     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
211 
212 #define BUSY_SPINS		200
213 
214 struct abs_timeout {
215 	int clockid;
216 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
217 	struct timespec cur;
218 	struct timespec end;
219 };
220 
221 #ifdef COMPAT_FREEBSD32
222 struct umutex32 {
223 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
224 	__uint32_t		m_flags;	/* Flags of the mutex */
225 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
226 	__uint32_t		m_rb_lnk;	/* Robust linkage */
227 	__uint32_t		m_pad;
228 	__uint32_t		m_spare[2];
229 };
230 
231 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
232 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
233     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
234 #endif
235 
236 int umtx_shm_vnobj_persistent = 0;
237 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
238     &umtx_shm_vnobj_persistent, 0,
239     "False forces destruction of umtx attached to file, on last close");
240 static int umtx_max_rb = 1000;
241 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
242     &umtx_max_rb, 0,
243     "");
244 
245 static uma_zone_t		umtx_pi_zone;
246 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
247 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
248 static int			umtx_pi_allocated;
249 
250 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
251 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
252     &umtx_pi_allocated, 0, "Allocated umtx_pi");
253 static int umtx_verbose_rb = 1;
254 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
255     &umtx_verbose_rb, 0,
256     "");
257 
258 #ifdef UMTX_PROFILING
259 static long max_length;
260 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
261 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
262 #endif
263 
264 static void abs_timeout_update(struct abs_timeout *timo);
265 
266 static void umtx_shm_init(void);
267 static void umtxq_sysinit(void *);
268 static void umtxq_hash(struct umtx_key *key);
269 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
270 static void umtxq_lock(struct umtx_key *key);
271 static void umtxq_unlock(struct umtx_key *key);
272 static void umtxq_busy(struct umtx_key *key);
273 static void umtxq_unbusy(struct umtx_key *key);
274 static void umtxq_insert_queue(struct umtx_q *uq, int q);
275 static void umtxq_remove_queue(struct umtx_q *uq, int q);
276 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
277 static int umtxq_count(struct umtx_key *key);
278 static struct umtx_pi *umtx_pi_alloc(int);
279 static void umtx_pi_free(struct umtx_pi *pi);
280 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
281     bool rb);
282 static void umtx_thread_cleanup(struct thread *td);
283 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
284     struct image_params *imgp __unused);
285 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
286 
287 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
288 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
289 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
290 
291 static struct mtx umtx_lock;
292 
293 #ifdef UMTX_PROFILING
294 static void
295 umtx_init_profiling(void)
296 {
297 	struct sysctl_oid *chain_oid;
298 	char chain_name[10];
299 	int i;
300 
301 	for (i = 0; i < UMTX_CHAINS; ++i) {
302 		snprintf(chain_name, sizeof(chain_name), "%d", i);
303 		chain_oid = SYSCTL_ADD_NODE(NULL,
304 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
305 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
306 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
307 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
308 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
309 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
310 	}
311 }
312 
313 static int
314 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
315 {
316 	char buf[512];
317 	struct sbuf sb;
318 	struct umtxq_chain *uc;
319 	u_int fract, i, j, tot, whole;
320 	u_int sf0, sf1, sf2, sf3, sf4;
321 	u_int si0, si1, si2, si3, si4;
322 	u_int sw0, sw1, sw2, sw3, sw4;
323 
324 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
325 	for (i = 0; i < 2; i++) {
326 		tot = 0;
327 		for (j = 0; j < UMTX_CHAINS; ++j) {
328 			uc = &umtxq_chains[i][j];
329 			mtx_lock(&uc->uc_lock);
330 			tot += uc->max_length;
331 			mtx_unlock(&uc->uc_lock);
332 		}
333 		if (tot == 0)
334 			sbuf_printf(&sb, "%u) Empty ", i);
335 		else {
336 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
337 			si0 = si1 = si2 = si3 = si4 = 0;
338 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
339 			for (j = 0; j < UMTX_CHAINS; j++) {
340 				uc = &umtxq_chains[i][j];
341 				mtx_lock(&uc->uc_lock);
342 				whole = uc->max_length * 100;
343 				mtx_unlock(&uc->uc_lock);
344 				fract = (whole % tot) * 100;
345 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
346 					sf0 = fract;
347 					si0 = j;
348 					sw0 = whole;
349 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
350 				    sf1)) {
351 					sf1 = fract;
352 					si1 = j;
353 					sw1 = whole;
354 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
355 				    sf2)) {
356 					sf2 = fract;
357 					si2 = j;
358 					sw2 = whole;
359 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
360 				    sf3)) {
361 					sf3 = fract;
362 					si3 = j;
363 					sw3 = whole;
364 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
365 				    sf4)) {
366 					sf4 = fract;
367 					si4 = j;
368 					sw4 = whole;
369 				}
370 			}
371 			sbuf_printf(&sb, "queue %u:\n", i);
372 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
373 			    sf0 / tot, si0);
374 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
375 			    sf1 / tot, si1);
376 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
377 			    sf2 / tot, si2);
378 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
379 			    sf3 / tot, si3);
380 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
381 			    sf4 / tot, si4);
382 		}
383 	}
384 	sbuf_trim(&sb);
385 	sbuf_finish(&sb);
386 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
387 	sbuf_delete(&sb);
388 	return (0);
389 }
390 
391 static int
392 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
393 {
394 	struct umtxq_chain *uc;
395 	u_int i, j;
396 	int clear, error;
397 
398 	clear = 0;
399 	error = sysctl_handle_int(oidp, &clear, 0, req);
400 	if (error != 0 || req->newptr == NULL)
401 		return (error);
402 
403 	if (clear != 0) {
404 		for (i = 0; i < 2; ++i) {
405 			for (j = 0; j < UMTX_CHAINS; ++j) {
406 				uc = &umtxq_chains[i][j];
407 				mtx_lock(&uc->uc_lock);
408 				uc->length = 0;
409 				uc->max_length = 0;
410 				mtx_unlock(&uc->uc_lock);
411 			}
412 		}
413 	}
414 	return (0);
415 }
416 
417 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
418     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
419     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
420 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
421     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
422     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
423 #endif
424 
425 static void
426 umtxq_sysinit(void *arg __unused)
427 {
428 	int i, j;
429 
430 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
431 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
432 	for (i = 0; i < 2; ++i) {
433 		for (j = 0; j < UMTX_CHAINS; ++j) {
434 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
435 				 MTX_DEF | MTX_DUPOK);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
438 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
439 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
440 			umtxq_chains[i][j].uc_busy = 0;
441 			umtxq_chains[i][j].uc_waiters = 0;
442 #ifdef UMTX_PROFILING
443 			umtxq_chains[i][j].length = 0;
444 			umtxq_chains[i][j].max_length = 0;
445 #endif
446 		}
447 	}
448 #ifdef UMTX_PROFILING
449 	umtx_init_profiling();
450 #endif
451 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
452 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
453 	    EVENTHANDLER_PRI_ANY);
454 	umtx_shm_init();
455 }
456 
457 struct umtx_q *
458 umtxq_alloc(void)
459 {
460 	struct umtx_q *uq;
461 
462 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
463 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
464 	    M_WAITOK | M_ZERO);
465 	TAILQ_INIT(&uq->uq_spare_queue->head);
466 	TAILQ_INIT(&uq->uq_pi_contested);
467 	uq->uq_inherited_pri = PRI_MAX;
468 	return (uq);
469 }
470 
471 void
472 umtxq_free(struct umtx_q *uq)
473 {
474 
475 	MPASS(uq->uq_spare_queue != NULL);
476 	free(uq->uq_spare_queue, M_UMTX);
477 	free(uq, M_UMTX);
478 }
479 
480 static inline void
481 umtxq_hash(struct umtx_key *key)
482 {
483 	unsigned n;
484 
485 	n = (uintptr_t)key->info.both.a + key->info.both.b;
486 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
487 }
488 
489 static inline struct umtxq_chain *
490 umtxq_getchain(struct umtx_key *key)
491 {
492 
493 	if (key->type <= TYPE_SEM)
494 		return (&umtxq_chains[1][key->hash]);
495 	return (&umtxq_chains[0][key->hash]);
496 }
497 
498 /*
499  * Lock a chain.
500  */
501 static inline void
502 umtxq_lock(struct umtx_key *key)
503 {
504 	struct umtxq_chain *uc;
505 
506 	uc = umtxq_getchain(key);
507 	mtx_lock(&uc->uc_lock);
508 }
509 
510 /*
511  * Unlock a chain.
512  */
513 static inline void
514 umtxq_unlock(struct umtx_key *key)
515 {
516 	struct umtxq_chain *uc;
517 
518 	uc = umtxq_getchain(key);
519 	mtx_unlock(&uc->uc_lock);
520 }
521 
522 /*
523  * Set chain to busy state when following operation
524  * may be blocked (kernel mutex can not be used).
525  */
526 static inline void
527 umtxq_busy(struct umtx_key *key)
528 {
529 	struct umtxq_chain *uc;
530 
531 	uc = umtxq_getchain(key);
532 	mtx_assert(&uc->uc_lock, MA_OWNED);
533 	if (uc->uc_busy) {
534 #ifdef SMP
535 		if (smp_cpus > 1) {
536 			int count = BUSY_SPINS;
537 			if (count > 0) {
538 				umtxq_unlock(key);
539 				while (uc->uc_busy && --count > 0)
540 					cpu_spinwait();
541 				umtxq_lock(key);
542 			}
543 		}
544 #endif
545 		while (uc->uc_busy) {
546 			uc->uc_waiters++;
547 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
548 			uc->uc_waiters--;
549 		}
550 	}
551 	uc->uc_busy = 1;
552 }
553 
554 /*
555  * Unbusy a chain.
556  */
557 static inline void
558 umtxq_unbusy(struct umtx_key *key)
559 {
560 	struct umtxq_chain *uc;
561 
562 	uc = umtxq_getchain(key);
563 	mtx_assert(&uc->uc_lock, MA_OWNED);
564 	KASSERT(uc->uc_busy != 0, ("not busy"));
565 	uc->uc_busy = 0;
566 	if (uc->uc_waiters)
567 		wakeup_one(uc);
568 }
569 
570 static inline void
571 umtxq_unbusy_unlocked(struct umtx_key *key)
572 {
573 
574 	umtxq_lock(key);
575 	umtxq_unbusy(key);
576 	umtxq_unlock(key);
577 }
578 
579 static struct umtxq_queue *
580 umtxq_queue_lookup(struct umtx_key *key, int q)
581 {
582 	struct umtxq_queue *uh;
583 	struct umtxq_chain *uc;
584 
585 	uc = umtxq_getchain(key);
586 	UMTXQ_LOCKED_ASSERT(uc);
587 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
588 		if (umtx_key_match(&uh->key, key))
589 			return (uh);
590 	}
591 
592 	return (NULL);
593 }
594 
595 static inline void
596 umtxq_insert_queue(struct umtx_q *uq, int q)
597 {
598 	struct umtxq_queue *uh;
599 	struct umtxq_chain *uc;
600 
601 	uc = umtxq_getchain(&uq->uq_key);
602 	UMTXQ_LOCKED_ASSERT(uc);
603 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
604 	uh = umtxq_queue_lookup(&uq->uq_key, q);
605 	if (uh != NULL) {
606 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
607 	} else {
608 		uh = uq->uq_spare_queue;
609 		uh->key = uq->uq_key;
610 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
611 #ifdef UMTX_PROFILING
612 		uc->length++;
613 		if (uc->length > uc->max_length) {
614 			uc->max_length = uc->length;
615 			if (uc->max_length > max_length)
616 				max_length = uc->max_length;
617 		}
618 #endif
619 	}
620 	uq->uq_spare_queue = NULL;
621 
622 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
623 	uh->length++;
624 	uq->uq_flags |= UQF_UMTXQ;
625 	uq->uq_cur_queue = uh;
626 	return;
627 }
628 
629 static inline void
630 umtxq_remove_queue(struct umtx_q *uq, int q)
631 {
632 	struct umtxq_chain *uc;
633 	struct umtxq_queue *uh;
634 
635 	uc = umtxq_getchain(&uq->uq_key);
636 	UMTXQ_LOCKED_ASSERT(uc);
637 	if (uq->uq_flags & UQF_UMTXQ) {
638 		uh = uq->uq_cur_queue;
639 		TAILQ_REMOVE(&uh->head, uq, uq_link);
640 		uh->length--;
641 		uq->uq_flags &= ~UQF_UMTXQ;
642 		if (TAILQ_EMPTY(&uh->head)) {
643 			KASSERT(uh->length == 0,
644 			    ("inconsistent umtxq_queue length"));
645 #ifdef UMTX_PROFILING
646 			uc->length--;
647 #endif
648 			LIST_REMOVE(uh, link);
649 		} else {
650 			uh = LIST_FIRST(&uc->uc_spare_queue);
651 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
652 			LIST_REMOVE(uh, link);
653 		}
654 		uq->uq_spare_queue = uh;
655 		uq->uq_cur_queue = NULL;
656 	}
657 }
658 
659 /*
660  * Check if there are multiple waiters
661  */
662 static int
663 umtxq_count(struct umtx_key *key)
664 {
665 	struct umtxq_queue *uh;
666 
667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
668 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
669 	if (uh != NULL)
670 		return (uh->length);
671 	return (0);
672 }
673 
674 /*
675  * Check if there are multiple PI waiters and returns first
676  * waiter.
677  */
678 static int
679 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
680 {
681 	struct umtxq_queue *uh;
682 
683 	*first = NULL;
684 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
685 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
686 	if (uh != NULL) {
687 		*first = TAILQ_FIRST(&uh->head);
688 		return (uh->length);
689 	}
690 	return (0);
691 }
692 
693 static int
694 umtxq_check_susp(struct thread *td)
695 {
696 	struct proc *p;
697 	int error;
698 
699 	/*
700 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
701 	 * eventually break the lockstep loop.
702 	 */
703 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
704 		return (0);
705 	error = 0;
706 	p = td->td_proc;
707 	PROC_LOCK(p);
708 	if (P_SHOULDSTOP(p) ||
709 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
710 		if (p->p_flag & P_SINGLE_EXIT)
711 			error = EINTR;
712 		else
713 			error = ERESTART;
714 	}
715 	PROC_UNLOCK(p);
716 	return (error);
717 }
718 
719 /*
720  * Wake up threads waiting on an userland object.
721  */
722 
723 static int
724 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
725 {
726 	struct umtxq_queue *uh;
727 	struct umtx_q *uq;
728 	int ret;
729 
730 	ret = 0;
731 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
732 	uh = umtxq_queue_lookup(key, q);
733 	if (uh != NULL) {
734 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
735 			umtxq_remove_queue(uq, q);
736 			wakeup(uq);
737 			if (++ret >= n_wake)
738 				return (ret);
739 		}
740 	}
741 	return (ret);
742 }
743 
744 
745 /*
746  * Wake up specified thread.
747  */
748 static inline void
749 umtxq_signal_thread(struct umtx_q *uq)
750 {
751 
752 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
753 	umtxq_remove(uq);
754 	wakeup(uq);
755 }
756 
757 static inline int
758 tstohz(const struct timespec *tsp)
759 {
760 	struct timeval tv;
761 
762 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
763 	return tvtohz(&tv);
764 }
765 
766 static void
767 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
768 	const struct timespec *timeout)
769 {
770 
771 	timo->clockid = clockid;
772 	if (!absolute) {
773 		timo->is_abs_real = false;
774 		abs_timeout_update(timo);
775 		timespecadd(&timo->cur, timeout, &timo->end);
776 	} else {
777 		timo->end = *timeout;
778 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
779 		    clockid == CLOCK_REALTIME_FAST ||
780 		    clockid == CLOCK_REALTIME_PRECISE;
781 		/*
782 		 * If is_abs_real, umtxq_sleep will read the clock
783 		 * after setting td_rtcgen; otherwise, read it here.
784 		 */
785 		if (!timo->is_abs_real) {
786 			abs_timeout_update(timo);
787 		}
788 	}
789 }
790 
791 static void
792 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
793 {
794 
795 	abs_timeout_init(timo, umtxtime->_clockid,
796 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
797 }
798 
799 static inline void
800 abs_timeout_update(struct abs_timeout *timo)
801 {
802 
803 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
804 }
805 
806 static int
807 abs_timeout_gethz(struct abs_timeout *timo)
808 {
809 	struct timespec tts;
810 
811 	if (timespeccmp(&timo->end, &timo->cur, <=))
812 		return (-1);
813 	timespecsub(&timo->end, &timo->cur, &tts);
814 	return (tstohz(&tts));
815 }
816 
817 static uint32_t
818 umtx_unlock_val(uint32_t flags, bool rb)
819 {
820 
821 	if (rb)
822 		return (UMUTEX_RB_OWNERDEAD);
823 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
824 		return (UMUTEX_RB_NOTRECOV);
825 	else
826 		return (UMUTEX_UNOWNED);
827 
828 }
829 
830 /*
831  * Put thread into sleep state, before sleeping, check if
832  * thread was removed from umtx queue.
833  */
834 static inline int
835 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
836 {
837 	struct umtxq_chain *uc;
838 	int error, timo;
839 
840 	if (abstime != NULL && abstime->is_abs_real) {
841 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
842 		abs_timeout_update(abstime);
843 	}
844 
845 	uc = umtxq_getchain(&uq->uq_key);
846 	UMTXQ_LOCKED_ASSERT(uc);
847 	for (;;) {
848 		if (!(uq->uq_flags & UQF_UMTXQ)) {
849 			error = 0;
850 			break;
851 		}
852 		if (abstime != NULL) {
853 			timo = abs_timeout_gethz(abstime);
854 			if (timo < 0) {
855 				error = ETIMEDOUT;
856 				break;
857 			}
858 		} else
859 			timo = 0;
860 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
861 		if (error == EINTR || error == ERESTART) {
862 			umtxq_lock(&uq->uq_key);
863 			break;
864 		}
865 		if (abstime != NULL) {
866 			if (abstime->is_abs_real)
867 				curthread->td_rtcgen =
868 				    atomic_load_acq_int(&rtc_generation);
869 			abs_timeout_update(abstime);
870 		}
871 		umtxq_lock(&uq->uq_key);
872 	}
873 
874 	curthread->td_rtcgen = 0;
875 	return (error);
876 }
877 
878 /*
879  * Convert userspace address into unique logical address.
880  */
881 int
882 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
883 {
884 	struct thread *td = curthread;
885 	vm_map_t map;
886 	vm_map_entry_t entry;
887 	vm_pindex_t pindex;
888 	vm_prot_t prot;
889 	boolean_t wired;
890 
891 	key->type = type;
892 	if (share == THREAD_SHARE) {
893 		key->shared = 0;
894 		key->info.private.vs = td->td_proc->p_vmspace;
895 		key->info.private.addr = (uintptr_t)addr;
896 	} else {
897 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
898 		map = &td->td_proc->p_vmspace->vm_map;
899 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
900 		    &entry, &key->info.shared.object, &pindex, &prot,
901 		    &wired) != KERN_SUCCESS) {
902 			return (EFAULT);
903 		}
904 
905 		if ((share == PROCESS_SHARE) ||
906 		    (share == AUTO_SHARE &&
907 		     VM_INHERIT_SHARE == entry->inheritance)) {
908 			key->shared = 1;
909 			key->info.shared.offset = (vm_offset_t)addr -
910 			    entry->start + entry->offset;
911 			vm_object_reference(key->info.shared.object);
912 		} else {
913 			key->shared = 0;
914 			key->info.private.vs = td->td_proc->p_vmspace;
915 			key->info.private.addr = (uintptr_t)addr;
916 		}
917 		vm_map_lookup_done(map, entry);
918 	}
919 
920 	umtxq_hash(key);
921 	return (0);
922 }
923 
924 /*
925  * Release key.
926  */
927 void
928 umtx_key_release(struct umtx_key *key)
929 {
930 	if (key->shared)
931 		vm_object_deallocate(key->info.shared.object);
932 }
933 
934 /*
935  * Fetch and compare value, sleep on the address if value is not changed.
936  */
937 static int
938 do_wait(struct thread *td, void *addr, u_long id,
939     struct _umtx_time *timeout, int compat32, int is_private)
940 {
941 	struct abs_timeout timo;
942 	struct umtx_q *uq;
943 	u_long tmp;
944 	uint32_t tmp32;
945 	int error = 0;
946 
947 	uq = td->td_umtxq;
948 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
949 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
950 		return (error);
951 
952 	if (timeout != NULL)
953 		abs_timeout_init2(&timo, timeout);
954 
955 	umtxq_lock(&uq->uq_key);
956 	umtxq_insert(uq);
957 	umtxq_unlock(&uq->uq_key);
958 	if (compat32 == 0) {
959 		error = fueword(addr, &tmp);
960 		if (error != 0)
961 			error = EFAULT;
962 	} else {
963 		error = fueword32(addr, &tmp32);
964 		if (error == 0)
965 			tmp = tmp32;
966 		else
967 			error = EFAULT;
968 	}
969 	umtxq_lock(&uq->uq_key);
970 	if (error == 0) {
971 		if (tmp == id)
972 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
973 			    NULL : &timo);
974 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
975 			error = 0;
976 		else
977 			umtxq_remove(uq);
978 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
979 		umtxq_remove(uq);
980 	}
981 	umtxq_unlock(&uq->uq_key);
982 	umtx_key_release(&uq->uq_key);
983 	if (error == ERESTART)
984 		error = EINTR;
985 	return (error);
986 }
987 
988 /*
989  * Wake up threads sleeping on the specified address.
990  */
991 int
992 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
993 {
994 	struct umtx_key key;
995 	int ret;
996 
997 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
998 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
999 		return (ret);
1000 	umtxq_lock(&key);
1001 	umtxq_signal(&key, n_wake);
1002 	umtxq_unlock(&key);
1003 	umtx_key_release(&key);
1004 	return (0);
1005 }
1006 
1007 /*
1008  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1009  */
1010 static int
1011 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1012     struct _umtx_time *timeout, int mode)
1013 {
1014 	struct abs_timeout timo;
1015 	struct umtx_q *uq;
1016 	uint32_t owner, old, id;
1017 	int error, rv;
1018 
1019 	id = td->td_tid;
1020 	uq = td->td_umtxq;
1021 	error = 0;
1022 	if (timeout != NULL)
1023 		abs_timeout_init2(&timo, timeout);
1024 
1025 	/*
1026 	 * Care must be exercised when dealing with umtx structure. It
1027 	 * can fault on any access.
1028 	 */
1029 	for (;;) {
1030 		rv = fueword32(&m->m_owner, &owner);
1031 		if (rv == -1)
1032 			return (EFAULT);
1033 		if (mode == _UMUTEX_WAIT) {
1034 			if (owner == UMUTEX_UNOWNED ||
1035 			    owner == UMUTEX_CONTESTED ||
1036 			    owner == UMUTEX_RB_OWNERDEAD ||
1037 			    owner == UMUTEX_RB_NOTRECOV)
1038 				return (0);
1039 		} else {
1040 			/*
1041 			 * Robust mutex terminated.  Kernel duty is to
1042 			 * return EOWNERDEAD to the userspace.  The
1043 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1044 			 * by the common userspace code.
1045 			 */
1046 			if (owner == UMUTEX_RB_OWNERDEAD) {
1047 				rv = casueword32(&m->m_owner,
1048 				    UMUTEX_RB_OWNERDEAD, &owner,
1049 				    id | UMUTEX_CONTESTED);
1050 				if (rv == -1)
1051 					return (EFAULT);
1052 				if (owner == UMUTEX_RB_OWNERDEAD)
1053 					return (EOWNERDEAD); /* success */
1054 				rv = umtxq_check_susp(td);
1055 				if (rv != 0)
1056 					return (rv);
1057 				continue;
1058 			}
1059 			if (owner == UMUTEX_RB_NOTRECOV)
1060 				return (ENOTRECOVERABLE);
1061 
1062 
1063 			/*
1064 			 * Try the uncontested case.  This should be
1065 			 * done in userland.
1066 			 */
1067 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1068 			    &owner, id);
1069 			/* The address was invalid. */
1070 			if (rv == -1)
1071 				return (EFAULT);
1072 
1073 			/* The acquire succeeded. */
1074 			if (owner == UMUTEX_UNOWNED)
1075 				return (0);
1076 
1077 			/*
1078 			 * If no one owns it but it is contested try
1079 			 * to acquire it.
1080 			 */
1081 			if (owner == UMUTEX_CONTESTED) {
1082 				rv = casueword32(&m->m_owner,
1083 				    UMUTEX_CONTESTED, &owner,
1084 				    id | UMUTEX_CONTESTED);
1085 				/* The address was invalid. */
1086 				if (rv == -1)
1087 					return (EFAULT);
1088 
1089 				if (owner == UMUTEX_CONTESTED)
1090 					return (0);
1091 
1092 				rv = umtxq_check_susp(td);
1093 				if (rv != 0)
1094 					return (rv);
1095 
1096 				/*
1097 				 * If this failed the lock has
1098 				 * changed, restart.
1099 				 */
1100 				continue;
1101 			}
1102 		}
1103 
1104 		if (mode == _UMUTEX_TRY)
1105 			return (EBUSY);
1106 
1107 		/*
1108 		 * If we caught a signal, we have retried and now
1109 		 * exit immediately.
1110 		 */
1111 		if (error != 0)
1112 			return (error);
1113 
1114 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1115 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1116 			return (error);
1117 
1118 		umtxq_lock(&uq->uq_key);
1119 		umtxq_busy(&uq->uq_key);
1120 		umtxq_insert(uq);
1121 		umtxq_unlock(&uq->uq_key);
1122 
1123 		/*
1124 		 * Set the contested bit so that a release in user space
1125 		 * knows to use the system call for unlock.  If this fails
1126 		 * either some one else has acquired the lock or it has been
1127 		 * released.
1128 		 */
1129 		rv = casueword32(&m->m_owner, owner, &old,
1130 		    owner | UMUTEX_CONTESTED);
1131 
1132 		/* The address was invalid. */
1133 		if (rv == -1) {
1134 			umtxq_lock(&uq->uq_key);
1135 			umtxq_remove(uq);
1136 			umtxq_unbusy(&uq->uq_key);
1137 			umtxq_unlock(&uq->uq_key);
1138 			umtx_key_release(&uq->uq_key);
1139 			return (EFAULT);
1140 		}
1141 
1142 		/*
1143 		 * We set the contested bit, sleep. Otherwise the lock changed
1144 		 * and we need to retry or we lost a race to the thread
1145 		 * unlocking the umtx.
1146 		 */
1147 		umtxq_lock(&uq->uq_key);
1148 		umtxq_unbusy(&uq->uq_key);
1149 		if (old == owner)
1150 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1151 			    NULL : &timo);
1152 		umtxq_remove(uq);
1153 		umtxq_unlock(&uq->uq_key);
1154 		umtx_key_release(&uq->uq_key);
1155 
1156 		if (error == 0)
1157 			error = umtxq_check_susp(td);
1158 	}
1159 
1160 	return (0);
1161 }
1162 
1163 /*
1164  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1165  */
1166 static int
1167 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1168 {
1169 	struct umtx_key key;
1170 	uint32_t owner, old, id, newlock;
1171 	int error, count;
1172 
1173 	id = td->td_tid;
1174 	/*
1175 	 * Make sure we own this mtx.
1176 	 */
1177 	error = fueword32(&m->m_owner, &owner);
1178 	if (error == -1)
1179 		return (EFAULT);
1180 
1181 	if ((owner & ~UMUTEX_CONTESTED) != id)
1182 		return (EPERM);
1183 
1184 	newlock = umtx_unlock_val(flags, rb);
1185 	if ((owner & UMUTEX_CONTESTED) == 0) {
1186 		error = casueword32(&m->m_owner, owner, &old, newlock);
1187 		if (error == -1)
1188 			return (EFAULT);
1189 		if (old == owner)
1190 			return (0);
1191 		owner = old;
1192 	}
1193 
1194 	/* We should only ever be in here for contested locks */
1195 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1196 	    &key)) != 0)
1197 		return (error);
1198 
1199 	umtxq_lock(&key);
1200 	umtxq_busy(&key);
1201 	count = umtxq_count(&key);
1202 	umtxq_unlock(&key);
1203 
1204 	/*
1205 	 * When unlocking the umtx, it must be marked as unowned if
1206 	 * there is zero or one thread only waiting for it.
1207 	 * Otherwise, it must be marked as contested.
1208 	 */
1209 	if (count > 1)
1210 		newlock |= UMUTEX_CONTESTED;
1211 	error = casueword32(&m->m_owner, owner, &old, newlock);
1212 	umtxq_lock(&key);
1213 	umtxq_signal(&key, 1);
1214 	umtxq_unbusy(&key);
1215 	umtxq_unlock(&key);
1216 	umtx_key_release(&key);
1217 	if (error == -1)
1218 		return (EFAULT);
1219 	if (old != owner)
1220 		return (EINVAL);
1221 	return (0);
1222 }
1223 
1224 /*
1225  * Check if the mutex is available and wake up a waiter,
1226  * only for simple mutex.
1227  */
1228 static int
1229 do_wake_umutex(struct thread *td, struct umutex *m)
1230 {
1231 	struct umtx_key key;
1232 	uint32_t owner;
1233 	uint32_t flags;
1234 	int error;
1235 	int count;
1236 
1237 	error = fueword32(&m->m_owner, &owner);
1238 	if (error == -1)
1239 		return (EFAULT);
1240 
1241 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1242 	    owner != UMUTEX_RB_NOTRECOV)
1243 		return (0);
1244 
1245 	error = fueword32(&m->m_flags, &flags);
1246 	if (error == -1)
1247 		return (EFAULT);
1248 
1249 	/* We should only ever be in here for contested locks */
1250 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1251 	    &key)) != 0)
1252 		return (error);
1253 
1254 	umtxq_lock(&key);
1255 	umtxq_busy(&key);
1256 	count = umtxq_count(&key);
1257 	umtxq_unlock(&key);
1258 
1259 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1260 	    owner != UMUTEX_RB_NOTRECOV) {
1261 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1262 		    UMUTEX_UNOWNED);
1263 		if (error == -1)
1264 			error = EFAULT;
1265 	}
1266 
1267 	umtxq_lock(&key);
1268 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1269 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1270 		umtxq_signal(&key, 1);
1271 	umtxq_unbusy(&key);
1272 	umtxq_unlock(&key);
1273 	umtx_key_release(&key);
1274 	return (error);
1275 }
1276 
1277 /*
1278  * Check if the mutex has waiters and tries to fix contention bit.
1279  */
1280 static int
1281 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1282 {
1283 	struct umtx_key key;
1284 	uint32_t owner, old;
1285 	int type;
1286 	int error;
1287 	int count;
1288 
1289 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1290 	    UMUTEX_ROBUST)) {
1291 	case 0:
1292 	case UMUTEX_ROBUST:
1293 		type = TYPE_NORMAL_UMUTEX;
1294 		break;
1295 	case UMUTEX_PRIO_INHERIT:
1296 		type = TYPE_PI_UMUTEX;
1297 		break;
1298 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1299 		type = TYPE_PI_ROBUST_UMUTEX;
1300 		break;
1301 	case UMUTEX_PRIO_PROTECT:
1302 		type = TYPE_PP_UMUTEX;
1303 		break;
1304 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1305 		type = TYPE_PP_ROBUST_UMUTEX;
1306 		break;
1307 	default:
1308 		return (EINVAL);
1309 	}
1310 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1311 		return (error);
1312 
1313 	owner = 0;
1314 	umtxq_lock(&key);
1315 	umtxq_busy(&key);
1316 	count = umtxq_count(&key);
1317 	umtxq_unlock(&key);
1318 	/*
1319 	 * Only repair contention bit if there is a waiter, this means the mutex
1320 	 * is still being referenced by userland code, otherwise don't update
1321 	 * any memory.
1322 	 */
1323 	if (count > 1) {
1324 		error = fueword32(&m->m_owner, &owner);
1325 		if (error == -1)
1326 			error = EFAULT;
1327 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1328 			error = casueword32(&m->m_owner, owner, &old,
1329 			    owner | UMUTEX_CONTESTED);
1330 			if (error == -1) {
1331 				error = EFAULT;
1332 				break;
1333 			}
1334 			if (old == owner)
1335 				break;
1336 			owner = old;
1337 			error = umtxq_check_susp(td);
1338 			if (error != 0)
1339 				break;
1340 		}
1341 	} else if (count == 1) {
1342 		error = fueword32(&m->m_owner, &owner);
1343 		if (error == -1)
1344 			error = EFAULT;
1345 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1346 		    (owner & UMUTEX_CONTESTED) == 0) {
1347 			error = casueword32(&m->m_owner, owner, &old,
1348 			    owner | UMUTEX_CONTESTED);
1349 			if (error == -1) {
1350 				error = EFAULT;
1351 				break;
1352 			}
1353 			if (old == owner)
1354 				break;
1355 			owner = old;
1356 			error = umtxq_check_susp(td);
1357 			if (error != 0)
1358 				break;
1359 		}
1360 	}
1361 	umtxq_lock(&key);
1362 	if (error == EFAULT) {
1363 		umtxq_signal(&key, INT_MAX);
1364 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1365 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1366 		umtxq_signal(&key, 1);
1367 	umtxq_unbusy(&key);
1368 	umtxq_unlock(&key);
1369 	umtx_key_release(&key);
1370 	return (error);
1371 }
1372 
1373 static inline struct umtx_pi *
1374 umtx_pi_alloc(int flags)
1375 {
1376 	struct umtx_pi *pi;
1377 
1378 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1379 	TAILQ_INIT(&pi->pi_blocked);
1380 	atomic_add_int(&umtx_pi_allocated, 1);
1381 	return (pi);
1382 }
1383 
1384 static inline void
1385 umtx_pi_free(struct umtx_pi *pi)
1386 {
1387 	uma_zfree(umtx_pi_zone, pi);
1388 	atomic_add_int(&umtx_pi_allocated, -1);
1389 }
1390 
1391 /*
1392  * Adjust the thread's position on a pi_state after its priority has been
1393  * changed.
1394  */
1395 static int
1396 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1397 {
1398 	struct umtx_q *uq, *uq1, *uq2;
1399 	struct thread *td1;
1400 
1401 	mtx_assert(&umtx_lock, MA_OWNED);
1402 	if (pi == NULL)
1403 		return (0);
1404 
1405 	uq = td->td_umtxq;
1406 
1407 	/*
1408 	 * Check if the thread needs to be moved on the blocked chain.
1409 	 * It needs to be moved if either its priority is lower than
1410 	 * the previous thread or higher than the next thread.
1411 	 */
1412 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1413 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1414 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1415 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1416 		/*
1417 		 * Remove thread from blocked chain and determine where
1418 		 * it should be moved to.
1419 		 */
1420 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1421 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1422 			td1 = uq1->uq_thread;
1423 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1424 			if (UPRI(td1) > UPRI(td))
1425 				break;
1426 		}
1427 
1428 		if (uq1 == NULL)
1429 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1430 		else
1431 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1432 	}
1433 	return (1);
1434 }
1435 
1436 static struct umtx_pi *
1437 umtx_pi_next(struct umtx_pi *pi)
1438 {
1439 	struct umtx_q *uq_owner;
1440 
1441 	if (pi->pi_owner == NULL)
1442 		return (NULL);
1443 	uq_owner = pi->pi_owner->td_umtxq;
1444 	if (uq_owner == NULL)
1445 		return (NULL);
1446 	return (uq_owner->uq_pi_blocked);
1447 }
1448 
1449 /*
1450  * Floyd's Cycle-Finding Algorithm.
1451  */
1452 static bool
1453 umtx_pi_check_loop(struct umtx_pi *pi)
1454 {
1455 	struct umtx_pi *pi1;	/* fast iterator */
1456 
1457 	mtx_assert(&umtx_lock, MA_OWNED);
1458 	if (pi == NULL)
1459 		return (false);
1460 	pi1 = pi;
1461 	for (;;) {
1462 		pi = umtx_pi_next(pi);
1463 		if (pi == NULL)
1464 			break;
1465 		pi1 = umtx_pi_next(pi1);
1466 		if (pi1 == NULL)
1467 			break;
1468 		pi1 = umtx_pi_next(pi1);
1469 		if (pi1 == NULL)
1470 			break;
1471 		if (pi == pi1)
1472 			return (true);
1473 	}
1474 	return (false);
1475 }
1476 
1477 /*
1478  * Propagate priority when a thread is blocked on POSIX
1479  * PI mutex.
1480  */
1481 static void
1482 umtx_propagate_priority(struct thread *td)
1483 {
1484 	struct umtx_q *uq;
1485 	struct umtx_pi *pi;
1486 	int pri;
1487 
1488 	mtx_assert(&umtx_lock, MA_OWNED);
1489 	pri = UPRI(td);
1490 	uq = td->td_umtxq;
1491 	pi = uq->uq_pi_blocked;
1492 	if (pi == NULL)
1493 		return;
1494 	if (umtx_pi_check_loop(pi))
1495 		return;
1496 
1497 	for (;;) {
1498 		td = pi->pi_owner;
1499 		if (td == NULL || td == curthread)
1500 			return;
1501 
1502 		MPASS(td->td_proc != NULL);
1503 		MPASS(td->td_proc->p_magic == P_MAGIC);
1504 
1505 		thread_lock(td);
1506 		if (td->td_lend_user_pri > pri)
1507 			sched_lend_user_prio(td, pri);
1508 		else {
1509 			thread_unlock(td);
1510 			break;
1511 		}
1512 		thread_unlock(td);
1513 
1514 		/*
1515 		 * Pick up the lock that td is blocked on.
1516 		 */
1517 		uq = td->td_umtxq;
1518 		pi = uq->uq_pi_blocked;
1519 		if (pi == NULL)
1520 			break;
1521 		/* Resort td on the list if needed. */
1522 		umtx_pi_adjust_thread(pi, td);
1523 	}
1524 }
1525 
1526 /*
1527  * Unpropagate priority for a PI mutex when a thread blocked on
1528  * it is interrupted by signal or resumed by others.
1529  */
1530 static void
1531 umtx_repropagate_priority(struct umtx_pi *pi)
1532 {
1533 	struct umtx_q *uq, *uq_owner;
1534 	struct umtx_pi *pi2;
1535 	int pri;
1536 
1537 	mtx_assert(&umtx_lock, MA_OWNED);
1538 
1539 	if (umtx_pi_check_loop(pi))
1540 		return;
1541 	while (pi != NULL && pi->pi_owner != NULL) {
1542 		pri = PRI_MAX;
1543 		uq_owner = pi->pi_owner->td_umtxq;
1544 
1545 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1546 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1547 			if (uq != NULL) {
1548 				if (pri > UPRI(uq->uq_thread))
1549 					pri = UPRI(uq->uq_thread);
1550 			}
1551 		}
1552 
1553 		if (pri > uq_owner->uq_inherited_pri)
1554 			pri = uq_owner->uq_inherited_pri;
1555 		thread_lock(pi->pi_owner);
1556 		sched_lend_user_prio(pi->pi_owner, pri);
1557 		thread_unlock(pi->pi_owner);
1558 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1559 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1560 	}
1561 }
1562 
1563 /*
1564  * Insert a PI mutex into owned list.
1565  */
1566 static void
1567 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1568 {
1569 	struct umtx_q *uq_owner;
1570 
1571 	uq_owner = owner->td_umtxq;
1572 	mtx_assert(&umtx_lock, MA_OWNED);
1573 	MPASS(pi->pi_owner == NULL);
1574 	pi->pi_owner = owner;
1575 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1576 }
1577 
1578 
1579 /*
1580  * Disown a PI mutex, and remove it from the owned list.
1581  */
1582 static void
1583 umtx_pi_disown(struct umtx_pi *pi)
1584 {
1585 
1586 	mtx_assert(&umtx_lock, MA_OWNED);
1587 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1588 	pi->pi_owner = NULL;
1589 }
1590 
1591 /*
1592  * Claim ownership of a PI mutex.
1593  */
1594 static int
1595 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1596 {
1597 	struct umtx_q *uq;
1598 	int pri;
1599 
1600 	mtx_lock(&umtx_lock);
1601 	if (pi->pi_owner == owner) {
1602 		mtx_unlock(&umtx_lock);
1603 		return (0);
1604 	}
1605 
1606 	if (pi->pi_owner != NULL) {
1607 		/*
1608 		 * userland may have already messed the mutex, sigh.
1609 		 */
1610 		mtx_unlock(&umtx_lock);
1611 		return (EPERM);
1612 	}
1613 	umtx_pi_setowner(pi, owner);
1614 	uq = TAILQ_FIRST(&pi->pi_blocked);
1615 	if (uq != NULL) {
1616 		pri = UPRI(uq->uq_thread);
1617 		thread_lock(owner);
1618 		if (pri < UPRI(owner))
1619 			sched_lend_user_prio(owner, pri);
1620 		thread_unlock(owner);
1621 	}
1622 	mtx_unlock(&umtx_lock);
1623 	return (0);
1624 }
1625 
1626 /*
1627  * Adjust a thread's order position in its blocked PI mutex,
1628  * this may result new priority propagating process.
1629  */
1630 void
1631 umtx_pi_adjust(struct thread *td, u_char oldpri)
1632 {
1633 	struct umtx_q *uq;
1634 	struct umtx_pi *pi;
1635 
1636 	uq = td->td_umtxq;
1637 	mtx_lock(&umtx_lock);
1638 	/*
1639 	 * Pick up the lock that td is blocked on.
1640 	 */
1641 	pi = uq->uq_pi_blocked;
1642 	if (pi != NULL) {
1643 		umtx_pi_adjust_thread(pi, td);
1644 		umtx_repropagate_priority(pi);
1645 	}
1646 	mtx_unlock(&umtx_lock);
1647 }
1648 
1649 /*
1650  * Sleep on a PI mutex.
1651  */
1652 static int
1653 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1654     const char *wmesg, struct abs_timeout *timo, bool shared)
1655 {
1656 	struct thread *td, *td1;
1657 	struct umtx_q *uq1;
1658 	int error, pri;
1659 #ifdef INVARIANTS
1660 	struct umtxq_chain *uc;
1661 
1662 	uc = umtxq_getchain(&pi->pi_key);
1663 #endif
1664 	error = 0;
1665 	td = uq->uq_thread;
1666 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1668 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1669 	umtxq_insert(uq);
1670 	mtx_lock(&umtx_lock);
1671 	if (pi->pi_owner == NULL) {
1672 		mtx_unlock(&umtx_lock);
1673 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1674 		mtx_lock(&umtx_lock);
1675 		if (td1 != NULL) {
1676 			if (pi->pi_owner == NULL)
1677 				umtx_pi_setowner(pi, td1);
1678 			PROC_UNLOCK(td1->td_proc);
1679 		}
1680 	}
1681 
1682 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1683 		pri = UPRI(uq1->uq_thread);
1684 		if (pri > UPRI(td))
1685 			break;
1686 	}
1687 
1688 	if (uq1 != NULL)
1689 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1690 	else
1691 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1692 
1693 	uq->uq_pi_blocked = pi;
1694 	thread_lock(td);
1695 	td->td_flags |= TDF_UPIBLOCKED;
1696 	thread_unlock(td);
1697 	umtx_propagate_priority(td);
1698 	mtx_unlock(&umtx_lock);
1699 	umtxq_unbusy(&uq->uq_key);
1700 
1701 	error = umtxq_sleep(uq, wmesg, timo);
1702 	umtxq_remove(uq);
1703 
1704 	mtx_lock(&umtx_lock);
1705 	uq->uq_pi_blocked = NULL;
1706 	thread_lock(td);
1707 	td->td_flags &= ~TDF_UPIBLOCKED;
1708 	thread_unlock(td);
1709 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1710 	umtx_repropagate_priority(pi);
1711 	mtx_unlock(&umtx_lock);
1712 	umtxq_unlock(&uq->uq_key);
1713 
1714 	return (error);
1715 }
1716 
1717 /*
1718  * Add reference count for a PI mutex.
1719  */
1720 static void
1721 umtx_pi_ref(struct umtx_pi *pi)
1722 {
1723 
1724 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1725 	pi->pi_refcount++;
1726 }
1727 
1728 /*
1729  * Decrease reference count for a PI mutex, if the counter
1730  * is decreased to zero, its memory space is freed.
1731  */
1732 static void
1733 umtx_pi_unref(struct umtx_pi *pi)
1734 {
1735 	struct umtxq_chain *uc;
1736 
1737 	uc = umtxq_getchain(&pi->pi_key);
1738 	UMTXQ_LOCKED_ASSERT(uc);
1739 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1740 	if (--pi->pi_refcount == 0) {
1741 		mtx_lock(&umtx_lock);
1742 		if (pi->pi_owner != NULL)
1743 			umtx_pi_disown(pi);
1744 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1745 			("blocked queue not empty"));
1746 		mtx_unlock(&umtx_lock);
1747 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1748 		umtx_pi_free(pi);
1749 	}
1750 }
1751 
1752 /*
1753  * Find a PI mutex in hash table.
1754  */
1755 static struct umtx_pi *
1756 umtx_pi_lookup(struct umtx_key *key)
1757 {
1758 	struct umtxq_chain *uc;
1759 	struct umtx_pi *pi;
1760 
1761 	uc = umtxq_getchain(key);
1762 	UMTXQ_LOCKED_ASSERT(uc);
1763 
1764 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1765 		if (umtx_key_match(&pi->pi_key, key)) {
1766 			return (pi);
1767 		}
1768 	}
1769 	return (NULL);
1770 }
1771 
1772 /*
1773  * Insert a PI mutex into hash table.
1774  */
1775 static inline void
1776 umtx_pi_insert(struct umtx_pi *pi)
1777 {
1778 	struct umtxq_chain *uc;
1779 
1780 	uc = umtxq_getchain(&pi->pi_key);
1781 	UMTXQ_LOCKED_ASSERT(uc);
1782 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1783 }
1784 
1785 /*
1786  * Lock a PI mutex.
1787  */
1788 static int
1789 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1790     struct _umtx_time *timeout, int try)
1791 {
1792 	struct abs_timeout timo;
1793 	struct umtx_q *uq;
1794 	struct umtx_pi *pi, *new_pi;
1795 	uint32_t id, old_owner, owner, old;
1796 	int error, rv;
1797 
1798 	id = td->td_tid;
1799 	uq = td->td_umtxq;
1800 
1801 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1802 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1803 	    &uq->uq_key)) != 0)
1804 		return (error);
1805 
1806 	if (timeout != NULL)
1807 		abs_timeout_init2(&timo, timeout);
1808 
1809 	umtxq_lock(&uq->uq_key);
1810 	pi = umtx_pi_lookup(&uq->uq_key);
1811 	if (pi == NULL) {
1812 		new_pi = umtx_pi_alloc(M_NOWAIT);
1813 		if (new_pi == NULL) {
1814 			umtxq_unlock(&uq->uq_key);
1815 			new_pi = umtx_pi_alloc(M_WAITOK);
1816 			umtxq_lock(&uq->uq_key);
1817 			pi = umtx_pi_lookup(&uq->uq_key);
1818 			if (pi != NULL) {
1819 				umtx_pi_free(new_pi);
1820 				new_pi = NULL;
1821 			}
1822 		}
1823 		if (new_pi != NULL) {
1824 			new_pi->pi_key = uq->uq_key;
1825 			umtx_pi_insert(new_pi);
1826 			pi = new_pi;
1827 		}
1828 	}
1829 	umtx_pi_ref(pi);
1830 	umtxq_unlock(&uq->uq_key);
1831 
1832 	/*
1833 	 * Care must be exercised when dealing with umtx structure.  It
1834 	 * can fault on any access.
1835 	 */
1836 	for (;;) {
1837 		/*
1838 		 * Try the uncontested case.  This should be done in userland.
1839 		 */
1840 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1841 		/* The address was invalid. */
1842 		if (rv == -1) {
1843 			error = EFAULT;
1844 			break;
1845 		}
1846 
1847 		/* The acquire succeeded. */
1848 		if (owner == UMUTEX_UNOWNED) {
1849 			error = 0;
1850 			break;
1851 		}
1852 
1853 		if (owner == UMUTEX_RB_NOTRECOV) {
1854 			error = ENOTRECOVERABLE;
1855 			break;
1856 		}
1857 
1858 		/* If no one owns it but it is contested try to acquire it. */
1859 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1860 			old_owner = owner;
1861 			rv = casueword32(&m->m_owner, owner, &owner,
1862 			    id | UMUTEX_CONTESTED);
1863 			/* The address was invalid. */
1864 			if (rv == -1) {
1865 				error = EFAULT;
1866 				break;
1867 			}
1868 
1869 			if (owner == old_owner) {
1870 				umtxq_lock(&uq->uq_key);
1871 				umtxq_busy(&uq->uq_key);
1872 				error = umtx_pi_claim(pi, td);
1873 				umtxq_unbusy(&uq->uq_key);
1874 				umtxq_unlock(&uq->uq_key);
1875 				if (error != 0) {
1876 					/*
1877 					 * Since we're going to return an
1878 					 * error, restore the m_owner to its
1879 					 * previous, unowned state to avoid
1880 					 * compounding the problem.
1881 					 */
1882 					(void)casuword32(&m->m_owner,
1883 					    id | UMUTEX_CONTESTED,
1884 					    old_owner);
1885 				}
1886 				if (error == 0 &&
1887 				    old_owner == UMUTEX_RB_OWNERDEAD)
1888 					error = EOWNERDEAD;
1889 				break;
1890 			}
1891 
1892 			error = umtxq_check_susp(td);
1893 			if (error != 0)
1894 				break;
1895 
1896 			/* If this failed the lock has changed, restart. */
1897 			continue;
1898 		}
1899 
1900 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1901 			error = EDEADLK;
1902 			break;
1903 		}
1904 
1905 		if (try != 0) {
1906 			error = EBUSY;
1907 			break;
1908 		}
1909 
1910 		/*
1911 		 * If we caught a signal, we have retried and now
1912 		 * exit immediately.
1913 		 */
1914 		if (error != 0)
1915 			break;
1916 
1917 		umtxq_lock(&uq->uq_key);
1918 		umtxq_busy(&uq->uq_key);
1919 		umtxq_unlock(&uq->uq_key);
1920 
1921 		/*
1922 		 * Set the contested bit so that a release in user space
1923 		 * knows to use the system call for unlock.  If this fails
1924 		 * either some one else has acquired the lock or it has been
1925 		 * released.
1926 		 */
1927 		rv = casueword32(&m->m_owner, owner, &old, owner |
1928 		    UMUTEX_CONTESTED);
1929 
1930 		/* The address was invalid. */
1931 		if (rv == -1) {
1932 			umtxq_unbusy_unlocked(&uq->uq_key);
1933 			error = EFAULT;
1934 			break;
1935 		}
1936 
1937 		umtxq_lock(&uq->uq_key);
1938 		/*
1939 		 * We set the contested bit, sleep. Otherwise the lock changed
1940 		 * and we need to retry or we lost a race to the thread
1941 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
1942 		 * value for owner is impossible there.
1943 		 */
1944 		if (old == owner) {
1945 			error = umtxq_sleep_pi(uq, pi,
1946 			    owner & ~UMUTEX_CONTESTED,
1947 			    "umtxpi", timeout == NULL ? NULL : &timo,
1948 			    (flags & USYNC_PROCESS_SHARED) != 0);
1949 			if (error != 0)
1950 				continue;
1951 		} else {
1952 			umtxq_unbusy(&uq->uq_key);
1953 			umtxq_unlock(&uq->uq_key);
1954 		}
1955 
1956 		error = umtxq_check_susp(td);
1957 		if (error != 0)
1958 			break;
1959 	}
1960 
1961 	umtxq_lock(&uq->uq_key);
1962 	umtx_pi_unref(pi);
1963 	umtxq_unlock(&uq->uq_key);
1964 
1965 	umtx_key_release(&uq->uq_key);
1966 	return (error);
1967 }
1968 
1969 /*
1970  * Unlock a PI mutex.
1971  */
1972 static int
1973 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1974 {
1975 	struct umtx_key key;
1976 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1977 	struct umtx_pi *pi, *pi2;
1978 	uint32_t id, new_owner, old, owner;
1979 	int count, error, pri;
1980 
1981 	id = td->td_tid;
1982 	/*
1983 	 * Make sure we own this mtx.
1984 	 */
1985 	error = fueword32(&m->m_owner, &owner);
1986 	if (error == -1)
1987 		return (EFAULT);
1988 
1989 	if ((owner & ~UMUTEX_CONTESTED) != id)
1990 		return (EPERM);
1991 
1992 	new_owner = umtx_unlock_val(flags, rb);
1993 
1994 	/* This should be done in userland */
1995 	if ((owner & UMUTEX_CONTESTED) == 0) {
1996 		error = casueword32(&m->m_owner, owner, &old, new_owner);
1997 		if (error == -1)
1998 			return (EFAULT);
1999 		if (old == owner)
2000 			return (0);
2001 		owner = old;
2002 	}
2003 
2004 	/* We should only ever be in here for contested locks */
2005 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2006 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2007 	    &key)) != 0)
2008 		return (error);
2009 
2010 	umtxq_lock(&key);
2011 	umtxq_busy(&key);
2012 	count = umtxq_count_pi(&key, &uq_first);
2013 	if (uq_first != NULL) {
2014 		mtx_lock(&umtx_lock);
2015 		pi = uq_first->uq_pi_blocked;
2016 		KASSERT(pi != NULL, ("pi == NULL?"));
2017 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2018 			mtx_unlock(&umtx_lock);
2019 			umtxq_unbusy(&key);
2020 			umtxq_unlock(&key);
2021 			umtx_key_release(&key);
2022 			/* userland messed the mutex */
2023 			return (EPERM);
2024 		}
2025 		uq_me = td->td_umtxq;
2026 		if (pi->pi_owner == td)
2027 			umtx_pi_disown(pi);
2028 		/* get highest priority thread which is still sleeping. */
2029 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2030 		while (uq_first != NULL &&
2031 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2032 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2033 		}
2034 		pri = PRI_MAX;
2035 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2036 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2037 			if (uq_first2 != NULL) {
2038 				if (pri > UPRI(uq_first2->uq_thread))
2039 					pri = UPRI(uq_first2->uq_thread);
2040 			}
2041 		}
2042 		thread_lock(td);
2043 		sched_lend_user_prio(td, pri);
2044 		thread_unlock(td);
2045 		mtx_unlock(&umtx_lock);
2046 		if (uq_first)
2047 			umtxq_signal_thread(uq_first);
2048 	} else {
2049 		pi = umtx_pi_lookup(&key);
2050 		/*
2051 		 * A umtx_pi can exist if a signal or timeout removed the
2052 		 * last waiter from the umtxq, but there is still
2053 		 * a thread in do_lock_pi() holding the umtx_pi.
2054 		 */
2055 		if (pi != NULL) {
2056 			/*
2057 			 * The umtx_pi can be unowned, such as when a thread
2058 			 * has just entered do_lock_pi(), allocated the
2059 			 * umtx_pi, and unlocked the umtxq.
2060 			 * If the current thread owns it, it must disown it.
2061 			 */
2062 			mtx_lock(&umtx_lock);
2063 			if (pi->pi_owner == td)
2064 				umtx_pi_disown(pi);
2065 			mtx_unlock(&umtx_lock);
2066 		}
2067 	}
2068 	umtxq_unlock(&key);
2069 
2070 	/*
2071 	 * When unlocking the umtx, it must be marked as unowned if
2072 	 * there is zero or one thread only waiting for it.
2073 	 * Otherwise, it must be marked as contested.
2074 	 */
2075 
2076 	if (count > 1)
2077 		new_owner |= UMUTEX_CONTESTED;
2078 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2079 
2080 	umtxq_unbusy_unlocked(&key);
2081 	umtx_key_release(&key);
2082 	if (error == -1)
2083 		return (EFAULT);
2084 	if (old != owner)
2085 		return (EINVAL);
2086 	return (0);
2087 }
2088 
2089 /*
2090  * Lock a PP mutex.
2091  */
2092 static int
2093 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2094     struct _umtx_time *timeout, int try)
2095 {
2096 	struct abs_timeout timo;
2097 	struct umtx_q *uq, *uq2;
2098 	struct umtx_pi *pi;
2099 	uint32_t ceiling;
2100 	uint32_t owner, id;
2101 	int error, pri, old_inherited_pri, su, rv;
2102 
2103 	id = td->td_tid;
2104 	uq = td->td_umtxq;
2105 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2106 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2107 	    &uq->uq_key)) != 0)
2108 		return (error);
2109 
2110 	if (timeout != NULL)
2111 		abs_timeout_init2(&timo, timeout);
2112 
2113 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2114 	for (;;) {
2115 		old_inherited_pri = uq->uq_inherited_pri;
2116 		umtxq_lock(&uq->uq_key);
2117 		umtxq_busy(&uq->uq_key);
2118 		umtxq_unlock(&uq->uq_key);
2119 
2120 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2121 		if (rv == -1) {
2122 			error = EFAULT;
2123 			goto out;
2124 		}
2125 		ceiling = RTP_PRIO_MAX - ceiling;
2126 		if (ceiling > RTP_PRIO_MAX) {
2127 			error = EINVAL;
2128 			goto out;
2129 		}
2130 
2131 		mtx_lock(&umtx_lock);
2132 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2133 			mtx_unlock(&umtx_lock);
2134 			error = EINVAL;
2135 			goto out;
2136 		}
2137 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2138 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2139 			thread_lock(td);
2140 			if (uq->uq_inherited_pri < UPRI(td))
2141 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2142 			thread_unlock(td);
2143 		}
2144 		mtx_unlock(&umtx_lock);
2145 
2146 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2147 		    id | UMUTEX_CONTESTED);
2148 		/* The address was invalid. */
2149 		if (rv == -1) {
2150 			error = EFAULT;
2151 			break;
2152 		}
2153 
2154 		if (owner == UMUTEX_CONTESTED) {
2155 			error = 0;
2156 			break;
2157 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
2158 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2159 			    &owner, id | UMUTEX_CONTESTED);
2160 			if (rv == -1) {
2161 				error = EFAULT;
2162 				break;
2163 			}
2164 			if (owner == UMUTEX_RB_OWNERDEAD) {
2165 				error = EOWNERDEAD; /* success */
2166 				break;
2167 			}
2168 			error = 0;
2169 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2170 			error = ENOTRECOVERABLE;
2171 			break;
2172 		}
2173 
2174 		if (try != 0) {
2175 			error = EBUSY;
2176 			break;
2177 		}
2178 
2179 		/*
2180 		 * If we caught a signal, we have retried and now
2181 		 * exit immediately.
2182 		 */
2183 		if (error != 0)
2184 			break;
2185 
2186 		umtxq_lock(&uq->uq_key);
2187 		umtxq_insert(uq);
2188 		umtxq_unbusy(&uq->uq_key);
2189 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2190 		    NULL : &timo);
2191 		umtxq_remove(uq);
2192 		umtxq_unlock(&uq->uq_key);
2193 
2194 		mtx_lock(&umtx_lock);
2195 		uq->uq_inherited_pri = old_inherited_pri;
2196 		pri = PRI_MAX;
2197 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2198 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2199 			if (uq2 != NULL) {
2200 				if (pri > UPRI(uq2->uq_thread))
2201 					pri = UPRI(uq2->uq_thread);
2202 			}
2203 		}
2204 		if (pri > uq->uq_inherited_pri)
2205 			pri = uq->uq_inherited_pri;
2206 		thread_lock(td);
2207 		sched_lend_user_prio(td, pri);
2208 		thread_unlock(td);
2209 		mtx_unlock(&umtx_lock);
2210 	}
2211 
2212 	if (error != 0 && error != EOWNERDEAD) {
2213 		mtx_lock(&umtx_lock);
2214 		uq->uq_inherited_pri = old_inherited_pri;
2215 		pri = PRI_MAX;
2216 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2217 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2218 			if (uq2 != NULL) {
2219 				if (pri > UPRI(uq2->uq_thread))
2220 					pri = UPRI(uq2->uq_thread);
2221 			}
2222 		}
2223 		if (pri > uq->uq_inherited_pri)
2224 			pri = uq->uq_inherited_pri;
2225 		thread_lock(td);
2226 		sched_lend_user_prio(td, pri);
2227 		thread_unlock(td);
2228 		mtx_unlock(&umtx_lock);
2229 	}
2230 
2231 out:
2232 	umtxq_unbusy_unlocked(&uq->uq_key);
2233 	umtx_key_release(&uq->uq_key);
2234 	return (error);
2235 }
2236 
2237 /*
2238  * Unlock a PP mutex.
2239  */
2240 static int
2241 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2242 {
2243 	struct umtx_key key;
2244 	struct umtx_q *uq, *uq2;
2245 	struct umtx_pi *pi;
2246 	uint32_t id, owner, rceiling;
2247 	int error, pri, new_inherited_pri, su;
2248 
2249 	id = td->td_tid;
2250 	uq = td->td_umtxq;
2251 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2252 
2253 	/*
2254 	 * Make sure we own this mtx.
2255 	 */
2256 	error = fueword32(&m->m_owner, &owner);
2257 	if (error == -1)
2258 		return (EFAULT);
2259 
2260 	if ((owner & ~UMUTEX_CONTESTED) != id)
2261 		return (EPERM);
2262 
2263 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2264 	if (error != 0)
2265 		return (error);
2266 
2267 	if (rceiling == -1)
2268 		new_inherited_pri = PRI_MAX;
2269 	else {
2270 		rceiling = RTP_PRIO_MAX - rceiling;
2271 		if (rceiling > RTP_PRIO_MAX)
2272 			return (EINVAL);
2273 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2274 	}
2275 
2276 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2277 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2278 	    &key)) != 0)
2279 		return (error);
2280 	umtxq_lock(&key);
2281 	umtxq_busy(&key);
2282 	umtxq_unlock(&key);
2283 	/*
2284 	 * For priority protected mutex, always set unlocked state
2285 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2286 	 * to lock the mutex, it is necessary because thread priority
2287 	 * has to be adjusted for such mutex.
2288 	 */
2289 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2290 	    UMUTEX_CONTESTED);
2291 
2292 	umtxq_lock(&key);
2293 	if (error == 0)
2294 		umtxq_signal(&key, 1);
2295 	umtxq_unbusy(&key);
2296 	umtxq_unlock(&key);
2297 
2298 	if (error == -1)
2299 		error = EFAULT;
2300 	else {
2301 		mtx_lock(&umtx_lock);
2302 		if (su != 0)
2303 			uq->uq_inherited_pri = new_inherited_pri;
2304 		pri = PRI_MAX;
2305 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2306 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2307 			if (uq2 != NULL) {
2308 				if (pri > UPRI(uq2->uq_thread))
2309 					pri = UPRI(uq2->uq_thread);
2310 			}
2311 		}
2312 		if (pri > uq->uq_inherited_pri)
2313 			pri = uq->uq_inherited_pri;
2314 		thread_lock(td);
2315 		sched_lend_user_prio(td, pri);
2316 		thread_unlock(td);
2317 		mtx_unlock(&umtx_lock);
2318 	}
2319 	umtx_key_release(&key);
2320 	return (error);
2321 }
2322 
2323 static int
2324 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2325     uint32_t *old_ceiling)
2326 {
2327 	struct umtx_q *uq;
2328 	uint32_t flags, id, owner, save_ceiling;
2329 	int error, rv, rv1;
2330 
2331 	error = fueword32(&m->m_flags, &flags);
2332 	if (error == -1)
2333 		return (EFAULT);
2334 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2335 		return (EINVAL);
2336 	if (ceiling > RTP_PRIO_MAX)
2337 		return (EINVAL);
2338 	id = td->td_tid;
2339 	uq = td->td_umtxq;
2340 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2341 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2342 	    &uq->uq_key)) != 0)
2343 		return (error);
2344 	for (;;) {
2345 		umtxq_lock(&uq->uq_key);
2346 		umtxq_busy(&uq->uq_key);
2347 		umtxq_unlock(&uq->uq_key);
2348 
2349 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2350 		if (rv == -1) {
2351 			error = EFAULT;
2352 			break;
2353 		}
2354 
2355 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2356 		    id | UMUTEX_CONTESTED);
2357 		if (rv == -1) {
2358 			error = EFAULT;
2359 			break;
2360 		}
2361 
2362 		if (owner == UMUTEX_CONTESTED) {
2363 			rv = suword32(&m->m_ceilings[0], ceiling);
2364 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2365 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2366 			break;
2367 		}
2368 
2369 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2370 			rv = suword32(&m->m_ceilings[0], ceiling);
2371 			error = rv == 0 ? 0 : EFAULT;
2372 			break;
2373 		}
2374 
2375 		if (owner == UMUTEX_RB_OWNERDEAD) {
2376 			error = EOWNERDEAD;
2377 			break;
2378 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2379 			error = ENOTRECOVERABLE;
2380 			break;
2381 		}
2382 
2383 		/*
2384 		 * If we caught a signal, we have retried and now
2385 		 * exit immediately.
2386 		 */
2387 		if (error != 0)
2388 			break;
2389 
2390 		/*
2391 		 * We set the contested bit, sleep. Otherwise the lock changed
2392 		 * and we need to retry or we lost a race to the thread
2393 		 * unlocking the umtx.
2394 		 */
2395 		umtxq_lock(&uq->uq_key);
2396 		umtxq_insert(uq);
2397 		umtxq_unbusy(&uq->uq_key);
2398 		error = umtxq_sleep(uq, "umtxpp", NULL);
2399 		umtxq_remove(uq);
2400 		umtxq_unlock(&uq->uq_key);
2401 	}
2402 	umtxq_lock(&uq->uq_key);
2403 	if (error == 0)
2404 		umtxq_signal(&uq->uq_key, INT_MAX);
2405 	umtxq_unbusy(&uq->uq_key);
2406 	umtxq_unlock(&uq->uq_key);
2407 	umtx_key_release(&uq->uq_key);
2408 	if (error == 0 && old_ceiling != NULL) {
2409 		rv = suword32(old_ceiling, save_ceiling);
2410 		error = rv == 0 ? 0 : EFAULT;
2411 	}
2412 	return (error);
2413 }
2414 
2415 /*
2416  * Lock a userland POSIX mutex.
2417  */
2418 static int
2419 do_lock_umutex(struct thread *td, struct umutex *m,
2420     struct _umtx_time *timeout, int mode)
2421 {
2422 	uint32_t flags;
2423 	int error;
2424 
2425 	error = fueword32(&m->m_flags, &flags);
2426 	if (error == -1)
2427 		return (EFAULT);
2428 
2429 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2430 	case 0:
2431 		error = do_lock_normal(td, m, flags, timeout, mode);
2432 		break;
2433 	case UMUTEX_PRIO_INHERIT:
2434 		error = do_lock_pi(td, m, flags, timeout, mode);
2435 		break;
2436 	case UMUTEX_PRIO_PROTECT:
2437 		error = do_lock_pp(td, m, flags, timeout, mode);
2438 		break;
2439 	default:
2440 		return (EINVAL);
2441 	}
2442 	if (timeout == NULL) {
2443 		if (error == EINTR && mode != _UMUTEX_WAIT)
2444 			error = ERESTART;
2445 	} else {
2446 		/* Timed-locking is not restarted. */
2447 		if (error == ERESTART)
2448 			error = EINTR;
2449 	}
2450 	return (error);
2451 }
2452 
2453 /*
2454  * Unlock a userland POSIX mutex.
2455  */
2456 static int
2457 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2458 {
2459 	uint32_t flags;
2460 	int error;
2461 
2462 	error = fueword32(&m->m_flags, &flags);
2463 	if (error == -1)
2464 		return (EFAULT);
2465 
2466 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2467 	case 0:
2468 		return (do_unlock_normal(td, m, flags, rb));
2469 	case UMUTEX_PRIO_INHERIT:
2470 		return (do_unlock_pi(td, m, flags, rb));
2471 	case UMUTEX_PRIO_PROTECT:
2472 		return (do_unlock_pp(td, m, flags, rb));
2473 	}
2474 
2475 	return (EINVAL);
2476 }
2477 
2478 static int
2479 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2480     struct timespec *timeout, u_long wflags)
2481 {
2482 	struct abs_timeout timo;
2483 	struct umtx_q *uq;
2484 	uint32_t flags, clockid, hasw;
2485 	int error;
2486 
2487 	uq = td->td_umtxq;
2488 	error = fueword32(&cv->c_flags, &flags);
2489 	if (error == -1)
2490 		return (EFAULT);
2491 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2492 	if (error != 0)
2493 		return (error);
2494 
2495 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2496 		error = fueword32(&cv->c_clockid, &clockid);
2497 		if (error == -1) {
2498 			umtx_key_release(&uq->uq_key);
2499 			return (EFAULT);
2500 		}
2501 		if (clockid < CLOCK_REALTIME ||
2502 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2503 			/* hmm, only HW clock id will work. */
2504 			umtx_key_release(&uq->uq_key);
2505 			return (EINVAL);
2506 		}
2507 	} else {
2508 		clockid = CLOCK_REALTIME;
2509 	}
2510 
2511 	umtxq_lock(&uq->uq_key);
2512 	umtxq_busy(&uq->uq_key);
2513 	umtxq_insert(uq);
2514 	umtxq_unlock(&uq->uq_key);
2515 
2516 	/*
2517 	 * Set c_has_waiters to 1 before releasing user mutex, also
2518 	 * don't modify cache line when unnecessary.
2519 	 */
2520 	error = fueword32(&cv->c_has_waiters, &hasw);
2521 	if (error == 0 && hasw == 0)
2522 		suword32(&cv->c_has_waiters, 1);
2523 
2524 	umtxq_unbusy_unlocked(&uq->uq_key);
2525 
2526 	error = do_unlock_umutex(td, m, false);
2527 
2528 	if (timeout != NULL)
2529 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2530 		    timeout);
2531 
2532 	umtxq_lock(&uq->uq_key);
2533 	if (error == 0) {
2534 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2535 		    NULL : &timo);
2536 	}
2537 
2538 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2539 		error = 0;
2540 	else {
2541 		/*
2542 		 * This must be timeout,interrupted by signal or
2543 		 * surprious wakeup, clear c_has_waiter flag when
2544 		 * necessary.
2545 		 */
2546 		umtxq_busy(&uq->uq_key);
2547 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2548 			int oldlen = uq->uq_cur_queue->length;
2549 			umtxq_remove(uq);
2550 			if (oldlen == 1) {
2551 				umtxq_unlock(&uq->uq_key);
2552 				suword32(&cv->c_has_waiters, 0);
2553 				umtxq_lock(&uq->uq_key);
2554 			}
2555 		}
2556 		umtxq_unbusy(&uq->uq_key);
2557 		if (error == ERESTART)
2558 			error = EINTR;
2559 	}
2560 
2561 	umtxq_unlock(&uq->uq_key);
2562 	umtx_key_release(&uq->uq_key);
2563 	return (error);
2564 }
2565 
2566 /*
2567  * Signal a userland condition variable.
2568  */
2569 static int
2570 do_cv_signal(struct thread *td, struct ucond *cv)
2571 {
2572 	struct umtx_key key;
2573 	int error, cnt, nwake;
2574 	uint32_t flags;
2575 
2576 	error = fueword32(&cv->c_flags, &flags);
2577 	if (error == -1)
2578 		return (EFAULT);
2579 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2580 		return (error);
2581 	umtxq_lock(&key);
2582 	umtxq_busy(&key);
2583 	cnt = umtxq_count(&key);
2584 	nwake = umtxq_signal(&key, 1);
2585 	if (cnt <= nwake) {
2586 		umtxq_unlock(&key);
2587 		error = suword32(&cv->c_has_waiters, 0);
2588 		if (error == -1)
2589 			error = EFAULT;
2590 		umtxq_lock(&key);
2591 	}
2592 	umtxq_unbusy(&key);
2593 	umtxq_unlock(&key);
2594 	umtx_key_release(&key);
2595 	return (error);
2596 }
2597 
2598 static int
2599 do_cv_broadcast(struct thread *td, struct ucond *cv)
2600 {
2601 	struct umtx_key key;
2602 	int error;
2603 	uint32_t flags;
2604 
2605 	error = fueword32(&cv->c_flags, &flags);
2606 	if (error == -1)
2607 		return (EFAULT);
2608 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2609 		return (error);
2610 
2611 	umtxq_lock(&key);
2612 	umtxq_busy(&key);
2613 	umtxq_signal(&key, INT_MAX);
2614 	umtxq_unlock(&key);
2615 
2616 	error = suword32(&cv->c_has_waiters, 0);
2617 	if (error == -1)
2618 		error = EFAULT;
2619 
2620 	umtxq_unbusy_unlocked(&key);
2621 
2622 	umtx_key_release(&key);
2623 	return (error);
2624 }
2625 
2626 static int
2627 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2628     struct _umtx_time *timeout)
2629 {
2630 	struct abs_timeout timo;
2631 	struct umtx_q *uq;
2632 	uint32_t flags, wrflags;
2633 	int32_t state, oldstate;
2634 	int32_t blocked_readers;
2635 	int error, error1, rv;
2636 
2637 	uq = td->td_umtxq;
2638 	error = fueword32(&rwlock->rw_flags, &flags);
2639 	if (error == -1)
2640 		return (EFAULT);
2641 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2642 	if (error != 0)
2643 		return (error);
2644 
2645 	if (timeout != NULL)
2646 		abs_timeout_init2(&timo, timeout);
2647 
2648 	wrflags = URWLOCK_WRITE_OWNER;
2649 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2650 		wrflags |= URWLOCK_WRITE_WAITERS;
2651 
2652 	for (;;) {
2653 		rv = fueword32(&rwlock->rw_state, &state);
2654 		if (rv == -1) {
2655 			umtx_key_release(&uq->uq_key);
2656 			return (EFAULT);
2657 		}
2658 
2659 		/* try to lock it */
2660 		while (!(state & wrflags)) {
2661 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2662 				umtx_key_release(&uq->uq_key);
2663 				return (EAGAIN);
2664 			}
2665 			rv = casueword32(&rwlock->rw_state, state,
2666 			    &oldstate, state + 1);
2667 			if (rv == -1) {
2668 				umtx_key_release(&uq->uq_key);
2669 				return (EFAULT);
2670 			}
2671 			if (oldstate == state) {
2672 				umtx_key_release(&uq->uq_key);
2673 				return (0);
2674 			}
2675 			error = umtxq_check_susp(td);
2676 			if (error != 0)
2677 				break;
2678 			state = oldstate;
2679 		}
2680 
2681 		if (error)
2682 			break;
2683 
2684 		/* grab monitor lock */
2685 		umtxq_lock(&uq->uq_key);
2686 		umtxq_busy(&uq->uq_key);
2687 		umtxq_unlock(&uq->uq_key);
2688 
2689 		/*
2690 		 * re-read the state, in case it changed between the try-lock above
2691 		 * and the check below
2692 		 */
2693 		rv = fueword32(&rwlock->rw_state, &state);
2694 		if (rv == -1)
2695 			error = EFAULT;
2696 
2697 		/* set read contention bit */
2698 		while (error == 0 && (state & wrflags) &&
2699 		    !(state & URWLOCK_READ_WAITERS)) {
2700 			rv = casueword32(&rwlock->rw_state, state,
2701 			    &oldstate, state | URWLOCK_READ_WAITERS);
2702 			if (rv == -1) {
2703 				error = EFAULT;
2704 				break;
2705 			}
2706 			if (oldstate == state)
2707 				goto sleep;
2708 			state = oldstate;
2709 			error = umtxq_check_susp(td);
2710 			if (error != 0)
2711 				break;
2712 		}
2713 		if (error != 0) {
2714 			umtxq_unbusy_unlocked(&uq->uq_key);
2715 			break;
2716 		}
2717 
2718 		/* state is changed while setting flags, restart */
2719 		if (!(state & wrflags)) {
2720 			umtxq_unbusy_unlocked(&uq->uq_key);
2721 			error = umtxq_check_susp(td);
2722 			if (error != 0)
2723 				break;
2724 			continue;
2725 		}
2726 
2727 sleep:
2728 		/* contention bit is set, before sleeping, increase read waiter count */
2729 		rv = fueword32(&rwlock->rw_blocked_readers,
2730 		    &blocked_readers);
2731 		if (rv == -1) {
2732 			umtxq_unbusy_unlocked(&uq->uq_key);
2733 			error = EFAULT;
2734 			break;
2735 		}
2736 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2737 
2738 		while (state & wrflags) {
2739 			umtxq_lock(&uq->uq_key);
2740 			umtxq_insert(uq);
2741 			umtxq_unbusy(&uq->uq_key);
2742 
2743 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2744 			    NULL : &timo);
2745 
2746 			umtxq_busy(&uq->uq_key);
2747 			umtxq_remove(uq);
2748 			umtxq_unlock(&uq->uq_key);
2749 			if (error)
2750 				break;
2751 			rv = fueword32(&rwlock->rw_state, &state);
2752 			if (rv == -1) {
2753 				error = EFAULT;
2754 				break;
2755 			}
2756 		}
2757 
2758 		/* decrease read waiter count, and may clear read contention bit */
2759 		rv = fueword32(&rwlock->rw_blocked_readers,
2760 		    &blocked_readers);
2761 		if (rv == -1) {
2762 			umtxq_unbusy_unlocked(&uq->uq_key);
2763 			error = EFAULT;
2764 			break;
2765 		}
2766 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2767 		if (blocked_readers == 1) {
2768 			rv = fueword32(&rwlock->rw_state, &state);
2769 			if (rv == -1) {
2770 				umtxq_unbusy_unlocked(&uq->uq_key);
2771 				error = EFAULT;
2772 				break;
2773 			}
2774 			for (;;) {
2775 				rv = casueword32(&rwlock->rw_state, state,
2776 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2777 				if (rv == -1) {
2778 					error = EFAULT;
2779 					break;
2780 				}
2781 				if (oldstate == state)
2782 					break;
2783 				state = oldstate;
2784 				error1 = umtxq_check_susp(td);
2785 				if (error1 != 0) {
2786 					if (error == 0)
2787 						error = error1;
2788 					break;
2789 				}
2790 			}
2791 		}
2792 
2793 		umtxq_unbusy_unlocked(&uq->uq_key);
2794 		if (error != 0)
2795 			break;
2796 	}
2797 	umtx_key_release(&uq->uq_key);
2798 	if (error == ERESTART)
2799 		error = EINTR;
2800 	return (error);
2801 }
2802 
2803 static int
2804 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2805 {
2806 	struct abs_timeout timo;
2807 	struct umtx_q *uq;
2808 	uint32_t flags;
2809 	int32_t state, oldstate;
2810 	int32_t blocked_writers;
2811 	int32_t blocked_readers;
2812 	int error, error1, rv;
2813 
2814 	uq = td->td_umtxq;
2815 	error = fueword32(&rwlock->rw_flags, &flags);
2816 	if (error == -1)
2817 		return (EFAULT);
2818 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2819 	if (error != 0)
2820 		return (error);
2821 
2822 	if (timeout != NULL)
2823 		abs_timeout_init2(&timo, timeout);
2824 
2825 	blocked_readers = 0;
2826 	for (;;) {
2827 		rv = fueword32(&rwlock->rw_state, &state);
2828 		if (rv == -1) {
2829 			umtx_key_release(&uq->uq_key);
2830 			return (EFAULT);
2831 		}
2832 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2833 			rv = casueword32(&rwlock->rw_state, state,
2834 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2835 			if (rv == -1) {
2836 				umtx_key_release(&uq->uq_key);
2837 				return (EFAULT);
2838 			}
2839 			if (oldstate == state) {
2840 				umtx_key_release(&uq->uq_key);
2841 				return (0);
2842 			}
2843 			state = oldstate;
2844 			error = umtxq_check_susp(td);
2845 			if (error != 0)
2846 				break;
2847 		}
2848 
2849 		if (error) {
2850 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2851 			    blocked_readers != 0) {
2852 				umtxq_lock(&uq->uq_key);
2853 				umtxq_busy(&uq->uq_key);
2854 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2855 				umtxq_unbusy(&uq->uq_key);
2856 				umtxq_unlock(&uq->uq_key);
2857 			}
2858 
2859 			break;
2860 		}
2861 
2862 		/* grab monitor lock */
2863 		umtxq_lock(&uq->uq_key);
2864 		umtxq_busy(&uq->uq_key);
2865 		umtxq_unlock(&uq->uq_key);
2866 
2867 		/*
2868 		 * re-read the state, in case it changed between the try-lock above
2869 		 * and the check below
2870 		 */
2871 		rv = fueword32(&rwlock->rw_state, &state);
2872 		if (rv == -1)
2873 			error = EFAULT;
2874 
2875 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2876 		    URWLOCK_READER_COUNT(state) != 0) &&
2877 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2878 			rv = casueword32(&rwlock->rw_state, state,
2879 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2880 			if (rv == -1) {
2881 				error = EFAULT;
2882 				break;
2883 			}
2884 			if (oldstate == state)
2885 				goto sleep;
2886 			state = oldstate;
2887 			error = umtxq_check_susp(td);
2888 			if (error != 0)
2889 				break;
2890 		}
2891 		if (error != 0) {
2892 			umtxq_unbusy_unlocked(&uq->uq_key);
2893 			break;
2894 		}
2895 
2896 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2897 			umtxq_unbusy_unlocked(&uq->uq_key);
2898 			error = umtxq_check_susp(td);
2899 			if (error != 0)
2900 				break;
2901 			continue;
2902 		}
2903 sleep:
2904 		rv = fueword32(&rwlock->rw_blocked_writers,
2905 		    &blocked_writers);
2906 		if (rv == -1) {
2907 			umtxq_unbusy_unlocked(&uq->uq_key);
2908 			error = EFAULT;
2909 			break;
2910 		}
2911 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2912 
2913 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2914 			umtxq_lock(&uq->uq_key);
2915 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2916 			umtxq_unbusy(&uq->uq_key);
2917 
2918 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2919 			    NULL : &timo);
2920 
2921 			umtxq_busy(&uq->uq_key);
2922 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2923 			umtxq_unlock(&uq->uq_key);
2924 			if (error)
2925 				break;
2926 			rv = fueword32(&rwlock->rw_state, &state);
2927 			if (rv == -1) {
2928 				error = EFAULT;
2929 				break;
2930 			}
2931 		}
2932 
2933 		rv = fueword32(&rwlock->rw_blocked_writers,
2934 		    &blocked_writers);
2935 		if (rv == -1) {
2936 			umtxq_unbusy_unlocked(&uq->uq_key);
2937 			error = EFAULT;
2938 			break;
2939 		}
2940 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2941 		if (blocked_writers == 1) {
2942 			rv = fueword32(&rwlock->rw_state, &state);
2943 			if (rv == -1) {
2944 				umtxq_unbusy_unlocked(&uq->uq_key);
2945 				error = EFAULT;
2946 				break;
2947 			}
2948 			for (;;) {
2949 				rv = casueword32(&rwlock->rw_state, state,
2950 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2951 				if (rv == -1) {
2952 					error = EFAULT;
2953 					break;
2954 				}
2955 				if (oldstate == state)
2956 					break;
2957 				state = oldstate;
2958 				error1 = umtxq_check_susp(td);
2959 				/*
2960 				 * We are leaving the URWLOCK_WRITE_WAITERS
2961 				 * behind, but this should not harm the
2962 				 * correctness.
2963 				 */
2964 				if (error1 != 0) {
2965 					if (error == 0)
2966 						error = error1;
2967 					break;
2968 				}
2969 			}
2970 			rv = fueword32(&rwlock->rw_blocked_readers,
2971 			    &blocked_readers);
2972 			if (rv == -1) {
2973 				umtxq_unbusy_unlocked(&uq->uq_key);
2974 				error = EFAULT;
2975 				break;
2976 			}
2977 		} else
2978 			blocked_readers = 0;
2979 
2980 		umtxq_unbusy_unlocked(&uq->uq_key);
2981 	}
2982 
2983 	umtx_key_release(&uq->uq_key);
2984 	if (error == ERESTART)
2985 		error = EINTR;
2986 	return (error);
2987 }
2988 
2989 static int
2990 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2991 {
2992 	struct umtx_q *uq;
2993 	uint32_t flags;
2994 	int32_t state, oldstate;
2995 	int error, rv, q, count;
2996 
2997 	uq = td->td_umtxq;
2998 	error = fueword32(&rwlock->rw_flags, &flags);
2999 	if (error == -1)
3000 		return (EFAULT);
3001 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3002 	if (error != 0)
3003 		return (error);
3004 
3005 	error = fueword32(&rwlock->rw_state, &state);
3006 	if (error == -1) {
3007 		error = EFAULT;
3008 		goto out;
3009 	}
3010 	if (state & URWLOCK_WRITE_OWNER) {
3011 		for (;;) {
3012 			rv = casueword32(&rwlock->rw_state, state,
3013 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3014 			if (rv == -1) {
3015 				error = EFAULT;
3016 				goto out;
3017 			}
3018 			if (oldstate != state) {
3019 				state = oldstate;
3020 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3021 					error = EPERM;
3022 					goto out;
3023 				}
3024 				error = umtxq_check_susp(td);
3025 				if (error != 0)
3026 					goto out;
3027 			} else
3028 				break;
3029 		}
3030 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3031 		for (;;) {
3032 			rv = casueword32(&rwlock->rw_state, state,
3033 			    &oldstate, state - 1);
3034 			if (rv == -1) {
3035 				error = EFAULT;
3036 				goto out;
3037 			}
3038 			if (oldstate != state) {
3039 				state = oldstate;
3040 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3041 					error = EPERM;
3042 					goto out;
3043 				}
3044 				error = umtxq_check_susp(td);
3045 				if (error != 0)
3046 					goto out;
3047 			} else
3048 				break;
3049 		}
3050 	} else {
3051 		error = EPERM;
3052 		goto out;
3053 	}
3054 
3055 	count = 0;
3056 
3057 	if (!(flags & URWLOCK_PREFER_READER)) {
3058 		if (state & URWLOCK_WRITE_WAITERS) {
3059 			count = 1;
3060 			q = UMTX_EXCLUSIVE_QUEUE;
3061 		} else if (state & URWLOCK_READ_WAITERS) {
3062 			count = INT_MAX;
3063 			q = UMTX_SHARED_QUEUE;
3064 		}
3065 	} else {
3066 		if (state & URWLOCK_READ_WAITERS) {
3067 			count = INT_MAX;
3068 			q = UMTX_SHARED_QUEUE;
3069 		} else if (state & URWLOCK_WRITE_WAITERS) {
3070 			count = 1;
3071 			q = UMTX_EXCLUSIVE_QUEUE;
3072 		}
3073 	}
3074 
3075 	if (count) {
3076 		umtxq_lock(&uq->uq_key);
3077 		umtxq_busy(&uq->uq_key);
3078 		umtxq_signal_queue(&uq->uq_key, count, q);
3079 		umtxq_unbusy(&uq->uq_key);
3080 		umtxq_unlock(&uq->uq_key);
3081 	}
3082 out:
3083 	umtx_key_release(&uq->uq_key);
3084 	return (error);
3085 }
3086 
3087 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3088 static int
3089 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3090 {
3091 	struct abs_timeout timo;
3092 	struct umtx_q *uq;
3093 	uint32_t flags, count, count1;
3094 	int error, rv;
3095 
3096 	uq = td->td_umtxq;
3097 	error = fueword32(&sem->_flags, &flags);
3098 	if (error == -1)
3099 		return (EFAULT);
3100 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3101 	if (error != 0)
3102 		return (error);
3103 
3104 	if (timeout != NULL)
3105 		abs_timeout_init2(&timo, timeout);
3106 
3107 	umtxq_lock(&uq->uq_key);
3108 	umtxq_busy(&uq->uq_key);
3109 	umtxq_insert(uq);
3110 	umtxq_unlock(&uq->uq_key);
3111 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3112 	if (rv == 0)
3113 		rv = fueword32(&sem->_count, &count);
3114 	if (rv == -1 || count != 0) {
3115 		umtxq_lock(&uq->uq_key);
3116 		umtxq_unbusy(&uq->uq_key);
3117 		umtxq_remove(uq);
3118 		umtxq_unlock(&uq->uq_key);
3119 		umtx_key_release(&uq->uq_key);
3120 		return (rv == -1 ? EFAULT : 0);
3121 	}
3122 	umtxq_lock(&uq->uq_key);
3123 	umtxq_unbusy(&uq->uq_key);
3124 
3125 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3126 
3127 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3128 		error = 0;
3129 	else {
3130 		umtxq_remove(uq);
3131 		/* A relative timeout cannot be restarted. */
3132 		if (error == ERESTART && timeout != NULL &&
3133 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3134 			error = EINTR;
3135 	}
3136 	umtxq_unlock(&uq->uq_key);
3137 	umtx_key_release(&uq->uq_key);
3138 	return (error);
3139 }
3140 
3141 /*
3142  * Signal a userland semaphore.
3143  */
3144 static int
3145 do_sem_wake(struct thread *td, struct _usem *sem)
3146 {
3147 	struct umtx_key key;
3148 	int error, cnt;
3149 	uint32_t flags;
3150 
3151 	error = fueword32(&sem->_flags, &flags);
3152 	if (error == -1)
3153 		return (EFAULT);
3154 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3155 		return (error);
3156 	umtxq_lock(&key);
3157 	umtxq_busy(&key);
3158 	cnt = umtxq_count(&key);
3159 	if (cnt > 0) {
3160 		/*
3161 		 * Check if count is greater than 0, this means the memory is
3162 		 * still being referenced by user code, so we can safely
3163 		 * update _has_waiters flag.
3164 		 */
3165 		if (cnt == 1) {
3166 			umtxq_unlock(&key);
3167 			error = suword32(&sem->_has_waiters, 0);
3168 			umtxq_lock(&key);
3169 			if (error == -1)
3170 				error = EFAULT;
3171 		}
3172 		umtxq_signal(&key, 1);
3173 	}
3174 	umtxq_unbusy(&key);
3175 	umtxq_unlock(&key);
3176 	umtx_key_release(&key);
3177 	return (error);
3178 }
3179 #endif
3180 
3181 static int
3182 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3183 {
3184 	struct abs_timeout timo;
3185 	struct umtx_q *uq;
3186 	uint32_t count, flags;
3187 	int error, rv;
3188 
3189 	uq = td->td_umtxq;
3190 	flags = fuword32(&sem->_flags);
3191 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3192 	if (error != 0)
3193 		return (error);
3194 
3195 	if (timeout != NULL)
3196 		abs_timeout_init2(&timo, timeout);
3197 
3198 	umtxq_lock(&uq->uq_key);
3199 	umtxq_busy(&uq->uq_key);
3200 	umtxq_insert(uq);
3201 	umtxq_unlock(&uq->uq_key);
3202 	rv = fueword32(&sem->_count, &count);
3203 	if (rv == -1) {
3204 		umtxq_lock(&uq->uq_key);
3205 		umtxq_unbusy(&uq->uq_key);
3206 		umtxq_remove(uq);
3207 		umtxq_unlock(&uq->uq_key);
3208 		umtx_key_release(&uq->uq_key);
3209 		return (EFAULT);
3210 	}
3211 	for (;;) {
3212 		if (USEM_COUNT(count) != 0) {
3213 			umtxq_lock(&uq->uq_key);
3214 			umtxq_unbusy(&uq->uq_key);
3215 			umtxq_remove(uq);
3216 			umtxq_unlock(&uq->uq_key);
3217 			umtx_key_release(&uq->uq_key);
3218 			return (0);
3219 		}
3220 		if (count == USEM_HAS_WAITERS)
3221 			break;
3222 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3223 		if (rv == -1) {
3224 			umtxq_lock(&uq->uq_key);
3225 			umtxq_unbusy(&uq->uq_key);
3226 			umtxq_remove(uq);
3227 			umtxq_unlock(&uq->uq_key);
3228 			umtx_key_release(&uq->uq_key);
3229 			return (EFAULT);
3230 		}
3231 		if (count == 0)
3232 			break;
3233 	}
3234 	umtxq_lock(&uq->uq_key);
3235 	umtxq_unbusy(&uq->uq_key);
3236 
3237 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3238 
3239 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3240 		error = 0;
3241 	else {
3242 		umtxq_remove(uq);
3243 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3244 			/* A relative timeout cannot be restarted. */
3245 			if (error == ERESTART)
3246 				error = EINTR;
3247 			if (error == EINTR) {
3248 				abs_timeout_update(&timo);
3249 				timespecsub(&timo.end, &timo.cur,
3250 				    &timeout->_timeout);
3251 			}
3252 		}
3253 	}
3254 	umtxq_unlock(&uq->uq_key);
3255 	umtx_key_release(&uq->uq_key);
3256 	return (error);
3257 }
3258 
3259 /*
3260  * Signal a userland semaphore.
3261  */
3262 static int
3263 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3264 {
3265 	struct umtx_key key;
3266 	int error, cnt, rv;
3267 	uint32_t count, flags;
3268 
3269 	rv = fueword32(&sem->_flags, &flags);
3270 	if (rv == -1)
3271 		return (EFAULT);
3272 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3273 		return (error);
3274 	umtxq_lock(&key);
3275 	umtxq_busy(&key);
3276 	cnt = umtxq_count(&key);
3277 	if (cnt > 0) {
3278 		/*
3279 		 * If this was the last sleeping thread, clear the waiters
3280 		 * flag in _count.
3281 		 */
3282 		if (cnt == 1) {
3283 			umtxq_unlock(&key);
3284 			rv = fueword32(&sem->_count, &count);
3285 			while (rv != -1 && count & USEM_HAS_WAITERS)
3286 				rv = casueword32(&sem->_count, count, &count,
3287 				    count & ~USEM_HAS_WAITERS);
3288 			if (rv == -1)
3289 				error = EFAULT;
3290 			umtxq_lock(&key);
3291 		}
3292 
3293 		umtxq_signal(&key, 1);
3294 	}
3295 	umtxq_unbusy(&key);
3296 	umtxq_unlock(&key);
3297 	umtx_key_release(&key);
3298 	return (error);
3299 }
3300 
3301 inline int
3302 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3303 {
3304 	int error;
3305 
3306 	error = copyin(addr, tsp, sizeof(struct timespec));
3307 	if (error == 0) {
3308 		if (tsp->tv_sec < 0 ||
3309 		    tsp->tv_nsec >= 1000000000 ||
3310 		    tsp->tv_nsec < 0)
3311 			error = EINVAL;
3312 	}
3313 	return (error);
3314 }
3315 
3316 static inline int
3317 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3318 {
3319 	int error;
3320 
3321 	if (size <= sizeof(struct timespec)) {
3322 		tp->_clockid = CLOCK_REALTIME;
3323 		tp->_flags = 0;
3324 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3325 	} else
3326 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3327 	if (error != 0)
3328 		return (error);
3329 	if (tp->_timeout.tv_sec < 0 ||
3330 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3331 		return (EINVAL);
3332 	return (0);
3333 }
3334 
3335 static int
3336 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3337 {
3338 
3339 	return (EOPNOTSUPP);
3340 }
3341 
3342 static int
3343 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3344 {
3345 	struct _umtx_time timeout, *tm_p;
3346 	int error;
3347 
3348 	if (uap->uaddr2 == NULL)
3349 		tm_p = NULL;
3350 	else {
3351 		error = umtx_copyin_umtx_time(
3352 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3353 		if (error != 0)
3354 			return (error);
3355 		tm_p = &timeout;
3356 	}
3357 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3358 }
3359 
3360 static int
3361 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3362 {
3363 	struct _umtx_time timeout, *tm_p;
3364 	int error;
3365 
3366 	if (uap->uaddr2 == NULL)
3367 		tm_p = NULL;
3368 	else {
3369 		error = umtx_copyin_umtx_time(
3370 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3371 		if (error != 0)
3372 			return (error);
3373 		tm_p = &timeout;
3374 	}
3375 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3376 }
3377 
3378 static int
3379 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3380 {
3381 	struct _umtx_time *tm_p, timeout;
3382 	int error;
3383 
3384 	if (uap->uaddr2 == NULL)
3385 		tm_p = NULL;
3386 	else {
3387 		error = umtx_copyin_umtx_time(
3388 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3389 		if (error != 0)
3390 			return (error);
3391 		tm_p = &timeout;
3392 	}
3393 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3394 }
3395 
3396 static int
3397 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3398 {
3399 
3400 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3401 }
3402 
3403 #define BATCH_SIZE	128
3404 static int
3405 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3406 {
3407 	char *uaddrs[BATCH_SIZE], **upp;
3408 	int count, error, i, pos, tocopy;
3409 
3410 	upp = (char **)uap->obj;
3411 	error = 0;
3412 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3413 	    pos += tocopy) {
3414 		tocopy = MIN(count, BATCH_SIZE);
3415 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3416 		if (error != 0)
3417 			break;
3418 		for (i = 0; i < tocopy; ++i)
3419 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3420 		maybe_yield();
3421 	}
3422 	return (error);
3423 }
3424 
3425 static int
3426 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3427 {
3428 
3429 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3430 }
3431 
3432 static int
3433 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3434 {
3435 	struct _umtx_time *tm_p, timeout;
3436 	int error;
3437 
3438 	/* Allow a null timespec (wait forever). */
3439 	if (uap->uaddr2 == NULL)
3440 		tm_p = NULL;
3441 	else {
3442 		error = umtx_copyin_umtx_time(
3443 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3444 		if (error != 0)
3445 			return (error);
3446 		tm_p = &timeout;
3447 	}
3448 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3449 }
3450 
3451 static int
3452 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3453 {
3454 
3455 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3456 }
3457 
3458 static int
3459 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3460 {
3461 	struct _umtx_time *tm_p, timeout;
3462 	int error;
3463 
3464 	/* Allow a null timespec (wait forever). */
3465 	if (uap->uaddr2 == NULL)
3466 		tm_p = NULL;
3467 	else {
3468 		error = umtx_copyin_umtx_time(
3469 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3470 		if (error != 0)
3471 			return (error);
3472 		tm_p = &timeout;
3473 	}
3474 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3475 }
3476 
3477 static int
3478 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3479 {
3480 
3481 	return (do_wake_umutex(td, uap->obj));
3482 }
3483 
3484 static int
3485 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3486 {
3487 
3488 	return (do_unlock_umutex(td, uap->obj, false));
3489 }
3490 
3491 static int
3492 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3493 {
3494 
3495 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3496 }
3497 
3498 static int
3499 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3500 {
3501 	struct timespec *ts, timeout;
3502 	int error;
3503 
3504 	/* Allow a null timespec (wait forever). */
3505 	if (uap->uaddr2 == NULL)
3506 		ts = NULL;
3507 	else {
3508 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3509 		if (error != 0)
3510 			return (error);
3511 		ts = &timeout;
3512 	}
3513 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3514 }
3515 
3516 static int
3517 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3518 {
3519 
3520 	return (do_cv_signal(td, uap->obj));
3521 }
3522 
3523 static int
3524 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3525 {
3526 
3527 	return (do_cv_broadcast(td, uap->obj));
3528 }
3529 
3530 static int
3531 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3532 {
3533 	struct _umtx_time timeout;
3534 	int error;
3535 
3536 	/* Allow a null timespec (wait forever). */
3537 	if (uap->uaddr2 == NULL) {
3538 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3539 	} else {
3540 		error = umtx_copyin_umtx_time(uap->uaddr2,
3541 		   (size_t)uap->uaddr1, &timeout);
3542 		if (error != 0)
3543 			return (error);
3544 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3545 	}
3546 	return (error);
3547 }
3548 
3549 static int
3550 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3551 {
3552 	struct _umtx_time timeout;
3553 	int error;
3554 
3555 	/* Allow a null timespec (wait forever). */
3556 	if (uap->uaddr2 == NULL) {
3557 		error = do_rw_wrlock(td, uap->obj, 0);
3558 	} else {
3559 		error = umtx_copyin_umtx_time(uap->uaddr2,
3560 		   (size_t)uap->uaddr1, &timeout);
3561 		if (error != 0)
3562 			return (error);
3563 
3564 		error = do_rw_wrlock(td, uap->obj, &timeout);
3565 	}
3566 	return (error);
3567 }
3568 
3569 static int
3570 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3571 {
3572 
3573 	return (do_rw_unlock(td, uap->obj));
3574 }
3575 
3576 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3577 static int
3578 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3579 {
3580 	struct _umtx_time *tm_p, timeout;
3581 	int error;
3582 
3583 	/* Allow a null timespec (wait forever). */
3584 	if (uap->uaddr2 == NULL)
3585 		tm_p = NULL;
3586 	else {
3587 		error = umtx_copyin_umtx_time(
3588 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3589 		if (error != 0)
3590 			return (error);
3591 		tm_p = &timeout;
3592 	}
3593 	return (do_sem_wait(td, uap->obj, tm_p));
3594 }
3595 
3596 static int
3597 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3598 {
3599 
3600 	return (do_sem_wake(td, uap->obj));
3601 }
3602 #endif
3603 
3604 static int
3605 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3606 {
3607 
3608 	return (do_wake2_umutex(td, uap->obj, uap->val));
3609 }
3610 
3611 static int
3612 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3613 {
3614 	struct _umtx_time *tm_p, timeout;
3615 	size_t uasize;
3616 	int error;
3617 
3618 	/* Allow a null timespec (wait forever). */
3619 	if (uap->uaddr2 == NULL) {
3620 		uasize = 0;
3621 		tm_p = NULL;
3622 	} else {
3623 		uasize = (size_t)uap->uaddr1;
3624 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3625 		if (error != 0)
3626 			return (error);
3627 		tm_p = &timeout;
3628 	}
3629 	error = do_sem2_wait(td, uap->obj, tm_p);
3630 	if (error == EINTR && uap->uaddr2 != NULL &&
3631 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3632 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3633 		error = copyout(&timeout._timeout,
3634 		    (struct _umtx_time *)uap->uaddr2 + 1,
3635 		    sizeof(struct timespec));
3636 		if (error == 0) {
3637 			error = EINTR;
3638 		}
3639 	}
3640 
3641 	return (error);
3642 }
3643 
3644 static int
3645 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3646 {
3647 
3648 	return (do_sem2_wake(td, uap->obj));
3649 }
3650 
3651 #define	USHM_OBJ_UMTX(o)						\
3652     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3653 
3654 #define	USHMF_REG_LINKED	0x0001
3655 #define	USHMF_OBJ_LINKED	0x0002
3656 struct umtx_shm_reg {
3657 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3658 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3659 	struct umtx_key		ushm_key;
3660 	struct ucred		*ushm_cred;
3661 	struct shmfd		*ushm_obj;
3662 	u_int			ushm_refcnt;
3663 	u_int			ushm_flags;
3664 };
3665 
3666 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3667 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3668 
3669 static uma_zone_t umtx_shm_reg_zone;
3670 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3671 static struct mtx umtx_shm_lock;
3672 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3673     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3674 
3675 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3676 
3677 static void
3678 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3679 {
3680 	struct umtx_shm_reg_head d;
3681 	struct umtx_shm_reg *reg, *reg1;
3682 
3683 	TAILQ_INIT(&d);
3684 	mtx_lock(&umtx_shm_lock);
3685 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3686 	mtx_unlock(&umtx_shm_lock);
3687 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3688 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3689 		umtx_shm_free_reg(reg);
3690 	}
3691 }
3692 
3693 static struct task umtx_shm_reg_delfree_task =
3694     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3695 
3696 static struct umtx_shm_reg *
3697 umtx_shm_find_reg_locked(const struct umtx_key *key)
3698 {
3699 	struct umtx_shm_reg *reg;
3700 	struct umtx_shm_reg_head *reg_head;
3701 
3702 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3703 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3704 	reg_head = &umtx_shm_registry[key->hash];
3705 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3706 		KASSERT(reg->ushm_key.shared,
3707 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3708 		if (reg->ushm_key.info.shared.object ==
3709 		    key->info.shared.object &&
3710 		    reg->ushm_key.info.shared.offset ==
3711 		    key->info.shared.offset) {
3712 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3713 			KASSERT(reg->ushm_refcnt > 0,
3714 			    ("reg %p refcnt 0 onlist", reg));
3715 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3716 			    ("reg %p not linked", reg));
3717 			reg->ushm_refcnt++;
3718 			return (reg);
3719 		}
3720 	}
3721 	return (NULL);
3722 }
3723 
3724 static struct umtx_shm_reg *
3725 umtx_shm_find_reg(const struct umtx_key *key)
3726 {
3727 	struct umtx_shm_reg *reg;
3728 
3729 	mtx_lock(&umtx_shm_lock);
3730 	reg = umtx_shm_find_reg_locked(key);
3731 	mtx_unlock(&umtx_shm_lock);
3732 	return (reg);
3733 }
3734 
3735 static void
3736 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3737 {
3738 
3739 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3740 	crfree(reg->ushm_cred);
3741 	shm_drop(reg->ushm_obj);
3742 	uma_zfree(umtx_shm_reg_zone, reg);
3743 }
3744 
3745 static bool
3746 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3747 {
3748 	bool res;
3749 
3750 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3751 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3752 	reg->ushm_refcnt--;
3753 	res = reg->ushm_refcnt == 0;
3754 	if (res || force) {
3755 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3756 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3757 			    reg, ushm_reg_link);
3758 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3759 		}
3760 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3761 			LIST_REMOVE(reg, ushm_obj_link);
3762 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3763 		}
3764 	}
3765 	return (res);
3766 }
3767 
3768 static void
3769 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3770 {
3771 	vm_object_t object;
3772 	bool dofree;
3773 
3774 	if (force) {
3775 		object = reg->ushm_obj->shm_object;
3776 		VM_OBJECT_WLOCK(object);
3777 		object->flags |= OBJ_UMTXDEAD;
3778 		VM_OBJECT_WUNLOCK(object);
3779 	}
3780 	mtx_lock(&umtx_shm_lock);
3781 	dofree = umtx_shm_unref_reg_locked(reg, force);
3782 	mtx_unlock(&umtx_shm_lock);
3783 	if (dofree)
3784 		umtx_shm_free_reg(reg);
3785 }
3786 
3787 void
3788 umtx_shm_object_init(vm_object_t object)
3789 {
3790 
3791 	LIST_INIT(USHM_OBJ_UMTX(object));
3792 }
3793 
3794 void
3795 umtx_shm_object_terminated(vm_object_t object)
3796 {
3797 	struct umtx_shm_reg *reg, *reg1;
3798 	bool dofree;
3799 
3800 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3801 		return;
3802 
3803 	dofree = false;
3804 	mtx_lock(&umtx_shm_lock);
3805 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3806 		if (umtx_shm_unref_reg_locked(reg, true)) {
3807 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3808 			    ushm_reg_link);
3809 			dofree = true;
3810 		}
3811 	}
3812 	mtx_unlock(&umtx_shm_lock);
3813 	if (dofree)
3814 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3815 }
3816 
3817 static int
3818 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3819     struct umtx_shm_reg **res)
3820 {
3821 	struct umtx_shm_reg *reg, *reg1;
3822 	struct ucred *cred;
3823 	int error;
3824 
3825 	reg = umtx_shm_find_reg(key);
3826 	if (reg != NULL) {
3827 		*res = reg;
3828 		return (0);
3829 	}
3830 	cred = td->td_ucred;
3831 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3832 		return (ENOMEM);
3833 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3834 	reg->ushm_refcnt = 1;
3835 	bcopy(key, &reg->ushm_key, sizeof(*key));
3836 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3837 	reg->ushm_cred = crhold(cred);
3838 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3839 	if (error != 0) {
3840 		umtx_shm_free_reg(reg);
3841 		return (error);
3842 	}
3843 	mtx_lock(&umtx_shm_lock);
3844 	reg1 = umtx_shm_find_reg_locked(key);
3845 	if (reg1 != NULL) {
3846 		mtx_unlock(&umtx_shm_lock);
3847 		umtx_shm_free_reg(reg);
3848 		*res = reg1;
3849 		return (0);
3850 	}
3851 	reg->ushm_refcnt++;
3852 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3853 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3854 	    ushm_obj_link);
3855 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3856 	mtx_unlock(&umtx_shm_lock);
3857 	*res = reg;
3858 	return (0);
3859 }
3860 
3861 static int
3862 umtx_shm_alive(struct thread *td, void *addr)
3863 {
3864 	vm_map_t map;
3865 	vm_map_entry_t entry;
3866 	vm_object_t object;
3867 	vm_pindex_t pindex;
3868 	vm_prot_t prot;
3869 	int res, ret;
3870 	boolean_t wired;
3871 
3872 	map = &td->td_proc->p_vmspace->vm_map;
3873 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3874 	    &object, &pindex, &prot, &wired);
3875 	if (res != KERN_SUCCESS)
3876 		return (EFAULT);
3877 	if (object == NULL)
3878 		ret = EINVAL;
3879 	else
3880 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3881 	vm_map_lookup_done(map, entry);
3882 	return (ret);
3883 }
3884 
3885 static void
3886 umtx_shm_init(void)
3887 {
3888 	int i;
3889 
3890 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3891 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3892 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3893 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3894 		TAILQ_INIT(&umtx_shm_registry[i]);
3895 }
3896 
3897 static int
3898 umtx_shm(struct thread *td, void *addr, u_int flags)
3899 {
3900 	struct umtx_key key;
3901 	struct umtx_shm_reg *reg;
3902 	struct file *fp;
3903 	int error, fd;
3904 
3905 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
3906 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
3907 		return (EINVAL);
3908 	if ((flags & UMTX_SHM_ALIVE) != 0)
3909 		return (umtx_shm_alive(td, addr));
3910 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
3911 	if (error != 0)
3912 		return (error);
3913 	KASSERT(key.shared == 1, ("non-shared key"));
3914 	if ((flags & UMTX_SHM_CREAT) != 0) {
3915 		error = umtx_shm_create_reg(td, &key, &reg);
3916 	} else {
3917 		reg = umtx_shm_find_reg(&key);
3918 		if (reg == NULL)
3919 			error = ESRCH;
3920 	}
3921 	umtx_key_release(&key);
3922 	if (error != 0)
3923 		return (error);
3924 	KASSERT(reg != NULL, ("no reg"));
3925 	if ((flags & UMTX_SHM_DESTROY) != 0) {
3926 		umtx_shm_unref_reg(reg, true);
3927 	} else {
3928 #if 0
3929 #ifdef MAC
3930 		error = mac_posixshm_check_open(td->td_ucred,
3931 		    reg->ushm_obj, FFLAGS(O_RDWR));
3932 		if (error == 0)
3933 #endif
3934 			error = shm_access(reg->ushm_obj, td->td_ucred,
3935 			    FFLAGS(O_RDWR));
3936 		if (error == 0)
3937 #endif
3938 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
3939 		if (error == 0) {
3940 			shm_hold(reg->ushm_obj);
3941 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
3942 			    &shm_ops);
3943 			td->td_retval[0] = fd;
3944 			fdrop(fp, td);
3945 		}
3946 	}
3947 	umtx_shm_unref_reg(reg, false);
3948 	return (error);
3949 }
3950 
3951 static int
3952 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
3953 {
3954 
3955 	return (umtx_shm(td, uap->uaddr1, uap->val));
3956 }
3957 
3958 static int
3959 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
3960 {
3961 
3962 	td->td_rb_list = rbp->robust_list_offset;
3963 	td->td_rbp_list = rbp->robust_priv_list_offset;
3964 	td->td_rb_inact = rbp->robust_inact_offset;
3965 	return (0);
3966 }
3967 
3968 static int
3969 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
3970 {
3971 	struct umtx_robust_lists_params rb;
3972 	int error;
3973 
3974 	if (uap->val > sizeof(rb))
3975 		return (EINVAL);
3976 	bzero(&rb, sizeof(rb));
3977 	error = copyin(uap->uaddr1, &rb, uap->val);
3978 	if (error != 0)
3979 		return (error);
3980 	return (umtx_robust_lists(td, &rb));
3981 }
3982 
3983 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3984 
3985 static const _umtx_op_func op_table[] = {
3986 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3987 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3988 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3989 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3990 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3991 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3992 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3993 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3994 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
3995 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
3996 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
3997 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
3998 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
3999 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4000 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4001 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4002 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4003 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4004 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4005 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4006 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4007 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4008 #else
4009 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4010 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4011 #endif
4012 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4013 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4014 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4015 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4016 	[UMTX_OP_SHM]		= __umtx_op_shm,
4017 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4018 };
4019 
4020 int
4021 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4022 {
4023 
4024 	if ((unsigned)uap->op < nitems(op_table))
4025 		return (*op_table[uap->op])(td, uap);
4026 	return (EINVAL);
4027 }
4028 
4029 #ifdef COMPAT_FREEBSD32
4030 
4031 struct timespec32 {
4032 	int32_t tv_sec;
4033 	int32_t tv_nsec;
4034 };
4035 
4036 struct umtx_time32 {
4037 	struct	timespec32	timeout;
4038 	uint32_t		flags;
4039 	uint32_t		clockid;
4040 };
4041 
4042 static inline int
4043 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4044 {
4045 	struct timespec32 ts32;
4046 	int error;
4047 
4048 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4049 	if (error == 0) {
4050 		if (ts32.tv_sec < 0 ||
4051 		    ts32.tv_nsec >= 1000000000 ||
4052 		    ts32.tv_nsec < 0)
4053 			error = EINVAL;
4054 		else {
4055 			tsp->tv_sec = ts32.tv_sec;
4056 			tsp->tv_nsec = ts32.tv_nsec;
4057 		}
4058 	}
4059 	return (error);
4060 }
4061 
4062 static inline int
4063 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4064 {
4065 	struct umtx_time32 t32;
4066 	int error;
4067 
4068 	t32.clockid = CLOCK_REALTIME;
4069 	t32.flags   = 0;
4070 	if (size <= sizeof(struct timespec32))
4071 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4072 	else
4073 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4074 	if (error != 0)
4075 		return (error);
4076 	if (t32.timeout.tv_sec < 0 ||
4077 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4078 		return (EINVAL);
4079 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4080 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4081 	tp->_flags = t32.flags;
4082 	tp->_clockid = t32.clockid;
4083 	return (0);
4084 }
4085 
4086 static int
4087 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4088 {
4089 	struct _umtx_time *tm_p, timeout;
4090 	int error;
4091 
4092 	if (uap->uaddr2 == NULL)
4093 		tm_p = NULL;
4094 	else {
4095 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4096 			(size_t)uap->uaddr1, &timeout);
4097 		if (error != 0)
4098 			return (error);
4099 		tm_p = &timeout;
4100 	}
4101 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4102 }
4103 
4104 static int
4105 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4106 {
4107 	struct _umtx_time *tm_p, timeout;
4108 	int error;
4109 
4110 	/* Allow a null timespec (wait forever). */
4111 	if (uap->uaddr2 == NULL)
4112 		tm_p = NULL;
4113 	else {
4114 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4115 			    (size_t)uap->uaddr1, &timeout);
4116 		if (error != 0)
4117 			return (error);
4118 		tm_p = &timeout;
4119 	}
4120 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4121 }
4122 
4123 static int
4124 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4125 {
4126 	struct _umtx_time *tm_p, timeout;
4127 	int error;
4128 
4129 	/* Allow a null timespec (wait forever). */
4130 	if (uap->uaddr2 == NULL)
4131 		tm_p = NULL;
4132 	else {
4133 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4134 		    (size_t)uap->uaddr1, &timeout);
4135 		if (error != 0)
4136 			return (error);
4137 		tm_p = &timeout;
4138 	}
4139 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4140 }
4141 
4142 static int
4143 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4144 {
4145 	struct timespec *ts, timeout;
4146 	int error;
4147 
4148 	/* Allow a null timespec (wait forever). */
4149 	if (uap->uaddr2 == NULL)
4150 		ts = NULL;
4151 	else {
4152 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4153 		if (error != 0)
4154 			return (error);
4155 		ts = &timeout;
4156 	}
4157 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4158 }
4159 
4160 static int
4161 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4162 {
4163 	struct _umtx_time timeout;
4164 	int error;
4165 
4166 	/* Allow a null timespec (wait forever). */
4167 	if (uap->uaddr2 == NULL) {
4168 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4169 	} else {
4170 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4171 		    (size_t)uap->uaddr1, &timeout);
4172 		if (error != 0)
4173 			return (error);
4174 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4175 	}
4176 	return (error);
4177 }
4178 
4179 static int
4180 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4181 {
4182 	struct _umtx_time timeout;
4183 	int error;
4184 
4185 	/* Allow a null timespec (wait forever). */
4186 	if (uap->uaddr2 == NULL) {
4187 		error = do_rw_wrlock(td, uap->obj, 0);
4188 	} else {
4189 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4190 		    (size_t)uap->uaddr1, &timeout);
4191 		if (error != 0)
4192 			return (error);
4193 		error = do_rw_wrlock(td, uap->obj, &timeout);
4194 	}
4195 	return (error);
4196 }
4197 
4198 static int
4199 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4200 {
4201 	struct _umtx_time *tm_p, timeout;
4202 	int error;
4203 
4204 	if (uap->uaddr2 == NULL)
4205 		tm_p = NULL;
4206 	else {
4207 		error = umtx_copyin_umtx_time32(
4208 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4209 		if (error != 0)
4210 			return (error);
4211 		tm_p = &timeout;
4212 	}
4213 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4214 }
4215 
4216 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4217 static int
4218 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4219 {
4220 	struct _umtx_time *tm_p, timeout;
4221 	int error;
4222 
4223 	/* Allow a null timespec (wait forever). */
4224 	if (uap->uaddr2 == NULL)
4225 		tm_p = NULL;
4226 	else {
4227 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4228 		    (size_t)uap->uaddr1, &timeout);
4229 		if (error != 0)
4230 			return (error);
4231 		tm_p = &timeout;
4232 	}
4233 	return (do_sem_wait(td, uap->obj, tm_p));
4234 }
4235 #endif
4236 
4237 static int
4238 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4239 {
4240 	struct _umtx_time *tm_p, timeout;
4241 	size_t uasize;
4242 	int error;
4243 
4244 	/* Allow a null timespec (wait forever). */
4245 	if (uap->uaddr2 == NULL) {
4246 		uasize = 0;
4247 		tm_p = NULL;
4248 	} else {
4249 		uasize = (size_t)uap->uaddr1;
4250 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4251 		if (error != 0)
4252 			return (error);
4253 		tm_p = &timeout;
4254 	}
4255 	error = do_sem2_wait(td, uap->obj, tm_p);
4256 	if (error == EINTR && uap->uaddr2 != NULL &&
4257 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4258 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4259 		struct timespec32 remain32 = {
4260 			.tv_sec = timeout._timeout.tv_sec,
4261 			.tv_nsec = timeout._timeout.tv_nsec
4262 		};
4263 		error = copyout(&remain32,
4264 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4265 		    sizeof(struct timespec32));
4266 		if (error == 0) {
4267 			error = EINTR;
4268 		}
4269 	}
4270 
4271 	return (error);
4272 }
4273 
4274 static int
4275 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4276 {
4277 	uint32_t uaddrs[BATCH_SIZE], **upp;
4278 	int count, error, i, pos, tocopy;
4279 
4280 	upp = (uint32_t **)uap->obj;
4281 	error = 0;
4282 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4283 	    pos += tocopy) {
4284 		tocopy = MIN(count, BATCH_SIZE);
4285 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4286 		if (error != 0)
4287 			break;
4288 		for (i = 0; i < tocopy; ++i)
4289 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4290 			    INT_MAX, 1);
4291 		maybe_yield();
4292 	}
4293 	return (error);
4294 }
4295 
4296 struct umtx_robust_lists_params_compat32 {
4297 	uint32_t	robust_list_offset;
4298 	uint32_t	robust_priv_list_offset;
4299 	uint32_t	robust_inact_offset;
4300 };
4301 
4302 static int
4303 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4304 {
4305 	struct umtx_robust_lists_params rb;
4306 	struct umtx_robust_lists_params_compat32 rb32;
4307 	int error;
4308 
4309 	if (uap->val > sizeof(rb32))
4310 		return (EINVAL);
4311 	bzero(&rb, sizeof(rb));
4312 	bzero(&rb32, sizeof(rb32));
4313 	error = copyin(uap->uaddr1, &rb32, uap->val);
4314 	if (error != 0)
4315 		return (error);
4316 	rb.robust_list_offset = rb32.robust_list_offset;
4317 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4318 	rb.robust_inact_offset = rb32.robust_inact_offset;
4319 	return (umtx_robust_lists(td, &rb));
4320 }
4321 
4322 static const _umtx_op_func op_table_compat32[] = {
4323 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4324 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4325 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4326 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4327 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4328 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4329 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4330 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4331 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4332 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4333 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4334 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4335 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4336 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4337 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4338 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4339 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4340 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4341 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4342 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4343 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4344 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4345 #else
4346 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4347 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4348 #endif
4349 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4350 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4351 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4352 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4353 	[UMTX_OP_SHM]		= __umtx_op_shm,
4354 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4355 };
4356 
4357 int
4358 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4359 {
4360 
4361 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4362 		return (*op_table_compat32[uap->op])(td,
4363 		    (struct _umtx_op_args *)uap);
4364 	}
4365 	return (EINVAL);
4366 }
4367 #endif
4368 
4369 void
4370 umtx_thread_init(struct thread *td)
4371 {
4372 
4373 	td->td_umtxq = umtxq_alloc();
4374 	td->td_umtxq->uq_thread = td;
4375 }
4376 
4377 void
4378 umtx_thread_fini(struct thread *td)
4379 {
4380 
4381 	umtxq_free(td->td_umtxq);
4382 }
4383 
4384 /*
4385  * It will be called when new thread is created, e.g fork().
4386  */
4387 void
4388 umtx_thread_alloc(struct thread *td)
4389 {
4390 	struct umtx_q *uq;
4391 
4392 	uq = td->td_umtxq;
4393 	uq->uq_inherited_pri = PRI_MAX;
4394 
4395 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4396 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4397 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4398 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4399 }
4400 
4401 /*
4402  * exec() hook.
4403  *
4404  * Clear robust lists for all process' threads, not delaying the
4405  * cleanup to thread_exit hook, since the relevant address space is
4406  * destroyed right now.
4407  */
4408 static void
4409 umtx_exec_hook(void *arg __unused, struct proc *p,
4410     struct image_params *imgp __unused)
4411 {
4412 	struct thread *td;
4413 
4414 	KASSERT(p == curproc, ("need curproc"));
4415 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4416 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4417 	    ("curproc must be single-threaded"));
4418 	/*
4419 	 * There is no need to lock the list as only this thread can be
4420 	 * running.
4421 	 */
4422 	FOREACH_THREAD_IN_PROC(p, td) {
4423 		KASSERT(td == curthread ||
4424 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4425 		    ("running thread %p %p", p, td));
4426 		umtx_thread_cleanup(td);
4427 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4428 	}
4429 }
4430 
4431 /*
4432  * thread_exit() hook.
4433  */
4434 void
4435 umtx_thread_exit(struct thread *td)
4436 {
4437 
4438 	umtx_thread_cleanup(td);
4439 }
4440 
4441 static int
4442 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4443 {
4444 	u_long res1;
4445 #ifdef COMPAT_FREEBSD32
4446 	uint32_t res32;
4447 #endif
4448 	int error;
4449 
4450 #ifdef COMPAT_FREEBSD32
4451 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4452 		error = fueword32((void *)ptr, &res32);
4453 		if (error == 0)
4454 			res1 = res32;
4455 	} else
4456 #endif
4457 	{
4458 		error = fueword((void *)ptr, &res1);
4459 	}
4460 	if (error == 0)
4461 		*res = res1;
4462 	else
4463 		error = EFAULT;
4464 	return (error);
4465 }
4466 
4467 static void
4468 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4469 {
4470 #ifdef COMPAT_FREEBSD32
4471 	struct umutex32 m32;
4472 
4473 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4474 		memcpy(&m32, m, sizeof(m32));
4475 		*rb_list = m32.m_rb_lnk;
4476 	} else
4477 #endif
4478 		*rb_list = m->m_rb_lnk;
4479 }
4480 
4481 static int
4482 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4483 {
4484 	struct umutex m;
4485 	int error;
4486 
4487 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4488 	error = copyin((void *)rbp, &m, sizeof(m));
4489 	if (error != 0)
4490 		return (error);
4491 	if (rb_list != NULL)
4492 		umtx_read_rb_list(td, &m, rb_list);
4493 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4494 		return (EINVAL);
4495 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4496 		/* inact is cleared after unlock, allow the inconsistency */
4497 		return (inact ? 0 : EINVAL);
4498 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4499 }
4500 
4501 static void
4502 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4503     const char *name)
4504 {
4505 	int error, i;
4506 	uintptr_t rbp;
4507 	bool inact;
4508 
4509 	if (rb_list == 0)
4510 		return;
4511 	error = umtx_read_uptr(td, rb_list, &rbp);
4512 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4513 		if (rbp == *rb_inact) {
4514 			inact = true;
4515 			*rb_inact = 0;
4516 		} else
4517 			inact = false;
4518 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4519 	}
4520 	if (i == umtx_max_rb && umtx_verbose_rb) {
4521 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4522 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4523 	}
4524 	if (error != 0 && umtx_verbose_rb) {
4525 		uprintf("comm %s pid %d: handling %srb error %d\n",
4526 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4527 	}
4528 }
4529 
4530 /*
4531  * Clean up umtx data.
4532  */
4533 static void
4534 umtx_thread_cleanup(struct thread *td)
4535 {
4536 	struct umtx_q *uq;
4537 	struct umtx_pi *pi;
4538 	uintptr_t rb_inact;
4539 
4540 	/*
4541 	 * Disown pi mutexes.
4542 	 */
4543 	uq = td->td_umtxq;
4544 	if (uq != NULL) {
4545 		if (uq->uq_inherited_pri != PRI_MAX ||
4546 		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
4547 			mtx_lock(&umtx_lock);
4548 			uq->uq_inherited_pri = PRI_MAX;
4549 			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4550 				pi->pi_owner = NULL;
4551 				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4552 			}
4553 			mtx_unlock(&umtx_lock);
4554 		}
4555 		sched_lend_user_prio_cond(td, PRI_MAX);
4556 	}
4557 
4558 	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
4559 		return;
4560 
4561 	/*
4562 	 * Handle terminated robust mutexes.  Must be done after
4563 	 * robust pi disown, otherwise unlock could see unowned
4564 	 * entries.
4565 	 */
4566 	rb_inact = td->td_rb_inact;
4567 	if (rb_inact != 0)
4568 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4569 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4570 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4571 	if (rb_inact != 0)
4572 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4573 }
4574