xref: /freebsd/sys/kern/kern_umtx.c (revision 8df8b2d3e51d1b816201d8a1fe8bc29fe192e562)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015, 2016 The FreeBSD Foundation
5  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Konstantin Belousov
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice unmodified, this list of conditions, and the following
17  *    disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_umtx_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysent.h>
59 #include <sys/systm.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/time.h>
64 #include <sys/eventhandler.h>
65 #include <sys/umtx.h>
66 
67 #include <security/mac/mac_framework.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_object.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/cpu.h>
77 
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32_proto.h>
80 #endif
81 
82 #define _UMUTEX_TRY		1
83 #define _UMUTEX_WAIT		2
84 
85 #ifdef UMTX_PROFILING
86 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
87 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
88 #endif
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98  	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 
142 	/* Spare queue ready to be reused */
143 	struct umtxq_queue	*uq_spare_queue;
144 
145 	/* The queue we on */
146 	struct umtxq_queue	*uq_cur_queue;
147 };
148 
149 TAILQ_HEAD(umtxq_head, umtx_q);
150 
151 /* Per-key wait-queue */
152 struct umtxq_queue {
153 	struct umtxq_head	head;
154 	struct umtx_key		key;
155 	LIST_ENTRY(umtxq_queue)	link;
156 	int			length;
157 };
158 
159 LIST_HEAD(umtxq_list, umtxq_queue);
160 
161 /* Userland lock object's wait-queue chain */
162 struct umtxq_chain {
163 	/* Lock for this chain. */
164 	struct mtx		uc_lock;
165 
166 	/* List of sleep queues. */
167 	struct umtxq_list	uc_queue[2];
168 #define UMTX_SHARED_QUEUE	0
169 #define UMTX_EXCLUSIVE_QUEUE	1
170 
171 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
172 
173 	/* Busy flag */
174 	char			uc_busy;
175 
176 	/* Chain lock waiters */
177 	int			uc_waiters;
178 
179 	/* All PI in the list */
180 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
181 
182 #ifdef UMTX_PROFILING
183 	u_int 			length;
184 	u_int			max_length;
185 #endif
186 };
187 
188 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
189 
190 /*
191  * Don't propagate time-sharing priority, there is a security reason,
192  * a user can simply introduce PI-mutex, let thread A lock the mutex,
193  * and let another thread B block on the mutex, because B is
194  * sleeping, its priority will be boosted, this causes A's priority to
195  * be boosted via priority propagating too and will never be lowered even
196  * if it is using 100%CPU, this is unfair to other processes.
197  */
198 
199 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
200 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
201 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
202 
203 #define	GOLDEN_RATIO_PRIME	2654404609U
204 #ifndef	UMTX_CHAINS
205 #define	UMTX_CHAINS		512
206 #endif
207 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
208 
209 #define	GET_SHARE(flags)	\
210     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
211 
212 #define BUSY_SPINS		200
213 
214 struct abs_timeout {
215 	int clockid;
216 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
217 	struct timespec cur;
218 	struct timespec end;
219 };
220 
221 #ifdef COMPAT_FREEBSD32
222 struct umutex32 {
223 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
224 	__uint32_t		m_flags;	/* Flags of the mutex */
225 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
226 	__uint32_t		m_rb_lnk;	/* Robust linkage */
227 	__uint32_t		m_pad;
228 	__uint32_t		m_spare[2];
229 };
230 
231 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
232 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
233     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
234 #endif
235 
236 int umtx_shm_vnobj_persistent = 0;
237 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
238     &umtx_shm_vnobj_persistent, 0,
239     "False forces destruction of umtx attached to file, on last close");
240 static int umtx_max_rb = 1000;
241 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
242     &umtx_max_rb, 0,
243     "");
244 
245 static uma_zone_t		umtx_pi_zone;
246 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
247 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
248 static int			umtx_pi_allocated;
249 
250 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
251 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
252     &umtx_pi_allocated, 0, "Allocated umtx_pi");
253 static int umtx_verbose_rb = 1;
254 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
255     &umtx_verbose_rb, 0,
256     "");
257 
258 #ifdef UMTX_PROFILING
259 static long max_length;
260 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
261 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
262 #endif
263 
264 static void abs_timeout_update(struct abs_timeout *timo);
265 
266 static void umtx_shm_init(void);
267 static void umtxq_sysinit(void *);
268 static void umtxq_hash(struct umtx_key *key);
269 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
270 static void umtxq_lock(struct umtx_key *key);
271 static void umtxq_unlock(struct umtx_key *key);
272 static void umtxq_busy(struct umtx_key *key);
273 static void umtxq_unbusy(struct umtx_key *key);
274 static void umtxq_insert_queue(struct umtx_q *uq, int q);
275 static void umtxq_remove_queue(struct umtx_q *uq, int q);
276 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
277 static int umtxq_count(struct umtx_key *key);
278 static struct umtx_pi *umtx_pi_alloc(int);
279 static void umtx_pi_free(struct umtx_pi *pi);
280 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
281     bool rb);
282 static void umtx_thread_cleanup(struct thread *td);
283 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
284     struct image_params *imgp __unused);
285 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
286 
287 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
288 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
289 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
290 
291 static struct mtx umtx_lock;
292 
293 #ifdef UMTX_PROFILING
294 static void
295 umtx_init_profiling(void)
296 {
297 	struct sysctl_oid *chain_oid;
298 	char chain_name[10];
299 	int i;
300 
301 	for (i = 0; i < UMTX_CHAINS; ++i) {
302 		snprintf(chain_name, sizeof(chain_name), "%d", i);
303 		chain_oid = SYSCTL_ADD_NODE(NULL,
304 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
305 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
306 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
307 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
308 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
309 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
310 	}
311 }
312 
313 static int
314 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
315 {
316 	char buf[512];
317 	struct sbuf sb;
318 	struct umtxq_chain *uc;
319 	u_int fract, i, j, tot, whole;
320 	u_int sf0, sf1, sf2, sf3, sf4;
321 	u_int si0, si1, si2, si3, si4;
322 	u_int sw0, sw1, sw2, sw3, sw4;
323 
324 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
325 	for (i = 0; i < 2; i++) {
326 		tot = 0;
327 		for (j = 0; j < UMTX_CHAINS; ++j) {
328 			uc = &umtxq_chains[i][j];
329 			mtx_lock(&uc->uc_lock);
330 			tot += uc->max_length;
331 			mtx_unlock(&uc->uc_lock);
332 		}
333 		if (tot == 0)
334 			sbuf_printf(&sb, "%u) Empty ", i);
335 		else {
336 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
337 			si0 = si1 = si2 = si3 = si4 = 0;
338 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
339 			for (j = 0; j < UMTX_CHAINS; j++) {
340 				uc = &umtxq_chains[i][j];
341 				mtx_lock(&uc->uc_lock);
342 				whole = uc->max_length * 100;
343 				mtx_unlock(&uc->uc_lock);
344 				fract = (whole % tot) * 100;
345 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
346 					sf0 = fract;
347 					si0 = j;
348 					sw0 = whole;
349 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
350 				    sf1)) {
351 					sf1 = fract;
352 					si1 = j;
353 					sw1 = whole;
354 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
355 				    sf2)) {
356 					sf2 = fract;
357 					si2 = j;
358 					sw2 = whole;
359 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
360 				    sf3)) {
361 					sf3 = fract;
362 					si3 = j;
363 					sw3 = whole;
364 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
365 				    sf4)) {
366 					sf4 = fract;
367 					si4 = j;
368 					sw4 = whole;
369 				}
370 			}
371 			sbuf_printf(&sb, "queue %u:\n", i);
372 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
373 			    sf0 / tot, si0);
374 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
375 			    sf1 / tot, si1);
376 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
377 			    sf2 / tot, si2);
378 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
379 			    sf3 / tot, si3);
380 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
381 			    sf4 / tot, si4);
382 		}
383 	}
384 	sbuf_trim(&sb);
385 	sbuf_finish(&sb);
386 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
387 	sbuf_delete(&sb);
388 	return (0);
389 }
390 
391 static int
392 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
393 {
394 	struct umtxq_chain *uc;
395 	u_int i, j;
396 	int clear, error;
397 
398 	clear = 0;
399 	error = sysctl_handle_int(oidp, &clear, 0, req);
400 	if (error != 0 || req->newptr == NULL)
401 		return (error);
402 
403 	if (clear != 0) {
404 		for (i = 0; i < 2; ++i) {
405 			for (j = 0; j < UMTX_CHAINS; ++j) {
406 				uc = &umtxq_chains[i][j];
407 				mtx_lock(&uc->uc_lock);
408 				uc->length = 0;
409 				uc->max_length = 0;
410 				mtx_unlock(&uc->uc_lock);
411 			}
412 		}
413 	}
414 	return (0);
415 }
416 
417 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
418     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
419     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
420 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
421     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
422     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
423 #endif
424 
425 static void
426 umtxq_sysinit(void *arg __unused)
427 {
428 	int i, j;
429 
430 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
431 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
432 	for (i = 0; i < 2; ++i) {
433 		for (j = 0; j < UMTX_CHAINS; ++j) {
434 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
435 				 MTX_DEF | MTX_DUPOK);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
438 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
439 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
440 			umtxq_chains[i][j].uc_busy = 0;
441 			umtxq_chains[i][j].uc_waiters = 0;
442 #ifdef UMTX_PROFILING
443 			umtxq_chains[i][j].length = 0;
444 			umtxq_chains[i][j].max_length = 0;
445 #endif
446 		}
447 	}
448 #ifdef UMTX_PROFILING
449 	umtx_init_profiling();
450 #endif
451 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
452 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
453 	    EVENTHANDLER_PRI_ANY);
454 	umtx_shm_init();
455 }
456 
457 struct umtx_q *
458 umtxq_alloc(void)
459 {
460 	struct umtx_q *uq;
461 
462 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
463 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
464 	    M_WAITOK | M_ZERO);
465 	TAILQ_INIT(&uq->uq_spare_queue->head);
466 	TAILQ_INIT(&uq->uq_pi_contested);
467 	uq->uq_inherited_pri = PRI_MAX;
468 	return (uq);
469 }
470 
471 void
472 umtxq_free(struct umtx_q *uq)
473 {
474 
475 	MPASS(uq->uq_spare_queue != NULL);
476 	free(uq->uq_spare_queue, M_UMTX);
477 	free(uq, M_UMTX);
478 }
479 
480 static inline void
481 umtxq_hash(struct umtx_key *key)
482 {
483 	unsigned n;
484 
485 	n = (uintptr_t)key->info.both.a + key->info.both.b;
486 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
487 }
488 
489 static inline struct umtxq_chain *
490 umtxq_getchain(struct umtx_key *key)
491 {
492 
493 	if (key->type <= TYPE_SEM)
494 		return (&umtxq_chains[1][key->hash]);
495 	return (&umtxq_chains[0][key->hash]);
496 }
497 
498 /*
499  * Lock a chain.
500  */
501 static inline void
502 umtxq_lock(struct umtx_key *key)
503 {
504 	struct umtxq_chain *uc;
505 
506 	uc = umtxq_getchain(key);
507 	mtx_lock(&uc->uc_lock);
508 }
509 
510 /*
511  * Unlock a chain.
512  */
513 static inline void
514 umtxq_unlock(struct umtx_key *key)
515 {
516 	struct umtxq_chain *uc;
517 
518 	uc = umtxq_getchain(key);
519 	mtx_unlock(&uc->uc_lock);
520 }
521 
522 /*
523  * Set chain to busy state when following operation
524  * may be blocked (kernel mutex can not be used).
525  */
526 static inline void
527 umtxq_busy(struct umtx_key *key)
528 {
529 	struct umtxq_chain *uc;
530 
531 	uc = umtxq_getchain(key);
532 	mtx_assert(&uc->uc_lock, MA_OWNED);
533 	if (uc->uc_busy) {
534 #ifdef SMP
535 		if (smp_cpus > 1) {
536 			int count = BUSY_SPINS;
537 			if (count > 0) {
538 				umtxq_unlock(key);
539 				while (uc->uc_busy && --count > 0)
540 					cpu_spinwait();
541 				umtxq_lock(key);
542 			}
543 		}
544 #endif
545 		while (uc->uc_busy) {
546 			uc->uc_waiters++;
547 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
548 			uc->uc_waiters--;
549 		}
550 	}
551 	uc->uc_busy = 1;
552 }
553 
554 /*
555  * Unbusy a chain.
556  */
557 static inline void
558 umtxq_unbusy(struct umtx_key *key)
559 {
560 	struct umtxq_chain *uc;
561 
562 	uc = umtxq_getchain(key);
563 	mtx_assert(&uc->uc_lock, MA_OWNED);
564 	KASSERT(uc->uc_busy != 0, ("not busy"));
565 	uc->uc_busy = 0;
566 	if (uc->uc_waiters)
567 		wakeup_one(uc);
568 }
569 
570 static inline void
571 umtxq_unbusy_unlocked(struct umtx_key *key)
572 {
573 
574 	umtxq_lock(key);
575 	umtxq_unbusy(key);
576 	umtxq_unlock(key);
577 }
578 
579 static struct umtxq_queue *
580 umtxq_queue_lookup(struct umtx_key *key, int q)
581 {
582 	struct umtxq_queue *uh;
583 	struct umtxq_chain *uc;
584 
585 	uc = umtxq_getchain(key);
586 	UMTXQ_LOCKED_ASSERT(uc);
587 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
588 		if (umtx_key_match(&uh->key, key))
589 			return (uh);
590 	}
591 
592 	return (NULL);
593 }
594 
595 static inline void
596 umtxq_insert_queue(struct umtx_q *uq, int q)
597 {
598 	struct umtxq_queue *uh;
599 	struct umtxq_chain *uc;
600 
601 	uc = umtxq_getchain(&uq->uq_key);
602 	UMTXQ_LOCKED_ASSERT(uc);
603 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
604 	uh = umtxq_queue_lookup(&uq->uq_key, q);
605 	if (uh != NULL) {
606 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
607 	} else {
608 		uh = uq->uq_spare_queue;
609 		uh->key = uq->uq_key;
610 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
611 #ifdef UMTX_PROFILING
612 		uc->length++;
613 		if (uc->length > uc->max_length) {
614 			uc->max_length = uc->length;
615 			if (uc->max_length > max_length)
616 				max_length = uc->max_length;
617 		}
618 #endif
619 	}
620 	uq->uq_spare_queue = NULL;
621 
622 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
623 	uh->length++;
624 	uq->uq_flags |= UQF_UMTXQ;
625 	uq->uq_cur_queue = uh;
626 	return;
627 }
628 
629 static inline void
630 umtxq_remove_queue(struct umtx_q *uq, int q)
631 {
632 	struct umtxq_chain *uc;
633 	struct umtxq_queue *uh;
634 
635 	uc = umtxq_getchain(&uq->uq_key);
636 	UMTXQ_LOCKED_ASSERT(uc);
637 	if (uq->uq_flags & UQF_UMTXQ) {
638 		uh = uq->uq_cur_queue;
639 		TAILQ_REMOVE(&uh->head, uq, uq_link);
640 		uh->length--;
641 		uq->uq_flags &= ~UQF_UMTXQ;
642 		if (TAILQ_EMPTY(&uh->head)) {
643 			KASSERT(uh->length == 0,
644 			    ("inconsistent umtxq_queue length"));
645 #ifdef UMTX_PROFILING
646 			uc->length--;
647 #endif
648 			LIST_REMOVE(uh, link);
649 		} else {
650 			uh = LIST_FIRST(&uc->uc_spare_queue);
651 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
652 			LIST_REMOVE(uh, link);
653 		}
654 		uq->uq_spare_queue = uh;
655 		uq->uq_cur_queue = NULL;
656 	}
657 }
658 
659 /*
660  * Check if there are multiple waiters
661  */
662 static int
663 umtxq_count(struct umtx_key *key)
664 {
665 	struct umtxq_queue *uh;
666 
667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
668 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
669 	if (uh != NULL)
670 		return (uh->length);
671 	return (0);
672 }
673 
674 /*
675  * Check if there are multiple PI waiters and returns first
676  * waiter.
677  */
678 static int
679 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
680 {
681 	struct umtxq_queue *uh;
682 
683 	*first = NULL;
684 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
685 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
686 	if (uh != NULL) {
687 		*first = TAILQ_FIRST(&uh->head);
688 		return (uh->length);
689 	}
690 	return (0);
691 }
692 
693 static int
694 umtxq_check_susp(struct thread *td)
695 {
696 	struct proc *p;
697 	int error;
698 
699 	/*
700 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
701 	 * eventually break the lockstep loop.
702 	 */
703 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
704 		return (0);
705 	error = 0;
706 	p = td->td_proc;
707 	PROC_LOCK(p);
708 	if (P_SHOULDSTOP(p) ||
709 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
710 		if (p->p_flag & P_SINGLE_EXIT)
711 			error = EINTR;
712 		else
713 			error = ERESTART;
714 	}
715 	PROC_UNLOCK(p);
716 	return (error);
717 }
718 
719 /*
720  * Wake up threads waiting on an userland object.
721  */
722 
723 static int
724 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
725 {
726 	struct umtxq_queue *uh;
727 	struct umtx_q *uq;
728 	int ret;
729 
730 	ret = 0;
731 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
732 	uh = umtxq_queue_lookup(key, q);
733 	if (uh != NULL) {
734 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
735 			umtxq_remove_queue(uq, q);
736 			wakeup(uq);
737 			if (++ret >= n_wake)
738 				return (ret);
739 		}
740 	}
741 	return (ret);
742 }
743 
744 
745 /*
746  * Wake up specified thread.
747  */
748 static inline void
749 umtxq_signal_thread(struct umtx_q *uq)
750 {
751 
752 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
753 	umtxq_remove(uq);
754 	wakeup(uq);
755 }
756 
757 static inline int
758 tstohz(const struct timespec *tsp)
759 {
760 	struct timeval tv;
761 
762 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
763 	return tvtohz(&tv);
764 }
765 
766 static void
767 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
768 	const struct timespec *timeout)
769 {
770 
771 	timo->clockid = clockid;
772 	if (!absolute) {
773 		timo->is_abs_real = false;
774 		abs_timeout_update(timo);
775 		timespecadd(&timo->cur, timeout, &timo->end);
776 	} else {
777 		timo->end = *timeout;
778 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
779 		    clockid == CLOCK_REALTIME_FAST ||
780 		    clockid == CLOCK_REALTIME_PRECISE;
781 		/*
782 		 * If is_abs_real, umtxq_sleep will read the clock
783 		 * after setting td_rtcgen; otherwise, read it here.
784 		 */
785 		if (!timo->is_abs_real) {
786 			abs_timeout_update(timo);
787 		}
788 	}
789 }
790 
791 static void
792 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
793 {
794 
795 	abs_timeout_init(timo, umtxtime->_clockid,
796 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
797 }
798 
799 static inline void
800 abs_timeout_update(struct abs_timeout *timo)
801 {
802 
803 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
804 }
805 
806 static int
807 abs_timeout_gethz(struct abs_timeout *timo)
808 {
809 	struct timespec tts;
810 
811 	if (timespeccmp(&timo->end, &timo->cur, <=))
812 		return (-1);
813 	timespecsub(&timo->end, &timo->cur, &tts);
814 	return (tstohz(&tts));
815 }
816 
817 static uint32_t
818 umtx_unlock_val(uint32_t flags, bool rb)
819 {
820 
821 	if (rb)
822 		return (UMUTEX_RB_OWNERDEAD);
823 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
824 		return (UMUTEX_RB_NOTRECOV);
825 	else
826 		return (UMUTEX_UNOWNED);
827 
828 }
829 
830 /*
831  * Put thread into sleep state, before sleeping, check if
832  * thread was removed from umtx queue.
833  */
834 static inline int
835 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
836 {
837 	struct umtxq_chain *uc;
838 	int error, timo;
839 
840 	if (abstime != NULL && abstime->is_abs_real) {
841 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
842 		abs_timeout_update(abstime);
843 	}
844 
845 	uc = umtxq_getchain(&uq->uq_key);
846 	UMTXQ_LOCKED_ASSERT(uc);
847 	for (;;) {
848 		if (!(uq->uq_flags & UQF_UMTXQ)) {
849 			error = 0;
850 			break;
851 		}
852 		if (abstime != NULL) {
853 			timo = abs_timeout_gethz(abstime);
854 			if (timo < 0) {
855 				error = ETIMEDOUT;
856 				break;
857 			}
858 		} else
859 			timo = 0;
860 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
861 		if (error == EINTR || error == ERESTART) {
862 			umtxq_lock(&uq->uq_key);
863 			break;
864 		}
865 		if (abstime != NULL) {
866 			if (abstime->is_abs_real)
867 				curthread->td_rtcgen =
868 				    atomic_load_acq_int(&rtc_generation);
869 			abs_timeout_update(abstime);
870 		}
871 		umtxq_lock(&uq->uq_key);
872 	}
873 
874 	curthread->td_rtcgen = 0;
875 	return (error);
876 }
877 
878 /*
879  * Convert userspace address into unique logical address.
880  */
881 int
882 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
883 {
884 	struct thread *td = curthread;
885 	vm_map_t map;
886 	vm_map_entry_t entry;
887 	vm_pindex_t pindex;
888 	vm_prot_t prot;
889 	boolean_t wired;
890 
891 	key->type = type;
892 	if (share == THREAD_SHARE) {
893 		key->shared = 0;
894 		key->info.private.vs = td->td_proc->p_vmspace;
895 		key->info.private.addr = (uintptr_t)addr;
896 	} else {
897 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
898 		map = &td->td_proc->p_vmspace->vm_map;
899 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
900 		    &entry, &key->info.shared.object, &pindex, &prot,
901 		    &wired) != KERN_SUCCESS) {
902 			return (EFAULT);
903 		}
904 
905 		if ((share == PROCESS_SHARE) ||
906 		    (share == AUTO_SHARE &&
907 		     VM_INHERIT_SHARE == entry->inheritance)) {
908 			key->shared = 1;
909 			key->info.shared.offset = (vm_offset_t)addr -
910 			    entry->start + entry->offset;
911 			vm_object_reference(key->info.shared.object);
912 		} else {
913 			key->shared = 0;
914 			key->info.private.vs = td->td_proc->p_vmspace;
915 			key->info.private.addr = (uintptr_t)addr;
916 		}
917 		vm_map_lookup_done(map, entry);
918 	}
919 
920 	umtxq_hash(key);
921 	return (0);
922 }
923 
924 /*
925  * Release key.
926  */
927 void
928 umtx_key_release(struct umtx_key *key)
929 {
930 	if (key->shared)
931 		vm_object_deallocate(key->info.shared.object);
932 }
933 
934 /*
935  * Fetch and compare value, sleep on the address if value is not changed.
936  */
937 static int
938 do_wait(struct thread *td, void *addr, u_long id,
939     struct _umtx_time *timeout, int compat32, int is_private)
940 {
941 	struct abs_timeout timo;
942 	struct umtx_q *uq;
943 	u_long tmp;
944 	uint32_t tmp32;
945 	int error = 0;
946 
947 	uq = td->td_umtxq;
948 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
949 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
950 		return (error);
951 
952 	if (timeout != NULL)
953 		abs_timeout_init2(&timo, timeout);
954 
955 	umtxq_lock(&uq->uq_key);
956 	umtxq_insert(uq);
957 	umtxq_unlock(&uq->uq_key);
958 	if (compat32 == 0) {
959 		error = fueword(addr, &tmp);
960 		if (error != 0)
961 			error = EFAULT;
962 	} else {
963 		error = fueword32(addr, &tmp32);
964 		if (error == 0)
965 			tmp = tmp32;
966 		else
967 			error = EFAULT;
968 	}
969 	umtxq_lock(&uq->uq_key);
970 	if (error == 0) {
971 		if (tmp == id)
972 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
973 			    NULL : &timo);
974 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
975 			error = 0;
976 		else
977 			umtxq_remove(uq);
978 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
979 		umtxq_remove(uq);
980 	}
981 	umtxq_unlock(&uq->uq_key);
982 	umtx_key_release(&uq->uq_key);
983 	if (error == ERESTART)
984 		error = EINTR;
985 	return (error);
986 }
987 
988 /*
989  * Wake up threads sleeping on the specified address.
990  */
991 int
992 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
993 {
994 	struct umtx_key key;
995 	int ret;
996 
997 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
998 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
999 		return (ret);
1000 	umtxq_lock(&key);
1001 	umtxq_signal(&key, n_wake);
1002 	umtxq_unlock(&key);
1003 	umtx_key_release(&key);
1004 	return (0);
1005 }
1006 
1007 /*
1008  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1009  */
1010 static int
1011 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1012     struct _umtx_time *timeout, int mode)
1013 {
1014 	struct abs_timeout timo;
1015 	struct umtx_q *uq;
1016 	uint32_t owner, old, id;
1017 	int error, rv;
1018 
1019 	id = td->td_tid;
1020 	uq = td->td_umtxq;
1021 	error = 0;
1022 	if (timeout != NULL)
1023 		abs_timeout_init2(&timo, timeout);
1024 
1025 	/*
1026 	 * Care must be exercised when dealing with umtx structure. It
1027 	 * can fault on any access.
1028 	 */
1029 	for (;;) {
1030 		rv = fueword32(&m->m_owner, &owner);
1031 		if (rv == -1)
1032 			return (EFAULT);
1033 		if (mode == _UMUTEX_WAIT) {
1034 			if (owner == UMUTEX_UNOWNED ||
1035 			    owner == UMUTEX_CONTESTED ||
1036 			    owner == UMUTEX_RB_OWNERDEAD ||
1037 			    owner == UMUTEX_RB_NOTRECOV)
1038 				return (0);
1039 		} else {
1040 			/*
1041 			 * Robust mutex terminated.  Kernel duty is to
1042 			 * return EOWNERDEAD to the userspace.  The
1043 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1044 			 * by the common userspace code.
1045 			 */
1046 			if (owner == UMUTEX_RB_OWNERDEAD) {
1047 				rv = casueword32(&m->m_owner,
1048 				    UMUTEX_RB_OWNERDEAD, &owner,
1049 				    id | UMUTEX_CONTESTED);
1050 				if (rv == -1)
1051 					return (EFAULT);
1052 				if (owner == UMUTEX_RB_OWNERDEAD)
1053 					return (EOWNERDEAD); /* success */
1054 				rv = umtxq_check_susp(td);
1055 				if (rv != 0)
1056 					return (rv);
1057 				continue;
1058 			}
1059 			if (owner == UMUTEX_RB_NOTRECOV)
1060 				return (ENOTRECOVERABLE);
1061 
1062 
1063 			/*
1064 			 * Try the uncontested case.  This should be
1065 			 * done in userland.
1066 			 */
1067 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1068 			    &owner, id);
1069 			/* The address was invalid. */
1070 			if (rv == -1)
1071 				return (EFAULT);
1072 
1073 			/* The acquire succeeded. */
1074 			if (owner == UMUTEX_UNOWNED)
1075 				return (0);
1076 
1077 			/*
1078 			 * If no one owns it but it is contested try
1079 			 * to acquire it.
1080 			 */
1081 			if (owner == UMUTEX_CONTESTED) {
1082 				rv = casueword32(&m->m_owner,
1083 				    UMUTEX_CONTESTED, &owner,
1084 				    id | UMUTEX_CONTESTED);
1085 				/* The address was invalid. */
1086 				if (rv == -1)
1087 					return (EFAULT);
1088 
1089 				if (owner == UMUTEX_CONTESTED)
1090 					return (0);
1091 
1092 				rv = umtxq_check_susp(td);
1093 				if (rv != 0)
1094 					return (rv);
1095 
1096 				/*
1097 				 * If this failed the lock has
1098 				 * changed, restart.
1099 				 */
1100 				continue;
1101 			}
1102 		}
1103 
1104 		if (mode == _UMUTEX_TRY)
1105 			return (EBUSY);
1106 
1107 		/*
1108 		 * If we caught a signal, we have retried and now
1109 		 * exit immediately.
1110 		 */
1111 		if (error != 0)
1112 			return (error);
1113 
1114 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1115 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1116 			return (error);
1117 
1118 		umtxq_lock(&uq->uq_key);
1119 		umtxq_busy(&uq->uq_key);
1120 		umtxq_insert(uq);
1121 		umtxq_unlock(&uq->uq_key);
1122 
1123 		/*
1124 		 * Set the contested bit so that a release in user space
1125 		 * knows to use the system call for unlock.  If this fails
1126 		 * either some one else has acquired the lock or it has been
1127 		 * released.
1128 		 */
1129 		rv = casueword32(&m->m_owner, owner, &old,
1130 		    owner | UMUTEX_CONTESTED);
1131 
1132 		/* The address was invalid. */
1133 		if (rv == -1) {
1134 			umtxq_lock(&uq->uq_key);
1135 			umtxq_remove(uq);
1136 			umtxq_unbusy(&uq->uq_key);
1137 			umtxq_unlock(&uq->uq_key);
1138 			umtx_key_release(&uq->uq_key);
1139 			return (EFAULT);
1140 		}
1141 
1142 		/*
1143 		 * We set the contested bit, sleep. Otherwise the lock changed
1144 		 * and we need to retry or we lost a race to the thread
1145 		 * unlocking the umtx.
1146 		 */
1147 		umtxq_lock(&uq->uq_key);
1148 		umtxq_unbusy(&uq->uq_key);
1149 		if (old == owner)
1150 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1151 			    NULL : &timo);
1152 		umtxq_remove(uq);
1153 		umtxq_unlock(&uq->uq_key);
1154 		umtx_key_release(&uq->uq_key);
1155 
1156 		if (error == 0)
1157 			error = umtxq_check_susp(td);
1158 	}
1159 
1160 	return (0);
1161 }
1162 
1163 /*
1164  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1165  */
1166 static int
1167 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1168 {
1169 	struct umtx_key key;
1170 	uint32_t owner, old, id, newlock;
1171 	int error, count;
1172 
1173 	id = td->td_tid;
1174 	/*
1175 	 * Make sure we own this mtx.
1176 	 */
1177 	error = fueword32(&m->m_owner, &owner);
1178 	if (error == -1)
1179 		return (EFAULT);
1180 
1181 	if ((owner & ~UMUTEX_CONTESTED) != id)
1182 		return (EPERM);
1183 
1184 	newlock = umtx_unlock_val(flags, rb);
1185 	if ((owner & UMUTEX_CONTESTED) == 0) {
1186 		error = casueword32(&m->m_owner, owner, &old, newlock);
1187 		if (error == -1)
1188 			return (EFAULT);
1189 		if (old == owner)
1190 			return (0);
1191 		owner = old;
1192 	}
1193 
1194 	/* We should only ever be in here for contested locks */
1195 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1196 	    &key)) != 0)
1197 		return (error);
1198 
1199 	umtxq_lock(&key);
1200 	umtxq_busy(&key);
1201 	count = umtxq_count(&key);
1202 	umtxq_unlock(&key);
1203 
1204 	/*
1205 	 * When unlocking the umtx, it must be marked as unowned if
1206 	 * there is zero or one thread only waiting for it.
1207 	 * Otherwise, it must be marked as contested.
1208 	 */
1209 	if (count > 1)
1210 		newlock |= UMUTEX_CONTESTED;
1211 	error = casueword32(&m->m_owner, owner, &old, newlock);
1212 	umtxq_lock(&key);
1213 	umtxq_signal(&key, 1);
1214 	umtxq_unbusy(&key);
1215 	umtxq_unlock(&key);
1216 	umtx_key_release(&key);
1217 	if (error == -1)
1218 		return (EFAULT);
1219 	if (old != owner)
1220 		return (EINVAL);
1221 	return (0);
1222 }
1223 
1224 /*
1225  * Check if the mutex is available and wake up a waiter,
1226  * only for simple mutex.
1227  */
1228 static int
1229 do_wake_umutex(struct thread *td, struct umutex *m)
1230 {
1231 	struct umtx_key key;
1232 	uint32_t owner;
1233 	uint32_t flags;
1234 	int error;
1235 	int count;
1236 
1237 	error = fueword32(&m->m_owner, &owner);
1238 	if (error == -1)
1239 		return (EFAULT);
1240 
1241 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1242 	    owner != UMUTEX_RB_NOTRECOV)
1243 		return (0);
1244 
1245 	error = fueword32(&m->m_flags, &flags);
1246 	if (error == -1)
1247 		return (EFAULT);
1248 
1249 	/* We should only ever be in here for contested locks */
1250 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1251 	    &key)) != 0)
1252 		return (error);
1253 
1254 	umtxq_lock(&key);
1255 	umtxq_busy(&key);
1256 	count = umtxq_count(&key);
1257 	umtxq_unlock(&key);
1258 
1259 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1260 	    owner != UMUTEX_RB_NOTRECOV) {
1261 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1262 		    UMUTEX_UNOWNED);
1263 		if (error == -1)
1264 			error = EFAULT;
1265 	}
1266 
1267 	umtxq_lock(&key);
1268 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1269 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1270 		umtxq_signal(&key, 1);
1271 	umtxq_unbusy(&key);
1272 	umtxq_unlock(&key);
1273 	umtx_key_release(&key);
1274 	return (error);
1275 }
1276 
1277 /*
1278  * Check if the mutex has waiters and tries to fix contention bit.
1279  */
1280 static int
1281 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1282 {
1283 	struct umtx_key key;
1284 	uint32_t owner, old;
1285 	int type;
1286 	int error;
1287 	int count;
1288 
1289 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1290 	    UMUTEX_ROBUST)) {
1291 	case 0:
1292 	case UMUTEX_ROBUST:
1293 		type = TYPE_NORMAL_UMUTEX;
1294 		break;
1295 	case UMUTEX_PRIO_INHERIT:
1296 		type = TYPE_PI_UMUTEX;
1297 		break;
1298 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1299 		type = TYPE_PI_ROBUST_UMUTEX;
1300 		break;
1301 	case UMUTEX_PRIO_PROTECT:
1302 		type = TYPE_PP_UMUTEX;
1303 		break;
1304 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1305 		type = TYPE_PP_ROBUST_UMUTEX;
1306 		break;
1307 	default:
1308 		return (EINVAL);
1309 	}
1310 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1311 		return (error);
1312 
1313 	owner = 0;
1314 	umtxq_lock(&key);
1315 	umtxq_busy(&key);
1316 	count = umtxq_count(&key);
1317 	umtxq_unlock(&key);
1318 	/*
1319 	 * Only repair contention bit if there is a waiter, this means the mutex
1320 	 * is still being referenced by userland code, otherwise don't update
1321 	 * any memory.
1322 	 */
1323 	if (count > 1) {
1324 		error = fueword32(&m->m_owner, &owner);
1325 		if (error == -1)
1326 			error = EFAULT;
1327 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1328 			error = casueword32(&m->m_owner, owner, &old,
1329 			    owner | UMUTEX_CONTESTED);
1330 			if (error == -1) {
1331 				error = EFAULT;
1332 				break;
1333 			}
1334 			if (old == owner)
1335 				break;
1336 			owner = old;
1337 			error = umtxq_check_susp(td);
1338 			if (error != 0)
1339 				break;
1340 		}
1341 	} else if (count == 1) {
1342 		error = fueword32(&m->m_owner, &owner);
1343 		if (error == -1)
1344 			error = EFAULT;
1345 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1346 		    (owner & UMUTEX_CONTESTED) == 0) {
1347 			error = casueword32(&m->m_owner, owner, &old,
1348 			    owner | UMUTEX_CONTESTED);
1349 			if (error == -1) {
1350 				error = EFAULT;
1351 				break;
1352 			}
1353 			if (old == owner)
1354 				break;
1355 			owner = old;
1356 			error = umtxq_check_susp(td);
1357 			if (error != 0)
1358 				break;
1359 		}
1360 	}
1361 	umtxq_lock(&key);
1362 	if (error == EFAULT) {
1363 		umtxq_signal(&key, INT_MAX);
1364 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1365 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1366 		umtxq_signal(&key, 1);
1367 	umtxq_unbusy(&key);
1368 	umtxq_unlock(&key);
1369 	umtx_key_release(&key);
1370 	return (error);
1371 }
1372 
1373 static inline struct umtx_pi *
1374 umtx_pi_alloc(int flags)
1375 {
1376 	struct umtx_pi *pi;
1377 
1378 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1379 	TAILQ_INIT(&pi->pi_blocked);
1380 	atomic_add_int(&umtx_pi_allocated, 1);
1381 	return (pi);
1382 }
1383 
1384 static inline void
1385 umtx_pi_free(struct umtx_pi *pi)
1386 {
1387 	uma_zfree(umtx_pi_zone, pi);
1388 	atomic_add_int(&umtx_pi_allocated, -1);
1389 }
1390 
1391 /*
1392  * Adjust the thread's position on a pi_state after its priority has been
1393  * changed.
1394  */
1395 static int
1396 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1397 {
1398 	struct umtx_q *uq, *uq1, *uq2;
1399 	struct thread *td1;
1400 
1401 	mtx_assert(&umtx_lock, MA_OWNED);
1402 	if (pi == NULL)
1403 		return (0);
1404 
1405 	uq = td->td_umtxq;
1406 
1407 	/*
1408 	 * Check if the thread needs to be moved on the blocked chain.
1409 	 * It needs to be moved if either its priority is lower than
1410 	 * the previous thread or higher than the next thread.
1411 	 */
1412 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1413 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1414 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1415 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1416 		/*
1417 		 * Remove thread from blocked chain and determine where
1418 		 * it should be moved to.
1419 		 */
1420 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1421 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1422 			td1 = uq1->uq_thread;
1423 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1424 			if (UPRI(td1) > UPRI(td))
1425 				break;
1426 		}
1427 
1428 		if (uq1 == NULL)
1429 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1430 		else
1431 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1432 	}
1433 	return (1);
1434 }
1435 
1436 static struct umtx_pi *
1437 umtx_pi_next(struct umtx_pi *pi)
1438 {
1439 	struct umtx_q *uq_owner;
1440 
1441 	if (pi->pi_owner == NULL)
1442 		return (NULL);
1443 	uq_owner = pi->pi_owner->td_umtxq;
1444 	if (uq_owner == NULL)
1445 		return (NULL);
1446 	return (uq_owner->uq_pi_blocked);
1447 }
1448 
1449 /*
1450  * Floyd's Cycle-Finding Algorithm.
1451  */
1452 static bool
1453 umtx_pi_check_loop(struct umtx_pi *pi)
1454 {
1455 	struct umtx_pi *pi1;	/* fast iterator */
1456 
1457 	mtx_assert(&umtx_lock, MA_OWNED);
1458 	if (pi == NULL)
1459 		return (false);
1460 	pi1 = pi;
1461 	for (;;) {
1462 		pi = umtx_pi_next(pi);
1463 		if (pi == NULL)
1464 			break;
1465 		pi1 = umtx_pi_next(pi1);
1466 		if (pi1 == NULL)
1467 			break;
1468 		pi1 = umtx_pi_next(pi1);
1469 		if (pi1 == NULL)
1470 			break;
1471 		if (pi == pi1)
1472 			return (true);
1473 	}
1474 	return (false);
1475 }
1476 
1477 /*
1478  * Propagate priority when a thread is blocked on POSIX
1479  * PI mutex.
1480  */
1481 static void
1482 umtx_propagate_priority(struct thread *td)
1483 {
1484 	struct umtx_q *uq;
1485 	struct umtx_pi *pi;
1486 	int pri;
1487 
1488 	mtx_assert(&umtx_lock, MA_OWNED);
1489 	pri = UPRI(td);
1490 	uq = td->td_umtxq;
1491 	pi = uq->uq_pi_blocked;
1492 	if (pi == NULL)
1493 		return;
1494 	if (umtx_pi_check_loop(pi))
1495 		return;
1496 
1497 	for (;;) {
1498 		td = pi->pi_owner;
1499 		if (td == NULL || td == curthread)
1500 			return;
1501 
1502 		MPASS(td->td_proc != NULL);
1503 		MPASS(td->td_proc->p_magic == P_MAGIC);
1504 
1505 		thread_lock(td);
1506 		if (td->td_lend_user_pri > pri)
1507 			sched_lend_user_prio(td, pri);
1508 		else {
1509 			thread_unlock(td);
1510 			break;
1511 		}
1512 		thread_unlock(td);
1513 
1514 		/*
1515 		 * Pick up the lock that td is blocked on.
1516 		 */
1517 		uq = td->td_umtxq;
1518 		pi = uq->uq_pi_blocked;
1519 		if (pi == NULL)
1520 			break;
1521 		/* Resort td on the list if needed. */
1522 		umtx_pi_adjust_thread(pi, td);
1523 	}
1524 }
1525 
1526 /*
1527  * Unpropagate priority for a PI mutex when a thread blocked on
1528  * it is interrupted by signal or resumed by others.
1529  */
1530 static void
1531 umtx_repropagate_priority(struct umtx_pi *pi)
1532 {
1533 	struct umtx_q *uq, *uq_owner;
1534 	struct umtx_pi *pi2;
1535 	int pri;
1536 
1537 	mtx_assert(&umtx_lock, MA_OWNED);
1538 
1539 	if (umtx_pi_check_loop(pi))
1540 		return;
1541 	while (pi != NULL && pi->pi_owner != NULL) {
1542 		pri = PRI_MAX;
1543 		uq_owner = pi->pi_owner->td_umtxq;
1544 
1545 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1546 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1547 			if (uq != NULL) {
1548 				if (pri > UPRI(uq->uq_thread))
1549 					pri = UPRI(uq->uq_thread);
1550 			}
1551 		}
1552 
1553 		if (pri > uq_owner->uq_inherited_pri)
1554 			pri = uq_owner->uq_inherited_pri;
1555 		thread_lock(pi->pi_owner);
1556 		sched_lend_user_prio(pi->pi_owner, pri);
1557 		thread_unlock(pi->pi_owner);
1558 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1559 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1560 	}
1561 }
1562 
1563 /*
1564  * Insert a PI mutex into owned list.
1565  */
1566 static void
1567 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1568 {
1569 	struct umtx_q *uq_owner;
1570 
1571 	uq_owner = owner->td_umtxq;
1572 	mtx_assert(&umtx_lock, MA_OWNED);
1573 	MPASS(pi->pi_owner == NULL);
1574 	pi->pi_owner = owner;
1575 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1576 }
1577 
1578 
1579 /*
1580  * Disown a PI mutex, and remove it from the owned list.
1581  */
1582 static void
1583 umtx_pi_disown(struct umtx_pi *pi)
1584 {
1585 
1586 	mtx_assert(&umtx_lock, MA_OWNED);
1587 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1588 	pi->pi_owner = NULL;
1589 }
1590 
1591 /*
1592  * Claim ownership of a PI mutex.
1593  */
1594 static int
1595 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1596 {
1597 	struct umtx_q *uq;
1598 	int pri;
1599 
1600 	mtx_lock(&umtx_lock);
1601 	if (pi->pi_owner == owner) {
1602 		mtx_unlock(&umtx_lock);
1603 		return (0);
1604 	}
1605 
1606 	if (pi->pi_owner != NULL) {
1607 		/*
1608 		 * userland may have already messed the mutex, sigh.
1609 		 */
1610 		mtx_unlock(&umtx_lock);
1611 		return (EPERM);
1612 	}
1613 	umtx_pi_setowner(pi, owner);
1614 	uq = TAILQ_FIRST(&pi->pi_blocked);
1615 	if (uq != NULL) {
1616 		pri = UPRI(uq->uq_thread);
1617 		thread_lock(owner);
1618 		if (pri < UPRI(owner))
1619 			sched_lend_user_prio(owner, pri);
1620 		thread_unlock(owner);
1621 	}
1622 	mtx_unlock(&umtx_lock);
1623 	return (0);
1624 }
1625 
1626 /*
1627  * Adjust a thread's order position in its blocked PI mutex,
1628  * this may result new priority propagating process.
1629  */
1630 void
1631 umtx_pi_adjust(struct thread *td, u_char oldpri)
1632 {
1633 	struct umtx_q *uq;
1634 	struct umtx_pi *pi;
1635 
1636 	uq = td->td_umtxq;
1637 	mtx_lock(&umtx_lock);
1638 	/*
1639 	 * Pick up the lock that td is blocked on.
1640 	 */
1641 	pi = uq->uq_pi_blocked;
1642 	if (pi != NULL) {
1643 		umtx_pi_adjust_thread(pi, td);
1644 		umtx_repropagate_priority(pi);
1645 	}
1646 	mtx_unlock(&umtx_lock);
1647 }
1648 
1649 /*
1650  * Sleep on a PI mutex.
1651  */
1652 static int
1653 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1654     const char *wmesg, struct abs_timeout *timo, bool shared)
1655 {
1656 	struct thread *td, *td1;
1657 	struct umtx_q *uq1;
1658 	int error, pri;
1659 #ifdef INVARIANTS
1660 	struct umtxq_chain *uc;
1661 
1662 	uc = umtxq_getchain(&pi->pi_key);
1663 #endif
1664 	error = 0;
1665 	td = uq->uq_thread;
1666 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1668 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1669 	umtxq_insert(uq);
1670 	mtx_lock(&umtx_lock);
1671 	if (pi->pi_owner == NULL) {
1672 		mtx_unlock(&umtx_lock);
1673 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1674 		mtx_lock(&umtx_lock);
1675 		if (td1 != NULL) {
1676 			if (pi->pi_owner == NULL)
1677 				umtx_pi_setowner(pi, td1);
1678 			PROC_UNLOCK(td1->td_proc);
1679 		}
1680 	}
1681 
1682 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1683 		pri = UPRI(uq1->uq_thread);
1684 		if (pri > UPRI(td))
1685 			break;
1686 	}
1687 
1688 	if (uq1 != NULL)
1689 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1690 	else
1691 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1692 
1693 	uq->uq_pi_blocked = pi;
1694 	thread_lock(td);
1695 	td->td_flags |= TDF_UPIBLOCKED;
1696 	thread_unlock(td);
1697 	umtx_propagate_priority(td);
1698 	mtx_unlock(&umtx_lock);
1699 	umtxq_unbusy(&uq->uq_key);
1700 
1701 	error = umtxq_sleep(uq, wmesg, timo);
1702 	umtxq_remove(uq);
1703 
1704 	mtx_lock(&umtx_lock);
1705 	uq->uq_pi_blocked = NULL;
1706 	thread_lock(td);
1707 	td->td_flags &= ~TDF_UPIBLOCKED;
1708 	thread_unlock(td);
1709 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1710 	umtx_repropagate_priority(pi);
1711 	mtx_unlock(&umtx_lock);
1712 	umtxq_unlock(&uq->uq_key);
1713 
1714 	return (error);
1715 }
1716 
1717 /*
1718  * Add reference count for a PI mutex.
1719  */
1720 static void
1721 umtx_pi_ref(struct umtx_pi *pi)
1722 {
1723 
1724 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1725 	pi->pi_refcount++;
1726 }
1727 
1728 /*
1729  * Decrease reference count for a PI mutex, if the counter
1730  * is decreased to zero, its memory space is freed.
1731  */
1732 static void
1733 umtx_pi_unref(struct umtx_pi *pi)
1734 {
1735 	struct umtxq_chain *uc;
1736 
1737 	uc = umtxq_getchain(&pi->pi_key);
1738 	UMTXQ_LOCKED_ASSERT(uc);
1739 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1740 	if (--pi->pi_refcount == 0) {
1741 		mtx_lock(&umtx_lock);
1742 		if (pi->pi_owner != NULL)
1743 			umtx_pi_disown(pi);
1744 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1745 			("blocked queue not empty"));
1746 		mtx_unlock(&umtx_lock);
1747 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1748 		umtx_pi_free(pi);
1749 	}
1750 }
1751 
1752 /*
1753  * Find a PI mutex in hash table.
1754  */
1755 static struct umtx_pi *
1756 umtx_pi_lookup(struct umtx_key *key)
1757 {
1758 	struct umtxq_chain *uc;
1759 	struct umtx_pi *pi;
1760 
1761 	uc = umtxq_getchain(key);
1762 	UMTXQ_LOCKED_ASSERT(uc);
1763 
1764 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1765 		if (umtx_key_match(&pi->pi_key, key)) {
1766 			return (pi);
1767 		}
1768 	}
1769 	return (NULL);
1770 }
1771 
1772 /*
1773  * Insert a PI mutex into hash table.
1774  */
1775 static inline void
1776 umtx_pi_insert(struct umtx_pi *pi)
1777 {
1778 	struct umtxq_chain *uc;
1779 
1780 	uc = umtxq_getchain(&pi->pi_key);
1781 	UMTXQ_LOCKED_ASSERT(uc);
1782 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1783 }
1784 
1785 /*
1786  * Lock a PI mutex.
1787  */
1788 static int
1789 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1790     struct _umtx_time *timeout, int try)
1791 {
1792 	struct abs_timeout timo;
1793 	struct umtx_q *uq;
1794 	struct umtx_pi *pi, *new_pi;
1795 	uint32_t id, old_owner, owner, old;
1796 	int error, rv;
1797 
1798 	id = td->td_tid;
1799 	uq = td->td_umtxq;
1800 
1801 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1802 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1803 	    &uq->uq_key)) != 0)
1804 		return (error);
1805 
1806 	if (timeout != NULL)
1807 		abs_timeout_init2(&timo, timeout);
1808 
1809 	umtxq_lock(&uq->uq_key);
1810 	pi = umtx_pi_lookup(&uq->uq_key);
1811 	if (pi == NULL) {
1812 		new_pi = umtx_pi_alloc(M_NOWAIT);
1813 		if (new_pi == NULL) {
1814 			umtxq_unlock(&uq->uq_key);
1815 			new_pi = umtx_pi_alloc(M_WAITOK);
1816 			umtxq_lock(&uq->uq_key);
1817 			pi = umtx_pi_lookup(&uq->uq_key);
1818 			if (pi != NULL) {
1819 				umtx_pi_free(new_pi);
1820 				new_pi = NULL;
1821 			}
1822 		}
1823 		if (new_pi != NULL) {
1824 			new_pi->pi_key = uq->uq_key;
1825 			umtx_pi_insert(new_pi);
1826 			pi = new_pi;
1827 		}
1828 	}
1829 	umtx_pi_ref(pi);
1830 	umtxq_unlock(&uq->uq_key);
1831 
1832 	/*
1833 	 * Care must be exercised when dealing with umtx structure.  It
1834 	 * can fault on any access.
1835 	 */
1836 	for (;;) {
1837 		/*
1838 		 * Try the uncontested case.  This should be done in userland.
1839 		 */
1840 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1841 		/* The address was invalid. */
1842 		if (rv == -1) {
1843 			error = EFAULT;
1844 			break;
1845 		}
1846 
1847 		/* The acquire succeeded. */
1848 		if (owner == UMUTEX_UNOWNED) {
1849 			error = 0;
1850 			break;
1851 		}
1852 
1853 		if (owner == UMUTEX_RB_NOTRECOV) {
1854 			error = ENOTRECOVERABLE;
1855 			break;
1856 		}
1857 
1858 		/* If no one owns it but it is contested try to acquire it. */
1859 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1860 			old_owner = owner;
1861 			rv = casueword32(&m->m_owner, owner, &owner,
1862 			    id | UMUTEX_CONTESTED);
1863 			/* The address was invalid. */
1864 			if (rv == -1) {
1865 				error = EFAULT;
1866 				break;
1867 			}
1868 
1869 			if (owner == old_owner) {
1870 				umtxq_lock(&uq->uq_key);
1871 				umtxq_busy(&uq->uq_key);
1872 				error = umtx_pi_claim(pi, td);
1873 				umtxq_unbusy(&uq->uq_key);
1874 				umtxq_unlock(&uq->uq_key);
1875 				if (error != 0) {
1876 					/*
1877 					 * Since we're going to return an
1878 					 * error, restore the m_owner to its
1879 					 * previous, unowned state to avoid
1880 					 * compounding the problem.
1881 					 */
1882 					(void)casuword32(&m->m_owner,
1883 					    id | UMUTEX_CONTESTED,
1884 					    old_owner);
1885 				}
1886 				if (error == 0 &&
1887 				    old_owner == UMUTEX_RB_OWNERDEAD)
1888 					error = EOWNERDEAD;
1889 				break;
1890 			}
1891 
1892 			error = umtxq_check_susp(td);
1893 			if (error != 0)
1894 				break;
1895 
1896 			/* If this failed the lock has changed, restart. */
1897 			continue;
1898 		}
1899 
1900 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1901 			error = EDEADLK;
1902 			break;
1903 		}
1904 
1905 		if (try != 0) {
1906 			error = EBUSY;
1907 			break;
1908 		}
1909 
1910 		/*
1911 		 * If we caught a signal, we have retried and now
1912 		 * exit immediately.
1913 		 */
1914 		if (error != 0)
1915 			break;
1916 
1917 		umtxq_lock(&uq->uq_key);
1918 		umtxq_busy(&uq->uq_key);
1919 		umtxq_unlock(&uq->uq_key);
1920 
1921 		/*
1922 		 * Set the contested bit so that a release in user space
1923 		 * knows to use the system call for unlock.  If this fails
1924 		 * either some one else has acquired the lock or it has been
1925 		 * released.
1926 		 */
1927 		rv = casueword32(&m->m_owner, owner, &old, owner |
1928 		    UMUTEX_CONTESTED);
1929 
1930 		/* The address was invalid. */
1931 		if (rv == -1) {
1932 			umtxq_unbusy_unlocked(&uq->uq_key);
1933 			error = EFAULT;
1934 			break;
1935 		}
1936 
1937 		umtxq_lock(&uq->uq_key);
1938 		/*
1939 		 * We set the contested bit, sleep. Otherwise the lock changed
1940 		 * and we need to retry or we lost a race to the thread
1941 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
1942 		 * value for owner is impossible there.
1943 		 */
1944 		if (old == owner) {
1945 			error = umtxq_sleep_pi(uq, pi,
1946 			    owner & ~UMUTEX_CONTESTED,
1947 			    "umtxpi", timeout == NULL ? NULL : &timo,
1948 			    (flags & USYNC_PROCESS_SHARED) != 0);
1949 			if (error != 0)
1950 				continue;
1951 		} else {
1952 			umtxq_unbusy(&uq->uq_key);
1953 			umtxq_unlock(&uq->uq_key);
1954 		}
1955 
1956 		error = umtxq_check_susp(td);
1957 		if (error != 0)
1958 			break;
1959 	}
1960 
1961 	umtxq_lock(&uq->uq_key);
1962 	umtx_pi_unref(pi);
1963 	umtxq_unlock(&uq->uq_key);
1964 
1965 	umtx_key_release(&uq->uq_key);
1966 	return (error);
1967 }
1968 
1969 /*
1970  * Unlock a PI mutex.
1971  */
1972 static int
1973 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1974 {
1975 	struct umtx_key key;
1976 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1977 	struct umtx_pi *pi, *pi2;
1978 	uint32_t id, new_owner, old, owner;
1979 	int count, error, pri;
1980 
1981 	id = td->td_tid;
1982 	/*
1983 	 * Make sure we own this mtx.
1984 	 */
1985 	error = fueword32(&m->m_owner, &owner);
1986 	if (error == -1)
1987 		return (EFAULT);
1988 
1989 	if ((owner & ~UMUTEX_CONTESTED) != id)
1990 		return (EPERM);
1991 
1992 	new_owner = umtx_unlock_val(flags, rb);
1993 
1994 	/* This should be done in userland */
1995 	if ((owner & UMUTEX_CONTESTED) == 0) {
1996 		error = casueword32(&m->m_owner, owner, &old, new_owner);
1997 		if (error == -1)
1998 			return (EFAULT);
1999 		if (old == owner)
2000 			return (0);
2001 		owner = old;
2002 	}
2003 
2004 	/* We should only ever be in here for contested locks */
2005 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2006 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2007 	    &key)) != 0)
2008 		return (error);
2009 
2010 	umtxq_lock(&key);
2011 	umtxq_busy(&key);
2012 	count = umtxq_count_pi(&key, &uq_first);
2013 	if (uq_first != NULL) {
2014 		mtx_lock(&umtx_lock);
2015 		pi = uq_first->uq_pi_blocked;
2016 		KASSERT(pi != NULL, ("pi == NULL?"));
2017 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2018 			mtx_unlock(&umtx_lock);
2019 			umtxq_unbusy(&key);
2020 			umtxq_unlock(&key);
2021 			umtx_key_release(&key);
2022 			/* userland messed the mutex */
2023 			return (EPERM);
2024 		}
2025 		uq_me = td->td_umtxq;
2026 		if (pi->pi_owner == td)
2027 			umtx_pi_disown(pi);
2028 		/* get highest priority thread which is still sleeping. */
2029 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2030 		while (uq_first != NULL &&
2031 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2032 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2033 		}
2034 		pri = PRI_MAX;
2035 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2036 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2037 			if (uq_first2 != NULL) {
2038 				if (pri > UPRI(uq_first2->uq_thread))
2039 					pri = UPRI(uq_first2->uq_thread);
2040 			}
2041 		}
2042 		thread_lock(td);
2043 		sched_lend_user_prio(td, pri);
2044 		thread_unlock(td);
2045 		mtx_unlock(&umtx_lock);
2046 		if (uq_first)
2047 			umtxq_signal_thread(uq_first);
2048 	} else {
2049 		pi = umtx_pi_lookup(&key);
2050 		/*
2051 		 * A umtx_pi can exist if a signal or timeout removed the
2052 		 * last waiter from the umtxq, but there is still
2053 		 * a thread in do_lock_pi() holding the umtx_pi.
2054 		 */
2055 		if (pi != NULL) {
2056 			/*
2057 			 * The umtx_pi can be unowned, such as when a thread
2058 			 * has just entered do_lock_pi(), allocated the
2059 			 * umtx_pi, and unlocked the umtxq.
2060 			 * If the current thread owns it, it must disown it.
2061 			 */
2062 			mtx_lock(&umtx_lock);
2063 			if (pi->pi_owner == td)
2064 				umtx_pi_disown(pi);
2065 			mtx_unlock(&umtx_lock);
2066 		}
2067 	}
2068 	umtxq_unlock(&key);
2069 
2070 	/*
2071 	 * When unlocking the umtx, it must be marked as unowned if
2072 	 * there is zero or one thread only waiting for it.
2073 	 * Otherwise, it must be marked as contested.
2074 	 */
2075 
2076 	if (count > 1)
2077 		new_owner |= UMUTEX_CONTESTED;
2078 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2079 
2080 	umtxq_unbusy_unlocked(&key);
2081 	umtx_key_release(&key);
2082 	if (error == -1)
2083 		return (EFAULT);
2084 	if (old != owner)
2085 		return (EINVAL);
2086 	return (0);
2087 }
2088 
2089 /*
2090  * Lock a PP mutex.
2091  */
2092 static int
2093 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2094     struct _umtx_time *timeout, int try)
2095 {
2096 	struct abs_timeout timo;
2097 	struct umtx_q *uq, *uq2;
2098 	struct umtx_pi *pi;
2099 	uint32_t ceiling;
2100 	uint32_t owner, id;
2101 	int error, pri, old_inherited_pri, su, rv;
2102 
2103 	id = td->td_tid;
2104 	uq = td->td_umtxq;
2105 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2106 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2107 	    &uq->uq_key)) != 0)
2108 		return (error);
2109 
2110 	if (timeout != NULL)
2111 		abs_timeout_init2(&timo, timeout);
2112 
2113 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2114 	for (;;) {
2115 		old_inherited_pri = uq->uq_inherited_pri;
2116 		umtxq_lock(&uq->uq_key);
2117 		umtxq_busy(&uq->uq_key);
2118 		umtxq_unlock(&uq->uq_key);
2119 
2120 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2121 		if (rv == -1) {
2122 			error = EFAULT;
2123 			goto out;
2124 		}
2125 		ceiling = RTP_PRIO_MAX - ceiling;
2126 		if (ceiling > RTP_PRIO_MAX) {
2127 			error = EINVAL;
2128 			goto out;
2129 		}
2130 
2131 		mtx_lock(&umtx_lock);
2132 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2133 			mtx_unlock(&umtx_lock);
2134 			error = EINVAL;
2135 			goto out;
2136 		}
2137 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2138 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2139 			thread_lock(td);
2140 			if (uq->uq_inherited_pri < UPRI(td))
2141 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2142 			thread_unlock(td);
2143 		}
2144 		mtx_unlock(&umtx_lock);
2145 
2146 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2147 		    id | UMUTEX_CONTESTED);
2148 		/* The address was invalid. */
2149 		if (rv == -1) {
2150 			error = EFAULT;
2151 			break;
2152 		}
2153 
2154 		if (owner == UMUTEX_CONTESTED) {
2155 			error = 0;
2156 			break;
2157 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
2158 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2159 			    &owner, id | UMUTEX_CONTESTED);
2160 			if (rv == -1) {
2161 				error = EFAULT;
2162 				break;
2163 			}
2164 			if (owner == UMUTEX_RB_OWNERDEAD) {
2165 				error = EOWNERDEAD; /* success */
2166 				break;
2167 			}
2168 			error = 0;
2169 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2170 			error = ENOTRECOVERABLE;
2171 			break;
2172 		}
2173 
2174 		if (try != 0) {
2175 			error = EBUSY;
2176 			break;
2177 		}
2178 
2179 		/*
2180 		 * If we caught a signal, we have retried and now
2181 		 * exit immediately.
2182 		 */
2183 		if (error != 0)
2184 			break;
2185 
2186 		umtxq_lock(&uq->uq_key);
2187 		umtxq_insert(uq);
2188 		umtxq_unbusy(&uq->uq_key);
2189 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2190 		    NULL : &timo);
2191 		umtxq_remove(uq);
2192 		umtxq_unlock(&uq->uq_key);
2193 
2194 		mtx_lock(&umtx_lock);
2195 		uq->uq_inherited_pri = old_inherited_pri;
2196 		pri = PRI_MAX;
2197 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2198 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2199 			if (uq2 != NULL) {
2200 				if (pri > UPRI(uq2->uq_thread))
2201 					pri = UPRI(uq2->uq_thread);
2202 			}
2203 		}
2204 		if (pri > uq->uq_inherited_pri)
2205 			pri = uq->uq_inherited_pri;
2206 		thread_lock(td);
2207 		sched_lend_user_prio(td, pri);
2208 		thread_unlock(td);
2209 		mtx_unlock(&umtx_lock);
2210 	}
2211 
2212 	if (error != 0 && error != EOWNERDEAD) {
2213 		mtx_lock(&umtx_lock);
2214 		uq->uq_inherited_pri = old_inherited_pri;
2215 		pri = PRI_MAX;
2216 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2217 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2218 			if (uq2 != NULL) {
2219 				if (pri > UPRI(uq2->uq_thread))
2220 					pri = UPRI(uq2->uq_thread);
2221 			}
2222 		}
2223 		if (pri > uq->uq_inherited_pri)
2224 			pri = uq->uq_inherited_pri;
2225 		thread_lock(td);
2226 		sched_lend_user_prio(td, pri);
2227 		thread_unlock(td);
2228 		mtx_unlock(&umtx_lock);
2229 	}
2230 
2231 out:
2232 	umtxq_unbusy_unlocked(&uq->uq_key);
2233 	umtx_key_release(&uq->uq_key);
2234 	return (error);
2235 }
2236 
2237 /*
2238  * Unlock a PP mutex.
2239  */
2240 static int
2241 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2242 {
2243 	struct umtx_key key;
2244 	struct umtx_q *uq, *uq2;
2245 	struct umtx_pi *pi;
2246 	uint32_t id, owner, rceiling;
2247 	int error, pri, new_inherited_pri, su;
2248 
2249 	id = td->td_tid;
2250 	uq = td->td_umtxq;
2251 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2252 
2253 	/*
2254 	 * Make sure we own this mtx.
2255 	 */
2256 	error = fueword32(&m->m_owner, &owner);
2257 	if (error == -1)
2258 		return (EFAULT);
2259 
2260 	if ((owner & ~UMUTEX_CONTESTED) != id)
2261 		return (EPERM);
2262 
2263 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2264 	if (error != 0)
2265 		return (error);
2266 
2267 	if (rceiling == -1)
2268 		new_inherited_pri = PRI_MAX;
2269 	else {
2270 		rceiling = RTP_PRIO_MAX - rceiling;
2271 		if (rceiling > RTP_PRIO_MAX)
2272 			return (EINVAL);
2273 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2274 	}
2275 
2276 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2277 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2278 	    &key)) != 0)
2279 		return (error);
2280 	umtxq_lock(&key);
2281 	umtxq_busy(&key);
2282 	umtxq_unlock(&key);
2283 	/*
2284 	 * For priority protected mutex, always set unlocked state
2285 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2286 	 * to lock the mutex, it is necessary because thread priority
2287 	 * has to be adjusted for such mutex.
2288 	 */
2289 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2290 	    UMUTEX_CONTESTED);
2291 
2292 	umtxq_lock(&key);
2293 	if (error == 0)
2294 		umtxq_signal(&key, 1);
2295 	umtxq_unbusy(&key);
2296 	umtxq_unlock(&key);
2297 
2298 	if (error == -1)
2299 		error = EFAULT;
2300 	else {
2301 		mtx_lock(&umtx_lock);
2302 		if (su != 0)
2303 			uq->uq_inherited_pri = new_inherited_pri;
2304 		pri = PRI_MAX;
2305 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2306 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2307 			if (uq2 != NULL) {
2308 				if (pri > UPRI(uq2->uq_thread))
2309 					pri = UPRI(uq2->uq_thread);
2310 			}
2311 		}
2312 		if (pri > uq->uq_inherited_pri)
2313 			pri = uq->uq_inherited_pri;
2314 		thread_lock(td);
2315 		sched_lend_user_prio(td, pri);
2316 		thread_unlock(td);
2317 		mtx_unlock(&umtx_lock);
2318 	}
2319 	umtx_key_release(&key);
2320 	return (error);
2321 }
2322 
2323 static int
2324 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2325     uint32_t *old_ceiling)
2326 {
2327 	struct umtx_q *uq;
2328 	uint32_t flags, id, owner, save_ceiling;
2329 	int error, rv, rv1;
2330 
2331 	error = fueword32(&m->m_flags, &flags);
2332 	if (error == -1)
2333 		return (EFAULT);
2334 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2335 		return (EINVAL);
2336 	if (ceiling > RTP_PRIO_MAX)
2337 		return (EINVAL);
2338 	id = td->td_tid;
2339 	uq = td->td_umtxq;
2340 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2341 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2342 	    &uq->uq_key)) != 0)
2343 		return (error);
2344 	for (;;) {
2345 		umtxq_lock(&uq->uq_key);
2346 		umtxq_busy(&uq->uq_key);
2347 		umtxq_unlock(&uq->uq_key);
2348 
2349 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2350 		if (rv == -1) {
2351 			error = EFAULT;
2352 			break;
2353 		}
2354 
2355 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2356 		    id | UMUTEX_CONTESTED);
2357 		if (rv == -1) {
2358 			error = EFAULT;
2359 			break;
2360 		}
2361 
2362 		if (owner == UMUTEX_CONTESTED) {
2363 			rv = suword32(&m->m_ceilings[0], ceiling);
2364 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2365 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2366 			break;
2367 		}
2368 
2369 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2370 			rv = suword32(&m->m_ceilings[0], ceiling);
2371 			error = rv == 0 ? 0 : EFAULT;
2372 			break;
2373 		}
2374 
2375 		if (owner == UMUTEX_RB_OWNERDEAD) {
2376 			error = EOWNERDEAD;
2377 			break;
2378 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2379 			error = ENOTRECOVERABLE;
2380 			break;
2381 		}
2382 
2383 		/*
2384 		 * If we caught a signal, we have retried and now
2385 		 * exit immediately.
2386 		 */
2387 		if (error != 0)
2388 			break;
2389 
2390 		/*
2391 		 * We set the contested bit, sleep. Otherwise the lock changed
2392 		 * and we need to retry or we lost a race to the thread
2393 		 * unlocking the umtx.
2394 		 */
2395 		umtxq_lock(&uq->uq_key);
2396 		umtxq_insert(uq);
2397 		umtxq_unbusy(&uq->uq_key);
2398 		error = umtxq_sleep(uq, "umtxpp", NULL);
2399 		umtxq_remove(uq);
2400 		umtxq_unlock(&uq->uq_key);
2401 	}
2402 	umtxq_lock(&uq->uq_key);
2403 	if (error == 0)
2404 		umtxq_signal(&uq->uq_key, INT_MAX);
2405 	umtxq_unbusy(&uq->uq_key);
2406 	umtxq_unlock(&uq->uq_key);
2407 	umtx_key_release(&uq->uq_key);
2408 	if (error == 0 && old_ceiling != NULL) {
2409 		rv = suword32(old_ceiling, save_ceiling);
2410 		error = rv == 0 ? 0 : EFAULT;
2411 	}
2412 	return (error);
2413 }
2414 
2415 /*
2416  * Lock a userland POSIX mutex.
2417  */
2418 static int
2419 do_lock_umutex(struct thread *td, struct umutex *m,
2420     struct _umtx_time *timeout, int mode)
2421 {
2422 	uint32_t flags;
2423 	int error;
2424 
2425 	error = fueword32(&m->m_flags, &flags);
2426 	if (error == -1)
2427 		return (EFAULT);
2428 
2429 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2430 	case 0:
2431 		error = do_lock_normal(td, m, flags, timeout, mode);
2432 		break;
2433 	case UMUTEX_PRIO_INHERIT:
2434 		error = do_lock_pi(td, m, flags, timeout, mode);
2435 		break;
2436 	case UMUTEX_PRIO_PROTECT:
2437 		error = do_lock_pp(td, m, flags, timeout, mode);
2438 		break;
2439 	default:
2440 		return (EINVAL);
2441 	}
2442 	if (timeout == NULL) {
2443 		if (error == EINTR && mode != _UMUTEX_WAIT)
2444 			error = ERESTART;
2445 	} else {
2446 		/* Timed-locking is not restarted. */
2447 		if (error == ERESTART)
2448 			error = EINTR;
2449 	}
2450 	return (error);
2451 }
2452 
2453 /*
2454  * Unlock a userland POSIX mutex.
2455  */
2456 static int
2457 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2458 {
2459 	uint32_t flags;
2460 	int error;
2461 
2462 	error = fueword32(&m->m_flags, &flags);
2463 	if (error == -1)
2464 		return (EFAULT);
2465 
2466 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2467 	case 0:
2468 		return (do_unlock_normal(td, m, flags, rb));
2469 	case UMUTEX_PRIO_INHERIT:
2470 		return (do_unlock_pi(td, m, flags, rb));
2471 	case UMUTEX_PRIO_PROTECT:
2472 		return (do_unlock_pp(td, m, flags, rb));
2473 	}
2474 
2475 	return (EINVAL);
2476 }
2477 
2478 static int
2479 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2480     struct timespec *timeout, u_long wflags)
2481 {
2482 	struct abs_timeout timo;
2483 	struct umtx_q *uq;
2484 	uint32_t flags, clockid, hasw;
2485 	int error;
2486 
2487 	uq = td->td_umtxq;
2488 	error = fueword32(&cv->c_flags, &flags);
2489 	if (error == -1)
2490 		return (EFAULT);
2491 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2492 	if (error != 0)
2493 		return (error);
2494 
2495 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2496 		error = fueword32(&cv->c_clockid, &clockid);
2497 		if (error == -1) {
2498 			umtx_key_release(&uq->uq_key);
2499 			return (EFAULT);
2500 		}
2501 		if (clockid < CLOCK_REALTIME ||
2502 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2503 			/* hmm, only HW clock id will work. */
2504 			umtx_key_release(&uq->uq_key);
2505 			return (EINVAL);
2506 		}
2507 	} else {
2508 		clockid = CLOCK_REALTIME;
2509 	}
2510 
2511 	umtxq_lock(&uq->uq_key);
2512 	umtxq_busy(&uq->uq_key);
2513 	umtxq_insert(uq);
2514 	umtxq_unlock(&uq->uq_key);
2515 
2516 	/*
2517 	 * Set c_has_waiters to 1 before releasing user mutex, also
2518 	 * don't modify cache line when unnecessary.
2519 	 */
2520 	error = fueword32(&cv->c_has_waiters, &hasw);
2521 	if (error == 0 && hasw == 0)
2522 		suword32(&cv->c_has_waiters, 1);
2523 
2524 	umtxq_unbusy_unlocked(&uq->uq_key);
2525 
2526 	error = do_unlock_umutex(td, m, false);
2527 
2528 	if (timeout != NULL)
2529 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2530 		    timeout);
2531 
2532 	umtxq_lock(&uq->uq_key);
2533 	if (error == 0) {
2534 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2535 		    NULL : &timo);
2536 	}
2537 
2538 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2539 		error = 0;
2540 	else {
2541 		/*
2542 		 * This must be timeout,interrupted by signal or
2543 		 * surprious wakeup, clear c_has_waiter flag when
2544 		 * necessary.
2545 		 */
2546 		umtxq_busy(&uq->uq_key);
2547 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2548 			int oldlen = uq->uq_cur_queue->length;
2549 			umtxq_remove(uq);
2550 			if (oldlen == 1) {
2551 				umtxq_unlock(&uq->uq_key);
2552 				suword32(&cv->c_has_waiters, 0);
2553 				umtxq_lock(&uq->uq_key);
2554 			}
2555 		}
2556 		umtxq_unbusy(&uq->uq_key);
2557 		if (error == ERESTART)
2558 			error = EINTR;
2559 	}
2560 
2561 	umtxq_unlock(&uq->uq_key);
2562 	umtx_key_release(&uq->uq_key);
2563 	return (error);
2564 }
2565 
2566 /*
2567  * Signal a userland condition variable.
2568  */
2569 static int
2570 do_cv_signal(struct thread *td, struct ucond *cv)
2571 {
2572 	struct umtx_key key;
2573 	int error, cnt, nwake;
2574 	uint32_t flags;
2575 
2576 	error = fueword32(&cv->c_flags, &flags);
2577 	if (error == -1)
2578 		return (EFAULT);
2579 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2580 		return (error);
2581 	umtxq_lock(&key);
2582 	umtxq_busy(&key);
2583 	cnt = umtxq_count(&key);
2584 	nwake = umtxq_signal(&key, 1);
2585 	if (cnt <= nwake) {
2586 		umtxq_unlock(&key);
2587 		error = suword32(&cv->c_has_waiters, 0);
2588 		if (error == -1)
2589 			error = EFAULT;
2590 		umtxq_lock(&key);
2591 	}
2592 	umtxq_unbusy(&key);
2593 	umtxq_unlock(&key);
2594 	umtx_key_release(&key);
2595 	return (error);
2596 }
2597 
2598 static int
2599 do_cv_broadcast(struct thread *td, struct ucond *cv)
2600 {
2601 	struct umtx_key key;
2602 	int error;
2603 	uint32_t flags;
2604 
2605 	error = fueword32(&cv->c_flags, &flags);
2606 	if (error == -1)
2607 		return (EFAULT);
2608 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2609 		return (error);
2610 
2611 	umtxq_lock(&key);
2612 	umtxq_busy(&key);
2613 	umtxq_signal(&key, INT_MAX);
2614 	umtxq_unlock(&key);
2615 
2616 	error = suword32(&cv->c_has_waiters, 0);
2617 	if (error == -1)
2618 		error = EFAULT;
2619 
2620 	umtxq_unbusy_unlocked(&key);
2621 
2622 	umtx_key_release(&key);
2623 	return (error);
2624 }
2625 
2626 static int
2627 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2628 {
2629 	struct abs_timeout timo;
2630 	struct umtx_q *uq;
2631 	uint32_t flags, wrflags;
2632 	int32_t state, oldstate;
2633 	int32_t blocked_readers;
2634 	int error, error1, rv;
2635 
2636 	uq = td->td_umtxq;
2637 	error = fueword32(&rwlock->rw_flags, &flags);
2638 	if (error == -1)
2639 		return (EFAULT);
2640 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2641 	if (error != 0)
2642 		return (error);
2643 
2644 	if (timeout != NULL)
2645 		abs_timeout_init2(&timo, timeout);
2646 
2647 	wrflags = URWLOCK_WRITE_OWNER;
2648 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2649 		wrflags |= URWLOCK_WRITE_WAITERS;
2650 
2651 	for (;;) {
2652 		rv = fueword32(&rwlock->rw_state, &state);
2653 		if (rv == -1) {
2654 			umtx_key_release(&uq->uq_key);
2655 			return (EFAULT);
2656 		}
2657 
2658 		/* try to lock it */
2659 		while (!(state & wrflags)) {
2660 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2661 				umtx_key_release(&uq->uq_key);
2662 				return (EAGAIN);
2663 			}
2664 			rv = casueword32(&rwlock->rw_state, state,
2665 			    &oldstate, state + 1);
2666 			if (rv == -1) {
2667 				umtx_key_release(&uq->uq_key);
2668 				return (EFAULT);
2669 			}
2670 			if (oldstate == state) {
2671 				umtx_key_release(&uq->uq_key);
2672 				return (0);
2673 			}
2674 			error = umtxq_check_susp(td);
2675 			if (error != 0)
2676 				break;
2677 			state = oldstate;
2678 		}
2679 
2680 		if (error)
2681 			break;
2682 
2683 		/* grab monitor lock */
2684 		umtxq_lock(&uq->uq_key);
2685 		umtxq_busy(&uq->uq_key);
2686 		umtxq_unlock(&uq->uq_key);
2687 
2688 		/*
2689 		 * re-read the state, in case it changed between the try-lock above
2690 		 * and the check below
2691 		 */
2692 		rv = fueword32(&rwlock->rw_state, &state);
2693 		if (rv == -1)
2694 			error = EFAULT;
2695 
2696 		/* set read contention bit */
2697 		while (error == 0 && (state & wrflags) &&
2698 		    !(state & URWLOCK_READ_WAITERS)) {
2699 			rv = casueword32(&rwlock->rw_state, state,
2700 			    &oldstate, state | URWLOCK_READ_WAITERS);
2701 			if (rv == -1) {
2702 				error = EFAULT;
2703 				break;
2704 			}
2705 			if (oldstate == state)
2706 				goto sleep;
2707 			state = oldstate;
2708 			error = umtxq_check_susp(td);
2709 			if (error != 0)
2710 				break;
2711 		}
2712 		if (error != 0) {
2713 			umtxq_unbusy_unlocked(&uq->uq_key);
2714 			break;
2715 		}
2716 
2717 		/* state is changed while setting flags, restart */
2718 		if (!(state & wrflags)) {
2719 			umtxq_unbusy_unlocked(&uq->uq_key);
2720 			error = umtxq_check_susp(td);
2721 			if (error != 0)
2722 				break;
2723 			continue;
2724 		}
2725 
2726 sleep:
2727 		/* contention bit is set, before sleeping, increase read waiter count */
2728 		rv = fueword32(&rwlock->rw_blocked_readers,
2729 		    &blocked_readers);
2730 		if (rv == -1) {
2731 			umtxq_unbusy_unlocked(&uq->uq_key);
2732 			error = EFAULT;
2733 			break;
2734 		}
2735 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2736 
2737 		while (state & wrflags) {
2738 			umtxq_lock(&uq->uq_key);
2739 			umtxq_insert(uq);
2740 			umtxq_unbusy(&uq->uq_key);
2741 
2742 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2743 			    NULL : &timo);
2744 
2745 			umtxq_busy(&uq->uq_key);
2746 			umtxq_remove(uq);
2747 			umtxq_unlock(&uq->uq_key);
2748 			if (error)
2749 				break;
2750 			rv = fueword32(&rwlock->rw_state, &state);
2751 			if (rv == -1) {
2752 				error = EFAULT;
2753 				break;
2754 			}
2755 		}
2756 
2757 		/* decrease read waiter count, and may clear read contention bit */
2758 		rv = fueword32(&rwlock->rw_blocked_readers,
2759 		    &blocked_readers);
2760 		if (rv == -1) {
2761 			umtxq_unbusy_unlocked(&uq->uq_key);
2762 			error = EFAULT;
2763 			break;
2764 		}
2765 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2766 		if (blocked_readers == 1) {
2767 			rv = fueword32(&rwlock->rw_state, &state);
2768 			if (rv == -1) {
2769 				umtxq_unbusy_unlocked(&uq->uq_key);
2770 				error = EFAULT;
2771 				break;
2772 			}
2773 			for (;;) {
2774 				rv = casueword32(&rwlock->rw_state, state,
2775 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2776 				if (rv == -1) {
2777 					error = EFAULT;
2778 					break;
2779 				}
2780 				if (oldstate == state)
2781 					break;
2782 				state = oldstate;
2783 				error1 = umtxq_check_susp(td);
2784 				if (error1 != 0) {
2785 					if (error == 0)
2786 						error = error1;
2787 					break;
2788 				}
2789 			}
2790 		}
2791 
2792 		umtxq_unbusy_unlocked(&uq->uq_key);
2793 		if (error != 0)
2794 			break;
2795 	}
2796 	umtx_key_release(&uq->uq_key);
2797 	if (error == ERESTART)
2798 		error = EINTR;
2799 	return (error);
2800 }
2801 
2802 static int
2803 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2804 {
2805 	struct abs_timeout timo;
2806 	struct umtx_q *uq;
2807 	uint32_t flags;
2808 	int32_t state, oldstate;
2809 	int32_t blocked_writers;
2810 	int32_t blocked_readers;
2811 	int error, error1, rv;
2812 
2813 	uq = td->td_umtxq;
2814 	error = fueword32(&rwlock->rw_flags, &flags);
2815 	if (error == -1)
2816 		return (EFAULT);
2817 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2818 	if (error != 0)
2819 		return (error);
2820 
2821 	if (timeout != NULL)
2822 		abs_timeout_init2(&timo, timeout);
2823 
2824 	blocked_readers = 0;
2825 	for (;;) {
2826 		rv = fueword32(&rwlock->rw_state, &state);
2827 		if (rv == -1) {
2828 			umtx_key_release(&uq->uq_key);
2829 			return (EFAULT);
2830 		}
2831 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2832 			rv = casueword32(&rwlock->rw_state, state,
2833 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2834 			if (rv == -1) {
2835 				umtx_key_release(&uq->uq_key);
2836 				return (EFAULT);
2837 			}
2838 			if (oldstate == state) {
2839 				umtx_key_release(&uq->uq_key);
2840 				return (0);
2841 			}
2842 			state = oldstate;
2843 			error = umtxq_check_susp(td);
2844 			if (error != 0)
2845 				break;
2846 		}
2847 
2848 		if (error) {
2849 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2850 			    blocked_readers != 0) {
2851 				umtxq_lock(&uq->uq_key);
2852 				umtxq_busy(&uq->uq_key);
2853 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2854 				umtxq_unbusy(&uq->uq_key);
2855 				umtxq_unlock(&uq->uq_key);
2856 			}
2857 
2858 			break;
2859 		}
2860 
2861 		/* grab monitor lock */
2862 		umtxq_lock(&uq->uq_key);
2863 		umtxq_busy(&uq->uq_key);
2864 		umtxq_unlock(&uq->uq_key);
2865 
2866 		/*
2867 		 * re-read the state, in case it changed between the try-lock above
2868 		 * and the check below
2869 		 */
2870 		rv = fueword32(&rwlock->rw_state, &state);
2871 		if (rv == -1)
2872 			error = EFAULT;
2873 
2874 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2875 		    URWLOCK_READER_COUNT(state) != 0) &&
2876 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2877 			rv = casueword32(&rwlock->rw_state, state,
2878 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2879 			if (rv == -1) {
2880 				error = EFAULT;
2881 				break;
2882 			}
2883 			if (oldstate == state)
2884 				goto sleep;
2885 			state = oldstate;
2886 			error = umtxq_check_susp(td);
2887 			if (error != 0)
2888 				break;
2889 		}
2890 		if (error != 0) {
2891 			umtxq_unbusy_unlocked(&uq->uq_key);
2892 			break;
2893 		}
2894 
2895 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2896 			umtxq_unbusy_unlocked(&uq->uq_key);
2897 			error = umtxq_check_susp(td);
2898 			if (error != 0)
2899 				break;
2900 			continue;
2901 		}
2902 sleep:
2903 		rv = fueword32(&rwlock->rw_blocked_writers,
2904 		    &blocked_writers);
2905 		if (rv == -1) {
2906 			umtxq_unbusy_unlocked(&uq->uq_key);
2907 			error = EFAULT;
2908 			break;
2909 		}
2910 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2911 
2912 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2913 			umtxq_lock(&uq->uq_key);
2914 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2915 			umtxq_unbusy(&uq->uq_key);
2916 
2917 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2918 			    NULL : &timo);
2919 
2920 			umtxq_busy(&uq->uq_key);
2921 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2922 			umtxq_unlock(&uq->uq_key);
2923 			if (error)
2924 				break;
2925 			rv = fueword32(&rwlock->rw_state, &state);
2926 			if (rv == -1) {
2927 				error = EFAULT;
2928 				break;
2929 			}
2930 		}
2931 
2932 		rv = fueword32(&rwlock->rw_blocked_writers,
2933 		    &blocked_writers);
2934 		if (rv == -1) {
2935 			umtxq_unbusy_unlocked(&uq->uq_key);
2936 			error = EFAULT;
2937 			break;
2938 		}
2939 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2940 		if (blocked_writers == 1) {
2941 			rv = fueword32(&rwlock->rw_state, &state);
2942 			if (rv == -1) {
2943 				umtxq_unbusy_unlocked(&uq->uq_key);
2944 				error = EFAULT;
2945 				break;
2946 			}
2947 			for (;;) {
2948 				rv = casueword32(&rwlock->rw_state, state,
2949 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2950 				if (rv == -1) {
2951 					error = EFAULT;
2952 					break;
2953 				}
2954 				if (oldstate == state)
2955 					break;
2956 				state = oldstate;
2957 				error1 = umtxq_check_susp(td);
2958 				/*
2959 				 * We are leaving the URWLOCK_WRITE_WAITERS
2960 				 * behind, but this should not harm the
2961 				 * correctness.
2962 				 */
2963 				if (error1 != 0) {
2964 					if (error == 0)
2965 						error = error1;
2966 					break;
2967 				}
2968 			}
2969 			rv = fueword32(&rwlock->rw_blocked_readers,
2970 			    &blocked_readers);
2971 			if (rv == -1) {
2972 				umtxq_unbusy_unlocked(&uq->uq_key);
2973 				error = EFAULT;
2974 				break;
2975 			}
2976 		} else
2977 			blocked_readers = 0;
2978 
2979 		umtxq_unbusy_unlocked(&uq->uq_key);
2980 	}
2981 
2982 	umtx_key_release(&uq->uq_key);
2983 	if (error == ERESTART)
2984 		error = EINTR;
2985 	return (error);
2986 }
2987 
2988 static int
2989 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2990 {
2991 	struct umtx_q *uq;
2992 	uint32_t flags;
2993 	int32_t state, oldstate;
2994 	int error, rv, q, count;
2995 
2996 	uq = td->td_umtxq;
2997 	error = fueword32(&rwlock->rw_flags, &flags);
2998 	if (error == -1)
2999 		return (EFAULT);
3000 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3001 	if (error != 0)
3002 		return (error);
3003 
3004 	error = fueword32(&rwlock->rw_state, &state);
3005 	if (error == -1) {
3006 		error = EFAULT;
3007 		goto out;
3008 	}
3009 	if (state & URWLOCK_WRITE_OWNER) {
3010 		for (;;) {
3011 			rv = casueword32(&rwlock->rw_state, state,
3012 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3013 			if (rv == -1) {
3014 				error = EFAULT;
3015 				goto out;
3016 			}
3017 			if (oldstate != state) {
3018 				state = oldstate;
3019 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3020 					error = EPERM;
3021 					goto out;
3022 				}
3023 				error = umtxq_check_susp(td);
3024 				if (error != 0)
3025 					goto out;
3026 			} else
3027 				break;
3028 		}
3029 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3030 		for (;;) {
3031 			rv = casueword32(&rwlock->rw_state, state,
3032 			    &oldstate, state - 1);
3033 			if (rv == -1) {
3034 				error = EFAULT;
3035 				goto out;
3036 			}
3037 			if (oldstate != state) {
3038 				state = oldstate;
3039 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3040 					error = EPERM;
3041 					goto out;
3042 				}
3043 				error = umtxq_check_susp(td);
3044 				if (error != 0)
3045 					goto out;
3046 			} else
3047 				break;
3048 		}
3049 	} else {
3050 		error = EPERM;
3051 		goto out;
3052 	}
3053 
3054 	count = 0;
3055 
3056 	if (!(flags & URWLOCK_PREFER_READER)) {
3057 		if (state & URWLOCK_WRITE_WAITERS) {
3058 			count = 1;
3059 			q = UMTX_EXCLUSIVE_QUEUE;
3060 		} else if (state & URWLOCK_READ_WAITERS) {
3061 			count = INT_MAX;
3062 			q = UMTX_SHARED_QUEUE;
3063 		}
3064 	} else {
3065 		if (state & URWLOCK_READ_WAITERS) {
3066 			count = INT_MAX;
3067 			q = UMTX_SHARED_QUEUE;
3068 		} else if (state & URWLOCK_WRITE_WAITERS) {
3069 			count = 1;
3070 			q = UMTX_EXCLUSIVE_QUEUE;
3071 		}
3072 	}
3073 
3074 	if (count) {
3075 		umtxq_lock(&uq->uq_key);
3076 		umtxq_busy(&uq->uq_key);
3077 		umtxq_signal_queue(&uq->uq_key, count, q);
3078 		umtxq_unbusy(&uq->uq_key);
3079 		umtxq_unlock(&uq->uq_key);
3080 	}
3081 out:
3082 	umtx_key_release(&uq->uq_key);
3083 	return (error);
3084 }
3085 
3086 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3087 static int
3088 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3089 {
3090 	struct abs_timeout timo;
3091 	struct umtx_q *uq;
3092 	uint32_t flags, count, count1;
3093 	int error, rv;
3094 
3095 	uq = td->td_umtxq;
3096 	error = fueword32(&sem->_flags, &flags);
3097 	if (error == -1)
3098 		return (EFAULT);
3099 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3100 	if (error != 0)
3101 		return (error);
3102 
3103 	if (timeout != NULL)
3104 		abs_timeout_init2(&timo, timeout);
3105 
3106 	umtxq_lock(&uq->uq_key);
3107 	umtxq_busy(&uq->uq_key);
3108 	umtxq_insert(uq);
3109 	umtxq_unlock(&uq->uq_key);
3110 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3111 	if (rv == 0)
3112 		rv = fueword32(&sem->_count, &count);
3113 	if (rv == -1 || count != 0) {
3114 		umtxq_lock(&uq->uq_key);
3115 		umtxq_unbusy(&uq->uq_key);
3116 		umtxq_remove(uq);
3117 		umtxq_unlock(&uq->uq_key);
3118 		umtx_key_release(&uq->uq_key);
3119 		return (rv == -1 ? EFAULT : 0);
3120 	}
3121 	umtxq_lock(&uq->uq_key);
3122 	umtxq_unbusy(&uq->uq_key);
3123 
3124 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3125 
3126 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3127 		error = 0;
3128 	else {
3129 		umtxq_remove(uq);
3130 		/* A relative timeout cannot be restarted. */
3131 		if (error == ERESTART && timeout != NULL &&
3132 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3133 			error = EINTR;
3134 	}
3135 	umtxq_unlock(&uq->uq_key);
3136 	umtx_key_release(&uq->uq_key);
3137 	return (error);
3138 }
3139 
3140 /*
3141  * Signal a userland semaphore.
3142  */
3143 static int
3144 do_sem_wake(struct thread *td, struct _usem *sem)
3145 {
3146 	struct umtx_key key;
3147 	int error, cnt;
3148 	uint32_t flags;
3149 
3150 	error = fueword32(&sem->_flags, &flags);
3151 	if (error == -1)
3152 		return (EFAULT);
3153 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3154 		return (error);
3155 	umtxq_lock(&key);
3156 	umtxq_busy(&key);
3157 	cnt = umtxq_count(&key);
3158 	if (cnt > 0) {
3159 		/*
3160 		 * Check if count is greater than 0, this means the memory is
3161 		 * still being referenced by user code, so we can safely
3162 		 * update _has_waiters flag.
3163 		 */
3164 		if (cnt == 1) {
3165 			umtxq_unlock(&key);
3166 			error = suword32(&sem->_has_waiters, 0);
3167 			umtxq_lock(&key);
3168 			if (error == -1)
3169 				error = EFAULT;
3170 		}
3171 		umtxq_signal(&key, 1);
3172 	}
3173 	umtxq_unbusy(&key);
3174 	umtxq_unlock(&key);
3175 	umtx_key_release(&key);
3176 	return (error);
3177 }
3178 #endif
3179 
3180 static int
3181 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3182 {
3183 	struct abs_timeout timo;
3184 	struct umtx_q *uq;
3185 	uint32_t count, flags;
3186 	int error, rv;
3187 
3188 	uq = td->td_umtxq;
3189 	flags = fuword32(&sem->_flags);
3190 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3191 	if (error != 0)
3192 		return (error);
3193 
3194 	if (timeout != NULL)
3195 		abs_timeout_init2(&timo, timeout);
3196 
3197 	umtxq_lock(&uq->uq_key);
3198 	umtxq_busy(&uq->uq_key);
3199 	umtxq_insert(uq);
3200 	umtxq_unlock(&uq->uq_key);
3201 	rv = fueword32(&sem->_count, &count);
3202 	if (rv == -1) {
3203 		umtxq_lock(&uq->uq_key);
3204 		umtxq_unbusy(&uq->uq_key);
3205 		umtxq_remove(uq);
3206 		umtxq_unlock(&uq->uq_key);
3207 		umtx_key_release(&uq->uq_key);
3208 		return (EFAULT);
3209 	}
3210 	for (;;) {
3211 		if (USEM_COUNT(count) != 0) {
3212 			umtxq_lock(&uq->uq_key);
3213 			umtxq_unbusy(&uq->uq_key);
3214 			umtxq_remove(uq);
3215 			umtxq_unlock(&uq->uq_key);
3216 			umtx_key_release(&uq->uq_key);
3217 			return (0);
3218 		}
3219 		if (count == USEM_HAS_WAITERS)
3220 			break;
3221 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3222 		if (rv == -1) {
3223 			umtxq_lock(&uq->uq_key);
3224 			umtxq_unbusy(&uq->uq_key);
3225 			umtxq_remove(uq);
3226 			umtxq_unlock(&uq->uq_key);
3227 			umtx_key_release(&uq->uq_key);
3228 			return (EFAULT);
3229 		}
3230 		if (count == 0)
3231 			break;
3232 	}
3233 	umtxq_lock(&uq->uq_key);
3234 	umtxq_unbusy(&uq->uq_key);
3235 
3236 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3237 
3238 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3239 		error = 0;
3240 	else {
3241 		umtxq_remove(uq);
3242 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3243 			/* A relative timeout cannot be restarted. */
3244 			if (error == ERESTART)
3245 				error = EINTR;
3246 			if (error == EINTR) {
3247 				abs_timeout_update(&timo);
3248 				timespecsub(&timo.end, &timo.cur,
3249 				    &timeout->_timeout);
3250 			}
3251 		}
3252 	}
3253 	umtxq_unlock(&uq->uq_key);
3254 	umtx_key_release(&uq->uq_key);
3255 	return (error);
3256 }
3257 
3258 /*
3259  * Signal a userland semaphore.
3260  */
3261 static int
3262 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3263 {
3264 	struct umtx_key key;
3265 	int error, cnt, rv;
3266 	uint32_t count, flags;
3267 
3268 	rv = fueword32(&sem->_flags, &flags);
3269 	if (rv == -1)
3270 		return (EFAULT);
3271 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3272 		return (error);
3273 	umtxq_lock(&key);
3274 	umtxq_busy(&key);
3275 	cnt = umtxq_count(&key);
3276 	if (cnt > 0) {
3277 		/*
3278 		 * If this was the last sleeping thread, clear the waiters
3279 		 * flag in _count.
3280 		 */
3281 		if (cnt == 1) {
3282 			umtxq_unlock(&key);
3283 			rv = fueword32(&sem->_count, &count);
3284 			while (rv != -1 && count & USEM_HAS_WAITERS)
3285 				rv = casueword32(&sem->_count, count, &count,
3286 				    count & ~USEM_HAS_WAITERS);
3287 			if (rv == -1)
3288 				error = EFAULT;
3289 			umtxq_lock(&key);
3290 		}
3291 
3292 		umtxq_signal(&key, 1);
3293 	}
3294 	umtxq_unbusy(&key);
3295 	umtxq_unlock(&key);
3296 	umtx_key_release(&key);
3297 	return (error);
3298 }
3299 
3300 inline int
3301 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3302 {
3303 	int error;
3304 
3305 	error = copyin(addr, tsp, sizeof(struct timespec));
3306 	if (error == 0) {
3307 		if (tsp->tv_sec < 0 ||
3308 		    tsp->tv_nsec >= 1000000000 ||
3309 		    tsp->tv_nsec < 0)
3310 			error = EINVAL;
3311 	}
3312 	return (error);
3313 }
3314 
3315 static inline int
3316 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3317 {
3318 	int error;
3319 
3320 	if (size <= sizeof(struct timespec)) {
3321 		tp->_clockid = CLOCK_REALTIME;
3322 		tp->_flags = 0;
3323 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3324 	} else
3325 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3326 	if (error != 0)
3327 		return (error);
3328 	if (tp->_timeout.tv_sec < 0 ||
3329 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3330 		return (EINVAL);
3331 	return (0);
3332 }
3333 
3334 static int
3335 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3336 {
3337 
3338 	return (EOPNOTSUPP);
3339 }
3340 
3341 static int
3342 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3343 {
3344 	struct _umtx_time timeout, *tm_p;
3345 	int error;
3346 
3347 	if (uap->uaddr2 == NULL)
3348 		tm_p = NULL;
3349 	else {
3350 		error = umtx_copyin_umtx_time(
3351 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3352 		if (error != 0)
3353 			return (error);
3354 		tm_p = &timeout;
3355 	}
3356 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3357 }
3358 
3359 static int
3360 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3361 {
3362 	struct _umtx_time timeout, *tm_p;
3363 	int error;
3364 
3365 	if (uap->uaddr2 == NULL)
3366 		tm_p = NULL;
3367 	else {
3368 		error = umtx_copyin_umtx_time(
3369 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3370 		if (error != 0)
3371 			return (error);
3372 		tm_p = &timeout;
3373 	}
3374 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3375 }
3376 
3377 static int
3378 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3379 {
3380 	struct _umtx_time *tm_p, timeout;
3381 	int error;
3382 
3383 	if (uap->uaddr2 == NULL)
3384 		tm_p = NULL;
3385 	else {
3386 		error = umtx_copyin_umtx_time(
3387 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3388 		if (error != 0)
3389 			return (error);
3390 		tm_p = &timeout;
3391 	}
3392 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3393 }
3394 
3395 static int
3396 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3397 {
3398 
3399 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3400 }
3401 
3402 #define BATCH_SIZE	128
3403 static int
3404 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3405 {
3406 	char *uaddrs[BATCH_SIZE], **upp;
3407 	int count, error, i, pos, tocopy;
3408 
3409 	upp = (char **)uap->obj;
3410 	error = 0;
3411 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3412 	    pos += tocopy) {
3413 		tocopy = MIN(count, BATCH_SIZE);
3414 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3415 		if (error != 0)
3416 			break;
3417 		for (i = 0; i < tocopy; ++i)
3418 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3419 		maybe_yield();
3420 	}
3421 	return (error);
3422 }
3423 
3424 static int
3425 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3426 {
3427 
3428 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3429 }
3430 
3431 static int
3432 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3433 {
3434 	struct _umtx_time *tm_p, timeout;
3435 	int error;
3436 
3437 	/* Allow a null timespec (wait forever). */
3438 	if (uap->uaddr2 == NULL)
3439 		tm_p = NULL;
3440 	else {
3441 		error = umtx_copyin_umtx_time(
3442 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3443 		if (error != 0)
3444 			return (error);
3445 		tm_p = &timeout;
3446 	}
3447 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3448 }
3449 
3450 static int
3451 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3452 {
3453 
3454 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3455 }
3456 
3457 static int
3458 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3459 {
3460 	struct _umtx_time *tm_p, timeout;
3461 	int error;
3462 
3463 	/* Allow a null timespec (wait forever). */
3464 	if (uap->uaddr2 == NULL)
3465 		tm_p = NULL;
3466 	else {
3467 		error = umtx_copyin_umtx_time(
3468 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3469 		if (error != 0)
3470 			return (error);
3471 		tm_p = &timeout;
3472 	}
3473 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3474 }
3475 
3476 static int
3477 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3478 {
3479 
3480 	return (do_wake_umutex(td, uap->obj));
3481 }
3482 
3483 static int
3484 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3485 {
3486 
3487 	return (do_unlock_umutex(td, uap->obj, false));
3488 }
3489 
3490 static int
3491 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3492 {
3493 
3494 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3495 }
3496 
3497 static int
3498 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3499 {
3500 	struct timespec *ts, timeout;
3501 	int error;
3502 
3503 	/* Allow a null timespec (wait forever). */
3504 	if (uap->uaddr2 == NULL)
3505 		ts = NULL;
3506 	else {
3507 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3508 		if (error != 0)
3509 			return (error);
3510 		ts = &timeout;
3511 	}
3512 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3513 }
3514 
3515 static int
3516 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3517 {
3518 
3519 	return (do_cv_signal(td, uap->obj));
3520 }
3521 
3522 static int
3523 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3524 {
3525 
3526 	return (do_cv_broadcast(td, uap->obj));
3527 }
3528 
3529 static int
3530 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3531 {
3532 	struct _umtx_time timeout;
3533 	int error;
3534 
3535 	/* Allow a null timespec (wait forever). */
3536 	if (uap->uaddr2 == NULL) {
3537 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3538 	} else {
3539 		error = umtx_copyin_umtx_time(uap->uaddr2,
3540 		   (size_t)uap->uaddr1, &timeout);
3541 		if (error != 0)
3542 			return (error);
3543 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3544 	}
3545 	return (error);
3546 }
3547 
3548 static int
3549 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3550 {
3551 	struct _umtx_time timeout;
3552 	int error;
3553 
3554 	/* Allow a null timespec (wait forever). */
3555 	if (uap->uaddr2 == NULL) {
3556 		error = do_rw_wrlock(td, uap->obj, 0);
3557 	} else {
3558 		error = umtx_copyin_umtx_time(uap->uaddr2,
3559 		   (size_t)uap->uaddr1, &timeout);
3560 		if (error != 0)
3561 			return (error);
3562 
3563 		error = do_rw_wrlock(td, uap->obj, &timeout);
3564 	}
3565 	return (error);
3566 }
3567 
3568 static int
3569 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3570 {
3571 
3572 	return (do_rw_unlock(td, uap->obj));
3573 }
3574 
3575 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3576 static int
3577 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3578 {
3579 	struct _umtx_time *tm_p, timeout;
3580 	int error;
3581 
3582 	/* Allow a null timespec (wait forever). */
3583 	if (uap->uaddr2 == NULL)
3584 		tm_p = NULL;
3585 	else {
3586 		error = umtx_copyin_umtx_time(
3587 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3588 		if (error != 0)
3589 			return (error);
3590 		tm_p = &timeout;
3591 	}
3592 	return (do_sem_wait(td, uap->obj, tm_p));
3593 }
3594 
3595 static int
3596 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3597 {
3598 
3599 	return (do_sem_wake(td, uap->obj));
3600 }
3601 #endif
3602 
3603 static int
3604 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3605 {
3606 
3607 	return (do_wake2_umutex(td, uap->obj, uap->val));
3608 }
3609 
3610 static int
3611 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3612 {
3613 	struct _umtx_time *tm_p, timeout;
3614 	size_t uasize;
3615 	int error;
3616 
3617 	/* Allow a null timespec (wait forever). */
3618 	if (uap->uaddr2 == NULL) {
3619 		uasize = 0;
3620 		tm_p = NULL;
3621 	} else {
3622 		uasize = (size_t)uap->uaddr1;
3623 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3624 		if (error != 0)
3625 			return (error);
3626 		tm_p = &timeout;
3627 	}
3628 	error = do_sem2_wait(td, uap->obj, tm_p);
3629 	if (error == EINTR && uap->uaddr2 != NULL &&
3630 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3631 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3632 		error = copyout(&timeout._timeout,
3633 		    (struct _umtx_time *)uap->uaddr2 + 1,
3634 		    sizeof(struct timespec));
3635 		if (error == 0) {
3636 			error = EINTR;
3637 		}
3638 	}
3639 
3640 	return (error);
3641 }
3642 
3643 static int
3644 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3645 {
3646 
3647 	return (do_sem2_wake(td, uap->obj));
3648 }
3649 
3650 #define	USHM_OBJ_UMTX(o)						\
3651     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3652 
3653 #define	USHMF_REG_LINKED	0x0001
3654 #define	USHMF_OBJ_LINKED	0x0002
3655 struct umtx_shm_reg {
3656 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3657 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3658 	struct umtx_key		ushm_key;
3659 	struct ucred		*ushm_cred;
3660 	struct shmfd		*ushm_obj;
3661 	u_int			ushm_refcnt;
3662 	u_int			ushm_flags;
3663 };
3664 
3665 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3666 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3667 
3668 static uma_zone_t umtx_shm_reg_zone;
3669 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3670 static struct mtx umtx_shm_lock;
3671 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3672     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3673 
3674 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3675 
3676 static void
3677 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3678 {
3679 	struct umtx_shm_reg_head d;
3680 	struct umtx_shm_reg *reg, *reg1;
3681 
3682 	TAILQ_INIT(&d);
3683 	mtx_lock(&umtx_shm_lock);
3684 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3685 	mtx_unlock(&umtx_shm_lock);
3686 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3687 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3688 		umtx_shm_free_reg(reg);
3689 	}
3690 }
3691 
3692 static struct task umtx_shm_reg_delfree_task =
3693     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3694 
3695 static struct umtx_shm_reg *
3696 umtx_shm_find_reg_locked(const struct umtx_key *key)
3697 {
3698 	struct umtx_shm_reg *reg;
3699 	struct umtx_shm_reg_head *reg_head;
3700 
3701 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3702 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3703 	reg_head = &umtx_shm_registry[key->hash];
3704 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3705 		KASSERT(reg->ushm_key.shared,
3706 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3707 		if (reg->ushm_key.info.shared.object ==
3708 		    key->info.shared.object &&
3709 		    reg->ushm_key.info.shared.offset ==
3710 		    key->info.shared.offset) {
3711 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3712 			KASSERT(reg->ushm_refcnt > 0,
3713 			    ("reg %p refcnt 0 onlist", reg));
3714 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3715 			    ("reg %p not linked", reg));
3716 			reg->ushm_refcnt++;
3717 			return (reg);
3718 		}
3719 	}
3720 	return (NULL);
3721 }
3722 
3723 static struct umtx_shm_reg *
3724 umtx_shm_find_reg(const struct umtx_key *key)
3725 {
3726 	struct umtx_shm_reg *reg;
3727 
3728 	mtx_lock(&umtx_shm_lock);
3729 	reg = umtx_shm_find_reg_locked(key);
3730 	mtx_unlock(&umtx_shm_lock);
3731 	return (reg);
3732 }
3733 
3734 static void
3735 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3736 {
3737 
3738 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3739 	crfree(reg->ushm_cred);
3740 	shm_drop(reg->ushm_obj);
3741 	uma_zfree(umtx_shm_reg_zone, reg);
3742 }
3743 
3744 static bool
3745 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3746 {
3747 	bool res;
3748 
3749 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3750 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3751 	reg->ushm_refcnt--;
3752 	res = reg->ushm_refcnt == 0;
3753 	if (res || force) {
3754 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3755 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3756 			    reg, ushm_reg_link);
3757 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3758 		}
3759 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3760 			LIST_REMOVE(reg, ushm_obj_link);
3761 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3762 		}
3763 	}
3764 	return (res);
3765 }
3766 
3767 static void
3768 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3769 {
3770 	vm_object_t object;
3771 	bool dofree;
3772 
3773 	if (force) {
3774 		object = reg->ushm_obj->shm_object;
3775 		VM_OBJECT_WLOCK(object);
3776 		object->flags |= OBJ_UMTXDEAD;
3777 		VM_OBJECT_WUNLOCK(object);
3778 	}
3779 	mtx_lock(&umtx_shm_lock);
3780 	dofree = umtx_shm_unref_reg_locked(reg, force);
3781 	mtx_unlock(&umtx_shm_lock);
3782 	if (dofree)
3783 		umtx_shm_free_reg(reg);
3784 }
3785 
3786 void
3787 umtx_shm_object_init(vm_object_t object)
3788 {
3789 
3790 	LIST_INIT(USHM_OBJ_UMTX(object));
3791 }
3792 
3793 void
3794 umtx_shm_object_terminated(vm_object_t object)
3795 {
3796 	struct umtx_shm_reg *reg, *reg1;
3797 	bool dofree;
3798 
3799 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3800 		return;
3801 
3802 	dofree = false;
3803 	mtx_lock(&umtx_shm_lock);
3804 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3805 		if (umtx_shm_unref_reg_locked(reg, true)) {
3806 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3807 			    ushm_reg_link);
3808 			dofree = true;
3809 		}
3810 	}
3811 	mtx_unlock(&umtx_shm_lock);
3812 	if (dofree)
3813 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3814 }
3815 
3816 static int
3817 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3818     struct umtx_shm_reg **res)
3819 {
3820 	struct umtx_shm_reg *reg, *reg1;
3821 	struct ucred *cred;
3822 	int error;
3823 
3824 	reg = umtx_shm_find_reg(key);
3825 	if (reg != NULL) {
3826 		*res = reg;
3827 		return (0);
3828 	}
3829 	cred = td->td_ucred;
3830 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3831 		return (ENOMEM);
3832 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3833 	reg->ushm_refcnt = 1;
3834 	bcopy(key, &reg->ushm_key, sizeof(*key));
3835 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3836 	reg->ushm_cred = crhold(cred);
3837 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3838 	if (error != 0) {
3839 		umtx_shm_free_reg(reg);
3840 		return (error);
3841 	}
3842 	mtx_lock(&umtx_shm_lock);
3843 	reg1 = umtx_shm_find_reg_locked(key);
3844 	if (reg1 != NULL) {
3845 		mtx_unlock(&umtx_shm_lock);
3846 		umtx_shm_free_reg(reg);
3847 		*res = reg1;
3848 		return (0);
3849 	}
3850 	reg->ushm_refcnt++;
3851 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3852 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3853 	    ushm_obj_link);
3854 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3855 	mtx_unlock(&umtx_shm_lock);
3856 	*res = reg;
3857 	return (0);
3858 }
3859 
3860 static int
3861 umtx_shm_alive(struct thread *td, void *addr)
3862 {
3863 	vm_map_t map;
3864 	vm_map_entry_t entry;
3865 	vm_object_t object;
3866 	vm_pindex_t pindex;
3867 	vm_prot_t prot;
3868 	int res, ret;
3869 	boolean_t wired;
3870 
3871 	map = &td->td_proc->p_vmspace->vm_map;
3872 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3873 	    &object, &pindex, &prot, &wired);
3874 	if (res != KERN_SUCCESS)
3875 		return (EFAULT);
3876 	if (object == NULL)
3877 		ret = EINVAL;
3878 	else
3879 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3880 	vm_map_lookup_done(map, entry);
3881 	return (ret);
3882 }
3883 
3884 static void
3885 umtx_shm_init(void)
3886 {
3887 	int i;
3888 
3889 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3890 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3891 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3892 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3893 		TAILQ_INIT(&umtx_shm_registry[i]);
3894 }
3895 
3896 static int
3897 umtx_shm(struct thread *td, void *addr, u_int flags)
3898 {
3899 	struct umtx_key key;
3900 	struct umtx_shm_reg *reg;
3901 	struct file *fp;
3902 	int error, fd;
3903 
3904 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
3905 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
3906 		return (EINVAL);
3907 	if ((flags & UMTX_SHM_ALIVE) != 0)
3908 		return (umtx_shm_alive(td, addr));
3909 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
3910 	if (error != 0)
3911 		return (error);
3912 	KASSERT(key.shared == 1, ("non-shared key"));
3913 	if ((flags & UMTX_SHM_CREAT) != 0) {
3914 		error = umtx_shm_create_reg(td, &key, &reg);
3915 	} else {
3916 		reg = umtx_shm_find_reg(&key);
3917 		if (reg == NULL)
3918 			error = ESRCH;
3919 	}
3920 	umtx_key_release(&key);
3921 	if (error != 0)
3922 		return (error);
3923 	KASSERT(reg != NULL, ("no reg"));
3924 	if ((flags & UMTX_SHM_DESTROY) != 0) {
3925 		umtx_shm_unref_reg(reg, true);
3926 	} else {
3927 #if 0
3928 #ifdef MAC
3929 		error = mac_posixshm_check_open(td->td_ucred,
3930 		    reg->ushm_obj, FFLAGS(O_RDWR));
3931 		if (error == 0)
3932 #endif
3933 			error = shm_access(reg->ushm_obj, td->td_ucred,
3934 			    FFLAGS(O_RDWR));
3935 		if (error == 0)
3936 #endif
3937 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
3938 		if (error == 0) {
3939 			shm_hold(reg->ushm_obj);
3940 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
3941 			    &shm_ops);
3942 			td->td_retval[0] = fd;
3943 			fdrop(fp, td);
3944 		}
3945 	}
3946 	umtx_shm_unref_reg(reg, false);
3947 	return (error);
3948 }
3949 
3950 static int
3951 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
3952 {
3953 
3954 	return (umtx_shm(td, uap->uaddr1, uap->val));
3955 }
3956 
3957 static int
3958 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
3959 {
3960 
3961 	td->td_rb_list = rbp->robust_list_offset;
3962 	td->td_rbp_list = rbp->robust_priv_list_offset;
3963 	td->td_rb_inact = rbp->robust_inact_offset;
3964 	return (0);
3965 }
3966 
3967 static int
3968 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
3969 {
3970 	struct umtx_robust_lists_params rb;
3971 	int error;
3972 
3973 	if (uap->val > sizeof(rb))
3974 		return (EINVAL);
3975 	bzero(&rb, sizeof(rb));
3976 	error = copyin(uap->uaddr1, &rb, uap->val);
3977 	if (error != 0)
3978 		return (error);
3979 	return (umtx_robust_lists(td, &rb));
3980 }
3981 
3982 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3983 
3984 static const _umtx_op_func op_table[] = {
3985 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3986 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3987 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3988 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3989 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3990 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3991 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3992 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3993 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
3994 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
3995 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
3996 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
3997 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
3998 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
3999 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4000 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4001 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4002 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4003 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4004 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4005 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4006 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4007 #else
4008 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4009 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4010 #endif
4011 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4012 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4013 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4014 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4015 	[UMTX_OP_SHM]		= __umtx_op_shm,
4016 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4017 };
4018 
4019 int
4020 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4021 {
4022 
4023 	if ((unsigned)uap->op < nitems(op_table))
4024 		return (*op_table[uap->op])(td, uap);
4025 	return (EINVAL);
4026 }
4027 
4028 #ifdef COMPAT_FREEBSD32
4029 
4030 struct timespec32 {
4031 	int32_t tv_sec;
4032 	int32_t tv_nsec;
4033 };
4034 
4035 struct umtx_time32 {
4036 	struct	timespec32	timeout;
4037 	uint32_t		flags;
4038 	uint32_t		clockid;
4039 };
4040 
4041 static inline int
4042 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4043 {
4044 	struct timespec32 ts32;
4045 	int error;
4046 
4047 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4048 	if (error == 0) {
4049 		if (ts32.tv_sec < 0 ||
4050 		    ts32.tv_nsec >= 1000000000 ||
4051 		    ts32.tv_nsec < 0)
4052 			error = EINVAL;
4053 		else {
4054 			tsp->tv_sec = ts32.tv_sec;
4055 			tsp->tv_nsec = ts32.tv_nsec;
4056 		}
4057 	}
4058 	return (error);
4059 }
4060 
4061 static inline int
4062 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4063 {
4064 	struct umtx_time32 t32;
4065 	int error;
4066 
4067 	t32.clockid = CLOCK_REALTIME;
4068 	t32.flags   = 0;
4069 	if (size <= sizeof(struct timespec32))
4070 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4071 	else
4072 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4073 	if (error != 0)
4074 		return (error);
4075 	if (t32.timeout.tv_sec < 0 ||
4076 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4077 		return (EINVAL);
4078 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4079 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4080 	tp->_flags = t32.flags;
4081 	tp->_clockid = t32.clockid;
4082 	return (0);
4083 }
4084 
4085 static int
4086 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4087 {
4088 	struct _umtx_time *tm_p, timeout;
4089 	int error;
4090 
4091 	if (uap->uaddr2 == NULL)
4092 		tm_p = NULL;
4093 	else {
4094 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4095 			(size_t)uap->uaddr1, &timeout);
4096 		if (error != 0)
4097 			return (error);
4098 		tm_p = &timeout;
4099 	}
4100 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4101 }
4102 
4103 static int
4104 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4105 {
4106 	struct _umtx_time *tm_p, timeout;
4107 	int error;
4108 
4109 	/* Allow a null timespec (wait forever). */
4110 	if (uap->uaddr2 == NULL)
4111 		tm_p = NULL;
4112 	else {
4113 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4114 			    (size_t)uap->uaddr1, &timeout);
4115 		if (error != 0)
4116 			return (error);
4117 		tm_p = &timeout;
4118 	}
4119 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4120 }
4121 
4122 static int
4123 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4124 {
4125 	struct _umtx_time *tm_p, timeout;
4126 	int error;
4127 
4128 	/* Allow a null timespec (wait forever). */
4129 	if (uap->uaddr2 == NULL)
4130 		tm_p = NULL;
4131 	else {
4132 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4133 		    (size_t)uap->uaddr1, &timeout);
4134 		if (error != 0)
4135 			return (error);
4136 		tm_p = &timeout;
4137 	}
4138 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4139 }
4140 
4141 static int
4142 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4143 {
4144 	struct timespec *ts, timeout;
4145 	int error;
4146 
4147 	/* Allow a null timespec (wait forever). */
4148 	if (uap->uaddr2 == NULL)
4149 		ts = NULL;
4150 	else {
4151 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4152 		if (error != 0)
4153 			return (error);
4154 		ts = &timeout;
4155 	}
4156 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4157 }
4158 
4159 static int
4160 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4161 {
4162 	struct _umtx_time timeout;
4163 	int error;
4164 
4165 	/* Allow a null timespec (wait forever). */
4166 	if (uap->uaddr2 == NULL) {
4167 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4168 	} else {
4169 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4170 		    (size_t)uap->uaddr1, &timeout);
4171 		if (error != 0)
4172 			return (error);
4173 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4174 	}
4175 	return (error);
4176 }
4177 
4178 static int
4179 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4180 {
4181 	struct _umtx_time timeout;
4182 	int error;
4183 
4184 	/* Allow a null timespec (wait forever). */
4185 	if (uap->uaddr2 == NULL) {
4186 		error = do_rw_wrlock(td, uap->obj, 0);
4187 	} else {
4188 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4189 		    (size_t)uap->uaddr1, &timeout);
4190 		if (error != 0)
4191 			return (error);
4192 		error = do_rw_wrlock(td, uap->obj, &timeout);
4193 	}
4194 	return (error);
4195 }
4196 
4197 static int
4198 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4199 {
4200 	struct _umtx_time *tm_p, timeout;
4201 	int error;
4202 
4203 	if (uap->uaddr2 == NULL)
4204 		tm_p = NULL;
4205 	else {
4206 		error = umtx_copyin_umtx_time32(
4207 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4208 		if (error != 0)
4209 			return (error);
4210 		tm_p = &timeout;
4211 	}
4212 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4213 }
4214 
4215 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4216 static int
4217 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4218 {
4219 	struct _umtx_time *tm_p, timeout;
4220 	int error;
4221 
4222 	/* Allow a null timespec (wait forever). */
4223 	if (uap->uaddr2 == NULL)
4224 		tm_p = NULL;
4225 	else {
4226 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4227 		    (size_t)uap->uaddr1, &timeout);
4228 		if (error != 0)
4229 			return (error);
4230 		tm_p = &timeout;
4231 	}
4232 	return (do_sem_wait(td, uap->obj, tm_p));
4233 }
4234 #endif
4235 
4236 static int
4237 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4238 {
4239 	struct _umtx_time *tm_p, timeout;
4240 	size_t uasize;
4241 	int error;
4242 
4243 	/* Allow a null timespec (wait forever). */
4244 	if (uap->uaddr2 == NULL) {
4245 		uasize = 0;
4246 		tm_p = NULL;
4247 	} else {
4248 		uasize = (size_t)uap->uaddr1;
4249 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4250 		if (error != 0)
4251 			return (error);
4252 		tm_p = &timeout;
4253 	}
4254 	error = do_sem2_wait(td, uap->obj, tm_p);
4255 	if (error == EINTR && uap->uaddr2 != NULL &&
4256 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4257 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4258 		struct timespec32 remain32 = {
4259 			.tv_sec = timeout._timeout.tv_sec,
4260 			.tv_nsec = timeout._timeout.tv_nsec
4261 		};
4262 		error = copyout(&remain32,
4263 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4264 		    sizeof(struct timespec32));
4265 		if (error == 0) {
4266 			error = EINTR;
4267 		}
4268 	}
4269 
4270 	return (error);
4271 }
4272 
4273 static int
4274 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4275 {
4276 	uint32_t uaddrs[BATCH_SIZE], **upp;
4277 	int count, error, i, pos, tocopy;
4278 
4279 	upp = (uint32_t **)uap->obj;
4280 	error = 0;
4281 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4282 	    pos += tocopy) {
4283 		tocopy = MIN(count, BATCH_SIZE);
4284 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4285 		if (error != 0)
4286 			break;
4287 		for (i = 0; i < tocopy; ++i)
4288 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4289 			    INT_MAX, 1);
4290 		maybe_yield();
4291 	}
4292 	return (error);
4293 }
4294 
4295 struct umtx_robust_lists_params_compat32 {
4296 	uint32_t	robust_list_offset;
4297 	uint32_t	robust_priv_list_offset;
4298 	uint32_t	robust_inact_offset;
4299 };
4300 
4301 static int
4302 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4303 {
4304 	struct umtx_robust_lists_params rb;
4305 	struct umtx_robust_lists_params_compat32 rb32;
4306 	int error;
4307 
4308 	if (uap->val > sizeof(rb32))
4309 		return (EINVAL);
4310 	bzero(&rb, sizeof(rb));
4311 	bzero(&rb32, sizeof(rb32));
4312 	error = copyin(uap->uaddr1, &rb32, uap->val);
4313 	if (error != 0)
4314 		return (error);
4315 	rb.robust_list_offset = rb32.robust_list_offset;
4316 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4317 	rb.robust_inact_offset = rb32.robust_inact_offset;
4318 	return (umtx_robust_lists(td, &rb));
4319 }
4320 
4321 static const _umtx_op_func op_table_compat32[] = {
4322 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4323 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4324 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4325 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4326 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4327 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4328 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4329 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4330 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4331 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4332 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4333 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4334 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4335 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4336 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4337 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4338 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4339 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4340 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4341 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4342 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4343 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4344 #else
4345 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4346 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4347 #endif
4348 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4349 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4350 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4351 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4352 	[UMTX_OP_SHM]		= __umtx_op_shm,
4353 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4354 };
4355 
4356 int
4357 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4358 {
4359 
4360 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4361 		return (*op_table_compat32[uap->op])(td,
4362 		    (struct _umtx_op_args *)uap);
4363 	}
4364 	return (EINVAL);
4365 }
4366 #endif
4367 
4368 void
4369 umtx_thread_init(struct thread *td)
4370 {
4371 
4372 	td->td_umtxq = umtxq_alloc();
4373 	td->td_umtxq->uq_thread = td;
4374 }
4375 
4376 void
4377 umtx_thread_fini(struct thread *td)
4378 {
4379 
4380 	umtxq_free(td->td_umtxq);
4381 }
4382 
4383 /*
4384  * It will be called when new thread is created, e.g fork().
4385  */
4386 void
4387 umtx_thread_alloc(struct thread *td)
4388 {
4389 	struct umtx_q *uq;
4390 
4391 	uq = td->td_umtxq;
4392 	uq->uq_inherited_pri = PRI_MAX;
4393 
4394 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4395 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4396 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4397 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4398 }
4399 
4400 /*
4401  * exec() hook.
4402  *
4403  * Clear robust lists for all process' threads, not delaying the
4404  * cleanup to thread_exit hook, since the relevant address space is
4405  * destroyed right now.
4406  */
4407 static void
4408 umtx_exec_hook(void *arg __unused, struct proc *p,
4409     struct image_params *imgp __unused)
4410 {
4411 	struct thread *td;
4412 
4413 	KASSERT(p == curproc, ("need curproc"));
4414 	PROC_LOCK(p);
4415 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4416 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4417 	    ("curproc must be single-threaded"));
4418 	FOREACH_THREAD_IN_PROC(p, td) {
4419 		KASSERT(td == curthread ||
4420 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4421 		    ("running thread %p %p", p, td));
4422 		PROC_UNLOCK(p);
4423 		umtx_thread_cleanup(td);
4424 		PROC_LOCK(p);
4425 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4426 	}
4427 	PROC_UNLOCK(p);
4428 }
4429 
4430 /*
4431  * thread_exit() hook.
4432  */
4433 void
4434 umtx_thread_exit(struct thread *td)
4435 {
4436 
4437 	umtx_thread_cleanup(td);
4438 }
4439 
4440 static int
4441 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4442 {
4443 	u_long res1;
4444 #ifdef COMPAT_FREEBSD32
4445 	uint32_t res32;
4446 #endif
4447 	int error;
4448 
4449 #ifdef COMPAT_FREEBSD32
4450 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4451 		error = fueword32((void *)ptr, &res32);
4452 		if (error == 0)
4453 			res1 = res32;
4454 	} else
4455 #endif
4456 	{
4457 		error = fueword((void *)ptr, &res1);
4458 	}
4459 	if (error == 0)
4460 		*res = res1;
4461 	else
4462 		error = EFAULT;
4463 	return (error);
4464 }
4465 
4466 static void
4467 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4468 {
4469 #ifdef COMPAT_FREEBSD32
4470 	struct umutex32 m32;
4471 
4472 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4473 		memcpy(&m32, m, sizeof(m32));
4474 		*rb_list = m32.m_rb_lnk;
4475 	} else
4476 #endif
4477 		*rb_list = m->m_rb_lnk;
4478 }
4479 
4480 static int
4481 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4482 {
4483 	struct umutex m;
4484 	int error;
4485 
4486 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4487 	error = copyin((void *)rbp, &m, sizeof(m));
4488 	if (error != 0)
4489 		return (error);
4490 	if (rb_list != NULL)
4491 		umtx_read_rb_list(td, &m, rb_list);
4492 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4493 		return (EINVAL);
4494 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4495 		/* inact is cleared after unlock, allow the inconsistency */
4496 		return (inact ? 0 : EINVAL);
4497 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4498 }
4499 
4500 static void
4501 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4502     const char *name)
4503 {
4504 	int error, i;
4505 	uintptr_t rbp;
4506 	bool inact;
4507 
4508 	if (rb_list == 0)
4509 		return;
4510 	error = umtx_read_uptr(td, rb_list, &rbp);
4511 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4512 		if (rbp == *rb_inact) {
4513 			inact = true;
4514 			*rb_inact = 0;
4515 		} else
4516 			inact = false;
4517 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4518 	}
4519 	if (i == umtx_max_rb && umtx_verbose_rb) {
4520 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4521 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4522 	}
4523 	if (error != 0 && umtx_verbose_rb) {
4524 		uprintf("comm %s pid %d: handling %srb error %d\n",
4525 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4526 	}
4527 }
4528 
4529 /*
4530  * Clean up umtx data.
4531  */
4532 static void
4533 umtx_thread_cleanup(struct thread *td)
4534 {
4535 	struct umtx_q *uq;
4536 	struct umtx_pi *pi;
4537 	uintptr_t rb_inact;
4538 
4539 	/*
4540 	 * Disown pi mutexes.
4541 	 */
4542 	uq = td->td_umtxq;
4543 	if (uq != NULL) {
4544 		mtx_lock(&umtx_lock);
4545 		uq->uq_inherited_pri = PRI_MAX;
4546 		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4547 			pi->pi_owner = NULL;
4548 			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4549 		}
4550 		mtx_unlock(&umtx_lock);
4551 		thread_lock(td);
4552 		sched_lend_user_prio(td, PRI_MAX);
4553 		thread_unlock(td);
4554 	}
4555 
4556 	/*
4557 	 * Handle terminated robust mutexes.  Must be done after
4558 	 * robust pi disown, otherwise unlock could see unowned
4559 	 * entries.
4560 	 */
4561 	rb_inact = td->td_rb_inact;
4562 	if (rb_inact != 0)
4563 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4564 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4565 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4566 	if (rb_inact != 0)
4567 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4568 }
4569