xref: /freebsd/sys/kern/kern_umtx.c (revision 8c5a9161d16094d9db474fe78ddead1325246d05)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015, 2016 The FreeBSD Foundation
5  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Konstantin Belousov
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice unmodified, this list of conditions, and the following
17  *    disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_umtx_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysent.h>
59 #include <sys/systm.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/time.h>
64 #include <sys/eventhandler.h>
65 #include <sys/umtx.h>
66 
67 #include <security/mac/mac_framework.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_object.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/cpu.h>
77 
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32_proto.h>
80 #endif
81 
82 #define _UMUTEX_TRY		1
83 #define _UMUTEX_WAIT		2
84 
85 #ifdef UMTX_PROFILING
86 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
87 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
88 #endif
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98  	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 
142 	/* Spare queue ready to be reused */
143 	struct umtxq_queue	*uq_spare_queue;
144 
145 	/* The queue we on */
146 	struct umtxq_queue	*uq_cur_queue;
147 };
148 
149 TAILQ_HEAD(umtxq_head, umtx_q);
150 
151 /* Per-key wait-queue */
152 struct umtxq_queue {
153 	struct umtxq_head	head;
154 	struct umtx_key		key;
155 	LIST_ENTRY(umtxq_queue)	link;
156 	int			length;
157 };
158 
159 LIST_HEAD(umtxq_list, umtxq_queue);
160 
161 /* Userland lock object's wait-queue chain */
162 struct umtxq_chain {
163 	/* Lock for this chain. */
164 	struct mtx		uc_lock;
165 
166 	/* List of sleep queues. */
167 	struct umtxq_list	uc_queue[2];
168 #define UMTX_SHARED_QUEUE	0
169 #define UMTX_EXCLUSIVE_QUEUE	1
170 
171 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
172 
173 	/* Busy flag */
174 	char			uc_busy;
175 
176 	/* Chain lock waiters */
177 	int			uc_waiters;
178 
179 	/* All PI in the list */
180 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
181 
182 #ifdef UMTX_PROFILING
183 	u_int 			length;
184 	u_int			max_length;
185 #endif
186 };
187 
188 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
189 
190 /*
191  * Don't propagate time-sharing priority, there is a security reason,
192  * a user can simply introduce PI-mutex, let thread A lock the mutex,
193  * and let another thread B block on the mutex, because B is
194  * sleeping, its priority will be boosted, this causes A's priority to
195  * be boosted via priority propagating too and will never be lowered even
196  * if it is using 100%CPU, this is unfair to other processes.
197  */
198 
199 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
200 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
201 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
202 
203 #define	GOLDEN_RATIO_PRIME	2654404609U
204 #ifndef	UMTX_CHAINS
205 #define	UMTX_CHAINS		512
206 #endif
207 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
208 
209 #define	GET_SHARE(flags)	\
210     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
211 
212 #define BUSY_SPINS		200
213 
214 struct abs_timeout {
215 	int clockid;
216 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
217 	struct timespec cur;
218 	struct timespec end;
219 };
220 
221 #ifdef COMPAT_FREEBSD32
222 struct umutex32 {
223 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
224 	__uint32_t		m_flags;	/* Flags of the mutex */
225 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
226 	__uint32_t		m_rb_lnk;	/* Robust linkage */
227 	__uint32_t		m_pad;
228 	__uint32_t		m_spare[2];
229 };
230 
231 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
232 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
233     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
234 #endif
235 
236 int umtx_shm_vnobj_persistent = 0;
237 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
238     &umtx_shm_vnobj_persistent, 0,
239     "False forces destruction of umtx attached to file, on last close");
240 static int umtx_max_rb = 1000;
241 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
242     &umtx_max_rb, 0,
243     "");
244 
245 static uma_zone_t		umtx_pi_zone;
246 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
247 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
248 static int			umtx_pi_allocated;
249 
250 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
251 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
252     &umtx_pi_allocated, 0, "Allocated umtx_pi");
253 static int umtx_verbose_rb = 1;
254 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
255     &umtx_verbose_rb, 0,
256     "");
257 
258 #ifdef UMTX_PROFILING
259 static long max_length;
260 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
261 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
262 #endif
263 
264 static void abs_timeout_update(struct abs_timeout *timo);
265 
266 static void umtx_shm_init(void);
267 static void umtxq_sysinit(void *);
268 static void umtxq_hash(struct umtx_key *key);
269 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
270 static void umtxq_lock(struct umtx_key *key);
271 static void umtxq_unlock(struct umtx_key *key);
272 static void umtxq_busy(struct umtx_key *key);
273 static void umtxq_unbusy(struct umtx_key *key);
274 static void umtxq_insert_queue(struct umtx_q *uq, int q);
275 static void umtxq_remove_queue(struct umtx_q *uq, int q);
276 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
277 static int umtxq_count(struct umtx_key *key);
278 static struct umtx_pi *umtx_pi_alloc(int);
279 static void umtx_pi_free(struct umtx_pi *pi);
280 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
281     bool rb);
282 static void umtx_thread_cleanup(struct thread *td);
283 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
284     struct image_params *imgp __unused);
285 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
286 
287 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
288 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
289 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
290 
291 static struct mtx umtx_lock;
292 
293 #ifdef UMTX_PROFILING
294 static void
295 umtx_init_profiling(void)
296 {
297 	struct sysctl_oid *chain_oid;
298 	char chain_name[10];
299 	int i;
300 
301 	for (i = 0; i < UMTX_CHAINS; ++i) {
302 		snprintf(chain_name, sizeof(chain_name), "%d", i);
303 		chain_oid = SYSCTL_ADD_NODE(NULL,
304 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
305 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
306 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
307 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
308 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
309 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
310 	}
311 }
312 
313 static int
314 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
315 {
316 	char buf[512];
317 	struct sbuf sb;
318 	struct umtxq_chain *uc;
319 	u_int fract, i, j, tot, whole;
320 	u_int sf0, sf1, sf2, sf3, sf4;
321 	u_int si0, si1, si2, si3, si4;
322 	u_int sw0, sw1, sw2, sw3, sw4;
323 
324 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
325 	for (i = 0; i < 2; i++) {
326 		tot = 0;
327 		for (j = 0; j < UMTX_CHAINS; ++j) {
328 			uc = &umtxq_chains[i][j];
329 			mtx_lock(&uc->uc_lock);
330 			tot += uc->max_length;
331 			mtx_unlock(&uc->uc_lock);
332 		}
333 		if (tot == 0)
334 			sbuf_printf(&sb, "%u) Empty ", i);
335 		else {
336 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
337 			si0 = si1 = si2 = si3 = si4 = 0;
338 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
339 			for (j = 0; j < UMTX_CHAINS; j++) {
340 				uc = &umtxq_chains[i][j];
341 				mtx_lock(&uc->uc_lock);
342 				whole = uc->max_length * 100;
343 				mtx_unlock(&uc->uc_lock);
344 				fract = (whole % tot) * 100;
345 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
346 					sf0 = fract;
347 					si0 = j;
348 					sw0 = whole;
349 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
350 				    sf1)) {
351 					sf1 = fract;
352 					si1 = j;
353 					sw1 = whole;
354 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
355 				    sf2)) {
356 					sf2 = fract;
357 					si2 = j;
358 					sw2 = whole;
359 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
360 				    sf3)) {
361 					sf3 = fract;
362 					si3 = j;
363 					sw3 = whole;
364 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
365 				    sf4)) {
366 					sf4 = fract;
367 					si4 = j;
368 					sw4 = whole;
369 				}
370 			}
371 			sbuf_printf(&sb, "queue %u:\n", i);
372 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
373 			    sf0 / tot, si0);
374 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
375 			    sf1 / tot, si1);
376 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
377 			    sf2 / tot, si2);
378 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
379 			    sf3 / tot, si3);
380 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
381 			    sf4 / tot, si4);
382 		}
383 	}
384 	sbuf_trim(&sb);
385 	sbuf_finish(&sb);
386 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
387 	sbuf_delete(&sb);
388 	return (0);
389 }
390 
391 static int
392 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
393 {
394 	struct umtxq_chain *uc;
395 	u_int i, j;
396 	int clear, error;
397 
398 	clear = 0;
399 	error = sysctl_handle_int(oidp, &clear, 0, req);
400 	if (error != 0 || req->newptr == NULL)
401 		return (error);
402 
403 	if (clear != 0) {
404 		for (i = 0; i < 2; ++i) {
405 			for (j = 0; j < UMTX_CHAINS; ++j) {
406 				uc = &umtxq_chains[i][j];
407 				mtx_lock(&uc->uc_lock);
408 				uc->length = 0;
409 				uc->max_length = 0;
410 				mtx_unlock(&uc->uc_lock);
411 			}
412 		}
413 	}
414 	return (0);
415 }
416 
417 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
418     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
419     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
420 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
421     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
422     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
423 #endif
424 
425 static void
426 umtxq_sysinit(void *arg __unused)
427 {
428 	int i, j;
429 
430 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
431 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
432 	for (i = 0; i < 2; ++i) {
433 		for (j = 0; j < UMTX_CHAINS; ++j) {
434 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
435 				 MTX_DEF | MTX_DUPOK);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
438 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
439 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
440 			umtxq_chains[i][j].uc_busy = 0;
441 			umtxq_chains[i][j].uc_waiters = 0;
442 #ifdef UMTX_PROFILING
443 			umtxq_chains[i][j].length = 0;
444 			umtxq_chains[i][j].max_length = 0;
445 #endif
446 		}
447 	}
448 #ifdef UMTX_PROFILING
449 	umtx_init_profiling();
450 #endif
451 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
452 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
453 	    EVENTHANDLER_PRI_ANY);
454 	umtx_shm_init();
455 }
456 
457 struct umtx_q *
458 umtxq_alloc(void)
459 {
460 	struct umtx_q *uq;
461 
462 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
463 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
464 	    M_WAITOK | M_ZERO);
465 	TAILQ_INIT(&uq->uq_spare_queue->head);
466 	TAILQ_INIT(&uq->uq_pi_contested);
467 	uq->uq_inherited_pri = PRI_MAX;
468 	return (uq);
469 }
470 
471 void
472 umtxq_free(struct umtx_q *uq)
473 {
474 
475 	MPASS(uq->uq_spare_queue != NULL);
476 	free(uq->uq_spare_queue, M_UMTX);
477 	free(uq, M_UMTX);
478 }
479 
480 static inline void
481 umtxq_hash(struct umtx_key *key)
482 {
483 	unsigned n;
484 
485 	n = (uintptr_t)key->info.both.a + key->info.both.b;
486 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
487 }
488 
489 static inline struct umtxq_chain *
490 umtxq_getchain(struct umtx_key *key)
491 {
492 
493 	if (key->type <= TYPE_SEM)
494 		return (&umtxq_chains[1][key->hash]);
495 	return (&umtxq_chains[0][key->hash]);
496 }
497 
498 /*
499  * Lock a chain.
500  */
501 static inline void
502 umtxq_lock(struct umtx_key *key)
503 {
504 	struct umtxq_chain *uc;
505 
506 	uc = umtxq_getchain(key);
507 	mtx_lock(&uc->uc_lock);
508 }
509 
510 /*
511  * Unlock a chain.
512  */
513 static inline void
514 umtxq_unlock(struct umtx_key *key)
515 {
516 	struct umtxq_chain *uc;
517 
518 	uc = umtxq_getchain(key);
519 	mtx_unlock(&uc->uc_lock);
520 }
521 
522 /*
523  * Set chain to busy state when following operation
524  * may be blocked (kernel mutex can not be used).
525  */
526 static inline void
527 umtxq_busy(struct umtx_key *key)
528 {
529 	struct umtxq_chain *uc;
530 
531 	uc = umtxq_getchain(key);
532 	mtx_assert(&uc->uc_lock, MA_OWNED);
533 	if (uc->uc_busy) {
534 #ifdef SMP
535 		if (smp_cpus > 1) {
536 			int count = BUSY_SPINS;
537 			if (count > 0) {
538 				umtxq_unlock(key);
539 				while (uc->uc_busy && --count > 0)
540 					cpu_spinwait();
541 				umtxq_lock(key);
542 			}
543 		}
544 #endif
545 		while (uc->uc_busy) {
546 			uc->uc_waiters++;
547 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
548 			uc->uc_waiters--;
549 		}
550 	}
551 	uc->uc_busy = 1;
552 }
553 
554 /*
555  * Unbusy a chain.
556  */
557 static inline void
558 umtxq_unbusy(struct umtx_key *key)
559 {
560 	struct umtxq_chain *uc;
561 
562 	uc = umtxq_getchain(key);
563 	mtx_assert(&uc->uc_lock, MA_OWNED);
564 	KASSERT(uc->uc_busy != 0, ("not busy"));
565 	uc->uc_busy = 0;
566 	if (uc->uc_waiters)
567 		wakeup_one(uc);
568 }
569 
570 static inline void
571 umtxq_unbusy_unlocked(struct umtx_key *key)
572 {
573 
574 	umtxq_lock(key);
575 	umtxq_unbusy(key);
576 	umtxq_unlock(key);
577 }
578 
579 static struct umtxq_queue *
580 umtxq_queue_lookup(struct umtx_key *key, int q)
581 {
582 	struct umtxq_queue *uh;
583 	struct umtxq_chain *uc;
584 
585 	uc = umtxq_getchain(key);
586 	UMTXQ_LOCKED_ASSERT(uc);
587 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
588 		if (umtx_key_match(&uh->key, key))
589 			return (uh);
590 	}
591 
592 	return (NULL);
593 }
594 
595 static inline void
596 umtxq_insert_queue(struct umtx_q *uq, int q)
597 {
598 	struct umtxq_queue *uh;
599 	struct umtxq_chain *uc;
600 
601 	uc = umtxq_getchain(&uq->uq_key);
602 	UMTXQ_LOCKED_ASSERT(uc);
603 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
604 	uh = umtxq_queue_lookup(&uq->uq_key, q);
605 	if (uh != NULL) {
606 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
607 	} else {
608 		uh = uq->uq_spare_queue;
609 		uh->key = uq->uq_key;
610 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
611 #ifdef UMTX_PROFILING
612 		uc->length++;
613 		if (uc->length > uc->max_length) {
614 			uc->max_length = uc->length;
615 			if (uc->max_length > max_length)
616 				max_length = uc->max_length;
617 		}
618 #endif
619 	}
620 	uq->uq_spare_queue = NULL;
621 
622 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
623 	uh->length++;
624 	uq->uq_flags |= UQF_UMTXQ;
625 	uq->uq_cur_queue = uh;
626 	return;
627 }
628 
629 static inline void
630 umtxq_remove_queue(struct umtx_q *uq, int q)
631 {
632 	struct umtxq_chain *uc;
633 	struct umtxq_queue *uh;
634 
635 	uc = umtxq_getchain(&uq->uq_key);
636 	UMTXQ_LOCKED_ASSERT(uc);
637 	if (uq->uq_flags & UQF_UMTXQ) {
638 		uh = uq->uq_cur_queue;
639 		TAILQ_REMOVE(&uh->head, uq, uq_link);
640 		uh->length--;
641 		uq->uq_flags &= ~UQF_UMTXQ;
642 		if (TAILQ_EMPTY(&uh->head)) {
643 			KASSERT(uh->length == 0,
644 			    ("inconsistent umtxq_queue length"));
645 #ifdef UMTX_PROFILING
646 			uc->length--;
647 #endif
648 			LIST_REMOVE(uh, link);
649 		} else {
650 			uh = LIST_FIRST(&uc->uc_spare_queue);
651 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
652 			LIST_REMOVE(uh, link);
653 		}
654 		uq->uq_spare_queue = uh;
655 		uq->uq_cur_queue = NULL;
656 	}
657 }
658 
659 /*
660  * Check if there are multiple waiters
661  */
662 static int
663 umtxq_count(struct umtx_key *key)
664 {
665 	struct umtxq_queue *uh;
666 
667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
668 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
669 	if (uh != NULL)
670 		return (uh->length);
671 	return (0);
672 }
673 
674 /*
675  * Check if there are multiple PI waiters and returns first
676  * waiter.
677  */
678 static int
679 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
680 {
681 	struct umtxq_queue *uh;
682 
683 	*first = NULL;
684 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
685 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
686 	if (uh != NULL) {
687 		*first = TAILQ_FIRST(&uh->head);
688 		return (uh->length);
689 	}
690 	return (0);
691 }
692 
693 static int
694 umtxq_check_susp(struct thread *td)
695 {
696 	struct proc *p;
697 	int error;
698 
699 	/*
700 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
701 	 * eventually break the lockstep loop.
702 	 */
703 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
704 		return (0);
705 	error = 0;
706 	p = td->td_proc;
707 	PROC_LOCK(p);
708 	if (P_SHOULDSTOP(p) ||
709 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
710 		if (p->p_flag & P_SINGLE_EXIT)
711 			error = EINTR;
712 		else
713 			error = ERESTART;
714 	}
715 	PROC_UNLOCK(p);
716 	return (error);
717 }
718 
719 /*
720  * Wake up threads waiting on an userland object.
721  */
722 
723 static int
724 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
725 {
726 	struct umtxq_queue *uh;
727 	struct umtx_q *uq;
728 	int ret;
729 
730 	ret = 0;
731 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
732 	uh = umtxq_queue_lookup(key, q);
733 	if (uh != NULL) {
734 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
735 			umtxq_remove_queue(uq, q);
736 			wakeup(uq);
737 			if (++ret >= n_wake)
738 				return (ret);
739 		}
740 	}
741 	return (ret);
742 }
743 
744 
745 /*
746  * Wake up specified thread.
747  */
748 static inline void
749 umtxq_signal_thread(struct umtx_q *uq)
750 {
751 
752 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
753 	umtxq_remove(uq);
754 	wakeup(uq);
755 }
756 
757 static inline int
758 tstohz(const struct timespec *tsp)
759 {
760 	struct timeval tv;
761 
762 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
763 	return tvtohz(&tv);
764 }
765 
766 static void
767 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
768 	const struct timespec *timeout)
769 {
770 
771 	timo->clockid = clockid;
772 	if (!absolute) {
773 		timo->is_abs_real = false;
774 		abs_timeout_update(timo);
775 		timespecadd(&timo->cur, timeout, &timo->end);
776 	} else {
777 		timo->end = *timeout;
778 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
779 		    clockid == CLOCK_REALTIME_FAST ||
780 		    clockid == CLOCK_REALTIME_PRECISE;
781 		/*
782 		 * If is_abs_real, umtxq_sleep will read the clock
783 		 * after setting td_rtcgen; otherwise, read it here.
784 		 */
785 		if (!timo->is_abs_real) {
786 			abs_timeout_update(timo);
787 		}
788 	}
789 }
790 
791 static void
792 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
793 {
794 
795 	abs_timeout_init(timo, umtxtime->_clockid,
796 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
797 }
798 
799 static inline void
800 abs_timeout_update(struct abs_timeout *timo)
801 {
802 
803 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
804 }
805 
806 static int
807 abs_timeout_gethz(struct abs_timeout *timo)
808 {
809 	struct timespec tts;
810 
811 	if (timespeccmp(&timo->end, &timo->cur, <=))
812 		return (-1);
813 	timespecsub(&timo->end, &timo->cur, &tts);
814 	return (tstohz(&tts));
815 }
816 
817 static uint32_t
818 umtx_unlock_val(uint32_t flags, bool rb)
819 {
820 
821 	if (rb)
822 		return (UMUTEX_RB_OWNERDEAD);
823 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
824 		return (UMUTEX_RB_NOTRECOV);
825 	else
826 		return (UMUTEX_UNOWNED);
827 
828 }
829 
830 /*
831  * Put thread into sleep state, before sleeping, check if
832  * thread was removed from umtx queue.
833  */
834 static inline int
835 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
836 {
837 	struct umtxq_chain *uc;
838 	int error, timo;
839 
840 	if (abstime != NULL && abstime->is_abs_real) {
841 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
842 		abs_timeout_update(abstime);
843 	}
844 
845 	uc = umtxq_getchain(&uq->uq_key);
846 	UMTXQ_LOCKED_ASSERT(uc);
847 	for (;;) {
848 		if (!(uq->uq_flags & UQF_UMTXQ)) {
849 			error = 0;
850 			break;
851 		}
852 		if (abstime != NULL) {
853 			timo = abs_timeout_gethz(abstime);
854 			if (timo < 0) {
855 				error = ETIMEDOUT;
856 				break;
857 			}
858 		} else
859 			timo = 0;
860 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
861 		if (error == EINTR || error == ERESTART) {
862 			umtxq_lock(&uq->uq_key);
863 			break;
864 		}
865 		if (abstime != NULL) {
866 			if (abstime->is_abs_real)
867 				curthread->td_rtcgen =
868 				    atomic_load_acq_int(&rtc_generation);
869 			abs_timeout_update(abstime);
870 		}
871 		umtxq_lock(&uq->uq_key);
872 	}
873 
874 	curthread->td_rtcgen = 0;
875 	return (error);
876 }
877 
878 /*
879  * Convert userspace address into unique logical address.
880  */
881 int
882 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
883 {
884 	struct thread *td = curthread;
885 	vm_map_t map;
886 	vm_map_entry_t entry;
887 	vm_pindex_t pindex;
888 	vm_prot_t prot;
889 	boolean_t wired;
890 
891 	key->type = type;
892 	if (share == THREAD_SHARE) {
893 		key->shared = 0;
894 		key->info.private.vs = td->td_proc->p_vmspace;
895 		key->info.private.addr = (uintptr_t)addr;
896 	} else {
897 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
898 		map = &td->td_proc->p_vmspace->vm_map;
899 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
900 		    &entry, &key->info.shared.object, &pindex, &prot,
901 		    &wired) != KERN_SUCCESS) {
902 			return (EFAULT);
903 		}
904 
905 		if ((share == PROCESS_SHARE) ||
906 		    (share == AUTO_SHARE &&
907 		     VM_INHERIT_SHARE == entry->inheritance)) {
908 			key->shared = 1;
909 			key->info.shared.offset = (vm_offset_t)addr -
910 			    entry->start + entry->offset;
911 			vm_object_reference(key->info.shared.object);
912 		} else {
913 			key->shared = 0;
914 			key->info.private.vs = td->td_proc->p_vmspace;
915 			key->info.private.addr = (uintptr_t)addr;
916 		}
917 		vm_map_lookup_done(map, entry);
918 	}
919 
920 	umtxq_hash(key);
921 	return (0);
922 }
923 
924 /*
925  * Release key.
926  */
927 void
928 umtx_key_release(struct umtx_key *key)
929 {
930 	if (key->shared)
931 		vm_object_deallocate(key->info.shared.object);
932 }
933 
934 /*
935  * Fetch and compare value, sleep on the address if value is not changed.
936  */
937 static int
938 do_wait(struct thread *td, void *addr, u_long id,
939     struct _umtx_time *timeout, int compat32, int is_private)
940 {
941 	struct abs_timeout timo;
942 	struct umtx_q *uq;
943 	u_long tmp;
944 	uint32_t tmp32;
945 	int error = 0;
946 
947 	uq = td->td_umtxq;
948 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
949 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
950 		return (error);
951 
952 	if (timeout != NULL)
953 		abs_timeout_init2(&timo, timeout);
954 
955 	umtxq_lock(&uq->uq_key);
956 	umtxq_insert(uq);
957 	umtxq_unlock(&uq->uq_key);
958 	if (compat32 == 0) {
959 		error = fueword(addr, &tmp);
960 		if (error != 0)
961 			error = EFAULT;
962 	} else {
963 		error = fueword32(addr, &tmp32);
964 		if (error == 0)
965 			tmp = tmp32;
966 		else
967 			error = EFAULT;
968 	}
969 	umtxq_lock(&uq->uq_key);
970 	if (error == 0) {
971 		if (tmp == id)
972 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
973 			    NULL : &timo);
974 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
975 			error = 0;
976 		else
977 			umtxq_remove(uq);
978 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
979 		umtxq_remove(uq);
980 	}
981 	umtxq_unlock(&uq->uq_key);
982 	umtx_key_release(&uq->uq_key);
983 	if (error == ERESTART)
984 		error = EINTR;
985 	return (error);
986 }
987 
988 /*
989  * Wake up threads sleeping on the specified address.
990  */
991 int
992 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
993 {
994 	struct umtx_key key;
995 	int ret;
996 
997 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
998 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
999 		return (ret);
1000 	umtxq_lock(&key);
1001 	umtxq_signal(&key, n_wake);
1002 	umtxq_unlock(&key);
1003 	umtx_key_release(&key);
1004 	return (0);
1005 }
1006 
1007 /*
1008  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1009  */
1010 static int
1011 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1012     struct _umtx_time *timeout, int mode)
1013 {
1014 	struct abs_timeout timo;
1015 	struct umtx_q *uq;
1016 	uint32_t owner, old, id;
1017 	int error, rv;
1018 
1019 	id = td->td_tid;
1020 	uq = td->td_umtxq;
1021 	error = 0;
1022 	if (timeout != NULL)
1023 		abs_timeout_init2(&timo, timeout);
1024 
1025 	/*
1026 	 * Care must be exercised when dealing with umtx structure. It
1027 	 * can fault on any access.
1028 	 */
1029 	for (;;) {
1030 		rv = fueword32(&m->m_owner, &owner);
1031 		if (rv == -1)
1032 			return (EFAULT);
1033 		if (mode == _UMUTEX_WAIT) {
1034 			if (owner == UMUTEX_UNOWNED ||
1035 			    owner == UMUTEX_CONTESTED ||
1036 			    owner == UMUTEX_RB_OWNERDEAD ||
1037 			    owner == UMUTEX_RB_NOTRECOV)
1038 				return (0);
1039 		} else {
1040 			/*
1041 			 * Robust mutex terminated.  Kernel duty is to
1042 			 * return EOWNERDEAD to the userspace.  The
1043 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1044 			 * by the common userspace code.
1045 			 */
1046 			if (owner == UMUTEX_RB_OWNERDEAD) {
1047 				rv = casueword32(&m->m_owner,
1048 				    UMUTEX_RB_OWNERDEAD, &owner,
1049 				    id | UMUTEX_CONTESTED);
1050 				if (rv == -1)
1051 					return (EFAULT);
1052 				if (owner == UMUTEX_RB_OWNERDEAD)
1053 					return (EOWNERDEAD); /* success */
1054 				rv = umtxq_check_susp(td);
1055 				if (rv != 0)
1056 					return (rv);
1057 				continue;
1058 			}
1059 			if (owner == UMUTEX_RB_NOTRECOV)
1060 				return (ENOTRECOVERABLE);
1061 
1062 			/*
1063 			 * Try the uncontested case.  This should be
1064 			 * done in userland.
1065 			 */
1066 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1067 			    &owner, id);
1068 			/* The address was invalid. */
1069 			if (rv == -1)
1070 				return (EFAULT);
1071 
1072 			/* The acquire succeeded. */
1073 			if (owner == UMUTEX_UNOWNED)
1074 				return (0);
1075 
1076 			/*
1077 			 * If no one owns it but it is contested try
1078 			 * to acquire it.
1079 			 */
1080 			if (owner == UMUTEX_CONTESTED) {
1081 				rv = casueword32(&m->m_owner,
1082 				    UMUTEX_CONTESTED, &owner,
1083 				    id | UMUTEX_CONTESTED);
1084 				/* The address was invalid. */
1085 				if (rv == -1)
1086 					return (EFAULT);
1087 
1088 				if (owner == UMUTEX_CONTESTED)
1089 					return (0);
1090 
1091 				rv = umtxq_check_susp(td);
1092 				if (rv != 0)
1093 					return (rv);
1094 
1095 				/*
1096 				 * If this failed the lock has
1097 				 * changed, restart.
1098 				 */
1099 				continue;
1100 			}
1101 		}
1102 
1103 		if (mode == _UMUTEX_TRY)
1104 			return (EBUSY);
1105 
1106 		/*
1107 		 * If we caught a signal, we have retried and now
1108 		 * exit immediately.
1109 		 */
1110 		if (error != 0)
1111 			return (error);
1112 
1113 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1114 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1115 			return (error);
1116 
1117 		umtxq_lock(&uq->uq_key);
1118 		umtxq_busy(&uq->uq_key);
1119 		umtxq_insert(uq);
1120 		umtxq_unlock(&uq->uq_key);
1121 
1122 		/*
1123 		 * Set the contested bit so that a release in user space
1124 		 * knows to use the system call for unlock.  If this fails
1125 		 * either some one else has acquired the lock or it has been
1126 		 * released.
1127 		 */
1128 		rv = casueword32(&m->m_owner, owner, &old,
1129 		    owner | UMUTEX_CONTESTED);
1130 
1131 		/* The address was invalid. */
1132 		if (rv == -1) {
1133 			umtxq_lock(&uq->uq_key);
1134 			umtxq_remove(uq);
1135 			umtxq_unbusy(&uq->uq_key);
1136 			umtxq_unlock(&uq->uq_key);
1137 			umtx_key_release(&uq->uq_key);
1138 			return (EFAULT);
1139 		}
1140 
1141 		/*
1142 		 * We set the contested bit, sleep. Otherwise the lock changed
1143 		 * and we need to retry or we lost a race to the thread
1144 		 * unlocking the umtx.
1145 		 */
1146 		umtxq_lock(&uq->uq_key);
1147 		umtxq_unbusy(&uq->uq_key);
1148 		if (old == owner)
1149 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1150 			    NULL : &timo);
1151 		umtxq_remove(uq);
1152 		umtxq_unlock(&uq->uq_key);
1153 		umtx_key_release(&uq->uq_key);
1154 
1155 		if (error == 0)
1156 			error = umtxq_check_susp(td);
1157 	}
1158 
1159 	return (0);
1160 }
1161 
1162 /*
1163  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1164  */
1165 static int
1166 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1167 {
1168 	struct umtx_key key;
1169 	uint32_t owner, old, id, newlock;
1170 	int error, count;
1171 
1172 	id = td->td_tid;
1173 	/*
1174 	 * Make sure we own this mtx.
1175 	 */
1176 	error = fueword32(&m->m_owner, &owner);
1177 	if (error == -1)
1178 		return (EFAULT);
1179 
1180 	if ((owner & ~UMUTEX_CONTESTED) != id)
1181 		return (EPERM);
1182 
1183 	newlock = umtx_unlock_val(flags, rb);
1184 	if ((owner & UMUTEX_CONTESTED) == 0) {
1185 		error = casueword32(&m->m_owner, owner, &old, newlock);
1186 		if (error == -1)
1187 			return (EFAULT);
1188 		if (old == owner)
1189 			return (0);
1190 		owner = old;
1191 	}
1192 
1193 	/* We should only ever be in here for contested locks */
1194 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1195 	    &key)) != 0)
1196 		return (error);
1197 
1198 	umtxq_lock(&key);
1199 	umtxq_busy(&key);
1200 	count = umtxq_count(&key);
1201 	umtxq_unlock(&key);
1202 
1203 	/*
1204 	 * When unlocking the umtx, it must be marked as unowned if
1205 	 * there is zero or one thread only waiting for it.
1206 	 * Otherwise, it must be marked as contested.
1207 	 */
1208 	if (count > 1)
1209 		newlock |= UMUTEX_CONTESTED;
1210 	error = casueword32(&m->m_owner, owner, &old, newlock);
1211 	umtxq_lock(&key);
1212 	umtxq_signal(&key, 1);
1213 	umtxq_unbusy(&key);
1214 	umtxq_unlock(&key);
1215 	umtx_key_release(&key);
1216 	if (error == -1)
1217 		return (EFAULT);
1218 	if (old != owner)
1219 		return (EINVAL);
1220 	return (0);
1221 }
1222 
1223 /*
1224  * Check if the mutex is available and wake up a waiter,
1225  * only for simple mutex.
1226  */
1227 static int
1228 do_wake_umutex(struct thread *td, struct umutex *m)
1229 {
1230 	struct umtx_key key;
1231 	uint32_t owner;
1232 	uint32_t flags;
1233 	int error;
1234 	int count;
1235 
1236 	error = fueword32(&m->m_owner, &owner);
1237 	if (error == -1)
1238 		return (EFAULT);
1239 
1240 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1241 	    owner != UMUTEX_RB_NOTRECOV)
1242 		return (0);
1243 
1244 	error = fueword32(&m->m_flags, &flags);
1245 	if (error == -1)
1246 		return (EFAULT);
1247 
1248 	/* We should only ever be in here for contested locks */
1249 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1250 	    &key)) != 0)
1251 		return (error);
1252 
1253 	umtxq_lock(&key);
1254 	umtxq_busy(&key);
1255 	count = umtxq_count(&key);
1256 	umtxq_unlock(&key);
1257 
1258 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1259 	    owner != UMUTEX_RB_NOTRECOV) {
1260 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1261 		    UMUTEX_UNOWNED);
1262 		if (error == -1)
1263 			error = EFAULT;
1264 	}
1265 
1266 	umtxq_lock(&key);
1267 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1268 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1269 		umtxq_signal(&key, 1);
1270 	umtxq_unbusy(&key);
1271 	umtxq_unlock(&key);
1272 	umtx_key_release(&key);
1273 	return (error);
1274 }
1275 
1276 /*
1277  * Check if the mutex has waiters and tries to fix contention bit.
1278  */
1279 static int
1280 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1281 {
1282 	struct umtx_key key;
1283 	uint32_t owner, old;
1284 	int type;
1285 	int error;
1286 	int count;
1287 
1288 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1289 	    UMUTEX_ROBUST)) {
1290 	case 0:
1291 	case UMUTEX_ROBUST:
1292 		type = TYPE_NORMAL_UMUTEX;
1293 		break;
1294 	case UMUTEX_PRIO_INHERIT:
1295 		type = TYPE_PI_UMUTEX;
1296 		break;
1297 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1298 		type = TYPE_PI_ROBUST_UMUTEX;
1299 		break;
1300 	case UMUTEX_PRIO_PROTECT:
1301 		type = TYPE_PP_UMUTEX;
1302 		break;
1303 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1304 		type = TYPE_PP_ROBUST_UMUTEX;
1305 		break;
1306 	default:
1307 		return (EINVAL);
1308 	}
1309 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1310 		return (error);
1311 
1312 	owner = 0;
1313 	umtxq_lock(&key);
1314 	umtxq_busy(&key);
1315 	count = umtxq_count(&key);
1316 	umtxq_unlock(&key);
1317 	/*
1318 	 * Only repair contention bit if there is a waiter, this means the mutex
1319 	 * is still being referenced by userland code, otherwise don't update
1320 	 * any memory.
1321 	 */
1322 	if (count > 1) {
1323 		error = fueword32(&m->m_owner, &owner);
1324 		if (error == -1)
1325 			error = EFAULT;
1326 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1327 			error = casueword32(&m->m_owner, owner, &old,
1328 			    owner | UMUTEX_CONTESTED);
1329 			if (error == -1) {
1330 				error = EFAULT;
1331 				break;
1332 			}
1333 			if (old == owner)
1334 				break;
1335 			owner = old;
1336 			error = umtxq_check_susp(td);
1337 			if (error != 0)
1338 				break;
1339 		}
1340 	} else if (count == 1) {
1341 		error = fueword32(&m->m_owner, &owner);
1342 		if (error == -1)
1343 			error = EFAULT;
1344 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1345 		    (owner & UMUTEX_CONTESTED) == 0) {
1346 			error = casueword32(&m->m_owner, owner, &old,
1347 			    owner | UMUTEX_CONTESTED);
1348 			if (error == -1) {
1349 				error = EFAULT;
1350 				break;
1351 			}
1352 			if (old == owner)
1353 				break;
1354 			owner = old;
1355 			error = umtxq_check_susp(td);
1356 			if (error != 0)
1357 				break;
1358 		}
1359 	}
1360 	umtxq_lock(&key);
1361 	if (error == EFAULT) {
1362 		umtxq_signal(&key, INT_MAX);
1363 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1364 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1365 		umtxq_signal(&key, 1);
1366 	umtxq_unbusy(&key);
1367 	umtxq_unlock(&key);
1368 	umtx_key_release(&key);
1369 	return (error);
1370 }
1371 
1372 static inline struct umtx_pi *
1373 umtx_pi_alloc(int flags)
1374 {
1375 	struct umtx_pi *pi;
1376 
1377 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1378 	TAILQ_INIT(&pi->pi_blocked);
1379 	atomic_add_int(&umtx_pi_allocated, 1);
1380 	return (pi);
1381 }
1382 
1383 static inline void
1384 umtx_pi_free(struct umtx_pi *pi)
1385 {
1386 	uma_zfree(umtx_pi_zone, pi);
1387 	atomic_add_int(&umtx_pi_allocated, -1);
1388 }
1389 
1390 /*
1391  * Adjust the thread's position on a pi_state after its priority has been
1392  * changed.
1393  */
1394 static int
1395 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1396 {
1397 	struct umtx_q *uq, *uq1, *uq2;
1398 	struct thread *td1;
1399 
1400 	mtx_assert(&umtx_lock, MA_OWNED);
1401 	if (pi == NULL)
1402 		return (0);
1403 
1404 	uq = td->td_umtxq;
1405 
1406 	/*
1407 	 * Check if the thread needs to be moved on the blocked chain.
1408 	 * It needs to be moved if either its priority is lower than
1409 	 * the previous thread or higher than the next thread.
1410 	 */
1411 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1412 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1413 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1414 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1415 		/*
1416 		 * Remove thread from blocked chain and determine where
1417 		 * it should be moved to.
1418 		 */
1419 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1420 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1421 			td1 = uq1->uq_thread;
1422 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1423 			if (UPRI(td1) > UPRI(td))
1424 				break;
1425 		}
1426 
1427 		if (uq1 == NULL)
1428 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1429 		else
1430 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1431 	}
1432 	return (1);
1433 }
1434 
1435 static struct umtx_pi *
1436 umtx_pi_next(struct umtx_pi *pi)
1437 {
1438 	struct umtx_q *uq_owner;
1439 
1440 	if (pi->pi_owner == NULL)
1441 		return (NULL);
1442 	uq_owner = pi->pi_owner->td_umtxq;
1443 	if (uq_owner == NULL)
1444 		return (NULL);
1445 	return (uq_owner->uq_pi_blocked);
1446 }
1447 
1448 /*
1449  * Floyd's Cycle-Finding Algorithm.
1450  */
1451 static bool
1452 umtx_pi_check_loop(struct umtx_pi *pi)
1453 {
1454 	struct umtx_pi *pi1;	/* fast iterator */
1455 
1456 	mtx_assert(&umtx_lock, MA_OWNED);
1457 	if (pi == NULL)
1458 		return (false);
1459 	pi1 = pi;
1460 	for (;;) {
1461 		pi = umtx_pi_next(pi);
1462 		if (pi == NULL)
1463 			break;
1464 		pi1 = umtx_pi_next(pi1);
1465 		if (pi1 == NULL)
1466 			break;
1467 		pi1 = umtx_pi_next(pi1);
1468 		if (pi1 == NULL)
1469 			break;
1470 		if (pi == pi1)
1471 			return (true);
1472 	}
1473 	return (false);
1474 }
1475 
1476 /*
1477  * Propagate priority when a thread is blocked on POSIX
1478  * PI mutex.
1479  */
1480 static void
1481 umtx_propagate_priority(struct thread *td)
1482 {
1483 	struct umtx_q *uq;
1484 	struct umtx_pi *pi;
1485 	int pri;
1486 
1487 	mtx_assert(&umtx_lock, MA_OWNED);
1488 	pri = UPRI(td);
1489 	uq = td->td_umtxq;
1490 	pi = uq->uq_pi_blocked;
1491 	if (pi == NULL)
1492 		return;
1493 	if (umtx_pi_check_loop(pi))
1494 		return;
1495 
1496 	for (;;) {
1497 		td = pi->pi_owner;
1498 		if (td == NULL || td == curthread)
1499 			return;
1500 
1501 		MPASS(td->td_proc != NULL);
1502 		MPASS(td->td_proc->p_magic == P_MAGIC);
1503 
1504 		thread_lock(td);
1505 		if (td->td_lend_user_pri > pri)
1506 			sched_lend_user_prio(td, pri);
1507 		else {
1508 			thread_unlock(td);
1509 			break;
1510 		}
1511 		thread_unlock(td);
1512 
1513 		/*
1514 		 * Pick up the lock that td is blocked on.
1515 		 */
1516 		uq = td->td_umtxq;
1517 		pi = uq->uq_pi_blocked;
1518 		if (pi == NULL)
1519 			break;
1520 		/* Resort td on the list if needed. */
1521 		umtx_pi_adjust_thread(pi, td);
1522 	}
1523 }
1524 
1525 /*
1526  * Unpropagate priority for a PI mutex when a thread blocked on
1527  * it is interrupted by signal or resumed by others.
1528  */
1529 static void
1530 umtx_repropagate_priority(struct umtx_pi *pi)
1531 {
1532 	struct umtx_q *uq, *uq_owner;
1533 	struct umtx_pi *pi2;
1534 	int pri;
1535 
1536 	mtx_assert(&umtx_lock, MA_OWNED);
1537 
1538 	if (umtx_pi_check_loop(pi))
1539 		return;
1540 	while (pi != NULL && pi->pi_owner != NULL) {
1541 		pri = PRI_MAX;
1542 		uq_owner = pi->pi_owner->td_umtxq;
1543 
1544 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1545 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1546 			if (uq != NULL) {
1547 				if (pri > UPRI(uq->uq_thread))
1548 					pri = UPRI(uq->uq_thread);
1549 			}
1550 		}
1551 
1552 		if (pri > uq_owner->uq_inherited_pri)
1553 			pri = uq_owner->uq_inherited_pri;
1554 		thread_lock(pi->pi_owner);
1555 		sched_lend_user_prio(pi->pi_owner, pri);
1556 		thread_unlock(pi->pi_owner);
1557 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1558 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1559 	}
1560 }
1561 
1562 /*
1563  * Insert a PI mutex into owned list.
1564  */
1565 static void
1566 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1567 {
1568 	struct umtx_q *uq_owner;
1569 
1570 	uq_owner = owner->td_umtxq;
1571 	mtx_assert(&umtx_lock, MA_OWNED);
1572 	MPASS(pi->pi_owner == NULL);
1573 	pi->pi_owner = owner;
1574 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1575 }
1576 
1577 
1578 /*
1579  * Disown a PI mutex, and remove it from the owned list.
1580  */
1581 static void
1582 umtx_pi_disown(struct umtx_pi *pi)
1583 {
1584 
1585 	mtx_assert(&umtx_lock, MA_OWNED);
1586 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1587 	pi->pi_owner = NULL;
1588 }
1589 
1590 /*
1591  * Claim ownership of a PI mutex.
1592  */
1593 static int
1594 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1595 {
1596 	struct umtx_q *uq;
1597 	int pri;
1598 
1599 	mtx_lock(&umtx_lock);
1600 	if (pi->pi_owner == owner) {
1601 		mtx_unlock(&umtx_lock);
1602 		return (0);
1603 	}
1604 
1605 	if (pi->pi_owner != NULL) {
1606 		/*
1607 		 * userland may have already messed the mutex, sigh.
1608 		 */
1609 		mtx_unlock(&umtx_lock);
1610 		return (EPERM);
1611 	}
1612 	umtx_pi_setowner(pi, owner);
1613 	uq = TAILQ_FIRST(&pi->pi_blocked);
1614 	if (uq != NULL) {
1615 		pri = UPRI(uq->uq_thread);
1616 		thread_lock(owner);
1617 		if (pri < UPRI(owner))
1618 			sched_lend_user_prio(owner, pri);
1619 		thread_unlock(owner);
1620 	}
1621 	mtx_unlock(&umtx_lock);
1622 	return (0);
1623 }
1624 
1625 /*
1626  * Adjust a thread's order position in its blocked PI mutex,
1627  * this may result new priority propagating process.
1628  */
1629 void
1630 umtx_pi_adjust(struct thread *td, u_char oldpri)
1631 {
1632 	struct umtx_q *uq;
1633 	struct umtx_pi *pi;
1634 
1635 	uq = td->td_umtxq;
1636 	mtx_lock(&umtx_lock);
1637 	/*
1638 	 * Pick up the lock that td is blocked on.
1639 	 */
1640 	pi = uq->uq_pi_blocked;
1641 	if (pi != NULL) {
1642 		umtx_pi_adjust_thread(pi, td);
1643 		umtx_repropagate_priority(pi);
1644 	}
1645 	mtx_unlock(&umtx_lock);
1646 }
1647 
1648 /*
1649  * Sleep on a PI mutex.
1650  */
1651 static int
1652 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1653     const char *wmesg, struct abs_timeout *timo, bool shared)
1654 {
1655 	struct thread *td, *td1;
1656 	struct umtx_q *uq1;
1657 	int error, pri;
1658 #ifdef INVARIANTS
1659 	struct umtxq_chain *uc;
1660 
1661 	uc = umtxq_getchain(&pi->pi_key);
1662 #endif
1663 	error = 0;
1664 	td = uq->uq_thread;
1665 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1666 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1667 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1668 	umtxq_insert(uq);
1669 	mtx_lock(&umtx_lock);
1670 	if (pi->pi_owner == NULL) {
1671 		mtx_unlock(&umtx_lock);
1672 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1673 		mtx_lock(&umtx_lock);
1674 		if (td1 != NULL) {
1675 			if (pi->pi_owner == NULL)
1676 				umtx_pi_setowner(pi, td1);
1677 			PROC_UNLOCK(td1->td_proc);
1678 		}
1679 	}
1680 
1681 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1682 		pri = UPRI(uq1->uq_thread);
1683 		if (pri > UPRI(td))
1684 			break;
1685 	}
1686 
1687 	if (uq1 != NULL)
1688 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1689 	else
1690 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1691 
1692 	uq->uq_pi_blocked = pi;
1693 	thread_lock(td);
1694 	td->td_flags |= TDF_UPIBLOCKED;
1695 	thread_unlock(td);
1696 	umtx_propagate_priority(td);
1697 	mtx_unlock(&umtx_lock);
1698 	umtxq_unbusy(&uq->uq_key);
1699 
1700 	error = umtxq_sleep(uq, wmesg, timo);
1701 	umtxq_remove(uq);
1702 
1703 	mtx_lock(&umtx_lock);
1704 	uq->uq_pi_blocked = NULL;
1705 	thread_lock(td);
1706 	td->td_flags &= ~TDF_UPIBLOCKED;
1707 	thread_unlock(td);
1708 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1709 	umtx_repropagate_priority(pi);
1710 	mtx_unlock(&umtx_lock);
1711 	umtxq_unlock(&uq->uq_key);
1712 
1713 	return (error);
1714 }
1715 
1716 /*
1717  * Add reference count for a PI mutex.
1718  */
1719 static void
1720 umtx_pi_ref(struct umtx_pi *pi)
1721 {
1722 
1723 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1724 	pi->pi_refcount++;
1725 }
1726 
1727 /*
1728  * Decrease reference count for a PI mutex, if the counter
1729  * is decreased to zero, its memory space is freed.
1730  */
1731 static void
1732 umtx_pi_unref(struct umtx_pi *pi)
1733 {
1734 	struct umtxq_chain *uc;
1735 
1736 	uc = umtxq_getchain(&pi->pi_key);
1737 	UMTXQ_LOCKED_ASSERT(uc);
1738 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1739 	if (--pi->pi_refcount == 0) {
1740 		mtx_lock(&umtx_lock);
1741 		if (pi->pi_owner != NULL)
1742 			umtx_pi_disown(pi);
1743 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1744 			("blocked queue not empty"));
1745 		mtx_unlock(&umtx_lock);
1746 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1747 		umtx_pi_free(pi);
1748 	}
1749 }
1750 
1751 /*
1752  * Find a PI mutex in hash table.
1753  */
1754 static struct umtx_pi *
1755 umtx_pi_lookup(struct umtx_key *key)
1756 {
1757 	struct umtxq_chain *uc;
1758 	struct umtx_pi *pi;
1759 
1760 	uc = umtxq_getchain(key);
1761 	UMTXQ_LOCKED_ASSERT(uc);
1762 
1763 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1764 		if (umtx_key_match(&pi->pi_key, key)) {
1765 			return (pi);
1766 		}
1767 	}
1768 	return (NULL);
1769 }
1770 
1771 /*
1772  * Insert a PI mutex into hash table.
1773  */
1774 static inline void
1775 umtx_pi_insert(struct umtx_pi *pi)
1776 {
1777 	struct umtxq_chain *uc;
1778 
1779 	uc = umtxq_getchain(&pi->pi_key);
1780 	UMTXQ_LOCKED_ASSERT(uc);
1781 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1782 }
1783 
1784 /*
1785  * Lock a PI mutex.
1786  */
1787 static int
1788 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1789     struct _umtx_time *timeout, int try)
1790 {
1791 	struct abs_timeout timo;
1792 	struct umtx_q *uq;
1793 	struct umtx_pi *pi, *new_pi;
1794 	uint32_t id, old_owner, owner, old;
1795 	int error, rv;
1796 
1797 	id = td->td_tid;
1798 	uq = td->td_umtxq;
1799 
1800 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1801 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1802 	    &uq->uq_key)) != 0)
1803 		return (error);
1804 
1805 	if (timeout != NULL)
1806 		abs_timeout_init2(&timo, timeout);
1807 
1808 	umtxq_lock(&uq->uq_key);
1809 	pi = umtx_pi_lookup(&uq->uq_key);
1810 	if (pi == NULL) {
1811 		new_pi = umtx_pi_alloc(M_NOWAIT);
1812 		if (new_pi == NULL) {
1813 			umtxq_unlock(&uq->uq_key);
1814 			new_pi = umtx_pi_alloc(M_WAITOK);
1815 			umtxq_lock(&uq->uq_key);
1816 			pi = umtx_pi_lookup(&uq->uq_key);
1817 			if (pi != NULL) {
1818 				umtx_pi_free(new_pi);
1819 				new_pi = NULL;
1820 			}
1821 		}
1822 		if (new_pi != NULL) {
1823 			new_pi->pi_key = uq->uq_key;
1824 			umtx_pi_insert(new_pi);
1825 			pi = new_pi;
1826 		}
1827 	}
1828 	umtx_pi_ref(pi);
1829 	umtxq_unlock(&uq->uq_key);
1830 
1831 	/*
1832 	 * Care must be exercised when dealing with umtx structure.  It
1833 	 * can fault on any access.
1834 	 */
1835 	for (;;) {
1836 		/*
1837 		 * Try the uncontested case.  This should be done in userland.
1838 		 */
1839 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1840 		/* The address was invalid. */
1841 		if (rv == -1) {
1842 			error = EFAULT;
1843 			break;
1844 		}
1845 
1846 		/* The acquire succeeded. */
1847 		if (owner == UMUTEX_UNOWNED) {
1848 			error = 0;
1849 			break;
1850 		}
1851 
1852 		if (owner == UMUTEX_RB_NOTRECOV) {
1853 			error = ENOTRECOVERABLE;
1854 			break;
1855 		}
1856 
1857 		/* If no one owns it but it is contested try to acquire it. */
1858 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1859 			old_owner = owner;
1860 			rv = casueword32(&m->m_owner, owner, &owner,
1861 			    id | UMUTEX_CONTESTED);
1862 			/* The address was invalid. */
1863 			if (rv == -1) {
1864 				error = EFAULT;
1865 				break;
1866 			}
1867 
1868 			if (owner == old_owner) {
1869 				umtxq_lock(&uq->uq_key);
1870 				umtxq_busy(&uq->uq_key);
1871 				error = umtx_pi_claim(pi, td);
1872 				umtxq_unbusy(&uq->uq_key);
1873 				umtxq_unlock(&uq->uq_key);
1874 				if (error != 0) {
1875 					/*
1876 					 * Since we're going to return an
1877 					 * error, restore the m_owner to its
1878 					 * previous, unowned state to avoid
1879 					 * compounding the problem.
1880 					 */
1881 					(void)casuword32(&m->m_owner,
1882 					    id | UMUTEX_CONTESTED,
1883 					    old_owner);
1884 				}
1885 				if (error == 0 &&
1886 				    old_owner == UMUTEX_RB_OWNERDEAD)
1887 					error = EOWNERDEAD;
1888 				break;
1889 			}
1890 
1891 			error = umtxq_check_susp(td);
1892 			if (error != 0)
1893 				break;
1894 
1895 			/* If this failed the lock has changed, restart. */
1896 			continue;
1897 		}
1898 
1899 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1900 			error = EDEADLK;
1901 			break;
1902 		}
1903 
1904 		if (try != 0) {
1905 			error = EBUSY;
1906 			break;
1907 		}
1908 
1909 		/*
1910 		 * If we caught a signal, we have retried and now
1911 		 * exit immediately.
1912 		 */
1913 		if (error != 0)
1914 			break;
1915 
1916 		umtxq_lock(&uq->uq_key);
1917 		umtxq_busy(&uq->uq_key);
1918 		umtxq_unlock(&uq->uq_key);
1919 
1920 		/*
1921 		 * Set the contested bit so that a release in user space
1922 		 * knows to use the system call for unlock.  If this fails
1923 		 * either some one else has acquired the lock or it has been
1924 		 * released.
1925 		 */
1926 		rv = casueword32(&m->m_owner, owner, &old, owner |
1927 		    UMUTEX_CONTESTED);
1928 
1929 		/* The address was invalid. */
1930 		if (rv == -1) {
1931 			umtxq_unbusy_unlocked(&uq->uq_key);
1932 			error = EFAULT;
1933 			break;
1934 		}
1935 
1936 		umtxq_lock(&uq->uq_key);
1937 		/*
1938 		 * We set the contested bit, sleep. Otherwise the lock changed
1939 		 * and we need to retry or we lost a race to the thread
1940 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
1941 		 * value for owner is impossible there.
1942 		 */
1943 		if (old == owner) {
1944 			error = umtxq_sleep_pi(uq, pi,
1945 			    owner & ~UMUTEX_CONTESTED,
1946 			    "umtxpi", timeout == NULL ? NULL : &timo,
1947 			    (flags & USYNC_PROCESS_SHARED) != 0);
1948 			if (error != 0)
1949 				continue;
1950 		} else {
1951 			umtxq_unbusy(&uq->uq_key);
1952 			umtxq_unlock(&uq->uq_key);
1953 		}
1954 
1955 		error = umtxq_check_susp(td);
1956 		if (error != 0)
1957 			break;
1958 	}
1959 
1960 	umtxq_lock(&uq->uq_key);
1961 	umtx_pi_unref(pi);
1962 	umtxq_unlock(&uq->uq_key);
1963 
1964 	umtx_key_release(&uq->uq_key);
1965 	return (error);
1966 }
1967 
1968 /*
1969  * Unlock a PI mutex.
1970  */
1971 static int
1972 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1973 {
1974 	struct umtx_key key;
1975 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1976 	struct umtx_pi *pi, *pi2;
1977 	uint32_t id, new_owner, old, owner;
1978 	int count, error, pri;
1979 
1980 	id = td->td_tid;
1981 	/*
1982 	 * Make sure we own this mtx.
1983 	 */
1984 	error = fueword32(&m->m_owner, &owner);
1985 	if (error == -1)
1986 		return (EFAULT);
1987 
1988 	if ((owner & ~UMUTEX_CONTESTED) != id)
1989 		return (EPERM);
1990 
1991 	new_owner = umtx_unlock_val(flags, rb);
1992 
1993 	/* This should be done in userland */
1994 	if ((owner & UMUTEX_CONTESTED) == 0) {
1995 		error = casueword32(&m->m_owner, owner, &old, new_owner);
1996 		if (error == -1)
1997 			return (EFAULT);
1998 		if (old == owner)
1999 			return (0);
2000 		owner = old;
2001 	}
2002 
2003 	/* We should only ever be in here for contested locks */
2004 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2005 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2006 	    &key)) != 0)
2007 		return (error);
2008 
2009 	umtxq_lock(&key);
2010 	umtxq_busy(&key);
2011 	count = umtxq_count_pi(&key, &uq_first);
2012 	if (uq_first != NULL) {
2013 		mtx_lock(&umtx_lock);
2014 		pi = uq_first->uq_pi_blocked;
2015 		KASSERT(pi != NULL, ("pi == NULL?"));
2016 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2017 			mtx_unlock(&umtx_lock);
2018 			umtxq_unbusy(&key);
2019 			umtxq_unlock(&key);
2020 			umtx_key_release(&key);
2021 			/* userland messed the mutex */
2022 			return (EPERM);
2023 		}
2024 		uq_me = td->td_umtxq;
2025 		if (pi->pi_owner == td)
2026 			umtx_pi_disown(pi);
2027 		/* get highest priority thread which is still sleeping. */
2028 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2029 		while (uq_first != NULL &&
2030 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2031 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2032 		}
2033 		pri = PRI_MAX;
2034 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2035 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2036 			if (uq_first2 != NULL) {
2037 				if (pri > UPRI(uq_first2->uq_thread))
2038 					pri = UPRI(uq_first2->uq_thread);
2039 			}
2040 		}
2041 		thread_lock(td);
2042 		sched_lend_user_prio(td, pri);
2043 		thread_unlock(td);
2044 		mtx_unlock(&umtx_lock);
2045 		if (uq_first)
2046 			umtxq_signal_thread(uq_first);
2047 	} else {
2048 		pi = umtx_pi_lookup(&key);
2049 		/*
2050 		 * A umtx_pi can exist if a signal or timeout removed the
2051 		 * last waiter from the umtxq, but there is still
2052 		 * a thread in do_lock_pi() holding the umtx_pi.
2053 		 */
2054 		if (pi != NULL) {
2055 			/*
2056 			 * The umtx_pi can be unowned, such as when a thread
2057 			 * has just entered do_lock_pi(), allocated the
2058 			 * umtx_pi, and unlocked the umtxq.
2059 			 * If the current thread owns it, it must disown it.
2060 			 */
2061 			mtx_lock(&umtx_lock);
2062 			if (pi->pi_owner == td)
2063 				umtx_pi_disown(pi);
2064 			mtx_unlock(&umtx_lock);
2065 		}
2066 	}
2067 	umtxq_unlock(&key);
2068 
2069 	/*
2070 	 * When unlocking the umtx, it must be marked as unowned if
2071 	 * there is zero or one thread only waiting for it.
2072 	 * Otherwise, it must be marked as contested.
2073 	 */
2074 
2075 	if (count > 1)
2076 		new_owner |= UMUTEX_CONTESTED;
2077 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2078 
2079 	umtxq_unbusy_unlocked(&key);
2080 	umtx_key_release(&key);
2081 	if (error == -1)
2082 		return (EFAULT);
2083 	if (old != owner)
2084 		return (EINVAL);
2085 	return (0);
2086 }
2087 
2088 /*
2089  * Lock a PP mutex.
2090  */
2091 static int
2092 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2093     struct _umtx_time *timeout, int try)
2094 {
2095 	struct abs_timeout timo;
2096 	struct umtx_q *uq, *uq2;
2097 	struct umtx_pi *pi;
2098 	uint32_t ceiling;
2099 	uint32_t owner, id;
2100 	int error, pri, old_inherited_pri, su, rv;
2101 
2102 	id = td->td_tid;
2103 	uq = td->td_umtxq;
2104 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2105 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2106 	    &uq->uq_key)) != 0)
2107 		return (error);
2108 
2109 	if (timeout != NULL)
2110 		abs_timeout_init2(&timo, timeout);
2111 
2112 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2113 	for (;;) {
2114 		old_inherited_pri = uq->uq_inherited_pri;
2115 		umtxq_lock(&uq->uq_key);
2116 		umtxq_busy(&uq->uq_key);
2117 		umtxq_unlock(&uq->uq_key);
2118 
2119 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2120 		if (rv == -1) {
2121 			error = EFAULT;
2122 			goto out;
2123 		}
2124 		ceiling = RTP_PRIO_MAX - ceiling;
2125 		if (ceiling > RTP_PRIO_MAX) {
2126 			error = EINVAL;
2127 			goto out;
2128 		}
2129 
2130 		mtx_lock(&umtx_lock);
2131 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2132 			mtx_unlock(&umtx_lock);
2133 			error = EINVAL;
2134 			goto out;
2135 		}
2136 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2137 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2138 			thread_lock(td);
2139 			if (uq->uq_inherited_pri < UPRI(td))
2140 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2141 			thread_unlock(td);
2142 		}
2143 		mtx_unlock(&umtx_lock);
2144 
2145 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2146 		    id | UMUTEX_CONTESTED);
2147 		/* The address was invalid. */
2148 		if (rv == -1) {
2149 			error = EFAULT;
2150 			break;
2151 		}
2152 
2153 		if (owner == UMUTEX_CONTESTED) {
2154 			error = 0;
2155 			break;
2156 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
2157 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2158 			    &owner, id | UMUTEX_CONTESTED);
2159 			if (rv == -1) {
2160 				error = EFAULT;
2161 				break;
2162 			}
2163 			if (owner == UMUTEX_RB_OWNERDEAD) {
2164 				error = EOWNERDEAD; /* success */
2165 				break;
2166 			}
2167 			error = 0;
2168 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2169 			error = ENOTRECOVERABLE;
2170 			break;
2171 		}
2172 
2173 		if (try != 0) {
2174 			error = EBUSY;
2175 			break;
2176 		}
2177 
2178 		/*
2179 		 * If we caught a signal, we have retried and now
2180 		 * exit immediately.
2181 		 */
2182 		if (error != 0)
2183 			break;
2184 
2185 		umtxq_lock(&uq->uq_key);
2186 		umtxq_insert(uq);
2187 		umtxq_unbusy(&uq->uq_key);
2188 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2189 		    NULL : &timo);
2190 		umtxq_remove(uq);
2191 		umtxq_unlock(&uq->uq_key);
2192 
2193 		mtx_lock(&umtx_lock);
2194 		uq->uq_inherited_pri = old_inherited_pri;
2195 		pri = PRI_MAX;
2196 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2197 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2198 			if (uq2 != NULL) {
2199 				if (pri > UPRI(uq2->uq_thread))
2200 					pri = UPRI(uq2->uq_thread);
2201 			}
2202 		}
2203 		if (pri > uq->uq_inherited_pri)
2204 			pri = uq->uq_inherited_pri;
2205 		thread_lock(td);
2206 		sched_lend_user_prio(td, pri);
2207 		thread_unlock(td);
2208 		mtx_unlock(&umtx_lock);
2209 	}
2210 
2211 	if (error != 0 && error != EOWNERDEAD) {
2212 		mtx_lock(&umtx_lock);
2213 		uq->uq_inherited_pri = old_inherited_pri;
2214 		pri = PRI_MAX;
2215 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2216 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2217 			if (uq2 != NULL) {
2218 				if (pri > UPRI(uq2->uq_thread))
2219 					pri = UPRI(uq2->uq_thread);
2220 			}
2221 		}
2222 		if (pri > uq->uq_inherited_pri)
2223 			pri = uq->uq_inherited_pri;
2224 		thread_lock(td);
2225 		sched_lend_user_prio(td, pri);
2226 		thread_unlock(td);
2227 		mtx_unlock(&umtx_lock);
2228 	}
2229 
2230 out:
2231 	umtxq_unbusy_unlocked(&uq->uq_key);
2232 	umtx_key_release(&uq->uq_key);
2233 	return (error);
2234 }
2235 
2236 /*
2237  * Unlock a PP mutex.
2238  */
2239 static int
2240 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2241 {
2242 	struct umtx_key key;
2243 	struct umtx_q *uq, *uq2;
2244 	struct umtx_pi *pi;
2245 	uint32_t id, owner, rceiling;
2246 	int error, pri, new_inherited_pri, su;
2247 
2248 	id = td->td_tid;
2249 	uq = td->td_umtxq;
2250 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2251 
2252 	/*
2253 	 * Make sure we own this mtx.
2254 	 */
2255 	error = fueword32(&m->m_owner, &owner);
2256 	if (error == -1)
2257 		return (EFAULT);
2258 
2259 	if ((owner & ~UMUTEX_CONTESTED) != id)
2260 		return (EPERM);
2261 
2262 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2263 	if (error != 0)
2264 		return (error);
2265 
2266 	if (rceiling == -1)
2267 		new_inherited_pri = PRI_MAX;
2268 	else {
2269 		rceiling = RTP_PRIO_MAX - rceiling;
2270 		if (rceiling > RTP_PRIO_MAX)
2271 			return (EINVAL);
2272 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2273 	}
2274 
2275 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2276 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2277 	    &key)) != 0)
2278 		return (error);
2279 	umtxq_lock(&key);
2280 	umtxq_busy(&key);
2281 	umtxq_unlock(&key);
2282 	/*
2283 	 * For priority protected mutex, always set unlocked state
2284 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2285 	 * to lock the mutex, it is necessary because thread priority
2286 	 * has to be adjusted for such mutex.
2287 	 */
2288 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2289 	    UMUTEX_CONTESTED);
2290 
2291 	umtxq_lock(&key);
2292 	if (error == 0)
2293 		umtxq_signal(&key, 1);
2294 	umtxq_unbusy(&key);
2295 	umtxq_unlock(&key);
2296 
2297 	if (error == -1)
2298 		error = EFAULT;
2299 	else {
2300 		mtx_lock(&umtx_lock);
2301 		if (su != 0)
2302 			uq->uq_inherited_pri = new_inherited_pri;
2303 		pri = PRI_MAX;
2304 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2305 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2306 			if (uq2 != NULL) {
2307 				if (pri > UPRI(uq2->uq_thread))
2308 					pri = UPRI(uq2->uq_thread);
2309 			}
2310 		}
2311 		if (pri > uq->uq_inherited_pri)
2312 			pri = uq->uq_inherited_pri;
2313 		thread_lock(td);
2314 		sched_lend_user_prio(td, pri);
2315 		thread_unlock(td);
2316 		mtx_unlock(&umtx_lock);
2317 	}
2318 	umtx_key_release(&key);
2319 	return (error);
2320 }
2321 
2322 static int
2323 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2324     uint32_t *old_ceiling)
2325 {
2326 	struct umtx_q *uq;
2327 	uint32_t flags, id, owner, save_ceiling;
2328 	int error, rv, rv1;
2329 
2330 	error = fueword32(&m->m_flags, &flags);
2331 	if (error == -1)
2332 		return (EFAULT);
2333 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2334 		return (EINVAL);
2335 	if (ceiling > RTP_PRIO_MAX)
2336 		return (EINVAL);
2337 	id = td->td_tid;
2338 	uq = td->td_umtxq;
2339 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2340 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2341 	    &uq->uq_key)) != 0)
2342 		return (error);
2343 	for (;;) {
2344 		umtxq_lock(&uq->uq_key);
2345 		umtxq_busy(&uq->uq_key);
2346 		umtxq_unlock(&uq->uq_key);
2347 
2348 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2349 		if (rv == -1) {
2350 			error = EFAULT;
2351 			break;
2352 		}
2353 
2354 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2355 		    id | UMUTEX_CONTESTED);
2356 		if (rv == -1) {
2357 			error = EFAULT;
2358 			break;
2359 		}
2360 
2361 		if (owner == UMUTEX_CONTESTED) {
2362 			rv = suword32(&m->m_ceilings[0], ceiling);
2363 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2364 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2365 			break;
2366 		}
2367 
2368 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2369 			rv = suword32(&m->m_ceilings[0], ceiling);
2370 			error = rv == 0 ? 0 : EFAULT;
2371 			break;
2372 		}
2373 
2374 		if (owner == UMUTEX_RB_OWNERDEAD) {
2375 			error = EOWNERDEAD;
2376 			break;
2377 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2378 			error = ENOTRECOVERABLE;
2379 			break;
2380 		}
2381 
2382 		/*
2383 		 * If we caught a signal, we have retried and now
2384 		 * exit immediately.
2385 		 */
2386 		if (error != 0)
2387 			break;
2388 
2389 		/*
2390 		 * We set the contested bit, sleep. Otherwise the lock changed
2391 		 * and we need to retry or we lost a race to the thread
2392 		 * unlocking the umtx.
2393 		 */
2394 		umtxq_lock(&uq->uq_key);
2395 		umtxq_insert(uq);
2396 		umtxq_unbusy(&uq->uq_key);
2397 		error = umtxq_sleep(uq, "umtxpp", NULL);
2398 		umtxq_remove(uq);
2399 		umtxq_unlock(&uq->uq_key);
2400 	}
2401 	umtxq_lock(&uq->uq_key);
2402 	if (error == 0)
2403 		umtxq_signal(&uq->uq_key, INT_MAX);
2404 	umtxq_unbusy(&uq->uq_key);
2405 	umtxq_unlock(&uq->uq_key);
2406 	umtx_key_release(&uq->uq_key);
2407 	if (error == 0 && old_ceiling != NULL) {
2408 		rv = suword32(old_ceiling, save_ceiling);
2409 		error = rv == 0 ? 0 : EFAULT;
2410 	}
2411 	return (error);
2412 }
2413 
2414 /*
2415  * Lock a userland POSIX mutex.
2416  */
2417 static int
2418 do_lock_umutex(struct thread *td, struct umutex *m,
2419     struct _umtx_time *timeout, int mode)
2420 {
2421 	uint32_t flags;
2422 	int error;
2423 
2424 	error = fueword32(&m->m_flags, &flags);
2425 	if (error == -1)
2426 		return (EFAULT);
2427 
2428 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2429 	case 0:
2430 		error = do_lock_normal(td, m, flags, timeout, mode);
2431 		break;
2432 	case UMUTEX_PRIO_INHERIT:
2433 		error = do_lock_pi(td, m, flags, timeout, mode);
2434 		break;
2435 	case UMUTEX_PRIO_PROTECT:
2436 		error = do_lock_pp(td, m, flags, timeout, mode);
2437 		break;
2438 	default:
2439 		return (EINVAL);
2440 	}
2441 	if (timeout == NULL) {
2442 		if (error == EINTR && mode != _UMUTEX_WAIT)
2443 			error = ERESTART;
2444 	} else {
2445 		/* Timed-locking is not restarted. */
2446 		if (error == ERESTART)
2447 			error = EINTR;
2448 	}
2449 	return (error);
2450 }
2451 
2452 /*
2453  * Unlock a userland POSIX mutex.
2454  */
2455 static int
2456 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2457 {
2458 	uint32_t flags;
2459 	int error;
2460 
2461 	error = fueword32(&m->m_flags, &flags);
2462 	if (error == -1)
2463 		return (EFAULT);
2464 
2465 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2466 	case 0:
2467 		return (do_unlock_normal(td, m, flags, rb));
2468 	case UMUTEX_PRIO_INHERIT:
2469 		return (do_unlock_pi(td, m, flags, rb));
2470 	case UMUTEX_PRIO_PROTECT:
2471 		return (do_unlock_pp(td, m, flags, rb));
2472 	}
2473 
2474 	return (EINVAL);
2475 }
2476 
2477 static int
2478 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2479     struct timespec *timeout, u_long wflags)
2480 {
2481 	struct abs_timeout timo;
2482 	struct umtx_q *uq;
2483 	uint32_t flags, clockid, hasw;
2484 	int error;
2485 
2486 	uq = td->td_umtxq;
2487 	error = fueword32(&cv->c_flags, &flags);
2488 	if (error == -1)
2489 		return (EFAULT);
2490 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2491 	if (error != 0)
2492 		return (error);
2493 
2494 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2495 		error = fueword32(&cv->c_clockid, &clockid);
2496 		if (error == -1) {
2497 			umtx_key_release(&uq->uq_key);
2498 			return (EFAULT);
2499 		}
2500 		if (clockid < CLOCK_REALTIME ||
2501 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2502 			/* hmm, only HW clock id will work. */
2503 			umtx_key_release(&uq->uq_key);
2504 			return (EINVAL);
2505 		}
2506 	} else {
2507 		clockid = CLOCK_REALTIME;
2508 	}
2509 
2510 	umtxq_lock(&uq->uq_key);
2511 	umtxq_busy(&uq->uq_key);
2512 	umtxq_insert(uq);
2513 	umtxq_unlock(&uq->uq_key);
2514 
2515 	/*
2516 	 * Set c_has_waiters to 1 before releasing user mutex, also
2517 	 * don't modify cache line when unnecessary.
2518 	 */
2519 	error = fueword32(&cv->c_has_waiters, &hasw);
2520 	if (error == 0 && hasw == 0)
2521 		suword32(&cv->c_has_waiters, 1);
2522 
2523 	umtxq_unbusy_unlocked(&uq->uq_key);
2524 
2525 	error = do_unlock_umutex(td, m, false);
2526 
2527 	if (timeout != NULL)
2528 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2529 		    timeout);
2530 
2531 	umtxq_lock(&uq->uq_key);
2532 	if (error == 0) {
2533 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2534 		    NULL : &timo);
2535 	}
2536 
2537 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2538 		error = 0;
2539 	else {
2540 		/*
2541 		 * This must be timeout,interrupted by signal or
2542 		 * surprious wakeup, clear c_has_waiter flag when
2543 		 * necessary.
2544 		 */
2545 		umtxq_busy(&uq->uq_key);
2546 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2547 			int oldlen = uq->uq_cur_queue->length;
2548 			umtxq_remove(uq);
2549 			if (oldlen == 1) {
2550 				umtxq_unlock(&uq->uq_key);
2551 				suword32(&cv->c_has_waiters, 0);
2552 				umtxq_lock(&uq->uq_key);
2553 			}
2554 		}
2555 		umtxq_unbusy(&uq->uq_key);
2556 		if (error == ERESTART)
2557 			error = EINTR;
2558 	}
2559 
2560 	umtxq_unlock(&uq->uq_key);
2561 	umtx_key_release(&uq->uq_key);
2562 	return (error);
2563 }
2564 
2565 /*
2566  * Signal a userland condition variable.
2567  */
2568 static int
2569 do_cv_signal(struct thread *td, struct ucond *cv)
2570 {
2571 	struct umtx_key key;
2572 	int error, cnt, nwake;
2573 	uint32_t flags;
2574 
2575 	error = fueword32(&cv->c_flags, &flags);
2576 	if (error == -1)
2577 		return (EFAULT);
2578 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2579 		return (error);
2580 	umtxq_lock(&key);
2581 	umtxq_busy(&key);
2582 	cnt = umtxq_count(&key);
2583 	nwake = umtxq_signal(&key, 1);
2584 	if (cnt <= nwake) {
2585 		umtxq_unlock(&key);
2586 		error = suword32(&cv->c_has_waiters, 0);
2587 		if (error == -1)
2588 			error = EFAULT;
2589 		umtxq_lock(&key);
2590 	}
2591 	umtxq_unbusy(&key);
2592 	umtxq_unlock(&key);
2593 	umtx_key_release(&key);
2594 	return (error);
2595 }
2596 
2597 static int
2598 do_cv_broadcast(struct thread *td, struct ucond *cv)
2599 {
2600 	struct umtx_key key;
2601 	int error;
2602 	uint32_t flags;
2603 
2604 	error = fueword32(&cv->c_flags, &flags);
2605 	if (error == -1)
2606 		return (EFAULT);
2607 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2608 		return (error);
2609 
2610 	umtxq_lock(&key);
2611 	umtxq_busy(&key);
2612 	umtxq_signal(&key, INT_MAX);
2613 	umtxq_unlock(&key);
2614 
2615 	error = suword32(&cv->c_has_waiters, 0);
2616 	if (error == -1)
2617 		error = EFAULT;
2618 
2619 	umtxq_unbusy_unlocked(&key);
2620 
2621 	umtx_key_release(&key);
2622 	return (error);
2623 }
2624 
2625 static int
2626 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2627     struct _umtx_time *timeout)
2628 {
2629 	struct abs_timeout timo;
2630 	struct umtx_q *uq;
2631 	uint32_t flags, wrflags;
2632 	int32_t state, oldstate;
2633 	int32_t blocked_readers;
2634 	int error, error1, rv;
2635 
2636 	uq = td->td_umtxq;
2637 	error = fueword32(&rwlock->rw_flags, &flags);
2638 	if (error == -1)
2639 		return (EFAULT);
2640 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2641 	if (error != 0)
2642 		return (error);
2643 
2644 	if (timeout != NULL)
2645 		abs_timeout_init2(&timo, timeout);
2646 
2647 	wrflags = URWLOCK_WRITE_OWNER;
2648 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2649 		wrflags |= URWLOCK_WRITE_WAITERS;
2650 
2651 	for (;;) {
2652 		rv = fueword32(&rwlock->rw_state, &state);
2653 		if (rv == -1) {
2654 			umtx_key_release(&uq->uq_key);
2655 			return (EFAULT);
2656 		}
2657 
2658 		/* try to lock it */
2659 		while (!(state & wrflags)) {
2660 			if (__predict_false(URWLOCK_READER_COUNT(state) ==
2661 			    URWLOCK_MAX_READERS)) {
2662 				umtx_key_release(&uq->uq_key);
2663 				return (EAGAIN);
2664 			}
2665 			rv = casueword32(&rwlock->rw_state, state,
2666 			    &oldstate, state + 1);
2667 			if (rv == -1) {
2668 				umtx_key_release(&uq->uq_key);
2669 				return (EFAULT);
2670 			}
2671 			if (oldstate == state) {
2672 				umtx_key_release(&uq->uq_key);
2673 				return (0);
2674 			}
2675 			error = umtxq_check_susp(td);
2676 			if (error != 0)
2677 				break;
2678 			state = oldstate;
2679 		}
2680 
2681 		if (error)
2682 			break;
2683 
2684 		/* grab monitor lock */
2685 		umtxq_lock(&uq->uq_key);
2686 		umtxq_busy(&uq->uq_key);
2687 		umtxq_unlock(&uq->uq_key);
2688 
2689 		/*
2690 		 * re-read the state, in case it changed between the try-lock above
2691 		 * and the check below
2692 		 */
2693 		rv = fueword32(&rwlock->rw_state, &state);
2694 		if (rv == -1)
2695 			error = EFAULT;
2696 
2697 		/* set read contention bit */
2698 		while (error == 0 && (state & wrflags) &&
2699 		    !(state & URWLOCK_READ_WAITERS)) {
2700 			rv = casueword32(&rwlock->rw_state, state,
2701 			    &oldstate, state | URWLOCK_READ_WAITERS);
2702 			if (rv == -1) {
2703 				error = EFAULT;
2704 				break;
2705 			}
2706 			if (oldstate == state)
2707 				goto sleep;
2708 			state = oldstate;
2709 			error = umtxq_check_susp(td);
2710 			if (error != 0)
2711 				break;
2712 		}
2713 		if (error != 0) {
2714 			umtxq_unbusy_unlocked(&uq->uq_key);
2715 			break;
2716 		}
2717 
2718 		/* state is changed while setting flags, restart */
2719 		if (!(state & wrflags)) {
2720 			umtxq_unbusy_unlocked(&uq->uq_key);
2721 			error = umtxq_check_susp(td);
2722 			if (error != 0)
2723 				break;
2724 			continue;
2725 		}
2726 
2727 sleep:
2728 		/*
2729 		 * Contention bit is set, before sleeping, increase
2730 		 * read waiter count.
2731 		 */
2732 		rv = fueword32(&rwlock->rw_blocked_readers,
2733 		    &blocked_readers);
2734 		if (rv == -1) {
2735 			umtxq_unbusy_unlocked(&uq->uq_key);
2736 			error = EFAULT;
2737 			break;
2738 		}
2739 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2740 
2741 		while (state & wrflags) {
2742 			umtxq_lock(&uq->uq_key);
2743 			umtxq_insert(uq);
2744 			umtxq_unbusy(&uq->uq_key);
2745 
2746 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2747 			    NULL : &timo);
2748 
2749 			umtxq_busy(&uq->uq_key);
2750 			umtxq_remove(uq);
2751 			umtxq_unlock(&uq->uq_key);
2752 			if (error)
2753 				break;
2754 			rv = fueword32(&rwlock->rw_state, &state);
2755 			if (rv == -1) {
2756 				error = EFAULT;
2757 				break;
2758 			}
2759 		}
2760 
2761 		/* decrease read waiter count, and may clear read contention bit */
2762 		rv = fueword32(&rwlock->rw_blocked_readers,
2763 		    &blocked_readers);
2764 		if (rv == -1) {
2765 			umtxq_unbusy_unlocked(&uq->uq_key);
2766 			error = EFAULT;
2767 			break;
2768 		}
2769 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2770 		if (blocked_readers == 1) {
2771 			rv = fueword32(&rwlock->rw_state, &state);
2772 			if (rv == -1) {
2773 				umtxq_unbusy_unlocked(&uq->uq_key);
2774 				error = EFAULT;
2775 				break;
2776 			}
2777 			for (;;) {
2778 				rv = casueword32(&rwlock->rw_state, state,
2779 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2780 				if (rv == -1) {
2781 					error = EFAULT;
2782 					break;
2783 				}
2784 				if (oldstate == state)
2785 					break;
2786 				state = oldstate;
2787 				error1 = umtxq_check_susp(td);
2788 				if (error1 != 0) {
2789 					if (error == 0)
2790 						error = error1;
2791 					break;
2792 				}
2793 			}
2794 		}
2795 
2796 		umtxq_unbusy_unlocked(&uq->uq_key);
2797 		if (error != 0)
2798 			break;
2799 	}
2800 	umtx_key_release(&uq->uq_key);
2801 	if (error == ERESTART)
2802 		error = EINTR;
2803 	return (error);
2804 }
2805 
2806 static int
2807 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2808 {
2809 	struct abs_timeout timo;
2810 	struct umtx_q *uq;
2811 	uint32_t flags;
2812 	int32_t state, oldstate;
2813 	int32_t blocked_writers;
2814 	int32_t blocked_readers;
2815 	int error, error1, rv;
2816 
2817 	uq = td->td_umtxq;
2818 	error = fueword32(&rwlock->rw_flags, &flags);
2819 	if (error == -1)
2820 		return (EFAULT);
2821 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2822 	if (error != 0)
2823 		return (error);
2824 
2825 	if (timeout != NULL)
2826 		abs_timeout_init2(&timo, timeout);
2827 
2828 	blocked_readers = 0;
2829 	for (;;) {
2830 		rv = fueword32(&rwlock->rw_state, &state);
2831 		if (rv == -1) {
2832 			umtx_key_release(&uq->uq_key);
2833 			return (EFAULT);
2834 		}
2835 		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
2836 		    URWLOCK_READER_COUNT(state) == 0) {
2837 			rv = casueword32(&rwlock->rw_state, state,
2838 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2839 			if (rv == -1) {
2840 				umtx_key_release(&uq->uq_key);
2841 				return (EFAULT);
2842 			}
2843 			if (oldstate == state) {
2844 				umtx_key_release(&uq->uq_key);
2845 				return (0);
2846 			}
2847 			state = oldstate;
2848 			error = umtxq_check_susp(td);
2849 			if (error != 0)
2850 				break;
2851 		}
2852 
2853 		if (error) {
2854 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2855 			    blocked_readers != 0) {
2856 				umtxq_lock(&uq->uq_key);
2857 				umtxq_busy(&uq->uq_key);
2858 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2859 				umtxq_unbusy(&uq->uq_key);
2860 				umtxq_unlock(&uq->uq_key);
2861 			}
2862 
2863 			break;
2864 		}
2865 
2866 		/* grab monitor lock */
2867 		umtxq_lock(&uq->uq_key);
2868 		umtxq_busy(&uq->uq_key);
2869 		umtxq_unlock(&uq->uq_key);
2870 
2871 		/*
2872 		 * Re-read the state, in case it changed between the
2873 		 * try-lock above and the check below.
2874 		 */
2875 		rv = fueword32(&rwlock->rw_state, &state);
2876 		if (rv == -1)
2877 			error = EFAULT;
2878 
2879 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2880 		    URWLOCK_READER_COUNT(state) != 0) &&
2881 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2882 			rv = casueword32(&rwlock->rw_state, state,
2883 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2884 			if (rv == -1) {
2885 				error = EFAULT;
2886 				break;
2887 			}
2888 			if (oldstate == state)
2889 				goto sleep;
2890 			state = oldstate;
2891 			error = umtxq_check_susp(td);
2892 			if (error != 0)
2893 				break;
2894 		}
2895 		if (error != 0) {
2896 			umtxq_unbusy_unlocked(&uq->uq_key);
2897 			break;
2898 		}
2899 
2900 		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
2901 		    URWLOCK_READER_COUNT(state) == 0) {
2902 			umtxq_unbusy_unlocked(&uq->uq_key);
2903 			error = umtxq_check_susp(td);
2904 			if (error != 0)
2905 				break;
2906 			continue;
2907 		}
2908 sleep:
2909 		rv = fueword32(&rwlock->rw_blocked_writers,
2910 		    &blocked_writers);
2911 		if (rv == -1) {
2912 			umtxq_unbusy_unlocked(&uq->uq_key);
2913 			error = EFAULT;
2914 			break;
2915 		}
2916 		suword32(&rwlock->rw_blocked_writers, blocked_writers + 1);
2917 
2918 		while ((state & URWLOCK_WRITE_OWNER) ||
2919 		    URWLOCK_READER_COUNT(state) != 0) {
2920 			umtxq_lock(&uq->uq_key);
2921 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2922 			umtxq_unbusy(&uq->uq_key);
2923 
2924 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2925 			    NULL : &timo);
2926 
2927 			umtxq_busy(&uq->uq_key);
2928 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2929 			umtxq_unlock(&uq->uq_key);
2930 			if (error)
2931 				break;
2932 			rv = fueword32(&rwlock->rw_state, &state);
2933 			if (rv == -1) {
2934 				error = EFAULT;
2935 				break;
2936 			}
2937 		}
2938 
2939 		rv = fueword32(&rwlock->rw_blocked_writers,
2940 		    &blocked_writers);
2941 		if (rv == -1) {
2942 			umtxq_unbusy_unlocked(&uq->uq_key);
2943 			error = EFAULT;
2944 			break;
2945 		}
2946 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2947 		if (blocked_writers == 1) {
2948 			rv = fueword32(&rwlock->rw_state, &state);
2949 			if (rv == -1) {
2950 				umtxq_unbusy_unlocked(&uq->uq_key);
2951 				error = EFAULT;
2952 				break;
2953 			}
2954 			for (;;) {
2955 				rv = casueword32(&rwlock->rw_state, state,
2956 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2957 				if (rv == -1) {
2958 					error = EFAULT;
2959 					break;
2960 				}
2961 				if (oldstate == state)
2962 					break;
2963 				state = oldstate;
2964 				error1 = umtxq_check_susp(td);
2965 				/*
2966 				 * We are leaving the URWLOCK_WRITE_WAITERS
2967 				 * behind, but this should not harm the
2968 				 * correctness.
2969 				 */
2970 				if (error1 != 0) {
2971 					if (error == 0)
2972 						error = error1;
2973 					break;
2974 				}
2975 			}
2976 			rv = fueword32(&rwlock->rw_blocked_readers,
2977 			    &blocked_readers);
2978 			if (rv == -1) {
2979 				umtxq_unbusy_unlocked(&uq->uq_key);
2980 				error = EFAULT;
2981 				break;
2982 			}
2983 		} else
2984 			blocked_readers = 0;
2985 
2986 		umtxq_unbusy_unlocked(&uq->uq_key);
2987 	}
2988 
2989 	umtx_key_release(&uq->uq_key);
2990 	if (error == ERESTART)
2991 		error = EINTR;
2992 	return (error);
2993 }
2994 
2995 static int
2996 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2997 {
2998 	struct umtx_q *uq;
2999 	uint32_t flags;
3000 	int32_t state, oldstate;
3001 	int error, rv, q, count;
3002 
3003 	uq = td->td_umtxq;
3004 	error = fueword32(&rwlock->rw_flags, &flags);
3005 	if (error == -1)
3006 		return (EFAULT);
3007 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3008 	if (error != 0)
3009 		return (error);
3010 
3011 	error = fueword32(&rwlock->rw_state, &state);
3012 	if (error == -1) {
3013 		error = EFAULT;
3014 		goto out;
3015 	}
3016 	if (state & URWLOCK_WRITE_OWNER) {
3017 		for (;;) {
3018 			rv = casueword32(&rwlock->rw_state, state,
3019 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3020 			if (rv == -1) {
3021 				error = EFAULT;
3022 				goto out;
3023 			}
3024 			if (oldstate != state) {
3025 				state = oldstate;
3026 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3027 					error = EPERM;
3028 					goto out;
3029 				}
3030 				error = umtxq_check_susp(td);
3031 				if (error != 0)
3032 					goto out;
3033 			} else
3034 				break;
3035 		}
3036 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3037 		for (;;) {
3038 			rv = casueword32(&rwlock->rw_state, state,
3039 			    &oldstate, state - 1);
3040 			if (rv == -1) {
3041 				error = EFAULT;
3042 				goto out;
3043 			}
3044 			if (oldstate != state) {
3045 				state = oldstate;
3046 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3047 					error = EPERM;
3048 					goto out;
3049 				}
3050 				error = umtxq_check_susp(td);
3051 				if (error != 0)
3052 					goto out;
3053 			} else
3054 				break;
3055 		}
3056 	} else {
3057 		error = EPERM;
3058 		goto out;
3059 	}
3060 
3061 	count = 0;
3062 
3063 	if (!(flags & URWLOCK_PREFER_READER)) {
3064 		if (state & URWLOCK_WRITE_WAITERS) {
3065 			count = 1;
3066 			q = UMTX_EXCLUSIVE_QUEUE;
3067 		} else if (state & URWLOCK_READ_WAITERS) {
3068 			count = INT_MAX;
3069 			q = UMTX_SHARED_QUEUE;
3070 		}
3071 	} else {
3072 		if (state & URWLOCK_READ_WAITERS) {
3073 			count = INT_MAX;
3074 			q = UMTX_SHARED_QUEUE;
3075 		} else if (state & URWLOCK_WRITE_WAITERS) {
3076 			count = 1;
3077 			q = UMTX_EXCLUSIVE_QUEUE;
3078 		}
3079 	}
3080 
3081 	if (count) {
3082 		umtxq_lock(&uq->uq_key);
3083 		umtxq_busy(&uq->uq_key);
3084 		umtxq_signal_queue(&uq->uq_key, count, q);
3085 		umtxq_unbusy(&uq->uq_key);
3086 		umtxq_unlock(&uq->uq_key);
3087 	}
3088 out:
3089 	umtx_key_release(&uq->uq_key);
3090 	return (error);
3091 }
3092 
3093 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3094 static int
3095 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3096 {
3097 	struct abs_timeout timo;
3098 	struct umtx_q *uq;
3099 	uint32_t flags, count, count1;
3100 	int error, rv;
3101 
3102 	uq = td->td_umtxq;
3103 	error = fueword32(&sem->_flags, &flags);
3104 	if (error == -1)
3105 		return (EFAULT);
3106 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3107 	if (error != 0)
3108 		return (error);
3109 
3110 	if (timeout != NULL)
3111 		abs_timeout_init2(&timo, timeout);
3112 
3113 	umtxq_lock(&uq->uq_key);
3114 	umtxq_busy(&uq->uq_key);
3115 	umtxq_insert(uq);
3116 	umtxq_unlock(&uq->uq_key);
3117 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3118 	if (rv == 0)
3119 		rv = fueword32(&sem->_count, &count);
3120 	if (rv == -1 || count != 0) {
3121 		umtxq_lock(&uq->uq_key);
3122 		umtxq_unbusy(&uq->uq_key);
3123 		umtxq_remove(uq);
3124 		umtxq_unlock(&uq->uq_key);
3125 		umtx_key_release(&uq->uq_key);
3126 		return (rv == -1 ? EFAULT : 0);
3127 	}
3128 	umtxq_lock(&uq->uq_key);
3129 	umtxq_unbusy(&uq->uq_key);
3130 
3131 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3132 
3133 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3134 		error = 0;
3135 	else {
3136 		umtxq_remove(uq);
3137 		/* A relative timeout cannot be restarted. */
3138 		if (error == ERESTART && timeout != NULL &&
3139 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3140 			error = EINTR;
3141 	}
3142 	umtxq_unlock(&uq->uq_key);
3143 	umtx_key_release(&uq->uq_key);
3144 	return (error);
3145 }
3146 
3147 /*
3148  * Signal a userland semaphore.
3149  */
3150 static int
3151 do_sem_wake(struct thread *td, struct _usem *sem)
3152 {
3153 	struct umtx_key key;
3154 	int error, cnt;
3155 	uint32_t flags;
3156 
3157 	error = fueword32(&sem->_flags, &flags);
3158 	if (error == -1)
3159 		return (EFAULT);
3160 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3161 		return (error);
3162 	umtxq_lock(&key);
3163 	umtxq_busy(&key);
3164 	cnt = umtxq_count(&key);
3165 	if (cnt > 0) {
3166 		/*
3167 		 * Check if count is greater than 0, this means the memory is
3168 		 * still being referenced by user code, so we can safely
3169 		 * update _has_waiters flag.
3170 		 */
3171 		if (cnt == 1) {
3172 			umtxq_unlock(&key);
3173 			error = suword32(&sem->_has_waiters, 0);
3174 			umtxq_lock(&key);
3175 			if (error == -1)
3176 				error = EFAULT;
3177 		}
3178 		umtxq_signal(&key, 1);
3179 	}
3180 	umtxq_unbusy(&key);
3181 	umtxq_unlock(&key);
3182 	umtx_key_release(&key);
3183 	return (error);
3184 }
3185 #endif
3186 
3187 static int
3188 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3189 {
3190 	struct abs_timeout timo;
3191 	struct umtx_q *uq;
3192 	uint32_t count, flags;
3193 	int error, rv;
3194 
3195 	uq = td->td_umtxq;
3196 	flags = fuword32(&sem->_flags);
3197 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3198 	if (error != 0)
3199 		return (error);
3200 
3201 	if (timeout != NULL)
3202 		abs_timeout_init2(&timo, timeout);
3203 
3204 	umtxq_lock(&uq->uq_key);
3205 	umtxq_busy(&uq->uq_key);
3206 	umtxq_insert(uq);
3207 	umtxq_unlock(&uq->uq_key);
3208 	rv = fueword32(&sem->_count, &count);
3209 	if (rv == -1) {
3210 		umtxq_lock(&uq->uq_key);
3211 		umtxq_unbusy(&uq->uq_key);
3212 		umtxq_remove(uq);
3213 		umtxq_unlock(&uq->uq_key);
3214 		umtx_key_release(&uq->uq_key);
3215 		return (EFAULT);
3216 	}
3217 	for (;;) {
3218 		if (USEM_COUNT(count) != 0) {
3219 			umtxq_lock(&uq->uq_key);
3220 			umtxq_unbusy(&uq->uq_key);
3221 			umtxq_remove(uq);
3222 			umtxq_unlock(&uq->uq_key);
3223 			umtx_key_release(&uq->uq_key);
3224 			return (0);
3225 		}
3226 		if (count == USEM_HAS_WAITERS)
3227 			break;
3228 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3229 		if (rv == -1) {
3230 			umtxq_lock(&uq->uq_key);
3231 			umtxq_unbusy(&uq->uq_key);
3232 			umtxq_remove(uq);
3233 			umtxq_unlock(&uq->uq_key);
3234 			umtx_key_release(&uq->uq_key);
3235 			return (EFAULT);
3236 		}
3237 		if (count == 0)
3238 			break;
3239 	}
3240 	umtxq_lock(&uq->uq_key);
3241 	umtxq_unbusy(&uq->uq_key);
3242 
3243 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3244 
3245 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3246 		error = 0;
3247 	else {
3248 		umtxq_remove(uq);
3249 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3250 			/* A relative timeout cannot be restarted. */
3251 			if (error == ERESTART)
3252 				error = EINTR;
3253 			if (error == EINTR) {
3254 				abs_timeout_update(&timo);
3255 				timespecsub(&timo.end, &timo.cur,
3256 				    &timeout->_timeout);
3257 			}
3258 		}
3259 	}
3260 	umtxq_unlock(&uq->uq_key);
3261 	umtx_key_release(&uq->uq_key);
3262 	return (error);
3263 }
3264 
3265 /*
3266  * Signal a userland semaphore.
3267  */
3268 static int
3269 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3270 {
3271 	struct umtx_key key;
3272 	int error, cnt, rv;
3273 	uint32_t count, flags;
3274 
3275 	rv = fueword32(&sem->_flags, &flags);
3276 	if (rv == -1)
3277 		return (EFAULT);
3278 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3279 		return (error);
3280 	umtxq_lock(&key);
3281 	umtxq_busy(&key);
3282 	cnt = umtxq_count(&key);
3283 	if (cnt > 0) {
3284 		/*
3285 		 * If this was the last sleeping thread, clear the waiters
3286 		 * flag in _count.
3287 		 */
3288 		if (cnt == 1) {
3289 			umtxq_unlock(&key);
3290 			rv = fueword32(&sem->_count, &count);
3291 			while (rv != -1 && count & USEM_HAS_WAITERS)
3292 				rv = casueword32(&sem->_count, count, &count,
3293 				    count & ~USEM_HAS_WAITERS);
3294 			if (rv == -1)
3295 				error = EFAULT;
3296 			umtxq_lock(&key);
3297 		}
3298 
3299 		umtxq_signal(&key, 1);
3300 	}
3301 	umtxq_unbusy(&key);
3302 	umtxq_unlock(&key);
3303 	umtx_key_release(&key);
3304 	return (error);
3305 }
3306 
3307 inline int
3308 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3309 {
3310 	int error;
3311 
3312 	error = copyin(addr, tsp, sizeof(struct timespec));
3313 	if (error == 0) {
3314 		if (tsp->tv_sec < 0 ||
3315 		    tsp->tv_nsec >= 1000000000 ||
3316 		    tsp->tv_nsec < 0)
3317 			error = EINVAL;
3318 	}
3319 	return (error);
3320 }
3321 
3322 static inline int
3323 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3324 {
3325 	int error;
3326 
3327 	if (size <= sizeof(struct timespec)) {
3328 		tp->_clockid = CLOCK_REALTIME;
3329 		tp->_flags = 0;
3330 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3331 	} else
3332 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3333 	if (error != 0)
3334 		return (error);
3335 	if (tp->_timeout.tv_sec < 0 ||
3336 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3337 		return (EINVAL);
3338 	return (0);
3339 }
3340 
3341 static int
3342 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3343 {
3344 
3345 	return (EOPNOTSUPP);
3346 }
3347 
3348 static int
3349 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3350 {
3351 	struct _umtx_time timeout, *tm_p;
3352 	int error;
3353 
3354 	if (uap->uaddr2 == NULL)
3355 		tm_p = NULL;
3356 	else {
3357 		error = umtx_copyin_umtx_time(
3358 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3359 		if (error != 0)
3360 			return (error);
3361 		tm_p = &timeout;
3362 	}
3363 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3364 }
3365 
3366 static int
3367 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3368 {
3369 	struct _umtx_time timeout, *tm_p;
3370 	int error;
3371 
3372 	if (uap->uaddr2 == NULL)
3373 		tm_p = NULL;
3374 	else {
3375 		error = umtx_copyin_umtx_time(
3376 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3377 		if (error != 0)
3378 			return (error);
3379 		tm_p = &timeout;
3380 	}
3381 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3382 }
3383 
3384 static int
3385 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3386 {
3387 	struct _umtx_time *tm_p, timeout;
3388 	int error;
3389 
3390 	if (uap->uaddr2 == NULL)
3391 		tm_p = NULL;
3392 	else {
3393 		error = umtx_copyin_umtx_time(
3394 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3395 		if (error != 0)
3396 			return (error);
3397 		tm_p = &timeout;
3398 	}
3399 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3400 }
3401 
3402 static int
3403 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3404 {
3405 
3406 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3407 }
3408 
3409 #define BATCH_SIZE	128
3410 static int
3411 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3412 {
3413 	char *uaddrs[BATCH_SIZE], **upp;
3414 	int count, error, i, pos, tocopy;
3415 
3416 	upp = (char **)uap->obj;
3417 	error = 0;
3418 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3419 	    pos += tocopy) {
3420 		tocopy = MIN(count, BATCH_SIZE);
3421 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3422 		if (error != 0)
3423 			break;
3424 		for (i = 0; i < tocopy; ++i)
3425 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3426 		maybe_yield();
3427 	}
3428 	return (error);
3429 }
3430 
3431 static int
3432 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3433 {
3434 
3435 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3436 }
3437 
3438 static int
3439 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3440 {
3441 	struct _umtx_time *tm_p, timeout;
3442 	int error;
3443 
3444 	/* Allow a null timespec (wait forever). */
3445 	if (uap->uaddr2 == NULL)
3446 		tm_p = NULL;
3447 	else {
3448 		error = umtx_copyin_umtx_time(
3449 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3450 		if (error != 0)
3451 			return (error);
3452 		tm_p = &timeout;
3453 	}
3454 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3455 }
3456 
3457 static int
3458 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3459 {
3460 
3461 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3462 }
3463 
3464 static int
3465 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3466 {
3467 	struct _umtx_time *tm_p, timeout;
3468 	int error;
3469 
3470 	/* Allow a null timespec (wait forever). */
3471 	if (uap->uaddr2 == NULL)
3472 		tm_p = NULL;
3473 	else {
3474 		error = umtx_copyin_umtx_time(
3475 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3476 		if (error != 0)
3477 			return (error);
3478 		tm_p = &timeout;
3479 	}
3480 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3481 }
3482 
3483 static int
3484 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3485 {
3486 
3487 	return (do_wake_umutex(td, uap->obj));
3488 }
3489 
3490 static int
3491 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3492 {
3493 
3494 	return (do_unlock_umutex(td, uap->obj, false));
3495 }
3496 
3497 static int
3498 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3499 {
3500 
3501 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3502 }
3503 
3504 static int
3505 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3506 {
3507 	struct timespec *ts, timeout;
3508 	int error;
3509 
3510 	/* Allow a null timespec (wait forever). */
3511 	if (uap->uaddr2 == NULL)
3512 		ts = NULL;
3513 	else {
3514 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3515 		if (error != 0)
3516 			return (error);
3517 		ts = &timeout;
3518 	}
3519 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3520 }
3521 
3522 static int
3523 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3524 {
3525 
3526 	return (do_cv_signal(td, uap->obj));
3527 }
3528 
3529 static int
3530 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3531 {
3532 
3533 	return (do_cv_broadcast(td, uap->obj));
3534 }
3535 
3536 static int
3537 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3538 {
3539 	struct _umtx_time timeout;
3540 	int error;
3541 
3542 	/* Allow a null timespec (wait forever). */
3543 	if (uap->uaddr2 == NULL) {
3544 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3545 	} else {
3546 		error = umtx_copyin_umtx_time(uap->uaddr2,
3547 		   (size_t)uap->uaddr1, &timeout);
3548 		if (error != 0)
3549 			return (error);
3550 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3551 	}
3552 	return (error);
3553 }
3554 
3555 static int
3556 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3557 {
3558 	struct _umtx_time timeout;
3559 	int error;
3560 
3561 	/* Allow a null timespec (wait forever). */
3562 	if (uap->uaddr2 == NULL) {
3563 		error = do_rw_wrlock(td, uap->obj, 0);
3564 	} else {
3565 		error = umtx_copyin_umtx_time(uap->uaddr2,
3566 		   (size_t)uap->uaddr1, &timeout);
3567 		if (error != 0)
3568 			return (error);
3569 
3570 		error = do_rw_wrlock(td, uap->obj, &timeout);
3571 	}
3572 	return (error);
3573 }
3574 
3575 static int
3576 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3577 {
3578 
3579 	return (do_rw_unlock(td, uap->obj));
3580 }
3581 
3582 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3583 static int
3584 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3585 {
3586 	struct _umtx_time *tm_p, timeout;
3587 	int error;
3588 
3589 	/* Allow a null timespec (wait forever). */
3590 	if (uap->uaddr2 == NULL)
3591 		tm_p = NULL;
3592 	else {
3593 		error = umtx_copyin_umtx_time(
3594 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3595 		if (error != 0)
3596 			return (error);
3597 		tm_p = &timeout;
3598 	}
3599 	return (do_sem_wait(td, uap->obj, tm_p));
3600 }
3601 
3602 static int
3603 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3604 {
3605 
3606 	return (do_sem_wake(td, uap->obj));
3607 }
3608 #endif
3609 
3610 static int
3611 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3612 {
3613 
3614 	return (do_wake2_umutex(td, uap->obj, uap->val));
3615 }
3616 
3617 static int
3618 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3619 {
3620 	struct _umtx_time *tm_p, timeout;
3621 	size_t uasize;
3622 	int error;
3623 
3624 	/* Allow a null timespec (wait forever). */
3625 	if (uap->uaddr2 == NULL) {
3626 		uasize = 0;
3627 		tm_p = NULL;
3628 	} else {
3629 		uasize = (size_t)uap->uaddr1;
3630 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3631 		if (error != 0)
3632 			return (error);
3633 		tm_p = &timeout;
3634 	}
3635 	error = do_sem2_wait(td, uap->obj, tm_p);
3636 	if (error == EINTR && uap->uaddr2 != NULL &&
3637 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3638 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3639 		error = copyout(&timeout._timeout,
3640 		    (struct _umtx_time *)uap->uaddr2 + 1,
3641 		    sizeof(struct timespec));
3642 		if (error == 0) {
3643 			error = EINTR;
3644 		}
3645 	}
3646 
3647 	return (error);
3648 }
3649 
3650 static int
3651 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3652 {
3653 
3654 	return (do_sem2_wake(td, uap->obj));
3655 }
3656 
3657 #define	USHM_OBJ_UMTX(o)						\
3658     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3659 
3660 #define	USHMF_REG_LINKED	0x0001
3661 #define	USHMF_OBJ_LINKED	0x0002
3662 struct umtx_shm_reg {
3663 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3664 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3665 	struct umtx_key		ushm_key;
3666 	struct ucred		*ushm_cred;
3667 	struct shmfd		*ushm_obj;
3668 	u_int			ushm_refcnt;
3669 	u_int			ushm_flags;
3670 };
3671 
3672 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3673 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3674 
3675 static uma_zone_t umtx_shm_reg_zone;
3676 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3677 static struct mtx umtx_shm_lock;
3678 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3679     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3680 
3681 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3682 
3683 static void
3684 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3685 {
3686 	struct umtx_shm_reg_head d;
3687 	struct umtx_shm_reg *reg, *reg1;
3688 
3689 	TAILQ_INIT(&d);
3690 	mtx_lock(&umtx_shm_lock);
3691 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3692 	mtx_unlock(&umtx_shm_lock);
3693 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3694 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3695 		umtx_shm_free_reg(reg);
3696 	}
3697 }
3698 
3699 static struct task umtx_shm_reg_delfree_task =
3700     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3701 
3702 static struct umtx_shm_reg *
3703 umtx_shm_find_reg_locked(const struct umtx_key *key)
3704 {
3705 	struct umtx_shm_reg *reg;
3706 	struct umtx_shm_reg_head *reg_head;
3707 
3708 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3709 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3710 	reg_head = &umtx_shm_registry[key->hash];
3711 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3712 		KASSERT(reg->ushm_key.shared,
3713 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3714 		if (reg->ushm_key.info.shared.object ==
3715 		    key->info.shared.object &&
3716 		    reg->ushm_key.info.shared.offset ==
3717 		    key->info.shared.offset) {
3718 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3719 			KASSERT(reg->ushm_refcnt > 0,
3720 			    ("reg %p refcnt 0 onlist", reg));
3721 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3722 			    ("reg %p not linked", reg));
3723 			reg->ushm_refcnt++;
3724 			return (reg);
3725 		}
3726 	}
3727 	return (NULL);
3728 }
3729 
3730 static struct umtx_shm_reg *
3731 umtx_shm_find_reg(const struct umtx_key *key)
3732 {
3733 	struct umtx_shm_reg *reg;
3734 
3735 	mtx_lock(&umtx_shm_lock);
3736 	reg = umtx_shm_find_reg_locked(key);
3737 	mtx_unlock(&umtx_shm_lock);
3738 	return (reg);
3739 }
3740 
3741 static void
3742 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3743 {
3744 
3745 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3746 	crfree(reg->ushm_cred);
3747 	shm_drop(reg->ushm_obj);
3748 	uma_zfree(umtx_shm_reg_zone, reg);
3749 }
3750 
3751 static bool
3752 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3753 {
3754 	bool res;
3755 
3756 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3757 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3758 	reg->ushm_refcnt--;
3759 	res = reg->ushm_refcnt == 0;
3760 	if (res || force) {
3761 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3762 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3763 			    reg, ushm_reg_link);
3764 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3765 		}
3766 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3767 			LIST_REMOVE(reg, ushm_obj_link);
3768 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3769 		}
3770 	}
3771 	return (res);
3772 }
3773 
3774 static void
3775 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3776 {
3777 	vm_object_t object;
3778 	bool dofree;
3779 
3780 	if (force) {
3781 		object = reg->ushm_obj->shm_object;
3782 		VM_OBJECT_WLOCK(object);
3783 		object->flags |= OBJ_UMTXDEAD;
3784 		VM_OBJECT_WUNLOCK(object);
3785 	}
3786 	mtx_lock(&umtx_shm_lock);
3787 	dofree = umtx_shm_unref_reg_locked(reg, force);
3788 	mtx_unlock(&umtx_shm_lock);
3789 	if (dofree)
3790 		umtx_shm_free_reg(reg);
3791 }
3792 
3793 void
3794 umtx_shm_object_init(vm_object_t object)
3795 {
3796 
3797 	LIST_INIT(USHM_OBJ_UMTX(object));
3798 }
3799 
3800 void
3801 umtx_shm_object_terminated(vm_object_t object)
3802 {
3803 	struct umtx_shm_reg *reg, *reg1;
3804 	bool dofree;
3805 
3806 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3807 		return;
3808 
3809 	dofree = false;
3810 	mtx_lock(&umtx_shm_lock);
3811 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3812 		if (umtx_shm_unref_reg_locked(reg, true)) {
3813 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3814 			    ushm_reg_link);
3815 			dofree = true;
3816 		}
3817 	}
3818 	mtx_unlock(&umtx_shm_lock);
3819 	if (dofree)
3820 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3821 }
3822 
3823 static int
3824 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3825     struct umtx_shm_reg **res)
3826 {
3827 	struct umtx_shm_reg *reg, *reg1;
3828 	struct ucred *cred;
3829 	int error;
3830 
3831 	reg = umtx_shm_find_reg(key);
3832 	if (reg != NULL) {
3833 		*res = reg;
3834 		return (0);
3835 	}
3836 	cred = td->td_ucred;
3837 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3838 		return (ENOMEM);
3839 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3840 	reg->ushm_refcnt = 1;
3841 	bcopy(key, &reg->ushm_key, sizeof(*key));
3842 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3843 	reg->ushm_cred = crhold(cred);
3844 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3845 	if (error != 0) {
3846 		umtx_shm_free_reg(reg);
3847 		return (error);
3848 	}
3849 	mtx_lock(&umtx_shm_lock);
3850 	reg1 = umtx_shm_find_reg_locked(key);
3851 	if (reg1 != NULL) {
3852 		mtx_unlock(&umtx_shm_lock);
3853 		umtx_shm_free_reg(reg);
3854 		*res = reg1;
3855 		return (0);
3856 	}
3857 	reg->ushm_refcnt++;
3858 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3859 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3860 	    ushm_obj_link);
3861 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3862 	mtx_unlock(&umtx_shm_lock);
3863 	*res = reg;
3864 	return (0);
3865 }
3866 
3867 static int
3868 umtx_shm_alive(struct thread *td, void *addr)
3869 {
3870 	vm_map_t map;
3871 	vm_map_entry_t entry;
3872 	vm_object_t object;
3873 	vm_pindex_t pindex;
3874 	vm_prot_t prot;
3875 	int res, ret;
3876 	boolean_t wired;
3877 
3878 	map = &td->td_proc->p_vmspace->vm_map;
3879 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3880 	    &object, &pindex, &prot, &wired);
3881 	if (res != KERN_SUCCESS)
3882 		return (EFAULT);
3883 	if (object == NULL)
3884 		ret = EINVAL;
3885 	else
3886 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3887 	vm_map_lookup_done(map, entry);
3888 	return (ret);
3889 }
3890 
3891 static void
3892 umtx_shm_init(void)
3893 {
3894 	int i;
3895 
3896 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3897 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3898 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3899 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3900 		TAILQ_INIT(&umtx_shm_registry[i]);
3901 }
3902 
3903 static int
3904 umtx_shm(struct thread *td, void *addr, u_int flags)
3905 {
3906 	struct umtx_key key;
3907 	struct umtx_shm_reg *reg;
3908 	struct file *fp;
3909 	int error, fd;
3910 
3911 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
3912 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
3913 		return (EINVAL);
3914 	if ((flags & UMTX_SHM_ALIVE) != 0)
3915 		return (umtx_shm_alive(td, addr));
3916 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
3917 	if (error != 0)
3918 		return (error);
3919 	KASSERT(key.shared == 1, ("non-shared key"));
3920 	if ((flags & UMTX_SHM_CREAT) != 0) {
3921 		error = umtx_shm_create_reg(td, &key, &reg);
3922 	} else {
3923 		reg = umtx_shm_find_reg(&key);
3924 		if (reg == NULL)
3925 			error = ESRCH;
3926 	}
3927 	umtx_key_release(&key);
3928 	if (error != 0)
3929 		return (error);
3930 	KASSERT(reg != NULL, ("no reg"));
3931 	if ((flags & UMTX_SHM_DESTROY) != 0) {
3932 		umtx_shm_unref_reg(reg, true);
3933 	} else {
3934 #if 0
3935 #ifdef MAC
3936 		error = mac_posixshm_check_open(td->td_ucred,
3937 		    reg->ushm_obj, FFLAGS(O_RDWR));
3938 		if (error == 0)
3939 #endif
3940 			error = shm_access(reg->ushm_obj, td->td_ucred,
3941 			    FFLAGS(O_RDWR));
3942 		if (error == 0)
3943 #endif
3944 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
3945 		if (error == 0) {
3946 			shm_hold(reg->ushm_obj);
3947 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
3948 			    &shm_ops);
3949 			td->td_retval[0] = fd;
3950 			fdrop(fp, td);
3951 		}
3952 	}
3953 	umtx_shm_unref_reg(reg, false);
3954 	return (error);
3955 }
3956 
3957 static int
3958 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
3959 {
3960 
3961 	return (umtx_shm(td, uap->uaddr1, uap->val));
3962 }
3963 
3964 static int
3965 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
3966 {
3967 
3968 	td->td_rb_list = rbp->robust_list_offset;
3969 	td->td_rbp_list = rbp->robust_priv_list_offset;
3970 	td->td_rb_inact = rbp->robust_inact_offset;
3971 	return (0);
3972 }
3973 
3974 static int
3975 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
3976 {
3977 	struct umtx_robust_lists_params rb;
3978 	int error;
3979 
3980 	if (uap->val > sizeof(rb))
3981 		return (EINVAL);
3982 	bzero(&rb, sizeof(rb));
3983 	error = copyin(uap->uaddr1, &rb, uap->val);
3984 	if (error != 0)
3985 		return (error);
3986 	return (umtx_robust_lists(td, &rb));
3987 }
3988 
3989 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3990 
3991 static const _umtx_op_func op_table[] = {
3992 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3993 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3994 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3995 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3996 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3997 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3998 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3999 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4000 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4001 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4002 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4003 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4004 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4005 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4006 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4007 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4008 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4009 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4010 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4011 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4012 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4013 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4014 #else
4015 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4016 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4017 #endif
4018 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4019 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4020 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4021 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4022 	[UMTX_OP_SHM]		= __umtx_op_shm,
4023 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4024 };
4025 
4026 int
4027 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4028 {
4029 
4030 	if ((unsigned)uap->op < nitems(op_table))
4031 		return (*op_table[uap->op])(td, uap);
4032 	return (EINVAL);
4033 }
4034 
4035 #ifdef COMPAT_FREEBSD32
4036 
4037 struct timespec32 {
4038 	int32_t tv_sec;
4039 	int32_t tv_nsec;
4040 };
4041 
4042 struct umtx_time32 {
4043 	struct	timespec32	timeout;
4044 	uint32_t		flags;
4045 	uint32_t		clockid;
4046 };
4047 
4048 static inline int
4049 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4050 {
4051 	struct timespec32 ts32;
4052 	int error;
4053 
4054 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4055 	if (error == 0) {
4056 		if (ts32.tv_sec < 0 ||
4057 		    ts32.tv_nsec >= 1000000000 ||
4058 		    ts32.tv_nsec < 0)
4059 			error = EINVAL;
4060 		else {
4061 			tsp->tv_sec = ts32.tv_sec;
4062 			tsp->tv_nsec = ts32.tv_nsec;
4063 		}
4064 	}
4065 	return (error);
4066 }
4067 
4068 static inline int
4069 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4070 {
4071 	struct umtx_time32 t32;
4072 	int error;
4073 
4074 	t32.clockid = CLOCK_REALTIME;
4075 	t32.flags   = 0;
4076 	if (size <= sizeof(struct timespec32))
4077 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4078 	else
4079 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4080 	if (error != 0)
4081 		return (error);
4082 	if (t32.timeout.tv_sec < 0 ||
4083 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4084 		return (EINVAL);
4085 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4086 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4087 	tp->_flags = t32.flags;
4088 	tp->_clockid = t32.clockid;
4089 	return (0);
4090 }
4091 
4092 static int
4093 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4094 {
4095 	struct _umtx_time *tm_p, timeout;
4096 	int error;
4097 
4098 	if (uap->uaddr2 == NULL)
4099 		tm_p = NULL;
4100 	else {
4101 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4102 			(size_t)uap->uaddr1, &timeout);
4103 		if (error != 0)
4104 			return (error);
4105 		tm_p = &timeout;
4106 	}
4107 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4108 }
4109 
4110 static int
4111 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4112 {
4113 	struct _umtx_time *tm_p, timeout;
4114 	int error;
4115 
4116 	/* Allow a null timespec (wait forever). */
4117 	if (uap->uaddr2 == NULL)
4118 		tm_p = NULL;
4119 	else {
4120 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4121 			    (size_t)uap->uaddr1, &timeout);
4122 		if (error != 0)
4123 			return (error);
4124 		tm_p = &timeout;
4125 	}
4126 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4127 }
4128 
4129 static int
4130 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4131 {
4132 	struct _umtx_time *tm_p, timeout;
4133 	int error;
4134 
4135 	/* Allow a null timespec (wait forever). */
4136 	if (uap->uaddr2 == NULL)
4137 		tm_p = NULL;
4138 	else {
4139 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4140 		    (size_t)uap->uaddr1, &timeout);
4141 		if (error != 0)
4142 			return (error);
4143 		tm_p = &timeout;
4144 	}
4145 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4146 }
4147 
4148 static int
4149 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4150 {
4151 	struct timespec *ts, timeout;
4152 	int error;
4153 
4154 	/* Allow a null timespec (wait forever). */
4155 	if (uap->uaddr2 == NULL)
4156 		ts = NULL;
4157 	else {
4158 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4159 		if (error != 0)
4160 			return (error);
4161 		ts = &timeout;
4162 	}
4163 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4164 }
4165 
4166 static int
4167 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4168 {
4169 	struct _umtx_time timeout;
4170 	int error;
4171 
4172 	/* Allow a null timespec (wait forever). */
4173 	if (uap->uaddr2 == NULL) {
4174 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4175 	} else {
4176 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4177 		    (size_t)uap->uaddr1, &timeout);
4178 		if (error != 0)
4179 			return (error);
4180 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4181 	}
4182 	return (error);
4183 }
4184 
4185 static int
4186 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4187 {
4188 	struct _umtx_time timeout;
4189 	int error;
4190 
4191 	/* Allow a null timespec (wait forever). */
4192 	if (uap->uaddr2 == NULL) {
4193 		error = do_rw_wrlock(td, uap->obj, 0);
4194 	} else {
4195 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4196 		    (size_t)uap->uaddr1, &timeout);
4197 		if (error != 0)
4198 			return (error);
4199 		error = do_rw_wrlock(td, uap->obj, &timeout);
4200 	}
4201 	return (error);
4202 }
4203 
4204 static int
4205 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4206 {
4207 	struct _umtx_time *tm_p, timeout;
4208 	int error;
4209 
4210 	if (uap->uaddr2 == NULL)
4211 		tm_p = NULL;
4212 	else {
4213 		error = umtx_copyin_umtx_time32(
4214 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4215 		if (error != 0)
4216 			return (error);
4217 		tm_p = &timeout;
4218 	}
4219 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4220 }
4221 
4222 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4223 static int
4224 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4225 {
4226 	struct _umtx_time *tm_p, timeout;
4227 	int error;
4228 
4229 	/* Allow a null timespec (wait forever). */
4230 	if (uap->uaddr2 == NULL)
4231 		tm_p = NULL;
4232 	else {
4233 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4234 		    (size_t)uap->uaddr1, &timeout);
4235 		if (error != 0)
4236 			return (error);
4237 		tm_p = &timeout;
4238 	}
4239 	return (do_sem_wait(td, uap->obj, tm_p));
4240 }
4241 #endif
4242 
4243 static int
4244 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4245 {
4246 	struct _umtx_time *tm_p, timeout;
4247 	size_t uasize;
4248 	int error;
4249 
4250 	/* Allow a null timespec (wait forever). */
4251 	if (uap->uaddr2 == NULL) {
4252 		uasize = 0;
4253 		tm_p = NULL;
4254 	} else {
4255 		uasize = (size_t)uap->uaddr1;
4256 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4257 		if (error != 0)
4258 			return (error);
4259 		tm_p = &timeout;
4260 	}
4261 	error = do_sem2_wait(td, uap->obj, tm_p);
4262 	if (error == EINTR && uap->uaddr2 != NULL &&
4263 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4264 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4265 		struct timespec32 remain32 = {
4266 			.tv_sec = timeout._timeout.tv_sec,
4267 			.tv_nsec = timeout._timeout.tv_nsec
4268 		};
4269 		error = copyout(&remain32,
4270 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4271 		    sizeof(struct timespec32));
4272 		if (error == 0) {
4273 			error = EINTR;
4274 		}
4275 	}
4276 
4277 	return (error);
4278 }
4279 
4280 static int
4281 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4282 {
4283 	uint32_t uaddrs[BATCH_SIZE], **upp;
4284 	int count, error, i, pos, tocopy;
4285 
4286 	upp = (uint32_t **)uap->obj;
4287 	error = 0;
4288 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4289 	    pos += tocopy) {
4290 		tocopy = MIN(count, BATCH_SIZE);
4291 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4292 		if (error != 0)
4293 			break;
4294 		for (i = 0; i < tocopy; ++i)
4295 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4296 			    INT_MAX, 1);
4297 		maybe_yield();
4298 	}
4299 	return (error);
4300 }
4301 
4302 struct umtx_robust_lists_params_compat32 {
4303 	uint32_t	robust_list_offset;
4304 	uint32_t	robust_priv_list_offset;
4305 	uint32_t	robust_inact_offset;
4306 };
4307 
4308 static int
4309 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4310 {
4311 	struct umtx_robust_lists_params rb;
4312 	struct umtx_robust_lists_params_compat32 rb32;
4313 	int error;
4314 
4315 	if (uap->val > sizeof(rb32))
4316 		return (EINVAL);
4317 	bzero(&rb, sizeof(rb));
4318 	bzero(&rb32, sizeof(rb32));
4319 	error = copyin(uap->uaddr1, &rb32, uap->val);
4320 	if (error != 0)
4321 		return (error);
4322 	rb.robust_list_offset = rb32.robust_list_offset;
4323 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4324 	rb.robust_inact_offset = rb32.robust_inact_offset;
4325 	return (umtx_robust_lists(td, &rb));
4326 }
4327 
4328 static const _umtx_op_func op_table_compat32[] = {
4329 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4330 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4331 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4332 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4333 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4334 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4335 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4336 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4337 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4338 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4339 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4340 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4341 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4342 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4343 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4344 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4345 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4346 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4347 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4348 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4349 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4350 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4351 #else
4352 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4353 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4354 #endif
4355 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4356 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4357 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4358 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4359 	[UMTX_OP_SHM]		= __umtx_op_shm,
4360 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4361 };
4362 
4363 int
4364 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4365 {
4366 
4367 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4368 		return (*op_table_compat32[uap->op])(td,
4369 		    (struct _umtx_op_args *)uap);
4370 	}
4371 	return (EINVAL);
4372 }
4373 #endif
4374 
4375 void
4376 umtx_thread_init(struct thread *td)
4377 {
4378 
4379 	td->td_umtxq = umtxq_alloc();
4380 	td->td_umtxq->uq_thread = td;
4381 }
4382 
4383 void
4384 umtx_thread_fini(struct thread *td)
4385 {
4386 
4387 	umtxq_free(td->td_umtxq);
4388 }
4389 
4390 /*
4391  * It will be called when new thread is created, e.g fork().
4392  */
4393 void
4394 umtx_thread_alloc(struct thread *td)
4395 {
4396 	struct umtx_q *uq;
4397 
4398 	uq = td->td_umtxq;
4399 	uq->uq_inherited_pri = PRI_MAX;
4400 
4401 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4402 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4403 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4404 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4405 }
4406 
4407 /*
4408  * exec() hook.
4409  *
4410  * Clear robust lists for all process' threads, not delaying the
4411  * cleanup to thread_exit hook, since the relevant address space is
4412  * destroyed right now.
4413  */
4414 static void
4415 umtx_exec_hook(void *arg __unused, struct proc *p,
4416     struct image_params *imgp __unused)
4417 {
4418 	struct thread *td;
4419 
4420 	KASSERT(p == curproc, ("need curproc"));
4421 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4422 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4423 	    ("curproc must be single-threaded"));
4424 	/*
4425 	 * There is no need to lock the list as only this thread can be
4426 	 * running.
4427 	 */
4428 	FOREACH_THREAD_IN_PROC(p, td) {
4429 		KASSERT(td == curthread ||
4430 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4431 		    ("running thread %p %p", p, td));
4432 		umtx_thread_cleanup(td);
4433 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4434 	}
4435 }
4436 
4437 /*
4438  * thread_exit() hook.
4439  */
4440 void
4441 umtx_thread_exit(struct thread *td)
4442 {
4443 
4444 	umtx_thread_cleanup(td);
4445 }
4446 
4447 static int
4448 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4449 {
4450 	u_long res1;
4451 #ifdef COMPAT_FREEBSD32
4452 	uint32_t res32;
4453 #endif
4454 	int error;
4455 
4456 #ifdef COMPAT_FREEBSD32
4457 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4458 		error = fueword32((void *)ptr, &res32);
4459 		if (error == 0)
4460 			res1 = res32;
4461 	} else
4462 #endif
4463 	{
4464 		error = fueword((void *)ptr, &res1);
4465 	}
4466 	if (error == 0)
4467 		*res = res1;
4468 	else
4469 		error = EFAULT;
4470 	return (error);
4471 }
4472 
4473 static void
4474 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4475 {
4476 #ifdef COMPAT_FREEBSD32
4477 	struct umutex32 m32;
4478 
4479 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4480 		memcpy(&m32, m, sizeof(m32));
4481 		*rb_list = m32.m_rb_lnk;
4482 	} else
4483 #endif
4484 		*rb_list = m->m_rb_lnk;
4485 }
4486 
4487 static int
4488 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4489 {
4490 	struct umutex m;
4491 	int error;
4492 
4493 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4494 	error = copyin((void *)rbp, &m, sizeof(m));
4495 	if (error != 0)
4496 		return (error);
4497 	if (rb_list != NULL)
4498 		umtx_read_rb_list(td, &m, rb_list);
4499 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4500 		return (EINVAL);
4501 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4502 		/* inact is cleared after unlock, allow the inconsistency */
4503 		return (inact ? 0 : EINVAL);
4504 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4505 }
4506 
4507 static void
4508 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4509     const char *name)
4510 {
4511 	int error, i;
4512 	uintptr_t rbp;
4513 	bool inact;
4514 
4515 	if (rb_list == 0)
4516 		return;
4517 	error = umtx_read_uptr(td, rb_list, &rbp);
4518 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4519 		if (rbp == *rb_inact) {
4520 			inact = true;
4521 			*rb_inact = 0;
4522 		} else
4523 			inact = false;
4524 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4525 	}
4526 	if (i == umtx_max_rb && umtx_verbose_rb) {
4527 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4528 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4529 	}
4530 	if (error != 0 && umtx_verbose_rb) {
4531 		uprintf("comm %s pid %d: handling %srb error %d\n",
4532 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4533 	}
4534 }
4535 
4536 /*
4537  * Clean up umtx data.
4538  */
4539 static void
4540 umtx_thread_cleanup(struct thread *td)
4541 {
4542 	struct umtx_q *uq;
4543 	struct umtx_pi *pi;
4544 	uintptr_t rb_inact;
4545 
4546 	/*
4547 	 * Disown pi mutexes.
4548 	 */
4549 	uq = td->td_umtxq;
4550 	if (uq != NULL) {
4551 		if (uq->uq_inherited_pri != PRI_MAX ||
4552 		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
4553 			mtx_lock(&umtx_lock);
4554 			uq->uq_inherited_pri = PRI_MAX;
4555 			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4556 				pi->pi_owner = NULL;
4557 				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4558 			}
4559 			mtx_unlock(&umtx_lock);
4560 		}
4561 		sched_lend_user_prio_cond(td, PRI_MAX);
4562 	}
4563 
4564 	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
4565 		return;
4566 
4567 	/*
4568 	 * Handle terminated robust mutexes.  Must be done after
4569 	 * robust pi disown, otherwise unlock could see unowned
4570 	 * entries.
4571 	 */
4572 	rb_inact = td->td_rb_inact;
4573 	if (rb_inact != 0)
4574 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4575 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4576 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4577 	if (rb_inact != 0)
4578 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4579 }
4580