xref: /freebsd/sys/kern/kern_umtx.c (revision 718cf2ccb9956613756ab15d7a0e28f2c8e91cab)
1 /*-
2  * Copyright (c) 2015, 2016 The FreeBSD Foundation
3  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
4  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
5  * All rights reserved.
6  *
7  * Portions of this software were developed by Konstantin Belousov
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice unmodified, this list of conditions, and the following
15  *    disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include "opt_compat.h"
36 #include "opt_umtx_profiling.h"
37 
38 #include <sys/param.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/file.h>
42 #include <sys/filedesc.h>
43 #include <sys/limits.h>
44 #include <sys/lock.h>
45 #include <sys/malloc.h>
46 #include <sys/mman.h>
47 #include <sys/mutex.h>
48 #include <sys/priv.h>
49 #include <sys/proc.h>
50 #include <sys/resource.h>
51 #include <sys/resourcevar.h>
52 #include <sys/rwlock.h>
53 #include <sys/sbuf.h>
54 #include <sys/sched.h>
55 #include <sys/smp.h>
56 #include <sys/sysctl.h>
57 #include <sys/sysent.h>
58 #include <sys/systm.h>
59 #include <sys/sysproto.h>
60 #include <sys/syscallsubr.h>
61 #include <sys/taskqueue.h>
62 #include <sys/time.h>
63 #include <sys/eventhandler.h>
64 #include <sys/umtx.h>
65 
66 #include <security/mac/mac_framework.h>
67 
68 #include <vm/vm.h>
69 #include <vm/vm_param.h>
70 #include <vm/pmap.h>
71 #include <vm/vm_map.h>
72 #include <vm/vm_object.h>
73 
74 #include <machine/atomic.h>
75 #include <machine/cpu.h>
76 
77 #ifdef COMPAT_FREEBSD32
78 #include <compat/freebsd32/freebsd32_proto.h>
79 #endif
80 
81 #define _UMUTEX_TRY		1
82 #define _UMUTEX_WAIT		2
83 
84 #ifdef UMTX_PROFILING
85 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
86 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
87 #endif
88 
89 /* Priority inheritance mutex info. */
90 struct umtx_pi {
91 	/* Owner thread */
92 	struct thread		*pi_owner;
93 
94 	/* Reference count */
95 	int			pi_refcount;
96 
97  	/* List entry to link umtx holding by thread */
98 	TAILQ_ENTRY(umtx_pi)	pi_link;
99 
100 	/* List entry in hash */
101 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
102 
103 	/* List for waiters */
104 	TAILQ_HEAD(,umtx_q)	pi_blocked;
105 
106 	/* Identify a userland lock object */
107 	struct umtx_key		pi_key;
108 };
109 
110 /* A userland synchronous object user. */
111 struct umtx_q {
112 	/* Linked list for the hash. */
113 	TAILQ_ENTRY(umtx_q)	uq_link;
114 
115 	/* Umtx key. */
116 	struct umtx_key		uq_key;
117 
118 	/* Umtx flags. */
119 	int			uq_flags;
120 #define UQF_UMTXQ	0x0001
121 
122 	/* The thread waits on. */
123 	struct thread		*uq_thread;
124 
125 	/*
126 	 * Blocked on PI mutex. read can use chain lock
127 	 * or umtx_lock, write must have both chain lock and
128 	 * umtx_lock being hold.
129 	 */
130 	struct umtx_pi		*uq_pi_blocked;
131 
132 	/* On blocked list */
133 	TAILQ_ENTRY(umtx_q)	uq_lockq;
134 
135 	/* Thread contending with us */
136 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
137 
138 	/* Inherited priority from PP mutex */
139 	u_char			uq_inherited_pri;
140 
141 	/* Spare queue ready to be reused */
142 	struct umtxq_queue	*uq_spare_queue;
143 
144 	/* The queue we on */
145 	struct umtxq_queue	*uq_cur_queue;
146 };
147 
148 TAILQ_HEAD(umtxq_head, umtx_q);
149 
150 /* Per-key wait-queue */
151 struct umtxq_queue {
152 	struct umtxq_head	head;
153 	struct umtx_key		key;
154 	LIST_ENTRY(umtxq_queue)	link;
155 	int			length;
156 };
157 
158 LIST_HEAD(umtxq_list, umtxq_queue);
159 
160 /* Userland lock object's wait-queue chain */
161 struct umtxq_chain {
162 	/* Lock for this chain. */
163 	struct mtx		uc_lock;
164 
165 	/* List of sleep queues. */
166 	struct umtxq_list	uc_queue[2];
167 #define UMTX_SHARED_QUEUE	0
168 #define UMTX_EXCLUSIVE_QUEUE	1
169 
170 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
171 
172 	/* Busy flag */
173 	char			uc_busy;
174 
175 	/* Chain lock waiters */
176 	int			uc_waiters;
177 
178 	/* All PI in the list */
179 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
180 
181 #ifdef UMTX_PROFILING
182 	u_int 			length;
183 	u_int			max_length;
184 #endif
185 };
186 
187 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
188 
189 /*
190  * Don't propagate time-sharing priority, there is a security reason,
191  * a user can simply introduce PI-mutex, let thread A lock the mutex,
192  * and let another thread B block on the mutex, because B is
193  * sleeping, its priority will be boosted, this causes A's priority to
194  * be boosted via priority propagating too and will never be lowered even
195  * if it is using 100%CPU, this is unfair to other processes.
196  */
197 
198 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
199 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
200 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
201 
202 #define	GOLDEN_RATIO_PRIME	2654404609U
203 #ifndef	UMTX_CHAINS
204 #define	UMTX_CHAINS		512
205 #endif
206 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
207 
208 #define	GET_SHARE(flags)	\
209     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
210 
211 #define BUSY_SPINS		200
212 
213 struct abs_timeout {
214 	int clockid;
215 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
216 	struct timespec cur;
217 	struct timespec end;
218 };
219 
220 #ifdef COMPAT_FREEBSD32
221 struct umutex32 {
222 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
223 	__uint32_t		m_flags;	/* Flags of the mutex */
224 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
225 	__uint32_t		m_rb_lnk;	/* Robust linkage */
226 	__uint32_t		m_pad;
227 	__uint32_t		m_spare[2];
228 };
229 
230 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
231 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
232     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
233 #endif
234 
235 int umtx_shm_vnobj_persistent = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
237     &umtx_shm_vnobj_persistent, 0,
238     "False forces destruction of umtx attached to file, on last close");
239 static int umtx_max_rb = 1000;
240 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
241     &umtx_max_rb, 0,
242     "");
243 
244 static uma_zone_t		umtx_pi_zone;
245 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
246 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
247 static int			umtx_pi_allocated;
248 
249 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
250 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
251     &umtx_pi_allocated, 0, "Allocated umtx_pi");
252 static int umtx_verbose_rb = 1;
253 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
254     &umtx_verbose_rb, 0,
255     "");
256 
257 #ifdef UMTX_PROFILING
258 static long max_length;
259 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
260 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
261 #endif
262 
263 static void abs_timeout_update(struct abs_timeout *timo);
264 
265 static void umtx_shm_init(void);
266 static void umtxq_sysinit(void *);
267 static void umtxq_hash(struct umtx_key *key);
268 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
269 static void umtxq_lock(struct umtx_key *key);
270 static void umtxq_unlock(struct umtx_key *key);
271 static void umtxq_busy(struct umtx_key *key);
272 static void umtxq_unbusy(struct umtx_key *key);
273 static void umtxq_insert_queue(struct umtx_q *uq, int q);
274 static void umtxq_remove_queue(struct umtx_q *uq, int q);
275 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
276 static int umtxq_count(struct umtx_key *key);
277 static struct umtx_pi *umtx_pi_alloc(int);
278 static void umtx_pi_free(struct umtx_pi *pi);
279 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
280     bool rb);
281 static void umtx_thread_cleanup(struct thread *td);
282 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
283     struct image_params *imgp __unused);
284 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
285 
286 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
287 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
288 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
289 
290 static struct mtx umtx_lock;
291 
292 #ifdef UMTX_PROFILING
293 static void
294 umtx_init_profiling(void)
295 {
296 	struct sysctl_oid *chain_oid;
297 	char chain_name[10];
298 	int i;
299 
300 	for (i = 0; i < UMTX_CHAINS; ++i) {
301 		snprintf(chain_name, sizeof(chain_name), "%d", i);
302 		chain_oid = SYSCTL_ADD_NODE(NULL,
303 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
304 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
305 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
306 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
307 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
308 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
309 	}
310 }
311 
312 static int
313 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
314 {
315 	char buf[512];
316 	struct sbuf sb;
317 	struct umtxq_chain *uc;
318 	u_int fract, i, j, tot, whole;
319 	u_int sf0, sf1, sf2, sf3, sf4;
320 	u_int si0, si1, si2, si3, si4;
321 	u_int sw0, sw1, sw2, sw3, sw4;
322 
323 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
324 	for (i = 0; i < 2; i++) {
325 		tot = 0;
326 		for (j = 0; j < UMTX_CHAINS; ++j) {
327 			uc = &umtxq_chains[i][j];
328 			mtx_lock(&uc->uc_lock);
329 			tot += uc->max_length;
330 			mtx_unlock(&uc->uc_lock);
331 		}
332 		if (tot == 0)
333 			sbuf_printf(&sb, "%u) Empty ", i);
334 		else {
335 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
336 			si0 = si1 = si2 = si3 = si4 = 0;
337 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
338 			for (j = 0; j < UMTX_CHAINS; j++) {
339 				uc = &umtxq_chains[i][j];
340 				mtx_lock(&uc->uc_lock);
341 				whole = uc->max_length * 100;
342 				mtx_unlock(&uc->uc_lock);
343 				fract = (whole % tot) * 100;
344 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
345 					sf0 = fract;
346 					si0 = j;
347 					sw0 = whole;
348 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
349 				    sf1)) {
350 					sf1 = fract;
351 					si1 = j;
352 					sw1 = whole;
353 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
354 				    sf2)) {
355 					sf2 = fract;
356 					si2 = j;
357 					sw2 = whole;
358 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
359 				    sf3)) {
360 					sf3 = fract;
361 					si3 = j;
362 					sw3 = whole;
363 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
364 				    sf4)) {
365 					sf4 = fract;
366 					si4 = j;
367 					sw4 = whole;
368 				}
369 			}
370 			sbuf_printf(&sb, "queue %u:\n", i);
371 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
372 			    sf0 / tot, si0);
373 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
374 			    sf1 / tot, si1);
375 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
376 			    sf2 / tot, si2);
377 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
378 			    sf3 / tot, si3);
379 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
380 			    sf4 / tot, si4);
381 		}
382 	}
383 	sbuf_trim(&sb);
384 	sbuf_finish(&sb);
385 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
386 	sbuf_delete(&sb);
387 	return (0);
388 }
389 
390 static int
391 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
392 {
393 	struct umtxq_chain *uc;
394 	u_int i, j;
395 	int clear, error;
396 
397 	clear = 0;
398 	error = sysctl_handle_int(oidp, &clear, 0, req);
399 	if (error != 0 || req->newptr == NULL)
400 		return (error);
401 
402 	if (clear != 0) {
403 		for (i = 0; i < 2; ++i) {
404 			for (j = 0; j < UMTX_CHAINS; ++j) {
405 				uc = &umtxq_chains[i][j];
406 				mtx_lock(&uc->uc_lock);
407 				uc->length = 0;
408 				uc->max_length = 0;
409 				mtx_unlock(&uc->uc_lock);
410 			}
411 		}
412 	}
413 	return (0);
414 }
415 
416 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
417     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
418     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
419 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
420     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
421     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
422 #endif
423 
424 static void
425 umtxq_sysinit(void *arg __unused)
426 {
427 	int i, j;
428 
429 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
430 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
431 	for (i = 0; i < 2; ++i) {
432 		for (j = 0; j < UMTX_CHAINS; ++j) {
433 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
434 				 MTX_DEF | MTX_DUPOK);
435 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
438 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
439 			umtxq_chains[i][j].uc_busy = 0;
440 			umtxq_chains[i][j].uc_waiters = 0;
441 #ifdef UMTX_PROFILING
442 			umtxq_chains[i][j].length = 0;
443 			umtxq_chains[i][j].max_length = 0;
444 #endif
445 		}
446 	}
447 #ifdef UMTX_PROFILING
448 	umtx_init_profiling();
449 #endif
450 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
451 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
452 	    EVENTHANDLER_PRI_ANY);
453 	umtx_shm_init();
454 }
455 
456 struct umtx_q *
457 umtxq_alloc(void)
458 {
459 	struct umtx_q *uq;
460 
461 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
462 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
463 	    M_WAITOK | M_ZERO);
464 	TAILQ_INIT(&uq->uq_spare_queue->head);
465 	TAILQ_INIT(&uq->uq_pi_contested);
466 	uq->uq_inherited_pri = PRI_MAX;
467 	return (uq);
468 }
469 
470 void
471 umtxq_free(struct umtx_q *uq)
472 {
473 
474 	MPASS(uq->uq_spare_queue != NULL);
475 	free(uq->uq_spare_queue, M_UMTX);
476 	free(uq, M_UMTX);
477 }
478 
479 static inline void
480 umtxq_hash(struct umtx_key *key)
481 {
482 	unsigned n;
483 
484 	n = (uintptr_t)key->info.both.a + key->info.both.b;
485 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
486 }
487 
488 static inline struct umtxq_chain *
489 umtxq_getchain(struct umtx_key *key)
490 {
491 
492 	if (key->type <= TYPE_SEM)
493 		return (&umtxq_chains[1][key->hash]);
494 	return (&umtxq_chains[0][key->hash]);
495 }
496 
497 /*
498  * Lock a chain.
499  */
500 static inline void
501 umtxq_lock(struct umtx_key *key)
502 {
503 	struct umtxq_chain *uc;
504 
505 	uc = umtxq_getchain(key);
506 	mtx_lock(&uc->uc_lock);
507 }
508 
509 /*
510  * Unlock a chain.
511  */
512 static inline void
513 umtxq_unlock(struct umtx_key *key)
514 {
515 	struct umtxq_chain *uc;
516 
517 	uc = umtxq_getchain(key);
518 	mtx_unlock(&uc->uc_lock);
519 }
520 
521 /*
522  * Set chain to busy state when following operation
523  * may be blocked (kernel mutex can not be used).
524  */
525 static inline void
526 umtxq_busy(struct umtx_key *key)
527 {
528 	struct umtxq_chain *uc;
529 
530 	uc = umtxq_getchain(key);
531 	mtx_assert(&uc->uc_lock, MA_OWNED);
532 	if (uc->uc_busy) {
533 #ifdef SMP
534 		if (smp_cpus > 1) {
535 			int count = BUSY_SPINS;
536 			if (count > 0) {
537 				umtxq_unlock(key);
538 				while (uc->uc_busy && --count > 0)
539 					cpu_spinwait();
540 				umtxq_lock(key);
541 			}
542 		}
543 #endif
544 		while (uc->uc_busy) {
545 			uc->uc_waiters++;
546 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
547 			uc->uc_waiters--;
548 		}
549 	}
550 	uc->uc_busy = 1;
551 }
552 
553 /*
554  * Unbusy a chain.
555  */
556 static inline void
557 umtxq_unbusy(struct umtx_key *key)
558 {
559 	struct umtxq_chain *uc;
560 
561 	uc = umtxq_getchain(key);
562 	mtx_assert(&uc->uc_lock, MA_OWNED);
563 	KASSERT(uc->uc_busy != 0, ("not busy"));
564 	uc->uc_busy = 0;
565 	if (uc->uc_waiters)
566 		wakeup_one(uc);
567 }
568 
569 static inline void
570 umtxq_unbusy_unlocked(struct umtx_key *key)
571 {
572 
573 	umtxq_lock(key);
574 	umtxq_unbusy(key);
575 	umtxq_unlock(key);
576 }
577 
578 static struct umtxq_queue *
579 umtxq_queue_lookup(struct umtx_key *key, int q)
580 {
581 	struct umtxq_queue *uh;
582 	struct umtxq_chain *uc;
583 
584 	uc = umtxq_getchain(key);
585 	UMTXQ_LOCKED_ASSERT(uc);
586 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
587 		if (umtx_key_match(&uh->key, key))
588 			return (uh);
589 	}
590 
591 	return (NULL);
592 }
593 
594 static inline void
595 umtxq_insert_queue(struct umtx_q *uq, int q)
596 {
597 	struct umtxq_queue *uh;
598 	struct umtxq_chain *uc;
599 
600 	uc = umtxq_getchain(&uq->uq_key);
601 	UMTXQ_LOCKED_ASSERT(uc);
602 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
603 	uh = umtxq_queue_lookup(&uq->uq_key, q);
604 	if (uh != NULL) {
605 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
606 	} else {
607 		uh = uq->uq_spare_queue;
608 		uh->key = uq->uq_key;
609 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
610 #ifdef UMTX_PROFILING
611 		uc->length++;
612 		if (uc->length > uc->max_length) {
613 			uc->max_length = uc->length;
614 			if (uc->max_length > max_length)
615 				max_length = uc->max_length;
616 		}
617 #endif
618 	}
619 	uq->uq_spare_queue = NULL;
620 
621 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
622 	uh->length++;
623 	uq->uq_flags |= UQF_UMTXQ;
624 	uq->uq_cur_queue = uh;
625 	return;
626 }
627 
628 static inline void
629 umtxq_remove_queue(struct umtx_q *uq, int q)
630 {
631 	struct umtxq_chain *uc;
632 	struct umtxq_queue *uh;
633 
634 	uc = umtxq_getchain(&uq->uq_key);
635 	UMTXQ_LOCKED_ASSERT(uc);
636 	if (uq->uq_flags & UQF_UMTXQ) {
637 		uh = uq->uq_cur_queue;
638 		TAILQ_REMOVE(&uh->head, uq, uq_link);
639 		uh->length--;
640 		uq->uq_flags &= ~UQF_UMTXQ;
641 		if (TAILQ_EMPTY(&uh->head)) {
642 			KASSERT(uh->length == 0,
643 			    ("inconsistent umtxq_queue length"));
644 #ifdef UMTX_PROFILING
645 			uc->length--;
646 #endif
647 			LIST_REMOVE(uh, link);
648 		} else {
649 			uh = LIST_FIRST(&uc->uc_spare_queue);
650 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
651 			LIST_REMOVE(uh, link);
652 		}
653 		uq->uq_spare_queue = uh;
654 		uq->uq_cur_queue = NULL;
655 	}
656 }
657 
658 /*
659  * Check if there are multiple waiters
660  */
661 static int
662 umtxq_count(struct umtx_key *key)
663 {
664 	struct umtxq_chain *uc;
665 	struct umtxq_queue *uh;
666 
667 	uc = umtxq_getchain(key);
668 	UMTXQ_LOCKED_ASSERT(uc);
669 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
670 	if (uh != NULL)
671 		return (uh->length);
672 	return (0);
673 }
674 
675 /*
676  * Check if there are multiple PI waiters and returns first
677  * waiter.
678  */
679 static int
680 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
681 {
682 	struct umtxq_chain *uc;
683 	struct umtxq_queue *uh;
684 
685 	*first = NULL;
686 	uc = umtxq_getchain(key);
687 	UMTXQ_LOCKED_ASSERT(uc);
688 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
689 	if (uh != NULL) {
690 		*first = TAILQ_FIRST(&uh->head);
691 		return (uh->length);
692 	}
693 	return (0);
694 }
695 
696 static int
697 umtxq_check_susp(struct thread *td)
698 {
699 	struct proc *p;
700 	int error;
701 
702 	/*
703 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
704 	 * eventually break the lockstep loop.
705 	 */
706 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
707 		return (0);
708 	error = 0;
709 	p = td->td_proc;
710 	PROC_LOCK(p);
711 	if (P_SHOULDSTOP(p) ||
712 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
713 		if (p->p_flag & P_SINGLE_EXIT)
714 			error = EINTR;
715 		else
716 			error = ERESTART;
717 	}
718 	PROC_UNLOCK(p);
719 	return (error);
720 }
721 
722 /*
723  * Wake up threads waiting on an userland object.
724  */
725 
726 static int
727 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
728 {
729 	struct umtxq_chain *uc;
730 	struct umtxq_queue *uh;
731 	struct umtx_q *uq;
732 	int ret;
733 
734 	ret = 0;
735 	uc = umtxq_getchain(key);
736 	UMTXQ_LOCKED_ASSERT(uc);
737 	uh = umtxq_queue_lookup(key, q);
738 	if (uh != NULL) {
739 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
740 			umtxq_remove_queue(uq, q);
741 			wakeup(uq);
742 			if (++ret >= n_wake)
743 				return (ret);
744 		}
745 	}
746 	return (ret);
747 }
748 
749 
750 /*
751  * Wake up specified thread.
752  */
753 static inline void
754 umtxq_signal_thread(struct umtx_q *uq)
755 {
756 	struct umtxq_chain *uc;
757 
758 	uc = umtxq_getchain(&uq->uq_key);
759 	UMTXQ_LOCKED_ASSERT(uc);
760 	umtxq_remove(uq);
761 	wakeup(uq);
762 }
763 
764 static inline int
765 tstohz(const struct timespec *tsp)
766 {
767 	struct timeval tv;
768 
769 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
770 	return tvtohz(&tv);
771 }
772 
773 static void
774 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
775 	const struct timespec *timeout)
776 {
777 
778 	timo->clockid = clockid;
779 	if (!absolute) {
780 		timo->is_abs_real = false;
781 		abs_timeout_update(timo);
782 		timo->end = timo->cur;
783 		timespecadd(&timo->end, timeout);
784 	} else {
785 		timo->end = *timeout;
786 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
787 		    clockid == CLOCK_REALTIME_FAST ||
788 		    clockid == CLOCK_REALTIME_PRECISE;
789 		/*
790 		 * If is_abs_real, umtxq_sleep will read the clock
791 		 * after setting td_rtcgen; otherwise, read it here.
792 		 */
793 		if (!timo->is_abs_real) {
794 			abs_timeout_update(timo);
795 		}
796 	}
797 }
798 
799 static void
800 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
801 {
802 
803 	abs_timeout_init(timo, umtxtime->_clockid,
804 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
805 }
806 
807 static inline void
808 abs_timeout_update(struct abs_timeout *timo)
809 {
810 
811 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
812 }
813 
814 static int
815 abs_timeout_gethz(struct abs_timeout *timo)
816 {
817 	struct timespec tts;
818 
819 	if (timespeccmp(&timo->end, &timo->cur, <=))
820 		return (-1);
821 	tts = timo->end;
822 	timespecsub(&tts, &timo->cur);
823 	return (tstohz(&tts));
824 }
825 
826 static uint32_t
827 umtx_unlock_val(uint32_t flags, bool rb)
828 {
829 
830 	if (rb)
831 		return (UMUTEX_RB_OWNERDEAD);
832 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
833 		return (UMUTEX_RB_NOTRECOV);
834 	else
835 		return (UMUTEX_UNOWNED);
836 
837 }
838 
839 /*
840  * Put thread into sleep state, before sleeping, check if
841  * thread was removed from umtx queue.
842  */
843 static inline int
844 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
845 {
846 	struct umtxq_chain *uc;
847 	int error, timo;
848 
849 	if (abstime != NULL && abstime->is_abs_real) {
850 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
851 		abs_timeout_update(abstime);
852 	}
853 
854 	uc = umtxq_getchain(&uq->uq_key);
855 	UMTXQ_LOCKED_ASSERT(uc);
856 	for (;;) {
857 		if (!(uq->uq_flags & UQF_UMTXQ)) {
858 			error = 0;
859 			break;
860 		}
861 		if (abstime != NULL) {
862 			timo = abs_timeout_gethz(abstime);
863 			if (timo < 0) {
864 				error = ETIMEDOUT;
865 				break;
866 			}
867 		} else
868 			timo = 0;
869 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
870 		if (error == EINTR || error == ERESTART) {
871 			umtxq_lock(&uq->uq_key);
872 			break;
873 		}
874 		if (abstime != NULL) {
875 			if (abstime->is_abs_real)
876 				curthread->td_rtcgen =
877 				    atomic_load_acq_int(&rtc_generation);
878 			abs_timeout_update(abstime);
879 		}
880 		umtxq_lock(&uq->uq_key);
881 	}
882 
883 	curthread->td_rtcgen = 0;
884 	return (error);
885 }
886 
887 /*
888  * Convert userspace address into unique logical address.
889  */
890 int
891 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
892 {
893 	struct thread *td = curthread;
894 	vm_map_t map;
895 	vm_map_entry_t entry;
896 	vm_pindex_t pindex;
897 	vm_prot_t prot;
898 	boolean_t wired;
899 
900 	key->type = type;
901 	if (share == THREAD_SHARE) {
902 		key->shared = 0;
903 		key->info.private.vs = td->td_proc->p_vmspace;
904 		key->info.private.addr = (uintptr_t)addr;
905 	} else {
906 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
907 		map = &td->td_proc->p_vmspace->vm_map;
908 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
909 		    &entry, &key->info.shared.object, &pindex, &prot,
910 		    &wired) != KERN_SUCCESS) {
911 			return (EFAULT);
912 		}
913 
914 		if ((share == PROCESS_SHARE) ||
915 		    (share == AUTO_SHARE &&
916 		     VM_INHERIT_SHARE == entry->inheritance)) {
917 			key->shared = 1;
918 			key->info.shared.offset = (vm_offset_t)addr -
919 			    entry->start + entry->offset;
920 			vm_object_reference(key->info.shared.object);
921 		} else {
922 			key->shared = 0;
923 			key->info.private.vs = td->td_proc->p_vmspace;
924 			key->info.private.addr = (uintptr_t)addr;
925 		}
926 		vm_map_lookup_done(map, entry);
927 	}
928 
929 	umtxq_hash(key);
930 	return (0);
931 }
932 
933 /*
934  * Release key.
935  */
936 void
937 umtx_key_release(struct umtx_key *key)
938 {
939 	if (key->shared)
940 		vm_object_deallocate(key->info.shared.object);
941 }
942 
943 /*
944  * Fetch and compare value, sleep on the address if value is not changed.
945  */
946 static int
947 do_wait(struct thread *td, void *addr, u_long id,
948     struct _umtx_time *timeout, int compat32, int is_private)
949 {
950 	struct abs_timeout timo;
951 	struct umtx_q *uq;
952 	u_long tmp;
953 	uint32_t tmp32;
954 	int error = 0;
955 
956 	uq = td->td_umtxq;
957 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
958 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
959 		return (error);
960 
961 	if (timeout != NULL)
962 		abs_timeout_init2(&timo, timeout);
963 
964 	umtxq_lock(&uq->uq_key);
965 	umtxq_insert(uq);
966 	umtxq_unlock(&uq->uq_key);
967 	if (compat32 == 0) {
968 		error = fueword(addr, &tmp);
969 		if (error != 0)
970 			error = EFAULT;
971 	} else {
972 		error = fueword32(addr, &tmp32);
973 		if (error == 0)
974 			tmp = tmp32;
975 		else
976 			error = EFAULT;
977 	}
978 	umtxq_lock(&uq->uq_key);
979 	if (error == 0) {
980 		if (tmp == id)
981 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
982 			    NULL : &timo);
983 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
984 			error = 0;
985 		else
986 			umtxq_remove(uq);
987 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
988 		umtxq_remove(uq);
989 	}
990 	umtxq_unlock(&uq->uq_key);
991 	umtx_key_release(&uq->uq_key);
992 	if (error == ERESTART)
993 		error = EINTR;
994 	return (error);
995 }
996 
997 /*
998  * Wake up threads sleeping on the specified address.
999  */
1000 int
1001 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1002 {
1003 	struct umtx_key key;
1004 	int ret;
1005 
1006 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1007 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1008 		return (ret);
1009 	umtxq_lock(&key);
1010 	umtxq_signal(&key, n_wake);
1011 	umtxq_unlock(&key);
1012 	umtx_key_release(&key);
1013 	return (0);
1014 }
1015 
1016 /*
1017  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1018  */
1019 static int
1020 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1021     struct _umtx_time *timeout, int mode)
1022 {
1023 	struct abs_timeout timo;
1024 	struct umtx_q *uq;
1025 	uint32_t owner, old, id;
1026 	int error, rv;
1027 
1028 	id = td->td_tid;
1029 	uq = td->td_umtxq;
1030 	error = 0;
1031 	if (timeout != NULL)
1032 		abs_timeout_init2(&timo, timeout);
1033 
1034 	/*
1035 	 * Care must be exercised when dealing with umtx structure. It
1036 	 * can fault on any access.
1037 	 */
1038 	for (;;) {
1039 		rv = fueword32(&m->m_owner, &owner);
1040 		if (rv == -1)
1041 			return (EFAULT);
1042 		if (mode == _UMUTEX_WAIT) {
1043 			if (owner == UMUTEX_UNOWNED ||
1044 			    owner == UMUTEX_CONTESTED ||
1045 			    owner == UMUTEX_RB_OWNERDEAD ||
1046 			    owner == UMUTEX_RB_NOTRECOV)
1047 				return (0);
1048 		} else {
1049 			/*
1050 			 * Robust mutex terminated.  Kernel duty is to
1051 			 * return EOWNERDEAD to the userspace.  The
1052 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1053 			 * by the common userspace code.
1054 			 */
1055 			if (owner == UMUTEX_RB_OWNERDEAD) {
1056 				rv = casueword32(&m->m_owner,
1057 				    UMUTEX_RB_OWNERDEAD, &owner,
1058 				    id | UMUTEX_CONTESTED);
1059 				if (rv == -1)
1060 					return (EFAULT);
1061 				if (owner == UMUTEX_RB_OWNERDEAD)
1062 					return (EOWNERDEAD); /* success */
1063 				rv = umtxq_check_susp(td);
1064 				if (rv != 0)
1065 					return (rv);
1066 				continue;
1067 			}
1068 			if (owner == UMUTEX_RB_NOTRECOV)
1069 				return (ENOTRECOVERABLE);
1070 
1071 
1072 			/*
1073 			 * Try the uncontested case.  This should be
1074 			 * done in userland.
1075 			 */
1076 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1077 			    &owner, id);
1078 			/* The address was invalid. */
1079 			if (rv == -1)
1080 				return (EFAULT);
1081 
1082 			/* The acquire succeeded. */
1083 			if (owner == UMUTEX_UNOWNED)
1084 				return (0);
1085 
1086 			/*
1087 			 * If no one owns it but it is contested try
1088 			 * to acquire it.
1089 			 */
1090 			if (owner == UMUTEX_CONTESTED) {
1091 				rv = casueword32(&m->m_owner,
1092 				    UMUTEX_CONTESTED, &owner,
1093 				    id | UMUTEX_CONTESTED);
1094 				/* The address was invalid. */
1095 				if (rv == -1)
1096 					return (EFAULT);
1097 
1098 				if (owner == UMUTEX_CONTESTED)
1099 					return (0);
1100 
1101 				rv = umtxq_check_susp(td);
1102 				if (rv != 0)
1103 					return (rv);
1104 
1105 				/*
1106 				 * If this failed the lock has
1107 				 * changed, restart.
1108 				 */
1109 				continue;
1110 			}
1111 		}
1112 
1113 		if (mode == _UMUTEX_TRY)
1114 			return (EBUSY);
1115 
1116 		/*
1117 		 * If we caught a signal, we have retried and now
1118 		 * exit immediately.
1119 		 */
1120 		if (error != 0)
1121 			return (error);
1122 
1123 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1124 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1125 			return (error);
1126 
1127 		umtxq_lock(&uq->uq_key);
1128 		umtxq_busy(&uq->uq_key);
1129 		umtxq_insert(uq);
1130 		umtxq_unlock(&uq->uq_key);
1131 
1132 		/*
1133 		 * Set the contested bit so that a release in user space
1134 		 * knows to use the system call for unlock.  If this fails
1135 		 * either some one else has acquired the lock or it has been
1136 		 * released.
1137 		 */
1138 		rv = casueword32(&m->m_owner, owner, &old,
1139 		    owner | UMUTEX_CONTESTED);
1140 
1141 		/* The address was invalid. */
1142 		if (rv == -1) {
1143 			umtxq_lock(&uq->uq_key);
1144 			umtxq_remove(uq);
1145 			umtxq_unbusy(&uq->uq_key);
1146 			umtxq_unlock(&uq->uq_key);
1147 			umtx_key_release(&uq->uq_key);
1148 			return (EFAULT);
1149 		}
1150 
1151 		/*
1152 		 * We set the contested bit, sleep. Otherwise the lock changed
1153 		 * and we need to retry or we lost a race to the thread
1154 		 * unlocking the umtx.
1155 		 */
1156 		umtxq_lock(&uq->uq_key);
1157 		umtxq_unbusy(&uq->uq_key);
1158 		if (old == owner)
1159 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1160 			    NULL : &timo);
1161 		umtxq_remove(uq);
1162 		umtxq_unlock(&uq->uq_key);
1163 		umtx_key_release(&uq->uq_key);
1164 
1165 		if (error == 0)
1166 			error = umtxq_check_susp(td);
1167 	}
1168 
1169 	return (0);
1170 }
1171 
1172 /*
1173  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1174  */
1175 static int
1176 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1177 {
1178 	struct umtx_key key;
1179 	uint32_t owner, old, id, newlock;
1180 	int error, count;
1181 
1182 	id = td->td_tid;
1183 	/*
1184 	 * Make sure we own this mtx.
1185 	 */
1186 	error = fueword32(&m->m_owner, &owner);
1187 	if (error == -1)
1188 		return (EFAULT);
1189 
1190 	if ((owner & ~UMUTEX_CONTESTED) != id)
1191 		return (EPERM);
1192 
1193 	newlock = umtx_unlock_val(flags, rb);
1194 	if ((owner & UMUTEX_CONTESTED) == 0) {
1195 		error = casueword32(&m->m_owner, owner, &old, newlock);
1196 		if (error == -1)
1197 			return (EFAULT);
1198 		if (old == owner)
1199 			return (0);
1200 		owner = old;
1201 	}
1202 
1203 	/* We should only ever be in here for contested locks */
1204 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1205 	    &key)) != 0)
1206 		return (error);
1207 
1208 	umtxq_lock(&key);
1209 	umtxq_busy(&key);
1210 	count = umtxq_count(&key);
1211 	umtxq_unlock(&key);
1212 
1213 	/*
1214 	 * When unlocking the umtx, it must be marked as unowned if
1215 	 * there is zero or one thread only waiting for it.
1216 	 * Otherwise, it must be marked as contested.
1217 	 */
1218 	if (count > 1)
1219 		newlock |= UMUTEX_CONTESTED;
1220 	error = casueword32(&m->m_owner, owner, &old, newlock);
1221 	umtxq_lock(&key);
1222 	umtxq_signal(&key, 1);
1223 	umtxq_unbusy(&key);
1224 	umtxq_unlock(&key);
1225 	umtx_key_release(&key);
1226 	if (error == -1)
1227 		return (EFAULT);
1228 	if (old != owner)
1229 		return (EINVAL);
1230 	return (0);
1231 }
1232 
1233 /*
1234  * Check if the mutex is available and wake up a waiter,
1235  * only for simple mutex.
1236  */
1237 static int
1238 do_wake_umutex(struct thread *td, struct umutex *m)
1239 {
1240 	struct umtx_key key;
1241 	uint32_t owner;
1242 	uint32_t flags;
1243 	int error;
1244 	int count;
1245 
1246 	error = fueword32(&m->m_owner, &owner);
1247 	if (error == -1)
1248 		return (EFAULT);
1249 
1250 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1251 	    owner != UMUTEX_RB_NOTRECOV)
1252 		return (0);
1253 
1254 	error = fueword32(&m->m_flags, &flags);
1255 	if (error == -1)
1256 		return (EFAULT);
1257 
1258 	/* We should only ever be in here for contested locks */
1259 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1260 	    &key)) != 0)
1261 		return (error);
1262 
1263 	umtxq_lock(&key);
1264 	umtxq_busy(&key);
1265 	count = umtxq_count(&key);
1266 	umtxq_unlock(&key);
1267 
1268 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1269 	    owner != UMUTEX_RB_NOTRECOV) {
1270 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1271 		    UMUTEX_UNOWNED);
1272 		if (error == -1)
1273 			error = EFAULT;
1274 	}
1275 
1276 	umtxq_lock(&key);
1277 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1278 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1279 		umtxq_signal(&key, 1);
1280 	umtxq_unbusy(&key);
1281 	umtxq_unlock(&key);
1282 	umtx_key_release(&key);
1283 	return (error);
1284 }
1285 
1286 /*
1287  * Check if the mutex has waiters and tries to fix contention bit.
1288  */
1289 static int
1290 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1291 {
1292 	struct umtx_key key;
1293 	uint32_t owner, old;
1294 	int type;
1295 	int error;
1296 	int count;
1297 
1298 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1299 	    UMUTEX_ROBUST)) {
1300 	case 0:
1301 	case UMUTEX_ROBUST:
1302 		type = TYPE_NORMAL_UMUTEX;
1303 		break;
1304 	case UMUTEX_PRIO_INHERIT:
1305 		type = TYPE_PI_UMUTEX;
1306 		break;
1307 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1308 		type = TYPE_PI_ROBUST_UMUTEX;
1309 		break;
1310 	case UMUTEX_PRIO_PROTECT:
1311 		type = TYPE_PP_UMUTEX;
1312 		break;
1313 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1314 		type = TYPE_PP_ROBUST_UMUTEX;
1315 		break;
1316 	default:
1317 		return (EINVAL);
1318 	}
1319 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1320 		return (error);
1321 
1322 	owner = 0;
1323 	umtxq_lock(&key);
1324 	umtxq_busy(&key);
1325 	count = umtxq_count(&key);
1326 	umtxq_unlock(&key);
1327 	/*
1328 	 * Only repair contention bit if there is a waiter, this means the mutex
1329 	 * is still being referenced by userland code, otherwise don't update
1330 	 * any memory.
1331 	 */
1332 	if (count > 1) {
1333 		error = fueword32(&m->m_owner, &owner);
1334 		if (error == -1)
1335 			error = EFAULT;
1336 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1337 			error = casueword32(&m->m_owner, owner, &old,
1338 			    owner | UMUTEX_CONTESTED);
1339 			if (error == -1) {
1340 				error = EFAULT;
1341 				break;
1342 			}
1343 			if (old == owner)
1344 				break;
1345 			owner = old;
1346 			error = umtxq_check_susp(td);
1347 			if (error != 0)
1348 				break;
1349 		}
1350 	} else if (count == 1) {
1351 		error = fueword32(&m->m_owner, &owner);
1352 		if (error == -1)
1353 			error = EFAULT;
1354 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1355 		    (owner & UMUTEX_CONTESTED) == 0) {
1356 			error = casueword32(&m->m_owner, owner, &old,
1357 			    owner | UMUTEX_CONTESTED);
1358 			if (error == -1) {
1359 				error = EFAULT;
1360 				break;
1361 			}
1362 			if (old == owner)
1363 				break;
1364 			owner = old;
1365 			error = umtxq_check_susp(td);
1366 			if (error != 0)
1367 				break;
1368 		}
1369 	}
1370 	umtxq_lock(&key);
1371 	if (error == EFAULT) {
1372 		umtxq_signal(&key, INT_MAX);
1373 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1374 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1375 		umtxq_signal(&key, 1);
1376 	umtxq_unbusy(&key);
1377 	umtxq_unlock(&key);
1378 	umtx_key_release(&key);
1379 	return (error);
1380 }
1381 
1382 static inline struct umtx_pi *
1383 umtx_pi_alloc(int flags)
1384 {
1385 	struct umtx_pi *pi;
1386 
1387 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1388 	TAILQ_INIT(&pi->pi_blocked);
1389 	atomic_add_int(&umtx_pi_allocated, 1);
1390 	return (pi);
1391 }
1392 
1393 static inline void
1394 umtx_pi_free(struct umtx_pi *pi)
1395 {
1396 	uma_zfree(umtx_pi_zone, pi);
1397 	atomic_add_int(&umtx_pi_allocated, -1);
1398 }
1399 
1400 /*
1401  * Adjust the thread's position on a pi_state after its priority has been
1402  * changed.
1403  */
1404 static int
1405 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1406 {
1407 	struct umtx_q *uq, *uq1, *uq2;
1408 	struct thread *td1;
1409 
1410 	mtx_assert(&umtx_lock, MA_OWNED);
1411 	if (pi == NULL)
1412 		return (0);
1413 
1414 	uq = td->td_umtxq;
1415 
1416 	/*
1417 	 * Check if the thread needs to be moved on the blocked chain.
1418 	 * It needs to be moved if either its priority is lower than
1419 	 * the previous thread or higher than the next thread.
1420 	 */
1421 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1422 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1423 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1424 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1425 		/*
1426 		 * Remove thread from blocked chain and determine where
1427 		 * it should be moved to.
1428 		 */
1429 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1430 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1431 			td1 = uq1->uq_thread;
1432 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1433 			if (UPRI(td1) > UPRI(td))
1434 				break;
1435 		}
1436 
1437 		if (uq1 == NULL)
1438 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1439 		else
1440 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1441 	}
1442 	return (1);
1443 }
1444 
1445 static struct umtx_pi *
1446 umtx_pi_next(struct umtx_pi *pi)
1447 {
1448 	struct umtx_q *uq_owner;
1449 
1450 	if (pi->pi_owner == NULL)
1451 		return (NULL);
1452 	uq_owner = pi->pi_owner->td_umtxq;
1453 	if (uq_owner == NULL)
1454 		return (NULL);
1455 	return (uq_owner->uq_pi_blocked);
1456 }
1457 
1458 /*
1459  * Floyd's Cycle-Finding Algorithm.
1460  */
1461 static bool
1462 umtx_pi_check_loop(struct umtx_pi *pi)
1463 {
1464 	struct umtx_pi *pi1;	/* fast iterator */
1465 
1466 	mtx_assert(&umtx_lock, MA_OWNED);
1467 	if (pi == NULL)
1468 		return (false);
1469 	pi1 = pi;
1470 	for (;;) {
1471 		pi = umtx_pi_next(pi);
1472 		if (pi == NULL)
1473 			break;
1474 		pi1 = umtx_pi_next(pi1);
1475 		if (pi1 == NULL)
1476 			break;
1477 		pi1 = umtx_pi_next(pi1);
1478 		if (pi1 == NULL)
1479 			break;
1480 		if (pi == pi1)
1481 			return (true);
1482 	}
1483 	return (false);
1484 }
1485 
1486 /*
1487  * Propagate priority when a thread is blocked on POSIX
1488  * PI mutex.
1489  */
1490 static void
1491 umtx_propagate_priority(struct thread *td)
1492 {
1493 	struct umtx_q *uq;
1494 	struct umtx_pi *pi;
1495 	int pri;
1496 
1497 	mtx_assert(&umtx_lock, MA_OWNED);
1498 	pri = UPRI(td);
1499 	uq = td->td_umtxq;
1500 	pi = uq->uq_pi_blocked;
1501 	if (pi == NULL)
1502 		return;
1503 	if (umtx_pi_check_loop(pi))
1504 		return;
1505 
1506 	for (;;) {
1507 		td = pi->pi_owner;
1508 		if (td == NULL || td == curthread)
1509 			return;
1510 
1511 		MPASS(td->td_proc != NULL);
1512 		MPASS(td->td_proc->p_magic == P_MAGIC);
1513 
1514 		thread_lock(td);
1515 		if (td->td_lend_user_pri > pri)
1516 			sched_lend_user_prio(td, pri);
1517 		else {
1518 			thread_unlock(td);
1519 			break;
1520 		}
1521 		thread_unlock(td);
1522 
1523 		/*
1524 		 * Pick up the lock that td is blocked on.
1525 		 */
1526 		uq = td->td_umtxq;
1527 		pi = uq->uq_pi_blocked;
1528 		if (pi == NULL)
1529 			break;
1530 		/* Resort td on the list if needed. */
1531 		umtx_pi_adjust_thread(pi, td);
1532 	}
1533 }
1534 
1535 /*
1536  * Unpropagate priority for a PI mutex when a thread blocked on
1537  * it is interrupted by signal or resumed by others.
1538  */
1539 static void
1540 umtx_repropagate_priority(struct umtx_pi *pi)
1541 {
1542 	struct umtx_q *uq, *uq_owner;
1543 	struct umtx_pi *pi2;
1544 	int pri;
1545 
1546 	mtx_assert(&umtx_lock, MA_OWNED);
1547 
1548 	if (umtx_pi_check_loop(pi))
1549 		return;
1550 	while (pi != NULL && pi->pi_owner != NULL) {
1551 		pri = PRI_MAX;
1552 		uq_owner = pi->pi_owner->td_umtxq;
1553 
1554 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1555 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1556 			if (uq != NULL) {
1557 				if (pri > UPRI(uq->uq_thread))
1558 					pri = UPRI(uq->uq_thread);
1559 			}
1560 		}
1561 
1562 		if (pri > uq_owner->uq_inherited_pri)
1563 			pri = uq_owner->uq_inherited_pri;
1564 		thread_lock(pi->pi_owner);
1565 		sched_lend_user_prio(pi->pi_owner, pri);
1566 		thread_unlock(pi->pi_owner);
1567 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1568 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1569 	}
1570 }
1571 
1572 /*
1573  * Insert a PI mutex into owned list.
1574  */
1575 static void
1576 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1577 {
1578 	struct umtx_q *uq_owner;
1579 
1580 	uq_owner = owner->td_umtxq;
1581 	mtx_assert(&umtx_lock, MA_OWNED);
1582 	MPASS(pi->pi_owner == NULL);
1583 	pi->pi_owner = owner;
1584 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1585 }
1586 
1587 
1588 /*
1589  * Disown a PI mutex, and remove it from the owned list.
1590  */
1591 static void
1592 umtx_pi_disown(struct umtx_pi *pi)
1593 {
1594 
1595 	mtx_assert(&umtx_lock, MA_OWNED);
1596 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1597 	pi->pi_owner = NULL;
1598 }
1599 
1600 /*
1601  * Claim ownership of a PI mutex.
1602  */
1603 static int
1604 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1605 {
1606 	struct umtx_q *uq;
1607 	int pri;
1608 
1609 	mtx_lock(&umtx_lock);
1610 	if (pi->pi_owner == owner) {
1611 		mtx_unlock(&umtx_lock);
1612 		return (0);
1613 	}
1614 
1615 	if (pi->pi_owner != NULL) {
1616 		/*
1617 		 * userland may have already messed the mutex, sigh.
1618 		 */
1619 		mtx_unlock(&umtx_lock);
1620 		return (EPERM);
1621 	}
1622 	umtx_pi_setowner(pi, owner);
1623 	uq = TAILQ_FIRST(&pi->pi_blocked);
1624 	if (uq != NULL) {
1625 		pri = UPRI(uq->uq_thread);
1626 		thread_lock(owner);
1627 		if (pri < UPRI(owner))
1628 			sched_lend_user_prio(owner, pri);
1629 		thread_unlock(owner);
1630 	}
1631 	mtx_unlock(&umtx_lock);
1632 	return (0);
1633 }
1634 
1635 /*
1636  * Adjust a thread's order position in its blocked PI mutex,
1637  * this may result new priority propagating process.
1638  */
1639 void
1640 umtx_pi_adjust(struct thread *td, u_char oldpri)
1641 {
1642 	struct umtx_q *uq;
1643 	struct umtx_pi *pi;
1644 
1645 	uq = td->td_umtxq;
1646 	mtx_lock(&umtx_lock);
1647 	/*
1648 	 * Pick up the lock that td is blocked on.
1649 	 */
1650 	pi = uq->uq_pi_blocked;
1651 	if (pi != NULL) {
1652 		umtx_pi_adjust_thread(pi, td);
1653 		umtx_repropagate_priority(pi);
1654 	}
1655 	mtx_unlock(&umtx_lock);
1656 }
1657 
1658 /*
1659  * Sleep on a PI mutex.
1660  */
1661 static int
1662 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1663     const char *wmesg, struct abs_timeout *timo, bool shared)
1664 {
1665 	struct umtxq_chain *uc;
1666 	struct thread *td, *td1;
1667 	struct umtx_q *uq1;
1668 	int error, pri;
1669 
1670 	error = 0;
1671 	td = uq->uq_thread;
1672 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1673 	uc = umtxq_getchain(&uq->uq_key);
1674 	UMTXQ_LOCKED_ASSERT(uc);
1675 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1676 	umtxq_insert(uq);
1677 	mtx_lock(&umtx_lock);
1678 	if (pi->pi_owner == NULL) {
1679 		mtx_unlock(&umtx_lock);
1680 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1681 		mtx_lock(&umtx_lock);
1682 		if (td1 != NULL) {
1683 			if (pi->pi_owner == NULL)
1684 				umtx_pi_setowner(pi, td1);
1685 			PROC_UNLOCK(td1->td_proc);
1686 		}
1687 	}
1688 
1689 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1690 		pri = UPRI(uq1->uq_thread);
1691 		if (pri > UPRI(td))
1692 			break;
1693 	}
1694 
1695 	if (uq1 != NULL)
1696 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1697 	else
1698 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1699 
1700 	uq->uq_pi_blocked = pi;
1701 	thread_lock(td);
1702 	td->td_flags |= TDF_UPIBLOCKED;
1703 	thread_unlock(td);
1704 	umtx_propagate_priority(td);
1705 	mtx_unlock(&umtx_lock);
1706 	umtxq_unbusy(&uq->uq_key);
1707 
1708 	error = umtxq_sleep(uq, wmesg, timo);
1709 	umtxq_remove(uq);
1710 
1711 	mtx_lock(&umtx_lock);
1712 	uq->uq_pi_blocked = NULL;
1713 	thread_lock(td);
1714 	td->td_flags &= ~TDF_UPIBLOCKED;
1715 	thread_unlock(td);
1716 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1717 	umtx_repropagate_priority(pi);
1718 	mtx_unlock(&umtx_lock);
1719 	umtxq_unlock(&uq->uq_key);
1720 
1721 	return (error);
1722 }
1723 
1724 /*
1725  * Add reference count for a PI mutex.
1726  */
1727 static void
1728 umtx_pi_ref(struct umtx_pi *pi)
1729 {
1730 	struct umtxq_chain *uc;
1731 
1732 	uc = umtxq_getchain(&pi->pi_key);
1733 	UMTXQ_LOCKED_ASSERT(uc);
1734 	pi->pi_refcount++;
1735 }
1736 
1737 /*
1738  * Decrease reference count for a PI mutex, if the counter
1739  * is decreased to zero, its memory space is freed.
1740  */
1741 static void
1742 umtx_pi_unref(struct umtx_pi *pi)
1743 {
1744 	struct umtxq_chain *uc;
1745 
1746 	uc = umtxq_getchain(&pi->pi_key);
1747 	UMTXQ_LOCKED_ASSERT(uc);
1748 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1749 	if (--pi->pi_refcount == 0) {
1750 		mtx_lock(&umtx_lock);
1751 		if (pi->pi_owner != NULL)
1752 			umtx_pi_disown(pi);
1753 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1754 			("blocked queue not empty"));
1755 		mtx_unlock(&umtx_lock);
1756 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1757 		umtx_pi_free(pi);
1758 	}
1759 }
1760 
1761 /*
1762  * Find a PI mutex in hash table.
1763  */
1764 static struct umtx_pi *
1765 umtx_pi_lookup(struct umtx_key *key)
1766 {
1767 	struct umtxq_chain *uc;
1768 	struct umtx_pi *pi;
1769 
1770 	uc = umtxq_getchain(key);
1771 	UMTXQ_LOCKED_ASSERT(uc);
1772 
1773 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1774 		if (umtx_key_match(&pi->pi_key, key)) {
1775 			return (pi);
1776 		}
1777 	}
1778 	return (NULL);
1779 }
1780 
1781 /*
1782  * Insert a PI mutex into hash table.
1783  */
1784 static inline void
1785 umtx_pi_insert(struct umtx_pi *pi)
1786 {
1787 	struct umtxq_chain *uc;
1788 
1789 	uc = umtxq_getchain(&pi->pi_key);
1790 	UMTXQ_LOCKED_ASSERT(uc);
1791 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1792 }
1793 
1794 /*
1795  * Lock a PI mutex.
1796  */
1797 static int
1798 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1799     struct _umtx_time *timeout, int try)
1800 {
1801 	struct abs_timeout timo;
1802 	struct umtx_q *uq;
1803 	struct umtx_pi *pi, *new_pi;
1804 	uint32_t id, old_owner, owner, old;
1805 	int error, rv;
1806 
1807 	id = td->td_tid;
1808 	uq = td->td_umtxq;
1809 
1810 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1811 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1812 	    &uq->uq_key)) != 0)
1813 		return (error);
1814 
1815 	if (timeout != NULL)
1816 		abs_timeout_init2(&timo, timeout);
1817 
1818 	umtxq_lock(&uq->uq_key);
1819 	pi = umtx_pi_lookup(&uq->uq_key);
1820 	if (pi == NULL) {
1821 		new_pi = umtx_pi_alloc(M_NOWAIT);
1822 		if (new_pi == NULL) {
1823 			umtxq_unlock(&uq->uq_key);
1824 			new_pi = umtx_pi_alloc(M_WAITOK);
1825 			umtxq_lock(&uq->uq_key);
1826 			pi = umtx_pi_lookup(&uq->uq_key);
1827 			if (pi != NULL) {
1828 				umtx_pi_free(new_pi);
1829 				new_pi = NULL;
1830 			}
1831 		}
1832 		if (new_pi != NULL) {
1833 			new_pi->pi_key = uq->uq_key;
1834 			umtx_pi_insert(new_pi);
1835 			pi = new_pi;
1836 		}
1837 	}
1838 	umtx_pi_ref(pi);
1839 	umtxq_unlock(&uq->uq_key);
1840 
1841 	/*
1842 	 * Care must be exercised when dealing with umtx structure.  It
1843 	 * can fault on any access.
1844 	 */
1845 	for (;;) {
1846 		/*
1847 		 * Try the uncontested case.  This should be done in userland.
1848 		 */
1849 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1850 		/* The address was invalid. */
1851 		if (rv == -1) {
1852 			error = EFAULT;
1853 			break;
1854 		}
1855 
1856 		/* The acquire succeeded. */
1857 		if (owner == UMUTEX_UNOWNED) {
1858 			error = 0;
1859 			break;
1860 		}
1861 
1862 		if (owner == UMUTEX_RB_NOTRECOV) {
1863 			error = ENOTRECOVERABLE;
1864 			break;
1865 		}
1866 
1867 		/* If no one owns it but it is contested try to acquire it. */
1868 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1869 			old_owner = owner;
1870 			rv = casueword32(&m->m_owner, owner, &owner,
1871 			    id | UMUTEX_CONTESTED);
1872 			/* The address was invalid. */
1873 			if (rv == -1) {
1874 				error = EFAULT;
1875 				break;
1876 			}
1877 
1878 			if (owner == old_owner) {
1879 				umtxq_lock(&uq->uq_key);
1880 				umtxq_busy(&uq->uq_key);
1881 				error = umtx_pi_claim(pi, td);
1882 				umtxq_unbusy(&uq->uq_key);
1883 				umtxq_unlock(&uq->uq_key);
1884 				if (error != 0) {
1885 					/*
1886 					 * Since we're going to return an
1887 					 * error, restore the m_owner to its
1888 					 * previous, unowned state to avoid
1889 					 * compounding the problem.
1890 					 */
1891 					(void)casuword32(&m->m_owner,
1892 					    id | UMUTEX_CONTESTED,
1893 					    old_owner);
1894 				}
1895 				if (error == 0 &&
1896 				    old_owner == UMUTEX_RB_OWNERDEAD)
1897 					error = EOWNERDEAD;
1898 				break;
1899 			}
1900 
1901 			error = umtxq_check_susp(td);
1902 			if (error != 0)
1903 				break;
1904 
1905 			/* If this failed the lock has changed, restart. */
1906 			continue;
1907 		}
1908 
1909 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1910 			error = EDEADLK;
1911 			break;
1912 		}
1913 
1914 		if (try != 0) {
1915 			error = EBUSY;
1916 			break;
1917 		}
1918 
1919 		/*
1920 		 * If we caught a signal, we have retried and now
1921 		 * exit immediately.
1922 		 */
1923 		if (error != 0)
1924 			break;
1925 
1926 		umtxq_lock(&uq->uq_key);
1927 		umtxq_busy(&uq->uq_key);
1928 		umtxq_unlock(&uq->uq_key);
1929 
1930 		/*
1931 		 * Set the contested bit so that a release in user space
1932 		 * knows to use the system call for unlock.  If this fails
1933 		 * either some one else has acquired the lock or it has been
1934 		 * released.
1935 		 */
1936 		rv = casueword32(&m->m_owner, owner, &old, owner |
1937 		    UMUTEX_CONTESTED);
1938 
1939 		/* The address was invalid. */
1940 		if (rv == -1) {
1941 			umtxq_unbusy_unlocked(&uq->uq_key);
1942 			error = EFAULT;
1943 			break;
1944 		}
1945 
1946 		umtxq_lock(&uq->uq_key);
1947 		/*
1948 		 * We set the contested bit, sleep. Otherwise the lock changed
1949 		 * and we need to retry or we lost a race to the thread
1950 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
1951 		 * value for owner is impossible there.
1952 		 */
1953 		if (old == owner) {
1954 			error = umtxq_sleep_pi(uq, pi,
1955 			    owner & ~UMUTEX_CONTESTED,
1956 			    "umtxpi", timeout == NULL ? NULL : &timo,
1957 			    (flags & USYNC_PROCESS_SHARED) != 0);
1958 			if (error != 0)
1959 				continue;
1960 		} else {
1961 			umtxq_unbusy(&uq->uq_key);
1962 			umtxq_unlock(&uq->uq_key);
1963 		}
1964 
1965 		error = umtxq_check_susp(td);
1966 		if (error != 0)
1967 			break;
1968 	}
1969 
1970 	umtxq_lock(&uq->uq_key);
1971 	umtx_pi_unref(pi);
1972 	umtxq_unlock(&uq->uq_key);
1973 
1974 	umtx_key_release(&uq->uq_key);
1975 	return (error);
1976 }
1977 
1978 /*
1979  * Unlock a PI mutex.
1980  */
1981 static int
1982 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1983 {
1984 	struct umtx_key key;
1985 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1986 	struct umtx_pi *pi, *pi2;
1987 	uint32_t id, new_owner, old, owner;
1988 	int count, error, pri;
1989 
1990 	id = td->td_tid;
1991 	/*
1992 	 * Make sure we own this mtx.
1993 	 */
1994 	error = fueword32(&m->m_owner, &owner);
1995 	if (error == -1)
1996 		return (EFAULT);
1997 
1998 	if ((owner & ~UMUTEX_CONTESTED) != id)
1999 		return (EPERM);
2000 
2001 	new_owner = umtx_unlock_val(flags, rb);
2002 
2003 	/* This should be done in userland */
2004 	if ((owner & UMUTEX_CONTESTED) == 0) {
2005 		error = casueword32(&m->m_owner, owner, &old, new_owner);
2006 		if (error == -1)
2007 			return (EFAULT);
2008 		if (old == owner)
2009 			return (0);
2010 		owner = old;
2011 	}
2012 
2013 	/* We should only ever be in here for contested locks */
2014 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2015 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2016 	    &key)) != 0)
2017 		return (error);
2018 
2019 	umtxq_lock(&key);
2020 	umtxq_busy(&key);
2021 	count = umtxq_count_pi(&key, &uq_first);
2022 	if (uq_first != NULL) {
2023 		mtx_lock(&umtx_lock);
2024 		pi = uq_first->uq_pi_blocked;
2025 		KASSERT(pi != NULL, ("pi == NULL?"));
2026 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2027 			mtx_unlock(&umtx_lock);
2028 			umtxq_unbusy(&key);
2029 			umtxq_unlock(&key);
2030 			umtx_key_release(&key);
2031 			/* userland messed the mutex */
2032 			return (EPERM);
2033 		}
2034 		uq_me = td->td_umtxq;
2035 		if (pi->pi_owner == td)
2036 			umtx_pi_disown(pi);
2037 		/* get highest priority thread which is still sleeping. */
2038 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2039 		while (uq_first != NULL &&
2040 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2041 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2042 		}
2043 		pri = PRI_MAX;
2044 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2045 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2046 			if (uq_first2 != NULL) {
2047 				if (pri > UPRI(uq_first2->uq_thread))
2048 					pri = UPRI(uq_first2->uq_thread);
2049 			}
2050 		}
2051 		thread_lock(td);
2052 		sched_lend_user_prio(td, pri);
2053 		thread_unlock(td);
2054 		mtx_unlock(&umtx_lock);
2055 		if (uq_first)
2056 			umtxq_signal_thread(uq_first);
2057 	} else {
2058 		pi = umtx_pi_lookup(&key);
2059 		/*
2060 		 * A umtx_pi can exist if a signal or timeout removed the
2061 		 * last waiter from the umtxq, but there is still
2062 		 * a thread in do_lock_pi() holding the umtx_pi.
2063 		 */
2064 		if (pi != NULL) {
2065 			/*
2066 			 * The umtx_pi can be unowned, such as when a thread
2067 			 * has just entered do_lock_pi(), allocated the
2068 			 * umtx_pi, and unlocked the umtxq.
2069 			 * If the current thread owns it, it must disown it.
2070 			 */
2071 			mtx_lock(&umtx_lock);
2072 			if (pi->pi_owner == td)
2073 				umtx_pi_disown(pi);
2074 			mtx_unlock(&umtx_lock);
2075 		}
2076 	}
2077 	umtxq_unlock(&key);
2078 
2079 	/*
2080 	 * When unlocking the umtx, it must be marked as unowned if
2081 	 * there is zero or one thread only waiting for it.
2082 	 * Otherwise, it must be marked as contested.
2083 	 */
2084 
2085 	if (count > 1)
2086 		new_owner |= UMUTEX_CONTESTED;
2087 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2088 
2089 	umtxq_unbusy_unlocked(&key);
2090 	umtx_key_release(&key);
2091 	if (error == -1)
2092 		return (EFAULT);
2093 	if (old != owner)
2094 		return (EINVAL);
2095 	return (0);
2096 }
2097 
2098 /*
2099  * Lock a PP mutex.
2100  */
2101 static int
2102 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2103     struct _umtx_time *timeout, int try)
2104 {
2105 	struct abs_timeout timo;
2106 	struct umtx_q *uq, *uq2;
2107 	struct umtx_pi *pi;
2108 	uint32_t ceiling;
2109 	uint32_t owner, id;
2110 	int error, pri, old_inherited_pri, su, rv;
2111 
2112 	id = td->td_tid;
2113 	uq = td->td_umtxq;
2114 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2115 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2116 	    &uq->uq_key)) != 0)
2117 		return (error);
2118 
2119 	if (timeout != NULL)
2120 		abs_timeout_init2(&timo, timeout);
2121 
2122 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2123 	for (;;) {
2124 		old_inherited_pri = uq->uq_inherited_pri;
2125 		umtxq_lock(&uq->uq_key);
2126 		umtxq_busy(&uq->uq_key);
2127 		umtxq_unlock(&uq->uq_key);
2128 
2129 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2130 		if (rv == -1) {
2131 			error = EFAULT;
2132 			goto out;
2133 		}
2134 		ceiling = RTP_PRIO_MAX - ceiling;
2135 		if (ceiling > RTP_PRIO_MAX) {
2136 			error = EINVAL;
2137 			goto out;
2138 		}
2139 
2140 		mtx_lock(&umtx_lock);
2141 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2142 			mtx_unlock(&umtx_lock);
2143 			error = EINVAL;
2144 			goto out;
2145 		}
2146 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2147 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2148 			thread_lock(td);
2149 			if (uq->uq_inherited_pri < UPRI(td))
2150 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2151 			thread_unlock(td);
2152 		}
2153 		mtx_unlock(&umtx_lock);
2154 
2155 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2156 		    id | UMUTEX_CONTESTED);
2157 		/* The address was invalid. */
2158 		if (rv == -1) {
2159 			error = EFAULT;
2160 			break;
2161 		}
2162 
2163 		if (owner == UMUTEX_CONTESTED) {
2164 			error = 0;
2165 			break;
2166 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
2167 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2168 			    &owner, id | UMUTEX_CONTESTED);
2169 			if (rv == -1) {
2170 				error = EFAULT;
2171 				break;
2172 			}
2173 			if (owner == UMUTEX_RB_OWNERDEAD) {
2174 				error = EOWNERDEAD; /* success */
2175 				break;
2176 			}
2177 			error = 0;
2178 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2179 			error = ENOTRECOVERABLE;
2180 			break;
2181 		}
2182 
2183 		if (try != 0) {
2184 			error = EBUSY;
2185 			break;
2186 		}
2187 
2188 		/*
2189 		 * If we caught a signal, we have retried and now
2190 		 * exit immediately.
2191 		 */
2192 		if (error != 0)
2193 			break;
2194 
2195 		umtxq_lock(&uq->uq_key);
2196 		umtxq_insert(uq);
2197 		umtxq_unbusy(&uq->uq_key);
2198 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2199 		    NULL : &timo);
2200 		umtxq_remove(uq);
2201 		umtxq_unlock(&uq->uq_key);
2202 
2203 		mtx_lock(&umtx_lock);
2204 		uq->uq_inherited_pri = old_inherited_pri;
2205 		pri = PRI_MAX;
2206 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2207 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2208 			if (uq2 != NULL) {
2209 				if (pri > UPRI(uq2->uq_thread))
2210 					pri = UPRI(uq2->uq_thread);
2211 			}
2212 		}
2213 		if (pri > uq->uq_inherited_pri)
2214 			pri = uq->uq_inherited_pri;
2215 		thread_lock(td);
2216 		sched_lend_user_prio(td, pri);
2217 		thread_unlock(td);
2218 		mtx_unlock(&umtx_lock);
2219 	}
2220 
2221 	if (error != 0 && error != EOWNERDEAD) {
2222 		mtx_lock(&umtx_lock);
2223 		uq->uq_inherited_pri = old_inherited_pri;
2224 		pri = PRI_MAX;
2225 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2226 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2227 			if (uq2 != NULL) {
2228 				if (pri > UPRI(uq2->uq_thread))
2229 					pri = UPRI(uq2->uq_thread);
2230 			}
2231 		}
2232 		if (pri > uq->uq_inherited_pri)
2233 			pri = uq->uq_inherited_pri;
2234 		thread_lock(td);
2235 		sched_lend_user_prio(td, pri);
2236 		thread_unlock(td);
2237 		mtx_unlock(&umtx_lock);
2238 	}
2239 
2240 out:
2241 	umtxq_unbusy_unlocked(&uq->uq_key);
2242 	umtx_key_release(&uq->uq_key);
2243 	return (error);
2244 }
2245 
2246 /*
2247  * Unlock a PP mutex.
2248  */
2249 static int
2250 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2251 {
2252 	struct umtx_key key;
2253 	struct umtx_q *uq, *uq2;
2254 	struct umtx_pi *pi;
2255 	uint32_t id, owner, rceiling;
2256 	int error, pri, new_inherited_pri, su;
2257 
2258 	id = td->td_tid;
2259 	uq = td->td_umtxq;
2260 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2261 
2262 	/*
2263 	 * Make sure we own this mtx.
2264 	 */
2265 	error = fueword32(&m->m_owner, &owner);
2266 	if (error == -1)
2267 		return (EFAULT);
2268 
2269 	if ((owner & ~UMUTEX_CONTESTED) != id)
2270 		return (EPERM);
2271 
2272 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2273 	if (error != 0)
2274 		return (error);
2275 
2276 	if (rceiling == -1)
2277 		new_inherited_pri = PRI_MAX;
2278 	else {
2279 		rceiling = RTP_PRIO_MAX - rceiling;
2280 		if (rceiling > RTP_PRIO_MAX)
2281 			return (EINVAL);
2282 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2283 	}
2284 
2285 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2286 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2287 	    &key)) != 0)
2288 		return (error);
2289 	umtxq_lock(&key);
2290 	umtxq_busy(&key);
2291 	umtxq_unlock(&key);
2292 	/*
2293 	 * For priority protected mutex, always set unlocked state
2294 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2295 	 * to lock the mutex, it is necessary because thread priority
2296 	 * has to be adjusted for such mutex.
2297 	 */
2298 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2299 	    UMUTEX_CONTESTED);
2300 
2301 	umtxq_lock(&key);
2302 	if (error == 0)
2303 		umtxq_signal(&key, 1);
2304 	umtxq_unbusy(&key);
2305 	umtxq_unlock(&key);
2306 
2307 	if (error == -1)
2308 		error = EFAULT;
2309 	else {
2310 		mtx_lock(&umtx_lock);
2311 		if (su != 0)
2312 			uq->uq_inherited_pri = new_inherited_pri;
2313 		pri = PRI_MAX;
2314 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2315 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2316 			if (uq2 != NULL) {
2317 				if (pri > UPRI(uq2->uq_thread))
2318 					pri = UPRI(uq2->uq_thread);
2319 			}
2320 		}
2321 		if (pri > uq->uq_inherited_pri)
2322 			pri = uq->uq_inherited_pri;
2323 		thread_lock(td);
2324 		sched_lend_user_prio(td, pri);
2325 		thread_unlock(td);
2326 		mtx_unlock(&umtx_lock);
2327 	}
2328 	umtx_key_release(&key);
2329 	return (error);
2330 }
2331 
2332 static int
2333 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2334     uint32_t *old_ceiling)
2335 {
2336 	struct umtx_q *uq;
2337 	uint32_t flags, id, owner, save_ceiling;
2338 	int error, rv, rv1;
2339 
2340 	error = fueword32(&m->m_flags, &flags);
2341 	if (error == -1)
2342 		return (EFAULT);
2343 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2344 		return (EINVAL);
2345 	if (ceiling > RTP_PRIO_MAX)
2346 		return (EINVAL);
2347 	id = td->td_tid;
2348 	uq = td->td_umtxq;
2349 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2350 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2351 	    &uq->uq_key)) != 0)
2352 		return (error);
2353 	for (;;) {
2354 		umtxq_lock(&uq->uq_key);
2355 		umtxq_busy(&uq->uq_key);
2356 		umtxq_unlock(&uq->uq_key);
2357 
2358 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2359 		if (rv == -1) {
2360 			error = EFAULT;
2361 			break;
2362 		}
2363 
2364 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2365 		    id | UMUTEX_CONTESTED);
2366 		if (rv == -1) {
2367 			error = EFAULT;
2368 			break;
2369 		}
2370 
2371 		if (owner == UMUTEX_CONTESTED) {
2372 			rv = suword32(&m->m_ceilings[0], ceiling);
2373 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2374 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2375 			break;
2376 		}
2377 
2378 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2379 			rv = suword32(&m->m_ceilings[0], ceiling);
2380 			error = rv == 0 ? 0 : EFAULT;
2381 			break;
2382 		}
2383 
2384 		if (owner == UMUTEX_RB_OWNERDEAD) {
2385 			error = EOWNERDEAD;
2386 			break;
2387 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2388 			error = ENOTRECOVERABLE;
2389 			break;
2390 		}
2391 
2392 		/*
2393 		 * If we caught a signal, we have retried and now
2394 		 * exit immediately.
2395 		 */
2396 		if (error != 0)
2397 			break;
2398 
2399 		/*
2400 		 * We set the contested bit, sleep. Otherwise the lock changed
2401 		 * and we need to retry or we lost a race to the thread
2402 		 * unlocking the umtx.
2403 		 */
2404 		umtxq_lock(&uq->uq_key);
2405 		umtxq_insert(uq);
2406 		umtxq_unbusy(&uq->uq_key);
2407 		error = umtxq_sleep(uq, "umtxpp", NULL);
2408 		umtxq_remove(uq);
2409 		umtxq_unlock(&uq->uq_key);
2410 	}
2411 	umtxq_lock(&uq->uq_key);
2412 	if (error == 0)
2413 		umtxq_signal(&uq->uq_key, INT_MAX);
2414 	umtxq_unbusy(&uq->uq_key);
2415 	umtxq_unlock(&uq->uq_key);
2416 	umtx_key_release(&uq->uq_key);
2417 	if (error == 0 && old_ceiling != NULL) {
2418 		rv = suword32(old_ceiling, save_ceiling);
2419 		error = rv == 0 ? 0 : EFAULT;
2420 	}
2421 	return (error);
2422 }
2423 
2424 /*
2425  * Lock a userland POSIX mutex.
2426  */
2427 static int
2428 do_lock_umutex(struct thread *td, struct umutex *m,
2429     struct _umtx_time *timeout, int mode)
2430 {
2431 	uint32_t flags;
2432 	int error;
2433 
2434 	error = fueword32(&m->m_flags, &flags);
2435 	if (error == -1)
2436 		return (EFAULT);
2437 
2438 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2439 	case 0:
2440 		error = do_lock_normal(td, m, flags, timeout, mode);
2441 		break;
2442 	case UMUTEX_PRIO_INHERIT:
2443 		error = do_lock_pi(td, m, flags, timeout, mode);
2444 		break;
2445 	case UMUTEX_PRIO_PROTECT:
2446 		error = do_lock_pp(td, m, flags, timeout, mode);
2447 		break;
2448 	default:
2449 		return (EINVAL);
2450 	}
2451 	if (timeout == NULL) {
2452 		if (error == EINTR && mode != _UMUTEX_WAIT)
2453 			error = ERESTART;
2454 	} else {
2455 		/* Timed-locking is not restarted. */
2456 		if (error == ERESTART)
2457 			error = EINTR;
2458 	}
2459 	return (error);
2460 }
2461 
2462 /*
2463  * Unlock a userland POSIX mutex.
2464  */
2465 static int
2466 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2467 {
2468 	uint32_t flags;
2469 	int error;
2470 
2471 	error = fueword32(&m->m_flags, &flags);
2472 	if (error == -1)
2473 		return (EFAULT);
2474 
2475 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2476 	case 0:
2477 		return (do_unlock_normal(td, m, flags, rb));
2478 	case UMUTEX_PRIO_INHERIT:
2479 		return (do_unlock_pi(td, m, flags, rb));
2480 	case UMUTEX_PRIO_PROTECT:
2481 		return (do_unlock_pp(td, m, flags, rb));
2482 	}
2483 
2484 	return (EINVAL);
2485 }
2486 
2487 static int
2488 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2489     struct timespec *timeout, u_long wflags)
2490 {
2491 	struct abs_timeout timo;
2492 	struct umtx_q *uq;
2493 	uint32_t flags, clockid, hasw;
2494 	int error;
2495 
2496 	uq = td->td_umtxq;
2497 	error = fueword32(&cv->c_flags, &flags);
2498 	if (error == -1)
2499 		return (EFAULT);
2500 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2501 	if (error != 0)
2502 		return (error);
2503 
2504 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2505 		error = fueword32(&cv->c_clockid, &clockid);
2506 		if (error == -1) {
2507 			umtx_key_release(&uq->uq_key);
2508 			return (EFAULT);
2509 		}
2510 		if (clockid < CLOCK_REALTIME ||
2511 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2512 			/* hmm, only HW clock id will work. */
2513 			umtx_key_release(&uq->uq_key);
2514 			return (EINVAL);
2515 		}
2516 	} else {
2517 		clockid = CLOCK_REALTIME;
2518 	}
2519 
2520 	umtxq_lock(&uq->uq_key);
2521 	umtxq_busy(&uq->uq_key);
2522 	umtxq_insert(uq);
2523 	umtxq_unlock(&uq->uq_key);
2524 
2525 	/*
2526 	 * Set c_has_waiters to 1 before releasing user mutex, also
2527 	 * don't modify cache line when unnecessary.
2528 	 */
2529 	error = fueword32(&cv->c_has_waiters, &hasw);
2530 	if (error == 0 && hasw == 0)
2531 		suword32(&cv->c_has_waiters, 1);
2532 
2533 	umtxq_unbusy_unlocked(&uq->uq_key);
2534 
2535 	error = do_unlock_umutex(td, m, false);
2536 
2537 	if (timeout != NULL)
2538 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2539 		    timeout);
2540 
2541 	umtxq_lock(&uq->uq_key);
2542 	if (error == 0) {
2543 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2544 		    NULL : &timo);
2545 	}
2546 
2547 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2548 		error = 0;
2549 	else {
2550 		/*
2551 		 * This must be timeout,interrupted by signal or
2552 		 * surprious wakeup, clear c_has_waiter flag when
2553 		 * necessary.
2554 		 */
2555 		umtxq_busy(&uq->uq_key);
2556 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2557 			int oldlen = uq->uq_cur_queue->length;
2558 			umtxq_remove(uq);
2559 			if (oldlen == 1) {
2560 				umtxq_unlock(&uq->uq_key);
2561 				suword32(&cv->c_has_waiters, 0);
2562 				umtxq_lock(&uq->uq_key);
2563 			}
2564 		}
2565 		umtxq_unbusy(&uq->uq_key);
2566 		if (error == ERESTART)
2567 			error = EINTR;
2568 	}
2569 
2570 	umtxq_unlock(&uq->uq_key);
2571 	umtx_key_release(&uq->uq_key);
2572 	return (error);
2573 }
2574 
2575 /*
2576  * Signal a userland condition variable.
2577  */
2578 static int
2579 do_cv_signal(struct thread *td, struct ucond *cv)
2580 {
2581 	struct umtx_key key;
2582 	int error, cnt, nwake;
2583 	uint32_t flags;
2584 
2585 	error = fueword32(&cv->c_flags, &flags);
2586 	if (error == -1)
2587 		return (EFAULT);
2588 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2589 		return (error);
2590 	umtxq_lock(&key);
2591 	umtxq_busy(&key);
2592 	cnt = umtxq_count(&key);
2593 	nwake = umtxq_signal(&key, 1);
2594 	if (cnt <= nwake) {
2595 		umtxq_unlock(&key);
2596 		error = suword32(&cv->c_has_waiters, 0);
2597 		if (error == -1)
2598 			error = EFAULT;
2599 		umtxq_lock(&key);
2600 	}
2601 	umtxq_unbusy(&key);
2602 	umtxq_unlock(&key);
2603 	umtx_key_release(&key);
2604 	return (error);
2605 }
2606 
2607 static int
2608 do_cv_broadcast(struct thread *td, struct ucond *cv)
2609 {
2610 	struct umtx_key key;
2611 	int error;
2612 	uint32_t flags;
2613 
2614 	error = fueword32(&cv->c_flags, &flags);
2615 	if (error == -1)
2616 		return (EFAULT);
2617 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2618 		return (error);
2619 
2620 	umtxq_lock(&key);
2621 	umtxq_busy(&key);
2622 	umtxq_signal(&key, INT_MAX);
2623 	umtxq_unlock(&key);
2624 
2625 	error = suword32(&cv->c_has_waiters, 0);
2626 	if (error == -1)
2627 		error = EFAULT;
2628 
2629 	umtxq_unbusy_unlocked(&key);
2630 
2631 	umtx_key_release(&key);
2632 	return (error);
2633 }
2634 
2635 static int
2636 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2637 {
2638 	struct abs_timeout timo;
2639 	struct umtx_q *uq;
2640 	uint32_t flags, wrflags;
2641 	int32_t state, oldstate;
2642 	int32_t blocked_readers;
2643 	int error, error1, rv;
2644 
2645 	uq = td->td_umtxq;
2646 	error = fueword32(&rwlock->rw_flags, &flags);
2647 	if (error == -1)
2648 		return (EFAULT);
2649 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2650 	if (error != 0)
2651 		return (error);
2652 
2653 	if (timeout != NULL)
2654 		abs_timeout_init2(&timo, timeout);
2655 
2656 	wrflags = URWLOCK_WRITE_OWNER;
2657 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2658 		wrflags |= URWLOCK_WRITE_WAITERS;
2659 
2660 	for (;;) {
2661 		rv = fueword32(&rwlock->rw_state, &state);
2662 		if (rv == -1) {
2663 			umtx_key_release(&uq->uq_key);
2664 			return (EFAULT);
2665 		}
2666 
2667 		/* try to lock it */
2668 		while (!(state & wrflags)) {
2669 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2670 				umtx_key_release(&uq->uq_key);
2671 				return (EAGAIN);
2672 			}
2673 			rv = casueword32(&rwlock->rw_state, state,
2674 			    &oldstate, state + 1);
2675 			if (rv == -1) {
2676 				umtx_key_release(&uq->uq_key);
2677 				return (EFAULT);
2678 			}
2679 			if (oldstate == state) {
2680 				umtx_key_release(&uq->uq_key);
2681 				return (0);
2682 			}
2683 			error = umtxq_check_susp(td);
2684 			if (error != 0)
2685 				break;
2686 			state = oldstate;
2687 		}
2688 
2689 		if (error)
2690 			break;
2691 
2692 		/* grab monitor lock */
2693 		umtxq_lock(&uq->uq_key);
2694 		umtxq_busy(&uq->uq_key);
2695 		umtxq_unlock(&uq->uq_key);
2696 
2697 		/*
2698 		 * re-read the state, in case it changed between the try-lock above
2699 		 * and the check below
2700 		 */
2701 		rv = fueword32(&rwlock->rw_state, &state);
2702 		if (rv == -1)
2703 			error = EFAULT;
2704 
2705 		/* set read contention bit */
2706 		while (error == 0 && (state & wrflags) &&
2707 		    !(state & URWLOCK_READ_WAITERS)) {
2708 			rv = casueword32(&rwlock->rw_state, state,
2709 			    &oldstate, state | URWLOCK_READ_WAITERS);
2710 			if (rv == -1) {
2711 				error = EFAULT;
2712 				break;
2713 			}
2714 			if (oldstate == state)
2715 				goto sleep;
2716 			state = oldstate;
2717 			error = umtxq_check_susp(td);
2718 			if (error != 0)
2719 				break;
2720 		}
2721 		if (error != 0) {
2722 			umtxq_unbusy_unlocked(&uq->uq_key);
2723 			break;
2724 		}
2725 
2726 		/* state is changed while setting flags, restart */
2727 		if (!(state & wrflags)) {
2728 			umtxq_unbusy_unlocked(&uq->uq_key);
2729 			error = umtxq_check_susp(td);
2730 			if (error != 0)
2731 				break;
2732 			continue;
2733 		}
2734 
2735 sleep:
2736 		/* contention bit is set, before sleeping, increase read waiter count */
2737 		rv = fueword32(&rwlock->rw_blocked_readers,
2738 		    &blocked_readers);
2739 		if (rv == -1) {
2740 			umtxq_unbusy_unlocked(&uq->uq_key);
2741 			error = EFAULT;
2742 			break;
2743 		}
2744 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2745 
2746 		while (state & wrflags) {
2747 			umtxq_lock(&uq->uq_key);
2748 			umtxq_insert(uq);
2749 			umtxq_unbusy(&uq->uq_key);
2750 
2751 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2752 			    NULL : &timo);
2753 
2754 			umtxq_busy(&uq->uq_key);
2755 			umtxq_remove(uq);
2756 			umtxq_unlock(&uq->uq_key);
2757 			if (error)
2758 				break;
2759 			rv = fueword32(&rwlock->rw_state, &state);
2760 			if (rv == -1) {
2761 				error = EFAULT;
2762 				break;
2763 			}
2764 		}
2765 
2766 		/* decrease read waiter count, and may clear read contention bit */
2767 		rv = fueword32(&rwlock->rw_blocked_readers,
2768 		    &blocked_readers);
2769 		if (rv == -1) {
2770 			umtxq_unbusy_unlocked(&uq->uq_key);
2771 			error = EFAULT;
2772 			break;
2773 		}
2774 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2775 		if (blocked_readers == 1) {
2776 			rv = fueword32(&rwlock->rw_state, &state);
2777 			if (rv == -1) {
2778 				umtxq_unbusy_unlocked(&uq->uq_key);
2779 				error = EFAULT;
2780 				break;
2781 			}
2782 			for (;;) {
2783 				rv = casueword32(&rwlock->rw_state, state,
2784 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2785 				if (rv == -1) {
2786 					error = EFAULT;
2787 					break;
2788 				}
2789 				if (oldstate == state)
2790 					break;
2791 				state = oldstate;
2792 				error1 = umtxq_check_susp(td);
2793 				if (error1 != 0) {
2794 					if (error == 0)
2795 						error = error1;
2796 					break;
2797 				}
2798 			}
2799 		}
2800 
2801 		umtxq_unbusy_unlocked(&uq->uq_key);
2802 		if (error != 0)
2803 			break;
2804 	}
2805 	umtx_key_release(&uq->uq_key);
2806 	if (error == ERESTART)
2807 		error = EINTR;
2808 	return (error);
2809 }
2810 
2811 static int
2812 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2813 {
2814 	struct abs_timeout timo;
2815 	struct umtx_q *uq;
2816 	uint32_t flags;
2817 	int32_t state, oldstate;
2818 	int32_t blocked_writers;
2819 	int32_t blocked_readers;
2820 	int error, error1, rv;
2821 
2822 	uq = td->td_umtxq;
2823 	error = fueword32(&rwlock->rw_flags, &flags);
2824 	if (error == -1)
2825 		return (EFAULT);
2826 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2827 	if (error != 0)
2828 		return (error);
2829 
2830 	if (timeout != NULL)
2831 		abs_timeout_init2(&timo, timeout);
2832 
2833 	blocked_readers = 0;
2834 	for (;;) {
2835 		rv = fueword32(&rwlock->rw_state, &state);
2836 		if (rv == -1) {
2837 			umtx_key_release(&uq->uq_key);
2838 			return (EFAULT);
2839 		}
2840 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2841 			rv = casueword32(&rwlock->rw_state, state,
2842 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2843 			if (rv == -1) {
2844 				umtx_key_release(&uq->uq_key);
2845 				return (EFAULT);
2846 			}
2847 			if (oldstate == state) {
2848 				umtx_key_release(&uq->uq_key);
2849 				return (0);
2850 			}
2851 			state = oldstate;
2852 			error = umtxq_check_susp(td);
2853 			if (error != 0)
2854 				break;
2855 		}
2856 
2857 		if (error) {
2858 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2859 			    blocked_readers != 0) {
2860 				umtxq_lock(&uq->uq_key);
2861 				umtxq_busy(&uq->uq_key);
2862 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2863 				umtxq_unbusy(&uq->uq_key);
2864 				umtxq_unlock(&uq->uq_key);
2865 			}
2866 
2867 			break;
2868 		}
2869 
2870 		/* grab monitor lock */
2871 		umtxq_lock(&uq->uq_key);
2872 		umtxq_busy(&uq->uq_key);
2873 		umtxq_unlock(&uq->uq_key);
2874 
2875 		/*
2876 		 * re-read the state, in case it changed between the try-lock above
2877 		 * and the check below
2878 		 */
2879 		rv = fueword32(&rwlock->rw_state, &state);
2880 		if (rv == -1)
2881 			error = EFAULT;
2882 
2883 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2884 		    URWLOCK_READER_COUNT(state) != 0) &&
2885 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2886 			rv = casueword32(&rwlock->rw_state, state,
2887 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2888 			if (rv == -1) {
2889 				error = EFAULT;
2890 				break;
2891 			}
2892 			if (oldstate == state)
2893 				goto sleep;
2894 			state = oldstate;
2895 			error = umtxq_check_susp(td);
2896 			if (error != 0)
2897 				break;
2898 		}
2899 		if (error != 0) {
2900 			umtxq_unbusy_unlocked(&uq->uq_key);
2901 			break;
2902 		}
2903 
2904 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2905 			umtxq_unbusy_unlocked(&uq->uq_key);
2906 			error = umtxq_check_susp(td);
2907 			if (error != 0)
2908 				break;
2909 			continue;
2910 		}
2911 sleep:
2912 		rv = fueword32(&rwlock->rw_blocked_writers,
2913 		    &blocked_writers);
2914 		if (rv == -1) {
2915 			umtxq_unbusy_unlocked(&uq->uq_key);
2916 			error = EFAULT;
2917 			break;
2918 		}
2919 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2920 
2921 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2922 			umtxq_lock(&uq->uq_key);
2923 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2924 			umtxq_unbusy(&uq->uq_key);
2925 
2926 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2927 			    NULL : &timo);
2928 
2929 			umtxq_busy(&uq->uq_key);
2930 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2931 			umtxq_unlock(&uq->uq_key);
2932 			if (error)
2933 				break;
2934 			rv = fueword32(&rwlock->rw_state, &state);
2935 			if (rv == -1) {
2936 				error = EFAULT;
2937 				break;
2938 			}
2939 		}
2940 
2941 		rv = fueword32(&rwlock->rw_blocked_writers,
2942 		    &blocked_writers);
2943 		if (rv == -1) {
2944 			umtxq_unbusy_unlocked(&uq->uq_key);
2945 			error = EFAULT;
2946 			break;
2947 		}
2948 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2949 		if (blocked_writers == 1) {
2950 			rv = fueword32(&rwlock->rw_state, &state);
2951 			if (rv == -1) {
2952 				umtxq_unbusy_unlocked(&uq->uq_key);
2953 				error = EFAULT;
2954 				break;
2955 			}
2956 			for (;;) {
2957 				rv = casueword32(&rwlock->rw_state, state,
2958 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2959 				if (rv == -1) {
2960 					error = EFAULT;
2961 					break;
2962 				}
2963 				if (oldstate == state)
2964 					break;
2965 				state = oldstate;
2966 				error1 = umtxq_check_susp(td);
2967 				/*
2968 				 * We are leaving the URWLOCK_WRITE_WAITERS
2969 				 * behind, but this should not harm the
2970 				 * correctness.
2971 				 */
2972 				if (error1 != 0) {
2973 					if (error == 0)
2974 						error = error1;
2975 					break;
2976 				}
2977 			}
2978 			rv = fueword32(&rwlock->rw_blocked_readers,
2979 			    &blocked_readers);
2980 			if (rv == -1) {
2981 				umtxq_unbusy_unlocked(&uq->uq_key);
2982 				error = EFAULT;
2983 				break;
2984 			}
2985 		} else
2986 			blocked_readers = 0;
2987 
2988 		umtxq_unbusy_unlocked(&uq->uq_key);
2989 	}
2990 
2991 	umtx_key_release(&uq->uq_key);
2992 	if (error == ERESTART)
2993 		error = EINTR;
2994 	return (error);
2995 }
2996 
2997 static int
2998 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2999 {
3000 	struct umtx_q *uq;
3001 	uint32_t flags;
3002 	int32_t state, oldstate;
3003 	int error, rv, q, count;
3004 
3005 	uq = td->td_umtxq;
3006 	error = fueword32(&rwlock->rw_flags, &flags);
3007 	if (error == -1)
3008 		return (EFAULT);
3009 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3010 	if (error != 0)
3011 		return (error);
3012 
3013 	error = fueword32(&rwlock->rw_state, &state);
3014 	if (error == -1) {
3015 		error = EFAULT;
3016 		goto out;
3017 	}
3018 	if (state & URWLOCK_WRITE_OWNER) {
3019 		for (;;) {
3020 			rv = casueword32(&rwlock->rw_state, state,
3021 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3022 			if (rv == -1) {
3023 				error = EFAULT;
3024 				goto out;
3025 			}
3026 			if (oldstate != state) {
3027 				state = oldstate;
3028 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3029 					error = EPERM;
3030 					goto out;
3031 				}
3032 				error = umtxq_check_susp(td);
3033 				if (error != 0)
3034 					goto out;
3035 			} else
3036 				break;
3037 		}
3038 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3039 		for (;;) {
3040 			rv = casueword32(&rwlock->rw_state, state,
3041 			    &oldstate, state - 1);
3042 			if (rv == -1) {
3043 				error = EFAULT;
3044 				goto out;
3045 			}
3046 			if (oldstate != state) {
3047 				state = oldstate;
3048 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3049 					error = EPERM;
3050 					goto out;
3051 				}
3052 				error = umtxq_check_susp(td);
3053 				if (error != 0)
3054 					goto out;
3055 			} else
3056 				break;
3057 		}
3058 	} else {
3059 		error = EPERM;
3060 		goto out;
3061 	}
3062 
3063 	count = 0;
3064 
3065 	if (!(flags & URWLOCK_PREFER_READER)) {
3066 		if (state & URWLOCK_WRITE_WAITERS) {
3067 			count = 1;
3068 			q = UMTX_EXCLUSIVE_QUEUE;
3069 		} else if (state & URWLOCK_READ_WAITERS) {
3070 			count = INT_MAX;
3071 			q = UMTX_SHARED_QUEUE;
3072 		}
3073 	} else {
3074 		if (state & URWLOCK_READ_WAITERS) {
3075 			count = INT_MAX;
3076 			q = UMTX_SHARED_QUEUE;
3077 		} else if (state & URWLOCK_WRITE_WAITERS) {
3078 			count = 1;
3079 			q = UMTX_EXCLUSIVE_QUEUE;
3080 		}
3081 	}
3082 
3083 	if (count) {
3084 		umtxq_lock(&uq->uq_key);
3085 		umtxq_busy(&uq->uq_key);
3086 		umtxq_signal_queue(&uq->uq_key, count, q);
3087 		umtxq_unbusy(&uq->uq_key);
3088 		umtxq_unlock(&uq->uq_key);
3089 	}
3090 out:
3091 	umtx_key_release(&uq->uq_key);
3092 	return (error);
3093 }
3094 
3095 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3096 static int
3097 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3098 {
3099 	struct abs_timeout timo;
3100 	struct umtx_q *uq;
3101 	uint32_t flags, count, count1;
3102 	int error, rv;
3103 
3104 	uq = td->td_umtxq;
3105 	error = fueword32(&sem->_flags, &flags);
3106 	if (error == -1)
3107 		return (EFAULT);
3108 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3109 	if (error != 0)
3110 		return (error);
3111 
3112 	if (timeout != NULL)
3113 		abs_timeout_init2(&timo, timeout);
3114 
3115 	umtxq_lock(&uq->uq_key);
3116 	umtxq_busy(&uq->uq_key);
3117 	umtxq_insert(uq);
3118 	umtxq_unlock(&uq->uq_key);
3119 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3120 	if (rv == 0)
3121 		rv = fueword32(&sem->_count, &count);
3122 	if (rv == -1 || count != 0) {
3123 		umtxq_lock(&uq->uq_key);
3124 		umtxq_unbusy(&uq->uq_key);
3125 		umtxq_remove(uq);
3126 		umtxq_unlock(&uq->uq_key);
3127 		umtx_key_release(&uq->uq_key);
3128 		return (rv == -1 ? EFAULT : 0);
3129 	}
3130 	umtxq_lock(&uq->uq_key);
3131 	umtxq_unbusy(&uq->uq_key);
3132 
3133 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3134 
3135 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3136 		error = 0;
3137 	else {
3138 		umtxq_remove(uq);
3139 		/* A relative timeout cannot be restarted. */
3140 		if (error == ERESTART && timeout != NULL &&
3141 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3142 			error = EINTR;
3143 	}
3144 	umtxq_unlock(&uq->uq_key);
3145 	umtx_key_release(&uq->uq_key);
3146 	return (error);
3147 }
3148 
3149 /*
3150  * Signal a userland semaphore.
3151  */
3152 static int
3153 do_sem_wake(struct thread *td, struct _usem *sem)
3154 {
3155 	struct umtx_key key;
3156 	int error, cnt;
3157 	uint32_t flags;
3158 
3159 	error = fueword32(&sem->_flags, &flags);
3160 	if (error == -1)
3161 		return (EFAULT);
3162 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3163 		return (error);
3164 	umtxq_lock(&key);
3165 	umtxq_busy(&key);
3166 	cnt = umtxq_count(&key);
3167 	if (cnt > 0) {
3168 		/*
3169 		 * Check if count is greater than 0, this means the memory is
3170 		 * still being referenced by user code, so we can safely
3171 		 * update _has_waiters flag.
3172 		 */
3173 		if (cnt == 1) {
3174 			umtxq_unlock(&key);
3175 			error = suword32(&sem->_has_waiters, 0);
3176 			umtxq_lock(&key);
3177 			if (error == -1)
3178 				error = EFAULT;
3179 		}
3180 		umtxq_signal(&key, 1);
3181 	}
3182 	umtxq_unbusy(&key);
3183 	umtxq_unlock(&key);
3184 	umtx_key_release(&key);
3185 	return (error);
3186 }
3187 #endif
3188 
3189 static int
3190 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3191 {
3192 	struct abs_timeout timo;
3193 	struct umtx_q *uq;
3194 	uint32_t count, flags;
3195 	int error, rv;
3196 
3197 	uq = td->td_umtxq;
3198 	flags = fuword32(&sem->_flags);
3199 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3200 	if (error != 0)
3201 		return (error);
3202 
3203 	if (timeout != NULL)
3204 		abs_timeout_init2(&timo, timeout);
3205 
3206 	umtxq_lock(&uq->uq_key);
3207 	umtxq_busy(&uq->uq_key);
3208 	umtxq_insert(uq);
3209 	umtxq_unlock(&uq->uq_key);
3210 	rv = fueword32(&sem->_count, &count);
3211 	if (rv == -1) {
3212 		umtxq_lock(&uq->uq_key);
3213 		umtxq_unbusy(&uq->uq_key);
3214 		umtxq_remove(uq);
3215 		umtxq_unlock(&uq->uq_key);
3216 		umtx_key_release(&uq->uq_key);
3217 		return (EFAULT);
3218 	}
3219 	for (;;) {
3220 		if (USEM_COUNT(count) != 0) {
3221 			umtxq_lock(&uq->uq_key);
3222 			umtxq_unbusy(&uq->uq_key);
3223 			umtxq_remove(uq);
3224 			umtxq_unlock(&uq->uq_key);
3225 			umtx_key_release(&uq->uq_key);
3226 			return (0);
3227 		}
3228 		if (count == USEM_HAS_WAITERS)
3229 			break;
3230 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3231 		if (rv == -1) {
3232 			umtxq_lock(&uq->uq_key);
3233 			umtxq_unbusy(&uq->uq_key);
3234 			umtxq_remove(uq);
3235 			umtxq_unlock(&uq->uq_key);
3236 			umtx_key_release(&uq->uq_key);
3237 			return (EFAULT);
3238 		}
3239 		if (count == 0)
3240 			break;
3241 	}
3242 	umtxq_lock(&uq->uq_key);
3243 	umtxq_unbusy(&uq->uq_key);
3244 
3245 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3246 
3247 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3248 		error = 0;
3249 	else {
3250 		umtxq_remove(uq);
3251 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3252 			/* A relative timeout cannot be restarted. */
3253 			if (error == ERESTART)
3254 				error = EINTR;
3255 			if (error == EINTR) {
3256 				abs_timeout_update(&timo);
3257 				timeout->_timeout = timo.end;
3258 				timespecsub(&timeout->_timeout, &timo.cur);
3259 			}
3260 		}
3261 	}
3262 	umtxq_unlock(&uq->uq_key);
3263 	umtx_key_release(&uq->uq_key);
3264 	return (error);
3265 }
3266 
3267 /*
3268  * Signal a userland semaphore.
3269  */
3270 static int
3271 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3272 {
3273 	struct umtx_key key;
3274 	int error, cnt, rv;
3275 	uint32_t count, flags;
3276 
3277 	rv = fueword32(&sem->_flags, &flags);
3278 	if (rv == -1)
3279 		return (EFAULT);
3280 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3281 		return (error);
3282 	umtxq_lock(&key);
3283 	umtxq_busy(&key);
3284 	cnt = umtxq_count(&key);
3285 	if (cnt > 0) {
3286 		/*
3287 		 * If this was the last sleeping thread, clear the waiters
3288 		 * flag in _count.
3289 		 */
3290 		if (cnt == 1) {
3291 			umtxq_unlock(&key);
3292 			rv = fueword32(&sem->_count, &count);
3293 			while (rv != -1 && count & USEM_HAS_WAITERS)
3294 				rv = casueword32(&sem->_count, count, &count,
3295 				    count & ~USEM_HAS_WAITERS);
3296 			if (rv == -1)
3297 				error = EFAULT;
3298 			umtxq_lock(&key);
3299 		}
3300 
3301 		umtxq_signal(&key, 1);
3302 	}
3303 	umtxq_unbusy(&key);
3304 	umtxq_unlock(&key);
3305 	umtx_key_release(&key);
3306 	return (error);
3307 }
3308 
3309 inline int
3310 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3311 {
3312 	int error;
3313 
3314 	error = copyin(addr, tsp, sizeof(struct timespec));
3315 	if (error == 0) {
3316 		if (tsp->tv_sec < 0 ||
3317 		    tsp->tv_nsec >= 1000000000 ||
3318 		    tsp->tv_nsec < 0)
3319 			error = EINVAL;
3320 	}
3321 	return (error);
3322 }
3323 
3324 static inline int
3325 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3326 {
3327 	int error;
3328 
3329 	if (size <= sizeof(struct timespec)) {
3330 		tp->_clockid = CLOCK_REALTIME;
3331 		tp->_flags = 0;
3332 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3333 	} else
3334 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3335 	if (error != 0)
3336 		return (error);
3337 	if (tp->_timeout.tv_sec < 0 ||
3338 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3339 		return (EINVAL);
3340 	return (0);
3341 }
3342 
3343 static int
3344 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3345 {
3346 
3347 	return (EOPNOTSUPP);
3348 }
3349 
3350 static int
3351 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3352 {
3353 	struct _umtx_time timeout, *tm_p;
3354 	int error;
3355 
3356 	if (uap->uaddr2 == NULL)
3357 		tm_p = NULL;
3358 	else {
3359 		error = umtx_copyin_umtx_time(
3360 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3361 		if (error != 0)
3362 			return (error);
3363 		tm_p = &timeout;
3364 	}
3365 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3366 }
3367 
3368 static int
3369 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3370 {
3371 	struct _umtx_time timeout, *tm_p;
3372 	int error;
3373 
3374 	if (uap->uaddr2 == NULL)
3375 		tm_p = NULL;
3376 	else {
3377 		error = umtx_copyin_umtx_time(
3378 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3379 		if (error != 0)
3380 			return (error);
3381 		tm_p = &timeout;
3382 	}
3383 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3384 }
3385 
3386 static int
3387 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3388 {
3389 	struct _umtx_time *tm_p, timeout;
3390 	int error;
3391 
3392 	if (uap->uaddr2 == NULL)
3393 		tm_p = NULL;
3394 	else {
3395 		error = umtx_copyin_umtx_time(
3396 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3397 		if (error != 0)
3398 			return (error);
3399 		tm_p = &timeout;
3400 	}
3401 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3402 }
3403 
3404 static int
3405 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3406 {
3407 
3408 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3409 }
3410 
3411 #define BATCH_SIZE	128
3412 static int
3413 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3414 {
3415 	char *uaddrs[BATCH_SIZE], **upp;
3416 	int count, error, i, pos, tocopy;
3417 
3418 	upp = (char **)uap->obj;
3419 	error = 0;
3420 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3421 	    pos += tocopy) {
3422 		tocopy = MIN(count, BATCH_SIZE);
3423 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3424 		if (error != 0)
3425 			break;
3426 		for (i = 0; i < tocopy; ++i)
3427 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3428 		maybe_yield();
3429 	}
3430 	return (error);
3431 }
3432 
3433 static int
3434 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3435 {
3436 
3437 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3438 }
3439 
3440 static int
3441 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3442 {
3443 	struct _umtx_time *tm_p, timeout;
3444 	int error;
3445 
3446 	/* Allow a null timespec (wait forever). */
3447 	if (uap->uaddr2 == NULL)
3448 		tm_p = NULL;
3449 	else {
3450 		error = umtx_copyin_umtx_time(
3451 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3452 		if (error != 0)
3453 			return (error);
3454 		tm_p = &timeout;
3455 	}
3456 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3457 }
3458 
3459 static int
3460 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3461 {
3462 
3463 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3464 }
3465 
3466 static int
3467 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3468 {
3469 	struct _umtx_time *tm_p, timeout;
3470 	int error;
3471 
3472 	/* Allow a null timespec (wait forever). */
3473 	if (uap->uaddr2 == NULL)
3474 		tm_p = NULL;
3475 	else {
3476 		error = umtx_copyin_umtx_time(
3477 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3478 		if (error != 0)
3479 			return (error);
3480 		tm_p = &timeout;
3481 	}
3482 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3483 }
3484 
3485 static int
3486 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3487 {
3488 
3489 	return (do_wake_umutex(td, uap->obj));
3490 }
3491 
3492 static int
3493 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3494 {
3495 
3496 	return (do_unlock_umutex(td, uap->obj, false));
3497 }
3498 
3499 static int
3500 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3501 {
3502 
3503 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3504 }
3505 
3506 static int
3507 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3508 {
3509 	struct timespec *ts, timeout;
3510 	int error;
3511 
3512 	/* Allow a null timespec (wait forever). */
3513 	if (uap->uaddr2 == NULL)
3514 		ts = NULL;
3515 	else {
3516 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3517 		if (error != 0)
3518 			return (error);
3519 		ts = &timeout;
3520 	}
3521 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3522 }
3523 
3524 static int
3525 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3526 {
3527 
3528 	return (do_cv_signal(td, uap->obj));
3529 }
3530 
3531 static int
3532 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3533 {
3534 
3535 	return (do_cv_broadcast(td, uap->obj));
3536 }
3537 
3538 static int
3539 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3540 {
3541 	struct _umtx_time timeout;
3542 	int error;
3543 
3544 	/* Allow a null timespec (wait forever). */
3545 	if (uap->uaddr2 == NULL) {
3546 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3547 	} else {
3548 		error = umtx_copyin_umtx_time(uap->uaddr2,
3549 		   (size_t)uap->uaddr1, &timeout);
3550 		if (error != 0)
3551 			return (error);
3552 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3553 	}
3554 	return (error);
3555 }
3556 
3557 static int
3558 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3559 {
3560 	struct _umtx_time timeout;
3561 	int error;
3562 
3563 	/* Allow a null timespec (wait forever). */
3564 	if (uap->uaddr2 == NULL) {
3565 		error = do_rw_wrlock(td, uap->obj, 0);
3566 	} else {
3567 		error = umtx_copyin_umtx_time(uap->uaddr2,
3568 		   (size_t)uap->uaddr1, &timeout);
3569 		if (error != 0)
3570 			return (error);
3571 
3572 		error = do_rw_wrlock(td, uap->obj, &timeout);
3573 	}
3574 	return (error);
3575 }
3576 
3577 static int
3578 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3579 {
3580 
3581 	return (do_rw_unlock(td, uap->obj));
3582 }
3583 
3584 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3585 static int
3586 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3587 {
3588 	struct _umtx_time *tm_p, timeout;
3589 	int error;
3590 
3591 	/* Allow a null timespec (wait forever). */
3592 	if (uap->uaddr2 == NULL)
3593 		tm_p = NULL;
3594 	else {
3595 		error = umtx_copyin_umtx_time(
3596 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3597 		if (error != 0)
3598 			return (error);
3599 		tm_p = &timeout;
3600 	}
3601 	return (do_sem_wait(td, uap->obj, tm_p));
3602 }
3603 
3604 static int
3605 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3606 {
3607 
3608 	return (do_sem_wake(td, uap->obj));
3609 }
3610 #endif
3611 
3612 static int
3613 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3614 {
3615 
3616 	return (do_wake2_umutex(td, uap->obj, uap->val));
3617 }
3618 
3619 static int
3620 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3621 {
3622 	struct _umtx_time *tm_p, timeout;
3623 	size_t uasize;
3624 	int error;
3625 
3626 	/* Allow a null timespec (wait forever). */
3627 	if (uap->uaddr2 == NULL) {
3628 		uasize = 0;
3629 		tm_p = NULL;
3630 	} else {
3631 		uasize = (size_t)uap->uaddr1;
3632 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3633 		if (error != 0)
3634 			return (error);
3635 		tm_p = &timeout;
3636 	}
3637 	error = do_sem2_wait(td, uap->obj, tm_p);
3638 	if (error == EINTR && uap->uaddr2 != NULL &&
3639 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3640 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3641 		error = copyout(&timeout._timeout,
3642 		    (struct _umtx_time *)uap->uaddr2 + 1,
3643 		    sizeof(struct timespec));
3644 		if (error == 0) {
3645 			error = EINTR;
3646 		}
3647 	}
3648 
3649 	return (error);
3650 }
3651 
3652 static int
3653 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3654 {
3655 
3656 	return (do_sem2_wake(td, uap->obj));
3657 }
3658 
3659 #define	USHM_OBJ_UMTX(o)						\
3660     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3661 
3662 #define	USHMF_REG_LINKED	0x0001
3663 #define	USHMF_OBJ_LINKED	0x0002
3664 struct umtx_shm_reg {
3665 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3666 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3667 	struct umtx_key		ushm_key;
3668 	struct ucred		*ushm_cred;
3669 	struct shmfd		*ushm_obj;
3670 	u_int			ushm_refcnt;
3671 	u_int			ushm_flags;
3672 };
3673 
3674 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3675 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3676 
3677 static uma_zone_t umtx_shm_reg_zone;
3678 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3679 static struct mtx umtx_shm_lock;
3680 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3681     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3682 
3683 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3684 
3685 static void
3686 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3687 {
3688 	struct umtx_shm_reg_head d;
3689 	struct umtx_shm_reg *reg, *reg1;
3690 
3691 	TAILQ_INIT(&d);
3692 	mtx_lock(&umtx_shm_lock);
3693 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3694 	mtx_unlock(&umtx_shm_lock);
3695 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3696 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3697 		umtx_shm_free_reg(reg);
3698 	}
3699 }
3700 
3701 static struct task umtx_shm_reg_delfree_task =
3702     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3703 
3704 static struct umtx_shm_reg *
3705 umtx_shm_find_reg_locked(const struct umtx_key *key)
3706 {
3707 	struct umtx_shm_reg *reg;
3708 	struct umtx_shm_reg_head *reg_head;
3709 
3710 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3711 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3712 	reg_head = &umtx_shm_registry[key->hash];
3713 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3714 		KASSERT(reg->ushm_key.shared,
3715 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3716 		if (reg->ushm_key.info.shared.object ==
3717 		    key->info.shared.object &&
3718 		    reg->ushm_key.info.shared.offset ==
3719 		    key->info.shared.offset) {
3720 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3721 			KASSERT(reg->ushm_refcnt > 0,
3722 			    ("reg %p refcnt 0 onlist", reg));
3723 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3724 			    ("reg %p not linked", reg));
3725 			reg->ushm_refcnt++;
3726 			return (reg);
3727 		}
3728 	}
3729 	return (NULL);
3730 }
3731 
3732 static struct umtx_shm_reg *
3733 umtx_shm_find_reg(const struct umtx_key *key)
3734 {
3735 	struct umtx_shm_reg *reg;
3736 
3737 	mtx_lock(&umtx_shm_lock);
3738 	reg = umtx_shm_find_reg_locked(key);
3739 	mtx_unlock(&umtx_shm_lock);
3740 	return (reg);
3741 }
3742 
3743 static void
3744 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3745 {
3746 
3747 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3748 	crfree(reg->ushm_cred);
3749 	shm_drop(reg->ushm_obj);
3750 	uma_zfree(umtx_shm_reg_zone, reg);
3751 }
3752 
3753 static bool
3754 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3755 {
3756 	bool res;
3757 
3758 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3759 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3760 	reg->ushm_refcnt--;
3761 	res = reg->ushm_refcnt == 0;
3762 	if (res || force) {
3763 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3764 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3765 			    reg, ushm_reg_link);
3766 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3767 		}
3768 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3769 			LIST_REMOVE(reg, ushm_obj_link);
3770 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3771 		}
3772 	}
3773 	return (res);
3774 }
3775 
3776 static void
3777 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3778 {
3779 	vm_object_t object;
3780 	bool dofree;
3781 
3782 	if (force) {
3783 		object = reg->ushm_obj->shm_object;
3784 		VM_OBJECT_WLOCK(object);
3785 		object->flags |= OBJ_UMTXDEAD;
3786 		VM_OBJECT_WUNLOCK(object);
3787 	}
3788 	mtx_lock(&umtx_shm_lock);
3789 	dofree = umtx_shm_unref_reg_locked(reg, force);
3790 	mtx_unlock(&umtx_shm_lock);
3791 	if (dofree)
3792 		umtx_shm_free_reg(reg);
3793 }
3794 
3795 void
3796 umtx_shm_object_init(vm_object_t object)
3797 {
3798 
3799 	LIST_INIT(USHM_OBJ_UMTX(object));
3800 }
3801 
3802 void
3803 umtx_shm_object_terminated(vm_object_t object)
3804 {
3805 	struct umtx_shm_reg *reg, *reg1;
3806 	bool dofree;
3807 
3808 	dofree = false;
3809 	mtx_lock(&umtx_shm_lock);
3810 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3811 		if (umtx_shm_unref_reg_locked(reg, true)) {
3812 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3813 			    ushm_reg_link);
3814 			dofree = true;
3815 		}
3816 	}
3817 	mtx_unlock(&umtx_shm_lock);
3818 	if (dofree)
3819 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3820 }
3821 
3822 static int
3823 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3824     struct umtx_shm_reg **res)
3825 {
3826 	struct umtx_shm_reg *reg, *reg1;
3827 	struct ucred *cred;
3828 	int error;
3829 
3830 	reg = umtx_shm_find_reg(key);
3831 	if (reg != NULL) {
3832 		*res = reg;
3833 		return (0);
3834 	}
3835 	cred = td->td_ucred;
3836 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3837 		return (ENOMEM);
3838 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3839 	reg->ushm_refcnt = 1;
3840 	bcopy(key, &reg->ushm_key, sizeof(*key));
3841 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3842 	reg->ushm_cred = crhold(cred);
3843 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3844 	if (error != 0) {
3845 		umtx_shm_free_reg(reg);
3846 		return (error);
3847 	}
3848 	mtx_lock(&umtx_shm_lock);
3849 	reg1 = umtx_shm_find_reg_locked(key);
3850 	if (reg1 != NULL) {
3851 		mtx_unlock(&umtx_shm_lock);
3852 		umtx_shm_free_reg(reg);
3853 		*res = reg1;
3854 		return (0);
3855 	}
3856 	reg->ushm_refcnt++;
3857 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3858 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3859 	    ushm_obj_link);
3860 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3861 	mtx_unlock(&umtx_shm_lock);
3862 	*res = reg;
3863 	return (0);
3864 }
3865 
3866 static int
3867 umtx_shm_alive(struct thread *td, void *addr)
3868 {
3869 	vm_map_t map;
3870 	vm_map_entry_t entry;
3871 	vm_object_t object;
3872 	vm_pindex_t pindex;
3873 	vm_prot_t prot;
3874 	int res, ret;
3875 	boolean_t wired;
3876 
3877 	map = &td->td_proc->p_vmspace->vm_map;
3878 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3879 	    &object, &pindex, &prot, &wired);
3880 	if (res != KERN_SUCCESS)
3881 		return (EFAULT);
3882 	if (object == NULL)
3883 		ret = EINVAL;
3884 	else
3885 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3886 	vm_map_lookup_done(map, entry);
3887 	return (ret);
3888 }
3889 
3890 static void
3891 umtx_shm_init(void)
3892 {
3893 	int i;
3894 
3895 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3896 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3897 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3898 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3899 		TAILQ_INIT(&umtx_shm_registry[i]);
3900 }
3901 
3902 static int
3903 umtx_shm(struct thread *td, void *addr, u_int flags)
3904 {
3905 	struct umtx_key key;
3906 	struct umtx_shm_reg *reg;
3907 	struct file *fp;
3908 	int error, fd;
3909 
3910 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
3911 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
3912 		return (EINVAL);
3913 	if ((flags & UMTX_SHM_ALIVE) != 0)
3914 		return (umtx_shm_alive(td, addr));
3915 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
3916 	if (error != 0)
3917 		return (error);
3918 	KASSERT(key.shared == 1, ("non-shared key"));
3919 	if ((flags & UMTX_SHM_CREAT) != 0) {
3920 		error = umtx_shm_create_reg(td, &key, &reg);
3921 	} else {
3922 		reg = umtx_shm_find_reg(&key);
3923 		if (reg == NULL)
3924 			error = ESRCH;
3925 	}
3926 	umtx_key_release(&key);
3927 	if (error != 0)
3928 		return (error);
3929 	KASSERT(reg != NULL, ("no reg"));
3930 	if ((flags & UMTX_SHM_DESTROY) != 0) {
3931 		umtx_shm_unref_reg(reg, true);
3932 	} else {
3933 #if 0
3934 #ifdef MAC
3935 		error = mac_posixshm_check_open(td->td_ucred,
3936 		    reg->ushm_obj, FFLAGS(O_RDWR));
3937 		if (error == 0)
3938 #endif
3939 			error = shm_access(reg->ushm_obj, td->td_ucred,
3940 			    FFLAGS(O_RDWR));
3941 		if (error == 0)
3942 #endif
3943 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
3944 		if (error == 0) {
3945 			shm_hold(reg->ushm_obj);
3946 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
3947 			    &shm_ops);
3948 			td->td_retval[0] = fd;
3949 			fdrop(fp, td);
3950 		}
3951 	}
3952 	umtx_shm_unref_reg(reg, false);
3953 	return (error);
3954 }
3955 
3956 static int
3957 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
3958 {
3959 
3960 	return (umtx_shm(td, uap->uaddr1, uap->val));
3961 }
3962 
3963 static int
3964 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
3965 {
3966 
3967 	td->td_rb_list = rbp->robust_list_offset;
3968 	td->td_rbp_list = rbp->robust_priv_list_offset;
3969 	td->td_rb_inact = rbp->robust_inact_offset;
3970 	return (0);
3971 }
3972 
3973 static int
3974 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
3975 {
3976 	struct umtx_robust_lists_params rb;
3977 	int error;
3978 
3979 	if (uap->val > sizeof(rb))
3980 		return (EINVAL);
3981 	bzero(&rb, sizeof(rb));
3982 	error = copyin(uap->uaddr1, &rb, uap->val);
3983 	if (error != 0)
3984 		return (error);
3985 	return (umtx_robust_lists(td, &rb));
3986 }
3987 
3988 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3989 
3990 static const _umtx_op_func op_table[] = {
3991 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
3992 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
3993 	[UMTX_OP_WAIT]		= __umtx_op_wait,
3994 	[UMTX_OP_WAKE]		= __umtx_op_wake,
3995 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
3996 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
3997 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
3998 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
3999 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4000 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4001 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4002 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4003 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4004 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4005 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4006 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4007 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4008 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4009 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4010 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4011 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4012 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4013 #else
4014 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4015 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4016 #endif
4017 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4018 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4019 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4020 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4021 	[UMTX_OP_SHM]		= __umtx_op_shm,
4022 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4023 };
4024 
4025 int
4026 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4027 {
4028 
4029 	if ((unsigned)uap->op < nitems(op_table))
4030 		return (*op_table[uap->op])(td, uap);
4031 	return (EINVAL);
4032 }
4033 
4034 #ifdef COMPAT_FREEBSD32
4035 
4036 struct timespec32 {
4037 	int32_t tv_sec;
4038 	int32_t tv_nsec;
4039 };
4040 
4041 struct umtx_time32 {
4042 	struct	timespec32	timeout;
4043 	uint32_t		flags;
4044 	uint32_t		clockid;
4045 };
4046 
4047 static inline int
4048 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4049 {
4050 	struct timespec32 ts32;
4051 	int error;
4052 
4053 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4054 	if (error == 0) {
4055 		if (ts32.tv_sec < 0 ||
4056 		    ts32.tv_nsec >= 1000000000 ||
4057 		    ts32.tv_nsec < 0)
4058 			error = EINVAL;
4059 		else {
4060 			tsp->tv_sec = ts32.tv_sec;
4061 			tsp->tv_nsec = ts32.tv_nsec;
4062 		}
4063 	}
4064 	return (error);
4065 }
4066 
4067 static inline int
4068 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4069 {
4070 	struct umtx_time32 t32;
4071 	int error;
4072 
4073 	t32.clockid = CLOCK_REALTIME;
4074 	t32.flags   = 0;
4075 	if (size <= sizeof(struct timespec32))
4076 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4077 	else
4078 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4079 	if (error != 0)
4080 		return (error);
4081 	if (t32.timeout.tv_sec < 0 ||
4082 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4083 		return (EINVAL);
4084 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4085 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4086 	tp->_flags = t32.flags;
4087 	tp->_clockid = t32.clockid;
4088 	return (0);
4089 }
4090 
4091 static int
4092 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4093 {
4094 	struct _umtx_time *tm_p, timeout;
4095 	int error;
4096 
4097 	if (uap->uaddr2 == NULL)
4098 		tm_p = NULL;
4099 	else {
4100 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4101 			(size_t)uap->uaddr1, &timeout);
4102 		if (error != 0)
4103 			return (error);
4104 		tm_p = &timeout;
4105 	}
4106 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4107 }
4108 
4109 static int
4110 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4111 {
4112 	struct _umtx_time *tm_p, timeout;
4113 	int error;
4114 
4115 	/* Allow a null timespec (wait forever). */
4116 	if (uap->uaddr2 == NULL)
4117 		tm_p = NULL;
4118 	else {
4119 		error = umtx_copyin_umtx_time(uap->uaddr2,
4120 			    (size_t)uap->uaddr1, &timeout);
4121 		if (error != 0)
4122 			return (error);
4123 		tm_p = &timeout;
4124 	}
4125 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4126 }
4127 
4128 static int
4129 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4130 {
4131 	struct _umtx_time *tm_p, timeout;
4132 	int error;
4133 
4134 	/* Allow a null timespec (wait forever). */
4135 	if (uap->uaddr2 == NULL)
4136 		tm_p = NULL;
4137 	else {
4138 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4139 		    (size_t)uap->uaddr1, &timeout);
4140 		if (error != 0)
4141 			return (error);
4142 		tm_p = &timeout;
4143 	}
4144 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4145 }
4146 
4147 static int
4148 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4149 {
4150 	struct timespec *ts, timeout;
4151 	int error;
4152 
4153 	/* Allow a null timespec (wait forever). */
4154 	if (uap->uaddr2 == NULL)
4155 		ts = NULL;
4156 	else {
4157 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4158 		if (error != 0)
4159 			return (error);
4160 		ts = &timeout;
4161 	}
4162 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4163 }
4164 
4165 static int
4166 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4167 {
4168 	struct _umtx_time timeout;
4169 	int error;
4170 
4171 	/* Allow a null timespec (wait forever). */
4172 	if (uap->uaddr2 == NULL) {
4173 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4174 	} else {
4175 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4176 		    (size_t)uap->uaddr1, &timeout);
4177 		if (error != 0)
4178 			return (error);
4179 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4180 	}
4181 	return (error);
4182 }
4183 
4184 static int
4185 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4186 {
4187 	struct _umtx_time timeout;
4188 	int error;
4189 
4190 	/* Allow a null timespec (wait forever). */
4191 	if (uap->uaddr2 == NULL) {
4192 		error = do_rw_wrlock(td, uap->obj, 0);
4193 	} else {
4194 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4195 		    (size_t)uap->uaddr1, &timeout);
4196 		if (error != 0)
4197 			return (error);
4198 		error = do_rw_wrlock(td, uap->obj, &timeout);
4199 	}
4200 	return (error);
4201 }
4202 
4203 static int
4204 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4205 {
4206 	struct _umtx_time *tm_p, timeout;
4207 	int error;
4208 
4209 	if (uap->uaddr2 == NULL)
4210 		tm_p = NULL;
4211 	else {
4212 		error = umtx_copyin_umtx_time32(
4213 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4214 		if (error != 0)
4215 			return (error);
4216 		tm_p = &timeout;
4217 	}
4218 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4219 }
4220 
4221 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4222 static int
4223 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4224 {
4225 	struct _umtx_time *tm_p, timeout;
4226 	int error;
4227 
4228 	/* Allow a null timespec (wait forever). */
4229 	if (uap->uaddr2 == NULL)
4230 		tm_p = NULL;
4231 	else {
4232 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4233 		    (size_t)uap->uaddr1, &timeout);
4234 		if (error != 0)
4235 			return (error);
4236 		tm_p = &timeout;
4237 	}
4238 	return (do_sem_wait(td, uap->obj, tm_p));
4239 }
4240 #endif
4241 
4242 static int
4243 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4244 {
4245 	struct _umtx_time *tm_p, timeout;
4246 	size_t uasize;
4247 	int error;
4248 
4249 	/* Allow a null timespec (wait forever). */
4250 	if (uap->uaddr2 == NULL) {
4251 		uasize = 0;
4252 		tm_p = NULL;
4253 	} else {
4254 		uasize = (size_t)uap->uaddr1;
4255 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4256 		if (error != 0)
4257 			return (error);
4258 		tm_p = &timeout;
4259 	}
4260 	error = do_sem2_wait(td, uap->obj, tm_p);
4261 	if (error == EINTR && uap->uaddr2 != NULL &&
4262 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4263 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4264 		struct timespec32 remain32 = {
4265 			.tv_sec = timeout._timeout.tv_sec,
4266 			.tv_nsec = timeout._timeout.tv_nsec
4267 		};
4268 		error = copyout(&remain32,
4269 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4270 		    sizeof(struct timespec32));
4271 		if (error == 0) {
4272 			error = EINTR;
4273 		}
4274 	}
4275 
4276 	return (error);
4277 }
4278 
4279 static int
4280 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4281 {
4282 	uint32_t uaddrs[BATCH_SIZE], **upp;
4283 	int count, error, i, pos, tocopy;
4284 
4285 	upp = (uint32_t **)uap->obj;
4286 	error = 0;
4287 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4288 	    pos += tocopy) {
4289 		tocopy = MIN(count, BATCH_SIZE);
4290 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4291 		if (error != 0)
4292 			break;
4293 		for (i = 0; i < tocopy; ++i)
4294 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4295 			    INT_MAX, 1);
4296 		maybe_yield();
4297 	}
4298 	return (error);
4299 }
4300 
4301 struct umtx_robust_lists_params_compat32 {
4302 	uint32_t	robust_list_offset;
4303 	uint32_t	robust_priv_list_offset;
4304 	uint32_t	robust_inact_offset;
4305 };
4306 
4307 static int
4308 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4309 {
4310 	struct umtx_robust_lists_params rb;
4311 	struct umtx_robust_lists_params_compat32 rb32;
4312 	int error;
4313 
4314 	if (uap->val > sizeof(rb32))
4315 		return (EINVAL);
4316 	bzero(&rb, sizeof(rb));
4317 	bzero(&rb32, sizeof(rb32));
4318 	error = copyin(uap->uaddr1, &rb32, uap->val);
4319 	if (error != 0)
4320 		return (error);
4321 	rb.robust_list_offset = rb32.robust_list_offset;
4322 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4323 	rb.robust_inact_offset = rb32.robust_inact_offset;
4324 	return (umtx_robust_lists(td, &rb));
4325 }
4326 
4327 static const _umtx_op_func op_table_compat32[] = {
4328 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4329 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4330 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4331 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4332 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4333 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4334 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4335 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4336 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4337 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4338 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4339 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4340 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4341 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4342 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4343 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4344 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4345 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4346 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4347 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4348 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4349 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4350 #else
4351 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4352 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4353 #endif
4354 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4355 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4356 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4357 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4358 	[UMTX_OP_SHM]		= __umtx_op_shm,
4359 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4360 };
4361 
4362 int
4363 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
4364 {
4365 
4366 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4367 		return (*op_table_compat32[uap->op])(td,
4368 		    (struct _umtx_op_args *)uap);
4369 	}
4370 	return (EINVAL);
4371 }
4372 #endif
4373 
4374 void
4375 umtx_thread_init(struct thread *td)
4376 {
4377 
4378 	td->td_umtxq = umtxq_alloc();
4379 	td->td_umtxq->uq_thread = td;
4380 }
4381 
4382 void
4383 umtx_thread_fini(struct thread *td)
4384 {
4385 
4386 	umtxq_free(td->td_umtxq);
4387 }
4388 
4389 /*
4390  * It will be called when new thread is created, e.g fork().
4391  */
4392 void
4393 umtx_thread_alloc(struct thread *td)
4394 {
4395 	struct umtx_q *uq;
4396 
4397 	uq = td->td_umtxq;
4398 	uq->uq_inherited_pri = PRI_MAX;
4399 
4400 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4401 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4402 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4403 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4404 }
4405 
4406 /*
4407  * exec() hook.
4408  *
4409  * Clear robust lists for all process' threads, not delaying the
4410  * cleanup to thread_exit hook, since the relevant address space is
4411  * destroyed right now.
4412  */
4413 static void
4414 umtx_exec_hook(void *arg __unused, struct proc *p,
4415     struct image_params *imgp __unused)
4416 {
4417 	struct thread *td;
4418 
4419 	KASSERT(p == curproc, ("need curproc"));
4420 	PROC_LOCK(p);
4421 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4422 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4423 	    ("curproc must be single-threaded"));
4424 	FOREACH_THREAD_IN_PROC(p, td) {
4425 		KASSERT(td == curthread ||
4426 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4427 		    ("running thread %p %p", p, td));
4428 		PROC_UNLOCK(p);
4429 		umtx_thread_cleanup(td);
4430 		PROC_LOCK(p);
4431 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4432 	}
4433 	PROC_UNLOCK(p);
4434 }
4435 
4436 /*
4437  * thread_exit() hook.
4438  */
4439 void
4440 umtx_thread_exit(struct thread *td)
4441 {
4442 
4443 	umtx_thread_cleanup(td);
4444 }
4445 
4446 static int
4447 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4448 {
4449 	u_long res1;
4450 #ifdef COMPAT_FREEBSD32
4451 	uint32_t res32;
4452 #endif
4453 	int error;
4454 
4455 #ifdef COMPAT_FREEBSD32
4456 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4457 		error = fueword32((void *)ptr, &res32);
4458 		if (error == 0)
4459 			res1 = res32;
4460 	} else
4461 #endif
4462 	{
4463 		error = fueword((void *)ptr, &res1);
4464 	}
4465 	if (error == 0)
4466 		*res = res1;
4467 	else
4468 		error = EFAULT;
4469 	return (error);
4470 }
4471 
4472 static void
4473 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4474 {
4475 #ifdef COMPAT_FREEBSD32
4476 	struct umutex32 m32;
4477 
4478 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4479 		memcpy(&m32, m, sizeof(m32));
4480 		*rb_list = m32.m_rb_lnk;
4481 	} else
4482 #endif
4483 		*rb_list = m->m_rb_lnk;
4484 }
4485 
4486 static int
4487 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4488 {
4489 	struct umutex m;
4490 	int error;
4491 
4492 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4493 	error = copyin((void *)rbp, &m, sizeof(m));
4494 	if (error != 0)
4495 		return (error);
4496 	if (rb_list != NULL)
4497 		umtx_read_rb_list(td, &m, rb_list);
4498 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4499 		return (EINVAL);
4500 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4501 		/* inact is cleared after unlock, allow the inconsistency */
4502 		return (inact ? 0 : EINVAL);
4503 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4504 }
4505 
4506 static void
4507 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4508     const char *name)
4509 {
4510 	int error, i;
4511 	uintptr_t rbp;
4512 	bool inact;
4513 
4514 	if (rb_list == 0)
4515 		return;
4516 	error = umtx_read_uptr(td, rb_list, &rbp);
4517 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4518 		if (rbp == *rb_inact) {
4519 			inact = true;
4520 			*rb_inact = 0;
4521 		} else
4522 			inact = false;
4523 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4524 	}
4525 	if (i == umtx_max_rb && umtx_verbose_rb) {
4526 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4527 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4528 	}
4529 	if (error != 0 && umtx_verbose_rb) {
4530 		uprintf("comm %s pid %d: handling %srb error %d\n",
4531 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4532 	}
4533 }
4534 
4535 /*
4536  * Clean up umtx data.
4537  */
4538 static void
4539 umtx_thread_cleanup(struct thread *td)
4540 {
4541 	struct umtx_q *uq;
4542 	struct umtx_pi *pi;
4543 	uintptr_t rb_inact;
4544 
4545 	/*
4546 	 * Disown pi mutexes.
4547 	 */
4548 	uq = td->td_umtxq;
4549 	if (uq != NULL) {
4550 		mtx_lock(&umtx_lock);
4551 		uq->uq_inherited_pri = PRI_MAX;
4552 		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4553 			pi->pi_owner = NULL;
4554 			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4555 		}
4556 		mtx_unlock(&umtx_lock);
4557 		thread_lock(td);
4558 		sched_lend_user_prio(td, PRI_MAX);
4559 		thread_unlock(td);
4560 	}
4561 
4562 	/*
4563 	 * Handle terminated robust mutexes.  Must be done after
4564 	 * robust pi disown, otherwise unlock could see unowned
4565 	 * entries.
4566 	 */
4567 	rb_inact = td->td_rb_inact;
4568 	if (rb_inact != 0)
4569 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4570 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4571 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4572 	if (rb_inact != 0)
4573 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4574 }
4575