xref: /freebsd/sys/kern/kern_umtx.c (revision 1171c633fb097a19e1da87128604190bc6d27341)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015, 2016 The FreeBSD Foundation
5  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Konstantin Belousov
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice unmodified, this list of conditions, and the following
17  *    disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_umtx_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysent.h>
59 #include <sys/systm.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/time.h>
64 #include <sys/eventhandler.h>
65 #include <sys/umtx.h>
66 
67 #include <security/mac/mac_framework.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_object.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/cpu.h>
77 
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32_proto.h>
80 #endif
81 
82 #define _UMUTEX_TRY		1
83 #define _UMUTEX_WAIT		2
84 
85 #ifdef UMTX_PROFILING
86 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
87 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
88 #endif
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98 	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 
142 	/* Spare queue ready to be reused */
143 	struct umtxq_queue	*uq_spare_queue;
144 
145 	/* The queue we on */
146 	struct umtxq_queue	*uq_cur_queue;
147 };
148 
149 TAILQ_HEAD(umtxq_head, umtx_q);
150 
151 /* Per-key wait-queue */
152 struct umtxq_queue {
153 	struct umtxq_head	head;
154 	struct umtx_key		key;
155 	LIST_ENTRY(umtxq_queue)	link;
156 	int			length;
157 };
158 
159 LIST_HEAD(umtxq_list, umtxq_queue);
160 
161 /* Userland lock object's wait-queue chain */
162 struct umtxq_chain {
163 	/* Lock for this chain. */
164 	struct mtx		uc_lock;
165 
166 	/* List of sleep queues. */
167 	struct umtxq_list	uc_queue[2];
168 #define UMTX_SHARED_QUEUE	0
169 #define UMTX_EXCLUSIVE_QUEUE	1
170 
171 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
172 
173 	/* Busy flag */
174 	char			uc_busy;
175 
176 	/* Chain lock waiters */
177 	int			uc_waiters;
178 
179 	/* All PI in the list */
180 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
181 
182 #ifdef UMTX_PROFILING
183 	u_int			length;
184 	u_int			max_length;
185 #endif
186 };
187 
188 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
189 
190 /*
191  * Don't propagate time-sharing priority, there is a security reason,
192  * a user can simply introduce PI-mutex, let thread A lock the mutex,
193  * and let another thread B block on the mutex, because B is
194  * sleeping, its priority will be boosted, this causes A's priority to
195  * be boosted via priority propagating too and will never be lowered even
196  * if it is using 100%CPU, this is unfair to other processes.
197  */
198 
199 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
200 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
201 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
202 
203 #define	GOLDEN_RATIO_PRIME	2654404609U
204 #ifndef	UMTX_CHAINS
205 #define	UMTX_CHAINS		512
206 #endif
207 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
208 
209 #define	GET_SHARE(flags)	\
210     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
211 
212 #define BUSY_SPINS		200
213 
214 struct abs_timeout {
215 	int clockid;
216 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
217 	struct timespec cur;
218 	struct timespec end;
219 };
220 
221 #ifdef COMPAT_FREEBSD32
222 struct umutex32 {
223 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
224 	__uint32_t		m_flags;	/* Flags of the mutex */
225 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
226 	__uint32_t		m_rb_lnk;	/* Robust linkage */
227 	__uint32_t		m_pad;
228 	__uint32_t		m_spare[2];
229 };
230 
231 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
232 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
233     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
234 #endif
235 
236 int umtx_shm_vnobj_persistent = 0;
237 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
238     &umtx_shm_vnobj_persistent, 0,
239     "False forces destruction of umtx attached to file, on last close");
240 static int umtx_max_rb = 1000;
241 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
242     &umtx_max_rb, 0,
243     "");
244 
245 static uma_zone_t		umtx_pi_zone;
246 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
247 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
248 static int			umtx_pi_allocated;
249 
250 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
251 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
252     &umtx_pi_allocated, 0, "Allocated umtx_pi");
253 static int umtx_verbose_rb = 1;
254 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
255     &umtx_verbose_rb, 0,
256     "");
257 
258 #ifdef UMTX_PROFILING
259 static long max_length;
260 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
261 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
262 #endif
263 
264 static void abs_timeout_update(struct abs_timeout *timo);
265 
266 static void umtx_shm_init(void);
267 static void umtxq_sysinit(void *);
268 static void umtxq_hash(struct umtx_key *key);
269 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
270 static void umtxq_lock(struct umtx_key *key);
271 static void umtxq_unlock(struct umtx_key *key);
272 static void umtxq_busy(struct umtx_key *key);
273 static void umtxq_unbusy(struct umtx_key *key);
274 static void umtxq_insert_queue(struct umtx_q *uq, int q);
275 static void umtxq_remove_queue(struct umtx_q *uq, int q);
276 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
277 static int umtxq_count(struct umtx_key *key);
278 static struct umtx_pi *umtx_pi_alloc(int);
279 static void umtx_pi_free(struct umtx_pi *pi);
280 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
281     bool rb);
282 static void umtx_thread_cleanup(struct thread *td);
283 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
284     struct image_params *imgp __unused);
285 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
286 
287 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
288 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
289 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
290 
291 static struct mtx umtx_lock;
292 
293 #ifdef UMTX_PROFILING
294 static void
295 umtx_init_profiling(void)
296 {
297 	struct sysctl_oid *chain_oid;
298 	char chain_name[10];
299 	int i;
300 
301 	for (i = 0; i < UMTX_CHAINS; ++i) {
302 		snprintf(chain_name, sizeof(chain_name), "%d", i);
303 		chain_oid = SYSCTL_ADD_NODE(NULL,
304 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
305 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
306 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
307 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
308 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
309 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
310 	}
311 }
312 
313 static int
314 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
315 {
316 	char buf[512];
317 	struct sbuf sb;
318 	struct umtxq_chain *uc;
319 	u_int fract, i, j, tot, whole;
320 	u_int sf0, sf1, sf2, sf3, sf4;
321 	u_int si0, si1, si2, si3, si4;
322 	u_int sw0, sw1, sw2, sw3, sw4;
323 
324 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
325 	for (i = 0; i < 2; i++) {
326 		tot = 0;
327 		for (j = 0; j < UMTX_CHAINS; ++j) {
328 			uc = &umtxq_chains[i][j];
329 			mtx_lock(&uc->uc_lock);
330 			tot += uc->max_length;
331 			mtx_unlock(&uc->uc_lock);
332 		}
333 		if (tot == 0)
334 			sbuf_printf(&sb, "%u) Empty ", i);
335 		else {
336 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
337 			si0 = si1 = si2 = si3 = si4 = 0;
338 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
339 			for (j = 0; j < UMTX_CHAINS; j++) {
340 				uc = &umtxq_chains[i][j];
341 				mtx_lock(&uc->uc_lock);
342 				whole = uc->max_length * 100;
343 				mtx_unlock(&uc->uc_lock);
344 				fract = (whole % tot) * 100;
345 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
346 					sf0 = fract;
347 					si0 = j;
348 					sw0 = whole;
349 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
350 				    sf1)) {
351 					sf1 = fract;
352 					si1 = j;
353 					sw1 = whole;
354 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
355 				    sf2)) {
356 					sf2 = fract;
357 					si2 = j;
358 					sw2 = whole;
359 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
360 				    sf3)) {
361 					sf3 = fract;
362 					si3 = j;
363 					sw3 = whole;
364 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
365 				    sf4)) {
366 					sf4 = fract;
367 					si4 = j;
368 					sw4 = whole;
369 				}
370 			}
371 			sbuf_printf(&sb, "queue %u:\n", i);
372 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
373 			    sf0 / tot, si0);
374 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
375 			    sf1 / tot, si1);
376 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
377 			    sf2 / tot, si2);
378 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
379 			    sf3 / tot, si3);
380 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
381 			    sf4 / tot, si4);
382 		}
383 	}
384 	sbuf_trim(&sb);
385 	sbuf_finish(&sb);
386 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
387 	sbuf_delete(&sb);
388 	return (0);
389 }
390 
391 static int
392 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
393 {
394 	struct umtxq_chain *uc;
395 	u_int i, j;
396 	int clear, error;
397 
398 	clear = 0;
399 	error = sysctl_handle_int(oidp, &clear, 0, req);
400 	if (error != 0 || req->newptr == NULL)
401 		return (error);
402 
403 	if (clear != 0) {
404 		for (i = 0; i < 2; ++i) {
405 			for (j = 0; j < UMTX_CHAINS; ++j) {
406 				uc = &umtxq_chains[i][j];
407 				mtx_lock(&uc->uc_lock);
408 				uc->length = 0;
409 				uc->max_length = 0;
410 				mtx_unlock(&uc->uc_lock);
411 			}
412 		}
413 	}
414 	return (0);
415 }
416 
417 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
418     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
419     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
420 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
421     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
422     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
423 #endif
424 
425 static void
426 umtxq_sysinit(void *arg __unused)
427 {
428 	int i, j;
429 
430 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
431 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
432 	for (i = 0; i < 2; ++i) {
433 		for (j = 0; j < UMTX_CHAINS; ++j) {
434 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
435 				 MTX_DEF | MTX_DUPOK);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
438 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
439 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
440 			umtxq_chains[i][j].uc_busy = 0;
441 			umtxq_chains[i][j].uc_waiters = 0;
442 #ifdef UMTX_PROFILING
443 			umtxq_chains[i][j].length = 0;
444 			umtxq_chains[i][j].max_length = 0;
445 #endif
446 		}
447 	}
448 #ifdef UMTX_PROFILING
449 	umtx_init_profiling();
450 #endif
451 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
452 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
453 	    EVENTHANDLER_PRI_ANY);
454 	umtx_shm_init();
455 }
456 
457 struct umtx_q *
458 umtxq_alloc(void)
459 {
460 	struct umtx_q *uq;
461 
462 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
463 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
464 	    M_WAITOK | M_ZERO);
465 	TAILQ_INIT(&uq->uq_spare_queue->head);
466 	TAILQ_INIT(&uq->uq_pi_contested);
467 	uq->uq_inherited_pri = PRI_MAX;
468 	return (uq);
469 }
470 
471 void
472 umtxq_free(struct umtx_q *uq)
473 {
474 
475 	MPASS(uq->uq_spare_queue != NULL);
476 	free(uq->uq_spare_queue, M_UMTX);
477 	free(uq, M_UMTX);
478 }
479 
480 static inline void
481 umtxq_hash(struct umtx_key *key)
482 {
483 	unsigned n;
484 
485 	n = (uintptr_t)key->info.both.a + key->info.both.b;
486 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
487 }
488 
489 static inline struct umtxq_chain *
490 umtxq_getchain(struct umtx_key *key)
491 {
492 
493 	if (key->type <= TYPE_SEM)
494 		return (&umtxq_chains[1][key->hash]);
495 	return (&umtxq_chains[0][key->hash]);
496 }
497 
498 /*
499  * Lock a chain.
500  */
501 static inline void
502 umtxq_lock(struct umtx_key *key)
503 {
504 	struct umtxq_chain *uc;
505 
506 	uc = umtxq_getchain(key);
507 	mtx_lock(&uc->uc_lock);
508 }
509 
510 /*
511  * Unlock a chain.
512  */
513 static inline void
514 umtxq_unlock(struct umtx_key *key)
515 {
516 	struct umtxq_chain *uc;
517 
518 	uc = umtxq_getchain(key);
519 	mtx_unlock(&uc->uc_lock);
520 }
521 
522 /*
523  * Set chain to busy state when following operation
524  * may be blocked (kernel mutex can not be used).
525  */
526 static inline void
527 umtxq_busy(struct umtx_key *key)
528 {
529 	struct umtxq_chain *uc;
530 
531 	uc = umtxq_getchain(key);
532 	mtx_assert(&uc->uc_lock, MA_OWNED);
533 	if (uc->uc_busy) {
534 #ifdef SMP
535 		if (smp_cpus > 1) {
536 			int count = BUSY_SPINS;
537 			if (count > 0) {
538 				umtxq_unlock(key);
539 				while (uc->uc_busy && --count > 0)
540 					cpu_spinwait();
541 				umtxq_lock(key);
542 			}
543 		}
544 #endif
545 		while (uc->uc_busy) {
546 			uc->uc_waiters++;
547 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
548 			uc->uc_waiters--;
549 		}
550 	}
551 	uc->uc_busy = 1;
552 }
553 
554 /*
555  * Unbusy a chain.
556  */
557 static inline void
558 umtxq_unbusy(struct umtx_key *key)
559 {
560 	struct umtxq_chain *uc;
561 
562 	uc = umtxq_getchain(key);
563 	mtx_assert(&uc->uc_lock, MA_OWNED);
564 	KASSERT(uc->uc_busy != 0, ("not busy"));
565 	uc->uc_busy = 0;
566 	if (uc->uc_waiters)
567 		wakeup_one(uc);
568 }
569 
570 static inline void
571 umtxq_unbusy_unlocked(struct umtx_key *key)
572 {
573 
574 	umtxq_lock(key);
575 	umtxq_unbusy(key);
576 	umtxq_unlock(key);
577 }
578 
579 static struct umtxq_queue *
580 umtxq_queue_lookup(struct umtx_key *key, int q)
581 {
582 	struct umtxq_queue *uh;
583 	struct umtxq_chain *uc;
584 
585 	uc = umtxq_getchain(key);
586 	UMTXQ_LOCKED_ASSERT(uc);
587 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
588 		if (umtx_key_match(&uh->key, key))
589 			return (uh);
590 	}
591 
592 	return (NULL);
593 }
594 
595 static inline void
596 umtxq_insert_queue(struct umtx_q *uq, int q)
597 {
598 	struct umtxq_queue *uh;
599 	struct umtxq_chain *uc;
600 
601 	uc = umtxq_getchain(&uq->uq_key);
602 	UMTXQ_LOCKED_ASSERT(uc);
603 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
604 	uh = umtxq_queue_lookup(&uq->uq_key, q);
605 	if (uh != NULL) {
606 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
607 	} else {
608 		uh = uq->uq_spare_queue;
609 		uh->key = uq->uq_key;
610 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
611 #ifdef UMTX_PROFILING
612 		uc->length++;
613 		if (uc->length > uc->max_length) {
614 			uc->max_length = uc->length;
615 			if (uc->max_length > max_length)
616 				max_length = uc->max_length;
617 		}
618 #endif
619 	}
620 	uq->uq_spare_queue = NULL;
621 
622 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
623 	uh->length++;
624 	uq->uq_flags |= UQF_UMTXQ;
625 	uq->uq_cur_queue = uh;
626 	return;
627 }
628 
629 static inline void
630 umtxq_remove_queue(struct umtx_q *uq, int q)
631 {
632 	struct umtxq_chain *uc;
633 	struct umtxq_queue *uh;
634 
635 	uc = umtxq_getchain(&uq->uq_key);
636 	UMTXQ_LOCKED_ASSERT(uc);
637 	if (uq->uq_flags & UQF_UMTXQ) {
638 		uh = uq->uq_cur_queue;
639 		TAILQ_REMOVE(&uh->head, uq, uq_link);
640 		uh->length--;
641 		uq->uq_flags &= ~UQF_UMTXQ;
642 		if (TAILQ_EMPTY(&uh->head)) {
643 			KASSERT(uh->length == 0,
644 			    ("inconsistent umtxq_queue length"));
645 #ifdef UMTX_PROFILING
646 			uc->length--;
647 #endif
648 			LIST_REMOVE(uh, link);
649 		} else {
650 			uh = LIST_FIRST(&uc->uc_spare_queue);
651 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
652 			LIST_REMOVE(uh, link);
653 		}
654 		uq->uq_spare_queue = uh;
655 		uq->uq_cur_queue = NULL;
656 	}
657 }
658 
659 /*
660  * Check if there are multiple waiters
661  */
662 static int
663 umtxq_count(struct umtx_key *key)
664 {
665 	struct umtxq_queue *uh;
666 
667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
668 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
669 	if (uh != NULL)
670 		return (uh->length);
671 	return (0);
672 }
673 
674 /*
675  * Check if there are multiple PI waiters and returns first
676  * waiter.
677  */
678 static int
679 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
680 {
681 	struct umtxq_queue *uh;
682 
683 	*first = NULL;
684 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
685 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
686 	if (uh != NULL) {
687 		*first = TAILQ_FIRST(&uh->head);
688 		return (uh->length);
689 	}
690 	return (0);
691 }
692 
693 /*
694  * Wake up threads waiting on an userland object.
695  */
696 
697 static int
698 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
699 {
700 	struct umtxq_queue *uh;
701 	struct umtx_q *uq;
702 	int ret;
703 
704 	ret = 0;
705 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
706 	uh = umtxq_queue_lookup(key, q);
707 	if (uh != NULL) {
708 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
709 			umtxq_remove_queue(uq, q);
710 			wakeup(uq);
711 			if (++ret >= n_wake)
712 				return (ret);
713 		}
714 	}
715 	return (ret);
716 }
717 
718 
719 /*
720  * Wake up specified thread.
721  */
722 static inline void
723 umtxq_signal_thread(struct umtx_q *uq)
724 {
725 
726 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
727 	umtxq_remove(uq);
728 	wakeup(uq);
729 }
730 
731 static inline int
732 tstohz(const struct timespec *tsp)
733 {
734 	struct timeval tv;
735 
736 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
737 	return tvtohz(&tv);
738 }
739 
740 static void
741 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
742 	const struct timespec *timeout)
743 {
744 
745 	timo->clockid = clockid;
746 	if (!absolute) {
747 		timo->is_abs_real = false;
748 		abs_timeout_update(timo);
749 		timespecadd(&timo->cur, timeout, &timo->end);
750 	} else {
751 		timo->end = *timeout;
752 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
753 		    clockid == CLOCK_REALTIME_FAST ||
754 		    clockid == CLOCK_REALTIME_PRECISE;
755 		/*
756 		 * If is_abs_real, umtxq_sleep will read the clock
757 		 * after setting td_rtcgen; otherwise, read it here.
758 		 */
759 		if (!timo->is_abs_real) {
760 			abs_timeout_update(timo);
761 		}
762 	}
763 }
764 
765 static void
766 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
767 {
768 
769 	abs_timeout_init(timo, umtxtime->_clockid,
770 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
771 }
772 
773 static inline void
774 abs_timeout_update(struct abs_timeout *timo)
775 {
776 
777 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
778 }
779 
780 static int
781 abs_timeout_gethz(struct abs_timeout *timo)
782 {
783 	struct timespec tts;
784 
785 	if (timespeccmp(&timo->end, &timo->cur, <=))
786 		return (-1);
787 	timespecsub(&timo->end, &timo->cur, &tts);
788 	return (tstohz(&tts));
789 }
790 
791 static uint32_t
792 umtx_unlock_val(uint32_t flags, bool rb)
793 {
794 
795 	if (rb)
796 		return (UMUTEX_RB_OWNERDEAD);
797 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
798 		return (UMUTEX_RB_NOTRECOV);
799 	else
800 		return (UMUTEX_UNOWNED);
801 
802 }
803 
804 /*
805  * Put thread into sleep state, before sleeping, check if
806  * thread was removed from umtx queue.
807  */
808 static inline int
809 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
810 {
811 	struct umtxq_chain *uc;
812 	int error, timo;
813 
814 	if (abstime != NULL && abstime->is_abs_real) {
815 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
816 		abs_timeout_update(abstime);
817 	}
818 
819 	uc = umtxq_getchain(&uq->uq_key);
820 	UMTXQ_LOCKED_ASSERT(uc);
821 	for (;;) {
822 		if (!(uq->uq_flags & UQF_UMTXQ)) {
823 			error = 0;
824 			break;
825 		}
826 		if (abstime != NULL) {
827 			timo = abs_timeout_gethz(abstime);
828 			if (timo < 0) {
829 				error = ETIMEDOUT;
830 				break;
831 			}
832 		} else
833 			timo = 0;
834 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
835 		if (error == EINTR || error == ERESTART) {
836 			umtxq_lock(&uq->uq_key);
837 			break;
838 		}
839 		if (abstime != NULL) {
840 			if (abstime->is_abs_real)
841 				curthread->td_rtcgen =
842 				    atomic_load_acq_int(&rtc_generation);
843 			abs_timeout_update(abstime);
844 		}
845 		umtxq_lock(&uq->uq_key);
846 	}
847 
848 	curthread->td_rtcgen = 0;
849 	return (error);
850 }
851 
852 /*
853  * Convert userspace address into unique logical address.
854  */
855 int
856 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
857 {
858 	struct thread *td = curthread;
859 	vm_map_t map;
860 	vm_map_entry_t entry;
861 	vm_pindex_t pindex;
862 	vm_prot_t prot;
863 	boolean_t wired;
864 
865 	key->type = type;
866 	if (share == THREAD_SHARE) {
867 		key->shared = 0;
868 		key->info.private.vs = td->td_proc->p_vmspace;
869 		key->info.private.addr = (uintptr_t)addr;
870 	} else {
871 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
872 		map = &td->td_proc->p_vmspace->vm_map;
873 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
874 		    &entry, &key->info.shared.object, &pindex, &prot,
875 		    &wired) != KERN_SUCCESS) {
876 			return (EFAULT);
877 		}
878 
879 		if ((share == PROCESS_SHARE) ||
880 		    (share == AUTO_SHARE &&
881 		     VM_INHERIT_SHARE == entry->inheritance)) {
882 			key->shared = 1;
883 			key->info.shared.offset = (vm_offset_t)addr -
884 			    entry->start + entry->offset;
885 			vm_object_reference(key->info.shared.object);
886 		} else {
887 			key->shared = 0;
888 			key->info.private.vs = td->td_proc->p_vmspace;
889 			key->info.private.addr = (uintptr_t)addr;
890 		}
891 		vm_map_lookup_done(map, entry);
892 	}
893 
894 	umtxq_hash(key);
895 	return (0);
896 }
897 
898 /*
899  * Release key.
900  */
901 void
902 umtx_key_release(struct umtx_key *key)
903 {
904 	if (key->shared)
905 		vm_object_deallocate(key->info.shared.object);
906 }
907 
908 /*
909  * Fetch and compare value, sleep on the address if value is not changed.
910  */
911 static int
912 do_wait(struct thread *td, void *addr, u_long id,
913     struct _umtx_time *timeout, int compat32, int is_private)
914 {
915 	struct abs_timeout timo;
916 	struct umtx_q *uq;
917 	u_long tmp;
918 	uint32_t tmp32;
919 	int error = 0;
920 
921 	uq = td->td_umtxq;
922 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
923 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
924 		return (error);
925 
926 	if (timeout != NULL)
927 		abs_timeout_init2(&timo, timeout);
928 
929 	umtxq_lock(&uq->uq_key);
930 	umtxq_insert(uq);
931 	umtxq_unlock(&uq->uq_key);
932 	if (compat32 == 0) {
933 		error = fueword(addr, &tmp);
934 		if (error != 0)
935 			error = EFAULT;
936 	} else {
937 		error = fueword32(addr, &tmp32);
938 		if (error == 0)
939 			tmp = tmp32;
940 		else
941 			error = EFAULT;
942 	}
943 	umtxq_lock(&uq->uq_key);
944 	if (error == 0) {
945 		if (tmp == id)
946 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
947 			    NULL : &timo);
948 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
949 			error = 0;
950 		else
951 			umtxq_remove(uq);
952 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
953 		umtxq_remove(uq);
954 	}
955 	umtxq_unlock(&uq->uq_key);
956 	umtx_key_release(&uq->uq_key);
957 	if (error == ERESTART)
958 		error = EINTR;
959 	return (error);
960 }
961 
962 /*
963  * Wake up threads sleeping on the specified address.
964  */
965 int
966 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
967 {
968 	struct umtx_key key;
969 	int ret;
970 
971 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
972 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
973 		return (ret);
974 	umtxq_lock(&key);
975 	umtxq_signal(&key, n_wake);
976 	umtxq_unlock(&key);
977 	umtx_key_release(&key);
978 	return (0);
979 }
980 
981 /*
982  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
983  */
984 static int
985 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
986     struct _umtx_time *timeout, int mode)
987 {
988 	struct abs_timeout timo;
989 	struct umtx_q *uq;
990 	uint32_t owner, old, id;
991 	int error, rv;
992 
993 	id = td->td_tid;
994 	uq = td->td_umtxq;
995 	error = 0;
996 	if (timeout != NULL)
997 		abs_timeout_init2(&timo, timeout);
998 
999 	/*
1000 	 * Care must be exercised when dealing with umtx structure. It
1001 	 * can fault on any access.
1002 	 */
1003 	for (;;) {
1004 		rv = fueword32(&m->m_owner, &owner);
1005 		if (rv == -1)
1006 			return (EFAULT);
1007 		if (mode == _UMUTEX_WAIT) {
1008 			if (owner == UMUTEX_UNOWNED ||
1009 			    owner == UMUTEX_CONTESTED ||
1010 			    owner == UMUTEX_RB_OWNERDEAD ||
1011 			    owner == UMUTEX_RB_NOTRECOV)
1012 				return (0);
1013 		} else {
1014 			/*
1015 			 * Robust mutex terminated.  Kernel duty is to
1016 			 * return EOWNERDEAD to the userspace.  The
1017 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1018 			 * by the common userspace code.
1019 			 */
1020 			if (owner == UMUTEX_RB_OWNERDEAD) {
1021 				rv = casueword32(&m->m_owner,
1022 				    UMUTEX_RB_OWNERDEAD, &owner,
1023 				    id | UMUTEX_CONTESTED);
1024 				if (rv == -1)
1025 					return (EFAULT);
1026 				if (rv == 0) {
1027 					MPASS(owner == UMUTEX_RB_OWNERDEAD);
1028 					return (EOWNERDEAD); /* success */
1029 				}
1030 				MPASS(rv == 1);
1031 				rv = thread_check_susp(td, false);
1032 				if (rv != 0)
1033 					return (rv);
1034 				continue;
1035 			}
1036 			if (owner == UMUTEX_RB_NOTRECOV)
1037 				return (ENOTRECOVERABLE);
1038 
1039 			/*
1040 			 * Try the uncontested case.  This should be
1041 			 * done in userland.
1042 			 */
1043 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1044 			    &owner, id);
1045 			/* The address was invalid. */
1046 			if (rv == -1)
1047 				return (EFAULT);
1048 
1049 			/* The acquire succeeded. */
1050 			if (rv == 0) {
1051 				MPASS(owner == UMUTEX_UNOWNED);
1052 				return (0);
1053 			}
1054 
1055 			/*
1056 			 * If no one owns it but it is contested try
1057 			 * to acquire it.
1058 			 */
1059 			MPASS(rv == 1);
1060 			if (owner == UMUTEX_CONTESTED) {
1061 				rv = casueword32(&m->m_owner,
1062 				    UMUTEX_CONTESTED, &owner,
1063 				    id | UMUTEX_CONTESTED);
1064 				/* The address was invalid. */
1065 				if (rv == -1)
1066 					return (EFAULT);
1067 				if (rv == 0) {
1068 					MPASS(owner == UMUTEX_CONTESTED);
1069 					return (0);
1070 				}
1071 				if (rv == 1) {
1072 					rv = thread_check_susp(td, false);
1073 					if (rv != 0)
1074 						return (rv);
1075 				}
1076 
1077 				/*
1078 				 * If this failed the lock has
1079 				 * changed, restart.
1080 				 */
1081 				continue;
1082 			}
1083 
1084 			/* rv == 1 but not contested, likely store failure */
1085 			rv = thread_check_susp(td, false);
1086 			if (rv != 0)
1087 				return (rv);
1088 		}
1089 
1090 		if (mode == _UMUTEX_TRY)
1091 			return (EBUSY);
1092 
1093 		/*
1094 		 * If we caught a signal, we have retried and now
1095 		 * exit immediately.
1096 		 */
1097 		if (error != 0)
1098 			return (error);
1099 
1100 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1101 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1102 			return (error);
1103 
1104 		umtxq_lock(&uq->uq_key);
1105 		umtxq_busy(&uq->uq_key);
1106 		umtxq_insert(uq);
1107 		umtxq_unlock(&uq->uq_key);
1108 
1109 		/*
1110 		 * Set the contested bit so that a release in user space
1111 		 * knows to use the system call for unlock.  If this fails
1112 		 * either some one else has acquired the lock or it has been
1113 		 * released.
1114 		 */
1115 		rv = casueword32(&m->m_owner, owner, &old,
1116 		    owner | UMUTEX_CONTESTED);
1117 
1118 		/* The address was invalid or casueword failed to store. */
1119 		if (rv == -1 || rv == 1) {
1120 			umtxq_lock(&uq->uq_key);
1121 			umtxq_remove(uq);
1122 			umtxq_unbusy(&uq->uq_key);
1123 			umtxq_unlock(&uq->uq_key);
1124 			umtx_key_release(&uq->uq_key);
1125 			if (rv == -1)
1126 				return (EFAULT);
1127 			if (rv == 1) {
1128 				rv = thread_check_susp(td, false);
1129 				if (rv != 0)
1130 					return (rv);
1131 			}
1132 			continue;
1133 		}
1134 
1135 		/*
1136 		 * We set the contested bit, sleep. Otherwise the lock changed
1137 		 * and we need to retry or we lost a race to the thread
1138 		 * unlocking the umtx.
1139 		 */
1140 		umtxq_lock(&uq->uq_key);
1141 		umtxq_unbusy(&uq->uq_key);
1142 		MPASS(old == owner);
1143 		error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1144 		    NULL : &timo);
1145 		umtxq_remove(uq);
1146 		umtxq_unlock(&uq->uq_key);
1147 		umtx_key_release(&uq->uq_key);
1148 
1149 		if (error == 0)
1150 			error = thread_check_susp(td, false);
1151 	}
1152 
1153 	return (0);
1154 }
1155 
1156 /*
1157  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1158  */
1159 static int
1160 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1161 {
1162 	struct umtx_key key;
1163 	uint32_t owner, old, id, newlock;
1164 	int error, count;
1165 
1166 	id = td->td_tid;
1167 
1168 again:
1169 	/*
1170 	 * Make sure we own this mtx.
1171 	 */
1172 	error = fueword32(&m->m_owner, &owner);
1173 	if (error == -1)
1174 		return (EFAULT);
1175 
1176 	if ((owner & ~UMUTEX_CONTESTED) != id)
1177 		return (EPERM);
1178 
1179 	newlock = umtx_unlock_val(flags, rb);
1180 	if ((owner & UMUTEX_CONTESTED) == 0) {
1181 		error = casueword32(&m->m_owner, owner, &old, newlock);
1182 		if (error == -1)
1183 			return (EFAULT);
1184 		if (error == 1) {
1185 			error = thread_check_susp(td, false);
1186 			if (error != 0)
1187 				return (error);
1188 			goto again;
1189 		}
1190 		MPASS(old == owner);
1191 		return (0);
1192 	}
1193 
1194 	/* We should only ever be in here for contested locks */
1195 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1196 	    &key)) != 0)
1197 		return (error);
1198 
1199 	umtxq_lock(&key);
1200 	umtxq_busy(&key);
1201 	count = umtxq_count(&key);
1202 	umtxq_unlock(&key);
1203 
1204 	/*
1205 	 * When unlocking the umtx, it must be marked as unowned if
1206 	 * there is zero or one thread only waiting for it.
1207 	 * Otherwise, it must be marked as contested.
1208 	 */
1209 	if (count > 1)
1210 		newlock |= UMUTEX_CONTESTED;
1211 	error = casueword32(&m->m_owner, owner, &old, newlock);
1212 	umtxq_lock(&key);
1213 	umtxq_signal(&key, 1);
1214 	umtxq_unbusy(&key);
1215 	umtxq_unlock(&key);
1216 	umtx_key_release(&key);
1217 	if (error == -1)
1218 		return (EFAULT);
1219 	if (error == 1) {
1220 		if (old != owner)
1221 			return (EINVAL);
1222 		error = thread_check_susp(td, false);
1223 		if (error != 0)
1224 			return (error);
1225 		goto again;
1226 	}
1227 	return (0);
1228 }
1229 
1230 /*
1231  * Check if the mutex is available and wake up a waiter,
1232  * only for simple mutex.
1233  */
1234 static int
1235 do_wake_umutex(struct thread *td, struct umutex *m)
1236 {
1237 	struct umtx_key key;
1238 	uint32_t owner;
1239 	uint32_t flags;
1240 	int error;
1241 	int count;
1242 
1243 again:
1244 	error = fueword32(&m->m_owner, &owner);
1245 	if (error == -1)
1246 		return (EFAULT);
1247 
1248 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1249 	    owner != UMUTEX_RB_NOTRECOV)
1250 		return (0);
1251 
1252 	error = fueword32(&m->m_flags, &flags);
1253 	if (error == -1)
1254 		return (EFAULT);
1255 
1256 	/* We should only ever be in here for contested locks */
1257 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1258 	    &key)) != 0)
1259 		return (error);
1260 
1261 	umtxq_lock(&key);
1262 	umtxq_busy(&key);
1263 	count = umtxq_count(&key);
1264 	umtxq_unlock(&key);
1265 
1266 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1267 	    owner != UMUTEX_RB_NOTRECOV) {
1268 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1269 		    UMUTEX_UNOWNED);
1270 		if (error == -1) {
1271 			error = EFAULT;
1272 		} else if (error == 1) {
1273 			umtxq_lock(&key);
1274 			umtxq_unbusy(&key);
1275 			umtxq_unlock(&key);
1276 			umtx_key_release(&key);
1277 			error = thread_check_susp(td, false);
1278 			if (error != 0)
1279 				return (error);
1280 			goto again;
1281 		}
1282 	}
1283 
1284 	umtxq_lock(&key);
1285 	if (error == 0 && count != 0) {
1286 		MPASS((owner & ~UMUTEX_CONTESTED) == 0 ||
1287 		    owner == UMUTEX_RB_OWNERDEAD ||
1288 		    owner == UMUTEX_RB_NOTRECOV);
1289 		umtxq_signal(&key, 1);
1290 	}
1291 	umtxq_unbusy(&key);
1292 	umtxq_unlock(&key);
1293 	umtx_key_release(&key);
1294 	return (error);
1295 }
1296 
1297 /*
1298  * Check if the mutex has waiters and tries to fix contention bit.
1299  */
1300 static int
1301 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1302 {
1303 	struct umtx_key key;
1304 	uint32_t owner, old;
1305 	int type;
1306 	int error;
1307 	int count;
1308 
1309 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1310 	    UMUTEX_ROBUST)) {
1311 	case 0:
1312 	case UMUTEX_ROBUST:
1313 		type = TYPE_NORMAL_UMUTEX;
1314 		break;
1315 	case UMUTEX_PRIO_INHERIT:
1316 		type = TYPE_PI_UMUTEX;
1317 		break;
1318 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1319 		type = TYPE_PI_ROBUST_UMUTEX;
1320 		break;
1321 	case UMUTEX_PRIO_PROTECT:
1322 		type = TYPE_PP_UMUTEX;
1323 		break;
1324 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1325 		type = TYPE_PP_ROBUST_UMUTEX;
1326 		break;
1327 	default:
1328 		return (EINVAL);
1329 	}
1330 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1331 		return (error);
1332 
1333 	owner = 0;
1334 	umtxq_lock(&key);
1335 	umtxq_busy(&key);
1336 	count = umtxq_count(&key);
1337 	umtxq_unlock(&key);
1338 
1339 	error = fueword32(&m->m_owner, &owner);
1340 	if (error == -1)
1341 		error = EFAULT;
1342 
1343 	/*
1344 	 * Only repair contention bit if there is a waiter, this means
1345 	 * the mutex is still being referenced by userland code,
1346 	 * otherwise don't update any memory.
1347 	 */
1348 	while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 &&
1349 	    (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) {
1350 		error = casueword32(&m->m_owner, owner, &old,
1351 		    owner | UMUTEX_CONTESTED);
1352 		if (error == -1) {
1353 			error = EFAULT;
1354 			break;
1355 		}
1356 		if (error == 0) {
1357 			MPASS(old == owner);
1358 			break;
1359 		}
1360 		owner = old;
1361 		error = thread_check_susp(td, false);
1362 	}
1363 
1364 	umtxq_lock(&key);
1365 	if (error == EFAULT) {
1366 		umtxq_signal(&key, INT_MAX);
1367 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1368 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1369 		umtxq_signal(&key, 1);
1370 	umtxq_unbusy(&key);
1371 	umtxq_unlock(&key);
1372 	umtx_key_release(&key);
1373 	return (error);
1374 }
1375 
1376 static inline struct umtx_pi *
1377 umtx_pi_alloc(int flags)
1378 {
1379 	struct umtx_pi *pi;
1380 
1381 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1382 	TAILQ_INIT(&pi->pi_blocked);
1383 	atomic_add_int(&umtx_pi_allocated, 1);
1384 	return (pi);
1385 }
1386 
1387 static inline void
1388 umtx_pi_free(struct umtx_pi *pi)
1389 {
1390 	uma_zfree(umtx_pi_zone, pi);
1391 	atomic_add_int(&umtx_pi_allocated, -1);
1392 }
1393 
1394 /*
1395  * Adjust the thread's position on a pi_state after its priority has been
1396  * changed.
1397  */
1398 static int
1399 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1400 {
1401 	struct umtx_q *uq, *uq1, *uq2;
1402 	struct thread *td1;
1403 
1404 	mtx_assert(&umtx_lock, MA_OWNED);
1405 	if (pi == NULL)
1406 		return (0);
1407 
1408 	uq = td->td_umtxq;
1409 
1410 	/*
1411 	 * Check if the thread needs to be moved on the blocked chain.
1412 	 * It needs to be moved if either its priority is lower than
1413 	 * the previous thread or higher than the next thread.
1414 	 */
1415 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1416 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1417 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1418 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1419 		/*
1420 		 * Remove thread from blocked chain and determine where
1421 		 * it should be moved to.
1422 		 */
1423 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1424 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1425 			td1 = uq1->uq_thread;
1426 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1427 			if (UPRI(td1) > UPRI(td))
1428 				break;
1429 		}
1430 
1431 		if (uq1 == NULL)
1432 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1433 		else
1434 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1435 	}
1436 	return (1);
1437 }
1438 
1439 static struct umtx_pi *
1440 umtx_pi_next(struct umtx_pi *pi)
1441 {
1442 	struct umtx_q *uq_owner;
1443 
1444 	if (pi->pi_owner == NULL)
1445 		return (NULL);
1446 	uq_owner = pi->pi_owner->td_umtxq;
1447 	if (uq_owner == NULL)
1448 		return (NULL);
1449 	return (uq_owner->uq_pi_blocked);
1450 }
1451 
1452 /*
1453  * Floyd's Cycle-Finding Algorithm.
1454  */
1455 static bool
1456 umtx_pi_check_loop(struct umtx_pi *pi)
1457 {
1458 	struct umtx_pi *pi1;	/* fast iterator */
1459 
1460 	mtx_assert(&umtx_lock, MA_OWNED);
1461 	if (pi == NULL)
1462 		return (false);
1463 	pi1 = pi;
1464 	for (;;) {
1465 		pi = umtx_pi_next(pi);
1466 		if (pi == NULL)
1467 			break;
1468 		pi1 = umtx_pi_next(pi1);
1469 		if (pi1 == NULL)
1470 			break;
1471 		pi1 = umtx_pi_next(pi1);
1472 		if (pi1 == NULL)
1473 			break;
1474 		if (pi == pi1)
1475 			return (true);
1476 	}
1477 	return (false);
1478 }
1479 
1480 /*
1481  * Propagate priority when a thread is blocked on POSIX
1482  * PI mutex.
1483  */
1484 static void
1485 umtx_propagate_priority(struct thread *td)
1486 {
1487 	struct umtx_q *uq;
1488 	struct umtx_pi *pi;
1489 	int pri;
1490 
1491 	mtx_assert(&umtx_lock, MA_OWNED);
1492 	pri = UPRI(td);
1493 	uq = td->td_umtxq;
1494 	pi = uq->uq_pi_blocked;
1495 	if (pi == NULL)
1496 		return;
1497 	if (umtx_pi_check_loop(pi))
1498 		return;
1499 
1500 	for (;;) {
1501 		td = pi->pi_owner;
1502 		if (td == NULL || td == curthread)
1503 			return;
1504 
1505 		MPASS(td->td_proc != NULL);
1506 		MPASS(td->td_proc->p_magic == P_MAGIC);
1507 
1508 		thread_lock(td);
1509 		if (td->td_lend_user_pri > pri)
1510 			sched_lend_user_prio(td, pri);
1511 		else {
1512 			thread_unlock(td);
1513 			break;
1514 		}
1515 		thread_unlock(td);
1516 
1517 		/*
1518 		 * Pick up the lock that td is blocked on.
1519 		 */
1520 		uq = td->td_umtxq;
1521 		pi = uq->uq_pi_blocked;
1522 		if (pi == NULL)
1523 			break;
1524 		/* Resort td on the list if needed. */
1525 		umtx_pi_adjust_thread(pi, td);
1526 	}
1527 }
1528 
1529 /*
1530  * Unpropagate priority for a PI mutex when a thread blocked on
1531  * it is interrupted by signal or resumed by others.
1532  */
1533 static void
1534 umtx_repropagate_priority(struct umtx_pi *pi)
1535 {
1536 	struct umtx_q *uq, *uq_owner;
1537 	struct umtx_pi *pi2;
1538 	int pri;
1539 
1540 	mtx_assert(&umtx_lock, MA_OWNED);
1541 
1542 	if (umtx_pi_check_loop(pi))
1543 		return;
1544 	while (pi != NULL && pi->pi_owner != NULL) {
1545 		pri = PRI_MAX;
1546 		uq_owner = pi->pi_owner->td_umtxq;
1547 
1548 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1549 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1550 			if (uq != NULL) {
1551 				if (pri > UPRI(uq->uq_thread))
1552 					pri = UPRI(uq->uq_thread);
1553 			}
1554 		}
1555 
1556 		if (pri > uq_owner->uq_inherited_pri)
1557 			pri = uq_owner->uq_inherited_pri;
1558 		thread_lock(pi->pi_owner);
1559 		sched_lend_user_prio(pi->pi_owner, pri);
1560 		thread_unlock(pi->pi_owner);
1561 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1562 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1563 	}
1564 }
1565 
1566 /*
1567  * Insert a PI mutex into owned list.
1568  */
1569 static void
1570 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1571 {
1572 	struct umtx_q *uq_owner;
1573 
1574 	uq_owner = owner->td_umtxq;
1575 	mtx_assert(&umtx_lock, MA_OWNED);
1576 	MPASS(pi->pi_owner == NULL);
1577 	pi->pi_owner = owner;
1578 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1579 }
1580 
1581 
1582 /*
1583  * Disown a PI mutex, and remove it from the owned list.
1584  */
1585 static void
1586 umtx_pi_disown(struct umtx_pi *pi)
1587 {
1588 
1589 	mtx_assert(&umtx_lock, MA_OWNED);
1590 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1591 	pi->pi_owner = NULL;
1592 }
1593 
1594 /*
1595  * Claim ownership of a PI mutex.
1596  */
1597 static int
1598 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1599 {
1600 	struct umtx_q *uq;
1601 	int pri;
1602 
1603 	mtx_lock(&umtx_lock);
1604 	if (pi->pi_owner == owner) {
1605 		mtx_unlock(&umtx_lock);
1606 		return (0);
1607 	}
1608 
1609 	if (pi->pi_owner != NULL) {
1610 		/*
1611 		 * userland may have already messed the mutex, sigh.
1612 		 */
1613 		mtx_unlock(&umtx_lock);
1614 		return (EPERM);
1615 	}
1616 	umtx_pi_setowner(pi, owner);
1617 	uq = TAILQ_FIRST(&pi->pi_blocked);
1618 	if (uq != NULL) {
1619 		pri = UPRI(uq->uq_thread);
1620 		thread_lock(owner);
1621 		if (pri < UPRI(owner))
1622 			sched_lend_user_prio(owner, pri);
1623 		thread_unlock(owner);
1624 	}
1625 	mtx_unlock(&umtx_lock);
1626 	return (0);
1627 }
1628 
1629 /*
1630  * Adjust a thread's order position in its blocked PI mutex,
1631  * this may result new priority propagating process.
1632  */
1633 void
1634 umtx_pi_adjust(struct thread *td, u_char oldpri)
1635 {
1636 	struct umtx_q *uq;
1637 	struct umtx_pi *pi;
1638 
1639 	uq = td->td_umtxq;
1640 	mtx_lock(&umtx_lock);
1641 	/*
1642 	 * Pick up the lock that td is blocked on.
1643 	 */
1644 	pi = uq->uq_pi_blocked;
1645 	if (pi != NULL) {
1646 		umtx_pi_adjust_thread(pi, td);
1647 		umtx_repropagate_priority(pi);
1648 	}
1649 	mtx_unlock(&umtx_lock);
1650 }
1651 
1652 /*
1653  * Sleep on a PI mutex.
1654  */
1655 static int
1656 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1657     const char *wmesg, struct abs_timeout *timo, bool shared)
1658 {
1659 	struct thread *td, *td1;
1660 	struct umtx_q *uq1;
1661 	int error, pri;
1662 #ifdef INVARIANTS
1663 	struct umtxq_chain *uc;
1664 
1665 	uc = umtxq_getchain(&pi->pi_key);
1666 #endif
1667 	error = 0;
1668 	td = uq->uq_thread;
1669 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1670 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1671 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1672 	umtxq_insert(uq);
1673 	mtx_lock(&umtx_lock);
1674 	if (pi->pi_owner == NULL) {
1675 		mtx_unlock(&umtx_lock);
1676 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1677 		mtx_lock(&umtx_lock);
1678 		if (td1 != NULL) {
1679 			if (pi->pi_owner == NULL)
1680 				umtx_pi_setowner(pi, td1);
1681 			PROC_UNLOCK(td1->td_proc);
1682 		}
1683 	}
1684 
1685 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1686 		pri = UPRI(uq1->uq_thread);
1687 		if (pri > UPRI(td))
1688 			break;
1689 	}
1690 
1691 	if (uq1 != NULL)
1692 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1693 	else
1694 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1695 
1696 	uq->uq_pi_blocked = pi;
1697 	thread_lock(td);
1698 	td->td_flags |= TDF_UPIBLOCKED;
1699 	thread_unlock(td);
1700 	umtx_propagate_priority(td);
1701 	mtx_unlock(&umtx_lock);
1702 	umtxq_unbusy(&uq->uq_key);
1703 
1704 	error = umtxq_sleep(uq, wmesg, timo);
1705 	umtxq_remove(uq);
1706 
1707 	mtx_lock(&umtx_lock);
1708 	uq->uq_pi_blocked = NULL;
1709 	thread_lock(td);
1710 	td->td_flags &= ~TDF_UPIBLOCKED;
1711 	thread_unlock(td);
1712 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1713 	umtx_repropagate_priority(pi);
1714 	mtx_unlock(&umtx_lock);
1715 	umtxq_unlock(&uq->uq_key);
1716 
1717 	return (error);
1718 }
1719 
1720 /*
1721  * Add reference count for a PI mutex.
1722  */
1723 static void
1724 umtx_pi_ref(struct umtx_pi *pi)
1725 {
1726 
1727 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1728 	pi->pi_refcount++;
1729 }
1730 
1731 /*
1732  * Decrease reference count for a PI mutex, if the counter
1733  * is decreased to zero, its memory space is freed.
1734  */
1735 static void
1736 umtx_pi_unref(struct umtx_pi *pi)
1737 {
1738 	struct umtxq_chain *uc;
1739 
1740 	uc = umtxq_getchain(&pi->pi_key);
1741 	UMTXQ_LOCKED_ASSERT(uc);
1742 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1743 	if (--pi->pi_refcount == 0) {
1744 		mtx_lock(&umtx_lock);
1745 		if (pi->pi_owner != NULL)
1746 			umtx_pi_disown(pi);
1747 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1748 			("blocked queue not empty"));
1749 		mtx_unlock(&umtx_lock);
1750 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1751 		umtx_pi_free(pi);
1752 	}
1753 }
1754 
1755 /*
1756  * Find a PI mutex in hash table.
1757  */
1758 static struct umtx_pi *
1759 umtx_pi_lookup(struct umtx_key *key)
1760 {
1761 	struct umtxq_chain *uc;
1762 	struct umtx_pi *pi;
1763 
1764 	uc = umtxq_getchain(key);
1765 	UMTXQ_LOCKED_ASSERT(uc);
1766 
1767 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1768 		if (umtx_key_match(&pi->pi_key, key)) {
1769 			return (pi);
1770 		}
1771 	}
1772 	return (NULL);
1773 }
1774 
1775 /*
1776  * Insert a PI mutex into hash table.
1777  */
1778 static inline void
1779 umtx_pi_insert(struct umtx_pi *pi)
1780 {
1781 	struct umtxq_chain *uc;
1782 
1783 	uc = umtxq_getchain(&pi->pi_key);
1784 	UMTXQ_LOCKED_ASSERT(uc);
1785 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1786 }
1787 
1788 /*
1789  * Lock a PI mutex.
1790  */
1791 static int
1792 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1793     struct _umtx_time *timeout, int try)
1794 {
1795 	struct abs_timeout timo;
1796 	struct umtx_q *uq;
1797 	struct umtx_pi *pi, *new_pi;
1798 	uint32_t id, old_owner, owner, old;
1799 	int error, rv;
1800 
1801 	id = td->td_tid;
1802 	uq = td->td_umtxq;
1803 
1804 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1805 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1806 	    &uq->uq_key)) != 0)
1807 		return (error);
1808 
1809 	if (timeout != NULL)
1810 		abs_timeout_init2(&timo, timeout);
1811 
1812 	umtxq_lock(&uq->uq_key);
1813 	pi = umtx_pi_lookup(&uq->uq_key);
1814 	if (pi == NULL) {
1815 		new_pi = umtx_pi_alloc(M_NOWAIT);
1816 		if (new_pi == NULL) {
1817 			umtxq_unlock(&uq->uq_key);
1818 			new_pi = umtx_pi_alloc(M_WAITOK);
1819 			umtxq_lock(&uq->uq_key);
1820 			pi = umtx_pi_lookup(&uq->uq_key);
1821 			if (pi != NULL) {
1822 				umtx_pi_free(new_pi);
1823 				new_pi = NULL;
1824 			}
1825 		}
1826 		if (new_pi != NULL) {
1827 			new_pi->pi_key = uq->uq_key;
1828 			umtx_pi_insert(new_pi);
1829 			pi = new_pi;
1830 		}
1831 	}
1832 	umtx_pi_ref(pi);
1833 	umtxq_unlock(&uq->uq_key);
1834 
1835 	/*
1836 	 * Care must be exercised when dealing with umtx structure.  It
1837 	 * can fault on any access.
1838 	 */
1839 	for (;;) {
1840 		/*
1841 		 * Try the uncontested case.  This should be done in userland.
1842 		 */
1843 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1844 		/* The address was invalid. */
1845 		if (rv == -1) {
1846 			error = EFAULT;
1847 			break;
1848 		}
1849 		/* The acquire succeeded. */
1850 		if (rv == 0) {
1851 			MPASS(owner == UMUTEX_UNOWNED);
1852 			error = 0;
1853 			break;
1854 		}
1855 
1856 		if (owner == UMUTEX_RB_NOTRECOV) {
1857 			error = ENOTRECOVERABLE;
1858 			break;
1859 		}
1860 
1861 		/*
1862 		 * Avoid overwriting a possible error from sleep due
1863 		 * to the pending signal with suspension check result.
1864 		 */
1865 		if (error == 0) {
1866 			error = thread_check_susp(td, true);
1867 			if (error != 0)
1868 				break;
1869 		}
1870 
1871 		/* If no one owns it but it is contested try to acquire it. */
1872 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1873 			old_owner = owner;
1874 			rv = casueword32(&m->m_owner, owner, &owner,
1875 			    id | UMUTEX_CONTESTED);
1876 			/* The address was invalid. */
1877 			if (rv == -1) {
1878 				error = EFAULT;
1879 				break;
1880 			}
1881 			if (rv == 1) {
1882 				if (error == 0) {
1883 					error = thread_check_susp(td, true);
1884 					if (error != 0)
1885 						break;
1886 				}
1887 
1888 				/*
1889 				 * If this failed the lock could
1890 				 * changed, restart.
1891 				 */
1892 				continue;
1893 			}
1894 
1895 			MPASS(rv == 0);
1896 			MPASS(owner == old_owner);
1897 			umtxq_lock(&uq->uq_key);
1898 			umtxq_busy(&uq->uq_key);
1899 			error = umtx_pi_claim(pi, td);
1900 			umtxq_unbusy(&uq->uq_key);
1901 			umtxq_unlock(&uq->uq_key);
1902 			if (error != 0) {
1903 				/*
1904 				 * Since we're going to return an
1905 				 * error, restore the m_owner to its
1906 				 * previous, unowned state to avoid
1907 				 * compounding the problem.
1908 				 */
1909 				(void)casuword32(&m->m_owner,
1910 				    id | UMUTEX_CONTESTED, old_owner);
1911 			}
1912 			if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD)
1913 				error = EOWNERDEAD;
1914 			break;
1915 		}
1916 
1917 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1918 			error = EDEADLK;
1919 			break;
1920 		}
1921 
1922 		if (try != 0) {
1923 			error = EBUSY;
1924 			break;
1925 		}
1926 
1927 		/*
1928 		 * If we caught a signal, we have retried and now
1929 		 * exit immediately.
1930 		 */
1931 		if (error != 0)
1932 			break;
1933 
1934 		umtxq_lock(&uq->uq_key);
1935 		umtxq_busy(&uq->uq_key);
1936 		umtxq_unlock(&uq->uq_key);
1937 
1938 		/*
1939 		 * Set the contested bit so that a release in user space
1940 		 * knows to use the system call for unlock.  If this fails
1941 		 * either some one else has acquired the lock or it has been
1942 		 * released.
1943 		 */
1944 		rv = casueword32(&m->m_owner, owner, &old, owner |
1945 		    UMUTEX_CONTESTED);
1946 
1947 		/* The address was invalid. */
1948 		if (rv == -1) {
1949 			umtxq_unbusy_unlocked(&uq->uq_key);
1950 			error = EFAULT;
1951 			break;
1952 		}
1953 		if (rv == 1) {
1954 			umtxq_unbusy_unlocked(&uq->uq_key);
1955 			error = thread_check_susp(td, true);
1956 			if (error != 0)
1957 				break;
1958 
1959 			/*
1960 			 * The lock changed and we need to retry or we
1961 			 * lost a race to the thread unlocking the
1962 			 * umtx.  Note that the UMUTEX_RB_OWNERDEAD
1963 			 * value for owner is impossible there.
1964 			 */
1965 			continue;
1966 		}
1967 
1968 		umtxq_lock(&uq->uq_key);
1969 
1970 		/* We set the contested bit, sleep. */
1971 		MPASS(old == owner);
1972 		error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1973 		    "umtxpi", timeout == NULL ? NULL : &timo,
1974 		    (flags & USYNC_PROCESS_SHARED) != 0);
1975 		if (error != 0)
1976 			continue;
1977 
1978 		error = thread_check_susp(td, false);
1979 		if (error != 0)
1980 			break;
1981 	}
1982 
1983 	umtxq_lock(&uq->uq_key);
1984 	umtx_pi_unref(pi);
1985 	umtxq_unlock(&uq->uq_key);
1986 
1987 	umtx_key_release(&uq->uq_key);
1988 	return (error);
1989 }
1990 
1991 /*
1992  * Unlock a PI mutex.
1993  */
1994 static int
1995 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1996 {
1997 	struct umtx_key key;
1998 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1999 	struct umtx_pi *pi, *pi2;
2000 	uint32_t id, new_owner, old, owner;
2001 	int count, error, pri;
2002 
2003 	id = td->td_tid;
2004 
2005 usrloop:
2006 	/*
2007 	 * Make sure we own this mtx.
2008 	 */
2009 	error = fueword32(&m->m_owner, &owner);
2010 	if (error == -1)
2011 		return (EFAULT);
2012 
2013 	if ((owner & ~UMUTEX_CONTESTED) != id)
2014 		return (EPERM);
2015 
2016 	new_owner = umtx_unlock_val(flags, rb);
2017 
2018 	/* This should be done in userland */
2019 	if ((owner & UMUTEX_CONTESTED) == 0) {
2020 		error = casueword32(&m->m_owner, owner, &old, new_owner);
2021 		if (error == -1)
2022 			return (EFAULT);
2023 		if (error == 1) {
2024 			error = thread_check_susp(td, true);
2025 			if (error != 0)
2026 				return (error);
2027 			goto usrloop;
2028 		}
2029 		if (old == owner)
2030 			return (0);
2031 		owner = old;
2032 	}
2033 
2034 	/* We should only ever be in here for contested locks */
2035 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2036 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2037 	    &key)) != 0)
2038 		return (error);
2039 
2040 	umtxq_lock(&key);
2041 	umtxq_busy(&key);
2042 	count = umtxq_count_pi(&key, &uq_first);
2043 	if (uq_first != NULL) {
2044 		mtx_lock(&umtx_lock);
2045 		pi = uq_first->uq_pi_blocked;
2046 		KASSERT(pi != NULL, ("pi == NULL?"));
2047 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2048 			mtx_unlock(&umtx_lock);
2049 			umtxq_unbusy(&key);
2050 			umtxq_unlock(&key);
2051 			umtx_key_release(&key);
2052 			/* userland messed the mutex */
2053 			return (EPERM);
2054 		}
2055 		uq_me = td->td_umtxq;
2056 		if (pi->pi_owner == td)
2057 			umtx_pi_disown(pi);
2058 		/* get highest priority thread which is still sleeping. */
2059 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2060 		while (uq_first != NULL &&
2061 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2062 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2063 		}
2064 		pri = PRI_MAX;
2065 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2066 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2067 			if (uq_first2 != NULL) {
2068 				if (pri > UPRI(uq_first2->uq_thread))
2069 					pri = UPRI(uq_first2->uq_thread);
2070 			}
2071 		}
2072 		thread_lock(td);
2073 		sched_lend_user_prio(td, pri);
2074 		thread_unlock(td);
2075 		mtx_unlock(&umtx_lock);
2076 		if (uq_first)
2077 			umtxq_signal_thread(uq_first);
2078 	} else {
2079 		pi = umtx_pi_lookup(&key);
2080 		/*
2081 		 * A umtx_pi can exist if a signal or timeout removed the
2082 		 * last waiter from the umtxq, but there is still
2083 		 * a thread in do_lock_pi() holding the umtx_pi.
2084 		 */
2085 		if (pi != NULL) {
2086 			/*
2087 			 * The umtx_pi can be unowned, such as when a thread
2088 			 * has just entered do_lock_pi(), allocated the
2089 			 * umtx_pi, and unlocked the umtxq.
2090 			 * If the current thread owns it, it must disown it.
2091 			 */
2092 			mtx_lock(&umtx_lock);
2093 			if (pi->pi_owner == td)
2094 				umtx_pi_disown(pi);
2095 			mtx_unlock(&umtx_lock);
2096 		}
2097 	}
2098 	umtxq_unlock(&key);
2099 
2100 	/*
2101 	 * When unlocking the umtx, it must be marked as unowned if
2102 	 * there is zero or one thread only waiting for it.
2103 	 * Otherwise, it must be marked as contested.
2104 	 */
2105 
2106 	if (count > 1)
2107 		new_owner |= UMUTEX_CONTESTED;
2108 again:
2109 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2110 	if (error == 1) {
2111 		error = thread_check_susp(td, false);
2112 		if (error == 0)
2113 			goto again;
2114 	}
2115 	umtxq_unbusy_unlocked(&key);
2116 	umtx_key_release(&key);
2117 	if (error == -1)
2118 		return (EFAULT);
2119 	if (error == 0 && old != owner)
2120 		return (EINVAL);
2121 	return (error);
2122 }
2123 
2124 /*
2125  * Lock a PP mutex.
2126  */
2127 static int
2128 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2129     struct _umtx_time *timeout, int try)
2130 {
2131 	struct abs_timeout timo;
2132 	struct umtx_q *uq, *uq2;
2133 	struct umtx_pi *pi;
2134 	uint32_t ceiling;
2135 	uint32_t owner, id;
2136 	int error, pri, old_inherited_pri, su, rv;
2137 
2138 	id = td->td_tid;
2139 	uq = td->td_umtxq;
2140 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2141 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2142 	    &uq->uq_key)) != 0)
2143 		return (error);
2144 
2145 	if (timeout != NULL)
2146 		abs_timeout_init2(&timo, timeout);
2147 
2148 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2149 	for (;;) {
2150 		old_inherited_pri = uq->uq_inherited_pri;
2151 		umtxq_lock(&uq->uq_key);
2152 		umtxq_busy(&uq->uq_key);
2153 		umtxq_unlock(&uq->uq_key);
2154 
2155 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2156 		if (rv == -1) {
2157 			error = EFAULT;
2158 			goto out;
2159 		}
2160 		ceiling = RTP_PRIO_MAX - ceiling;
2161 		if (ceiling > RTP_PRIO_MAX) {
2162 			error = EINVAL;
2163 			goto out;
2164 		}
2165 
2166 		mtx_lock(&umtx_lock);
2167 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2168 			mtx_unlock(&umtx_lock);
2169 			error = EINVAL;
2170 			goto out;
2171 		}
2172 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2173 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2174 			thread_lock(td);
2175 			if (uq->uq_inherited_pri < UPRI(td))
2176 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2177 			thread_unlock(td);
2178 		}
2179 		mtx_unlock(&umtx_lock);
2180 
2181 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2182 		    id | UMUTEX_CONTESTED);
2183 		/* The address was invalid. */
2184 		if (rv == -1) {
2185 			error = EFAULT;
2186 			break;
2187 		}
2188 		if (rv == 0) {
2189 			MPASS(owner == UMUTEX_CONTESTED);
2190 			error = 0;
2191 			break;
2192 		}
2193 		/* rv == 1 */
2194 		if (owner == UMUTEX_RB_OWNERDEAD) {
2195 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2196 			    &owner, id | UMUTEX_CONTESTED);
2197 			if (rv == -1) {
2198 				error = EFAULT;
2199 				break;
2200 			}
2201 			if (rv == 0) {
2202 				MPASS(owner == UMUTEX_RB_OWNERDEAD);
2203 				error = EOWNERDEAD; /* success */
2204 				break;
2205 			}
2206 
2207 			/*
2208 			 *  rv == 1, only check for suspension if we
2209 			 *  did not already catched a signal.  If we
2210 			 *  get an error from the check, the same
2211 			 *  condition is checked by the umtxq_sleep()
2212 			 *  call below, so we should obliterate the
2213 			 *  error to not skip the last loop iteration.
2214 			 */
2215 			if (error == 0) {
2216 				error = thread_check_susp(td, false);
2217 				if (error == 0) {
2218 					if (try != 0)
2219 						error = EBUSY;
2220 					else
2221 						continue;
2222 				}
2223 				error = 0;
2224 			}
2225 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2226 			error = ENOTRECOVERABLE;
2227 		}
2228 
2229 		if (try != 0)
2230 			error = EBUSY;
2231 
2232 		/*
2233 		 * If we caught a signal, we have retried and now
2234 		 * exit immediately.
2235 		 */
2236 		if (error != 0)
2237 			break;
2238 
2239 		umtxq_lock(&uq->uq_key);
2240 		umtxq_insert(uq);
2241 		umtxq_unbusy(&uq->uq_key);
2242 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2243 		    NULL : &timo);
2244 		umtxq_remove(uq);
2245 		umtxq_unlock(&uq->uq_key);
2246 
2247 		mtx_lock(&umtx_lock);
2248 		uq->uq_inherited_pri = old_inherited_pri;
2249 		pri = PRI_MAX;
2250 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2251 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2252 			if (uq2 != NULL) {
2253 				if (pri > UPRI(uq2->uq_thread))
2254 					pri = UPRI(uq2->uq_thread);
2255 			}
2256 		}
2257 		if (pri > uq->uq_inherited_pri)
2258 			pri = uq->uq_inherited_pri;
2259 		thread_lock(td);
2260 		sched_lend_user_prio(td, pri);
2261 		thread_unlock(td);
2262 		mtx_unlock(&umtx_lock);
2263 	}
2264 
2265 	if (error != 0 && error != EOWNERDEAD) {
2266 		mtx_lock(&umtx_lock);
2267 		uq->uq_inherited_pri = old_inherited_pri;
2268 		pri = PRI_MAX;
2269 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2270 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2271 			if (uq2 != NULL) {
2272 				if (pri > UPRI(uq2->uq_thread))
2273 					pri = UPRI(uq2->uq_thread);
2274 			}
2275 		}
2276 		if (pri > uq->uq_inherited_pri)
2277 			pri = uq->uq_inherited_pri;
2278 		thread_lock(td);
2279 		sched_lend_user_prio(td, pri);
2280 		thread_unlock(td);
2281 		mtx_unlock(&umtx_lock);
2282 	}
2283 
2284 out:
2285 	umtxq_unbusy_unlocked(&uq->uq_key);
2286 	umtx_key_release(&uq->uq_key);
2287 	return (error);
2288 }
2289 
2290 /*
2291  * Unlock a PP mutex.
2292  */
2293 static int
2294 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2295 {
2296 	struct umtx_key key;
2297 	struct umtx_q *uq, *uq2;
2298 	struct umtx_pi *pi;
2299 	uint32_t id, owner, rceiling;
2300 	int error, pri, new_inherited_pri, su;
2301 
2302 	id = td->td_tid;
2303 	uq = td->td_umtxq;
2304 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2305 
2306 	/*
2307 	 * Make sure we own this mtx.
2308 	 */
2309 	error = fueword32(&m->m_owner, &owner);
2310 	if (error == -1)
2311 		return (EFAULT);
2312 
2313 	if ((owner & ~UMUTEX_CONTESTED) != id)
2314 		return (EPERM);
2315 
2316 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2317 	if (error != 0)
2318 		return (error);
2319 
2320 	if (rceiling == -1)
2321 		new_inherited_pri = PRI_MAX;
2322 	else {
2323 		rceiling = RTP_PRIO_MAX - rceiling;
2324 		if (rceiling > RTP_PRIO_MAX)
2325 			return (EINVAL);
2326 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2327 	}
2328 
2329 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2330 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2331 	    &key)) != 0)
2332 		return (error);
2333 	umtxq_lock(&key);
2334 	umtxq_busy(&key);
2335 	umtxq_unlock(&key);
2336 	/*
2337 	 * For priority protected mutex, always set unlocked state
2338 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2339 	 * to lock the mutex, it is necessary because thread priority
2340 	 * has to be adjusted for such mutex.
2341 	 */
2342 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2343 	    UMUTEX_CONTESTED);
2344 
2345 	umtxq_lock(&key);
2346 	if (error == 0)
2347 		umtxq_signal(&key, 1);
2348 	umtxq_unbusy(&key);
2349 	umtxq_unlock(&key);
2350 
2351 	if (error == -1)
2352 		error = EFAULT;
2353 	else {
2354 		mtx_lock(&umtx_lock);
2355 		if (su != 0)
2356 			uq->uq_inherited_pri = new_inherited_pri;
2357 		pri = PRI_MAX;
2358 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2359 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2360 			if (uq2 != NULL) {
2361 				if (pri > UPRI(uq2->uq_thread))
2362 					pri = UPRI(uq2->uq_thread);
2363 			}
2364 		}
2365 		if (pri > uq->uq_inherited_pri)
2366 			pri = uq->uq_inherited_pri;
2367 		thread_lock(td);
2368 		sched_lend_user_prio(td, pri);
2369 		thread_unlock(td);
2370 		mtx_unlock(&umtx_lock);
2371 	}
2372 	umtx_key_release(&key);
2373 	return (error);
2374 }
2375 
2376 static int
2377 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2378     uint32_t *old_ceiling)
2379 {
2380 	struct umtx_q *uq;
2381 	uint32_t flags, id, owner, save_ceiling;
2382 	int error, rv, rv1;
2383 
2384 	error = fueword32(&m->m_flags, &flags);
2385 	if (error == -1)
2386 		return (EFAULT);
2387 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2388 		return (EINVAL);
2389 	if (ceiling > RTP_PRIO_MAX)
2390 		return (EINVAL);
2391 	id = td->td_tid;
2392 	uq = td->td_umtxq;
2393 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2394 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2395 	    &uq->uq_key)) != 0)
2396 		return (error);
2397 	for (;;) {
2398 		umtxq_lock(&uq->uq_key);
2399 		umtxq_busy(&uq->uq_key);
2400 		umtxq_unlock(&uq->uq_key);
2401 
2402 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2403 		if (rv == -1) {
2404 			error = EFAULT;
2405 			break;
2406 		}
2407 
2408 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2409 		    id | UMUTEX_CONTESTED);
2410 		if (rv == -1) {
2411 			error = EFAULT;
2412 			break;
2413 		}
2414 
2415 		if (rv == 0) {
2416 			MPASS(owner == UMUTEX_CONTESTED);
2417 			rv = suword32(&m->m_ceilings[0], ceiling);
2418 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2419 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2420 			break;
2421 		}
2422 
2423 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2424 			rv = suword32(&m->m_ceilings[0], ceiling);
2425 			error = rv == 0 ? 0 : EFAULT;
2426 			break;
2427 		}
2428 
2429 		if (owner == UMUTEX_RB_OWNERDEAD) {
2430 			error = EOWNERDEAD;
2431 			break;
2432 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2433 			error = ENOTRECOVERABLE;
2434 			break;
2435 		}
2436 
2437 		/*
2438 		 * If we caught a signal, we have retried and now
2439 		 * exit immediately.
2440 		 */
2441 		if (error != 0)
2442 			break;
2443 
2444 		/*
2445 		 * We set the contested bit, sleep. Otherwise the lock changed
2446 		 * and we need to retry or we lost a race to the thread
2447 		 * unlocking the umtx.
2448 		 */
2449 		umtxq_lock(&uq->uq_key);
2450 		umtxq_insert(uq);
2451 		umtxq_unbusy(&uq->uq_key);
2452 		error = umtxq_sleep(uq, "umtxpp", NULL);
2453 		umtxq_remove(uq);
2454 		umtxq_unlock(&uq->uq_key);
2455 	}
2456 	umtxq_lock(&uq->uq_key);
2457 	if (error == 0)
2458 		umtxq_signal(&uq->uq_key, INT_MAX);
2459 	umtxq_unbusy(&uq->uq_key);
2460 	umtxq_unlock(&uq->uq_key);
2461 	umtx_key_release(&uq->uq_key);
2462 	if (error == 0 && old_ceiling != NULL) {
2463 		rv = suword32(old_ceiling, save_ceiling);
2464 		error = rv == 0 ? 0 : EFAULT;
2465 	}
2466 	return (error);
2467 }
2468 
2469 /*
2470  * Lock a userland POSIX mutex.
2471  */
2472 static int
2473 do_lock_umutex(struct thread *td, struct umutex *m,
2474     struct _umtx_time *timeout, int mode)
2475 {
2476 	uint32_t flags;
2477 	int error;
2478 
2479 	error = fueword32(&m->m_flags, &flags);
2480 	if (error == -1)
2481 		return (EFAULT);
2482 
2483 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2484 	case 0:
2485 		error = do_lock_normal(td, m, flags, timeout, mode);
2486 		break;
2487 	case UMUTEX_PRIO_INHERIT:
2488 		error = do_lock_pi(td, m, flags, timeout, mode);
2489 		break;
2490 	case UMUTEX_PRIO_PROTECT:
2491 		error = do_lock_pp(td, m, flags, timeout, mode);
2492 		break;
2493 	default:
2494 		return (EINVAL);
2495 	}
2496 	if (timeout == NULL) {
2497 		if (error == EINTR && mode != _UMUTEX_WAIT)
2498 			error = ERESTART;
2499 	} else {
2500 		/* Timed-locking is not restarted. */
2501 		if (error == ERESTART)
2502 			error = EINTR;
2503 	}
2504 	return (error);
2505 }
2506 
2507 /*
2508  * Unlock a userland POSIX mutex.
2509  */
2510 static int
2511 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2512 {
2513 	uint32_t flags;
2514 	int error;
2515 
2516 	error = fueword32(&m->m_flags, &flags);
2517 	if (error == -1)
2518 		return (EFAULT);
2519 
2520 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2521 	case 0:
2522 		return (do_unlock_normal(td, m, flags, rb));
2523 	case UMUTEX_PRIO_INHERIT:
2524 		return (do_unlock_pi(td, m, flags, rb));
2525 	case UMUTEX_PRIO_PROTECT:
2526 		return (do_unlock_pp(td, m, flags, rb));
2527 	}
2528 
2529 	return (EINVAL);
2530 }
2531 
2532 static int
2533 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2534     struct timespec *timeout, u_long wflags)
2535 {
2536 	struct abs_timeout timo;
2537 	struct umtx_q *uq;
2538 	uint32_t flags, clockid, hasw;
2539 	int error;
2540 
2541 	uq = td->td_umtxq;
2542 	error = fueword32(&cv->c_flags, &flags);
2543 	if (error == -1)
2544 		return (EFAULT);
2545 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2546 	if (error != 0)
2547 		return (error);
2548 
2549 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2550 		error = fueword32(&cv->c_clockid, &clockid);
2551 		if (error == -1) {
2552 			umtx_key_release(&uq->uq_key);
2553 			return (EFAULT);
2554 		}
2555 		if (clockid < CLOCK_REALTIME ||
2556 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2557 			/* hmm, only HW clock id will work. */
2558 			umtx_key_release(&uq->uq_key);
2559 			return (EINVAL);
2560 		}
2561 	} else {
2562 		clockid = CLOCK_REALTIME;
2563 	}
2564 
2565 	umtxq_lock(&uq->uq_key);
2566 	umtxq_busy(&uq->uq_key);
2567 	umtxq_insert(uq);
2568 	umtxq_unlock(&uq->uq_key);
2569 
2570 	/*
2571 	 * Set c_has_waiters to 1 before releasing user mutex, also
2572 	 * don't modify cache line when unnecessary.
2573 	 */
2574 	error = fueword32(&cv->c_has_waiters, &hasw);
2575 	if (error == 0 && hasw == 0)
2576 		suword32(&cv->c_has_waiters, 1);
2577 
2578 	umtxq_unbusy_unlocked(&uq->uq_key);
2579 
2580 	error = do_unlock_umutex(td, m, false);
2581 
2582 	if (timeout != NULL)
2583 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2584 		    timeout);
2585 
2586 	umtxq_lock(&uq->uq_key);
2587 	if (error == 0) {
2588 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2589 		    NULL : &timo);
2590 	}
2591 
2592 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2593 		error = 0;
2594 	else {
2595 		/*
2596 		 * This must be timeout,interrupted by signal or
2597 		 * surprious wakeup, clear c_has_waiter flag when
2598 		 * necessary.
2599 		 */
2600 		umtxq_busy(&uq->uq_key);
2601 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2602 			int oldlen = uq->uq_cur_queue->length;
2603 			umtxq_remove(uq);
2604 			if (oldlen == 1) {
2605 				umtxq_unlock(&uq->uq_key);
2606 				suword32(&cv->c_has_waiters, 0);
2607 				umtxq_lock(&uq->uq_key);
2608 			}
2609 		}
2610 		umtxq_unbusy(&uq->uq_key);
2611 		if (error == ERESTART)
2612 			error = EINTR;
2613 	}
2614 
2615 	umtxq_unlock(&uq->uq_key);
2616 	umtx_key_release(&uq->uq_key);
2617 	return (error);
2618 }
2619 
2620 /*
2621  * Signal a userland condition variable.
2622  */
2623 static int
2624 do_cv_signal(struct thread *td, struct ucond *cv)
2625 {
2626 	struct umtx_key key;
2627 	int error, cnt, nwake;
2628 	uint32_t flags;
2629 
2630 	error = fueword32(&cv->c_flags, &flags);
2631 	if (error == -1)
2632 		return (EFAULT);
2633 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2634 		return (error);
2635 	umtxq_lock(&key);
2636 	umtxq_busy(&key);
2637 	cnt = umtxq_count(&key);
2638 	nwake = umtxq_signal(&key, 1);
2639 	if (cnt <= nwake) {
2640 		umtxq_unlock(&key);
2641 		error = suword32(&cv->c_has_waiters, 0);
2642 		if (error == -1)
2643 			error = EFAULT;
2644 		umtxq_lock(&key);
2645 	}
2646 	umtxq_unbusy(&key);
2647 	umtxq_unlock(&key);
2648 	umtx_key_release(&key);
2649 	return (error);
2650 }
2651 
2652 static int
2653 do_cv_broadcast(struct thread *td, struct ucond *cv)
2654 {
2655 	struct umtx_key key;
2656 	int error;
2657 	uint32_t flags;
2658 
2659 	error = fueword32(&cv->c_flags, &flags);
2660 	if (error == -1)
2661 		return (EFAULT);
2662 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2663 		return (error);
2664 
2665 	umtxq_lock(&key);
2666 	umtxq_busy(&key);
2667 	umtxq_signal(&key, INT_MAX);
2668 	umtxq_unlock(&key);
2669 
2670 	error = suword32(&cv->c_has_waiters, 0);
2671 	if (error == -1)
2672 		error = EFAULT;
2673 
2674 	umtxq_unbusy_unlocked(&key);
2675 
2676 	umtx_key_release(&key);
2677 	return (error);
2678 }
2679 
2680 static int
2681 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2682     struct _umtx_time *timeout)
2683 {
2684 	struct abs_timeout timo;
2685 	struct umtx_q *uq;
2686 	uint32_t flags, wrflags;
2687 	int32_t state, oldstate;
2688 	int32_t blocked_readers;
2689 	int error, error1, rv;
2690 
2691 	uq = td->td_umtxq;
2692 	error = fueword32(&rwlock->rw_flags, &flags);
2693 	if (error == -1)
2694 		return (EFAULT);
2695 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2696 	if (error != 0)
2697 		return (error);
2698 
2699 	if (timeout != NULL)
2700 		abs_timeout_init2(&timo, timeout);
2701 
2702 	wrflags = URWLOCK_WRITE_OWNER;
2703 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2704 		wrflags |= URWLOCK_WRITE_WAITERS;
2705 
2706 	for (;;) {
2707 		rv = fueword32(&rwlock->rw_state, &state);
2708 		if (rv == -1) {
2709 			umtx_key_release(&uq->uq_key);
2710 			return (EFAULT);
2711 		}
2712 
2713 		/* try to lock it */
2714 		while (!(state & wrflags)) {
2715 			if (__predict_false(URWLOCK_READER_COUNT(state) ==
2716 			    URWLOCK_MAX_READERS)) {
2717 				umtx_key_release(&uq->uq_key);
2718 				return (EAGAIN);
2719 			}
2720 			rv = casueword32(&rwlock->rw_state, state,
2721 			    &oldstate, state + 1);
2722 			if (rv == -1) {
2723 				umtx_key_release(&uq->uq_key);
2724 				return (EFAULT);
2725 			}
2726 			if (rv == 0) {
2727 				MPASS(oldstate == state);
2728 				umtx_key_release(&uq->uq_key);
2729 				return (0);
2730 			}
2731 			error = thread_check_susp(td, true);
2732 			if (error != 0)
2733 				break;
2734 			state = oldstate;
2735 		}
2736 
2737 		if (error)
2738 			break;
2739 
2740 		/* grab monitor lock */
2741 		umtxq_lock(&uq->uq_key);
2742 		umtxq_busy(&uq->uq_key);
2743 		umtxq_unlock(&uq->uq_key);
2744 
2745 		/*
2746 		 * re-read the state, in case it changed between the try-lock above
2747 		 * and the check below
2748 		 */
2749 		rv = fueword32(&rwlock->rw_state, &state);
2750 		if (rv == -1)
2751 			error = EFAULT;
2752 
2753 		/* set read contention bit */
2754 		while (error == 0 && (state & wrflags) &&
2755 		    !(state & URWLOCK_READ_WAITERS)) {
2756 			rv = casueword32(&rwlock->rw_state, state,
2757 			    &oldstate, state | URWLOCK_READ_WAITERS);
2758 			if (rv == -1) {
2759 				error = EFAULT;
2760 				break;
2761 			}
2762 			if (rv == 0) {
2763 				MPASS(oldstate == state);
2764 				goto sleep;
2765 			}
2766 			state = oldstate;
2767 			error = thread_check_susp(td, false);
2768 			if (error != 0)
2769 				break;
2770 		}
2771 		if (error != 0) {
2772 			umtxq_unbusy_unlocked(&uq->uq_key);
2773 			break;
2774 		}
2775 
2776 		/* state is changed while setting flags, restart */
2777 		if (!(state & wrflags)) {
2778 			umtxq_unbusy_unlocked(&uq->uq_key);
2779 			error = thread_check_susp(td, true);
2780 			if (error != 0)
2781 				break;
2782 			continue;
2783 		}
2784 
2785 sleep:
2786 		/*
2787 		 * Contention bit is set, before sleeping, increase
2788 		 * read waiter count.
2789 		 */
2790 		rv = fueword32(&rwlock->rw_blocked_readers,
2791 		    &blocked_readers);
2792 		if (rv == -1) {
2793 			umtxq_unbusy_unlocked(&uq->uq_key);
2794 			error = EFAULT;
2795 			break;
2796 		}
2797 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2798 
2799 		while (state & wrflags) {
2800 			umtxq_lock(&uq->uq_key);
2801 			umtxq_insert(uq);
2802 			umtxq_unbusy(&uq->uq_key);
2803 
2804 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2805 			    NULL : &timo);
2806 
2807 			umtxq_busy(&uq->uq_key);
2808 			umtxq_remove(uq);
2809 			umtxq_unlock(&uq->uq_key);
2810 			if (error)
2811 				break;
2812 			rv = fueword32(&rwlock->rw_state, &state);
2813 			if (rv == -1) {
2814 				error = EFAULT;
2815 				break;
2816 			}
2817 		}
2818 
2819 		/* decrease read waiter count, and may clear read contention bit */
2820 		rv = fueword32(&rwlock->rw_blocked_readers,
2821 		    &blocked_readers);
2822 		if (rv == -1) {
2823 			umtxq_unbusy_unlocked(&uq->uq_key);
2824 			error = EFAULT;
2825 			break;
2826 		}
2827 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2828 		if (blocked_readers == 1) {
2829 			rv = fueword32(&rwlock->rw_state, &state);
2830 			if (rv == -1) {
2831 				umtxq_unbusy_unlocked(&uq->uq_key);
2832 				error = EFAULT;
2833 				break;
2834 			}
2835 			for (;;) {
2836 				rv = casueword32(&rwlock->rw_state, state,
2837 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2838 				if (rv == -1) {
2839 					error = EFAULT;
2840 					break;
2841 				}
2842 				if (rv == 0) {
2843 					MPASS(oldstate == state);
2844 					break;
2845 				}
2846 				state = oldstate;
2847 				error1 = thread_check_susp(td, false);
2848 				if (error1 != 0) {
2849 					if (error == 0)
2850 						error = error1;
2851 					break;
2852 				}
2853 			}
2854 		}
2855 
2856 		umtxq_unbusy_unlocked(&uq->uq_key);
2857 		if (error != 0)
2858 			break;
2859 	}
2860 	umtx_key_release(&uq->uq_key);
2861 	if (error == ERESTART)
2862 		error = EINTR;
2863 	return (error);
2864 }
2865 
2866 static int
2867 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2868 {
2869 	struct abs_timeout timo;
2870 	struct umtx_q *uq;
2871 	uint32_t flags;
2872 	int32_t state, oldstate;
2873 	int32_t blocked_writers;
2874 	int32_t blocked_readers;
2875 	int error, error1, rv;
2876 
2877 	uq = td->td_umtxq;
2878 	error = fueword32(&rwlock->rw_flags, &flags);
2879 	if (error == -1)
2880 		return (EFAULT);
2881 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2882 	if (error != 0)
2883 		return (error);
2884 
2885 	if (timeout != NULL)
2886 		abs_timeout_init2(&timo, timeout);
2887 
2888 	blocked_readers = 0;
2889 	for (;;) {
2890 		rv = fueword32(&rwlock->rw_state, &state);
2891 		if (rv == -1) {
2892 			umtx_key_release(&uq->uq_key);
2893 			return (EFAULT);
2894 		}
2895 		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
2896 		    URWLOCK_READER_COUNT(state) == 0) {
2897 			rv = casueword32(&rwlock->rw_state, state,
2898 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2899 			if (rv == -1) {
2900 				umtx_key_release(&uq->uq_key);
2901 				return (EFAULT);
2902 			}
2903 			if (rv == 0) {
2904 				MPASS(oldstate == state);
2905 				umtx_key_release(&uq->uq_key);
2906 				return (0);
2907 			}
2908 			state = oldstate;
2909 			error = thread_check_susp(td, true);
2910 			if (error != 0)
2911 				break;
2912 		}
2913 
2914 		if (error) {
2915 			if ((state & (URWLOCK_WRITE_OWNER |
2916 			    URWLOCK_WRITE_WAITERS)) == 0 &&
2917 			    blocked_readers != 0) {
2918 				umtxq_lock(&uq->uq_key);
2919 				umtxq_busy(&uq->uq_key);
2920 				umtxq_signal_queue(&uq->uq_key, INT_MAX,
2921 				    UMTX_SHARED_QUEUE);
2922 				umtxq_unbusy(&uq->uq_key);
2923 				umtxq_unlock(&uq->uq_key);
2924 			}
2925 
2926 			break;
2927 		}
2928 
2929 		/* grab monitor lock */
2930 		umtxq_lock(&uq->uq_key);
2931 		umtxq_busy(&uq->uq_key);
2932 		umtxq_unlock(&uq->uq_key);
2933 
2934 		/*
2935 		 * Re-read the state, in case it changed between the
2936 		 * try-lock above and the check below.
2937 		 */
2938 		rv = fueword32(&rwlock->rw_state, &state);
2939 		if (rv == -1)
2940 			error = EFAULT;
2941 
2942 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2943 		    URWLOCK_READER_COUNT(state) != 0) &&
2944 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2945 			rv = casueword32(&rwlock->rw_state, state,
2946 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2947 			if (rv == -1) {
2948 				error = EFAULT;
2949 				break;
2950 			}
2951 			if (rv == 0) {
2952 				MPASS(oldstate == state);
2953 				goto sleep;
2954 			}
2955 			state = oldstate;
2956 			error = thread_check_susp(td, false);
2957 			if (error != 0)
2958 				break;
2959 		}
2960 		if (error != 0) {
2961 			umtxq_unbusy_unlocked(&uq->uq_key);
2962 			break;
2963 		}
2964 
2965 		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
2966 		    URWLOCK_READER_COUNT(state) == 0) {
2967 			umtxq_unbusy_unlocked(&uq->uq_key);
2968 			error = thread_check_susp(td, false);
2969 			if (error != 0)
2970 				break;
2971 			continue;
2972 		}
2973 sleep:
2974 		rv = fueword32(&rwlock->rw_blocked_writers,
2975 		    &blocked_writers);
2976 		if (rv == -1) {
2977 			umtxq_unbusy_unlocked(&uq->uq_key);
2978 			error = EFAULT;
2979 			break;
2980 		}
2981 		suword32(&rwlock->rw_blocked_writers, blocked_writers + 1);
2982 
2983 		while ((state & URWLOCK_WRITE_OWNER) ||
2984 		    URWLOCK_READER_COUNT(state) != 0) {
2985 			umtxq_lock(&uq->uq_key);
2986 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2987 			umtxq_unbusy(&uq->uq_key);
2988 
2989 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2990 			    NULL : &timo);
2991 
2992 			umtxq_busy(&uq->uq_key);
2993 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2994 			umtxq_unlock(&uq->uq_key);
2995 			if (error)
2996 				break;
2997 			rv = fueword32(&rwlock->rw_state, &state);
2998 			if (rv == -1) {
2999 				error = EFAULT;
3000 				break;
3001 			}
3002 		}
3003 
3004 		rv = fueword32(&rwlock->rw_blocked_writers,
3005 		    &blocked_writers);
3006 		if (rv == -1) {
3007 			umtxq_unbusy_unlocked(&uq->uq_key);
3008 			error = EFAULT;
3009 			break;
3010 		}
3011 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3012 		if (blocked_writers == 1) {
3013 			rv = fueword32(&rwlock->rw_state, &state);
3014 			if (rv == -1) {
3015 				umtxq_unbusy_unlocked(&uq->uq_key);
3016 				error = EFAULT;
3017 				break;
3018 			}
3019 			for (;;) {
3020 				rv = casueword32(&rwlock->rw_state, state,
3021 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3022 				if (rv == -1) {
3023 					error = EFAULT;
3024 					break;
3025 				}
3026 				if (rv == 0) {
3027 					MPASS(oldstate == state);
3028 					break;
3029 				}
3030 				state = oldstate;
3031 				error1 = thread_check_susp(td, false);
3032 				/*
3033 				 * We are leaving the URWLOCK_WRITE_WAITERS
3034 				 * behind, but this should not harm the
3035 				 * correctness.
3036 				 */
3037 				if (error1 != 0) {
3038 					if (error == 0)
3039 						error = error1;
3040 					break;
3041 				}
3042 			}
3043 			rv = fueword32(&rwlock->rw_blocked_readers,
3044 			    &blocked_readers);
3045 			if (rv == -1) {
3046 				umtxq_unbusy_unlocked(&uq->uq_key);
3047 				error = EFAULT;
3048 				break;
3049 			}
3050 		} else
3051 			blocked_readers = 0;
3052 
3053 		umtxq_unbusy_unlocked(&uq->uq_key);
3054 	}
3055 
3056 	umtx_key_release(&uq->uq_key);
3057 	if (error == ERESTART)
3058 		error = EINTR;
3059 	return (error);
3060 }
3061 
3062 static int
3063 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3064 {
3065 	struct umtx_q *uq;
3066 	uint32_t flags;
3067 	int32_t state, oldstate;
3068 	int error, rv, q, count;
3069 
3070 	uq = td->td_umtxq;
3071 	error = fueword32(&rwlock->rw_flags, &flags);
3072 	if (error == -1)
3073 		return (EFAULT);
3074 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3075 	if (error != 0)
3076 		return (error);
3077 
3078 	error = fueword32(&rwlock->rw_state, &state);
3079 	if (error == -1) {
3080 		error = EFAULT;
3081 		goto out;
3082 	}
3083 	if (state & URWLOCK_WRITE_OWNER) {
3084 		for (;;) {
3085 			rv = casueword32(&rwlock->rw_state, state,
3086 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3087 			if (rv == -1) {
3088 				error = EFAULT;
3089 				goto out;
3090 			}
3091 			if (rv == 1) {
3092 				state = oldstate;
3093 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3094 					error = EPERM;
3095 					goto out;
3096 				}
3097 				error = thread_check_susp(td, true);
3098 				if (error != 0)
3099 					goto out;
3100 			} else
3101 				break;
3102 		}
3103 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3104 		for (;;) {
3105 			rv = casueword32(&rwlock->rw_state, state,
3106 			    &oldstate, state - 1);
3107 			if (rv == -1) {
3108 				error = EFAULT;
3109 				goto out;
3110 			}
3111 			if (rv == 1) {
3112 				state = oldstate;
3113 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3114 					error = EPERM;
3115 					goto out;
3116 				}
3117 				error = thread_check_susp(td, true);
3118 				if (error != 0)
3119 					goto out;
3120 			} else
3121 				break;
3122 		}
3123 	} else {
3124 		error = EPERM;
3125 		goto out;
3126 	}
3127 
3128 	count = 0;
3129 
3130 	if (!(flags & URWLOCK_PREFER_READER)) {
3131 		if (state & URWLOCK_WRITE_WAITERS) {
3132 			count = 1;
3133 			q = UMTX_EXCLUSIVE_QUEUE;
3134 		} else if (state & URWLOCK_READ_WAITERS) {
3135 			count = INT_MAX;
3136 			q = UMTX_SHARED_QUEUE;
3137 		}
3138 	} else {
3139 		if (state & URWLOCK_READ_WAITERS) {
3140 			count = INT_MAX;
3141 			q = UMTX_SHARED_QUEUE;
3142 		} else if (state & URWLOCK_WRITE_WAITERS) {
3143 			count = 1;
3144 			q = UMTX_EXCLUSIVE_QUEUE;
3145 		}
3146 	}
3147 
3148 	if (count) {
3149 		umtxq_lock(&uq->uq_key);
3150 		umtxq_busy(&uq->uq_key);
3151 		umtxq_signal_queue(&uq->uq_key, count, q);
3152 		umtxq_unbusy(&uq->uq_key);
3153 		umtxq_unlock(&uq->uq_key);
3154 	}
3155 out:
3156 	umtx_key_release(&uq->uq_key);
3157 	return (error);
3158 }
3159 
3160 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3161 static int
3162 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3163 {
3164 	struct abs_timeout timo;
3165 	struct umtx_q *uq;
3166 	uint32_t flags, count, count1;
3167 	int error, rv, rv1;
3168 
3169 	uq = td->td_umtxq;
3170 	error = fueword32(&sem->_flags, &flags);
3171 	if (error == -1)
3172 		return (EFAULT);
3173 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3174 	if (error != 0)
3175 		return (error);
3176 
3177 	if (timeout != NULL)
3178 		abs_timeout_init2(&timo, timeout);
3179 
3180 again:
3181 	umtxq_lock(&uq->uq_key);
3182 	umtxq_busy(&uq->uq_key);
3183 	umtxq_insert(uq);
3184 	umtxq_unlock(&uq->uq_key);
3185 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3186 	if (rv == 0)
3187 		rv1 = fueword32(&sem->_count, &count);
3188 	if (rv == -1 || (rv == 0 && (rv1 == -1 || count != 0)) ||
3189 	    (rv == 1 && count1 == 0)) {
3190 		umtxq_lock(&uq->uq_key);
3191 		umtxq_unbusy(&uq->uq_key);
3192 		umtxq_remove(uq);
3193 		umtxq_unlock(&uq->uq_key);
3194 		if (rv == 1) {
3195 			rv = thread_check_susp(td, true);
3196 			if (rv == 0)
3197 				goto again;
3198 			error = rv;
3199 			goto out;
3200 		}
3201 		if (rv == 0)
3202 			rv = rv1;
3203 		error = rv == -1 ? EFAULT : 0;
3204 		goto out;
3205 	}
3206 	umtxq_lock(&uq->uq_key);
3207 	umtxq_unbusy(&uq->uq_key);
3208 
3209 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3210 
3211 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3212 		error = 0;
3213 	else {
3214 		umtxq_remove(uq);
3215 		/* A relative timeout cannot be restarted. */
3216 		if (error == ERESTART && timeout != NULL &&
3217 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3218 			error = EINTR;
3219 	}
3220 	umtxq_unlock(&uq->uq_key);
3221 out:
3222 	umtx_key_release(&uq->uq_key);
3223 	return (error);
3224 }
3225 
3226 /*
3227  * Signal a userland semaphore.
3228  */
3229 static int
3230 do_sem_wake(struct thread *td, struct _usem *sem)
3231 {
3232 	struct umtx_key key;
3233 	int error, cnt;
3234 	uint32_t flags;
3235 
3236 	error = fueword32(&sem->_flags, &flags);
3237 	if (error == -1)
3238 		return (EFAULT);
3239 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3240 		return (error);
3241 	umtxq_lock(&key);
3242 	umtxq_busy(&key);
3243 	cnt = umtxq_count(&key);
3244 	if (cnt > 0) {
3245 		/*
3246 		 * Check if count is greater than 0, this means the memory is
3247 		 * still being referenced by user code, so we can safely
3248 		 * update _has_waiters flag.
3249 		 */
3250 		if (cnt == 1) {
3251 			umtxq_unlock(&key);
3252 			error = suword32(&sem->_has_waiters, 0);
3253 			umtxq_lock(&key);
3254 			if (error == -1)
3255 				error = EFAULT;
3256 		}
3257 		umtxq_signal(&key, 1);
3258 	}
3259 	umtxq_unbusy(&key);
3260 	umtxq_unlock(&key);
3261 	umtx_key_release(&key);
3262 	return (error);
3263 }
3264 #endif
3265 
3266 static int
3267 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3268 {
3269 	struct abs_timeout timo;
3270 	struct umtx_q *uq;
3271 	uint32_t count, flags;
3272 	int error, rv;
3273 
3274 	uq = td->td_umtxq;
3275 	flags = fuword32(&sem->_flags);
3276 	if (timeout != NULL)
3277 		abs_timeout_init2(&timo, timeout);
3278 
3279 again:
3280 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3281 	if (error != 0)
3282 		return (error);
3283 	umtxq_lock(&uq->uq_key);
3284 	umtxq_busy(&uq->uq_key);
3285 	umtxq_insert(uq);
3286 	umtxq_unlock(&uq->uq_key);
3287 	rv = fueword32(&sem->_count, &count);
3288 	if (rv == -1) {
3289 		umtxq_lock(&uq->uq_key);
3290 		umtxq_unbusy(&uq->uq_key);
3291 		umtxq_remove(uq);
3292 		umtxq_unlock(&uq->uq_key);
3293 		umtx_key_release(&uq->uq_key);
3294 		return (EFAULT);
3295 	}
3296 	for (;;) {
3297 		if (USEM_COUNT(count) != 0) {
3298 			umtxq_lock(&uq->uq_key);
3299 			umtxq_unbusy(&uq->uq_key);
3300 			umtxq_remove(uq);
3301 			umtxq_unlock(&uq->uq_key);
3302 			umtx_key_release(&uq->uq_key);
3303 			return (0);
3304 		}
3305 		if (count == USEM_HAS_WAITERS)
3306 			break;
3307 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3308 		if (rv == 0)
3309 			break;
3310 		umtxq_lock(&uq->uq_key);
3311 		umtxq_unbusy(&uq->uq_key);
3312 		umtxq_remove(uq);
3313 		umtxq_unlock(&uq->uq_key);
3314 		umtx_key_release(&uq->uq_key);
3315 		if (rv == -1)
3316 			return (EFAULT);
3317 		rv = thread_check_susp(td, true);
3318 		if (rv != 0)
3319 			return (rv);
3320 		goto again;
3321 	}
3322 	umtxq_lock(&uq->uq_key);
3323 	umtxq_unbusy(&uq->uq_key);
3324 
3325 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3326 
3327 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3328 		error = 0;
3329 	else {
3330 		umtxq_remove(uq);
3331 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3332 			/* A relative timeout cannot be restarted. */
3333 			if (error == ERESTART)
3334 				error = EINTR;
3335 			if (error == EINTR) {
3336 				abs_timeout_update(&timo);
3337 				timespecsub(&timo.end, &timo.cur,
3338 				    &timeout->_timeout);
3339 			}
3340 		}
3341 	}
3342 	umtxq_unlock(&uq->uq_key);
3343 	umtx_key_release(&uq->uq_key);
3344 	return (error);
3345 }
3346 
3347 /*
3348  * Signal a userland semaphore.
3349  */
3350 static int
3351 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3352 {
3353 	struct umtx_key key;
3354 	int error, cnt, rv;
3355 	uint32_t count, flags;
3356 
3357 	rv = fueword32(&sem->_flags, &flags);
3358 	if (rv == -1)
3359 		return (EFAULT);
3360 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3361 		return (error);
3362 	umtxq_lock(&key);
3363 	umtxq_busy(&key);
3364 	cnt = umtxq_count(&key);
3365 	if (cnt > 0) {
3366 		/*
3367 		 * If this was the last sleeping thread, clear the waiters
3368 		 * flag in _count.
3369 		 */
3370 		if (cnt == 1) {
3371 			umtxq_unlock(&key);
3372 			rv = fueword32(&sem->_count, &count);
3373 			while (rv != -1 && count & USEM_HAS_WAITERS) {
3374 				rv = casueword32(&sem->_count, count, &count,
3375 				    count & ~USEM_HAS_WAITERS);
3376 				if (rv == 1) {
3377 					rv = thread_check_susp(td, true);
3378 					if (rv != 0)
3379 						break;
3380 				}
3381 			}
3382 			if (rv == -1)
3383 				error = EFAULT;
3384 			else if (rv > 0) {
3385 				error = rv;
3386 			}
3387 			umtxq_lock(&key);
3388 		}
3389 
3390 		umtxq_signal(&key, 1);
3391 	}
3392 	umtxq_unbusy(&key);
3393 	umtxq_unlock(&key);
3394 	umtx_key_release(&key);
3395 	return (error);
3396 }
3397 
3398 inline int
3399 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3400 {
3401 	int error;
3402 
3403 	error = copyin(addr, tsp, sizeof(struct timespec));
3404 	if (error == 0) {
3405 		if (tsp->tv_sec < 0 ||
3406 		    tsp->tv_nsec >= 1000000000 ||
3407 		    tsp->tv_nsec < 0)
3408 			error = EINVAL;
3409 	}
3410 	return (error);
3411 }
3412 
3413 static inline int
3414 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3415 {
3416 	int error;
3417 
3418 	if (size <= sizeof(struct timespec)) {
3419 		tp->_clockid = CLOCK_REALTIME;
3420 		tp->_flags = 0;
3421 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3422 	} else
3423 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3424 	if (error != 0)
3425 		return (error);
3426 	if (tp->_timeout.tv_sec < 0 ||
3427 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3428 		return (EINVAL);
3429 	return (0);
3430 }
3431 
3432 static int
3433 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3434 {
3435 
3436 	return (EOPNOTSUPP);
3437 }
3438 
3439 static int
3440 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3441 {
3442 	struct _umtx_time timeout, *tm_p;
3443 	int error;
3444 
3445 	if (uap->uaddr2 == NULL)
3446 		tm_p = NULL;
3447 	else {
3448 		error = umtx_copyin_umtx_time(
3449 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3450 		if (error != 0)
3451 			return (error);
3452 		tm_p = &timeout;
3453 	}
3454 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3455 }
3456 
3457 static int
3458 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3459 {
3460 	struct _umtx_time timeout, *tm_p;
3461 	int error;
3462 
3463 	if (uap->uaddr2 == NULL)
3464 		tm_p = NULL;
3465 	else {
3466 		error = umtx_copyin_umtx_time(
3467 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3468 		if (error != 0)
3469 			return (error);
3470 		tm_p = &timeout;
3471 	}
3472 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3473 }
3474 
3475 static int
3476 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3477 {
3478 	struct _umtx_time *tm_p, timeout;
3479 	int error;
3480 
3481 	if (uap->uaddr2 == NULL)
3482 		tm_p = NULL;
3483 	else {
3484 		error = umtx_copyin_umtx_time(
3485 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3486 		if (error != 0)
3487 			return (error);
3488 		tm_p = &timeout;
3489 	}
3490 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3491 }
3492 
3493 static int
3494 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3495 {
3496 
3497 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3498 }
3499 
3500 #define BATCH_SIZE	128
3501 static int
3502 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3503 {
3504 	char *uaddrs[BATCH_SIZE], **upp;
3505 	int count, error, i, pos, tocopy;
3506 
3507 	upp = (char **)uap->obj;
3508 	error = 0;
3509 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3510 	    pos += tocopy) {
3511 		tocopy = MIN(count, BATCH_SIZE);
3512 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3513 		if (error != 0)
3514 			break;
3515 		for (i = 0; i < tocopy; ++i)
3516 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3517 		maybe_yield();
3518 	}
3519 	return (error);
3520 }
3521 
3522 static int
3523 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3524 {
3525 
3526 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3527 }
3528 
3529 static int
3530 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3531 {
3532 	struct _umtx_time *tm_p, timeout;
3533 	int error;
3534 
3535 	/* Allow a null timespec (wait forever). */
3536 	if (uap->uaddr2 == NULL)
3537 		tm_p = NULL;
3538 	else {
3539 		error = umtx_copyin_umtx_time(
3540 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3541 		if (error != 0)
3542 			return (error);
3543 		tm_p = &timeout;
3544 	}
3545 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3546 }
3547 
3548 static int
3549 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3550 {
3551 
3552 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3553 }
3554 
3555 static int
3556 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3557 {
3558 	struct _umtx_time *tm_p, timeout;
3559 	int error;
3560 
3561 	/* Allow a null timespec (wait forever). */
3562 	if (uap->uaddr2 == NULL)
3563 		tm_p = NULL;
3564 	else {
3565 		error = umtx_copyin_umtx_time(
3566 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3567 		if (error != 0)
3568 			return (error);
3569 		tm_p = &timeout;
3570 	}
3571 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3572 }
3573 
3574 static int
3575 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3576 {
3577 
3578 	return (do_wake_umutex(td, uap->obj));
3579 }
3580 
3581 static int
3582 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3583 {
3584 
3585 	return (do_unlock_umutex(td, uap->obj, false));
3586 }
3587 
3588 static int
3589 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3590 {
3591 
3592 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3593 }
3594 
3595 static int
3596 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3597 {
3598 	struct timespec *ts, timeout;
3599 	int error;
3600 
3601 	/* Allow a null timespec (wait forever). */
3602 	if (uap->uaddr2 == NULL)
3603 		ts = NULL;
3604 	else {
3605 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3606 		if (error != 0)
3607 			return (error);
3608 		ts = &timeout;
3609 	}
3610 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3611 }
3612 
3613 static int
3614 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3615 {
3616 
3617 	return (do_cv_signal(td, uap->obj));
3618 }
3619 
3620 static int
3621 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3622 {
3623 
3624 	return (do_cv_broadcast(td, uap->obj));
3625 }
3626 
3627 static int
3628 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3629 {
3630 	struct _umtx_time timeout;
3631 	int error;
3632 
3633 	/* Allow a null timespec (wait forever). */
3634 	if (uap->uaddr2 == NULL) {
3635 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3636 	} else {
3637 		error = umtx_copyin_umtx_time(uap->uaddr2,
3638 		   (size_t)uap->uaddr1, &timeout);
3639 		if (error != 0)
3640 			return (error);
3641 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3642 	}
3643 	return (error);
3644 }
3645 
3646 static int
3647 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3648 {
3649 	struct _umtx_time timeout;
3650 	int error;
3651 
3652 	/* Allow a null timespec (wait forever). */
3653 	if (uap->uaddr2 == NULL) {
3654 		error = do_rw_wrlock(td, uap->obj, 0);
3655 	} else {
3656 		error = umtx_copyin_umtx_time(uap->uaddr2,
3657 		   (size_t)uap->uaddr1, &timeout);
3658 		if (error != 0)
3659 			return (error);
3660 
3661 		error = do_rw_wrlock(td, uap->obj, &timeout);
3662 	}
3663 	return (error);
3664 }
3665 
3666 static int
3667 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3668 {
3669 
3670 	return (do_rw_unlock(td, uap->obj));
3671 }
3672 
3673 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3674 static int
3675 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3676 {
3677 	struct _umtx_time *tm_p, timeout;
3678 	int error;
3679 
3680 	/* Allow a null timespec (wait forever). */
3681 	if (uap->uaddr2 == NULL)
3682 		tm_p = NULL;
3683 	else {
3684 		error = umtx_copyin_umtx_time(
3685 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3686 		if (error != 0)
3687 			return (error);
3688 		tm_p = &timeout;
3689 	}
3690 	return (do_sem_wait(td, uap->obj, tm_p));
3691 }
3692 
3693 static int
3694 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3695 {
3696 
3697 	return (do_sem_wake(td, uap->obj));
3698 }
3699 #endif
3700 
3701 static int
3702 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3703 {
3704 
3705 	return (do_wake2_umutex(td, uap->obj, uap->val));
3706 }
3707 
3708 static int
3709 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3710 {
3711 	struct _umtx_time *tm_p, timeout;
3712 	size_t uasize;
3713 	int error;
3714 
3715 	/* Allow a null timespec (wait forever). */
3716 	if (uap->uaddr2 == NULL) {
3717 		uasize = 0;
3718 		tm_p = NULL;
3719 	} else {
3720 		uasize = (size_t)uap->uaddr1;
3721 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3722 		if (error != 0)
3723 			return (error);
3724 		tm_p = &timeout;
3725 	}
3726 	error = do_sem2_wait(td, uap->obj, tm_p);
3727 	if (error == EINTR && uap->uaddr2 != NULL &&
3728 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3729 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3730 		error = copyout(&timeout._timeout,
3731 		    (struct _umtx_time *)uap->uaddr2 + 1,
3732 		    sizeof(struct timespec));
3733 		if (error == 0) {
3734 			error = EINTR;
3735 		}
3736 	}
3737 
3738 	return (error);
3739 }
3740 
3741 static int
3742 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3743 {
3744 
3745 	return (do_sem2_wake(td, uap->obj));
3746 }
3747 
3748 #define	USHM_OBJ_UMTX(o)						\
3749     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3750 
3751 #define	USHMF_REG_LINKED	0x0001
3752 #define	USHMF_OBJ_LINKED	0x0002
3753 struct umtx_shm_reg {
3754 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3755 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3756 	struct umtx_key		ushm_key;
3757 	struct ucred		*ushm_cred;
3758 	struct shmfd		*ushm_obj;
3759 	u_int			ushm_refcnt;
3760 	u_int			ushm_flags;
3761 };
3762 
3763 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3764 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3765 
3766 static uma_zone_t umtx_shm_reg_zone;
3767 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3768 static struct mtx umtx_shm_lock;
3769 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3770     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3771 
3772 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3773 
3774 static void
3775 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3776 {
3777 	struct umtx_shm_reg_head d;
3778 	struct umtx_shm_reg *reg, *reg1;
3779 
3780 	TAILQ_INIT(&d);
3781 	mtx_lock(&umtx_shm_lock);
3782 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3783 	mtx_unlock(&umtx_shm_lock);
3784 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3785 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3786 		umtx_shm_free_reg(reg);
3787 	}
3788 }
3789 
3790 static struct task umtx_shm_reg_delfree_task =
3791     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3792 
3793 static struct umtx_shm_reg *
3794 umtx_shm_find_reg_locked(const struct umtx_key *key)
3795 {
3796 	struct umtx_shm_reg *reg;
3797 	struct umtx_shm_reg_head *reg_head;
3798 
3799 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3800 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3801 	reg_head = &umtx_shm_registry[key->hash];
3802 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3803 		KASSERT(reg->ushm_key.shared,
3804 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3805 		if (reg->ushm_key.info.shared.object ==
3806 		    key->info.shared.object &&
3807 		    reg->ushm_key.info.shared.offset ==
3808 		    key->info.shared.offset) {
3809 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3810 			KASSERT(reg->ushm_refcnt > 0,
3811 			    ("reg %p refcnt 0 onlist", reg));
3812 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3813 			    ("reg %p not linked", reg));
3814 			reg->ushm_refcnt++;
3815 			return (reg);
3816 		}
3817 	}
3818 	return (NULL);
3819 }
3820 
3821 static struct umtx_shm_reg *
3822 umtx_shm_find_reg(const struct umtx_key *key)
3823 {
3824 	struct umtx_shm_reg *reg;
3825 
3826 	mtx_lock(&umtx_shm_lock);
3827 	reg = umtx_shm_find_reg_locked(key);
3828 	mtx_unlock(&umtx_shm_lock);
3829 	return (reg);
3830 }
3831 
3832 static void
3833 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3834 {
3835 
3836 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3837 	crfree(reg->ushm_cred);
3838 	shm_drop(reg->ushm_obj);
3839 	uma_zfree(umtx_shm_reg_zone, reg);
3840 }
3841 
3842 static bool
3843 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3844 {
3845 	bool res;
3846 
3847 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3848 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3849 	reg->ushm_refcnt--;
3850 	res = reg->ushm_refcnt == 0;
3851 	if (res || force) {
3852 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3853 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3854 			    reg, ushm_reg_link);
3855 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3856 		}
3857 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3858 			LIST_REMOVE(reg, ushm_obj_link);
3859 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3860 		}
3861 	}
3862 	return (res);
3863 }
3864 
3865 static void
3866 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3867 {
3868 	vm_object_t object;
3869 	bool dofree;
3870 
3871 	if (force) {
3872 		object = reg->ushm_obj->shm_object;
3873 		VM_OBJECT_WLOCK(object);
3874 		object->flags |= OBJ_UMTXDEAD;
3875 		VM_OBJECT_WUNLOCK(object);
3876 	}
3877 	mtx_lock(&umtx_shm_lock);
3878 	dofree = umtx_shm_unref_reg_locked(reg, force);
3879 	mtx_unlock(&umtx_shm_lock);
3880 	if (dofree)
3881 		umtx_shm_free_reg(reg);
3882 }
3883 
3884 void
3885 umtx_shm_object_init(vm_object_t object)
3886 {
3887 
3888 	LIST_INIT(USHM_OBJ_UMTX(object));
3889 }
3890 
3891 void
3892 umtx_shm_object_terminated(vm_object_t object)
3893 {
3894 	struct umtx_shm_reg *reg, *reg1;
3895 	bool dofree;
3896 
3897 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3898 		return;
3899 
3900 	dofree = false;
3901 	mtx_lock(&umtx_shm_lock);
3902 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3903 		if (umtx_shm_unref_reg_locked(reg, true)) {
3904 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3905 			    ushm_reg_link);
3906 			dofree = true;
3907 		}
3908 	}
3909 	mtx_unlock(&umtx_shm_lock);
3910 	if (dofree)
3911 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3912 }
3913 
3914 static int
3915 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3916     struct umtx_shm_reg **res)
3917 {
3918 	struct umtx_shm_reg *reg, *reg1;
3919 	struct ucred *cred;
3920 	int error;
3921 
3922 	reg = umtx_shm_find_reg(key);
3923 	if (reg != NULL) {
3924 		*res = reg;
3925 		return (0);
3926 	}
3927 	cred = td->td_ucred;
3928 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3929 		return (ENOMEM);
3930 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3931 	reg->ushm_refcnt = 1;
3932 	bcopy(key, &reg->ushm_key, sizeof(*key));
3933 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3934 	reg->ushm_cred = crhold(cred);
3935 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3936 	if (error != 0) {
3937 		umtx_shm_free_reg(reg);
3938 		return (error);
3939 	}
3940 	mtx_lock(&umtx_shm_lock);
3941 	reg1 = umtx_shm_find_reg_locked(key);
3942 	if (reg1 != NULL) {
3943 		mtx_unlock(&umtx_shm_lock);
3944 		umtx_shm_free_reg(reg);
3945 		*res = reg1;
3946 		return (0);
3947 	}
3948 	reg->ushm_refcnt++;
3949 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3950 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3951 	    ushm_obj_link);
3952 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3953 	mtx_unlock(&umtx_shm_lock);
3954 	*res = reg;
3955 	return (0);
3956 }
3957 
3958 static int
3959 umtx_shm_alive(struct thread *td, void *addr)
3960 {
3961 	vm_map_t map;
3962 	vm_map_entry_t entry;
3963 	vm_object_t object;
3964 	vm_pindex_t pindex;
3965 	vm_prot_t prot;
3966 	int res, ret;
3967 	boolean_t wired;
3968 
3969 	map = &td->td_proc->p_vmspace->vm_map;
3970 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3971 	    &object, &pindex, &prot, &wired);
3972 	if (res != KERN_SUCCESS)
3973 		return (EFAULT);
3974 	if (object == NULL)
3975 		ret = EINVAL;
3976 	else
3977 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3978 	vm_map_lookup_done(map, entry);
3979 	return (ret);
3980 }
3981 
3982 static void
3983 umtx_shm_init(void)
3984 {
3985 	int i;
3986 
3987 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3988 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3989 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3990 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3991 		TAILQ_INIT(&umtx_shm_registry[i]);
3992 }
3993 
3994 static int
3995 umtx_shm(struct thread *td, void *addr, u_int flags)
3996 {
3997 	struct umtx_key key;
3998 	struct umtx_shm_reg *reg;
3999 	struct file *fp;
4000 	int error, fd;
4001 
4002 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
4003 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
4004 		return (EINVAL);
4005 	if ((flags & UMTX_SHM_ALIVE) != 0)
4006 		return (umtx_shm_alive(td, addr));
4007 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
4008 	if (error != 0)
4009 		return (error);
4010 	KASSERT(key.shared == 1, ("non-shared key"));
4011 	if ((flags & UMTX_SHM_CREAT) != 0) {
4012 		error = umtx_shm_create_reg(td, &key, &reg);
4013 	} else {
4014 		reg = umtx_shm_find_reg(&key);
4015 		if (reg == NULL)
4016 			error = ESRCH;
4017 	}
4018 	umtx_key_release(&key);
4019 	if (error != 0)
4020 		return (error);
4021 	KASSERT(reg != NULL, ("no reg"));
4022 	if ((flags & UMTX_SHM_DESTROY) != 0) {
4023 		umtx_shm_unref_reg(reg, true);
4024 	} else {
4025 #if 0
4026 #ifdef MAC
4027 		error = mac_posixshm_check_open(td->td_ucred,
4028 		    reg->ushm_obj, FFLAGS(O_RDWR));
4029 		if (error == 0)
4030 #endif
4031 			error = shm_access(reg->ushm_obj, td->td_ucred,
4032 			    FFLAGS(O_RDWR));
4033 		if (error == 0)
4034 #endif
4035 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
4036 		if (error == 0) {
4037 			shm_hold(reg->ushm_obj);
4038 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
4039 			    &shm_ops);
4040 			td->td_retval[0] = fd;
4041 			fdrop(fp, td);
4042 		}
4043 	}
4044 	umtx_shm_unref_reg(reg, false);
4045 	return (error);
4046 }
4047 
4048 static int
4049 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
4050 {
4051 
4052 	return (umtx_shm(td, uap->uaddr1, uap->val));
4053 }
4054 
4055 static int
4056 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
4057 {
4058 
4059 	td->td_rb_list = rbp->robust_list_offset;
4060 	td->td_rbp_list = rbp->robust_priv_list_offset;
4061 	td->td_rb_inact = rbp->robust_inact_offset;
4062 	return (0);
4063 }
4064 
4065 static int
4066 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
4067 {
4068 	struct umtx_robust_lists_params rb;
4069 	int error;
4070 
4071 	if (uap->val > sizeof(rb))
4072 		return (EINVAL);
4073 	bzero(&rb, sizeof(rb));
4074 	error = copyin(uap->uaddr1, &rb, uap->val);
4075 	if (error != 0)
4076 		return (error);
4077 	return (umtx_robust_lists(td, &rb));
4078 }
4079 
4080 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
4081 
4082 static const _umtx_op_func op_table[] = {
4083 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4084 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4085 	[UMTX_OP_WAIT]		= __umtx_op_wait,
4086 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4087 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4088 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
4089 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4090 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4091 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4092 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4093 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4094 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4095 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4096 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4097 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4098 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4099 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4100 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4101 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4102 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4103 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4104 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4105 #else
4106 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4107 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4108 #endif
4109 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4110 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4111 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4112 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4113 	[UMTX_OP_SHM]		= __umtx_op_shm,
4114 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4115 };
4116 
4117 int
4118 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4119 {
4120 
4121 	if ((unsigned)uap->op < nitems(op_table))
4122 		return (*op_table[uap->op])(td, uap);
4123 	return (EINVAL);
4124 }
4125 
4126 #ifdef COMPAT_FREEBSD32
4127 
4128 struct timespec32 {
4129 	int32_t tv_sec;
4130 	int32_t tv_nsec;
4131 };
4132 
4133 struct umtx_time32 {
4134 	struct	timespec32	timeout;
4135 	uint32_t		flags;
4136 	uint32_t		clockid;
4137 };
4138 
4139 static inline int
4140 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4141 {
4142 	struct timespec32 ts32;
4143 	int error;
4144 
4145 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4146 	if (error == 0) {
4147 		if (ts32.tv_sec < 0 ||
4148 		    ts32.tv_nsec >= 1000000000 ||
4149 		    ts32.tv_nsec < 0)
4150 			error = EINVAL;
4151 		else {
4152 			tsp->tv_sec = ts32.tv_sec;
4153 			tsp->tv_nsec = ts32.tv_nsec;
4154 		}
4155 	}
4156 	return (error);
4157 }
4158 
4159 static inline int
4160 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4161 {
4162 	struct umtx_time32 t32;
4163 	int error;
4164 
4165 	t32.clockid = CLOCK_REALTIME;
4166 	t32.flags   = 0;
4167 	if (size <= sizeof(struct timespec32))
4168 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4169 	else
4170 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4171 	if (error != 0)
4172 		return (error);
4173 	if (t32.timeout.tv_sec < 0 ||
4174 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4175 		return (EINVAL);
4176 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4177 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4178 	tp->_flags = t32.flags;
4179 	tp->_clockid = t32.clockid;
4180 	return (0);
4181 }
4182 
4183 static int
4184 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4185 {
4186 	struct _umtx_time *tm_p, timeout;
4187 	int error;
4188 
4189 	if (uap->uaddr2 == NULL)
4190 		tm_p = NULL;
4191 	else {
4192 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4193 			(size_t)uap->uaddr1, &timeout);
4194 		if (error != 0)
4195 			return (error);
4196 		tm_p = &timeout;
4197 	}
4198 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4199 }
4200 
4201 static int
4202 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4203 {
4204 	struct _umtx_time *tm_p, timeout;
4205 	int error;
4206 
4207 	/* Allow a null timespec (wait forever). */
4208 	if (uap->uaddr2 == NULL)
4209 		tm_p = NULL;
4210 	else {
4211 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4212 			    (size_t)uap->uaddr1, &timeout);
4213 		if (error != 0)
4214 			return (error);
4215 		tm_p = &timeout;
4216 	}
4217 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4218 }
4219 
4220 static int
4221 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4222 {
4223 	struct _umtx_time *tm_p, timeout;
4224 	int error;
4225 
4226 	/* Allow a null timespec (wait forever). */
4227 	if (uap->uaddr2 == NULL)
4228 		tm_p = NULL;
4229 	else {
4230 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4231 		    (size_t)uap->uaddr1, &timeout);
4232 		if (error != 0)
4233 			return (error);
4234 		tm_p = &timeout;
4235 	}
4236 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4237 }
4238 
4239 static int
4240 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4241 {
4242 	struct timespec *ts, timeout;
4243 	int error;
4244 
4245 	/* Allow a null timespec (wait forever). */
4246 	if (uap->uaddr2 == NULL)
4247 		ts = NULL;
4248 	else {
4249 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4250 		if (error != 0)
4251 			return (error);
4252 		ts = &timeout;
4253 	}
4254 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4255 }
4256 
4257 static int
4258 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4259 {
4260 	struct _umtx_time timeout;
4261 	int error;
4262 
4263 	/* Allow a null timespec (wait forever). */
4264 	if (uap->uaddr2 == NULL) {
4265 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4266 	} else {
4267 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4268 		    (size_t)uap->uaddr1, &timeout);
4269 		if (error != 0)
4270 			return (error);
4271 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4272 	}
4273 	return (error);
4274 }
4275 
4276 static int
4277 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4278 {
4279 	struct _umtx_time timeout;
4280 	int error;
4281 
4282 	/* Allow a null timespec (wait forever). */
4283 	if (uap->uaddr2 == NULL) {
4284 		error = do_rw_wrlock(td, uap->obj, 0);
4285 	} else {
4286 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4287 		    (size_t)uap->uaddr1, &timeout);
4288 		if (error != 0)
4289 			return (error);
4290 		error = do_rw_wrlock(td, uap->obj, &timeout);
4291 	}
4292 	return (error);
4293 }
4294 
4295 static int
4296 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4297 {
4298 	struct _umtx_time *tm_p, timeout;
4299 	int error;
4300 
4301 	if (uap->uaddr2 == NULL)
4302 		tm_p = NULL;
4303 	else {
4304 		error = umtx_copyin_umtx_time32(
4305 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4306 		if (error != 0)
4307 			return (error);
4308 		tm_p = &timeout;
4309 	}
4310 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4311 }
4312 
4313 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4314 static int
4315 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4316 {
4317 	struct _umtx_time *tm_p, timeout;
4318 	int error;
4319 
4320 	/* Allow a null timespec (wait forever). */
4321 	if (uap->uaddr2 == NULL)
4322 		tm_p = NULL;
4323 	else {
4324 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4325 		    (size_t)uap->uaddr1, &timeout);
4326 		if (error != 0)
4327 			return (error);
4328 		tm_p = &timeout;
4329 	}
4330 	return (do_sem_wait(td, uap->obj, tm_p));
4331 }
4332 #endif
4333 
4334 static int
4335 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4336 {
4337 	struct _umtx_time *tm_p, timeout;
4338 	size_t uasize;
4339 	int error;
4340 
4341 	/* Allow a null timespec (wait forever). */
4342 	if (uap->uaddr2 == NULL) {
4343 		uasize = 0;
4344 		tm_p = NULL;
4345 	} else {
4346 		uasize = (size_t)uap->uaddr1;
4347 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4348 		if (error != 0)
4349 			return (error);
4350 		tm_p = &timeout;
4351 	}
4352 	error = do_sem2_wait(td, uap->obj, tm_p);
4353 	if (error == EINTR && uap->uaddr2 != NULL &&
4354 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4355 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4356 		struct timespec32 remain32 = {
4357 			.tv_sec = timeout._timeout.tv_sec,
4358 			.tv_nsec = timeout._timeout.tv_nsec
4359 		};
4360 		error = copyout(&remain32,
4361 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4362 		    sizeof(struct timespec32));
4363 		if (error == 0) {
4364 			error = EINTR;
4365 		}
4366 	}
4367 
4368 	return (error);
4369 }
4370 
4371 static int
4372 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4373 {
4374 	uint32_t uaddrs[BATCH_SIZE], **upp;
4375 	int count, error, i, pos, tocopy;
4376 
4377 	upp = (uint32_t **)uap->obj;
4378 	error = 0;
4379 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4380 	    pos += tocopy) {
4381 		tocopy = MIN(count, BATCH_SIZE);
4382 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4383 		if (error != 0)
4384 			break;
4385 		for (i = 0; i < tocopy; ++i)
4386 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4387 			    INT_MAX, 1);
4388 		maybe_yield();
4389 	}
4390 	return (error);
4391 }
4392 
4393 struct umtx_robust_lists_params_compat32 {
4394 	uint32_t	robust_list_offset;
4395 	uint32_t	robust_priv_list_offset;
4396 	uint32_t	robust_inact_offset;
4397 };
4398 
4399 static int
4400 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4401 {
4402 	struct umtx_robust_lists_params rb;
4403 	struct umtx_robust_lists_params_compat32 rb32;
4404 	int error;
4405 
4406 	if (uap->val > sizeof(rb32))
4407 		return (EINVAL);
4408 	bzero(&rb, sizeof(rb));
4409 	bzero(&rb32, sizeof(rb32));
4410 	error = copyin(uap->uaddr1, &rb32, uap->val);
4411 	if (error != 0)
4412 		return (error);
4413 	rb.robust_list_offset = rb32.robust_list_offset;
4414 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4415 	rb.robust_inact_offset = rb32.robust_inact_offset;
4416 	return (umtx_robust_lists(td, &rb));
4417 }
4418 
4419 static const _umtx_op_func op_table_compat32[] = {
4420 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4421 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4422 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4423 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4424 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4425 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4426 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4427 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4428 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4429 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4430 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4431 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4432 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4433 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4434 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4435 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4436 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4437 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4438 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4439 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4440 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4441 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4442 #else
4443 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4444 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4445 #endif
4446 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4447 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4448 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4449 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4450 	[UMTX_OP_SHM]		= __umtx_op_shm,
4451 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4452 };
4453 
4454 int
4455 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4456 {
4457 
4458 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4459 		return (*op_table_compat32[uap->op])(td,
4460 		    (struct _umtx_op_args *)uap);
4461 	}
4462 	return (EINVAL);
4463 }
4464 #endif
4465 
4466 void
4467 umtx_thread_init(struct thread *td)
4468 {
4469 
4470 	td->td_umtxq = umtxq_alloc();
4471 	td->td_umtxq->uq_thread = td;
4472 }
4473 
4474 void
4475 umtx_thread_fini(struct thread *td)
4476 {
4477 
4478 	umtxq_free(td->td_umtxq);
4479 }
4480 
4481 /*
4482  * It will be called when new thread is created, e.g fork().
4483  */
4484 void
4485 umtx_thread_alloc(struct thread *td)
4486 {
4487 	struct umtx_q *uq;
4488 
4489 	uq = td->td_umtxq;
4490 	uq->uq_inherited_pri = PRI_MAX;
4491 
4492 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4493 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4494 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4495 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4496 }
4497 
4498 /*
4499  * exec() hook.
4500  *
4501  * Clear robust lists for all process' threads, not delaying the
4502  * cleanup to thread_exit hook, since the relevant address space is
4503  * destroyed right now.
4504  */
4505 static void
4506 umtx_exec_hook(void *arg __unused, struct proc *p,
4507     struct image_params *imgp __unused)
4508 {
4509 	struct thread *td;
4510 
4511 	KASSERT(p == curproc, ("need curproc"));
4512 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4513 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4514 	    ("curproc must be single-threaded"));
4515 	/*
4516 	 * There is no need to lock the list as only this thread can be
4517 	 * running.
4518 	 */
4519 	FOREACH_THREAD_IN_PROC(p, td) {
4520 		KASSERT(td == curthread ||
4521 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4522 		    ("running thread %p %p", p, td));
4523 		umtx_thread_cleanup(td);
4524 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4525 	}
4526 }
4527 
4528 /*
4529  * thread_exit() hook.
4530  */
4531 void
4532 umtx_thread_exit(struct thread *td)
4533 {
4534 
4535 	umtx_thread_cleanup(td);
4536 }
4537 
4538 static int
4539 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4540 {
4541 	u_long res1;
4542 #ifdef COMPAT_FREEBSD32
4543 	uint32_t res32;
4544 #endif
4545 	int error;
4546 
4547 #ifdef COMPAT_FREEBSD32
4548 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4549 		error = fueword32((void *)ptr, &res32);
4550 		if (error == 0)
4551 			res1 = res32;
4552 	} else
4553 #endif
4554 	{
4555 		error = fueword((void *)ptr, &res1);
4556 	}
4557 	if (error == 0)
4558 		*res = res1;
4559 	else
4560 		error = EFAULT;
4561 	return (error);
4562 }
4563 
4564 static void
4565 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4566 {
4567 #ifdef COMPAT_FREEBSD32
4568 	struct umutex32 m32;
4569 
4570 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4571 		memcpy(&m32, m, sizeof(m32));
4572 		*rb_list = m32.m_rb_lnk;
4573 	} else
4574 #endif
4575 		*rb_list = m->m_rb_lnk;
4576 }
4577 
4578 static int
4579 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4580 {
4581 	struct umutex m;
4582 	int error;
4583 
4584 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4585 	error = copyin((void *)rbp, &m, sizeof(m));
4586 	if (error != 0)
4587 		return (error);
4588 	if (rb_list != NULL)
4589 		umtx_read_rb_list(td, &m, rb_list);
4590 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4591 		return (EINVAL);
4592 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4593 		/* inact is cleared after unlock, allow the inconsistency */
4594 		return (inact ? 0 : EINVAL);
4595 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4596 }
4597 
4598 static void
4599 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4600     const char *name)
4601 {
4602 	int error, i;
4603 	uintptr_t rbp;
4604 	bool inact;
4605 
4606 	if (rb_list == 0)
4607 		return;
4608 	error = umtx_read_uptr(td, rb_list, &rbp);
4609 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4610 		if (rbp == *rb_inact) {
4611 			inact = true;
4612 			*rb_inact = 0;
4613 		} else
4614 			inact = false;
4615 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4616 	}
4617 	if (i == umtx_max_rb && umtx_verbose_rb) {
4618 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4619 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4620 	}
4621 	if (error != 0 && umtx_verbose_rb) {
4622 		uprintf("comm %s pid %d: handling %srb error %d\n",
4623 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4624 	}
4625 }
4626 
4627 /*
4628  * Clean up umtx data.
4629  */
4630 static void
4631 umtx_thread_cleanup(struct thread *td)
4632 {
4633 	struct umtx_q *uq;
4634 	struct umtx_pi *pi;
4635 	uintptr_t rb_inact;
4636 
4637 	/*
4638 	 * Disown pi mutexes.
4639 	 */
4640 	uq = td->td_umtxq;
4641 	if (uq != NULL) {
4642 		if (uq->uq_inherited_pri != PRI_MAX ||
4643 		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
4644 			mtx_lock(&umtx_lock);
4645 			uq->uq_inherited_pri = PRI_MAX;
4646 			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4647 				pi->pi_owner = NULL;
4648 				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4649 			}
4650 			mtx_unlock(&umtx_lock);
4651 		}
4652 		sched_lend_user_prio_cond(td, PRI_MAX);
4653 	}
4654 
4655 	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
4656 		return;
4657 
4658 	/*
4659 	 * Handle terminated robust mutexes.  Must be done after
4660 	 * robust pi disown, otherwise unlock could see unowned
4661 	 * entries.
4662 	 */
4663 	rb_inact = td->td_rb_inact;
4664 	if (rb_inact != 0)
4665 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4666 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4667 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4668 	if (rb_inact != 0)
4669 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4670 }
4671