xref: /freebsd/sys/kern/kern_umtx.c (revision d38c30c092828f4882ce13b08d0bd3fd6dc7afb5)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2015, 2016 The FreeBSD Foundation
5  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
6  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
7  * All rights reserved.
8  *
9  * Portions of this software were developed by Konstantin Belousov
10  * under sponsorship from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice unmodified, this list of conditions, and the following
17  *    disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
23  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
24  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
25  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
27  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
31  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include "opt_umtx_profiling.h"
38 
39 #include <sys/param.h>
40 #include <sys/kernel.h>
41 #include <sys/fcntl.h>
42 #include <sys/file.h>
43 #include <sys/filedesc.h>
44 #include <sys/limits.h>
45 #include <sys/lock.h>
46 #include <sys/malloc.h>
47 #include <sys/mman.h>
48 #include <sys/mutex.h>
49 #include <sys/priv.h>
50 #include <sys/proc.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/rwlock.h>
54 #include <sys/sbuf.h>
55 #include <sys/sched.h>
56 #include <sys/smp.h>
57 #include <sys/sysctl.h>
58 #include <sys/sysent.h>
59 #include <sys/systm.h>
60 #include <sys/sysproto.h>
61 #include <sys/syscallsubr.h>
62 #include <sys/taskqueue.h>
63 #include <sys/time.h>
64 #include <sys/eventhandler.h>
65 #include <sys/umtx.h>
66 
67 #include <security/mac/mac_framework.h>
68 
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <vm/pmap.h>
72 #include <vm/vm_map.h>
73 #include <vm/vm_object.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/cpu.h>
77 
78 #ifdef COMPAT_FREEBSD32
79 #include <compat/freebsd32/freebsd32_proto.h>
80 #endif
81 
82 #define _UMUTEX_TRY		1
83 #define _UMUTEX_WAIT		2
84 
85 #ifdef UMTX_PROFILING
86 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
87 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
88 #endif
89 
90 /* Priority inheritance mutex info. */
91 struct umtx_pi {
92 	/* Owner thread */
93 	struct thread		*pi_owner;
94 
95 	/* Reference count */
96 	int			pi_refcount;
97 
98 	/* List entry to link umtx holding by thread */
99 	TAILQ_ENTRY(umtx_pi)	pi_link;
100 
101 	/* List entry in hash */
102 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
103 
104 	/* List for waiters */
105 	TAILQ_HEAD(,umtx_q)	pi_blocked;
106 
107 	/* Identify a userland lock object */
108 	struct umtx_key		pi_key;
109 };
110 
111 /* A userland synchronous object user. */
112 struct umtx_q {
113 	/* Linked list for the hash. */
114 	TAILQ_ENTRY(umtx_q)	uq_link;
115 
116 	/* Umtx key. */
117 	struct umtx_key		uq_key;
118 
119 	/* Umtx flags. */
120 	int			uq_flags;
121 #define UQF_UMTXQ	0x0001
122 
123 	/* The thread waits on. */
124 	struct thread		*uq_thread;
125 
126 	/*
127 	 * Blocked on PI mutex. read can use chain lock
128 	 * or umtx_lock, write must have both chain lock and
129 	 * umtx_lock being hold.
130 	 */
131 	struct umtx_pi		*uq_pi_blocked;
132 
133 	/* On blocked list */
134 	TAILQ_ENTRY(umtx_q)	uq_lockq;
135 
136 	/* Thread contending with us */
137 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
138 
139 	/* Inherited priority from PP mutex */
140 	u_char			uq_inherited_pri;
141 
142 	/* Spare queue ready to be reused */
143 	struct umtxq_queue	*uq_spare_queue;
144 
145 	/* The queue we on */
146 	struct umtxq_queue	*uq_cur_queue;
147 };
148 
149 TAILQ_HEAD(umtxq_head, umtx_q);
150 
151 /* Per-key wait-queue */
152 struct umtxq_queue {
153 	struct umtxq_head	head;
154 	struct umtx_key		key;
155 	LIST_ENTRY(umtxq_queue)	link;
156 	int			length;
157 };
158 
159 LIST_HEAD(umtxq_list, umtxq_queue);
160 
161 /* Userland lock object's wait-queue chain */
162 struct umtxq_chain {
163 	/* Lock for this chain. */
164 	struct mtx		uc_lock;
165 
166 	/* List of sleep queues. */
167 	struct umtxq_list	uc_queue[2];
168 #define UMTX_SHARED_QUEUE	0
169 #define UMTX_EXCLUSIVE_QUEUE	1
170 
171 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
172 
173 	/* Busy flag */
174 	char			uc_busy;
175 
176 	/* Chain lock waiters */
177 	int			uc_waiters;
178 
179 	/* All PI in the list */
180 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
181 
182 #ifdef UMTX_PROFILING
183 	u_int			length;
184 	u_int			max_length;
185 #endif
186 };
187 
188 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
189 
190 /*
191  * Don't propagate time-sharing priority, there is a security reason,
192  * a user can simply introduce PI-mutex, let thread A lock the mutex,
193  * and let another thread B block on the mutex, because B is
194  * sleeping, its priority will be boosted, this causes A's priority to
195  * be boosted via priority propagating too and will never be lowered even
196  * if it is using 100%CPU, this is unfair to other processes.
197  */
198 
199 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
200 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
201 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
202 
203 #define	GOLDEN_RATIO_PRIME	2654404609U
204 #ifndef	UMTX_CHAINS
205 #define	UMTX_CHAINS		512
206 #endif
207 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
208 
209 #define	GET_SHARE(flags)	\
210     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
211 
212 #define BUSY_SPINS		200
213 
214 struct abs_timeout {
215 	int clockid;
216 	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
217 	struct timespec cur;
218 	struct timespec end;
219 };
220 
221 #ifdef COMPAT_FREEBSD32
222 struct umutex32 {
223 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
224 	__uint32_t		m_flags;	/* Flags of the mutex */
225 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
226 	__uint32_t		m_rb_lnk;	/* Robust linkage */
227 	__uint32_t		m_pad;
228 	__uint32_t		m_spare[2];
229 };
230 
231 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
232 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
233     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
234 #endif
235 
236 int umtx_shm_vnobj_persistent = 0;
237 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
238     &umtx_shm_vnobj_persistent, 0,
239     "False forces destruction of umtx attached to file, on last close");
240 static int umtx_max_rb = 1000;
241 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
242     &umtx_max_rb, 0,
243     "");
244 
245 static uma_zone_t		umtx_pi_zone;
246 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
247 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
248 static int			umtx_pi_allocated;
249 
250 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
251 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
252     &umtx_pi_allocated, 0, "Allocated umtx_pi");
253 static int umtx_verbose_rb = 1;
254 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
255     &umtx_verbose_rb, 0,
256     "");
257 
258 #ifdef UMTX_PROFILING
259 static long max_length;
260 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
261 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
262 #endif
263 
264 static void abs_timeout_update(struct abs_timeout *timo);
265 
266 static void umtx_shm_init(void);
267 static void umtxq_sysinit(void *);
268 static void umtxq_hash(struct umtx_key *key);
269 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
270 static void umtxq_lock(struct umtx_key *key);
271 static void umtxq_unlock(struct umtx_key *key);
272 static void umtxq_busy(struct umtx_key *key);
273 static void umtxq_unbusy(struct umtx_key *key);
274 static void umtxq_insert_queue(struct umtx_q *uq, int q);
275 static void umtxq_remove_queue(struct umtx_q *uq, int q);
276 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
277 static int umtxq_count(struct umtx_key *key);
278 static struct umtx_pi *umtx_pi_alloc(int);
279 static void umtx_pi_free(struct umtx_pi *pi);
280 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
281     bool rb);
282 static void umtx_thread_cleanup(struct thread *td);
283 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
284     struct image_params *imgp __unused);
285 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
286 
287 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
288 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
289 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
290 
291 static struct mtx umtx_lock;
292 
293 #ifdef UMTX_PROFILING
294 static void
295 umtx_init_profiling(void)
296 {
297 	struct sysctl_oid *chain_oid;
298 	char chain_name[10];
299 	int i;
300 
301 	for (i = 0; i < UMTX_CHAINS; ++i) {
302 		snprintf(chain_name, sizeof(chain_name), "%d", i);
303 		chain_oid = SYSCTL_ADD_NODE(NULL,
304 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
305 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
306 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
307 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
308 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
309 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
310 	}
311 }
312 
313 static int
314 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
315 {
316 	char buf[512];
317 	struct sbuf sb;
318 	struct umtxq_chain *uc;
319 	u_int fract, i, j, tot, whole;
320 	u_int sf0, sf1, sf2, sf3, sf4;
321 	u_int si0, si1, si2, si3, si4;
322 	u_int sw0, sw1, sw2, sw3, sw4;
323 
324 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
325 	for (i = 0; i < 2; i++) {
326 		tot = 0;
327 		for (j = 0; j < UMTX_CHAINS; ++j) {
328 			uc = &umtxq_chains[i][j];
329 			mtx_lock(&uc->uc_lock);
330 			tot += uc->max_length;
331 			mtx_unlock(&uc->uc_lock);
332 		}
333 		if (tot == 0)
334 			sbuf_printf(&sb, "%u) Empty ", i);
335 		else {
336 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
337 			si0 = si1 = si2 = si3 = si4 = 0;
338 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
339 			for (j = 0; j < UMTX_CHAINS; j++) {
340 				uc = &umtxq_chains[i][j];
341 				mtx_lock(&uc->uc_lock);
342 				whole = uc->max_length * 100;
343 				mtx_unlock(&uc->uc_lock);
344 				fract = (whole % tot) * 100;
345 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
346 					sf0 = fract;
347 					si0 = j;
348 					sw0 = whole;
349 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
350 				    sf1)) {
351 					sf1 = fract;
352 					si1 = j;
353 					sw1 = whole;
354 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
355 				    sf2)) {
356 					sf2 = fract;
357 					si2 = j;
358 					sw2 = whole;
359 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
360 				    sf3)) {
361 					sf3 = fract;
362 					si3 = j;
363 					sw3 = whole;
364 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
365 				    sf4)) {
366 					sf4 = fract;
367 					si4 = j;
368 					sw4 = whole;
369 				}
370 			}
371 			sbuf_printf(&sb, "queue %u:\n", i);
372 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
373 			    sf0 / tot, si0);
374 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
375 			    sf1 / tot, si1);
376 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
377 			    sf2 / tot, si2);
378 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
379 			    sf3 / tot, si3);
380 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
381 			    sf4 / tot, si4);
382 		}
383 	}
384 	sbuf_trim(&sb);
385 	sbuf_finish(&sb);
386 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
387 	sbuf_delete(&sb);
388 	return (0);
389 }
390 
391 static int
392 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
393 {
394 	struct umtxq_chain *uc;
395 	u_int i, j;
396 	int clear, error;
397 
398 	clear = 0;
399 	error = sysctl_handle_int(oidp, &clear, 0, req);
400 	if (error != 0 || req->newptr == NULL)
401 		return (error);
402 
403 	if (clear != 0) {
404 		for (i = 0; i < 2; ++i) {
405 			for (j = 0; j < UMTX_CHAINS; ++j) {
406 				uc = &umtxq_chains[i][j];
407 				mtx_lock(&uc->uc_lock);
408 				uc->length = 0;
409 				uc->max_length = 0;
410 				mtx_unlock(&uc->uc_lock);
411 			}
412 		}
413 	}
414 	return (0);
415 }
416 
417 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
418     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
419     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
420 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
421     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
422     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
423 #endif
424 
425 static void
426 umtxq_sysinit(void *arg __unused)
427 {
428 	int i, j;
429 
430 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
431 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
432 	for (i = 0; i < 2; ++i) {
433 		for (j = 0; j < UMTX_CHAINS; ++j) {
434 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
435 				 MTX_DEF | MTX_DUPOK);
436 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
437 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
438 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
439 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
440 			umtxq_chains[i][j].uc_busy = 0;
441 			umtxq_chains[i][j].uc_waiters = 0;
442 #ifdef UMTX_PROFILING
443 			umtxq_chains[i][j].length = 0;
444 			umtxq_chains[i][j].max_length = 0;
445 #endif
446 		}
447 	}
448 #ifdef UMTX_PROFILING
449 	umtx_init_profiling();
450 #endif
451 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
452 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
453 	    EVENTHANDLER_PRI_ANY);
454 	umtx_shm_init();
455 }
456 
457 struct umtx_q *
458 umtxq_alloc(void)
459 {
460 	struct umtx_q *uq;
461 
462 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
463 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
464 	    M_WAITOK | M_ZERO);
465 	TAILQ_INIT(&uq->uq_spare_queue->head);
466 	TAILQ_INIT(&uq->uq_pi_contested);
467 	uq->uq_inherited_pri = PRI_MAX;
468 	return (uq);
469 }
470 
471 void
472 umtxq_free(struct umtx_q *uq)
473 {
474 
475 	MPASS(uq->uq_spare_queue != NULL);
476 	free(uq->uq_spare_queue, M_UMTX);
477 	free(uq, M_UMTX);
478 }
479 
480 static inline void
481 umtxq_hash(struct umtx_key *key)
482 {
483 	unsigned n;
484 
485 	n = (uintptr_t)key->info.both.a + key->info.both.b;
486 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
487 }
488 
489 static inline struct umtxq_chain *
490 umtxq_getchain(struct umtx_key *key)
491 {
492 
493 	if (key->type <= TYPE_SEM)
494 		return (&umtxq_chains[1][key->hash]);
495 	return (&umtxq_chains[0][key->hash]);
496 }
497 
498 /*
499  * Lock a chain.
500  */
501 static inline void
502 umtxq_lock(struct umtx_key *key)
503 {
504 	struct umtxq_chain *uc;
505 
506 	uc = umtxq_getchain(key);
507 	mtx_lock(&uc->uc_lock);
508 }
509 
510 /*
511  * Unlock a chain.
512  */
513 static inline void
514 umtxq_unlock(struct umtx_key *key)
515 {
516 	struct umtxq_chain *uc;
517 
518 	uc = umtxq_getchain(key);
519 	mtx_unlock(&uc->uc_lock);
520 }
521 
522 /*
523  * Set chain to busy state when following operation
524  * may be blocked (kernel mutex can not be used).
525  */
526 static inline void
527 umtxq_busy(struct umtx_key *key)
528 {
529 	struct umtxq_chain *uc;
530 
531 	uc = umtxq_getchain(key);
532 	mtx_assert(&uc->uc_lock, MA_OWNED);
533 	if (uc->uc_busy) {
534 #ifdef SMP
535 		if (smp_cpus > 1) {
536 			int count = BUSY_SPINS;
537 			if (count > 0) {
538 				umtxq_unlock(key);
539 				while (uc->uc_busy && --count > 0)
540 					cpu_spinwait();
541 				umtxq_lock(key);
542 			}
543 		}
544 #endif
545 		while (uc->uc_busy) {
546 			uc->uc_waiters++;
547 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
548 			uc->uc_waiters--;
549 		}
550 	}
551 	uc->uc_busy = 1;
552 }
553 
554 /*
555  * Unbusy a chain.
556  */
557 static inline void
558 umtxq_unbusy(struct umtx_key *key)
559 {
560 	struct umtxq_chain *uc;
561 
562 	uc = umtxq_getchain(key);
563 	mtx_assert(&uc->uc_lock, MA_OWNED);
564 	KASSERT(uc->uc_busy != 0, ("not busy"));
565 	uc->uc_busy = 0;
566 	if (uc->uc_waiters)
567 		wakeup_one(uc);
568 }
569 
570 static inline void
571 umtxq_unbusy_unlocked(struct umtx_key *key)
572 {
573 
574 	umtxq_lock(key);
575 	umtxq_unbusy(key);
576 	umtxq_unlock(key);
577 }
578 
579 static struct umtxq_queue *
580 umtxq_queue_lookup(struct umtx_key *key, int q)
581 {
582 	struct umtxq_queue *uh;
583 	struct umtxq_chain *uc;
584 
585 	uc = umtxq_getchain(key);
586 	UMTXQ_LOCKED_ASSERT(uc);
587 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
588 		if (umtx_key_match(&uh->key, key))
589 			return (uh);
590 	}
591 
592 	return (NULL);
593 }
594 
595 static inline void
596 umtxq_insert_queue(struct umtx_q *uq, int q)
597 {
598 	struct umtxq_queue *uh;
599 	struct umtxq_chain *uc;
600 
601 	uc = umtxq_getchain(&uq->uq_key);
602 	UMTXQ_LOCKED_ASSERT(uc);
603 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
604 	uh = umtxq_queue_lookup(&uq->uq_key, q);
605 	if (uh != NULL) {
606 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
607 	} else {
608 		uh = uq->uq_spare_queue;
609 		uh->key = uq->uq_key;
610 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
611 #ifdef UMTX_PROFILING
612 		uc->length++;
613 		if (uc->length > uc->max_length) {
614 			uc->max_length = uc->length;
615 			if (uc->max_length > max_length)
616 				max_length = uc->max_length;
617 		}
618 #endif
619 	}
620 	uq->uq_spare_queue = NULL;
621 
622 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
623 	uh->length++;
624 	uq->uq_flags |= UQF_UMTXQ;
625 	uq->uq_cur_queue = uh;
626 	return;
627 }
628 
629 static inline void
630 umtxq_remove_queue(struct umtx_q *uq, int q)
631 {
632 	struct umtxq_chain *uc;
633 	struct umtxq_queue *uh;
634 
635 	uc = umtxq_getchain(&uq->uq_key);
636 	UMTXQ_LOCKED_ASSERT(uc);
637 	if (uq->uq_flags & UQF_UMTXQ) {
638 		uh = uq->uq_cur_queue;
639 		TAILQ_REMOVE(&uh->head, uq, uq_link);
640 		uh->length--;
641 		uq->uq_flags &= ~UQF_UMTXQ;
642 		if (TAILQ_EMPTY(&uh->head)) {
643 			KASSERT(uh->length == 0,
644 			    ("inconsistent umtxq_queue length"));
645 #ifdef UMTX_PROFILING
646 			uc->length--;
647 #endif
648 			LIST_REMOVE(uh, link);
649 		} else {
650 			uh = LIST_FIRST(&uc->uc_spare_queue);
651 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
652 			LIST_REMOVE(uh, link);
653 		}
654 		uq->uq_spare_queue = uh;
655 		uq->uq_cur_queue = NULL;
656 	}
657 }
658 
659 /*
660  * Check if there are multiple waiters
661  */
662 static int
663 umtxq_count(struct umtx_key *key)
664 {
665 	struct umtxq_queue *uh;
666 
667 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
668 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
669 	if (uh != NULL)
670 		return (uh->length);
671 	return (0);
672 }
673 
674 /*
675  * Check if there are multiple PI waiters and returns first
676  * waiter.
677  */
678 static int
679 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
680 {
681 	struct umtxq_queue *uh;
682 
683 	*first = NULL;
684 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
685 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
686 	if (uh != NULL) {
687 		*first = TAILQ_FIRST(&uh->head);
688 		return (uh->length);
689 	}
690 	return (0);
691 }
692 
693 /*
694  * Wake up threads waiting on an userland object.
695  */
696 
697 static int
698 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
699 {
700 	struct umtxq_queue *uh;
701 	struct umtx_q *uq;
702 	int ret;
703 
704 	ret = 0;
705 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(key));
706 	uh = umtxq_queue_lookup(key, q);
707 	if (uh != NULL) {
708 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
709 			umtxq_remove_queue(uq, q);
710 			wakeup(uq);
711 			if (++ret >= n_wake)
712 				return (ret);
713 		}
714 	}
715 	return (ret);
716 }
717 
718 /*
719  * Wake up specified thread.
720  */
721 static inline void
722 umtxq_signal_thread(struct umtx_q *uq)
723 {
724 
725 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
726 	umtxq_remove(uq);
727 	wakeup(uq);
728 }
729 
730 static inline int
731 tstohz(const struct timespec *tsp)
732 {
733 	struct timeval tv;
734 
735 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
736 	return tvtohz(&tv);
737 }
738 
739 static void
740 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
741 	const struct timespec *timeout)
742 {
743 
744 	timo->clockid = clockid;
745 	if (!absolute) {
746 		timo->is_abs_real = false;
747 		abs_timeout_update(timo);
748 		timespecadd(&timo->cur, timeout, &timo->end);
749 	} else {
750 		timo->end = *timeout;
751 		timo->is_abs_real = clockid == CLOCK_REALTIME ||
752 		    clockid == CLOCK_REALTIME_FAST ||
753 		    clockid == CLOCK_REALTIME_PRECISE;
754 		/*
755 		 * If is_abs_real, umtxq_sleep will read the clock
756 		 * after setting td_rtcgen; otherwise, read it here.
757 		 */
758 		if (!timo->is_abs_real) {
759 			abs_timeout_update(timo);
760 		}
761 	}
762 }
763 
764 static void
765 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
766 {
767 
768 	abs_timeout_init(timo, umtxtime->_clockid,
769 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
770 }
771 
772 static inline void
773 abs_timeout_update(struct abs_timeout *timo)
774 {
775 
776 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
777 }
778 
779 static int
780 abs_timeout_gethz(struct abs_timeout *timo)
781 {
782 	struct timespec tts;
783 
784 	if (timespeccmp(&timo->end, &timo->cur, <=))
785 		return (-1);
786 	timespecsub(&timo->end, &timo->cur, &tts);
787 	return (tstohz(&tts));
788 }
789 
790 static uint32_t
791 umtx_unlock_val(uint32_t flags, bool rb)
792 {
793 
794 	if (rb)
795 		return (UMUTEX_RB_OWNERDEAD);
796 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
797 		return (UMUTEX_RB_NOTRECOV);
798 	else
799 		return (UMUTEX_UNOWNED);
800 
801 }
802 
803 /*
804  * Put thread into sleep state, before sleeping, check if
805  * thread was removed from umtx queue.
806  */
807 static inline int
808 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
809 {
810 	struct umtxq_chain *uc;
811 	int error, timo;
812 
813 	if (abstime != NULL && abstime->is_abs_real) {
814 		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
815 		abs_timeout_update(abstime);
816 	}
817 
818 	uc = umtxq_getchain(&uq->uq_key);
819 	UMTXQ_LOCKED_ASSERT(uc);
820 	for (;;) {
821 		if (!(uq->uq_flags & UQF_UMTXQ)) {
822 			error = 0;
823 			break;
824 		}
825 		if (abstime != NULL) {
826 			timo = abs_timeout_gethz(abstime);
827 			if (timo < 0) {
828 				error = ETIMEDOUT;
829 				break;
830 			}
831 		} else
832 			timo = 0;
833 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
834 		if (error == EINTR || error == ERESTART) {
835 			umtxq_lock(&uq->uq_key);
836 			break;
837 		}
838 		if (abstime != NULL) {
839 			if (abstime->is_abs_real)
840 				curthread->td_rtcgen =
841 				    atomic_load_acq_int(&rtc_generation);
842 			abs_timeout_update(abstime);
843 		}
844 		umtxq_lock(&uq->uq_key);
845 	}
846 
847 	curthread->td_rtcgen = 0;
848 	return (error);
849 }
850 
851 /*
852  * Convert userspace address into unique logical address.
853  */
854 int
855 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
856 {
857 	struct thread *td = curthread;
858 	vm_map_t map;
859 	vm_map_entry_t entry;
860 	vm_pindex_t pindex;
861 	vm_prot_t prot;
862 	boolean_t wired;
863 
864 	key->type = type;
865 	if (share == THREAD_SHARE) {
866 		key->shared = 0;
867 		key->info.private.vs = td->td_proc->p_vmspace;
868 		key->info.private.addr = (uintptr_t)addr;
869 	} else {
870 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
871 		map = &td->td_proc->p_vmspace->vm_map;
872 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
873 		    &entry, &key->info.shared.object, &pindex, &prot,
874 		    &wired) != KERN_SUCCESS) {
875 			return (EFAULT);
876 		}
877 
878 		if ((share == PROCESS_SHARE) ||
879 		    (share == AUTO_SHARE &&
880 		     VM_INHERIT_SHARE == entry->inheritance)) {
881 			key->shared = 1;
882 			key->info.shared.offset = (vm_offset_t)addr -
883 			    entry->start + entry->offset;
884 			vm_object_reference(key->info.shared.object);
885 		} else {
886 			key->shared = 0;
887 			key->info.private.vs = td->td_proc->p_vmspace;
888 			key->info.private.addr = (uintptr_t)addr;
889 		}
890 		vm_map_lookup_done(map, entry);
891 	}
892 
893 	umtxq_hash(key);
894 	return (0);
895 }
896 
897 /*
898  * Release key.
899  */
900 void
901 umtx_key_release(struct umtx_key *key)
902 {
903 	if (key->shared)
904 		vm_object_deallocate(key->info.shared.object);
905 }
906 
907 /*
908  * Fetch and compare value, sleep on the address if value is not changed.
909  */
910 static int
911 do_wait(struct thread *td, void *addr, u_long id,
912     struct _umtx_time *timeout, int compat32, int is_private)
913 {
914 	struct abs_timeout timo;
915 	struct umtx_q *uq;
916 	u_long tmp;
917 	uint32_t tmp32;
918 	int error = 0;
919 
920 	uq = td->td_umtxq;
921 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
922 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
923 		return (error);
924 
925 	if (timeout != NULL)
926 		abs_timeout_init2(&timo, timeout);
927 
928 	umtxq_lock(&uq->uq_key);
929 	umtxq_insert(uq);
930 	umtxq_unlock(&uq->uq_key);
931 	if (compat32 == 0) {
932 		error = fueword(addr, &tmp);
933 		if (error != 0)
934 			error = EFAULT;
935 	} else {
936 		error = fueword32(addr, &tmp32);
937 		if (error == 0)
938 			tmp = tmp32;
939 		else
940 			error = EFAULT;
941 	}
942 	umtxq_lock(&uq->uq_key);
943 	if (error == 0) {
944 		if (tmp == id)
945 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
946 			    NULL : &timo);
947 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
948 			error = 0;
949 		else
950 			umtxq_remove(uq);
951 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
952 		umtxq_remove(uq);
953 	}
954 	umtxq_unlock(&uq->uq_key);
955 	umtx_key_release(&uq->uq_key);
956 	if (error == ERESTART)
957 		error = EINTR;
958 	return (error);
959 }
960 
961 /*
962  * Wake up threads sleeping on the specified address.
963  */
964 int
965 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
966 {
967 	struct umtx_key key;
968 	int ret;
969 
970 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
971 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
972 		return (ret);
973 	umtxq_lock(&key);
974 	umtxq_signal(&key, n_wake);
975 	umtxq_unlock(&key);
976 	umtx_key_release(&key);
977 	return (0);
978 }
979 
980 /*
981  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
982  */
983 static int
984 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
985     struct _umtx_time *timeout, int mode)
986 {
987 	struct abs_timeout timo;
988 	struct umtx_q *uq;
989 	uint32_t owner, old, id;
990 	int error, rv;
991 
992 	id = td->td_tid;
993 	uq = td->td_umtxq;
994 	error = 0;
995 	if (timeout != NULL)
996 		abs_timeout_init2(&timo, timeout);
997 
998 	/*
999 	 * Care must be exercised when dealing with umtx structure. It
1000 	 * can fault on any access.
1001 	 */
1002 	for (;;) {
1003 		rv = fueword32(&m->m_owner, &owner);
1004 		if (rv == -1)
1005 			return (EFAULT);
1006 		if (mode == _UMUTEX_WAIT) {
1007 			if (owner == UMUTEX_UNOWNED ||
1008 			    owner == UMUTEX_CONTESTED ||
1009 			    owner == UMUTEX_RB_OWNERDEAD ||
1010 			    owner == UMUTEX_RB_NOTRECOV)
1011 				return (0);
1012 		} else {
1013 			/*
1014 			 * Robust mutex terminated.  Kernel duty is to
1015 			 * return EOWNERDEAD to the userspace.  The
1016 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
1017 			 * by the common userspace code.
1018 			 */
1019 			if (owner == UMUTEX_RB_OWNERDEAD) {
1020 				rv = casueword32(&m->m_owner,
1021 				    UMUTEX_RB_OWNERDEAD, &owner,
1022 				    id | UMUTEX_CONTESTED);
1023 				if (rv == -1)
1024 					return (EFAULT);
1025 				if (rv == 0) {
1026 					MPASS(owner == UMUTEX_RB_OWNERDEAD);
1027 					return (EOWNERDEAD); /* success */
1028 				}
1029 				MPASS(rv == 1);
1030 				rv = thread_check_susp(td, false);
1031 				if (rv != 0)
1032 					return (rv);
1033 				continue;
1034 			}
1035 			if (owner == UMUTEX_RB_NOTRECOV)
1036 				return (ENOTRECOVERABLE);
1037 
1038 			/*
1039 			 * Try the uncontested case.  This should be
1040 			 * done in userland.
1041 			 */
1042 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1043 			    &owner, id);
1044 			/* The address was invalid. */
1045 			if (rv == -1)
1046 				return (EFAULT);
1047 
1048 			/* The acquire succeeded. */
1049 			if (rv == 0) {
1050 				MPASS(owner == UMUTEX_UNOWNED);
1051 				return (0);
1052 			}
1053 
1054 			/*
1055 			 * If no one owns it but it is contested try
1056 			 * to acquire it.
1057 			 */
1058 			MPASS(rv == 1);
1059 			if (owner == UMUTEX_CONTESTED) {
1060 				rv = casueword32(&m->m_owner,
1061 				    UMUTEX_CONTESTED, &owner,
1062 				    id | UMUTEX_CONTESTED);
1063 				/* The address was invalid. */
1064 				if (rv == -1)
1065 					return (EFAULT);
1066 				if (rv == 0) {
1067 					MPASS(owner == UMUTEX_CONTESTED);
1068 					return (0);
1069 				}
1070 				if (rv == 1) {
1071 					rv = thread_check_susp(td, false);
1072 					if (rv != 0)
1073 						return (rv);
1074 				}
1075 
1076 				/*
1077 				 * If this failed the lock has
1078 				 * changed, restart.
1079 				 */
1080 				continue;
1081 			}
1082 
1083 			/* rv == 1 but not contested, likely store failure */
1084 			rv = thread_check_susp(td, false);
1085 			if (rv != 0)
1086 				return (rv);
1087 		}
1088 
1089 		if (mode == _UMUTEX_TRY)
1090 			return (EBUSY);
1091 
1092 		/*
1093 		 * If we caught a signal, we have retried and now
1094 		 * exit immediately.
1095 		 */
1096 		if (error != 0)
1097 			return (error);
1098 
1099 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1100 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1101 			return (error);
1102 
1103 		umtxq_lock(&uq->uq_key);
1104 		umtxq_busy(&uq->uq_key);
1105 		umtxq_insert(uq);
1106 		umtxq_unlock(&uq->uq_key);
1107 
1108 		/*
1109 		 * Set the contested bit so that a release in user space
1110 		 * knows to use the system call for unlock.  If this fails
1111 		 * either some one else has acquired the lock or it has been
1112 		 * released.
1113 		 */
1114 		rv = casueword32(&m->m_owner, owner, &old,
1115 		    owner | UMUTEX_CONTESTED);
1116 
1117 		/* The address was invalid or casueword failed to store. */
1118 		if (rv == -1 || rv == 1) {
1119 			umtxq_lock(&uq->uq_key);
1120 			umtxq_remove(uq);
1121 			umtxq_unbusy(&uq->uq_key);
1122 			umtxq_unlock(&uq->uq_key);
1123 			umtx_key_release(&uq->uq_key);
1124 			if (rv == -1)
1125 				return (EFAULT);
1126 			if (rv == 1) {
1127 				rv = thread_check_susp(td, false);
1128 				if (rv != 0)
1129 					return (rv);
1130 			}
1131 			continue;
1132 		}
1133 
1134 		/*
1135 		 * We set the contested bit, sleep. Otherwise the lock changed
1136 		 * and we need to retry or we lost a race to the thread
1137 		 * unlocking the umtx.
1138 		 */
1139 		umtxq_lock(&uq->uq_key);
1140 		umtxq_unbusy(&uq->uq_key);
1141 		MPASS(old == owner);
1142 		error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1143 		    NULL : &timo);
1144 		umtxq_remove(uq);
1145 		umtxq_unlock(&uq->uq_key);
1146 		umtx_key_release(&uq->uq_key);
1147 
1148 		if (error == 0)
1149 			error = thread_check_susp(td, false);
1150 	}
1151 
1152 	return (0);
1153 }
1154 
1155 /*
1156  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1157  */
1158 static int
1159 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1160 {
1161 	struct umtx_key key;
1162 	uint32_t owner, old, id, newlock;
1163 	int error, count;
1164 
1165 	id = td->td_tid;
1166 
1167 again:
1168 	/*
1169 	 * Make sure we own this mtx.
1170 	 */
1171 	error = fueword32(&m->m_owner, &owner);
1172 	if (error == -1)
1173 		return (EFAULT);
1174 
1175 	if ((owner & ~UMUTEX_CONTESTED) != id)
1176 		return (EPERM);
1177 
1178 	newlock = umtx_unlock_val(flags, rb);
1179 	if ((owner & UMUTEX_CONTESTED) == 0) {
1180 		error = casueword32(&m->m_owner, owner, &old, newlock);
1181 		if (error == -1)
1182 			return (EFAULT);
1183 		if (error == 1) {
1184 			error = thread_check_susp(td, false);
1185 			if (error != 0)
1186 				return (error);
1187 			goto again;
1188 		}
1189 		MPASS(old == owner);
1190 		return (0);
1191 	}
1192 
1193 	/* We should only ever be in here for contested locks */
1194 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1195 	    &key)) != 0)
1196 		return (error);
1197 
1198 	umtxq_lock(&key);
1199 	umtxq_busy(&key);
1200 	count = umtxq_count(&key);
1201 	umtxq_unlock(&key);
1202 
1203 	/*
1204 	 * When unlocking the umtx, it must be marked as unowned if
1205 	 * there is zero or one thread only waiting for it.
1206 	 * Otherwise, it must be marked as contested.
1207 	 */
1208 	if (count > 1)
1209 		newlock |= UMUTEX_CONTESTED;
1210 	error = casueword32(&m->m_owner, owner, &old, newlock);
1211 	umtxq_lock(&key);
1212 	umtxq_signal(&key, 1);
1213 	umtxq_unbusy(&key);
1214 	umtxq_unlock(&key);
1215 	umtx_key_release(&key);
1216 	if (error == -1)
1217 		return (EFAULT);
1218 	if (error == 1) {
1219 		if (old != owner)
1220 			return (EINVAL);
1221 		error = thread_check_susp(td, false);
1222 		if (error != 0)
1223 			return (error);
1224 		goto again;
1225 	}
1226 	return (0);
1227 }
1228 
1229 /*
1230  * Check if the mutex is available and wake up a waiter,
1231  * only for simple mutex.
1232  */
1233 static int
1234 do_wake_umutex(struct thread *td, struct umutex *m)
1235 {
1236 	struct umtx_key key;
1237 	uint32_t owner;
1238 	uint32_t flags;
1239 	int error;
1240 	int count;
1241 
1242 again:
1243 	error = fueword32(&m->m_owner, &owner);
1244 	if (error == -1)
1245 		return (EFAULT);
1246 
1247 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
1248 	    owner != UMUTEX_RB_NOTRECOV)
1249 		return (0);
1250 
1251 	error = fueword32(&m->m_flags, &flags);
1252 	if (error == -1)
1253 		return (EFAULT);
1254 
1255 	/* We should only ever be in here for contested locks */
1256 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1257 	    &key)) != 0)
1258 		return (error);
1259 
1260 	umtxq_lock(&key);
1261 	umtxq_busy(&key);
1262 	count = umtxq_count(&key);
1263 	umtxq_unlock(&key);
1264 
1265 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
1266 	    owner != UMUTEX_RB_NOTRECOV) {
1267 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1268 		    UMUTEX_UNOWNED);
1269 		if (error == -1) {
1270 			error = EFAULT;
1271 		} else if (error == 1) {
1272 			umtxq_lock(&key);
1273 			umtxq_unbusy(&key);
1274 			umtxq_unlock(&key);
1275 			umtx_key_release(&key);
1276 			error = thread_check_susp(td, false);
1277 			if (error != 0)
1278 				return (error);
1279 			goto again;
1280 		}
1281 	}
1282 
1283 	umtxq_lock(&key);
1284 	if (error == 0 && count != 0) {
1285 		MPASS((owner & ~UMUTEX_CONTESTED) == 0 ||
1286 		    owner == UMUTEX_RB_OWNERDEAD ||
1287 		    owner == UMUTEX_RB_NOTRECOV);
1288 		umtxq_signal(&key, 1);
1289 	}
1290 	umtxq_unbusy(&key);
1291 	umtxq_unlock(&key);
1292 	umtx_key_release(&key);
1293 	return (error);
1294 }
1295 
1296 /*
1297  * Check if the mutex has waiters and tries to fix contention bit.
1298  */
1299 static int
1300 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1301 {
1302 	struct umtx_key key;
1303 	uint32_t owner, old;
1304 	int type;
1305 	int error;
1306 	int count;
1307 
1308 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
1309 	    UMUTEX_ROBUST)) {
1310 	case 0:
1311 	case UMUTEX_ROBUST:
1312 		type = TYPE_NORMAL_UMUTEX;
1313 		break;
1314 	case UMUTEX_PRIO_INHERIT:
1315 		type = TYPE_PI_UMUTEX;
1316 		break;
1317 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
1318 		type = TYPE_PI_ROBUST_UMUTEX;
1319 		break;
1320 	case UMUTEX_PRIO_PROTECT:
1321 		type = TYPE_PP_UMUTEX;
1322 		break;
1323 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
1324 		type = TYPE_PP_ROBUST_UMUTEX;
1325 		break;
1326 	default:
1327 		return (EINVAL);
1328 	}
1329 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
1330 		return (error);
1331 
1332 	owner = 0;
1333 	umtxq_lock(&key);
1334 	umtxq_busy(&key);
1335 	count = umtxq_count(&key);
1336 	umtxq_unlock(&key);
1337 
1338 	error = fueword32(&m->m_owner, &owner);
1339 	if (error == -1)
1340 		error = EFAULT;
1341 
1342 	/*
1343 	 * Only repair contention bit if there is a waiter, this means
1344 	 * the mutex is still being referenced by userland code,
1345 	 * otherwise don't update any memory.
1346 	 */
1347 	while (error == 0 && (owner & UMUTEX_CONTESTED) == 0 &&
1348 	    (count > 1 || (count == 1 && (owner & ~UMUTEX_CONTESTED) != 0))) {
1349 		error = casueword32(&m->m_owner, owner, &old,
1350 		    owner | UMUTEX_CONTESTED);
1351 		if (error == -1) {
1352 			error = EFAULT;
1353 			break;
1354 		}
1355 		if (error == 0) {
1356 			MPASS(old == owner);
1357 			break;
1358 		}
1359 		owner = old;
1360 		error = thread_check_susp(td, false);
1361 	}
1362 
1363 	umtxq_lock(&key);
1364 	if (error == EFAULT) {
1365 		umtxq_signal(&key, INT_MAX);
1366 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
1367 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
1368 		umtxq_signal(&key, 1);
1369 	umtxq_unbusy(&key);
1370 	umtxq_unlock(&key);
1371 	umtx_key_release(&key);
1372 	return (error);
1373 }
1374 
1375 static inline struct umtx_pi *
1376 umtx_pi_alloc(int flags)
1377 {
1378 	struct umtx_pi *pi;
1379 
1380 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1381 	TAILQ_INIT(&pi->pi_blocked);
1382 	atomic_add_int(&umtx_pi_allocated, 1);
1383 	return (pi);
1384 }
1385 
1386 static inline void
1387 umtx_pi_free(struct umtx_pi *pi)
1388 {
1389 	uma_zfree(umtx_pi_zone, pi);
1390 	atomic_add_int(&umtx_pi_allocated, -1);
1391 }
1392 
1393 /*
1394  * Adjust the thread's position on a pi_state after its priority has been
1395  * changed.
1396  */
1397 static int
1398 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1399 {
1400 	struct umtx_q *uq, *uq1, *uq2;
1401 	struct thread *td1;
1402 
1403 	mtx_assert(&umtx_lock, MA_OWNED);
1404 	if (pi == NULL)
1405 		return (0);
1406 
1407 	uq = td->td_umtxq;
1408 
1409 	/*
1410 	 * Check if the thread needs to be moved on the blocked chain.
1411 	 * It needs to be moved if either its priority is lower than
1412 	 * the previous thread or higher than the next thread.
1413 	 */
1414 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1415 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1416 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1417 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1418 		/*
1419 		 * Remove thread from blocked chain and determine where
1420 		 * it should be moved to.
1421 		 */
1422 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1423 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1424 			td1 = uq1->uq_thread;
1425 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1426 			if (UPRI(td1) > UPRI(td))
1427 				break;
1428 		}
1429 
1430 		if (uq1 == NULL)
1431 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1432 		else
1433 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1434 	}
1435 	return (1);
1436 }
1437 
1438 static struct umtx_pi *
1439 umtx_pi_next(struct umtx_pi *pi)
1440 {
1441 	struct umtx_q *uq_owner;
1442 
1443 	if (pi->pi_owner == NULL)
1444 		return (NULL);
1445 	uq_owner = pi->pi_owner->td_umtxq;
1446 	if (uq_owner == NULL)
1447 		return (NULL);
1448 	return (uq_owner->uq_pi_blocked);
1449 }
1450 
1451 /*
1452  * Floyd's Cycle-Finding Algorithm.
1453  */
1454 static bool
1455 umtx_pi_check_loop(struct umtx_pi *pi)
1456 {
1457 	struct umtx_pi *pi1;	/* fast iterator */
1458 
1459 	mtx_assert(&umtx_lock, MA_OWNED);
1460 	if (pi == NULL)
1461 		return (false);
1462 	pi1 = pi;
1463 	for (;;) {
1464 		pi = umtx_pi_next(pi);
1465 		if (pi == NULL)
1466 			break;
1467 		pi1 = umtx_pi_next(pi1);
1468 		if (pi1 == NULL)
1469 			break;
1470 		pi1 = umtx_pi_next(pi1);
1471 		if (pi1 == NULL)
1472 			break;
1473 		if (pi == pi1)
1474 			return (true);
1475 	}
1476 	return (false);
1477 }
1478 
1479 /*
1480  * Propagate priority when a thread is blocked on POSIX
1481  * PI mutex.
1482  */
1483 static void
1484 umtx_propagate_priority(struct thread *td)
1485 {
1486 	struct umtx_q *uq;
1487 	struct umtx_pi *pi;
1488 	int pri;
1489 
1490 	mtx_assert(&umtx_lock, MA_OWNED);
1491 	pri = UPRI(td);
1492 	uq = td->td_umtxq;
1493 	pi = uq->uq_pi_blocked;
1494 	if (pi == NULL)
1495 		return;
1496 	if (umtx_pi_check_loop(pi))
1497 		return;
1498 
1499 	for (;;) {
1500 		td = pi->pi_owner;
1501 		if (td == NULL || td == curthread)
1502 			return;
1503 
1504 		MPASS(td->td_proc != NULL);
1505 		MPASS(td->td_proc->p_magic == P_MAGIC);
1506 
1507 		thread_lock(td);
1508 		if (td->td_lend_user_pri > pri)
1509 			sched_lend_user_prio(td, pri);
1510 		else {
1511 			thread_unlock(td);
1512 			break;
1513 		}
1514 		thread_unlock(td);
1515 
1516 		/*
1517 		 * Pick up the lock that td is blocked on.
1518 		 */
1519 		uq = td->td_umtxq;
1520 		pi = uq->uq_pi_blocked;
1521 		if (pi == NULL)
1522 			break;
1523 		/* Resort td on the list if needed. */
1524 		umtx_pi_adjust_thread(pi, td);
1525 	}
1526 }
1527 
1528 /*
1529  * Unpropagate priority for a PI mutex when a thread blocked on
1530  * it is interrupted by signal or resumed by others.
1531  */
1532 static void
1533 umtx_repropagate_priority(struct umtx_pi *pi)
1534 {
1535 	struct umtx_q *uq, *uq_owner;
1536 	struct umtx_pi *pi2;
1537 	int pri;
1538 
1539 	mtx_assert(&umtx_lock, MA_OWNED);
1540 
1541 	if (umtx_pi_check_loop(pi))
1542 		return;
1543 	while (pi != NULL && pi->pi_owner != NULL) {
1544 		pri = PRI_MAX;
1545 		uq_owner = pi->pi_owner->td_umtxq;
1546 
1547 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1548 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1549 			if (uq != NULL) {
1550 				if (pri > UPRI(uq->uq_thread))
1551 					pri = UPRI(uq->uq_thread);
1552 			}
1553 		}
1554 
1555 		if (pri > uq_owner->uq_inherited_pri)
1556 			pri = uq_owner->uq_inherited_pri;
1557 		thread_lock(pi->pi_owner);
1558 		sched_lend_user_prio(pi->pi_owner, pri);
1559 		thread_unlock(pi->pi_owner);
1560 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1561 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1562 	}
1563 }
1564 
1565 /*
1566  * Insert a PI mutex into owned list.
1567  */
1568 static void
1569 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1570 {
1571 	struct umtx_q *uq_owner;
1572 
1573 	uq_owner = owner->td_umtxq;
1574 	mtx_assert(&umtx_lock, MA_OWNED);
1575 	MPASS(pi->pi_owner == NULL);
1576 	pi->pi_owner = owner;
1577 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1578 }
1579 
1580 /*
1581  * Disown a PI mutex, and remove it from the owned list.
1582  */
1583 static void
1584 umtx_pi_disown(struct umtx_pi *pi)
1585 {
1586 
1587 	mtx_assert(&umtx_lock, MA_OWNED);
1588 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
1589 	pi->pi_owner = NULL;
1590 }
1591 
1592 /*
1593  * Claim ownership of a PI mutex.
1594  */
1595 static int
1596 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1597 {
1598 	struct umtx_q *uq;
1599 	int pri;
1600 
1601 	mtx_lock(&umtx_lock);
1602 	if (pi->pi_owner == owner) {
1603 		mtx_unlock(&umtx_lock);
1604 		return (0);
1605 	}
1606 
1607 	if (pi->pi_owner != NULL) {
1608 		/*
1609 		 * userland may have already messed the mutex, sigh.
1610 		 */
1611 		mtx_unlock(&umtx_lock);
1612 		return (EPERM);
1613 	}
1614 	umtx_pi_setowner(pi, owner);
1615 	uq = TAILQ_FIRST(&pi->pi_blocked);
1616 	if (uq != NULL) {
1617 		pri = UPRI(uq->uq_thread);
1618 		thread_lock(owner);
1619 		if (pri < UPRI(owner))
1620 			sched_lend_user_prio(owner, pri);
1621 		thread_unlock(owner);
1622 	}
1623 	mtx_unlock(&umtx_lock);
1624 	return (0);
1625 }
1626 
1627 /*
1628  * Adjust a thread's order position in its blocked PI mutex,
1629  * this may result new priority propagating process.
1630  */
1631 void
1632 umtx_pi_adjust(struct thread *td, u_char oldpri)
1633 {
1634 	struct umtx_q *uq;
1635 	struct umtx_pi *pi;
1636 
1637 	uq = td->td_umtxq;
1638 	mtx_lock(&umtx_lock);
1639 	/*
1640 	 * Pick up the lock that td is blocked on.
1641 	 */
1642 	pi = uq->uq_pi_blocked;
1643 	if (pi != NULL) {
1644 		umtx_pi_adjust_thread(pi, td);
1645 		umtx_repropagate_priority(pi);
1646 	}
1647 	mtx_unlock(&umtx_lock);
1648 }
1649 
1650 /*
1651  * Sleep on a PI mutex.
1652  */
1653 static int
1654 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
1655     const char *wmesg, struct abs_timeout *timo, bool shared)
1656 {
1657 	struct thread *td, *td1;
1658 	struct umtx_q *uq1;
1659 	int error, pri;
1660 #ifdef INVARIANTS
1661 	struct umtxq_chain *uc;
1662 
1663 	uc = umtxq_getchain(&pi->pi_key);
1664 #endif
1665 	error = 0;
1666 	td = uq->uq_thread;
1667 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1668 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key));
1669 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1670 	umtxq_insert(uq);
1671 	mtx_lock(&umtx_lock);
1672 	if (pi->pi_owner == NULL) {
1673 		mtx_unlock(&umtx_lock);
1674 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
1675 		mtx_lock(&umtx_lock);
1676 		if (td1 != NULL) {
1677 			if (pi->pi_owner == NULL)
1678 				umtx_pi_setowner(pi, td1);
1679 			PROC_UNLOCK(td1->td_proc);
1680 		}
1681 	}
1682 
1683 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1684 		pri = UPRI(uq1->uq_thread);
1685 		if (pri > UPRI(td))
1686 			break;
1687 	}
1688 
1689 	if (uq1 != NULL)
1690 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1691 	else
1692 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1693 
1694 	uq->uq_pi_blocked = pi;
1695 	thread_lock(td);
1696 	td->td_flags |= TDF_UPIBLOCKED;
1697 	thread_unlock(td);
1698 	umtx_propagate_priority(td);
1699 	mtx_unlock(&umtx_lock);
1700 	umtxq_unbusy(&uq->uq_key);
1701 
1702 	error = umtxq_sleep(uq, wmesg, timo);
1703 	umtxq_remove(uq);
1704 
1705 	mtx_lock(&umtx_lock);
1706 	uq->uq_pi_blocked = NULL;
1707 	thread_lock(td);
1708 	td->td_flags &= ~TDF_UPIBLOCKED;
1709 	thread_unlock(td);
1710 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1711 	umtx_repropagate_priority(pi);
1712 	mtx_unlock(&umtx_lock);
1713 	umtxq_unlock(&uq->uq_key);
1714 
1715 	return (error);
1716 }
1717 
1718 /*
1719  * Add reference count for a PI mutex.
1720  */
1721 static void
1722 umtx_pi_ref(struct umtx_pi *pi)
1723 {
1724 
1725 	UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key));
1726 	pi->pi_refcount++;
1727 }
1728 
1729 /*
1730  * Decrease reference count for a PI mutex, if the counter
1731  * is decreased to zero, its memory space is freed.
1732  */
1733 static void
1734 umtx_pi_unref(struct umtx_pi *pi)
1735 {
1736 	struct umtxq_chain *uc;
1737 
1738 	uc = umtxq_getchain(&pi->pi_key);
1739 	UMTXQ_LOCKED_ASSERT(uc);
1740 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1741 	if (--pi->pi_refcount == 0) {
1742 		mtx_lock(&umtx_lock);
1743 		if (pi->pi_owner != NULL)
1744 			umtx_pi_disown(pi);
1745 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1746 			("blocked queue not empty"));
1747 		mtx_unlock(&umtx_lock);
1748 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1749 		umtx_pi_free(pi);
1750 	}
1751 }
1752 
1753 /*
1754  * Find a PI mutex in hash table.
1755  */
1756 static struct umtx_pi *
1757 umtx_pi_lookup(struct umtx_key *key)
1758 {
1759 	struct umtxq_chain *uc;
1760 	struct umtx_pi *pi;
1761 
1762 	uc = umtxq_getchain(key);
1763 	UMTXQ_LOCKED_ASSERT(uc);
1764 
1765 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1766 		if (umtx_key_match(&pi->pi_key, key)) {
1767 			return (pi);
1768 		}
1769 	}
1770 	return (NULL);
1771 }
1772 
1773 /*
1774  * Insert a PI mutex into hash table.
1775  */
1776 static inline void
1777 umtx_pi_insert(struct umtx_pi *pi)
1778 {
1779 	struct umtxq_chain *uc;
1780 
1781 	uc = umtxq_getchain(&pi->pi_key);
1782 	UMTXQ_LOCKED_ASSERT(uc);
1783 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1784 }
1785 
1786 /*
1787  * Lock a PI mutex.
1788  */
1789 static int
1790 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1791     struct _umtx_time *timeout, int try)
1792 {
1793 	struct abs_timeout timo;
1794 	struct umtx_q *uq;
1795 	struct umtx_pi *pi, *new_pi;
1796 	uint32_t id, old_owner, owner, old;
1797 	int error, rv;
1798 
1799 	id = td->td_tid;
1800 	uq = td->td_umtxq;
1801 
1802 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
1803 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
1804 	    &uq->uq_key)) != 0)
1805 		return (error);
1806 
1807 	if (timeout != NULL)
1808 		abs_timeout_init2(&timo, timeout);
1809 
1810 	umtxq_lock(&uq->uq_key);
1811 	pi = umtx_pi_lookup(&uq->uq_key);
1812 	if (pi == NULL) {
1813 		new_pi = umtx_pi_alloc(M_NOWAIT);
1814 		if (new_pi == NULL) {
1815 			umtxq_unlock(&uq->uq_key);
1816 			new_pi = umtx_pi_alloc(M_WAITOK);
1817 			umtxq_lock(&uq->uq_key);
1818 			pi = umtx_pi_lookup(&uq->uq_key);
1819 			if (pi != NULL) {
1820 				umtx_pi_free(new_pi);
1821 				new_pi = NULL;
1822 			}
1823 		}
1824 		if (new_pi != NULL) {
1825 			new_pi->pi_key = uq->uq_key;
1826 			umtx_pi_insert(new_pi);
1827 			pi = new_pi;
1828 		}
1829 	}
1830 	umtx_pi_ref(pi);
1831 	umtxq_unlock(&uq->uq_key);
1832 
1833 	/*
1834 	 * Care must be exercised when dealing with umtx structure.  It
1835 	 * can fault on any access.
1836 	 */
1837 	for (;;) {
1838 		/*
1839 		 * Try the uncontested case.  This should be done in userland.
1840 		 */
1841 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1842 		/* The address was invalid. */
1843 		if (rv == -1) {
1844 			error = EFAULT;
1845 			break;
1846 		}
1847 		/* The acquire succeeded. */
1848 		if (rv == 0) {
1849 			MPASS(owner == UMUTEX_UNOWNED);
1850 			error = 0;
1851 			break;
1852 		}
1853 
1854 		if (owner == UMUTEX_RB_NOTRECOV) {
1855 			error = ENOTRECOVERABLE;
1856 			break;
1857 		}
1858 
1859 		/*
1860 		 * Avoid overwriting a possible error from sleep due
1861 		 * to the pending signal with suspension check result.
1862 		 */
1863 		if (error == 0) {
1864 			error = thread_check_susp(td, true);
1865 			if (error != 0)
1866 				break;
1867 		}
1868 
1869 		/* If no one owns it but it is contested try to acquire it. */
1870 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
1871 			old_owner = owner;
1872 			rv = casueword32(&m->m_owner, owner, &owner,
1873 			    id | UMUTEX_CONTESTED);
1874 			/* The address was invalid. */
1875 			if (rv == -1) {
1876 				error = EFAULT;
1877 				break;
1878 			}
1879 			if (rv == 1) {
1880 				if (error == 0) {
1881 					error = thread_check_susp(td, true);
1882 					if (error != 0)
1883 						break;
1884 				}
1885 
1886 				/*
1887 				 * If this failed the lock could
1888 				 * changed, restart.
1889 				 */
1890 				continue;
1891 			}
1892 
1893 			MPASS(rv == 0);
1894 			MPASS(owner == old_owner);
1895 			umtxq_lock(&uq->uq_key);
1896 			umtxq_busy(&uq->uq_key);
1897 			error = umtx_pi_claim(pi, td);
1898 			umtxq_unbusy(&uq->uq_key);
1899 			umtxq_unlock(&uq->uq_key);
1900 			if (error != 0) {
1901 				/*
1902 				 * Since we're going to return an
1903 				 * error, restore the m_owner to its
1904 				 * previous, unowned state to avoid
1905 				 * compounding the problem.
1906 				 */
1907 				(void)casuword32(&m->m_owner,
1908 				    id | UMUTEX_CONTESTED, old_owner);
1909 			}
1910 			if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD)
1911 				error = EOWNERDEAD;
1912 			break;
1913 		}
1914 
1915 		if ((owner & ~UMUTEX_CONTESTED) == id) {
1916 			error = EDEADLK;
1917 			break;
1918 		}
1919 
1920 		if (try != 0) {
1921 			error = EBUSY;
1922 			break;
1923 		}
1924 
1925 		/*
1926 		 * If we caught a signal, we have retried and now
1927 		 * exit immediately.
1928 		 */
1929 		if (error != 0)
1930 			break;
1931 
1932 		umtxq_lock(&uq->uq_key);
1933 		umtxq_busy(&uq->uq_key);
1934 		umtxq_unlock(&uq->uq_key);
1935 
1936 		/*
1937 		 * Set the contested bit so that a release in user space
1938 		 * knows to use the system call for unlock.  If this fails
1939 		 * either some one else has acquired the lock or it has been
1940 		 * released.
1941 		 */
1942 		rv = casueword32(&m->m_owner, owner, &old, owner |
1943 		    UMUTEX_CONTESTED);
1944 
1945 		/* The address was invalid. */
1946 		if (rv == -1) {
1947 			umtxq_unbusy_unlocked(&uq->uq_key);
1948 			error = EFAULT;
1949 			break;
1950 		}
1951 		if (rv == 1) {
1952 			umtxq_unbusy_unlocked(&uq->uq_key);
1953 			error = thread_check_susp(td, true);
1954 			if (error != 0)
1955 				break;
1956 
1957 			/*
1958 			 * The lock changed and we need to retry or we
1959 			 * lost a race to the thread unlocking the
1960 			 * umtx.  Note that the UMUTEX_RB_OWNERDEAD
1961 			 * value for owner is impossible there.
1962 			 */
1963 			continue;
1964 		}
1965 
1966 		umtxq_lock(&uq->uq_key);
1967 
1968 		/* We set the contested bit, sleep. */
1969 		MPASS(old == owner);
1970 		error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1971 		    "umtxpi", timeout == NULL ? NULL : &timo,
1972 		    (flags & USYNC_PROCESS_SHARED) != 0);
1973 		if (error != 0)
1974 			continue;
1975 
1976 		error = thread_check_susp(td, false);
1977 		if (error != 0)
1978 			break;
1979 	}
1980 
1981 	umtxq_lock(&uq->uq_key);
1982 	umtx_pi_unref(pi);
1983 	umtxq_unlock(&uq->uq_key);
1984 
1985 	umtx_key_release(&uq->uq_key);
1986 	return (error);
1987 }
1988 
1989 /*
1990  * Unlock a PI mutex.
1991  */
1992 static int
1993 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
1994 {
1995 	struct umtx_key key;
1996 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1997 	struct umtx_pi *pi, *pi2;
1998 	uint32_t id, new_owner, old, owner;
1999 	int count, error, pri;
2000 
2001 	id = td->td_tid;
2002 
2003 usrloop:
2004 	/*
2005 	 * Make sure we own this mtx.
2006 	 */
2007 	error = fueword32(&m->m_owner, &owner);
2008 	if (error == -1)
2009 		return (EFAULT);
2010 
2011 	if ((owner & ~UMUTEX_CONTESTED) != id)
2012 		return (EPERM);
2013 
2014 	new_owner = umtx_unlock_val(flags, rb);
2015 
2016 	/* This should be done in userland */
2017 	if ((owner & UMUTEX_CONTESTED) == 0) {
2018 		error = casueword32(&m->m_owner, owner, &old, new_owner);
2019 		if (error == -1)
2020 			return (EFAULT);
2021 		if (error == 1) {
2022 			error = thread_check_susp(td, true);
2023 			if (error != 0)
2024 				return (error);
2025 			goto usrloop;
2026 		}
2027 		if (old == owner)
2028 			return (0);
2029 		owner = old;
2030 	}
2031 
2032 	/* We should only ever be in here for contested locks */
2033 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2034 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
2035 	    &key)) != 0)
2036 		return (error);
2037 
2038 	umtxq_lock(&key);
2039 	umtxq_busy(&key);
2040 	count = umtxq_count_pi(&key, &uq_first);
2041 	if (uq_first != NULL) {
2042 		mtx_lock(&umtx_lock);
2043 		pi = uq_first->uq_pi_blocked;
2044 		KASSERT(pi != NULL, ("pi == NULL?"));
2045 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
2046 			mtx_unlock(&umtx_lock);
2047 			umtxq_unbusy(&key);
2048 			umtxq_unlock(&key);
2049 			umtx_key_release(&key);
2050 			/* userland messed the mutex */
2051 			return (EPERM);
2052 		}
2053 		uq_me = td->td_umtxq;
2054 		if (pi->pi_owner == td)
2055 			umtx_pi_disown(pi);
2056 		/* get highest priority thread which is still sleeping. */
2057 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2058 		while (uq_first != NULL &&
2059 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2060 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2061 		}
2062 		pri = PRI_MAX;
2063 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2064 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2065 			if (uq_first2 != NULL) {
2066 				if (pri > UPRI(uq_first2->uq_thread))
2067 					pri = UPRI(uq_first2->uq_thread);
2068 			}
2069 		}
2070 		thread_lock(td);
2071 		sched_lend_user_prio(td, pri);
2072 		thread_unlock(td);
2073 		mtx_unlock(&umtx_lock);
2074 		if (uq_first)
2075 			umtxq_signal_thread(uq_first);
2076 	} else {
2077 		pi = umtx_pi_lookup(&key);
2078 		/*
2079 		 * A umtx_pi can exist if a signal or timeout removed the
2080 		 * last waiter from the umtxq, but there is still
2081 		 * a thread in do_lock_pi() holding the umtx_pi.
2082 		 */
2083 		if (pi != NULL) {
2084 			/*
2085 			 * The umtx_pi can be unowned, such as when a thread
2086 			 * has just entered do_lock_pi(), allocated the
2087 			 * umtx_pi, and unlocked the umtxq.
2088 			 * If the current thread owns it, it must disown it.
2089 			 */
2090 			mtx_lock(&umtx_lock);
2091 			if (pi->pi_owner == td)
2092 				umtx_pi_disown(pi);
2093 			mtx_unlock(&umtx_lock);
2094 		}
2095 	}
2096 	umtxq_unlock(&key);
2097 
2098 	/*
2099 	 * When unlocking the umtx, it must be marked as unowned if
2100 	 * there is zero or one thread only waiting for it.
2101 	 * Otherwise, it must be marked as contested.
2102 	 */
2103 
2104 	if (count > 1)
2105 		new_owner |= UMUTEX_CONTESTED;
2106 again:
2107 	error = casueword32(&m->m_owner, owner, &old, new_owner);
2108 	if (error == 1) {
2109 		error = thread_check_susp(td, false);
2110 		if (error == 0)
2111 			goto again;
2112 	}
2113 	umtxq_unbusy_unlocked(&key);
2114 	umtx_key_release(&key);
2115 	if (error == -1)
2116 		return (EFAULT);
2117 	if (error == 0 && old != owner)
2118 		return (EINVAL);
2119 	return (error);
2120 }
2121 
2122 /*
2123  * Lock a PP mutex.
2124  */
2125 static int
2126 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2127     struct _umtx_time *timeout, int try)
2128 {
2129 	struct abs_timeout timo;
2130 	struct umtx_q *uq, *uq2;
2131 	struct umtx_pi *pi;
2132 	uint32_t ceiling;
2133 	uint32_t owner, id;
2134 	int error, pri, old_inherited_pri, su, rv;
2135 
2136 	id = td->td_tid;
2137 	uq = td->td_umtxq;
2138 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2139 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2140 	    &uq->uq_key)) != 0)
2141 		return (error);
2142 
2143 	if (timeout != NULL)
2144 		abs_timeout_init2(&timo, timeout);
2145 
2146 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2147 	for (;;) {
2148 		old_inherited_pri = uq->uq_inherited_pri;
2149 		umtxq_lock(&uq->uq_key);
2150 		umtxq_busy(&uq->uq_key);
2151 		umtxq_unlock(&uq->uq_key);
2152 
2153 		rv = fueword32(&m->m_ceilings[0], &ceiling);
2154 		if (rv == -1) {
2155 			error = EFAULT;
2156 			goto out;
2157 		}
2158 		ceiling = RTP_PRIO_MAX - ceiling;
2159 		if (ceiling > RTP_PRIO_MAX) {
2160 			error = EINVAL;
2161 			goto out;
2162 		}
2163 
2164 		mtx_lock(&umtx_lock);
2165 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2166 			mtx_unlock(&umtx_lock);
2167 			error = EINVAL;
2168 			goto out;
2169 		}
2170 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2171 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2172 			thread_lock(td);
2173 			if (uq->uq_inherited_pri < UPRI(td))
2174 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2175 			thread_unlock(td);
2176 		}
2177 		mtx_unlock(&umtx_lock);
2178 
2179 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2180 		    id | UMUTEX_CONTESTED);
2181 		/* The address was invalid. */
2182 		if (rv == -1) {
2183 			error = EFAULT;
2184 			break;
2185 		}
2186 		if (rv == 0) {
2187 			MPASS(owner == UMUTEX_CONTESTED);
2188 			error = 0;
2189 			break;
2190 		}
2191 		/* rv == 1 */
2192 		if (owner == UMUTEX_RB_OWNERDEAD) {
2193 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
2194 			    &owner, id | UMUTEX_CONTESTED);
2195 			if (rv == -1) {
2196 				error = EFAULT;
2197 				break;
2198 			}
2199 			if (rv == 0) {
2200 				MPASS(owner == UMUTEX_RB_OWNERDEAD);
2201 				error = EOWNERDEAD; /* success */
2202 				break;
2203 			}
2204 
2205 			/*
2206 			 *  rv == 1, only check for suspension if we
2207 			 *  did not already catched a signal.  If we
2208 			 *  get an error from the check, the same
2209 			 *  condition is checked by the umtxq_sleep()
2210 			 *  call below, so we should obliterate the
2211 			 *  error to not skip the last loop iteration.
2212 			 */
2213 			if (error == 0) {
2214 				error = thread_check_susp(td, false);
2215 				if (error == 0) {
2216 					if (try != 0)
2217 						error = EBUSY;
2218 					else
2219 						continue;
2220 				}
2221 				error = 0;
2222 			}
2223 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2224 			error = ENOTRECOVERABLE;
2225 		}
2226 
2227 		if (try != 0)
2228 			error = EBUSY;
2229 
2230 		/*
2231 		 * If we caught a signal, we have retried and now
2232 		 * exit immediately.
2233 		 */
2234 		if (error != 0)
2235 			break;
2236 
2237 		umtxq_lock(&uq->uq_key);
2238 		umtxq_insert(uq);
2239 		umtxq_unbusy(&uq->uq_key);
2240 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2241 		    NULL : &timo);
2242 		umtxq_remove(uq);
2243 		umtxq_unlock(&uq->uq_key);
2244 
2245 		mtx_lock(&umtx_lock);
2246 		uq->uq_inherited_pri = old_inherited_pri;
2247 		pri = PRI_MAX;
2248 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2249 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2250 			if (uq2 != NULL) {
2251 				if (pri > UPRI(uq2->uq_thread))
2252 					pri = UPRI(uq2->uq_thread);
2253 			}
2254 		}
2255 		if (pri > uq->uq_inherited_pri)
2256 			pri = uq->uq_inherited_pri;
2257 		thread_lock(td);
2258 		sched_lend_user_prio(td, pri);
2259 		thread_unlock(td);
2260 		mtx_unlock(&umtx_lock);
2261 	}
2262 
2263 	if (error != 0 && error != EOWNERDEAD) {
2264 		mtx_lock(&umtx_lock);
2265 		uq->uq_inherited_pri = old_inherited_pri;
2266 		pri = PRI_MAX;
2267 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2268 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2269 			if (uq2 != NULL) {
2270 				if (pri > UPRI(uq2->uq_thread))
2271 					pri = UPRI(uq2->uq_thread);
2272 			}
2273 		}
2274 		if (pri > uq->uq_inherited_pri)
2275 			pri = uq->uq_inherited_pri;
2276 		thread_lock(td);
2277 		sched_lend_user_prio(td, pri);
2278 		thread_unlock(td);
2279 		mtx_unlock(&umtx_lock);
2280 	}
2281 
2282 out:
2283 	umtxq_unbusy_unlocked(&uq->uq_key);
2284 	umtx_key_release(&uq->uq_key);
2285 	return (error);
2286 }
2287 
2288 /*
2289  * Unlock a PP mutex.
2290  */
2291 static int
2292 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
2293 {
2294 	struct umtx_key key;
2295 	struct umtx_q *uq, *uq2;
2296 	struct umtx_pi *pi;
2297 	uint32_t id, owner, rceiling;
2298 	int error, pri, new_inherited_pri, su;
2299 
2300 	id = td->td_tid;
2301 	uq = td->td_umtxq;
2302 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2303 
2304 	/*
2305 	 * Make sure we own this mtx.
2306 	 */
2307 	error = fueword32(&m->m_owner, &owner);
2308 	if (error == -1)
2309 		return (EFAULT);
2310 
2311 	if ((owner & ~UMUTEX_CONTESTED) != id)
2312 		return (EPERM);
2313 
2314 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2315 	if (error != 0)
2316 		return (error);
2317 
2318 	if (rceiling == -1)
2319 		new_inherited_pri = PRI_MAX;
2320 	else {
2321 		rceiling = RTP_PRIO_MAX - rceiling;
2322 		if (rceiling > RTP_PRIO_MAX)
2323 			return (EINVAL);
2324 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2325 	}
2326 
2327 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2328 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2329 	    &key)) != 0)
2330 		return (error);
2331 	umtxq_lock(&key);
2332 	umtxq_busy(&key);
2333 	umtxq_unlock(&key);
2334 	/*
2335 	 * For priority protected mutex, always set unlocked state
2336 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2337 	 * to lock the mutex, it is necessary because thread priority
2338 	 * has to be adjusted for such mutex.
2339 	 */
2340 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
2341 	    UMUTEX_CONTESTED);
2342 
2343 	umtxq_lock(&key);
2344 	if (error == 0)
2345 		umtxq_signal(&key, 1);
2346 	umtxq_unbusy(&key);
2347 	umtxq_unlock(&key);
2348 
2349 	if (error == -1)
2350 		error = EFAULT;
2351 	else {
2352 		mtx_lock(&umtx_lock);
2353 		if (su != 0)
2354 			uq->uq_inherited_pri = new_inherited_pri;
2355 		pri = PRI_MAX;
2356 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2357 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2358 			if (uq2 != NULL) {
2359 				if (pri > UPRI(uq2->uq_thread))
2360 					pri = UPRI(uq2->uq_thread);
2361 			}
2362 		}
2363 		if (pri > uq->uq_inherited_pri)
2364 			pri = uq->uq_inherited_pri;
2365 		thread_lock(td);
2366 		sched_lend_user_prio(td, pri);
2367 		thread_unlock(td);
2368 		mtx_unlock(&umtx_lock);
2369 	}
2370 	umtx_key_release(&key);
2371 	return (error);
2372 }
2373 
2374 static int
2375 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2376     uint32_t *old_ceiling)
2377 {
2378 	struct umtx_q *uq;
2379 	uint32_t flags, id, owner, save_ceiling;
2380 	int error, rv, rv1;
2381 
2382 	error = fueword32(&m->m_flags, &flags);
2383 	if (error == -1)
2384 		return (EFAULT);
2385 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2386 		return (EINVAL);
2387 	if (ceiling > RTP_PRIO_MAX)
2388 		return (EINVAL);
2389 	id = td->td_tid;
2390 	uq = td->td_umtxq;
2391 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
2392 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
2393 	    &uq->uq_key)) != 0)
2394 		return (error);
2395 	for (;;) {
2396 		umtxq_lock(&uq->uq_key);
2397 		umtxq_busy(&uq->uq_key);
2398 		umtxq_unlock(&uq->uq_key);
2399 
2400 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2401 		if (rv == -1) {
2402 			error = EFAULT;
2403 			break;
2404 		}
2405 
2406 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
2407 		    id | UMUTEX_CONTESTED);
2408 		if (rv == -1) {
2409 			error = EFAULT;
2410 			break;
2411 		}
2412 
2413 		if (rv == 0) {
2414 			MPASS(owner == UMUTEX_CONTESTED);
2415 			rv = suword32(&m->m_ceilings[0], ceiling);
2416 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
2417 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
2418 			break;
2419 		}
2420 
2421 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2422 			rv = suword32(&m->m_ceilings[0], ceiling);
2423 			error = rv == 0 ? 0 : EFAULT;
2424 			break;
2425 		}
2426 
2427 		if (owner == UMUTEX_RB_OWNERDEAD) {
2428 			error = EOWNERDEAD;
2429 			break;
2430 		} else if (owner == UMUTEX_RB_NOTRECOV) {
2431 			error = ENOTRECOVERABLE;
2432 			break;
2433 		}
2434 
2435 		/*
2436 		 * If we caught a signal, we have retried and now
2437 		 * exit immediately.
2438 		 */
2439 		if (error != 0)
2440 			break;
2441 
2442 		/*
2443 		 * We set the contested bit, sleep. Otherwise the lock changed
2444 		 * and we need to retry or we lost a race to the thread
2445 		 * unlocking the umtx.
2446 		 */
2447 		umtxq_lock(&uq->uq_key);
2448 		umtxq_insert(uq);
2449 		umtxq_unbusy(&uq->uq_key);
2450 		error = umtxq_sleep(uq, "umtxpp", NULL);
2451 		umtxq_remove(uq);
2452 		umtxq_unlock(&uq->uq_key);
2453 	}
2454 	umtxq_lock(&uq->uq_key);
2455 	if (error == 0)
2456 		umtxq_signal(&uq->uq_key, INT_MAX);
2457 	umtxq_unbusy(&uq->uq_key);
2458 	umtxq_unlock(&uq->uq_key);
2459 	umtx_key_release(&uq->uq_key);
2460 	if (error == 0 && old_ceiling != NULL) {
2461 		rv = suword32(old_ceiling, save_ceiling);
2462 		error = rv == 0 ? 0 : EFAULT;
2463 	}
2464 	return (error);
2465 }
2466 
2467 /*
2468  * Lock a userland POSIX mutex.
2469  */
2470 static int
2471 do_lock_umutex(struct thread *td, struct umutex *m,
2472     struct _umtx_time *timeout, int mode)
2473 {
2474 	uint32_t flags;
2475 	int error;
2476 
2477 	error = fueword32(&m->m_flags, &flags);
2478 	if (error == -1)
2479 		return (EFAULT);
2480 
2481 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2482 	case 0:
2483 		error = do_lock_normal(td, m, flags, timeout, mode);
2484 		break;
2485 	case UMUTEX_PRIO_INHERIT:
2486 		error = do_lock_pi(td, m, flags, timeout, mode);
2487 		break;
2488 	case UMUTEX_PRIO_PROTECT:
2489 		error = do_lock_pp(td, m, flags, timeout, mode);
2490 		break;
2491 	default:
2492 		return (EINVAL);
2493 	}
2494 	if (timeout == NULL) {
2495 		if (error == EINTR && mode != _UMUTEX_WAIT)
2496 			error = ERESTART;
2497 	} else {
2498 		/* Timed-locking is not restarted. */
2499 		if (error == ERESTART)
2500 			error = EINTR;
2501 	}
2502 	return (error);
2503 }
2504 
2505 /*
2506  * Unlock a userland POSIX mutex.
2507  */
2508 static int
2509 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
2510 {
2511 	uint32_t flags;
2512 	int error;
2513 
2514 	error = fueword32(&m->m_flags, &flags);
2515 	if (error == -1)
2516 		return (EFAULT);
2517 
2518 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2519 	case 0:
2520 		return (do_unlock_normal(td, m, flags, rb));
2521 	case UMUTEX_PRIO_INHERIT:
2522 		return (do_unlock_pi(td, m, flags, rb));
2523 	case UMUTEX_PRIO_PROTECT:
2524 		return (do_unlock_pp(td, m, flags, rb));
2525 	}
2526 
2527 	return (EINVAL);
2528 }
2529 
2530 static int
2531 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2532     struct timespec *timeout, u_long wflags)
2533 {
2534 	struct abs_timeout timo;
2535 	struct umtx_q *uq;
2536 	uint32_t flags, clockid, hasw;
2537 	int error;
2538 
2539 	uq = td->td_umtxq;
2540 	error = fueword32(&cv->c_flags, &flags);
2541 	if (error == -1)
2542 		return (EFAULT);
2543 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2544 	if (error != 0)
2545 		return (error);
2546 
2547 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2548 		error = fueword32(&cv->c_clockid, &clockid);
2549 		if (error == -1) {
2550 			umtx_key_release(&uq->uq_key);
2551 			return (EFAULT);
2552 		}
2553 		if (clockid < CLOCK_REALTIME ||
2554 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2555 			/* hmm, only HW clock id will work. */
2556 			umtx_key_release(&uq->uq_key);
2557 			return (EINVAL);
2558 		}
2559 	} else {
2560 		clockid = CLOCK_REALTIME;
2561 	}
2562 
2563 	umtxq_lock(&uq->uq_key);
2564 	umtxq_busy(&uq->uq_key);
2565 	umtxq_insert(uq);
2566 	umtxq_unlock(&uq->uq_key);
2567 
2568 	/*
2569 	 * Set c_has_waiters to 1 before releasing user mutex, also
2570 	 * don't modify cache line when unnecessary.
2571 	 */
2572 	error = fueword32(&cv->c_has_waiters, &hasw);
2573 	if (error == 0 && hasw == 0)
2574 		suword32(&cv->c_has_waiters, 1);
2575 
2576 	umtxq_unbusy_unlocked(&uq->uq_key);
2577 
2578 	error = do_unlock_umutex(td, m, false);
2579 
2580 	if (timeout != NULL)
2581 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
2582 		    timeout);
2583 
2584 	umtxq_lock(&uq->uq_key);
2585 	if (error == 0) {
2586 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2587 		    NULL : &timo);
2588 	}
2589 
2590 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2591 		error = 0;
2592 	else {
2593 		/*
2594 		 * This must be timeout,interrupted by signal or
2595 		 * surprious wakeup, clear c_has_waiter flag when
2596 		 * necessary.
2597 		 */
2598 		umtxq_busy(&uq->uq_key);
2599 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2600 			int oldlen = uq->uq_cur_queue->length;
2601 			umtxq_remove(uq);
2602 			if (oldlen == 1) {
2603 				umtxq_unlock(&uq->uq_key);
2604 				suword32(&cv->c_has_waiters, 0);
2605 				umtxq_lock(&uq->uq_key);
2606 			}
2607 		}
2608 		umtxq_unbusy(&uq->uq_key);
2609 		if (error == ERESTART)
2610 			error = EINTR;
2611 	}
2612 
2613 	umtxq_unlock(&uq->uq_key);
2614 	umtx_key_release(&uq->uq_key);
2615 	return (error);
2616 }
2617 
2618 /*
2619  * Signal a userland condition variable.
2620  */
2621 static int
2622 do_cv_signal(struct thread *td, struct ucond *cv)
2623 {
2624 	struct umtx_key key;
2625 	int error, cnt, nwake;
2626 	uint32_t flags;
2627 
2628 	error = fueword32(&cv->c_flags, &flags);
2629 	if (error == -1)
2630 		return (EFAULT);
2631 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2632 		return (error);
2633 	umtxq_lock(&key);
2634 	umtxq_busy(&key);
2635 	cnt = umtxq_count(&key);
2636 	nwake = umtxq_signal(&key, 1);
2637 	if (cnt <= nwake) {
2638 		umtxq_unlock(&key);
2639 		error = suword32(&cv->c_has_waiters, 0);
2640 		if (error == -1)
2641 			error = EFAULT;
2642 		umtxq_lock(&key);
2643 	}
2644 	umtxq_unbusy(&key);
2645 	umtxq_unlock(&key);
2646 	umtx_key_release(&key);
2647 	return (error);
2648 }
2649 
2650 static int
2651 do_cv_broadcast(struct thread *td, struct ucond *cv)
2652 {
2653 	struct umtx_key key;
2654 	int error;
2655 	uint32_t flags;
2656 
2657 	error = fueword32(&cv->c_flags, &flags);
2658 	if (error == -1)
2659 		return (EFAULT);
2660 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2661 		return (error);
2662 
2663 	umtxq_lock(&key);
2664 	umtxq_busy(&key);
2665 	umtxq_signal(&key, INT_MAX);
2666 	umtxq_unlock(&key);
2667 
2668 	error = suword32(&cv->c_has_waiters, 0);
2669 	if (error == -1)
2670 		error = EFAULT;
2671 
2672 	umtxq_unbusy_unlocked(&key);
2673 
2674 	umtx_key_release(&key);
2675 	return (error);
2676 }
2677 
2678 static int
2679 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag,
2680     struct _umtx_time *timeout)
2681 {
2682 	struct abs_timeout timo;
2683 	struct umtx_q *uq;
2684 	uint32_t flags, wrflags;
2685 	int32_t state, oldstate;
2686 	int32_t blocked_readers;
2687 	int error, error1, rv;
2688 
2689 	uq = td->td_umtxq;
2690 	error = fueword32(&rwlock->rw_flags, &flags);
2691 	if (error == -1)
2692 		return (EFAULT);
2693 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2694 	if (error != 0)
2695 		return (error);
2696 
2697 	if (timeout != NULL)
2698 		abs_timeout_init2(&timo, timeout);
2699 
2700 	wrflags = URWLOCK_WRITE_OWNER;
2701 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2702 		wrflags |= URWLOCK_WRITE_WAITERS;
2703 
2704 	for (;;) {
2705 		rv = fueword32(&rwlock->rw_state, &state);
2706 		if (rv == -1) {
2707 			umtx_key_release(&uq->uq_key);
2708 			return (EFAULT);
2709 		}
2710 
2711 		/* try to lock it */
2712 		while (!(state & wrflags)) {
2713 			if (__predict_false(URWLOCK_READER_COUNT(state) ==
2714 			    URWLOCK_MAX_READERS)) {
2715 				umtx_key_release(&uq->uq_key);
2716 				return (EAGAIN);
2717 			}
2718 			rv = casueword32(&rwlock->rw_state, state,
2719 			    &oldstate, state + 1);
2720 			if (rv == -1) {
2721 				umtx_key_release(&uq->uq_key);
2722 				return (EFAULT);
2723 			}
2724 			if (rv == 0) {
2725 				MPASS(oldstate == state);
2726 				umtx_key_release(&uq->uq_key);
2727 				return (0);
2728 			}
2729 			error = thread_check_susp(td, true);
2730 			if (error != 0)
2731 				break;
2732 			state = oldstate;
2733 		}
2734 
2735 		if (error)
2736 			break;
2737 
2738 		/* grab monitor lock */
2739 		umtxq_lock(&uq->uq_key);
2740 		umtxq_busy(&uq->uq_key);
2741 		umtxq_unlock(&uq->uq_key);
2742 
2743 		/*
2744 		 * re-read the state, in case it changed between the try-lock above
2745 		 * and the check below
2746 		 */
2747 		rv = fueword32(&rwlock->rw_state, &state);
2748 		if (rv == -1)
2749 			error = EFAULT;
2750 
2751 		/* set read contention bit */
2752 		while (error == 0 && (state & wrflags) &&
2753 		    !(state & URWLOCK_READ_WAITERS)) {
2754 			rv = casueword32(&rwlock->rw_state, state,
2755 			    &oldstate, state | URWLOCK_READ_WAITERS);
2756 			if (rv == -1) {
2757 				error = EFAULT;
2758 				break;
2759 			}
2760 			if (rv == 0) {
2761 				MPASS(oldstate == state);
2762 				goto sleep;
2763 			}
2764 			state = oldstate;
2765 			error = thread_check_susp(td, false);
2766 			if (error != 0)
2767 				break;
2768 		}
2769 		if (error != 0) {
2770 			umtxq_unbusy_unlocked(&uq->uq_key);
2771 			break;
2772 		}
2773 
2774 		/* state is changed while setting flags, restart */
2775 		if (!(state & wrflags)) {
2776 			umtxq_unbusy_unlocked(&uq->uq_key);
2777 			error = thread_check_susp(td, true);
2778 			if (error != 0)
2779 				break;
2780 			continue;
2781 		}
2782 
2783 sleep:
2784 		/*
2785 		 * Contention bit is set, before sleeping, increase
2786 		 * read waiter count.
2787 		 */
2788 		rv = fueword32(&rwlock->rw_blocked_readers,
2789 		    &blocked_readers);
2790 		if (rv == -1) {
2791 			umtxq_unbusy_unlocked(&uq->uq_key);
2792 			error = EFAULT;
2793 			break;
2794 		}
2795 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2796 
2797 		while (state & wrflags) {
2798 			umtxq_lock(&uq->uq_key);
2799 			umtxq_insert(uq);
2800 			umtxq_unbusy(&uq->uq_key);
2801 
2802 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2803 			    NULL : &timo);
2804 
2805 			umtxq_busy(&uq->uq_key);
2806 			umtxq_remove(uq);
2807 			umtxq_unlock(&uq->uq_key);
2808 			if (error)
2809 				break;
2810 			rv = fueword32(&rwlock->rw_state, &state);
2811 			if (rv == -1) {
2812 				error = EFAULT;
2813 				break;
2814 			}
2815 		}
2816 
2817 		/* decrease read waiter count, and may clear read contention bit */
2818 		rv = fueword32(&rwlock->rw_blocked_readers,
2819 		    &blocked_readers);
2820 		if (rv == -1) {
2821 			umtxq_unbusy_unlocked(&uq->uq_key);
2822 			error = EFAULT;
2823 			break;
2824 		}
2825 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2826 		if (blocked_readers == 1) {
2827 			rv = fueword32(&rwlock->rw_state, &state);
2828 			if (rv == -1) {
2829 				umtxq_unbusy_unlocked(&uq->uq_key);
2830 				error = EFAULT;
2831 				break;
2832 			}
2833 			for (;;) {
2834 				rv = casueword32(&rwlock->rw_state, state,
2835 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2836 				if (rv == -1) {
2837 					error = EFAULT;
2838 					break;
2839 				}
2840 				if (rv == 0) {
2841 					MPASS(oldstate == state);
2842 					break;
2843 				}
2844 				state = oldstate;
2845 				error1 = thread_check_susp(td, false);
2846 				if (error1 != 0) {
2847 					if (error == 0)
2848 						error = error1;
2849 					break;
2850 				}
2851 			}
2852 		}
2853 
2854 		umtxq_unbusy_unlocked(&uq->uq_key);
2855 		if (error != 0)
2856 			break;
2857 	}
2858 	umtx_key_release(&uq->uq_key);
2859 	if (error == ERESTART)
2860 		error = EINTR;
2861 	return (error);
2862 }
2863 
2864 static int
2865 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2866 {
2867 	struct abs_timeout timo;
2868 	struct umtx_q *uq;
2869 	uint32_t flags;
2870 	int32_t state, oldstate;
2871 	int32_t blocked_writers;
2872 	int32_t blocked_readers;
2873 	int error, error1, rv;
2874 
2875 	uq = td->td_umtxq;
2876 	error = fueword32(&rwlock->rw_flags, &flags);
2877 	if (error == -1)
2878 		return (EFAULT);
2879 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2880 	if (error != 0)
2881 		return (error);
2882 
2883 	if (timeout != NULL)
2884 		abs_timeout_init2(&timo, timeout);
2885 
2886 	blocked_readers = 0;
2887 	for (;;) {
2888 		rv = fueword32(&rwlock->rw_state, &state);
2889 		if (rv == -1) {
2890 			umtx_key_release(&uq->uq_key);
2891 			return (EFAULT);
2892 		}
2893 		while ((state & URWLOCK_WRITE_OWNER) == 0 &&
2894 		    URWLOCK_READER_COUNT(state) == 0) {
2895 			rv = casueword32(&rwlock->rw_state, state,
2896 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2897 			if (rv == -1) {
2898 				umtx_key_release(&uq->uq_key);
2899 				return (EFAULT);
2900 			}
2901 			if (rv == 0) {
2902 				MPASS(oldstate == state);
2903 				umtx_key_release(&uq->uq_key);
2904 				return (0);
2905 			}
2906 			state = oldstate;
2907 			error = thread_check_susp(td, true);
2908 			if (error != 0)
2909 				break;
2910 		}
2911 
2912 		if (error) {
2913 			if ((state & (URWLOCK_WRITE_OWNER |
2914 			    URWLOCK_WRITE_WAITERS)) == 0 &&
2915 			    blocked_readers != 0) {
2916 				umtxq_lock(&uq->uq_key);
2917 				umtxq_busy(&uq->uq_key);
2918 				umtxq_signal_queue(&uq->uq_key, INT_MAX,
2919 				    UMTX_SHARED_QUEUE);
2920 				umtxq_unbusy(&uq->uq_key);
2921 				umtxq_unlock(&uq->uq_key);
2922 			}
2923 
2924 			break;
2925 		}
2926 
2927 		/* grab monitor lock */
2928 		umtxq_lock(&uq->uq_key);
2929 		umtxq_busy(&uq->uq_key);
2930 		umtxq_unlock(&uq->uq_key);
2931 
2932 		/*
2933 		 * Re-read the state, in case it changed between the
2934 		 * try-lock above and the check below.
2935 		 */
2936 		rv = fueword32(&rwlock->rw_state, &state);
2937 		if (rv == -1)
2938 			error = EFAULT;
2939 
2940 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2941 		    URWLOCK_READER_COUNT(state) != 0) &&
2942 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2943 			rv = casueword32(&rwlock->rw_state, state,
2944 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2945 			if (rv == -1) {
2946 				error = EFAULT;
2947 				break;
2948 			}
2949 			if (rv == 0) {
2950 				MPASS(oldstate == state);
2951 				goto sleep;
2952 			}
2953 			state = oldstate;
2954 			error = thread_check_susp(td, false);
2955 			if (error != 0)
2956 				break;
2957 		}
2958 		if (error != 0) {
2959 			umtxq_unbusy_unlocked(&uq->uq_key);
2960 			break;
2961 		}
2962 
2963 		if ((state & URWLOCK_WRITE_OWNER) == 0 &&
2964 		    URWLOCK_READER_COUNT(state) == 0) {
2965 			umtxq_unbusy_unlocked(&uq->uq_key);
2966 			error = thread_check_susp(td, false);
2967 			if (error != 0)
2968 				break;
2969 			continue;
2970 		}
2971 sleep:
2972 		rv = fueword32(&rwlock->rw_blocked_writers,
2973 		    &blocked_writers);
2974 		if (rv == -1) {
2975 			umtxq_unbusy_unlocked(&uq->uq_key);
2976 			error = EFAULT;
2977 			break;
2978 		}
2979 		suword32(&rwlock->rw_blocked_writers, blocked_writers + 1);
2980 
2981 		while ((state & URWLOCK_WRITE_OWNER) ||
2982 		    URWLOCK_READER_COUNT(state) != 0) {
2983 			umtxq_lock(&uq->uq_key);
2984 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2985 			umtxq_unbusy(&uq->uq_key);
2986 
2987 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2988 			    NULL : &timo);
2989 
2990 			umtxq_busy(&uq->uq_key);
2991 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2992 			umtxq_unlock(&uq->uq_key);
2993 			if (error)
2994 				break;
2995 			rv = fueword32(&rwlock->rw_state, &state);
2996 			if (rv == -1) {
2997 				error = EFAULT;
2998 				break;
2999 			}
3000 		}
3001 
3002 		rv = fueword32(&rwlock->rw_blocked_writers,
3003 		    &blocked_writers);
3004 		if (rv == -1) {
3005 			umtxq_unbusy_unlocked(&uq->uq_key);
3006 			error = EFAULT;
3007 			break;
3008 		}
3009 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3010 		if (blocked_writers == 1) {
3011 			rv = fueword32(&rwlock->rw_state, &state);
3012 			if (rv == -1) {
3013 				umtxq_unbusy_unlocked(&uq->uq_key);
3014 				error = EFAULT;
3015 				break;
3016 			}
3017 			for (;;) {
3018 				rv = casueword32(&rwlock->rw_state, state,
3019 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3020 				if (rv == -1) {
3021 					error = EFAULT;
3022 					break;
3023 				}
3024 				if (rv == 0) {
3025 					MPASS(oldstate == state);
3026 					break;
3027 				}
3028 				state = oldstate;
3029 				error1 = thread_check_susp(td, false);
3030 				/*
3031 				 * We are leaving the URWLOCK_WRITE_WAITERS
3032 				 * behind, but this should not harm the
3033 				 * correctness.
3034 				 */
3035 				if (error1 != 0) {
3036 					if (error == 0)
3037 						error = error1;
3038 					break;
3039 				}
3040 			}
3041 			rv = fueword32(&rwlock->rw_blocked_readers,
3042 			    &blocked_readers);
3043 			if (rv == -1) {
3044 				umtxq_unbusy_unlocked(&uq->uq_key);
3045 				error = EFAULT;
3046 				break;
3047 			}
3048 		} else
3049 			blocked_readers = 0;
3050 
3051 		umtxq_unbusy_unlocked(&uq->uq_key);
3052 	}
3053 
3054 	umtx_key_release(&uq->uq_key);
3055 	if (error == ERESTART)
3056 		error = EINTR;
3057 	return (error);
3058 }
3059 
3060 static int
3061 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3062 {
3063 	struct umtx_q *uq;
3064 	uint32_t flags;
3065 	int32_t state, oldstate;
3066 	int error, rv, q, count;
3067 
3068 	uq = td->td_umtxq;
3069 	error = fueword32(&rwlock->rw_flags, &flags);
3070 	if (error == -1)
3071 		return (EFAULT);
3072 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3073 	if (error != 0)
3074 		return (error);
3075 
3076 	error = fueword32(&rwlock->rw_state, &state);
3077 	if (error == -1) {
3078 		error = EFAULT;
3079 		goto out;
3080 	}
3081 	if (state & URWLOCK_WRITE_OWNER) {
3082 		for (;;) {
3083 			rv = casueword32(&rwlock->rw_state, state,
3084 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3085 			if (rv == -1) {
3086 				error = EFAULT;
3087 				goto out;
3088 			}
3089 			if (rv == 1) {
3090 				state = oldstate;
3091 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3092 					error = EPERM;
3093 					goto out;
3094 				}
3095 				error = thread_check_susp(td, true);
3096 				if (error != 0)
3097 					goto out;
3098 			} else
3099 				break;
3100 		}
3101 	} else if (URWLOCK_READER_COUNT(state) != 0) {
3102 		for (;;) {
3103 			rv = casueword32(&rwlock->rw_state, state,
3104 			    &oldstate, state - 1);
3105 			if (rv == -1) {
3106 				error = EFAULT;
3107 				goto out;
3108 			}
3109 			if (rv == 1) {
3110 				state = oldstate;
3111 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3112 					error = EPERM;
3113 					goto out;
3114 				}
3115 				error = thread_check_susp(td, true);
3116 				if (error != 0)
3117 					goto out;
3118 			} else
3119 				break;
3120 		}
3121 	} else {
3122 		error = EPERM;
3123 		goto out;
3124 	}
3125 
3126 	count = 0;
3127 
3128 	if (!(flags & URWLOCK_PREFER_READER)) {
3129 		if (state & URWLOCK_WRITE_WAITERS) {
3130 			count = 1;
3131 			q = UMTX_EXCLUSIVE_QUEUE;
3132 		} else if (state & URWLOCK_READ_WAITERS) {
3133 			count = INT_MAX;
3134 			q = UMTX_SHARED_QUEUE;
3135 		}
3136 	} else {
3137 		if (state & URWLOCK_READ_WAITERS) {
3138 			count = INT_MAX;
3139 			q = UMTX_SHARED_QUEUE;
3140 		} else if (state & URWLOCK_WRITE_WAITERS) {
3141 			count = 1;
3142 			q = UMTX_EXCLUSIVE_QUEUE;
3143 		}
3144 	}
3145 
3146 	if (count) {
3147 		umtxq_lock(&uq->uq_key);
3148 		umtxq_busy(&uq->uq_key);
3149 		umtxq_signal_queue(&uq->uq_key, count, q);
3150 		umtxq_unbusy(&uq->uq_key);
3151 		umtxq_unlock(&uq->uq_key);
3152 	}
3153 out:
3154 	umtx_key_release(&uq->uq_key);
3155 	return (error);
3156 }
3157 
3158 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3159 static int
3160 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3161 {
3162 	struct abs_timeout timo;
3163 	struct umtx_q *uq;
3164 	uint32_t flags, count, count1;
3165 	int error, rv, rv1;
3166 
3167 	uq = td->td_umtxq;
3168 	error = fueword32(&sem->_flags, &flags);
3169 	if (error == -1)
3170 		return (EFAULT);
3171 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3172 	if (error != 0)
3173 		return (error);
3174 
3175 	if (timeout != NULL)
3176 		abs_timeout_init2(&timo, timeout);
3177 
3178 again:
3179 	umtxq_lock(&uq->uq_key);
3180 	umtxq_busy(&uq->uq_key);
3181 	umtxq_insert(uq);
3182 	umtxq_unlock(&uq->uq_key);
3183 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3184 	if (rv == 0)
3185 		rv1 = fueword32(&sem->_count, &count);
3186 	if (rv == -1 || (rv == 0 && (rv1 == -1 || count != 0)) ||
3187 	    (rv == 1 && count1 == 0)) {
3188 		umtxq_lock(&uq->uq_key);
3189 		umtxq_unbusy(&uq->uq_key);
3190 		umtxq_remove(uq);
3191 		umtxq_unlock(&uq->uq_key);
3192 		if (rv == 1) {
3193 			rv = thread_check_susp(td, true);
3194 			if (rv == 0)
3195 				goto again;
3196 			error = rv;
3197 			goto out;
3198 		}
3199 		if (rv == 0)
3200 			rv = rv1;
3201 		error = rv == -1 ? EFAULT : 0;
3202 		goto out;
3203 	}
3204 	umtxq_lock(&uq->uq_key);
3205 	umtxq_unbusy(&uq->uq_key);
3206 
3207 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3208 
3209 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3210 		error = 0;
3211 	else {
3212 		umtxq_remove(uq);
3213 		/* A relative timeout cannot be restarted. */
3214 		if (error == ERESTART && timeout != NULL &&
3215 		    (timeout->_flags & UMTX_ABSTIME) == 0)
3216 			error = EINTR;
3217 	}
3218 	umtxq_unlock(&uq->uq_key);
3219 out:
3220 	umtx_key_release(&uq->uq_key);
3221 	return (error);
3222 }
3223 
3224 /*
3225  * Signal a userland semaphore.
3226  */
3227 static int
3228 do_sem_wake(struct thread *td, struct _usem *sem)
3229 {
3230 	struct umtx_key key;
3231 	int error, cnt;
3232 	uint32_t flags;
3233 
3234 	error = fueword32(&sem->_flags, &flags);
3235 	if (error == -1)
3236 		return (EFAULT);
3237 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3238 		return (error);
3239 	umtxq_lock(&key);
3240 	umtxq_busy(&key);
3241 	cnt = umtxq_count(&key);
3242 	if (cnt > 0) {
3243 		/*
3244 		 * Check if count is greater than 0, this means the memory is
3245 		 * still being referenced by user code, so we can safely
3246 		 * update _has_waiters flag.
3247 		 */
3248 		if (cnt == 1) {
3249 			umtxq_unlock(&key);
3250 			error = suword32(&sem->_has_waiters, 0);
3251 			umtxq_lock(&key);
3252 			if (error == -1)
3253 				error = EFAULT;
3254 		}
3255 		umtxq_signal(&key, 1);
3256 	}
3257 	umtxq_unbusy(&key);
3258 	umtxq_unlock(&key);
3259 	umtx_key_release(&key);
3260 	return (error);
3261 }
3262 #endif
3263 
3264 static int
3265 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
3266 {
3267 	struct abs_timeout timo;
3268 	struct umtx_q *uq;
3269 	uint32_t count, flags;
3270 	int error, rv;
3271 
3272 	uq = td->td_umtxq;
3273 	flags = fuword32(&sem->_flags);
3274 	if (timeout != NULL)
3275 		abs_timeout_init2(&timo, timeout);
3276 
3277 again:
3278 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3279 	if (error != 0)
3280 		return (error);
3281 	umtxq_lock(&uq->uq_key);
3282 	umtxq_busy(&uq->uq_key);
3283 	umtxq_insert(uq);
3284 	umtxq_unlock(&uq->uq_key);
3285 	rv = fueword32(&sem->_count, &count);
3286 	if (rv == -1) {
3287 		umtxq_lock(&uq->uq_key);
3288 		umtxq_unbusy(&uq->uq_key);
3289 		umtxq_remove(uq);
3290 		umtxq_unlock(&uq->uq_key);
3291 		umtx_key_release(&uq->uq_key);
3292 		return (EFAULT);
3293 	}
3294 	for (;;) {
3295 		if (USEM_COUNT(count) != 0) {
3296 			umtxq_lock(&uq->uq_key);
3297 			umtxq_unbusy(&uq->uq_key);
3298 			umtxq_remove(uq);
3299 			umtxq_unlock(&uq->uq_key);
3300 			umtx_key_release(&uq->uq_key);
3301 			return (0);
3302 		}
3303 		if (count == USEM_HAS_WAITERS)
3304 			break;
3305 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
3306 		if (rv == 0)
3307 			break;
3308 		umtxq_lock(&uq->uq_key);
3309 		umtxq_unbusy(&uq->uq_key);
3310 		umtxq_remove(uq);
3311 		umtxq_unlock(&uq->uq_key);
3312 		umtx_key_release(&uq->uq_key);
3313 		if (rv == -1)
3314 			return (EFAULT);
3315 		rv = thread_check_susp(td, true);
3316 		if (rv != 0)
3317 			return (rv);
3318 		goto again;
3319 	}
3320 	umtxq_lock(&uq->uq_key);
3321 	umtxq_unbusy(&uq->uq_key);
3322 
3323 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3324 
3325 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3326 		error = 0;
3327 	else {
3328 		umtxq_remove(uq);
3329 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
3330 			/* A relative timeout cannot be restarted. */
3331 			if (error == ERESTART)
3332 				error = EINTR;
3333 			if (error == EINTR) {
3334 				abs_timeout_update(&timo);
3335 				timespecsub(&timo.end, &timo.cur,
3336 				    &timeout->_timeout);
3337 			}
3338 		}
3339 	}
3340 	umtxq_unlock(&uq->uq_key);
3341 	umtx_key_release(&uq->uq_key);
3342 	return (error);
3343 }
3344 
3345 /*
3346  * Signal a userland semaphore.
3347  */
3348 static int
3349 do_sem2_wake(struct thread *td, struct _usem2 *sem)
3350 {
3351 	struct umtx_key key;
3352 	int error, cnt, rv;
3353 	uint32_t count, flags;
3354 
3355 	rv = fueword32(&sem->_flags, &flags);
3356 	if (rv == -1)
3357 		return (EFAULT);
3358 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3359 		return (error);
3360 	umtxq_lock(&key);
3361 	umtxq_busy(&key);
3362 	cnt = umtxq_count(&key);
3363 	if (cnt > 0) {
3364 		/*
3365 		 * If this was the last sleeping thread, clear the waiters
3366 		 * flag in _count.
3367 		 */
3368 		if (cnt == 1) {
3369 			umtxq_unlock(&key);
3370 			rv = fueword32(&sem->_count, &count);
3371 			while (rv != -1 && count & USEM_HAS_WAITERS) {
3372 				rv = casueword32(&sem->_count, count, &count,
3373 				    count & ~USEM_HAS_WAITERS);
3374 				if (rv == 1) {
3375 					rv = thread_check_susp(td, true);
3376 					if (rv != 0)
3377 						break;
3378 				}
3379 			}
3380 			if (rv == -1)
3381 				error = EFAULT;
3382 			else if (rv > 0) {
3383 				error = rv;
3384 			}
3385 			umtxq_lock(&key);
3386 		}
3387 
3388 		umtxq_signal(&key, 1);
3389 	}
3390 	umtxq_unbusy(&key);
3391 	umtxq_unlock(&key);
3392 	umtx_key_release(&key);
3393 	return (error);
3394 }
3395 
3396 inline int
3397 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3398 {
3399 	int error;
3400 
3401 	error = copyin(addr, tsp, sizeof(struct timespec));
3402 	if (error == 0) {
3403 		if (tsp->tv_sec < 0 ||
3404 		    tsp->tv_nsec >= 1000000000 ||
3405 		    tsp->tv_nsec < 0)
3406 			error = EINVAL;
3407 	}
3408 	return (error);
3409 }
3410 
3411 static inline int
3412 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3413 {
3414 	int error;
3415 
3416 	if (size <= sizeof(struct timespec)) {
3417 		tp->_clockid = CLOCK_REALTIME;
3418 		tp->_flags = 0;
3419 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3420 	} else
3421 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3422 	if (error != 0)
3423 		return (error);
3424 	if (tp->_timeout.tv_sec < 0 ||
3425 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3426 		return (EINVAL);
3427 	return (0);
3428 }
3429 
3430 static int
3431 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3432 {
3433 
3434 	return (EOPNOTSUPP);
3435 }
3436 
3437 static int
3438 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3439 {
3440 	struct _umtx_time timeout, *tm_p;
3441 	int error;
3442 
3443 	if (uap->uaddr2 == NULL)
3444 		tm_p = NULL;
3445 	else {
3446 		error = umtx_copyin_umtx_time(
3447 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3448 		if (error != 0)
3449 			return (error);
3450 		tm_p = &timeout;
3451 	}
3452 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
3453 }
3454 
3455 static int
3456 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3457 {
3458 	struct _umtx_time timeout, *tm_p;
3459 	int error;
3460 
3461 	if (uap->uaddr2 == NULL)
3462 		tm_p = NULL;
3463 	else {
3464 		error = umtx_copyin_umtx_time(
3465 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3466 		if (error != 0)
3467 			return (error);
3468 		tm_p = &timeout;
3469 	}
3470 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
3471 }
3472 
3473 static int
3474 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3475 {
3476 	struct _umtx_time *tm_p, timeout;
3477 	int error;
3478 
3479 	if (uap->uaddr2 == NULL)
3480 		tm_p = NULL;
3481 	else {
3482 		error = umtx_copyin_umtx_time(
3483 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3484 		if (error != 0)
3485 			return (error);
3486 		tm_p = &timeout;
3487 	}
3488 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
3489 }
3490 
3491 static int
3492 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3493 {
3494 
3495 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3496 }
3497 
3498 #define BATCH_SIZE	128
3499 static int
3500 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3501 {
3502 	char *uaddrs[BATCH_SIZE], **upp;
3503 	int count, error, i, pos, tocopy;
3504 
3505 	upp = (char **)uap->obj;
3506 	error = 0;
3507 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
3508 	    pos += tocopy) {
3509 		tocopy = MIN(count, BATCH_SIZE);
3510 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
3511 		if (error != 0)
3512 			break;
3513 		for (i = 0; i < tocopy; ++i)
3514 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3515 		maybe_yield();
3516 	}
3517 	return (error);
3518 }
3519 
3520 static int
3521 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3522 {
3523 
3524 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3525 }
3526 
3527 static int
3528 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3529 {
3530 	struct _umtx_time *tm_p, timeout;
3531 	int error;
3532 
3533 	/* Allow a null timespec (wait forever). */
3534 	if (uap->uaddr2 == NULL)
3535 		tm_p = NULL;
3536 	else {
3537 		error = umtx_copyin_umtx_time(
3538 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3539 		if (error != 0)
3540 			return (error);
3541 		tm_p = &timeout;
3542 	}
3543 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
3544 }
3545 
3546 static int
3547 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3548 {
3549 
3550 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
3551 }
3552 
3553 static int
3554 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3555 {
3556 	struct _umtx_time *tm_p, timeout;
3557 	int error;
3558 
3559 	/* Allow a null timespec (wait forever). */
3560 	if (uap->uaddr2 == NULL)
3561 		tm_p = NULL;
3562 	else {
3563 		error = umtx_copyin_umtx_time(
3564 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3565 		if (error != 0)
3566 			return (error);
3567 		tm_p = &timeout;
3568 	}
3569 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
3570 }
3571 
3572 static int
3573 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3574 {
3575 
3576 	return (do_wake_umutex(td, uap->obj));
3577 }
3578 
3579 static int
3580 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3581 {
3582 
3583 	return (do_unlock_umutex(td, uap->obj, false));
3584 }
3585 
3586 static int
3587 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3588 {
3589 
3590 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
3591 }
3592 
3593 static int
3594 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3595 {
3596 	struct timespec *ts, timeout;
3597 	int error;
3598 
3599 	/* Allow a null timespec (wait forever). */
3600 	if (uap->uaddr2 == NULL)
3601 		ts = NULL;
3602 	else {
3603 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3604 		if (error != 0)
3605 			return (error);
3606 		ts = &timeout;
3607 	}
3608 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3609 }
3610 
3611 static int
3612 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3613 {
3614 
3615 	return (do_cv_signal(td, uap->obj));
3616 }
3617 
3618 static int
3619 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3620 {
3621 
3622 	return (do_cv_broadcast(td, uap->obj));
3623 }
3624 
3625 static int
3626 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3627 {
3628 	struct _umtx_time timeout;
3629 	int error;
3630 
3631 	/* Allow a null timespec (wait forever). */
3632 	if (uap->uaddr2 == NULL) {
3633 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3634 	} else {
3635 		error = umtx_copyin_umtx_time(uap->uaddr2,
3636 		   (size_t)uap->uaddr1, &timeout);
3637 		if (error != 0)
3638 			return (error);
3639 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3640 	}
3641 	return (error);
3642 }
3643 
3644 static int
3645 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3646 {
3647 	struct _umtx_time timeout;
3648 	int error;
3649 
3650 	/* Allow a null timespec (wait forever). */
3651 	if (uap->uaddr2 == NULL) {
3652 		error = do_rw_wrlock(td, uap->obj, 0);
3653 	} else {
3654 		error = umtx_copyin_umtx_time(uap->uaddr2,
3655 		   (size_t)uap->uaddr1, &timeout);
3656 		if (error != 0)
3657 			return (error);
3658 
3659 		error = do_rw_wrlock(td, uap->obj, &timeout);
3660 	}
3661 	return (error);
3662 }
3663 
3664 static int
3665 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3666 {
3667 
3668 	return (do_rw_unlock(td, uap->obj));
3669 }
3670 
3671 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3672 static int
3673 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3674 {
3675 	struct _umtx_time *tm_p, timeout;
3676 	int error;
3677 
3678 	/* Allow a null timespec (wait forever). */
3679 	if (uap->uaddr2 == NULL)
3680 		tm_p = NULL;
3681 	else {
3682 		error = umtx_copyin_umtx_time(
3683 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3684 		if (error != 0)
3685 			return (error);
3686 		tm_p = &timeout;
3687 	}
3688 	return (do_sem_wait(td, uap->obj, tm_p));
3689 }
3690 
3691 static int
3692 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3693 {
3694 
3695 	return (do_sem_wake(td, uap->obj));
3696 }
3697 #endif
3698 
3699 static int
3700 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3701 {
3702 
3703 	return (do_wake2_umutex(td, uap->obj, uap->val));
3704 }
3705 
3706 static int
3707 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3708 {
3709 	struct _umtx_time *tm_p, timeout;
3710 	size_t uasize;
3711 	int error;
3712 
3713 	/* Allow a null timespec (wait forever). */
3714 	if (uap->uaddr2 == NULL) {
3715 		uasize = 0;
3716 		tm_p = NULL;
3717 	} else {
3718 		uasize = (size_t)uap->uaddr1;
3719 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
3720 		if (error != 0)
3721 			return (error);
3722 		tm_p = &timeout;
3723 	}
3724 	error = do_sem2_wait(td, uap->obj, tm_p);
3725 	if (error == EINTR && uap->uaddr2 != NULL &&
3726 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
3727 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
3728 		error = copyout(&timeout._timeout,
3729 		    (struct _umtx_time *)uap->uaddr2 + 1,
3730 		    sizeof(struct timespec));
3731 		if (error == 0) {
3732 			error = EINTR;
3733 		}
3734 	}
3735 
3736 	return (error);
3737 }
3738 
3739 static int
3740 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3741 {
3742 
3743 	return (do_sem2_wake(td, uap->obj));
3744 }
3745 
3746 #define	USHM_OBJ_UMTX(o)						\
3747     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
3748 
3749 #define	USHMF_REG_LINKED	0x0001
3750 #define	USHMF_OBJ_LINKED	0x0002
3751 struct umtx_shm_reg {
3752 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
3753 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
3754 	struct umtx_key		ushm_key;
3755 	struct ucred		*ushm_cred;
3756 	struct shmfd		*ushm_obj;
3757 	u_int			ushm_refcnt;
3758 	u_int			ushm_flags;
3759 };
3760 
3761 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
3762 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
3763 
3764 static uma_zone_t umtx_shm_reg_zone;
3765 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
3766 static struct mtx umtx_shm_lock;
3767 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
3768     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
3769 
3770 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
3771 
3772 static void
3773 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
3774 {
3775 	struct umtx_shm_reg_head d;
3776 	struct umtx_shm_reg *reg, *reg1;
3777 
3778 	TAILQ_INIT(&d);
3779 	mtx_lock(&umtx_shm_lock);
3780 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
3781 	mtx_unlock(&umtx_shm_lock);
3782 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
3783 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
3784 		umtx_shm_free_reg(reg);
3785 	}
3786 }
3787 
3788 static struct task umtx_shm_reg_delfree_task =
3789     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
3790 
3791 static struct umtx_shm_reg *
3792 umtx_shm_find_reg_locked(const struct umtx_key *key)
3793 {
3794 	struct umtx_shm_reg *reg;
3795 	struct umtx_shm_reg_head *reg_head;
3796 
3797 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
3798 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3799 	reg_head = &umtx_shm_registry[key->hash];
3800 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
3801 		KASSERT(reg->ushm_key.shared,
3802 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
3803 		if (reg->ushm_key.info.shared.object ==
3804 		    key->info.shared.object &&
3805 		    reg->ushm_key.info.shared.offset ==
3806 		    key->info.shared.offset) {
3807 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
3808 			KASSERT(reg->ushm_refcnt > 0,
3809 			    ("reg %p refcnt 0 onlist", reg));
3810 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
3811 			    ("reg %p not linked", reg));
3812 			reg->ushm_refcnt++;
3813 			return (reg);
3814 		}
3815 	}
3816 	return (NULL);
3817 }
3818 
3819 static struct umtx_shm_reg *
3820 umtx_shm_find_reg(const struct umtx_key *key)
3821 {
3822 	struct umtx_shm_reg *reg;
3823 
3824 	mtx_lock(&umtx_shm_lock);
3825 	reg = umtx_shm_find_reg_locked(key);
3826 	mtx_unlock(&umtx_shm_lock);
3827 	return (reg);
3828 }
3829 
3830 static void
3831 umtx_shm_free_reg(struct umtx_shm_reg *reg)
3832 {
3833 
3834 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
3835 	crfree(reg->ushm_cred);
3836 	shm_drop(reg->ushm_obj);
3837 	uma_zfree(umtx_shm_reg_zone, reg);
3838 }
3839 
3840 static bool
3841 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
3842 {
3843 	bool res;
3844 
3845 	mtx_assert(&umtx_shm_lock, MA_OWNED);
3846 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
3847 	reg->ushm_refcnt--;
3848 	res = reg->ushm_refcnt == 0;
3849 	if (res || force) {
3850 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
3851 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
3852 			    reg, ushm_reg_link);
3853 			reg->ushm_flags &= ~USHMF_REG_LINKED;
3854 		}
3855 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
3856 			LIST_REMOVE(reg, ushm_obj_link);
3857 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
3858 		}
3859 	}
3860 	return (res);
3861 }
3862 
3863 static void
3864 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
3865 {
3866 	vm_object_t object;
3867 	bool dofree;
3868 
3869 	if (force) {
3870 		object = reg->ushm_obj->shm_object;
3871 		VM_OBJECT_WLOCK(object);
3872 		object->flags |= OBJ_UMTXDEAD;
3873 		VM_OBJECT_WUNLOCK(object);
3874 	}
3875 	mtx_lock(&umtx_shm_lock);
3876 	dofree = umtx_shm_unref_reg_locked(reg, force);
3877 	mtx_unlock(&umtx_shm_lock);
3878 	if (dofree)
3879 		umtx_shm_free_reg(reg);
3880 }
3881 
3882 void
3883 umtx_shm_object_init(vm_object_t object)
3884 {
3885 
3886 	LIST_INIT(USHM_OBJ_UMTX(object));
3887 }
3888 
3889 void
3890 umtx_shm_object_terminated(vm_object_t object)
3891 {
3892 	struct umtx_shm_reg *reg, *reg1;
3893 	bool dofree;
3894 
3895 	if (LIST_EMPTY(USHM_OBJ_UMTX(object)))
3896 		return;
3897 
3898 	dofree = false;
3899 	mtx_lock(&umtx_shm_lock);
3900 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
3901 		if (umtx_shm_unref_reg_locked(reg, true)) {
3902 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
3903 			    ushm_reg_link);
3904 			dofree = true;
3905 		}
3906 	}
3907 	mtx_unlock(&umtx_shm_lock);
3908 	if (dofree)
3909 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
3910 }
3911 
3912 static int
3913 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
3914     struct umtx_shm_reg **res)
3915 {
3916 	struct umtx_shm_reg *reg, *reg1;
3917 	struct ucred *cred;
3918 	int error;
3919 
3920 	reg = umtx_shm_find_reg(key);
3921 	if (reg != NULL) {
3922 		*res = reg;
3923 		return (0);
3924 	}
3925 	cred = td->td_ucred;
3926 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
3927 		return (ENOMEM);
3928 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
3929 	reg->ushm_refcnt = 1;
3930 	bcopy(key, &reg->ushm_key, sizeof(*key));
3931 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
3932 	reg->ushm_cred = crhold(cred);
3933 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
3934 	if (error != 0) {
3935 		umtx_shm_free_reg(reg);
3936 		return (error);
3937 	}
3938 	mtx_lock(&umtx_shm_lock);
3939 	reg1 = umtx_shm_find_reg_locked(key);
3940 	if (reg1 != NULL) {
3941 		mtx_unlock(&umtx_shm_lock);
3942 		umtx_shm_free_reg(reg);
3943 		*res = reg1;
3944 		return (0);
3945 	}
3946 	reg->ushm_refcnt++;
3947 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
3948 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
3949 	    ushm_obj_link);
3950 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
3951 	mtx_unlock(&umtx_shm_lock);
3952 	*res = reg;
3953 	return (0);
3954 }
3955 
3956 static int
3957 umtx_shm_alive(struct thread *td, void *addr)
3958 {
3959 	vm_map_t map;
3960 	vm_map_entry_t entry;
3961 	vm_object_t object;
3962 	vm_pindex_t pindex;
3963 	vm_prot_t prot;
3964 	int res, ret;
3965 	boolean_t wired;
3966 
3967 	map = &td->td_proc->p_vmspace->vm_map;
3968 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
3969 	    &object, &pindex, &prot, &wired);
3970 	if (res != KERN_SUCCESS)
3971 		return (EFAULT);
3972 	if (object == NULL)
3973 		ret = EINVAL;
3974 	else
3975 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
3976 	vm_map_lookup_done(map, entry);
3977 	return (ret);
3978 }
3979 
3980 static void
3981 umtx_shm_init(void)
3982 {
3983 	int i;
3984 
3985 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
3986 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
3987 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
3988 	for (i = 0; i < nitems(umtx_shm_registry); i++)
3989 		TAILQ_INIT(&umtx_shm_registry[i]);
3990 }
3991 
3992 static int
3993 umtx_shm(struct thread *td, void *addr, u_int flags)
3994 {
3995 	struct umtx_key key;
3996 	struct umtx_shm_reg *reg;
3997 	struct file *fp;
3998 	int error, fd;
3999 
4000 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
4001 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
4002 		return (EINVAL);
4003 	if ((flags & UMTX_SHM_ALIVE) != 0)
4004 		return (umtx_shm_alive(td, addr));
4005 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
4006 	if (error != 0)
4007 		return (error);
4008 	KASSERT(key.shared == 1, ("non-shared key"));
4009 	if ((flags & UMTX_SHM_CREAT) != 0) {
4010 		error = umtx_shm_create_reg(td, &key, &reg);
4011 	} else {
4012 		reg = umtx_shm_find_reg(&key);
4013 		if (reg == NULL)
4014 			error = ESRCH;
4015 	}
4016 	umtx_key_release(&key);
4017 	if (error != 0)
4018 		return (error);
4019 	KASSERT(reg != NULL, ("no reg"));
4020 	if ((flags & UMTX_SHM_DESTROY) != 0) {
4021 		umtx_shm_unref_reg(reg, true);
4022 	} else {
4023 #if 0
4024 #ifdef MAC
4025 		error = mac_posixshm_check_open(td->td_ucred,
4026 		    reg->ushm_obj, FFLAGS(O_RDWR));
4027 		if (error == 0)
4028 #endif
4029 			error = shm_access(reg->ushm_obj, td->td_ucred,
4030 			    FFLAGS(O_RDWR));
4031 		if (error == 0)
4032 #endif
4033 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
4034 		if (error == 0) {
4035 			shm_hold(reg->ushm_obj);
4036 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
4037 			    &shm_ops);
4038 			td->td_retval[0] = fd;
4039 			fdrop(fp, td);
4040 		}
4041 	}
4042 	umtx_shm_unref_reg(reg, false);
4043 	return (error);
4044 }
4045 
4046 static int
4047 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
4048 {
4049 
4050 	return (umtx_shm(td, uap->uaddr1, uap->val));
4051 }
4052 
4053 static int
4054 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
4055 {
4056 
4057 	td->td_rb_list = rbp->robust_list_offset;
4058 	td->td_rbp_list = rbp->robust_priv_list_offset;
4059 	td->td_rb_inact = rbp->robust_inact_offset;
4060 	return (0);
4061 }
4062 
4063 static int
4064 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
4065 {
4066 	struct umtx_robust_lists_params rb;
4067 	int error;
4068 
4069 	if (uap->val > sizeof(rb))
4070 		return (EINVAL);
4071 	bzero(&rb, sizeof(rb));
4072 	error = copyin(uap->uaddr1, &rb, uap->val);
4073 	if (error != 0)
4074 		return (error);
4075 	return (umtx_robust_lists(td, &rb));
4076 }
4077 
4078 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
4079 
4080 static const _umtx_op_func op_table[] = {
4081 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4082 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4083 	[UMTX_OP_WAIT]		= __umtx_op_wait,
4084 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4085 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4086 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
4087 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4088 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4089 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
4090 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4091 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4092 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
4093 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
4094 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
4095 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4096 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
4097 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4098 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
4099 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4100 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4101 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
4102 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4103 #else
4104 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4105 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4106 #endif
4107 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
4108 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4109 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
4110 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4111 	[UMTX_OP_SHM]		= __umtx_op_shm,
4112 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
4113 };
4114 
4115 int
4116 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
4117 {
4118 
4119 	if ((unsigned)uap->op < nitems(op_table))
4120 		return (*op_table[uap->op])(td, uap);
4121 	return (EINVAL);
4122 }
4123 
4124 #ifdef COMPAT_FREEBSD32
4125 
4126 struct timespec32 {
4127 	int32_t tv_sec;
4128 	int32_t tv_nsec;
4129 };
4130 
4131 struct umtx_time32 {
4132 	struct	timespec32	timeout;
4133 	uint32_t		flags;
4134 	uint32_t		clockid;
4135 };
4136 
4137 static inline int
4138 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
4139 {
4140 	struct timespec32 ts32;
4141 	int error;
4142 
4143 	error = copyin(addr, &ts32, sizeof(struct timespec32));
4144 	if (error == 0) {
4145 		if (ts32.tv_sec < 0 ||
4146 		    ts32.tv_nsec >= 1000000000 ||
4147 		    ts32.tv_nsec < 0)
4148 			error = EINVAL;
4149 		else {
4150 			tsp->tv_sec = ts32.tv_sec;
4151 			tsp->tv_nsec = ts32.tv_nsec;
4152 		}
4153 	}
4154 	return (error);
4155 }
4156 
4157 static inline int
4158 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
4159 {
4160 	struct umtx_time32 t32;
4161 	int error;
4162 
4163 	t32.clockid = CLOCK_REALTIME;
4164 	t32.flags   = 0;
4165 	if (size <= sizeof(struct timespec32))
4166 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
4167 	else
4168 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
4169 	if (error != 0)
4170 		return (error);
4171 	if (t32.timeout.tv_sec < 0 ||
4172 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
4173 		return (EINVAL);
4174 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
4175 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
4176 	tp->_flags = t32.flags;
4177 	tp->_clockid = t32.clockid;
4178 	return (0);
4179 }
4180 
4181 static int
4182 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4183 {
4184 	struct _umtx_time *tm_p, timeout;
4185 	int error;
4186 
4187 	if (uap->uaddr2 == NULL)
4188 		tm_p = NULL;
4189 	else {
4190 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4191 			(size_t)uap->uaddr1, &timeout);
4192 		if (error != 0)
4193 			return (error);
4194 		tm_p = &timeout;
4195 	}
4196 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
4197 }
4198 
4199 static int
4200 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4201 {
4202 	struct _umtx_time *tm_p, timeout;
4203 	int error;
4204 
4205 	/* Allow a null timespec (wait forever). */
4206 	if (uap->uaddr2 == NULL)
4207 		tm_p = NULL;
4208 	else {
4209 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4210 			    (size_t)uap->uaddr1, &timeout);
4211 		if (error != 0)
4212 			return (error);
4213 		tm_p = &timeout;
4214 	}
4215 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
4216 }
4217 
4218 static int
4219 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
4220 {
4221 	struct _umtx_time *tm_p, timeout;
4222 	int error;
4223 
4224 	/* Allow a null timespec (wait forever). */
4225 	if (uap->uaddr2 == NULL)
4226 		tm_p = NULL;
4227 	else {
4228 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4229 		    (size_t)uap->uaddr1, &timeout);
4230 		if (error != 0)
4231 			return (error);
4232 		tm_p = &timeout;
4233 	}
4234 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
4235 }
4236 
4237 static int
4238 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4239 {
4240 	struct timespec *ts, timeout;
4241 	int error;
4242 
4243 	/* Allow a null timespec (wait forever). */
4244 	if (uap->uaddr2 == NULL)
4245 		ts = NULL;
4246 	else {
4247 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
4248 		if (error != 0)
4249 			return (error);
4250 		ts = &timeout;
4251 	}
4252 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
4253 }
4254 
4255 static int
4256 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4257 {
4258 	struct _umtx_time timeout;
4259 	int error;
4260 
4261 	/* Allow a null timespec (wait forever). */
4262 	if (uap->uaddr2 == NULL) {
4263 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
4264 	} else {
4265 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4266 		    (size_t)uap->uaddr1, &timeout);
4267 		if (error != 0)
4268 			return (error);
4269 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
4270 	}
4271 	return (error);
4272 }
4273 
4274 static int
4275 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
4276 {
4277 	struct _umtx_time timeout;
4278 	int error;
4279 
4280 	/* Allow a null timespec (wait forever). */
4281 	if (uap->uaddr2 == NULL) {
4282 		error = do_rw_wrlock(td, uap->obj, 0);
4283 	} else {
4284 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4285 		    (size_t)uap->uaddr1, &timeout);
4286 		if (error != 0)
4287 			return (error);
4288 		error = do_rw_wrlock(td, uap->obj, &timeout);
4289 	}
4290 	return (error);
4291 }
4292 
4293 static int
4294 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
4295 {
4296 	struct _umtx_time *tm_p, timeout;
4297 	int error;
4298 
4299 	if (uap->uaddr2 == NULL)
4300 		tm_p = NULL;
4301 	else {
4302 		error = umtx_copyin_umtx_time32(
4303 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
4304 		if (error != 0)
4305 			return (error);
4306 		tm_p = &timeout;
4307 	}
4308 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
4309 }
4310 
4311 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4312 static int
4313 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4314 {
4315 	struct _umtx_time *tm_p, timeout;
4316 	int error;
4317 
4318 	/* Allow a null timespec (wait forever). */
4319 	if (uap->uaddr2 == NULL)
4320 		tm_p = NULL;
4321 	else {
4322 		error = umtx_copyin_umtx_time32(uap->uaddr2,
4323 		    (size_t)uap->uaddr1, &timeout);
4324 		if (error != 0)
4325 			return (error);
4326 		tm_p = &timeout;
4327 	}
4328 	return (do_sem_wait(td, uap->obj, tm_p));
4329 }
4330 #endif
4331 
4332 static int
4333 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
4334 {
4335 	struct _umtx_time *tm_p, timeout;
4336 	size_t uasize;
4337 	int error;
4338 
4339 	/* Allow a null timespec (wait forever). */
4340 	if (uap->uaddr2 == NULL) {
4341 		uasize = 0;
4342 		tm_p = NULL;
4343 	} else {
4344 		uasize = (size_t)uap->uaddr1;
4345 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
4346 		if (error != 0)
4347 			return (error);
4348 		tm_p = &timeout;
4349 	}
4350 	error = do_sem2_wait(td, uap->obj, tm_p);
4351 	if (error == EINTR && uap->uaddr2 != NULL &&
4352 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
4353 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
4354 		struct timespec32 remain32 = {
4355 			.tv_sec = timeout._timeout.tv_sec,
4356 			.tv_nsec = timeout._timeout.tv_nsec
4357 		};
4358 		error = copyout(&remain32,
4359 		    (struct umtx_time32 *)uap->uaddr2 + 1,
4360 		    sizeof(struct timespec32));
4361 		if (error == 0) {
4362 			error = EINTR;
4363 		}
4364 	}
4365 
4366 	return (error);
4367 }
4368 
4369 static int
4370 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
4371 {
4372 	uint32_t uaddrs[BATCH_SIZE], **upp;
4373 	int count, error, i, pos, tocopy;
4374 
4375 	upp = (uint32_t **)uap->obj;
4376 	error = 0;
4377 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
4378 	    pos += tocopy) {
4379 		tocopy = MIN(count, BATCH_SIZE);
4380 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
4381 		if (error != 0)
4382 			break;
4383 		for (i = 0; i < tocopy; ++i)
4384 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
4385 			    INT_MAX, 1);
4386 		maybe_yield();
4387 	}
4388 	return (error);
4389 }
4390 
4391 struct umtx_robust_lists_params_compat32 {
4392 	uint32_t	robust_list_offset;
4393 	uint32_t	robust_priv_list_offset;
4394 	uint32_t	robust_inact_offset;
4395 };
4396 
4397 static int
4398 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
4399 {
4400 	struct umtx_robust_lists_params rb;
4401 	struct umtx_robust_lists_params_compat32 rb32;
4402 	int error;
4403 
4404 	if (uap->val > sizeof(rb32))
4405 		return (EINVAL);
4406 	bzero(&rb, sizeof(rb));
4407 	bzero(&rb32, sizeof(rb32));
4408 	error = copyin(uap->uaddr1, &rb32, uap->val);
4409 	if (error != 0)
4410 		return (error);
4411 	rb.robust_list_offset = rb32.robust_list_offset;
4412 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
4413 	rb.robust_inact_offset = rb32.robust_inact_offset;
4414 	return (umtx_robust_lists(td, &rb));
4415 }
4416 
4417 static const _umtx_op_func op_table_compat32[] = {
4418 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
4419 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
4420 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
4421 	[UMTX_OP_WAKE]		= __umtx_op_wake,
4422 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
4423 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
4424 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
4425 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
4426 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
4427 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
4428 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
4429 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
4430 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
4431 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
4432 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
4433 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
4434 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
4435 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
4436 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
4437 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
4438 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
4439 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
4440 #else
4441 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
4442 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
4443 #endif
4444 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
4445 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
4446 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
4447 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
4448 	[UMTX_OP_SHM]		= __umtx_op_shm,
4449 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
4450 };
4451 
4452 int
4453 freebsd32__umtx_op(struct thread *td, struct freebsd32__umtx_op_args *uap)
4454 {
4455 
4456 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
4457 		return (*op_table_compat32[uap->op])(td,
4458 		    (struct _umtx_op_args *)uap);
4459 	}
4460 	return (EINVAL);
4461 }
4462 #endif
4463 
4464 void
4465 umtx_thread_init(struct thread *td)
4466 {
4467 
4468 	td->td_umtxq = umtxq_alloc();
4469 	td->td_umtxq->uq_thread = td;
4470 }
4471 
4472 void
4473 umtx_thread_fini(struct thread *td)
4474 {
4475 
4476 	umtxq_free(td->td_umtxq);
4477 }
4478 
4479 /*
4480  * It will be called when new thread is created, e.g fork().
4481  */
4482 void
4483 umtx_thread_alloc(struct thread *td)
4484 {
4485 	struct umtx_q *uq;
4486 
4487 	uq = td->td_umtxq;
4488 	uq->uq_inherited_pri = PRI_MAX;
4489 
4490 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
4491 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
4492 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
4493 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
4494 }
4495 
4496 /*
4497  * exec() hook.
4498  *
4499  * Clear robust lists for all process' threads, not delaying the
4500  * cleanup to thread_exit hook, since the relevant address space is
4501  * destroyed right now.
4502  */
4503 static void
4504 umtx_exec_hook(void *arg __unused, struct proc *p,
4505     struct image_params *imgp __unused)
4506 {
4507 	struct thread *td;
4508 
4509 	KASSERT(p == curproc, ("need curproc"));
4510 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
4511 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
4512 	    ("curproc must be single-threaded"));
4513 	/*
4514 	 * There is no need to lock the list as only this thread can be
4515 	 * running.
4516 	 */
4517 	FOREACH_THREAD_IN_PROC(p, td) {
4518 		KASSERT(td == curthread ||
4519 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
4520 		    ("running thread %p %p", p, td));
4521 		umtx_thread_cleanup(td);
4522 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
4523 	}
4524 }
4525 
4526 /*
4527  * thread_exit() hook.
4528  */
4529 void
4530 umtx_thread_exit(struct thread *td)
4531 {
4532 
4533 	umtx_thread_cleanup(td);
4534 }
4535 
4536 static int
4537 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
4538 {
4539 	u_long res1;
4540 #ifdef COMPAT_FREEBSD32
4541 	uint32_t res32;
4542 #endif
4543 	int error;
4544 
4545 #ifdef COMPAT_FREEBSD32
4546 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4547 		error = fueword32((void *)ptr, &res32);
4548 		if (error == 0)
4549 			res1 = res32;
4550 	} else
4551 #endif
4552 	{
4553 		error = fueword((void *)ptr, &res1);
4554 	}
4555 	if (error == 0)
4556 		*res = res1;
4557 	else
4558 		error = EFAULT;
4559 	return (error);
4560 }
4561 
4562 static void
4563 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
4564 {
4565 #ifdef COMPAT_FREEBSD32
4566 	struct umutex32 m32;
4567 
4568 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
4569 		memcpy(&m32, m, sizeof(m32));
4570 		*rb_list = m32.m_rb_lnk;
4571 	} else
4572 #endif
4573 		*rb_list = m->m_rb_lnk;
4574 }
4575 
4576 static int
4577 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
4578 {
4579 	struct umutex m;
4580 	int error;
4581 
4582 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
4583 	error = copyin((void *)rbp, &m, sizeof(m));
4584 	if (error != 0)
4585 		return (error);
4586 	if (rb_list != NULL)
4587 		umtx_read_rb_list(td, &m, rb_list);
4588 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
4589 		return (EINVAL);
4590 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
4591 		/* inact is cleared after unlock, allow the inconsistency */
4592 		return (inact ? 0 : EINVAL);
4593 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
4594 }
4595 
4596 static void
4597 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
4598     const char *name)
4599 {
4600 	int error, i;
4601 	uintptr_t rbp;
4602 	bool inact;
4603 
4604 	if (rb_list == 0)
4605 		return;
4606 	error = umtx_read_uptr(td, rb_list, &rbp);
4607 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
4608 		if (rbp == *rb_inact) {
4609 			inact = true;
4610 			*rb_inact = 0;
4611 		} else
4612 			inact = false;
4613 		error = umtx_handle_rb(td, rbp, &rbp, inact);
4614 	}
4615 	if (i == umtx_max_rb && umtx_verbose_rb) {
4616 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
4617 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
4618 	}
4619 	if (error != 0 && umtx_verbose_rb) {
4620 		uprintf("comm %s pid %d: handling %srb error %d\n",
4621 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
4622 	}
4623 }
4624 
4625 /*
4626  * Clean up umtx data.
4627  */
4628 static void
4629 umtx_thread_cleanup(struct thread *td)
4630 {
4631 	struct umtx_q *uq;
4632 	struct umtx_pi *pi;
4633 	uintptr_t rb_inact;
4634 
4635 	/*
4636 	 * Disown pi mutexes.
4637 	 */
4638 	uq = td->td_umtxq;
4639 	if (uq != NULL) {
4640 		if (uq->uq_inherited_pri != PRI_MAX ||
4641 		    !TAILQ_EMPTY(&uq->uq_pi_contested)) {
4642 			mtx_lock(&umtx_lock);
4643 			uq->uq_inherited_pri = PRI_MAX;
4644 			while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4645 				pi->pi_owner = NULL;
4646 				TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4647 			}
4648 			mtx_unlock(&umtx_lock);
4649 		}
4650 		sched_lend_user_prio_cond(td, PRI_MAX);
4651 	}
4652 
4653 	if (td->td_rb_inact == 0 && td->td_rb_list == 0 && td->td_rbp_list == 0)
4654 		return;
4655 
4656 	/*
4657 	 * Handle terminated robust mutexes.  Must be done after
4658 	 * robust pi disown, otherwise unlock could see unowned
4659 	 * entries.
4660 	 */
4661 	rb_inact = td->td_rb_inact;
4662 	if (rb_inact != 0)
4663 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
4664 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
4665 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
4666 	if (rb_inact != 0)
4667 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
4668 }
4669