xref: /freebsd/sys/kern/kern_umtx.c (revision 595e514d0df2bac5b813d35f83e32875dbf16a83)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173 
174 /*
175  * Don't propagate time-sharing priority, there is a security reason,
176  * a user can simply introduce PI-mutex, let thread A lock the mutex,
177  * and let another thread B block on the mutex, because B is
178  * sleeping, its priority will be boosted, this causes A's priority to
179  * be boosted via priority propagating too and will never be lowered even
180  * if it is using 100%CPU, this is unfair to other processes.
181  */
182 
183 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186 
187 #define	GOLDEN_RATIO_PRIME	2654404609U
188 #define	UMTX_CHAINS		512
189 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
190 
191 #define	GET_SHARE(flags)	\
192     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193 
194 #define BUSY_SPINS		200
195 
196 struct abs_timeout {
197 	int clockid;
198 	struct timespec cur;
199 	struct timespec end;
200 };
201 
202 static uma_zone_t		umtx_pi_zone;
203 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205 static int			umtx_pi_allocated;
206 
207 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209     &umtx_pi_allocated, 0, "Allocated umtx_pi");
210 
211 #ifdef UMTX_PROFILING
212 static long max_length;
213 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215 #endif
216 
217 static void umtxq_sysinit(void *);
218 static void umtxq_hash(struct umtx_key *key);
219 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220 static void umtxq_lock(struct umtx_key *key);
221 static void umtxq_unlock(struct umtx_key *key);
222 static void umtxq_busy(struct umtx_key *key);
223 static void umtxq_unbusy(struct umtx_key *key);
224 static void umtxq_insert_queue(struct umtx_q *uq, int q);
225 static void umtxq_remove_queue(struct umtx_q *uq, int q);
226 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227 static int umtxq_count(struct umtx_key *key);
228 static struct umtx_pi *umtx_pi_alloc(int);
229 static void umtx_pi_free(struct umtx_pi *pi);
230 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231 static void umtx_thread_cleanup(struct thread *td);
232 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233 	struct image_params *imgp __unused);
234 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235 
236 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239 
240 static struct mtx umtx_lock;
241 
242 #ifdef UMTX_PROFILING
243 static void
244 umtx_init_profiling(void)
245 {
246 	struct sysctl_oid *chain_oid;
247 	char chain_name[10];
248 	int i;
249 
250 	for (i = 0; i < UMTX_CHAINS; ++i) {
251 		snprintf(chain_name, sizeof(chain_name), "%d", i);
252 		chain_oid = SYSCTL_ADD_NODE(NULL,
253 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259 	}
260 }
261 
262 static int
263 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264 {
265 	char buf[512];
266 	struct sbuf sb;
267 	struct umtxq_chain *uc;
268 	u_int fract, i, j, tot, whole;
269 	u_int sf0, sf1, sf2, sf3, sf4;
270 	u_int si0, si1, si2, si3, si4;
271 	u_int sw0, sw1, sw2, sw3, sw4;
272 
273 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274 	for (i = 0; i < 2; i++) {
275 		tot = 0;
276 		for (j = 0; j < UMTX_CHAINS; ++j) {
277 			uc = &umtxq_chains[i][j];
278 			mtx_lock(&uc->uc_lock);
279 			tot += uc->max_length;
280 			mtx_unlock(&uc->uc_lock);
281 		}
282 		if (tot == 0)
283 			sbuf_printf(&sb, "%u) Empty ", i);
284 		else {
285 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286 			si0 = si1 = si2 = si3 = si4 = 0;
287 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288 			for (j = 0; j < UMTX_CHAINS; j++) {
289 				uc = &umtxq_chains[i][j];
290 				mtx_lock(&uc->uc_lock);
291 				whole = uc->max_length * 100;
292 				mtx_unlock(&uc->uc_lock);
293 				fract = (whole % tot) * 100;
294 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295 					sf0 = fract;
296 					si0 = j;
297 					sw0 = whole;
298 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299 				    sf1)) {
300 					sf1 = fract;
301 					si1 = j;
302 					sw1 = whole;
303 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304 				    sf2)) {
305 					sf2 = fract;
306 					si2 = j;
307 					sw2 = whole;
308 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309 				    sf3)) {
310 					sf3 = fract;
311 					si3 = j;
312 					sw3 = whole;
313 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314 				    sf4)) {
315 					sf4 = fract;
316 					si4 = j;
317 					sw4 = whole;
318 				}
319 			}
320 			sbuf_printf(&sb, "queue %u:\n", i);
321 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322 			    sf0 / tot, si0);
323 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324 			    sf1 / tot, si1);
325 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326 			    sf2 / tot, si2);
327 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328 			    sf3 / tot, si3);
329 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330 			    sf4 / tot, si4);
331 		}
332 	}
333 	sbuf_trim(&sb);
334 	sbuf_finish(&sb);
335 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336 	sbuf_delete(&sb);
337 	return (0);
338 }
339 
340 static int
341 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342 {
343 	struct umtxq_chain *uc;
344 	u_int i, j;
345 	int clear, error;
346 
347 	clear = 0;
348 	error = sysctl_handle_int(oidp, &clear, 0, req);
349 	if (error != 0 || req->newptr == NULL)
350 		return (error);
351 
352 	if (clear != 0) {
353 		for (i = 0; i < 2; ++i) {
354 			for (j = 0; j < UMTX_CHAINS; ++j) {
355 				uc = &umtxq_chains[i][j];
356 				mtx_lock(&uc->uc_lock);
357 				uc->length = 0;
358 				uc->max_length = 0;
359 				mtx_unlock(&uc->uc_lock);
360 			}
361 		}
362 	}
363 	return (0);
364 }
365 
366 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372 #endif
373 
374 static void
375 umtxq_sysinit(void *arg __unused)
376 {
377 	int i, j;
378 
379 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381 	for (i = 0; i < 2; ++i) {
382 		for (j = 0; j < UMTX_CHAINS; ++j) {
383 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384 				 MTX_DEF | MTX_DUPOK);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389 			umtxq_chains[i][j].uc_busy = 0;
390 			umtxq_chains[i][j].uc_waiters = 0;
391 #ifdef UMTX_PROFILING
392 			umtxq_chains[i][j].length = 0;
393 			umtxq_chains[i][j].max_length = 0;
394 #endif
395 		}
396 	}
397 #ifdef UMTX_PROFILING
398 	umtx_init_profiling();
399 #endif
400 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402 	    EVENTHANDLER_PRI_ANY);
403 }
404 
405 struct umtx_q *
406 umtxq_alloc(void)
407 {
408 	struct umtx_q *uq;
409 
410 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412 	TAILQ_INIT(&uq->uq_spare_queue->head);
413 	TAILQ_INIT(&uq->uq_pi_contested);
414 	uq->uq_inherited_pri = PRI_MAX;
415 	return (uq);
416 }
417 
418 void
419 umtxq_free(struct umtx_q *uq)
420 {
421 	MPASS(uq->uq_spare_queue != NULL);
422 	free(uq->uq_spare_queue, M_UMTX);
423 	free(uq, M_UMTX);
424 }
425 
426 static inline void
427 umtxq_hash(struct umtx_key *key)
428 {
429 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431 }
432 
433 static inline struct umtxq_chain *
434 umtxq_getchain(struct umtx_key *key)
435 {
436 	if (key->type <= TYPE_SEM)
437 		return (&umtxq_chains[1][key->hash]);
438 	return (&umtxq_chains[0][key->hash]);
439 }
440 
441 /*
442  * Lock a chain.
443  */
444 static inline void
445 umtxq_lock(struct umtx_key *key)
446 {
447 	struct umtxq_chain *uc;
448 
449 	uc = umtxq_getchain(key);
450 	mtx_lock(&uc->uc_lock);
451 }
452 
453 /*
454  * Unlock a chain.
455  */
456 static inline void
457 umtxq_unlock(struct umtx_key *key)
458 {
459 	struct umtxq_chain *uc;
460 
461 	uc = umtxq_getchain(key);
462 	mtx_unlock(&uc->uc_lock);
463 }
464 
465 /*
466  * Set chain to busy state when following operation
467  * may be blocked (kernel mutex can not be used).
468  */
469 static inline void
470 umtxq_busy(struct umtx_key *key)
471 {
472 	struct umtxq_chain *uc;
473 
474 	uc = umtxq_getchain(key);
475 	mtx_assert(&uc->uc_lock, MA_OWNED);
476 	if (uc->uc_busy) {
477 #ifdef SMP
478 		if (smp_cpus > 1) {
479 			int count = BUSY_SPINS;
480 			if (count > 0) {
481 				umtxq_unlock(key);
482 				while (uc->uc_busy && --count > 0)
483 					cpu_spinwait();
484 				umtxq_lock(key);
485 			}
486 		}
487 #endif
488 		while (uc->uc_busy) {
489 			uc->uc_waiters++;
490 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491 			uc->uc_waiters--;
492 		}
493 	}
494 	uc->uc_busy = 1;
495 }
496 
497 /*
498  * Unbusy a chain.
499  */
500 static inline void
501 umtxq_unbusy(struct umtx_key *key)
502 {
503 	struct umtxq_chain *uc;
504 
505 	uc = umtxq_getchain(key);
506 	mtx_assert(&uc->uc_lock, MA_OWNED);
507 	KASSERT(uc->uc_busy != 0, ("not busy"));
508 	uc->uc_busy = 0;
509 	if (uc->uc_waiters)
510 		wakeup_one(uc);
511 }
512 
513 static struct umtxq_queue *
514 umtxq_queue_lookup(struct umtx_key *key, int q)
515 {
516 	struct umtxq_queue *uh;
517 	struct umtxq_chain *uc;
518 
519 	uc = umtxq_getchain(key);
520 	UMTXQ_LOCKED_ASSERT(uc);
521 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
522 		if (umtx_key_match(&uh->key, key))
523 			return (uh);
524 	}
525 
526 	return (NULL);
527 }
528 
529 static inline void
530 umtxq_insert_queue(struct umtx_q *uq, int q)
531 {
532 	struct umtxq_queue *uh;
533 	struct umtxq_chain *uc;
534 
535 	uc = umtxq_getchain(&uq->uq_key);
536 	UMTXQ_LOCKED_ASSERT(uc);
537 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
538 	uh = umtxq_queue_lookup(&uq->uq_key, q);
539 	if (uh != NULL) {
540 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
541 	} else {
542 		uh = uq->uq_spare_queue;
543 		uh->key = uq->uq_key;
544 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
545 #ifdef UMTX_PROFILING
546 		uc->length++;
547 		if (uc->length > uc->max_length) {
548 			uc->max_length = uc->length;
549 			if (uc->max_length > max_length)
550 				max_length = uc->max_length;
551 		}
552 #endif
553 	}
554 	uq->uq_spare_queue = NULL;
555 
556 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
557 	uh->length++;
558 	uq->uq_flags |= UQF_UMTXQ;
559 	uq->uq_cur_queue = uh;
560 	return;
561 }
562 
563 static inline void
564 umtxq_remove_queue(struct umtx_q *uq, int q)
565 {
566 	struct umtxq_chain *uc;
567 	struct umtxq_queue *uh;
568 
569 	uc = umtxq_getchain(&uq->uq_key);
570 	UMTXQ_LOCKED_ASSERT(uc);
571 	if (uq->uq_flags & UQF_UMTXQ) {
572 		uh = uq->uq_cur_queue;
573 		TAILQ_REMOVE(&uh->head, uq, uq_link);
574 		uh->length--;
575 		uq->uq_flags &= ~UQF_UMTXQ;
576 		if (TAILQ_EMPTY(&uh->head)) {
577 			KASSERT(uh->length == 0,
578 			    ("inconsistent umtxq_queue length"));
579 #ifdef UMTX_PROFILING
580 			uc->length--;
581 #endif
582 			LIST_REMOVE(uh, link);
583 		} else {
584 			uh = LIST_FIRST(&uc->uc_spare_queue);
585 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
586 			LIST_REMOVE(uh, link);
587 		}
588 		uq->uq_spare_queue = uh;
589 		uq->uq_cur_queue = NULL;
590 	}
591 }
592 
593 /*
594  * Check if there are multiple waiters
595  */
596 static int
597 umtxq_count(struct umtx_key *key)
598 {
599 	struct umtxq_chain *uc;
600 	struct umtxq_queue *uh;
601 
602 	uc = umtxq_getchain(key);
603 	UMTXQ_LOCKED_ASSERT(uc);
604 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
605 	if (uh != NULL)
606 		return (uh->length);
607 	return (0);
608 }
609 
610 /*
611  * Check if there are multiple PI waiters and returns first
612  * waiter.
613  */
614 static int
615 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
616 {
617 	struct umtxq_chain *uc;
618 	struct umtxq_queue *uh;
619 
620 	*first = NULL;
621 	uc = umtxq_getchain(key);
622 	UMTXQ_LOCKED_ASSERT(uc);
623 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
624 	if (uh != NULL) {
625 		*first = TAILQ_FIRST(&uh->head);
626 		return (uh->length);
627 	}
628 	return (0);
629 }
630 
631 /*
632  * Wake up threads waiting on an userland object.
633  */
634 
635 static int
636 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
637 {
638 	struct umtxq_chain *uc;
639 	struct umtxq_queue *uh;
640 	struct umtx_q *uq;
641 	int ret;
642 
643 	ret = 0;
644 	uc = umtxq_getchain(key);
645 	UMTXQ_LOCKED_ASSERT(uc);
646 	uh = umtxq_queue_lookup(key, q);
647 	if (uh != NULL) {
648 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
649 			umtxq_remove_queue(uq, q);
650 			wakeup(uq);
651 			if (++ret >= n_wake)
652 				return (ret);
653 		}
654 	}
655 	return (ret);
656 }
657 
658 
659 /*
660  * Wake up specified thread.
661  */
662 static inline void
663 umtxq_signal_thread(struct umtx_q *uq)
664 {
665 	struct umtxq_chain *uc;
666 
667 	uc = umtxq_getchain(&uq->uq_key);
668 	UMTXQ_LOCKED_ASSERT(uc);
669 	umtxq_remove(uq);
670 	wakeup(uq);
671 }
672 
673 static inline int
674 tstohz(const struct timespec *tsp)
675 {
676 	struct timeval tv;
677 
678 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
679 	return tvtohz(&tv);
680 }
681 
682 static void
683 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
684 	const struct timespec *timeout)
685 {
686 
687 	timo->clockid = clockid;
688 	if (!absolute) {
689 		kern_clock_gettime(curthread, clockid, &timo->end);
690 		timo->cur = timo->end;
691 		timespecadd(&timo->end, timeout);
692 	} else {
693 		timo->end = *timeout;
694 		kern_clock_gettime(curthread, clockid, &timo->cur);
695 	}
696 }
697 
698 static void
699 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
700 {
701 
702 	abs_timeout_init(timo, umtxtime->_clockid,
703 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
704 		&umtxtime->_timeout);
705 }
706 
707 static inline void
708 abs_timeout_update(struct abs_timeout *timo)
709 {
710 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
711 }
712 
713 static int
714 abs_timeout_gethz(struct abs_timeout *timo)
715 {
716 	struct timespec tts;
717 
718 	if (timespeccmp(&timo->end, &timo->cur, <=))
719 		return (-1);
720 	tts = timo->end;
721 	timespecsub(&tts, &timo->cur);
722 	return (tstohz(&tts));
723 }
724 
725 /*
726  * Put thread into sleep state, before sleeping, check if
727  * thread was removed from umtx queue.
728  */
729 static inline int
730 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
731 {
732 	struct umtxq_chain *uc;
733 	int error, timo;
734 
735 	uc = umtxq_getchain(&uq->uq_key);
736 	UMTXQ_LOCKED_ASSERT(uc);
737 	for (;;) {
738 		if (!(uq->uq_flags & UQF_UMTXQ))
739 			return (0);
740 		if (abstime != NULL) {
741 			timo = abs_timeout_gethz(abstime);
742 			if (timo < 0)
743 				return (ETIMEDOUT);
744 		} else
745 			timo = 0;
746 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
747 		if (error != EWOULDBLOCK) {
748 			umtxq_lock(&uq->uq_key);
749 			break;
750 		}
751 		if (abstime != NULL)
752 			abs_timeout_update(abstime);
753 		umtxq_lock(&uq->uq_key);
754 	}
755 	return (error);
756 }
757 
758 /*
759  * Convert userspace address into unique logical address.
760  */
761 int
762 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
763 {
764 	struct thread *td = curthread;
765 	vm_map_t map;
766 	vm_map_entry_t entry;
767 	vm_pindex_t pindex;
768 	vm_prot_t prot;
769 	boolean_t wired;
770 
771 	key->type = type;
772 	if (share == THREAD_SHARE) {
773 		key->shared = 0;
774 		key->info.private.vs = td->td_proc->p_vmspace;
775 		key->info.private.addr = (uintptr_t)addr;
776 	} else {
777 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
778 		map = &td->td_proc->p_vmspace->vm_map;
779 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
780 		    &entry, &key->info.shared.object, &pindex, &prot,
781 		    &wired) != KERN_SUCCESS) {
782 			return EFAULT;
783 		}
784 
785 		if ((share == PROCESS_SHARE) ||
786 		    (share == AUTO_SHARE &&
787 		     VM_INHERIT_SHARE == entry->inheritance)) {
788 			key->shared = 1;
789 			key->info.shared.offset = entry->offset + entry->start -
790 				(vm_offset_t)addr;
791 			vm_object_reference(key->info.shared.object);
792 		} else {
793 			key->shared = 0;
794 			key->info.private.vs = td->td_proc->p_vmspace;
795 			key->info.private.addr = (uintptr_t)addr;
796 		}
797 		vm_map_lookup_done(map, entry);
798 	}
799 
800 	umtxq_hash(key);
801 	return (0);
802 }
803 
804 /*
805  * Release key.
806  */
807 void
808 umtx_key_release(struct umtx_key *key)
809 {
810 	if (key->shared)
811 		vm_object_deallocate(key->info.shared.object);
812 }
813 
814 /*
815  * Lock a umtx object.
816  */
817 static int
818 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
819 	const struct timespec *timeout)
820 {
821 	struct abs_timeout timo;
822 	struct umtx_q *uq;
823 	u_long owner;
824 	u_long old;
825 	int error = 0;
826 
827 	uq = td->td_umtxq;
828 	if (timeout != NULL)
829 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
830 
831 	/*
832 	 * Care must be exercised when dealing with umtx structure. It
833 	 * can fault on any access.
834 	 */
835 	for (;;) {
836 		/*
837 		 * Try the uncontested case.  This should be done in userland.
838 		 */
839 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
840 
841 		/* The acquire succeeded. */
842 		if (owner == UMTX_UNOWNED)
843 			return (0);
844 
845 		/* The address was invalid. */
846 		if (owner == -1)
847 			return (EFAULT);
848 
849 		/* If no one owns it but it is contested try to acquire it. */
850 		if (owner == UMTX_CONTESTED) {
851 			owner = casuword(&umtx->u_owner,
852 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
853 
854 			if (owner == UMTX_CONTESTED)
855 				return (0);
856 
857 			/* The address was invalid. */
858 			if (owner == -1)
859 				return (EFAULT);
860 
861 			/* If this failed the lock has changed, restart. */
862 			continue;
863 		}
864 
865 		/*
866 		 * If we caught a signal, we have retried and now
867 		 * exit immediately.
868 		 */
869 		if (error != 0)
870 			break;
871 
872 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
873 			AUTO_SHARE, &uq->uq_key)) != 0)
874 			return (error);
875 
876 		umtxq_lock(&uq->uq_key);
877 		umtxq_busy(&uq->uq_key);
878 		umtxq_insert(uq);
879 		umtxq_unbusy(&uq->uq_key);
880 		umtxq_unlock(&uq->uq_key);
881 
882 		/*
883 		 * Set the contested bit so that a release in user space
884 		 * knows to use the system call for unlock.  If this fails
885 		 * either some one else has acquired the lock or it has been
886 		 * released.
887 		 */
888 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
889 
890 		/* The address was invalid. */
891 		if (old == -1) {
892 			umtxq_lock(&uq->uq_key);
893 			umtxq_remove(uq);
894 			umtxq_unlock(&uq->uq_key);
895 			umtx_key_release(&uq->uq_key);
896 			return (EFAULT);
897 		}
898 
899 		/*
900 		 * We set the contested bit, sleep. Otherwise the lock changed
901 		 * and we need to retry or we lost a race to the thread
902 		 * unlocking the umtx.
903 		 */
904 		umtxq_lock(&uq->uq_key);
905 		if (old == owner)
906 			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
907 			    &timo);
908 		umtxq_remove(uq);
909 		umtxq_unlock(&uq->uq_key);
910 		umtx_key_release(&uq->uq_key);
911 	}
912 
913 	if (timeout == NULL) {
914 		/* Mutex locking is restarted if it is interrupted. */
915 		if (error == EINTR)
916 			error = ERESTART;
917 	} else {
918 		/* Timed-locking is not restarted. */
919 		if (error == ERESTART)
920 			error = EINTR;
921 	}
922 	return (error);
923 }
924 
925 /*
926  * Unlock a umtx object.
927  */
928 static int
929 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
930 {
931 	struct umtx_key key;
932 	u_long owner;
933 	u_long old;
934 	int error;
935 	int count;
936 
937 	/*
938 	 * Make sure we own this mtx.
939 	 */
940 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
941 	if (owner == -1)
942 		return (EFAULT);
943 
944 	if ((owner & ~UMTX_CONTESTED) != id)
945 		return (EPERM);
946 
947 	/* This should be done in userland */
948 	if ((owner & UMTX_CONTESTED) == 0) {
949 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
950 		if (old == -1)
951 			return (EFAULT);
952 		if (old == owner)
953 			return (0);
954 		owner = old;
955 	}
956 
957 	/* We should only ever be in here for contested locks */
958 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
959 		&key)) != 0)
960 		return (error);
961 
962 	umtxq_lock(&key);
963 	umtxq_busy(&key);
964 	count = umtxq_count(&key);
965 	umtxq_unlock(&key);
966 
967 	/*
968 	 * When unlocking the umtx, it must be marked as unowned if
969 	 * there is zero or one thread only waiting for it.
970 	 * Otherwise, it must be marked as contested.
971 	 */
972 	old = casuword(&umtx->u_owner, owner,
973 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
974 	umtxq_lock(&key);
975 	umtxq_signal(&key,1);
976 	umtxq_unbusy(&key);
977 	umtxq_unlock(&key);
978 	umtx_key_release(&key);
979 	if (old == -1)
980 		return (EFAULT);
981 	if (old != owner)
982 		return (EINVAL);
983 	return (0);
984 }
985 
986 #ifdef COMPAT_FREEBSD32
987 
988 /*
989  * Lock a umtx object.
990  */
991 static int
992 do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
993 	const struct timespec *timeout)
994 {
995 	struct abs_timeout timo;
996 	struct umtx_q *uq;
997 	uint32_t owner;
998 	uint32_t old;
999 	int error = 0;
1000 
1001 	uq = td->td_umtxq;
1002 
1003 	if (timeout != NULL)
1004 		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
1005 
1006 	/*
1007 	 * Care must be exercised when dealing with umtx structure. It
1008 	 * can fault on any access.
1009 	 */
1010 	for (;;) {
1011 		/*
1012 		 * Try the uncontested case.  This should be done in userland.
1013 		 */
1014 		owner = casuword32(m, UMUTEX_UNOWNED, id);
1015 
1016 		/* The acquire succeeded. */
1017 		if (owner == UMUTEX_UNOWNED)
1018 			return (0);
1019 
1020 		/* The address was invalid. */
1021 		if (owner == -1)
1022 			return (EFAULT);
1023 
1024 		/* If no one owns it but it is contested try to acquire it. */
1025 		if (owner == UMUTEX_CONTESTED) {
1026 			owner = casuword32(m,
1027 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1028 			if (owner == UMUTEX_CONTESTED)
1029 				return (0);
1030 
1031 			/* The address was invalid. */
1032 			if (owner == -1)
1033 				return (EFAULT);
1034 
1035 			/* If this failed the lock has changed, restart. */
1036 			continue;
1037 		}
1038 
1039 		/*
1040 		 * If we caught a signal, we have retried and now
1041 		 * exit immediately.
1042 		 */
1043 		if (error != 0)
1044 			return (error);
1045 
1046 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
1047 			AUTO_SHARE, &uq->uq_key)) != 0)
1048 			return (error);
1049 
1050 		umtxq_lock(&uq->uq_key);
1051 		umtxq_busy(&uq->uq_key);
1052 		umtxq_insert(uq);
1053 		umtxq_unbusy(&uq->uq_key);
1054 		umtxq_unlock(&uq->uq_key);
1055 
1056 		/*
1057 		 * Set the contested bit so that a release in user space
1058 		 * knows to use the system call for unlock.  If this fails
1059 		 * either some one else has acquired the lock or it has been
1060 		 * released.
1061 		 */
1062 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
1063 
1064 		/* The address was invalid. */
1065 		if (old == -1) {
1066 			umtxq_lock(&uq->uq_key);
1067 			umtxq_remove(uq);
1068 			umtxq_unlock(&uq->uq_key);
1069 			umtx_key_release(&uq->uq_key);
1070 			return (EFAULT);
1071 		}
1072 
1073 		/*
1074 		 * We set the contested bit, sleep. Otherwise the lock changed
1075 		 * and we need to retry or we lost a race to the thread
1076 		 * unlocking the umtx.
1077 		 */
1078 		umtxq_lock(&uq->uq_key);
1079 		if (old == owner)
1080 			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
1081 			    NULL : &timo);
1082 		umtxq_remove(uq);
1083 		umtxq_unlock(&uq->uq_key);
1084 		umtx_key_release(&uq->uq_key);
1085 	}
1086 
1087 	if (timeout == NULL) {
1088 		/* Mutex locking is restarted if it is interrupted. */
1089 		if (error == EINTR)
1090 			error = ERESTART;
1091 	} else {
1092 		/* Timed-locking is not restarted. */
1093 		if (error == ERESTART)
1094 			error = EINTR;
1095 	}
1096 	return (error);
1097 }
1098 
1099 /*
1100  * Unlock a umtx object.
1101  */
1102 static int
1103 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
1104 {
1105 	struct umtx_key key;
1106 	uint32_t owner;
1107 	uint32_t old;
1108 	int error;
1109 	int count;
1110 
1111 	/*
1112 	 * Make sure we own this mtx.
1113 	 */
1114 	owner = fuword32(m);
1115 	if (owner == -1)
1116 		return (EFAULT);
1117 
1118 	if ((owner & ~UMUTEX_CONTESTED) != id)
1119 		return (EPERM);
1120 
1121 	/* This should be done in userland */
1122 	if ((owner & UMUTEX_CONTESTED) == 0) {
1123 		old = casuword32(m, owner, UMUTEX_UNOWNED);
1124 		if (old == -1)
1125 			return (EFAULT);
1126 		if (old == owner)
1127 			return (0);
1128 		owner = old;
1129 	}
1130 
1131 	/* We should only ever be in here for contested locks */
1132 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1133 		&key)) != 0)
1134 		return (error);
1135 
1136 	umtxq_lock(&key);
1137 	umtxq_busy(&key);
1138 	count = umtxq_count(&key);
1139 	umtxq_unlock(&key);
1140 
1141 	/*
1142 	 * When unlocking the umtx, it must be marked as unowned if
1143 	 * there is zero or one thread only waiting for it.
1144 	 * Otherwise, it must be marked as contested.
1145 	 */
1146 	old = casuword32(m, owner,
1147 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1148 	umtxq_lock(&key);
1149 	umtxq_signal(&key,1);
1150 	umtxq_unbusy(&key);
1151 	umtxq_unlock(&key);
1152 	umtx_key_release(&key);
1153 	if (old == -1)
1154 		return (EFAULT);
1155 	if (old != owner)
1156 		return (EINVAL);
1157 	return (0);
1158 }
1159 #endif
1160 
1161 /*
1162  * Fetch and compare value, sleep on the address if value is not changed.
1163  */
1164 static int
1165 do_wait(struct thread *td, void *addr, u_long id,
1166 	struct _umtx_time *timeout, int compat32, int is_private)
1167 {
1168 	struct abs_timeout timo;
1169 	struct umtx_q *uq;
1170 	u_long tmp;
1171 	int error = 0;
1172 
1173 	uq = td->td_umtxq;
1174 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1175 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1176 		return (error);
1177 
1178 	if (timeout != NULL)
1179 		abs_timeout_init2(&timo, timeout);
1180 
1181 	umtxq_lock(&uq->uq_key);
1182 	umtxq_insert(uq);
1183 	umtxq_unlock(&uq->uq_key);
1184 	if (compat32 == 0)
1185 		tmp = fuword(addr);
1186         else
1187 		tmp = (unsigned int)fuword32(addr);
1188 	umtxq_lock(&uq->uq_key);
1189 	if (tmp == id)
1190 		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1191 		    NULL : &timo);
1192 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1193 		error = 0;
1194 	else
1195 		umtxq_remove(uq);
1196 	umtxq_unlock(&uq->uq_key);
1197 	umtx_key_release(&uq->uq_key);
1198 	if (error == ERESTART)
1199 		error = EINTR;
1200 	return (error);
1201 }
1202 
1203 /*
1204  * Wake up threads sleeping on the specified address.
1205  */
1206 int
1207 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1208 {
1209 	struct umtx_key key;
1210 	int ret;
1211 
1212 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1213 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1214 		return (ret);
1215 	umtxq_lock(&key);
1216 	ret = umtxq_signal(&key, n_wake);
1217 	umtxq_unlock(&key);
1218 	umtx_key_release(&key);
1219 	return (0);
1220 }
1221 
1222 /*
1223  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1224  */
1225 static int
1226 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1227 	struct _umtx_time *timeout, int mode)
1228 {
1229 	struct abs_timeout timo;
1230 	struct umtx_q *uq;
1231 	uint32_t owner, old, id;
1232 	int error = 0;
1233 
1234 	id = td->td_tid;
1235 	uq = td->td_umtxq;
1236 
1237 	if (timeout != NULL)
1238 		abs_timeout_init2(&timo, timeout);
1239 
1240 	/*
1241 	 * Care must be exercised when dealing with umtx structure. It
1242 	 * can fault on any access.
1243 	 */
1244 	for (;;) {
1245 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1246 		if (mode == _UMUTEX_WAIT) {
1247 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1248 				return (0);
1249 		} else {
1250 			/*
1251 			 * Try the uncontested case.  This should be done in userland.
1252 			 */
1253 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1254 
1255 			/* The acquire succeeded. */
1256 			if (owner == UMUTEX_UNOWNED)
1257 				return (0);
1258 
1259 			/* The address was invalid. */
1260 			if (owner == -1)
1261 				return (EFAULT);
1262 
1263 			/* If no one owns it but it is contested try to acquire it. */
1264 			if (owner == UMUTEX_CONTESTED) {
1265 				owner = casuword32(&m->m_owner,
1266 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1267 
1268 				if (owner == UMUTEX_CONTESTED)
1269 					return (0);
1270 
1271 				/* The address was invalid. */
1272 				if (owner == -1)
1273 					return (EFAULT);
1274 
1275 				/* If this failed the lock has changed, restart. */
1276 				continue;
1277 			}
1278 		}
1279 
1280 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1281 		    (owner & ~UMUTEX_CONTESTED) == id)
1282 			return (EDEADLK);
1283 
1284 		if (mode == _UMUTEX_TRY)
1285 			return (EBUSY);
1286 
1287 		/*
1288 		 * If we caught a signal, we have retried and now
1289 		 * exit immediately.
1290 		 */
1291 		if (error != 0)
1292 			return (error);
1293 
1294 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1295 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1296 			return (error);
1297 
1298 		umtxq_lock(&uq->uq_key);
1299 		umtxq_busy(&uq->uq_key);
1300 		umtxq_insert(uq);
1301 		umtxq_unlock(&uq->uq_key);
1302 
1303 		/*
1304 		 * Set the contested bit so that a release in user space
1305 		 * knows to use the system call for unlock.  If this fails
1306 		 * either some one else has acquired the lock or it has been
1307 		 * released.
1308 		 */
1309 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1310 
1311 		/* The address was invalid. */
1312 		if (old == -1) {
1313 			umtxq_lock(&uq->uq_key);
1314 			umtxq_remove(uq);
1315 			umtxq_unbusy(&uq->uq_key);
1316 			umtxq_unlock(&uq->uq_key);
1317 			umtx_key_release(&uq->uq_key);
1318 			return (EFAULT);
1319 		}
1320 
1321 		/*
1322 		 * We set the contested bit, sleep. Otherwise the lock changed
1323 		 * and we need to retry or we lost a race to the thread
1324 		 * unlocking the umtx.
1325 		 */
1326 		umtxq_lock(&uq->uq_key);
1327 		umtxq_unbusy(&uq->uq_key);
1328 		if (old == owner)
1329 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1330 			    NULL : &timo);
1331 		umtxq_remove(uq);
1332 		umtxq_unlock(&uq->uq_key);
1333 		umtx_key_release(&uq->uq_key);
1334 	}
1335 
1336 	return (0);
1337 }
1338 
1339 /*
1340  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1341  */
1342 static int
1343 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1344 {
1345 	struct umtx_key key;
1346 	uint32_t owner, old, id;
1347 	int error;
1348 	int count;
1349 
1350 	id = td->td_tid;
1351 	/*
1352 	 * Make sure we own this mtx.
1353 	 */
1354 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1355 	if (owner == -1)
1356 		return (EFAULT);
1357 
1358 	if ((owner & ~UMUTEX_CONTESTED) != id)
1359 		return (EPERM);
1360 
1361 	if ((owner & UMUTEX_CONTESTED) == 0) {
1362 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1363 		if (old == -1)
1364 			return (EFAULT);
1365 		if (old == owner)
1366 			return (0);
1367 		owner = old;
1368 	}
1369 
1370 	/* We should only ever be in here for contested locks */
1371 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1372 	    &key)) != 0)
1373 		return (error);
1374 
1375 	umtxq_lock(&key);
1376 	umtxq_busy(&key);
1377 	count = umtxq_count(&key);
1378 	umtxq_unlock(&key);
1379 
1380 	/*
1381 	 * When unlocking the umtx, it must be marked as unowned if
1382 	 * there is zero or one thread only waiting for it.
1383 	 * Otherwise, it must be marked as contested.
1384 	 */
1385 	old = casuword32(&m->m_owner, owner,
1386 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1387 	umtxq_lock(&key);
1388 	umtxq_signal(&key,1);
1389 	umtxq_unbusy(&key);
1390 	umtxq_unlock(&key);
1391 	umtx_key_release(&key);
1392 	if (old == -1)
1393 		return (EFAULT);
1394 	if (old != owner)
1395 		return (EINVAL);
1396 	return (0);
1397 }
1398 
1399 /*
1400  * Check if the mutex is available and wake up a waiter,
1401  * only for simple mutex.
1402  */
1403 static int
1404 do_wake_umutex(struct thread *td, struct umutex *m)
1405 {
1406 	struct umtx_key key;
1407 	uint32_t owner;
1408 	uint32_t flags;
1409 	int error;
1410 	int count;
1411 
1412 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1413 	if (owner == -1)
1414 		return (EFAULT);
1415 
1416 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1417 		return (0);
1418 
1419 	flags = fuword32(&m->m_flags);
1420 
1421 	/* We should only ever be in here for contested locks */
1422 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1423 	    &key)) != 0)
1424 		return (error);
1425 
1426 	umtxq_lock(&key);
1427 	umtxq_busy(&key);
1428 	count = umtxq_count(&key);
1429 	umtxq_unlock(&key);
1430 
1431 	if (count <= 1)
1432 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1433 
1434 	umtxq_lock(&key);
1435 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1436 		umtxq_signal(&key, 1);
1437 	umtxq_unbusy(&key);
1438 	umtxq_unlock(&key);
1439 	umtx_key_release(&key);
1440 	return (0);
1441 }
1442 
1443 /*
1444  * Check if the mutex has waiters and tries to fix contention bit.
1445  */
1446 static int
1447 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1448 {
1449 	struct umtx_key key;
1450 	uint32_t owner, old;
1451 	int type;
1452 	int error;
1453 	int count;
1454 
1455 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1456 	case 0:
1457 		type = TYPE_NORMAL_UMUTEX;
1458 		break;
1459 	case UMUTEX_PRIO_INHERIT:
1460 		type = TYPE_PI_UMUTEX;
1461 		break;
1462 	case UMUTEX_PRIO_PROTECT:
1463 		type = TYPE_PP_UMUTEX;
1464 		break;
1465 	default:
1466 		return (EINVAL);
1467 	}
1468 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1469 	    &key)) != 0)
1470 		return (error);
1471 
1472 	owner = 0;
1473 	umtxq_lock(&key);
1474 	umtxq_busy(&key);
1475 	count = umtxq_count(&key);
1476 	umtxq_unlock(&key);
1477 	/*
1478 	 * Only repair contention bit if there is a waiter, this means the mutex
1479 	 * is still being referenced by userland code, otherwise don't update
1480 	 * any memory.
1481 	 */
1482 	if (count > 1) {
1483 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1484 		while ((owner & UMUTEX_CONTESTED) ==0) {
1485 			old = casuword32(&m->m_owner, owner,
1486 			    owner|UMUTEX_CONTESTED);
1487 			if (old == owner)
1488 				break;
1489 			owner = old;
1490 		}
1491 	} else if (count == 1) {
1492 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1493 		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1494 		       (owner & UMUTEX_CONTESTED) == 0) {
1495 			old = casuword32(&m->m_owner, owner,
1496 			    owner|UMUTEX_CONTESTED);
1497 			if (old == owner)
1498 				break;
1499 			owner = old;
1500 		}
1501 	}
1502 	umtxq_lock(&key);
1503 	if (owner == -1) {
1504 		error = EFAULT;
1505 		umtxq_signal(&key, INT_MAX);
1506 	}
1507 	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1508 		umtxq_signal(&key, 1);
1509 	umtxq_unbusy(&key);
1510 	umtxq_unlock(&key);
1511 	umtx_key_release(&key);
1512 	return (error);
1513 }
1514 
1515 static inline struct umtx_pi *
1516 umtx_pi_alloc(int flags)
1517 {
1518 	struct umtx_pi *pi;
1519 
1520 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1521 	TAILQ_INIT(&pi->pi_blocked);
1522 	atomic_add_int(&umtx_pi_allocated, 1);
1523 	return (pi);
1524 }
1525 
1526 static inline void
1527 umtx_pi_free(struct umtx_pi *pi)
1528 {
1529 	uma_zfree(umtx_pi_zone, pi);
1530 	atomic_add_int(&umtx_pi_allocated, -1);
1531 }
1532 
1533 /*
1534  * Adjust the thread's position on a pi_state after its priority has been
1535  * changed.
1536  */
1537 static int
1538 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1539 {
1540 	struct umtx_q *uq, *uq1, *uq2;
1541 	struct thread *td1;
1542 
1543 	mtx_assert(&umtx_lock, MA_OWNED);
1544 	if (pi == NULL)
1545 		return (0);
1546 
1547 	uq = td->td_umtxq;
1548 
1549 	/*
1550 	 * Check if the thread needs to be moved on the blocked chain.
1551 	 * It needs to be moved if either its priority is lower than
1552 	 * the previous thread or higher than the next thread.
1553 	 */
1554 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1555 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1556 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1557 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1558 		/*
1559 		 * Remove thread from blocked chain and determine where
1560 		 * it should be moved to.
1561 		 */
1562 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1563 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1564 			td1 = uq1->uq_thread;
1565 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1566 			if (UPRI(td1) > UPRI(td))
1567 				break;
1568 		}
1569 
1570 		if (uq1 == NULL)
1571 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1572 		else
1573 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1574 	}
1575 	return (1);
1576 }
1577 
1578 /*
1579  * Propagate priority when a thread is blocked on POSIX
1580  * PI mutex.
1581  */
1582 static void
1583 umtx_propagate_priority(struct thread *td)
1584 {
1585 	struct umtx_q *uq;
1586 	struct umtx_pi *pi;
1587 	int pri;
1588 
1589 	mtx_assert(&umtx_lock, MA_OWNED);
1590 	pri = UPRI(td);
1591 	uq = td->td_umtxq;
1592 	pi = uq->uq_pi_blocked;
1593 	if (pi == NULL)
1594 		return;
1595 
1596 	for (;;) {
1597 		td = pi->pi_owner;
1598 		if (td == NULL || td == curthread)
1599 			return;
1600 
1601 		MPASS(td->td_proc != NULL);
1602 		MPASS(td->td_proc->p_magic == P_MAGIC);
1603 
1604 		thread_lock(td);
1605 		if (td->td_lend_user_pri > pri)
1606 			sched_lend_user_prio(td, pri);
1607 		else {
1608 			thread_unlock(td);
1609 			break;
1610 		}
1611 		thread_unlock(td);
1612 
1613 		/*
1614 		 * Pick up the lock that td is blocked on.
1615 		 */
1616 		uq = td->td_umtxq;
1617 		pi = uq->uq_pi_blocked;
1618 		if (pi == NULL)
1619 			break;
1620 		/* Resort td on the list if needed. */
1621 		umtx_pi_adjust_thread(pi, td);
1622 	}
1623 }
1624 
1625 /*
1626  * Unpropagate priority for a PI mutex when a thread blocked on
1627  * it is interrupted by signal or resumed by others.
1628  */
1629 static void
1630 umtx_repropagate_priority(struct umtx_pi *pi)
1631 {
1632 	struct umtx_q *uq, *uq_owner;
1633 	struct umtx_pi *pi2;
1634 	int pri;
1635 
1636 	mtx_assert(&umtx_lock, MA_OWNED);
1637 
1638 	while (pi != NULL && pi->pi_owner != NULL) {
1639 		pri = PRI_MAX;
1640 		uq_owner = pi->pi_owner->td_umtxq;
1641 
1642 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1643 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1644 			if (uq != NULL) {
1645 				if (pri > UPRI(uq->uq_thread))
1646 					pri = UPRI(uq->uq_thread);
1647 			}
1648 		}
1649 
1650 		if (pri > uq_owner->uq_inherited_pri)
1651 			pri = uq_owner->uq_inherited_pri;
1652 		thread_lock(pi->pi_owner);
1653 		sched_lend_user_prio(pi->pi_owner, pri);
1654 		thread_unlock(pi->pi_owner);
1655 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1656 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1657 	}
1658 }
1659 
1660 /*
1661  * Insert a PI mutex into owned list.
1662  */
1663 static void
1664 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1665 {
1666 	struct umtx_q *uq_owner;
1667 
1668 	uq_owner = owner->td_umtxq;
1669 	mtx_assert(&umtx_lock, MA_OWNED);
1670 	if (pi->pi_owner != NULL)
1671 		panic("pi_ower != NULL");
1672 	pi->pi_owner = owner;
1673 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1674 }
1675 
1676 /*
1677  * Claim ownership of a PI mutex.
1678  */
1679 static int
1680 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1681 {
1682 	struct umtx_q *uq, *uq_owner;
1683 
1684 	uq_owner = owner->td_umtxq;
1685 	mtx_lock_spin(&umtx_lock);
1686 	if (pi->pi_owner == owner) {
1687 		mtx_unlock_spin(&umtx_lock);
1688 		return (0);
1689 	}
1690 
1691 	if (pi->pi_owner != NULL) {
1692 		/*
1693 		 * userland may have already messed the mutex, sigh.
1694 		 */
1695 		mtx_unlock_spin(&umtx_lock);
1696 		return (EPERM);
1697 	}
1698 	umtx_pi_setowner(pi, owner);
1699 	uq = TAILQ_FIRST(&pi->pi_blocked);
1700 	if (uq != NULL) {
1701 		int pri;
1702 
1703 		pri = UPRI(uq->uq_thread);
1704 		thread_lock(owner);
1705 		if (pri < UPRI(owner))
1706 			sched_lend_user_prio(owner, pri);
1707 		thread_unlock(owner);
1708 	}
1709 	mtx_unlock_spin(&umtx_lock);
1710 	return (0);
1711 }
1712 
1713 /*
1714  * Adjust a thread's order position in its blocked PI mutex,
1715  * this may result new priority propagating process.
1716  */
1717 void
1718 umtx_pi_adjust(struct thread *td, u_char oldpri)
1719 {
1720 	struct umtx_q *uq;
1721 	struct umtx_pi *pi;
1722 
1723 	uq = td->td_umtxq;
1724 	mtx_lock_spin(&umtx_lock);
1725 	/*
1726 	 * Pick up the lock that td is blocked on.
1727 	 */
1728 	pi = uq->uq_pi_blocked;
1729 	if (pi != NULL) {
1730 		umtx_pi_adjust_thread(pi, td);
1731 		umtx_repropagate_priority(pi);
1732 	}
1733 	mtx_unlock_spin(&umtx_lock);
1734 }
1735 
1736 /*
1737  * Sleep on a PI mutex.
1738  */
1739 static int
1740 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1741 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1742 {
1743 	struct umtxq_chain *uc;
1744 	struct thread *td, *td1;
1745 	struct umtx_q *uq1;
1746 	int pri;
1747 	int error = 0;
1748 
1749 	td = uq->uq_thread;
1750 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1751 	uc = umtxq_getchain(&uq->uq_key);
1752 	UMTXQ_LOCKED_ASSERT(uc);
1753 	UMTXQ_BUSY_ASSERT(uc);
1754 	umtxq_insert(uq);
1755 	mtx_lock_spin(&umtx_lock);
1756 	if (pi->pi_owner == NULL) {
1757 		mtx_unlock_spin(&umtx_lock);
1758 		/* XXX Only look up thread in current process. */
1759 		td1 = tdfind(owner, curproc->p_pid);
1760 		mtx_lock_spin(&umtx_lock);
1761 		if (td1 != NULL) {
1762 			if (pi->pi_owner == NULL)
1763 				umtx_pi_setowner(pi, td1);
1764 			PROC_UNLOCK(td1->td_proc);
1765 		}
1766 	}
1767 
1768 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1769 		pri = UPRI(uq1->uq_thread);
1770 		if (pri > UPRI(td))
1771 			break;
1772 	}
1773 
1774 	if (uq1 != NULL)
1775 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1776 	else
1777 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1778 
1779 	uq->uq_pi_blocked = pi;
1780 	thread_lock(td);
1781 	td->td_flags |= TDF_UPIBLOCKED;
1782 	thread_unlock(td);
1783 	umtx_propagate_priority(td);
1784 	mtx_unlock_spin(&umtx_lock);
1785 	umtxq_unbusy(&uq->uq_key);
1786 
1787 	error = umtxq_sleep(uq, wmesg, timo);
1788 	umtxq_remove(uq);
1789 
1790 	mtx_lock_spin(&umtx_lock);
1791 	uq->uq_pi_blocked = NULL;
1792 	thread_lock(td);
1793 	td->td_flags &= ~TDF_UPIBLOCKED;
1794 	thread_unlock(td);
1795 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1796 	umtx_repropagate_priority(pi);
1797 	mtx_unlock_spin(&umtx_lock);
1798 	umtxq_unlock(&uq->uq_key);
1799 
1800 	return (error);
1801 }
1802 
1803 /*
1804  * Add reference count for a PI mutex.
1805  */
1806 static void
1807 umtx_pi_ref(struct umtx_pi *pi)
1808 {
1809 	struct umtxq_chain *uc;
1810 
1811 	uc = umtxq_getchain(&pi->pi_key);
1812 	UMTXQ_LOCKED_ASSERT(uc);
1813 	pi->pi_refcount++;
1814 }
1815 
1816 /*
1817  * Decrease reference count for a PI mutex, if the counter
1818  * is decreased to zero, its memory space is freed.
1819  */
1820 static void
1821 umtx_pi_unref(struct umtx_pi *pi)
1822 {
1823 	struct umtxq_chain *uc;
1824 
1825 	uc = umtxq_getchain(&pi->pi_key);
1826 	UMTXQ_LOCKED_ASSERT(uc);
1827 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1828 	if (--pi->pi_refcount == 0) {
1829 		mtx_lock_spin(&umtx_lock);
1830 		if (pi->pi_owner != NULL) {
1831 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1832 				pi, pi_link);
1833 			pi->pi_owner = NULL;
1834 		}
1835 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1836 			("blocked queue not empty"));
1837 		mtx_unlock_spin(&umtx_lock);
1838 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1839 		umtx_pi_free(pi);
1840 	}
1841 }
1842 
1843 /*
1844  * Find a PI mutex in hash table.
1845  */
1846 static struct umtx_pi *
1847 umtx_pi_lookup(struct umtx_key *key)
1848 {
1849 	struct umtxq_chain *uc;
1850 	struct umtx_pi *pi;
1851 
1852 	uc = umtxq_getchain(key);
1853 	UMTXQ_LOCKED_ASSERT(uc);
1854 
1855 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1856 		if (umtx_key_match(&pi->pi_key, key)) {
1857 			return (pi);
1858 		}
1859 	}
1860 	return (NULL);
1861 }
1862 
1863 /*
1864  * Insert a PI mutex into hash table.
1865  */
1866 static inline void
1867 umtx_pi_insert(struct umtx_pi *pi)
1868 {
1869 	struct umtxq_chain *uc;
1870 
1871 	uc = umtxq_getchain(&pi->pi_key);
1872 	UMTXQ_LOCKED_ASSERT(uc);
1873 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1874 }
1875 
1876 /*
1877  * Lock a PI mutex.
1878  */
1879 static int
1880 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1881     struct _umtx_time *timeout, int try)
1882 {
1883 	struct abs_timeout timo;
1884 	struct umtx_q *uq;
1885 	struct umtx_pi *pi, *new_pi;
1886 	uint32_t id, owner, old;
1887 	int error;
1888 
1889 	id = td->td_tid;
1890 	uq = td->td_umtxq;
1891 
1892 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1893 	    &uq->uq_key)) != 0)
1894 		return (error);
1895 
1896 	if (timeout != NULL)
1897 		abs_timeout_init2(&timo, timeout);
1898 
1899 	umtxq_lock(&uq->uq_key);
1900 	pi = umtx_pi_lookup(&uq->uq_key);
1901 	if (pi == NULL) {
1902 		new_pi = umtx_pi_alloc(M_NOWAIT);
1903 		if (new_pi == NULL) {
1904 			umtxq_unlock(&uq->uq_key);
1905 			new_pi = umtx_pi_alloc(M_WAITOK);
1906 			umtxq_lock(&uq->uq_key);
1907 			pi = umtx_pi_lookup(&uq->uq_key);
1908 			if (pi != NULL) {
1909 				umtx_pi_free(new_pi);
1910 				new_pi = NULL;
1911 			}
1912 		}
1913 		if (new_pi != NULL) {
1914 			new_pi->pi_key = uq->uq_key;
1915 			umtx_pi_insert(new_pi);
1916 			pi = new_pi;
1917 		}
1918 	}
1919 	umtx_pi_ref(pi);
1920 	umtxq_unlock(&uq->uq_key);
1921 
1922 	/*
1923 	 * Care must be exercised when dealing with umtx structure.  It
1924 	 * can fault on any access.
1925 	 */
1926 	for (;;) {
1927 		/*
1928 		 * Try the uncontested case.  This should be done in userland.
1929 		 */
1930 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1931 
1932 		/* The acquire succeeded. */
1933 		if (owner == UMUTEX_UNOWNED) {
1934 			error = 0;
1935 			break;
1936 		}
1937 
1938 		/* The address was invalid. */
1939 		if (owner == -1) {
1940 			error = EFAULT;
1941 			break;
1942 		}
1943 
1944 		/* If no one owns it but it is contested try to acquire it. */
1945 		if (owner == UMUTEX_CONTESTED) {
1946 			owner = casuword32(&m->m_owner,
1947 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1948 
1949 			if (owner == UMUTEX_CONTESTED) {
1950 				umtxq_lock(&uq->uq_key);
1951 				umtxq_busy(&uq->uq_key);
1952 				error = umtx_pi_claim(pi, td);
1953 				umtxq_unbusy(&uq->uq_key);
1954 				umtxq_unlock(&uq->uq_key);
1955 				break;
1956 			}
1957 
1958 			/* The address was invalid. */
1959 			if (owner == -1) {
1960 				error = EFAULT;
1961 				break;
1962 			}
1963 
1964 			/* If this failed the lock has changed, restart. */
1965 			continue;
1966 		}
1967 
1968 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1969 		    (owner & ~UMUTEX_CONTESTED) == id) {
1970 			error = EDEADLK;
1971 			break;
1972 		}
1973 
1974 		if (try != 0) {
1975 			error = EBUSY;
1976 			break;
1977 		}
1978 
1979 		/*
1980 		 * If we caught a signal, we have retried and now
1981 		 * exit immediately.
1982 		 */
1983 		if (error != 0)
1984 			break;
1985 
1986 		umtxq_lock(&uq->uq_key);
1987 		umtxq_busy(&uq->uq_key);
1988 		umtxq_unlock(&uq->uq_key);
1989 
1990 		/*
1991 		 * Set the contested bit so that a release in user space
1992 		 * knows to use the system call for unlock.  If this fails
1993 		 * either some one else has acquired the lock or it has been
1994 		 * released.
1995 		 */
1996 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1997 
1998 		/* The address was invalid. */
1999 		if (old == -1) {
2000 			umtxq_lock(&uq->uq_key);
2001 			umtxq_unbusy(&uq->uq_key);
2002 			umtxq_unlock(&uq->uq_key);
2003 			error = EFAULT;
2004 			break;
2005 		}
2006 
2007 		umtxq_lock(&uq->uq_key);
2008 		/*
2009 		 * We set the contested bit, sleep. Otherwise the lock changed
2010 		 * and we need to retry or we lost a race to the thread
2011 		 * unlocking the umtx.
2012 		 */
2013 		if (old == owner)
2014 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
2015 			    "umtxpi", timeout == NULL ? NULL : &timo);
2016 		else {
2017 			umtxq_unbusy(&uq->uq_key);
2018 			umtxq_unlock(&uq->uq_key);
2019 		}
2020 	}
2021 
2022 	umtxq_lock(&uq->uq_key);
2023 	umtx_pi_unref(pi);
2024 	umtxq_unlock(&uq->uq_key);
2025 
2026 	umtx_key_release(&uq->uq_key);
2027 	return (error);
2028 }
2029 
2030 /*
2031  * Unlock a PI mutex.
2032  */
2033 static int
2034 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
2035 {
2036 	struct umtx_key key;
2037 	struct umtx_q *uq_first, *uq_first2, *uq_me;
2038 	struct umtx_pi *pi, *pi2;
2039 	uint32_t owner, old, id;
2040 	int error;
2041 	int count;
2042 	int pri;
2043 
2044 	id = td->td_tid;
2045 	/*
2046 	 * Make sure we own this mtx.
2047 	 */
2048 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2049 	if (owner == -1)
2050 		return (EFAULT);
2051 
2052 	if ((owner & ~UMUTEX_CONTESTED) != id)
2053 		return (EPERM);
2054 
2055 	/* This should be done in userland */
2056 	if ((owner & UMUTEX_CONTESTED) == 0) {
2057 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
2058 		if (old == -1)
2059 			return (EFAULT);
2060 		if (old == owner)
2061 			return (0);
2062 		owner = old;
2063 	}
2064 
2065 	/* We should only ever be in here for contested locks */
2066 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2067 	    &key)) != 0)
2068 		return (error);
2069 
2070 	umtxq_lock(&key);
2071 	umtxq_busy(&key);
2072 	count = umtxq_count_pi(&key, &uq_first);
2073 	if (uq_first != NULL) {
2074 		mtx_lock_spin(&umtx_lock);
2075 		pi = uq_first->uq_pi_blocked;
2076 		KASSERT(pi != NULL, ("pi == NULL?"));
2077 		if (pi->pi_owner != curthread) {
2078 			mtx_unlock_spin(&umtx_lock);
2079 			umtxq_unbusy(&key);
2080 			umtxq_unlock(&key);
2081 			umtx_key_release(&key);
2082 			/* userland messed the mutex */
2083 			return (EPERM);
2084 		}
2085 		uq_me = curthread->td_umtxq;
2086 		pi->pi_owner = NULL;
2087 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
2088 		/* get highest priority thread which is still sleeping. */
2089 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2090 		while (uq_first != NULL &&
2091 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2092 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2093 		}
2094 		pri = PRI_MAX;
2095 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2096 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2097 			if (uq_first2 != NULL) {
2098 				if (pri > UPRI(uq_first2->uq_thread))
2099 					pri = UPRI(uq_first2->uq_thread);
2100 			}
2101 		}
2102 		thread_lock(curthread);
2103 		sched_lend_user_prio(curthread, pri);
2104 		thread_unlock(curthread);
2105 		mtx_unlock_spin(&umtx_lock);
2106 		if (uq_first)
2107 			umtxq_signal_thread(uq_first);
2108 	}
2109 	umtxq_unlock(&key);
2110 
2111 	/*
2112 	 * When unlocking the umtx, it must be marked as unowned if
2113 	 * there is zero or one thread only waiting for it.
2114 	 * Otherwise, it must be marked as contested.
2115 	 */
2116 	old = casuword32(&m->m_owner, owner,
2117 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2118 
2119 	umtxq_lock(&key);
2120 	umtxq_unbusy(&key);
2121 	umtxq_unlock(&key);
2122 	umtx_key_release(&key);
2123 	if (old == -1)
2124 		return (EFAULT);
2125 	if (old != owner)
2126 		return (EINVAL);
2127 	return (0);
2128 }
2129 
2130 /*
2131  * Lock a PP mutex.
2132  */
2133 static int
2134 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2135     struct _umtx_time *timeout, int try)
2136 {
2137 	struct abs_timeout timo;
2138 	struct umtx_q *uq, *uq2;
2139 	struct umtx_pi *pi;
2140 	uint32_t ceiling;
2141 	uint32_t owner, id;
2142 	int error, pri, old_inherited_pri, su;
2143 
2144 	id = td->td_tid;
2145 	uq = td->td_umtxq;
2146 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2147 	    &uq->uq_key)) != 0)
2148 		return (error);
2149 
2150 	if (timeout != NULL)
2151 		abs_timeout_init2(&timo, timeout);
2152 
2153 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2154 	for (;;) {
2155 		old_inherited_pri = uq->uq_inherited_pri;
2156 		umtxq_lock(&uq->uq_key);
2157 		umtxq_busy(&uq->uq_key);
2158 		umtxq_unlock(&uq->uq_key);
2159 
2160 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2161 		if (ceiling > RTP_PRIO_MAX) {
2162 			error = EINVAL;
2163 			goto out;
2164 		}
2165 
2166 		mtx_lock_spin(&umtx_lock);
2167 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2168 			mtx_unlock_spin(&umtx_lock);
2169 			error = EINVAL;
2170 			goto out;
2171 		}
2172 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2173 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2174 			thread_lock(td);
2175 			if (uq->uq_inherited_pri < UPRI(td))
2176 				sched_lend_user_prio(td, uq->uq_inherited_pri);
2177 			thread_unlock(td);
2178 		}
2179 		mtx_unlock_spin(&umtx_lock);
2180 
2181 		owner = casuword32(&m->m_owner,
2182 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2183 
2184 		if (owner == UMUTEX_CONTESTED) {
2185 			error = 0;
2186 			break;
2187 		}
2188 
2189 		/* The address was invalid. */
2190 		if (owner == -1) {
2191 			error = EFAULT;
2192 			break;
2193 		}
2194 
2195 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2196 		    (owner & ~UMUTEX_CONTESTED) == id) {
2197 			error = EDEADLK;
2198 			break;
2199 		}
2200 
2201 		if (try != 0) {
2202 			error = EBUSY;
2203 			break;
2204 		}
2205 
2206 		/*
2207 		 * If we caught a signal, we have retried and now
2208 		 * exit immediately.
2209 		 */
2210 		if (error != 0)
2211 			break;
2212 
2213 		umtxq_lock(&uq->uq_key);
2214 		umtxq_insert(uq);
2215 		umtxq_unbusy(&uq->uq_key);
2216 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2217 		    NULL : &timo);
2218 		umtxq_remove(uq);
2219 		umtxq_unlock(&uq->uq_key);
2220 
2221 		mtx_lock_spin(&umtx_lock);
2222 		uq->uq_inherited_pri = old_inherited_pri;
2223 		pri = PRI_MAX;
2224 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2225 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2226 			if (uq2 != NULL) {
2227 				if (pri > UPRI(uq2->uq_thread))
2228 					pri = UPRI(uq2->uq_thread);
2229 			}
2230 		}
2231 		if (pri > uq->uq_inherited_pri)
2232 			pri = uq->uq_inherited_pri;
2233 		thread_lock(td);
2234 		sched_lend_user_prio(td, pri);
2235 		thread_unlock(td);
2236 		mtx_unlock_spin(&umtx_lock);
2237 	}
2238 
2239 	if (error != 0) {
2240 		mtx_lock_spin(&umtx_lock);
2241 		uq->uq_inherited_pri = old_inherited_pri;
2242 		pri = PRI_MAX;
2243 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2244 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2245 			if (uq2 != NULL) {
2246 				if (pri > UPRI(uq2->uq_thread))
2247 					pri = UPRI(uq2->uq_thread);
2248 			}
2249 		}
2250 		if (pri > uq->uq_inherited_pri)
2251 			pri = uq->uq_inherited_pri;
2252 		thread_lock(td);
2253 		sched_lend_user_prio(td, pri);
2254 		thread_unlock(td);
2255 		mtx_unlock_spin(&umtx_lock);
2256 	}
2257 
2258 out:
2259 	umtxq_lock(&uq->uq_key);
2260 	umtxq_unbusy(&uq->uq_key);
2261 	umtxq_unlock(&uq->uq_key);
2262 	umtx_key_release(&uq->uq_key);
2263 	return (error);
2264 }
2265 
2266 /*
2267  * Unlock a PP mutex.
2268  */
2269 static int
2270 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2271 {
2272 	struct umtx_key key;
2273 	struct umtx_q *uq, *uq2;
2274 	struct umtx_pi *pi;
2275 	uint32_t owner, id;
2276 	uint32_t rceiling;
2277 	int error, pri, new_inherited_pri, su;
2278 
2279 	id = td->td_tid;
2280 	uq = td->td_umtxq;
2281 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2282 
2283 	/*
2284 	 * Make sure we own this mtx.
2285 	 */
2286 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2287 	if (owner == -1)
2288 		return (EFAULT);
2289 
2290 	if ((owner & ~UMUTEX_CONTESTED) != id)
2291 		return (EPERM);
2292 
2293 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2294 	if (error != 0)
2295 		return (error);
2296 
2297 	if (rceiling == -1)
2298 		new_inherited_pri = PRI_MAX;
2299 	else {
2300 		rceiling = RTP_PRIO_MAX - rceiling;
2301 		if (rceiling > RTP_PRIO_MAX)
2302 			return (EINVAL);
2303 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2304 	}
2305 
2306 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2307 	    &key)) != 0)
2308 		return (error);
2309 	umtxq_lock(&key);
2310 	umtxq_busy(&key);
2311 	umtxq_unlock(&key);
2312 	/*
2313 	 * For priority protected mutex, always set unlocked state
2314 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2315 	 * to lock the mutex, it is necessary because thread priority
2316 	 * has to be adjusted for such mutex.
2317 	 */
2318 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2319 		UMUTEX_CONTESTED);
2320 
2321 	umtxq_lock(&key);
2322 	if (error == 0)
2323 		umtxq_signal(&key, 1);
2324 	umtxq_unbusy(&key);
2325 	umtxq_unlock(&key);
2326 
2327 	if (error == -1)
2328 		error = EFAULT;
2329 	else {
2330 		mtx_lock_spin(&umtx_lock);
2331 		if (su != 0)
2332 			uq->uq_inherited_pri = new_inherited_pri;
2333 		pri = PRI_MAX;
2334 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2335 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2336 			if (uq2 != NULL) {
2337 				if (pri > UPRI(uq2->uq_thread))
2338 					pri = UPRI(uq2->uq_thread);
2339 			}
2340 		}
2341 		if (pri > uq->uq_inherited_pri)
2342 			pri = uq->uq_inherited_pri;
2343 		thread_lock(td);
2344 		sched_lend_user_prio(td, pri);
2345 		thread_unlock(td);
2346 		mtx_unlock_spin(&umtx_lock);
2347 	}
2348 	umtx_key_release(&key);
2349 	return (error);
2350 }
2351 
2352 static int
2353 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2354 	uint32_t *old_ceiling)
2355 {
2356 	struct umtx_q *uq;
2357 	uint32_t save_ceiling;
2358 	uint32_t owner, id;
2359 	uint32_t flags;
2360 	int error;
2361 
2362 	flags = fuword32(&m->m_flags);
2363 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2364 		return (EINVAL);
2365 	if (ceiling > RTP_PRIO_MAX)
2366 		return (EINVAL);
2367 	id = td->td_tid;
2368 	uq = td->td_umtxq;
2369 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2370 	   &uq->uq_key)) != 0)
2371 		return (error);
2372 	for (;;) {
2373 		umtxq_lock(&uq->uq_key);
2374 		umtxq_busy(&uq->uq_key);
2375 		umtxq_unlock(&uq->uq_key);
2376 
2377 		save_ceiling = fuword32(&m->m_ceilings[0]);
2378 
2379 		owner = casuword32(&m->m_owner,
2380 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2381 
2382 		if (owner == UMUTEX_CONTESTED) {
2383 			suword32(&m->m_ceilings[0], ceiling);
2384 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2385 				UMUTEX_CONTESTED);
2386 			error = 0;
2387 			break;
2388 		}
2389 
2390 		/* The address was invalid. */
2391 		if (owner == -1) {
2392 			error = EFAULT;
2393 			break;
2394 		}
2395 
2396 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2397 			suword32(&m->m_ceilings[0], ceiling);
2398 			error = 0;
2399 			break;
2400 		}
2401 
2402 		/*
2403 		 * If we caught a signal, we have retried and now
2404 		 * exit immediately.
2405 		 */
2406 		if (error != 0)
2407 			break;
2408 
2409 		/*
2410 		 * We set the contested bit, sleep. Otherwise the lock changed
2411 		 * and we need to retry or we lost a race to the thread
2412 		 * unlocking the umtx.
2413 		 */
2414 		umtxq_lock(&uq->uq_key);
2415 		umtxq_insert(uq);
2416 		umtxq_unbusy(&uq->uq_key);
2417 		error = umtxq_sleep(uq, "umtxpp", NULL);
2418 		umtxq_remove(uq);
2419 		umtxq_unlock(&uq->uq_key);
2420 	}
2421 	umtxq_lock(&uq->uq_key);
2422 	if (error == 0)
2423 		umtxq_signal(&uq->uq_key, INT_MAX);
2424 	umtxq_unbusy(&uq->uq_key);
2425 	umtxq_unlock(&uq->uq_key);
2426 	umtx_key_release(&uq->uq_key);
2427 	if (error == 0 && old_ceiling != NULL)
2428 		suword32(old_ceiling, save_ceiling);
2429 	return (error);
2430 }
2431 
2432 /*
2433  * Lock a userland POSIX mutex.
2434  */
2435 static int
2436 do_lock_umutex(struct thread *td, struct umutex *m,
2437     struct _umtx_time *timeout, int mode)
2438 {
2439 	uint32_t flags;
2440 	int error;
2441 
2442 	flags = fuword32(&m->m_flags);
2443 	if (flags == -1)
2444 		return (EFAULT);
2445 
2446 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2447 	case 0:
2448 		error = do_lock_normal(td, m, flags, timeout, mode);
2449 		break;
2450 	case UMUTEX_PRIO_INHERIT:
2451 		error = do_lock_pi(td, m, flags, timeout, mode);
2452 		break;
2453 	case UMUTEX_PRIO_PROTECT:
2454 		error = do_lock_pp(td, m, flags, timeout, mode);
2455 		break;
2456 	default:
2457 		return (EINVAL);
2458 	}
2459 	if (timeout == NULL) {
2460 		if (error == EINTR && mode != _UMUTEX_WAIT)
2461 			error = ERESTART;
2462 	} else {
2463 		/* Timed-locking is not restarted. */
2464 		if (error == ERESTART)
2465 			error = EINTR;
2466 	}
2467 	return (error);
2468 }
2469 
2470 /*
2471  * Unlock a userland POSIX mutex.
2472  */
2473 static int
2474 do_unlock_umutex(struct thread *td, struct umutex *m)
2475 {
2476 	uint32_t flags;
2477 
2478 	flags = fuword32(&m->m_flags);
2479 	if (flags == -1)
2480 		return (EFAULT);
2481 
2482 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2483 	case 0:
2484 		return (do_unlock_normal(td, m, flags));
2485 	case UMUTEX_PRIO_INHERIT:
2486 		return (do_unlock_pi(td, m, flags));
2487 	case UMUTEX_PRIO_PROTECT:
2488 		return (do_unlock_pp(td, m, flags));
2489 	}
2490 
2491 	return (EINVAL);
2492 }
2493 
2494 static int
2495 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2496 	struct timespec *timeout, u_long wflags)
2497 {
2498 	struct abs_timeout timo;
2499 	struct umtx_q *uq;
2500 	uint32_t flags;
2501 	uint32_t clockid;
2502 	int error;
2503 
2504 	uq = td->td_umtxq;
2505 	flags = fuword32(&cv->c_flags);
2506 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2507 	if (error != 0)
2508 		return (error);
2509 
2510 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2511 		clockid = fuword32(&cv->c_clockid);
2512 		if (clockid < CLOCK_REALTIME ||
2513 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2514 			/* hmm, only HW clock id will work. */
2515 			return (EINVAL);
2516 		}
2517 	} else {
2518 		clockid = CLOCK_REALTIME;
2519 	}
2520 
2521 	umtxq_lock(&uq->uq_key);
2522 	umtxq_busy(&uq->uq_key);
2523 	umtxq_insert(uq);
2524 	umtxq_unlock(&uq->uq_key);
2525 
2526 	/*
2527 	 * Set c_has_waiters to 1 before releasing user mutex, also
2528 	 * don't modify cache line when unnecessary.
2529 	 */
2530 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2531 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2532 
2533 	umtxq_lock(&uq->uq_key);
2534 	umtxq_unbusy(&uq->uq_key);
2535 	umtxq_unlock(&uq->uq_key);
2536 
2537 	error = do_unlock_umutex(td, m);
2538 
2539 	if (timeout != NULL)
2540 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2541 			timeout);
2542 
2543 	umtxq_lock(&uq->uq_key);
2544 	if (error == 0) {
2545 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2546 		    NULL : &timo);
2547 	}
2548 
2549 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2550 		error = 0;
2551 	else {
2552 		/*
2553 		 * This must be timeout,interrupted by signal or
2554 		 * surprious wakeup, clear c_has_waiter flag when
2555 		 * necessary.
2556 		 */
2557 		umtxq_busy(&uq->uq_key);
2558 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2559 			int oldlen = uq->uq_cur_queue->length;
2560 			umtxq_remove(uq);
2561 			if (oldlen == 1) {
2562 				umtxq_unlock(&uq->uq_key);
2563 				suword32(
2564 				    __DEVOLATILE(uint32_t *,
2565 					 &cv->c_has_waiters), 0);
2566 				umtxq_lock(&uq->uq_key);
2567 			}
2568 		}
2569 		umtxq_unbusy(&uq->uq_key);
2570 		if (error == ERESTART)
2571 			error = EINTR;
2572 	}
2573 
2574 	umtxq_unlock(&uq->uq_key);
2575 	umtx_key_release(&uq->uq_key);
2576 	return (error);
2577 }
2578 
2579 /*
2580  * Signal a userland condition variable.
2581  */
2582 static int
2583 do_cv_signal(struct thread *td, struct ucond *cv)
2584 {
2585 	struct umtx_key key;
2586 	int error, cnt, nwake;
2587 	uint32_t flags;
2588 
2589 	flags = fuword32(&cv->c_flags);
2590 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2591 		return (error);
2592 	umtxq_lock(&key);
2593 	umtxq_busy(&key);
2594 	cnt = umtxq_count(&key);
2595 	nwake = umtxq_signal(&key, 1);
2596 	if (cnt <= nwake) {
2597 		umtxq_unlock(&key);
2598 		error = suword32(
2599 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2600 		umtxq_lock(&key);
2601 	}
2602 	umtxq_unbusy(&key);
2603 	umtxq_unlock(&key);
2604 	umtx_key_release(&key);
2605 	return (error);
2606 }
2607 
2608 static int
2609 do_cv_broadcast(struct thread *td, struct ucond *cv)
2610 {
2611 	struct umtx_key key;
2612 	int error;
2613 	uint32_t flags;
2614 
2615 	flags = fuword32(&cv->c_flags);
2616 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2617 		return (error);
2618 
2619 	umtxq_lock(&key);
2620 	umtxq_busy(&key);
2621 	umtxq_signal(&key, INT_MAX);
2622 	umtxq_unlock(&key);
2623 
2624 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2625 
2626 	umtxq_lock(&key);
2627 	umtxq_unbusy(&key);
2628 	umtxq_unlock(&key);
2629 
2630 	umtx_key_release(&key);
2631 	return (error);
2632 }
2633 
2634 static int
2635 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2636 {
2637 	struct abs_timeout timo;
2638 	struct umtx_q *uq;
2639 	uint32_t flags, wrflags;
2640 	int32_t state, oldstate;
2641 	int32_t blocked_readers;
2642 	int error;
2643 
2644 	uq = td->td_umtxq;
2645 	flags = fuword32(&rwlock->rw_flags);
2646 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2647 	if (error != 0)
2648 		return (error);
2649 
2650 	if (timeout != NULL)
2651 		abs_timeout_init2(&timo, timeout);
2652 
2653 	wrflags = URWLOCK_WRITE_OWNER;
2654 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2655 		wrflags |= URWLOCK_WRITE_WAITERS;
2656 
2657 	for (;;) {
2658 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2659 		/* try to lock it */
2660 		while (!(state & wrflags)) {
2661 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2662 				umtx_key_release(&uq->uq_key);
2663 				return (EAGAIN);
2664 			}
2665 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2666 			if (oldstate == state) {
2667 				umtx_key_release(&uq->uq_key);
2668 				return (0);
2669 			}
2670 			state = oldstate;
2671 		}
2672 
2673 		if (error)
2674 			break;
2675 
2676 		/* grab monitor lock */
2677 		umtxq_lock(&uq->uq_key);
2678 		umtxq_busy(&uq->uq_key);
2679 		umtxq_unlock(&uq->uq_key);
2680 
2681 		/*
2682 		 * re-read the state, in case it changed between the try-lock above
2683 		 * and the check below
2684 		 */
2685 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2686 
2687 		/* set read contention bit */
2688 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2689 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2690 			if (oldstate == state)
2691 				goto sleep;
2692 			state = oldstate;
2693 		}
2694 
2695 		/* state is changed while setting flags, restart */
2696 		if (!(state & wrflags)) {
2697 			umtxq_lock(&uq->uq_key);
2698 			umtxq_unbusy(&uq->uq_key);
2699 			umtxq_unlock(&uq->uq_key);
2700 			continue;
2701 		}
2702 
2703 sleep:
2704 		/* contention bit is set, before sleeping, increase read waiter count */
2705 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2706 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2707 
2708 		while (state & wrflags) {
2709 			umtxq_lock(&uq->uq_key);
2710 			umtxq_insert(uq);
2711 			umtxq_unbusy(&uq->uq_key);
2712 
2713 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2714 			    NULL : &timo);
2715 
2716 			umtxq_busy(&uq->uq_key);
2717 			umtxq_remove(uq);
2718 			umtxq_unlock(&uq->uq_key);
2719 			if (error)
2720 				break;
2721 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2722 		}
2723 
2724 		/* decrease read waiter count, and may clear read contention bit */
2725 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2726 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2727 		if (blocked_readers == 1) {
2728 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2729 			for (;;) {
2730 				oldstate = casuword32(&rwlock->rw_state, state,
2731 					 state & ~URWLOCK_READ_WAITERS);
2732 				if (oldstate == state)
2733 					break;
2734 				state = oldstate;
2735 			}
2736 		}
2737 
2738 		umtxq_lock(&uq->uq_key);
2739 		umtxq_unbusy(&uq->uq_key);
2740 		umtxq_unlock(&uq->uq_key);
2741 	}
2742 	umtx_key_release(&uq->uq_key);
2743 	if (error == ERESTART)
2744 		error = EINTR;
2745 	return (error);
2746 }
2747 
2748 static int
2749 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2750 {
2751 	struct abs_timeout timo;
2752 	struct umtx_q *uq;
2753 	uint32_t flags;
2754 	int32_t state, oldstate;
2755 	int32_t blocked_writers;
2756 	int32_t blocked_readers;
2757 	int error;
2758 
2759 	uq = td->td_umtxq;
2760 	flags = fuword32(&rwlock->rw_flags);
2761 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2762 	if (error != 0)
2763 		return (error);
2764 
2765 	if (timeout != NULL)
2766 		abs_timeout_init2(&timo, timeout);
2767 
2768 	blocked_readers = 0;
2769 	for (;;) {
2770 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2771 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2772 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2773 			if (oldstate == state) {
2774 				umtx_key_release(&uq->uq_key);
2775 				return (0);
2776 			}
2777 			state = oldstate;
2778 		}
2779 
2780 		if (error) {
2781 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2782 			    blocked_readers != 0) {
2783 				umtxq_lock(&uq->uq_key);
2784 				umtxq_busy(&uq->uq_key);
2785 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2786 				umtxq_unbusy(&uq->uq_key);
2787 				umtxq_unlock(&uq->uq_key);
2788 			}
2789 
2790 			break;
2791 		}
2792 
2793 		/* grab monitor lock */
2794 		umtxq_lock(&uq->uq_key);
2795 		umtxq_busy(&uq->uq_key);
2796 		umtxq_unlock(&uq->uq_key);
2797 
2798 		/*
2799 		 * re-read the state, in case it changed between the try-lock above
2800 		 * and the check below
2801 		 */
2802 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2803 
2804 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2805 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2806 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2807 			if (oldstate == state)
2808 				goto sleep;
2809 			state = oldstate;
2810 		}
2811 
2812 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2813 			umtxq_lock(&uq->uq_key);
2814 			umtxq_unbusy(&uq->uq_key);
2815 			umtxq_unlock(&uq->uq_key);
2816 			continue;
2817 		}
2818 sleep:
2819 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2820 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2821 
2822 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2823 			umtxq_lock(&uq->uq_key);
2824 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2825 			umtxq_unbusy(&uq->uq_key);
2826 
2827 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2828 			    NULL : &timo);
2829 
2830 			umtxq_busy(&uq->uq_key);
2831 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2832 			umtxq_unlock(&uq->uq_key);
2833 			if (error)
2834 				break;
2835 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2836 		}
2837 
2838 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2839 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2840 		if (blocked_writers == 1) {
2841 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2842 			for (;;) {
2843 				oldstate = casuword32(&rwlock->rw_state, state,
2844 					 state & ~URWLOCK_WRITE_WAITERS);
2845 				if (oldstate == state)
2846 					break;
2847 				state = oldstate;
2848 			}
2849 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2850 		} else
2851 			blocked_readers = 0;
2852 
2853 		umtxq_lock(&uq->uq_key);
2854 		umtxq_unbusy(&uq->uq_key);
2855 		umtxq_unlock(&uq->uq_key);
2856 	}
2857 
2858 	umtx_key_release(&uq->uq_key);
2859 	if (error == ERESTART)
2860 		error = EINTR;
2861 	return (error);
2862 }
2863 
2864 static int
2865 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2866 {
2867 	struct umtx_q *uq;
2868 	uint32_t flags;
2869 	int32_t state, oldstate;
2870 	int error, q, count;
2871 
2872 	uq = td->td_umtxq;
2873 	flags = fuword32(&rwlock->rw_flags);
2874 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2875 	if (error != 0)
2876 		return (error);
2877 
2878 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2879 	if (state & URWLOCK_WRITE_OWNER) {
2880 		for (;;) {
2881 			oldstate = casuword32(&rwlock->rw_state, state,
2882 				state & ~URWLOCK_WRITE_OWNER);
2883 			if (oldstate != state) {
2884 				state = oldstate;
2885 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2886 					error = EPERM;
2887 					goto out;
2888 				}
2889 			} else
2890 				break;
2891 		}
2892 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2893 		for (;;) {
2894 			oldstate = casuword32(&rwlock->rw_state, state,
2895 				state - 1);
2896 			if (oldstate != state) {
2897 				state = oldstate;
2898 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2899 					error = EPERM;
2900 					goto out;
2901 				}
2902 			}
2903 			else
2904 				break;
2905 		}
2906 	} else {
2907 		error = EPERM;
2908 		goto out;
2909 	}
2910 
2911 	count = 0;
2912 
2913 	if (!(flags & URWLOCK_PREFER_READER)) {
2914 		if (state & URWLOCK_WRITE_WAITERS) {
2915 			count = 1;
2916 			q = UMTX_EXCLUSIVE_QUEUE;
2917 		} else if (state & URWLOCK_READ_WAITERS) {
2918 			count = INT_MAX;
2919 			q = UMTX_SHARED_QUEUE;
2920 		}
2921 	} else {
2922 		if (state & URWLOCK_READ_WAITERS) {
2923 			count = INT_MAX;
2924 			q = UMTX_SHARED_QUEUE;
2925 		} else if (state & URWLOCK_WRITE_WAITERS) {
2926 			count = 1;
2927 			q = UMTX_EXCLUSIVE_QUEUE;
2928 		}
2929 	}
2930 
2931 	if (count) {
2932 		umtxq_lock(&uq->uq_key);
2933 		umtxq_busy(&uq->uq_key);
2934 		umtxq_signal_queue(&uq->uq_key, count, q);
2935 		umtxq_unbusy(&uq->uq_key);
2936 		umtxq_unlock(&uq->uq_key);
2937 	}
2938 out:
2939 	umtx_key_release(&uq->uq_key);
2940 	return (error);
2941 }
2942 
2943 static int
2944 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2945 {
2946 	struct abs_timeout timo;
2947 	struct umtx_q *uq;
2948 	uint32_t flags, count;
2949 	int error;
2950 
2951 	uq = td->td_umtxq;
2952 	flags = fuword32(&sem->_flags);
2953 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2954 	if (error != 0)
2955 		return (error);
2956 
2957 	if (timeout != NULL)
2958 		abs_timeout_init2(&timo, timeout);
2959 
2960 	umtxq_lock(&uq->uq_key);
2961 	umtxq_busy(&uq->uq_key);
2962 	umtxq_insert(uq);
2963 	umtxq_unlock(&uq->uq_key);
2964 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2965 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2966 	if (count != 0) {
2967 		umtxq_lock(&uq->uq_key);
2968 		umtxq_unbusy(&uq->uq_key);
2969 		umtxq_remove(uq);
2970 		umtxq_unlock(&uq->uq_key);
2971 		umtx_key_release(&uq->uq_key);
2972 		return (0);
2973 	}
2974 	umtxq_lock(&uq->uq_key);
2975 	umtxq_unbusy(&uq->uq_key);
2976 
2977 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2978 
2979 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2980 		error = 0;
2981 	else {
2982 		umtxq_remove(uq);
2983 		/* A relative timeout cannot be restarted. */
2984 		if (error == ERESTART && timeout != NULL &&
2985 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2986 			error = EINTR;
2987 	}
2988 	umtxq_unlock(&uq->uq_key);
2989 	umtx_key_release(&uq->uq_key);
2990 	return (error);
2991 }
2992 
2993 /*
2994  * Signal a userland condition variable.
2995  */
2996 static int
2997 do_sem_wake(struct thread *td, struct _usem *sem)
2998 {
2999 	struct umtx_key key;
3000 	int error, cnt;
3001 	uint32_t flags;
3002 
3003 	flags = fuword32(&sem->_flags);
3004 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3005 		return (error);
3006 	umtxq_lock(&key);
3007 	umtxq_busy(&key);
3008 	cnt = umtxq_count(&key);
3009 	if (cnt > 0) {
3010 		umtxq_signal(&key, 1);
3011 		/*
3012 		 * Check if count is greater than 0, this means the memory is
3013 		 * still being referenced by user code, so we can safely
3014 		 * update _has_waiters flag.
3015 		 */
3016 		if (cnt == 1) {
3017 			umtxq_unlock(&key);
3018 			error = suword32(
3019 			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
3020 			umtxq_lock(&key);
3021 		}
3022 	}
3023 	umtxq_unbusy(&key);
3024 	umtxq_unlock(&key);
3025 	umtx_key_release(&key);
3026 	return (error);
3027 }
3028 
3029 int
3030 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
3031     /* struct umtx *umtx */
3032 {
3033 	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
3034 }
3035 
3036 int
3037 sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
3038     /* struct umtx *umtx */
3039 {
3040 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
3041 }
3042 
3043 inline int
3044 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3045 {
3046 	int error;
3047 
3048 	error = copyin(addr, tsp, sizeof(struct timespec));
3049 	if (error == 0) {
3050 		if (tsp->tv_sec < 0 ||
3051 		    tsp->tv_nsec >= 1000000000 ||
3052 		    tsp->tv_nsec < 0)
3053 			error = EINVAL;
3054 	}
3055 	return (error);
3056 }
3057 
3058 static inline int
3059 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3060 {
3061 	int error;
3062 
3063 	if (size <= sizeof(struct timespec)) {
3064 		tp->_clockid = CLOCK_REALTIME;
3065 		tp->_flags = 0;
3066 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3067 	} else
3068 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3069 	if (error != 0)
3070 		return (error);
3071 	if (tp->_timeout.tv_sec < 0 ||
3072 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3073 		return (EINVAL);
3074 	return (0);
3075 }
3076 
3077 static int
3078 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3079 {
3080 	struct timespec *ts, timeout;
3081 	int error;
3082 
3083 	/* Allow a null timespec (wait forever). */
3084 	if (uap->uaddr2 == NULL)
3085 		ts = NULL;
3086 	else {
3087 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3088 		if (error != 0)
3089 			return (error);
3090 		ts = &timeout;
3091 	}
3092 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3093 }
3094 
3095 static int
3096 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3097 {
3098 	return (do_unlock_umtx(td, uap->obj, uap->val));
3099 }
3100 
3101 static int
3102 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3103 {
3104 	struct _umtx_time timeout, *tm_p;
3105 	int error;
3106 
3107 	if (uap->uaddr2 == NULL)
3108 		tm_p = NULL;
3109 	else {
3110 		error = umtx_copyin_umtx_time(
3111 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3112 		if (error != 0)
3113 			return (error);
3114 		tm_p = &timeout;
3115 	}
3116 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3117 }
3118 
3119 static int
3120 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3121 {
3122 	struct _umtx_time timeout, *tm_p;
3123 	int error;
3124 
3125 	if (uap->uaddr2 == NULL)
3126 		tm_p = NULL;
3127 	else {
3128 		error = umtx_copyin_umtx_time(
3129 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3130 		if (error != 0)
3131 			return (error);
3132 		tm_p = &timeout;
3133 	}
3134 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3135 }
3136 
3137 static int
3138 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3139 {
3140 	struct _umtx_time *tm_p, timeout;
3141 	int error;
3142 
3143 	if (uap->uaddr2 == NULL)
3144 		tm_p = NULL;
3145 	else {
3146 		error = umtx_copyin_umtx_time(
3147 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3148 		if (error != 0)
3149 			return (error);
3150 		tm_p = &timeout;
3151 	}
3152 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3153 }
3154 
3155 static int
3156 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3157 {
3158 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3159 }
3160 
3161 #define BATCH_SIZE	128
3162 static int
3163 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3164 {
3165 	int count = uap->val;
3166 	void *uaddrs[BATCH_SIZE];
3167 	char **upp = (char **)uap->obj;
3168 	int tocopy;
3169 	int error = 0;
3170 	int i, pos = 0;
3171 
3172 	while (count > 0) {
3173 		tocopy = count;
3174 		if (tocopy > BATCH_SIZE)
3175 			tocopy = BATCH_SIZE;
3176 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3177 		if (error != 0)
3178 			break;
3179 		for (i = 0; i < tocopy; ++i)
3180 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3181 		count -= tocopy;
3182 		pos += tocopy;
3183 	}
3184 	return (error);
3185 }
3186 
3187 static int
3188 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3189 {
3190 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3191 }
3192 
3193 static int
3194 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3195 {
3196 	struct _umtx_time *tm_p, timeout;
3197 	int error;
3198 
3199 	/* Allow a null timespec (wait forever). */
3200 	if (uap->uaddr2 == NULL)
3201 		tm_p = NULL;
3202 	else {
3203 		error = umtx_copyin_umtx_time(
3204 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3205 		if (error != 0)
3206 			return (error);
3207 		tm_p = &timeout;
3208 	}
3209 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3210 }
3211 
3212 static int
3213 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3214 {
3215 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3216 }
3217 
3218 static int
3219 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3220 {
3221 	struct _umtx_time *tm_p, timeout;
3222 	int error;
3223 
3224 	/* Allow a null timespec (wait forever). */
3225 	if (uap->uaddr2 == NULL)
3226 		tm_p = NULL;
3227 	else {
3228 		error = umtx_copyin_umtx_time(
3229 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3230 		if (error != 0)
3231 			return (error);
3232 		tm_p = &timeout;
3233 	}
3234 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3235 }
3236 
3237 static int
3238 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3239 {
3240 	return do_wake_umutex(td, uap->obj);
3241 }
3242 
3243 static int
3244 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3245 {
3246 	return do_unlock_umutex(td, uap->obj);
3247 }
3248 
3249 static int
3250 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3251 {
3252 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3253 }
3254 
3255 static int
3256 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3257 {
3258 	struct timespec *ts, timeout;
3259 	int error;
3260 
3261 	/* Allow a null timespec (wait forever). */
3262 	if (uap->uaddr2 == NULL)
3263 		ts = NULL;
3264 	else {
3265 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3266 		if (error != 0)
3267 			return (error);
3268 		ts = &timeout;
3269 	}
3270 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3271 }
3272 
3273 static int
3274 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3275 {
3276 	return do_cv_signal(td, uap->obj);
3277 }
3278 
3279 static int
3280 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3281 {
3282 	return do_cv_broadcast(td, uap->obj);
3283 }
3284 
3285 static int
3286 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3287 {
3288 	struct _umtx_time timeout;
3289 	int error;
3290 
3291 	/* Allow a null timespec (wait forever). */
3292 	if (uap->uaddr2 == NULL) {
3293 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3294 	} else {
3295 		error = umtx_copyin_umtx_time(uap->uaddr2,
3296 		   (size_t)uap->uaddr1, &timeout);
3297 		if (error != 0)
3298 			return (error);
3299 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3300 	}
3301 	return (error);
3302 }
3303 
3304 static int
3305 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3306 {
3307 	struct _umtx_time timeout;
3308 	int error;
3309 
3310 	/* Allow a null timespec (wait forever). */
3311 	if (uap->uaddr2 == NULL) {
3312 		error = do_rw_wrlock(td, uap->obj, 0);
3313 	} else {
3314 		error = umtx_copyin_umtx_time(uap->uaddr2,
3315 		   (size_t)uap->uaddr1, &timeout);
3316 		if (error != 0)
3317 			return (error);
3318 
3319 		error = do_rw_wrlock(td, uap->obj, &timeout);
3320 	}
3321 	return (error);
3322 }
3323 
3324 static int
3325 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3326 {
3327 	return do_rw_unlock(td, uap->obj);
3328 }
3329 
3330 static int
3331 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3332 {
3333 	struct _umtx_time *tm_p, timeout;
3334 	int error;
3335 
3336 	/* Allow a null timespec (wait forever). */
3337 	if (uap->uaddr2 == NULL)
3338 		tm_p = NULL;
3339 	else {
3340 		error = umtx_copyin_umtx_time(
3341 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3342 		if (error != 0)
3343 			return (error);
3344 		tm_p = &timeout;
3345 	}
3346 	return (do_sem_wait(td, uap->obj, tm_p));
3347 }
3348 
3349 static int
3350 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3351 {
3352 	return do_sem_wake(td, uap->obj);
3353 }
3354 
3355 static int
3356 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3357 {
3358 	return do_wake2_umutex(td, uap->obj, uap->val);
3359 }
3360 
3361 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3362 
3363 static _umtx_op_func op_table[] = {
3364 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3365 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3366 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3367 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3368 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3369 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3370 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3371 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3372 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3373 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3374 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3375 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3376 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3377 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3378 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3379 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3380 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3381 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3382 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3383 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3384 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3385 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3386 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3387 };
3388 
3389 int
3390 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3391 {
3392 	if ((unsigned)uap->op < UMTX_OP_MAX)
3393 		return (*op_table[uap->op])(td, uap);
3394 	return (EINVAL);
3395 }
3396 
3397 #ifdef COMPAT_FREEBSD32
3398 int
3399 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3400     /* struct umtx *umtx */
3401 {
3402 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3403 }
3404 
3405 int
3406 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3407     /* struct umtx *umtx */
3408 {
3409 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3410 }
3411 
3412 struct timespec32 {
3413 	int32_t tv_sec;
3414 	int32_t tv_nsec;
3415 };
3416 
3417 struct umtx_time32 {
3418 	struct	timespec32	timeout;
3419 	uint32_t		flags;
3420 	uint32_t		clockid;
3421 };
3422 
3423 static inline int
3424 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3425 {
3426 	struct timespec32 ts32;
3427 	int error;
3428 
3429 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3430 	if (error == 0) {
3431 		if (ts32.tv_sec < 0 ||
3432 		    ts32.tv_nsec >= 1000000000 ||
3433 		    ts32.tv_nsec < 0)
3434 			error = EINVAL;
3435 		else {
3436 			tsp->tv_sec = ts32.tv_sec;
3437 			tsp->tv_nsec = ts32.tv_nsec;
3438 		}
3439 	}
3440 	return (error);
3441 }
3442 
3443 static inline int
3444 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3445 {
3446 	struct umtx_time32 t32;
3447 	int error;
3448 
3449 	t32.clockid = CLOCK_REALTIME;
3450 	t32.flags   = 0;
3451 	if (size <= sizeof(struct timespec32))
3452 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3453 	else
3454 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3455 	if (error != 0)
3456 		return (error);
3457 	if (t32.timeout.tv_sec < 0 ||
3458 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3459 		return (EINVAL);
3460 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3461 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3462 	tp->_flags = t32.flags;
3463 	tp->_clockid = t32.clockid;
3464 	return (0);
3465 }
3466 
3467 static int
3468 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3469 {
3470 	struct timespec *ts, timeout;
3471 	int error;
3472 
3473 	/* Allow a null timespec (wait forever). */
3474 	if (uap->uaddr2 == NULL)
3475 		ts = NULL;
3476 	else {
3477 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3478 		if (error != 0)
3479 			return (error);
3480 		ts = &timeout;
3481 	}
3482 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3483 }
3484 
3485 static int
3486 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3487 {
3488 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3489 }
3490 
3491 static int
3492 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3493 {
3494 	struct _umtx_time *tm_p, timeout;
3495 	int error;
3496 
3497 	if (uap->uaddr2 == NULL)
3498 		tm_p = NULL;
3499 	else {
3500 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3501 			(size_t)uap->uaddr1, &timeout);
3502 		if (error != 0)
3503 			return (error);
3504 		tm_p = &timeout;
3505 	}
3506 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3507 }
3508 
3509 static int
3510 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3511 {
3512 	struct _umtx_time *tm_p, timeout;
3513 	int error;
3514 
3515 	/* Allow a null timespec (wait forever). */
3516 	if (uap->uaddr2 == NULL)
3517 		tm_p = NULL;
3518 	else {
3519 		error = umtx_copyin_umtx_time(uap->uaddr2,
3520 			    (size_t)uap->uaddr1, &timeout);
3521 		if (error != 0)
3522 			return (error);
3523 		tm_p = &timeout;
3524 	}
3525 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3526 }
3527 
3528 static int
3529 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3530 {
3531 	struct _umtx_time *tm_p, timeout;
3532 	int error;
3533 
3534 	/* Allow a null timespec (wait forever). */
3535 	if (uap->uaddr2 == NULL)
3536 		tm_p = NULL;
3537 	else {
3538 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3539 		    (size_t)uap->uaddr1, &timeout);
3540 		if (error != 0)
3541 			return (error);
3542 		tm_p = &timeout;
3543 	}
3544 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3545 }
3546 
3547 static int
3548 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3549 {
3550 	struct timespec *ts, timeout;
3551 	int error;
3552 
3553 	/* Allow a null timespec (wait forever). */
3554 	if (uap->uaddr2 == NULL)
3555 		ts = NULL;
3556 	else {
3557 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3558 		if (error != 0)
3559 			return (error);
3560 		ts = &timeout;
3561 	}
3562 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3563 }
3564 
3565 static int
3566 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3567 {
3568 	struct _umtx_time timeout;
3569 	int error;
3570 
3571 	/* Allow a null timespec (wait forever). */
3572 	if (uap->uaddr2 == NULL) {
3573 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3574 	} else {
3575 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3576 		    (size_t)uap->uaddr1, &timeout);
3577 		if (error != 0)
3578 			return (error);
3579 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3580 	}
3581 	return (error);
3582 }
3583 
3584 static int
3585 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3586 {
3587 	struct _umtx_time timeout;
3588 	int error;
3589 
3590 	/* Allow a null timespec (wait forever). */
3591 	if (uap->uaddr2 == NULL) {
3592 		error = do_rw_wrlock(td, uap->obj, 0);
3593 	} else {
3594 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3595 		    (size_t)uap->uaddr1, &timeout);
3596 		if (error != 0)
3597 			return (error);
3598 		error = do_rw_wrlock(td, uap->obj, &timeout);
3599 	}
3600 	return (error);
3601 }
3602 
3603 static int
3604 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3605 {
3606 	struct _umtx_time *tm_p, timeout;
3607 	int error;
3608 
3609 	if (uap->uaddr2 == NULL)
3610 		tm_p = NULL;
3611 	else {
3612 		error = umtx_copyin_umtx_time32(
3613 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3614 		if (error != 0)
3615 			return (error);
3616 		tm_p = &timeout;
3617 	}
3618 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3619 }
3620 
3621 static int
3622 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3623 {
3624 	struct _umtx_time *tm_p, timeout;
3625 	int error;
3626 
3627 	/* Allow a null timespec (wait forever). */
3628 	if (uap->uaddr2 == NULL)
3629 		tm_p = NULL;
3630 	else {
3631 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3632 		    (size_t)uap->uaddr1, &timeout);
3633 		if (error != 0)
3634 			return (error);
3635 		tm_p = &timeout;
3636 	}
3637 	return (do_sem_wait(td, uap->obj, tm_p));
3638 }
3639 
3640 static int
3641 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3642 {
3643 	int count = uap->val;
3644 	uint32_t uaddrs[BATCH_SIZE];
3645 	uint32_t **upp = (uint32_t **)uap->obj;
3646 	int tocopy;
3647 	int error = 0;
3648 	int i, pos = 0;
3649 
3650 	while (count > 0) {
3651 		tocopy = count;
3652 		if (tocopy > BATCH_SIZE)
3653 			tocopy = BATCH_SIZE;
3654 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3655 		if (error != 0)
3656 			break;
3657 		for (i = 0; i < tocopy; ++i)
3658 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3659 				INT_MAX, 1);
3660 		count -= tocopy;
3661 		pos += tocopy;
3662 	}
3663 	return (error);
3664 }
3665 
3666 static _umtx_op_func op_table_compat32[] = {
3667 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3668 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3669 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3670 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3671 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3672 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3673 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3674 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3675 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3676 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3677 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3678 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3679 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3680 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3681 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3682 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3683 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3684 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3685 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3686 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3687 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3688 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3689 	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3690 };
3691 
3692 int
3693 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3694 {
3695 	if ((unsigned)uap->op < UMTX_OP_MAX)
3696 		return (*op_table_compat32[uap->op])(td,
3697 			(struct _umtx_op_args *)uap);
3698 	return (EINVAL);
3699 }
3700 #endif
3701 
3702 void
3703 umtx_thread_init(struct thread *td)
3704 {
3705 	td->td_umtxq = umtxq_alloc();
3706 	td->td_umtxq->uq_thread = td;
3707 }
3708 
3709 void
3710 umtx_thread_fini(struct thread *td)
3711 {
3712 	umtxq_free(td->td_umtxq);
3713 }
3714 
3715 /*
3716  * It will be called when new thread is created, e.g fork().
3717  */
3718 void
3719 umtx_thread_alloc(struct thread *td)
3720 {
3721 	struct umtx_q *uq;
3722 
3723 	uq = td->td_umtxq;
3724 	uq->uq_inherited_pri = PRI_MAX;
3725 
3726 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3727 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3728 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3729 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3730 }
3731 
3732 /*
3733  * exec() hook.
3734  */
3735 static void
3736 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3737 	struct image_params *imgp __unused)
3738 {
3739 	umtx_thread_cleanup(curthread);
3740 }
3741 
3742 /*
3743  * thread_exit() hook.
3744  */
3745 void
3746 umtx_thread_exit(struct thread *td)
3747 {
3748 	umtx_thread_cleanup(td);
3749 }
3750 
3751 /*
3752  * clean up umtx data.
3753  */
3754 static void
3755 umtx_thread_cleanup(struct thread *td)
3756 {
3757 	struct umtx_q *uq;
3758 	struct umtx_pi *pi;
3759 
3760 	if ((uq = td->td_umtxq) == NULL)
3761 		return;
3762 
3763 	mtx_lock_spin(&umtx_lock);
3764 	uq->uq_inherited_pri = PRI_MAX;
3765 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3766 		pi->pi_owner = NULL;
3767 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3768 	}
3769 	mtx_unlock_spin(&umtx_lock);
3770 	thread_lock(td);
3771 	sched_lend_user_prio(td, PRI_MAX);
3772 	thread_unlock(td);
3773 }
3774