xref: /freebsd/sys/kern/kern_umtx.c (revision 0572ccaa4543b0abef8ef81e384c1d04de9f3da1)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173 
174 /*
175  * Don't propagate time-sharing priority, there is a security reason,
176  * a user can simply introduce PI-mutex, let thread A lock the mutex,
177  * and let another thread B block on the mutex, because B is
178  * sleeping, its priority will be boosted, this causes A's priority to
179  * be boosted via priority propagating too and will never be lowered even
180  * if it is using 100%CPU, this is unfair to other processes.
181  */
182 
183 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186 
187 #define	GOLDEN_RATIO_PRIME	2654404609U
188 #define	UMTX_CHAINS		512
189 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
190 
191 #define	GET_SHARE(flags)	\
192     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193 
194 #define BUSY_SPINS		200
195 
196 struct abs_timeout {
197 	int clockid;
198 	struct timespec cur;
199 	struct timespec end;
200 };
201 
202 static uma_zone_t		umtx_pi_zone;
203 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205 static int			umtx_pi_allocated;
206 
207 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209     &umtx_pi_allocated, 0, "Allocated umtx_pi");
210 
211 #ifdef UMTX_PROFILING
212 static long max_length;
213 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215 #endif
216 
217 static void umtxq_sysinit(void *);
218 static void umtxq_hash(struct umtx_key *key);
219 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220 static void umtxq_lock(struct umtx_key *key);
221 static void umtxq_unlock(struct umtx_key *key);
222 static void umtxq_busy(struct umtx_key *key);
223 static void umtxq_unbusy(struct umtx_key *key);
224 static void umtxq_insert_queue(struct umtx_q *uq, int q);
225 static void umtxq_remove_queue(struct umtx_q *uq, int q);
226 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227 static int umtxq_count(struct umtx_key *key);
228 static struct umtx_pi *umtx_pi_alloc(int);
229 static void umtx_pi_free(struct umtx_pi *pi);
230 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231 static void umtx_thread_cleanup(struct thread *td);
232 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233 	struct image_params *imgp __unused);
234 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235 
236 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239 
240 static struct mtx umtx_lock;
241 
242 #ifdef UMTX_PROFILING
243 static void
244 umtx_init_profiling(void)
245 {
246 	struct sysctl_oid *chain_oid;
247 	char chain_name[10];
248 	int i;
249 
250 	for (i = 0; i < UMTX_CHAINS; ++i) {
251 		snprintf(chain_name, sizeof(chain_name), "%d", i);
252 		chain_oid = SYSCTL_ADD_NODE(NULL,
253 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259 	}
260 }
261 
262 static int
263 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264 {
265 	char buf[512];
266 	struct sbuf sb;
267 	struct umtxq_chain *uc;
268 	u_int fract, i, j, tot, whole;
269 	u_int sf0, sf1, sf2, sf3, sf4;
270 	u_int si0, si1, si2, si3, si4;
271 	u_int sw0, sw1, sw2, sw3, sw4;
272 
273 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274 	for (i = 0; i < 2; i++) {
275 		tot = 0;
276 		for (j = 0; j < UMTX_CHAINS; ++j) {
277 			uc = &umtxq_chains[i][j];
278 			mtx_lock(&uc->uc_lock);
279 			tot += uc->max_length;
280 			mtx_unlock(&uc->uc_lock);
281 		}
282 		if (tot == 0)
283 			sbuf_printf(&sb, "%u) Empty ", i);
284 		else {
285 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286 			si0 = si1 = si2 = si3 = si4 = 0;
287 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288 			for (j = 0; j < UMTX_CHAINS; j++) {
289 				uc = &umtxq_chains[i][j];
290 				mtx_lock(&uc->uc_lock);
291 				whole = uc->max_length * 100;
292 				mtx_unlock(&uc->uc_lock);
293 				fract = (whole % tot) * 100;
294 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295 					sf0 = fract;
296 					si0 = j;
297 					sw0 = whole;
298 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299 				    sf1)) {
300 					sf1 = fract;
301 					si1 = j;
302 					sw1 = whole;
303 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304 				    sf2)) {
305 					sf2 = fract;
306 					si2 = j;
307 					sw2 = whole;
308 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309 				    sf3)) {
310 					sf3 = fract;
311 					si3 = j;
312 					sw3 = whole;
313 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314 				    sf4)) {
315 					sf4 = fract;
316 					si4 = j;
317 					sw4 = whole;
318 				}
319 			}
320 			sbuf_printf(&sb, "queue %u:\n", i);
321 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322 			    sf0 / tot, si0);
323 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324 			    sf1 / tot, si1);
325 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326 			    sf2 / tot, si2);
327 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328 			    sf3 / tot, si3);
329 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330 			    sf4 / tot, si4);
331 		}
332 	}
333 	sbuf_trim(&sb);
334 	sbuf_finish(&sb);
335 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336 	sbuf_delete(&sb);
337 	return (0);
338 }
339 
340 static int
341 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342 {
343 	struct umtxq_chain *uc;
344 	u_int i, j;
345 	int clear, error;
346 
347 	clear = 0;
348 	error = sysctl_handle_int(oidp, &clear, 0, req);
349 	if (error != 0 || req->newptr == NULL)
350 		return (error);
351 
352 	if (clear != 0) {
353 		for (i = 0; i < 2; ++i) {
354 			for (j = 0; j < UMTX_CHAINS; ++j) {
355 				uc = &umtxq_chains[i][j];
356 				mtx_lock(&uc->uc_lock);
357 				uc->length = 0;
358 				uc->max_length = 0;
359 				mtx_unlock(&uc->uc_lock);
360 			}
361 		}
362 	}
363 	return (0);
364 }
365 
366 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372 #endif
373 
374 static void
375 umtxq_sysinit(void *arg __unused)
376 {
377 	int i, j;
378 
379 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381 	for (i = 0; i < 2; ++i) {
382 		for (j = 0; j < UMTX_CHAINS; ++j) {
383 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384 				 MTX_DEF | MTX_DUPOK);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389 			umtxq_chains[i][j].uc_busy = 0;
390 			umtxq_chains[i][j].uc_waiters = 0;
391 #ifdef UMTX_PROFILING
392 			umtxq_chains[i][j].length = 0;
393 			umtxq_chains[i][j].max_length = 0;
394 #endif
395 		}
396 	}
397 #ifdef UMTX_PROFILING
398 	umtx_init_profiling();
399 #endif
400 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402 	    EVENTHANDLER_PRI_ANY);
403 }
404 
405 struct umtx_q *
406 umtxq_alloc(void)
407 {
408 	struct umtx_q *uq;
409 
410 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412 	TAILQ_INIT(&uq->uq_spare_queue->head);
413 	TAILQ_INIT(&uq->uq_pi_contested);
414 	uq->uq_inherited_pri = PRI_MAX;
415 	return (uq);
416 }
417 
418 void
419 umtxq_free(struct umtx_q *uq)
420 {
421 	MPASS(uq->uq_spare_queue != NULL);
422 	free(uq->uq_spare_queue, M_UMTX);
423 	free(uq, M_UMTX);
424 }
425 
426 static inline void
427 umtxq_hash(struct umtx_key *key)
428 {
429 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431 }
432 
433 static inline struct umtxq_chain *
434 umtxq_getchain(struct umtx_key *key)
435 {
436 	if (key->type <= TYPE_SEM)
437 		return (&umtxq_chains[1][key->hash]);
438 	return (&umtxq_chains[0][key->hash]);
439 }
440 
441 /*
442  * Lock a chain.
443  */
444 static inline void
445 umtxq_lock(struct umtx_key *key)
446 {
447 	struct umtxq_chain *uc;
448 
449 	uc = umtxq_getchain(key);
450 	mtx_lock(&uc->uc_lock);
451 }
452 
453 /*
454  * Unlock a chain.
455  */
456 static inline void
457 umtxq_unlock(struct umtx_key *key)
458 {
459 	struct umtxq_chain *uc;
460 
461 	uc = umtxq_getchain(key);
462 	mtx_unlock(&uc->uc_lock);
463 }
464 
465 /*
466  * Set chain to busy state when following operation
467  * may be blocked (kernel mutex can not be used).
468  */
469 static inline void
470 umtxq_busy(struct umtx_key *key)
471 {
472 	struct umtxq_chain *uc;
473 
474 	uc = umtxq_getchain(key);
475 	mtx_assert(&uc->uc_lock, MA_OWNED);
476 	if (uc->uc_busy) {
477 #ifdef SMP
478 		if (smp_cpus > 1) {
479 			int count = BUSY_SPINS;
480 			if (count > 0) {
481 				umtxq_unlock(key);
482 				while (uc->uc_busy && --count > 0)
483 					cpu_spinwait();
484 				umtxq_lock(key);
485 			}
486 		}
487 #endif
488 		while (uc->uc_busy) {
489 			uc->uc_waiters++;
490 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491 			uc->uc_waiters--;
492 		}
493 	}
494 	uc->uc_busy = 1;
495 }
496 
497 /*
498  * Unbusy a chain.
499  */
500 static inline void
501 umtxq_unbusy(struct umtx_key *key)
502 {
503 	struct umtxq_chain *uc;
504 
505 	uc = umtxq_getchain(key);
506 	mtx_assert(&uc->uc_lock, MA_OWNED);
507 	KASSERT(uc->uc_busy != 0, ("not busy"));
508 	uc->uc_busy = 0;
509 	if (uc->uc_waiters)
510 		wakeup_one(uc);
511 }
512 
513 static struct umtxq_queue *
514 umtxq_queue_lookup(struct umtx_key *key, int q)
515 {
516 	struct umtxq_queue *uh;
517 	struct umtxq_chain *uc;
518 
519 	uc = umtxq_getchain(key);
520 	UMTXQ_LOCKED_ASSERT(uc);
521 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
522 		if (umtx_key_match(&uh->key, key))
523 			return (uh);
524 	}
525 
526 	return (NULL);
527 }
528 
529 static inline void
530 umtxq_insert_queue(struct umtx_q *uq, int q)
531 {
532 	struct umtxq_queue *uh;
533 	struct umtxq_chain *uc;
534 
535 	uc = umtxq_getchain(&uq->uq_key);
536 	UMTXQ_LOCKED_ASSERT(uc);
537 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
538 	uh = umtxq_queue_lookup(&uq->uq_key, q);
539 	if (uh != NULL) {
540 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
541 	} else {
542 		uh = uq->uq_spare_queue;
543 		uh->key = uq->uq_key;
544 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
545 #ifdef UMTX_PROFILING
546 		uc->length++;
547 		if (uc->length > uc->max_length) {
548 			uc->max_length = uc->length;
549 			if (uc->max_length > max_length)
550 				max_length = uc->max_length;
551 		}
552 #endif
553 	}
554 	uq->uq_spare_queue = NULL;
555 
556 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
557 	uh->length++;
558 	uq->uq_flags |= UQF_UMTXQ;
559 	uq->uq_cur_queue = uh;
560 	return;
561 }
562 
563 static inline void
564 umtxq_remove_queue(struct umtx_q *uq, int q)
565 {
566 	struct umtxq_chain *uc;
567 	struct umtxq_queue *uh;
568 
569 	uc = umtxq_getchain(&uq->uq_key);
570 	UMTXQ_LOCKED_ASSERT(uc);
571 	if (uq->uq_flags & UQF_UMTXQ) {
572 		uh = uq->uq_cur_queue;
573 		TAILQ_REMOVE(&uh->head, uq, uq_link);
574 		uh->length--;
575 		uq->uq_flags &= ~UQF_UMTXQ;
576 		if (TAILQ_EMPTY(&uh->head)) {
577 			KASSERT(uh->length == 0,
578 			    ("inconsistent umtxq_queue length"));
579 #ifdef UMTX_PROFILING
580 			uc->length--;
581 #endif
582 			LIST_REMOVE(uh, link);
583 		} else {
584 			uh = LIST_FIRST(&uc->uc_spare_queue);
585 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
586 			LIST_REMOVE(uh, link);
587 		}
588 		uq->uq_spare_queue = uh;
589 		uq->uq_cur_queue = NULL;
590 	}
591 }
592 
593 /*
594  * Check if there are multiple waiters
595  */
596 static int
597 umtxq_count(struct umtx_key *key)
598 {
599 	struct umtxq_chain *uc;
600 	struct umtxq_queue *uh;
601 
602 	uc = umtxq_getchain(key);
603 	UMTXQ_LOCKED_ASSERT(uc);
604 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
605 	if (uh != NULL)
606 		return (uh->length);
607 	return (0);
608 }
609 
610 /*
611  * Check if there are multiple PI waiters and returns first
612  * waiter.
613  */
614 static int
615 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
616 {
617 	struct umtxq_chain *uc;
618 	struct umtxq_queue *uh;
619 
620 	*first = NULL;
621 	uc = umtxq_getchain(key);
622 	UMTXQ_LOCKED_ASSERT(uc);
623 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
624 	if (uh != NULL) {
625 		*first = TAILQ_FIRST(&uh->head);
626 		return (uh->length);
627 	}
628 	return (0);
629 }
630 
631 static int
632 umtxq_check_susp(struct thread *td)
633 {
634 	struct proc *p;
635 	int error;
636 
637 	/*
638 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
639 	 * eventually break the lockstep loop.
640 	 */
641 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
642 		return (0);
643 	error = 0;
644 	p = td->td_proc;
645 	PROC_LOCK(p);
646 	if (P_SHOULDSTOP(p) ||
647 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
648 		if (p->p_flag & P_SINGLE_EXIT)
649 			error = EINTR;
650 		else
651 			error = ERESTART;
652 	}
653 	PROC_UNLOCK(p);
654 	return (error);
655 }
656 
657 /*
658  * Wake up threads waiting on an userland object.
659  */
660 
661 static int
662 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
663 {
664 	struct umtxq_chain *uc;
665 	struct umtxq_queue *uh;
666 	struct umtx_q *uq;
667 	int ret;
668 
669 	ret = 0;
670 	uc = umtxq_getchain(key);
671 	UMTXQ_LOCKED_ASSERT(uc);
672 	uh = umtxq_queue_lookup(key, q);
673 	if (uh != NULL) {
674 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
675 			umtxq_remove_queue(uq, q);
676 			wakeup(uq);
677 			if (++ret >= n_wake)
678 				return (ret);
679 		}
680 	}
681 	return (ret);
682 }
683 
684 
685 /*
686  * Wake up specified thread.
687  */
688 static inline void
689 umtxq_signal_thread(struct umtx_q *uq)
690 {
691 	struct umtxq_chain *uc;
692 
693 	uc = umtxq_getchain(&uq->uq_key);
694 	UMTXQ_LOCKED_ASSERT(uc);
695 	umtxq_remove(uq);
696 	wakeup(uq);
697 }
698 
699 static inline int
700 tstohz(const struct timespec *tsp)
701 {
702 	struct timeval tv;
703 
704 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
705 	return tvtohz(&tv);
706 }
707 
708 static void
709 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
710 	const struct timespec *timeout)
711 {
712 
713 	timo->clockid = clockid;
714 	if (!absolute) {
715 		kern_clock_gettime(curthread, clockid, &timo->end);
716 		timo->cur = timo->end;
717 		timespecadd(&timo->end, timeout);
718 	} else {
719 		timo->end = *timeout;
720 		kern_clock_gettime(curthread, clockid, &timo->cur);
721 	}
722 }
723 
724 static void
725 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
726 {
727 
728 	abs_timeout_init(timo, umtxtime->_clockid,
729 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
730 		&umtxtime->_timeout);
731 }
732 
733 static inline void
734 abs_timeout_update(struct abs_timeout *timo)
735 {
736 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
737 }
738 
739 static int
740 abs_timeout_gethz(struct abs_timeout *timo)
741 {
742 	struct timespec tts;
743 
744 	if (timespeccmp(&timo->end, &timo->cur, <=))
745 		return (-1);
746 	tts = timo->end;
747 	timespecsub(&tts, &timo->cur);
748 	return (tstohz(&tts));
749 }
750 
751 /*
752  * Put thread into sleep state, before sleeping, check if
753  * thread was removed from umtx queue.
754  */
755 static inline int
756 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
757 {
758 	struct umtxq_chain *uc;
759 	int error, timo;
760 
761 	uc = umtxq_getchain(&uq->uq_key);
762 	UMTXQ_LOCKED_ASSERT(uc);
763 	for (;;) {
764 		if (!(uq->uq_flags & UQF_UMTXQ))
765 			return (0);
766 		if (abstime != NULL) {
767 			timo = abs_timeout_gethz(abstime);
768 			if (timo < 0)
769 				return (ETIMEDOUT);
770 		} else
771 			timo = 0;
772 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
773 		if (error != EWOULDBLOCK) {
774 			umtxq_lock(&uq->uq_key);
775 			break;
776 		}
777 		if (abstime != NULL)
778 			abs_timeout_update(abstime);
779 		umtxq_lock(&uq->uq_key);
780 	}
781 	return (error);
782 }
783 
784 /*
785  * Convert userspace address into unique logical address.
786  */
787 int
788 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
789 {
790 	struct thread *td = curthread;
791 	vm_map_t map;
792 	vm_map_entry_t entry;
793 	vm_pindex_t pindex;
794 	vm_prot_t prot;
795 	boolean_t wired;
796 
797 	key->type = type;
798 	if (share == THREAD_SHARE) {
799 		key->shared = 0;
800 		key->info.private.vs = td->td_proc->p_vmspace;
801 		key->info.private.addr = (uintptr_t)addr;
802 	} else {
803 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
804 		map = &td->td_proc->p_vmspace->vm_map;
805 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
806 		    &entry, &key->info.shared.object, &pindex, &prot,
807 		    &wired) != KERN_SUCCESS) {
808 			return EFAULT;
809 		}
810 
811 		if ((share == PROCESS_SHARE) ||
812 		    (share == AUTO_SHARE &&
813 		     VM_INHERIT_SHARE == entry->inheritance)) {
814 			key->shared = 1;
815 			key->info.shared.offset = entry->offset + entry->start -
816 				(vm_offset_t)addr;
817 			vm_object_reference(key->info.shared.object);
818 		} else {
819 			key->shared = 0;
820 			key->info.private.vs = td->td_proc->p_vmspace;
821 			key->info.private.addr = (uintptr_t)addr;
822 		}
823 		vm_map_lookup_done(map, entry);
824 	}
825 
826 	umtxq_hash(key);
827 	return (0);
828 }
829 
830 /*
831  * Release key.
832  */
833 void
834 umtx_key_release(struct umtx_key *key)
835 {
836 	if (key->shared)
837 		vm_object_deallocate(key->info.shared.object);
838 }
839 
840 /*
841  * Fetch and compare value, sleep on the address if value is not changed.
842  */
843 static int
844 do_wait(struct thread *td, void *addr, u_long id,
845 	struct _umtx_time *timeout, int compat32, int is_private)
846 {
847 	struct abs_timeout timo;
848 	struct umtx_q *uq;
849 	u_long tmp;
850 	int error = 0;
851 
852 	uq = td->td_umtxq;
853 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
854 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
855 		return (error);
856 
857 	if (timeout != NULL)
858 		abs_timeout_init2(&timo, timeout);
859 
860 	umtxq_lock(&uq->uq_key);
861 	umtxq_insert(uq);
862 	umtxq_unlock(&uq->uq_key);
863 	if (compat32 == 0)
864 		tmp = fuword(addr);
865         else
866 		tmp = (unsigned int)fuword32(addr);
867 	umtxq_lock(&uq->uq_key);
868 	if (tmp == id)
869 		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
870 		    NULL : &timo);
871 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
872 		error = 0;
873 	else
874 		umtxq_remove(uq);
875 	umtxq_unlock(&uq->uq_key);
876 	umtx_key_release(&uq->uq_key);
877 	if (error == ERESTART)
878 		error = EINTR;
879 	return (error);
880 }
881 
882 /*
883  * Wake up threads sleeping on the specified address.
884  */
885 int
886 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
887 {
888 	struct umtx_key key;
889 	int ret;
890 
891 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
892 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
893 		return (ret);
894 	umtxq_lock(&key);
895 	ret = umtxq_signal(&key, n_wake);
896 	umtxq_unlock(&key);
897 	umtx_key_release(&key);
898 	return (0);
899 }
900 
901 /*
902  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
903  */
904 static int
905 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
906 	struct _umtx_time *timeout, int mode)
907 {
908 	struct abs_timeout timo;
909 	struct umtx_q *uq;
910 	uint32_t owner, old, id;
911 	int error = 0;
912 
913 	id = td->td_tid;
914 	uq = td->td_umtxq;
915 
916 	if (timeout != NULL)
917 		abs_timeout_init2(&timo, timeout);
918 
919 	/*
920 	 * Care must be exercised when dealing with umtx structure. It
921 	 * can fault on any access.
922 	 */
923 	for (;;) {
924 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
925 		if (mode == _UMUTEX_WAIT) {
926 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
927 				return (0);
928 		} else {
929 			/*
930 			 * Try the uncontested case.  This should be done in userland.
931 			 */
932 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
933 
934 			/* The acquire succeeded. */
935 			if (owner == UMUTEX_UNOWNED)
936 				return (0);
937 
938 			/* The address was invalid. */
939 			if (owner == -1)
940 				return (EFAULT);
941 
942 			/* If no one owns it but it is contested try to acquire it. */
943 			if (owner == UMUTEX_CONTESTED) {
944 				owner = casuword32(&m->m_owner,
945 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
946 
947 				if (owner == UMUTEX_CONTESTED)
948 					return (0);
949 
950 				/* The address was invalid. */
951 				if (owner == -1)
952 					return (EFAULT);
953 
954 				error = umtxq_check_susp(td);
955 				if (error != 0)
956 					return (error);
957 
958 				/* If this failed the lock has changed, restart. */
959 				continue;
960 			}
961 		}
962 
963 		if (mode == _UMUTEX_TRY)
964 			return (EBUSY);
965 
966 		/*
967 		 * If we caught a signal, we have retried and now
968 		 * exit immediately.
969 		 */
970 		if (error != 0)
971 			return (error);
972 
973 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
974 		    GET_SHARE(flags), &uq->uq_key)) != 0)
975 			return (error);
976 
977 		umtxq_lock(&uq->uq_key);
978 		umtxq_busy(&uq->uq_key);
979 		umtxq_insert(uq);
980 		umtxq_unlock(&uq->uq_key);
981 
982 		/*
983 		 * Set the contested bit so that a release in user space
984 		 * knows to use the system call for unlock.  If this fails
985 		 * either some one else has acquired the lock or it has been
986 		 * released.
987 		 */
988 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
989 
990 		/* The address was invalid. */
991 		if (old == -1) {
992 			umtxq_lock(&uq->uq_key);
993 			umtxq_remove(uq);
994 			umtxq_unbusy(&uq->uq_key);
995 			umtxq_unlock(&uq->uq_key);
996 			umtx_key_release(&uq->uq_key);
997 			return (EFAULT);
998 		}
999 
1000 		/*
1001 		 * We set the contested bit, sleep. Otherwise the lock changed
1002 		 * and we need to retry or we lost a race to the thread
1003 		 * unlocking the umtx.
1004 		 */
1005 		umtxq_lock(&uq->uq_key);
1006 		umtxq_unbusy(&uq->uq_key);
1007 		if (old == owner)
1008 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1009 			    NULL : &timo);
1010 		umtxq_remove(uq);
1011 		umtxq_unlock(&uq->uq_key);
1012 		umtx_key_release(&uq->uq_key);
1013 
1014 		if (error == 0)
1015 			error = umtxq_check_susp(td);
1016 	}
1017 
1018 	return (0);
1019 }
1020 
1021 /*
1022  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1023  */
1024 static int
1025 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1026 {
1027 	struct umtx_key key;
1028 	uint32_t owner, old, id;
1029 	int error;
1030 	int count;
1031 
1032 	id = td->td_tid;
1033 	/*
1034 	 * Make sure we own this mtx.
1035 	 */
1036 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1037 	if (owner == -1)
1038 		return (EFAULT);
1039 
1040 	if ((owner & ~UMUTEX_CONTESTED) != id)
1041 		return (EPERM);
1042 
1043 	if ((owner & UMUTEX_CONTESTED) == 0) {
1044 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1045 		if (old == -1)
1046 			return (EFAULT);
1047 		if (old == owner)
1048 			return (0);
1049 		owner = old;
1050 	}
1051 
1052 	/* We should only ever be in here for contested locks */
1053 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1054 	    &key)) != 0)
1055 		return (error);
1056 
1057 	umtxq_lock(&key);
1058 	umtxq_busy(&key);
1059 	count = umtxq_count(&key);
1060 	umtxq_unlock(&key);
1061 
1062 	/*
1063 	 * When unlocking the umtx, it must be marked as unowned if
1064 	 * there is zero or one thread only waiting for it.
1065 	 * Otherwise, it must be marked as contested.
1066 	 */
1067 	old = casuword32(&m->m_owner, owner,
1068 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1069 	umtxq_lock(&key);
1070 	umtxq_signal(&key,1);
1071 	umtxq_unbusy(&key);
1072 	umtxq_unlock(&key);
1073 	umtx_key_release(&key);
1074 	if (old == -1)
1075 		return (EFAULT);
1076 	if (old != owner)
1077 		return (EINVAL);
1078 	return (0);
1079 }
1080 
1081 /*
1082  * Check if the mutex is available and wake up a waiter,
1083  * only for simple mutex.
1084  */
1085 static int
1086 do_wake_umutex(struct thread *td, struct umutex *m)
1087 {
1088 	struct umtx_key key;
1089 	uint32_t owner;
1090 	uint32_t flags;
1091 	int error;
1092 	int count;
1093 
1094 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1095 	if (owner == -1)
1096 		return (EFAULT);
1097 
1098 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1099 		return (0);
1100 
1101 	flags = fuword32(&m->m_flags);
1102 
1103 	/* We should only ever be in here for contested locks */
1104 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1105 	    &key)) != 0)
1106 		return (error);
1107 
1108 	umtxq_lock(&key);
1109 	umtxq_busy(&key);
1110 	count = umtxq_count(&key);
1111 	umtxq_unlock(&key);
1112 
1113 	if (count <= 1)
1114 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1115 
1116 	umtxq_lock(&key);
1117 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1118 		umtxq_signal(&key, 1);
1119 	umtxq_unbusy(&key);
1120 	umtxq_unlock(&key);
1121 	umtx_key_release(&key);
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Check if the mutex has waiters and tries to fix contention bit.
1127  */
1128 static int
1129 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1130 {
1131 	struct umtx_key key;
1132 	uint32_t owner, old;
1133 	int type;
1134 	int error;
1135 	int count;
1136 
1137 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1138 	case 0:
1139 		type = TYPE_NORMAL_UMUTEX;
1140 		break;
1141 	case UMUTEX_PRIO_INHERIT:
1142 		type = TYPE_PI_UMUTEX;
1143 		break;
1144 	case UMUTEX_PRIO_PROTECT:
1145 		type = TYPE_PP_UMUTEX;
1146 		break;
1147 	default:
1148 		return (EINVAL);
1149 	}
1150 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1151 	    &key)) != 0)
1152 		return (error);
1153 
1154 	owner = 0;
1155 	umtxq_lock(&key);
1156 	umtxq_busy(&key);
1157 	count = umtxq_count(&key);
1158 	umtxq_unlock(&key);
1159 	/*
1160 	 * Only repair contention bit if there is a waiter, this means the mutex
1161 	 * is still being referenced by userland code, otherwise don't update
1162 	 * any memory.
1163 	 */
1164 	if (count > 1) {
1165 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1166 		while ((owner & UMUTEX_CONTESTED) ==0) {
1167 			old = casuword32(&m->m_owner, owner,
1168 			    owner|UMUTEX_CONTESTED);
1169 			if (old == owner)
1170 				break;
1171 			owner = old;
1172 			if (old == -1)
1173 				break;
1174 			error = umtxq_check_susp(td);
1175 			if (error != 0)
1176 				break;
1177 		}
1178 	} else if (count == 1) {
1179 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1180 		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1181 		       (owner & UMUTEX_CONTESTED) == 0) {
1182 			old = casuword32(&m->m_owner, owner,
1183 			    owner|UMUTEX_CONTESTED);
1184 			if (old == owner)
1185 				break;
1186 			owner = old;
1187 			if (old == -1)
1188 				break;
1189 			error = umtxq_check_susp(td);
1190 			if (error != 0)
1191 				break;
1192 		}
1193 	}
1194 	umtxq_lock(&key);
1195 	if (owner == -1) {
1196 		error = EFAULT;
1197 		umtxq_signal(&key, INT_MAX);
1198 	}
1199 	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1200 		umtxq_signal(&key, 1);
1201 	umtxq_unbusy(&key);
1202 	umtxq_unlock(&key);
1203 	umtx_key_release(&key);
1204 	return (error);
1205 }
1206 
1207 static inline struct umtx_pi *
1208 umtx_pi_alloc(int flags)
1209 {
1210 	struct umtx_pi *pi;
1211 
1212 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1213 	TAILQ_INIT(&pi->pi_blocked);
1214 	atomic_add_int(&umtx_pi_allocated, 1);
1215 	return (pi);
1216 }
1217 
1218 static inline void
1219 umtx_pi_free(struct umtx_pi *pi)
1220 {
1221 	uma_zfree(umtx_pi_zone, pi);
1222 	atomic_add_int(&umtx_pi_allocated, -1);
1223 }
1224 
1225 /*
1226  * Adjust the thread's position on a pi_state after its priority has been
1227  * changed.
1228  */
1229 static int
1230 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1231 {
1232 	struct umtx_q *uq, *uq1, *uq2;
1233 	struct thread *td1;
1234 
1235 	mtx_assert(&umtx_lock, MA_OWNED);
1236 	if (pi == NULL)
1237 		return (0);
1238 
1239 	uq = td->td_umtxq;
1240 
1241 	/*
1242 	 * Check if the thread needs to be moved on the blocked chain.
1243 	 * It needs to be moved if either its priority is lower than
1244 	 * the previous thread or higher than the next thread.
1245 	 */
1246 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1247 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1248 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1249 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1250 		/*
1251 		 * Remove thread from blocked chain and determine where
1252 		 * it should be moved to.
1253 		 */
1254 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1255 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1256 			td1 = uq1->uq_thread;
1257 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1258 			if (UPRI(td1) > UPRI(td))
1259 				break;
1260 		}
1261 
1262 		if (uq1 == NULL)
1263 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1264 		else
1265 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1266 	}
1267 	return (1);
1268 }
1269 
1270 /*
1271  * Propagate priority when a thread is blocked on POSIX
1272  * PI mutex.
1273  */
1274 static void
1275 umtx_propagate_priority(struct thread *td)
1276 {
1277 	struct umtx_q *uq;
1278 	struct umtx_pi *pi;
1279 	int pri;
1280 
1281 	mtx_assert(&umtx_lock, MA_OWNED);
1282 	pri = UPRI(td);
1283 	uq = td->td_umtxq;
1284 	pi = uq->uq_pi_blocked;
1285 	if (pi == NULL)
1286 		return;
1287 
1288 	for (;;) {
1289 		td = pi->pi_owner;
1290 		if (td == NULL || td == curthread)
1291 			return;
1292 
1293 		MPASS(td->td_proc != NULL);
1294 		MPASS(td->td_proc->p_magic == P_MAGIC);
1295 
1296 		thread_lock(td);
1297 		if (td->td_lend_user_pri > pri)
1298 			sched_lend_user_prio(td, pri);
1299 		else {
1300 			thread_unlock(td);
1301 			break;
1302 		}
1303 		thread_unlock(td);
1304 
1305 		/*
1306 		 * Pick up the lock that td is blocked on.
1307 		 */
1308 		uq = td->td_umtxq;
1309 		pi = uq->uq_pi_blocked;
1310 		if (pi == NULL)
1311 			break;
1312 		/* Resort td on the list if needed. */
1313 		umtx_pi_adjust_thread(pi, td);
1314 	}
1315 }
1316 
1317 /*
1318  * Unpropagate priority for a PI mutex when a thread blocked on
1319  * it is interrupted by signal or resumed by others.
1320  */
1321 static void
1322 umtx_repropagate_priority(struct umtx_pi *pi)
1323 {
1324 	struct umtx_q *uq, *uq_owner;
1325 	struct umtx_pi *pi2;
1326 	int pri;
1327 
1328 	mtx_assert(&umtx_lock, MA_OWNED);
1329 
1330 	while (pi != NULL && pi->pi_owner != NULL) {
1331 		pri = PRI_MAX;
1332 		uq_owner = pi->pi_owner->td_umtxq;
1333 
1334 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1335 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1336 			if (uq != NULL) {
1337 				if (pri > UPRI(uq->uq_thread))
1338 					pri = UPRI(uq->uq_thread);
1339 			}
1340 		}
1341 
1342 		if (pri > uq_owner->uq_inherited_pri)
1343 			pri = uq_owner->uq_inherited_pri;
1344 		thread_lock(pi->pi_owner);
1345 		sched_lend_user_prio(pi->pi_owner, pri);
1346 		thread_unlock(pi->pi_owner);
1347 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1348 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1349 	}
1350 }
1351 
1352 /*
1353  * Insert a PI mutex into owned list.
1354  */
1355 static void
1356 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1357 {
1358 	struct umtx_q *uq_owner;
1359 
1360 	uq_owner = owner->td_umtxq;
1361 	mtx_assert(&umtx_lock, MA_OWNED);
1362 	if (pi->pi_owner != NULL)
1363 		panic("pi_ower != NULL");
1364 	pi->pi_owner = owner;
1365 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1366 }
1367 
1368 /*
1369  * Claim ownership of a PI mutex.
1370  */
1371 static int
1372 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1373 {
1374 	struct umtx_q *uq, *uq_owner;
1375 
1376 	uq_owner = owner->td_umtxq;
1377 	mtx_lock_spin(&umtx_lock);
1378 	if (pi->pi_owner == owner) {
1379 		mtx_unlock_spin(&umtx_lock);
1380 		return (0);
1381 	}
1382 
1383 	if (pi->pi_owner != NULL) {
1384 		/*
1385 		 * userland may have already messed the mutex, sigh.
1386 		 */
1387 		mtx_unlock_spin(&umtx_lock);
1388 		return (EPERM);
1389 	}
1390 	umtx_pi_setowner(pi, owner);
1391 	uq = TAILQ_FIRST(&pi->pi_blocked);
1392 	if (uq != NULL) {
1393 		int pri;
1394 
1395 		pri = UPRI(uq->uq_thread);
1396 		thread_lock(owner);
1397 		if (pri < UPRI(owner))
1398 			sched_lend_user_prio(owner, pri);
1399 		thread_unlock(owner);
1400 	}
1401 	mtx_unlock_spin(&umtx_lock);
1402 	return (0);
1403 }
1404 
1405 /*
1406  * Adjust a thread's order position in its blocked PI mutex,
1407  * this may result new priority propagating process.
1408  */
1409 void
1410 umtx_pi_adjust(struct thread *td, u_char oldpri)
1411 {
1412 	struct umtx_q *uq;
1413 	struct umtx_pi *pi;
1414 
1415 	uq = td->td_umtxq;
1416 	mtx_lock_spin(&umtx_lock);
1417 	/*
1418 	 * Pick up the lock that td is blocked on.
1419 	 */
1420 	pi = uq->uq_pi_blocked;
1421 	if (pi != NULL) {
1422 		umtx_pi_adjust_thread(pi, td);
1423 		umtx_repropagate_priority(pi);
1424 	}
1425 	mtx_unlock_spin(&umtx_lock);
1426 }
1427 
1428 /*
1429  * Sleep on a PI mutex.
1430  */
1431 static int
1432 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1433 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1434 {
1435 	struct umtxq_chain *uc;
1436 	struct thread *td, *td1;
1437 	struct umtx_q *uq1;
1438 	int pri;
1439 	int error = 0;
1440 
1441 	td = uq->uq_thread;
1442 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1443 	uc = umtxq_getchain(&uq->uq_key);
1444 	UMTXQ_LOCKED_ASSERT(uc);
1445 	UMTXQ_BUSY_ASSERT(uc);
1446 	umtxq_insert(uq);
1447 	mtx_lock_spin(&umtx_lock);
1448 	if (pi->pi_owner == NULL) {
1449 		mtx_unlock_spin(&umtx_lock);
1450 		/* XXX Only look up thread in current process. */
1451 		td1 = tdfind(owner, curproc->p_pid);
1452 		mtx_lock_spin(&umtx_lock);
1453 		if (td1 != NULL) {
1454 			if (pi->pi_owner == NULL)
1455 				umtx_pi_setowner(pi, td1);
1456 			PROC_UNLOCK(td1->td_proc);
1457 		}
1458 	}
1459 
1460 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1461 		pri = UPRI(uq1->uq_thread);
1462 		if (pri > UPRI(td))
1463 			break;
1464 	}
1465 
1466 	if (uq1 != NULL)
1467 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1468 	else
1469 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1470 
1471 	uq->uq_pi_blocked = pi;
1472 	thread_lock(td);
1473 	td->td_flags |= TDF_UPIBLOCKED;
1474 	thread_unlock(td);
1475 	umtx_propagate_priority(td);
1476 	mtx_unlock_spin(&umtx_lock);
1477 	umtxq_unbusy(&uq->uq_key);
1478 
1479 	error = umtxq_sleep(uq, wmesg, timo);
1480 	umtxq_remove(uq);
1481 
1482 	mtx_lock_spin(&umtx_lock);
1483 	uq->uq_pi_blocked = NULL;
1484 	thread_lock(td);
1485 	td->td_flags &= ~TDF_UPIBLOCKED;
1486 	thread_unlock(td);
1487 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1488 	umtx_repropagate_priority(pi);
1489 	mtx_unlock_spin(&umtx_lock);
1490 	umtxq_unlock(&uq->uq_key);
1491 
1492 	return (error);
1493 }
1494 
1495 /*
1496  * Add reference count for a PI mutex.
1497  */
1498 static void
1499 umtx_pi_ref(struct umtx_pi *pi)
1500 {
1501 	struct umtxq_chain *uc;
1502 
1503 	uc = umtxq_getchain(&pi->pi_key);
1504 	UMTXQ_LOCKED_ASSERT(uc);
1505 	pi->pi_refcount++;
1506 }
1507 
1508 /*
1509  * Decrease reference count for a PI mutex, if the counter
1510  * is decreased to zero, its memory space is freed.
1511  */
1512 static void
1513 umtx_pi_unref(struct umtx_pi *pi)
1514 {
1515 	struct umtxq_chain *uc;
1516 
1517 	uc = umtxq_getchain(&pi->pi_key);
1518 	UMTXQ_LOCKED_ASSERT(uc);
1519 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1520 	if (--pi->pi_refcount == 0) {
1521 		mtx_lock_spin(&umtx_lock);
1522 		if (pi->pi_owner != NULL) {
1523 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1524 				pi, pi_link);
1525 			pi->pi_owner = NULL;
1526 		}
1527 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1528 			("blocked queue not empty"));
1529 		mtx_unlock_spin(&umtx_lock);
1530 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1531 		umtx_pi_free(pi);
1532 	}
1533 }
1534 
1535 /*
1536  * Find a PI mutex in hash table.
1537  */
1538 static struct umtx_pi *
1539 umtx_pi_lookup(struct umtx_key *key)
1540 {
1541 	struct umtxq_chain *uc;
1542 	struct umtx_pi *pi;
1543 
1544 	uc = umtxq_getchain(key);
1545 	UMTXQ_LOCKED_ASSERT(uc);
1546 
1547 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1548 		if (umtx_key_match(&pi->pi_key, key)) {
1549 			return (pi);
1550 		}
1551 	}
1552 	return (NULL);
1553 }
1554 
1555 /*
1556  * Insert a PI mutex into hash table.
1557  */
1558 static inline void
1559 umtx_pi_insert(struct umtx_pi *pi)
1560 {
1561 	struct umtxq_chain *uc;
1562 
1563 	uc = umtxq_getchain(&pi->pi_key);
1564 	UMTXQ_LOCKED_ASSERT(uc);
1565 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1566 }
1567 
1568 /*
1569  * Lock a PI mutex.
1570  */
1571 static int
1572 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1573     struct _umtx_time *timeout, int try)
1574 {
1575 	struct abs_timeout timo;
1576 	struct umtx_q *uq;
1577 	struct umtx_pi *pi, *new_pi;
1578 	uint32_t id, owner, old;
1579 	int error;
1580 
1581 	id = td->td_tid;
1582 	uq = td->td_umtxq;
1583 
1584 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1585 	    &uq->uq_key)) != 0)
1586 		return (error);
1587 
1588 	if (timeout != NULL)
1589 		abs_timeout_init2(&timo, timeout);
1590 
1591 	umtxq_lock(&uq->uq_key);
1592 	pi = umtx_pi_lookup(&uq->uq_key);
1593 	if (pi == NULL) {
1594 		new_pi = umtx_pi_alloc(M_NOWAIT);
1595 		if (new_pi == NULL) {
1596 			umtxq_unlock(&uq->uq_key);
1597 			new_pi = umtx_pi_alloc(M_WAITOK);
1598 			umtxq_lock(&uq->uq_key);
1599 			pi = umtx_pi_lookup(&uq->uq_key);
1600 			if (pi != NULL) {
1601 				umtx_pi_free(new_pi);
1602 				new_pi = NULL;
1603 			}
1604 		}
1605 		if (new_pi != NULL) {
1606 			new_pi->pi_key = uq->uq_key;
1607 			umtx_pi_insert(new_pi);
1608 			pi = new_pi;
1609 		}
1610 	}
1611 	umtx_pi_ref(pi);
1612 	umtxq_unlock(&uq->uq_key);
1613 
1614 	/*
1615 	 * Care must be exercised when dealing with umtx structure.  It
1616 	 * can fault on any access.
1617 	 */
1618 	for (;;) {
1619 		/*
1620 		 * Try the uncontested case.  This should be done in userland.
1621 		 */
1622 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1623 
1624 		/* The acquire succeeded. */
1625 		if (owner == UMUTEX_UNOWNED) {
1626 			error = 0;
1627 			break;
1628 		}
1629 
1630 		/* The address was invalid. */
1631 		if (owner == -1) {
1632 			error = EFAULT;
1633 			break;
1634 		}
1635 
1636 		/* If no one owns it but it is contested try to acquire it. */
1637 		if (owner == UMUTEX_CONTESTED) {
1638 			owner = casuword32(&m->m_owner,
1639 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1640 
1641 			if (owner == UMUTEX_CONTESTED) {
1642 				umtxq_lock(&uq->uq_key);
1643 				umtxq_busy(&uq->uq_key);
1644 				error = umtx_pi_claim(pi, td);
1645 				umtxq_unbusy(&uq->uq_key);
1646 				umtxq_unlock(&uq->uq_key);
1647 				break;
1648 			}
1649 
1650 			/* The address was invalid. */
1651 			if (owner == -1) {
1652 				error = EFAULT;
1653 				break;
1654 			}
1655 
1656 			error = umtxq_check_susp(td);
1657 			if (error != 0)
1658 				break;
1659 
1660 			/* If this failed the lock has changed, restart. */
1661 			continue;
1662 		}
1663 
1664 		if (try != 0) {
1665 			error = EBUSY;
1666 			break;
1667 		}
1668 
1669 		/*
1670 		 * If we caught a signal, we have retried and now
1671 		 * exit immediately.
1672 		 */
1673 		if (error != 0)
1674 			break;
1675 
1676 		umtxq_lock(&uq->uq_key);
1677 		umtxq_busy(&uq->uq_key);
1678 		umtxq_unlock(&uq->uq_key);
1679 
1680 		/*
1681 		 * Set the contested bit so that a release in user space
1682 		 * knows to use the system call for unlock.  If this fails
1683 		 * either some one else has acquired the lock or it has been
1684 		 * released.
1685 		 */
1686 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1687 
1688 		/* The address was invalid. */
1689 		if (old == -1) {
1690 			umtxq_lock(&uq->uq_key);
1691 			umtxq_unbusy(&uq->uq_key);
1692 			umtxq_unlock(&uq->uq_key);
1693 			error = EFAULT;
1694 			break;
1695 		}
1696 
1697 		umtxq_lock(&uq->uq_key);
1698 		/*
1699 		 * We set the contested bit, sleep. Otherwise the lock changed
1700 		 * and we need to retry or we lost a race to the thread
1701 		 * unlocking the umtx.
1702 		 */
1703 		if (old == owner)
1704 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1705 			    "umtxpi", timeout == NULL ? NULL : &timo);
1706 		else {
1707 			umtxq_unbusy(&uq->uq_key);
1708 			umtxq_unlock(&uq->uq_key);
1709 		}
1710 
1711 		error = umtxq_check_susp(td);
1712 		if (error != 0)
1713 			break;
1714 	}
1715 
1716 	umtxq_lock(&uq->uq_key);
1717 	umtx_pi_unref(pi);
1718 	umtxq_unlock(&uq->uq_key);
1719 
1720 	umtx_key_release(&uq->uq_key);
1721 	return (error);
1722 }
1723 
1724 /*
1725  * Unlock a PI mutex.
1726  */
1727 static int
1728 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1729 {
1730 	struct umtx_key key;
1731 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1732 	struct umtx_pi *pi, *pi2;
1733 	uint32_t owner, old, id;
1734 	int error;
1735 	int count;
1736 	int pri;
1737 
1738 	id = td->td_tid;
1739 	/*
1740 	 * Make sure we own this mtx.
1741 	 */
1742 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1743 	if (owner == -1)
1744 		return (EFAULT);
1745 
1746 	if ((owner & ~UMUTEX_CONTESTED) != id)
1747 		return (EPERM);
1748 
1749 	/* This should be done in userland */
1750 	if ((owner & UMUTEX_CONTESTED) == 0) {
1751 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1752 		if (old == -1)
1753 			return (EFAULT);
1754 		if (old == owner)
1755 			return (0);
1756 		owner = old;
1757 	}
1758 
1759 	/* We should only ever be in here for contested locks */
1760 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1761 	    &key)) != 0)
1762 		return (error);
1763 
1764 	umtxq_lock(&key);
1765 	umtxq_busy(&key);
1766 	count = umtxq_count_pi(&key, &uq_first);
1767 	if (uq_first != NULL) {
1768 		mtx_lock_spin(&umtx_lock);
1769 		pi = uq_first->uq_pi_blocked;
1770 		KASSERT(pi != NULL, ("pi == NULL?"));
1771 		if (pi->pi_owner != curthread) {
1772 			mtx_unlock_spin(&umtx_lock);
1773 			umtxq_unbusy(&key);
1774 			umtxq_unlock(&key);
1775 			umtx_key_release(&key);
1776 			/* userland messed the mutex */
1777 			return (EPERM);
1778 		}
1779 		uq_me = curthread->td_umtxq;
1780 		pi->pi_owner = NULL;
1781 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1782 		/* get highest priority thread which is still sleeping. */
1783 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1784 		while (uq_first != NULL &&
1785 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1786 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1787 		}
1788 		pri = PRI_MAX;
1789 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1790 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1791 			if (uq_first2 != NULL) {
1792 				if (pri > UPRI(uq_first2->uq_thread))
1793 					pri = UPRI(uq_first2->uq_thread);
1794 			}
1795 		}
1796 		thread_lock(curthread);
1797 		sched_lend_user_prio(curthread, pri);
1798 		thread_unlock(curthread);
1799 		mtx_unlock_spin(&umtx_lock);
1800 		if (uq_first)
1801 			umtxq_signal_thread(uq_first);
1802 	}
1803 	umtxq_unlock(&key);
1804 
1805 	/*
1806 	 * When unlocking the umtx, it must be marked as unowned if
1807 	 * there is zero or one thread only waiting for it.
1808 	 * Otherwise, it must be marked as contested.
1809 	 */
1810 	old = casuword32(&m->m_owner, owner,
1811 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1812 
1813 	umtxq_lock(&key);
1814 	umtxq_unbusy(&key);
1815 	umtxq_unlock(&key);
1816 	umtx_key_release(&key);
1817 	if (old == -1)
1818 		return (EFAULT);
1819 	if (old != owner)
1820 		return (EINVAL);
1821 	return (0);
1822 }
1823 
1824 /*
1825  * Lock a PP mutex.
1826  */
1827 static int
1828 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1829     struct _umtx_time *timeout, int try)
1830 {
1831 	struct abs_timeout timo;
1832 	struct umtx_q *uq, *uq2;
1833 	struct umtx_pi *pi;
1834 	uint32_t ceiling;
1835 	uint32_t owner, id;
1836 	int error, pri, old_inherited_pri, su;
1837 
1838 	id = td->td_tid;
1839 	uq = td->td_umtxq;
1840 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1841 	    &uq->uq_key)) != 0)
1842 		return (error);
1843 
1844 	if (timeout != NULL)
1845 		abs_timeout_init2(&timo, timeout);
1846 
1847 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1848 	for (;;) {
1849 		old_inherited_pri = uq->uq_inherited_pri;
1850 		umtxq_lock(&uq->uq_key);
1851 		umtxq_busy(&uq->uq_key);
1852 		umtxq_unlock(&uq->uq_key);
1853 
1854 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1855 		if (ceiling > RTP_PRIO_MAX) {
1856 			error = EINVAL;
1857 			goto out;
1858 		}
1859 
1860 		mtx_lock_spin(&umtx_lock);
1861 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1862 			mtx_unlock_spin(&umtx_lock);
1863 			error = EINVAL;
1864 			goto out;
1865 		}
1866 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1867 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1868 			thread_lock(td);
1869 			if (uq->uq_inherited_pri < UPRI(td))
1870 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1871 			thread_unlock(td);
1872 		}
1873 		mtx_unlock_spin(&umtx_lock);
1874 
1875 		owner = casuword32(&m->m_owner,
1876 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1877 
1878 		if (owner == UMUTEX_CONTESTED) {
1879 			error = 0;
1880 			break;
1881 		}
1882 
1883 		/* The address was invalid. */
1884 		if (owner == -1) {
1885 			error = EFAULT;
1886 			break;
1887 		}
1888 
1889 		if (try != 0) {
1890 			error = EBUSY;
1891 			break;
1892 		}
1893 
1894 		/*
1895 		 * If we caught a signal, we have retried and now
1896 		 * exit immediately.
1897 		 */
1898 		if (error != 0)
1899 			break;
1900 
1901 		umtxq_lock(&uq->uq_key);
1902 		umtxq_insert(uq);
1903 		umtxq_unbusy(&uq->uq_key);
1904 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
1905 		    NULL : &timo);
1906 		umtxq_remove(uq);
1907 		umtxq_unlock(&uq->uq_key);
1908 
1909 		mtx_lock_spin(&umtx_lock);
1910 		uq->uq_inherited_pri = old_inherited_pri;
1911 		pri = PRI_MAX;
1912 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1913 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1914 			if (uq2 != NULL) {
1915 				if (pri > UPRI(uq2->uq_thread))
1916 					pri = UPRI(uq2->uq_thread);
1917 			}
1918 		}
1919 		if (pri > uq->uq_inherited_pri)
1920 			pri = uq->uq_inherited_pri;
1921 		thread_lock(td);
1922 		sched_lend_user_prio(td, pri);
1923 		thread_unlock(td);
1924 		mtx_unlock_spin(&umtx_lock);
1925 	}
1926 
1927 	if (error != 0) {
1928 		mtx_lock_spin(&umtx_lock);
1929 		uq->uq_inherited_pri = old_inherited_pri;
1930 		pri = PRI_MAX;
1931 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1932 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1933 			if (uq2 != NULL) {
1934 				if (pri > UPRI(uq2->uq_thread))
1935 					pri = UPRI(uq2->uq_thread);
1936 			}
1937 		}
1938 		if (pri > uq->uq_inherited_pri)
1939 			pri = uq->uq_inherited_pri;
1940 		thread_lock(td);
1941 		sched_lend_user_prio(td, pri);
1942 		thread_unlock(td);
1943 		mtx_unlock_spin(&umtx_lock);
1944 	}
1945 
1946 out:
1947 	umtxq_lock(&uq->uq_key);
1948 	umtxq_unbusy(&uq->uq_key);
1949 	umtxq_unlock(&uq->uq_key);
1950 	umtx_key_release(&uq->uq_key);
1951 	return (error);
1952 }
1953 
1954 /*
1955  * Unlock a PP mutex.
1956  */
1957 static int
1958 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1959 {
1960 	struct umtx_key key;
1961 	struct umtx_q *uq, *uq2;
1962 	struct umtx_pi *pi;
1963 	uint32_t owner, id;
1964 	uint32_t rceiling;
1965 	int error, pri, new_inherited_pri, su;
1966 
1967 	id = td->td_tid;
1968 	uq = td->td_umtxq;
1969 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1970 
1971 	/*
1972 	 * Make sure we own this mtx.
1973 	 */
1974 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1975 	if (owner == -1)
1976 		return (EFAULT);
1977 
1978 	if ((owner & ~UMUTEX_CONTESTED) != id)
1979 		return (EPERM);
1980 
1981 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1982 	if (error != 0)
1983 		return (error);
1984 
1985 	if (rceiling == -1)
1986 		new_inherited_pri = PRI_MAX;
1987 	else {
1988 		rceiling = RTP_PRIO_MAX - rceiling;
1989 		if (rceiling > RTP_PRIO_MAX)
1990 			return (EINVAL);
1991 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1992 	}
1993 
1994 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1995 	    &key)) != 0)
1996 		return (error);
1997 	umtxq_lock(&key);
1998 	umtxq_busy(&key);
1999 	umtxq_unlock(&key);
2000 	/*
2001 	 * For priority protected mutex, always set unlocked state
2002 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2003 	 * to lock the mutex, it is necessary because thread priority
2004 	 * has to be adjusted for such mutex.
2005 	 */
2006 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2007 		UMUTEX_CONTESTED);
2008 
2009 	umtxq_lock(&key);
2010 	if (error == 0)
2011 		umtxq_signal(&key, 1);
2012 	umtxq_unbusy(&key);
2013 	umtxq_unlock(&key);
2014 
2015 	if (error == -1)
2016 		error = EFAULT;
2017 	else {
2018 		mtx_lock_spin(&umtx_lock);
2019 		if (su != 0)
2020 			uq->uq_inherited_pri = new_inherited_pri;
2021 		pri = PRI_MAX;
2022 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2023 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2024 			if (uq2 != NULL) {
2025 				if (pri > UPRI(uq2->uq_thread))
2026 					pri = UPRI(uq2->uq_thread);
2027 			}
2028 		}
2029 		if (pri > uq->uq_inherited_pri)
2030 			pri = uq->uq_inherited_pri;
2031 		thread_lock(td);
2032 		sched_lend_user_prio(td, pri);
2033 		thread_unlock(td);
2034 		mtx_unlock_spin(&umtx_lock);
2035 	}
2036 	umtx_key_release(&key);
2037 	return (error);
2038 }
2039 
2040 static int
2041 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2042 	uint32_t *old_ceiling)
2043 {
2044 	struct umtx_q *uq;
2045 	uint32_t save_ceiling;
2046 	uint32_t owner, id;
2047 	uint32_t flags;
2048 	int error;
2049 
2050 	flags = fuword32(&m->m_flags);
2051 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2052 		return (EINVAL);
2053 	if (ceiling > RTP_PRIO_MAX)
2054 		return (EINVAL);
2055 	id = td->td_tid;
2056 	uq = td->td_umtxq;
2057 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2058 	   &uq->uq_key)) != 0)
2059 		return (error);
2060 	for (;;) {
2061 		umtxq_lock(&uq->uq_key);
2062 		umtxq_busy(&uq->uq_key);
2063 		umtxq_unlock(&uq->uq_key);
2064 
2065 		save_ceiling = fuword32(&m->m_ceilings[0]);
2066 
2067 		owner = casuword32(&m->m_owner,
2068 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2069 
2070 		if (owner == UMUTEX_CONTESTED) {
2071 			suword32(&m->m_ceilings[0], ceiling);
2072 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2073 				UMUTEX_CONTESTED);
2074 			error = 0;
2075 			break;
2076 		}
2077 
2078 		/* The address was invalid. */
2079 		if (owner == -1) {
2080 			error = EFAULT;
2081 			break;
2082 		}
2083 
2084 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2085 			suword32(&m->m_ceilings[0], ceiling);
2086 			error = 0;
2087 			break;
2088 		}
2089 
2090 		/*
2091 		 * If we caught a signal, we have retried and now
2092 		 * exit immediately.
2093 		 */
2094 		if (error != 0)
2095 			break;
2096 
2097 		/*
2098 		 * We set the contested bit, sleep. Otherwise the lock changed
2099 		 * and we need to retry or we lost a race to the thread
2100 		 * unlocking the umtx.
2101 		 */
2102 		umtxq_lock(&uq->uq_key);
2103 		umtxq_insert(uq);
2104 		umtxq_unbusy(&uq->uq_key);
2105 		error = umtxq_sleep(uq, "umtxpp", NULL);
2106 		umtxq_remove(uq);
2107 		umtxq_unlock(&uq->uq_key);
2108 	}
2109 	umtxq_lock(&uq->uq_key);
2110 	if (error == 0)
2111 		umtxq_signal(&uq->uq_key, INT_MAX);
2112 	umtxq_unbusy(&uq->uq_key);
2113 	umtxq_unlock(&uq->uq_key);
2114 	umtx_key_release(&uq->uq_key);
2115 	if (error == 0 && old_ceiling != NULL)
2116 		suword32(old_ceiling, save_ceiling);
2117 	return (error);
2118 }
2119 
2120 /*
2121  * Lock a userland POSIX mutex.
2122  */
2123 static int
2124 do_lock_umutex(struct thread *td, struct umutex *m,
2125     struct _umtx_time *timeout, int mode)
2126 {
2127 	uint32_t flags;
2128 	int error;
2129 
2130 	flags = fuword32(&m->m_flags);
2131 	if (flags == -1)
2132 		return (EFAULT);
2133 
2134 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2135 	case 0:
2136 		error = do_lock_normal(td, m, flags, timeout, mode);
2137 		break;
2138 	case UMUTEX_PRIO_INHERIT:
2139 		error = do_lock_pi(td, m, flags, timeout, mode);
2140 		break;
2141 	case UMUTEX_PRIO_PROTECT:
2142 		error = do_lock_pp(td, m, flags, timeout, mode);
2143 		break;
2144 	default:
2145 		return (EINVAL);
2146 	}
2147 	if (timeout == NULL) {
2148 		if (error == EINTR && mode != _UMUTEX_WAIT)
2149 			error = ERESTART;
2150 	} else {
2151 		/* Timed-locking is not restarted. */
2152 		if (error == ERESTART)
2153 			error = EINTR;
2154 	}
2155 	return (error);
2156 }
2157 
2158 /*
2159  * Unlock a userland POSIX mutex.
2160  */
2161 static int
2162 do_unlock_umutex(struct thread *td, struct umutex *m)
2163 {
2164 	uint32_t flags;
2165 
2166 	flags = fuword32(&m->m_flags);
2167 	if (flags == -1)
2168 		return (EFAULT);
2169 
2170 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2171 	case 0:
2172 		return (do_unlock_normal(td, m, flags));
2173 	case UMUTEX_PRIO_INHERIT:
2174 		return (do_unlock_pi(td, m, flags));
2175 	case UMUTEX_PRIO_PROTECT:
2176 		return (do_unlock_pp(td, m, flags));
2177 	}
2178 
2179 	return (EINVAL);
2180 }
2181 
2182 static int
2183 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2184 	struct timespec *timeout, u_long wflags)
2185 {
2186 	struct abs_timeout timo;
2187 	struct umtx_q *uq;
2188 	uint32_t flags;
2189 	uint32_t clockid;
2190 	int error;
2191 
2192 	uq = td->td_umtxq;
2193 	flags = fuword32(&cv->c_flags);
2194 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2195 	if (error != 0)
2196 		return (error);
2197 
2198 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2199 		clockid = fuword32(&cv->c_clockid);
2200 		if (clockid < CLOCK_REALTIME ||
2201 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2202 			/* hmm, only HW clock id will work. */
2203 			return (EINVAL);
2204 		}
2205 	} else {
2206 		clockid = CLOCK_REALTIME;
2207 	}
2208 
2209 	umtxq_lock(&uq->uq_key);
2210 	umtxq_busy(&uq->uq_key);
2211 	umtxq_insert(uq);
2212 	umtxq_unlock(&uq->uq_key);
2213 
2214 	/*
2215 	 * Set c_has_waiters to 1 before releasing user mutex, also
2216 	 * don't modify cache line when unnecessary.
2217 	 */
2218 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2219 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2220 
2221 	umtxq_lock(&uq->uq_key);
2222 	umtxq_unbusy(&uq->uq_key);
2223 	umtxq_unlock(&uq->uq_key);
2224 
2225 	error = do_unlock_umutex(td, m);
2226 
2227 	if (timeout != NULL)
2228 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2229 			timeout);
2230 
2231 	umtxq_lock(&uq->uq_key);
2232 	if (error == 0) {
2233 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2234 		    NULL : &timo);
2235 	}
2236 
2237 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2238 		error = 0;
2239 	else {
2240 		/*
2241 		 * This must be timeout,interrupted by signal or
2242 		 * surprious wakeup, clear c_has_waiter flag when
2243 		 * necessary.
2244 		 */
2245 		umtxq_busy(&uq->uq_key);
2246 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2247 			int oldlen = uq->uq_cur_queue->length;
2248 			umtxq_remove(uq);
2249 			if (oldlen == 1) {
2250 				umtxq_unlock(&uq->uq_key);
2251 				suword32(
2252 				    __DEVOLATILE(uint32_t *,
2253 					 &cv->c_has_waiters), 0);
2254 				umtxq_lock(&uq->uq_key);
2255 			}
2256 		}
2257 		umtxq_unbusy(&uq->uq_key);
2258 		if (error == ERESTART)
2259 			error = EINTR;
2260 	}
2261 
2262 	umtxq_unlock(&uq->uq_key);
2263 	umtx_key_release(&uq->uq_key);
2264 	return (error);
2265 }
2266 
2267 /*
2268  * Signal a userland condition variable.
2269  */
2270 static int
2271 do_cv_signal(struct thread *td, struct ucond *cv)
2272 {
2273 	struct umtx_key key;
2274 	int error, cnt, nwake;
2275 	uint32_t flags;
2276 
2277 	flags = fuword32(&cv->c_flags);
2278 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2279 		return (error);
2280 	umtxq_lock(&key);
2281 	umtxq_busy(&key);
2282 	cnt = umtxq_count(&key);
2283 	nwake = umtxq_signal(&key, 1);
2284 	if (cnt <= nwake) {
2285 		umtxq_unlock(&key);
2286 		error = suword32(
2287 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2288 		umtxq_lock(&key);
2289 	}
2290 	umtxq_unbusy(&key);
2291 	umtxq_unlock(&key);
2292 	umtx_key_release(&key);
2293 	return (error);
2294 }
2295 
2296 static int
2297 do_cv_broadcast(struct thread *td, struct ucond *cv)
2298 {
2299 	struct umtx_key key;
2300 	int error;
2301 	uint32_t flags;
2302 
2303 	flags = fuword32(&cv->c_flags);
2304 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2305 		return (error);
2306 
2307 	umtxq_lock(&key);
2308 	umtxq_busy(&key);
2309 	umtxq_signal(&key, INT_MAX);
2310 	umtxq_unlock(&key);
2311 
2312 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2313 
2314 	umtxq_lock(&key);
2315 	umtxq_unbusy(&key);
2316 	umtxq_unlock(&key);
2317 
2318 	umtx_key_release(&key);
2319 	return (error);
2320 }
2321 
2322 static int
2323 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2324 {
2325 	struct abs_timeout timo;
2326 	struct umtx_q *uq;
2327 	uint32_t flags, wrflags;
2328 	int32_t state, oldstate;
2329 	int32_t blocked_readers;
2330 	int error;
2331 
2332 	uq = td->td_umtxq;
2333 	flags = fuword32(&rwlock->rw_flags);
2334 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2335 	if (error != 0)
2336 		return (error);
2337 
2338 	if (timeout != NULL)
2339 		abs_timeout_init2(&timo, timeout);
2340 
2341 	wrflags = URWLOCK_WRITE_OWNER;
2342 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2343 		wrflags |= URWLOCK_WRITE_WAITERS;
2344 
2345 	for (;;) {
2346 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2347 		/* try to lock it */
2348 		while (!(state & wrflags)) {
2349 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2350 				umtx_key_release(&uq->uq_key);
2351 				return (EAGAIN);
2352 			}
2353 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2354 			if (oldstate == -1) {
2355 				umtx_key_release(&uq->uq_key);
2356 				return (EFAULT);
2357 			}
2358 			if (oldstate == state) {
2359 				umtx_key_release(&uq->uq_key);
2360 				return (0);
2361 			}
2362 			error = umtxq_check_susp(td);
2363 			if (error != 0)
2364 				break;
2365 			state = oldstate;
2366 		}
2367 
2368 		if (error)
2369 			break;
2370 
2371 		/* grab monitor lock */
2372 		umtxq_lock(&uq->uq_key);
2373 		umtxq_busy(&uq->uq_key);
2374 		umtxq_unlock(&uq->uq_key);
2375 
2376 		/*
2377 		 * re-read the state, in case it changed between the try-lock above
2378 		 * and the check below
2379 		 */
2380 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2381 
2382 		/* set read contention bit */
2383 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2384 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2385 			if (oldstate == -1) {
2386 				error = EFAULT;
2387 				break;
2388 			}
2389 			if (oldstate == state)
2390 				goto sleep;
2391 			state = oldstate;
2392 			error = umtxq_check_susp(td);
2393 			if (error != 0)
2394 				break;
2395 		}
2396 		if (error != 0) {
2397 			umtxq_lock(&uq->uq_key);
2398 			umtxq_unbusy(&uq->uq_key);
2399 			umtxq_unlock(&uq->uq_key);
2400 			break;
2401 		}
2402 
2403 		/* state is changed while setting flags, restart */
2404 		if (!(state & wrflags)) {
2405 			umtxq_lock(&uq->uq_key);
2406 			umtxq_unbusy(&uq->uq_key);
2407 			umtxq_unlock(&uq->uq_key);
2408 			error = umtxq_check_susp(td);
2409 			if (error != 0)
2410 				break;
2411 			continue;
2412 		}
2413 
2414 sleep:
2415 		/* contention bit is set, before sleeping, increase read waiter count */
2416 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2417 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2418 
2419 		while (state & wrflags) {
2420 			umtxq_lock(&uq->uq_key);
2421 			umtxq_insert(uq);
2422 			umtxq_unbusy(&uq->uq_key);
2423 
2424 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2425 			    NULL : &timo);
2426 
2427 			umtxq_busy(&uq->uq_key);
2428 			umtxq_remove(uq);
2429 			umtxq_unlock(&uq->uq_key);
2430 			if (error)
2431 				break;
2432 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2433 		}
2434 
2435 		/* decrease read waiter count, and may clear read contention bit */
2436 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2437 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2438 		if (blocked_readers == 1) {
2439 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2440 			for (;;) {
2441 				oldstate = casuword32(&rwlock->rw_state, state,
2442 					 state & ~URWLOCK_READ_WAITERS);
2443 				if (oldstate == -1) {
2444 					error = EFAULT;
2445 					break;
2446 				}
2447 				if (oldstate == state)
2448 					break;
2449 				state = oldstate;
2450 				error = umtxq_check_susp(td);
2451 				if (error != 0)
2452 					break;
2453 			}
2454 		}
2455 
2456 		umtxq_lock(&uq->uq_key);
2457 		umtxq_unbusy(&uq->uq_key);
2458 		umtxq_unlock(&uq->uq_key);
2459 		if (error != 0)
2460 			break;
2461 	}
2462 	umtx_key_release(&uq->uq_key);
2463 	if (error == ERESTART)
2464 		error = EINTR;
2465 	return (error);
2466 }
2467 
2468 static int
2469 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2470 {
2471 	struct abs_timeout timo;
2472 	struct umtx_q *uq;
2473 	uint32_t flags;
2474 	int32_t state, oldstate;
2475 	int32_t blocked_writers;
2476 	int32_t blocked_readers;
2477 	int error;
2478 
2479 	uq = td->td_umtxq;
2480 	flags = fuword32(&rwlock->rw_flags);
2481 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2482 	if (error != 0)
2483 		return (error);
2484 
2485 	if (timeout != NULL)
2486 		abs_timeout_init2(&timo, timeout);
2487 
2488 	blocked_readers = 0;
2489 	for (;;) {
2490 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2491 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2492 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2493 			if (oldstate == -1) {
2494 				umtx_key_release(&uq->uq_key);
2495 				return (EFAULT);
2496 			}
2497 			if (oldstate == state) {
2498 				umtx_key_release(&uq->uq_key);
2499 				return (0);
2500 			}
2501 			state = oldstate;
2502 			error = umtxq_check_susp(td);
2503 			if (error != 0)
2504 				break;
2505 		}
2506 
2507 		if (error) {
2508 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2509 			    blocked_readers != 0) {
2510 				umtxq_lock(&uq->uq_key);
2511 				umtxq_busy(&uq->uq_key);
2512 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2513 				umtxq_unbusy(&uq->uq_key);
2514 				umtxq_unlock(&uq->uq_key);
2515 			}
2516 
2517 			break;
2518 		}
2519 
2520 		/* grab monitor lock */
2521 		umtxq_lock(&uq->uq_key);
2522 		umtxq_busy(&uq->uq_key);
2523 		umtxq_unlock(&uq->uq_key);
2524 
2525 		/*
2526 		 * re-read the state, in case it changed between the try-lock above
2527 		 * and the check below
2528 		 */
2529 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2530 
2531 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2532 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2533 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2534 			if (oldstate == -1) {
2535 				error = EFAULT;
2536 				break;
2537 			}
2538 			if (oldstate == state)
2539 				goto sleep;
2540 			state = oldstate;
2541 			error = umtxq_check_susp(td);
2542 			if (error != 0)
2543 				break;
2544 		}
2545 		if (error != 0) {
2546 			umtxq_lock(&uq->uq_key);
2547 			umtxq_unbusy(&uq->uq_key);
2548 			umtxq_unlock(&uq->uq_key);
2549 			break;
2550 		}
2551 
2552 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2553 			umtxq_lock(&uq->uq_key);
2554 			umtxq_unbusy(&uq->uq_key);
2555 			umtxq_unlock(&uq->uq_key);
2556 			error = umtxq_check_susp(td);
2557 			if (error != 0)
2558 				break;
2559 			continue;
2560 		}
2561 sleep:
2562 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2563 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2564 
2565 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2566 			umtxq_lock(&uq->uq_key);
2567 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2568 			umtxq_unbusy(&uq->uq_key);
2569 
2570 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2571 			    NULL : &timo);
2572 
2573 			umtxq_busy(&uq->uq_key);
2574 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2575 			umtxq_unlock(&uq->uq_key);
2576 			if (error)
2577 				break;
2578 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2579 		}
2580 
2581 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2582 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2583 		if (blocked_writers == 1) {
2584 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2585 			for (;;) {
2586 				oldstate = casuword32(&rwlock->rw_state, state,
2587 					 state & ~URWLOCK_WRITE_WAITERS);
2588 				if (oldstate == -1) {
2589 					error = EFAULT;
2590 					break;
2591 				}
2592 				if (oldstate == state)
2593 					break;
2594 				state = oldstate;
2595 				error = umtxq_check_susp(td);
2596 				/*
2597 				 * We are leaving the URWLOCK_WRITE_WAITERS
2598 				 * behind, but this should not harm the
2599 				 * correctness.
2600 				 */
2601 				if (error != 0)
2602 					break;
2603 			}
2604 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2605 		} else
2606 			blocked_readers = 0;
2607 
2608 		umtxq_lock(&uq->uq_key);
2609 		umtxq_unbusy(&uq->uq_key);
2610 		umtxq_unlock(&uq->uq_key);
2611 	}
2612 
2613 	umtx_key_release(&uq->uq_key);
2614 	if (error == ERESTART)
2615 		error = EINTR;
2616 	return (error);
2617 }
2618 
2619 static int
2620 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2621 {
2622 	struct umtx_q *uq;
2623 	uint32_t flags;
2624 	int32_t state, oldstate;
2625 	int error, q, count;
2626 
2627 	uq = td->td_umtxq;
2628 	flags = fuword32(&rwlock->rw_flags);
2629 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2630 	if (error != 0)
2631 		return (error);
2632 
2633 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2634 	if (state & URWLOCK_WRITE_OWNER) {
2635 		for (;;) {
2636 			oldstate = casuword32(&rwlock->rw_state, state,
2637 				state & ~URWLOCK_WRITE_OWNER);
2638 			if (oldstate == -1) {
2639 				error = EFAULT;
2640 				goto out;
2641 			}
2642 			if (oldstate != state) {
2643 				state = oldstate;
2644 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2645 					error = EPERM;
2646 					goto out;
2647 				}
2648 				error = umtxq_check_susp(td);
2649 				if (error != 0)
2650 					goto out;
2651 			} else
2652 				break;
2653 		}
2654 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2655 		for (;;) {
2656 			oldstate = casuword32(&rwlock->rw_state, state,
2657 				state - 1);
2658 			if (oldstate == -1) {
2659 				error = EFAULT;
2660 				goto out;
2661 			}
2662 			if (oldstate != state) {
2663 				state = oldstate;
2664 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2665 					error = EPERM;
2666 					goto out;
2667 				}
2668 				error = umtxq_check_susp(td);
2669 				if (error != 0)
2670 					goto out;
2671 			} else
2672 				break;
2673 		}
2674 	} else {
2675 		error = EPERM;
2676 		goto out;
2677 	}
2678 
2679 	count = 0;
2680 
2681 	if (!(flags & URWLOCK_PREFER_READER)) {
2682 		if (state & URWLOCK_WRITE_WAITERS) {
2683 			count = 1;
2684 			q = UMTX_EXCLUSIVE_QUEUE;
2685 		} else if (state & URWLOCK_READ_WAITERS) {
2686 			count = INT_MAX;
2687 			q = UMTX_SHARED_QUEUE;
2688 		}
2689 	} else {
2690 		if (state & URWLOCK_READ_WAITERS) {
2691 			count = INT_MAX;
2692 			q = UMTX_SHARED_QUEUE;
2693 		} else if (state & URWLOCK_WRITE_WAITERS) {
2694 			count = 1;
2695 			q = UMTX_EXCLUSIVE_QUEUE;
2696 		}
2697 	}
2698 
2699 	if (count) {
2700 		umtxq_lock(&uq->uq_key);
2701 		umtxq_busy(&uq->uq_key);
2702 		umtxq_signal_queue(&uq->uq_key, count, q);
2703 		umtxq_unbusy(&uq->uq_key);
2704 		umtxq_unlock(&uq->uq_key);
2705 	}
2706 out:
2707 	umtx_key_release(&uq->uq_key);
2708 	return (error);
2709 }
2710 
2711 static int
2712 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2713 {
2714 	struct abs_timeout timo;
2715 	struct umtx_q *uq;
2716 	uint32_t flags, count;
2717 	int error;
2718 
2719 	uq = td->td_umtxq;
2720 	flags = fuword32(&sem->_flags);
2721 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2722 	if (error != 0)
2723 		return (error);
2724 
2725 	if (timeout != NULL)
2726 		abs_timeout_init2(&timo, timeout);
2727 
2728 	umtxq_lock(&uq->uq_key);
2729 	umtxq_busy(&uq->uq_key);
2730 	umtxq_insert(uq);
2731 	umtxq_unlock(&uq->uq_key);
2732 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2733 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2734 	if (count != 0) {
2735 		umtxq_lock(&uq->uq_key);
2736 		umtxq_unbusy(&uq->uq_key);
2737 		umtxq_remove(uq);
2738 		umtxq_unlock(&uq->uq_key);
2739 		umtx_key_release(&uq->uq_key);
2740 		return (0);
2741 	}
2742 	umtxq_lock(&uq->uq_key);
2743 	umtxq_unbusy(&uq->uq_key);
2744 
2745 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2746 
2747 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2748 		error = 0;
2749 	else {
2750 		umtxq_remove(uq);
2751 		/* A relative timeout cannot be restarted. */
2752 		if (error == ERESTART && timeout != NULL &&
2753 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2754 			error = EINTR;
2755 	}
2756 	umtxq_unlock(&uq->uq_key);
2757 	umtx_key_release(&uq->uq_key);
2758 	return (error);
2759 }
2760 
2761 /*
2762  * Signal a userland condition variable.
2763  */
2764 static int
2765 do_sem_wake(struct thread *td, struct _usem *sem)
2766 {
2767 	struct umtx_key key;
2768 	int error, cnt;
2769 	uint32_t flags;
2770 
2771 	flags = fuword32(&sem->_flags);
2772 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2773 		return (error);
2774 	umtxq_lock(&key);
2775 	umtxq_busy(&key);
2776 	cnt = umtxq_count(&key);
2777 	if (cnt > 0) {
2778 		umtxq_signal(&key, 1);
2779 		/*
2780 		 * Check if count is greater than 0, this means the memory is
2781 		 * still being referenced by user code, so we can safely
2782 		 * update _has_waiters flag.
2783 		 */
2784 		if (cnt == 1) {
2785 			umtxq_unlock(&key);
2786 			error = suword32(
2787 			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2788 			umtxq_lock(&key);
2789 		}
2790 	}
2791 	umtxq_unbusy(&key);
2792 	umtxq_unlock(&key);
2793 	umtx_key_release(&key);
2794 	return (error);
2795 }
2796 
2797 inline int
2798 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2799 {
2800 	int error;
2801 
2802 	error = copyin(addr, tsp, sizeof(struct timespec));
2803 	if (error == 0) {
2804 		if (tsp->tv_sec < 0 ||
2805 		    tsp->tv_nsec >= 1000000000 ||
2806 		    tsp->tv_nsec < 0)
2807 			error = EINVAL;
2808 	}
2809 	return (error);
2810 }
2811 
2812 static inline int
2813 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2814 {
2815 	int error;
2816 
2817 	if (size <= sizeof(struct timespec)) {
2818 		tp->_clockid = CLOCK_REALTIME;
2819 		tp->_flags = 0;
2820 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2821 	} else
2822 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2823 	if (error != 0)
2824 		return (error);
2825 	if (tp->_timeout.tv_sec < 0 ||
2826 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2827 		return (EINVAL);
2828 	return (0);
2829 }
2830 
2831 static int
2832 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
2833 {
2834 
2835 	return (EOPNOTSUPP);
2836 }
2837 
2838 static int
2839 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2840 {
2841 	struct _umtx_time timeout, *tm_p;
2842 	int error;
2843 
2844 	if (uap->uaddr2 == NULL)
2845 		tm_p = NULL;
2846 	else {
2847 		error = umtx_copyin_umtx_time(
2848 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2849 		if (error != 0)
2850 			return (error);
2851 		tm_p = &timeout;
2852 	}
2853 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2854 }
2855 
2856 static int
2857 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2858 {
2859 	struct _umtx_time timeout, *tm_p;
2860 	int error;
2861 
2862 	if (uap->uaddr2 == NULL)
2863 		tm_p = NULL;
2864 	else {
2865 		error = umtx_copyin_umtx_time(
2866 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2867 		if (error != 0)
2868 			return (error);
2869 		tm_p = &timeout;
2870 	}
2871 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
2872 }
2873 
2874 static int
2875 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2876 {
2877 	struct _umtx_time *tm_p, timeout;
2878 	int error;
2879 
2880 	if (uap->uaddr2 == NULL)
2881 		tm_p = NULL;
2882 	else {
2883 		error = umtx_copyin_umtx_time(
2884 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2885 		if (error != 0)
2886 			return (error);
2887 		tm_p = &timeout;
2888 	}
2889 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
2890 }
2891 
2892 static int
2893 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2894 {
2895 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2896 }
2897 
2898 #define BATCH_SIZE	128
2899 static int
2900 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
2901 {
2902 	int count = uap->val;
2903 	void *uaddrs[BATCH_SIZE];
2904 	char **upp = (char **)uap->obj;
2905 	int tocopy;
2906 	int error = 0;
2907 	int i, pos = 0;
2908 
2909 	while (count > 0) {
2910 		tocopy = count;
2911 		if (tocopy > BATCH_SIZE)
2912 			tocopy = BATCH_SIZE;
2913 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
2914 		if (error != 0)
2915 			break;
2916 		for (i = 0; i < tocopy; ++i)
2917 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
2918 		count -= tocopy;
2919 		pos += tocopy;
2920 	}
2921 	return (error);
2922 }
2923 
2924 static int
2925 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2926 {
2927 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2928 }
2929 
2930 static int
2931 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2932 {
2933 	struct _umtx_time *tm_p, timeout;
2934 	int error;
2935 
2936 	/* Allow a null timespec (wait forever). */
2937 	if (uap->uaddr2 == NULL)
2938 		tm_p = NULL;
2939 	else {
2940 		error = umtx_copyin_umtx_time(
2941 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2942 		if (error != 0)
2943 			return (error);
2944 		tm_p = &timeout;
2945 	}
2946 	return do_lock_umutex(td, uap->obj, tm_p, 0);
2947 }
2948 
2949 static int
2950 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2951 {
2952 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
2953 }
2954 
2955 static int
2956 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
2957 {
2958 	struct _umtx_time *tm_p, timeout;
2959 	int error;
2960 
2961 	/* Allow a null timespec (wait forever). */
2962 	if (uap->uaddr2 == NULL)
2963 		tm_p = NULL;
2964 	else {
2965 		error = umtx_copyin_umtx_time(
2966 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2967 		if (error != 0)
2968 			return (error);
2969 		tm_p = &timeout;
2970 	}
2971 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
2972 }
2973 
2974 static int
2975 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
2976 {
2977 	return do_wake_umutex(td, uap->obj);
2978 }
2979 
2980 static int
2981 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2982 {
2983 	return do_unlock_umutex(td, uap->obj);
2984 }
2985 
2986 static int
2987 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2988 {
2989 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2990 }
2991 
2992 static int
2993 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2994 {
2995 	struct timespec *ts, timeout;
2996 	int error;
2997 
2998 	/* Allow a null timespec (wait forever). */
2999 	if (uap->uaddr2 == NULL)
3000 		ts = NULL;
3001 	else {
3002 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3003 		if (error != 0)
3004 			return (error);
3005 		ts = &timeout;
3006 	}
3007 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3008 }
3009 
3010 static int
3011 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3012 {
3013 	return do_cv_signal(td, uap->obj);
3014 }
3015 
3016 static int
3017 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3018 {
3019 	return do_cv_broadcast(td, uap->obj);
3020 }
3021 
3022 static int
3023 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3024 {
3025 	struct _umtx_time timeout;
3026 	int error;
3027 
3028 	/* Allow a null timespec (wait forever). */
3029 	if (uap->uaddr2 == NULL) {
3030 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3031 	} else {
3032 		error = umtx_copyin_umtx_time(uap->uaddr2,
3033 		   (size_t)uap->uaddr1, &timeout);
3034 		if (error != 0)
3035 			return (error);
3036 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3037 	}
3038 	return (error);
3039 }
3040 
3041 static int
3042 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3043 {
3044 	struct _umtx_time timeout;
3045 	int error;
3046 
3047 	/* Allow a null timespec (wait forever). */
3048 	if (uap->uaddr2 == NULL) {
3049 		error = do_rw_wrlock(td, uap->obj, 0);
3050 	} else {
3051 		error = umtx_copyin_umtx_time(uap->uaddr2,
3052 		   (size_t)uap->uaddr1, &timeout);
3053 		if (error != 0)
3054 			return (error);
3055 
3056 		error = do_rw_wrlock(td, uap->obj, &timeout);
3057 	}
3058 	return (error);
3059 }
3060 
3061 static int
3062 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3063 {
3064 	return do_rw_unlock(td, uap->obj);
3065 }
3066 
3067 static int
3068 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3069 {
3070 	struct _umtx_time *tm_p, timeout;
3071 	int error;
3072 
3073 	/* Allow a null timespec (wait forever). */
3074 	if (uap->uaddr2 == NULL)
3075 		tm_p = NULL;
3076 	else {
3077 		error = umtx_copyin_umtx_time(
3078 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3079 		if (error != 0)
3080 			return (error);
3081 		tm_p = &timeout;
3082 	}
3083 	return (do_sem_wait(td, uap->obj, tm_p));
3084 }
3085 
3086 static int
3087 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3088 {
3089 	return do_sem_wake(td, uap->obj);
3090 }
3091 
3092 static int
3093 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3094 {
3095 	return do_wake2_umutex(td, uap->obj, uap->val);
3096 }
3097 
3098 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3099 
3100 static _umtx_op_func op_table[] = {
3101 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3102 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3103 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3104 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3105 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3106 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3107 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3108 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3109 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3110 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3111 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3112 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3113 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3114 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3115 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3116 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3117 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3118 	__umtx_op_wait_umutex,		/* UMTX_OP_MUTEX_WAIT */
3119 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3120 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3121 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3122 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3123 	__umtx_op_wake2_umutex		/* UMTX_OP_MUTEX_WAKE2 */
3124 };
3125 
3126 int
3127 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3128 {
3129 	if ((unsigned)uap->op < UMTX_OP_MAX)
3130 		return (*op_table[uap->op])(td, uap);
3131 	return (EINVAL);
3132 }
3133 
3134 #ifdef COMPAT_FREEBSD32
3135 
3136 struct timespec32 {
3137 	int32_t tv_sec;
3138 	int32_t tv_nsec;
3139 };
3140 
3141 struct umtx_time32 {
3142 	struct	timespec32	timeout;
3143 	uint32_t		flags;
3144 	uint32_t		clockid;
3145 };
3146 
3147 static inline int
3148 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3149 {
3150 	struct timespec32 ts32;
3151 	int error;
3152 
3153 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3154 	if (error == 0) {
3155 		if (ts32.tv_sec < 0 ||
3156 		    ts32.tv_nsec >= 1000000000 ||
3157 		    ts32.tv_nsec < 0)
3158 			error = EINVAL;
3159 		else {
3160 			tsp->tv_sec = ts32.tv_sec;
3161 			tsp->tv_nsec = ts32.tv_nsec;
3162 		}
3163 	}
3164 	return (error);
3165 }
3166 
3167 static inline int
3168 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3169 {
3170 	struct umtx_time32 t32;
3171 	int error;
3172 
3173 	t32.clockid = CLOCK_REALTIME;
3174 	t32.flags   = 0;
3175 	if (size <= sizeof(struct timespec32))
3176 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3177 	else
3178 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3179 	if (error != 0)
3180 		return (error);
3181 	if (t32.timeout.tv_sec < 0 ||
3182 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3183 		return (EINVAL);
3184 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3185 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3186 	tp->_flags = t32.flags;
3187 	tp->_clockid = t32.clockid;
3188 	return (0);
3189 }
3190 
3191 static int
3192 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3193 {
3194 	struct _umtx_time *tm_p, timeout;
3195 	int error;
3196 
3197 	if (uap->uaddr2 == NULL)
3198 		tm_p = NULL;
3199 	else {
3200 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3201 			(size_t)uap->uaddr1, &timeout);
3202 		if (error != 0)
3203 			return (error);
3204 		tm_p = &timeout;
3205 	}
3206 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3207 }
3208 
3209 static int
3210 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3211 {
3212 	struct _umtx_time *tm_p, timeout;
3213 	int error;
3214 
3215 	/* Allow a null timespec (wait forever). */
3216 	if (uap->uaddr2 == NULL)
3217 		tm_p = NULL;
3218 	else {
3219 		error = umtx_copyin_umtx_time(uap->uaddr2,
3220 			    (size_t)uap->uaddr1, &timeout);
3221 		if (error != 0)
3222 			return (error);
3223 		tm_p = &timeout;
3224 	}
3225 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3226 }
3227 
3228 static int
3229 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3230 {
3231 	struct _umtx_time *tm_p, timeout;
3232 	int error;
3233 
3234 	/* Allow a null timespec (wait forever). */
3235 	if (uap->uaddr2 == NULL)
3236 		tm_p = NULL;
3237 	else {
3238 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3239 		    (size_t)uap->uaddr1, &timeout);
3240 		if (error != 0)
3241 			return (error);
3242 		tm_p = &timeout;
3243 	}
3244 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3245 }
3246 
3247 static int
3248 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3249 {
3250 	struct timespec *ts, timeout;
3251 	int error;
3252 
3253 	/* Allow a null timespec (wait forever). */
3254 	if (uap->uaddr2 == NULL)
3255 		ts = NULL;
3256 	else {
3257 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3258 		if (error != 0)
3259 			return (error);
3260 		ts = &timeout;
3261 	}
3262 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3263 }
3264 
3265 static int
3266 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3267 {
3268 	struct _umtx_time timeout;
3269 	int error;
3270 
3271 	/* Allow a null timespec (wait forever). */
3272 	if (uap->uaddr2 == NULL) {
3273 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3274 	} else {
3275 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3276 		    (size_t)uap->uaddr1, &timeout);
3277 		if (error != 0)
3278 			return (error);
3279 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3280 	}
3281 	return (error);
3282 }
3283 
3284 static int
3285 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3286 {
3287 	struct _umtx_time timeout;
3288 	int error;
3289 
3290 	/* Allow a null timespec (wait forever). */
3291 	if (uap->uaddr2 == NULL) {
3292 		error = do_rw_wrlock(td, uap->obj, 0);
3293 	} else {
3294 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3295 		    (size_t)uap->uaddr1, &timeout);
3296 		if (error != 0)
3297 			return (error);
3298 		error = do_rw_wrlock(td, uap->obj, &timeout);
3299 	}
3300 	return (error);
3301 }
3302 
3303 static int
3304 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3305 {
3306 	struct _umtx_time *tm_p, timeout;
3307 	int error;
3308 
3309 	if (uap->uaddr2 == NULL)
3310 		tm_p = NULL;
3311 	else {
3312 		error = umtx_copyin_umtx_time32(
3313 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3314 		if (error != 0)
3315 			return (error);
3316 		tm_p = &timeout;
3317 	}
3318 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3319 }
3320 
3321 static int
3322 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3323 {
3324 	struct _umtx_time *tm_p, timeout;
3325 	int error;
3326 
3327 	/* Allow a null timespec (wait forever). */
3328 	if (uap->uaddr2 == NULL)
3329 		tm_p = NULL;
3330 	else {
3331 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3332 		    (size_t)uap->uaddr1, &timeout);
3333 		if (error != 0)
3334 			return (error);
3335 		tm_p = &timeout;
3336 	}
3337 	return (do_sem_wait(td, uap->obj, tm_p));
3338 }
3339 
3340 static int
3341 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3342 {
3343 	int count = uap->val;
3344 	uint32_t uaddrs[BATCH_SIZE];
3345 	uint32_t **upp = (uint32_t **)uap->obj;
3346 	int tocopy;
3347 	int error = 0;
3348 	int i, pos = 0;
3349 
3350 	while (count > 0) {
3351 		tocopy = count;
3352 		if (tocopy > BATCH_SIZE)
3353 			tocopy = BATCH_SIZE;
3354 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3355 		if (error != 0)
3356 			break;
3357 		for (i = 0; i < tocopy; ++i)
3358 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3359 				INT_MAX, 1);
3360 		count -= tocopy;
3361 		pos += tocopy;
3362 	}
3363 	return (error);
3364 }
3365 
3366 static _umtx_op_func op_table_compat32[] = {
3367 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3368 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3369 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3370 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3371 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3372 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3373 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3374 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3375 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3376 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3377 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3378 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3379 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3380 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3381 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3382 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3383 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3384 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_MUTEX_WAIT */
3385 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3386 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3387 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3388 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3389 	__umtx_op_wake2_umutex		/* UMTX_OP_MUTEX_WAKE2 */
3390 };
3391 
3392 int
3393 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3394 {
3395 	if ((unsigned)uap->op < UMTX_OP_MAX)
3396 		return (*op_table_compat32[uap->op])(td,
3397 			(struct _umtx_op_args *)uap);
3398 	return (EINVAL);
3399 }
3400 #endif
3401 
3402 void
3403 umtx_thread_init(struct thread *td)
3404 {
3405 	td->td_umtxq = umtxq_alloc();
3406 	td->td_umtxq->uq_thread = td;
3407 }
3408 
3409 void
3410 umtx_thread_fini(struct thread *td)
3411 {
3412 	umtxq_free(td->td_umtxq);
3413 }
3414 
3415 /*
3416  * It will be called when new thread is created, e.g fork().
3417  */
3418 void
3419 umtx_thread_alloc(struct thread *td)
3420 {
3421 	struct umtx_q *uq;
3422 
3423 	uq = td->td_umtxq;
3424 	uq->uq_inherited_pri = PRI_MAX;
3425 
3426 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3427 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3428 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3429 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3430 }
3431 
3432 /*
3433  * exec() hook.
3434  */
3435 static void
3436 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3437 	struct image_params *imgp __unused)
3438 {
3439 	umtx_thread_cleanup(curthread);
3440 }
3441 
3442 /*
3443  * thread_exit() hook.
3444  */
3445 void
3446 umtx_thread_exit(struct thread *td)
3447 {
3448 	umtx_thread_cleanup(td);
3449 }
3450 
3451 /*
3452  * clean up umtx data.
3453  */
3454 static void
3455 umtx_thread_cleanup(struct thread *td)
3456 {
3457 	struct umtx_q *uq;
3458 	struct umtx_pi *pi;
3459 
3460 	if ((uq = td->td_umtxq) == NULL)
3461 		return;
3462 
3463 	mtx_lock_spin(&umtx_lock);
3464 	uq->uq_inherited_pri = PRI_MAX;
3465 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3466 		pi->pi_owner = NULL;
3467 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3468 	}
3469 	mtx_unlock_spin(&umtx_lock);
3470 	thread_lock(td);
3471 	sched_lend_user_prio(td, PRI_MAX);
3472 	thread_unlock(td);
3473 }
3474