xref: /freebsd/sys/kern/kern_umtx.c (revision f02f7422801bb39f5eaab8fc383fa7b70c467ff9)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173 
174 /*
175  * Don't propagate time-sharing priority, there is a security reason,
176  * a user can simply introduce PI-mutex, let thread A lock the mutex,
177  * and let another thread B block on the mutex, because B is
178  * sleeping, its priority will be boosted, this causes A's priority to
179  * be boosted via priority propagating too and will never be lowered even
180  * if it is using 100%CPU, this is unfair to other processes.
181  */
182 
183 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186 
187 #define	GOLDEN_RATIO_PRIME	2654404609U
188 #define	UMTX_CHAINS		512
189 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
190 
191 #define	GET_SHARE(flags)	\
192     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193 
194 #define BUSY_SPINS		200
195 
196 struct abs_timeout {
197 	int clockid;
198 	struct timespec cur;
199 	struct timespec end;
200 };
201 
202 static uma_zone_t		umtx_pi_zone;
203 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205 static int			umtx_pi_allocated;
206 
207 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209     &umtx_pi_allocated, 0, "Allocated umtx_pi");
210 
211 #ifdef UMTX_PROFILING
212 static long max_length;
213 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215 #endif
216 
217 static void umtxq_sysinit(void *);
218 static void umtxq_hash(struct umtx_key *key);
219 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220 static void umtxq_lock(struct umtx_key *key);
221 static void umtxq_unlock(struct umtx_key *key);
222 static void umtxq_busy(struct umtx_key *key);
223 static void umtxq_unbusy(struct umtx_key *key);
224 static void umtxq_insert_queue(struct umtx_q *uq, int q);
225 static void umtxq_remove_queue(struct umtx_q *uq, int q);
226 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227 static int umtxq_count(struct umtx_key *key);
228 static struct umtx_pi *umtx_pi_alloc(int);
229 static void umtx_pi_free(struct umtx_pi *pi);
230 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231 static void umtx_thread_cleanup(struct thread *td);
232 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233 	struct image_params *imgp __unused);
234 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235 
236 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239 
240 static struct mtx umtx_lock;
241 
242 #ifdef UMTX_PROFILING
243 static void
244 umtx_init_profiling(void)
245 {
246 	struct sysctl_oid *chain_oid;
247 	char chain_name[10];
248 	int i;
249 
250 	for (i = 0; i < UMTX_CHAINS; ++i) {
251 		snprintf(chain_name, sizeof(chain_name), "%d", i);
252 		chain_oid = SYSCTL_ADD_NODE(NULL,
253 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259 	}
260 }
261 
262 static int
263 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264 {
265 	char buf[512];
266 	struct sbuf sb;
267 	struct umtxq_chain *uc;
268 	u_int fract, i, j, tot, whole;
269 	u_int sf0, sf1, sf2, sf3, sf4;
270 	u_int si0, si1, si2, si3, si4;
271 	u_int sw0, sw1, sw2, sw3, sw4;
272 
273 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274 	for (i = 0; i < 2; i++) {
275 		tot = 0;
276 		for (j = 0; j < UMTX_CHAINS; ++j) {
277 			uc = &umtxq_chains[i][j];
278 			mtx_lock(&uc->uc_lock);
279 			tot += uc->max_length;
280 			mtx_unlock(&uc->uc_lock);
281 		}
282 		if (tot == 0)
283 			sbuf_printf(&sb, "%u) Empty ", i);
284 		else {
285 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286 			si0 = si1 = si2 = si3 = si4 = 0;
287 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288 			for (j = 0; j < UMTX_CHAINS; j++) {
289 				uc = &umtxq_chains[i][j];
290 				mtx_lock(&uc->uc_lock);
291 				whole = uc->max_length * 100;
292 				mtx_unlock(&uc->uc_lock);
293 				fract = (whole % tot) * 100;
294 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295 					sf0 = fract;
296 					si0 = j;
297 					sw0 = whole;
298 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299 				    sf1)) {
300 					sf1 = fract;
301 					si1 = j;
302 					sw1 = whole;
303 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304 				    sf2)) {
305 					sf2 = fract;
306 					si2 = j;
307 					sw2 = whole;
308 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309 				    sf3)) {
310 					sf3 = fract;
311 					si3 = j;
312 					sw3 = whole;
313 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314 				    sf4)) {
315 					sf4 = fract;
316 					si4 = j;
317 					sw4 = whole;
318 				}
319 			}
320 			sbuf_printf(&sb, "queue %u:\n", i);
321 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322 			    sf0 / tot, si0);
323 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324 			    sf1 / tot, si1);
325 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326 			    sf2 / tot, si2);
327 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328 			    sf3 / tot, si3);
329 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330 			    sf4 / tot, si4);
331 		}
332 	}
333 	sbuf_trim(&sb);
334 	sbuf_finish(&sb);
335 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336 	sbuf_delete(&sb);
337 	return (0);
338 }
339 
340 static int
341 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342 {
343 	struct umtxq_chain *uc;
344 	u_int i, j;
345 	int clear, error;
346 
347 	clear = 0;
348 	error = sysctl_handle_int(oidp, &clear, 0, req);
349 	if (error != 0 || req->newptr == NULL)
350 		return (error);
351 
352 	if (clear != 0) {
353 		for (i = 0; i < 2; ++i) {
354 			for (j = 0; j < UMTX_CHAINS; ++j) {
355 				uc = &umtxq_chains[i][j];
356 				mtx_lock(&uc->uc_lock);
357 				uc->length = 0;
358 				uc->max_length = 0;
359 				mtx_unlock(&uc->uc_lock);
360 			}
361 		}
362 	}
363 	return (0);
364 }
365 
366 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372 #endif
373 
374 static void
375 umtxq_sysinit(void *arg __unused)
376 {
377 	int i, j;
378 
379 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381 	for (i = 0; i < 2; ++i) {
382 		for (j = 0; j < UMTX_CHAINS; ++j) {
383 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384 				 MTX_DEF | MTX_DUPOK);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389 			umtxq_chains[i][j].uc_busy = 0;
390 			umtxq_chains[i][j].uc_waiters = 0;
391 #ifdef UMTX_PROFILING
392 			umtxq_chains[i][j].length = 0;
393 			umtxq_chains[i][j].max_length = 0;
394 #endif
395 		}
396 	}
397 #ifdef UMTX_PROFILING
398 	umtx_init_profiling();
399 #endif
400 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402 	    EVENTHANDLER_PRI_ANY);
403 }
404 
405 struct umtx_q *
406 umtxq_alloc(void)
407 {
408 	struct umtx_q *uq;
409 
410 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412 	TAILQ_INIT(&uq->uq_spare_queue->head);
413 	TAILQ_INIT(&uq->uq_pi_contested);
414 	uq->uq_inherited_pri = PRI_MAX;
415 	return (uq);
416 }
417 
418 void
419 umtxq_free(struct umtx_q *uq)
420 {
421 	MPASS(uq->uq_spare_queue != NULL);
422 	free(uq->uq_spare_queue, M_UMTX);
423 	free(uq, M_UMTX);
424 }
425 
426 static inline void
427 umtxq_hash(struct umtx_key *key)
428 {
429 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431 }
432 
433 static inline struct umtxq_chain *
434 umtxq_getchain(struct umtx_key *key)
435 {
436 	if (key->type <= TYPE_SEM)
437 		return (&umtxq_chains[1][key->hash]);
438 	return (&umtxq_chains[0][key->hash]);
439 }
440 
441 /*
442  * Lock a chain.
443  */
444 static inline void
445 umtxq_lock(struct umtx_key *key)
446 {
447 	struct umtxq_chain *uc;
448 
449 	uc = umtxq_getchain(key);
450 	mtx_lock(&uc->uc_lock);
451 }
452 
453 /*
454  * Unlock a chain.
455  */
456 static inline void
457 umtxq_unlock(struct umtx_key *key)
458 {
459 	struct umtxq_chain *uc;
460 
461 	uc = umtxq_getchain(key);
462 	mtx_unlock(&uc->uc_lock);
463 }
464 
465 /*
466  * Set chain to busy state when following operation
467  * may be blocked (kernel mutex can not be used).
468  */
469 static inline void
470 umtxq_busy(struct umtx_key *key)
471 {
472 	struct umtxq_chain *uc;
473 
474 	uc = umtxq_getchain(key);
475 	mtx_assert(&uc->uc_lock, MA_OWNED);
476 	if (uc->uc_busy) {
477 #ifdef SMP
478 		if (smp_cpus > 1) {
479 			int count = BUSY_SPINS;
480 			if (count > 0) {
481 				umtxq_unlock(key);
482 				while (uc->uc_busy && --count > 0)
483 					cpu_spinwait();
484 				umtxq_lock(key);
485 			}
486 		}
487 #endif
488 		while (uc->uc_busy) {
489 			uc->uc_waiters++;
490 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491 			uc->uc_waiters--;
492 		}
493 	}
494 	uc->uc_busy = 1;
495 }
496 
497 /*
498  * Unbusy a chain.
499  */
500 static inline void
501 umtxq_unbusy(struct umtx_key *key)
502 {
503 	struct umtxq_chain *uc;
504 
505 	uc = umtxq_getchain(key);
506 	mtx_assert(&uc->uc_lock, MA_OWNED);
507 	KASSERT(uc->uc_busy != 0, ("not busy"));
508 	uc->uc_busy = 0;
509 	if (uc->uc_waiters)
510 		wakeup_one(uc);
511 }
512 
513 static struct umtxq_queue *
514 umtxq_queue_lookup(struct umtx_key *key, int q)
515 {
516 	struct umtxq_queue *uh;
517 	struct umtxq_chain *uc;
518 
519 	uc = umtxq_getchain(key);
520 	UMTXQ_LOCKED_ASSERT(uc);
521 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
522 		if (umtx_key_match(&uh->key, key))
523 			return (uh);
524 	}
525 
526 	return (NULL);
527 }
528 
529 static inline void
530 umtxq_insert_queue(struct umtx_q *uq, int q)
531 {
532 	struct umtxq_queue *uh;
533 	struct umtxq_chain *uc;
534 
535 	uc = umtxq_getchain(&uq->uq_key);
536 	UMTXQ_LOCKED_ASSERT(uc);
537 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
538 	uh = umtxq_queue_lookup(&uq->uq_key, q);
539 	if (uh != NULL) {
540 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
541 	} else {
542 		uh = uq->uq_spare_queue;
543 		uh->key = uq->uq_key;
544 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
545 #ifdef UMTX_PROFILING
546 		uc->length++;
547 		if (uc->length > uc->max_length) {
548 			uc->max_length = uc->length;
549 			if (uc->max_length > max_length)
550 				max_length = uc->max_length;
551 		}
552 #endif
553 	}
554 	uq->uq_spare_queue = NULL;
555 
556 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
557 	uh->length++;
558 	uq->uq_flags |= UQF_UMTXQ;
559 	uq->uq_cur_queue = uh;
560 	return;
561 }
562 
563 static inline void
564 umtxq_remove_queue(struct umtx_q *uq, int q)
565 {
566 	struct umtxq_chain *uc;
567 	struct umtxq_queue *uh;
568 
569 	uc = umtxq_getchain(&uq->uq_key);
570 	UMTXQ_LOCKED_ASSERT(uc);
571 	if (uq->uq_flags & UQF_UMTXQ) {
572 		uh = uq->uq_cur_queue;
573 		TAILQ_REMOVE(&uh->head, uq, uq_link);
574 		uh->length--;
575 		uq->uq_flags &= ~UQF_UMTXQ;
576 		if (TAILQ_EMPTY(&uh->head)) {
577 			KASSERT(uh->length == 0,
578 			    ("inconsistent umtxq_queue length"));
579 #ifdef UMTX_PROFILING
580 			uc->length--;
581 #endif
582 			LIST_REMOVE(uh, link);
583 		} else {
584 			uh = LIST_FIRST(&uc->uc_spare_queue);
585 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
586 			LIST_REMOVE(uh, link);
587 		}
588 		uq->uq_spare_queue = uh;
589 		uq->uq_cur_queue = NULL;
590 	}
591 }
592 
593 /*
594  * Check if there are multiple waiters
595  */
596 static int
597 umtxq_count(struct umtx_key *key)
598 {
599 	struct umtxq_chain *uc;
600 	struct umtxq_queue *uh;
601 
602 	uc = umtxq_getchain(key);
603 	UMTXQ_LOCKED_ASSERT(uc);
604 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
605 	if (uh != NULL)
606 		return (uh->length);
607 	return (0);
608 }
609 
610 /*
611  * Check if there are multiple PI waiters and returns first
612  * waiter.
613  */
614 static int
615 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
616 {
617 	struct umtxq_chain *uc;
618 	struct umtxq_queue *uh;
619 
620 	*first = NULL;
621 	uc = umtxq_getchain(key);
622 	UMTXQ_LOCKED_ASSERT(uc);
623 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
624 	if (uh != NULL) {
625 		*first = TAILQ_FIRST(&uh->head);
626 		return (uh->length);
627 	}
628 	return (0);
629 }
630 
631 static int
632 umtxq_check_susp(struct thread *td)
633 {
634 	struct proc *p;
635 	int error;
636 
637 	/*
638 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
639 	 * eventually break the lockstep loop.
640 	 */
641 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
642 		return (0);
643 	error = 0;
644 	p = td->td_proc;
645 	PROC_LOCK(p);
646 	if (P_SHOULDSTOP(p) ||
647 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
648 		if (p->p_flag & P_SINGLE_EXIT)
649 			error = EINTR;
650 		else
651 			error = ERESTART;
652 	}
653 	PROC_UNLOCK(p);
654 	return (error);
655 }
656 
657 /*
658  * Wake up threads waiting on an userland object.
659  */
660 
661 static int
662 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
663 {
664 	struct umtxq_chain *uc;
665 	struct umtxq_queue *uh;
666 	struct umtx_q *uq;
667 	int ret;
668 
669 	ret = 0;
670 	uc = umtxq_getchain(key);
671 	UMTXQ_LOCKED_ASSERT(uc);
672 	uh = umtxq_queue_lookup(key, q);
673 	if (uh != NULL) {
674 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
675 			umtxq_remove_queue(uq, q);
676 			wakeup(uq);
677 			if (++ret >= n_wake)
678 				return (ret);
679 		}
680 	}
681 	return (ret);
682 }
683 
684 
685 /*
686  * Wake up specified thread.
687  */
688 static inline void
689 umtxq_signal_thread(struct umtx_q *uq)
690 {
691 	struct umtxq_chain *uc;
692 
693 	uc = umtxq_getchain(&uq->uq_key);
694 	UMTXQ_LOCKED_ASSERT(uc);
695 	umtxq_remove(uq);
696 	wakeup(uq);
697 }
698 
699 static inline int
700 tstohz(const struct timespec *tsp)
701 {
702 	struct timeval tv;
703 
704 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
705 	return tvtohz(&tv);
706 }
707 
708 static void
709 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
710 	const struct timespec *timeout)
711 {
712 
713 	timo->clockid = clockid;
714 	if (!absolute) {
715 		kern_clock_gettime(curthread, clockid, &timo->end);
716 		timo->cur = timo->end;
717 		timespecadd(&timo->end, timeout);
718 	} else {
719 		timo->end = *timeout;
720 		kern_clock_gettime(curthread, clockid, &timo->cur);
721 	}
722 }
723 
724 static void
725 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
726 {
727 
728 	abs_timeout_init(timo, umtxtime->_clockid,
729 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
730 		&umtxtime->_timeout);
731 }
732 
733 static inline void
734 abs_timeout_update(struct abs_timeout *timo)
735 {
736 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
737 }
738 
739 static int
740 abs_timeout_gethz(struct abs_timeout *timo)
741 {
742 	struct timespec tts;
743 
744 	if (timespeccmp(&timo->end, &timo->cur, <=))
745 		return (-1);
746 	tts = timo->end;
747 	timespecsub(&tts, &timo->cur);
748 	return (tstohz(&tts));
749 }
750 
751 /*
752  * Put thread into sleep state, before sleeping, check if
753  * thread was removed from umtx queue.
754  */
755 static inline int
756 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
757 {
758 	struct umtxq_chain *uc;
759 	int error, timo;
760 
761 	uc = umtxq_getchain(&uq->uq_key);
762 	UMTXQ_LOCKED_ASSERT(uc);
763 	for (;;) {
764 		if (!(uq->uq_flags & UQF_UMTXQ))
765 			return (0);
766 		if (abstime != NULL) {
767 			timo = abs_timeout_gethz(abstime);
768 			if (timo < 0)
769 				return (ETIMEDOUT);
770 		} else
771 			timo = 0;
772 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
773 		if (error != EWOULDBLOCK) {
774 			umtxq_lock(&uq->uq_key);
775 			break;
776 		}
777 		if (abstime != NULL)
778 			abs_timeout_update(abstime);
779 		umtxq_lock(&uq->uq_key);
780 	}
781 	return (error);
782 }
783 
784 /*
785  * Convert userspace address into unique logical address.
786  */
787 int
788 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
789 {
790 	struct thread *td = curthread;
791 	vm_map_t map;
792 	vm_map_entry_t entry;
793 	vm_pindex_t pindex;
794 	vm_prot_t prot;
795 	boolean_t wired;
796 
797 	key->type = type;
798 	if (share == THREAD_SHARE) {
799 		key->shared = 0;
800 		key->info.private.vs = td->td_proc->p_vmspace;
801 		key->info.private.addr = (uintptr_t)addr;
802 	} else {
803 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
804 		map = &td->td_proc->p_vmspace->vm_map;
805 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
806 		    &entry, &key->info.shared.object, &pindex, &prot,
807 		    &wired) != KERN_SUCCESS) {
808 			return EFAULT;
809 		}
810 
811 		if ((share == PROCESS_SHARE) ||
812 		    (share == AUTO_SHARE &&
813 		     VM_INHERIT_SHARE == entry->inheritance)) {
814 			key->shared = 1;
815 			key->info.shared.offset = entry->offset + entry->start -
816 				(vm_offset_t)addr;
817 			vm_object_reference(key->info.shared.object);
818 		} else {
819 			key->shared = 0;
820 			key->info.private.vs = td->td_proc->p_vmspace;
821 			key->info.private.addr = (uintptr_t)addr;
822 		}
823 		vm_map_lookup_done(map, entry);
824 	}
825 
826 	umtxq_hash(key);
827 	return (0);
828 }
829 
830 /*
831  * Release key.
832  */
833 void
834 umtx_key_release(struct umtx_key *key)
835 {
836 	if (key->shared)
837 		vm_object_deallocate(key->info.shared.object);
838 }
839 
840 /*
841  * Fetch and compare value, sleep on the address if value is not changed.
842  */
843 static int
844 do_wait(struct thread *td, void *addr, u_long id,
845 	struct _umtx_time *timeout, int compat32, int is_private)
846 {
847 	struct abs_timeout timo;
848 	struct umtx_q *uq;
849 	u_long tmp;
850 	int error = 0;
851 
852 	uq = td->td_umtxq;
853 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
854 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
855 		return (error);
856 
857 	if (timeout != NULL)
858 		abs_timeout_init2(&timo, timeout);
859 
860 	umtxq_lock(&uq->uq_key);
861 	umtxq_insert(uq);
862 	umtxq_unlock(&uq->uq_key);
863 	if (compat32 == 0)
864 		tmp = fuword(addr);
865         else
866 		tmp = (unsigned int)fuword32(addr);
867 	umtxq_lock(&uq->uq_key);
868 	if (tmp == id)
869 		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
870 		    NULL : &timo);
871 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
872 		error = 0;
873 	else
874 		umtxq_remove(uq);
875 	umtxq_unlock(&uq->uq_key);
876 	umtx_key_release(&uq->uq_key);
877 	if (error == ERESTART)
878 		error = EINTR;
879 	return (error);
880 }
881 
882 /*
883  * Wake up threads sleeping on the specified address.
884  */
885 int
886 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
887 {
888 	struct umtx_key key;
889 	int ret;
890 
891 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
892 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
893 		return (ret);
894 	umtxq_lock(&key);
895 	ret = umtxq_signal(&key, n_wake);
896 	umtxq_unlock(&key);
897 	umtx_key_release(&key);
898 	return (0);
899 }
900 
901 /*
902  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
903  */
904 static int
905 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
906 	struct _umtx_time *timeout, int mode)
907 {
908 	struct abs_timeout timo;
909 	struct umtx_q *uq;
910 	uint32_t owner, old, id;
911 	int error = 0;
912 
913 	id = td->td_tid;
914 	uq = td->td_umtxq;
915 
916 	if (timeout != NULL)
917 		abs_timeout_init2(&timo, timeout);
918 
919 	/*
920 	 * Care must be exercised when dealing with umtx structure. It
921 	 * can fault on any access.
922 	 */
923 	for (;;) {
924 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
925 		if (mode == _UMUTEX_WAIT) {
926 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
927 				return (0);
928 		} else {
929 			/*
930 			 * Try the uncontested case.  This should be done in userland.
931 			 */
932 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
933 
934 			/* The acquire succeeded. */
935 			if (owner == UMUTEX_UNOWNED)
936 				return (0);
937 
938 			/* The address was invalid. */
939 			if (owner == -1)
940 				return (EFAULT);
941 
942 			/* If no one owns it but it is contested try to acquire it. */
943 			if (owner == UMUTEX_CONTESTED) {
944 				owner = casuword32(&m->m_owner,
945 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
946 
947 				if (owner == UMUTEX_CONTESTED)
948 					return (0);
949 
950 				/* The address was invalid. */
951 				if (owner == -1)
952 					return (EFAULT);
953 
954 				error = umtxq_check_susp(td);
955 				if (error != 0)
956 					return (error);
957 
958 				/* If this failed the lock has changed, restart. */
959 				continue;
960 			}
961 		}
962 
963 		if (mode == _UMUTEX_TRY)
964 			return (EBUSY);
965 
966 		/*
967 		 * If we caught a signal, we have retried and now
968 		 * exit immediately.
969 		 */
970 		if (error != 0)
971 			return (error);
972 
973 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
974 		    GET_SHARE(flags), &uq->uq_key)) != 0)
975 			return (error);
976 
977 		umtxq_lock(&uq->uq_key);
978 		umtxq_busy(&uq->uq_key);
979 		umtxq_insert(uq);
980 		umtxq_unlock(&uq->uq_key);
981 
982 		/*
983 		 * Set the contested bit so that a release in user space
984 		 * knows to use the system call for unlock.  If this fails
985 		 * either some one else has acquired the lock or it has been
986 		 * released.
987 		 */
988 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
989 
990 		/* The address was invalid. */
991 		if (old == -1) {
992 			umtxq_lock(&uq->uq_key);
993 			umtxq_remove(uq);
994 			umtxq_unbusy(&uq->uq_key);
995 			umtxq_unlock(&uq->uq_key);
996 			umtx_key_release(&uq->uq_key);
997 			return (EFAULT);
998 		}
999 
1000 		/*
1001 		 * We set the contested bit, sleep. Otherwise the lock changed
1002 		 * and we need to retry or we lost a race to the thread
1003 		 * unlocking the umtx.
1004 		 */
1005 		umtxq_lock(&uq->uq_key);
1006 		umtxq_unbusy(&uq->uq_key);
1007 		if (old == owner)
1008 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1009 			    NULL : &timo);
1010 		umtxq_remove(uq);
1011 		umtxq_unlock(&uq->uq_key);
1012 		umtx_key_release(&uq->uq_key);
1013 
1014 		if (error == 0)
1015 			error = umtxq_check_susp(td);
1016 	}
1017 
1018 	return (0);
1019 }
1020 
1021 /*
1022  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1023  */
1024 static int
1025 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1026 {
1027 	struct umtx_key key;
1028 	uint32_t owner, old, id;
1029 	int error;
1030 	int count;
1031 
1032 	id = td->td_tid;
1033 	/*
1034 	 * Make sure we own this mtx.
1035 	 */
1036 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1037 	if (owner == -1)
1038 		return (EFAULT);
1039 
1040 	if ((owner & ~UMUTEX_CONTESTED) != id)
1041 		return (EPERM);
1042 
1043 	if ((owner & UMUTEX_CONTESTED) == 0) {
1044 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1045 		if (old == -1)
1046 			return (EFAULT);
1047 		if (old == owner)
1048 			return (0);
1049 		owner = old;
1050 	}
1051 
1052 	/* We should only ever be in here for contested locks */
1053 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1054 	    &key)) != 0)
1055 		return (error);
1056 
1057 	umtxq_lock(&key);
1058 	umtxq_busy(&key);
1059 	count = umtxq_count(&key);
1060 	umtxq_unlock(&key);
1061 
1062 	/*
1063 	 * When unlocking the umtx, it must be marked as unowned if
1064 	 * there is zero or one thread only waiting for it.
1065 	 * Otherwise, it must be marked as contested.
1066 	 */
1067 	old = casuword32(&m->m_owner, owner,
1068 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1069 	umtxq_lock(&key);
1070 	umtxq_signal(&key,1);
1071 	umtxq_unbusy(&key);
1072 	umtxq_unlock(&key);
1073 	umtx_key_release(&key);
1074 	if (old == -1)
1075 		return (EFAULT);
1076 	if (old != owner)
1077 		return (EINVAL);
1078 	return (0);
1079 }
1080 
1081 /*
1082  * Check if the mutex is available and wake up a waiter,
1083  * only for simple mutex.
1084  */
1085 static int
1086 do_wake_umutex(struct thread *td, struct umutex *m)
1087 {
1088 	struct umtx_key key;
1089 	uint32_t owner;
1090 	uint32_t flags;
1091 	int error;
1092 	int count;
1093 
1094 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1095 	if (owner == -1)
1096 		return (EFAULT);
1097 
1098 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1099 		return (0);
1100 
1101 	flags = fuword32(&m->m_flags);
1102 
1103 	/* We should only ever be in here for contested locks */
1104 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1105 	    &key)) != 0)
1106 		return (error);
1107 
1108 	umtxq_lock(&key);
1109 	umtxq_busy(&key);
1110 	count = umtxq_count(&key);
1111 	umtxq_unlock(&key);
1112 
1113 	if (count <= 1)
1114 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1115 
1116 	umtxq_lock(&key);
1117 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1118 		umtxq_signal(&key, 1);
1119 	umtxq_unbusy(&key);
1120 	umtxq_unlock(&key);
1121 	umtx_key_release(&key);
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Check if the mutex has waiters and tries to fix contention bit.
1127  */
1128 static int
1129 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1130 {
1131 	struct umtx_key key;
1132 	uint32_t owner, old;
1133 	int type;
1134 	int error;
1135 	int count;
1136 
1137 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1138 	case 0:
1139 		type = TYPE_NORMAL_UMUTEX;
1140 		break;
1141 	case UMUTEX_PRIO_INHERIT:
1142 		type = TYPE_PI_UMUTEX;
1143 		break;
1144 	case UMUTEX_PRIO_PROTECT:
1145 		type = TYPE_PP_UMUTEX;
1146 		break;
1147 	default:
1148 		return (EINVAL);
1149 	}
1150 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1151 	    &key)) != 0)
1152 		return (error);
1153 
1154 	owner = 0;
1155 	umtxq_lock(&key);
1156 	umtxq_busy(&key);
1157 	count = umtxq_count(&key);
1158 	umtxq_unlock(&key);
1159 	/*
1160 	 * Only repair contention bit if there is a waiter, this means the mutex
1161 	 * is still being referenced by userland code, otherwise don't update
1162 	 * any memory.
1163 	 */
1164 	if (count > 1) {
1165 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1166 		while ((owner & UMUTEX_CONTESTED) ==0) {
1167 			old = casuword32(&m->m_owner, owner,
1168 			    owner|UMUTEX_CONTESTED);
1169 			if (old == owner)
1170 				break;
1171 			owner = old;
1172 			if (old == -1)
1173 				break;
1174 			error = umtxq_check_susp(td);
1175 			if (error != 0)
1176 				break;
1177 		}
1178 	} else if (count == 1) {
1179 		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1180 		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1181 		       (owner & UMUTEX_CONTESTED) == 0) {
1182 			old = casuword32(&m->m_owner, owner,
1183 			    owner|UMUTEX_CONTESTED);
1184 			if (old == owner)
1185 				break;
1186 			owner = old;
1187 			if (old == -1)
1188 				break;
1189 			error = umtxq_check_susp(td);
1190 			if (error != 0)
1191 				break;
1192 		}
1193 	}
1194 	umtxq_lock(&key);
1195 	if (owner == -1) {
1196 		error = EFAULT;
1197 		umtxq_signal(&key, INT_MAX);
1198 	}
1199 	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1200 		umtxq_signal(&key, 1);
1201 	umtxq_unbusy(&key);
1202 	umtxq_unlock(&key);
1203 	umtx_key_release(&key);
1204 	return (error);
1205 }
1206 
1207 static inline struct umtx_pi *
1208 umtx_pi_alloc(int flags)
1209 {
1210 	struct umtx_pi *pi;
1211 
1212 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1213 	TAILQ_INIT(&pi->pi_blocked);
1214 	atomic_add_int(&umtx_pi_allocated, 1);
1215 	return (pi);
1216 }
1217 
1218 static inline void
1219 umtx_pi_free(struct umtx_pi *pi)
1220 {
1221 	uma_zfree(umtx_pi_zone, pi);
1222 	atomic_add_int(&umtx_pi_allocated, -1);
1223 }
1224 
1225 /*
1226  * Adjust the thread's position on a pi_state after its priority has been
1227  * changed.
1228  */
1229 static int
1230 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1231 {
1232 	struct umtx_q *uq, *uq1, *uq2;
1233 	struct thread *td1;
1234 
1235 	mtx_assert(&umtx_lock, MA_OWNED);
1236 	if (pi == NULL)
1237 		return (0);
1238 
1239 	uq = td->td_umtxq;
1240 
1241 	/*
1242 	 * Check if the thread needs to be moved on the blocked chain.
1243 	 * It needs to be moved if either its priority is lower than
1244 	 * the previous thread or higher than the next thread.
1245 	 */
1246 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1247 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1248 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1249 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1250 		/*
1251 		 * Remove thread from blocked chain and determine where
1252 		 * it should be moved to.
1253 		 */
1254 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1255 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1256 			td1 = uq1->uq_thread;
1257 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1258 			if (UPRI(td1) > UPRI(td))
1259 				break;
1260 		}
1261 
1262 		if (uq1 == NULL)
1263 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1264 		else
1265 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1266 	}
1267 	return (1);
1268 }
1269 
1270 /*
1271  * Propagate priority when a thread is blocked on POSIX
1272  * PI mutex.
1273  */
1274 static void
1275 umtx_propagate_priority(struct thread *td)
1276 {
1277 	struct umtx_q *uq;
1278 	struct umtx_pi *pi;
1279 	int pri;
1280 
1281 	mtx_assert(&umtx_lock, MA_OWNED);
1282 	pri = UPRI(td);
1283 	uq = td->td_umtxq;
1284 	pi = uq->uq_pi_blocked;
1285 	if (pi == NULL)
1286 		return;
1287 
1288 	for (;;) {
1289 		td = pi->pi_owner;
1290 		if (td == NULL || td == curthread)
1291 			return;
1292 
1293 		MPASS(td->td_proc != NULL);
1294 		MPASS(td->td_proc->p_magic == P_MAGIC);
1295 
1296 		thread_lock(td);
1297 		if (td->td_lend_user_pri > pri)
1298 			sched_lend_user_prio(td, pri);
1299 		else {
1300 			thread_unlock(td);
1301 			break;
1302 		}
1303 		thread_unlock(td);
1304 
1305 		/*
1306 		 * Pick up the lock that td is blocked on.
1307 		 */
1308 		uq = td->td_umtxq;
1309 		pi = uq->uq_pi_blocked;
1310 		if (pi == NULL)
1311 			break;
1312 		/* Resort td on the list if needed. */
1313 		umtx_pi_adjust_thread(pi, td);
1314 	}
1315 }
1316 
1317 /*
1318  * Unpropagate priority for a PI mutex when a thread blocked on
1319  * it is interrupted by signal or resumed by others.
1320  */
1321 static void
1322 umtx_repropagate_priority(struct umtx_pi *pi)
1323 {
1324 	struct umtx_q *uq, *uq_owner;
1325 	struct umtx_pi *pi2;
1326 	int pri;
1327 
1328 	mtx_assert(&umtx_lock, MA_OWNED);
1329 
1330 	while (pi != NULL && pi->pi_owner != NULL) {
1331 		pri = PRI_MAX;
1332 		uq_owner = pi->pi_owner->td_umtxq;
1333 
1334 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1335 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1336 			if (uq != NULL) {
1337 				if (pri > UPRI(uq->uq_thread))
1338 					pri = UPRI(uq->uq_thread);
1339 			}
1340 		}
1341 
1342 		if (pri > uq_owner->uq_inherited_pri)
1343 			pri = uq_owner->uq_inherited_pri;
1344 		thread_lock(pi->pi_owner);
1345 		sched_lend_user_prio(pi->pi_owner, pri);
1346 		thread_unlock(pi->pi_owner);
1347 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1348 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1349 	}
1350 }
1351 
1352 /*
1353  * Insert a PI mutex into owned list.
1354  */
1355 static void
1356 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1357 {
1358 	struct umtx_q *uq_owner;
1359 
1360 	uq_owner = owner->td_umtxq;
1361 	mtx_assert(&umtx_lock, MA_OWNED);
1362 	if (pi->pi_owner != NULL)
1363 		panic("pi_ower != NULL");
1364 	pi->pi_owner = owner;
1365 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1366 }
1367 
1368 /*
1369  * Claim ownership of a PI mutex.
1370  */
1371 static int
1372 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1373 {
1374 	struct umtx_q *uq, *uq_owner;
1375 
1376 	uq_owner = owner->td_umtxq;
1377 	mtx_lock_spin(&umtx_lock);
1378 	if (pi->pi_owner == owner) {
1379 		mtx_unlock_spin(&umtx_lock);
1380 		return (0);
1381 	}
1382 
1383 	if (pi->pi_owner != NULL) {
1384 		/*
1385 		 * userland may have already messed the mutex, sigh.
1386 		 */
1387 		mtx_unlock_spin(&umtx_lock);
1388 		return (EPERM);
1389 	}
1390 	umtx_pi_setowner(pi, owner);
1391 	uq = TAILQ_FIRST(&pi->pi_blocked);
1392 	if (uq != NULL) {
1393 		int pri;
1394 
1395 		pri = UPRI(uq->uq_thread);
1396 		thread_lock(owner);
1397 		if (pri < UPRI(owner))
1398 			sched_lend_user_prio(owner, pri);
1399 		thread_unlock(owner);
1400 	}
1401 	mtx_unlock_spin(&umtx_lock);
1402 	return (0);
1403 }
1404 
1405 /*
1406  * Adjust a thread's order position in its blocked PI mutex,
1407  * this may result new priority propagating process.
1408  */
1409 void
1410 umtx_pi_adjust(struct thread *td, u_char oldpri)
1411 {
1412 	struct umtx_q *uq;
1413 	struct umtx_pi *pi;
1414 
1415 	uq = td->td_umtxq;
1416 	mtx_lock_spin(&umtx_lock);
1417 	/*
1418 	 * Pick up the lock that td is blocked on.
1419 	 */
1420 	pi = uq->uq_pi_blocked;
1421 	if (pi != NULL) {
1422 		umtx_pi_adjust_thread(pi, td);
1423 		umtx_repropagate_priority(pi);
1424 	}
1425 	mtx_unlock_spin(&umtx_lock);
1426 }
1427 
1428 /*
1429  * Sleep on a PI mutex.
1430  */
1431 static int
1432 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1433 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1434 {
1435 	struct umtxq_chain *uc;
1436 	struct thread *td, *td1;
1437 	struct umtx_q *uq1;
1438 	int pri;
1439 	int error = 0;
1440 
1441 	td = uq->uq_thread;
1442 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1443 	uc = umtxq_getchain(&uq->uq_key);
1444 	UMTXQ_LOCKED_ASSERT(uc);
1445 	UMTXQ_BUSY_ASSERT(uc);
1446 	umtxq_insert(uq);
1447 	mtx_lock_spin(&umtx_lock);
1448 	if (pi->pi_owner == NULL) {
1449 		mtx_unlock_spin(&umtx_lock);
1450 		/* XXX Only look up thread in current process. */
1451 		td1 = tdfind(owner, curproc->p_pid);
1452 		mtx_lock_spin(&umtx_lock);
1453 		if (td1 != NULL) {
1454 			if (pi->pi_owner == NULL)
1455 				umtx_pi_setowner(pi, td1);
1456 			PROC_UNLOCK(td1->td_proc);
1457 		}
1458 	}
1459 
1460 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1461 		pri = UPRI(uq1->uq_thread);
1462 		if (pri > UPRI(td))
1463 			break;
1464 	}
1465 
1466 	if (uq1 != NULL)
1467 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1468 	else
1469 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1470 
1471 	uq->uq_pi_blocked = pi;
1472 	thread_lock(td);
1473 	td->td_flags |= TDF_UPIBLOCKED;
1474 	thread_unlock(td);
1475 	umtx_propagate_priority(td);
1476 	mtx_unlock_spin(&umtx_lock);
1477 	umtxq_unbusy(&uq->uq_key);
1478 
1479 	error = umtxq_sleep(uq, wmesg, timo);
1480 	umtxq_remove(uq);
1481 
1482 	mtx_lock_spin(&umtx_lock);
1483 	uq->uq_pi_blocked = NULL;
1484 	thread_lock(td);
1485 	td->td_flags &= ~TDF_UPIBLOCKED;
1486 	thread_unlock(td);
1487 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1488 	umtx_repropagate_priority(pi);
1489 	mtx_unlock_spin(&umtx_lock);
1490 	umtxq_unlock(&uq->uq_key);
1491 
1492 	return (error);
1493 }
1494 
1495 /*
1496  * Add reference count for a PI mutex.
1497  */
1498 static void
1499 umtx_pi_ref(struct umtx_pi *pi)
1500 {
1501 	struct umtxq_chain *uc;
1502 
1503 	uc = umtxq_getchain(&pi->pi_key);
1504 	UMTXQ_LOCKED_ASSERT(uc);
1505 	pi->pi_refcount++;
1506 }
1507 
1508 /*
1509  * Decrease reference count for a PI mutex, if the counter
1510  * is decreased to zero, its memory space is freed.
1511  */
1512 static void
1513 umtx_pi_unref(struct umtx_pi *pi)
1514 {
1515 	struct umtxq_chain *uc;
1516 
1517 	uc = umtxq_getchain(&pi->pi_key);
1518 	UMTXQ_LOCKED_ASSERT(uc);
1519 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1520 	if (--pi->pi_refcount == 0) {
1521 		mtx_lock_spin(&umtx_lock);
1522 		if (pi->pi_owner != NULL) {
1523 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1524 				pi, pi_link);
1525 			pi->pi_owner = NULL;
1526 		}
1527 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1528 			("blocked queue not empty"));
1529 		mtx_unlock_spin(&umtx_lock);
1530 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1531 		umtx_pi_free(pi);
1532 	}
1533 }
1534 
1535 /*
1536  * Find a PI mutex in hash table.
1537  */
1538 static struct umtx_pi *
1539 umtx_pi_lookup(struct umtx_key *key)
1540 {
1541 	struct umtxq_chain *uc;
1542 	struct umtx_pi *pi;
1543 
1544 	uc = umtxq_getchain(key);
1545 	UMTXQ_LOCKED_ASSERT(uc);
1546 
1547 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1548 		if (umtx_key_match(&pi->pi_key, key)) {
1549 			return (pi);
1550 		}
1551 	}
1552 	return (NULL);
1553 }
1554 
1555 /*
1556  * Insert a PI mutex into hash table.
1557  */
1558 static inline void
1559 umtx_pi_insert(struct umtx_pi *pi)
1560 {
1561 	struct umtxq_chain *uc;
1562 
1563 	uc = umtxq_getchain(&pi->pi_key);
1564 	UMTXQ_LOCKED_ASSERT(uc);
1565 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1566 }
1567 
1568 /*
1569  * Lock a PI mutex.
1570  */
1571 static int
1572 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1573     struct _umtx_time *timeout, int try)
1574 {
1575 	struct abs_timeout timo;
1576 	struct umtx_q *uq;
1577 	struct umtx_pi *pi, *new_pi;
1578 	uint32_t id, owner, old;
1579 	int error;
1580 
1581 	id = td->td_tid;
1582 	uq = td->td_umtxq;
1583 
1584 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1585 	    &uq->uq_key)) != 0)
1586 		return (error);
1587 
1588 	if (timeout != NULL)
1589 		abs_timeout_init2(&timo, timeout);
1590 
1591 	umtxq_lock(&uq->uq_key);
1592 	pi = umtx_pi_lookup(&uq->uq_key);
1593 	if (pi == NULL) {
1594 		new_pi = umtx_pi_alloc(M_NOWAIT);
1595 		if (new_pi == NULL) {
1596 			umtxq_unlock(&uq->uq_key);
1597 			new_pi = umtx_pi_alloc(M_WAITOK);
1598 			umtxq_lock(&uq->uq_key);
1599 			pi = umtx_pi_lookup(&uq->uq_key);
1600 			if (pi != NULL) {
1601 				umtx_pi_free(new_pi);
1602 				new_pi = NULL;
1603 			}
1604 		}
1605 		if (new_pi != NULL) {
1606 			new_pi->pi_key = uq->uq_key;
1607 			umtx_pi_insert(new_pi);
1608 			pi = new_pi;
1609 		}
1610 	}
1611 	umtx_pi_ref(pi);
1612 	umtxq_unlock(&uq->uq_key);
1613 
1614 	/*
1615 	 * Care must be exercised when dealing with umtx structure.  It
1616 	 * can fault on any access.
1617 	 */
1618 	for (;;) {
1619 		/*
1620 		 * Try the uncontested case.  This should be done in userland.
1621 		 */
1622 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1623 
1624 		/* The acquire succeeded. */
1625 		if (owner == UMUTEX_UNOWNED) {
1626 			error = 0;
1627 			break;
1628 		}
1629 
1630 		/* The address was invalid. */
1631 		if (owner == -1) {
1632 			error = EFAULT;
1633 			break;
1634 		}
1635 
1636 		/* If no one owns it but it is contested try to acquire it. */
1637 		if (owner == UMUTEX_CONTESTED) {
1638 			owner = casuword32(&m->m_owner,
1639 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1640 
1641 			if (owner == UMUTEX_CONTESTED) {
1642 				umtxq_lock(&uq->uq_key);
1643 				umtxq_busy(&uq->uq_key);
1644 				error = umtx_pi_claim(pi, td);
1645 				umtxq_unbusy(&uq->uq_key);
1646 				umtxq_unlock(&uq->uq_key);
1647 				break;
1648 			}
1649 
1650 			/* The address was invalid. */
1651 			if (owner == -1) {
1652 				error = EFAULT;
1653 				break;
1654 			}
1655 
1656 			error = umtxq_check_susp(td);
1657 			if (error != 0)
1658 				break;
1659 
1660 			/* If this failed the lock has changed, restart. */
1661 			continue;
1662 		}
1663 
1664 		if (try != 0) {
1665 			error = EBUSY;
1666 			break;
1667 		}
1668 
1669 		/*
1670 		 * If we caught a signal, we have retried and now
1671 		 * exit immediately.
1672 		 */
1673 		if (error != 0)
1674 			break;
1675 
1676 		umtxq_lock(&uq->uq_key);
1677 		umtxq_busy(&uq->uq_key);
1678 		umtxq_unlock(&uq->uq_key);
1679 
1680 		/*
1681 		 * Set the contested bit so that a release in user space
1682 		 * knows to use the system call for unlock.  If this fails
1683 		 * either some one else has acquired the lock or it has been
1684 		 * released.
1685 		 */
1686 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1687 
1688 		/* The address was invalid. */
1689 		if (old == -1) {
1690 			umtxq_lock(&uq->uq_key);
1691 			umtxq_unbusy(&uq->uq_key);
1692 			umtxq_unlock(&uq->uq_key);
1693 			error = EFAULT;
1694 			break;
1695 		}
1696 
1697 		umtxq_lock(&uq->uq_key);
1698 		/*
1699 		 * We set the contested bit, sleep. Otherwise the lock changed
1700 		 * and we need to retry or we lost a race to the thread
1701 		 * unlocking the umtx.
1702 		 */
1703 		if (old == owner) {
1704 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1705 			    "umtxpi", timeout == NULL ? NULL : &timo);
1706 			if (error != 0)
1707 				continue;
1708 		} else {
1709 			umtxq_unbusy(&uq->uq_key);
1710 			umtxq_unlock(&uq->uq_key);
1711 		}
1712 
1713 		error = umtxq_check_susp(td);
1714 		if (error != 0)
1715 			break;
1716 	}
1717 
1718 	umtxq_lock(&uq->uq_key);
1719 	umtx_pi_unref(pi);
1720 	umtxq_unlock(&uq->uq_key);
1721 
1722 	umtx_key_release(&uq->uq_key);
1723 	return (error);
1724 }
1725 
1726 /*
1727  * Unlock a PI mutex.
1728  */
1729 static int
1730 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1731 {
1732 	struct umtx_key key;
1733 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1734 	struct umtx_pi *pi, *pi2;
1735 	uint32_t owner, old, id;
1736 	int error;
1737 	int count;
1738 	int pri;
1739 
1740 	id = td->td_tid;
1741 	/*
1742 	 * Make sure we own this mtx.
1743 	 */
1744 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1745 	if (owner == -1)
1746 		return (EFAULT);
1747 
1748 	if ((owner & ~UMUTEX_CONTESTED) != id)
1749 		return (EPERM);
1750 
1751 	/* This should be done in userland */
1752 	if ((owner & UMUTEX_CONTESTED) == 0) {
1753 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1754 		if (old == -1)
1755 			return (EFAULT);
1756 		if (old == owner)
1757 			return (0);
1758 		owner = old;
1759 	}
1760 
1761 	/* We should only ever be in here for contested locks */
1762 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1763 	    &key)) != 0)
1764 		return (error);
1765 
1766 	umtxq_lock(&key);
1767 	umtxq_busy(&key);
1768 	count = umtxq_count_pi(&key, &uq_first);
1769 	if (uq_first != NULL) {
1770 		mtx_lock_spin(&umtx_lock);
1771 		pi = uq_first->uq_pi_blocked;
1772 		KASSERT(pi != NULL, ("pi == NULL?"));
1773 		if (pi->pi_owner != curthread) {
1774 			mtx_unlock_spin(&umtx_lock);
1775 			umtxq_unbusy(&key);
1776 			umtxq_unlock(&key);
1777 			umtx_key_release(&key);
1778 			/* userland messed the mutex */
1779 			return (EPERM);
1780 		}
1781 		uq_me = curthread->td_umtxq;
1782 		pi->pi_owner = NULL;
1783 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1784 		/* get highest priority thread which is still sleeping. */
1785 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1786 		while (uq_first != NULL &&
1787 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1788 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1789 		}
1790 		pri = PRI_MAX;
1791 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1792 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1793 			if (uq_first2 != NULL) {
1794 				if (pri > UPRI(uq_first2->uq_thread))
1795 					pri = UPRI(uq_first2->uq_thread);
1796 			}
1797 		}
1798 		thread_lock(curthread);
1799 		sched_lend_user_prio(curthread, pri);
1800 		thread_unlock(curthread);
1801 		mtx_unlock_spin(&umtx_lock);
1802 		if (uq_first)
1803 			umtxq_signal_thread(uq_first);
1804 	}
1805 	umtxq_unlock(&key);
1806 
1807 	/*
1808 	 * When unlocking the umtx, it must be marked as unowned if
1809 	 * there is zero or one thread only waiting for it.
1810 	 * Otherwise, it must be marked as contested.
1811 	 */
1812 	old = casuword32(&m->m_owner, owner,
1813 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1814 
1815 	umtxq_lock(&key);
1816 	umtxq_unbusy(&key);
1817 	umtxq_unlock(&key);
1818 	umtx_key_release(&key);
1819 	if (old == -1)
1820 		return (EFAULT);
1821 	if (old != owner)
1822 		return (EINVAL);
1823 	return (0);
1824 }
1825 
1826 /*
1827  * Lock a PP mutex.
1828  */
1829 static int
1830 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1831     struct _umtx_time *timeout, int try)
1832 {
1833 	struct abs_timeout timo;
1834 	struct umtx_q *uq, *uq2;
1835 	struct umtx_pi *pi;
1836 	uint32_t ceiling;
1837 	uint32_t owner, id;
1838 	int error, pri, old_inherited_pri, su;
1839 
1840 	id = td->td_tid;
1841 	uq = td->td_umtxq;
1842 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1843 	    &uq->uq_key)) != 0)
1844 		return (error);
1845 
1846 	if (timeout != NULL)
1847 		abs_timeout_init2(&timo, timeout);
1848 
1849 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1850 	for (;;) {
1851 		old_inherited_pri = uq->uq_inherited_pri;
1852 		umtxq_lock(&uq->uq_key);
1853 		umtxq_busy(&uq->uq_key);
1854 		umtxq_unlock(&uq->uq_key);
1855 
1856 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1857 		if (ceiling > RTP_PRIO_MAX) {
1858 			error = EINVAL;
1859 			goto out;
1860 		}
1861 
1862 		mtx_lock_spin(&umtx_lock);
1863 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1864 			mtx_unlock_spin(&umtx_lock);
1865 			error = EINVAL;
1866 			goto out;
1867 		}
1868 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1869 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1870 			thread_lock(td);
1871 			if (uq->uq_inherited_pri < UPRI(td))
1872 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1873 			thread_unlock(td);
1874 		}
1875 		mtx_unlock_spin(&umtx_lock);
1876 
1877 		owner = casuword32(&m->m_owner,
1878 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1879 
1880 		if (owner == UMUTEX_CONTESTED) {
1881 			error = 0;
1882 			break;
1883 		}
1884 
1885 		/* The address was invalid. */
1886 		if (owner == -1) {
1887 			error = EFAULT;
1888 			break;
1889 		}
1890 
1891 		if (try != 0) {
1892 			error = EBUSY;
1893 			break;
1894 		}
1895 
1896 		/*
1897 		 * If we caught a signal, we have retried and now
1898 		 * exit immediately.
1899 		 */
1900 		if (error != 0)
1901 			break;
1902 
1903 		umtxq_lock(&uq->uq_key);
1904 		umtxq_insert(uq);
1905 		umtxq_unbusy(&uq->uq_key);
1906 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
1907 		    NULL : &timo);
1908 		umtxq_remove(uq);
1909 		umtxq_unlock(&uq->uq_key);
1910 
1911 		mtx_lock_spin(&umtx_lock);
1912 		uq->uq_inherited_pri = old_inherited_pri;
1913 		pri = PRI_MAX;
1914 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1915 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1916 			if (uq2 != NULL) {
1917 				if (pri > UPRI(uq2->uq_thread))
1918 					pri = UPRI(uq2->uq_thread);
1919 			}
1920 		}
1921 		if (pri > uq->uq_inherited_pri)
1922 			pri = uq->uq_inherited_pri;
1923 		thread_lock(td);
1924 		sched_lend_user_prio(td, pri);
1925 		thread_unlock(td);
1926 		mtx_unlock_spin(&umtx_lock);
1927 	}
1928 
1929 	if (error != 0) {
1930 		mtx_lock_spin(&umtx_lock);
1931 		uq->uq_inherited_pri = old_inherited_pri;
1932 		pri = PRI_MAX;
1933 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1934 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1935 			if (uq2 != NULL) {
1936 				if (pri > UPRI(uq2->uq_thread))
1937 					pri = UPRI(uq2->uq_thread);
1938 			}
1939 		}
1940 		if (pri > uq->uq_inherited_pri)
1941 			pri = uq->uq_inherited_pri;
1942 		thread_lock(td);
1943 		sched_lend_user_prio(td, pri);
1944 		thread_unlock(td);
1945 		mtx_unlock_spin(&umtx_lock);
1946 	}
1947 
1948 out:
1949 	umtxq_lock(&uq->uq_key);
1950 	umtxq_unbusy(&uq->uq_key);
1951 	umtxq_unlock(&uq->uq_key);
1952 	umtx_key_release(&uq->uq_key);
1953 	return (error);
1954 }
1955 
1956 /*
1957  * Unlock a PP mutex.
1958  */
1959 static int
1960 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1961 {
1962 	struct umtx_key key;
1963 	struct umtx_q *uq, *uq2;
1964 	struct umtx_pi *pi;
1965 	uint32_t owner, id;
1966 	uint32_t rceiling;
1967 	int error, pri, new_inherited_pri, su;
1968 
1969 	id = td->td_tid;
1970 	uq = td->td_umtxq;
1971 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1972 
1973 	/*
1974 	 * Make sure we own this mtx.
1975 	 */
1976 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1977 	if (owner == -1)
1978 		return (EFAULT);
1979 
1980 	if ((owner & ~UMUTEX_CONTESTED) != id)
1981 		return (EPERM);
1982 
1983 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1984 	if (error != 0)
1985 		return (error);
1986 
1987 	if (rceiling == -1)
1988 		new_inherited_pri = PRI_MAX;
1989 	else {
1990 		rceiling = RTP_PRIO_MAX - rceiling;
1991 		if (rceiling > RTP_PRIO_MAX)
1992 			return (EINVAL);
1993 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1994 	}
1995 
1996 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1997 	    &key)) != 0)
1998 		return (error);
1999 	umtxq_lock(&key);
2000 	umtxq_busy(&key);
2001 	umtxq_unlock(&key);
2002 	/*
2003 	 * For priority protected mutex, always set unlocked state
2004 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2005 	 * to lock the mutex, it is necessary because thread priority
2006 	 * has to be adjusted for such mutex.
2007 	 */
2008 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2009 		UMUTEX_CONTESTED);
2010 
2011 	umtxq_lock(&key);
2012 	if (error == 0)
2013 		umtxq_signal(&key, 1);
2014 	umtxq_unbusy(&key);
2015 	umtxq_unlock(&key);
2016 
2017 	if (error == -1)
2018 		error = EFAULT;
2019 	else {
2020 		mtx_lock_spin(&umtx_lock);
2021 		if (su != 0)
2022 			uq->uq_inherited_pri = new_inherited_pri;
2023 		pri = PRI_MAX;
2024 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2025 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2026 			if (uq2 != NULL) {
2027 				if (pri > UPRI(uq2->uq_thread))
2028 					pri = UPRI(uq2->uq_thread);
2029 			}
2030 		}
2031 		if (pri > uq->uq_inherited_pri)
2032 			pri = uq->uq_inherited_pri;
2033 		thread_lock(td);
2034 		sched_lend_user_prio(td, pri);
2035 		thread_unlock(td);
2036 		mtx_unlock_spin(&umtx_lock);
2037 	}
2038 	umtx_key_release(&key);
2039 	return (error);
2040 }
2041 
2042 static int
2043 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2044 	uint32_t *old_ceiling)
2045 {
2046 	struct umtx_q *uq;
2047 	uint32_t save_ceiling;
2048 	uint32_t owner, id;
2049 	uint32_t flags;
2050 	int error;
2051 
2052 	flags = fuword32(&m->m_flags);
2053 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2054 		return (EINVAL);
2055 	if (ceiling > RTP_PRIO_MAX)
2056 		return (EINVAL);
2057 	id = td->td_tid;
2058 	uq = td->td_umtxq;
2059 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2060 	   &uq->uq_key)) != 0)
2061 		return (error);
2062 	for (;;) {
2063 		umtxq_lock(&uq->uq_key);
2064 		umtxq_busy(&uq->uq_key);
2065 		umtxq_unlock(&uq->uq_key);
2066 
2067 		save_ceiling = fuword32(&m->m_ceilings[0]);
2068 
2069 		owner = casuword32(&m->m_owner,
2070 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2071 
2072 		if (owner == UMUTEX_CONTESTED) {
2073 			suword32(&m->m_ceilings[0], ceiling);
2074 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2075 				UMUTEX_CONTESTED);
2076 			error = 0;
2077 			break;
2078 		}
2079 
2080 		/* The address was invalid. */
2081 		if (owner == -1) {
2082 			error = EFAULT;
2083 			break;
2084 		}
2085 
2086 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2087 			suword32(&m->m_ceilings[0], ceiling);
2088 			error = 0;
2089 			break;
2090 		}
2091 
2092 		/*
2093 		 * If we caught a signal, we have retried and now
2094 		 * exit immediately.
2095 		 */
2096 		if (error != 0)
2097 			break;
2098 
2099 		/*
2100 		 * We set the contested bit, sleep. Otherwise the lock changed
2101 		 * and we need to retry or we lost a race to the thread
2102 		 * unlocking the umtx.
2103 		 */
2104 		umtxq_lock(&uq->uq_key);
2105 		umtxq_insert(uq);
2106 		umtxq_unbusy(&uq->uq_key);
2107 		error = umtxq_sleep(uq, "umtxpp", NULL);
2108 		umtxq_remove(uq);
2109 		umtxq_unlock(&uq->uq_key);
2110 	}
2111 	umtxq_lock(&uq->uq_key);
2112 	if (error == 0)
2113 		umtxq_signal(&uq->uq_key, INT_MAX);
2114 	umtxq_unbusy(&uq->uq_key);
2115 	umtxq_unlock(&uq->uq_key);
2116 	umtx_key_release(&uq->uq_key);
2117 	if (error == 0 && old_ceiling != NULL)
2118 		suword32(old_ceiling, save_ceiling);
2119 	return (error);
2120 }
2121 
2122 /*
2123  * Lock a userland POSIX mutex.
2124  */
2125 static int
2126 do_lock_umutex(struct thread *td, struct umutex *m,
2127     struct _umtx_time *timeout, int mode)
2128 {
2129 	uint32_t flags;
2130 	int error;
2131 
2132 	flags = fuword32(&m->m_flags);
2133 	if (flags == -1)
2134 		return (EFAULT);
2135 
2136 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2137 	case 0:
2138 		error = do_lock_normal(td, m, flags, timeout, mode);
2139 		break;
2140 	case UMUTEX_PRIO_INHERIT:
2141 		error = do_lock_pi(td, m, flags, timeout, mode);
2142 		break;
2143 	case UMUTEX_PRIO_PROTECT:
2144 		error = do_lock_pp(td, m, flags, timeout, mode);
2145 		break;
2146 	default:
2147 		return (EINVAL);
2148 	}
2149 	if (timeout == NULL) {
2150 		if (error == EINTR && mode != _UMUTEX_WAIT)
2151 			error = ERESTART;
2152 	} else {
2153 		/* Timed-locking is not restarted. */
2154 		if (error == ERESTART)
2155 			error = EINTR;
2156 	}
2157 	return (error);
2158 }
2159 
2160 /*
2161  * Unlock a userland POSIX mutex.
2162  */
2163 static int
2164 do_unlock_umutex(struct thread *td, struct umutex *m)
2165 {
2166 	uint32_t flags;
2167 
2168 	flags = fuword32(&m->m_flags);
2169 	if (flags == -1)
2170 		return (EFAULT);
2171 
2172 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2173 	case 0:
2174 		return (do_unlock_normal(td, m, flags));
2175 	case UMUTEX_PRIO_INHERIT:
2176 		return (do_unlock_pi(td, m, flags));
2177 	case UMUTEX_PRIO_PROTECT:
2178 		return (do_unlock_pp(td, m, flags));
2179 	}
2180 
2181 	return (EINVAL);
2182 }
2183 
2184 static int
2185 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2186 	struct timespec *timeout, u_long wflags)
2187 {
2188 	struct abs_timeout timo;
2189 	struct umtx_q *uq;
2190 	uint32_t flags;
2191 	uint32_t clockid;
2192 	int error;
2193 
2194 	uq = td->td_umtxq;
2195 	flags = fuword32(&cv->c_flags);
2196 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2197 	if (error != 0)
2198 		return (error);
2199 
2200 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2201 		clockid = fuword32(&cv->c_clockid);
2202 		if (clockid < CLOCK_REALTIME ||
2203 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2204 			/* hmm, only HW clock id will work. */
2205 			return (EINVAL);
2206 		}
2207 	} else {
2208 		clockid = CLOCK_REALTIME;
2209 	}
2210 
2211 	umtxq_lock(&uq->uq_key);
2212 	umtxq_busy(&uq->uq_key);
2213 	umtxq_insert(uq);
2214 	umtxq_unlock(&uq->uq_key);
2215 
2216 	/*
2217 	 * Set c_has_waiters to 1 before releasing user mutex, also
2218 	 * don't modify cache line when unnecessary.
2219 	 */
2220 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2221 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2222 
2223 	umtxq_lock(&uq->uq_key);
2224 	umtxq_unbusy(&uq->uq_key);
2225 	umtxq_unlock(&uq->uq_key);
2226 
2227 	error = do_unlock_umutex(td, m);
2228 
2229 	if (timeout != NULL)
2230 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2231 			timeout);
2232 
2233 	umtxq_lock(&uq->uq_key);
2234 	if (error == 0) {
2235 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2236 		    NULL : &timo);
2237 	}
2238 
2239 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2240 		error = 0;
2241 	else {
2242 		/*
2243 		 * This must be timeout,interrupted by signal or
2244 		 * surprious wakeup, clear c_has_waiter flag when
2245 		 * necessary.
2246 		 */
2247 		umtxq_busy(&uq->uq_key);
2248 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2249 			int oldlen = uq->uq_cur_queue->length;
2250 			umtxq_remove(uq);
2251 			if (oldlen == 1) {
2252 				umtxq_unlock(&uq->uq_key);
2253 				suword32(
2254 				    __DEVOLATILE(uint32_t *,
2255 					 &cv->c_has_waiters), 0);
2256 				umtxq_lock(&uq->uq_key);
2257 			}
2258 		}
2259 		umtxq_unbusy(&uq->uq_key);
2260 		if (error == ERESTART)
2261 			error = EINTR;
2262 	}
2263 
2264 	umtxq_unlock(&uq->uq_key);
2265 	umtx_key_release(&uq->uq_key);
2266 	return (error);
2267 }
2268 
2269 /*
2270  * Signal a userland condition variable.
2271  */
2272 static int
2273 do_cv_signal(struct thread *td, struct ucond *cv)
2274 {
2275 	struct umtx_key key;
2276 	int error, cnt, nwake;
2277 	uint32_t flags;
2278 
2279 	flags = fuword32(&cv->c_flags);
2280 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2281 		return (error);
2282 	umtxq_lock(&key);
2283 	umtxq_busy(&key);
2284 	cnt = umtxq_count(&key);
2285 	nwake = umtxq_signal(&key, 1);
2286 	if (cnt <= nwake) {
2287 		umtxq_unlock(&key);
2288 		error = suword32(
2289 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2290 		umtxq_lock(&key);
2291 	}
2292 	umtxq_unbusy(&key);
2293 	umtxq_unlock(&key);
2294 	umtx_key_release(&key);
2295 	return (error);
2296 }
2297 
2298 static int
2299 do_cv_broadcast(struct thread *td, struct ucond *cv)
2300 {
2301 	struct umtx_key key;
2302 	int error;
2303 	uint32_t flags;
2304 
2305 	flags = fuword32(&cv->c_flags);
2306 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2307 		return (error);
2308 
2309 	umtxq_lock(&key);
2310 	umtxq_busy(&key);
2311 	umtxq_signal(&key, INT_MAX);
2312 	umtxq_unlock(&key);
2313 
2314 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2315 
2316 	umtxq_lock(&key);
2317 	umtxq_unbusy(&key);
2318 	umtxq_unlock(&key);
2319 
2320 	umtx_key_release(&key);
2321 	return (error);
2322 }
2323 
2324 static int
2325 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2326 {
2327 	struct abs_timeout timo;
2328 	struct umtx_q *uq;
2329 	uint32_t flags, wrflags;
2330 	int32_t state, oldstate;
2331 	int32_t blocked_readers;
2332 	int error;
2333 
2334 	uq = td->td_umtxq;
2335 	flags = fuword32(&rwlock->rw_flags);
2336 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2337 	if (error != 0)
2338 		return (error);
2339 
2340 	if (timeout != NULL)
2341 		abs_timeout_init2(&timo, timeout);
2342 
2343 	wrflags = URWLOCK_WRITE_OWNER;
2344 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2345 		wrflags |= URWLOCK_WRITE_WAITERS;
2346 
2347 	for (;;) {
2348 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2349 		/* try to lock it */
2350 		while (!(state & wrflags)) {
2351 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2352 				umtx_key_release(&uq->uq_key);
2353 				return (EAGAIN);
2354 			}
2355 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2356 			if (oldstate == -1) {
2357 				umtx_key_release(&uq->uq_key);
2358 				return (EFAULT);
2359 			}
2360 			if (oldstate == state) {
2361 				umtx_key_release(&uq->uq_key);
2362 				return (0);
2363 			}
2364 			error = umtxq_check_susp(td);
2365 			if (error != 0)
2366 				break;
2367 			state = oldstate;
2368 		}
2369 
2370 		if (error)
2371 			break;
2372 
2373 		/* grab monitor lock */
2374 		umtxq_lock(&uq->uq_key);
2375 		umtxq_busy(&uq->uq_key);
2376 		umtxq_unlock(&uq->uq_key);
2377 
2378 		/*
2379 		 * re-read the state, in case it changed between the try-lock above
2380 		 * and the check below
2381 		 */
2382 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2383 
2384 		/* set read contention bit */
2385 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2386 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2387 			if (oldstate == -1) {
2388 				error = EFAULT;
2389 				break;
2390 			}
2391 			if (oldstate == state)
2392 				goto sleep;
2393 			state = oldstate;
2394 			error = umtxq_check_susp(td);
2395 			if (error != 0)
2396 				break;
2397 		}
2398 		if (error != 0) {
2399 			umtxq_lock(&uq->uq_key);
2400 			umtxq_unbusy(&uq->uq_key);
2401 			umtxq_unlock(&uq->uq_key);
2402 			break;
2403 		}
2404 
2405 		/* state is changed while setting flags, restart */
2406 		if (!(state & wrflags)) {
2407 			umtxq_lock(&uq->uq_key);
2408 			umtxq_unbusy(&uq->uq_key);
2409 			umtxq_unlock(&uq->uq_key);
2410 			error = umtxq_check_susp(td);
2411 			if (error != 0)
2412 				break;
2413 			continue;
2414 		}
2415 
2416 sleep:
2417 		/* contention bit is set, before sleeping, increase read waiter count */
2418 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2419 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2420 
2421 		while (state & wrflags) {
2422 			umtxq_lock(&uq->uq_key);
2423 			umtxq_insert(uq);
2424 			umtxq_unbusy(&uq->uq_key);
2425 
2426 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2427 			    NULL : &timo);
2428 
2429 			umtxq_busy(&uq->uq_key);
2430 			umtxq_remove(uq);
2431 			umtxq_unlock(&uq->uq_key);
2432 			if (error)
2433 				break;
2434 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2435 		}
2436 
2437 		/* decrease read waiter count, and may clear read contention bit */
2438 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2439 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2440 		if (blocked_readers == 1) {
2441 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2442 			for (;;) {
2443 				oldstate = casuword32(&rwlock->rw_state, state,
2444 					 state & ~URWLOCK_READ_WAITERS);
2445 				if (oldstate == -1) {
2446 					error = EFAULT;
2447 					break;
2448 				}
2449 				if (oldstate == state)
2450 					break;
2451 				state = oldstate;
2452 				error = umtxq_check_susp(td);
2453 				if (error != 0)
2454 					break;
2455 			}
2456 		}
2457 
2458 		umtxq_lock(&uq->uq_key);
2459 		umtxq_unbusy(&uq->uq_key);
2460 		umtxq_unlock(&uq->uq_key);
2461 		if (error != 0)
2462 			break;
2463 	}
2464 	umtx_key_release(&uq->uq_key);
2465 	if (error == ERESTART)
2466 		error = EINTR;
2467 	return (error);
2468 }
2469 
2470 static int
2471 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2472 {
2473 	struct abs_timeout timo;
2474 	struct umtx_q *uq;
2475 	uint32_t flags;
2476 	int32_t state, oldstate;
2477 	int32_t blocked_writers;
2478 	int32_t blocked_readers;
2479 	int error;
2480 
2481 	uq = td->td_umtxq;
2482 	flags = fuword32(&rwlock->rw_flags);
2483 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2484 	if (error != 0)
2485 		return (error);
2486 
2487 	if (timeout != NULL)
2488 		abs_timeout_init2(&timo, timeout);
2489 
2490 	blocked_readers = 0;
2491 	for (;;) {
2492 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2493 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2494 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2495 			if (oldstate == -1) {
2496 				umtx_key_release(&uq->uq_key);
2497 				return (EFAULT);
2498 			}
2499 			if (oldstate == state) {
2500 				umtx_key_release(&uq->uq_key);
2501 				return (0);
2502 			}
2503 			state = oldstate;
2504 			error = umtxq_check_susp(td);
2505 			if (error != 0)
2506 				break;
2507 		}
2508 
2509 		if (error) {
2510 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2511 			    blocked_readers != 0) {
2512 				umtxq_lock(&uq->uq_key);
2513 				umtxq_busy(&uq->uq_key);
2514 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2515 				umtxq_unbusy(&uq->uq_key);
2516 				umtxq_unlock(&uq->uq_key);
2517 			}
2518 
2519 			break;
2520 		}
2521 
2522 		/* grab monitor lock */
2523 		umtxq_lock(&uq->uq_key);
2524 		umtxq_busy(&uq->uq_key);
2525 		umtxq_unlock(&uq->uq_key);
2526 
2527 		/*
2528 		 * re-read the state, in case it changed between the try-lock above
2529 		 * and the check below
2530 		 */
2531 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2532 
2533 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2534 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2535 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2536 			if (oldstate == -1) {
2537 				error = EFAULT;
2538 				break;
2539 			}
2540 			if (oldstate == state)
2541 				goto sleep;
2542 			state = oldstate;
2543 			error = umtxq_check_susp(td);
2544 			if (error != 0)
2545 				break;
2546 		}
2547 		if (error != 0) {
2548 			umtxq_lock(&uq->uq_key);
2549 			umtxq_unbusy(&uq->uq_key);
2550 			umtxq_unlock(&uq->uq_key);
2551 			break;
2552 		}
2553 
2554 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2555 			umtxq_lock(&uq->uq_key);
2556 			umtxq_unbusy(&uq->uq_key);
2557 			umtxq_unlock(&uq->uq_key);
2558 			error = umtxq_check_susp(td);
2559 			if (error != 0)
2560 				break;
2561 			continue;
2562 		}
2563 sleep:
2564 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2565 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2566 
2567 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2568 			umtxq_lock(&uq->uq_key);
2569 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2570 			umtxq_unbusy(&uq->uq_key);
2571 
2572 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2573 			    NULL : &timo);
2574 
2575 			umtxq_busy(&uq->uq_key);
2576 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2577 			umtxq_unlock(&uq->uq_key);
2578 			if (error)
2579 				break;
2580 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2581 		}
2582 
2583 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2584 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2585 		if (blocked_writers == 1) {
2586 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2587 			for (;;) {
2588 				oldstate = casuword32(&rwlock->rw_state, state,
2589 					 state & ~URWLOCK_WRITE_WAITERS);
2590 				if (oldstate == -1) {
2591 					error = EFAULT;
2592 					break;
2593 				}
2594 				if (oldstate == state)
2595 					break;
2596 				state = oldstate;
2597 				error = umtxq_check_susp(td);
2598 				/*
2599 				 * We are leaving the URWLOCK_WRITE_WAITERS
2600 				 * behind, but this should not harm the
2601 				 * correctness.
2602 				 */
2603 				if (error != 0)
2604 					break;
2605 			}
2606 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2607 		} else
2608 			blocked_readers = 0;
2609 
2610 		umtxq_lock(&uq->uq_key);
2611 		umtxq_unbusy(&uq->uq_key);
2612 		umtxq_unlock(&uq->uq_key);
2613 	}
2614 
2615 	umtx_key_release(&uq->uq_key);
2616 	if (error == ERESTART)
2617 		error = EINTR;
2618 	return (error);
2619 }
2620 
2621 static int
2622 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2623 {
2624 	struct umtx_q *uq;
2625 	uint32_t flags;
2626 	int32_t state, oldstate;
2627 	int error, q, count;
2628 
2629 	uq = td->td_umtxq;
2630 	flags = fuword32(&rwlock->rw_flags);
2631 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2632 	if (error != 0)
2633 		return (error);
2634 
2635 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2636 	if (state & URWLOCK_WRITE_OWNER) {
2637 		for (;;) {
2638 			oldstate = casuword32(&rwlock->rw_state, state,
2639 				state & ~URWLOCK_WRITE_OWNER);
2640 			if (oldstate == -1) {
2641 				error = EFAULT;
2642 				goto out;
2643 			}
2644 			if (oldstate != state) {
2645 				state = oldstate;
2646 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2647 					error = EPERM;
2648 					goto out;
2649 				}
2650 				error = umtxq_check_susp(td);
2651 				if (error != 0)
2652 					goto out;
2653 			} else
2654 				break;
2655 		}
2656 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2657 		for (;;) {
2658 			oldstate = casuword32(&rwlock->rw_state, state,
2659 				state - 1);
2660 			if (oldstate == -1) {
2661 				error = EFAULT;
2662 				goto out;
2663 			}
2664 			if (oldstate != state) {
2665 				state = oldstate;
2666 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2667 					error = EPERM;
2668 					goto out;
2669 				}
2670 				error = umtxq_check_susp(td);
2671 				if (error != 0)
2672 					goto out;
2673 			} else
2674 				break;
2675 		}
2676 	} else {
2677 		error = EPERM;
2678 		goto out;
2679 	}
2680 
2681 	count = 0;
2682 
2683 	if (!(flags & URWLOCK_PREFER_READER)) {
2684 		if (state & URWLOCK_WRITE_WAITERS) {
2685 			count = 1;
2686 			q = UMTX_EXCLUSIVE_QUEUE;
2687 		} else if (state & URWLOCK_READ_WAITERS) {
2688 			count = INT_MAX;
2689 			q = UMTX_SHARED_QUEUE;
2690 		}
2691 	} else {
2692 		if (state & URWLOCK_READ_WAITERS) {
2693 			count = INT_MAX;
2694 			q = UMTX_SHARED_QUEUE;
2695 		} else if (state & URWLOCK_WRITE_WAITERS) {
2696 			count = 1;
2697 			q = UMTX_EXCLUSIVE_QUEUE;
2698 		}
2699 	}
2700 
2701 	if (count) {
2702 		umtxq_lock(&uq->uq_key);
2703 		umtxq_busy(&uq->uq_key);
2704 		umtxq_signal_queue(&uq->uq_key, count, q);
2705 		umtxq_unbusy(&uq->uq_key);
2706 		umtxq_unlock(&uq->uq_key);
2707 	}
2708 out:
2709 	umtx_key_release(&uq->uq_key);
2710 	return (error);
2711 }
2712 
2713 static int
2714 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2715 {
2716 	struct abs_timeout timo;
2717 	struct umtx_q *uq;
2718 	uint32_t flags, count;
2719 	int error;
2720 
2721 	uq = td->td_umtxq;
2722 	flags = fuword32(&sem->_flags);
2723 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2724 	if (error != 0)
2725 		return (error);
2726 
2727 	if (timeout != NULL)
2728 		abs_timeout_init2(&timo, timeout);
2729 
2730 	umtxq_lock(&uq->uq_key);
2731 	umtxq_busy(&uq->uq_key);
2732 	umtxq_insert(uq);
2733 	umtxq_unlock(&uq->uq_key);
2734 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2735 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2736 	if (count != 0) {
2737 		umtxq_lock(&uq->uq_key);
2738 		umtxq_unbusy(&uq->uq_key);
2739 		umtxq_remove(uq);
2740 		umtxq_unlock(&uq->uq_key);
2741 		umtx_key_release(&uq->uq_key);
2742 		return (0);
2743 	}
2744 	umtxq_lock(&uq->uq_key);
2745 	umtxq_unbusy(&uq->uq_key);
2746 
2747 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2748 
2749 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2750 		error = 0;
2751 	else {
2752 		umtxq_remove(uq);
2753 		/* A relative timeout cannot be restarted. */
2754 		if (error == ERESTART && timeout != NULL &&
2755 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2756 			error = EINTR;
2757 	}
2758 	umtxq_unlock(&uq->uq_key);
2759 	umtx_key_release(&uq->uq_key);
2760 	return (error);
2761 }
2762 
2763 /*
2764  * Signal a userland condition variable.
2765  */
2766 static int
2767 do_sem_wake(struct thread *td, struct _usem *sem)
2768 {
2769 	struct umtx_key key;
2770 	int error, cnt;
2771 	uint32_t flags;
2772 
2773 	flags = fuword32(&sem->_flags);
2774 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2775 		return (error);
2776 	umtxq_lock(&key);
2777 	umtxq_busy(&key);
2778 	cnt = umtxq_count(&key);
2779 	if (cnt > 0) {
2780 		umtxq_signal(&key, 1);
2781 		/*
2782 		 * Check if count is greater than 0, this means the memory is
2783 		 * still being referenced by user code, so we can safely
2784 		 * update _has_waiters flag.
2785 		 */
2786 		if (cnt == 1) {
2787 			umtxq_unlock(&key);
2788 			error = suword32(
2789 			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2790 			umtxq_lock(&key);
2791 		}
2792 	}
2793 	umtxq_unbusy(&key);
2794 	umtxq_unlock(&key);
2795 	umtx_key_release(&key);
2796 	return (error);
2797 }
2798 
2799 inline int
2800 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2801 {
2802 	int error;
2803 
2804 	error = copyin(addr, tsp, sizeof(struct timespec));
2805 	if (error == 0) {
2806 		if (tsp->tv_sec < 0 ||
2807 		    tsp->tv_nsec >= 1000000000 ||
2808 		    tsp->tv_nsec < 0)
2809 			error = EINVAL;
2810 	}
2811 	return (error);
2812 }
2813 
2814 static inline int
2815 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2816 {
2817 	int error;
2818 
2819 	if (size <= sizeof(struct timespec)) {
2820 		tp->_clockid = CLOCK_REALTIME;
2821 		tp->_flags = 0;
2822 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2823 	} else
2824 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2825 	if (error != 0)
2826 		return (error);
2827 	if (tp->_timeout.tv_sec < 0 ||
2828 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2829 		return (EINVAL);
2830 	return (0);
2831 }
2832 
2833 static int
2834 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
2835 {
2836 
2837 	return (EOPNOTSUPP);
2838 }
2839 
2840 static int
2841 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2842 {
2843 	struct _umtx_time timeout, *tm_p;
2844 	int error;
2845 
2846 	if (uap->uaddr2 == NULL)
2847 		tm_p = NULL;
2848 	else {
2849 		error = umtx_copyin_umtx_time(
2850 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2851 		if (error != 0)
2852 			return (error);
2853 		tm_p = &timeout;
2854 	}
2855 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2856 }
2857 
2858 static int
2859 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2860 {
2861 	struct _umtx_time timeout, *tm_p;
2862 	int error;
2863 
2864 	if (uap->uaddr2 == NULL)
2865 		tm_p = NULL;
2866 	else {
2867 		error = umtx_copyin_umtx_time(
2868 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2869 		if (error != 0)
2870 			return (error);
2871 		tm_p = &timeout;
2872 	}
2873 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
2874 }
2875 
2876 static int
2877 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2878 {
2879 	struct _umtx_time *tm_p, timeout;
2880 	int error;
2881 
2882 	if (uap->uaddr2 == NULL)
2883 		tm_p = NULL;
2884 	else {
2885 		error = umtx_copyin_umtx_time(
2886 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2887 		if (error != 0)
2888 			return (error);
2889 		tm_p = &timeout;
2890 	}
2891 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
2892 }
2893 
2894 static int
2895 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2896 {
2897 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2898 }
2899 
2900 #define BATCH_SIZE	128
2901 static int
2902 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
2903 {
2904 	int count = uap->val;
2905 	void *uaddrs[BATCH_SIZE];
2906 	char **upp = (char **)uap->obj;
2907 	int tocopy;
2908 	int error = 0;
2909 	int i, pos = 0;
2910 
2911 	while (count > 0) {
2912 		tocopy = count;
2913 		if (tocopy > BATCH_SIZE)
2914 			tocopy = BATCH_SIZE;
2915 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
2916 		if (error != 0)
2917 			break;
2918 		for (i = 0; i < tocopy; ++i)
2919 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
2920 		count -= tocopy;
2921 		pos += tocopy;
2922 	}
2923 	return (error);
2924 }
2925 
2926 static int
2927 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2928 {
2929 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2930 }
2931 
2932 static int
2933 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2934 {
2935 	struct _umtx_time *tm_p, timeout;
2936 	int error;
2937 
2938 	/* Allow a null timespec (wait forever). */
2939 	if (uap->uaddr2 == NULL)
2940 		tm_p = NULL;
2941 	else {
2942 		error = umtx_copyin_umtx_time(
2943 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2944 		if (error != 0)
2945 			return (error);
2946 		tm_p = &timeout;
2947 	}
2948 	return do_lock_umutex(td, uap->obj, tm_p, 0);
2949 }
2950 
2951 static int
2952 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2953 {
2954 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
2955 }
2956 
2957 static int
2958 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
2959 {
2960 	struct _umtx_time *tm_p, timeout;
2961 	int error;
2962 
2963 	/* Allow a null timespec (wait forever). */
2964 	if (uap->uaddr2 == NULL)
2965 		tm_p = NULL;
2966 	else {
2967 		error = umtx_copyin_umtx_time(
2968 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2969 		if (error != 0)
2970 			return (error);
2971 		tm_p = &timeout;
2972 	}
2973 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
2974 }
2975 
2976 static int
2977 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
2978 {
2979 	return do_wake_umutex(td, uap->obj);
2980 }
2981 
2982 static int
2983 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2984 {
2985 	return do_unlock_umutex(td, uap->obj);
2986 }
2987 
2988 static int
2989 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2990 {
2991 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2992 }
2993 
2994 static int
2995 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2996 {
2997 	struct timespec *ts, timeout;
2998 	int error;
2999 
3000 	/* Allow a null timespec (wait forever). */
3001 	if (uap->uaddr2 == NULL)
3002 		ts = NULL;
3003 	else {
3004 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3005 		if (error != 0)
3006 			return (error);
3007 		ts = &timeout;
3008 	}
3009 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3010 }
3011 
3012 static int
3013 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3014 {
3015 	return do_cv_signal(td, uap->obj);
3016 }
3017 
3018 static int
3019 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3020 {
3021 	return do_cv_broadcast(td, uap->obj);
3022 }
3023 
3024 static int
3025 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3026 {
3027 	struct _umtx_time timeout;
3028 	int error;
3029 
3030 	/* Allow a null timespec (wait forever). */
3031 	if (uap->uaddr2 == NULL) {
3032 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3033 	} else {
3034 		error = umtx_copyin_umtx_time(uap->uaddr2,
3035 		   (size_t)uap->uaddr1, &timeout);
3036 		if (error != 0)
3037 			return (error);
3038 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3039 	}
3040 	return (error);
3041 }
3042 
3043 static int
3044 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3045 {
3046 	struct _umtx_time timeout;
3047 	int error;
3048 
3049 	/* Allow a null timespec (wait forever). */
3050 	if (uap->uaddr2 == NULL) {
3051 		error = do_rw_wrlock(td, uap->obj, 0);
3052 	} else {
3053 		error = umtx_copyin_umtx_time(uap->uaddr2,
3054 		   (size_t)uap->uaddr1, &timeout);
3055 		if (error != 0)
3056 			return (error);
3057 
3058 		error = do_rw_wrlock(td, uap->obj, &timeout);
3059 	}
3060 	return (error);
3061 }
3062 
3063 static int
3064 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3065 {
3066 	return do_rw_unlock(td, uap->obj);
3067 }
3068 
3069 static int
3070 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3071 {
3072 	struct _umtx_time *tm_p, timeout;
3073 	int error;
3074 
3075 	/* Allow a null timespec (wait forever). */
3076 	if (uap->uaddr2 == NULL)
3077 		tm_p = NULL;
3078 	else {
3079 		error = umtx_copyin_umtx_time(
3080 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3081 		if (error != 0)
3082 			return (error);
3083 		tm_p = &timeout;
3084 	}
3085 	return (do_sem_wait(td, uap->obj, tm_p));
3086 }
3087 
3088 static int
3089 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3090 {
3091 	return do_sem_wake(td, uap->obj);
3092 }
3093 
3094 static int
3095 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3096 {
3097 	return do_wake2_umutex(td, uap->obj, uap->val);
3098 }
3099 
3100 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3101 
3102 static _umtx_op_func op_table[] = {
3103 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3104 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3105 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3106 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3107 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3108 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3109 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3110 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3111 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3112 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3113 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3114 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3115 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3116 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3117 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3118 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3119 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3120 	__umtx_op_wait_umutex,		/* UMTX_OP_MUTEX_WAIT */
3121 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3122 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3123 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3124 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3125 	__umtx_op_wake2_umutex		/* UMTX_OP_MUTEX_WAKE2 */
3126 };
3127 
3128 int
3129 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3130 {
3131 	if ((unsigned)uap->op < UMTX_OP_MAX)
3132 		return (*op_table[uap->op])(td, uap);
3133 	return (EINVAL);
3134 }
3135 
3136 #ifdef COMPAT_FREEBSD32
3137 
3138 struct timespec32 {
3139 	int32_t tv_sec;
3140 	int32_t tv_nsec;
3141 };
3142 
3143 struct umtx_time32 {
3144 	struct	timespec32	timeout;
3145 	uint32_t		flags;
3146 	uint32_t		clockid;
3147 };
3148 
3149 static inline int
3150 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3151 {
3152 	struct timespec32 ts32;
3153 	int error;
3154 
3155 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3156 	if (error == 0) {
3157 		if (ts32.tv_sec < 0 ||
3158 		    ts32.tv_nsec >= 1000000000 ||
3159 		    ts32.tv_nsec < 0)
3160 			error = EINVAL;
3161 		else {
3162 			tsp->tv_sec = ts32.tv_sec;
3163 			tsp->tv_nsec = ts32.tv_nsec;
3164 		}
3165 	}
3166 	return (error);
3167 }
3168 
3169 static inline int
3170 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3171 {
3172 	struct umtx_time32 t32;
3173 	int error;
3174 
3175 	t32.clockid = CLOCK_REALTIME;
3176 	t32.flags   = 0;
3177 	if (size <= sizeof(struct timespec32))
3178 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3179 	else
3180 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3181 	if (error != 0)
3182 		return (error);
3183 	if (t32.timeout.tv_sec < 0 ||
3184 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3185 		return (EINVAL);
3186 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3187 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3188 	tp->_flags = t32.flags;
3189 	tp->_clockid = t32.clockid;
3190 	return (0);
3191 }
3192 
3193 static int
3194 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3195 {
3196 	struct _umtx_time *tm_p, timeout;
3197 	int error;
3198 
3199 	if (uap->uaddr2 == NULL)
3200 		tm_p = NULL;
3201 	else {
3202 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3203 			(size_t)uap->uaddr1, &timeout);
3204 		if (error != 0)
3205 			return (error);
3206 		tm_p = &timeout;
3207 	}
3208 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3209 }
3210 
3211 static int
3212 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3213 {
3214 	struct _umtx_time *tm_p, timeout;
3215 	int error;
3216 
3217 	/* Allow a null timespec (wait forever). */
3218 	if (uap->uaddr2 == NULL)
3219 		tm_p = NULL;
3220 	else {
3221 		error = umtx_copyin_umtx_time(uap->uaddr2,
3222 			    (size_t)uap->uaddr1, &timeout);
3223 		if (error != 0)
3224 			return (error);
3225 		tm_p = &timeout;
3226 	}
3227 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3228 }
3229 
3230 static int
3231 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3232 {
3233 	struct _umtx_time *tm_p, timeout;
3234 	int error;
3235 
3236 	/* Allow a null timespec (wait forever). */
3237 	if (uap->uaddr2 == NULL)
3238 		tm_p = NULL;
3239 	else {
3240 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3241 		    (size_t)uap->uaddr1, &timeout);
3242 		if (error != 0)
3243 			return (error);
3244 		tm_p = &timeout;
3245 	}
3246 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3247 }
3248 
3249 static int
3250 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3251 {
3252 	struct timespec *ts, timeout;
3253 	int error;
3254 
3255 	/* Allow a null timespec (wait forever). */
3256 	if (uap->uaddr2 == NULL)
3257 		ts = NULL;
3258 	else {
3259 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3260 		if (error != 0)
3261 			return (error);
3262 		ts = &timeout;
3263 	}
3264 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3265 }
3266 
3267 static int
3268 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3269 {
3270 	struct _umtx_time timeout;
3271 	int error;
3272 
3273 	/* Allow a null timespec (wait forever). */
3274 	if (uap->uaddr2 == NULL) {
3275 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3276 	} else {
3277 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3278 		    (size_t)uap->uaddr1, &timeout);
3279 		if (error != 0)
3280 			return (error);
3281 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3282 	}
3283 	return (error);
3284 }
3285 
3286 static int
3287 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3288 {
3289 	struct _umtx_time timeout;
3290 	int error;
3291 
3292 	/* Allow a null timespec (wait forever). */
3293 	if (uap->uaddr2 == NULL) {
3294 		error = do_rw_wrlock(td, uap->obj, 0);
3295 	} else {
3296 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3297 		    (size_t)uap->uaddr1, &timeout);
3298 		if (error != 0)
3299 			return (error);
3300 		error = do_rw_wrlock(td, uap->obj, &timeout);
3301 	}
3302 	return (error);
3303 }
3304 
3305 static int
3306 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3307 {
3308 	struct _umtx_time *tm_p, timeout;
3309 	int error;
3310 
3311 	if (uap->uaddr2 == NULL)
3312 		tm_p = NULL;
3313 	else {
3314 		error = umtx_copyin_umtx_time32(
3315 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3316 		if (error != 0)
3317 			return (error);
3318 		tm_p = &timeout;
3319 	}
3320 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3321 }
3322 
3323 static int
3324 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3325 {
3326 	struct _umtx_time *tm_p, timeout;
3327 	int error;
3328 
3329 	/* Allow a null timespec (wait forever). */
3330 	if (uap->uaddr2 == NULL)
3331 		tm_p = NULL;
3332 	else {
3333 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3334 		    (size_t)uap->uaddr1, &timeout);
3335 		if (error != 0)
3336 			return (error);
3337 		tm_p = &timeout;
3338 	}
3339 	return (do_sem_wait(td, uap->obj, tm_p));
3340 }
3341 
3342 static int
3343 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3344 {
3345 	int count = uap->val;
3346 	uint32_t uaddrs[BATCH_SIZE];
3347 	uint32_t **upp = (uint32_t **)uap->obj;
3348 	int tocopy;
3349 	int error = 0;
3350 	int i, pos = 0;
3351 
3352 	while (count > 0) {
3353 		tocopy = count;
3354 		if (tocopy > BATCH_SIZE)
3355 			tocopy = BATCH_SIZE;
3356 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3357 		if (error != 0)
3358 			break;
3359 		for (i = 0; i < tocopy; ++i)
3360 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3361 				INT_MAX, 1);
3362 		count -= tocopy;
3363 		pos += tocopy;
3364 	}
3365 	return (error);
3366 }
3367 
3368 static _umtx_op_func op_table_compat32[] = {
3369 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3370 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3371 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3372 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3373 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3374 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3375 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3376 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3377 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3378 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3379 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3380 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3381 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3382 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3383 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3384 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3385 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3386 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_MUTEX_WAIT */
3387 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3388 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3389 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3390 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3391 	__umtx_op_wake2_umutex		/* UMTX_OP_MUTEX_WAKE2 */
3392 };
3393 
3394 int
3395 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3396 {
3397 	if ((unsigned)uap->op < UMTX_OP_MAX)
3398 		return (*op_table_compat32[uap->op])(td,
3399 			(struct _umtx_op_args *)uap);
3400 	return (EINVAL);
3401 }
3402 #endif
3403 
3404 void
3405 umtx_thread_init(struct thread *td)
3406 {
3407 	td->td_umtxq = umtxq_alloc();
3408 	td->td_umtxq->uq_thread = td;
3409 }
3410 
3411 void
3412 umtx_thread_fini(struct thread *td)
3413 {
3414 	umtxq_free(td->td_umtxq);
3415 }
3416 
3417 /*
3418  * It will be called when new thread is created, e.g fork().
3419  */
3420 void
3421 umtx_thread_alloc(struct thread *td)
3422 {
3423 	struct umtx_q *uq;
3424 
3425 	uq = td->td_umtxq;
3426 	uq->uq_inherited_pri = PRI_MAX;
3427 
3428 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3429 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3430 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3431 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3432 }
3433 
3434 /*
3435  * exec() hook.
3436  */
3437 static void
3438 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3439 	struct image_params *imgp __unused)
3440 {
3441 	umtx_thread_cleanup(curthread);
3442 }
3443 
3444 /*
3445  * thread_exit() hook.
3446  */
3447 void
3448 umtx_thread_exit(struct thread *td)
3449 {
3450 	umtx_thread_cleanup(td);
3451 }
3452 
3453 /*
3454  * clean up umtx data.
3455  */
3456 static void
3457 umtx_thread_cleanup(struct thread *td)
3458 {
3459 	struct umtx_q *uq;
3460 	struct umtx_pi *pi;
3461 
3462 	if ((uq = td->td_umtxq) == NULL)
3463 		return;
3464 
3465 	mtx_lock_spin(&umtx_lock);
3466 	uq->uq_inherited_pri = PRI_MAX;
3467 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3468 		pi->pi_owner = NULL;
3469 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3470 	}
3471 	mtx_unlock_spin(&umtx_lock);
3472 	thread_lock(td);
3473 	sched_lend_user_prio(td, PRI_MAX);
3474 	thread_unlock(td);
3475 }
3476