xref: /freebsd/sys/kern/kern_umtx.c (revision ae77177087c655fc883075af4f425b37e032cd05)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sched.h>
43 #include <sys/smp.h>
44 #include <sys/sysctl.h>
45 #include <sys/sysent.h>
46 #include <sys/systm.h>
47 #include <sys/sysproto.h>
48 #include <sys/syscallsubr.h>
49 #include <sys/eventhandler.h>
50 #include <sys/umtx.h>
51 
52 #include <vm/vm.h>
53 #include <vm/vm_param.h>
54 #include <vm/pmap.h>
55 #include <vm/vm_map.h>
56 #include <vm/vm_object.h>
57 
58 #include <machine/cpu.h>
59 
60 #ifdef COMPAT_FREEBSD32
61 #include <compat/freebsd32/freebsd32_proto.h>
62 #endif
63 
64 #define _UMUTEX_TRY		1
65 #define _UMUTEX_WAIT		2
66 
67 /* Priority inheritance mutex info. */
68 struct umtx_pi {
69 	/* Owner thread */
70 	struct thread		*pi_owner;
71 
72 	/* Reference count */
73 	int			pi_refcount;
74 
75  	/* List entry to link umtx holding by thread */
76 	TAILQ_ENTRY(umtx_pi)	pi_link;
77 
78 	/* List entry in hash */
79 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80 
81 	/* List for waiters */
82 	TAILQ_HEAD(,umtx_q)	pi_blocked;
83 
84 	/* Identify a userland lock object */
85 	struct umtx_key		pi_key;
86 };
87 
88 /* A userland synchronous object user. */
89 struct umtx_q {
90 	/* Linked list for the hash. */
91 	TAILQ_ENTRY(umtx_q)	uq_link;
92 
93 	/* Umtx key. */
94 	struct umtx_key		uq_key;
95 
96 	/* Umtx flags. */
97 	int			uq_flags;
98 #define UQF_UMTXQ	0x0001
99 
100 	/* The thread waits on. */
101 	struct thread		*uq_thread;
102 
103 	/*
104 	 * Blocked on PI mutex. read can use chain lock
105 	 * or umtx_lock, write must have both chain lock and
106 	 * umtx_lock being hold.
107 	 */
108 	struct umtx_pi		*uq_pi_blocked;
109 
110 	/* On blocked list */
111 	TAILQ_ENTRY(umtx_q)	uq_lockq;
112 
113 	/* Thread contending with us */
114 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115 
116 	/* Inherited priority from PP mutex */
117 	u_char			uq_inherited_pri;
118 
119 	/* Spare queue ready to be reused */
120 	struct umtxq_queue	*uq_spare_queue;
121 
122 	/* The queue we on */
123 	struct umtxq_queue	*uq_cur_queue;
124 };
125 
126 TAILQ_HEAD(umtxq_head, umtx_q);
127 
128 /* Per-key wait-queue */
129 struct umtxq_queue {
130 	struct umtxq_head	head;
131 	struct umtx_key		key;
132 	LIST_ENTRY(umtxq_queue)	link;
133 	int			length;
134 };
135 
136 LIST_HEAD(umtxq_list, umtxq_queue);
137 
138 /* Userland lock object's wait-queue chain */
139 struct umtxq_chain {
140 	/* Lock for this chain. */
141 	struct mtx		uc_lock;
142 
143 	/* List of sleep queues. */
144 	struct umtxq_list	uc_queue[2];
145 #define UMTX_SHARED_QUEUE	0
146 #define UMTX_EXCLUSIVE_QUEUE	1
147 
148 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149 
150 	/* Busy flag */
151 	char			uc_busy;
152 
153 	/* Chain lock waiters */
154 	int			uc_waiters;
155 
156 	/* All PI in the list */
157 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158 
159 #ifdef UMTX_PROFILING
160 	int 			length;
161 	int			max_length;
162 #endif
163 };
164 
165 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166 #define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167 
168 /*
169  * Don't propagate time-sharing priority, there is a security reason,
170  * a user can simply introduce PI-mutex, let thread A lock the mutex,
171  * and let another thread B block on the mutex, because B is
172  * sleeping, its priority will be boosted, this causes A's priority to
173  * be boosted via priority propagating too and will never be lowered even
174  * if it is using 100%CPU, this is unfair to other processes.
175  */
176 
177 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180 
181 #define	GOLDEN_RATIO_PRIME	2654404609U
182 #define	UMTX_CHAINS		512
183 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
184 
185 #define	GET_SHARE(flags)	\
186     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187 
188 #define BUSY_SPINS		200
189 
190 static uma_zone_t		umtx_pi_zone;
191 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
192 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
193 static int			umtx_pi_allocated;
194 
195 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
196 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
197     &umtx_pi_allocated, 0, "Allocated umtx_pi");
198 
199 #ifdef UMTX_PROFILING
200 static long max_length;
201 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
202 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
203 #endif
204 
205 static void umtxq_sysinit(void *);
206 static void umtxq_hash(struct umtx_key *key);
207 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
208 static void umtxq_lock(struct umtx_key *key);
209 static void umtxq_unlock(struct umtx_key *key);
210 static void umtxq_busy(struct umtx_key *key);
211 static void umtxq_unbusy(struct umtx_key *key);
212 static void umtxq_insert_queue(struct umtx_q *uq, int q);
213 static void umtxq_remove_queue(struct umtx_q *uq, int q);
214 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
215 static int umtxq_count(struct umtx_key *key);
216 static struct umtx_pi *umtx_pi_alloc(int);
217 static void umtx_pi_free(struct umtx_pi *pi);
218 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
219 static void umtx_thread_cleanup(struct thread *td);
220 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
221 	struct image_params *imgp __unused);
222 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
223 
224 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
225 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
226 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
227 
228 static struct mtx umtx_lock;
229 
230 #ifdef UMTX_PROFILING
231 static void
232 umtx_init_profiling(void)
233 {
234 	struct sysctl_oid *chain_oid;
235 	char chain_name[10];
236 	int i;
237 
238 	for (i = 0; i < UMTX_CHAINS; ++i) {
239 		snprintf(chain_name, sizeof(chain_name), "%d", i);
240 		chain_oid = SYSCTL_ADD_NODE(NULL,
241 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
242 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
243 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
244 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
245 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
246 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
247 	}
248 }
249 #endif
250 
251 static void
252 umtxq_sysinit(void *arg __unused)
253 {
254 	int i, j;
255 
256 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
257 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
258 	for (i = 0; i < 2; ++i) {
259 		for (j = 0; j < UMTX_CHAINS; ++j) {
260 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
261 				 MTX_DEF | MTX_DUPOK);
262 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
263 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
264 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
265 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
266 			umtxq_chains[i][j].uc_busy = 0;
267 			umtxq_chains[i][j].uc_waiters = 0;
268 			#ifdef UMTX_PROFILING
269 			umtxq_chains[i][j].length = 0;
270 			umtxq_chains[i][j].max_length = 0;
271 			#endif
272 		}
273 	}
274 	#ifdef UMTX_PROFILING
275 	umtx_init_profiling();
276 	#endif
277 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279 	    EVENTHANDLER_PRI_ANY);
280 }
281 
282 struct umtx_q *
283 umtxq_alloc(void)
284 {
285 	struct umtx_q *uq;
286 
287 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289 	TAILQ_INIT(&uq->uq_spare_queue->head);
290 	TAILQ_INIT(&uq->uq_pi_contested);
291 	uq->uq_inherited_pri = PRI_MAX;
292 	return (uq);
293 }
294 
295 void
296 umtxq_free(struct umtx_q *uq)
297 {
298 	MPASS(uq->uq_spare_queue != NULL);
299 	free(uq->uq_spare_queue, M_UMTX);
300 	free(uq, M_UMTX);
301 }
302 
303 static inline void
304 umtxq_hash(struct umtx_key *key)
305 {
306 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308 }
309 
310 static inline struct umtxq_chain *
311 umtxq_getchain(struct umtx_key *key)
312 {
313 	if (key->type <= TYPE_SEM)
314 		return (&umtxq_chains[1][key->hash]);
315 	return (&umtxq_chains[0][key->hash]);
316 }
317 
318 /*
319  * Lock a chain.
320  */
321 static inline void
322 umtxq_lock(struct umtx_key *key)
323 {
324 	struct umtxq_chain *uc;
325 
326 	uc = umtxq_getchain(key);
327 	mtx_lock(&uc->uc_lock);
328 }
329 
330 /*
331  * Unlock a chain.
332  */
333 static inline void
334 umtxq_unlock(struct umtx_key *key)
335 {
336 	struct umtxq_chain *uc;
337 
338 	uc = umtxq_getchain(key);
339 	mtx_unlock(&uc->uc_lock);
340 }
341 
342 /*
343  * Set chain to busy state when following operation
344  * may be blocked (kernel mutex can not be used).
345  */
346 static inline void
347 umtxq_busy(struct umtx_key *key)
348 {
349 	struct umtxq_chain *uc;
350 
351 	uc = umtxq_getchain(key);
352 	mtx_assert(&uc->uc_lock, MA_OWNED);
353 	if (uc->uc_busy) {
354 #ifdef SMP
355 		if (smp_cpus > 1) {
356 			int count = BUSY_SPINS;
357 			if (count > 0) {
358 				umtxq_unlock(key);
359 				while (uc->uc_busy && --count > 0)
360 					cpu_spinwait();
361 				umtxq_lock(key);
362 			}
363 		}
364 #endif
365 		while (uc->uc_busy) {
366 			uc->uc_waiters++;
367 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
368 			uc->uc_waiters--;
369 		}
370 	}
371 	uc->uc_busy = 1;
372 }
373 
374 /*
375  * Unbusy a chain.
376  */
377 static inline void
378 umtxq_unbusy(struct umtx_key *key)
379 {
380 	struct umtxq_chain *uc;
381 
382 	uc = umtxq_getchain(key);
383 	mtx_assert(&uc->uc_lock, MA_OWNED);
384 	KASSERT(uc->uc_busy != 0, ("not busy"));
385 	uc->uc_busy = 0;
386 	if (uc->uc_waiters)
387 		wakeup_one(uc);
388 }
389 
390 static struct umtxq_queue *
391 umtxq_queue_lookup(struct umtx_key *key, int q)
392 {
393 	struct umtxq_queue *uh;
394 	struct umtxq_chain *uc;
395 
396 	uc = umtxq_getchain(key);
397 	UMTXQ_LOCKED_ASSERT(uc);
398 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
399 		if (umtx_key_match(&uh->key, key))
400 			return (uh);
401 	}
402 
403 	return (NULL);
404 }
405 
406 static inline void
407 umtxq_insert_queue(struct umtx_q *uq, int q)
408 {
409 	struct umtxq_queue *uh;
410 	struct umtxq_chain *uc;
411 
412 	uc = umtxq_getchain(&uq->uq_key);
413 	UMTXQ_LOCKED_ASSERT(uc);
414 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
415 	uh = umtxq_queue_lookup(&uq->uq_key, q);
416 	if (uh != NULL) {
417 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
418 	} else {
419 		uh = uq->uq_spare_queue;
420 		uh->key = uq->uq_key;
421 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
422 	}
423 	uq->uq_spare_queue = NULL;
424 
425 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
426 	uh->length++;
427 	#ifdef UMTX_PROFILING
428 	uc->length++;
429 	if (uc->length > uc->max_length) {
430 		uc->max_length = uc->length;
431 		if (uc->max_length > max_length)
432 			max_length = uc->max_length;
433 	}
434 	#endif
435 	uq->uq_flags |= UQF_UMTXQ;
436 	uq->uq_cur_queue = uh;
437 	return;
438 }
439 
440 static inline void
441 umtxq_remove_queue(struct umtx_q *uq, int q)
442 {
443 	struct umtxq_chain *uc;
444 	struct umtxq_queue *uh;
445 
446 	uc = umtxq_getchain(&uq->uq_key);
447 	UMTXQ_LOCKED_ASSERT(uc);
448 	if (uq->uq_flags & UQF_UMTXQ) {
449 		uh = uq->uq_cur_queue;
450 		TAILQ_REMOVE(&uh->head, uq, uq_link);
451 		uh->length--;
452 		#ifdef UMTX_PROFILING
453 		uc->length--;
454 		#endif
455 		uq->uq_flags &= ~UQF_UMTXQ;
456 		if (TAILQ_EMPTY(&uh->head)) {
457 			KASSERT(uh->length == 0,
458 			    ("inconsistent umtxq_queue length"));
459 			LIST_REMOVE(uh, link);
460 		} else {
461 			uh = LIST_FIRST(&uc->uc_spare_queue);
462 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
463 			LIST_REMOVE(uh, link);
464 		}
465 		uq->uq_spare_queue = uh;
466 		uq->uq_cur_queue = NULL;
467 	}
468 }
469 
470 /*
471  * Check if there are multiple waiters
472  */
473 static int
474 umtxq_count(struct umtx_key *key)
475 {
476 	struct umtxq_chain *uc;
477 	struct umtxq_queue *uh;
478 
479 	uc = umtxq_getchain(key);
480 	UMTXQ_LOCKED_ASSERT(uc);
481 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
482 	if (uh != NULL)
483 		return (uh->length);
484 	return (0);
485 }
486 
487 /*
488  * Check if there are multiple PI waiters and returns first
489  * waiter.
490  */
491 static int
492 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
493 {
494 	struct umtxq_chain *uc;
495 	struct umtxq_queue *uh;
496 
497 	*first = NULL;
498 	uc = umtxq_getchain(key);
499 	UMTXQ_LOCKED_ASSERT(uc);
500 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
501 	if (uh != NULL) {
502 		*first = TAILQ_FIRST(&uh->head);
503 		return (uh->length);
504 	}
505 	return (0);
506 }
507 
508 /*
509  * Wake up threads waiting on an userland object.
510  */
511 
512 static int
513 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
514 {
515 	struct umtxq_chain *uc;
516 	struct umtxq_queue *uh;
517 	struct umtx_q *uq;
518 	int ret;
519 
520 	ret = 0;
521 	uc = umtxq_getchain(key);
522 	UMTXQ_LOCKED_ASSERT(uc);
523 	uh = umtxq_queue_lookup(key, q);
524 	if (uh != NULL) {
525 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
526 			umtxq_remove_queue(uq, q);
527 			wakeup(uq);
528 			if (++ret >= n_wake)
529 				return (ret);
530 		}
531 	}
532 	return (ret);
533 }
534 
535 
536 /*
537  * Wake up specified thread.
538  */
539 static inline void
540 umtxq_signal_thread(struct umtx_q *uq)
541 {
542 	struct umtxq_chain *uc;
543 
544 	uc = umtxq_getchain(&uq->uq_key);
545 	UMTXQ_LOCKED_ASSERT(uc);
546 	umtxq_remove(uq);
547 	wakeup(uq);
548 }
549 
550 /*
551  * Put thread into sleep state, before sleeping, check if
552  * thread was removed from umtx queue.
553  */
554 static inline int
555 umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
556 {
557 	struct umtxq_chain *uc;
558 	int error;
559 
560 	uc = umtxq_getchain(&uq->uq_key);
561 	UMTXQ_LOCKED_ASSERT(uc);
562 	if (!(uq->uq_flags & UQF_UMTXQ))
563 		return (0);
564 	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
565 	if (error == EWOULDBLOCK)
566 		error = ETIMEDOUT;
567 	return (error);
568 }
569 
570 /*
571  * Convert userspace address into unique logical address.
572  */
573 int
574 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
575 {
576 	struct thread *td = curthread;
577 	vm_map_t map;
578 	vm_map_entry_t entry;
579 	vm_pindex_t pindex;
580 	vm_prot_t prot;
581 	boolean_t wired;
582 
583 	key->type = type;
584 	if (share == THREAD_SHARE) {
585 		key->shared = 0;
586 		key->info.private.vs = td->td_proc->p_vmspace;
587 		key->info.private.addr = (uintptr_t)addr;
588 	} else {
589 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
590 		map = &td->td_proc->p_vmspace->vm_map;
591 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
592 		    &entry, &key->info.shared.object, &pindex, &prot,
593 		    &wired) != KERN_SUCCESS) {
594 			return EFAULT;
595 		}
596 
597 		if ((share == PROCESS_SHARE) ||
598 		    (share == AUTO_SHARE &&
599 		     VM_INHERIT_SHARE == entry->inheritance)) {
600 			key->shared = 1;
601 			key->info.shared.offset = entry->offset + entry->start -
602 				(vm_offset_t)addr;
603 			vm_object_reference(key->info.shared.object);
604 		} else {
605 			key->shared = 0;
606 			key->info.private.vs = td->td_proc->p_vmspace;
607 			key->info.private.addr = (uintptr_t)addr;
608 		}
609 		vm_map_lookup_done(map, entry);
610 	}
611 
612 	umtxq_hash(key);
613 	return (0);
614 }
615 
616 /*
617  * Release key.
618  */
619 void
620 umtx_key_release(struct umtx_key *key)
621 {
622 	if (key->shared)
623 		vm_object_deallocate(key->info.shared.object);
624 }
625 
626 /*
627  * Lock a umtx object.
628  */
629 static int
630 _do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
631 {
632 	struct umtx_q *uq;
633 	u_long owner;
634 	u_long old;
635 	int error = 0;
636 
637 	uq = td->td_umtxq;
638 
639 	/*
640 	 * Care must be exercised when dealing with umtx structure. It
641 	 * can fault on any access.
642 	 */
643 	for (;;) {
644 		/*
645 		 * Try the uncontested case.  This should be done in userland.
646 		 */
647 		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
648 
649 		/* The acquire succeeded. */
650 		if (owner == UMTX_UNOWNED)
651 			return (0);
652 
653 		/* The address was invalid. */
654 		if (owner == -1)
655 			return (EFAULT);
656 
657 		/* If no one owns it but it is contested try to acquire it. */
658 		if (owner == UMTX_CONTESTED) {
659 			owner = casuword(&umtx->u_owner,
660 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
661 
662 			if (owner == UMTX_CONTESTED)
663 				return (0);
664 
665 			/* The address was invalid. */
666 			if (owner == -1)
667 				return (EFAULT);
668 
669 			/* If this failed the lock has changed, restart. */
670 			continue;
671 		}
672 
673 		/*
674 		 * If we caught a signal, we have retried and now
675 		 * exit immediately.
676 		 */
677 		if (error != 0)
678 			return (error);
679 
680 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
681 			AUTO_SHARE, &uq->uq_key)) != 0)
682 			return (error);
683 
684 		umtxq_lock(&uq->uq_key);
685 		umtxq_busy(&uq->uq_key);
686 		umtxq_insert(uq);
687 		umtxq_unbusy(&uq->uq_key);
688 		umtxq_unlock(&uq->uq_key);
689 
690 		/*
691 		 * Set the contested bit so that a release in user space
692 		 * knows to use the system call for unlock.  If this fails
693 		 * either some one else has acquired the lock or it has been
694 		 * released.
695 		 */
696 		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
697 
698 		/* The address was invalid. */
699 		if (old == -1) {
700 			umtxq_lock(&uq->uq_key);
701 			umtxq_remove(uq);
702 			umtxq_unlock(&uq->uq_key);
703 			umtx_key_release(&uq->uq_key);
704 			return (EFAULT);
705 		}
706 
707 		/*
708 		 * We set the contested bit, sleep. Otherwise the lock changed
709 		 * and we need to retry or we lost a race to the thread
710 		 * unlocking the umtx.
711 		 */
712 		umtxq_lock(&uq->uq_key);
713 		if (old == owner)
714 			error = umtxq_sleep(uq, "umtx", timo);
715 		umtxq_remove(uq);
716 		umtxq_unlock(&uq->uq_key);
717 		umtx_key_release(&uq->uq_key);
718 	}
719 
720 	return (0);
721 }
722 
723 /*
724  * Lock a umtx object.
725  */
726 static int
727 do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
728 	struct timespec *timeout)
729 {
730 	struct timespec ts, ts2, ts3;
731 	struct timeval tv;
732 	int error;
733 
734 	if (timeout == NULL) {
735 		error = _do_lock_umtx(td, umtx, id, 0);
736 		/* Mutex locking is restarted if it is interrupted. */
737 		if (error == EINTR)
738 			error = ERESTART;
739 	} else {
740 		getnanouptime(&ts);
741 		timespecadd(&ts, timeout);
742 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
743 		for (;;) {
744 			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
745 			if (error != ETIMEDOUT)
746 				break;
747 			getnanouptime(&ts2);
748 			if (timespeccmp(&ts2, &ts, >=)) {
749 				error = ETIMEDOUT;
750 				break;
751 			}
752 			ts3 = ts;
753 			timespecsub(&ts3, &ts2);
754 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
755 		}
756 		/* Timed-locking is not restarted. */
757 		if (error == ERESTART)
758 			error = EINTR;
759 	}
760 	return (error);
761 }
762 
763 /*
764  * Unlock a umtx object.
765  */
766 static int
767 do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
768 {
769 	struct umtx_key key;
770 	u_long owner;
771 	u_long old;
772 	int error;
773 	int count;
774 
775 	/*
776 	 * Make sure we own this mtx.
777 	 */
778 	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
779 	if (owner == -1)
780 		return (EFAULT);
781 
782 	if ((owner & ~UMTX_CONTESTED) != id)
783 		return (EPERM);
784 
785 	/* This should be done in userland */
786 	if ((owner & UMTX_CONTESTED) == 0) {
787 		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
788 		if (old == -1)
789 			return (EFAULT);
790 		if (old == owner)
791 			return (0);
792 		owner = old;
793 	}
794 
795 	/* We should only ever be in here for contested locks */
796 	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
797 		&key)) != 0)
798 		return (error);
799 
800 	umtxq_lock(&key);
801 	umtxq_busy(&key);
802 	count = umtxq_count(&key);
803 	umtxq_unlock(&key);
804 
805 	/*
806 	 * When unlocking the umtx, it must be marked as unowned if
807 	 * there is zero or one thread only waiting for it.
808 	 * Otherwise, it must be marked as contested.
809 	 */
810 	old = casuword(&umtx->u_owner, owner,
811 		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
812 	umtxq_lock(&key);
813 	umtxq_signal(&key,1);
814 	umtxq_unbusy(&key);
815 	umtxq_unlock(&key);
816 	umtx_key_release(&key);
817 	if (old == -1)
818 		return (EFAULT);
819 	if (old != owner)
820 		return (EINVAL);
821 	return (0);
822 }
823 
824 #ifdef COMPAT_FREEBSD32
825 
826 /*
827  * Lock a umtx object.
828  */
829 static int
830 _do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
831 {
832 	struct umtx_q *uq;
833 	uint32_t owner;
834 	uint32_t old;
835 	int error = 0;
836 
837 	uq = td->td_umtxq;
838 
839 	/*
840 	 * Care must be exercised when dealing with umtx structure. It
841 	 * can fault on any access.
842 	 */
843 	for (;;) {
844 		/*
845 		 * Try the uncontested case.  This should be done in userland.
846 		 */
847 		owner = casuword32(m, UMUTEX_UNOWNED, id);
848 
849 		/* The acquire succeeded. */
850 		if (owner == UMUTEX_UNOWNED)
851 			return (0);
852 
853 		/* The address was invalid. */
854 		if (owner == -1)
855 			return (EFAULT);
856 
857 		/* If no one owns it but it is contested try to acquire it. */
858 		if (owner == UMUTEX_CONTESTED) {
859 			owner = casuword32(m,
860 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
861 			if (owner == UMUTEX_CONTESTED)
862 				return (0);
863 
864 			/* The address was invalid. */
865 			if (owner == -1)
866 				return (EFAULT);
867 
868 			/* If this failed the lock has changed, restart. */
869 			continue;
870 		}
871 
872 		/*
873 		 * If we caught a signal, we have retried and now
874 		 * exit immediately.
875 		 */
876 		if (error != 0)
877 			return (error);
878 
879 		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
880 			AUTO_SHARE, &uq->uq_key)) != 0)
881 			return (error);
882 
883 		umtxq_lock(&uq->uq_key);
884 		umtxq_busy(&uq->uq_key);
885 		umtxq_insert(uq);
886 		umtxq_unbusy(&uq->uq_key);
887 		umtxq_unlock(&uq->uq_key);
888 
889 		/*
890 		 * Set the contested bit so that a release in user space
891 		 * knows to use the system call for unlock.  If this fails
892 		 * either some one else has acquired the lock or it has been
893 		 * released.
894 		 */
895 		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
896 
897 		/* The address was invalid. */
898 		if (old == -1) {
899 			umtxq_lock(&uq->uq_key);
900 			umtxq_remove(uq);
901 			umtxq_unlock(&uq->uq_key);
902 			umtx_key_release(&uq->uq_key);
903 			return (EFAULT);
904 		}
905 
906 		/*
907 		 * We set the contested bit, sleep. Otherwise the lock changed
908 		 * and we need to retry or we lost a race to the thread
909 		 * unlocking the umtx.
910 		 */
911 		umtxq_lock(&uq->uq_key);
912 		if (old == owner)
913 			error = umtxq_sleep(uq, "umtx", timo);
914 		umtxq_remove(uq);
915 		umtxq_unlock(&uq->uq_key);
916 		umtx_key_release(&uq->uq_key);
917 	}
918 
919 	return (0);
920 }
921 
922 /*
923  * Lock a umtx object.
924  */
925 static int
926 do_lock_umtx32(struct thread *td, void *m, uint32_t id,
927 	struct timespec *timeout)
928 {
929 	struct timespec ts, ts2, ts3;
930 	struct timeval tv;
931 	int error;
932 
933 	if (timeout == NULL) {
934 		error = _do_lock_umtx32(td, m, id, 0);
935 		/* Mutex locking is restarted if it is interrupted. */
936 		if (error == EINTR)
937 			error = ERESTART;
938 	} else {
939 		getnanouptime(&ts);
940 		timespecadd(&ts, timeout);
941 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
942 		for (;;) {
943 			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
944 			if (error != ETIMEDOUT)
945 				break;
946 			getnanouptime(&ts2);
947 			if (timespeccmp(&ts2, &ts, >=)) {
948 				error = ETIMEDOUT;
949 				break;
950 			}
951 			ts3 = ts;
952 			timespecsub(&ts3, &ts2);
953 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
954 		}
955 		/* Timed-locking is not restarted. */
956 		if (error == ERESTART)
957 			error = EINTR;
958 	}
959 	return (error);
960 }
961 
962 /*
963  * Unlock a umtx object.
964  */
965 static int
966 do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
967 {
968 	struct umtx_key key;
969 	uint32_t owner;
970 	uint32_t old;
971 	int error;
972 	int count;
973 
974 	/*
975 	 * Make sure we own this mtx.
976 	 */
977 	owner = fuword32(m);
978 	if (owner == -1)
979 		return (EFAULT);
980 
981 	if ((owner & ~UMUTEX_CONTESTED) != id)
982 		return (EPERM);
983 
984 	/* This should be done in userland */
985 	if ((owner & UMUTEX_CONTESTED) == 0) {
986 		old = casuword32(m, owner, UMUTEX_UNOWNED);
987 		if (old == -1)
988 			return (EFAULT);
989 		if (old == owner)
990 			return (0);
991 		owner = old;
992 	}
993 
994 	/* We should only ever be in here for contested locks */
995 	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
996 		&key)) != 0)
997 		return (error);
998 
999 	umtxq_lock(&key);
1000 	umtxq_busy(&key);
1001 	count = umtxq_count(&key);
1002 	umtxq_unlock(&key);
1003 
1004 	/*
1005 	 * When unlocking the umtx, it must be marked as unowned if
1006 	 * there is zero or one thread only waiting for it.
1007 	 * Otherwise, it must be marked as contested.
1008 	 */
1009 	old = casuword32(m, owner,
1010 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1011 	umtxq_lock(&key);
1012 	umtxq_signal(&key,1);
1013 	umtxq_unbusy(&key);
1014 	umtxq_unlock(&key);
1015 	umtx_key_release(&key);
1016 	if (old == -1)
1017 		return (EFAULT);
1018 	if (old != owner)
1019 		return (EINVAL);
1020 	return (0);
1021 }
1022 #endif
1023 
1024 static inline int
1025 tstohz(const struct timespec *tsp)
1026 {
1027 	struct timeval tv;
1028 
1029 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
1030 	return tvtohz(&tv);
1031 }
1032 
1033 /*
1034  * Fetch and compare value, sleep on the address if value is not changed.
1035  */
1036 static int
1037 do_wait(struct thread *td, void *addr, u_long id,
1038 	struct _umtx_time *timeout, int compat32, int is_private)
1039 {
1040 	struct umtx_q *uq;
1041 	struct timespec ets, cts, tts;
1042 	u_long tmp;
1043 	int error = 0;
1044 
1045 	uq = td->td_umtxq;
1046 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1047 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1048 		return (error);
1049 
1050 	umtxq_lock(&uq->uq_key);
1051 	umtxq_insert(uq);
1052 	umtxq_unlock(&uq->uq_key);
1053 	if (compat32 == 0)
1054 		tmp = fuword(addr);
1055         else
1056 		tmp = (unsigned int)fuword32(addr);
1057 	if (tmp != id) {
1058 		umtxq_lock(&uq->uq_key);
1059 		umtxq_remove(uq);
1060 		umtxq_unlock(&uq->uq_key);
1061 	} else if (timeout == NULL) {
1062 		umtxq_lock(&uq->uq_key);
1063 		error = umtxq_sleep(uq, "uwait", 0);
1064 		umtxq_remove(uq);
1065 		umtxq_unlock(&uq->uq_key);
1066 	} else {
1067 		kern_clock_gettime(td, timeout->_clockid, &cts);
1068 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
1069 			ets = cts;
1070 			timespecadd(&ets, &timeout->_timeout);
1071 		} else {
1072 			ets = timeout->_timeout;
1073 		}
1074 		umtxq_lock(&uq->uq_key);
1075 		for (;;) {
1076 			if (timespeccmp(&cts, &ets, >=)) {
1077 				error = ETIMEDOUT;
1078 				break;
1079 			}
1080 			tts = ets;
1081 			timespecsub(&tts, &cts);
1082 			error = umtxq_sleep(uq, "uwait", tstohz(&tts));
1083 			if (!(uq->uq_flags & UQF_UMTXQ)) {
1084 				error = 0;
1085 				break;
1086 			}
1087 			if (error != ETIMEDOUT)
1088 				break;
1089 			umtxq_unlock(&uq->uq_key);
1090 			kern_clock_gettime(td, timeout->_clockid, &cts);
1091 			umtxq_lock(&uq->uq_key);
1092 		}
1093 		umtxq_remove(uq);
1094 		umtxq_unlock(&uq->uq_key);
1095 	}
1096 	umtx_key_release(&uq->uq_key);
1097 	if (error == ERESTART)
1098 		error = EINTR;
1099 	return (error);
1100 }
1101 
1102 /*
1103  * Wake up threads sleeping on the specified address.
1104  */
1105 int
1106 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1107 {
1108 	struct umtx_key key;
1109 	int ret;
1110 
1111 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1112 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1113 		return (ret);
1114 	umtxq_lock(&key);
1115 	ret = umtxq_signal(&key, n_wake);
1116 	umtxq_unlock(&key);
1117 	umtx_key_release(&key);
1118 	return (0);
1119 }
1120 
1121 /*
1122  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1123  */
1124 static int
1125 _do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1126 	int mode)
1127 {
1128 	struct umtx_q *uq;
1129 	uint32_t owner, old, id;
1130 	int error = 0;
1131 
1132 	id = td->td_tid;
1133 	uq = td->td_umtxq;
1134 
1135 	/*
1136 	 * Care must be exercised when dealing with umtx structure. It
1137 	 * can fault on any access.
1138 	 */
1139 	for (;;) {
1140 		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1141 		if (mode == _UMUTEX_WAIT) {
1142 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1143 				return (0);
1144 		} else {
1145 			/*
1146 			 * Try the uncontested case.  This should be done in userland.
1147 			 */
1148 			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1149 
1150 			/* The acquire succeeded. */
1151 			if (owner == UMUTEX_UNOWNED)
1152 				return (0);
1153 
1154 			/* The address was invalid. */
1155 			if (owner == -1)
1156 				return (EFAULT);
1157 
1158 			/* If no one owns it but it is contested try to acquire it. */
1159 			if (owner == UMUTEX_CONTESTED) {
1160 				owner = casuword32(&m->m_owner,
1161 				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1162 
1163 				if (owner == UMUTEX_CONTESTED)
1164 					return (0);
1165 
1166 				/* The address was invalid. */
1167 				if (owner == -1)
1168 					return (EFAULT);
1169 
1170 				/* If this failed the lock has changed, restart. */
1171 				continue;
1172 			}
1173 		}
1174 
1175 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1176 		    (owner & ~UMUTEX_CONTESTED) == id)
1177 			return (EDEADLK);
1178 
1179 		if (mode == _UMUTEX_TRY)
1180 			return (EBUSY);
1181 
1182 		/*
1183 		 * If we caught a signal, we have retried and now
1184 		 * exit immediately.
1185 		 */
1186 		if (error != 0)
1187 			return (error);
1188 
1189 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1190 		    GET_SHARE(flags), &uq->uq_key)) != 0)
1191 			return (error);
1192 
1193 		umtxq_lock(&uq->uq_key);
1194 		umtxq_busy(&uq->uq_key);
1195 		umtxq_insert(uq);
1196 		umtxq_unlock(&uq->uq_key);
1197 
1198 		/*
1199 		 * Set the contested bit so that a release in user space
1200 		 * knows to use the system call for unlock.  If this fails
1201 		 * either some one else has acquired the lock or it has been
1202 		 * released.
1203 		 */
1204 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1205 
1206 		/* The address was invalid. */
1207 		if (old == -1) {
1208 			umtxq_lock(&uq->uq_key);
1209 			umtxq_remove(uq);
1210 			umtxq_unbusy(&uq->uq_key);
1211 			umtxq_unlock(&uq->uq_key);
1212 			umtx_key_release(&uq->uq_key);
1213 			return (EFAULT);
1214 		}
1215 
1216 		/*
1217 		 * We set the contested bit, sleep. Otherwise the lock changed
1218 		 * and we need to retry or we lost a race to the thread
1219 		 * unlocking the umtx.
1220 		 */
1221 		umtxq_lock(&uq->uq_key);
1222 		umtxq_unbusy(&uq->uq_key);
1223 		if (old == owner)
1224 			error = umtxq_sleep(uq, "umtxn", timo);
1225 		umtxq_remove(uq);
1226 		umtxq_unlock(&uq->uq_key);
1227 		umtx_key_release(&uq->uq_key);
1228 	}
1229 
1230 	return (0);
1231 }
1232 
1233 /*
1234  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1235  */
1236 /*
1237  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1238  */
1239 static int
1240 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1241 {
1242 	struct umtx_key key;
1243 	uint32_t owner, old, id;
1244 	int error;
1245 	int count;
1246 
1247 	id = td->td_tid;
1248 	/*
1249 	 * Make sure we own this mtx.
1250 	 */
1251 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1252 	if (owner == -1)
1253 		return (EFAULT);
1254 
1255 	if ((owner & ~UMUTEX_CONTESTED) != id)
1256 		return (EPERM);
1257 
1258 	if ((owner & UMUTEX_CONTESTED) == 0) {
1259 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1260 		if (old == -1)
1261 			return (EFAULT);
1262 		if (old == owner)
1263 			return (0);
1264 		owner = old;
1265 	}
1266 
1267 	/* We should only ever be in here for contested locks */
1268 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1269 	    &key)) != 0)
1270 		return (error);
1271 
1272 	umtxq_lock(&key);
1273 	umtxq_busy(&key);
1274 	count = umtxq_count(&key);
1275 	umtxq_unlock(&key);
1276 
1277 	/*
1278 	 * When unlocking the umtx, it must be marked as unowned if
1279 	 * there is zero or one thread only waiting for it.
1280 	 * Otherwise, it must be marked as contested.
1281 	 */
1282 	old = casuword32(&m->m_owner, owner,
1283 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1284 	umtxq_lock(&key);
1285 	umtxq_signal(&key,1);
1286 	umtxq_unbusy(&key);
1287 	umtxq_unlock(&key);
1288 	umtx_key_release(&key);
1289 	if (old == -1)
1290 		return (EFAULT);
1291 	if (old != owner)
1292 		return (EINVAL);
1293 	return (0);
1294 }
1295 
1296 /*
1297  * Check if the mutex is available and wake up a waiter,
1298  * only for simple mutex.
1299  */
1300 static int
1301 do_wake_umutex(struct thread *td, struct umutex *m)
1302 {
1303 	struct umtx_key key;
1304 	uint32_t owner;
1305 	uint32_t flags;
1306 	int error;
1307 	int count;
1308 
1309 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1310 	if (owner == -1)
1311 		return (EFAULT);
1312 
1313 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1314 		return (0);
1315 
1316 	flags = fuword32(&m->m_flags);
1317 
1318 	/* We should only ever be in here for contested locks */
1319 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1320 	    &key)) != 0)
1321 		return (error);
1322 
1323 	umtxq_lock(&key);
1324 	umtxq_busy(&key);
1325 	count = umtxq_count(&key);
1326 	umtxq_unlock(&key);
1327 
1328 	if (count <= 1)
1329 		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1330 
1331 	umtxq_lock(&key);
1332 	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1333 		umtxq_signal(&key, 1);
1334 	umtxq_unbusy(&key);
1335 	umtxq_unlock(&key);
1336 	umtx_key_release(&key);
1337 	return (0);
1338 }
1339 
1340 static inline struct umtx_pi *
1341 umtx_pi_alloc(int flags)
1342 {
1343 	struct umtx_pi *pi;
1344 
1345 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1346 	TAILQ_INIT(&pi->pi_blocked);
1347 	atomic_add_int(&umtx_pi_allocated, 1);
1348 	return (pi);
1349 }
1350 
1351 static inline void
1352 umtx_pi_free(struct umtx_pi *pi)
1353 {
1354 	uma_zfree(umtx_pi_zone, pi);
1355 	atomic_add_int(&umtx_pi_allocated, -1);
1356 }
1357 
1358 /*
1359  * Adjust the thread's position on a pi_state after its priority has been
1360  * changed.
1361  */
1362 static int
1363 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1364 {
1365 	struct umtx_q *uq, *uq1, *uq2;
1366 	struct thread *td1;
1367 
1368 	mtx_assert(&umtx_lock, MA_OWNED);
1369 	if (pi == NULL)
1370 		return (0);
1371 
1372 	uq = td->td_umtxq;
1373 
1374 	/*
1375 	 * Check if the thread needs to be moved on the blocked chain.
1376 	 * It needs to be moved if either its priority is lower than
1377 	 * the previous thread or higher than the next thread.
1378 	 */
1379 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1380 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1381 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1382 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1383 		/*
1384 		 * Remove thread from blocked chain and determine where
1385 		 * it should be moved to.
1386 		 */
1387 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1388 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1389 			td1 = uq1->uq_thread;
1390 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1391 			if (UPRI(td1) > UPRI(td))
1392 				break;
1393 		}
1394 
1395 		if (uq1 == NULL)
1396 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1397 		else
1398 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1399 	}
1400 	return (1);
1401 }
1402 
1403 /*
1404  * Propagate priority when a thread is blocked on POSIX
1405  * PI mutex.
1406  */
1407 static void
1408 umtx_propagate_priority(struct thread *td)
1409 {
1410 	struct umtx_q *uq;
1411 	struct umtx_pi *pi;
1412 	int pri;
1413 
1414 	mtx_assert(&umtx_lock, MA_OWNED);
1415 	pri = UPRI(td);
1416 	uq = td->td_umtxq;
1417 	pi = uq->uq_pi_blocked;
1418 	if (pi == NULL)
1419 		return;
1420 
1421 	for (;;) {
1422 		td = pi->pi_owner;
1423 		if (td == NULL || td == curthread)
1424 			return;
1425 
1426 		MPASS(td->td_proc != NULL);
1427 		MPASS(td->td_proc->p_magic == P_MAGIC);
1428 
1429 		thread_lock(td);
1430 		if (td->td_lend_user_pri > pri)
1431 			sched_lend_user_prio(td, pri);
1432 		else {
1433 			thread_unlock(td);
1434 			break;
1435 		}
1436 		thread_unlock(td);
1437 
1438 		/*
1439 		 * Pick up the lock that td is blocked on.
1440 		 */
1441 		uq = td->td_umtxq;
1442 		pi = uq->uq_pi_blocked;
1443 		if (pi == NULL)
1444 			break;
1445 		/* Resort td on the list if needed. */
1446 		umtx_pi_adjust_thread(pi, td);
1447 	}
1448 }
1449 
1450 /*
1451  * Unpropagate priority for a PI mutex when a thread blocked on
1452  * it is interrupted by signal or resumed by others.
1453  */
1454 static void
1455 umtx_repropagate_priority(struct umtx_pi *pi)
1456 {
1457 	struct umtx_q *uq, *uq_owner;
1458 	struct umtx_pi *pi2;
1459 	int pri;
1460 
1461 	mtx_assert(&umtx_lock, MA_OWNED);
1462 
1463 	while (pi != NULL && pi->pi_owner != NULL) {
1464 		pri = PRI_MAX;
1465 		uq_owner = pi->pi_owner->td_umtxq;
1466 
1467 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1468 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1469 			if (uq != NULL) {
1470 				if (pri > UPRI(uq->uq_thread))
1471 					pri = UPRI(uq->uq_thread);
1472 			}
1473 		}
1474 
1475 		if (pri > uq_owner->uq_inherited_pri)
1476 			pri = uq_owner->uq_inherited_pri;
1477 		thread_lock(pi->pi_owner);
1478 		sched_lend_user_prio(pi->pi_owner, pri);
1479 		thread_unlock(pi->pi_owner);
1480 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1481 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1482 	}
1483 }
1484 
1485 /*
1486  * Insert a PI mutex into owned list.
1487  */
1488 static void
1489 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1490 {
1491 	struct umtx_q *uq_owner;
1492 
1493 	uq_owner = owner->td_umtxq;
1494 	mtx_assert(&umtx_lock, MA_OWNED);
1495 	if (pi->pi_owner != NULL)
1496 		panic("pi_ower != NULL");
1497 	pi->pi_owner = owner;
1498 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1499 }
1500 
1501 /*
1502  * Claim ownership of a PI mutex.
1503  */
1504 static int
1505 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1506 {
1507 	struct umtx_q *uq, *uq_owner;
1508 
1509 	uq_owner = owner->td_umtxq;
1510 	mtx_lock_spin(&umtx_lock);
1511 	if (pi->pi_owner == owner) {
1512 		mtx_unlock_spin(&umtx_lock);
1513 		return (0);
1514 	}
1515 
1516 	if (pi->pi_owner != NULL) {
1517 		/*
1518 		 * userland may have already messed the mutex, sigh.
1519 		 */
1520 		mtx_unlock_spin(&umtx_lock);
1521 		return (EPERM);
1522 	}
1523 	umtx_pi_setowner(pi, owner);
1524 	uq = TAILQ_FIRST(&pi->pi_blocked);
1525 	if (uq != NULL) {
1526 		int pri;
1527 
1528 		pri = UPRI(uq->uq_thread);
1529 		thread_lock(owner);
1530 		if (pri < UPRI(owner))
1531 			sched_lend_user_prio(owner, pri);
1532 		thread_unlock(owner);
1533 	}
1534 	mtx_unlock_spin(&umtx_lock);
1535 	return (0);
1536 }
1537 
1538 /*
1539  * Adjust a thread's order position in its blocked PI mutex,
1540  * this may result new priority propagating process.
1541  */
1542 void
1543 umtx_pi_adjust(struct thread *td, u_char oldpri)
1544 {
1545 	struct umtx_q *uq;
1546 	struct umtx_pi *pi;
1547 
1548 	uq = td->td_umtxq;
1549 	mtx_lock_spin(&umtx_lock);
1550 	/*
1551 	 * Pick up the lock that td is blocked on.
1552 	 */
1553 	pi = uq->uq_pi_blocked;
1554 	if (pi != NULL) {
1555 		umtx_pi_adjust_thread(pi, td);
1556 		umtx_repropagate_priority(pi);
1557 	}
1558 	mtx_unlock_spin(&umtx_lock);
1559 }
1560 
1561 /*
1562  * Sleep on a PI mutex.
1563  */
1564 static int
1565 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1566 	uint32_t owner, const char *wmesg, int timo)
1567 {
1568 	struct umtxq_chain *uc;
1569 	struct thread *td, *td1;
1570 	struct umtx_q *uq1;
1571 	int pri;
1572 	int error = 0;
1573 
1574 	td = uq->uq_thread;
1575 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1576 	uc = umtxq_getchain(&uq->uq_key);
1577 	UMTXQ_LOCKED_ASSERT(uc);
1578 	UMTXQ_BUSY_ASSERT(uc);
1579 	umtxq_insert(uq);
1580 	mtx_lock_spin(&umtx_lock);
1581 	if (pi->pi_owner == NULL) {
1582 		mtx_unlock_spin(&umtx_lock);
1583 		/* XXX Only look up thread in current process. */
1584 		td1 = tdfind(owner, curproc->p_pid);
1585 		mtx_lock_spin(&umtx_lock);
1586 		if (td1 != NULL) {
1587 			if (pi->pi_owner == NULL)
1588 				umtx_pi_setowner(pi, td1);
1589 			PROC_UNLOCK(td1->td_proc);
1590 		}
1591 	}
1592 
1593 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1594 		pri = UPRI(uq1->uq_thread);
1595 		if (pri > UPRI(td))
1596 			break;
1597 	}
1598 
1599 	if (uq1 != NULL)
1600 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1601 	else
1602 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1603 
1604 	uq->uq_pi_blocked = pi;
1605 	thread_lock(td);
1606 	td->td_flags |= TDF_UPIBLOCKED;
1607 	thread_unlock(td);
1608 	umtx_propagate_priority(td);
1609 	mtx_unlock_spin(&umtx_lock);
1610 	umtxq_unbusy(&uq->uq_key);
1611 
1612 	if (uq->uq_flags & UQF_UMTXQ) {
1613 		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1614 		if (error == EWOULDBLOCK)
1615 			error = ETIMEDOUT;
1616 		if (uq->uq_flags & UQF_UMTXQ) {
1617 			umtxq_remove(uq);
1618 		}
1619 	}
1620 	mtx_lock_spin(&umtx_lock);
1621 	uq->uq_pi_blocked = NULL;
1622 	thread_lock(td);
1623 	td->td_flags &= ~TDF_UPIBLOCKED;
1624 	thread_unlock(td);
1625 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1626 	umtx_repropagate_priority(pi);
1627 	mtx_unlock_spin(&umtx_lock);
1628 	umtxq_unlock(&uq->uq_key);
1629 
1630 	return (error);
1631 }
1632 
1633 /*
1634  * Add reference count for a PI mutex.
1635  */
1636 static void
1637 umtx_pi_ref(struct umtx_pi *pi)
1638 {
1639 	struct umtxq_chain *uc;
1640 
1641 	uc = umtxq_getchain(&pi->pi_key);
1642 	UMTXQ_LOCKED_ASSERT(uc);
1643 	pi->pi_refcount++;
1644 }
1645 
1646 /*
1647  * Decrease reference count for a PI mutex, if the counter
1648  * is decreased to zero, its memory space is freed.
1649  */
1650 static void
1651 umtx_pi_unref(struct umtx_pi *pi)
1652 {
1653 	struct umtxq_chain *uc;
1654 
1655 	uc = umtxq_getchain(&pi->pi_key);
1656 	UMTXQ_LOCKED_ASSERT(uc);
1657 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1658 	if (--pi->pi_refcount == 0) {
1659 		mtx_lock_spin(&umtx_lock);
1660 		if (pi->pi_owner != NULL) {
1661 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1662 				pi, pi_link);
1663 			pi->pi_owner = NULL;
1664 		}
1665 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1666 			("blocked queue not empty"));
1667 		mtx_unlock_spin(&umtx_lock);
1668 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1669 		umtx_pi_free(pi);
1670 	}
1671 }
1672 
1673 /*
1674  * Find a PI mutex in hash table.
1675  */
1676 static struct umtx_pi *
1677 umtx_pi_lookup(struct umtx_key *key)
1678 {
1679 	struct umtxq_chain *uc;
1680 	struct umtx_pi *pi;
1681 
1682 	uc = umtxq_getchain(key);
1683 	UMTXQ_LOCKED_ASSERT(uc);
1684 
1685 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1686 		if (umtx_key_match(&pi->pi_key, key)) {
1687 			return (pi);
1688 		}
1689 	}
1690 	return (NULL);
1691 }
1692 
1693 /*
1694  * Insert a PI mutex into hash table.
1695  */
1696 static inline void
1697 umtx_pi_insert(struct umtx_pi *pi)
1698 {
1699 	struct umtxq_chain *uc;
1700 
1701 	uc = umtxq_getchain(&pi->pi_key);
1702 	UMTXQ_LOCKED_ASSERT(uc);
1703 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1704 }
1705 
1706 /*
1707  * Lock a PI mutex.
1708  */
1709 static int
1710 _do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1711 	int try)
1712 {
1713 	struct umtx_q *uq;
1714 	struct umtx_pi *pi, *new_pi;
1715 	uint32_t id, owner, old;
1716 	int error;
1717 
1718 	id = td->td_tid;
1719 	uq = td->td_umtxq;
1720 
1721 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1722 	    &uq->uq_key)) != 0)
1723 		return (error);
1724 	umtxq_lock(&uq->uq_key);
1725 	pi = umtx_pi_lookup(&uq->uq_key);
1726 	if (pi == NULL) {
1727 		new_pi = umtx_pi_alloc(M_NOWAIT);
1728 		if (new_pi == NULL) {
1729 			umtxq_unlock(&uq->uq_key);
1730 			new_pi = umtx_pi_alloc(M_WAITOK);
1731 			umtxq_lock(&uq->uq_key);
1732 			pi = umtx_pi_lookup(&uq->uq_key);
1733 			if (pi != NULL) {
1734 				umtx_pi_free(new_pi);
1735 				new_pi = NULL;
1736 			}
1737 		}
1738 		if (new_pi != NULL) {
1739 			new_pi->pi_key = uq->uq_key;
1740 			umtx_pi_insert(new_pi);
1741 			pi = new_pi;
1742 		}
1743 	}
1744 	umtx_pi_ref(pi);
1745 	umtxq_unlock(&uq->uq_key);
1746 
1747 	/*
1748 	 * Care must be exercised when dealing with umtx structure.  It
1749 	 * can fault on any access.
1750 	 */
1751 	for (;;) {
1752 		/*
1753 		 * Try the uncontested case.  This should be done in userland.
1754 		 */
1755 		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1756 
1757 		/* The acquire succeeded. */
1758 		if (owner == UMUTEX_UNOWNED) {
1759 			error = 0;
1760 			break;
1761 		}
1762 
1763 		/* The address was invalid. */
1764 		if (owner == -1) {
1765 			error = EFAULT;
1766 			break;
1767 		}
1768 
1769 		/* If no one owns it but it is contested try to acquire it. */
1770 		if (owner == UMUTEX_CONTESTED) {
1771 			owner = casuword32(&m->m_owner,
1772 			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1773 
1774 			if (owner == UMUTEX_CONTESTED) {
1775 				umtxq_lock(&uq->uq_key);
1776 				umtxq_busy(&uq->uq_key);
1777 				error = umtx_pi_claim(pi, td);
1778 				umtxq_unbusy(&uq->uq_key);
1779 				umtxq_unlock(&uq->uq_key);
1780 				break;
1781 			}
1782 
1783 			/* The address was invalid. */
1784 			if (owner == -1) {
1785 				error = EFAULT;
1786 				break;
1787 			}
1788 
1789 			/* If this failed the lock has changed, restart. */
1790 			continue;
1791 		}
1792 
1793 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1794 		    (owner & ~UMUTEX_CONTESTED) == id) {
1795 			error = EDEADLK;
1796 			break;
1797 		}
1798 
1799 		if (try != 0) {
1800 			error = EBUSY;
1801 			break;
1802 		}
1803 
1804 		/*
1805 		 * If we caught a signal, we have retried and now
1806 		 * exit immediately.
1807 		 */
1808 		if (error != 0)
1809 			break;
1810 
1811 		umtxq_lock(&uq->uq_key);
1812 		umtxq_busy(&uq->uq_key);
1813 		umtxq_unlock(&uq->uq_key);
1814 
1815 		/*
1816 		 * Set the contested bit so that a release in user space
1817 		 * knows to use the system call for unlock.  If this fails
1818 		 * either some one else has acquired the lock or it has been
1819 		 * released.
1820 		 */
1821 		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1822 
1823 		/* The address was invalid. */
1824 		if (old == -1) {
1825 			umtxq_lock(&uq->uq_key);
1826 			umtxq_unbusy(&uq->uq_key);
1827 			umtxq_unlock(&uq->uq_key);
1828 			error = EFAULT;
1829 			break;
1830 		}
1831 
1832 		umtxq_lock(&uq->uq_key);
1833 		/*
1834 		 * We set the contested bit, sleep. Otherwise the lock changed
1835 		 * and we need to retry or we lost a race to the thread
1836 		 * unlocking the umtx.
1837 		 */
1838 		if (old == owner)
1839 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1840 				 "umtxpi", timo);
1841 		else {
1842 			umtxq_unbusy(&uq->uq_key);
1843 			umtxq_unlock(&uq->uq_key);
1844 		}
1845 	}
1846 
1847 	umtxq_lock(&uq->uq_key);
1848 	umtx_pi_unref(pi);
1849 	umtxq_unlock(&uq->uq_key);
1850 
1851 	umtx_key_release(&uq->uq_key);
1852 	return (error);
1853 }
1854 
1855 /*
1856  * Unlock a PI mutex.
1857  */
1858 static int
1859 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1860 {
1861 	struct umtx_key key;
1862 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1863 	struct umtx_pi *pi, *pi2;
1864 	uint32_t owner, old, id;
1865 	int error;
1866 	int count;
1867 	int pri;
1868 
1869 	id = td->td_tid;
1870 	/*
1871 	 * Make sure we own this mtx.
1872 	 */
1873 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1874 	if (owner == -1)
1875 		return (EFAULT);
1876 
1877 	if ((owner & ~UMUTEX_CONTESTED) != id)
1878 		return (EPERM);
1879 
1880 	/* This should be done in userland */
1881 	if ((owner & UMUTEX_CONTESTED) == 0) {
1882 		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1883 		if (old == -1)
1884 			return (EFAULT);
1885 		if (old == owner)
1886 			return (0);
1887 		owner = old;
1888 	}
1889 
1890 	/* We should only ever be in here for contested locks */
1891 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1892 	    &key)) != 0)
1893 		return (error);
1894 
1895 	umtxq_lock(&key);
1896 	umtxq_busy(&key);
1897 	count = umtxq_count_pi(&key, &uq_first);
1898 	if (uq_first != NULL) {
1899 		mtx_lock_spin(&umtx_lock);
1900 		pi = uq_first->uq_pi_blocked;
1901 		KASSERT(pi != NULL, ("pi == NULL?"));
1902 		if (pi->pi_owner != curthread) {
1903 			mtx_unlock_spin(&umtx_lock);
1904 			umtxq_unbusy(&key);
1905 			umtxq_unlock(&key);
1906 			umtx_key_release(&key);
1907 			/* userland messed the mutex */
1908 			return (EPERM);
1909 		}
1910 		uq_me = curthread->td_umtxq;
1911 		pi->pi_owner = NULL;
1912 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1913 		/* get highest priority thread which is still sleeping. */
1914 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1915 		while (uq_first != NULL &&
1916 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1917 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1918 		}
1919 		pri = PRI_MAX;
1920 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1921 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1922 			if (uq_first2 != NULL) {
1923 				if (pri > UPRI(uq_first2->uq_thread))
1924 					pri = UPRI(uq_first2->uq_thread);
1925 			}
1926 		}
1927 		thread_lock(curthread);
1928 		sched_lend_user_prio(curthread, pri);
1929 		thread_unlock(curthread);
1930 		mtx_unlock_spin(&umtx_lock);
1931 		if (uq_first)
1932 			umtxq_signal_thread(uq_first);
1933 	}
1934 	umtxq_unlock(&key);
1935 
1936 	/*
1937 	 * When unlocking the umtx, it must be marked as unowned if
1938 	 * there is zero or one thread only waiting for it.
1939 	 * Otherwise, it must be marked as contested.
1940 	 */
1941 	old = casuword32(&m->m_owner, owner,
1942 		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1943 
1944 	umtxq_lock(&key);
1945 	umtxq_unbusy(&key);
1946 	umtxq_unlock(&key);
1947 	umtx_key_release(&key);
1948 	if (old == -1)
1949 		return (EFAULT);
1950 	if (old != owner)
1951 		return (EINVAL);
1952 	return (0);
1953 }
1954 
1955 /*
1956  * Lock a PP mutex.
1957  */
1958 static int
1959 _do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1960 	int try)
1961 {
1962 	struct umtx_q *uq, *uq2;
1963 	struct umtx_pi *pi;
1964 	uint32_t ceiling;
1965 	uint32_t owner, id;
1966 	int error, pri, old_inherited_pri, su;
1967 
1968 	id = td->td_tid;
1969 	uq = td->td_umtxq;
1970 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1971 	    &uq->uq_key)) != 0)
1972 		return (error);
1973 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1974 	for (;;) {
1975 		old_inherited_pri = uq->uq_inherited_pri;
1976 		umtxq_lock(&uq->uq_key);
1977 		umtxq_busy(&uq->uq_key);
1978 		umtxq_unlock(&uq->uq_key);
1979 
1980 		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1981 		if (ceiling > RTP_PRIO_MAX) {
1982 			error = EINVAL;
1983 			goto out;
1984 		}
1985 
1986 		mtx_lock_spin(&umtx_lock);
1987 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1988 			mtx_unlock_spin(&umtx_lock);
1989 			error = EINVAL;
1990 			goto out;
1991 		}
1992 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1993 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1994 			thread_lock(td);
1995 			if (uq->uq_inherited_pri < UPRI(td))
1996 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1997 			thread_unlock(td);
1998 		}
1999 		mtx_unlock_spin(&umtx_lock);
2000 
2001 		owner = casuword32(&m->m_owner,
2002 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2003 
2004 		if (owner == UMUTEX_CONTESTED) {
2005 			error = 0;
2006 			break;
2007 		}
2008 
2009 		/* The address was invalid. */
2010 		if (owner == -1) {
2011 			error = EFAULT;
2012 			break;
2013 		}
2014 
2015 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2016 		    (owner & ~UMUTEX_CONTESTED) == id) {
2017 			error = EDEADLK;
2018 			break;
2019 		}
2020 
2021 		if (try != 0) {
2022 			error = EBUSY;
2023 			break;
2024 		}
2025 
2026 		/*
2027 		 * If we caught a signal, we have retried and now
2028 		 * exit immediately.
2029 		 */
2030 		if (error != 0)
2031 			break;
2032 
2033 		umtxq_lock(&uq->uq_key);
2034 		umtxq_insert(uq);
2035 		umtxq_unbusy(&uq->uq_key);
2036 		error = umtxq_sleep(uq, "umtxpp", timo);
2037 		umtxq_remove(uq);
2038 		umtxq_unlock(&uq->uq_key);
2039 
2040 		mtx_lock_spin(&umtx_lock);
2041 		uq->uq_inherited_pri = old_inherited_pri;
2042 		pri = PRI_MAX;
2043 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2044 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2045 			if (uq2 != NULL) {
2046 				if (pri > UPRI(uq2->uq_thread))
2047 					pri = UPRI(uq2->uq_thread);
2048 			}
2049 		}
2050 		if (pri > uq->uq_inherited_pri)
2051 			pri = uq->uq_inherited_pri;
2052 		thread_lock(td);
2053 		sched_lend_user_prio(td, pri);
2054 		thread_unlock(td);
2055 		mtx_unlock_spin(&umtx_lock);
2056 	}
2057 
2058 	if (error != 0) {
2059 		mtx_lock_spin(&umtx_lock);
2060 		uq->uq_inherited_pri = old_inherited_pri;
2061 		pri = PRI_MAX;
2062 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2063 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2064 			if (uq2 != NULL) {
2065 				if (pri > UPRI(uq2->uq_thread))
2066 					pri = UPRI(uq2->uq_thread);
2067 			}
2068 		}
2069 		if (pri > uq->uq_inherited_pri)
2070 			pri = uq->uq_inherited_pri;
2071 		thread_lock(td);
2072 		sched_lend_user_prio(td, pri);
2073 		thread_unlock(td);
2074 		mtx_unlock_spin(&umtx_lock);
2075 	}
2076 
2077 out:
2078 	umtxq_lock(&uq->uq_key);
2079 	umtxq_unbusy(&uq->uq_key);
2080 	umtxq_unlock(&uq->uq_key);
2081 	umtx_key_release(&uq->uq_key);
2082 	return (error);
2083 }
2084 
2085 /*
2086  * Unlock a PP mutex.
2087  */
2088 static int
2089 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2090 {
2091 	struct umtx_key key;
2092 	struct umtx_q *uq, *uq2;
2093 	struct umtx_pi *pi;
2094 	uint32_t owner, id;
2095 	uint32_t rceiling;
2096 	int error, pri, new_inherited_pri, su;
2097 
2098 	id = td->td_tid;
2099 	uq = td->td_umtxq;
2100 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2101 
2102 	/*
2103 	 * Make sure we own this mtx.
2104 	 */
2105 	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2106 	if (owner == -1)
2107 		return (EFAULT);
2108 
2109 	if ((owner & ~UMUTEX_CONTESTED) != id)
2110 		return (EPERM);
2111 
2112 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2113 	if (error != 0)
2114 		return (error);
2115 
2116 	if (rceiling == -1)
2117 		new_inherited_pri = PRI_MAX;
2118 	else {
2119 		rceiling = RTP_PRIO_MAX - rceiling;
2120 		if (rceiling > RTP_PRIO_MAX)
2121 			return (EINVAL);
2122 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2123 	}
2124 
2125 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2126 	    &key)) != 0)
2127 		return (error);
2128 	umtxq_lock(&key);
2129 	umtxq_busy(&key);
2130 	umtxq_unlock(&key);
2131 	/*
2132 	 * For priority protected mutex, always set unlocked state
2133 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2134 	 * to lock the mutex, it is necessary because thread priority
2135 	 * has to be adjusted for such mutex.
2136 	 */
2137 	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2138 		UMUTEX_CONTESTED);
2139 
2140 	umtxq_lock(&key);
2141 	if (error == 0)
2142 		umtxq_signal(&key, 1);
2143 	umtxq_unbusy(&key);
2144 	umtxq_unlock(&key);
2145 
2146 	if (error == -1)
2147 		error = EFAULT;
2148 	else {
2149 		mtx_lock_spin(&umtx_lock);
2150 		if (su != 0)
2151 			uq->uq_inherited_pri = new_inherited_pri;
2152 		pri = PRI_MAX;
2153 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2154 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2155 			if (uq2 != NULL) {
2156 				if (pri > UPRI(uq2->uq_thread))
2157 					pri = UPRI(uq2->uq_thread);
2158 			}
2159 		}
2160 		if (pri > uq->uq_inherited_pri)
2161 			pri = uq->uq_inherited_pri;
2162 		thread_lock(td);
2163 		sched_lend_user_prio(td, pri);
2164 		thread_unlock(td);
2165 		mtx_unlock_spin(&umtx_lock);
2166 	}
2167 	umtx_key_release(&key);
2168 	return (error);
2169 }
2170 
2171 static int
2172 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2173 	uint32_t *old_ceiling)
2174 {
2175 	struct umtx_q *uq;
2176 	uint32_t save_ceiling;
2177 	uint32_t owner, id;
2178 	uint32_t flags;
2179 	int error;
2180 
2181 	flags = fuword32(&m->m_flags);
2182 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2183 		return (EINVAL);
2184 	if (ceiling > RTP_PRIO_MAX)
2185 		return (EINVAL);
2186 	id = td->td_tid;
2187 	uq = td->td_umtxq;
2188 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2189 	   &uq->uq_key)) != 0)
2190 		return (error);
2191 	for (;;) {
2192 		umtxq_lock(&uq->uq_key);
2193 		umtxq_busy(&uq->uq_key);
2194 		umtxq_unlock(&uq->uq_key);
2195 
2196 		save_ceiling = fuword32(&m->m_ceilings[0]);
2197 
2198 		owner = casuword32(&m->m_owner,
2199 		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2200 
2201 		if (owner == UMUTEX_CONTESTED) {
2202 			suword32(&m->m_ceilings[0], ceiling);
2203 			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2204 				UMUTEX_CONTESTED);
2205 			error = 0;
2206 			break;
2207 		}
2208 
2209 		/* The address was invalid. */
2210 		if (owner == -1) {
2211 			error = EFAULT;
2212 			break;
2213 		}
2214 
2215 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2216 			suword32(&m->m_ceilings[0], ceiling);
2217 			error = 0;
2218 			break;
2219 		}
2220 
2221 		/*
2222 		 * If we caught a signal, we have retried and now
2223 		 * exit immediately.
2224 		 */
2225 		if (error != 0)
2226 			break;
2227 
2228 		/*
2229 		 * We set the contested bit, sleep. Otherwise the lock changed
2230 		 * and we need to retry or we lost a race to the thread
2231 		 * unlocking the umtx.
2232 		 */
2233 		umtxq_lock(&uq->uq_key);
2234 		umtxq_insert(uq);
2235 		umtxq_unbusy(&uq->uq_key);
2236 		error = umtxq_sleep(uq, "umtxpp", 0);
2237 		umtxq_remove(uq);
2238 		umtxq_unlock(&uq->uq_key);
2239 	}
2240 	umtxq_lock(&uq->uq_key);
2241 	if (error == 0)
2242 		umtxq_signal(&uq->uq_key, INT_MAX);
2243 	umtxq_unbusy(&uq->uq_key);
2244 	umtxq_unlock(&uq->uq_key);
2245 	umtx_key_release(&uq->uq_key);
2246 	if (error == 0 && old_ceiling != NULL)
2247 		suword32(old_ceiling, save_ceiling);
2248 	return (error);
2249 }
2250 
2251 static int
2252 _do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2253 	int mode)
2254 {
2255 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2256 	case 0:
2257 		return (_do_lock_normal(td, m, flags, timo, mode));
2258 	case UMUTEX_PRIO_INHERIT:
2259 		return (_do_lock_pi(td, m, flags, timo, mode));
2260 	case UMUTEX_PRIO_PROTECT:
2261 		return (_do_lock_pp(td, m, flags, timo, mode));
2262 	}
2263 	return (EINVAL);
2264 }
2265 
2266 /*
2267  * Lock a userland POSIX mutex.
2268  */
2269 static int
2270 do_lock_umutex(struct thread *td, struct umutex *m,
2271 	struct _umtx_time *timeout, int mode)
2272 {
2273 	struct timespec cts, ets, tts;
2274 	uint32_t flags;
2275 	int error;
2276 
2277 	flags = fuword32(&m->m_flags);
2278 	if (flags == -1)
2279 		return (EFAULT);
2280 
2281 	if (timeout == NULL) {
2282 		error = _do_lock_umutex(td, m, flags, 0, mode);
2283 		/* Mutex locking is restarted if it is interrupted. */
2284 		if (error == EINTR && mode != _UMUTEX_WAIT)
2285 			error = ERESTART;
2286 	} else {
2287 		kern_clock_gettime(td, timeout->_clockid, &cts);
2288 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2289 			ets = cts;
2290 			timespecadd(&ets, &timeout->_timeout);
2291 			tts = timeout->_timeout;
2292 		} else {
2293 			ets = timeout->_timeout;
2294 			tts = timeout->_timeout;
2295 			timespecsub(&tts, &cts);
2296 		}
2297 		for (;;) {
2298 			error = _do_lock_umutex(td, m, flags, tstohz(&tts), mode);
2299 			if (error != ETIMEDOUT)
2300 				break;
2301 			kern_clock_gettime(td, timeout->_clockid, &cts);
2302 			if (timespeccmp(&cts, &ets, >=))
2303 				break;
2304 			tts = ets;
2305 			timespecsub(&tts, &cts);
2306 		}
2307 		/* Timed-locking is not restarted. */
2308 		if (error == ERESTART)
2309 			error = EINTR;
2310 	}
2311 	return (error);
2312 }
2313 
2314 /*
2315  * Unlock a userland POSIX mutex.
2316  */
2317 static int
2318 do_unlock_umutex(struct thread *td, struct umutex *m)
2319 {
2320 	uint32_t flags;
2321 
2322 	flags = fuword32(&m->m_flags);
2323 	if (flags == -1)
2324 		return (EFAULT);
2325 
2326 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2327 	case 0:
2328 		return (do_unlock_normal(td, m, flags));
2329 	case UMUTEX_PRIO_INHERIT:
2330 		return (do_unlock_pi(td, m, flags));
2331 	case UMUTEX_PRIO_PROTECT:
2332 		return (do_unlock_pp(td, m, flags));
2333 	}
2334 
2335 	return (EINVAL);
2336 }
2337 
2338 static int
2339 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2340 	struct timespec *timeout, u_long wflags)
2341 {
2342 	struct umtx_q *uq;
2343 	struct timespec cts, ets, tts;
2344 	uint32_t flags;
2345 	uint32_t clockid;
2346 	int error;
2347 
2348 	uq = td->td_umtxq;
2349 	flags = fuword32(&cv->c_flags);
2350 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2351 	if (error != 0)
2352 		return (error);
2353 
2354 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2355 		clockid = fuword32(&cv->c_clockid);
2356 		if (clockid < CLOCK_REALTIME ||
2357 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2358 			/* hmm, only HW clock id will work. */
2359 			return (EINVAL);
2360 		}
2361 	} else {
2362 		clockid = CLOCK_REALTIME;
2363 	}
2364 
2365 	umtxq_lock(&uq->uq_key);
2366 	umtxq_busy(&uq->uq_key);
2367 	umtxq_insert(uq);
2368 	umtxq_unlock(&uq->uq_key);
2369 
2370 	/*
2371 	 * Set c_has_waiters to 1 before releasing user mutex, also
2372 	 * don't modify cache line when unnecessary.
2373 	 */
2374 	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2375 		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2376 
2377 	umtxq_lock(&uq->uq_key);
2378 	umtxq_unbusy(&uq->uq_key);
2379 	umtxq_unlock(&uq->uq_key);
2380 
2381 	error = do_unlock_umutex(td, m);
2382 
2383 	umtxq_lock(&uq->uq_key);
2384 	if (error == 0) {
2385 		if (timeout == NULL) {
2386 			error = umtxq_sleep(uq, "ucond", 0);
2387 		} else {
2388 			if ((wflags & CVWAIT_ABSTIME) == 0) {
2389 				kern_clock_gettime(td, clockid, &ets);
2390 				timespecadd(&ets, timeout);
2391 				tts = *timeout;
2392 			} else { /* absolute time */
2393 				ets = *timeout;
2394 				tts = *timeout;
2395 				kern_clock_gettime(td, clockid, &cts);
2396 				timespecsub(&tts, &cts);
2397 			}
2398 			for (;;) {
2399 				error = umtxq_sleep(uq, "ucond", tstohz(&tts));
2400 				if (error != ETIMEDOUT)
2401 					break;
2402 				kern_clock_gettime(td, clockid, &cts);
2403 				if (timespeccmp(&cts, &ets, >=)) {
2404 					error = ETIMEDOUT;
2405 					break;
2406 				}
2407 				tts = ets;
2408 				timespecsub(&tts, &cts);
2409 			}
2410 		}
2411 	}
2412 
2413 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2414 		error = 0;
2415 	else {
2416 		/*
2417 		 * This must be timeout,interrupted by signal or
2418 		 * surprious wakeup, clear c_has_waiter flag when
2419 		 * necessary.
2420 		 */
2421 		umtxq_busy(&uq->uq_key);
2422 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2423 			int oldlen = uq->uq_cur_queue->length;
2424 			umtxq_remove(uq);
2425 			if (oldlen == 1) {
2426 				umtxq_unlock(&uq->uq_key);
2427 				suword32(
2428 				    __DEVOLATILE(uint32_t *,
2429 					 &cv->c_has_waiters), 0);
2430 				umtxq_lock(&uq->uq_key);
2431 			}
2432 		}
2433 		umtxq_unbusy(&uq->uq_key);
2434 		if (error == ERESTART)
2435 			error = EINTR;
2436 	}
2437 
2438 	umtxq_unlock(&uq->uq_key);
2439 	umtx_key_release(&uq->uq_key);
2440 	return (error);
2441 }
2442 
2443 /*
2444  * Signal a userland condition variable.
2445  */
2446 static int
2447 do_cv_signal(struct thread *td, struct ucond *cv)
2448 {
2449 	struct umtx_key key;
2450 	int error, cnt, nwake;
2451 	uint32_t flags;
2452 
2453 	flags = fuword32(&cv->c_flags);
2454 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2455 		return (error);
2456 	umtxq_lock(&key);
2457 	umtxq_busy(&key);
2458 	cnt = umtxq_count(&key);
2459 	nwake = umtxq_signal(&key, 1);
2460 	if (cnt <= nwake) {
2461 		umtxq_unlock(&key);
2462 		error = suword32(
2463 		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2464 		umtxq_lock(&key);
2465 	}
2466 	umtxq_unbusy(&key);
2467 	umtxq_unlock(&key);
2468 	umtx_key_release(&key);
2469 	return (error);
2470 }
2471 
2472 static int
2473 do_cv_broadcast(struct thread *td, struct ucond *cv)
2474 {
2475 	struct umtx_key key;
2476 	int error;
2477 	uint32_t flags;
2478 
2479 	flags = fuword32(&cv->c_flags);
2480 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2481 		return (error);
2482 
2483 	umtxq_lock(&key);
2484 	umtxq_busy(&key);
2485 	umtxq_signal(&key, INT_MAX);
2486 	umtxq_unlock(&key);
2487 
2488 	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2489 
2490 	umtxq_lock(&key);
2491 	umtxq_unbusy(&key);
2492 	umtxq_unlock(&key);
2493 
2494 	umtx_key_release(&key);
2495 	return (error);
2496 }
2497 
2498 static int
2499 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2500 {
2501 	struct umtx_q *uq;
2502 	uint32_t flags, wrflags;
2503 	int32_t state, oldstate;
2504 	int32_t blocked_readers;
2505 	int error;
2506 
2507 	uq = td->td_umtxq;
2508 	flags = fuword32(&rwlock->rw_flags);
2509 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2510 	if (error != 0)
2511 		return (error);
2512 
2513 	wrflags = URWLOCK_WRITE_OWNER;
2514 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2515 		wrflags |= URWLOCK_WRITE_WAITERS;
2516 
2517 	for (;;) {
2518 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2519 		/* try to lock it */
2520 		while (!(state & wrflags)) {
2521 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2522 				umtx_key_release(&uq->uq_key);
2523 				return (EAGAIN);
2524 			}
2525 			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2526 			if (oldstate == state) {
2527 				umtx_key_release(&uq->uq_key);
2528 				return (0);
2529 			}
2530 			state = oldstate;
2531 		}
2532 
2533 		if (error)
2534 			break;
2535 
2536 		/* grab monitor lock */
2537 		umtxq_lock(&uq->uq_key);
2538 		umtxq_busy(&uq->uq_key);
2539 		umtxq_unlock(&uq->uq_key);
2540 
2541 		/*
2542 		 * re-read the state, in case it changed between the try-lock above
2543 		 * and the check below
2544 		 */
2545 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2546 
2547 		/* set read contention bit */
2548 		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2549 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2550 			if (oldstate == state)
2551 				goto sleep;
2552 			state = oldstate;
2553 		}
2554 
2555 		/* state is changed while setting flags, restart */
2556 		if (!(state & wrflags)) {
2557 			umtxq_lock(&uq->uq_key);
2558 			umtxq_unbusy(&uq->uq_key);
2559 			umtxq_unlock(&uq->uq_key);
2560 			continue;
2561 		}
2562 
2563 sleep:
2564 		/* contention bit is set, before sleeping, increase read waiter count */
2565 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2566 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2567 
2568 		while (state & wrflags) {
2569 			umtxq_lock(&uq->uq_key);
2570 			umtxq_insert(uq);
2571 			umtxq_unbusy(&uq->uq_key);
2572 
2573 			error = umtxq_sleep(uq, "urdlck", timo);
2574 
2575 			umtxq_busy(&uq->uq_key);
2576 			umtxq_remove(uq);
2577 			umtxq_unlock(&uq->uq_key);
2578 			if (error)
2579 				break;
2580 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2581 		}
2582 
2583 		/* decrease read waiter count, and may clear read contention bit */
2584 		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2585 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2586 		if (blocked_readers == 1) {
2587 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2588 			for (;;) {
2589 				oldstate = casuword32(&rwlock->rw_state, state,
2590 					 state & ~URWLOCK_READ_WAITERS);
2591 				if (oldstate == state)
2592 					break;
2593 				state = oldstate;
2594 			}
2595 		}
2596 
2597 		umtxq_lock(&uq->uq_key);
2598 		umtxq_unbusy(&uq->uq_key);
2599 		umtxq_unlock(&uq->uq_key);
2600 	}
2601 	umtx_key_release(&uq->uq_key);
2602 	return (error);
2603 }
2604 
2605 static int
2606 do_rw_rdlock2(struct thread *td, void *obj, long val, struct _umtx_time *timeout)
2607 {
2608 	struct timespec cts, ets, tts;
2609 	int error;
2610 
2611 	kern_clock_gettime(td, timeout->_clockid, &cts);
2612 	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2613 		ets = cts;
2614 		timespecadd(&ets, &timeout->_timeout);
2615 		tts = timeout->_timeout;
2616 	} else {
2617 		ets = timeout->_timeout;
2618 		tts = timeout->_timeout;
2619 		timespecsub(&tts, &cts);
2620 	}
2621 	for (;;) {
2622 		error = do_rw_rdlock(td, obj, val, tstohz(&tts));
2623 		if (error != ETIMEDOUT)
2624 			break;
2625 		kern_clock_gettime(td, timeout->_clockid, &cts);
2626 		if (timespeccmp(&cts, &ets, >=))
2627 			break;
2628 		tts = ets;
2629 		timespecsub(&tts, &cts);
2630 	}
2631 	if (error == ERESTART)
2632 		error = EINTR;
2633 	return (error);
2634 }
2635 
2636 static int
2637 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2638 {
2639 	struct umtx_q *uq;
2640 	uint32_t flags;
2641 	int32_t state, oldstate;
2642 	int32_t blocked_writers;
2643 	int32_t blocked_readers;
2644 	int error;
2645 
2646 	uq = td->td_umtxq;
2647 	flags = fuword32(&rwlock->rw_flags);
2648 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2649 	if (error != 0)
2650 		return (error);
2651 
2652 	blocked_readers = 0;
2653 	for (;;) {
2654 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2655 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2656 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2657 			if (oldstate == state) {
2658 				umtx_key_release(&uq->uq_key);
2659 				return (0);
2660 			}
2661 			state = oldstate;
2662 		}
2663 
2664 		if (error) {
2665 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2666 			    blocked_readers != 0) {
2667 				umtxq_lock(&uq->uq_key);
2668 				umtxq_busy(&uq->uq_key);
2669 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2670 				umtxq_unbusy(&uq->uq_key);
2671 				umtxq_unlock(&uq->uq_key);
2672 			}
2673 
2674 			break;
2675 		}
2676 
2677 		/* grab monitor lock */
2678 		umtxq_lock(&uq->uq_key);
2679 		umtxq_busy(&uq->uq_key);
2680 		umtxq_unlock(&uq->uq_key);
2681 
2682 		/*
2683 		 * re-read the state, in case it changed between the try-lock above
2684 		 * and the check below
2685 		 */
2686 		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2687 
2688 		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2689 		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2690 			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2691 			if (oldstate == state)
2692 				goto sleep;
2693 			state = oldstate;
2694 		}
2695 
2696 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2697 			umtxq_lock(&uq->uq_key);
2698 			umtxq_unbusy(&uq->uq_key);
2699 			umtxq_unlock(&uq->uq_key);
2700 			continue;
2701 		}
2702 sleep:
2703 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2704 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2705 
2706 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2707 			umtxq_lock(&uq->uq_key);
2708 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2709 			umtxq_unbusy(&uq->uq_key);
2710 
2711 			error = umtxq_sleep(uq, "uwrlck", timo);
2712 
2713 			umtxq_busy(&uq->uq_key);
2714 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2715 			umtxq_unlock(&uq->uq_key);
2716 			if (error)
2717 				break;
2718 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2719 		}
2720 
2721 		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2722 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2723 		if (blocked_writers == 1) {
2724 			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2725 			for (;;) {
2726 				oldstate = casuword32(&rwlock->rw_state, state,
2727 					 state & ~URWLOCK_WRITE_WAITERS);
2728 				if (oldstate == state)
2729 					break;
2730 				state = oldstate;
2731 			}
2732 			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2733 		} else
2734 			blocked_readers = 0;
2735 
2736 		umtxq_lock(&uq->uq_key);
2737 		umtxq_unbusy(&uq->uq_key);
2738 		umtxq_unlock(&uq->uq_key);
2739 	}
2740 
2741 	umtx_key_release(&uq->uq_key);
2742 	return (error);
2743 }
2744 
2745 static int
2746 do_rw_wrlock2(struct thread *td, void *obj, struct _umtx_time *timeout)
2747 {
2748 	struct timespec cts, ets, tts;
2749 	int error;
2750 
2751 	kern_clock_gettime(td, timeout->_clockid, &cts);
2752 	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2753 		ets = cts;
2754 		timespecadd(&ets, &timeout->_timeout);
2755 		tts = timeout->_timeout;
2756 	} else {
2757 		ets = timeout->_timeout;
2758 		tts = timeout->_timeout;
2759 		timespecsub(&tts, &cts);
2760 	}
2761 	for (;;) {
2762 		error = do_rw_wrlock(td, obj, tstohz(&tts));
2763 		if (error != ETIMEDOUT)
2764 			break;
2765 		kern_clock_gettime(td, timeout->_clockid, &cts);
2766 		if (timespeccmp(&cts, &ets, >=))
2767 			break;
2768 		tts = ets;
2769 		timespecsub(&tts, &cts);
2770 	}
2771 	if (error == ERESTART)
2772 		error = EINTR;
2773 	return (error);
2774 }
2775 
2776 static int
2777 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2778 {
2779 	struct umtx_q *uq;
2780 	uint32_t flags;
2781 	int32_t state, oldstate;
2782 	int error, q, count;
2783 
2784 	uq = td->td_umtxq;
2785 	flags = fuword32(&rwlock->rw_flags);
2786 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2787 	if (error != 0)
2788 		return (error);
2789 
2790 	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2791 	if (state & URWLOCK_WRITE_OWNER) {
2792 		for (;;) {
2793 			oldstate = casuword32(&rwlock->rw_state, state,
2794 				state & ~URWLOCK_WRITE_OWNER);
2795 			if (oldstate != state) {
2796 				state = oldstate;
2797 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2798 					error = EPERM;
2799 					goto out;
2800 				}
2801 			} else
2802 				break;
2803 		}
2804 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2805 		for (;;) {
2806 			oldstate = casuword32(&rwlock->rw_state, state,
2807 				state - 1);
2808 			if (oldstate != state) {
2809 				state = oldstate;
2810 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2811 					error = EPERM;
2812 					goto out;
2813 				}
2814 			}
2815 			else
2816 				break;
2817 		}
2818 	} else {
2819 		error = EPERM;
2820 		goto out;
2821 	}
2822 
2823 	count = 0;
2824 
2825 	if (!(flags & URWLOCK_PREFER_READER)) {
2826 		if (state & URWLOCK_WRITE_WAITERS) {
2827 			count = 1;
2828 			q = UMTX_EXCLUSIVE_QUEUE;
2829 		} else if (state & URWLOCK_READ_WAITERS) {
2830 			count = INT_MAX;
2831 			q = UMTX_SHARED_QUEUE;
2832 		}
2833 	} else {
2834 		if (state & URWLOCK_READ_WAITERS) {
2835 			count = INT_MAX;
2836 			q = UMTX_SHARED_QUEUE;
2837 		} else if (state & URWLOCK_WRITE_WAITERS) {
2838 			count = 1;
2839 			q = UMTX_EXCLUSIVE_QUEUE;
2840 		}
2841 	}
2842 
2843 	if (count) {
2844 		umtxq_lock(&uq->uq_key);
2845 		umtxq_busy(&uq->uq_key);
2846 		umtxq_signal_queue(&uq->uq_key, count, q);
2847 		umtxq_unbusy(&uq->uq_key);
2848 		umtxq_unlock(&uq->uq_key);
2849 	}
2850 out:
2851 	umtx_key_release(&uq->uq_key);
2852 	return (error);
2853 }
2854 
2855 static int
2856 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2857 {
2858 	struct umtx_q *uq;
2859 	struct timespec cts, ets, tts;
2860 	uint32_t flags, count;
2861 	int error;
2862 
2863 	uq = td->td_umtxq;
2864 	flags = fuword32(&sem->_flags);
2865 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2866 	if (error != 0)
2867 		return (error);
2868 	umtxq_lock(&uq->uq_key);
2869 	umtxq_busy(&uq->uq_key);
2870 	umtxq_insert(uq);
2871 	umtxq_unlock(&uq->uq_key);
2872 
2873 	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2874 	rmb();
2875 	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2876 	if (count != 0) {
2877 		umtxq_lock(&uq->uq_key);
2878 		umtxq_unbusy(&uq->uq_key);
2879 		umtxq_remove(uq);
2880 		umtxq_unlock(&uq->uq_key);
2881 		umtx_key_release(&uq->uq_key);
2882 		return (0);
2883 	}
2884 
2885 	umtxq_lock(&uq->uq_key);
2886 	umtxq_unbusy(&uq->uq_key);
2887 
2888 	if (timeout == NULL) {
2889 		error = umtxq_sleep(uq, "usem", 0);
2890 	} else {
2891 		umtxq_unlock(&uq->uq_key);
2892 		kern_clock_gettime(td, timeout->_clockid, &cts);
2893 		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2894 			ets = cts;
2895 			timespecadd(&ets, &timeout->_timeout);
2896 		} else {
2897 			ets = timeout->_timeout;
2898 		}
2899 		umtxq_lock(&uq->uq_key);
2900 		for (;;) {
2901 			if (timespeccmp(&cts, &ets, >=)) {
2902 				error = ETIMEDOUT;
2903 				break;
2904 			}
2905 			tts = ets;
2906 			timespecsub(&tts, &cts);
2907 			error = umtxq_sleep(uq, "usem", tstohz(&tts));
2908 			if (error != ETIMEDOUT)
2909 				break;
2910 			umtxq_unlock(&uq->uq_key);
2911 			kern_clock_gettime(td, timeout->_clockid, &cts);
2912 			umtxq_lock(&uq->uq_key);
2913 		}
2914 	}
2915 
2916 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2917 		error = 0;
2918 	else {
2919 		umtxq_remove(uq);
2920 		if (error == ERESTART)
2921 			error = EINTR;
2922 	}
2923 	umtxq_unlock(&uq->uq_key);
2924 	umtx_key_release(&uq->uq_key);
2925 	return (error);
2926 }
2927 
2928 /*
2929  * Signal a userland condition variable.
2930  */
2931 static int
2932 do_sem_wake(struct thread *td, struct _usem *sem)
2933 {
2934 	struct umtx_key key;
2935 	int error, cnt, nwake;
2936 	uint32_t flags;
2937 
2938 	flags = fuword32(&sem->_flags);
2939 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2940 		return (error);
2941 	umtxq_lock(&key);
2942 	umtxq_busy(&key);
2943 	cnt = umtxq_count(&key);
2944 	nwake = umtxq_signal(&key, 1);
2945 	if (cnt <= nwake) {
2946 		umtxq_unlock(&key);
2947 		error = suword32(
2948 		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2949 		umtxq_lock(&key);
2950 	}
2951 	umtxq_unbusy(&key);
2952 	umtxq_unlock(&key);
2953 	umtx_key_release(&key);
2954 	return (error);
2955 }
2956 
2957 int
2958 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2959     /* struct umtx *umtx */
2960 {
2961 	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2962 }
2963 
2964 int
2965 sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2966     /* struct umtx *umtx */
2967 {
2968 	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2969 }
2970 
2971 inline int
2972 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2973 {
2974 	int error;
2975 
2976 	error = copyin(addr, tsp, sizeof(struct timespec));
2977 	if (error == 0) {
2978 		if (tsp->tv_sec < 0 ||
2979 		    tsp->tv_nsec >= 1000000000 ||
2980 		    tsp->tv_nsec < 0)
2981 			error = EINVAL;
2982 	}
2983 	return (error);
2984 }
2985 
2986 static inline int
2987 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2988 {
2989 	int error;
2990 
2991 	if (size <= sizeof(struct timespec)) {
2992 		tp->_clockid = CLOCK_REALTIME;
2993 		tp->_flags = 0;
2994 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2995 	} else
2996 		error = copyin(addr, tp, sizeof(struct _umtx_time));
2997 	if (error != 0)
2998 		return (error);
2999 	if (tp->_timeout.tv_sec < 0 ||
3000 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3001 		return (EINVAL);
3002 	return (0);
3003 }
3004 
3005 static int
3006 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3007 {
3008 	struct timespec *ts, timeout;
3009 	int error;
3010 
3011 	/* Allow a null timespec (wait forever). */
3012 	if (uap->uaddr2 == NULL)
3013 		ts = NULL;
3014 	else {
3015 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3016 		if (error != 0)
3017 			return (error);
3018 		ts = &timeout;
3019 	}
3020 	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3021 }
3022 
3023 static int
3024 __umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3025 {
3026 	return (do_unlock_umtx(td, uap->obj, uap->val));
3027 }
3028 
3029 static int
3030 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3031 {
3032 	struct _umtx_time timeout, *tm_p;
3033 	int error;
3034 
3035 	if (uap->uaddr2 == NULL)
3036 		tm_p = NULL;
3037 	else {
3038 		error = umtx_copyin_umtx_time(
3039 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3040 		if (error != 0)
3041 			return (error);
3042 		tm_p = &timeout;
3043 	}
3044 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3045 }
3046 
3047 static int
3048 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3049 {
3050 	struct _umtx_time timeout, *tm_p;
3051 	int error;
3052 
3053 	if (uap->uaddr2 == NULL)
3054 		tm_p = NULL;
3055 	else {
3056 		error = umtx_copyin_umtx_time(
3057 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3058 		if (error != 0)
3059 			return (error);
3060 		tm_p = &timeout;
3061 	}
3062 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3063 }
3064 
3065 static int
3066 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3067 {
3068 	struct _umtx_time *tm_p, timeout;
3069 	int error;
3070 
3071 	if (uap->uaddr2 == NULL)
3072 		tm_p = NULL;
3073 	else {
3074 		error = umtx_copyin_umtx_time(
3075 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3076 		if (error != 0)
3077 			return (error);
3078 		tm_p = &timeout;
3079 	}
3080 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3081 }
3082 
3083 static int
3084 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3085 {
3086 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3087 }
3088 
3089 #define BATCH_SIZE	128
3090 static int
3091 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3092 {
3093 	int count = uap->val;
3094 	void *uaddrs[BATCH_SIZE];
3095 	char **upp = (char **)uap->obj;
3096 	int tocopy;
3097 	int error = 0;
3098 	int i, pos = 0;
3099 
3100 	while (count > 0) {
3101 		tocopy = count;
3102 		if (tocopy > BATCH_SIZE)
3103 			tocopy = BATCH_SIZE;
3104 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3105 		if (error != 0)
3106 			break;
3107 		for (i = 0; i < tocopy; ++i)
3108 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3109 		count -= tocopy;
3110 		pos += tocopy;
3111 	}
3112 	return (error);
3113 }
3114 
3115 static int
3116 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3117 {
3118 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3119 }
3120 
3121 static int
3122 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3123 {
3124 	struct _umtx_time *tm_p, timeout;
3125 	int error;
3126 
3127 	/* Allow a null timespec (wait forever). */
3128 	if (uap->uaddr2 == NULL)
3129 		tm_p = NULL;
3130 	else {
3131 		error = umtx_copyin_umtx_time(
3132 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3133 		if (error != 0)
3134 			return (error);
3135 		tm_p = &timeout;
3136 	}
3137 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3138 }
3139 
3140 static int
3141 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3142 {
3143 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3144 }
3145 
3146 static int
3147 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3148 {
3149 	struct _umtx_time *tm_p, timeout;
3150 	int error;
3151 
3152 	/* Allow a null timespec (wait forever). */
3153 	if (uap->uaddr2 == NULL)
3154 		tm_p = NULL;
3155 	else {
3156 		error = umtx_copyin_umtx_time(
3157 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3158 		if (error != 0)
3159 			return (error);
3160 		tm_p = &timeout;
3161 	}
3162 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3163 }
3164 
3165 static int
3166 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3167 {
3168 	return do_wake_umutex(td, uap->obj);
3169 }
3170 
3171 static int
3172 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3173 {
3174 	return do_unlock_umutex(td, uap->obj);
3175 }
3176 
3177 static int
3178 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3179 {
3180 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3181 }
3182 
3183 static int
3184 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3185 {
3186 	struct timespec *ts, timeout;
3187 	int error;
3188 
3189 	/* Allow a null timespec (wait forever). */
3190 	if (uap->uaddr2 == NULL)
3191 		ts = NULL;
3192 	else {
3193 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3194 		if (error != 0)
3195 			return (error);
3196 		ts = &timeout;
3197 	}
3198 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3199 }
3200 
3201 static int
3202 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3203 {
3204 	return do_cv_signal(td, uap->obj);
3205 }
3206 
3207 static int
3208 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	return do_cv_broadcast(td, uap->obj);
3211 }
3212 
3213 static int
3214 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3215 {
3216 	struct _umtx_time timeout;
3217 	int error;
3218 
3219 	/* Allow a null timespec (wait forever). */
3220 	if (uap->uaddr2 == NULL) {
3221 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3222 	} else {
3223 		error = umtx_copyin_umtx_time(uap->uaddr2,
3224 		   (size_t)uap->uaddr1, &timeout);
3225 		if (error != 0)
3226 			return (error);
3227 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3228 	}
3229 	return (error);
3230 }
3231 
3232 static int
3233 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3234 {
3235 	struct _umtx_time timeout;
3236 	int error;
3237 
3238 	/* Allow a null timespec (wait forever). */
3239 	if (uap->uaddr2 == NULL) {
3240 		error = do_rw_wrlock(td, uap->obj, 0);
3241 	} else {
3242 		error = umtx_copyin_umtx_time(uap->uaddr2,
3243 		   (size_t)uap->uaddr1, &timeout);
3244 		if (error != 0)
3245 			return (error);
3246 
3247 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3248 	}
3249 	return (error);
3250 }
3251 
3252 static int
3253 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3254 {
3255 	return do_rw_unlock(td, uap->obj);
3256 }
3257 
3258 static int
3259 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3260 {
3261 	struct _umtx_time *tm_p, timeout;
3262 	int error;
3263 
3264 	/* Allow a null timespec (wait forever). */
3265 	if (uap->uaddr2 == NULL)
3266 		tm_p = NULL;
3267 	else {
3268 		error = umtx_copyin_umtx_time(
3269 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3270 		if (error != 0)
3271 			return (error);
3272 		tm_p = &timeout;
3273 	}
3274 	return (do_sem_wait(td, uap->obj, tm_p));
3275 }
3276 
3277 static int
3278 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3279 {
3280 	return do_sem_wake(td, uap->obj);
3281 }
3282 
3283 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3284 
3285 static _umtx_op_func op_table[] = {
3286 	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3287 	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3288 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3289 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3290 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3291 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3292 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3293 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3294 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3295 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3296 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3297 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3298 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3299 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3300 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3301 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3302 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3303 	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3304 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3305 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3306 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3307 	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3308 };
3309 
3310 int
3311 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3312 {
3313 	if ((unsigned)uap->op < UMTX_OP_MAX)
3314 		return (*op_table[uap->op])(td, uap);
3315 	return (EINVAL);
3316 }
3317 
3318 #ifdef COMPAT_FREEBSD32
3319 int
3320 freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3321     /* struct umtx *umtx */
3322 {
3323 	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3324 }
3325 
3326 int
3327 freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3328     /* struct umtx *umtx */
3329 {
3330 	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3331 }
3332 
3333 struct timespec32 {
3334 	uint32_t tv_sec;
3335 	uint32_t tv_nsec;
3336 };
3337 
3338 struct umtx_time32 {
3339 	struct	timespec32	timeout;
3340 	uint32_t		flags;
3341 	uint32_t		clockid;
3342 };
3343 
3344 static inline int
3345 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3346 {
3347 	struct timespec32 ts32;
3348 	int error;
3349 
3350 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3351 	if (error == 0) {
3352 		if (ts32.tv_sec < 0 ||
3353 		    ts32.tv_nsec >= 1000000000 ||
3354 		    ts32.tv_nsec < 0)
3355 			error = EINVAL;
3356 		else {
3357 			tsp->tv_sec = ts32.tv_sec;
3358 			tsp->tv_nsec = ts32.tv_nsec;
3359 		}
3360 	}
3361 	return (error);
3362 }
3363 
3364 static inline int
3365 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3366 {
3367 	struct umtx_time32 t32;
3368 	int error;
3369 
3370 	t32.clockid = CLOCK_REALTIME;
3371 	t32.flags   = 0;
3372 	if (size <= sizeof(struct timespec32))
3373 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3374 	else
3375 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3376 	if (error != 0)
3377 		return (error);
3378 	if (t32.timeout.tv_sec < 0 ||
3379 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3380 		return (EINVAL);
3381 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3382 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3383 	tp->_flags = t32.flags;
3384 	tp->_clockid = t32.clockid;
3385 	return (0);
3386 }
3387 
3388 static int
3389 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3390 {
3391 	struct timespec *ts, timeout;
3392 	int error;
3393 
3394 	/* Allow a null timespec (wait forever). */
3395 	if (uap->uaddr2 == NULL)
3396 		ts = NULL;
3397 	else {
3398 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3399 		if (error != 0)
3400 			return (error);
3401 		ts = &timeout;
3402 	}
3403 	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3404 }
3405 
3406 static int
3407 __umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3408 {
3409 	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3410 }
3411 
3412 static int
3413 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3414 {
3415 	struct _umtx_time *tm_p, timeout;
3416 	int error;
3417 
3418 	if (uap->uaddr2 == NULL)
3419 		tm_p = NULL;
3420 	else {
3421 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3422 			(size_t)uap->uaddr1, &timeout);
3423 		if (error != 0)
3424 			return (error);
3425 		tm_p = &timeout;
3426 	}
3427 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3428 }
3429 
3430 static int
3431 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3432 {
3433 	struct _umtx_time *tm_p, timeout;
3434 	int error;
3435 
3436 	/* Allow a null timespec (wait forever). */
3437 	if (uap->uaddr2 == NULL)
3438 		tm_p = NULL;
3439 	else {
3440 		error = umtx_copyin_umtx_time(uap->uaddr2,
3441 			    (size_t)uap->uaddr1, &timeout);
3442 		if (error != 0)
3443 			return (error);
3444 		tm_p = &timeout;
3445 	}
3446 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3447 }
3448 
3449 static int
3450 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3451 {
3452 	struct _umtx_time *tm_p, timeout;
3453 	int error;
3454 
3455 	/* Allow a null timespec (wait forever). */
3456 	if (uap->uaddr2 == NULL)
3457 		tm_p = NULL;
3458 	else {
3459 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3460 		    (size_t)uap->uaddr1, &timeout);
3461 		if (error != 0)
3462 			return (error);
3463 		tm_p = &timeout;
3464 	}
3465 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3466 }
3467 
3468 static int
3469 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3470 {
3471 	struct timespec *ts, timeout;
3472 	int error;
3473 
3474 	/* Allow a null timespec (wait forever). */
3475 	if (uap->uaddr2 == NULL)
3476 		ts = NULL;
3477 	else {
3478 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3479 		if (error != 0)
3480 			return (error);
3481 		ts = &timeout;
3482 	}
3483 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3484 }
3485 
3486 static int
3487 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3488 {
3489 	struct _umtx_time timeout;
3490 	int error;
3491 
3492 	/* Allow a null timespec (wait forever). */
3493 	if (uap->uaddr2 == NULL) {
3494 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3495 	} else {
3496 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3497 		    (size_t)uap->uaddr1, &timeout);
3498 		if (error != 0)
3499 			return (error);
3500 		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3501 	}
3502 	return (error);
3503 }
3504 
3505 static int
3506 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3507 {
3508 	struct _umtx_time timeout;
3509 	int error;
3510 
3511 	/* Allow a null timespec (wait forever). */
3512 	if (uap->uaddr2 == NULL) {
3513 		error = do_rw_wrlock(td, uap->obj, 0);
3514 	} else {
3515 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3516 		    (size_t)uap->uaddr1, &timeout);
3517 		if (error != 0)
3518 			return (error);
3519 		error = do_rw_wrlock2(td, uap->obj, &timeout);
3520 	}
3521 	return (error);
3522 }
3523 
3524 static int
3525 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3526 {
3527 	struct _umtx_time *tm_p, timeout;
3528 	int error;
3529 
3530 	if (uap->uaddr2 == NULL)
3531 		tm_p = NULL;
3532 	else {
3533 		error = umtx_copyin_umtx_time32(
3534 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3535 		if (error != 0)
3536 			return (error);
3537 		tm_p = &timeout;
3538 	}
3539 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3540 }
3541 
3542 static int
3543 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3544 {
3545 	struct _umtx_time *tm_p, timeout;
3546 	int error;
3547 
3548 	/* Allow a null timespec (wait forever). */
3549 	if (uap->uaddr2 == NULL)
3550 		tm_p = NULL;
3551 	else {
3552 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3553 		    (size_t)uap->uaddr1, &timeout);
3554 		if (error != 0)
3555 			return (error);
3556 		tm_p = &timeout;
3557 	}
3558 	return (do_sem_wait(td, uap->obj, tm_p));
3559 }
3560 
3561 static int
3562 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3563 {
3564 	int count = uap->val;
3565 	uint32_t uaddrs[BATCH_SIZE];
3566 	uint32_t **upp = (uint32_t **)uap->obj;
3567 	int tocopy;
3568 	int error = 0;
3569 	int i, pos = 0;
3570 
3571 	while (count > 0) {
3572 		tocopy = count;
3573 		if (tocopy > BATCH_SIZE)
3574 			tocopy = BATCH_SIZE;
3575 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3576 		if (error != 0)
3577 			break;
3578 		for (i = 0; i < tocopy; ++i)
3579 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3580 				INT_MAX, 1);
3581 		count -= tocopy;
3582 		pos += tocopy;
3583 	}
3584 	return (error);
3585 }
3586 
3587 static _umtx_op_func op_table_compat32[] = {
3588 	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3589 	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3590 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3591 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3592 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3593 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3594 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3595 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3596 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3597 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3598 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3599 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3600 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3601 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3602 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3603 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3604 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3605 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3606 	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3607 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3608 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3609 	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3610 };
3611 
3612 int
3613 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3614 {
3615 	if ((unsigned)uap->op < UMTX_OP_MAX)
3616 		return (*op_table_compat32[uap->op])(td,
3617 			(struct _umtx_op_args *)uap);
3618 	return (EINVAL);
3619 }
3620 #endif
3621 
3622 void
3623 umtx_thread_init(struct thread *td)
3624 {
3625 	td->td_umtxq = umtxq_alloc();
3626 	td->td_umtxq->uq_thread = td;
3627 }
3628 
3629 void
3630 umtx_thread_fini(struct thread *td)
3631 {
3632 	umtxq_free(td->td_umtxq);
3633 }
3634 
3635 /*
3636  * It will be called when new thread is created, e.g fork().
3637  */
3638 void
3639 umtx_thread_alloc(struct thread *td)
3640 {
3641 	struct umtx_q *uq;
3642 
3643 	uq = td->td_umtxq;
3644 	uq->uq_inherited_pri = PRI_MAX;
3645 
3646 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3647 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3648 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3649 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3650 }
3651 
3652 /*
3653  * exec() hook.
3654  */
3655 static void
3656 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3657 	struct image_params *imgp __unused)
3658 {
3659 	umtx_thread_cleanup(curthread);
3660 }
3661 
3662 /*
3663  * thread_exit() hook.
3664  */
3665 void
3666 umtx_thread_exit(struct thread *td)
3667 {
3668 	umtx_thread_cleanup(td);
3669 }
3670 
3671 /*
3672  * clean up umtx data.
3673  */
3674 static void
3675 umtx_thread_cleanup(struct thread *td)
3676 {
3677 	struct umtx_q *uq;
3678 	struct umtx_pi *pi;
3679 
3680 	if ((uq = td->td_umtxq) == NULL)
3681 		return;
3682 
3683 	mtx_lock_spin(&umtx_lock);
3684 	uq->uq_inherited_pri = PRI_MAX;
3685 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3686 		pi->pi_owner = NULL;
3687 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3688 	}
3689 	mtx_unlock_spin(&umtx_lock);
3690 	thread_lock(td);
3691 	sched_lend_user_prio(td, PRI_MAX);
3692 	thread_unlock(td);
3693 }
3694