xref: /freebsd/sys/kern/kern_umtx.c (revision ff0ba87247820afbdfdc1b307c803f7923d0e4d3)
1 /*-
2  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice unmodified, this list of conditions, and the following
11  *    disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_compat.h"
32 #include "opt_umtx_profiling.h"
33 
34 #include <sys/param.h>
35 #include <sys/kernel.h>
36 #include <sys/limits.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/priv.h>
41 #include <sys/proc.h>
42 #include <sys/sbuf.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/sysctl.h>
46 #include <sys/sysent.h>
47 #include <sys/systm.h>
48 #include <sys/sysproto.h>
49 #include <sys/syscallsubr.h>
50 #include <sys/eventhandler.h>
51 #include <sys/umtx.h>
52 
53 #include <vm/vm.h>
54 #include <vm/vm_param.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_object.h>
58 
59 #include <machine/cpu.h>
60 
61 #ifdef COMPAT_FREEBSD32
62 #include <compat/freebsd32/freebsd32_proto.h>
63 #endif
64 
65 #define _UMUTEX_TRY		1
66 #define _UMUTEX_WAIT		2
67 
68 #ifdef UMTX_PROFILING
69 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71 #endif
72 
73 /* Priority inheritance mutex info. */
74 struct umtx_pi {
75 	/* Owner thread */
76 	struct thread		*pi_owner;
77 
78 	/* Reference count */
79 	int			pi_refcount;
80 
81  	/* List entry to link umtx holding by thread */
82 	TAILQ_ENTRY(umtx_pi)	pi_link;
83 
84 	/* List entry in hash */
85 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86 
87 	/* List for waiters */
88 	TAILQ_HEAD(,umtx_q)	pi_blocked;
89 
90 	/* Identify a userland lock object */
91 	struct umtx_key		pi_key;
92 };
93 
94 /* A userland synchronous object user. */
95 struct umtx_q {
96 	/* Linked list for the hash. */
97 	TAILQ_ENTRY(umtx_q)	uq_link;
98 
99 	/* Umtx key. */
100 	struct umtx_key		uq_key;
101 
102 	/* Umtx flags. */
103 	int			uq_flags;
104 #define UQF_UMTXQ	0x0001
105 
106 	/* The thread waits on. */
107 	struct thread		*uq_thread;
108 
109 	/*
110 	 * Blocked on PI mutex. read can use chain lock
111 	 * or umtx_lock, write must have both chain lock and
112 	 * umtx_lock being hold.
113 	 */
114 	struct umtx_pi		*uq_pi_blocked;
115 
116 	/* On blocked list */
117 	TAILQ_ENTRY(umtx_q)	uq_lockq;
118 
119 	/* Thread contending with us */
120 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121 
122 	/* Inherited priority from PP mutex */
123 	u_char			uq_inherited_pri;
124 
125 	/* Spare queue ready to be reused */
126 	struct umtxq_queue	*uq_spare_queue;
127 
128 	/* The queue we on */
129 	struct umtxq_queue	*uq_cur_queue;
130 };
131 
132 TAILQ_HEAD(umtxq_head, umtx_q);
133 
134 /* Per-key wait-queue */
135 struct umtxq_queue {
136 	struct umtxq_head	head;
137 	struct umtx_key		key;
138 	LIST_ENTRY(umtxq_queue)	link;
139 	int			length;
140 };
141 
142 LIST_HEAD(umtxq_list, umtxq_queue);
143 
144 /* Userland lock object's wait-queue chain */
145 struct umtxq_chain {
146 	/* Lock for this chain. */
147 	struct mtx		uc_lock;
148 
149 	/* List of sleep queues. */
150 	struct umtxq_list	uc_queue[2];
151 #define UMTX_SHARED_QUEUE	0
152 #define UMTX_EXCLUSIVE_QUEUE	1
153 
154 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155 
156 	/* Busy flag */
157 	char			uc_busy;
158 
159 	/* Chain lock waiters */
160 	int			uc_waiters;
161 
162 	/* All PI in the list */
163 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164 
165 #ifdef UMTX_PROFILING
166 	u_int 			length;
167 	u_int			max_length;
168 #endif
169 };
170 
171 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172 
173 /*
174  * Don't propagate time-sharing priority, there is a security reason,
175  * a user can simply introduce PI-mutex, let thread A lock the mutex,
176  * and let another thread B block on the mutex, because B is
177  * sleeping, its priority will be boosted, this causes A's priority to
178  * be boosted via priority propagating too and will never be lowered even
179  * if it is using 100%CPU, this is unfair to other processes.
180  */
181 
182 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
183 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
184 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
185 
186 #define	GOLDEN_RATIO_PRIME	2654404609U
187 #define	UMTX_CHAINS		512
188 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
189 
190 #define	GET_SHARE(flags)	\
191     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
192 
193 #define BUSY_SPINS		200
194 
195 struct abs_timeout {
196 	int clockid;
197 	struct timespec cur;
198 	struct timespec end;
199 };
200 
201 static uma_zone_t		umtx_pi_zone;
202 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
203 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
204 static int			umtx_pi_allocated;
205 
206 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
207 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
208     &umtx_pi_allocated, 0, "Allocated umtx_pi");
209 
210 #ifdef UMTX_PROFILING
211 static long max_length;
212 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
213 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
214 #endif
215 
216 static void umtxq_sysinit(void *);
217 static void umtxq_hash(struct umtx_key *key);
218 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
219 static void umtxq_lock(struct umtx_key *key);
220 static void umtxq_unlock(struct umtx_key *key);
221 static void umtxq_busy(struct umtx_key *key);
222 static void umtxq_unbusy(struct umtx_key *key);
223 static void umtxq_insert_queue(struct umtx_q *uq, int q);
224 static void umtxq_remove_queue(struct umtx_q *uq, int q);
225 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
226 static int umtxq_count(struct umtx_key *key);
227 static struct umtx_pi *umtx_pi_alloc(int);
228 static void umtx_pi_free(struct umtx_pi *pi);
229 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
230 static void umtx_thread_cleanup(struct thread *td);
231 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
232 	struct image_params *imgp __unused);
233 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
234 
235 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
236 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
237 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
238 
239 static struct mtx umtx_lock;
240 
241 #ifdef UMTX_PROFILING
242 static void
243 umtx_init_profiling(void)
244 {
245 	struct sysctl_oid *chain_oid;
246 	char chain_name[10];
247 	int i;
248 
249 	for (i = 0; i < UMTX_CHAINS; ++i) {
250 		snprintf(chain_name, sizeof(chain_name), "%d", i);
251 		chain_oid = SYSCTL_ADD_NODE(NULL,
252 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
253 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
254 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
255 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
256 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
257 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
258 	}
259 }
260 
261 static int
262 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
263 {
264 	char buf[512];
265 	struct sbuf sb;
266 	struct umtxq_chain *uc;
267 	u_int fract, i, j, tot, whole;
268 	u_int sf0, sf1, sf2, sf3, sf4;
269 	u_int si0, si1, si2, si3, si4;
270 	u_int sw0, sw1, sw2, sw3, sw4;
271 
272 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
273 	for (i = 0; i < 2; i++) {
274 		tot = 0;
275 		for (j = 0; j < UMTX_CHAINS; ++j) {
276 			uc = &umtxq_chains[i][j];
277 			mtx_lock(&uc->uc_lock);
278 			tot += uc->max_length;
279 			mtx_unlock(&uc->uc_lock);
280 		}
281 		if (tot == 0)
282 			sbuf_printf(&sb, "%u) Empty ", i);
283 		else {
284 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
285 			si0 = si1 = si2 = si3 = si4 = 0;
286 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
287 			for (j = 0; j < UMTX_CHAINS; j++) {
288 				uc = &umtxq_chains[i][j];
289 				mtx_lock(&uc->uc_lock);
290 				whole = uc->max_length * 100;
291 				mtx_unlock(&uc->uc_lock);
292 				fract = (whole % tot) * 100;
293 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
294 					sf0 = fract;
295 					si0 = j;
296 					sw0 = whole;
297 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
298 				    sf1)) {
299 					sf1 = fract;
300 					si1 = j;
301 					sw1 = whole;
302 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
303 				    sf2)) {
304 					sf2 = fract;
305 					si2 = j;
306 					sw2 = whole;
307 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
308 				    sf3)) {
309 					sf3 = fract;
310 					si3 = j;
311 					sw3 = whole;
312 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
313 				    sf4)) {
314 					sf4 = fract;
315 					si4 = j;
316 					sw4 = whole;
317 				}
318 			}
319 			sbuf_printf(&sb, "queue %u:\n", i);
320 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
321 			    sf0 / tot, si0);
322 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
323 			    sf1 / tot, si1);
324 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
325 			    sf2 / tot, si2);
326 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
327 			    sf3 / tot, si3);
328 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
329 			    sf4 / tot, si4);
330 		}
331 	}
332 	sbuf_trim(&sb);
333 	sbuf_finish(&sb);
334 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
335 	sbuf_delete(&sb);
336 	return (0);
337 }
338 
339 static int
340 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
341 {
342 	struct umtxq_chain *uc;
343 	u_int i, j;
344 	int clear, error;
345 
346 	clear = 0;
347 	error = sysctl_handle_int(oidp, &clear, 0, req);
348 	if (error != 0 || req->newptr == NULL)
349 		return (error);
350 
351 	if (clear != 0) {
352 		for (i = 0; i < 2; ++i) {
353 			for (j = 0; j < UMTX_CHAINS; ++j) {
354 				uc = &umtxq_chains[i][j];
355 				mtx_lock(&uc->uc_lock);
356 				uc->length = 0;
357 				uc->max_length = 0;
358 				mtx_unlock(&uc->uc_lock);
359 			}
360 		}
361 	}
362 	return (0);
363 }
364 
365 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
366     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
367     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
368 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
369     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
370     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
371 #endif
372 
373 static void
374 umtxq_sysinit(void *arg __unused)
375 {
376 	int i, j;
377 
378 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
379 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
380 	for (i = 0; i < 2; ++i) {
381 		for (j = 0; j < UMTX_CHAINS; ++j) {
382 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
383 				 MTX_DEF | MTX_DUPOK);
384 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
385 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
386 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
387 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
388 			umtxq_chains[i][j].uc_busy = 0;
389 			umtxq_chains[i][j].uc_waiters = 0;
390 #ifdef UMTX_PROFILING
391 			umtxq_chains[i][j].length = 0;
392 			umtxq_chains[i][j].max_length = 0;
393 #endif
394 		}
395 	}
396 #ifdef UMTX_PROFILING
397 	umtx_init_profiling();
398 #endif
399 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
400 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
401 	    EVENTHANDLER_PRI_ANY);
402 }
403 
404 struct umtx_q *
405 umtxq_alloc(void)
406 {
407 	struct umtx_q *uq;
408 
409 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
410 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
411 	TAILQ_INIT(&uq->uq_spare_queue->head);
412 	TAILQ_INIT(&uq->uq_pi_contested);
413 	uq->uq_inherited_pri = PRI_MAX;
414 	return (uq);
415 }
416 
417 void
418 umtxq_free(struct umtx_q *uq)
419 {
420 	MPASS(uq->uq_spare_queue != NULL);
421 	free(uq->uq_spare_queue, M_UMTX);
422 	free(uq, M_UMTX);
423 }
424 
425 static inline void
426 umtxq_hash(struct umtx_key *key)
427 {
428 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
429 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
430 }
431 
432 static inline struct umtxq_chain *
433 umtxq_getchain(struct umtx_key *key)
434 {
435 	if (key->type <= TYPE_SEM)
436 		return (&umtxq_chains[1][key->hash]);
437 	return (&umtxq_chains[0][key->hash]);
438 }
439 
440 /*
441  * Lock a chain.
442  */
443 static inline void
444 umtxq_lock(struct umtx_key *key)
445 {
446 	struct umtxq_chain *uc;
447 
448 	uc = umtxq_getchain(key);
449 	mtx_lock(&uc->uc_lock);
450 }
451 
452 /*
453  * Unlock a chain.
454  */
455 static inline void
456 umtxq_unlock(struct umtx_key *key)
457 {
458 	struct umtxq_chain *uc;
459 
460 	uc = umtxq_getchain(key);
461 	mtx_unlock(&uc->uc_lock);
462 }
463 
464 /*
465  * Set chain to busy state when following operation
466  * may be blocked (kernel mutex can not be used).
467  */
468 static inline void
469 umtxq_busy(struct umtx_key *key)
470 {
471 	struct umtxq_chain *uc;
472 
473 	uc = umtxq_getchain(key);
474 	mtx_assert(&uc->uc_lock, MA_OWNED);
475 	if (uc->uc_busy) {
476 #ifdef SMP
477 		if (smp_cpus > 1) {
478 			int count = BUSY_SPINS;
479 			if (count > 0) {
480 				umtxq_unlock(key);
481 				while (uc->uc_busy && --count > 0)
482 					cpu_spinwait();
483 				umtxq_lock(key);
484 			}
485 		}
486 #endif
487 		while (uc->uc_busy) {
488 			uc->uc_waiters++;
489 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
490 			uc->uc_waiters--;
491 		}
492 	}
493 	uc->uc_busy = 1;
494 }
495 
496 /*
497  * Unbusy a chain.
498  */
499 static inline void
500 umtxq_unbusy(struct umtx_key *key)
501 {
502 	struct umtxq_chain *uc;
503 
504 	uc = umtxq_getchain(key);
505 	mtx_assert(&uc->uc_lock, MA_OWNED);
506 	KASSERT(uc->uc_busy != 0, ("not busy"));
507 	uc->uc_busy = 0;
508 	if (uc->uc_waiters)
509 		wakeup_one(uc);
510 }
511 
512 static inline void
513 umtxq_unbusy_unlocked(struct umtx_key *key)
514 {
515 
516 	umtxq_lock(key);
517 	umtxq_unbusy(key);
518 	umtxq_unlock(key);
519 }
520 
521 static struct umtxq_queue *
522 umtxq_queue_lookup(struct umtx_key *key, int q)
523 {
524 	struct umtxq_queue *uh;
525 	struct umtxq_chain *uc;
526 
527 	uc = umtxq_getchain(key);
528 	UMTXQ_LOCKED_ASSERT(uc);
529 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
530 		if (umtx_key_match(&uh->key, key))
531 			return (uh);
532 	}
533 
534 	return (NULL);
535 }
536 
537 static inline void
538 umtxq_insert_queue(struct umtx_q *uq, int q)
539 {
540 	struct umtxq_queue *uh;
541 	struct umtxq_chain *uc;
542 
543 	uc = umtxq_getchain(&uq->uq_key);
544 	UMTXQ_LOCKED_ASSERT(uc);
545 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
546 	uh = umtxq_queue_lookup(&uq->uq_key, q);
547 	if (uh != NULL) {
548 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
549 	} else {
550 		uh = uq->uq_spare_queue;
551 		uh->key = uq->uq_key;
552 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
553 #ifdef UMTX_PROFILING
554 		uc->length++;
555 		if (uc->length > uc->max_length) {
556 			uc->max_length = uc->length;
557 			if (uc->max_length > max_length)
558 				max_length = uc->max_length;
559 		}
560 #endif
561 	}
562 	uq->uq_spare_queue = NULL;
563 
564 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
565 	uh->length++;
566 	uq->uq_flags |= UQF_UMTXQ;
567 	uq->uq_cur_queue = uh;
568 	return;
569 }
570 
571 static inline void
572 umtxq_remove_queue(struct umtx_q *uq, int q)
573 {
574 	struct umtxq_chain *uc;
575 	struct umtxq_queue *uh;
576 
577 	uc = umtxq_getchain(&uq->uq_key);
578 	UMTXQ_LOCKED_ASSERT(uc);
579 	if (uq->uq_flags & UQF_UMTXQ) {
580 		uh = uq->uq_cur_queue;
581 		TAILQ_REMOVE(&uh->head, uq, uq_link);
582 		uh->length--;
583 		uq->uq_flags &= ~UQF_UMTXQ;
584 		if (TAILQ_EMPTY(&uh->head)) {
585 			KASSERT(uh->length == 0,
586 			    ("inconsistent umtxq_queue length"));
587 #ifdef UMTX_PROFILING
588 			uc->length--;
589 #endif
590 			LIST_REMOVE(uh, link);
591 		} else {
592 			uh = LIST_FIRST(&uc->uc_spare_queue);
593 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
594 			LIST_REMOVE(uh, link);
595 		}
596 		uq->uq_spare_queue = uh;
597 		uq->uq_cur_queue = NULL;
598 	}
599 }
600 
601 /*
602  * Check if there are multiple waiters
603  */
604 static int
605 umtxq_count(struct umtx_key *key)
606 {
607 	struct umtxq_chain *uc;
608 	struct umtxq_queue *uh;
609 
610 	uc = umtxq_getchain(key);
611 	UMTXQ_LOCKED_ASSERT(uc);
612 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
613 	if (uh != NULL)
614 		return (uh->length);
615 	return (0);
616 }
617 
618 /*
619  * Check if there are multiple PI waiters and returns first
620  * waiter.
621  */
622 static int
623 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
624 {
625 	struct umtxq_chain *uc;
626 	struct umtxq_queue *uh;
627 
628 	*first = NULL;
629 	uc = umtxq_getchain(key);
630 	UMTXQ_LOCKED_ASSERT(uc);
631 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
632 	if (uh != NULL) {
633 		*first = TAILQ_FIRST(&uh->head);
634 		return (uh->length);
635 	}
636 	return (0);
637 }
638 
639 static int
640 umtxq_check_susp(struct thread *td)
641 {
642 	struct proc *p;
643 	int error;
644 
645 	/*
646 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
647 	 * eventually break the lockstep loop.
648 	 */
649 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
650 		return (0);
651 	error = 0;
652 	p = td->td_proc;
653 	PROC_LOCK(p);
654 	if (P_SHOULDSTOP(p) ||
655 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
656 		if (p->p_flag & P_SINGLE_EXIT)
657 			error = EINTR;
658 		else
659 			error = ERESTART;
660 	}
661 	PROC_UNLOCK(p);
662 	return (error);
663 }
664 
665 /*
666  * Wake up threads waiting on an userland object.
667  */
668 
669 static int
670 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
671 {
672 	struct umtxq_chain *uc;
673 	struct umtxq_queue *uh;
674 	struct umtx_q *uq;
675 	int ret;
676 
677 	ret = 0;
678 	uc = umtxq_getchain(key);
679 	UMTXQ_LOCKED_ASSERT(uc);
680 	uh = umtxq_queue_lookup(key, q);
681 	if (uh != NULL) {
682 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
683 			umtxq_remove_queue(uq, q);
684 			wakeup(uq);
685 			if (++ret >= n_wake)
686 				return (ret);
687 		}
688 	}
689 	return (ret);
690 }
691 
692 
693 /*
694  * Wake up specified thread.
695  */
696 static inline void
697 umtxq_signal_thread(struct umtx_q *uq)
698 {
699 	struct umtxq_chain *uc;
700 
701 	uc = umtxq_getchain(&uq->uq_key);
702 	UMTXQ_LOCKED_ASSERT(uc);
703 	umtxq_remove(uq);
704 	wakeup(uq);
705 }
706 
707 static inline int
708 tstohz(const struct timespec *tsp)
709 {
710 	struct timeval tv;
711 
712 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
713 	return tvtohz(&tv);
714 }
715 
716 static void
717 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
718 	const struct timespec *timeout)
719 {
720 
721 	timo->clockid = clockid;
722 	if (!absolute) {
723 		kern_clock_gettime(curthread, clockid, &timo->end);
724 		timo->cur = timo->end;
725 		timespecadd(&timo->end, timeout);
726 	} else {
727 		timo->end = *timeout;
728 		kern_clock_gettime(curthread, clockid, &timo->cur);
729 	}
730 }
731 
732 static void
733 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
734 {
735 
736 	abs_timeout_init(timo, umtxtime->_clockid,
737 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
738 		&umtxtime->_timeout);
739 }
740 
741 static inline void
742 abs_timeout_update(struct abs_timeout *timo)
743 {
744 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
745 }
746 
747 static int
748 abs_timeout_gethz(struct abs_timeout *timo)
749 {
750 	struct timespec tts;
751 
752 	if (timespeccmp(&timo->end, &timo->cur, <=))
753 		return (-1);
754 	tts = timo->end;
755 	timespecsub(&tts, &timo->cur);
756 	return (tstohz(&tts));
757 }
758 
759 /*
760  * Put thread into sleep state, before sleeping, check if
761  * thread was removed from umtx queue.
762  */
763 static inline int
764 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
765 {
766 	struct umtxq_chain *uc;
767 	int error, timo;
768 
769 	uc = umtxq_getchain(&uq->uq_key);
770 	UMTXQ_LOCKED_ASSERT(uc);
771 	for (;;) {
772 		if (!(uq->uq_flags & UQF_UMTXQ))
773 			return (0);
774 		if (abstime != NULL) {
775 			timo = abs_timeout_gethz(abstime);
776 			if (timo < 0)
777 				return (ETIMEDOUT);
778 		} else
779 			timo = 0;
780 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
781 		if (error != EWOULDBLOCK) {
782 			umtxq_lock(&uq->uq_key);
783 			break;
784 		}
785 		if (abstime != NULL)
786 			abs_timeout_update(abstime);
787 		umtxq_lock(&uq->uq_key);
788 	}
789 	return (error);
790 }
791 
792 /*
793  * Convert userspace address into unique logical address.
794  */
795 int
796 umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
797 {
798 	struct thread *td = curthread;
799 	vm_map_t map;
800 	vm_map_entry_t entry;
801 	vm_pindex_t pindex;
802 	vm_prot_t prot;
803 	boolean_t wired;
804 
805 	key->type = type;
806 	if (share == THREAD_SHARE) {
807 		key->shared = 0;
808 		key->info.private.vs = td->td_proc->p_vmspace;
809 		key->info.private.addr = (uintptr_t)addr;
810 	} else {
811 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
812 		map = &td->td_proc->p_vmspace->vm_map;
813 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
814 		    &entry, &key->info.shared.object, &pindex, &prot,
815 		    &wired) != KERN_SUCCESS) {
816 			return EFAULT;
817 		}
818 
819 		if ((share == PROCESS_SHARE) ||
820 		    (share == AUTO_SHARE &&
821 		     VM_INHERIT_SHARE == entry->inheritance)) {
822 			key->shared = 1;
823 			key->info.shared.offset = entry->offset + entry->start -
824 				(vm_offset_t)addr;
825 			vm_object_reference(key->info.shared.object);
826 		} else {
827 			key->shared = 0;
828 			key->info.private.vs = td->td_proc->p_vmspace;
829 			key->info.private.addr = (uintptr_t)addr;
830 		}
831 		vm_map_lookup_done(map, entry);
832 	}
833 
834 	umtxq_hash(key);
835 	return (0);
836 }
837 
838 /*
839  * Release key.
840  */
841 void
842 umtx_key_release(struct umtx_key *key)
843 {
844 	if (key->shared)
845 		vm_object_deallocate(key->info.shared.object);
846 }
847 
848 /*
849  * Fetch and compare value, sleep on the address if value is not changed.
850  */
851 static int
852 do_wait(struct thread *td, void *addr, u_long id,
853 	struct _umtx_time *timeout, int compat32, int is_private)
854 {
855 	struct abs_timeout timo;
856 	struct umtx_q *uq;
857 	u_long tmp;
858 	uint32_t tmp32;
859 	int error = 0;
860 
861 	uq = td->td_umtxq;
862 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
863 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
864 		return (error);
865 
866 	if (timeout != NULL)
867 		abs_timeout_init2(&timo, timeout);
868 
869 	umtxq_lock(&uq->uq_key);
870 	umtxq_insert(uq);
871 	umtxq_unlock(&uq->uq_key);
872 	if (compat32 == 0) {
873 		error = fueword(addr, &tmp);
874 		if (error != 0)
875 			error = EFAULT;
876 	} else {
877 		error = fueword32(addr, &tmp32);
878 		if (error == 0)
879 			tmp = tmp32;
880 		else
881 			error = EFAULT;
882 	}
883 	umtxq_lock(&uq->uq_key);
884 	if (error == 0) {
885 		if (tmp == id)
886 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
887 			    NULL : &timo);
888 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
889 			error = 0;
890 		else
891 			umtxq_remove(uq);
892 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
893 		umtxq_remove(uq);
894 	}
895 	umtxq_unlock(&uq->uq_key);
896 	umtx_key_release(&uq->uq_key);
897 	if (error == ERESTART)
898 		error = EINTR;
899 	return (error);
900 }
901 
902 /*
903  * Wake up threads sleeping on the specified address.
904  */
905 int
906 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
907 {
908 	struct umtx_key key;
909 	int ret;
910 
911 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
912 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
913 		return (ret);
914 	umtxq_lock(&key);
915 	ret = umtxq_signal(&key, n_wake);
916 	umtxq_unlock(&key);
917 	umtx_key_release(&key);
918 	return (0);
919 }
920 
921 /*
922  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
923  */
924 static int
925 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
926 	struct _umtx_time *timeout, int mode)
927 {
928 	struct abs_timeout timo;
929 	struct umtx_q *uq;
930 	uint32_t owner, old, id;
931 	int error, rv;
932 
933 	id = td->td_tid;
934 	uq = td->td_umtxq;
935 	error = 0;
936 	if (timeout != NULL)
937 		abs_timeout_init2(&timo, timeout);
938 
939 	/*
940 	 * Care must be exercised when dealing with umtx structure. It
941 	 * can fault on any access.
942 	 */
943 	for (;;) {
944 		rv = fueword32(&m->m_owner, &owner);
945 		if (rv == -1)
946 			return (EFAULT);
947 		if (mode == _UMUTEX_WAIT) {
948 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
949 				return (0);
950 		} else {
951 			/*
952 			 * Try the uncontested case.  This should be done in userland.
953 			 */
954 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
955 			    &owner, id);
956 			/* The address was invalid. */
957 			if (rv == -1)
958 				return (EFAULT);
959 
960 			/* The acquire succeeded. */
961 			if (owner == UMUTEX_UNOWNED)
962 				return (0);
963 
964 			/* If no one owns it but it is contested try to acquire it. */
965 			if (owner == UMUTEX_CONTESTED) {
966 				rv = casueword32(&m->m_owner,
967 				    UMUTEX_CONTESTED, &owner,
968 				    id | UMUTEX_CONTESTED);
969 				/* The address was invalid. */
970 				if (rv == -1)
971 					return (EFAULT);
972 
973 				if (owner == UMUTEX_CONTESTED)
974 					return (0);
975 
976 				rv = umtxq_check_susp(td);
977 				if (rv != 0)
978 					return (rv);
979 
980 				/* If this failed the lock has changed, restart. */
981 				continue;
982 			}
983 		}
984 
985 		if (mode == _UMUTEX_TRY)
986 			return (EBUSY);
987 
988 		/*
989 		 * If we caught a signal, we have retried and now
990 		 * exit immediately.
991 		 */
992 		if (error != 0)
993 			return (error);
994 
995 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
996 		    GET_SHARE(flags), &uq->uq_key)) != 0)
997 			return (error);
998 
999 		umtxq_lock(&uq->uq_key);
1000 		umtxq_busy(&uq->uq_key);
1001 		umtxq_insert(uq);
1002 		umtxq_unlock(&uq->uq_key);
1003 
1004 		/*
1005 		 * Set the contested bit so that a release in user space
1006 		 * knows to use the system call for unlock.  If this fails
1007 		 * either some one else has acquired the lock or it has been
1008 		 * released.
1009 		 */
1010 		rv = casueword32(&m->m_owner, owner, &old,
1011 		    owner | UMUTEX_CONTESTED);
1012 
1013 		/* The address was invalid. */
1014 		if (rv == -1) {
1015 			umtxq_lock(&uq->uq_key);
1016 			umtxq_remove(uq);
1017 			umtxq_unbusy(&uq->uq_key);
1018 			umtxq_unlock(&uq->uq_key);
1019 			umtx_key_release(&uq->uq_key);
1020 			return (EFAULT);
1021 		}
1022 
1023 		/*
1024 		 * We set the contested bit, sleep. Otherwise the lock changed
1025 		 * and we need to retry or we lost a race to the thread
1026 		 * unlocking the umtx.
1027 		 */
1028 		umtxq_lock(&uq->uq_key);
1029 		umtxq_unbusy(&uq->uq_key);
1030 		if (old == owner)
1031 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1032 			    NULL : &timo);
1033 		umtxq_remove(uq);
1034 		umtxq_unlock(&uq->uq_key);
1035 		umtx_key_release(&uq->uq_key);
1036 
1037 		if (error == 0)
1038 			error = umtxq_check_susp(td);
1039 	}
1040 
1041 	return (0);
1042 }
1043 
1044 /*
1045  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1046  */
1047 static int
1048 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1049 {
1050 	struct umtx_key key;
1051 	uint32_t owner, old, id;
1052 	int error;
1053 	int count;
1054 
1055 	id = td->td_tid;
1056 	/*
1057 	 * Make sure we own this mtx.
1058 	 */
1059 	error = fueword32(&m->m_owner, &owner);
1060 	if (error == -1)
1061 		return (EFAULT);
1062 
1063 	if ((owner & ~UMUTEX_CONTESTED) != id)
1064 		return (EPERM);
1065 
1066 	if ((owner & UMUTEX_CONTESTED) == 0) {
1067 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1068 		if (error == -1)
1069 			return (EFAULT);
1070 		if (old == owner)
1071 			return (0);
1072 		owner = old;
1073 	}
1074 
1075 	/* We should only ever be in here for contested locks */
1076 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1077 	    &key)) != 0)
1078 		return (error);
1079 
1080 	umtxq_lock(&key);
1081 	umtxq_busy(&key);
1082 	count = umtxq_count(&key);
1083 	umtxq_unlock(&key);
1084 
1085 	/*
1086 	 * When unlocking the umtx, it must be marked as unowned if
1087 	 * there is zero or one thread only waiting for it.
1088 	 * Otherwise, it must be marked as contested.
1089 	 */
1090 	error = casueword32(&m->m_owner, owner, &old,
1091 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1092 	umtxq_lock(&key);
1093 	umtxq_signal(&key,1);
1094 	umtxq_unbusy(&key);
1095 	umtxq_unlock(&key);
1096 	umtx_key_release(&key);
1097 	if (error == -1)
1098 		return (EFAULT);
1099 	if (old != owner)
1100 		return (EINVAL);
1101 	return (0);
1102 }
1103 
1104 /*
1105  * Check if the mutex is available and wake up a waiter,
1106  * only for simple mutex.
1107  */
1108 static int
1109 do_wake_umutex(struct thread *td, struct umutex *m)
1110 {
1111 	struct umtx_key key;
1112 	uint32_t owner;
1113 	uint32_t flags;
1114 	int error;
1115 	int count;
1116 
1117 	error = fueword32(&m->m_owner, &owner);
1118 	if (error == -1)
1119 		return (EFAULT);
1120 
1121 	if ((owner & ~UMUTEX_CONTESTED) != 0)
1122 		return (0);
1123 
1124 	error = fueword32(&m->m_flags, &flags);
1125 	if (error == -1)
1126 		return (EFAULT);
1127 
1128 	/* We should only ever be in here for contested locks */
1129 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1130 	    &key)) != 0)
1131 		return (error);
1132 
1133 	umtxq_lock(&key);
1134 	umtxq_busy(&key);
1135 	count = umtxq_count(&key);
1136 	umtxq_unlock(&key);
1137 
1138 	if (count <= 1) {
1139 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1140 		    UMUTEX_UNOWNED);
1141 		if (error == -1)
1142 			error = EFAULT;
1143 	}
1144 
1145 	umtxq_lock(&key);
1146 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1147 		umtxq_signal(&key, 1);
1148 	umtxq_unbusy(&key);
1149 	umtxq_unlock(&key);
1150 	umtx_key_release(&key);
1151 	return (error);
1152 }
1153 
1154 /*
1155  * Check if the mutex has waiters and tries to fix contention bit.
1156  */
1157 static int
1158 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1159 {
1160 	struct umtx_key key;
1161 	uint32_t owner, old;
1162 	int type;
1163 	int error;
1164 	int count;
1165 
1166 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1167 	case 0:
1168 		type = TYPE_NORMAL_UMUTEX;
1169 		break;
1170 	case UMUTEX_PRIO_INHERIT:
1171 		type = TYPE_PI_UMUTEX;
1172 		break;
1173 	case UMUTEX_PRIO_PROTECT:
1174 		type = TYPE_PP_UMUTEX;
1175 		break;
1176 	default:
1177 		return (EINVAL);
1178 	}
1179 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1180 	    &key)) != 0)
1181 		return (error);
1182 
1183 	owner = 0;
1184 	umtxq_lock(&key);
1185 	umtxq_busy(&key);
1186 	count = umtxq_count(&key);
1187 	umtxq_unlock(&key);
1188 	/*
1189 	 * Only repair contention bit if there is a waiter, this means the mutex
1190 	 * is still being referenced by userland code, otherwise don't update
1191 	 * any memory.
1192 	 */
1193 	if (count > 1) {
1194 		error = fueword32(&m->m_owner, &owner);
1195 		if (error == -1)
1196 			error = EFAULT;
1197 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1198 			error = casueword32(&m->m_owner, owner, &old,
1199 			    owner | UMUTEX_CONTESTED);
1200 			if (error == -1) {
1201 				error = EFAULT;
1202 				break;
1203 			}
1204 			if (old == owner)
1205 				break;
1206 			owner = old;
1207 			error = umtxq_check_susp(td);
1208 			if (error != 0)
1209 				break;
1210 		}
1211 	} else if (count == 1) {
1212 		error = fueword32(&m->m_owner, &owner);
1213 		if (error == -1)
1214 			error = EFAULT;
1215 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1216 		       (owner & UMUTEX_CONTESTED) == 0) {
1217 			error = casueword32(&m->m_owner, owner, &old,
1218 			    owner | UMUTEX_CONTESTED);
1219 			if (error == -1) {
1220 				error = EFAULT;
1221 				break;
1222 			}
1223 			if (old == owner)
1224 				break;
1225 			owner = old;
1226 			error = umtxq_check_susp(td);
1227 			if (error != 0)
1228 				break;
1229 		}
1230 	}
1231 	umtxq_lock(&key);
1232 	if (error == EFAULT) {
1233 		umtxq_signal(&key, INT_MAX);
1234 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1235 		umtxq_signal(&key, 1);
1236 	umtxq_unbusy(&key);
1237 	umtxq_unlock(&key);
1238 	umtx_key_release(&key);
1239 	return (error);
1240 }
1241 
1242 static inline struct umtx_pi *
1243 umtx_pi_alloc(int flags)
1244 {
1245 	struct umtx_pi *pi;
1246 
1247 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1248 	TAILQ_INIT(&pi->pi_blocked);
1249 	atomic_add_int(&umtx_pi_allocated, 1);
1250 	return (pi);
1251 }
1252 
1253 static inline void
1254 umtx_pi_free(struct umtx_pi *pi)
1255 {
1256 	uma_zfree(umtx_pi_zone, pi);
1257 	atomic_add_int(&umtx_pi_allocated, -1);
1258 }
1259 
1260 /*
1261  * Adjust the thread's position on a pi_state after its priority has been
1262  * changed.
1263  */
1264 static int
1265 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1266 {
1267 	struct umtx_q *uq, *uq1, *uq2;
1268 	struct thread *td1;
1269 
1270 	mtx_assert(&umtx_lock, MA_OWNED);
1271 	if (pi == NULL)
1272 		return (0);
1273 
1274 	uq = td->td_umtxq;
1275 
1276 	/*
1277 	 * Check if the thread needs to be moved on the blocked chain.
1278 	 * It needs to be moved if either its priority is lower than
1279 	 * the previous thread or higher than the next thread.
1280 	 */
1281 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1282 	uq2 = TAILQ_NEXT(uq, uq_lockq);
1283 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1284 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1285 		/*
1286 		 * Remove thread from blocked chain and determine where
1287 		 * it should be moved to.
1288 		 */
1289 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1290 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1291 			td1 = uq1->uq_thread;
1292 			MPASS(td1->td_proc->p_magic == P_MAGIC);
1293 			if (UPRI(td1) > UPRI(td))
1294 				break;
1295 		}
1296 
1297 		if (uq1 == NULL)
1298 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1299 		else
1300 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1301 	}
1302 	return (1);
1303 }
1304 
1305 /*
1306  * Propagate priority when a thread is blocked on POSIX
1307  * PI mutex.
1308  */
1309 static void
1310 umtx_propagate_priority(struct thread *td)
1311 {
1312 	struct umtx_q *uq;
1313 	struct umtx_pi *pi;
1314 	int pri;
1315 
1316 	mtx_assert(&umtx_lock, MA_OWNED);
1317 	pri = UPRI(td);
1318 	uq = td->td_umtxq;
1319 	pi = uq->uq_pi_blocked;
1320 	if (pi == NULL)
1321 		return;
1322 
1323 	for (;;) {
1324 		td = pi->pi_owner;
1325 		if (td == NULL || td == curthread)
1326 			return;
1327 
1328 		MPASS(td->td_proc != NULL);
1329 		MPASS(td->td_proc->p_magic == P_MAGIC);
1330 
1331 		thread_lock(td);
1332 		if (td->td_lend_user_pri > pri)
1333 			sched_lend_user_prio(td, pri);
1334 		else {
1335 			thread_unlock(td);
1336 			break;
1337 		}
1338 		thread_unlock(td);
1339 
1340 		/*
1341 		 * Pick up the lock that td is blocked on.
1342 		 */
1343 		uq = td->td_umtxq;
1344 		pi = uq->uq_pi_blocked;
1345 		if (pi == NULL)
1346 			break;
1347 		/* Resort td on the list if needed. */
1348 		umtx_pi_adjust_thread(pi, td);
1349 	}
1350 }
1351 
1352 /*
1353  * Unpropagate priority for a PI mutex when a thread blocked on
1354  * it is interrupted by signal or resumed by others.
1355  */
1356 static void
1357 umtx_repropagate_priority(struct umtx_pi *pi)
1358 {
1359 	struct umtx_q *uq, *uq_owner;
1360 	struct umtx_pi *pi2;
1361 	int pri;
1362 
1363 	mtx_assert(&umtx_lock, MA_OWNED);
1364 
1365 	while (pi != NULL && pi->pi_owner != NULL) {
1366 		pri = PRI_MAX;
1367 		uq_owner = pi->pi_owner->td_umtxq;
1368 
1369 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1370 			uq = TAILQ_FIRST(&pi2->pi_blocked);
1371 			if (uq != NULL) {
1372 				if (pri > UPRI(uq->uq_thread))
1373 					pri = UPRI(uq->uq_thread);
1374 			}
1375 		}
1376 
1377 		if (pri > uq_owner->uq_inherited_pri)
1378 			pri = uq_owner->uq_inherited_pri;
1379 		thread_lock(pi->pi_owner);
1380 		sched_lend_user_prio(pi->pi_owner, pri);
1381 		thread_unlock(pi->pi_owner);
1382 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1383 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1384 	}
1385 }
1386 
1387 /*
1388  * Insert a PI mutex into owned list.
1389  */
1390 static void
1391 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1392 {
1393 	struct umtx_q *uq_owner;
1394 
1395 	uq_owner = owner->td_umtxq;
1396 	mtx_assert(&umtx_lock, MA_OWNED);
1397 	if (pi->pi_owner != NULL)
1398 		panic("pi_ower != NULL");
1399 	pi->pi_owner = owner;
1400 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1401 }
1402 
1403 /*
1404  * Claim ownership of a PI mutex.
1405  */
1406 static int
1407 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1408 {
1409 	struct umtx_q *uq, *uq_owner;
1410 
1411 	uq_owner = owner->td_umtxq;
1412 	mtx_lock_spin(&umtx_lock);
1413 	if (pi->pi_owner == owner) {
1414 		mtx_unlock_spin(&umtx_lock);
1415 		return (0);
1416 	}
1417 
1418 	if (pi->pi_owner != NULL) {
1419 		/*
1420 		 * userland may have already messed the mutex, sigh.
1421 		 */
1422 		mtx_unlock_spin(&umtx_lock);
1423 		return (EPERM);
1424 	}
1425 	umtx_pi_setowner(pi, owner);
1426 	uq = TAILQ_FIRST(&pi->pi_blocked);
1427 	if (uq != NULL) {
1428 		int pri;
1429 
1430 		pri = UPRI(uq->uq_thread);
1431 		thread_lock(owner);
1432 		if (pri < UPRI(owner))
1433 			sched_lend_user_prio(owner, pri);
1434 		thread_unlock(owner);
1435 	}
1436 	mtx_unlock_spin(&umtx_lock);
1437 	return (0);
1438 }
1439 
1440 /*
1441  * Adjust a thread's order position in its blocked PI mutex,
1442  * this may result new priority propagating process.
1443  */
1444 void
1445 umtx_pi_adjust(struct thread *td, u_char oldpri)
1446 {
1447 	struct umtx_q *uq;
1448 	struct umtx_pi *pi;
1449 
1450 	uq = td->td_umtxq;
1451 	mtx_lock_spin(&umtx_lock);
1452 	/*
1453 	 * Pick up the lock that td is blocked on.
1454 	 */
1455 	pi = uq->uq_pi_blocked;
1456 	if (pi != NULL) {
1457 		umtx_pi_adjust_thread(pi, td);
1458 		umtx_repropagate_priority(pi);
1459 	}
1460 	mtx_unlock_spin(&umtx_lock);
1461 }
1462 
1463 /*
1464  * Sleep on a PI mutex.
1465  */
1466 static int
1467 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1468 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1469 {
1470 	struct umtxq_chain *uc;
1471 	struct thread *td, *td1;
1472 	struct umtx_q *uq1;
1473 	int pri;
1474 	int error = 0;
1475 
1476 	td = uq->uq_thread;
1477 	KASSERT(td == curthread, ("inconsistent uq_thread"));
1478 	uc = umtxq_getchain(&uq->uq_key);
1479 	UMTXQ_LOCKED_ASSERT(uc);
1480 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
1481 	umtxq_insert(uq);
1482 	mtx_lock_spin(&umtx_lock);
1483 	if (pi->pi_owner == NULL) {
1484 		mtx_unlock_spin(&umtx_lock);
1485 		/* XXX Only look up thread in current process. */
1486 		td1 = tdfind(owner, curproc->p_pid);
1487 		mtx_lock_spin(&umtx_lock);
1488 		if (td1 != NULL) {
1489 			if (pi->pi_owner == NULL)
1490 				umtx_pi_setowner(pi, td1);
1491 			PROC_UNLOCK(td1->td_proc);
1492 		}
1493 	}
1494 
1495 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1496 		pri = UPRI(uq1->uq_thread);
1497 		if (pri > UPRI(td))
1498 			break;
1499 	}
1500 
1501 	if (uq1 != NULL)
1502 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1503 	else
1504 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1505 
1506 	uq->uq_pi_blocked = pi;
1507 	thread_lock(td);
1508 	td->td_flags |= TDF_UPIBLOCKED;
1509 	thread_unlock(td);
1510 	umtx_propagate_priority(td);
1511 	mtx_unlock_spin(&umtx_lock);
1512 	umtxq_unbusy(&uq->uq_key);
1513 
1514 	error = umtxq_sleep(uq, wmesg, timo);
1515 	umtxq_remove(uq);
1516 
1517 	mtx_lock_spin(&umtx_lock);
1518 	uq->uq_pi_blocked = NULL;
1519 	thread_lock(td);
1520 	td->td_flags &= ~TDF_UPIBLOCKED;
1521 	thread_unlock(td);
1522 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1523 	umtx_repropagate_priority(pi);
1524 	mtx_unlock_spin(&umtx_lock);
1525 	umtxq_unlock(&uq->uq_key);
1526 
1527 	return (error);
1528 }
1529 
1530 /*
1531  * Add reference count for a PI mutex.
1532  */
1533 static void
1534 umtx_pi_ref(struct umtx_pi *pi)
1535 {
1536 	struct umtxq_chain *uc;
1537 
1538 	uc = umtxq_getchain(&pi->pi_key);
1539 	UMTXQ_LOCKED_ASSERT(uc);
1540 	pi->pi_refcount++;
1541 }
1542 
1543 /*
1544  * Decrease reference count for a PI mutex, if the counter
1545  * is decreased to zero, its memory space is freed.
1546  */
1547 static void
1548 umtx_pi_unref(struct umtx_pi *pi)
1549 {
1550 	struct umtxq_chain *uc;
1551 
1552 	uc = umtxq_getchain(&pi->pi_key);
1553 	UMTXQ_LOCKED_ASSERT(uc);
1554 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1555 	if (--pi->pi_refcount == 0) {
1556 		mtx_lock_spin(&umtx_lock);
1557 		if (pi->pi_owner != NULL) {
1558 			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1559 				pi, pi_link);
1560 			pi->pi_owner = NULL;
1561 		}
1562 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1563 			("blocked queue not empty"));
1564 		mtx_unlock_spin(&umtx_lock);
1565 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1566 		umtx_pi_free(pi);
1567 	}
1568 }
1569 
1570 /*
1571  * Find a PI mutex in hash table.
1572  */
1573 static struct umtx_pi *
1574 umtx_pi_lookup(struct umtx_key *key)
1575 {
1576 	struct umtxq_chain *uc;
1577 	struct umtx_pi *pi;
1578 
1579 	uc = umtxq_getchain(key);
1580 	UMTXQ_LOCKED_ASSERT(uc);
1581 
1582 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1583 		if (umtx_key_match(&pi->pi_key, key)) {
1584 			return (pi);
1585 		}
1586 	}
1587 	return (NULL);
1588 }
1589 
1590 /*
1591  * Insert a PI mutex into hash table.
1592  */
1593 static inline void
1594 umtx_pi_insert(struct umtx_pi *pi)
1595 {
1596 	struct umtxq_chain *uc;
1597 
1598 	uc = umtxq_getchain(&pi->pi_key);
1599 	UMTXQ_LOCKED_ASSERT(uc);
1600 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1601 }
1602 
1603 /*
1604  * Lock a PI mutex.
1605  */
1606 static int
1607 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1608     struct _umtx_time *timeout, int try)
1609 {
1610 	struct abs_timeout timo;
1611 	struct umtx_q *uq;
1612 	struct umtx_pi *pi, *new_pi;
1613 	uint32_t id, owner, old;
1614 	int error, rv;
1615 
1616 	id = td->td_tid;
1617 	uq = td->td_umtxq;
1618 
1619 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1620 	    &uq->uq_key)) != 0)
1621 		return (error);
1622 
1623 	if (timeout != NULL)
1624 		abs_timeout_init2(&timo, timeout);
1625 
1626 	umtxq_lock(&uq->uq_key);
1627 	pi = umtx_pi_lookup(&uq->uq_key);
1628 	if (pi == NULL) {
1629 		new_pi = umtx_pi_alloc(M_NOWAIT);
1630 		if (new_pi == NULL) {
1631 			umtxq_unlock(&uq->uq_key);
1632 			new_pi = umtx_pi_alloc(M_WAITOK);
1633 			umtxq_lock(&uq->uq_key);
1634 			pi = umtx_pi_lookup(&uq->uq_key);
1635 			if (pi != NULL) {
1636 				umtx_pi_free(new_pi);
1637 				new_pi = NULL;
1638 			}
1639 		}
1640 		if (new_pi != NULL) {
1641 			new_pi->pi_key = uq->uq_key;
1642 			umtx_pi_insert(new_pi);
1643 			pi = new_pi;
1644 		}
1645 	}
1646 	umtx_pi_ref(pi);
1647 	umtxq_unlock(&uq->uq_key);
1648 
1649 	/*
1650 	 * Care must be exercised when dealing with umtx structure.  It
1651 	 * can fault on any access.
1652 	 */
1653 	for (;;) {
1654 		/*
1655 		 * Try the uncontested case.  This should be done in userland.
1656 		 */
1657 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
1658 		/* The address was invalid. */
1659 		if (rv == -1) {
1660 			error = EFAULT;
1661 			break;
1662 		}
1663 
1664 		/* The acquire succeeded. */
1665 		if (owner == UMUTEX_UNOWNED) {
1666 			error = 0;
1667 			break;
1668 		}
1669 
1670 		/* If no one owns it but it is contested try to acquire it. */
1671 		if (owner == UMUTEX_CONTESTED) {
1672 			rv = casueword32(&m->m_owner,
1673 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1674 			/* The address was invalid. */
1675 			if (rv == -1) {
1676 				error = EFAULT;
1677 				break;
1678 			}
1679 
1680 			if (owner == UMUTEX_CONTESTED) {
1681 				umtxq_lock(&uq->uq_key);
1682 				umtxq_busy(&uq->uq_key);
1683 				error = umtx_pi_claim(pi, td);
1684 				umtxq_unbusy(&uq->uq_key);
1685 				umtxq_unlock(&uq->uq_key);
1686 				break;
1687 			}
1688 
1689 			error = umtxq_check_susp(td);
1690 			if (error != 0)
1691 				break;
1692 
1693 			/* If this failed the lock has changed, restart. */
1694 			continue;
1695 		}
1696 
1697 		if (try != 0) {
1698 			error = EBUSY;
1699 			break;
1700 		}
1701 
1702 		/*
1703 		 * If we caught a signal, we have retried and now
1704 		 * exit immediately.
1705 		 */
1706 		if (error != 0)
1707 			break;
1708 
1709 		umtxq_lock(&uq->uq_key);
1710 		umtxq_busy(&uq->uq_key);
1711 		umtxq_unlock(&uq->uq_key);
1712 
1713 		/*
1714 		 * Set the contested bit so that a release in user space
1715 		 * knows to use the system call for unlock.  If this fails
1716 		 * either some one else has acquired the lock or it has been
1717 		 * released.
1718 		 */
1719 		rv = casueword32(&m->m_owner, owner, &old,
1720 		    owner | UMUTEX_CONTESTED);
1721 
1722 		/* The address was invalid. */
1723 		if (rv == -1) {
1724 			umtxq_unbusy_unlocked(&uq->uq_key);
1725 			error = EFAULT;
1726 			break;
1727 		}
1728 
1729 		umtxq_lock(&uq->uq_key);
1730 		/*
1731 		 * We set the contested bit, sleep. Otherwise the lock changed
1732 		 * and we need to retry or we lost a race to the thread
1733 		 * unlocking the umtx.
1734 		 */
1735 		if (old == owner) {
1736 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1737 			    "umtxpi", timeout == NULL ? NULL : &timo);
1738 			if (error != 0)
1739 				continue;
1740 		} else {
1741 			umtxq_unbusy(&uq->uq_key);
1742 			umtxq_unlock(&uq->uq_key);
1743 		}
1744 
1745 		error = umtxq_check_susp(td);
1746 		if (error != 0)
1747 			break;
1748 	}
1749 
1750 	umtxq_lock(&uq->uq_key);
1751 	umtx_pi_unref(pi);
1752 	umtxq_unlock(&uq->uq_key);
1753 
1754 	umtx_key_release(&uq->uq_key);
1755 	return (error);
1756 }
1757 
1758 /*
1759  * Unlock a PI mutex.
1760  */
1761 static int
1762 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1763 {
1764 	struct umtx_key key;
1765 	struct umtx_q *uq_first, *uq_first2, *uq_me;
1766 	struct umtx_pi *pi, *pi2;
1767 	uint32_t owner, old, id;
1768 	int error;
1769 	int count;
1770 	int pri;
1771 
1772 	id = td->td_tid;
1773 	/*
1774 	 * Make sure we own this mtx.
1775 	 */
1776 	error = fueword32(&m->m_owner, &owner);
1777 	if (error == -1)
1778 		return (EFAULT);
1779 
1780 	if ((owner & ~UMUTEX_CONTESTED) != id)
1781 		return (EPERM);
1782 
1783 	/* This should be done in userland */
1784 	if ((owner & UMUTEX_CONTESTED) == 0) {
1785 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1786 		if (error == -1)
1787 			return (EFAULT);
1788 		if (old == owner)
1789 			return (0);
1790 		owner = old;
1791 	}
1792 
1793 	/* We should only ever be in here for contested locks */
1794 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1795 	    &key)) != 0)
1796 		return (error);
1797 
1798 	umtxq_lock(&key);
1799 	umtxq_busy(&key);
1800 	count = umtxq_count_pi(&key, &uq_first);
1801 	if (uq_first != NULL) {
1802 		mtx_lock_spin(&umtx_lock);
1803 		pi = uq_first->uq_pi_blocked;
1804 		KASSERT(pi != NULL, ("pi == NULL?"));
1805 		if (pi->pi_owner != curthread) {
1806 			mtx_unlock_spin(&umtx_lock);
1807 			umtxq_unbusy(&key);
1808 			umtxq_unlock(&key);
1809 			umtx_key_release(&key);
1810 			/* userland messed the mutex */
1811 			return (EPERM);
1812 		}
1813 		uq_me = curthread->td_umtxq;
1814 		pi->pi_owner = NULL;
1815 		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1816 		/* get highest priority thread which is still sleeping. */
1817 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1818 		while (uq_first != NULL &&
1819 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1820 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1821 		}
1822 		pri = PRI_MAX;
1823 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1824 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1825 			if (uq_first2 != NULL) {
1826 				if (pri > UPRI(uq_first2->uq_thread))
1827 					pri = UPRI(uq_first2->uq_thread);
1828 			}
1829 		}
1830 		thread_lock(curthread);
1831 		sched_lend_user_prio(curthread, pri);
1832 		thread_unlock(curthread);
1833 		mtx_unlock_spin(&umtx_lock);
1834 		if (uq_first)
1835 			umtxq_signal_thread(uq_first);
1836 	}
1837 	umtxq_unlock(&key);
1838 
1839 	/*
1840 	 * When unlocking the umtx, it must be marked as unowned if
1841 	 * there is zero or one thread only waiting for it.
1842 	 * Otherwise, it must be marked as contested.
1843 	 */
1844 	error = casueword32(&m->m_owner, owner, &old,
1845 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1846 
1847 	umtxq_unbusy_unlocked(&key);
1848 	umtx_key_release(&key);
1849 	if (error == -1)
1850 		return (EFAULT);
1851 	if (old != owner)
1852 		return (EINVAL);
1853 	return (0);
1854 }
1855 
1856 /*
1857  * Lock a PP mutex.
1858  */
1859 static int
1860 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
1861     struct _umtx_time *timeout, int try)
1862 {
1863 	struct abs_timeout timo;
1864 	struct umtx_q *uq, *uq2;
1865 	struct umtx_pi *pi;
1866 	uint32_t ceiling;
1867 	uint32_t owner, id;
1868 	int error, pri, old_inherited_pri, su, rv;
1869 
1870 	id = td->td_tid;
1871 	uq = td->td_umtxq;
1872 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1873 	    &uq->uq_key)) != 0)
1874 		return (error);
1875 
1876 	if (timeout != NULL)
1877 		abs_timeout_init2(&timo, timeout);
1878 
1879 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1880 	for (;;) {
1881 		old_inherited_pri = uq->uq_inherited_pri;
1882 		umtxq_lock(&uq->uq_key);
1883 		umtxq_busy(&uq->uq_key);
1884 		umtxq_unlock(&uq->uq_key);
1885 
1886 		rv = fueword32(&m->m_ceilings[0], &ceiling);
1887 		if (rv == -1) {
1888 			error = EFAULT;
1889 			goto out;
1890 		}
1891 		ceiling = RTP_PRIO_MAX - ceiling;
1892 		if (ceiling > RTP_PRIO_MAX) {
1893 			error = EINVAL;
1894 			goto out;
1895 		}
1896 
1897 		mtx_lock_spin(&umtx_lock);
1898 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1899 			mtx_unlock_spin(&umtx_lock);
1900 			error = EINVAL;
1901 			goto out;
1902 		}
1903 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1904 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1905 			thread_lock(td);
1906 			if (uq->uq_inherited_pri < UPRI(td))
1907 				sched_lend_user_prio(td, uq->uq_inherited_pri);
1908 			thread_unlock(td);
1909 		}
1910 		mtx_unlock_spin(&umtx_lock);
1911 
1912 		rv = casueword32(&m->m_owner,
1913 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
1914 		/* The address was invalid. */
1915 		if (rv == -1) {
1916 			error = EFAULT;
1917 			break;
1918 		}
1919 
1920 		if (owner == UMUTEX_CONTESTED) {
1921 			error = 0;
1922 			break;
1923 		}
1924 
1925 		if (try != 0) {
1926 			error = EBUSY;
1927 			break;
1928 		}
1929 
1930 		/*
1931 		 * If we caught a signal, we have retried and now
1932 		 * exit immediately.
1933 		 */
1934 		if (error != 0)
1935 			break;
1936 
1937 		umtxq_lock(&uq->uq_key);
1938 		umtxq_insert(uq);
1939 		umtxq_unbusy(&uq->uq_key);
1940 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
1941 		    NULL : &timo);
1942 		umtxq_remove(uq);
1943 		umtxq_unlock(&uq->uq_key);
1944 
1945 		mtx_lock_spin(&umtx_lock);
1946 		uq->uq_inherited_pri = old_inherited_pri;
1947 		pri = PRI_MAX;
1948 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1949 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1950 			if (uq2 != NULL) {
1951 				if (pri > UPRI(uq2->uq_thread))
1952 					pri = UPRI(uq2->uq_thread);
1953 			}
1954 		}
1955 		if (pri > uq->uq_inherited_pri)
1956 			pri = uq->uq_inherited_pri;
1957 		thread_lock(td);
1958 		sched_lend_user_prio(td, pri);
1959 		thread_unlock(td);
1960 		mtx_unlock_spin(&umtx_lock);
1961 	}
1962 
1963 	if (error != 0) {
1964 		mtx_lock_spin(&umtx_lock);
1965 		uq->uq_inherited_pri = old_inherited_pri;
1966 		pri = PRI_MAX;
1967 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1968 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1969 			if (uq2 != NULL) {
1970 				if (pri > UPRI(uq2->uq_thread))
1971 					pri = UPRI(uq2->uq_thread);
1972 			}
1973 		}
1974 		if (pri > uq->uq_inherited_pri)
1975 			pri = uq->uq_inherited_pri;
1976 		thread_lock(td);
1977 		sched_lend_user_prio(td, pri);
1978 		thread_unlock(td);
1979 		mtx_unlock_spin(&umtx_lock);
1980 	}
1981 
1982 out:
1983 	umtxq_unbusy_unlocked(&uq->uq_key);
1984 	umtx_key_release(&uq->uq_key);
1985 	return (error);
1986 }
1987 
1988 /*
1989  * Unlock a PP mutex.
1990  */
1991 static int
1992 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1993 {
1994 	struct umtx_key key;
1995 	struct umtx_q *uq, *uq2;
1996 	struct umtx_pi *pi;
1997 	uint32_t owner, id;
1998 	uint32_t rceiling;
1999 	int error, pri, new_inherited_pri, su;
2000 
2001 	id = td->td_tid;
2002 	uq = td->td_umtxq;
2003 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2004 
2005 	/*
2006 	 * Make sure we own this mtx.
2007 	 */
2008 	error = fueword32(&m->m_owner, &owner);
2009 	if (error == -1)
2010 		return (EFAULT);
2011 
2012 	if ((owner & ~UMUTEX_CONTESTED) != id)
2013 		return (EPERM);
2014 
2015 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2016 	if (error != 0)
2017 		return (error);
2018 
2019 	if (rceiling == -1)
2020 		new_inherited_pri = PRI_MAX;
2021 	else {
2022 		rceiling = RTP_PRIO_MAX - rceiling;
2023 		if (rceiling > RTP_PRIO_MAX)
2024 			return (EINVAL);
2025 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2026 	}
2027 
2028 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2029 	    &key)) != 0)
2030 		return (error);
2031 	umtxq_lock(&key);
2032 	umtxq_busy(&key);
2033 	umtxq_unlock(&key);
2034 	/*
2035 	 * For priority protected mutex, always set unlocked state
2036 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2037 	 * to lock the mutex, it is necessary because thread priority
2038 	 * has to be adjusted for such mutex.
2039 	 */
2040 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2041 
2042 	umtxq_lock(&key);
2043 	if (error == 0)
2044 		umtxq_signal(&key, 1);
2045 	umtxq_unbusy(&key);
2046 	umtxq_unlock(&key);
2047 
2048 	if (error == -1)
2049 		error = EFAULT;
2050 	else {
2051 		mtx_lock_spin(&umtx_lock);
2052 		if (su != 0)
2053 			uq->uq_inherited_pri = new_inherited_pri;
2054 		pri = PRI_MAX;
2055 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057 			if (uq2 != NULL) {
2058 				if (pri > UPRI(uq2->uq_thread))
2059 					pri = UPRI(uq2->uq_thread);
2060 			}
2061 		}
2062 		if (pri > uq->uq_inherited_pri)
2063 			pri = uq->uq_inherited_pri;
2064 		thread_lock(td);
2065 		sched_lend_user_prio(td, pri);
2066 		thread_unlock(td);
2067 		mtx_unlock_spin(&umtx_lock);
2068 	}
2069 	umtx_key_release(&key);
2070 	return (error);
2071 }
2072 
2073 static int
2074 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2075 	uint32_t *old_ceiling)
2076 {
2077 	struct umtx_q *uq;
2078 	uint32_t save_ceiling;
2079 	uint32_t owner, id;
2080 	uint32_t flags;
2081 	int error, rv;
2082 
2083 	error = fueword32(&m->m_flags, &flags);
2084 	if (error == -1)
2085 		return (EFAULT);
2086 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2087 		return (EINVAL);
2088 	if (ceiling > RTP_PRIO_MAX)
2089 		return (EINVAL);
2090 	id = td->td_tid;
2091 	uq = td->td_umtxq;
2092 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2093 	   &uq->uq_key)) != 0)
2094 		return (error);
2095 	for (;;) {
2096 		umtxq_lock(&uq->uq_key);
2097 		umtxq_busy(&uq->uq_key);
2098 		umtxq_unlock(&uq->uq_key);
2099 
2100 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2101 		if (rv == -1) {
2102 			error = EFAULT;
2103 			break;
2104 		}
2105 
2106 		rv = casueword32(&m->m_owner,
2107 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2108 		if (rv == -1) {
2109 			error = EFAULT;
2110 			break;
2111 		}
2112 
2113 		if (owner == UMUTEX_CONTESTED) {
2114 			suword32(&m->m_ceilings[0], ceiling);
2115 			suword32(&m->m_owner, UMUTEX_CONTESTED);
2116 			error = 0;
2117 			break;
2118 		}
2119 
2120 		if ((owner & ~UMUTEX_CONTESTED) == id) {
2121 			suword32(&m->m_ceilings[0], ceiling);
2122 			error = 0;
2123 			break;
2124 		}
2125 
2126 		/*
2127 		 * If we caught a signal, we have retried and now
2128 		 * exit immediately.
2129 		 */
2130 		if (error != 0)
2131 			break;
2132 
2133 		/*
2134 		 * We set the contested bit, sleep. Otherwise the lock changed
2135 		 * and we need to retry or we lost a race to the thread
2136 		 * unlocking the umtx.
2137 		 */
2138 		umtxq_lock(&uq->uq_key);
2139 		umtxq_insert(uq);
2140 		umtxq_unbusy(&uq->uq_key);
2141 		error = umtxq_sleep(uq, "umtxpp", NULL);
2142 		umtxq_remove(uq);
2143 		umtxq_unlock(&uq->uq_key);
2144 	}
2145 	umtxq_lock(&uq->uq_key);
2146 	if (error == 0)
2147 		umtxq_signal(&uq->uq_key, INT_MAX);
2148 	umtxq_unbusy(&uq->uq_key);
2149 	umtxq_unlock(&uq->uq_key);
2150 	umtx_key_release(&uq->uq_key);
2151 	if (error == 0 && old_ceiling != NULL)
2152 		suword32(old_ceiling, save_ceiling);
2153 	return (error);
2154 }
2155 
2156 /*
2157  * Lock a userland POSIX mutex.
2158  */
2159 static int
2160 do_lock_umutex(struct thread *td, struct umutex *m,
2161     struct _umtx_time *timeout, int mode)
2162 {
2163 	uint32_t flags;
2164 	int error;
2165 
2166 	error = fueword32(&m->m_flags, &flags);
2167 	if (error == -1)
2168 		return (EFAULT);
2169 
2170 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2171 	case 0:
2172 		error = do_lock_normal(td, m, flags, timeout, mode);
2173 		break;
2174 	case UMUTEX_PRIO_INHERIT:
2175 		error = do_lock_pi(td, m, flags, timeout, mode);
2176 		break;
2177 	case UMUTEX_PRIO_PROTECT:
2178 		error = do_lock_pp(td, m, flags, timeout, mode);
2179 		break;
2180 	default:
2181 		return (EINVAL);
2182 	}
2183 	if (timeout == NULL) {
2184 		if (error == EINTR && mode != _UMUTEX_WAIT)
2185 			error = ERESTART;
2186 	} else {
2187 		/* Timed-locking is not restarted. */
2188 		if (error == ERESTART)
2189 			error = EINTR;
2190 	}
2191 	return (error);
2192 }
2193 
2194 /*
2195  * Unlock a userland POSIX mutex.
2196  */
2197 static int
2198 do_unlock_umutex(struct thread *td, struct umutex *m)
2199 {
2200 	uint32_t flags;
2201 	int error;
2202 
2203 	error = fueword32(&m->m_flags, &flags);
2204 	if (error == -1)
2205 		return (EFAULT);
2206 
2207 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2208 	case 0:
2209 		return (do_unlock_normal(td, m, flags));
2210 	case UMUTEX_PRIO_INHERIT:
2211 		return (do_unlock_pi(td, m, flags));
2212 	case UMUTEX_PRIO_PROTECT:
2213 		return (do_unlock_pp(td, m, flags));
2214 	}
2215 
2216 	return (EINVAL);
2217 }
2218 
2219 static int
2220 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2221 	struct timespec *timeout, u_long wflags)
2222 {
2223 	struct abs_timeout timo;
2224 	struct umtx_q *uq;
2225 	uint32_t flags, clockid, hasw;
2226 	int error;
2227 
2228 	uq = td->td_umtxq;
2229 	error = fueword32(&cv->c_flags, &flags);
2230 	if (error == -1)
2231 		return (EFAULT);
2232 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2233 	if (error != 0)
2234 		return (error);
2235 
2236 	if ((wflags & CVWAIT_CLOCKID) != 0) {
2237 		error = fueword32(&cv->c_clockid, &clockid);
2238 		if (error == -1) {
2239 			umtx_key_release(&uq->uq_key);
2240 			return (EFAULT);
2241 		}
2242 		if (clockid < CLOCK_REALTIME ||
2243 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2244 			/* hmm, only HW clock id will work. */
2245 			umtx_key_release(&uq->uq_key);
2246 			return (EINVAL);
2247 		}
2248 	} else {
2249 		clockid = CLOCK_REALTIME;
2250 	}
2251 
2252 	umtxq_lock(&uq->uq_key);
2253 	umtxq_busy(&uq->uq_key);
2254 	umtxq_insert(uq);
2255 	umtxq_unlock(&uq->uq_key);
2256 
2257 	/*
2258 	 * Set c_has_waiters to 1 before releasing user mutex, also
2259 	 * don't modify cache line when unnecessary.
2260 	 */
2261 	error = fueword32(&cv->c_has_waiters, &hasw);
2262 	if (error == 0 && hasw == 0)
2263 		suword32(&cv->c_has_waiters, 1);
2264 
2265 	umtxq_unbusy_unlocked(&uq->uq_key);
2266 
2267 	error = do_unlock_umutex(td, m);
2268 
2269 	if (timeout != NULL)
2270 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2271 			timeout);
2272 
2273 	umtxq_lock(&uq->uq_key);
2274 	if (error == 0) {
2275 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2276 		    NULL : &timo);
2277 	}
2278 
2279 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2280 		error = 0;
2281 	else {
2282 		/*
2283 		 * This must be timeout,interrupted by signal or
2284 		 * surprious wakeup, clear c_has_waiter flag when
2285 		 * necessary.
2286 		 */
2287 		umtxq_busy(&uq->uq_key);
2288 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2289 			int oldlen = uq->uq_cur_queue->length;
2290 			umtxq_remove(uq);
2291 			if (oldlen == 1) {
2292 				umtxq_unlock(&uq->uq_key);
2293 				suword32(&cv->c_has_waiters, 0);
2294 				umtxq_lock(&uq->uq_key);
2295 			}
2296 		}
2297 		umtxq_unbusy(&uq->uq_key);
2298 		if (error == ERESTART)
2299 			error = EINTR;
2300 	}
2301 
2302 	umtxq_unlock(&uq->uq_key);
2303 	umtx_key_release(&uq->uq_key);
2304 	return (error);
2305 }
2306 
2307 /*
2308  * Signal a userland condition variable.
2309  */
2310 static int
2311 do_cv_signal(struct thread *td, struct ucond *cv)
2312 {
2313 	struct umtx_key key;
2314 	int error, cnt, nwake;
2315 	uint32_t flags;
2316 
2317 	error = fueword32(&cv->c_flags, &flags);
2318 	if (error == -1)
2319 		return (EFAULT);
2320 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2321 		return (error);
2322 	umtxq_lock(&key);
2323 	umtxq_busy(&key);
2324 	cnt = umtxq_count(&key);
2325 	nwake = umtxq_signal(&key, 1);
2326 	if (cnt <= nwake) {
2327 		umtxq_unlock(&key);
2328 		error = suword32(&cv->c_has_waiters, 0);
2329 		if (error == -1)
2330 			error = EFAULT;
2331 		umtxq_lock(&key);
2332 	}
2333 	umtxq_unbusy(&key);
2334 	umtxq_unlock(&key);
2335 	umtx_key_release(&key);
2336 	return (error);
2337 }
2338 
2339 static int
2340 do_cv_broadcast(struct thread *td, struct ucond *cv)
2341 {
2342 	struct umtx_key key;
2343 	int error;
2344 	uint32_t flags;
2345 
2346 	error = fueword32(&cv->c_flags, &flags);
2347 	if (error == -1)
2348 		return (EFAULT);
2349 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2350 		return (error);
2351 
2352 	umtxq_lock(&key);
2353 	umtxq_busy(&key);
2354 	umtxq_signal(&key, INT_MAX);
2355 	umtxq_unlock(&key);
2356 
2357 	error = suword32(&cv->c_has_waiters, 0);
2358 	if (error == -1)
2359 		error = EFAULT;
2360 
2361 	umtxq_unbusy_unlocked(&key);
2362 
2363 	umtx_key_release(&key);
2364 	return (error);
2365 }
2366 
2367 static int
2368 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2369 {
2370 	struct abs_timeout timo;
2371 	struct umtx_q *uq;
2372 	uint32_t flags, wrflags;
2373 	int32_t state, oldstate;
2374 	int32_t blocked_readers;
2375 	int error, rv;
2376 
2377 	uq = td->td_umtxq;
2378 	error = fueword32(&rwlock->rw_flags, &flags);
2379 	if (error == -1)
2380 		return (EFAULT);
2381 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2382 	if (error != 0)
2383 		return (error);
2384 
2385 	if (timeout != NULL)
2386 		abs_timeout_init2(&timo, timeout);
2387 
2388 	wrflags = URWLOCK_WRITE_OWNER;
2389 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2390 		wrflags |= URWLOCK_WRITE_WAITERS;
2391 
2392 	for (;;) {
2393 		rv = fueword32(&rwlock->rw_state, &state);
2394 		if (rv == -1) {
2395 			umtx_key_release(&uq->uq_key);
2396 			return (EFAULT);
2397 		}
2398 
2399 		/* try to lock it */
2400 		while (!(state & wrflags)) {
2401 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2402 				umtx_key_release(&uq->uq_key);
2403 				return (EAGAIN);
2404 			}
2405 			rv = casueword32(&rwlock->rw_state, state,
2406 			    &oldstate, state + 1);
2407 			if (rv == -1) {
2408 				umtx_key_release(&uq->uq_key);
2409 				return (EFAULT);
2410 			}
2411 			if (oldstate == state) {
2412 				umtx_key_release(&uq->uq_key);
2413 				return (0);
2414 			}
2415 			error = umtxq_check_susp(td);
2416 			if (error != 0)
2417 				break;
2418 			state = oldstate;
2419 		}
2420 
2421 		if (error)
2422 			break;
2423 
2424 		/* grab monitor lock */
2425 		umtxq_lock(&uq->uq_key);
2426 		umtxq_busy(&uq->uq_key);
2427 		umtxq_unlock(&uq->uq_key);
2428 
2429 		/*
2430 		 * re-read the state, in case it changed between the try-lock above
2431 		 * and the check below
2432 		 */
2433 		rv = fueword32(&rwlock->rw_state, &state);
2434 		if (rv == -1)
2435 			error = EFAULT;
2436 
2437 		/* set read contention bit */
2438 		while (error == 0 && (state & wrflags) &&
2439 		    !(state & URWLOCK_READ_WAITERS)) {
2440 			rv = casueword32(&rwlock->rw_state, state,
2441 			    &oldstate, state | URWLOCK_READ_WAITERS);
2442 			if (rv == -1) {
2443 				error = EFAULT;
2444 				break;
2445 			}
2446 			if (oldstate == state)
2447 				goto sleep;
2448 			state = oldstate;
2449 			error = umtxq_check_susp(td);
2450 			if (error != 0)
2451 				break;
2452 		}
2453 		if (error != 0) {
2454 			umtxq_unbusy_unlocked(&uq->uq_key);
2455 			break;
2456 		}
2457 
2458 		/* state is changed while setting flags, restart */
2459 		if (!(state & wrflags)) {
2460 			umtxq_unbusy_unlocked(&uq->uq_key);
2461 			error = umtxq_check_susp(td);
2462 			if (error != 0)
2463 				break;
2464 			continue;
2465 		}
2466 
2467 sleep:
2468 		/* contention bit is set, before sleeping, increase read waiter count */
2469 		rv = fueword32(&rwlock->rw_blocked_readers,
2470 		    &blocked_readers);
2471 		if (rv == -1) {
2472 			umtxq_unbusy_unlocked(&uq->uq_key);
2473 			error = EFAULT;
2474 			break;
2475 		}
2476 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2477 
2478 		while (state & wrflags) {
2479 			umtxq_lock(&uq->uq_key);
2480 			umtxq_insert(uq);
2481 			umtxq_unbusy(&uq->uq_key);
2482 
2483 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2484 			    NULL : &timo);
2485 
2486 			umtxq_busy(&uq->uq_key);
2487 			umtxq_remove(uq);
2488 			umtxq_unlock(&uq->uq_key);
2489 			if (error)
2490 				break;
2491 			rv = fueword32(&rwlock->rw_state, &state);
2492 			if (rv == -1) {
2493 				error = EFAULT;
2494 				break;
2495 			}
2496 		}
2497 
2498 		/* decrease read waiter count, and may clear read contention bit */
2499 		rv = fueword32(&rwlock->rw_blocked_readers,
2500 		    &blocked_readers);
2501 		if (rv == -1) {
2502 			umtxq_unbusy_unlocked(&uq->uq_key);
2503 			error = EFAULT;
2504 			break;
2505 		}
2506 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2507 		if (blocked_readers == 1) {
2508 			rv = fueword32(&rwlock->rw_state, &state);
2509 			if (rv == -1)
2510 				error = EFAULT;
2511 			while (error == 0) {
2512 				rv = casueword32(&rwlock->rw_state, state,
2513 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2514 				if (rv == -1) {
2515 					error = EFAULT;
2516 					break;
2517 				}
2518 				if (oldstate == state)
2519 					break;
2520 				state = oldstate;
2521 				error = umtxq_check_susp(td);
2522 			}
2523 		}
2524 
2525 		umtxq_unbusy_unlocked(&uq->uq_key);
2526 		if (error != 0)
2527 			break;
2528 	}
2529 	umtx_key_release(&uq->uq_key);
2530 	if (error == ERESTART)
2531 		error = EINTR;
2532 	return (error);
2533 }
2534 
2535 static int
2536 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2537 {
2538 	struct abs_timeout timo;
2539 	struct umtx_q *uq;
2540 	uint32_t flags;
2541 	int32_t state, oldstate;
2542 	int32_t blocked_writers;
2543 	int32_t blocked_readers;
2544 	int error, rv;
2545 
2546 	uq = td->td_umtxq;
2547 	error = fueword32(&rwlock->rw_flags, &flags);
2548 	if (error == -1)
2549 		return (EFAULT);
2550 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2551 	if (error != 0)
2552 		return (error);
2553 
2554 	if (timeout != NULL)
2555 		abs_timeout_init2(&timo, timeout);
2556 
2557 	blocked_readers = 0;
2558 	for (;;) {
2559 		rv = fueword32(&rwlock->rw_state, &state);
2560 		if (rv == -1) {
2561 			umtx_key_release(&uq->uq_key);
2562 			return (EFAULT);
2563 		}
2564 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2565 			rv = casueword32(&rwlock->rw_state, state,
2566 			    &oldstate, state | URWLOCK_WRITE_OWNER);
2567 			if (rv == -1) {
2568 				umtx_key_release(&uq->uq_key);
2569 				return (EFAULT);
2570 			}
2571 			if (oldstate == state) {
2572 				umtx_key_release(&uq->uq_key);
2573 				return (0);
2574 			}
2575 			state = oldstate;
2576 			error = umtxq_check_susp(td);
2577 			if (error != 0)
2578 				break;
2579 		}
2580 
2581 		if (error) {
2582 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2583 			    blocked_readers != 0) {
2584 				umtxq_lock(&uq->uq_key);
2585 				umtxq_busy(&uq->uq_key);
2586 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2587 				umtxq_unbusy(&uq->uq_key);
2588 				umtxq_unlock(&uq->uq_key);
2589 			}
2590 
2591 			break;
2592 		}
2593 
2594 		/* grab monitor lock */
2595 		umtxq_lock(&uq->uq_key);
2596 		umtxq_busy(&uq->uq_key);
2597 		umtxq_unlock(&uq->uq_key);
2598 
2599 		/*
2600 		 * re-read the state, in case it changed between the try-lock above
2601 		 * and the check below
2602 		 */
2603 		rv = fueword32(&rwlock->rw_state, &state);
2604 		if (rv == -1)
2605 			error = EFAULT;
2606 
2607 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2608 		    URWLOCK_READER_COUNT(state) != 0) &&
2609 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2610 			rv = casueword32(&rwlock->rw_state, state,
2611 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2612 			if (rv == -1) {
2613 				error = EFAULT;
2614 				break;
2615 			}
2616 			if (oldstate == state)
2617 				goto sleep;
2618 			state = oldstate;
2619 			error = umtxq_check_susp(td);
2620 			if (error != 0)
2621 				break;
2622 		}
2623 		if (error != 0) {
2624 			umtxq_unbusy_unlocked(&uq->uq_key);
2625 			break;
2626 		}
2627 
2628 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2629 			umtxq_unbusy_unlocked(&uq->uq_key);
2630 			error = umtxq_check_susp(td);
2631 			if (error != 0)
2632 				break;
2633 			continue;
2634 		}
2635 sleep:
2636 		rv = fueword32(&rwlock->rw_blocked_writers,
2637 		    &blocked_writers);
2638 		if (rv == -1) {
2639 			umtxq_unbusy_unlocked(&uq->uq_key);
2640 			error = EFAULT;
2641 			break;
2642 		}
2643 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2644 
2645 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2646 			umtxq_lock(&uq->uq_key);
2647 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2648 			umtxq_unbusy(&uq->uq_key);
2649 
2650 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2651 			    NULL : &timo);
2652 
2653 			umtxq_busy(&uq->uq_key);
2654 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2655 			umtxq_unlock(&uq->uq_key);
2656 			if (error)
2657 				break;
2658 			rv = fueword32(&rwlock->rw_state, &state);
2659 			if (rv == -1) {
2660 				error = EFAULT;
2661 				break;
2662 			}
2663 		}
2664 
2665 		rv = fueword32(&rwlock->rw_blocked_writers,
2666 		    &blocked_writers);
2667 		if (rv == -1) {
2668 			umtxq_unbusy_unlocked(&uq->uq_key);
2669 			error = EFAULT;
2670 			break;
2671 		}
2672 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2673 		if (blocked_writers == 1) {
2674 			rv = fueword32(&rwlock->rw_state, &state);
2675 			if (rv == -1) {
2676 				umtxq_unbusy_unlocked(&uq->uq_key);
2677 				error = EFAULT;
2678 				break;
2679 			}
2680 			for (;;) {
2681 				rv = casueword32(&rwlock->rw_state, state,
2682 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
2683 				if (rv == -1) {
2684 					error = EFAULT;
2685 					break;
2686 				}
2687 				if (oldstate == state)
2688 					break;
2689 				state = oldstate;
2690 				error = umtxq_check_susp(td);
2691 				/*
2692 				 * We are leaving the URWLOCK_WRITE_WAITERS
2693 				 * behind, but this should not harm the
2694 				 * correctness.
2695 				 */
2696 				if (error != 0)
2697 					break;
2698 			}
2699 			rv = fueword32(&rwlock->rw_blocked_readers,
2700 			    &blocked_readers);
2701 			if (rv == -1) {
2702 				umtxq_unbusy_unlocked(&uq->uq_key);
2703 				error = EFAULT;
2704 				break;
2705 			}
2706 		} else
2707 			blocked_readers = 0;
2708 
2709 		umtxq_unbusy_unlocked(&uq->uq_key);
2710 	}
2711 
2712 	umtx_key_release(&uq->uq_key);
2713 	if (error == ERESTART)
2714 		error = EINTR;
2715 	return (error);
2716 }
2717 
2718 static int
2719 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2720 {
2721 	struct umtx_q *uq;
2722 	uint32_t flags;
2723 	int32_t state, oldstate;
2724 	int error, rv, q, count;
2725 
2726 	uq = td->td_umtxq;
2727 	error = fueword32(&rwlock->rw_flags, &flags);
2728 	if (error == -1)
2729 		return (EFAULT);
2730 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2731 	if (error != 0)
2732 		return (error);
2733 
2734 	error = fueword32(&rwlock->rw_state, &state);
2735 	if (error == -1) {
2736 		error = EFAULT;
2737 		goto out;
2738 	}
2739 	if (state & URWLOCK_WRITE_OWNER) {
2740 		for (;;) {
2741 			rv = casueword32(&rwlock->rw_state, state,
2742 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
2743 			if (rv == -1) {
2744 				error = EFAULT;
2745 				goto out;
2746 			}
2747 			if (oldstate != state) {
2748 				state = oldstate;
2749 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2750 					error = EPERM;
2751 					goto out;
2752 				}
2753 				error = umtxq_check_susp(td);
2754 				if (error != 0)
2755 					goto out;
2756 			} else
2757 				break;
2758 		}
2759 	} else if (URWLOCK_READER_COUNT(state) != 0) {
2760 		for (;;) {
2761 			rv = casueword32(&rwlock->rw_state, state,
2762 			    &oldstate, state - 1);
2763 			if (rv == -1) {
2764 				error = EFAULT;
2765 				goto out;
2766 			}
2767 			if (oldstate != state) {
2768 				state = oldstate;
2769 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2770 					error = EPERM;
2771 					goto out;
2772 				}
2773 				error = umtxq_check_susp(td);
2774 				if (error != 0)
2775 					goto out;
2776 			} else
2777 				break;
2778 		}
2779 	} else {
2780 		error = EPERM;
2781 		goto out;
2782 	}
2783 
2784 	count = 0;
2785 
2786 	if (!(flags & URWLOCK_PREFER_READER)) {
2787 		if (state & URWLOCK_WRITE_WAITERS) {
2788 			count = 1;
2789 			q = UMTX_EXCLUSIVE_QUEUE;
2790 		} else if (state & URWLOCK_READ_WAITERS) {
2791 			count = INT_MAX;
2792 			q = UMTX_SHARED_QUEUE;
2793 		}
2794 	} else {
2795 		if (state & URWLOCK_READ_WAITERS) {
2796 			count = INT_MAX;
2797 			q = UMTX_SHARED_QUEUE;
2798 		} else if (state & URWLOCK_WRITE_WAITERS) {
2799 			count = 1;
2800 			q = UMTX_EXCLUSIVE_QUEUE;
2801 		}
2802 	}
2803 
2804 	if (count) {
2805 		umtxq_lock(&uq->uq_key);
2806 		umtxq_busy(&uq->uq_key);
2807 		umtxq_signal_queue(&uq->uq_key, count, q);
2808 		umtxq_unbusy(&uq->uq_key);
2809 		umtxq_unlock(&uq->uq_key);
2810 	}
2811 out:
2812 	umtx_key_release(&uq->uq_key);
2813 	return (error);
2814 }
2815 
2816 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
2817 static int
2818 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2819 {
2820 	struct abs_timeout timo;
2821 	struct umtx_q *uq;
2822 	uint32_t flags, count, count1;
2823 	int error, rv;
2824 
2825 	uq = td->td_umtxq;
2826 	error = fueword32(&sem->_flags, &flags);
2827 	if (error == -1)
2828 		return (EFAULT);
2829 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2830 	if (error != 0)
2831 		return (error);
2832 
2833 	if (timeout != NULL)
2834 		abs_timeout_init2(&timo, timeout);
2835 
2836 	umtxq_lock(&uq->uq_key);
2837 	umtxq_busy(&uq->uq_key);
2838 	umtxq_insert(uq);
2839 	umtxq_unlock(&uq->uq_key);
2840 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
2841 	if (rv == 0)
2842 		rv = fueword32(&sem->_count, &count);
2843 	if (rv == -1 || count != 0) {
2844 		umtxq_lock(&uq->uq_key);
2845 		umtxq_unbusy(&uq->uq_key);
2846 		umtxq_remove(uq);
2847 		umtxq_unlock(&uq->uq_key);
2848 		umtx_key_release(&uq->uq_key);
2849 		return (rv == -1 ? EFAULT : 0);
2850 	}
2851 	umtxq_lock(&uq->uq_key);
2852 	umtxq_unbusy(&uq->uq_key);
2853 
2854 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2855 
2856 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2857 		error = 0;
2858 	else {
2859 		umtxq_remove(uq);
2860 		/* A relative timeout cannot be restarted. */
2861 		if (error == ERESTART && timeout != NULL &&
2862 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2863 			error = EINTR;
2864 	}
2865 	umtxq_unlock(&uq->uq_key);
2866 	umtx_key_release(&uq->uq_key);
2867 	return (error);
2868 }
2869 
2870 /*
2871  * Signal a userland semaphore.
2872  */
2873 static int
2874 do_sem_wake(struct thread *td, struct _usem *sem)
2875 {
2876 	struct umtx_key key;
2877 	int error, cnt;
2878 	uint32_t flags;
2879 
2880 	error = fueword32(&sem->_flags, &flags);
2881 	if (error == -1)
2882 		return (EFAULT);
2883 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2884 		return (error);
2885 	umtxq_lock(&key);
2886 	umtxq_busy(&key);
2887 	cnt = umtxq_count(&key);
2888 	if (cnt > 0) {
2889 		umtxq_signal(&key, 1);
2890 		/*
2891 		 * Check if count is greater than 0, this means the memory is
2892 		 * still being referenced by user code, so we can safely
2893 		 * update _has_waiters flag.
2894 		 */
2895 		if (cnt == 1) {
2896 			umtxq_unlock(&key);
2897 			error = suword32(&sem->_has_waiters, 0);
2898 			umtxq_lock(&key);
2899 			if (error == -1)
2900 				error = EFAULT;
2901 		}
2902 	}
2903 	umtxq_unbusy(&key);
2904 	umtxq_unlock(&key);
2905 	umtx_key_release(&key);
2906 	return (error);
2907 }
2908 #endif
2909 
2910 static int
2911 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
2912 {
2913 	struct abs_timeout timo;
2914 	struct umtx_q *uq;
2915 	uint32_t count, flags;
2916 	int error, rv;
2917 
2918 	uq = td->td_umtxq;
2919 	flags = fuword32(&sem->_flags);
2920 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2921 	if (error != 0)
2922 		return (error);
2923 
2924 	if (timeout != NULL)
2925 		abs_timeout_init2(&timo, timeout);
2926 
2927 	umtxq_lock(&uq->uq_key);
2928 	umtxq_busy(&uq->uq_key);
2929 	umtxq_insert(uq);
2930 	umtxq_unlock(&uq->uq_key);
2931 	rv = fueword32(&sem->_count, &count);
2932 	if (rv == -1) {
2933 		umtxq_lock(&uq->uq_key);
2934 		umtxq_unbusy(&uq->uq_key);
2935 		umtxq_remove(uq);
2936 		umtxq_unlock(&uq->uq_key);
2937 		umtx_key_release(&uq->uq_key);
2938 		return (EFAULT);
2939 	}
2940 	for (;;) {
2941 		if (USEM_COUNT(count) != 0) {
2942 			umtxq_lock(&uq->uq_key);
2943 			umtxq_unbusy(&uq->uq_key);
2944 			umtxq_remove(uq);
2945 			umtxq_unlock(&uq->uq_key);
2946 			umtx_key_release(&uq->uq_key);
2947 			return (0);
2948 		}
2949 		if (count == USEM_HAS_WAITERS)
2950 			break;
2951 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
2952 		if (rv == -1) {
2953 			umtxq_lock(&uq->uq_key);
2954 			umtxq_unbusy(&uq->uq_key);
2955 			umtxq_remove(uq);
2956 			umtxq_unlock(&uq->uq_key);
2957 			umtx_key_release(&uq->uq_key);
2958 			return (EFAULT);
2959 		}
2960 		if (count == 0)
2961 			break;
2962 	}
2963 	umtxq_lock(&uq->uq_key);
2964 	umtxq_unbusy(&uq->uq_key);
2965 
2966 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2967 
2968 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2969 		error = 0;
2970 	else {
2971 		umtxq_remove(uq);
2972 		/* A relative timeout cannot be restarted. */
2973 		if (error == ERESTART && timeout != NULL &&
2974 		    (timeout->_flags & UMTX_ABSTIME) == 0)
2975 			error = EINTR;
2976 	}
2977 	umtxq_unlock(&uq->uq_key);
2978 	umtx_key_release(&uq->uq_key);
2979 	return (error);
2980 }
2981 
2982 /*
2983  * Signal a userland semaphore.
2984  */
2985 static int
2986 do_sem2_wake(struct thread *td, struct _usem2 *sem)
2987 {
2988 	struct umtx_key key;
2989 	int error, cnt, rv;
2990 	uint32_t count, flags;
2991 
2992 	rv = fueword32(&sem->_flags, &flags);
2993 	if (rv == -1)
2994 		return (EFAULT);
2995 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2996 		return (error);
2997 	umtxq_lock(&key);
2998 	umtxq_busy(&key);
2999 	cnt = umtxq_count(&key);
3000 	if (cnt > 0) {
3001 		umtxq_signal(&key, 1);
3002 
3003 		/*
3004 		 * If this was the last sleeping thread, clear the waiters
3005 		 * flag in _count.
3006 		 */
3007 		if (cnt == 1) {
3008 			umtxq_unlock(&key);
3009 			rv = fueword32(&sem->_count, &count);
3010 			while (rv != -1 && count & USEM_HAS_WAITERS)
3011 				rv = casueword32(&sem->_count, count, &count,
3012 				    count & ~USEM_HAS_WAITERS);
3013 			if (rv == -1)
3014 				error = EFAULT;
3015 			umtxq_lock(&key);
3016 		}
3017 	}
3018 	umtxq_unbusy(&key);
3019 	umtxq_unlock(&key);
3020 	umtx_key_release(&key);
3021 	return (error);
3022 }
3023 
3024 inline int
3025 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3026 {
3027 	int error;
3028 
3029 	error = copyin(addr, tsp, sizeof(struct timespec));
3030 	if (error == 0) {
3031 		if (tsp->tv_sec < 0 ||
3032 		    tsp->tv_nsec >= 1000000000 ||
3033 		    tsp->tv_nsec < 0)
3034 			error = EINVAL;
3035 	}
3036 	return (error);
3037 }
3038 
3039 static inline int
3040 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3041 {
3042 	int error;
3043 
3044 	if (size <= sizeof(struct timespec)) {
3045 		tp->_clockid = CLOCK_REALTIME;
3046 		tp->_flags = 0;
3047 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3048 	} else
3049 		error = copyin(addr, tp, sizeof(struct _umtx_time));
3050 	if (error != 0)
3051 		return (error);
3052 	if (tp->_timeout.tv_sec < 0 ||
3053 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3054 		return (EINVAL);
3055 	return (0);
3056 }
3057 
3058 static int
3059 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
3060 {
3061 
3062 	return (EOPNOTSUPP);
3063 }
3064 
3065 static int
3066 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3067 {
3068 	struct _umtx_time timeout, *tm_p;
3069 	int error;
3070 
3071 	if (uap->uaddr2 == NULL)
3072 		tm_p = NULL;
3073 	else {
3074 		error = umtx_copyin_umtx_time(
3075 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3076 		if (error != 0)
3077 			return (error);
3078 		tm_p = &timeout;
3079 	}
3080 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3081 }
3082 
3083 static int
3084 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3085 {
3086 	struct _umtx_time timeout, *tm_p;
3087 	int error;
3088 
3089 	if (uap->uaddr2 == NULL)
3090 		tm_p = NULL;
3091 	else {
3092 		error = umtx_copyin_umtx_time(
3093 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3094 		if (error != 0)
3095 			return (error);
3096 		tm_p = &timeout;
3097 	}
3098 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3099 }
3100 
3101 static int
3102 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3103 {
3104 	struct _umtx_time *tm_p, timeout;
3105 	int error;
3106 
3107 	if (uap->uaddr2 == NULL)
3108 		tm_p = NULL;
3109 	else {
3110 		error = umtx_copyin_umtx_time(
3111 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3112 		if (error != 0)
3113 			return (error);
3114 		tm_p = &timeout;
3115 	}
3116 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3117 }
3118 
3119 static int
3120 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3121 {
3122 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3123 }
3124 
3125 #define BATCH_SIZE	128
3126 static int
3127 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3128 {
3129 	int count = uap->val;
3130 	void *uaddrs[BATCH_SIZE];
3131 	char **upp = (char **)uap->obj;
3132 	int tocopy;
3133 	int error = 0;
3134 	int i, pos = 0;
3135 
3136 	while (count > 0) {
3137 		tocopy = count;
3138 		if (tocopy > BATCH_SIZE)
3139 			tocopy = BATCH_SIZE;
3140 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3141 		if (error != 0)
3142 			break;
3143 		for (i = 0; i < tocopy; ++i)
3144 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3145 		count -= tocopy;
3146 		pos += tocopy;
3147 	}
3148 	return (error);
3149 }
3150 
3151 static int
3152 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3153 {
3154 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3155 }
3156 
3157 static int
3158 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3159 {
3160 	struct _umtx_time *tm_p, timeout;
3161 	int error;
3162 
3163 	/* Allow a null timespec (wait forever). */
3164 	if (uap->uaddr2 == NULL)
3165 		tm_p = NULL;
3166 	else {
3167 		error = umtx_copyin_umtx_time(
3168 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3169 		if (error != 0)
3170 			return (error);
3171 		tm_p = &timeout;
3172 	}
3173 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3174 }
3175 
3176 static int
3177 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3178 {
3179 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3180 }
3181 
3182 static int
3183 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3184 {
3185 	struct _umtx_time *tm_p, timeout;
3186 	int error;
3187 
3188 	/* Allow a null timespec (wait forever). */
3189 	if (uap->uaddr2 == NULL)
3190 		tm_p = NULL;
3191 	else {
3192 		error = umtx_copyin_umtx_time(
3193 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3194 		if (error != 0)
3195 			return (error);
3196 		tm_p = &timeout;
3197 	}
3198 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3199 }
3200 
3201 static int
3202 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3203 {
3204 	return do_wake_umutex(td, uap->obj);
3205 }
3206 
3207 static int
3208 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3209 {
3210 	return do_unlock_umutex(td, uap->obj);
3211 }
3212 
3213 static int
3214 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3215 {
3216 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3217 }
3218 
3219 static int
3220 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3221 {
3222 	struct timespec *ts, timeout;
3223 	int error;
3224 
3225 	/* Allow a null timespec (wait forever). */
3226 	if (uap->uaddr2 == NULL)
3227 		ts = NULL;
3228 	else {
3229 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3230 		if (error != 0)
3231 			return (error);
3232 		ts = &timeout;
3233 	}
3234 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3235 }
3236 
3237 static int
3238 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3239 {
3240 	return do_cv_signal(td, uap->obj);
3241 }
3242 
3243 static int
3244 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3245 {
3246 	return do_cv_broadcast(td, uap->obj);
3247 }
3248 
3249 static int
3250 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3251 {
3252 	struct _umtx_time timeout;
3253 	int error;
3254 
3255 	/* Allow a null timespec (wait forever). */
3256 	if (uap->uaddr2 == NULL) {
3257 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3258 	} else {
3259 		error = umtx_copyin_umtx_time(uap->uaddr2,
3260 		   (size_t)uap->uaddr1, &timeout);
3261 		if (error != 0)
3262 			return (error);
3263 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3264 	}
3265 	return (error);
3266 }
3267 
3268 static int
3269 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3270 {
3271 	struct _umtx_time timeout;
3272 	int error;
3273 
3274 	/* Allow a null timespec (wait forever). */
3275 	if (uap->uaddr2 == NULL) {
3276 		error = do_rw_wrlock(td, uap->obj, 0);
3277 	} else {
3278 		error = umtx_copyin_umtx_time(uap->uaddr2,
3279 		   (size_t)uap->uaddr1, &timeout);
3280 		if (error != 0)
3281 			return (error);
3282 
3283 		error = do_rw_wrlock(td, uap->obj, &timeout);
3284 	}
3285 	return (error);
3286 }
3287 
3288 static int
3289 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3290 {
3291 	return do_rw_unlock(td, uap->obj);
3292 }
3293 
3294 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3295 static int
3296 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3297 {
3298 	struct _umtx_time *tm_p, timeout;
3299 	int error;
3300 
3301 	/* Allow a null timespec (wait forever). */
3302 	if (uap->uaddr2 == NULL)
3303 		tm_p = NULL;
3304 	else {
3305 		error = umtx_copyin_umtx_time(
3306 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3307 		if (error != 0)
3308 			return (error);
3309 		tm_p = &timeout;
3310 	}
3311 	return (do_sem_wait(td, uap->obj, tm_p));
3312 }
3313 
3314 static int
3315 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3316 {
3317 	return do_sem_wake(td, uap->obj);
3318 }
3319 #endif
3320 
3321 static int
3322 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3323 {
3324 	return do_wake2_umutex(td, uap->obj, uap->val);
3325 }
3326 
3327 static int
3328 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
3329 {
3330 	struct _umtx_time *tm_p, timeout;
3331 	int error;
3332 
3333 	/* Allow a null timespec (wait forever). */
3334 	if (uap->uaddr2 == NULL)
3335 		tm_p = NULL;
3336 	else {
3337 		error = umtx_copyin_umtx_time(
3338 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3339 		if (error != 0)
3340 			return (error);
3341 		tm_p = &timeout;
3342 	}
3343 	return (do_sem2_wait(td, uap->obj, tm_p));
3344 }
3345 
3346 static int
3347 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
3348 {
3349 	return do_sem2_wake(td, uap->obj);
3350 }
3351 
3352 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3353 
3354 static _umtx_op_func op_table[] = {
3355 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3356 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3357 	__umtx_op_wait,			/* UMTX_OP_WAIT */
3358 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3359 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3360 	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3361 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3362 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3363 	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3364 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3365 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3366 	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3367 	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3368 	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3369 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3370 	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3371 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3372 	__umtx_op_wait_umutex,		/* UMTX_OP_MUTEX_WAIT */
3373 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3374 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3375 	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3376 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3377 #else
3378 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3379 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3380 #endif
3381 	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3382 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3383 	__umtx_op_sem2_wait,		/* UMTX_OP_SEM2_WAIT */
3384 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3385 };
3386 
3387 int
3388 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3389 {
3390 	if ((unsigned)uap->op < UMTX_OP_MAX)
3391 		return (*op_table[uap->op])(td, uap);
3392 	return (EINVAL);
3393 }
3394 
3395 #ifdef COMPAT_FREEBSD32
3396 
3397 struct timespec32 {
3398 	int32_t tv_sec;
3399 	int32_t tv_nsec;
3400 };
3401 
3402 struct umtx_time32 {
3403 	struct	timespec32	timeout;
3404 	uint32_t		flags;
3405 	uint32_t		clockid;
3406 };
3407 
3408 static inline int
3409 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3410 {
3411 	struct timespec32 ts32;
3412 	int error;
3413 
3414 	error = copyin(addr, &ts32, sizeof(struct timespec32));
3415 	if (error == 0) {
3416 		if (ts32.tv_sec < 0 ||
3417 		    ts32.tv_nsec >= 1000000000 ||
3418 		    ts32.tv_nsec < 0)
3419 			error = EINVAL;
3420 		else {
3421 			tsp->tv_sec = ts32.tv_sec;
3422 			tsp->tv_nsec = ts32.tv_nsec;
3423 		}
3424 	}
3425 	return (error);
3426 }
3427 
3428 static inline int
3429 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3430 {
3431 	struct umtx_time32 t32;
3432 	int error;
3433 
3434 	t32.clockid = CLOCK_REALTIME;
3435 	t32.flags   = 0;
3436 	if (size <= sizeof(struct timespec32))
3437 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3438 	else
3439 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3440 	if (error != 0)
3441 		return (error);
3442 	if (t32.timeout.tv_sec < 0 ||
3443 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3444 		return (EINVAL);
3445 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3446 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3447 	tp->_flags = t32.flags;
3448 	tp->_clockid = t32.clockid;
3449 	return (0);
3450 }
3451 
3452 static int
3453 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3454 {
3455 	struct _umtx_time *tm_p, timeout;
3456 	int error;
3457 
3458 	if (uap->uaddr2 == NULL)
3459 		tm_p = NULL;
3460 	else {
3461 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3462 			(size_t)uap->uaddr1, &timeout);
3463 		if (error != 0)
3464 			return (error);
3465 		tm_p = &timeout;
3466 	}
3467 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3468 }
3469 
3470 static int
3471 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3472 {
3473 	struct _umtx_time *tm_p, timeout;
3474 	int error;
3475 
3476 	/* Allow a null timespec (wait forever). */
3477 	if (uap->uaddr2 == NULL)
3478 		tm_p = NULL;
3479 	else {
3480 		error = umtx_copyin_umtx_time(uap->uaddr2,
3481 			    (size_t)uap->uaddr1, &timeout);
3482 		if (error != 0)
3483 			return (error);
3484 		tm_p = &timeout;
3485 	}
3486 	return do_lock_umutex(td, uap->obj, tm_p, 0);
3487 }
3488 
3489 static int
3490 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3491 {
3492 	struct _umtx_time *tm_p, timeout;
3493 	int error;
3494 
3495 	/* Allow a null timespec (wait forever). */
3496 	if (uap->uaddr2 == NULL)
3497 		tm_p = NULL;
3498 	else {
3499 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3500 		    (size_t)uap->uaddr1, &timeout);
3501 		if (error != 0)
3502 			return (error);
3503 		tm_p = &timeout;
3504 	}
3505 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3506 }
3507 
3508 static int
3509 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3510 {
3511 	struct timespec *ts, timeout;
3512 	int error;
3513 
3514 	/* Allow a null timespec (wait forever). */
3515 	if (uap->uaddr2 == NULL)
3516 		ts = NULL;
3517 	else {
3518 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3519 		if (error != 0)
3520 			return (error);
3521 		ts = &timeout;
3522 	}
3523 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3524 }
3525 
3526 static int
3527 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3528 {
3529 	struct _umtx_time timeout;
3530 	int error;
3531 
3532 	/* Allow a null timespec (wait forever). */
3533 	if (uap->uaddr2 == NULL) {
3534 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3535 	} else {
3536 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3537 		    (size_t)uap->uaddr1, &timeout);
3538 		if (error != 0)
3539 			return (error);
3540 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3541 	}
3542 	return (error);
3543 }
3544 
3545 static int
3546 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3547 {
3548 	struct _umtx_time timeout;
3549 	int error;
3550 
3551 	/* Allow a null timespec (wait forever). */
3552 	if (uap->uaddr2 == NULL) {
3553 		error = do_rw_wrlock(td, uap->obj, 0);
3554 	} else {
3555 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3556 		    (size_t)uap->uaddr1, &timeout);
3557 		if (error != 0)
3558 			return (error);
3559 		error = do_rw_wrlock(td, uap->obj, &timeout);
3560 	}
3561 	return (error);
3562 }
3563 
3564 static int
3565 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3566 {
3567 	struct _umtx_time *tm_p, timeout;
3568 	int error;
3569 
3570 	if (uap->uaddr2 == NULL)
3571 		tm_p = NULL;
3572 	else {
3573 		error = umtx_copyin_umtx_time32(
3574 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3575 		if (error != 0)
3576 			return (error);
3577 		tm_p = &timeout;
3578 	}
3579 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3580 }
3581 
3582 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3583 static int
3584 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3585 {
3586 	struct _umtx_time *tm_p, timeout;
3587 	int error;
3588 
3589 	/* Allow a null timespec (wait forever). */
3590 	if (uap->uaddr2 == NULL)
3591 		tm_p = NULL;
3592 	else {
3593 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3594 		    (size_t)uap->uaddr1, &timeout);
3595 		if (error != 0)
3596 			return (error);
3597 		tm_p = &timeout;
3598 	}
3599 	return (do_sem_wait(td, uap->obj, tm_p));
3600 }
3601 #endif
3602 
3603 static int
3604 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3605 {
3606 	struct _umtx_time *tm_p, timeout;
3607 	int error;
3608 
3609 	/* Allow a null timespec (wait forever). */
3610 	if (uap->uaddr2 == NULL)
3611 		tm_p = NULL;
3612 	else {
3613 		error = umtx_copyin_umtx_time32(uap->uaddr2,
3614 		    (size_t)uap->uaddr1, &timeout);
3615 		if (error != 0)
3616 			return (error);
3617 		tm_p = &timeout;
3618 	}
3619 	return (do_sem2_wait(td, uap->obj, tm_p));
3620 }
3621 
3622 static int
3623 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3624 {
3625 	int count = uap->val;
3626 	uint32_t uaddrs[BATCH_SIZE];
3627 	uint32_t **upp = (uint32_t **)uap->obj;
3628 	int tocopy;
3629 	int error = 0;
3630 	int i, pos = 0;
3631 
3632 	while (count > 0) {
3633 		tocopy = count;
3634 		if (tocopy > BATCH_SIZE)
3635 			tocopy = BATCH_SIZE;
3636 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3637 		if (error != 0)
3638 			break;
3639 		for (i = 0; i < tocopy; ++i)
3640 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3641 				INT_MAX, 1);
3642 		count -= tocopy;
3643 		pos += tocopy;
3644 	}
3645 	return (error);
3646 }
3647 
3648 static _umtx_op_func op_table_compat32[] = {
3649 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED0 */
3650 	__umtx_op_unimpl,		/* UMTX_OP_RESERVED1 */
3651 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3652 	__umtx_op_wake,			/* UMTX_OP_WAKE */
3653 	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3654 	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3655 	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3656 	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3657 	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3658 	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3659 	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3660 	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3661 	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3662 	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3663 	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3664 	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3665 	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3666 	__umtx_op_wait_umutex_compat32, /* UMTX_OP_MUTEX_WAIT */
3667 	__umtx_op_wake_umutex,		/* UMTX_OP_MUTEX_WAKE */
3668 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
3669 	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3670 	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3671 #else
3672 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAIT */
3673 	__umtx_op_unimpl,		/* UMTX_OP_SEM_WAKE */
3674 #endif
3675 	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3676 	__umtx_op_wake2_umutex,		/* UMTX_OP_MUTEX_WAKE2 */
3677 	__umtx_op_sem2_wait_compat32,	/* UMTX_OP_SEM2_WAIT */
3678 	__umtx_op_sem2_wake,		/* UMTX_OP_SEM2_WAKE */
3679 };
3680 
3681 int
3682 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3683 {
3684 	if ((unsigned)uap->op < UMTX_OP_MAX)
3685 		return (*op_table_compat32[uap->op])(td,
3686 			(struct _umtx_op_args *)uap);
3687 	return (EINVAL);
3688 }
3689 #endif
3690 
3691 void
3692 umtx_thread_init(struct thread *td)
3693 {
3694 	td->td_umtxq = umtxq_alloc();
3695 	td->td_umtxq->uq_thread = td;
3696 }
3697 
3698 void
3699 umtx_thread_fini(struct thread *td)
3700 {
3701 	umtxq_free(td->td_umtxq);
3702 }
3703 
3704 /*
3705  * It will be called when new thread is created, e.g fork().
3706  */
3707 void
3708 umtx_thread_alloc(struct thread *td)
3709 {
3710 	struct umtx_q *uq;
3711 
3712 	uq = td->td_umtxq;
3713 	uq->uq_inherited_pri = PRI_MAX;
3714 
3715 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3716 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3717 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3718 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3719 }
3720 
3721 /*
3722  * exec() hook.
3723  */
3724 static void
3725 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3726 	struct image_params *imgp __unused)
3727 {
3728 	umtx_thread_cleanup(curthread);
3729 }
3730 
3731 /*
3732  * thread_exit() hook.
3733  */
3734 void
3735 umtx_thread_exit(struct thread *td)
3736 {
3737 	umtx_thread_cleanup(td);
3738 }
3739 
3740 /*
3741  * clean up umtx data.
3742  */
3743 static void
3744 umtx_thread_cleanup(struct thread *td)
3745 {
3746 	struct umtx_q *uq;
3747 	struct umtx_pi *pi;
3748 
3749 	if ((uq = td->td_umtxq) == NULL)
3750 		return;
3751 
3752 	mtx_lock_spin(&umtx_lock);
3753 	uq->uq_inherited_pri = PRI_MAX;
3754 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3755 		pi->pi_owner = NULL;
3756 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3757 	}
3758 	mtx_unlock_spin(&umtx_lock);
3759 	thread_lock(td);
3760 	sched_lend_user_prio(td, PRI_MAX);
3761 	thread_unlock(td);
3762 }
3763